forked from moreo/QuaPy
Compare commits
37 Commits
|
@ -27,6 +27,40 @@ share/python-wheels/
|
||||||
.installed.cfg
|
.installed.cfg
|
||||||
*.egg
|
*.egg
|
||||||
MANIFEST
|
MANIFEST
|
||||||
|
.idea
|
||||||
|
.vscode
|
||||||
|
LeQua2022
|
||||||
|
MultiLabel/results_generales
|
||||||
|
MultiLabel/mlqtables
|
||||||
|
NewMethods/plots*
|
||||||
|
NewMethods/results*
|
||||||
|
NewMethods/tables*
|
||||||
|
NewMethods/latex*
|
||||||
|
Ordinal/data*
|
||||||
|
Ordinal/roberta*
|
||||||
|
Ordinal/tables*
|
||||||
|
Ordinal/results*
|
||||||
|
eDiscovery/plots*
|
||||||
|
eDiscovery/results*
|
||||||
|
examples/results*
|
||||||
|
poster-cikm*
|
||||||
|
slides-cikm*
|
||||||
|
slides-short-cikm*
|
||||||
|
quick_experiment/figures*
|
||||||
|
quick_experiment/figures*
|
||||||
|
svm_perf_quantification/*
|
||||||
|
TweetSentQuant/plots*
|
||||||
|
TweetSentQuant/results*
|
||||||
|
TweetSentQuant/tables*
|
||||||
|
TweetSentQuant/Tweet Sentiment Quantification_NPP
|
||||||
|
TweetSentQuant/checkpoint
|
||||||
|
TweetSentQuant/*.tex
|
||||||
|
checkpoint
|
||||||
|
*.png
|
||||||
|
*.zip
|
||||||
|
*.pkl
|
||||||
|
*.pickle
|
||||||
|
*.pdf
|
||||||
|
|
||||||
# PyInstaller
|
# PyInstaller
|
||||||
# Usually these files are written by a python script from a template
|
# Usually these files are written by a python script from a template
|
||||||
|
@ -130,3 +164,32 @@ dmypy.json
|
||||||
.pyre/
|
.pyre/
|
||||||
|
|
||||||
*__pycache__*
|
*__pycache__*
|
||||||
|
*.dataframe
|
||||||
|
*.svg
|
||||||
|
|
||||||
|
cross_lingual.py
|
||||||
|
|
||||||
|
# results
|
||||||
|
*.csv
|
||||||
|
|
||||||
|
# other projects
|
||||||
|
*.tex
|
||||||
|
*.md
|
||||||
|
*.doctree
|
||||||
|
*.rst
|
||||||
|
MultiLabel/
|
||||||
|
NewMethods/
|
||||||
|
Ordinal/
|
||||||
|
Retrieval/
|
||||||
|
quick_experiment/
|
||||||
|
Transduction/
|
||||||
|
Transduction_office/
|
||||||
|
TweetSentQuant/
|
||||||
|
laboratory/
|
||||||
|
docs/
|
||||||
|
dataset_drifts.py
|
||||||
|
notes.txt
|
||||||
|
eDiscovery/
|
||||||
|
examples/experiment_Dirichlet_calibration.py
|
||||||
|
examples/experiments_dirichlet_cal.txt
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,124 @@
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from distribution_matching.method.kdex import KDExML
|
||||||
|
from distribution_matching.method.method_kdey import KDEy
|
||||||
|
from distribution_matching.method.method_kdey_closed_efficient_correct import KDEyclosed_efficient_corr
|
||||||
|
from distribution_matching.method.kdey import KDEyCS, KDEyHD, KDEyML
|
||||||
|
from quapy.method.aggregative import EMQ, CC, PCC, DistributionMatching, PACC, HDy, OneVsAllAggregative, ACC
|
||||||
|
from distribution_matching.method.dirichlety import DIRy
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
|
||||||
|
# set to True to get the full list of methods tested in the paper (reported in the appendix)
|
||||||
|
# set to False to get the reduced list (shown in the body of the paper)
|
||||||
|
FULL_METHOD_LIST = False
|
||||||
|
|
||||||
|
if FULL_METHOD_LIST:
|
||||||
|
ADJUSTMENT_METHODS = ['ACC', 'PACC']
|
||||||
|
DISTR_MATCH_METHODS = ['HDy-OvA', 'DM-T', 'DM-HD', 'KDEy-HD', 'DM-CS', 'KDEy-CS']
|
||||||
|
MAX_LIKE_METHODS = ['DIR', 'EMQ', 'EMQ-BCTS', 'KDEy-ML']
|
||||||
|
else:
|
||||||
|
ADJUSTMENT_METHODS = ['PACC']
|
||||||
|
DISTR_MATCH_METHODS = ['DM-T', 'DM-HD', 'KDEy-HD', 'DM-CS', 'KDEy-CS']
|
||||||
|
MAX_LIKE_METHODS = ['EMQ', 'KDEy-ML']
|
||||||
|
|
||||||
|
# list of methods to consider
|
||||||
|
METHODS = ADJUSTMENT_METHODS + DISTR_MATCH_METHODS + MAX_LIKE_METHODS
|
||||||
|
BIN_METHODS = [x.replace('-OvA', '') for x in METHODS]
|
||||||
|
|
||||||
|
# common hyperparameterss
|
||||||
|
hyper_LR = {
|
||||||
|
'classifier__C': np.logspace(-3,3,7),
|
||||||
|
'classifier__class_weight': ['balanced', None]
|
||||||
|
}
|
||||||
|
|
||||||
|
hyper_kde = {
|
||||||
|
'bandwidth': np.linspace(0.01, 0.2, 20)
|
||||||
|
}
|
||||||
|
|
||||||
|
nbins_range = [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 64]
|
||||||
|
|
||||||
|
|
||||||
|
# instances a new quantifier based on a string name
|
||||||
|
def new_method(method, **lr_kwargs):
|
||||||
|
lr = LogisticRegression(**lr_kwargs)
|
||||||
|
|
||||||
|
if method == 'CC':
|
||||||
|
param_grid = hyper_LR
|
||||||
|
quantifier = CC(lr)
|
||||||
|
elif method == 'PCC':
|
||||||
|
param_grid = hyper_LR
|
||||||
|
quantifier = PCC(lr)
|
||||||
|
elif method == 'ACC':
|
||||||
|
param_grid = hyper_LR
|
||||||
|
quantifier = ACC(lr)
|
||||||
|
elif method == 'PACC':
|
||||||
|
param_grid = hyper_LR
|
||||||
|
quantifier = PACC(lr)
|
||||||
|
elif method in ['KDEy-HD']:
|
||||||
|
param_grid = {**hyper_kde, **hyper_LR}
|
||||||
|
quantifier = KDEyHD(lr)
|
||||||
|
elif method == 'KDEy-CS':
|
||||||
|
param_grid = {**hyper_kde, **hyper_LR}
|
||||||
|
quantifier = KDEyCS(lr)
|
||||||
|
elif method == 'KDEy-ML':
|
||||||
|
param_grid = {**hyper_kde, **hyper_LR}
|
||||||
|
quantifier = KDEyML(lr)
|
||||||
|
elif method == 'KDEx-ML':
|
||||||
|
param_grid = {
|
||||||
|
'bandwidth': np.linspace(0.001, 2, 501)
|
||||||
|
}
|
||||||
|
quantifier = KDExML()
|
||||||
|
elif method == 'DIR':
|
||||||
|
param_grid = hyper_LR
|
||||||
|
quantifier = DIRy(lr)
|
||||||
|
elif method == 'EMQ':
|
||||||
|
param_grid = hyper_LR
|
||||||
|
quantifier = EMQ(lr)
|
||||||
|
elif method == 'EMQ-BCTS':
|
||||||
|
method_params = {'exact_train_prev': [False], 'recalib': ['bcts']}
|
||||||
|
param_grid = {**method_params, **hyper_LR}
|
||||||
|
quantifier = EMQ(lr)
|
||||||
|
elif method == 'HDy':
|
||||||
|
param_grid = hyper_LR
|
||||||
|
quantifier = HDy(lr)
|
||||||
|
elif method == 'HDy-OvA':
|
||||||
|
param_grid = {'binary_quantifier__' + key: val for key, val in hyper_LR.items()}
|
||||||
|
quantifier = OneVsAllAggregative(HDy(lr))
|
||||||
|
elif method == 'DM-T':
|
||||||
|
method_params = {
|
||||||
|
'nbins': nbins_range,
|
||||||
|
'val_split': [10],
|
||||||
|
'divergence': ['topsoe']
|
||||||
|
}
|
||||||
|
param_grid = {**method_params, **hyper_LR}
|
||||||
|
quantifier = DistributionMatching(lr)
|
||||||
|
elif method == 'DM-HD':
|
||||||
|
method_params = {
|
||||||
|
'nbins': nbins_range,
|
||||||
|
'val_split': [10],
|
||||||
|
'divergence': ['HD']
|
||||||
|
}
|
||||||
|
param_grid = {**method_params, **hyper_LR}
|
||||||
|
quantifier = DistributionMatching(lr)
|
||||||
|
elif method == 'DM-CS':
|
||||||
|
method_params = {
|
||||||
|
'nbins': nbins_range,
|
||||||
|
'val_split': [10],
|
||||||
|
'divergence': ['CS']
|
||||||
|
}
|
||||||
|
param_grid = {**method_params, **hyper_LR}
|
||||||
|
quantifier = DistributionMatching(lr)
|
||||||
|
else:
|
||||||
|
raise NotImplementedError('unknown method', method)
|
||||||
|
|
||||||
|
return param_grid, quantifier
|
||||||
|
|
||||||
|
|
||||||
|
def show_results(result_path):
|
||||||
|
df = pd.read_csv(result_path+'.csv', sep='\t')
|
||||||
|
|
||||||
|
pd.set_option('display.max_columns', None)
|
||||||
|
pd.set_option('display.max_rows', None)
|
||||||
|
pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"])
|
||||||
|
print(pv)
|
||||||
|
|
|
@ -0,0 +1,73 @@
|
||||||
|
|
||||||
|
import math
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
from sklearn.model_selection import train_test_split, cross_val_predict
|
||||||
|
from sklearn.neighbors import KernelDensity
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from data import LabelledCollection
|
||||||
|
|
||||||
|
scale = 100
|
||||||
|
|
||||||
|
|
||||||
|
import quapy as qp
|
||||||
|
|
||||||
|
data = qp.datasets.fetch_twitter('wb', min_df=3, pickle=True, for_model_selection=False)
|
||||||
|
|
||||||
|
X, y = data.training.Xy
|
||||||
|
|
||||||
|
cls = LogisticRegression(C=0.0001, random_state=0)
|
||||||
|
|
||||||
|
|
||||||
|
posteriors = cross_val_predict(cls, X=X, y=y, method='predict_proba', n_jobs=-1, cv=3)
|
||||||
|
|
||||||
|
cls.fit(X, y)
|
||||||
|
|
||||||
|
Xte, yte = data.test.Xy
|
||||||
|
|
||||||
|
post_c1 = posteriors[y==0]
|
||||||
|
post_c2 = posteriors[y==1]
|
||||||
|
post_c3 = posteriors[y==2]
|
||||||
|
|
||||||
|
|
||||||
|
print(len(post_c1))
|
||||||
|
print(len(post_c2))
|
||||||
|
print(len(post_c3))
|
||||||
|
|
||||||
|
post_test = cls.predict_proba(Xte)
|
||||||
|
|
||||||
|
alpha = qp.functional.prevalence_from_labels(yte, classes=[0, 1, 2])
|
||||||
|
|
||||||
|
|
||||||
|
nbins = 20
|
||||||
|
|
||||||
|
plt.rcParams.update({'font.size': 7})
|
||||||
|
|
||||||
|
fig = plt.figure()
|
||||||
|
positions = np.asarray([2,1,0])
|
||||||
|
colors = ['r', 'g', 'b']
|
||||||
|
|
||||||
|
for i, post_set in enumerate([post_c1, post_c2, post_c3, post_test]):
|
||||||
|
ax = fig.add_subplot(141+i, projection='3d')
|
||||||
|
for post, c, z in zip(post_set.T, colors, positions):
|
||||||
|
|
||||||
|
hist, bins = np.histogram(post, bins=nbins, density=True, range=[0,1])
|
||||||
|
xs = (bins[:-1] + bins[1:])/2
|
||||||
|
|
||||||
|
ax.bar(xs, hist, width=1/nbins, zs=z, zdir='y', color=c, ec=c, alpha=0.6)
|
||||||
|
|
||||||
|
ax.yaxis.set_ticks(positions)
|
||||||
|
ax.yaxis.set_ticklabels(['$y=1$', '$y=2$', '$y=3$'])
|
||||||
|
ax.xaxis.set_ticks([])
|
||||||
|
ax.xaxis.set_ticklabels([], minor=True)
|
||||||
|
ax.zaxis.set_ticks([])
|
||||||
|
ax.zaxis.set_ticklabels([], minor=True)
|
||||||
|
|
||||||
|
|
||||||
|
#plt.figure(figsize=(10,6))
|
||||||
|
#plt.show()
|
||||||
|
plt.savefig('./histograms.pdf')
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,29 @@
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import seaborn as sns
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
"""
|
||||||
|
This script generates plots of sensibility for the number of classes
|
||||||
|
Plots results for MAE, MRAE, and KLD
|
||||||
|
The hyperparameters were set as:
|
||||||
|
quantifier.set_params(classifier__C=0.01, classifier__class_weight='balanced', bandwidth=0.2)
|
||||||
|
"""
|
||||||
|
|
||||||
|
methods = ['DM', 'KDEy-ML', 'EMQ']
|
||||||
|
optim = 'mae'
|
||||||
|
dfs = [pd.read_csv(f'../results/lequa/nclasses/{optim}/{method}.csv', sep='\t') for method in methods]
|
||||||
|
df = pd.concat(dfs)
|
||||||
|
|
||||||
|
|
||||||
|
for err in ['MAE', 'MRAE']:
|
||||||
|
piv = df.pivot_table(index='nClasses', columns='Method', values=err)
|
||||||
|
g = sns.lineplot(data=piv, markers=True, dashes=False)
|
||||||
|
g.set(xlim=(1, 28))
|
||||||
|
g.legend(loc="center left", bbox_to_anchor=(1, 0.5))
|
||||||
|
g.set_ylabel(err)
|
||||||
|
g.set_xticks(np.linspace(1, 28, 28))
|
||||||
|
plt.xticks(rotation=90)
|
||||||
|
plt.grid()
|
||||||
|
plt.savefig(f'./nclasses_{err}.pdf', bbox_inches='tight')
|
||||||
|
plt.clf()
|
|
@ -0,0 +1,50 @@
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import seaborn as sns
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
"""
|
||||||
|
This script generates plots of sensibility for the hyperparameter "bandwidth".
|
||||||
|
Plots results for MAE, MRAE, and KLD
|
||||||
|
The rest of hyperparameters were set to their default values
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
log_mrae = True
|
||||||
|
|
||||||
|
for method, param, xlim, xticks in [
|
||||||
|
('KDEy-ML', 'Bandwidth', (0.01, 0.2), np.linspace(0.01, 0.2, 20)),
|
||||||
|
('DM-HD', 'nbins', (2,32), list(range(2,10)) + list(range(10,34,2)))
|
||||||
|
]:
|
||||||
|
|
||||||
|
for dataset in ['tweet', 'lequa', 'uciml']:
|
||||||
|
|
||||||
|
if dataset == 'tweet':
|
||||||
|
df = pd.read_csv(f'../results/tweet/sensibility/{method}.csv', sep='\t')
|
||||||
|
ylim = (0.03, 0.21)
|
||||||
|
elif dataset == 'lequa':
|
||||||
|
df = pd.read_csv(f'../results/lequa/T1B/sensibility/{method}.csv', sep='\t')
|
||||||
|
ylim = (0.0125, 0.03)
|
||||||
|
elif dataset == 'uciml':
|
||||||
|
ylim = (0, 0.23)
|
||||||
|
df = pd.read_csv(f'../results/ucimulti/sensibility/{method}.csv', sep='\t')
|
||||||
|
|
||||||
|
for err in ['MAE']: #, 'MRAE']:
|
||||||
|
piv = df.pivot_table(index=param, columns='Dataset', values=err)
|
||||||
|
g = sns.lineplot(data=piv, markers=True, dashes=False)
|
||||||
|
g.set(xlim=xlim)
|
||||||
|
g.legend(loc="center left", bbox_to_anchor=(1, 0.5))
|
||||||
|
|
||||||
|
if log_mrae and err=='MRAE':
|
||||||
|
plt.yscale('log')
|
||||||
|
g.set_ylabel('log('+err+')')
|
||||||
|
else:
|
||||||
|
g.set_ylabel(err)
|
||||||
|
|
||||||
|
g.set_ylim(ylim)
|
||||||
|
g.set_xticks(xticks)
|
||||||
|
plt.xticks(rotation=90)
|
||||||
|
plt.grid()
|
||||||
|
plt.savefig(f'./sensibility_{method}_{dataset}_{err}.pdf', bbox_inches='tight')
|
||||||
|
plt.clf()
|
|
@ -0,0 +1,154 @@
|
||||||
|
import math
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from sklearn.neighbors import KernelDensity
|
||||||
|
|
||||||
|
from data import LabelledCollection
|
||||||
|
|
||||||
|
scale = 10
|
||||||
|
|
||||||
|
|
||||||
|
# con ternary (una lib de matplotlib) salen bien pero no puedo crear contornos, o no se
|
||||||
|
# con plotly salen los contornos bien, pero es un poco un jaleo porque utiliza el navegador...
|
||||||
|
|
||||||
|
def plot_simplex_(ax, density, title='', fontsize=9, points=None):
|
||||||
|
import ternary
|
||||||
|
|
||||||
|
tax = ternary.TernaryAxesSubplot(ax=ax, scale=scale)
|
||||||
|
tax.heatmapf(density, boundary=True, style="triangular", colorbar=False, cmap='viridis') #cmap='magma')
|
||||||
|
tax.boundary(linewidth=1.0)
|
||||||
|
corner_fontsize = 5*fontsize//6
|
||||||
|
tax.right_corner_label("$y=3$", fontsize=corner_fontsize)
|
||||||
|
tax.top_corner_label("$y=2$", fontsize=corner_fontsize)
|
||||||
|
tax.left_corner_label("$y=1$", fontsize=corner_fontsize)
|
||||||
|
if title:
|
||||||
|
tax.set_title(title, loc='center', y=-0.11, fontsize=fontsize)
|
||||||
|
if points is not None:
|
||||||
|
tax.scatter(points*scale, marker='o', color='w', alpha=0.25, zorder=10)
|
||||||
|
tax.get_axes().axis('off')
|
||||||
|
tax.clear_matplotlib_ticks()
|
||||||
|
|
||||||
|
return tax
|
||||||
|
|
||||||
|
|
||||||
|
def plot_simplex(ax, coord, kde_scores, title='', fontsize=11, points=None, savepath=None):
|
||||||
|
import plotly.figure_factory as ff
|
||||||
|
|
||||||
|
tax = ff.create_ternary_contour(coord.T, kde_scores, pole_labels=['y=1', 'y=2', 'y=3'],
|
||||||
|
interp_mode='cartesian',
|
||||||
|
ncontours=20,
|
||||||
|
colorscale='Viridis',
|
||||||
|
showscale=True,
|
||||||
|
title=title)
|
||||||
|
if savepath is None:
|
||||||
|
tax.show()
|
||||||
|
else:
|
||||||
|
tax.write_image(savepath)
|
||||||
|
return tax
|
||||||
|
|
||||||
|
from mpl_toolkits.axes_grid1 import make_axes_locatable
|
||||||
|
def plot_3class_problem(post_c1, post_c2, post_c3, post_test, alpha, bandwidth):
|
||||||
|
import ternary
|
||||||
|
|
||||||
|
post_c1 = np.flip(post_c1, axis=1)
|
||||||
|
post_c2 = np.flip(post_c2, axis=1)
|
||||||
|
post_c3 = np.flip(post_c3, axis=1)
|
||||||
|
post_test = np.flip(post_test, axis=1)
|
||||||
|
|
||||||
|
size_=10
|
||||||
|
fig = ternary.plt.figure(figsize=(4*size_, 1*size_))
|
||||||
|
fig.tight_layout()
|
||||||
|
ax1 = fig.add_subplot(1, 4, 1)
|
||||||
|
divider = make_axes_locatable(ax1)
|
||||||
|
ax2 = fig.add_subplot(1, 4, 2)
|
||||||
|
divider = make_axes_locatable(ax2)
|
||||||
|
ax3 = fig.add_subplot(1, 4, 3)
|
||||||
|
divider = make_axes_locatable(ax3)
|
||||||
|
ax4 = fig.add_subplot(1, 4, 4)
|
||||||
|
divider = make_axes_locatable(ax4)
|
||||||
|
|
||||||
|
kde1 = KernelDensity(bandwidth=bandwidth).fit(post_c1)
|
||||||
|
kde2 = KernelDensity(bandwidth=bandwidth).fit(post_c2)
|
||||||
|
kde3 = KernelDensity(bandwidth=bandwidth).fit(post_c3)
|
||||||
|
|
||||||
|
#post_c1 = np.concatenate([post_c1, np.eye(3, dtype=float)])
|
||||||
|
#post_c2 = np.concatenate([post_c2, np.eye(3, dtype=float)])
|
||||||
|
#post_c3 = np.concatenate([post_c3, np.eye(3, dtype=float)])
|
||||||
|
|
||||||
|
#plot_simplex_(ax1, lambda x:0, title='$f_1(\mathbf{x})=p(s(\mathbf{x})|y=1)$')
|
||||||
|
#plot_simplex_(ax2, lambda x:0, title='$f_1(\mathbf{x})=p(s(\mathbf{x})|y=1)$')
|
||||||
|
#plot_simplex_(ax3, lambda x:0, title='$f_1(\mathbf{x})=p(s(\mathbf{x})|y=1)$')
|
||||||
|
def density(kde):
|
||||||
|
def d(p):
|
||||||
|
return np.exp(kde([p])).item()
|
||||||
|
return d
|
||||||
|
|
||||||
|
plot_simplex_(ax1, density(kde1.score_samples), title='$p_1$')
|
||||||
|
plot_simplex_(ax2, density(kde2.score_samples), title='$p_2$')
|
||||||
|
plot_simplex_(ax3, density(kde3.score_samples), title='$p_3$')
|
||||||
|
#plot_simplex(ax1, post_c1, np.exp(kde1.score_samples(post_c1)), title='$f_1(\mathbf{x})=p(s(\mathbf{x})|y=1)$') #, savepath='figure/y1.png')
|
||||||
|
#plot_simplex(ax2, post_c2, np.exp(kde2.score_samples(post_c2)), title='$f_2(\mathbf{x})=p(s(\mathbf{x})|y=2)$') #, savepath='figure/y2.png')
|
||||||
|
#plot_simplex(ax3, post_c3, np.exp(kde3.score_samples(post_c3)), title='$f_3(\mathbf{x})=p(s(\mathbf{x})|y=3)$') #, savepath='figure/y3.png')
|
||||||
|
|
||||||
|
def mixture_(prevs, kdes):
|
||||||
|
def m(p):
|
||||||
|
total_density = 0
|
||||||
|
for prev, kde in zip(prevs, kdes):
|
||||||
|
log_density = kde.score_samples([p]).item()
|
||||||
|
density = np.exp(log_density)
|
||||||
|
density *= prev
|
||||||
|
total_density += density
|
||||||
|
#print(total_density)
|
||||||
|
return total_density
|
||||||
|
return m
|
||||||
|
|
||||||
|
title = '$p_{\mathbf{\\alpha}} = \sum_{i \in n} \\alpha_i p_i$'
|
||||||
|
|
||||||
|
plot_simplex_(ax4, mixture_(alpha, [kde1, kde2, kde3]), title=title, points=post_test)
|
||||||
|
#mixture(alpha, [kde1, kde2, kde3])
|
||||||
|
|
||||||
|
#post_test = np.concatenate([post_test, np.eye(3, dtype=float)])
|
||||||
|
#test_scores = sum(alphai*np.exp(kdei.score_samples(post_test)) for alphai, kdei in zip(alpha, [kde1,kde2,kde3]))
|
||||||
|
#plot_simplex(ax4, post_test, test_scores, title=title, points=post_test)
|
||||||
|
|
||||||
|
#ternary.plt.show()
|
||||||
|
ternary.plt.savefig('./simplex.png')
|
||||||
|
|
||||||
|
|
||||||
|
import quapy as qp
|
||||||
|
|
||||||
|
|
||||||
|
data = qp.datasets.fetch_twitter('wb', min_df=3, pickle=True, for_model_selection=False)
|
||||||
|
|
||||||
|
X, y = data.training.Xy
|
||||||
|
|
||||||
|
cls = LogisticRegression(C=0.0001, random_state=0)
|
||||||
|
|
||||||
|
Xtr, Xte, ytr, yte = train_test_split(X, y, train_size=0.7, stratify=y, random_state=0)
|
||||||
|
|
||||||
|
cls.fit(Xtr, ytr)
|
||||||
|
|
||||||
|
test = LabelledCollection(Xte, yte)
|
||||||
|
test = test.sampling(100, *[0.2, 0.1, 0.7])
|
||||||
|
|
||||||
|
Xte, yte = test.Xy
|
||||||
|
|
||||||
|
post_c1 = cls.predict_proba(Xte[yte==0])
|
||||||
|
post_c2 = cls.predict_proba(Xte[yte==1])
|
||||||
|
post_c3 = cls.predict_proba(Xte[yte==2])
|
||||||
|
|
||||||
|
post_test = cls.predict_proba(Xte)
|
||||||
|
print(post_test)
|
||||||
|
alpha = qp.functional.prevalence_from_labels(yte, classes=[0, 1, 2])
|
||||||
|
|
||||||
|
#post_c1 = np.random.dirichlet([10,3,1], 30)
|
||||||
|
#post_c2 = np.random.dirichlet([1,11,6], 30)
|
||||||
|
#post_c3 = np.random.dirichlet([1,5,20], 30)
|
||||||
|
#post_test = np.random.dirichlet([5,1,6], 100)
|
||||||
|
#alpha = [0.5, 0.3, 0.2]
|
||||||
|
|
||||||
|
|
||||||
|
print(f'test alpha {alpha}')
|
||||||
|
plot_3class_problem(post_c1, post_c2, post_c3, post_test, alpha, bandwidth=0.1)
|
||||||
|
|
|
@ -0,0 +1,129 @@
|
||||||
|
import ternary
|
||||||
|
import math
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
from sklearn.model_selection import train_test_split, cross_val_predict
|
||||||
|
from sklearn.neighbors import KernelDensity
|
||||||
|
import plotly.figure_factory as ff
|
||||||
|
|
||||||
|
from data import LabelledCollection
|
||||||
|
|
||||||
|
scale = 100
|
||||||
|
|
||||||
|
|
||||||
|
# con ternary (una lib de matplotlib) salen bien pero no puedo crear contornos, o no se
|
||||||
|
# con plotly salen los contornos bien, pero es un poco un jaleo porque utiliza el navegador...
|
||||||
|
|
||||||
|
def plot_simplex_(ax, density, title='', fontsize=30, points=None):
|
||||||
|
|
||||||
|
tax = ternary.TernaryAxesSubplot(ax=ax, scale=scale)
|
||||||
|
tax.heatmapf(density, boundary=True, style="triangular", colorbar=False, cmap='viridis') #cmap='magma')
|
||||||
|
tax.boundary(linewidth=1.0)
|
||||||
|
corner_fontsize = int(5*fontsize//6)
|
||||||
|
tax.right_corner_label("$y=3$", fontsize=corner_fontsize)
|
||||||
|
tax.top_corner_label("$y=2$", fontsize=corner_fontsize)
|
||||||
|
tax.left_corner_label("$y=1$", fontsize=corner_fontsize)
|
||||||
|
if title:
|
||||||
|
tax.set_title(title, loc='center', y=-0.11, fontsize=fontsize)
|
||||||
|
if points is not None:
|
||||||
|
tax.scatter(points*scale, marker='o', color='w', alpha=0.25, zorder=10, s=5*scale)
|
||||||
|
tax.get_axes().axis('off')
|
||||||
|
tax.clear_matplotlib_ticks()
|
||||||
|
|
||||||
|
return tax
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
from mpl_toolkits.axes_grid1 import make_axes_locatable
|
||||||
|
def plot_3class_problem(post_c1, post_c2, post_c3, post_test, alpha, bandwidth):
|
||||||
|
post_c1 = np.flip(post_c1, axis=1)
|
||||||
|
post_c2 = np.flip(post_c2, axis=1)
|
||||||
|
post_c3 = np.flip(post_c3, axis=1)
|
||||||
|
post_test = np.flip(post_test, axis=1)
|
||||||
|
|
||||||
|
size_=10
|
||||||
|
fig = ternary.plt.figure(figsize=(5*size_, 1*size_))
|
||||||
|
fig.tight_layout()
|
||||||
|
ax1 = fig.add_subplot(1, 4, 1)
|
||||||
|
divider = make_axes_locatable(ax1)
|
||||||
|
ax2 = fig.add_subplot(1, 4, 2)
|
||||||
|
divider = make_axes_locatable(ax2)
|
||||||
|
ax3 = fig.add_subplot(1, 4, 3)
|
||||||
|
divider = make_axes_locatable(ax3)
|
||||||
|
ax4 = fig.add_subplot(1, 4, 4)
|
||||||
|
divider = make_axes_locatable(ax4)
|
||||||
|
|
||||||
|
kde1 = KernelDensity(bandwidth=bandwidth).fit(post_c1)
|
||||||
|
kde2 = KernelDensity(bandwidth=bandwidth).fit(post_c2)
|
||||||
|
kde3 = KernelDensity(bandwidth=bandwidth).fit(post_c3)
|
||||||
|
|
||||||
|
#post_c1 = np.concatenate([post_c1, np.eye(3, dtype=float)])
|
||||||
|
#post_c2 = np.concatenate([post_c2, np.eye(3, dtype=float)])
|
||||||
|
#post_c3 = np.concatenate([post_c3, np.eye(3, dtype=float)])
|
||||||
|
|
||||||
|
#plot_simplex_(ax1, lambda x:0, title='$f_1(\mathbf{x})=p(s(\mathbf{x})|y=1)$')
|
||||||
|
#plot_simplex_(ax2, lambda x:0, title='$f_1(\mathbf{x})=p(s(\mathbf{x})|y=1)$')
|
||||||
|
#plot_simplex_(ax3, lambda x:0, title='$f_1(\mathbf{x})=p(s(\mathbf{x})|y=1)$')
|
||||||
|
def density(kde):
|
||||||
|
def d(p):
|
||||||
|
return np.exp(kde([p])).item()
|
||||||
|
return d
|
||||||
|
|
||||||
|
plot_simplex_(ax1, density(kde1.score_samples), title='$p_1$')
|
||||||
|
plot_simplex_(ax2, density(kde2.score_samples), title='$p_2$')
|
||||||
|
plot_simplex_(ax3, density(kde3.score_samples), title='$p_3$')
|
||||||
|
#plot_simplex(ax1, post_c1, np.exp(kde1.score_samples(post_c1)), title='$f_1(\mathbf{x})=p(s(\mathbf{x})|y=1)$') #, savepath='figure/y1.png')
|
||||||
|
#plot_simplex(ax2, post_c2, np.exp(kde2.score_samples(post_c2)), title='$f_2(\mathbf{x})=p(s(\mathbf{x})|y=2)$') #, savepath='figure/y2.png')
|
||||||
|
#plot_simplex(ax3, post_c3, np.exp(kde3.score_samples(post_c3)), title='$f_3(\mathbf{x})=p(s(\mathbf{x})|y=3)$') #, savepath='figure/y3.png')
|
||||||
|
|
||||||
|
def mixture_(prevs, kdes):
|
||||||
|
def m(p):
|
||||||
|
total_density = 0
|
||||||
|
for prev, kde in zip(prevs, kdes):
|
||||||
|
log_density = kde.score_samples([p]).item()
|
||||||
|
density = np.exp(log_density)
|
||||||
|
density *= prev
|
||||||
|
total_density += density
|
||||||
|
#print(total_density)
|
||||||
|
return total_density
|
||||||
|
return m
|
||||||
|
|
||||||
|
title = '$\mathbf{p}_{\mathbf{\\alpha}} = \sum_{i \in n} \\alpha_i p_i$'
|
||||||
|
|
||||||
|
plot_simplex_(ax4, mixture_(alpha, [kde1, kde2, kde3]), title=title, points=post_test)
|
||||||
|
|
||||||
|
#ternary.plt.show()
|
||||||
|
ternary.plt.savefig('./simplex.pdf')
|
||||||
|
|
||||||
|
|
||||||
|
import quapy as qp
|
||||||
|
|
||||||
|
|
||||||
|
data = qp.datasets.fetch_twitter('wb', min_df=3, pickle=True, for_model_selection=False)
|
||||||
|
|
||||||
|
Xtr, ytr = data.training.Xy
|
||||||
|
Xte, yte = data.test.sampling(150, *[0.5, 0.1, 0.4]).Xy
|
||||||
|
|
||||||
|
cls = LogisticRegression(C=0.0001, random_state=0)
|
||||||
|
|
||||||
|
draw_from_training = False
|
||||||
|
if draw_from_training:
|
||||||
|
post_tr = cross_val_predict(cls, Xtr, ytr, n_jobs=-1, method='predict_proba')
|
||||||
|
post_c1 = post_tr[ytr==0]
|
||||||
|
post_c2 = post_tr[ytr==1]
|
||||||
|
post_c3 = post_tr[ytr==2]
|
||||||
|
cls.fit(Xtr, ytr)
|
||||||
|
else:
|
||||||
|
cls.fit(Xtr, ytr)
|
||||||
|
post_te = cls.predict_proba(Xte)
|
||||||
|
post_c1 = post_te[yte == 0]
|
||||||
|
post_c2 = post_te[yte == 1]
|
||||||
|
post_c3 = post_te[yte == 2]
|
||||||
|
|
||||||
|
post_test = cls.predict_proba(Xte)
|
||||||
|
|
||||||
|
alpha = qp.functional.prevalence_from_labels(yte, classes=[0, 1, 2])
|
||||||
|
|
||||||
|
print(f'test alpha {alpha}')
|
||||||
|
plot_3class_problem(post_c1, post_c2, post_c3, post_test, alpha, bandwidth=0.1)
|
||||||
|
|
|
@ -0,0 +1,65 @@
|
||||||
|
import pickle
|
||||||
|
import numpy as np
|
||||||
|
import os
|
||||||
|
import pandas as pd
|
||||||
|
from distribution_matching.commons import METHODS, BIN_METHODS, new_method, show_results
|
||||||
|
|
||||||
|
import quapy as qp
|
||||||
|
from quapy.model_selection import GridSearchQ
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
for task in ['T1A', 'T1B']:
|
||||||
|
qp.environ['SAMPLE_SIZE'] = qp.datasets.LEQUA2022_SAMPLE_SIZE[task]
|
||||||
|
qp.environ['N_JOBS'] = -1
|
||||||
|
for optim in ['mae', 'mrae']:
|
||||||
|
|
||||||
|
result_dir = f'results/lequa/{task}/{optim}'
|
||||||
|
|
||||||
|
os.makedirs(result_dir, exist_ok=True)
|
||||||
|
|
||||||
|
for method in (METHODS if task=='T1B' else BIN_METHODS):
|
||||||
|
|
||||||
|
print('Init method', method)
|
||||||
|
|
||||||
|
result_path = f'{result_dir}/{method}'
|
||||||
|
|
||||||
|
if os.path.exists(result_path+'.csv'):
|
||||||
|
print(f'file {result_path}.csv already exist; skipping')
|
||||||
|
continue
|
||||||
|
|
||||||
|
with open(result_path+'.csv', 'wt') as csv:
|
||||||
|
csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n')
|
||||||
|
|
||||||
|
dataset = task
|
||||||
|
train, val_gen, test_gen = qp.datasets.fetch_lequa2022(dataset)
|
||||||
|
print(f'init {dataset} #instances: {len(train)}')
|
||||||
|
param_grid, quantifier = new_method(method)
|
||||||
|
|
||||||
|
if param_grid is not None:
|
||||||
|
modsel = GridSearchQ(quantifier, param_grid, protocol=val_gen, refit=False, n_jobs=-1, verbose=1, error=optim)
|
||||||
|
|
||||||
|
modsel.fit(train)
|
||||||
|
print(f'best params {modsel.best_params_}')
|
||||||
|
print(f'best score {modsel.best_score_}')
|
||||||
|
pickle.dump(
|
||||||
|
(modsel.best_params_, modsel.best_score_,),
|
||||||
|
open(f'{result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
|
quantifier = modsel.best_model()
|
||||||
|
else:
|
||||||
|
print('debug mode... skipping model selection')
|
||||||
|
quantifier.fit(train)
|
||||||
|
|
||||||
|
report = qp.evaluation.evaluation_report(
|
||||||
|
quantifier, protocol=test_gen, error_metrics=['mae', 'mrae', 'kld'],
|
||||||
|
verbose=True, verbose_error=optim[1:], n_jobs=-1
|
||||||
|
)
|
||||||
|
means = report.mean()
|
||||||
|
report.to_csv(result_path+'.dataframe')
|
||||||
|
csv.write(f'{method}\tLeQua-{task}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
|
||||||
|
csv.flush()
|
||||||
|
print(means)
|
||||||
|
|
||||||
|
show_results(result_path)
|
|
@ -0,0 +1,56 @@
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
import os
|
||||||
|
import quapy as qp
|
||||||
|
from distribution_matching.commons import show_results
|
||||||
|
from distribution_matching.method.method_kdey import KDEy
|
||||||
|
from quapy.method.aggregative import DistributionMatching
|
||||||
|
|
||||||
|
|
||||||
|
SEED=1
|
||||||
|
|
||||||
|
def task(val):
|
||||||
|
print('job-init', val)
|
||||||
|
train, val_gen, test_gen = qp.datasets.fetch_lequa2022('T1B')
|
||||||
|
|
||||||
|
with qp.util.temp_seed(SEED):
|
||||||
|
if method=='KDEy-ML':
|
||||||
|
quantifier = KDEy(LogisticRegression(), target='max_likelihood', val_split=10, bandwidth=val)
|
||||||
|
elif method == 'DM-HD':
|
||||||
|
quantifier = DistributionMatching(LogisticRegression(), val_split=10, nbins=val, divergence='HD')
|
||||||
|
|
||||||
|
quantifier.fit(train)
|
||||||
|
report = qp.evaluation.evaluation_report(
|
||||||
|
quantifier, protocol=test_gen, error_metrics=['mae', 'mrae', 'kld'], verbose=True)
|
||||||
|
return report
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
qp.environ['SAMPLE_SIZE'] = qp.datasets.LEQUA2022_SAMPLE_SIZE['T1B']
|
||||||
|
qp.environ['N_JOBS'] = -1
|
||||||
|
result_dir = f'results/lequa/T1B/sensibility'
|
||||||
|
|
||||||
|
os.makedirs(result_dir, exist_ok=True)
|
||||||
|
|
||||||
|
for method, param, grid in [
|
||||||
|
('KDEy-ML', 'Bandwidth', np.linspace(0.01, 0.2, 20)),
|
||||||
|
('DM-HD', 'nbins', list(range(2, 10)) + list(range(10, 34, 2)))
|
||||||
|
]:
|
||||||
|
|
||||||
|
global_result_path = f'{result_dir}/{method}'
|
||||||
|
|
||||||
|
if not os.path.exists(global_result_path+'.csv'):
|
||||||
|
with open(global_result_path+'.csv', 'wt') as csv:
|
||||||
|
csv.write(f'Method\tDataset\t{param}\tMAE\tMRAE\tKLD\n')
|
||||||
|
|
||||||
|
reports = qp.util.parallel(task, grid, n_jobs=-1)
|
||||||
|
with open(global_result_path + '.csv', 'at') as csv:
|
||||||
|
for val, report in zip(grid, reports):
|
||||||
|
means = report.mean()
|
||||||
|
local_result_path = global_result_path + '_T1B' + (f'_{val:.3f}' if isinstance(val, float) else f'{val}')
|
||||||
|
report.to_csv(f'{local_result_path}.dataframe')
|
||||||
|
csv.write(f'{method}\tLeQua-T1B\t{val}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
|
||||||
|
csv.flush()
|
||||||
|
|
||||||
|
show_results(global_result_path)
|
|
@ -0,0 +1,105 @@
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from typing import Union, Callable
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.base import BaseEstimator
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.model_selection import GridSearchCV
|
||||||
|
from sklearn.neighbors import KernelDensity
|
||||||
|
|
||||||
|
import quapy as qp
|
||||||
|
from quapy.data import LabelledCollection
|
||||||
|
from quapy.protocol import APP, UPP
|
||||||
|
from quapy.method.aggregative import AggregativeProbabilisticQuantifier, _training_helper, cross_generate_predictions, \
|
||||||
|
DistributionMatching, _get_divergence
|
||||||
|
import scipy
|
||||||
|
from scipy import optimize
|
||||||
|
from statsmodels.nonparametric.kernel_density import KDEMultivariateConditional
|
||||||
|
import dirichlet
|
||||||
|
|
||||||
|
|
||||||
|
class DIRy(AggregativeProbabilisticQuantifier):
|
||||||
|
|
||||||
|
MAXITER = 100000
|
||||||
|
|
||||||
|
def __init__(self, classifier: BaseEstimator, val_split=0.4, n_jobs=None, target='max_likelihood'):
|
||||||
|
self.classifier = classifier
|
||||||
|
self.val_split = val_split
|
||||||
|
self.n_jobs = n_jobs
|
||||||
|
self.target = target
|
||||||
|
|
||||||
|
def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
|
||||||
|
|
||||||
|
if val_split is None:
|
||||||
|
val_split = self.val_split
|
||||||
|
|
||||||
|
self.classifier, y, posteriors, _, _ = cross_generate_predictions(
|
||||||
|
data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
|
||||||
|
)
|
||||||
|
|
||||||
|
self.val_parameters = []
|
||||||
|
for cat in range(data.n_classes):
|
||||||
|
dir_i = dirichlet.mle(posteriors[y == cat], maxiter=DIRy.MAXITER)
|
||||||
|
self.val_parameters.append(dir_i)
|
||||||
|
# print(cat)
|
||||||
|
# self.val_parameters = [dirichlet.mle(posteriors[y == cat], maxiter=DIRy.MAXITER) for cat in range(data.n_classes)]
|
||||||
|
|
||||||
|
return self
|
||||||
|
|
||||||
|
def val_pdf(self, prev):
|
||||||
|
"""
|
||||||
|
Returns a function that computes the mixture model with the given prev as mixture factor
|
||||||
|
:param prev: a prevalence vector, ndarray
|
||||||
|
:return: a function implementing the validation distribution with fixed mixture factor
|
||||||
|
"""
|
||||||
|
return lambda posteriors: sum(prev_i * dirichlet.pdf(parameters_i)(posteriors) for parameters_i, prev_i in zip(self.val_parameters, prev))
|
||||||
|
|
||||||
|
def aggregate(self, posteriors: np.ndarray):
|
||||||
|
if self.target == 'min_divergence':
|
||||||
|
raise NotImplementedError('not yet')
|
||||||
|
return self._target_divergence(posteriors)
|
||||||
|
elif self.target == 'max_likelihood':
|
||||||
|
return self._target_likelihood(posteriors)
|
||||||
|
else:
|
||||||
|
raise ValueError('unknown target')
|
||||||
|
|
||||||
|
def _target_divergence(self, posteriors):
|
||||||
|
test_density = self.get_kde(posteriors)
|
||||||
|
# val_test_posteriors = np.concatenate([self.val_posteriors, posteriors])
|
||||||
|
test_likelihood = self.pdf(test_density, posteriors)
|
||||||
|
divergence = _get_divergence(self.divergence)
|
||||||
|
|
||||||
|
n_classes = len(self.val_densities)
|
||||||
|
|
||||||
|
def match(prev):
|
||||||
|
val_pdf = self.val_pdf(prev)
|
||||||
|
val_likelihood = val_pdf(posteriors)
|
||||||
|
return divergence(val_likelihood, test_likelihood)
|
||||||
|
|
||||||
|
# the initial point is set as the uniform distribution
|
||||||
|
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
|
||||||
|
|
||||||
|
# solutions are bounded to those contained in the unit-simplex
|
||||||
|
bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1]
|
||||||
|
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
|
||||||
|
r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
|
||||||
|
return r.x
|
||||||
|
|
||||||
|
def _target_likelihood(self, posteriors, eps=0.000001):
|
||||||
|
n_classes = len(self.val_parameters)
|
||||||
|
|
||||||
|
def neg_loglikelihood(prev):
|
||||||
|
val_pdf = self.val_pdf(prev)
|
||||||
|
test_likelihood = val_pdf(posteriors)
|
||||||
|
test_loglikelihood = np.log(test_likelihood + eps)
|
||||||
|
return -np.sum(test_loglikelihood)
|
||||||
|
|
||||||
|
# the initial point is set as the uniform distribution
|
||||||
|
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
|
||||||
|
|
||||||
|
# solutions are bounded to those contained in the unit-simplex
|
||||||
|
bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1]
|
||||||
|
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
|
||||||
|
r = optimize.minimize(neg_loglikelihood, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
|
||||||
|
return r.x
|
|
@ -0,0 +1,50 @@
|
||||||
|
from quapy.method.base import BaseQuantifier
|
||||||
|
import numpy as np
|
||||||
|
from distribution_matching.method.kdey import KDEBase
|
||||||
|
|
||||||
|
import quapy as qp
|
||||||
|
from quapy.data import LabelledCollection
|
||||||
|
import quapy.functional as F
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
|
||||||
|
|
||||||
|
class KDExML(BaseQuantifier, KDEBase):
|
||||||
|
|
||||||
|
def __init__(self, bandwidth=0.1, standardize=True):
|
||||||
|
self._check_bandwidth(bandwidth)
|
||||||
|
self.bandwidth = bandwidth
|
||||||
|
self.standardize = standardize
|
||||||
|
|
||||||
|
def fit(self, data: LabelledCollection):
|
||||||
|
X, y = data.Xy
|
||||||
|
if self.standardize:
|
||||||
|
self.scaler = StandardScaler()
|
||||||
|
X = self.scaler.fit_transform(X)
|
||||||
|
|
||||||
|
self.mix_densities = self.get_mixture_components(X, y, data.n_classes, self.bandwidth)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def quantify(self, X):
|
||||||
|
"""
|
||||||
|
Searches for the mixture model parameter (the sought prevalence values) that maximizes the likelihood
|
||||||
|
of the data (i.e., that minimizes the negative log-likelihood)
|
||||||
|
|
||||||
|
:param X: instances in the sample
|
||||||
|
:return: a vector of class prevalence estimates
|
||||||
|
"""
|
||||||
|
epsilon = 1e-10
|
||||||
|
n_classes = len(self.mix_densities)
|
||||||
|
if self.standardize:
|
||||||
|
X = self.scaler.transform(X)
|
||||||
|
test_densities = [self.pdf(kde_i, X) for kde_i in self.mix_densities]
|
||||||
|
|
||||||
|
def neg_loglikelihood(prev):
|
||||||
|
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities))
|
||||||
|
test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
|
||||||
|
return -np.sum(test_loglikelihood)
|
||||||
|
|
||||||
|
return F.optim_minimize(neg_loglikelihood, n_classes)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,234 @@
|
||||||
|
from typing import Union
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.base import BaseEstimator
|
||||||
|
from sklearn.neighbors import KernelDensity
|
||||||
|
|
||||||
|
import quapy as qp
|
||||||
|
from quapy.data import LabelledCollection
|
||||||
|
from quapy.method.aggregative import AggregativeProbabilisticQuantifier, cross_generate_predictions
|
||||||
|
import quapy.functional as F
|
||||||
|
|
||||||
|
from sklearn.metrics.pairwise import rbf_kernel
|
||||||
|
|
||||||
|
|
||||||
|
class KDEBase:
|
||||||
|
|
||||||
|
BANDWIDTH_METHOD = ['scott', 'silverman']
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _check_bandwidth(cls, bandwidth):
|
||||||
|
assert bandwidth in KDEBase.BANDWIDTH_METHOD or isinstance(bandwidth, float), \
|
||||||
|
f'invalid bandwidth, valid ones are {KDEBase.BANDWIDTH_METHOD} or float values'
|
||||||
|
if isinstance(bandwidth, float):
|
||||||
|
assert 0 < bandwidth < 1, "the bandwith for KDEy should be in (0,1), since this method models the unit simplex"
|
||||||
|
|
||||||
|
def get_kde_function(self, X, bandwidth):
|
||||||
|
return KernelDensity(bandwidth=bandwidth).fit(X)
|
||||||
|
|
||||||
|
def pdf(self, kde, X):
|
||||||
|
return np.exp(kde.score_samples(X))
|
||||||
|
|
||||||
|
def get_mixture_components(self, X, y, n_classes, bandwidth):
|
||||||
|
return [self.get_kde_function(X[y == cat], bandwidth) for cat in range(n_classes)]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class KDEyML(AggregativeProbabilisticQuantifier, KDEBase):
|
||||||
|
|
||||||
|
def __init__(self, classifier: BaseEstimator, val_split=10, bandwidth=0.1, n_jobs=None, random_state=0):
|
||||||
|
self._check_bandwidth(bandwidth)
|
||||||
|
self.classifier = classifier
|
||||||
|
self.val_split = val_split
|
||||||
|
self.bandwidth = bandwidth
|
||||||
|
self.n_jobs = n_jobs
|
||||||
|
self.random_state=random_state
|
||||||
|
|
||||||
|
def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
|
||||||
|
if val_split is None:
|
||||||
|
val_split = self.val_split
|
||||||
|
|
||||||
|
self.classifier, y, posteriors, _, _ = cross_generate_predictions(
|
||||||
|
data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
|
||||||
|
)
|
||||||
|
|
||||||
|
self.mix_densities = self.get_mixture_components(posteriors, y, data.n_classes, self.bandwidth)
|
||||||
|
|
||||||
|
return self
|
||||||
|
|
||||||
|
def aggregate(self, posteriors: np.ndarray):
|
||||||
|
"""
|
||||||
|
Searches for the mixture model parameter (the sought prevalence values) that maximizes the likelihood
|
||||||
|
of the data (i.e., that minimizes the negative log-likelihood)
|
||||||
|
|
||||||
|
:param posteriors: instances in the sample converted into posterior probabilities
|
||||||
|
:return: a vector of class prevalence estimates
|
||||||
|
"""
|
||||||
|
np.random.RandomState(self.random_state)
|
||||||
|
epsilon = 1e-10
|
||||||
|
n_classes = len(self.mix_densities)
|
||||||
|
test_densities = [self.pdf(kde_i, posteriors) for kde_i in self.mix_densities]
|
||||||
|
|
||||||
|
def neg_loglikelihood(prev):
|
||||||
|
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities))
|
||||||
|
test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
|
||||||
|
return -np.sum(test_loglikelihood)
|
||||||
|
|
||||||
|
return F.optim_minimize(neg_loglikelihood, n_classes)
|
||||||
|
|
||||||
|
|
||||||
|
class KDEyHD(AggregativeProbabilisticQuantifier, KDEBase):
|
||||||
|
|
||||||
|
def __init__(self, classifier: BaseEstimator, val_split=10, divergence: str='HD',
|
||||||
|
bandwidth=0.1, n_jobs=None, random_state=0, montecarlo_trials=10000):
|
||||||
|
|
||||||
|
self._check_bandwidth(bandwidth)
|
||||||
|
self.classifier = classifier
|
||||||
|
self.val_split = val_split
|
||||||
|
self.divergence = divergence
|
||||||
|
self.bandwidth = bandwidth
|
||||||
|
self.n_jobs = n_jobs
|
||||||
|
self.random_state=random_state
|
||||||
|
self.montecarlo_trials = montecarlo_trials
|
||||||
|
|
||||||
|
def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
|
||||||
|
if val_split is None:
|
||||||
|
val_split = self.val_split
|
||||||
|
|
||||||
|
self.classifier, y, posteriors, _, _ = cross_generate_predictions(
|
||||||
|
data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
|
||||||
|
)
|
||||||
|
|
||||||
|
self.mix_densities = self.get_mixture_components(posteriors, y, data.n_classes, self.bandwidth)
|
||||||
|
|
||||||
|
N = self.montecarlo_trials
|
||||||
|
rs = self.random_state
|
||||||
|
n = data.n_classes
|
||||||
|
self.reference_samples = np.vstack([kde_i.sample(N//n, random_state=rs) for kde_i in self.mix_densities])
|
||||||
|
self.reference_classwise_densities = np.asarray([self.pdf(kde_j, self.reference_samples) for kde_j in self.mix_densities])
|
||||||
|
self.reference_density = np.mean(self.reference_classwise_densities, axis=0) # equiv. to (uniform @ self.reference_classwise_densities)
|
||||||
|
|
||||||
|
return self
|
||||||
|
|
||||||
|
def aggregate(self, posteriors: np.ndarray):
|
||||||
|
# we retain all n*N examples (sampled from a mixture with uniform parameter), and then
|
||||||
|
# apply importance sampling (IS). In this version we compute D(p_alpha||q) with IS
|
||||||
|
n_classes = len(self.mix_densities)
|
||||||
|
|
||||||
|
test_kde = self.get_kde_function(posteriors, self.bandwidth)
|
||||||
|
test_densities = self.pdf(test_kde, self.reference_samples)
|
||||||
|
|
||||||
|
def f_squared_hellinger(u):
|
||||||
|
return (np.sqrt(u)-1)**2
|
||||||
|
|
||||||
|
# todo: this will fail when self.divergence is a callable, and is not the right place to do it anyway
|
||||||
|
if self.divergence.lower() == 'hd':
|
||||||
|
f = f_squared_hellinger
|
||||||
|
else:
|
||||||
|
raise ValueError('only squared HD is currently implemented')
|
||||||
|
|
||||||
|
epsilon = 1e-10
|
||||||
|
qs = test_densities + epsilon
|
||||||
|
rs = self.reference_density + epsilon
|
||||||
|
iw = qs/rs #importance weights
|
||||||
|
p_class = self.reference_classwise_densities + epsilon
|
||||||
|
fracs = p_class/qs
|
||||||
|
|
||||||
|
def divergence(prev):
|
||||||
|
# ps / qs = (prev @ p_class) / qs = prev @ (p_class / qs) = prev @ fracs
|
||||||
|
ps_div_qs = prev @ fracs
|
||||||
|
return np.mean( f(ps_div_qs) * iw )
|
||||||
|
|
||||||
|
return F.optim_minimize(divergence, n_classes)
|
||||||
|
|
||||||
|
|
||||||
|
class KDEyCS(AggregativeProbabilisticQuantifier):
|
||||||
|
|
||||||
|
def __init__(self, classifier: BaseEstimator, val_split=10, bandwidth=0.1, n_jobs=None, random_state=0):
|
||||||
|
KDEBase._check_bandwidth(bandwidth)
|
||||||
|
self.classifier = classifier
|
||||||
|
self.val_split = val_split
|
||||||
|
self.bandwidth = bandwidth
|
||||||
|
self.n_jobs = n_jobs
|
||||||
|
self.random_state=random_state
|
||||||
|
|
||||||
|
def gram_matrix_mix_sum(self, X, Y=None):
|
||||||
|
# this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y))
|
||||||
|
# to contain pairwise evaluations of N(x|mu,Sigma1+Sigma2) with mu=y and Sigma1 and Sigma2 are
|
||||||
|
# two "scalar matrices" (h^2)*I each, so Sigma1+Sigma2 has scalar 2(h^2) (h is the bandwidth)
|
||||||
|
h = self.bandwidth
|
||||||
|
variance = 2 * (h**2)
|
||||||
|
nD = X.shape[1]
|
||||||
|
gamma = 1/(2*variance)
|
||||||
|
norm_factor = 1/np.sqrt(((2*np.pi)**nD) * (variance**(nD)))
|
||||||
|
gram = norm_factor * rbf_kernel(X, Y, gamma=gamma)
|
||||||
|
return gram.sum()
|
||||||
|
|
||||||
|
def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
|
||||||
|
if val_split is None:
|
||||||
|
val_split = self.val_split
|
||||||
|
|
||||||
|
self.classifier, y, posteriors, _, _ = cross_generate_predictions(
|
||||||
|
data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
|
||||||
|
)
|
||||||
|
|
||||||
|
assert all(sorted(np.unique(y)) == np.arange(data.n_classes)), \
|
||||||
|
'label name gaps not allowed in current implementation'
|
||||||
|
|
||||||
|
n = data.n_classes
|
||||||
|
P = posteriors
|
||||||
|
|
||||||
|
# counts_inv keeps track of the relative weight of each datapoint within its class
|
||||||
|
# (i.e., the weight in its KDE model)
|
||||||
|
counts_inv = 1 / (data.counts())
|
||||||
|
|
||||||
|
# tr_tr_sums corresponds to symbol \overline{B} in the paper
|
||||||
|
tr_tr_sums = np.zeros(shape=(n,n), dtype=float)
|
||||||
|
for i in range(n):
|
||||||
|
for j in range(n):
|
||||||
|
if i > j:
|
||||||
|
tr_tr_sums[i,j] = tr_tr_sums[j,i]
|
||||||
|
else:
|
||||||
|
block = self.gram_matrix_mix_sum(P[y == i], P[y == j] if i!=j else None)
|
||||||
|
tr_tr_sums[i, j] = block
|
||||||
|
|
||||||
|
# keep track of these data structures for the test phase
|
||||||
|
self.Ptr = P
|
||||||
|
self.ytr = y
|
||||||
|
self.tr_tr_sums = tr_tr_sums
|
||||||
|
self.counts_inv = counts_inv
|
||||||
|
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
def aggregate(self, posteriors: np.ndarray):
|
||||||
|
Ptr = self.Ptr
|
||||||
|
Pte = posteriors
|
||||||
|
y = self.ytr
|
||||||
|
tr_tr_sums = self.tr_tr_sums
|
||||||
|
|
||||||
|
M, nD = Pte.shape
|
||||||
|
Minv = (1/M) # t in the paper
|
||||||
|
n = Ptr.shape[1]
|
||||||
|
|
||||||
|
|
||||||
|
# becomes a constant that does not affect the optimization, no need to compute it
|
||||||
|
# partC = 0.5*np.log(self.gram_matrix_mix_sum(Pte) * Kinv * Kinv)
|
||||||
|
|
||||||
|
# tr_te_sums corresponds to \overline{a}*(1/Li)*(1/M) in the paper (note the constants
|
||||||
|
# are already aggregated to tr_te_sums, so these multiplications are not carried out
|
||||||
|
# at each iteration of the optimization phase)
|
||||||
|
tr_te_sums = np.zeros(shape=n, dtype=float)
|
||||||
|
for i in range(n):
|
||||||
|
tr_te_sums[i] = self.gram_matrix_mix_sum(Ptr[y==i], Pte)
|
||||||
|
|
||||||
|
def divergence(alpha):
|
||||||
|
# called \overline{r} in the paper
|
||||||
|
alpha_ratio = alpha * self.counts_inv
|
||||||
|
|
||||||
|
# recal that tr_te_sums already accounts for the constant terms (1/Li)*(1/M)
|
||||||
|
partA = -np.log((alpha_ratio @ tr_te_sums) * Minv)
|
||||||
|
partB = 0.5 * np.log(alpha_ratio @ tr_tr_sums @ alpha_ratio)
|
||||||
|
return partA + partB #+ partC
|
||||||
|
|
||||||
|
return F.optim_minimize(divergence, n)
|
||||||
|
|
|
@ -0,0 +1,321 @@
|
||||||
|
from cgi import test
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from typing import Union, Callable
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.base import BaseEstimator
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.model_selection import GridSearchCV
|
||||||
|
from sklearn.neighbors import KernelDensity
|
||||||
|
|
||||||
|
import quapy as qp
|
||||||
|
from quapy.data import LabelledCollection
|
||||||
|
from quapy.protocol import APP, UPP
|
||||||
|
from quapy.method.aggregative import AggregativeProbabilisticQuantifier, _training_helper, cross_generate_predictions, \
|
||||||
|
DistributionMatching, _get_divergence
|
||||||
|
import quapy.functional as F
|
||||||
|
import scipy
|
||||||
|
from scipy import optimize
|
||||||
|
#from statsmodels.nonparametric.kernel_density import KDEMultivariateConditional
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: optimize the bandwidth automatically
|
||||||
|
# TODO: think of a MMD-y variant, i.e., a MMD variant that uses the points in the simplex and possibly any non-linear kernel
|
||||||
|
|
||||||
|
|
||||||
|
class KDEy(AggregativeProbabilisticQuantifier):
|
||||||
|
|
||||||
|
BANDWIDTH_METHOD = ['auto', 'scott', 'silverman']
|
||||||
|
ENGINE = ['scipy', 'sklearn', 'statsmodels']
|
||||||
|
TARGET = ['min_divergence', 'min_divergence_uniform', 'max_likelihood']
|
||||||
|
|
||||||
|
def __init__(self, classifier: BaseEstimator, val_split=0.4, divergence: Union[str, Callable]='L2',
|
||||||
|
bandwidth='scott', engine='sklearn', target='min_divergence', n_jobs=None, random_state=0, montecarlo_trials=1000):
|
||||||
|
assert bandwidth in KDEy.BANDWIDTH_METHOD or isinstance(bandwidth, float), \
|
||||||
|
f'unknown bandwidth_method, valid ones are {KDEy.BANDWIDTH_METHOD}'
|
||||||
|
assert engine in KDEy.ENGINE, f'unknown engine, valid ones are {KDEy.ENGINE}'
|
||||||
|
assert target in KDEy.TARGET, f'unknown target, valid ones are {KDEy.TARGET}'
|
||||||
|
assert target=='max_likelihood' or divergence in ['KLD', 'HD', 'JS'], \
|
||||||
|
'in this version I will only allow KLD or squared HD as a divergence'
|
||||||
|
self.classifier = classifier
|
||||||
|
self.val_split = val_split
|
||||||
|
self.divergence = divergence
|
||||||
|
self.bandwidth = bandwidth
|
||||||
|
self.engine = engine
|
||||||
|
self.target = target
|
||||||
|
self.n_jobs = n_jobs
|
||||||
|
self.random_state=random_state
|
||||||
|
self.montecarlo_trials = montecarlo_trials
|
||||||
|
|
||||||
|
def search_bandwidth_maxlikelihood(self, posteriors, labels):
|
||||||
|
grid = {'bandwidth': np.linspace(0.001, 0.2, 100)}
|
||||||
|
search = GridSearchCV(
|
||||||
|
KernelDensity(), param_grid=grid, n_jobs=-1, cv=50, verbose=1, refit=True
|
||||||
|
)
|
||||||
|
search.fit(posteriors, labels)
|
||||||
|
bandwidth = search.best_params_['bandwidth']
|
||||||
|
print(f'auto: bandwidth={bandwidth:.5f}')
|
||||||
|
return bandwidth
|
||||||
|
|
||||||
|
def get_kde_function(self, posteriors):
|
||||||
|
# if self.bandwidth == 'auto':
|
||||||
|
# print('adjusting bandwidth')
|
||||||
|
#
|
||||||
|
# if self.engine == 'sklearn':
|
||||||
|
# grid = {'bandwidth': np.linspace(0.001,0.2,41)}
|
||||||
|
# search = GridSearchCV(
|
||||||
|
# KernelDensity(), param_grid=grid, n_jobs=-1, cv=10, verbose=1, refit=True
|
||||||
|
# )
|
||||||
|
# search.fit(posteriors)
|
||||||
|
# print(search.best_score_)
|
||||||
|
# print(search.best_params_)
|
||||||
|
#
|
||||||
|
# import pandas as pd
|
||||||
|
# df = pd.DataFrame(search.cv_results_)
|
||||||
|
# pd.set_option('display.max_columns', None)
|
||||||
|
# pd.set_option('display.max_rows', None)
|
||||||
|
# pd.set_option('expand_frame_repr', False)
|
||||||
|
# print(df)
|
||||||
|
# sys.exit(0)
|
||||||
|
#
|
||||||
|
# kde = search
|
||||||
|
|
||||||
|
#else:
|
||||||
|
if self.engine == 'scipy':
|
||||||
|
# scipy treats columns as datapoints, and need the datapoints not to lie in a lower-dimensional subspace, which
|
||||||
|
# requires removing the last dimension which is constrained
|
||||||
|
posteriors = posteriors[:,:-1].T
|
||||||
|
kde = scipy.stats.gaussian_kde(posteriors)
|
||||||
|
kde.set_bandwidth(self.bandwidth)
|
||||||
|
elif self.engine == 'sklearn':
|
||||||
|
#print('fitting kde')
|
||||||
|
kde = KernelDensity(bandwidth=self.bandwidth).fit(posteriors)
|
||||||
|
#print('[fitting done]')
|
||||||
|
return kde
|
||||||
|
|
||||||
|
def pdf(self, kde, posteriors):
|
||||||
|
if self.engine == 'scipy':
|
||||||
|
return kde(posteriors[:, :-1].T)
|
||||||
|
elif self.engine == 'sklearn':
|
||||||
|
return np.exp(kde.score_samples(posteriors))
|
||||||
|
|
||||||
|
def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
|
||||||
|
"""
|
||||||
|
|
||||||
|
:param data: the training set
|
||||||
|
:param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit)
|
||||||
|
:param val_split: either a float in (0,1) indicating the proportion of training instances to use for
|
||||||
|
validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
|
||||||
|
indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV
|
||||||
|
to estimate the parameters
|
||||||
|
"""
|
||||||
|
if val_split is None:
|
||||||
|
val_split = self.val_split
|
||||||
|
|
||||||
|
self.classifier, y, posteriors, classes, class_count = cross_generate_predictions(
|
||||||
|
data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.bandwidth == 'auto':
|
||||||
|
self.bandwidth = self.search_bandwidth_maxlikelihood(posteriors, y)
|
||||||
|
|
||||||
|
self.val_densities = [self.get_kde_function(posteriors[y == cat]) for cat in range(data.n_classes)]
|
||||||
|
self.val_posteriors = posteriors
|
||||||
|
|
||||||
|
if self.target == 'min_divergence_uniform':
|
||||||
|
self.samples = qp.functional.uniform_prevalence_sampling(n_classes=data.n_classes, size=self.montecarlo_trials)
|
||||||
|
self.sample_densities = [self.pdf(kde_i, self.samples) for kde_i in self.val_densities]
|
||||||
|
elif self.target == 'min_divergence':
|
||||||
|
N = self.montecarlo_trials
|
||||||
|
rs = self.random_state
|
||||||
|
n = data.n_classes
|
||||||
|
self.reference_samples = np.vstack([kde_i.sample(N//n, random_state=rs) for kde_i in self.val_densities])
|
||||||
|
self.reference_classwise_densities = np.asarray([self.pdf(kde_j, self.reference_samples) for kde_j in self.val_densities])
|
||||||
|
self.reference_density = np.mean(self.reference_classwise_densities, axis=0) # equiv. to (uniform @ self.reference_classwise_densities)
|
||||||
|
elif self.target == 'min_divergence_deprecated': # the version of the first draft, with n*N presampled, then alpha*N chosen for class
|
||||||
|
self.class_samples = [kde_i.sample(self.montecarlo_trials, random_state=self.random_state) for kde_i in self.val_densities]
|
||||||
|
self.class_sample_densities = {}
|
||||||
|
for ci, samples_i in enumerate(self.class_samples):
|
||||||
|
self.class_sample_densities[ci] = np.asarray([self.pdf(kde_j, samples_i) for kde_j in self.val_densities]).T
|
||||||
|
|
||||||
|
return self
|
||||||
|
|
||||||
|
def aggregate(self, posteriors: np.ndarray):
|
||||||
|
if self.target == 'min_divergence':
|
||||||
|
return self._target_divergence(posteriors)
|
||||||
|
elif self.target == 'min_divergence_uniform':
|
||||||
|
return self._target_divergence_uniform(posteriors)
|
||||||
|
elif self.target == 'max_likelihood':
|
||||||
|
return self._target_likelihood(posteriors)
|
||||||
|
else:
|
||||||
|
raise ValueError('unknown target')
|
||||||
|
|
||||||
|
|
||||||
|
# new version in which we retain all n*N examples (sampled from a mixture with uniform parameter), and then
|
||||||
|
# apply importance sampling (IS). In this version we compute D(p_alpha||q) with IS, and not D(q||p_alpha) as
|
||||||
|
# in the first draft
|
||||||
|
def _target_divergence(self, posteriors):
|
||||||
|
# in this variant we evaluate the divergence using a Montecarlo approach
|
||||||
|
n_classes = len(self.val_densities)
|
||||||
|
|
||||||
|
test_kde = self.get_kde_function(posteriors)
|
||||||
|
test_densities = self.pdf(test_kde, self.reference_samples)
|
||||||
|
|
||||||
|
def f_squared_hellinger(u):
|
||||||
|
return (np.sqrt(u)-1)**2
|
||||||
|
|
||||||
|
# todo: this will fail when self.divergence is a callable, and is not the right place to do it anyway
|
||||||
|
if self.divergence.lower() == 'hd':
|
||||||
|
f = f_squared_hellinger
|
||||||
|
else:
|
||||||
|
raise ValueError('only squared HD is currently implemented')
|
||||||
|
|
||||||
|
epsilon = 1e-10
|
||||||
|
qs = test_densities + epsilon
|
||||||
|
rs = self.reference_density + epsilon
|
||||||
|
iw = qs/rs #importance weights
|
||||||
|
p_class = self.reference_classwise_densities + epsilon
|
||||||
|
fracs = p_class/qs
|
||||||
|
|
||||||
|
def divergence(prev):
|
||||||
|
# ps / qs = (prev @ p_class) / qs = prev @ (p_class / qs) = prev @ fracs
|
||||||
|
ps_div_qs = prev @ fracs
|
||||||
|
return np.mean( f(ps_div_qs) * iw )
|
||||||
|
|
||||||
|
return F.optim_minimize(divergence, n_classes)
|
||||||
|
|
||||||
|
# new version in which we retain all n*N examples (sampled from a mixture with uniform parameter), and then
|
||||||
|
# apply importance sampling (IS). In this version we compute D(q||p_alpha) with IS, and not D(p_alpha||q) as
|
||||||
|
# in the reformulation proposed above
|
||||||
|
def _target_divergence_q__p(self, posteriors):
|
||||||
|
# in this variant we evaluate the divergence using a Montecarlo approach
|
||||||
|
n_classes = len(self.val_densities)
|
||||||
|
|
||||||
|
test_kde = self.get_kde_function(posteriors)
|
||||||
|
test_densities = self.pdf(test_kde, self.reference_samples)
|
||||||
|
|
||||||
|
def f_squared_hellinger(u):
|
||||||
|
return (np.sqrt(u)-1)**2
|
||||||
|
|
||||||
|
# todo: this will fail when self.divergence is a callable, and is not the right place to do it anyway
|
||||||
|
if self.divergence.lower() == 'hd':
|
||||||
|
f = f_squared_hellinger
|
||||||
|
else:
|
||||||
|
raise ValueError('only squared HD is currently implemented')
|
||||||
|
|
||||||
|
epsilon = 1e-10
|
||||||
|
qs = test_densities + epsilon
|
||||||
|
rs = self.reference_density + epsilon
|
||||||
|
p_class = self.reference_classwise_densities + epsilon
|
||||||
|
|
||||||
|
# D(q||p_a) = 1/N sum f(q/p_a) * (p_a / p_u)
|
||||||
|
def divergence(prev):
|
||||||
|
p_a = prev @ p_class
|
||||||
|
return np.mean( f(qs/p_a) * (p_a/rs) )
|
||||||
|
|
||||||
|
return F.optim_minimize(divergence, n_classes)
|
||||||
|
|
||||||
|
|
||||||
|
# the first version we explain in the draft, choosing alpha*N from a pool of N for each class and w/o importance sampling
|
||||||
|
def _target_divergence_deprecated(self, posteriors):
|
||||||
|
# in this variant we evaluate the divergence using a Montecarlo approach
|
||||||
|
n_classes = len(self.val_densities)
|
||||||
|
|
||||||
|
test_kde = self.get_kde_function(posteriors)
|
||||||
|
test_densities_per_class = [self.pdf(test_kde, samples_i) for samples_i in self.class_samples]
|
||||||
|
|
||||||
|
# divergence = _get_divergence(self.divergence)
|
||||||
|
def kld_monte(pi, qi, eps=1e-8):
|
||||||
|
# there is no pi in front of the log because the samples are already drawn according to pi
|
||||||
|
smooth_pi = pi+eps
|
||||||
|
smooth_qi = qi+eps
|
||||||
|
return np.mean(np.log(smooth_pi / smooth_qi))
|
||||||
|
|
||||||
|
def squared_hellinger(pi, qi, eps=1e-8):
|
||||||
|
smooth_pi = pi + eps
|
||||||
|
smooth_qi = qi + eps
|
||||||
|
return np.mean((np.sqrt(smooth_pi/smooth_qi)-1)**2)
|
||||||
|
|
||||||
|
# todo: this will fail when self.divergence is a callable, and is not the right place to do it anyway
|
||||||
|
if self.divergence.lower() == 'kld':
|
||||||
|
fdivergence = kld_monte
|
||||||
|
elif self.divergence.lower() == 'hd':
|
||||||
|
fdivergence = squared_hellinger
|
||||||
|
|
||||||
|
|
||||||
|
def dissimilarity(prev):
|
||||||
|
# choose the samples according to the prevalence vector
|
||||||
|
# e.g., prev = [0.5, 0.3, 0.2] will draw 50% from KDE_0, 30% from KDE_1, and 20% from KDE_2
|
||||||
|
# the points are already pre-sampled and de densities are pre-computed, so that now all that remains
|
||||||
|
# is to pick a proportional number of each from each class (same for test)
|
||||||
|
num_variates_per_class = np.round(prev * self.montecarlo_trials).astype(int)
|
||||||
|
sample_densities = np.vstack(
|
||||||
|
[self.class_sample_densities[ci][:num_i] for ci, num_i in enumerate(num_variates_per_class)]
|
||||||
|
)
|
||||||
|
#val_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prev, sample_densities.T))
|
||||||
|
val_likelihood = prev @ sample_densities.T
|
||||||
|
#test_likelihood = []
|
||||||
|
#for samples_i, num_i in zip(test_densities_per_class, num_variates_per_class):
|
||||||
|
# test_likelihood.append(samples_i[:num_i])
|
||||||
|
#test_likelihood = np.concatenate[test]
|
||||||
|
test_likelihood = np.concatenate(
|
||||||
|
[samples_i[:num_i] for samples_i, num_i in zip(test_densities_per_class, num_variates_per_class)]
|
||||||
|
)
|
||||||
|
# return fdivergence(val_likelihood, test_likelihood) # this is wrong, If I sample from the val distribution
|
||||||
|
# then I am computing D(Test||Val), so it should be E_{x ~ Val}[f(Test(x)/Val(x))]
|
||||||
|
return fdivergence(test_likelihood, val_likelihood)
|
||||||
|
|
||||||
|
return F.optim_minimize(dissimilarity, n_classes)
|
||||||
|
|
||||||
|
# this version explores the entire simplex, and then applies importance sampling. We have not really tested it in deep but
|
||||||
|
# seems not to be promising
|
||||||
|
def _target_divergence_uniform(self, posteriors):
|
||||||
|
# in this variant we evaluate the divergence using a Montecarlo approach
|
||||||
|
n_classes = len(self.val_densities)
|
||||||
|
|
||||||
|
test_kde = self.get_kde_function(posteriors)
|
||||||
|
test_likelihood = self.pdf(test_kde, self.samples)
|
||||||
|
|
||||||
|
def f_squared_hellinger(t):
|
||||||
|
return (np.sqrt(t) - 1)**2
|
||||||
|
|
||||||
|
def f_jensen_shannon(t):
|
||||||
|
return -(t+1)*np.log((t+1)/2) + t*np.log(t)
|
||||||
|
|
||||||
|
def fdivergence(pi, qi, f, eps=1e-10):
|
||||||
|
spi = pi+eps
|
||||||
|
sqi = qi+eps
|
||||||
|
return np.mean(f(spi/sqi)*sqi)
|
||||||
|
|
||||||
|
if self.divergence.lower() == 'hd':
|
||||||
|
f = f_squared_hellinger
|
||||||
|
elif self.divergence.lower() == 'js':
|
||||||
|
f = f_jensen_shannon
|
||||||
|
|
||||||
|
def dissimilarity(prev):
|
||||||
|
val_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, self.sample_densities))
|
||||||
|
return fdivergence(val_likelihood, test_likelihood, f)
|
||||||
|
|
||||||
|
return F.optim_minimize(dissimilarity, n_classes)
|
||||||
|
|
||||||
|
def _target_likelihood(self, posteriors, eps=0.000001):
|
||||||
|
"""
|
||||||
|
Searches for the mixture model parameter (the sought prevalence values) that yields a validation distribution
|
||||||
|
(the mixture) that best matches the test distribution, in terms of the divergence measure of choice.
|
||||||
|
|
||||||
|
:param instances: instances in the sample
|
||||||
|
:return: a vector of class prevalence estimates
|
||||||
|
"""
|
||||||
|
np.random.RandomState(self.random_state)
|
||||||
|
n_classes = len(self.val_densities)
|
||||||
|
test_densities = [self.pdf(kde_i, posteriors) for kde_i in self.val_densities]
|
||||||
|
|
||||||
|
#return lambda posteriors: sum(prev_i * self.pdf(kde_i, posteriors) for kde_i, prev_i in zip(self.val_densities, prev))
|
||||||
|
def neg_loglikelihood(prev):
|
||||||
|
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities))
|
||||||
|
test_loglikelihood = np.log(test_mixture_likelihood + eps)
|
||||||
|
return -np.sum(test_loglikelihood)
|
||||||
|
|
||||||
|
return F.optim_minimize(neg_loglikelihood, n_classes)
|
||||||
|
|
|
@ -0,0 +1,174 @@
|
||||||
|
from cgi import test
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from typing import Union, Callable
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.base import BaseEstimator
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.model_selection import GridSearchCV
|
||||||
|
from sklearn.neighbors import KernelDensity
|
||||||
|
from scipy.stats import multivariate_normal
|
||||||
|
import quapy as qp
|
||||||
|
from quapy.data import LabelledCollection
|
||||||
|
from quapy.protocol import APP, UPP
|
||||||
|
from quapy.method.aggregative import AggregativeProbabilisticQuantifier, _training_helper, cross_generate_predictions, \
|
||||||
|
DistributionMatching, _get_divergence
|
||||||
|
import scipy
|
||||||
|
from scipy import optimize
|
||||||
|
from statsmodels.nonparametric.kernel_density import KDEMultivariateConditional
|
||||||
|
from time import time
|
||||||
|
from sklearn.metrics.pairwise import rbf_kernel
|
||||||
|
|
||||||
|
|
||||||
|
def gram_matrix_mix(bandwidth, X, Y=None):
|
||||||
|
# this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y))
|
||||||
|
# to contain pairwise evaluations of N(x|mu,Sigma1+Sigma2) with mu=y and Sigma1 and Sigma2 are
|
||||||
|
# two "scalar matrices" (h^2) I each, so Sigma1+Sigma2 has scalar 2(h^2) (h is the bandwidth)
|
||||||
|
variance = 2 * (bandwidth**2)
|
||||||
|
nD = X.shape[1]
|
||||||
|
gamma = 1/(2*variance)
|
||||||
|
gram = rbf_kernel(X, Y, gamma=gamma)
|
||||||
|
norm_factor = 1/np.sqrt(((2*np.pi)**nD) * (variance**(nD)))
|
||||||
|
gram *= norm_factor
|
||||||
|
print('GRAM SUM:', gram.sum())
|
||||||
|
return gram
|
||||||
|
|
||||||
|
def weighted_prod(pi, tau, G):
|
||||||
|
return pi[:,np.newaxis] * G * tau
|
||||||
|
|
||||||
|
def tril_weighted_prod(pi, G):
|
||||||
|
M = weighted_prod(pi, pi, G)
|
||||||
|
return np.triu(M,1)
|
||||||
|
|
||||||
|
|
||||||
|
class KDEyclosed(AggregativeProbabilisticQuantifier):
|
||||||
|
|
||||||
|
def __init__(self, classifier: BaseEstimator, val_split=0.4, bandwidth=0.1, n_jobs=None, random_state=0):
|
||||||
|
self.classifier = classifier
|
||||||
|
self.val_split = val_split
|
||||||
|
self.bandwidth = bandwidth
|
||||||
|
self.n_jobs = n_jobs
|
||||||
|
self.random_state=random_state
|
||||||
|
|
||||||
|
def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
|
||||||
|
"""
|
||||||
|
|
||||||
|
:param data: the training set
|
||||||
|
:param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit)
|
||||||
|
:param val_split: either a float in (0,1) indicating the proportion of training instances to use for
|
||||||
|
validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
|
||||||
|
indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV
|
||||||
|
to estimate the parameters
|
||||||
|
"""
|
||||||
|
# print('[fit] enter')
|
||||||
|
if val_split is None:
|
||||||
|
val_split = self.val_split
|
||||||
|
|
||||||
|
self.classifier, y, posteriors, classes, class_count = cross_generate_predictions(
|
||||||
|
data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
|
||||||
|
)
|
||||||
|
|
||||||
|
# from distribution_matching.binary_debug import HACK
|
||||||
|
# posteriors, y = HACK(posteriors, y)
|
||||||
|
|
||||||
|
# print('[fit] precomputing stuff')
|
||||||
|
|
||||||
|
n = data.n_classes
|
||||||
|
#L = [posteriors[y==i] for i in range(n)]
|
||||||
|
#l = [len(Li) for Li in L]
|
||||||
|
|
||||||
|
D = n
|
||||||
|
h = self.bandwidth
|
||||||
|
#cov_mix_scalar = 2 * h * h # corresponds to a bandwidth of sqrt(2)*h
|
||||||
|
#Kernel = multivariate_normal(mean=np.zeros(D), cov=cov_mix_scalar)
|
||||||
|
|
||||||
|
# print('[fit] classifier ready; precomputing gram')
|
||||||
|
self.gram_tr_tr = gram_matrix_mix(h, posteriors)
|
||||||
|
|
||||||
|
# li_inv keeps track of the relative weight of each datapoint within its class
|
||||||
|
# (i.e., the weight in its KDE model)
|
||||||
|
counts_inv = 1/(data.counts())
|
||||||
|
self.li_inv = counts_inv[y]
|
||||||
|
|
||||||
|
# Khash = {}
|
||||||
|
# for a in range(n):
|
||||||
|
# for b in range(l[a]):
|
||||||
|
# for i in range(n):
|
||||||
|
# Khash[(a,b,i)] = sum(Kernel.pdf(L[i][j] - L[a][b]) for j in range(l[i]))
|
||||||
|
# for j in range(l[i]): # this for, and index j, can be supressed and store the sum across j
|
||||||
|
# Khash[(a, b, i, j)] = Kernel.pdf(L[i][j] - L[a][b])
|
||||||
|
|
||||||
|
self.n = n
|
||||||
|
#self.L = L
|
||||||
|
#self.l = l
|
||||||
|
#self.Kernel = Kernel
|
||||||
|
#self.Khash = Khash
|
||||||
|
self.C = ((2 * np.pi) ** (-D / 2)) * h ** (-D)
|
||||||
|
print('C:', self.C)
|
||||||
|
self.Ptr = posteriors
|
||||||
|
self.ytr = y
|
||||||
|
|
||||||
|
assert all(sorted(np.unique(y)) == np.arange(data.n_classes)), 'label name gaps not allowed in current implementation'
|
||||||
|
|
||||||
|
# print('[fit] exit')
|
||||||
|
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
def aggregate(self, posteriors: np.ndarray):
|
||||||
|
|
||||||
|
# print('[aggregate] enter')
|
||||||
|
|
||||||
|
Ptr = self.Ptr
|
||||||
|
Pte = posteriors
|
||||||
|
|
||||||
|
gram_te_te = gram_matrix_mix(self.bandwidth, Pte, Pte)
|
||||||
|
gram_tr_te = gram_matrix_mix(self.bandwidth, Ptr, Pte)
|
||||||
|
|
||||||
|
K = Pte.shape[0]
|
||||||
|
tau = np.full(shape=K, fill_value=1/K, dtype=float)
|
||||||
|
|
||||||
|
h = self.bandwidth
|
||||||
|
D = Ptr.shape[1]
|
||||||
|
C = self.C
|
||||||
|
|
||||||
|
partC = 0.5 * np.log( C/K + 2 * tril_weighted_prod(tau, gram_te_te).sum())
|
||||||
|
|
||||||
|
def match(alpha):
|
||||||
|
|
||||||
|
pi = alpha[self.ytr] * self.li_inv
|
||||||
|
|
||||||
|
partA = -np.log(weighted_prod(pi, tau, gram_tr_te).sum())
|
||||||
|
# print('gram_Tr_Tr sum', self.gram_tr_tr.sum())
|
||||||
|
# print('pretril', (np.triu(self.gram_tr_tr,1).sum()))
|
||||||
|
# print('tril', (2 * tril_weighted_prod(pi, self.gram_tr_tr).sum()))
|
||||||
|
# print('pi', pi.sum(), pi[:10])
|
||||||
|
# print('Cs', C*(pi**2).sum())
|
||||||
|
partB = 0.5 * np.log(C*(pi**2).sum() + 2*tril_weighted_prod(pi, self.gram_tr_tr).sum())
|
||||||
|
|
||||||
|
Dcs = partA + partB + partC
|
||||||
|
|
||||||
|
# print(f'{alpha=}\t{partA=}\t{partB=}\t{partC}')
|
||||||
|
# print()
|
||||||
|
|
||||||
|
return Dcs
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# print('[aggregate] starts search')
|
||||||
|
|
||||||
|
# the initial point is set as the uniform distribution
|
||||||
|
uniform_distribution = np.full(fill_value=1 / self.n, shape=(self.n,))
|
||||||
|
# uniform_distribution = [0.2, 0.8]
|
||||||
|
|
||||||
|
# solutions are bounded to those contained in the unit-simplex
|
||||||
|
bounds = tuple((0, 1) for _ in range(self.n)) # values in [0,1]
|
||||||
|
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
|
||||||
|
r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
|
||||||
|
# print('[aggregate] end')
|
||||||
|
return r.x
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,145 @@
|
||||||
|
from cgi import test
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from typing import Union, Callable
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.base import BaseEstimator
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.model_selection import GridSearchCV
|
||||||
|
from sklearn.neighbors import KernelDensity
|
||||||
|
from scipy.stats import multivariate_normal
|
||||||
|
import quapy as qp
|
||||||
|
from quapy.data import LabelledCollection
|
||||||
|
from quapy.protocol import APP, UPP
|
||||||
|
from quapy.method.aggregative import AggregativeProbabilisticQuantifier, _training_helper, cross_generate_predictions, \
|
||||||
|
DistributionMatching, _get_divergence
|
||||||
|
import scipy
|
||||||
|
from scipy import optimize
|
||||||
|
from statsmodels.nonparametric.kernel_density import KDEMultivariateConditional
|
||||||
|
from time import time
|
||||||
|
from sklearn.metrics.pairwise import rbf_kernel
|
||||||
|
|
||||||
|
|
||||||
|
def gram_matrix_mix_sum(bandwidth, X, Y=None, reduce=True):
|
||||||
|
# this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y))
|
||||||
|
# to contain pairwise evaluations of N(x|mu,Sigma1+Sigma2) with mu=y and Sigma1 and Sigma2 are
|
||||||
|
# two "scalar matrices" (h^2) I each, so Sigma1+Sigma2 has scalar 2(h^2) (h is the bandwidth)
|
||||||
|
variance = 2 * (bandwidth**2)
|
||||||
|
nRows,nD = X.shape
|
||||||
|
gamma = 1/(2*variance)
|
||||||
|
gram = rbf_kernel(X, Y, gamma=gamma)
|
||||||
|
|
||||||
|
norm_factor = 1/np.sqrt(((2*np.pi)**nD) * (variance**(nD)))
|
||||||
|
gram *= norm_factor
|
||||||
|
if Y is None:
|
||||||
|
# ignores the diagonal
|
||||||
|
aggr = (2 * np.triu(gram, 1)).sum()
|
||||||
|
else:
|
||||||
|
aggr = gram.sum()
|
||||||
|
return aggr
|
||||||
|
|
||||||
|
|
||||||
|
class KDEyclosed_efficient(AggregativeProbabilisticQuantifier):
|
||||||
|
|
||||||
|
def __init__(self, classifier: BaseEstimator, val_split=0.4, bandwidth=0.1, n_jobs=None, random_state=0):
|
||||||
|
self.classifier = classifier
|
||||||
|
self.val_split = val_split
|
||||||
|
self.bandwidth = bandwidth
|
||||||
|
self.n_jobs = n_jobs
|
||||||
|
self.random_state=random_state
|
||||||
|
|
||||||
|
def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
|
||||||
|
"""
|
||||||
|
|
||||||
|
:param data: the training set
|
||||||
|
:param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit)
|
||||||
|
:param val_split: either a float in (0,1) indicating the proportion of training instances to use for
|
||||||
|
validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
|
||||||
|
indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV
|
||||||
|
to estimate the parameters
|
||||||
|
"""
|
||||||
|
|
||||||
|
# print('[fit] enter')
|
||||||
|
if val_split is None:
|
||||||
|
val_split = self.val_split
|
||||||
|
|
||||||
|
self.classifier, y, posteriors, classes, class_count = cross_generate_predictions(
|
||||||
|
data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
|
||||||
|
)
|
||||||
|
|
||||||
|
assert all(sorted(np.unique(y)) == np.arange(data.n_classes)), \
|
||||||
|
'label name gaps not allowed in current implementation'
|
||||||
|
|
||||||
|
n = data.n_classes
|
||||||
|
h = self.bandwidth
|
||||||
|
|
||||||
|
P = posteriors
|
||||||
|
counts_inv = 1 / (data.counts())
|
||||||
|
|
||||||
|
nD = P.shape[1]
|
||||||
|
C = ((2 * np.pi) ** (-nD / 2)) * (self.bandwidth ** (-nD))
|
||||||
|
tr_tr_sums = np.zeros(shape=(n,n), dtype=float)
|
||||||
|
self.tr_C = []
|
||||||
|
for i in range(n):
|
||||||
|
for j in range(n):
|
||||||
|
if i > j:
|
||||||
|
tr_tr_sums[i,j] = tr_tr_sums[j,i]
|
||||||
|
else:
|
||||||
|
if i == j:
|
||||||
|
tr_tr_sums[i, j] = gram_matrix_mix_sum(h, P[y == i])
|
||||||
|
self.tr_C.append(C * sum(y == i))
|
||||||
|
else:
|
||||||
|
block = gram_matrix_mix_sum(h, P[y == i], P[y == j])
|
||||||
|
tr_tr_sums[i, j] = block
|
||||||
|
self.tr_C = np.asarray(self.tr_C)
|
||||||
|
self.Ptr = posteriors
|
||||||
|
self.ytr = y
|
||||||
|
self.tr_tr_sums = tr_tr_sums
|
||||||
|
self.counts_inv = counts_inv
|
||||||
|
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
def aggregate(self, posteriors: np.ndarray):
|
||||||
|
|
||||||
|
# print('[aggregate] enter')
|
||||||
|
|
||||||
|
Ptr = self.Ptr
|
||||||
|
Pte = posteriors
|
||||||
|
|
||||||
|
K,nD = Pte.shape
|
||||||
|
Kinv = (1/K)
|
||||||
|
h = self.bandwidth
|
||||||
|
n = Ptr.shape[1]
|
||||||
|
y = self.ytr
|
||||||
|
tr_tr_sums = self.tr_tr_sums
|
||||||
|
|
||||||
|
C = ((2 * np.pi) ** (-nD / 2)) * (self.bandwidth ** (-nD))
|
||||||
|
partC = 0.5*np.log(gram_matrix_mix_sum(h, Pte) * Kinv * Kinv + C*Kinv)
|
||||||
|
|
||||||
|
|
||||||
|
tr_te_sums = np.zeros(shape=n, dtype=float)
|
||||||
|
for i in range(n):
|
||||||
|
tr_te_sums[i] = gram_matrix_mix_sum(h, Ptr[y==i], Pte) * self.counts_inv[i] * Kinv
|
||||||
|
|
||||||
|
def match(alpha):
|
||||||
|
partA = -np.log((alpha * tr_te_sums).sum())
|
||||||
|
alpha_l = alpha * self.counts_inv
|
||||||
|
partB = 0.5 * np.log((alpha_l[:,np.newaxis] * tr_tr_sums * alpha_l).sum() + (self.tr_C*(alpha_l**2)).sum())
|
||||||
|
return partA + partB + partC
|
||||||
|
|
||||||
|
# print('[aggregate] starts search')
|
||||||
|
|
||||||
|
# the initial point is set as the uniform distribution
|
||||||
|
uniform_distribution = np.full(fill_value=1 / n, shape=(n,))
|
||||||
|
# uniform_distribution = [0.2, 0.8]
|
||||||
|
|
||||||
|
# solutions are bounded to those contained in the unit-simplex
|
||||||
|
bounds = tuple((0, 1) for _ in range(n)) # values in [0,1]
|
||||||
|
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
|
||||||
|
r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
|
||||||
|
# print('[aggregate] end')
|
||||||
|
return r.x
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,139 @@
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from typing import Union, Callable
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.base import BaseEstimator
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.model_selection import GridSearchCV
|
||||||
|
from sklearn.neighbors import KernelDensity
|
||||||
|
from scipy.stats import multivariate_normal
|
||||||
|
import quapy as qp
|
||||||
|
from quapy.data import LabelledCollection
|
||||||
|
from quapy.protocol import APP, UPP
|
||||||
|
from quapy.method.aggregative import AggregativeProbabilisticQuantifier, _training_helper, cross_generate_predictions, \
|
||||||
|
DistributionMatching, _get_divergence
|
||||||
|
import scipy
|
||||||
|
from scipy import optimize
|
||||||
|
from statsmodels.nonparametric.kernel_density import KDEMultivariateConditional
|
||||||
|
from time import time
|
||||||
|
from sklearn.metrics.pairwise import rbf_kernel
|
||||||
|
|
||||||
|
|
||||||
|
def gram_matrix_mix_sum(bandwidth, X, Y=None, reduce=True):
|
||||||
|
# this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y))
|
||||||
|
# to contain pairwise evaluations of N(x|mu,Sigma1+Sigma2) with mu=y and Sigma1 and Sigma2 are
|
||||||
|
# two "scalar matrices" (h^2) I each, so Sigma1+Sigma2 has scalar 2(h^2) (h is the bandwidth)
|
||||||
|
variance = 2 * (bandwidth**2)
|
||||||
|
nRows,nD = X.shape
|
||||||
|
gamma = 1/(2*variance)
|
||||||
|
norm_factor = 1/np.sqrt(((2*np.pi)**nD) * (variance**(nD)))
|
||||||
|
gram = norm_factor * rbf_kernel(X, Y, gamma=gamma)
|
||||||
|
return gram.sum()
|
||||||
|
|
||||||
|
|
||||||
|
class KDEyclosed_efficient_corr(AggregativeProbabilisticQuantifier):
|
||||||
|
|
||||||
|
def __init__(self, classifier: BaseEstimator, val_split=0.4, bandwidth=0.1, n_jobs=None, random_state=0):
|
||||||
|
self.classifier = classifier
|
||||||
|
self.val_split = val_split
|
||||||
|
self.bandwidth = bandwidth
|
||||||
|
self.n_jobs = n_jobs
|
||||||
|
self.random_state=random_state
|
||||||
|
|
||||||
|
def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
|
||||||
|
"""
|
||||||
|
|
||||||
|
:param data: the training set
|
||||||
|
:param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit)
|
||||||
|
:param val_split: either a float in (0,1) indicating the proportion of training instances to use for
|
||||||
|
validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
|
||||||
|
indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV
|
||||||
|
to estimate the parameters
|
||||||
|
"""
|
||||||
|
|
||||||
|
# print('[fit] enter')
|
||||||
|
if val_split is None:
|
||||||
|
val_split = self.val_split
|
||||||
|
|
||||||
|
self.classifier, y, posteriors, classes, class_count = cross_generate_predictions(
|
||||||
|
data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
|
||||||
|
)
|
||||||
|
|
||||||
|
print('training over')
|
||||||
|
|
||||||
|
assert all(sorted(np.unique(y)) == np.arange(data.n_classes)), \
|
||||||
|
'label name gaps not allowed in current implementation'
|
||||||
|
|
||||||
|
# print('[fit] precomputing stuff')
|
||||||
|
|
||||||
|
# from distribution_matching.binary_debug import HACK
|
||||||
|
# posteriors, y = HACK(posteriors, y)
|
||||||
|
|
||||||
|
n = data.n_classes
|
||||||
|
h = self.bandwidth
|
||||||
|
|
||||||
|
# print('[fit] classifier ready; precomputing gram')
|
||||||
|
P = posteriors
|
||||||
|
|
||||||
|
# li_inv keeps track of the relative weight of each datapoint within its class
|
||||||
|
# (i.e., the weight in its KDE model)
|
||||||
|
counts_inv = 1 / (data.counts())
|
||||||
|
|
||||||
|
tr_tr_sums = np.zeros(shape=(n,n), dtype=float)
|
||||||
|
for i in range(n):
|
||||||
|
for j in range(n):
|
||||||
|
if i > j:
|
||||||
|
tr_tr_sums[i,j] = tr_tr_sums[j,i]
|
||||||
|
else:
|
||||||
|
block = gram_matrix_mix_sum(h, P[y == i], P[y == j] if i!=j else None)
|
||||||
|
tr_tr_sums[i, j] = block
|
||||||
|
|
||||||
|
# compute class-class block-sums
|
||||||
|
self.Ptr = posteriors
|
||||||
|
self.ytr = y
|
||||||
|
self.tr_tr_sums = tr_tr_sums
|
||||||
|
self.counts_inv = counts_inv
|
||||||
|
|
||||||
|
print('fit over')
|
||||||
|
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
def aggregate(self, posteriors: np.ndarray):
|
||||||
|
|
||||||
|
# print('aggregating')
|
||||||
|
Ptr = self.Ptr
|
||||||
|
Pte = posteriors
|
||||||
|
|
||||||
|
K,nD = Pte.shape
|
||||||
|
Kinv = (1/K)
|
||||||
|
h = self.bandwidth
|
||||||
|
n = Ptr.shape[1]
|
||||||
|
y = self.ytr
|
||||||
|
tr_tr_sums = self.tr_tr_sums
|
||||||
|
|
||||||
|
partC = 0.5*np.log(gram_matrix_mix_sum(h, Pte) * Kinv * Kinv)
|
||||||
|
|
||||||
|
tr_te_sums = np.zeros(shape=n, dtype=float)
|
||||||
|
for i in range(n):
|
||||||
|
tr_te_sums[i] = gram_matrix_mix_sum(h, Ptr[y==i], Pte) * self.counts_inv[i] * Kinv
|
||||||
|
|
||||||
|
def match(alpha):
|
||||||
|
partA = -np.log((alpha * tr_te_sums).sum())
|
||||||
|
alpha_l = alpha * self.counts_inv
|
||||||
|
partB = 0.5 * np.log((alpha_l[:,np.newaxis] * tr_tr_sums * alpha_l).sum())
|
||||||
|
return partA + partB + partC
|
||||||
|
|
||||||
|
# print('starting search')
|
||||||
|
|
||||||
|
# the initial point is set as the uniform distribution
|
||||||
|
uniform_distribution = np.full(fill_value=1 / n, shape=(n,))
|
||||||
|
|
||||||
|
# solutions are bounded to those contained in the unit-simplex
|
||||||
|
bounds = tuple((0, 1) for _ in range(n)) # values in [0,1]
|
||||||
|
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
|
||||||
|
r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
|
||||||
|
return r.x
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,210 @@
|
||||||
|
from distribution_matching.commons import (ADJUSTMENT_METHODS, BIN_METHODS, DISTR_MATCH_METHODS, MAX_LIKE_METHODS,
|
||||||
|
METHODS, FULL_METHOD_LIST)
|
||||||
|
import quapy as qp
|
||||||
|
from os import makedirs
|
||||||
|
import os
|
||||||
|
|
||||||
|
from tabular import Table
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
tables_path = '.'
|
||||||
|
# makedirs(tables_path, exist_ok=True)
|
||||||
|
|
||||||
|
MAXTONE = 35 # sets the intensity of the maximum color reached by the worst (red) and best (green) results
|
||||||
|
SHOW_STD = False
|
||||||
|
|
||||||
|
NUM_ADJUSTMENT_METHODS = len(ADJUSTMENT_METHODS)
|
||||||
|
NUM_MAXIMUM_LIKELIHOOD_METHODS = len(MAX_LIKE_METHODS)
|
||||||
|
NUM_DISTRIBUTION_MATCHING_METHODS = len(DISTR_MATCH_METHODS)
|
||||||
|
|
||||||
|
qp.environ['SAMPLE_SIZE'] = 100
|
||||||
|
|
||||||
|
nice_bench = {
|
||||||
|
'sanders': 'Sanders',
|
||||||
|
'semeval13': 'SemEval13',
|
||||||
|
'semeval14': 'SemEval14',
|
||||||
|
'semeval15': 'SemEval15',
|
||||||
|
'semeval16': 'SemEval16',
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def save_table(path, table):
|
||||||
|
print(f'saving results in {path}')
|
||||||
|
with open(path, 'wt') as foo:
|
||||||
|
foo.write(table)
|
||||||
|
|
||||||
|
def new_table(datasets, methods):
|
||||||
|
return Table(
|
||||||
|
benchmarks=datasets,
|
||||||
|
methods=methods,
|
||||||
|
ttest='wilcoxon',
|
||||||
|
prec_mean=5,
|
||||||
|
show_std=SHOW_STD,
|
||||||
|
prec_std=4,
|
||||||
|
clean_zero=(eval=='mae'),
|
||||||
|
average=True,
|
||||||
|
maxtone=MAXTONE
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def make_table(tabs, eval, benchmark_groups, benchmark_names, compact=False):
|
||||||
|
|
||||||
|
n_methods = len(METHODS)
|
||||||
|
assert n_methods == (NUM_ADJUSTMENT_METHODS+NUM_DISTRIBUTION_MATCHING_METHODS+NUM_MAXIMUM_LIKELIHOOD_METHODS), \
|
||||||
|
"Unexpected number of methods"
|
||||||
|
|
||||||
|
cline = "\cline{2-" + str(n_methods+ 1) + "}"
|
||||||
|
|
||||||
|
# write the latex table
|
||||||
|
tabular = """
|
||||||
|
\\begin{tabular}{|c|""" + ('c|' * NUM_ADJUSTMENT_METHODS) + ('c|' * NUM_DISTRIBUTION_MATCHING_METHODS) + ('c|' * NUM_MAXIMUM_LIKELIHOOD_METHODS) + """} """ + cline + """
|
||||||
|
\multicolumn{1}{c}{} &
|
||||||
|
\multicolumn{"""+str(NUM_ADJUSTMENT_METHODS)+"""}{|c}{Adjustment} &
|
||||||
|
\multicolumn{"""+str(NUM_DISTRIBUTION_MATCHING_METHODS)+"""}{|c|}{Distribution Matching} &
|
||||||
|
\multicolumn{"""+str(NUM_MAXIMUM_LIKELIHOOD_METHODS)+"""}{c|}{Maximum Likelihood} \\\\
|
||||||
|
\hline
|
||||||
|
"""
|
||||||
|
for i, (tab, group, name) in enumerate(zip(tabs, benchmark_groups, benchmark_names)):
|
||||||
|
tablines = tab.latexTabular(benchmark_replace=nice_bench, endl='\\\\'+ cline, aslines=True)
|
||||||
|
tablines[0] = tablines[0].replace('\multicolumn{1}{c|}{}', '\\textbf{'+name+'}')
|
||||||
|
if compact or len(tab.benchmarks)==1:
|
||||||
|
# if compact, keep the method names and the average; discard the rest
|
||||||
|
tabular += tablines[0] + '\n' + tablines[1 if len(tab.benchmarks)==1 else -1] + '\n'
|
||||||
|
else:
|
||||||
|
tabular += '\n'.join(tablines)
|
||||||
|
|
||||||
|
tabular += "\n" + "\\textit{Rank} & " + tab.getRankTable(prec_mean=0 if name.startswith('LeQua') else 1).latexAverage()
|
||||||
|
if i < (len(tabs) - 1):
|
||||||
|
tabular += "\\hline\n"
|
||||||
|
else:
|
||||||
|
tabular += "\n"
|
||||||
|
tabular += "\end{tabular}"
|
||||||
|
return tabular
|
||||||
|
|
||||||
|
|
||||||
|
def gen_tables_uci_multiclass(eval):
|
||||||
|
|
||||||
|
print('Generating table for UCI Multiclass Datasets', eval)
|
||||||
|
dir_results = f'../results/ucimulti/{eval}'
|
||||||
|
|
||||||
|
datasets = qp.datasets.UCI_MULTICLASS_DATASETS
|
||||||
|
|
||||||
|
tab = new_table(datasets, METHODS)
|
||||||
|
|
||||||
|
for dataset in datasets:
|
||||||
|
print(f'\t Dataset: {dataset}: ', end='')
|
||||||
|
for method in METHODS:
|
||||||
|
result_path = f'{dir_results}/{method}_{dataset}.dataframe'
|
||||||
|
if os.path.exists(result_path):
|
||||||
|
df = pd.read_csv(result_path)
|
||||||
|
print(f'{method}', end=' ')
|
||||||
|
tab.add(dataset, method, df[eval].values)
|
||||||
|
else:
|
||||||
|
print(f'MISSING-{method}', end=' ')
|
||||||
|
print()
|
||||||
|
|
||||||
|
return tab
|
||||||
|
|
||||||
|
|
||||||
|
def gen_tables_uci_bin(eval):
|
||||||
|
|
||||||
|
print('Generating table for UCI Datasets', eval)
|
||||||
|
dir_results = f'../results/binary/{eval}'
|
||||||
|
|
||||||
|
exclude = ['acute.a', 'acute.b', 'iris.1', 'balance.2']
|
||||||
|
datasets = [x for x in qp.datasets.UCI_DATASETS if x not in exclude]
|
||||||
|
|
||||||
|
tab = new_table(datasets, BIN_METHODS)
|
||||||
|
|
||||||
|
for dataset in datasets:
|
||||||
|
print(f'\t Dataset: {dataset}: ', end='')
|
||||||
|
for method in BIN_METHODS:
|
||||||
|
result_path = f'{dir_results}/{method}_{dataset}.dataframe'
|
||||||
|
if os.path.exists(result_path):
|
||||||
|
df = pd.read_csv(result_path)
|
||||||
|
print(f'{method}', end=' ')
|
||||||
|
tab.add(dataset, method, df[eval].values)
|
||||||
|
else:
|
||||||
|
print(f'MISSING-{method}', end=' ')
|
||||||
|
|
||||||
|
return tab
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def gen_tables_tweet(eval):
|
||||||
|
|
||||||
|
print('Generating table for Twitter', eval)
|
||||||
|
dir_results = f'../results/tweet/{eval}'
|
||||||
|
|
||||||
|
datasets = qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST
|
||||||
|
|
||||||
|
tab = new_table(datasets, METHODS)
|
||||||
|
|
||||||
|
for dataset in datasets:
|
||||||
|
print(f'\t Dataset: {dataset}: ', end='')
|
||||||
|
for method in METHODS:
|
||||||
|
result_path = f'{dir_results}/{method}_{dataset}.dataframe'
|
||||||
|
if os.path.exists(result_path):
|
||||||
|
df = pd.read_csv(result_path)
|
||||||
|
print(f'{method}', end=' ')
|
||||||
|
tab.add(dataset, method, df[eval].values)
|
||||||
|
else:
|
||||||
|
print(f'MISSING-{method}', end=' ')
|
||||||
|
print()
|
||||||
|
|
||||||
|
return tab
|
||||||
|
|
||||||
|
|
||||||
|
def gen_tables_lequa(Methods, task, eval):
|
||||||
|
# generating table for LeQua-T1A or Lequa-T1B; only one table with two rows, one for MAE, another for MRAE
|
||||||
|
|
||||||
|
tab = new_table([task], Methods)
|
||||||
|
|
||||||
|
print('Generating table for T1A@Lequa', eval, end='')
|
||||||
|
dir_results = f'../results/lequa/{task}/{eval}'
|
||||||
|
|
||||||
|
for method in Methods:
|
||||||
|
result_path = f'{dir_results}/{method}.dataframe'
|
||||||
|
if os.path.exists(result_path):
|
||||||
|
df = pd.read_csv(result_path)
|
||||||
|
print(f'{method}', end=' ')
|
||||||
|
tab.add(task, method, df[eval].values)
|
||||||
|
else:
|
||||||
|
print(f'MISSING-{method}', end=' ')
|
||||||
|
print()
|
||||||
|
|
||||||
|
return tab
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
os.makedirs('./latex', exist_ok=True)
|
||||||
|
|
||||||
|
for eval in ['mae', 'mrae']:
|
||||||
|
tabs = []
|
||||||
|
tabs.append(gen_tables_tweet(eval))
|
||||||
|
tabs.append(gen_tables_uci_multiclass(eval))
|
||||||
|
tabs.append(gen_tables_lequa(METHODS, 'T1B', eval))
|
||||||
|
|
||||||
|
names = ['Tweets', 'UCI-multi', 'LeQua']
|
||||||
|
table = make_table(tabs, eval, benchmark_groups=tabs, benchmark_names=names)
|
||||||
|
save_table(f'./latex/multiclass_{eval}.tex', table)
|
||||||
|
|
||||||
|
for eval in ['mae', 'mrae']:
|
||||||
|
tabs = []
|
||||||
|
tabs.append(gen_tables_uci_bin(eval))
|
||||||
|
|
||||||
|
# print uci-binary with all datasets for the appendix
|
||||||
|
table = make_table(tabs, eval, benchmark_groups=tabs, benchmark_names=['UCI-binary'])
|
||||||
|
save_table(f'./latex/ucibinary_{eval}.tex', table)
|
||||||
|
|
||||||
|
# print uci-bin compacted plus lequa-T1A for the main body
|
||||||
|
tabs.append(gen_tables_lequa(BIN_METHODS, 'T1A', eval))
|
||||||
|
table = make_table(tabs, eval, benchmark_groups=tabs, benchmark_names=['UCI-binary', 'LeQua'], compact=True)
|
||||||
|
save_table(f'./latex/binary_{eval}.tex', table)
|
||||||
|
|
||||||
|
print("[Tables Done] runing latex")
|
||||||
|
os.chdir('./latex/')
|
||||||
|
os.system('pdflatex tables_compact.tex')
|
||||||
|
os.system('rm tables_compact.aux tables_compact.bbl tables_compact.blg tables_compact.log tables_compact.out tables_compact.dvi')
|
||||||
|
|
|
@ -0,0 +1,107 @@
|
||||||
|
\documentclass{article}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
\usepackage[utf8]{inputenc} % allow utf-8 input
|
||||||
|
\usepackage[T1]{fontenc} % use 8-bit T1 fonts
|
||||||
|
\usepackage{hyperref} % hyperlinks
|
||||||
|
\usepackage{url} % simple URL typesetting
|
||||||
|
\usepackage{booktabs} % professional-quality tables
|
||||||
|
\usepackage{amsfonts} % blackboard math symbols
|
||||||
|
\usepackage{nicefrac} % compact symbols for 1/2, etc.
|
||||||
|
\usepackage{microtype} % microtypography
|
||||||
|
\usepackage{lipsum}
|
||||||
|
\usepackage{fancyhdr} % header
|
||||||
|
\usepackage{graphicx} % graphics
|
||||||
|
\graphicspath{{media/}} % organize your images and other figures under media/ folder
|
||||||
|
\usepackage{amsmath}
|
||||||
|
\usepackage{bm}
|
||||||
|
\usepackage{tabularx}
|
||||||
|
\usepackage{color}
|
||||||
|
\usepackage{colortbl}
|
||||||
|
\usepackage{xcolor}
|
||||||
|
\usepackage{lmodern}
|
||||||
|
|
||||||
|
\DeclareMathOperator*{\argmax}{arg\,max}
|
||||||
|
\DeclareMathOperator*{\argmin}{arg\,min}
|
||||||
|
|
||||||
|
\newif\ifdraft
|
||||||
|
\drafttrue
|
||||||
|
|
||||||
|
\newcommand{\juanjo}[1]{\ifdraft{\leavevmode\color{purple}{[JJ]:
|
||||||
|
{#1}}}\else{\vspace{0ex}}\fi}
|
||||||
|
|
||||||
|
\newcommand{\alex}[1]{\ifdraft{\leavevmode\color{violet}{[AM]:
|
||||||
|
{#1}}}\else{\vspace{0ex}}\fi}
|
||||||
|
|
||||||
|
\newcommand{\pablo}[1]{\ifdraft{\leavevmode\color{red}{[PG]:
|
||||||
|
{#1}}}\else{\vspace{0ex}}\fi}
|
||||||
|
|
||||||
|
|
||||||
|
\title{Tables}
|
||||||
|
|
||||||
|
|
||||||
|
\author{
|
||||||
|
Alejandro Moreo
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
\begin{document}
|
||||||
|
|
||||||
|
\maketitle
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
\begin{table}[h]
|
||||||
|
\centering
|
||||||
|
\caption{Multiclass AE}
|
||||||
|
\resizebox{\textwidth}{!}{%
|
||||||
|
\input{multiclass_mae}
|
||||||
|
}%
|
||||||
|
\end{table}
|
||||||
|
|
||||||
|
|
||||||
|
\begin{table}[h]
|
||||||
|
\centering
|
||||||
|
\caption{Multiclass RAE}
|
||||||
|
\resizebox{\textwidth}{!}{%
|
||||||
|
\input{multiclass_mrae}
|
||||||
|
}%
|
||||||
|
\end{table}
|
||||||
|
|
||||||
|
\begin{table}[h]
|
||||||
|
\centering
|
||||||
|
\caption{Binary MAE}
|
||||||
|
\resizebox{\textwidth}{!}{%
|
||||||
|
\input{binary_mae}
|
||||||
|
}%
|
||||||
|
\end{table}
|
||||||
|
|
||||||
|
\begin{table}[h]
|
||||||
|
\centering
|
||||||
|
\caption{Binary MRAE}
|
||||||
|
\resizebox{\textwidth}{!}{%
|
||||||
|
\input{binary_mrae}
|
||||||
|
}%
|
||||||
|
\end{table}
|
||||||
|
|
||||||
|
\begin{table}[h]
|
||||||
|
\centering
|
||||||
|
\caption{UCI binary full AE}
|
||||||
|
\resizebox{\textwidth}{!}{%
|
||||||
|
\input{ucibinary_mae}
|
||||||
|
}%
|
||||||
|
\end{table}
|
||||||
|
|
||||||
|
\begin{table}[h]
|
||||||
|
\centering
|
||||||
|
\caption{UCI binary full RAE}
|
||||||
|
\resizebox{\textwidth}{!}{%
|
||||||
|
\input{ucibinary_mrae}
|
||||||
|
}%
|
||||||
|
\end{table}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
\end{document}
|
||||||
|
|
|
@ -0,0 +1,348 @@
|
||||||
|
import numpy as np
|
||||||
|
import itertools
|
||||||
|
from scipy.stats import ttest_ind_from_stats, wilcoxon
|
||||||
|
|
||||||
|
|
||||||
|
class Table:
|
||||||
|
VALID_TESTS = [None, "wilcoxon", "ttest"]
|
||||||
|
|
||||||
|
def __init__(self, benchmarks, methods, lower_is_better=True, ttest='ttest', prec_mean=3,
|
||||||
|
clean_zero=False, show_std=False, prec_std=3, average=True, missing=None, missing_str='--', color=True, maxtone=50):
|
||||||
|
assert ttest in self.VALID_TESTS, f'unknown test, valid are {self.VALID_TESTS}'
|
||||||
|
|
||||||
|
self.benchmarks = np.asarray(benchmarks)
|
||||||
|
self.benchmark_index = {row:i for i, row in enumerate(benchmarks)}
|
||||||
|
|
||||||
|
self.methods = np.asarray(methods)
|
||||||
|
self.method_index = {col:j for j, col in enumerate(methods)}
|
||||||
|
|
||||||
|
self.map = {}
|
||||||
|
# keyed (#rows,#cols)-ndarrays holding computations from self.map['values']
|
||||||
|
self._addmap('values', dtype=object)
|
||||||
|
self.lower_is_better = lower_is_better
|
||||||
|
self.ttest = ttest
|
||||||
|
self.prec_mean = prec_mean
|
||||||
|
self.clean_zero = clean_zero
|
||||||
|
self.show_std = show_std
|
||||||
|
self.prec_std = prec_std
|
||||||
|
self.add_average = average
|
||||||
|
self.missing = missing
|
||||||
|
self.missing_str = missing_str
|
||||||
|
self.color = color
|
||||||
|
self.maxtone = maxtone
|
||||||
|
|
||||||
|
self.touch()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def nbenchmarks(self):
|
||||||
|
return len(self.benchmarks)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def nmethods(self):
|
||||||
|
return len(self.methods)
|
||||||
|
|
||||||
|
def touch(self):
|
||||||
|
self._modif = True
|
||||||
|
|
||||||
|
def update(self):
|
||||||
|
if self._modif:
|
||||||
|
self.compute()
|
||||||
|
|
||||||
|
def _getfilled(self):
|
||||||
|
return np.argwhere(self.map['fill'])
|
||||||
|
|
||||||
|
@property
|
||||||
|
def values(self):
|
||||||
|
return self.map['values']
|
||||||
|
|
||||||
|
def _indexes(self):
|
||||||
|
return itertools.product(range(self.nbenchmarks), range(self.nmethods))
|
||||||
|
|
||||||
|
def _addmap(self, map, dtype, func=None):
|
||||||
|
self.map[map] = np.empty((self.nbenchmarks, self.nmethods), dtype=dtype)
|
||||||
|
if func is None:
|
||||||
|
return
|
||||||
|
m = self.map[map]
|
||||||
|
f = func
|
||||||
|
indexes = self._indexes() if map == 'fill' else self._getfilled()
|
||||||
|
for i, j in indexes:
|
||||||
|
m[i, j] = f(self.values[i, j])
|
||||||
|
|
||||||
|
def _addrank(self):
|
||||||
|
for i in range(self.nbenchmarks):
|
||||||
|
filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
|
||||||
|
col_means = [self.map['mean'][i,j] for j in filled_cols_idx]
|
||||||
|
ranked_cols_idx = filled_cols_idx[np.argsort(col_means)]
|
||||||
|
if not self.lower_is_better:
|
||||||
|
ranked_cols_idx = ranked_cols_idx[::-1]
|
||||||
|
self.map['rank'][i, ranked_cols_idx] = np.arange(1, len(filled_cols_idx)+1)
|
||||||
|
|
||||||
|
def _addcolor(self):
|
||||||
|
for i in range(self.nbenchmarks):
|
||||||
|
filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
|
||||||
|
if filled_cols_idx.size==0:
|
||||||
|
continue
|
||||||
|
col_means = [self.map['mean'][i,j] for j in filled_cols_idx]
|
||||||
|
#col_means = [self.map['rank'][i, j] for j in filled_cols_idx]
|
||||||
|
|
||||||
|
minval = min(col_means)
|
||||||
|
maxval = max(col_means)
|
||||||
|
|
||||||
|
for col_idx in filled_cols_idx:
|
||||||
|
val = self.map['mean'][i,col_idx]
|
||||||
|
norm = (maxval - minval)
|
||||||
|
if norm > 0:
|
||||||
|
normval = (val - minval) / norm
|
||||||
|
else:
|
||||||
|
normval = 0.5
|
||||||
|
|
||||||
|
if self.lower_is_better:
|
||||||
|
normval = 1 - normval
|
||||||
|
|
||||||
|
normval = np.clip(normval, 0,1)
|
||||||
|
|
||||||
|
self.map['color'][i, col_idx] = color_red2green_01(normval, self.maxtone)
|
||||||
|
|
||||||
|
def _run_ttest(self, row, col1, col2):
|
||||||
|
mean1 = self.map['mean'][row, col1]
|
||||||
|
std1 = self.map['std'][row, col1]
|
||||||
|
nobs1 = self.map['nobs'][row, col1]
|
||||||
|
mean2 = self.map['mean'][row, col2]
|
||||||
|
std2 = self.map['std'][row, col2]
|
||||||
|
nobs2 = self.map['nobs'][row, col2]
|
||||||
|
_, p_val = ttest_ind_from_stats(mean1, std1, nobs1, mean2, std2, nobs2)
|
||||||
|
return p_val
|
||||||
|
|
||||||
|
def _run_wilcoxon(self, row, col1, col2):
|
||||||
|
values1 = self.map['values'][row, col1]
|
||||||
|
values2 = self.map['values'][row, col2]
|
||||||
|
try:
|
||||||
|
_, p_val = wilcoxon(values1, values2)
|
||||||
|
except ValueError:
|
||||||
|
p_val = 0
|
||||||
|
return p_val
|
||||||
|
|
||||||
|
def _add_statistical_test(self):
|
||||||
|
if self.ttest is None:
|
||||||
|
return
|
||||||
|
self.some_similar = [False]*self.nmethods
|
||||||
|
for i in range(self.nbenchmarks):
|
||||||
|
filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
|
||||||
|
if len(filled_cols_idx) <= 1:
|
||||||
|
continue
|
||||||
|
col_means = [self.map['mean'][i,j] for j in filled_cols_idx]
|
||||||
|
best_pos = filled_cols_idx[np.argmin(col_means)]
|
||||||
|
|
||||||
|
for j in filled_cols_idx:
|
||||||
|
if j==best_pos:
|
||||||
|
continue
|
||||||
|
if self.ttest == 'ttest':
|
||||||
|
p_val = self._run_ttest(i, best_pos, j)
|
||||||
|
else:
|
||||||
|
p_val = self._run_wilcoxon(i, best_pos, j)
|
||||||
|
|
||||||
|
pval_outcome = pval_interpretation(p_val)
|
||||||
|
self.map['ttest'][i, j] = pval_outcome
|
||||||
|
if pval_outcome != 'Diff':
|
||||||
|
self.some_similar[j] = True
|
||||||
|
|
||||||
|
def compute(self):
|
||||||
|
self._addmap('fill', dtype=bool, func=lambda x: x is not None)
|
||||||
|
self._addmap('mean', dtype=float, func=np.mean)
|
||||||
|
self._addmap('std', dtype=float, func=np.std)
|
||||||
|
self._addmap('nobs', dtype=float, func=len)
|
||||||
|
self._addmap('rank', dtype=int, func=None)
|
||||||
|
self._addmap('color', dtype=object, func=None)
|
||||||
|
self._addmap('ttest', dtype=object, func=None)
|
||||||
|
self._addmap('latex', dtype=object, func=None)
|
||||||
|
self._addrank()
|
||||||
|
self._addcolor()
|
||||||
|
self._add_statistical_test()
|
||||||
|
if self.add_average:
|
||||||
|
self._addave()
|
||||||
|
self._modif = False
|
||||||
|
|
||||||
|
def _is_column_full(self, col):
|
||||||
|
return all(self.map['fill'][:, self.method_index[col]])
|
||||||
|
|
||||||
|
def _addave(self):
|
||||||
|
ave = Table(['ave'], self.methods,
|
||||||
|
lower_is_better=self.lower_is_better,
|
||||||
|
ttest=self.ttest,
|
||||||
|
average=False,
|
||||||
|
missing=self.missing,
|
||||||
|
missing_str=self.missing_str,
|
||||||
|
prec_mean=self.prec_mean,
|
||||||
|
prec_std=self.prec_std,
|
||||||
|
clean_zero=self.clean_zero,
|
||||||
|
show_std=self.show_std,
|
||||||
|
color=self.color,
|
||||||
|
maxtone=self.maxtone)
|
||||||
|
for col in self.methods:
|
||||||
|
values = None
|
||||||
|
if self._is_column_full(col):
|
||||||
|
if self.ttest == 'ttest':
|
||||||
|
# values = np.asarray(self.map['mean'][:, self.method_index[col]])
|
||||||
|
values = np.concatenate(self.values[:, self.method_index[col]])
|
||||||
|
else: # wilcoxon
|
||||||
|
# values = np.asarray(self.map['mean'][:, self.method_index[col]])
|
||||||
|
values = np.concatenate(self.values[:, self.method_index[col]])
|
||||||
|
ave.add('ave', col, values)
|
||||||
|
self.average = ave
|
||||||
|
|
||||||
|
def add(self, benchmark, method, values):
|
||||||
|
if values is not None:
|
||||||
|
values = np.asarray(values)
|
||||||
|
if values.ndim==0:
|
||||||
|
values = values.flatten()
|
||||||
|
rid, cid = self._coordinates(benchmark, method)
|
||||||
|
self.map['values'][rid, cid] = values
|
||||||
|
self.touch()
|
||||||
|
|
||||||
|
def get(self, benchmark, method, attr='mean'):
|
||||||
|
self.update()
|
||||||
|
assert attr in self.map, f'unknwon attribute {attr}'
|
||||||
|
rid, cid = self._coordinates(benchmark, method)
|
||||||
|
if self.map['fill'][rid, cid]:
|
||||||
|
v = self.map[attr][rid, cid]
|
||||||
|
if v is None or (isinstance(v,float) and np.isnan(v)):
|
||||||
|
return self.missing
|
||||||
|
return v
|
||||||
|
else:
|
||||||
|
return self.missing
|
||||||
|
|
||||||
|
def _coordinates(self, benchmark, method):
|
||||||
|
assert benchmark in self.benchmark_index, f'benchmark {benchmark} out of range'
|
||||||
|
assert method in self.method_index, f'method {method} out of range'
|
||||||
|
rid = self.benchmark_index[benchmark]
|
||||||
|
cid = self.method_index[method]
|
||||||
|
return rid, cid
|
||||||
|
|
||||||
|
def get_average(self, method, attr='mean'):
|
||||||
|
self.update()
|
||||||
|
if self.add_average:
|
||||||
|
return self.average.get('ave', method, attr=attr)
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_color(self, benchmark, method):
|
||||||
|
color = self.get(benchmark, method, attr='color')
|
||||||
|
if color is None:
|
||||||
|
return ''
|
||||||
|
return color
|
||||||
|
|
||||||
|
def latex(self, benchmark, method):
|
||||||
|
self.update()
|
||||||
|
i,j = self._coordinates(benchmark, method)
|
||||||
|
if self.map['fill'][i,j] == False:
|
||||||
|
return self.missing_str
|
||||||
|
|
||||||
|
mean = self.map['mean'][i,j]
|
||||||
|
l = f" {mean:.{self.prec_mean}f}"
|
||||||
|
if self.clean_zero:
|
||||||
|
l = l.replace(' 0.', '.')
|
||||||
|
|
||||||
|
isbest = self.map['rank'][i,j] == 1
|
||||||
|
if isbest:
|
||||||
|
l = "\\textbf{"+l.strip()+"}"
|
||||||
|
|
||||||
|
stat = '' if self.ttest is None else '^{\phantom{\ddag}}'
|
||||||
|
if self.ttest is not None and self.some_similar[j]:
|
||||||
|
test_label = self.map['ttest'][i,j]
|
||||||
|
if test_label == 'Sim':
|
||||||
|
stat = '^{\dag}'
|
||||||
|
elif test_label == 'Same':
|
||||||
|
stat = '^{\ddag}'
|
||||||
|
elif isbest or test_label == 'Diff':
|
||||||
|
stat = '^{\phantom{\ddag}}'
|
||||||
|
|
||||||
|
std = ''
|
||||||
|
if self.show_std:
|
||||||
|
std = self.map['std'][i,j]
|
||||||
|
std = f" {std:.{self.prec_std}f}"
|
||||||
|
if self.clean_zero:
|
||||||
|
std = std.replace(' 0.', '.')
|
||||||
|
std = f"\pm {std:{self.prec_std}}"
|
||||||
|
|
||||||
|
if stat!='' or std!='':
|
||||||
|
l = f'{l}${stat}{std}$'
|
||||||
|
|
||||||
|
if self.color:
|
||||||
|
l += ' ' + self.map['color'][i,j]
|
||||||
|
|
||||||
|
return l
|
||||||
|
|
||||||
|
def latexTabular(self, benchmark_replace={}, method_replace={}, aslines=False, endl='\\\\\hline'):
|
||||||
|
lines = []
|
||||||
|
l = '\multicolumn{1}{c|}{} & '
|
||||||
|
l += ' & '.join([method_replace.get(col, col) for col in self.methods])
|
||||||
|
l += ' \\\\\hline'
|
||||||
|
lines.append(l)
|
||||||
|
|
||||||
|
for row in self.benchmarks:
|
||||||
|
rowname = benchmark_replace.get(row, row)
|
||||||
|
l = rowname + ' & '
|
||||||
|
l += self.latexRow(row, endl=endl)
|
||||||
|
lines.append(l)
|
||||||
|
|
||||||
|
if self.add_average:
|
||||||
|
# l += '\hline\n'
|
||||||
|
l = '\hline \n \\textit{Average} & '
|
||||||
|
l += self.latexAverage(endl=endl)
|
||||||
|
lines.append(l)
|
||||||
|
if not aslines:
|
||||||
|
lines='\n'.join(lines)
|
||||||
|
return lines
|
||||||
|
|
||||||
|
def latexRow(self, benchmark, endl='\\\\\hline\n'):
|
||||||
|
s = [self.latex(benchmark, col) for col in self.methods]
|
||||||
|
s = ' & '.join(s)
|
||||||
|
s += ' ' + endl
|
||||||
|
return s
|
||||||
|
|
||||||
|
def latexAverage(self, endl='\\\\\hline\n'):
|
||||||
|
if self.add_average:
|
||||||
|
return self.average.latexRow('ave', endl=endl)
|
||||||
|
|
||||||
|
def getRankTable(self, prec_mean=0):
|
||||||
|
t = Table(benchmarks=self.benchmarks, methods=self.methods, prec_mean=prec_mean, average=True, maxtone=self.maxtone, ttest=None)
|
||||||
|
for rid, cid in self._getfilled():
|
||||||
|
row = self.benchmarks[rid]
|
||||||
|
col = self.methods[cid]
|
||||||
|
t.add(row, col, self.get(row, col, 'rank'))
|
||||||
|
t.compute()
|
||||||
|
return t
|
||||||
|
|
||||||
|
def dropMethods(self, methods):
|
||||||
|
drop_index = [self.method_index[m] for m in methods]
|
||||||
|
new_methods = np.delete(self.methods, drop_index)
|
||||||
|
new_index = {col:j for j, col in enumerate(new_methods)}
|
||||||
|
|
||||||
|
self.map['values'] = self.values[:,np.asarray([self.method_index[m] for m in new_methods], dtype=int)]
|
||||||
|
self.methods = new_methods
|
||||||
|
self.method_index = new_index
|
||||||
|
self.touch()
|
||||||
|
|
||||||
|
|
||||||
|
def pval_interpretation(p_val):
|
||||||
|
if 0.005 >= p_val:
|
||||||
|
return 'Diff'
|
||||||
|
elif 0.05 >= p_val > 0.005:
|
||||||
|
return 'Sim'
|
||||||
|
elif p_val > 0.05:
|
||||||
|
return 'Same'
|
||||||
|
|
||||||
|
|
||||||
|
def color_red2green_01(val, maxtone=50):
|
||||||
|
if np.isnan(val): return None
|
||||||
|
assert 0 <= val <= 1, f'val {val} out of range [0,1]'
|
||||||
|
|
||||||
|
|
||||||
|
# rescale to [-1,1]
|
||||||
|
val = val * 2 - 1
|
||||||
|
if val < 0:
|
||||||
|
color = 'red'
|
||||||
|
tone = maxtone * (-val)
|
||||||
|
else:
|
||||||
|
color = 'green'
|
||||||
|
tone = maxtone * val
|
||||||
|
return '\cellcolor{' + color + f'!{int(tone)}' + '}'
|
|
@ -0,0 +1,82 @@
|
||||||
|
import numpy as np
|
||||||
|
from scipy.stats import norm, uniform, expon
|
||||||
|
from scipy.special import rel_entr
|
||||||
|
|
||||||
|
|
||||||
|
def kld_approx(p, q, resolution=100):
|
||||||
|
steps = np.linspace(-5, 5, resolution)
|
||||||
|
integral = 0
|
||||||
|
for step_i1, step_i2 in zip(steps[:-1], steps[1:]):
|
||||||
|
base = step_i2-step_i1
|
||||||
|
middle = (step_i1+step_i2)/2
|
||||||
|
pi = p.pdf(middle)
|
||||||
|
qi = q.pdf(middle)
|
||||||
|
integral += (base * pi*np.log(pi/qi))
|
||||||
|
return integral
|
||||||
|
|
||||||
|
|
||||||
|
def integrate(f, resolution=10000):
|
||||||
|
steps = np.linspace(-10, 10, resolution)
|
||||||
|
integral = 0
|
||||||
|
for step_i1, step_i2 in zip(steps[:-1], steps[1:]):
|
||||||
|
base = step_i2 - step_i1
|
||||||
|
middle = (step_i1 + step_i2) / 2
|
||||||
|
integral += (base * f(middle))
|
||||||
|
return integral
|
||||||
|
|
||||||
|
|
||||||
|
def kl_analytic(m1, s1, m2, s2):
|
||||||
|
return np.log(s2/s1)+ (s1*s1 + (m1-m2)**2) / (2*s2*s2) - 0.5
|
||||||
|
|
||||||
|
|
||||||
|
def montecarlo(p, q, trials=1000000):
|
||||||
|
xs = p.rvs(trials)
|
||||||
|
ps = p.pdf(xs)
|
||||||
|
qs = q.pdf(xs)
|
||||||
|
return np.mean(np.log(ps/qs))
|
||||||
|
|
||||||
|
|
||||||
|
def montecarlo2(p, q, trials=100000):
|
||||||
|
#r = norm(-3, scale=5)
|
||||||
|
r = uniform(-10, 20)
|
||||||
|
# r = expon(loc=-6)
|
||||||
|
#r = p
|
||||||
|
xs = r.rvs(trials)
|
||||||
|
rs = r.pdf(xs)
|
||||||
|
print(rs)
|
||||||
|
ps = p.pdf(xs)+0.0000001
|
||||||
|
qs = q.pdf(xs)+0.0000001
|
||||||
|
return np.mean((ps/rs)*np.log(ps/qs))
|
||||||
|
|
||||||
|
|
||||||
|
def wrong(p, q, trials=100000):
|
||||||
|
xs = np.random.uniform(-10,10, trials)
|
||||||
|
ps = p.pdf(xs)
|
||||||
|
qs = q.pdf(xs)
|
||||||
|
return np.mean(ps*np.log(ps/qs))
|
||||||
|
|
||||||
|
|
||||||
|
p = norm(loc=0, scale=1)
|
||||||
|
q = norm(loc=1, scale=1)
|
||||||
|
|
||||||
|
|
||||||
|
integral_approx = kld_approx(p, q)
|
||||||
|
analytic_solution = kl_analytic(p.kwds['loc'], p.kwds['scale'], q.kwds['loc'], q.kwds['scale'])
|
||||||
|
montecarlo_approx = montecarlo(p, q)
|
||||||
|
montecarlo_approx2 = montecarlo2(p, q)
|
||||||
|
wrong_approx = wrong(p, q)
|
||||||
|
|
||||||
|
print(f'{analytic_solution=:.10f}')
|
||||||
|
print()
|
||||||
|
print(f'{integral_approx=:.10f}')
|
||||||
|
print(f'integra error = {np.abs(analytic_solution-integral_approx):.10f}')
|
||||||
|
print()
|
||||||
|
print(f'{montecarlo_approx=:.10f}')
|
||||||
|
print(f'montecarlo error = {np.abs(analytic_solution-montecarlo_approx):.10f}')
|
||||||
|
print()
|
||||||
|
print(f'{montecarlo_approx2=:.10f}')
|
||||||
|
print(f'montecarlo2 error = {np.abs(analytic_solution-montecarlo_approx2):.10f}')
|
||||||
|
print()
|
||||||
|
print(f'{wrong_approx=:.10f}')
|
||||||
|
print(f'wrong error = {np.abs(analytic_solution-wrong_approx):.10f}')
|
||||||
|
|
|
@ -0,0 +1,113 @@
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.neighbors import KernelDensity
|
||||||
|
from scipy.stats import norm, multivariate_normal, Covariance
|
||||||
|
from sklearn.metrics.pairwise import rbf_kernel
|
||||||
|
from sklearn.neighbors import KDTree
|
||||||
|
|
||||||
|
# def CauchySchwarzDivGaussMix(pi_i, mu_i, Lambda_i, tau_i, nu_i, Omega_i):
|
||||||
|
# Lambda_i_inv =
|
||||||
|
|
||||||
|
|
||||||
|
# def z()
|
||||||
|
|
||||||
|
|
||||||
|
nD=2
|
||||||
|
mu = [[0.1,0.1], [0.2, 0.3], [1, 1]] # centers
|
||||||
|
bandwidth = 0.10 # bandwidth and scale (standard deviation)
|
||||||
|
standard_deviation = bandwidth
|
||||||
|
variance = standard_deviation**2
|
||||||
|
|
||||||
|
mus = np.asarray(mu).reshape(-1,nD)
|
||||||
|
kde = KernelDensity(bandwidth=bandwidth).fit(mus)
|
||||||
|
|
||||||
|
x = [0.5,0.2]
|
||||||
|
|
||||||
|
print('with scikit-learn KDE')
|
||||||
|
prob = np.exp(kde.score_samples(np.asarray(x).reshape(-1,nD)))
|
||||||
|
print(prob)
|
||||||
|
|
||||||
|
# univariate
|
||||||
|
# N = norm(loc=mu, scale=scale)
|
||||||
|
# prob = N.pdf(x)
|
||||||
|
# print(prob)
|
||||||
|
|
||||||
|
# multivariate
|
||||||
|
|
||||||
|
print('with scipy multivariate normal')
|
||||||
|
npoints = mus.shape[0]
|
||||||
|
probs = sum(multivariate_normal(mean=mu_i, cov=variance).pdf(x) for mu_i in mu)
|
||||||
|
probs /= npoints
|
||||||
|
print(probs)
|
||||||
|
|
||||||
|
print('with scikit learn rbf_kernel')
|
||||||
|
x = np.asarray(x).reshape(-1,nD)
|
||||||
|
gamma = 1/(2*variance)
|
||||||
|
gram = rbf_kernel(mus, x, gamma=gamma)
|
||||||
|
const = 1/np.sqrt(((2*np.pi)**nD) * (variance**(nD)))
|
||||||
|
gram *= const
|
||||||
|
print(gram)
|
||||||
|
print(np.sum(gram)/npoints)
|
||||||
|
|
||||||
|
print('with stackoverflow answer')
|
||||||
|
from scipy.spatial.distance import pdist, cdist, squareform
|
||||||
|
import scipy
|
||||||
|
# this is an NxD matrix, where N is number of items and D its dimensionalites
|
||||||
|
pairwise_sq_dists = cdist(mus, x, 'euclidean')
|
||||||
|
print(pairwise_sq_dists)
|
||||||
|
K = np.exp(-pairwise_sq_dists / variance)
|
||||||
|
print(K)
|
||||||
|
print(np.sum(K)/npoints)
|
||||||
|
|
||||||
|
print("trying with scipy multivariate on more than one instance")
|
||||||
|
probs = sum(multivariate_normal(mean=x.reshape(-nD), cov=variance).pdf(mus))
|
||||||
|
probs /= npoints
|
||||||
|
print(probs)
|
||||||
|
|
||||||
|
import sys
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
# N1 = multivariate_normal(mean=mu[0], cov=variance)
|
||||||
|
# prob1 = N1.pdf(x)
|
||||||
|
# N2 = multivariate_normal(mean=mu[1], cov=variance)
|
||||||
|
# prob2 = N2.pdf(x)
|
||||||
|
# print(prob1+prob2)
|
||||||
|
|
||||||
|
cov = Covariance.from_diagonal([variance, variance])
|
||||||
|
print(cov.covariance)
|
||||||
|
precision_matrix = np.asarray([[1/variance, 0],[0, 1/variance]])
|
||||||
|
print(Covariance.from_precision(precision_matrix).covariance)
|
||||||
|
print(np.linalg.inv(precision_matrix))
|
||||||
|
print(np.linalg.inv(cov.covariance))
|
||||||
|
|
||||||
|
print('-'*100)
|
||||||
|
|
||||||
|
nD=2
|
||||||
|
mu = np.asarray([[0.1,0.5]]) # centers
|
||||||
|
bandwidth = 0.10 # bandwidth and scale (standard deviation)
|
||||||
|
standard_deviation = bandwidth
|
||||||
|
variance = standard_deviation**2
|
||||||
|
|
||||||
|
mus = np.asarray([mu]).reshape(-1,nD)
|
||||||
|
kde = KernelDensity(bandwidth=bandwidth).fit(mus)
|
||||||
|
|
||||||
|
x = np.asarray([0.5,0.2])
|
||||||
|
|
||||||
|
prob = np.exp(kde.score_samples(np.asarray(x).reshape(-1,nD)))
|
||||||
|
print(prob)
|
||||||
|
|
||||||
|
probs = sum(multivariate_normal(mean=mu_i, cov=variance).pdf(x) for mu_i in mu) / len(mu)
|
||||||
|
probs = sum(multivariate_normal(mean=[0,0], cov=1).pdf((x-mu_i)/bandwidth) for mu_i in mu) / len(mu)
|
||||||
|
probs = sum(multivariate_normal(mean=[0,0], cov=variance).pdf(x-mu_i) for mu_i in mu) / len(mu)
|
||||||
|
print(probs)
|
||||||
|
# N1 = multivariate_normal(mean=mu[0], cov=variance)
|
||||||
|
# prob1 = N1.pdf(x)
|
||||||
|
# N2 = multivariate_normal(mean=mu[1], cov=variance)
|
||||||
|
# prob2 = N2.pdf(x)
|
||||||
|
# print(prob1+prob2)
|
||||||
|
|
||||||
|
h=0.1
|
||||||
|
D=4
|
||||||
|
|
||||||
|
print(np.sqrt((4**2) * (5**2)))
|
||||||
|
print(4*5)
|
||||||
|
|
|
@ -0,0 +1,153 @@
|
||||||
|
import numpy as np
|
||||||
|
from scipy.stats import multivariate_normal
|
||||||
|
from scipy import optimize
|
||||||
|
|
||||||
|
|
||||||
|
def cauchy_schwarz_divergence_kde(L:list, Xte:np.ndarray, bandwidth:float, alpha:np.ndarray):
|
||||||
|
"""
|
||||||
|
|
||||||
|
:param L: a list of np.ndarray (instances x dimensions) with the Li being the instances of class i
|
||||||
|
:param Xte: an np.ndarray (instances x dimensions)
|
||||||
|
:param bandwidth: the bandwidth of the kernel
|
||||||
|
:param alpha: the mixture parameter
|
||||||
|
:return: the Cauchy-Schwarz divergence between the validation KDE mixture distribution (with mixture paramerter
|
||||||
|
alpha) and the test KDE distribution
|
||||||
|
"""
|
||||||
|
|
||||||
|
n = len(L) # number of classes
|
||||||
|
K, D = Xte.shape # number of test instances, and number of dimensions
|
||||||
|
Kinv = 1/K
|
||||||
|
|
||||||
|
# the lengths of each block
|
||||||
|
l = np.asarray([len(Li) for Li in L])
|
||||||
|
|
||||||
|
# contains the a_i / l_i
|
||||||
|
alpha_r = alpha / l
|
||||||
|
alpha2_r_sum = np.sum(alpha * alpha_r) # contains the sum_i a_i**2 / l_i
|
||||||
|
|
||||||
|
h = bandwidth
|
||||||
|
|
||||||
|
# we will only use the bandwidth (h) between two gaussians with covariance matrix a "scalar matrix" h**2
|
||||||
|
cov_mix_scalar = 2*h*h # corresponds to a bandwidth of sqrt(2)*h
|
||||||
|
|
||||||
|
# constant
|
||||||
|
C = ((2*np.pi)**(-D/2))*h**(-D)
|
||||||
|
|
||||||
|
Kernel = multivariate_normal(mean=np.zeros(D), cov=cov_mix_scalar)
|
||||||
|
K0 = Kernel.pdf(np.zeros(D))
|
||||||
|
|
||||||
|
|
||||||
|
def compute_block_E():
|
||||||
|
kernel_block_E = []
|
||||||
|
for i,Li in enumerate(L):
|
||||||
|
acc = 0
|
||||||
|
for x_ji in Li: #optimize...
|
||||||
|
for x_k in Xte: #optimize...
|
||||||
|
acc += Kernel.pdf(x_ji - x_k) #optimize...
|
||||||
|
kernel_block_E.append(acc)
|
||||||
|
return np.asarray(kernel_block_E)
|
||||||
|
|
||||||
|
|
||||||
|
def compute_block_F_hash():
|
||||||
|
# this can be computed entirely at training time
|
||||||
|
Khash = {}
|
||||||
|
for a in range(n):
|
||||||
|
for b in range(l[a]):
|
||||||
|
for i in range(n):
|
||||||
|
for j in range(l[i]): # this for, and index j, can be supressed and store the sum across j
|
||||||
|
Khash[(a,b,i,j)] = Kernel.pdf(L[i][j]-L[a][b])
|
||||||
|
return Khash
|
||||||
|
|
||||||
|
|
||||||
|
def compute_block_Ktest():
|
||||||
|
# this can be optimized in several ways, starting by computing only the lower diagonal triangle... and remove
|
||||||
|
# then the K0 which is not needed after that
|
||||||
|
acc = 0
|
||||||
|
for x_i in Xte:
|
||||||
|
for x_j in Xte:
|
||||||
|
acc += Kernel.pdf(x_i-x_j)
|
||||||
|
return acc
|
||||||
|
|
||||||
|
|
||||||
|
def compute_block_F():
|
||||||
|
F = 0
|
||||||
|
for a in range(n):
|
||||||
|
tmp_b = 0
|
||||||
|
for b in range(l[a]):
|
||||||
|
tmp_i = 0
|
||||||
|
for i in range(n):
|
||||||
|
tmp_j = 0
|
||||||
|
for j in range(l[i]):
|
||||||
|
tmp_j += Fh[(a, b, i, j)]
|
||||||
|
tmp_i += (alpha_r[i] * tmp_j)
|
||||||
|
tmp_b += tmp_i
|
||||||
|
F += (alpha_r[a] * tmp_b)
|
||||||
|
return F
|
||||||
|
|
||||||
|
|
||||||
|
E = compute_block_E()
|
||||||
|
Fh = compute_block_F_hash()
|
||||||
|
# Ktest = compute_block_Ktest()
|
||||||
|
F = compute_block_F()
|
||||||
|
|
||||||
|
C1 = K*Kinv*Kinv*C
|
||||||
|
C2 = 2 * np.sum([Kernel.pdf(Xte[k]-Xte[k_p]) for k in range(K) for k_p in range(k)])
|
||||||
|
|
||||||
|
partA = -np.log(Kinv * (alpha_r @ E))
|
||||||
|
partB = 0.5*np.log(C*alpha2_r_sum + F - (K0*alpha2_r_sum))
|
||||||
|
# partC = 0.5*np.log(Kinv) + 0.5*np.log(C + Kinv*Ktest - K0)
|
||||||
|
partC = 0.5*np.log(C1+C2)
|
||||||
|
|
||||||
|
Dcs = partA + partB + partC
|
||||||
|
|
||||||
|
return Dcs
|
||||||
|
|
||||||
|
|
||||||
|
L = [
|
||||||
|
np.asarray([
|
||||||
|
[-1,-1,-1]
|
||||||
|
]),
|
||||||
|
np.asarray([
|
||||||
|
[0,0,0],
|
||||||
|
]),
|
||||||
|
np.asarray([
|
||||||
|
[0,0,0.1],
|
||||||
|
[1,1,1],
|
||||||
|
[3,3,1],
|
||||||
|
]),
|
||||||
|
np.asarray([
|
||||||
|
[1,0,0]
|
||||||
|
]),
|
||||||
|
np.asarray([
|
||||||
|
[0,1,0]
|
||||||
|
])
|
||||||
|
]
|
||||||
|
Xte = np.asarray(
|
||||||
|
[[0,0,0],
|
||||||
|
[0,0,0],
|
||||||
|
[1,0,0],
|
||||||
|
[0,1,0]]
|
||||||
|
)
|
||||||
|
bandwidth=0.01
|
||||||
|
alpha=np.asarray([0, 2/4, 0, 1/4, 1/4])
|
||||||
|
|
||||||
|
div = cauchy_schwarz_divergence_kde(L, Xte, bandwidth, alpha)
|
||||||
|
print(div)
|
||||||
|
|
||||||
|
def divergence(alpha):
|
||||||
|
return cauchy_schwarz_divergence_kde(L, Xte, bandwidth, alpha)
|
||||||
|
|
||||||
|
|
||||||
|
# the initial point is set as the uniform distribution
|
||||||
|
n_classes = len(L)
|
||||||
|
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
|
||||||
|
|
||||||
|
# solutions are bounded to those contained in the unit-simplex
|
||||||
|
bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1]
|
||||||
|
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
|
||||||
|
#print('searching for alpha')
|
||||||
|
r = optimize.minimize(divergence, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
|
||||||
|
sol = r.x
|
||||||
|
for x in sol:
|
||||||
|
print(f'{x:.4f}')
|
||||||
|
print(cauchy_schwarz_divergence_kde(L, Xte, bandwidth, sol))
|
|
@ -0,0 +1,19 @@
|
||||||
|
import pickle
|
||||||
|
import os
|
||||||
|
|
||||||
|
dataset = 'lequa/T1B'
|
||||||
|
for metric in ['mae', 'mrae']:
|
||||||
|
print('metric', metric)
|
||||||
|
for method in ['KDEy-DMhd4', 'KDEy-DMhd4+', 'KDEy-DMhd4++']:
|
||||||
|
|
||||||
|
path = f'/home/moreo/QuaPy/distribution_matching/results/{dataset}/{metric}/{method}.hyper.pkl'
|
||||||
|
if os.path.exists(path):
|
||||||
|
obj = pickle.load(open(path, 'rb'))
|
||||||
|
print(method, obj)
|
||||||
|
else:
|
||||||
|
print(f'path {path} does not exist')
|
||||||
|
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,27 @@
|
||||||
|
from time import time
|
||||||
|
from sklearn.metrics.pairwise import rbf_kernel
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
def rbf(X, Y):
|
||||||
|
return X @ Y.T
|
||||||
|
|
||||||
|
|
||||||
|
@torch.compile
|
||||||
|
def rbf_comp(X, Y):
|
||||||
|
return X @ Y.T
|
||||||
|
|
||||||
|
|
||||||
|
def measure_time(X, Y, func):
|
||||||
|
tinit = time()
|
||||||
|
func(X, Y)
|
||||||
|
tend = time()
|
||||||
|
print(f'took {tend-tinit:.3}s')
|
||||||
|
|
||||||
|
|
||||||
|
X = np.random.rand(1000, 100)
|
||||||
|
Y = np.random.rand(1000, 100)
|
||||||
|
|
||||||
|
measure_time(X, Y, rbf)
|
||||||
|
measure_time(X, Y, rbf_comp)
|
|
@ -0,0 +1,20 @@
|
||||||
|
import pickle
|
||||||
|
import os
|
||||||
|
|
||||||
|
import quapy as qp
|
||||||
|
from quapy.model_selection import GridSearchQ
|
||||||
|
from quapy.protocol import UPP
|
||||||
|
|
||||||
|
|
||||||
|
toprint = []
|
||||||
|
for dataset in qp.datasets.UCI_MULTICLASS_DATASETS:
|
||||||
|
|
||||||
|
data = qp.datasets.fetch_UCIMulticlassDataset(dataset)
|
||||||
|
|
||||||
|
# model selection
|
||||||
|
train, test = data.train_test
|
||||||
|
toprint.append(f'{dataset}\t{len(train)}\t{len(test)}\t{data.n_classes}')
|
||||||
|
|
||||||
|
print()
|
||||||
|
for pr in toprint:
|
||||||
|
print(pr)
|
|
@ -0,0 +1,6 @@
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
report = pd.read_csv('../results_lequa_mrae/KDEy-MLE.dataframe')
|
||||||
|
|
||||||
|
means = report.mean()
|
||||||
|
print(f'KDEy-MLE\tLeQua-T1B\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
|
|
@ -0,0 +1,155 @@
|
||||||
|
import numpy as np
|
||||||
|
from scipy.stats import norm
|
||||||
|
|
||||||
|
|
||||||
|
EPS=1e-8
|
||||||
|
LARGE_NUM=1000000
|
||||||
|
TRIALS=LARGE_NUM
|
||||||
|
|
||||||
|
# calculamos: D(p||q)
|
||||||
|
# se asume:
|
||||||
|
# q = distribución de referencia (en nuestro caso: el mixture de KDEs en training)
|
||||||
|
# p = distribución (en nuestro caso: KDE del test)
|
||||||
|
# N = TRIALS el número de trials para montecarlo
|
||||||
|
# n = numero de clases, en este ejemplo n=3
|
||||||
|
|
||||||
|
|
||||||
|
# ejemplo de f-generator function: Squared Hellinger Distance
|
||||||
|
def hd2(u):
|
||||||
|
v = (np.sqrt(u)-1)
|
||||||
|
return v*v
|
||||||
|
|
||||||
|
|
||||||
|
# esta funcion estima la divergencia entre dos mixtures (univariates)
|
||||||
|
def integrate(p, q, f=hd2, resolution=TRIALS, epsilon=EPS):
|
||||||
|
xs = np.linspace(-50, 50, resolution)
|
||||||
|
ps = p.pdf(xs)+epsilon
|
||||||
|
qs = q.pdf(xs)+epsilon
|
||||||
|
rs = ps/qs
|
||||||
|
base = xs[1]-xs[0]
|
||||||
|
return np.sum(base * f(rs) * qs)
|
||||||
|
|
||||||
|
|
||||||
|
# esta es la aproximación de montecarlo según está en el artículo
|
||||||
|
# es decir, sampleando la q, y haciendo la media (1/N). En el artículo
|
||||||
|
# en realidad se "presamplean" N por cada clase y luego se eligen los que
|
||||||
|
# corresponda a cada clase. Aquí directamente sampleamos solo los
|
||||||
|
# que correspondan.
|
||||||
|
def montecarlo(p, q, f=hd2, trials=TRIALS, epsilon=EPS):
|
||||||
|
xs = q.rvs(trials)
|
||||||
|
ps = p.pdf(xs)+epsilon
|
||||||
|
qs = q.pdf(xs)+epsilon
|
||||||
|
N = trials
|
||||||
|
return (1/N)*np.sum(f(ps/qs))
|
||||||
|
|
||||||
|
|
||||||
|
# esta es la aproximación de montecarlo asumiendo que vayamos con todos los N*n
|
||||||
|
# puntos (es decir, si hubieramos sampleado de una "q" en la que todas las clases
|
||||||
|
# eran igualmente probables, 1/3) y luego reescalamos los pesos con "importance
|
||||||
|
# weighting" <-- esto no está en el artículo, pero me gusta más que lo que hay
|
||||||
|
# porque desaparece el paso de "seleccionar los que corresponden por clase"
|
||||||
|
def montecarlo_importancesampling(p, q, r, f=hd2, trials=TRIALS, epsilon=EPS):
|
||||||
|
xs = r.rvs(trials) # <- distribución de referencia (aquí: class weight uniforme)
|
||||||
|
rs = r.pdf(xs)
|
||||||
|
ps = p.pdf(xs)+epsilon
|
||||||
|
qs = q.pdf(xs)+epsilon
|
||||||
|
N = trials
|
||||||
|
importance_weight = qs/rs
|
||||||
|
return (1/N)*np.sum(f(ps/qs)*(importance_weight))
|
||||||
|
|
||||||
|
|
||||||
|
# he intentado implementar la variante que propne Juanjo pero creo que no la he
|
||||||
|
# entendido bien. Tal vez haya que reescalar el peso por 1/3 o algo así, pero no
|
||||||
|
# doy con la tecla...
|
||||||
|
def montecarlo_classweight(p, q, f=hd2, trials=TRIALS, epsilon=EPS):
|
||||||
|
xs_1 = q.rvs_fromclass(0, trials)
|
||||||
|
xs_2 = q.rvs_fromclass(1, trials)
|
||||||
|
xs_3 = q.rvs_fromclass(2, trials)
|
||||||
|
xs = np.concatenate([xs_1, xs_2, xs_3])
|
||||||
|
weights = np.asarray([[alpha_i]*trials for alpha_i in q.alphas]).flatten()
|
||||||
|
ps = p.pdf(xs)+epsilon
|
||||||
|
qs = q.pdf(xs)+epsilon
|
||||||
|
N = trials
|
||||||
|
n = q.n
|
||||||
|
return (1/(N))*np.sum(weights*f(ps/qs))
|
||||||
|
|
||||||
|
|
||||||
|
class Q:
|
||||||
|
"""
|
||||||
|
Esta clase es un mixture de gausianas.
|
||||||
|
|
||||||
|
:param locs: medias, una por clase
|
||||||
|
:param scales: stds, una por clase
|
||||||
|
:param alphas: peso, uno por clase
|
||||||
|
"""
|
||||||
|
def __init__(self, locs, scales, alphas):
|
||||||
|
self.qs = []
|
||||||
|
for loc, scale in zip(locs, scales):
|
||||||
|
self.qs.append(norm(loc=loc, scale=scale))
|
||||||
|
assert np.isclose(np.sum(alphas), 1), 'alphas do not sum up to 1!'
|
||||||
|
self.alphas = np.asarray(alphas) / np.sum(alphas)
|
||||||
|
|
||||||
|
def pdf(self, xs):
|
||||||
|
q_xs = np.vstack([
|
||||||
|
q_i.pdf(xs) * alpha_i for q_i, alpha_i in zip(self.qs, self.alphas)
|
||||||
|
])
|
||||||
|
v = q_xs.sum(axis=0)
|
||||||
|
if len(v)==1:
|
||||||
|
v = v[0]
|
||||||
|
return v
|
||||||
|
|
||||||
|
@property
|
||||||
|
def n(self):
|
||||||
|
return len(self.alphas)
|
||||||
|
|
||||||
|
def rvs_fromclass(self, inclass, trials):
|
||||||
|
return self.qs[inclass].rvs(trials)
|
||||||
|
|
||||||
|
def rvs(self, trials):
|
||||||
|
variates = []
|
||||||
|
added = 0
|
||||||
|
for i, (q_i, alpha_i) in enumerate(zip(self.qs, self.alphas)):
|
||||||
|
trials_i = int(trials*alpha_i) if (i < self.n-1) else trials-added
|
||||||
|
variates_i = q_i.rvs(trials_i)
|
||||||
|
variates.append(variates_i)
|
||||||
|
added += len(variates_i)
|
||||||
|
|
||||||
|
return np.concatenate(variates)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def locs(self):
|
||||||
|
return np.asarray([x.kwds['loc'] for x in self.qs])
|
||||||
|
|
||||||
|
@property
|
||||||
|
def scales(self):
|
||||||
|
return np.asarray([x.kwds['scale'] for x in self.qs])
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def change_priors(cls, q, alphas):
|
||||||
|
assert len(alphas)==len(q.alphas)
|
||||||
|
return Q(locs=q.locs, scales=q.scales, alphas=alphas)
|
||||||
|
|
||||||
|
# distribucion de test
|
||||||
|
p = Q(locs=[1, 1.5], scales=[1,1], alphas=[0.8, 0.2])
|
||||||
|
|
||||||
|
# distribucion de training (mixture por clase)
|
||||||
|
q = Q(locs=[0, 0.5, 1.2], scales=[1,1,1.5], alphas=[0.1, 0.2, 0.7])
|
||||||
|
|
||||||
|
# distribucion de referencia (mixture copia de q, con pesos de clase uniformes)
|
||||||
|
r = Q.change_priors(q, alphas=[1/3, 1/3, 1/3])
|
||||||
|
|
||||||
|
|
||||||
|
integral_approx = integrate(p, q)
|
||||||
|
montecarlo_approx = montecarlo(p, q)
|
||||||
|
montecarlo_approx2 = montecarlo_importancesampling(p, q, r)
|
||||||
|
montecarlo_approx3 = montecarlo_classweight(p, q)
|
||||||
|
|
||||||
|
print(f'{integral_approx=:.10f}')
|
||||||
|
print()
|
||||||
|
print(f'{montecarlo_approx=:.10f}, error={np.abs(integral_approx-montecarlo_approx):.10f}')
|
||||||
|
print()
|
||||||
|
print(f'{montecarlo_approx2=:.10f}, error={np.abs(integral_approx-montecarlo_approx2):.10f}')
|
||||||
|
print()
|
||||||
|
print(f'{montecarlo_approx3=:.10f}, error={np.abs(integral_approx-montecarlo_approx3):.10f}')
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,21 @@
|
||||||
|
1.- No se si sería más facil tomar r=uniforme, y no r=mixture model, simplifica mucho el sampling y tal vez incluso produzca menos error
|
||||||
|
2.- Por ahora tengo KLD y HD:
|
||||||
|
- para KLD no he entendido si tengo que añadir el -x + y
|
||||||
|
3.- Se puede poner la topsoe como una f-divergence?
|
||||||
|
La topsoe parece que es 2 veces la jensen-shannon divergence, o sea
|
||||||
|
topsoe(p,q) = kld(p|m) + kld(q|m), con m = (p+q)/2
|
||||||
|
4.- Se puede poner la Wasserstein como una f-divergence?
|
||||||
|
5.- En general, qué relación hay con las "distancias"?
|
||||||
|
|
||||||
|
|
||||||
|
2) implementar el auto
|
||||||
|
- optimización interna para likelihood [ninguno parece funcionar bien]
|
||||||
|
- de todo (e.g., todo el training)?
|
||||||
|
- independiente para cada conjunto etiquetado? (e.g., positivos, negativos, neutros, y test)
|
||||||
|
- optimización como un parámetro GridSearchQ
|
||||||
|
6) optimizar kernel? optimizar distancia?
|
||||||
|
10) Definir un clasificador que devuelva, para cada clase, una posterior como la likelihood en la class-conditional KDE dividida
|
||||||
|
por la likelihood en en todas las clases (como propone Juanjo) y meterlo en EMD. Hacer al contario: re-calibrar con
|
||||||
|
EMD y meterlo en KDEy
|
||||||
|
11) KDEx?
|
||||||
|
|
|
@ -0,0 +1,85 @@
|
||||||
|
import pickle
|
||||||
|
import os
|
||||||
|
import pandas as pd
|
||||||
|
from distribution_matching.commons import METHODS, new_method, show_results
|
||||||
|
|
||||||
|
import quapy as qp
|
||||||
|
from quapy.model_selection import GridSearchQ
|
||||||
|
from quapy.protocol import UPP
|
||||||
|
|
||||||
|
SEED=1
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
qp.environ['SAMPLE_SIZE'] = 100
|
||||||
|
qp.environ['N_JOBS'] = -1
|
||||||
|
n_bags_val = 250
|
||||||
|
n_bags_test = 1000
|
||||||
|
for optim in ['mae', 'mrae']:
|
||||||
|
|
||||||
|
result_dir = f'results/tweet/{optim}'
|
||||||
|
|
||||||
|
os.makedirs(result_dir, exist_ok=True)
|
||||||
|
|
||||||
|
for method in METHODS:
|
||||||
|
|
||||||
|
print('Init method', method)
|
||||||
|
|
||||||
|
global_result_path = f'{result_dir}/{method}'
|
||||||
|
|
||||||
|
if not os.path.exists(global_result_path+'.csv'):
|
||||||
|
with open(global_result_path+'.csv', 'wt') as csv:
|
||||||
|
csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n')
|
||||||
|
|
||||||
|
with open(global_result_path+'.csv', 'at') as csv:
|
||||||
|
# four semeval dataset share the training, so it is useless to optimize hyperparameters four times;
|
||||||
|
# this variable controls that the mod sel has already been done, and skip this otherwise
|
||||||
|
semeval_trained = False
|
||||||
|
|
||||||
|
for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST:
|
||||||
|
print('init', dataset)
|
||||||
|
|
||||||
|
local_result_path = global_result_path + '_' + dataset
|
||||||
|
if os.path.exists(local_result_path+'.dataframe'):
|
||||||
|
print(f'result file {local_result_path}.dataframe already exist; skipping')
|
||||||
|
continue
|
||||||
|
|
||||||
|
with qp.util.temp_seed(SEED):
|
||||||
|
|
||||||
|
is_semeval = dataset.startswith('semeval')
|
||||||
|
|
||||||
|
if not is_semeval or not semeval_trained:
|
||||||
|
|
||||||
|
param_grid, quantifier = new_method(method)
|
||||||
|
|
||||||
|
# model selection
|
||||||
|
data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=True)
|
||||||
|
|
||||||
|
protocol = UPP(data.test, repeats=n_bags_val)
|
||||||
|
modsel = GridSearchQ(quantifier, param_grid, protocol, refit=False, n_jobs=-1, verbose=1, error=optim)
|
||||||
|
|
||||||
|
modsel.fit(data.training)
|
||||||
|
print(f'best params {modsel.best_params_}')
|
||||||
|
print(f'best score {modsel.best_score_}')
|
||||||
|
pickle.dump(
|
||||||
|
(modsel.best_params_, modsel.best_score_,),
|
||||||
|
open(f'{local_result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
|
quantifier = modsel.best_model()
|
||||||
|
|
||||||
|
if is_semeval:
|
||||||
|
semeval_trained = True
|
||||||
|
|
||||||
|
else:
|
||||||
|
print(f'model selection for {dataset} already done; skipping')
|
||||||
|
|
||||||
|
data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=False)
|
||||||
|
quantifier.fit(data.training)
|
||||||
|
protocol = UPP(data.test, repeats=n_bags_test)
|
||||||
|
report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], verbose=True)
|
||||||
|
report.to_csv(f'{local_result_path}.dataframe')
|
||||||
|
means = report.mean()
|
||||||
|
csv.write(f'{method}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
|
||||||
|
csv.flush()
|
||||||
|
|
||||||
|
show_results(global_result_path)
|
|
@ -0,0 +1,57 @@
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
import os
|
||||||
|
|
||||||
|
import quapy as qp
|
||||||
|
from distribution_matching.commons import show_results
|
||||||
|
from quapy.method.aggregative import DMy
|
||||||
|
from distribution_matching.method.method_kdey import KDEy
|
||||||
|
from quapy.protocol import UPP
|
||||||
|
|
||||||
|
SEED=1
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
qp.environ['SAMPLE_SIZE'] = 100
|
||||||
|
qp.environ['N_JOBS'] = -1
|
||||||
|
n_bags_val = 250
|
||||||
|
n_bags_test = 1000
|
||||||
|
result_dir = f'results/tweet/sensibility'
|
||||||
|
|
||||||
|
os.makedirs(result_dir, exist_ok=True)
|
||||||
|
|
||||||
|
for method, param, grid in [
|
||||||
|
('KDEy-ML', 'Bandwidth', np.linspace(0.01, 0.2, 20)),
|
||||||
|
('DM-HD', 'nbins', list(range(2,10)) + list(range(10,34,2)))
|
||||||
|
]:
|
||||||
|
|
||||||
|
global_result_path = f'{result_dir}/{method}'
|
||||||
|
|
||||||
|
if not os.path.exists(global_result_path+'.csv'):
|
||||||
|
with open(global_result_path+'.csv', 'wt') as csv:
|
||||||
|
csv.write(f'Method\tDataset\t{param}\tMAE\tMRAE\tKLD\n')
|
||||||
|
|
||||||
|
with open(global_result_path+'.csv', 'at') as csv:
|
||||||
|
for val in grid:
|
||||||
|
for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST:
|
||||||
|
print('init', dataset)
|
||||||
|
|
||||||
|
local_result_path = global_result_path + '_' + dataset + (f'_{val:.3f}' if isinstance(val, float) else f'{val}')
|
||||||
|
|
||||||
|
with qp.util.temp_seed(SEED):
|
||||||
|
|
||||||
|
data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=False)
|
||||||
|
|
||||||
|
if method == 'KDEy-ML':
|
||||||
|
quantifier = KDEy(LogisticRegression(n_jobs=-1), target='max_likelihood', val_split=10, bandwidth=val)
|
||||||
|
elif method == 'DM-HD':
|
||||||
|
quantifier = DMy(LogisticRegression(n_jobs=-1), val_split=10, nbins=val, divergence='HD', n_jobs=-1)
|
||||||
|
quantifier.fit(data.training)
|
||||||
|
protocol = UPP(data.test, repeats=n_bags_test)
|
||||||
|
report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], verbose=True, n_jobs=-1)
|
||||||
|
report.to_csv(f'{local_result_path}.dataframe')
|
||||||
|
means = report.mean()
|
||||||
|
csv.write(f'{method}\t{data.name}\t{val}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
|
||||||
|
csv.flush()
|
||||||
|
|
||||||
|
show_results(global_result_path)
|
|
@ -0,0 +1,83 @@
|
||||||
|
import pickle
|
||||||
|
import os
|
||||||
|
from distribution_matching.commons import BIN_METHODS, new_method, show_results
|
||||||
|
|
||||||
|
import quapy as qp
|
||||||
|
from quapy.model_selection import GridSearchQ
|
||||||
|
from quapy.protocol import UPP
|
||||||
|
|
||||||
|
|
||||||
|
SEED = 1
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
qp.environ['SAMPLE_SIZE'] = 100
|
||||||
|
qp.environ['N_JOBS'] = -1
|
||||||
|
n_bags_val = 250
|
||||||
|
n_bags_test = 1000
|
||||||
|
for optim in ['mae', 'mrae']:
|
||||||
|
result_dir = f'results/binary/{optim}'
|
||||||
|
|
||||||
|
os.makedirs(result_dir, exist_ok=True)
|
||||||
|
|
||||||
|
for method in BIN_METHODS:
|
||||||
|
|
||||||
|
print('Init method', method)
|
||||||
|
|
||||||
|
global_result_path = f'{result_dir}/{method}'
|
||||||
|
|
||||||
|
if not os.path.exists(global_result_path + '.csv'):
|
||||||
|
with open(global_result_path + '.csv', 'wt') as csv:
|
||||||
|
csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n')
|
||||||
|
|
||||||
|
with open(global_result_path + '.csv', 'at') as csv:
|
||||||
|
|
||||||
|
for dataset in qp.datasets.UCI_DATASETS:
|
||||||
|
if dataset in ['acute.a', 'acute.b', 'iris.1']: continue # , 'pageblocks.5', 'spambase', 'wdbc']: continue
|
||||||
|
|
||||||
|
print('init', dataset)
|
||||||
|
|
||||||
|
local_result_path = global_result_path + '_' + dataset
|
||||||
|
if os.path.exists(local_result_path + '.dataframe'):
|
||||||
|
print(f'result file {local_result_path}.dataframe already exist; skipping')
|
||||||
|
continue
|
||||||
|
|
||||||
|
with qp.util.temp_seed(SEED):
|
||||||
|
|
||||||
|
param_grid, quantifier = new_method(method, max_iter=3000)
|
||||||
|
|
||||||
|
data = qp.datasets.fetch_UCIDataset(dataset)
|
||||||
|
|
||||||
|
# model selection
|
||||||
|
train, test = data.train_test
|
||||||
|
train, val = train.split_stratified(random_state=SEED)
|
||||||
|
|
||||||
|
protocol = UPP(val, repeats=n_bags_val)
|
||||||
|
modsel = GridSearchQ(
|
||||||
|
quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=1, error=optim
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
modsel.fit(train)
|
||||||
|
|
||||||
|
print(f'best params {modsel.best_params_}')
|
||||||
|
print(f'best score {modsel.best_score_}')
|
||||||
|
pickle.dump(
|
||||||
|
(modsel.best_params_, modsel.best_score_,),
|
||||||
|
open(f'{local_result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
|
quantifier = modsel.best_model()
|
||||||
|
except:
|
||||||
|
print('something went wrong... trying to fit the default model')
|
||||||
|
quantifier.fit(train)
|
||||||
|
|
||||||
|
protocol = UPP(test, repeats=n_bags_test)
|
||||||
|
report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'],
|
||||||
|
verbose=True)
|
||||||
|
report.to_csv(f'{local_result_path}.dataframe')
|
||||||
|
means = report.mean()
|
||||||
|
csv.write(f'{method}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
|
||||||
|
csv.flush()
|
||||||
|
|
||||||
|
show_results(global_result_path)
|
|
@ -0,0 +1,88 @@
|
||||||
|
import pickle
|
||||||
|
import os
|
||||||
|
from data.base import LabelledCollection
|
||||||
|
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
|
||||||
|
from distribution_matching.commons import METHODS, new_method, show_results
|
||||||
|
|
||||||
|
import quapy as qp
|
||||||
|
from quapy.model_selection import GridSearchQ
|
||||||
|
from quapy.protocol import UPP
|
||||||
|
|
||||||
|
|
||||||
|
SEED = 1
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
qp.environ['SAMPLE_SIZE'] = 500
|
||||||
|
qp.environ['N_JOBS'] = -1
|
||||||
|
n_bags_val = 250
|
||||||
|
n_bags_test = 1000
|
||||||
|
for optim in ['mae', 'mrae']:
|
||||||
|
result_dir = f'results/ucimulti/{optim}'
|
||||||
|
|
||||||
|
os.makedirs(result_dir, exist_ok=True)
|
||||||
|
|
||||||
|
for method in METHODS:
|
||||||
|
|
||||||
|
print('Init method', method)
|
||||||
|
|
||||||
|
global_result_path = f'{result_dir}/{method}'
|
||||||
|
|
||||||
|
if not os.path.exists(global_result_path + '.csv'):
|
||||||
|
with open(global_result_path + '.csv', 'wt') as csv:
|
||||||
|
csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n')
|
||||||
|
|
||||||
|
with open(global_result_path + '.csv', 'at') as csv:
|
||||||
|
|
||||||
|
for dataset in qp.datasets.UCI_MULTICLASS_DATASETS:
|
||||||
|
|
||||||
|
print('init', dataset)
|
||||||
|
|
||||||
|
local_result_path = global_result_path + '_' + dataset
|
||||||
|
if os.path.exists(local_result_path + '.dataframe'):
|
||||||
|
print(f'result file {local_result_path}.dataframe already exist; skipping')
|
||||||
|
continue
|
||||||
|
|
||||||
|
with qp.util.temp_seed(SEED):
|
||||||
|
|
||||||
|
param_grid, quantifier = new_method(method, max_iter=3000)
|
||||||
|
|
||||||
|
data = qp.datasets.fetch_UCIMulticlassDataset(dataset)
|
||||||
|
|
||||||
|
# model selection
|
||||||
|
train, test = data.train_test
|
||||||
|
train, val = train.split_stratified(random_state=SEED)
|
||||||
|
|
||||||
|
protocol = UPP(val, repeats=n_bags_val)
|
||||||
|
modsel = GridSearchQ(
|
||||||
|
quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=1, error=optim
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
modsel.fit(train)
|
||||||
|
|
||||||
|
print(f'best params {modsel.best_params_}')
|
||||||
|
print(f'best score {modsel.best_score_}')
|
||||||
|
pickle.dump(
|
||||||
|
(modsel.best_params_, modsel.best_score_,),
|
||||||
|
open(f'{local_result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
|
quantifier = modsel.best_model()
|
||||||
|
except:
|
||||||
|
print('something went wrong... trying to fit the default model')
|
||||||
|
quantifier.fit(train)
|
||||||
|
# quantifier = qp.method.aggregative.CC(LogisticRegression()).fit(train)
|
||||||
|
|
||||||
|
|
||||||
|
protocol = UPP(test, repeats=n_bags_test)
|
||||||
|
report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'],
|
||||||
|
verbose=True, n_jobs=-1)
|
||||||
|
report.to_csv(f'{local_result_path}.dataframe')
|
||||||
|
means = report.mean()
|
||||||
|
csv.write(f'{method}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
|
||||||
|
csv.flush()
|
||||||
|
|
||||||
|
show_results(global_result_path)
|
|
@ -0,0 +1,63 @@
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
import os
|
||||||
|
import quapy as qp
|
||||||
|
from distribution_matching.commons import show_results
|
||||||
|
from distribution_matching.method.method_kdey import KDEy
|
||||||
|
from quapy.method.aggregative import DMy
|
||||||
|
from quapy.protocol import UPP
|
||||||
|
|
||||||
|
|
||||||
|
SEED=1
|
||||||
|
|
||||||
|
def task(val):
|
||||||
|
print('job-init', dataset, val)
|
||||||
|
|
||||||
|
with qp.util.temp_seed(SEED):
|
||||||
|
if method=='KDEy-ML':
|
||||||
|
quantifier = KDEy(LogisticRegression(), target='max_likelihood', val_split=10, bandwidth=val)
|
||||||
|
elif method == 'DM-HD':
|
||||||
|
quantifier = DMy(LogisticRegression(), val_split=10, nbins=val, divergence='HD')
|
||||||
|
|
||||||
|
quantifier.fit(data.data)
|
||||||
|
protocol = UPP(data.test, repeats=n_bags_test)
|
||||||
|
report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'],
|
||||||
|
verbose=True, n_jobs=-1)
|
||||||
|
return report
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
qp.environ['SAMPLE_SIZE'] = 500
|
||||||
|
qp.environ['N_JOBS'] = -1
|
||||||
|
n_bags_val = 250
|
||||||
|
n_bags_test = 1000
|
||||||
|
result_dir = f'results/ucimulti/sensibility'
|
||||||
|
|
||||||
|
os.makedirs(result_dir, exist_ok=True)
|
||||||
|
|
||||||
|
for dataset in qp.datasets.UCI_MULTICLASS_DATASETS:
|
||||||
|
|
||||||
|
data = qp.datasets.fetch_UCIMulticlassDataset(dataset)
|
||||||
|
|
||||||
|
for method, param, grid in [
|
||||||
|
('KDEy-ML', 'Bandwidth', np.linspace(0.01, 0.2, 20)),
|
||||||
|
('DM-HD', 'nbins', list(range(2, 10)) + list(range(10, 34, 2)))
|
||||||
|
]:
|
||||||
|
|
||||||
|
global_result_path = f'{result_dir}/{method}'
|
||||||
|
|
||||||
|
if not os.path.exists(global_result_path+'.csv'):
|
||||||
|
with open(global_result_path+'.csv', 'wt') as csv:
|
||||||
|
csv.write(f'Method\tDataset\t{param}\tMAE\tMRAE\tKLD\n')
|
||||||
|
|
||||||
|
reports = qp.util.parallel(task, grid, n_jobs=-1)
|
||||||
|
with open(global_result_path + '.csv', 'at') as csv:
|
||||||
|
for val, report in zip(grid, reports):
|
||||||
|
means = report.mean()
|
||||||
|
local_result_path = global_result_path + '_' + dataset + (f'_{val:.3f}' if isinstance(val, float) else f'{val}')
|
||||||
|
report.to_csv(f'{local_result_path}.dataframe')
|
||||||
|
csv.write(f'{method}\t{dataset}\t{val}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
|
||||||
|
csv.flush()
|
||||||
|
|
||||||
|
show_results(global_result_path)
|
|
@ -116,7 +116,7 @@ def run(experiment):
|
||||||
model,
|
model,
|
||||||
protocol=APP(test, n_prevalences=21, repeats=100)
|
protocol=APP(test, n_prevalences=21, repeats=100)
|
||||||
)
|
)
|
||||||
test_true_prevalence = data.test.prevalence()
|
test_true_prevalence = data.mixture.prevalence()
|
||||||
|
|
||||||
evaluate_experiment(true_prevalences, estim_prevalences)
|
evaluate_experiment(true_prevalences, estim_prevalences)
|
||||||
save_results(dataset_name, model_name, run, optim_loss,
|
save_results(dataset_name, model_name, run, optim_loss,
|
||||||
|
|
|
@ -0,0 +1,244 @@
|
||||||
|
from scipy.sparse import csc_matrix, csr_matrix
|
||||||
|
from sklearn.base import BaseEstimator, TransformerMixin
|
||||||
|
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
|
||||||
|
import numpy as np
|
||||||
|
from joblib import Parallel, delayed
|
||||||
|
import sklearn
|
||||||
|
import math
|
||||||
|
from scipy.stats import t
|
||||||
|
|
||||||
|
|
||||||
|
class ContTable:
|
||||||
|
def __init__(self, tp=0, tn=0, fp=0, fn=0):
|
||||||
|
self.tp=tp
|
||||||
|
self.tn=tn
|
||||||
|
self.fp=fp
|
||||||
|
self.fn=fn
|
||||||
|
|
||||||
|
def get_d(self): return self.tp + self.tn + self.fp + self.fn
|
||||||
|
|
||||||
|
def get_c(self): return self.tp + self.fn
|
||||||
|
|
||||||
|
def get_not_c(self): return self.tn + self.fp
|
||||||
|
|
||||||
|
def get_f(self): return self.tp + self.fp
|
||||||
|
|
||||||
|
def get_not_f(self): return self.tn + self.fn
|
||||||
|
|
||||||
|
def p_c(self): return (1.0*self.get_c())/self.get_d()
|
||||||
|
|
||||||
|
def p_not_c(self): return 1.0-self.p_c()
|
||||||
|
|
||||||
|
def p_f(self): return (1.0*self.get_f())/self.get_d()
|
||||||
|
|
||||||
|
def p_not_f(self): return 1.0-self.p_f()
|
||||||
|
|
||||||
|
def p_tp(self): return (1.0*self.tp) / self.get_d()
|
||||||
|
|
||||||
|
def p_tn(self): return (1.0*self.tn) / self.get_d()
|
||||||
|
|
||||||
|
def p_fp(self): return (1.0*self.fp) / self.get_d()
|
||||||
|
|
||||||
|
def p_fn(self): return (1.0*self.fn) / self.get_d()
|
||||||
|
|
||||||
|
def tpr(self):
|
||||||
|
c = 1.0*self.get_c()
|
||||||
|
return self.tp / c if c > 0.0 else 0.0
|
||||||
|
|
||||||
|
def fpr(self):
|
||||||
|
_c = 1.0*self.get_not_c()
|
||||||
|
return self.fp / _c if _c > 0.0 else 0.0
|
||||||
|
|
||||||
|
|
||||||
|
def __ig_factor(p_tc, p_t, p_c):
|
||||||
|
den = p_t * p_c
|
||||||
|
if den != 0.0 and p_tc != 0:
|
||||||
|
return p_tc * math.log(p_tc / den, 2)
|
||||||
|
else:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
|
||||||
|
def information_gain(cell):
|
||||||
|
return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + \
|
||||||
|
__ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) +\
|
||||||
|
__ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c()) + \
|
||||||
|
__ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c())
|
||||||
|
|
||||||
|
|
||||||
|
def squared_information_gain(cell):
|
||||||
|
return information_gain(cell)**2
|
||||||
|
|
||||||
|
def posneg_information_gain(cell):
|
||||||
|
ig = information_gain(cell)
|
||||||
|
if cell.tpr() < cell.fpr():
|
||||||
|
return -ig
|
||||||
|
else:
|
||||||
|
return ig
|
||||||
|
|
||||||
|
def pos_information_gain(cell):
|
||||||
|
if cell.tpr() < cell.fpr():
|
||||||
|
return 0
|
||||||
|
else:
|
||||||
|
return information_gain(cell)
|
||||||
|
|
||||||
|
def pointwise_mutual_information(cell):
|
||||||
|
return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c())
|
||||||
|
|
||||||
|
|
||||||
|
def gss(cell):
|
||||||
|
return cell.p_tp()*cell.p_tn() - cell.p_fp()*cell.p_fn()
|
||||||
|
|
||||||
|
|
||||||
|
def chi_square(cell):
|
||||||
|
den = cell.p_f() * cell.p_not_f() * cell.p_c() * cell.p_not_c()
|
||||||
|
if den==0.0: return 0.0
|
||||||
|
num = gss(cell)**2
|
||||||
|
return num / den
|
||||||
|
|
||||||
|
|
||||||
|
def conf_interval(xt, n):
|
||||||
|
if n>30:
|
||||||
|
z2 = 3.84145882069 # norm.ppf(0.5+0.95/2.0)**2
|
||||||
|
else:
|
||||||
|
z2 = t.ppf(0.5 + 0.95 / 2.0, df=max(n-1,1)) ** 2
|
||||||
|
p = (xt + 0.5 * z2) / (n + z2)
|
||||||
|
amplitude = 0.5 * z2 * math.sqrt((p * (1.0 - p)) / (n + z2))
|
||||||
|
return p, amplitude
|
||||||
|
|
||||||
|
|
||||||
|
def strength(minPosRelFreq, minPos, maxNeg):
|
||||||
|
if minPos > maxNeg:
|
||||||
|
return math.log(2.0 * minPosRelFreq, 2.0)
|
||||||
|
else:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
|
||||||
|
#set cancel_features=True to allow some features to be weighted as 0 (as in the original article)
|
||||||
|
#however, for some extremely imbalanced dataset caused all documents to be 0
|
||||||
|
def conf_weight(cell, cancel_features=False):
|
||||||
|
c = cell.get_c()
|
||||||
|
not_c = cell.get_not_c()
|
||||||
|
tp = cell.tp
|
||||||
|
fp = cell.fp
|
||||||
|
|
||||||
|
pos_p, pos_amp = conf_interval(tp, c)
|
||||||
|
neg_p, neg_amp = conf_interval(fp, not_c)
|
||||||
|
|
||||||
|
min_pos = pos_p-pos_amp
|
||||||
|
max_neg = neg_p+neg_amp
|
||||||
|
den = (min_pos + max_neg)
|
||||||
|
minpos_relfreq = min_pos / (den if den != 0 else 1)
|
||||||
|
|
||||||
|
str_tplus = strength(minpos_relfreq, min_pos, max_neg);
|
||||||
|
|
||||||
|
if str_tplus == 0 and not cancel_features:
|
||||||
|
return 1e-20
|
||||||
|
|
||||||
|
return str_tplus;
|
||||||
|
|
||||||
|
def get_tsr_matrix(cell_matrix, tsr_score_funtion):
|
||||||
|
nC = len(cell_matrix)
|
||||||
|
nF = len(cell_matrix[0])
|
||||||
|
tsr_matrix = [[tsr_score_funtion(cell_matrix[c,f]) for f in range(nF)] for c in range(nC)]
|
||||||
|
return np.array(tsr_matrix)
|
||||||
|
|
||||||
|
|
||||||
|
def feature_label_contingency_table(positive_document_indexes, feature_document_indexes, nD):
|
||||||
|
tp_ = len(positive_document_indexes & feature_document_indexes)
|
||||||
|
fp_ = len(feature_document_indexes - positive_document_indexes)
|
||||||
|
fn_ = len(positive_document_indexes - feature_document_indexes)
|
||||||
|
tn_ = nD - (tp_ + fp_ + fn_)
|
||||||
|
return ContTable(tp=tp_, tn=tn_, fp=fp_, fn=fn_)
|
||||||
|
|
||||||
|
def category_tables(feature_sets, category_sets, c, nD, nF):
|
||||||
|
return [feature_label_contingency_table(category_sets[c], feature_sets[f], nD) for f in range(nF)]
|
||||||
|
|
||||||
|
def get_supervised_matrix(coocurrence_matrix, label_matrix, n_jobs=-1):
|
||||||
|
"""
|
||||||
|
Computes the nC x nF supervised matrix M where Mcf is the 4-cell contingency table for feature f and class c.
|
||||||
|
Efficiency O(nF x nC x log(S)) where S is the sparse factor
|
||||||
|
"""
|
||||||
|
|
||||||
|
nD, nF = coocurrence_matrix.shape
|
||||||
|
nD2, nC = label_matrix.shape
|
||||||
|
|
||||||
|
if nD != nD2:
|
||||||
|
raise ValueError('Number of rows in coocurrence matrix shape %s and label matrix shape %s is not consistent' %
|
||||||
|
(coocurrence_matrix.shape,label_matrix.shape))
|
||||||
|
|
||||||
|
def nonzero_set(matrix, col):
|
||||||
|
return set(matrix[:, col].nonzero()[0])
|
||||||
|
|
||||||
|
if isinstance(coocurrence_matrix, csr_matrix):
|
||||||
|
coocurrence_matrix = csc_matrix(coocurrence_matrix)
|
||||||
|
feature_sets = [nonzero_set(coocurrence_matrix, f) for f in range(nF)]
|
||||||
|
category_sets = [nonzero_set(label_matrix, c) for c in range(nC)]
|
||||||
|
cell_matrix = Parallel(n_jobs=n_jobs, backend="threading")(delayed(category_tables)(feature_sets, category_sets, c, nD, nF) for c in range(nC))
|
||||||
|
return np.array(cell_matrix)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class TSRweighting(BaseEstimator,TransformerMixin):
|
||||||
|
"""
|
||||||
|
Supervised Term Weighting function based on any Term Selection Reduction (TSR) function (e.g., information gain,
|
||||||
|
chi-square, etc.) or, more generally, on any function that could be computed on the 4-cell contingency table for
|
||||||
|
each category-feature pair.
|
||||||
|
The supervised_4cell_matrix (a CxF matrix containing the 4-cell contingency tables
|
||||||
|
for each category-feature pair) can be pre-computed (e.g., during the feature selection phase) and passed as an
|
||||||
|
argument.
|
||||||
|
When C>1, i.e., in multiclass scenarios, a global_policy is used in order to determine a single feature-score which
|
||||||
|
informs about its relevance. Accepted policies include "max" (takes the max score across categories), "ave" and "wave"
|
||||||
|
(take the average, or weighted average, across all categories -- weights correspond to the class prevalence), and "sum"
|
||||||
|
(which sums all category scores).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, tsr_function, global_policy='max', supervised_4cell_matrix=None, sublinear_tf=True, norm='l2', min_df=3, n_jobs=-1):
|
||||||
|
if global_policy not in ['max', 'ave', 'wave', 'sum']: raise ValueError('Global policy should be in {"max", "ave", "wave", "sum"}')
|
||||||
|
self.tsr_function = tsr_function
|
||||||
|
self.global_policy = global_policy
|
||||||
|
self.supervised_4cell_matrix = supervised_4cell_matrix
|
||||||
|
self.sublinear_tf=sublinear_tf
|
||||||
|
self.norm=norm
|
||||||
|
self.min_df = min_df
|
||||||
|
self.n_jobs=n_jobs
|
||||||
|
|
||||||
|
def fit(self, X, y):
|
||||||
|
self.count_vectorizer = CountVectorizer(min_df=self.min_df)
|
||||||
|
X = self.count_vectorizer.fit_transform(X)
|
||||||
|
|
||||||
|
self.tf_vectorizer = TfidfTransformer(
|
||||||
|
norm=None, use_idf=False, smooth_idf=False, sublinear_tf=self.sublinear_tf).fit(X)
|
||||||
|
|
||||||
|
if len(y.shape) == 1:
|
||||||
|
y = np.expand_dims(y, axis=1)
|
||||||
|
|
||||||
|
nD, nC = y.shape
|
||||||
|
nF = len(self.tf_vectorizer.get_feature_names_out())
|
||||||
|
|
||||||
|
if self.supervised_4cell_matrix is None:
|
||||||
|
self.supervised_4cell_matrix = get_supervised_matrix(X, y, n_jobs=self.n_jobs)
|
||||||
|
else:
|
||||||
|
if self.supervised_4cell_matrix.shape != (nC, nF): raise ValueError("Shape of supervised information matrix is inconsistent with X and y")
|
||||||
|
tsr_matrix = get_tsr_matrix(self.supervised_4cell_matrix, self.tsr_function)
|
||||||
|
if self.global_policy == 'ave':
|
||||||
|
self.global_tsr_vector = np.average(tsr_matrix, axis=0)
|
||||||
|
elif self.global_policy == 'wave':
|
||||||
|
category_prevalences = [sum(y[:,c])*1.0/nD for c in range(nC)]
|
||||||
|
self.global_tsr_vector = np.average(tsr_matrix, axis=0, weights=category_prevalences)
|
||||||
|
elif self.global_policy == 'sum':
|
||||||
|
self.global_tsr_vector = np.sum(tsr_matrix, axis=0)
|
||||||
|
elif self.global_policy == 'max':
|
||||||
|
self.global_tsr_vector = np.amax(tsr_matrix, axis=0)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def fit_transform(self, X, y):
|
||||||
|
return self.fit(X,y).transform(X)
|
||||||
|
|
||||||
|
def transform(self, X):
|
||||||
|
if not hasattr(self, 'global_tsr_vector'): raise NameError('TSRweighting: transform method called before fit.')
|
||||||
|
X = self.count_vectorizer.transform(X)
|
||||||
|
tf_X = self.tf_vectorizer.transform(X).toarray()
|
||||||
|
weighted_X = np.multiply(tf_X, self.global_tsr_vector)
|
||||||
|
if self.norm is not None and self.norm!='none':
|
||||||
|
weighted_X = sklearn.preprocessing.normalize(weighted_X, norm=self.norm, axis=1, copy=False)
|
||||||
|
return csr_matrix(weighted_X)
|
|
@ -0,0 +1,62 @@
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
import quapy as qp
|
||||||
|
from method.aggregative import DistributionMatching
|
||||||
|
from distribution_matching.method.method_kdey import KDEy
|
||||||
|
from protocol import UPP
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
qp.environ['SAMPLE_SIZE'] = 100
|
||||||
|
qp.environ['N_JOBS'] = -1
|
||||||
|
method = 'KDE'
|
||||||
|
param = 0.1
|
||||||
|
div = 'topsoe'
|
||||||
|
method_identifier = f'{method}_{param}_{div}'
|
||||||
|
|
||||||
|
# generates tuples (dataset, method, method_name)
|
||||||
|
# (the dataset is needed for methods that process the dataset differently)
|
||||||
|
def gen_methods():
|
||||||
|
|
||||||
|
for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST:
|
||||||
|
|
||||||
|
data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True)
|
||||||
|
|
||||||
|
if method == 'KDE':
|
||||||
|
kdey = KDEy(LogisticRegression(), divergence=div, bandwidth=param, engine='sklearn')
|
||||||
|
yield data, kdey, method_identifier
|
||||||
|
|
||||||
|
elif method == 'DM':
|
||||||
|
dm = DistributionMatching(LogisticRegression(), divergence=div, nbins=param)
|
||||||
|
yield data, dm, method_identifier
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise NotImplementedError('unknown method')
|
||||||
|
|
||||||
|
os.makedirs('results', exist_ok=True)
|
||||||
|
result_path = f'results/{method_identifier}.csv'
|
||||||
|
|
||||||
|
if os.path.exists(result_path):
|
||||||
|
print('Result already exit. Nothing to do')
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
with open(result_path, 'wt') as csv:
|
||||||
|
csv.write(f'Method\tDataset\tMAE\tMRAE\n')
|
||||||
|
for data, quantifier, quant_name in gen_methods():
|
||||||
|
quantifier.fit(data.training)
|
||||||
|
protocol = UPP(data.mixture, repeats=100)
|
||||||
|
report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae'], verbose=True)
|
||||||
|
means = report.mean()
|
||||||
|
csv.write(f'{quant_name}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n')
|
||||||
|
csv.flush()
|
||||||
|
|
||||||
|
df = pd.read_csv(result_path, sep='\t')
|
||||||
|
|
||||||
|
pd.set_option('display.max_columns', None)
|
||||||
|
pd.set_option('display.max_rows', None)
|
||||||
|
pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"])
|
||||||
|
print(pv)
|
|
@ -0,0 +1,148 @@
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
|
||||||
|
import quapy as qp
|
||||||
|
from data import LabelledCollection
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from laboratory.custom_vectorizers import *
|
||||||
|
from protocol import APP
|
||||||
|
from quapy.method.aggregative import _get_divergence, HDy, DistributionMatching
|
||||||
|
from quapy.method.base import BaseQuantifier
|
||||||
|
from scipy import optimize
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: explore the bernoulli (term presence/absence) variant
|
||||||
|
# TODO: explore the multinomial (term frequency) variant
|
||||||
|
# TODO: explore the multinomial + length normalization variant
|
||||||
|
# TODO: consolidate the TSR-variant (e.g., using information gain) variant;
|
||||||
|
# - works better with the idf?
|
||||||
|
# - works better with length normalization?
|
||||||
|
# - etc
|
||||||
|
|
||||||
|
class DxS(BaseQuantifier):
|
||||||
|
def __init__(self, vectorizer=None, divergence='topsoe'):
|
||||||
|
self.vectorizer = vectorizer
|
||||||
|
self.divergence = divergence
|
||||||
|
|
||||||
|
# def __as_distribution(self, instances):
|
||||||
|
# return np.asarray(instances.sum(axis=0) / instances.sum()).flatten()
|
||||||
|
|
||||||
|
def __as_distribution(self, instances):
|
||||||
|
dist = instances.sum(axis=0) / instances.sum()
|
||||||
|
return np.asarray(dist).flatten()
|
||||||
|
|
||||||
|
def fit(self, data: LabelledCollection):
|
||||||
|
|
||||||
|
text_instances, labels = data.Xy
|
||||||
|
|
||||||
|
if self.vectorizer is not None:
|
||||||
|
text_instances = self.vectorizer.fit_transform(text_instances, y=labels)
|
||||||
|
|
||||||
|
distributions = []
|
||||||
|
for class_i in data.classes_:
|
||||||
|
distributions.append(self.__as_distribution(text_instances[labels == class_i]))
|
||||||
|
self.validation_distribution = np.asarray(distributions)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def quantify(self, text_instances):
|
||||||
|
if self.vectorizer is not None:
|
||||||
|
text_instances = self.vectorizer.transform(text_instances)
|
||||||
|
|
||||||
|
test_distribution = self.__as_distribution(text_instances)
|
||||||
|
divergence = _get_divergence(self.divergence)
|
||||||
|
n_classes, n_feats = self.validation_distribution.shape
|
||||||
|
|
||||||
|
def match(prev):
|
||||||
|
prev = np.expand_dims(prev, axis=0)
|
||||||
|
mixture_distribution = (prev @ self.validation_distribution).flatten()
|
||||||
|
return divergence(test_distribution, mixture_distribution)
|
||||||
|
|
||||||
|
# the initial point is set as the uniform distribution
|
||||||
|
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
|
||||||
|
|
||||||
|
# solutions are bounded to those contained in the unit-simplex
|
||||||
|
bounds = tuple((0, 1) for x in range(n_classes)) # values in [0,1]
|
||||||
|
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
|
||||||
|
r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
|
||||||
|
return r.x
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
qp.environ['SAMPLE_SIZE'] = 250
|
||||||
|
qp.environ['N_JOBS'] = -1
|
||||||
|
min_df = 10
|
||||||
|
# dataset = 'imdb'
|
||||||
|
repeats = 10
|
||||||
|
error = 'mae'
|
||||||
|
|
||||||
|
div = 'topsoe'
|
||||||
|
|
||||||
|
# generates tuples (dataset, method, method_name)
|
||||||
|
# (the dataset is needed for methods that process the dataset differently)
|
||||||
|
def gen_methods():
|
||||||
|
|
||||||
|
for dataset in qp.datasets.REVIEWS_SENTIMENT_DATASETS:
|
||||||
|
|
||||||
|
data = qp.datasets.fetch_reviews(dataset, tfidf=False)
|
||||||
|
|
||||||
|
bernoulli_vectorizer = CountVectorizer(min_df=min_df, binary=True)
|
||||||
|
dxs = DxS(divergence=div, vectorizer=bernoulli_vectorizer)
|
||||||
|
yield data, dxs, 'DxS-Bernoulli'
|
||||||
|
|
||||||
|
multinomial_vectorizer = CountVectorizer(min_df=min_df, binary=False)
|
||||||
|
dxs = DxS(divergence=div, vectorizer=multinomial_vectorizer)
|
||||||
|
yield data, dxs, 'DxS-multinomial'
|
||||||
|
|
||||||
|
tf_vectorizer = TfidfVectorizer(sublinear_tf=False, use_idf=False, min_df=min_df, norm=None)
|
||||||
|
dxs = DxS(divergence=div, vectorizer=tf_vectorizer)
|
||||||
|
yield data, dxs, 'DxS-TF'
|
||||||
|
|
||||||
|
logtf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=False, min_df=min_df, norm=None)
|
||||||
|
dxs = DxS(divergence=div, vectorizer=logtf_vectorizer)
|
||||||
|
yield data, dxs, 'DxS-logTF'
|
||||||
|
|
||||||
|
tfidf_vectorizer = TfidfVectorizer(use_idf=True, min_df=min_df, norm=None)
|
||||||
|
dxs = DxS(divergence=div, vectorizer=tfidf_vectorizer)
|
||||||
|
yield data, dxs, 'DxS-TFIDF'
|
||||||
|
|
||||||
|
tfidf_vectorizer = TfidfVectorizer(use_idf=True, min_df=min_df, norm='l2')
|
||||||
|
dxs = DxS(divergence=div, vectorizer=tfidf_vectorizer)
|
||||||
|
yield data, dxs, 'DxS-TFIDF-l2'
|
||||||
|
|
||||||
|
tsr_vectorizer = TSRweighting(tsr_function=information_gain, min_df=min_df, norm='l2')
|
||||||
|
dxs = DxS(divergence=div, vectorizer=tsr_vectorizer)
|
||||||
|
yield data, dxs, 'DxS-TFTSR-l2'
|
||||||
|
|
||||||
|
data = qp.datasets.fetch_reviews(dataset, tfidf=True, min_df=min_df)
|
||||||
|
hdy = HDy(LogisticRegression())
|
||||||
|
yield data, hdy, 'HDy'
|
||||||
|
|
||||||
|
dm = DistributionMatching(LogisticRegression(), divergence=div, nbins=5)
|
||||||
|
yield data, dm, 'DM-5b'
|
||||||
|
|
||||||
|
dm = DistributionMatching(LogisticRegression(), divergence=div, nbins=10)
|
||||||
|
yield data, dm, 'DM-10b'
|
||||||
|
|
||||||
|
|
||||||
|
result_path = 'results.csv'
|
||||||
|
with open(result_path, 'wt') as csv:
|
||||||
|
csv.write(f'Method\tDataset\tMAE\tMRAE\n')
|
||||||
|
for data, quantifier, quant_name in gen_methods():
|
||||||
|
quantifier.fit(data.training)
|
||||||
|
report = qp.evaluation.evaluation_report(quantifier, APP(data.mixture, repeats=repeats), error_metrics=['mae', 'mrae'], verbose=True)
|
||||||
|
means = report.mean()
|
||||||
|
csv.write(f'{quant_name}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n')
|
||||||
|
|
||||||
|
df = pd.read_csv(result_path, sep='\t')
|
||||||
|
# print(df)
|
||||||
|
|
||||||
|
pv = df.pivot_table(index='Method', columns="Dataset", values=["MAE", "MRAE"])
|
||||||
|
print(pv)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,15 @@
|
||||||
|
Change Log 0.1.8
|
||||||
|
----------------
|
||||||
|
|
||||||
|
- A conceptual error in MedianSweep and MedianSweep2 has been solved. The error consisted on computing all TPRs and
|
||||||
|
FPRs and report the median; then, the adjustment then operated on these single values. Instead, the original
|
||||||
|
method proposed by G.Forman comes down to generating all prevalence predictions, for all TPRs and FPRs, and then
|
||||||
|
computing the median of it.
|
||||||
|
|
||||||
|
- qp.evaluation now runs in parallel <improve, remove or fix the ongoing error, put at the qp. level instead of
|
||||||
|
qp.evaluation because I don't like the qp.evaluation.evaluate thing>
|
||||||
|
|
||||||
|
|
||||||
Change Log 0.1.7
|
Change Log 0.1.7
|
||||||
----------------
|
----------------
|
||||||
|
|
||||||
|
|
|
@ -59,7 +59,7 @@ class RecalibratedProbabilisticClassifierBase(BaseEstimator, RecalibratedProbabi
|
||||||
elif isinstance(k, float):
|
elif isinstance(k, float):
|
||||||
if not (0 < k < 1):
|
if not (0 < k < 1):
|
||||||
raise ValueError('wrong value for val_split: the proportion of validation documents must be in (0,1)')
|
raise ValueError('wrong value for val_split: the proportion of validation documents must be in (0,1)')
|
||||||
return self.fit_cv(X, y)
|
return self.fit_tr_val(X, y)
|
||||||
|
|
||||||
def fit_cv(self, X, y):
|
def fit_cv(self, X, y):
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -176,7 +176,7 @@ class NeuralClassifierTrainer:
|
||||||
self.classes_ = train.classes_
|
self.classes_ = train.classes_
|
||||||
opt = self.trainer_hyperparams
|
opt = self.trainer_hyperparams
|
||||||
checkpoint = self.checkpointpath
|
checkpoint = self.checkpointpath
|
||||||
self.reset_net_params(self.vocab_size, train.n_classes)
|
self.reset_net_params(self.vocab_size, train.arange_classes)
|
||||||
|
|
||||||
train_generator = TorchDataset(train.instances, train.labels).asDataloader(
|
train_generator = TorchDataset(train.instances, train.labels).asDataloader(
|
||||||
opt['batch_size'], shuffle=True, pad_length=opt['padding_length'], device=opt['device'])
|
opt['batch_size'], shuffle=True, pad_length=opt['padding_length'], device=opt['device'])
|
||||||
|
|
|
@ -123,7 +123,7 @@ class LabelledCollection:
|
||||||
return self.uniform_sampling_index(size, random_state=random_state)
|
return self.uniform_sampling_index(size, random_state=random_state)
|
||||||
if len(prevs) == self.n_classes - 1:
|
if len(prevs) == self.n_classes - 1:
|
||||||
prevs = prevs + (1 - sum(prevs),)
|
prevs = prevs + (1 - sum(prevs),)
|
||||||
assert len(prevs) == self.n_classes, 'unexpected number of prevalences'
|
assert len(prevs) == self.n_classes, f'unexpected number of prevalences (found {len(prevs)}, expected {self.n_classes})'
|
||||||
assert sum(prevs) == 1, f'prevalences ({prevs}) wrong range (sum={sum(prevs)})'
|
assert sum(prevs) == 1, f'prevalences ({prevs}) wrong range (sum={sum(prevs)})'
|
||||||
|
|
||||||
# Decide how many instances should be taken for each class in order to satisfy the requested prevalence
|
# Decide how many instances should be taken for each class in order to satisfy the requested prevalence
|
||||||
|
@ -269,6 +269,7 @@ class LabelledCollection:
|
||||||
test = self.sampling_from_index(right)
|
test = self.sampling_from_index(right)
|
||||||
return training, test
|
return training, test
|
||||||
|
|
||||||
|
|
||||||
def __add__(self, other):
|
def __add__(self, other):
|
||||||
"""
|
"""
|
||||||
Returns a new :class:`LabelledCollection` as the union of this collection with another collection.
|
Returns a new :class:`LabelledCollection` as the union of this collection with another collection.
|
||||||
|
@ -319,7 +320,8 @@ class LabelledCollection:
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError('unsupported operation for collection types')
|
raise NotImplementedError('unsupported operation for collection types')
|
||||||
labels = np.concatenate([lc.labels for lc in args])
|
labels = np.concatenate([lc.labels for lc in args])
|
||||||
classes = np.unique(labels).sort()
|
classes = np.unique(labels)
|
||||||
|
classes.sort()
|
||||||
return LabelledCollection(instances, labels, classes=classes)
|
return LabelledCollection(instances, labels, classes=classes)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
|
@ -1,3 +1,6 @@
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
def warn(*args, **kwargs):
|
def warn(*args, **kwargs):
|
||||||
pass
|
pass
|
||||||
import warnings
|
import warnings
|
||||||
|
@ -7,7 +10,8 @@ import zipfile
|
||||||
from os.path import join
|
from os.path import join
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import scipy
|
import scipy
|
||||||
|
import pickle
|
||||||
|
from ucimlrepo import fetch_ucirepo
|
||||||
from quapy.data.base import Dataset, LabelledCollection
|
from quapy.data.base import Dataset, LabelledCollection
|
||||||
from quapy.data.preprocessing import text2tfidf, reduce_columns
|
from quapy.data.preprocessing import text2tfidf, reduce_columns
|
||||||
from quapy.data.reader import *
|
from quapy.data.reader import *
|
||||||
|
@ -45,6 +49,12 @@ UCI_DATASETS = ['acute.a', 'acute.b',
|
||||||
'wine-q-red', 'wine-q-white',
|
'wine-q-red', 'wine-q-white',
|
||||||
'yeast']
|
'yeast']
|
||||||
|
|
||||||
|
UCI_MULTICLASS_DATASETS = ['dry-bean',
|
||||||
|
'wine-quality',
|
||||||
|
'academic-success',
|
||||||
|
'digits',
|
||||||
|
'letter']
|
||||||
|
|
||||||
LEQUA2022_TASKS = ['T1A', 'T1B', 'T2A', 'T2B']
|
LEQUA2022_TASKS = ['T1A', 'T1B', 'T2A', 'T2B']
|
||||||
|
|
||||||
_TXA_SAMPLE_SIZE = 250
|
_TXA_SAMPLE_SIZE = 250
|
||||||
|
@ -204,7 +214,7 @@ def fetch_UCIDataset(dataset_name, data_home=None, test_split=0.3, verbose=False
|
||||||
:return: a :class:`quapy.data.base.Dataset` instance
|
:return: a :class:`quapy.data.base.Dataset` instance
|
||||||
"""
|
"""
|
||||||
data = fetch_UCILabelledCollection(dataset_name, data_home, verbose)
|
data = fetch_UCILabelledCollection(dataset_name, data_home, verbose)
|
||||||
return Dataset(*data.split_stratified(1 - test_split, random_state=0))
|
return Dataset(*data.split_stratified(1 - test_split, random_state=0), name=dataset_name)
|
||||||
|
|
||||||
|
|
||||||
def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection:
|
def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection:
|
||||||
|
@ -548,11 +558,110 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) ->
|
||||||
data.stats()
|
data.stats()
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
def _df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):
|
def _df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):
|
||||||
df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False)
|
df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False)
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset:
|
||||||
|
"""
|
||||||
|
Loads a UCI multiclass dataset as an instance of :class:`quapy.data.base.Dataset`, as used in
|
||||||
|
`Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
|
||||||
|
Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
|
||||||
|
Information Fusion, 34, 87-100. <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_
|
||||||
|
and
|
||||||
|
`Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).
|
||||||
|
Dynamic ensemble selection for quantification tasks.
|
||||||
|
Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_.
|
||||||
|
The datasets do not come with a predefined train-test split (see :meth:`fetch_UCILabelledCollection` for further
|
||||||
|
information on how to use these collections), and so a train-test split is generated at desired proportion.
|
||||||
|
The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_DATASETS`
|
||||||
|
|
||||||
|
:param dataset_name: a dataset name
|
||||||
|
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
|
||||||
|
~/quay_data/ directory)
|
||||||
|
:param test_split: proportion of documents to be included in the test set. The rest conforms the training set
|
||||||
|
:param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
|
||||||
|
:return: a :class:`quapy.data.base.Dataset` instance
|
||||||
|
"""
|
||||||
|
data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, verbose)
|
||||||
|
return Dataset(*data.split_stratified(1 - test_split, random_state=0))
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection:
|
||||||
|
"""
|
||||||
|
Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`, as used in
|
||||||
|
`Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
|
||||||
|
Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
|
||||||
|
Information Fusion, 34, 87-100. <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_
|
||||||
|
and
|
||||||
|
`Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).
|
||||||
|
Dynamic ensemble selection for quantification tasks.
|
||||||
|
Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_.
|
||||||
|
The datasets do not come with a predefined train-test split, and so Pérez-Gállego et al. adopted a 5FCVx2 evaluation
|
||||||
|
protocol, meaning that each collection was used to generate two rounds (hence the x2) of 5 fold cross validation.
|
||||||
|
This can be reproduced by using :meth:`quapy.data.base.Dataset.kFCV`, e.g.:
|
||||||
|
|
||||||
|
>>> import quapy as qp
|
||||||
|
>>> collection = qp.datasets.fetch_UCILabelledCollection("dry-bean")
|
||||||
|
>>> for data in qp.domains.Dataset.kFCV(collection, nfolds=5, nrepeats=2):
|
||||||
|
>>> ...
|
||||||
|
|
||||||
|
The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS`
|
||||||
|
|
||||||
|
:param dataset_name: a dataset name
|
||||||
|
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
|
||||||
|
~/quay_data/ directory)
|
||||||
|
:param test_split: proportion of documents to be included in the test set. The rest conforms the training set
|
||||||
|
:param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
|
||||||
|
:return: a :class:`quapy.data.base.LabelledCollection` instance
|
||||||
|
"""
|
||||||
|
assert dataset_name in UCI_MULTICLASS_DATASETS, \
|
||||||
|
f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository (multiclass). ' \
|
||||||
|
f'Valid ones are {UCI_MULTICLASS_DATASETS}'
|
||||||
|
|
||||||
|
if data_home is None:
|
||||||
|
data_home = get_quapy_home()
|
||||||
|
|
||||||
|
identifiers = {"dry-bean": 602,
|
||||||
|
"wine-quality": 186,
|
||||||
|
"academic-success": 697,
|
||||||
|
"digits": 80,
|
||||||
|
"letter": 59}
|
||||||
|
|
||||||
|
full_names = {"dry-bean": "Dry Bean Dataset",
|
||||||
|
"wine-quality": "Wine Quality",
|
||||||
|
"academic-success": "Predict students' dropout and academic success",
|
||||||
|
"digits": "Optical Recognition of Handwritten Digits",
|
||||||
|
"letter": "Letter Recognition"
|
||||||
|
}
|
||||||
|
|
||||||
|
identifier = identifiers[dataset_name]
|
||||||
|
fullname = full_names[dataset_name]
|
||||||
|
|
||||||
|
print(f'Loading UCI Muticlass {dataset_name} ({fullname})')
|
||||||
|
|
||||||
|
file = join(data_home, 'uci_multiclass', dataset_name + '.pkl')
|
||||||
|
if os.path.exists(file):
|
||||||
|
with open(file, 'rb') as file:
|
||||||
|
data = pickle.load(file)
|
||||||
|
else:
|
||||||
|
data = fetch_ucirepo(id=identifier)
|
||||||
|
X, y = data['data']['features'].to_numpy(), data['data']['targets'].to_numpy().squeeze()
|
||||||
|
classes = np.sort(np.unique(y))
|
||||||
|
y = np.searchsorted(classes, y)
|
||||||
|
data = LabelledCollection(X, y)
|
||||||
|
os.makedirs(os.path.dirname(file), exist_ok=True)
|
||||||
|
with open(file, 'wb') as file:
|
||||||
|
pickle.dump(data, file)
|
||||||
|
|
||||||
|
data.stats()
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def _df_replace(df, col, repl={'yes': 1, 'no': 0}, astype=float):
|
||||||
|
df[col] = df[col].apply(lambda x: repl[x]).astype(astype, copy=False)
|
||||||
|
|
||||||
|
|
||||||
def fetch_lequa2022(task, data_home=None):
|
def fetch_lequa2022(task, data_home=None):
|
||||||
"""
|
"""
|
||||||
Loads the official datasets provided for the `LeQua <https://lequa2022.github.io/index>`_ competition.
|
Loads the official datasets provided for the `LeQua <https://lequa2022.github.io/index>`_ competition.
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
from copy import deepcopy
|
||||||
from typing import Union, Callable, Iterable
|
from typing import Union, Callable, Iterable
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
@ -11,7 +12,9 @@ def prediction(
|
||||||
model: BaseQuantifier,
|
model: BaseQuantifier,
|
||||||
protocol: AbstractProtocol,
|
protocol: AbstractProtocol,
|
||||||
aggr_speedup: Union[str, bool] = 'auto',
|
aggr_speedup: Union[str, bool] = 'auto',
|
||||||
verbose=False):
|
verbose=False,
|
||||||
|
verbose_error=None,
|
||||||
|
n_jobs=1):
|
||||||
"""
|
"""
|
||||||
Uses a quantification model to generate predictions for the samples generated via a specific protocol.
|
Uses a quantification model to generate predictions for the samples generated via a specific protocol.
|
||||||
This function is central to all evaluation processes, and is endowed with an optimization to speed-up the
|
This function is central to all evaluation processes, and is endowed with an optimization to speed-up the
|
||||||
|
@ -32,6 +35,11 @@ def prediction(
|
||||||
in the samples to be generated. Set to True or "auto" (default) for letting QuaPy decide whether it is
|
in the samples to be generated. Set to True or "auto" (default) for letting QuaPy decide whether it is
|
||||||
convenient or not. Set to False to deactivate.
|
convenient or not. Set to False to deactivate.
|
||||||
:param verbose: boolean, show or not information in stdout
|
:param verbose: boolean, show or not information in stdout
|
||||||
|
:param verbose_error: an evaluation function to be used to display intermediate results if verbose=True (default None)
|
||||||
|
:param n_jobs: number of parallel workers. Default is 1 so that, if not explicitly requested, the evaluation phase
|
||||||
|
is to be carried out in a single core. That is to say, this parameter will not inspect the environment variable
|
||||||
|
N_JOBS by default. This might be convenient in many situations, since parallelizing the evaluation entails
|
||||||
|
adding an overhead for cloning the objects within different threads that is often not worth the effort.
|
||||||
:return: a tuple `(true_prevs, estim_prevs)` in which each element in the tuple is an array of shape
|
:return: a tuple `(true_prevs, estim_prevs)` in which each element in the tuple is an array of shape
|
||||||
`(n_samples, n_classes)` containing the true, or predicted, prevalence values for each sample
|
`(n_samples, n_classes)` containing the true, or predicted, prevalence values for each sample
|
||||||
"""
|
"""
|
||||||
|
@ -61,16 +69,44 @@ def prediction(
|
||||||
if apply_optimization:
|
if apply_optimization:
|
||||||
pre_classified = model.classify(protocol.get_labelled_collection().instances)
|
pre_classified = model.classify(protocol.get_labelled_collection().instances)
|
||||||
protocol_with_predictions = protocol.on_preclassified_instances(pre_classified)
|
protocol_with_predictions = protocol.on_preclassified_instances(pre_classified)
|
||||||
return __prediction_helper(model.aggregate, protocol_with_predictions, verbose)
|
return __prediction_helper(model, protocol_with_predictions, True, verbose, verbose_error, n_jobs)
|
||||||
else:
|
else:
|
||||||
return __prediction_helper(model.quantify, protocol, verbose)
|
return __prediction_helper(model, protocol, False, verbose, verbose_error, n_jobs)
|
||||||
|
|
||||||
|
|
||||||
def __prediction_helper(quantification_fn, protocol: AbstractProtocol, verbose=False):
|
def __delayed_prediction(args):
|
||||||
|
quantifier, aggregate, sample_instances, sample_prev = args
|
||||||
|
quantifier = deepcopy(quantifier)
|
||||||
|
quant_fn = quantifier.aggregate if aggregate else quantifier.quantify
|
||||||
|
predicted = quant_fn(sample_instances)
|
||||||
|
return sample_prev, predicted
|
||||||
|
|
||||||
|
|
||||||
|
def __prediction_helper(quantifier, protocol: AbstractProtocol, aggregate: bool, verbose=False, verbose_error=None, n_jobs=1):
|
||||||
true_prevs, estim_prevs = [], []
|
true_prevs, estim_prevs = [], []
|
||||||
for sample_instances, sample_prev in tqdm(protocol(), total=protocol.total(), desc='predicting') if verbose else protocol():
|
ongoing_errors = []
|
||||||
estim_prevs.append(quantification_fn(sample_instances))
|
if verbose:
|
||||||
true_prevs.append(sample_prev)
|
pbar = tqdm(protocol(), total=protocol.total(), desc='predicting')
|
||||||
|
if n_jobs==1:
|
||||||
|
quant_fn = quantifier.aggregate if aggregate else quantifier.quantify
|
||||||
|
for sample_instances, sample_prev in pbar if verbose else protocol():
|
||||||
|
predicted = quant_fn(sample_instances)
|
||||||
|
estim_prevs.append(predicted)
|
||||||
|
true_prevs.append(sample_prev)
|
||||||
|
if verbose and verbose_error is not None:
|
||||||
|
err = verbose_error(sample_prev, predicted)
|
||||||
|
ongoing_errors.append(err)
|
||||||
|
pbar.set_description(f'predicting: ongoing error={np.mean(ongoing_errors):.5f}')
|
||||||
|
else:
|
||||||
|
if verbose:
|
||||||
|
print('parallelizing prediction')
|
||||||
|
outputs = qp.util.parallel(
|
||||||
|
__delayed_prediction,
|
||||||
|
((quantifier, aggregate, sample_X, sample_p) for (sample_X, sample_p) in (pbar if verbose else protocol())),
|
||||||
|
seed=qp.environ.get('_R_SEED', None),
|
||||||
|
n_jobs=n_jobs
|
||||||
|
)
|
||||||
|
true_prevs, estim_prevs = list(zip(*outputs))
|
||||||
|
|
||||||
true_prevs = np.asarray(true_prevs)
|
true_prevs = np.asarray(true_prevs)
|
||||||
estim_prevs = np.asarray(estim_prevs)
|
estim_prevs = np.asarray(estim_prevs)
|
||||||
|
@ -82,7 +118,7 @@ def evaluation_report(model: BaseQuantifier,
|
||||||
protocol: AbstractProtocol,
|
protocol: AbstractProtocol,
|
||||||
error_metrics: Iterable[Union[str,Callable]] = 'mae',
|
error_metrics: Iterable[Union[str,Callable]] = 'mae',
|
||||||
aggr_speedup: Union[str, bool] = 'auto',
|
aggr_speedup: Union[str, bool] = 'auto',
|
||||||
verbose=False):
|
verbose=False, verbose_error=None, n_jobs=1):
|
||||||
"""
|
"""
|
||||||
Generates a report (a pandas' DataFrame) containing information of the evaluation of the model as according
|
Generates a report (a pandas' DataFrame) containing information of the evaluation of the model as according
|
||||||
to a specific protocol and in terms of one or more evaluation metrics (errors).
|
to a specific protocol and in terms of one or more evaluation metrics (errors).
|
||||||
|
@ -100,12 +136,19 @@ def evaluation_report(model: BaseQuantifier,
|
||||||
in the samples to be generated. Set to True or "auto" (default) for letting QuaPy decide whether it is
|
in the samples to be generated. Set to True or "auto" (default) for letting QuaPy decide whether it is
|
||||||
convenient or not. Set to False to deactivate.
|
convenient or not. Set to False to deactivate.
|
||||||
:param verbose: boolean, show or not information in stdout
|
:param verbose: boolean, show or not information in stdout
|
||||||
|
:param verbose_error: an evaluation function to be used to display intermediate results if verbose=True (default None)
|
||||||
|
:param n_jobs: number of parallel workers. Default is 1 so that, if not explicitly requested, the evaluation phase
|
||||||
|
is to be carried out in a single core. That is to say, this parameter will not inspect the environment variable
|
||||||
|
N_JOBS by default. This might be convenient in many situations, since parallelizing the evaluation entails
|
||||||
|
adding an overhead for cloning the objects within different threads that is often not worth the effort.
|
||||||
:return: a pandas' DataFrame containing the columns 'true-prev' (the true prevalence of each sample),
|
:return: a pandas' DataFrame containing the columns 'true-prev' (the true prevalence of each sample),
|
||||||
'estim-prev' (the prevalence estimated by the model for each sample), and as many columns as error metrics
|
'estim-prev' (the prevalence estimated by the model for each sample), and as many columns as error metrics
|
||||||
have been indicated, each displaying the score in terms of that metric for every sample.
|
have been indicated, each displaying the score in terms of that metric for every sample.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
true_prevs, estim_prevs = prediction(model, protocol, aggr_speedup=aggr_speedup, verbose=verbose)
|
true_prevs, estim_prevs = prediction(
|
||||||
|
model, protocol, aggr_speedup=aggr_speedup, verbose=verbose, verbose_error=verbose_error, n_jobs=n_jobs
|
||||||
|
)
|
||||||
return _prevalence_report(true_prevs, estim_prevs, error_metrics)
|
return _prevalence_report(true_prevs, estim_prevs, error_metrics)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -90,11 +90,32 @@ def TopsoeDistance(P, Q, epsilon=1e-20):
|
||||||
|
|
||||||
:param P: real-valued array-like of shape `(k,)` representing a discrete distribution
|
:param P: real-valued array-like of shape `(k,)` representing a discrete distribution
|
||||||
:param Q: real-valued array-like of shape `(k,)` representing a discrete distribution
|
:param Q: real-valued array-like of shape `(k,)` representing a discrete distribution
|
||||||
|
:param epsilon: small value to smooth the distributions for numerical stability
|
||||||
:return: float
|
:return: float
|
||||||
"""
|
"""
|
||||||
return np.sum(P*np.log((2*P+epsilon)/(P+Q+epsilon)) + Q*np.log((2*Q+epsilon)/(P+Q+epsilon)))
|
return np.sum(P*np.log((2*P+epsilon)/(P+Q+epsilon)) + Q*np.log((2*Q+epsilon)/(P+Q+epsilon)))
|
||||||
|
|
||||||
|
|
||||||
|
def CauchySchwarz(P, Q, epsilon=1e-20):
|
||||||
|
"""
|
||||||
|
Cauchy-Schwarz divergence between two (discretized) distributions `P` and `Q`.
|
||||||
|
The Cauchy-Schwarz divergence for two discrete distributions of `k` bins is defined as:
|
||||||
|
|
||||||
|
.. math::
|
||||||
|
CS(P,Q) = \\frac{ \\sum_{i=1}^k p_i q_i }{
|
||||||
|
\\left( \\sum_{i=1}^k p^2_i \\right) \\left( \\sum_{i=1}^k q^2_i \\right)
|
||||||
|
}
|
||||||
|
|
||||||
|
:param P: real-valued array-like of shape `(k,)` representing a discrete distribution
|
||||||
|
:param Q: real-valued array-like of shape `(k,)` representing a discrete distribution
|
||||||
|
:param epsilon: small value to smooth the distributions for numerical stability
|
||||||
|
:return: float
|
||||||
|
"""
|
||||||
|
P += epsilon
|
||||||
|
Q += epsilon
|
||||||
|
return - np.log(sum(P * Q) / np.sqrt(sum(P ** 2) * sum(Q ** 2)))
|
||||||
|
|
||||||
|
|
||||||
def uniform_prevalence_sampling(n_classes, size=1):
|
def uniform_prevalence_sampling(n_classes, size=1):
|
||||||
"""
|
"""
|
||||||
Implements the `Kraemer algorithm <http://www.cs.cmu.edu/~nasmith/papers/smith+tromble.tr04.pdf>`_
|
Implements the `Kraemer algorithm <http://www.cs.cmu.edu/~nasmith/papers/smith+tromble.tr04.pdf>`_
|
||||||
|
@ -276,3 +297,24 @@ def check_prevalence_vector(p, raise_exception=False, toleranze=1e-08):
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def optim_minimize(loss, n_classes):
|
||||||
|
"""
|
||||||
|
Searches for the optimal prevalence values, i.e., an `n_classes`-dimensional vector of the (`n_classes`-1)-simplex
|
||||||
|
that yields the smallest lost. This optimization is carried out by means of a constrained search using scipy's
|
||||||
|
SLSQP routine.
|
||||||
|
|
||||||
|
:param loss: (callable) the function to minimize
|
||||||
|
:param n_classes: (int) the number of classes, i.e., the dimensionality of the prevalence vector
|
||||||
|
:return: (ndarray) the best prevalence vector found
|
||||||
|
"""
|
||||||
|
from scipy import optimize
|
||||||
|
|
||||||
|
# the initial point is set as the uniform distribution
|
||||||
|
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
|
||||||
|
|
||||||
|
# solutions are bounded to those contained in the unit-simplex
|
||||||
|
bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1]
|
||||||
|
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
|
||||||
|
r = optimize.minimize(loss, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
|
||||||
|
return r.x
|
||||||
|
|
|
@ -604,10 +604,17 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
|
||||||
|
|
||||||
def _get_divergence(divergence: Union[str, Callable]):
|
def _get_divergence(divergence: Union[str, Callable]):
|
||||||
if isinstance(divergence, str):
|
if isinstance(divergence, str):
|
||||||
if divergence=='HD':
|
divergence = divergence.lower()
|
||||||
|
if divergence=='hd':
|
||||||
return F.HellingerDistance
|
return F.HellingerDistance
|
||||||
elif divergence=='topsoe':
|
elif divergence=='topsoe':
|
||||||
return F.TopsoeDistance
|
return F.TopsoeDistance
|
||||||
|
elif divergence.lower()=='cs':
|
||||||
|
return F.CauchySchwarz
|
||||||
|
elif divergence.lower()=='l2':
|
||||||
|
return lambda a,b: np.linalg.norm(a-b)
|
||||||
|
elif divergence.lower()=='l1':
|
||||||
|
return lambda a,b: np.linalg.norm(a-b, ord=1)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f'unknown divergence {divergence}')
|
raise ValueError(f'unknown divergence {divergence}')
|
||||||
elif callable(divergence):
|
elif callable(divergence):
|
||||||
|
@ -770,7 +777,9 @@ class DistributionMatching(AggregativeProbabilisticQuantifier):
|
||||||
"""
|
"""
|
||||||
Trains the classifier (if requested) and generates the validation distributions out of the training data.
|
Trains the classifier (if requested) and generates the validation distributions out of the training data.
|
||||||
The validation distributions have shape `(n, ch, nbins)`, with `n` the number of classes, `ch` the number of
|
The validation distributions have shape `(n, ch, nbins)`, with `n` the number of classes, `ch` the number of
|
||||||
channels, and `nbins` the number of bins. In particular, let `V` be the validation distributions; `di=V[i]`
|
channels (a channel is a description, in form of a histogram, of a specific class -- there are as many channels
|
||||||
|
as classes, although in the binary case one can use only one channel, since the other one is constrained),
|
||||||
|
and `nbins` the number of bins. In particular, let `V` be the validation distributions; `di=V[i]`
|
||||||
are the distributions obtained from training data labelled with class `i`; `dij = di[j]` is the discrete
|
are the distributions obtained from training data labelled with class `i`; `dij = di[j]` is the discrete
|
||||||
distribution of posterior probabilities `P(Y=j|X=x)` for training data labelled with class `i`, and `dij[k]`
|
distribution of posterior probabilities `P(Y=j|X=x)` for training data labelled with class `i`, and `dij[k]`
|
||||||
is the fraction of instances with a value in the `k`-th bin.
|
is the fraction of instances with a value in the `k`-th bin.
|
||||||
|
@ -819,7 +828,7 @@ class DistributionMatching(AggregativeProbabilisticQuantifier):
|
||||||
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
|
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
|
||||||
|
|
||||||
# solutions are bounded to those contained in the unit-simplex
|
# solutions are bounded to those contained in the unit-simplex
|
||||||
bounds = tuple((0, 1) for x in range(n_classes)) # values in [0,1]
|
bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1]
|
||||||
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
|
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
|
||||||
r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
|
r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
|
||||||
return r.x
|
return r.x
|
||||||
|
@ -1085,7 +1094,7 @@ class T50(ThresholdOptimization):
|
||||||
Threshold Optimization variant for :class:`ACC` as proposed by
|
Threshold Optimization variant for :class:`ACC` as proposed by
|
||||||
`Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and
|
`Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and
|
||||||
`Forman 2008 <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_ that looks
|
`Forman 2008 <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_ that looks
|
||||||
for the threshold that makes `tpr` cosest to 0.5.
|
for the threshold that makes `tpr` closest to 0.5.
|
||||||
The goal is to bring improved stability to the denominator of the adjustment.
|
The goal is to bring improved stability to the denominator of the adjustment.
|
||||||
|
|
||||||
:param classifier: a sklearn's Estimator that generates a classifier
|
:param classifier: a sklearn's Estimator that generates a classifier
|
||||||
|
@ -1173,7 +1182,7 @@ class MS(ThresholdOptimization):
|
||||||
super().__init__(classifier, val_split)
|
super().__init__(classifier, val_split)
|
||||||
|
|
||||||
def _condition(self, tpr, fpr) -> float:
|
def _condition(self, tpr, fpr) -> float:
|
||||||
pass
|
return True
|
||||||
|
|
||||||
def _optimize_threshold(self, y, probabilities):
|
def _optimize_threshold(self, y, probabilities):
|
||||||
tprs = []
|
tprs = []
|
||||||
|
@ -1184,9 +1193,26 @@ class MS(ThresholdOptimization):
|
||||||
TP, FP, FN, TN = self._compute_table(y, y_)
|
TP, FP, FN, TN = self._compute_table(y, y_)
|
||||||
tpr = self._compute_tpr(TP, FP)
|
tpr = self._compute_tpr(TP, FP)
|
||||||
fpr = self._compute_fpr(FP, TN)
|
fpr = self._compute_fpr(FP, TN)
|
||||||
tprs.append(tpr)
|
if self._condition(tpr, fpr):
|
||||||
fprs.append(fpr)
|
tprs.append(tpr)
|
||||||
return np.median(tprs), np.median(fprs)
|
fprs.append(fpr)
|
||||||
|
return tprs, fprs
|
||||||
|
|
||||||
|
def aggregate(self, classif_predictions):
|
||||||
|
prevs_estim = self.cc.aggregate(classif_predictions)
|
||||||
|
|
||||||
|
positive_prevs = []
|
||||||
|
for tpr, fpr in zip(self.tpr, self.fpr):
|
||||||
|
if tpr - fpr > 0:
|
||||||
|
acc = np.clip((prevs_estim[1] - fpr) / (tpr - fpr), 0, 1)
|
||||||
|
positive_prevs.append(acc)
|
||||||
|
|
||||||
|
if len(positive_prevs) > 0:
|
||||||
|
adjusted_positive_prev = np.median(positive_prevs)
|
||||||
|
adjusted_prevs_estim = np.array((1 - adjusted_positive_prev, adjusted_positive_prev))
|
||||||
|
return adjusted_prevs_estim
|
||||||
|
else:
|
||||||
|
return prevs_estim
|
||||||
|
|
||||||
|
|
||||||
class MS2(MS):
|
class MS2(MS):
|
||||||
|
@ -1209,19 +1235,9 @@ class MS2(MS):
|
||||||
def __init__(self, classifier: BaseEstimator, val_split=0.4):
|
def __init__(self, classifier: BaseEstimator, val_split=0.4):
|
||||||
super().__init__(classifier, val_split)
|
super().__init__(classifier, val_split)
|
||||||
|
|
||||||
def _optimize_threshold(self, y, probabilities):
|
def _condition(self, tpr, fpr) -> float:
|
||||||
tprs = [0, 1]
|
return (tpr - fpr) > 0.25
|
||||||
fprs = [0, 1]
|
|
||||||
candidate_thresholds = np.unique(probabilities[:, 1])
|
|
||||||
for candidate_threshold in candidate_thresholds:
|
|
||||||
y_ = [self.classes_[1] if p > candidate_threshold else self.classes_[0] for p in probabilities[:, 1]]
|
|
||||||
TP, FP, FN, TN = self._compute_table(y, y_)
|
|
||||||
tpr = self._compute_tpr(TP, FP)
|
|
||||||
fpr = self._compute_fpr(FP, TN)
|
|
||||||
if (tpr - fpr) > 0.25:
|
|
||||||
tprs.append(tpr)
|
|
||||||
fprs.append(fpr)
|
|
||||||
return np.median(tprs), np.median(fprs)
|
|
||||||
|
|
||||||
|
|
||||||
ClassifyAndCount = CC
|
ClassifyAndCount = CC
|
||||||
|
|
|
@ -32,13 +32,13 @@ class QuaNetTrainer(BaseQuantifier):
|
||||||
>>> qp.domains.preprocessing.index(dataset, min_df=5, inplace=True)
|
>>> qp.domains.preprocessing.index(dataset, min_df=5, inplace=True)
|
||||||
>>>
|
>>>
|
||||||
>>> # the text classifier is a CNN trained by NeuralClassifierTrainer
|
>>> # the text classifier is a CNN trained by NeuralClassifierTrainer
|
||||||
>>> cnn = CNNnet(dataset.vocabulary_size, dataset.n_classes)
|
>>> cnn = CNNnet(dataset.vocabulary_size, dataset.arange_classes)
|
||||||
>>> classifier = NeuralClassifierTrainer(cnn, device='cuda')
|
>>> classifier = NeuralClassifierTrainer(cnn, device='cuda')
|
||||||
>>>
|
>>>
|
||||||
>>> # train QuaNet (QuaNet is an alias to QuaNetTrainer)
|
>>> # train QuaNet (QuaNet is an alias to QuaNetTrainer)
|
||||||
>>> model = QuaNet(classifier, qp.environ['SAMPLE_SIZE'], device='cuda')
|
>>> model = QuaNet(classifier, qp.environ['SAMPLE_SIZE'], device='cuda')
|
||||||
>>> model.fit(dataset.training)
|
>>> model.fit(dataset.training)
|
||||||
>>> estim_prevalence = model.quantify(dataset.test.instances)
|
>>> estim_prevalence = model.quantify(dataset.mixture.instances)
|
||||||
|
|
||||||
:param classifier: an object implementing `fit` (i.e., that can be trained on labelled data),
|
:param classifier: an object implementing `fit` (i.e., that can be trained on labelled data),
|
||||||
`predict_proba` (i.e., that can generate posterior probabilities of unlabelled examples) and
|
`predict_proba` (i.e., that can generate posterior probabilities of unlabelled examples) and
|
||||||
|
|
|
@ -54,6 +54,7 @@ class GridSearchQ(BaseQuantifier):
|
||||||
self.__check_error(error)
|
self.__check_error(error)
|
||||||
assert isinstance(protocol, AbstractProtocol), 'unknown protocol'
|
assert isinstance(protocol, AbstractProtocol), 'unknown protocol'
|
||||||
|
|
||||||
|
|
||||||
def _sout(self, msg):
|
def _sout(self, msg):
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print(f'[{self.__class__.__name__}:{self.model.__class__.__name__}]: {msg}')
|
print(f'[{self.__class__.__name__}:{self.model.__class__.__name__}]: {msg}')
|
||||||
|
@ -88,7 +89,7 @@ class GridSearchQ(BaseQuantifier):
|
||||||
|
|
||||||
hyper = [dict({k: val[i] for i, k in enumerate(params_keys)}) for val in itertools.product(*params_values)]
|
hyper = [dict({k: val[i] for i, k in enumerate(params_keys)}) for val in itertools.product(*params_values)]
|
||||||
self._sout(f'starting model selection with {self.n_jobs =}')
|
self._sout(f'starting model selection with {self.n_jobs =}')
|
||||||
#pass a seed to parallel so it is set in clild processes
|
# pass a seed to parallel so it is set in clild processes
|
||||||
scores = qp.util.parallel(
|
scores = qp.util.parallel(
|
||||||
self._delayed_eval,
|
self._delayed_eval,
|
||||||
((params, training) for params in hyper),
|
((params, training) for params in hyper),
|
||||||
|
@ -223,7 +224,7 @@ def cross_val_predict(quantifier: BaseQuantifier, data: LabelledCollection, nfol
|
||||||
for train, test in data.kFCV(nfolds=nfolds, random_state=random_state):
|
for train, test in data.kFCV(nfolds=nfolds, random_state=random_state):
|
||||||
quantifier.fit(train)
|
quantifier.fit(train)
|
||||||
fold_prev = quantifier.quantify(test.X)
|
fold_prev = quantifier.quantify(test.X)
|
||||||
rel_size = len(test.X)/len(data)
|
rel_size = len(test)/len(data)
|
||||||
total_prev += fold_prev*rel_size
|
total_prev += fold_prev*rel_size
|
||||||
|
|
||||||
return total_prev
|
return total_prev
|
||||||
|
|
|
@ -11,7 +11,7 @@ import quapy as qp
|
||||||
|
|
||||||
plt.rcParams['figure.figsize'] = [10, 6]
|
plt.rcParams['figure.figsize'] = [10, 6]
|
||||||
plt.rcParams['figure.dpi'] = 200
|
plt.rcParams['figure.dpi'] = 200
|
||||||
plt.rcParams['font.size'] = 18
|
plt.rcParams['font.size'] = 12
|
||||||
|
|
||||||
|
|
||||||
def binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title=None, show_std=True, legend=True,
|
def binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title=None, show_std=True, legend=True,
|
||||||
|
@ -259,7 +259,7 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
|
||||||
data = _join_data_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, x_error, y_error, method_order)
|
data = _join_data_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, x_error, y_error, method_order)
|
||||||
|
|
||||||
if method_order is None:
|
if method_order is None:
|
||||||
method_order = method_names
|
method_order = np.unique(method_names)
|
||||||
|
|
||||||
_set_colors(ax, n_methods=len(method_order))
|
_set_colors(ax, n_methods=len(method_order))
|
||||||
|
|
||||||
|
@ -330,8 +330,8 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
|
||||||
|
|
||||||
if show_legend:
|
if show_legend:
|
||||||
fig.legend(loc='lower center',
|
fig.legend(loc='lower center',
|
||||||
bbox_to_anchor=(1, 0.5),
|
bbox_to_anchor=(0.9, 0.2),
|
||||||
ncol=(len(method_names)+1)//2)
|
ncol=1)
|
||||||
|
|
||||||
_save_or_show(savepath)
|
_save_or_show(savepath)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue