1
0
Fork 0

Compare commits

...

37 Commits

Author SHA1 Message Date
Alejandro Moreo Fernandez 0e81117cfb quick test kdex 2023-12-17 20:26:53 +01:00
Alejandro Moreo Fernandez 9bdc7676d6 first push to github 2023-12-17 20:14:38 +01:00
Alejandro Moreo Fernandez 59500a5a42 refactoring 2023-12-11 16:43:45 +01:00
Alejandro Moreo Fernandez a776816063 cleaning project and refactoring kde methods 2023-12-06 19:24:42 +01:00
Alejandro Moreo Fernandez 29e23b5bf6 cleaning gitginore 2023-12-06 17:05:12 +01:00
Alejandro Moreo Fernandez b823745dd1 cleaning project 2023-12-06 16:55:06 +01:00
Alejandro Moreo Fernandez b7b052ef7d checking stat significance 2023-11-28 10:20:54 +01:00
Alejandro Moreo Fernandez 01ef81bf25 sampling better the KDEy-HD approach 2023-11-23 11:30:45 +01:00
Alejandro Moreo Fernandez dcfd61ae5b launching new KDEy-HD with importance sampling 2023-11-22 12:27:15 +01:00
Alejandro Moreo Fernandez ac88dc8961 understanding importance sampling 2023-11-21 10:26:13 +01:00
Alejandro Moreo Fernandez 61399d66f6 scripts for understanding the divergence approximations 2023-11-20 15:28:56 +01:00
Alejandro Moreo Fernandez d060ffa591 Merge branch 'lab' of gitea-s2i2s.isti.cnr.it:moreo/QuaPy into lab 2023-11-09 18:15:33 +01:00
Alejandro Moreo Fernandez 949bd3cca7 gitignore update 2023-11-07 17:33:54 +01:00
Alejandro Moreo Fernandez c0f9a50a14 plotting simplex and 3d-histograms 2023-11-07 17:28:32 +01:00
Alejandro Moreo Fernandez 50d0ed2e84 switching 2023-11-03 09:54:36 +01:00
Alejandro Moreo Fernandez 0f4008e18d switching to devel 2023-10-30 09:41:52 +01:00
Alejandro Moreo Fernandez 3243fd90f8 running final experiments, one dedicated DM for each divergence with fine-grained exploration of nbins 2023-10-23 11:32:35 +02:00
Alejandro Moreo Fernandez f08885dca3 renaming dataset labels as numeric values w/o gaps 2023-10-18 11:42:52 +02:00
Alejandro Moreo Fernandez cfd5b00b97 renaming binary -> ucibinary 2023-10-18 10:55:56 +02:00
Alejandro Moreo Fernandez 53b052edc7 add uci multiclass datasets 2023-10-18 10:52:04 +02:00
Alejandro Moreo Fernandez 1c258a2000 adding multiclass experiments 2023-10-18 10:48:50 +02:00
Alejandro Moreo Fernandez dca955819c everything working; need to clean prints though 2023-10-17 18:11:25 +02:00
Alejandro Moreo Fernandez 7593fde2e0 KDE with closed form working 2023-10-13 17:34:26 +02:00
Alejandro Moreo Fernandez 32d6aa58f6 understanding montecarlo sampling 2023-10-02 17:50:12 +02:00
Alejandro Moreo Fernandez 56dbe744df cleaning and refactoring, also trying to repair the montecarlo approximation 2023-09-28 17:47:39 +02:00
Alejandro Moreo Fernandez 9221b3f775 binary experiments 2023-09-11 12:48:32 +02:00
Alejandro Moreo Fernandez 7e2237abe8 stability 2023-09-05 16:57:43 +02:00
Alejandro Moreo Fernandez 2d6ac4af0d testing the sensibility of KDEy to the bandwidth 2023-09-04 12:05:25 +02:00
Alejandro Moreo Fernandez b1715b685c Merge branch 'lab' of gitea-s2i2s.isti.cnr.it:moreo/QuaPy into lab 2023-08-01 19:04:36 +02:00
Alejandro Moreo Fernandez f72a612011 everything working, last results 2023-08-01 19:04:23 +02:00
Alejandro Moreo Fernandez fe819a7a3c plots 2023-07-31 09:03:41 +02:00
Alejandro Moreo Fernandez 96758834f3 kde working with kfcv 2023-07-24 16:28:21 +02:00
Alejandro Moreo Fernandez 0ce6106ee1 simplex plot 2023-07-21 12:56:01 +02:00
Alejandro Moreo Fernandez d995990fba initial experiments and DIR method 2023-07-21 11:41:16 +02:00
Alejandro Moreo Fernandez 26de9d92eb distribution matching multiclass 2023-07-20 09:03:22 +02:00
Alejandro Moreo Fernandez 4a7ffe5b50 testing kde-y 2023-05-22 17:34:00 +02:00
Alejandro Moreo Fernandez ca25f1d601 lab + some experimental methods based on distribution matching 2023-05-04 17:26:17 +02:00
50 changed files with 4273 additions and 46 deletions

63
.gitignore vendored
View File

@ -27,6 +27,40 @@ share/python-wheels/
.installed.cfg .installed.cfg
*.egg *.egg
MANIFEST MANIFEST
.idea
.vscode
LeQua2022
MultiLabel/results_generales
MultiLabel/mlqtables
NewMethods/plots*
NewMethods/results*
NewMethods/tables*
NewMethods/latex*
Ordinal/data*
Ordinal/roberta*
Ordinal/tables*
Ordinal/results*
eDiscovery/plots*
eDiscovery/results*
examples/results*
poster-cikm*
slides-cikm*
slides-short-cikm*
quick_experiment/figures*
quick_experiment/figures*
svm_perf_quantification/*
TweetSentQuant/plots*
TweetSentQuant/results*
TweetSentQuant/tables*
TweetSentQuant/Tweet Sentiment Quantification_NPP
TweetSentQuant/checkpoint
TweetSentQuant/*.tex
checkpoint
*.png
*.zip
*.pkl
*.pickle
*.pdf
# PyInstaller # PyInstaller
# Usually these files are written by a python script from a template # Usually these files are written by a python script from a template
@ -130,3 +164,32 @@ dmypy.json
.pyre/ .pyre/
*__pycache__* *__pycache__*
*.dataframe
*.svg
cross_lingual.py
# results
*.csv
# other projects
*.tex
*.md
*.doctree
*.rst
MultiLabel/
NewMethods/
Ordinal/
Retrieval/
quick_experiment/
Transduction/
Transduction_office/
TweetSentQuant/
laboratory/
docs/
dataset_drifts.py
notes.txt
eDiscovery/
examples/experiment_Dirichlet_calibration.py
examples/experiments_dirichlet_cal.txt

View File

@ -0,0 +1,124 @@
import numpy as np
import pandas as pd
from distribution_matching.method.kdex import KDExML
from distribution_matching.method.method_kdey import KDEy
from distribution_matching.method.method_kdey_closed_efficient_correct import KDEyclosed_efficient_corr
from distribution_matching.method.kdey import KDEyCS, KDEyHD, KDEyML
from quapy.method.aggregative import EMQ, CC, PCC, DistributionMatching, PACC, HDy, OneVsAllAggregative, ACC
from distribution_matching.method.dirichlety import DIRy
from sklearn.linear_model import LogisticRegression
# set to True to get the full list of methods tested in the paper (reported in the appendix)
# set to False to get the reduced list (shown in the body of the paper)
FULL_METHOD_LIST = False
if FULL_METHOD_LIST:
ADJUSTMENT_METHODS = ['ACC', 'PACC']
DISTR_MATCH_METHODS = ['HDy-OvA', 'DM-T', 'DM-HD', 'KDEy-HD', 'DM-CS', 'KDEy-CS']
MAX_LIKE_METHODS = ['DIR', 'EMQ', 'EMQ-BCTS', 'KDEy-ML']
else:
ADJUSTMENT_METHODS = ['PACC']
DISTR_MATCH_METHODS = ['DM-T', 'DM-HD', 'KDEy-HD', 'DM-CS', 'KDEy-CS']
MAX_LIKE_METHODS = ['EMQ', 'KDEy-ML']
# list of methods to consider
METHODS = ADJUSTMENT_METHODS + DISTR_MATCH_METHODS + MAX_LIKE_METHODS
BIN_METHODS = [x.replace('-OvA', '') for x in METHODS]
# common hyperparameterss
hyper_LR = {
'classifier__C': np.logspace(-3,3,7),
'classifier__class_weight': ['balanced', None]
}
hyper_kde = {
'bandwidth': np.linspace(0.01, 0.2, 20)
}
nbins_range = [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 64]
# instances a new quantifier based on a string name
def new_method(method, **lr_kwargs):
lr = LogisticRegression(**lr_kwargs)
if method == 'CC':
param_grid = hyper_LR
quantifier = CC(lr)
elif method == 'PCC':
param_grid = hyper_LR
quantifier = PCC(lr)
elif method == 'ACC':
param_grid = hyper_LR
quantifier = ACC(lr)
elif method == 'PACC':
param_grid = hyper_LR
quantifier = PACC(lr)
elif method in ['KDEy-HD']:
param_grid = {**hyper_kde, **hyper_LR}
quantifier = KDEyHD(lr)
elif method == 'KDEy-CS':
param_grid = {**hyper_kde, **hyper_LR}
quantifier = KDEyCS(lr)
elif method == 'KDEy-ML':
param_grid = {**hyper_kde, **hyper_LR}
quantifier = KDEyML(lr)
elif method == 'KDEx-ML':
param_grid = {
'bandwidth': np.linspace(0.001, 2, 501)
}
quantifier = KDExML()
elif method == 'DIR':
param_grid = hyper_LR
quantifier = DIRy(lr)
elif method == 'EMQ':
param_grid = hyper_LR
quantifier = EMQ(lr)
elif method == 'EMQ-BCTS':
method_params = {'exact_train_prev': [False], 'recalib': ['bcts']}
param_grid = {**method_params, **hyper_LR}
quantifier = EMQ(lr)
elif method == 'HDy':
param_grid = hyper_LR
quantifier = HDy(lr)
elif method == 'HDy-OvA':
param_grid = {'binary_quantifier__' + key: val for key, val in hyper_LR.items()}
quantifier = OneVsAllAggregative(HDy(lr))
elif method == 'DM-T':
method_params = {
'nbins': nbins_range,
'val_split': [10],
'divergence': ['topsoe']
}
param_grid = {**method_params, **hyper_LR}
quantifier = DistributionMatching(lr)
elif method == 'DM-HD':
method_params = {
'nbins': nbins_range,
'val_split': [10],
'divergence': ['HD']
}
param_grid = {**method_params, **hyper_LR}
quantifier = DistributionMatching(lr)
elif method == 'DM-CS':
method_params = {
'nbins': nbins_range,
'val_split': [10],
'divergence': ['CS']
}
param_grid = {**method_params, **hyper_LR}
quantifier = DistributionMatching(lr)
else:
raise NotImplementedError('unknown method', method)
return param_grid, quantifier
def show_results(result_path):
df = pd.read_csv(result_path+'.csv', sep='\t')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"])
print(pv)

View File

@ -0,0 +1,73 @@
import math
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.neighbors import KernelDensity
import matplotlib.pyplot as plt
import numpy as np
from data import LabelledCollection
scale = 100
import quapy as qp
data = qp.datasets.fetch_twitter('wb', min_df=3, pickle=True, for_model_selection=False)
X, y = data.training.Xy
cls = LogisticRegression(C=0.0001, random_state=0)
posteriors = cross_val_predict(cls, X=X, y=y, method='predict_proba', n_jobs=-1, cv=3)
cls.fit(X, y)
Xte, yte = data.test.Xy
post_c1 = posteriors[y==0]
post_c2 = posteriors[y==1]
post_c3 = posteriors[y==2]
print(len(post_c1))
print(len(post_c2))
print(len(post_c3))
post_test = cls.predict_proba(Xte)
alpha = qp.functional.prevalence_from_labels(yte, classes=[0, 1, 2])
nbins = 20
plt.rcParams.update({'font.size': 7})
fig = plt.figure()
positions = np.asarray([2,1,0])
colors = ['r', 'g', 'b']
for i, post_set in enumerate([post_c1, post_c2, post_c3, post_test]):
ax = fig.add_subplot(141+i, projection='3d')
for post, c, z in zip(post_set.T, colors, positions):
hist, bins = np.histogram(post, bins=nbins, density=True, range=[0,1])
xs = (bins[:-1] + bins[1:])/2
ax.bar(xs, hist, width=1/nbins, zs=z, zdir='y', color=c, ec=c, alpha=0.6)
ax.yaxis.set_ticks(positions)
ax.yaxis.set_ticklabels(['$y=1$', '$y=2$', '$y=3$'])
ax.xaxis.set_ticks([])
ax.xaxis.set_ticklabels([], minor=True)
ax.zaxis.set_ticks([])
ax.zaxis.set_ticklabels([], minor=True)
#plt.figure(figsize=(10,6))
#plt.show()
plt.savefig('./histograms.pdf')

View File

@ -0,0 +1,29 @@
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
"""
This script generates plots of sensibility for the number of classes
Plots results for MAE, MRAE, and KLD
The hyperparameters were set as:
quantifier.set_params(classifier__C=0.01, classifier__class_weight='balanced', bandwidth=0.2)
"""
methods = ['DM', 'KDEy-ML', 'EMQ']
optim = 'mae'
dfs = [pd.read_csv(f'../results/lequa/nclasses/{optim}/{method}.csv', sep='\t') for method in methods]
df = pd.concat(dfs)
for err in ['MAE', 'MRAE']:
piv = df.pivot_table(index='nClasses', columns='Method', values=err)
g = sns.lineplot(data=piv, markers=True, dashes=False)
g.set(xlim=(1, 28))
g.legend(loc="center left", bbox_to_anchor=(1, 0.5))
g.set_ylabel(err)
g.set_xticks(np.linspace(1, 28, 28))
plt.xticks(rotation=90)
plt.grid()
plt.savefig(f'./nclasses_{err}.pdf', bbox_inches='tight')
plt.clf()

View File

@ -0,0 +1,50 @@
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
"""
This script generates plots of sensibility for the hyperparameter "bandwidth".
Plots results for MAE, MRAE, and KLD
The rest of hyperparameters were set to their default values
"""
log_mrae = True
for method, param, xlim, xticks in [
('KDEy-ML', 'Bandwidth', (0.01, 0.2), np.linspace(0.01, 0.2, 20)),
('DM-HD', 'nbins', (2,32), list(range(2,10)) + list(range(10,34,2)))
]:
for dataset in ['tweet', 'lequa', 'uciml']:
if dataset == 'tweet':
df = pd.read_csv(f'../results/tweet/sensibility/{method}.csv', sep='\t')
ylim = (0.03, 0.21)
elif dataset == 'lequa':
df = pd.read_csv(f'../results/lequa/T1B/sensibility/{method}.csv', sep='\t')
ylim = (0.0125, 0.03)
elif dataset == 'uciml':
ylim = (0, 0.23)
df = pd.read_csv(f'../results/ucimulti/sensibility/{method}.csv', sep='\t')
for err in ['MAE']: #, 'MRAE']:
piv = df.pivot_table(index=param, columns='Dataset', values=err)
g = sns.lineplot(data=piv, markers=True, dashes=False)
g.set(xlim=xlim)
g.legend(loc="center left", bbox_to_anchor=(1, 0.5))
if log_mrae and err=='MRAE':
plt.yscale('log')
g.set_ylabel('log('+err+')')
else:
g.set_ylabel(err)
g.set_ylim(ylim)
g.set_xticks(xticks)
plt.xticks(rotation=90)
plt.grid()
plt.savefig(f'./sensibility_{method}_{dataset}_{err}.pdf', bbox_inches='tight')
plt.clf()

View File

@ -0,0 +1,154 @@
import math
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KernelDensity
from data import LabelledCollection
scale = 10
# con ternary (una lib de matplotlib) salen bien pero no puedo crear contornos, o no se
# con plotly salen los contornos bien, pero es un poco un jaleo porque utiliza el navegador...
def plot_simplex_(ax, density, title='', fontsize=9, points=None):
import ternary
tax = ternary.TernaryAxesSubplot(ax=ax, scale=scale)
tax.heatmapf(density, boundary=True, style="triangular", colorbar=False, cmap='viridis') #cmap='magma')
tax.boundary(linewidth=1.0)
corner_fontsize = 5*fontsize//6
tax.right_corner_label("$y=3$", fontsize=corner_fontsize)
tax.top_corner_label("$y=2$", fontsize=corner_fontsize)
tax.left_corner_label("$y=1$", fontsize=corner_fontsize)
if title:
tax.set_title(title, loc='center', y=-0.11, fontsize=fontsize)
if points is not None:
tax.scatter(points*scale, marker='o', color='w', alpha=0.25, zorder=10)
tax.get_axes().axis('off')
tax.clear_matplotlib_ticks()
return tax
def plot_simplex(ax, coord, kde_scores, title='', fontsize=11, points=None, savepath=None):
import plotly.figure_factory as ff
tax = ff.create_ternary_contour(coord.T, kde_scores, pole_labels=['y=1', 'y=2', 'y=3'],
interp_mode='cartesian',
ncontours=20,
colorscale='Viridis',
showscale=True,
title=title)
if savepath is None:
tax.show()
else:
tax.write_image(savepath)
return tax
from mpl_toolkits.axes_grid1 import make_axes_locatable
def plot_3class_problem(post_c1, post_c2, post_c3, post_test, alpha, bandwidth):
import ternary
post_c1 = np.flip(post_c1, axis=1)
post_c2 = np.flip(post_c2, axis=1)
post_c3 = np.flip(post_c3, axis=1)
post_test = np.flip(post_test, axis=1)
size_=10
fig = ternary.plt.figure(figsize=(4*size_, 1*size_))
fig.tight_layout()
ax1 = fig.add_subplot(1, 4, 1)
divider = make_axes_locatable(ax1)
ax2 = fig.add_subplot(1, 4, 2)
divider = make_axes_locatable(ax2)
ax3 = fig.add_subplot(1, 4, 3)
divider = make_axes_locatable(ax3)
ax4 = fig.add_subplot(1, 4, 4)
divider = make_axes_locatable(ax4)
kde1 = KernelDensity(bandwidth=bandwidth).fit(post_c1)
kde2 = KernelDensity(bandwidth=bandwidth).fit(post_c2)
kde3 = KernelDensity(bandwidth=bandwidth).fit(post_c3)
#post_c1 = np.concatenate([post_c1, np.eye(3, dtype=float)])
#post_c2 = np.concatenate([post_c2, np.eye(3, dtype=float)])
#post_c3 = np.concatenate([post_c3, np.eye(3, dtype=float)])
#plot_simplex_(ax1, lambda x:0, title='$f_1(\mathbf{x})=p(s(\mathbf{x})|y=1)$')
#plot_simplex_(ax2, lambda x:0, title='$f_1(\mathbf{x})=p(s(\mathbf{x})|y=1)$')
#plot_simplex_(ax3, lambda x:0, title='$f_1(\mathbf{x})=p(s(\mathbf{x})|y=1)$')
def density(kde):
def d(p):
return np.exp(kde([p])).item()
return d
plot_simplex_(ax1, density(kde1.score_samples), title='$p_1$')
plot_simplex_(ax2, density(kde2.score_samples), title='$p_2$')
plot_simplex_(ax3, density(kde3.score_samples), title='$p_3$')
#plot_simplex(ax1, post_c1, np.exp(kde1.score_samples(post_c1)), title='$f_1(\mathbf{x})=p(s(\mathbf{x})|y=1)$') #, savepath='figure/y1.png')
#plot_simplex(ax2, post_c2, np.exp(kde2.score_samples(post_c2)), title='$f_2(\mathbf{x})=p(s(\mathbf{x})|y=2)$') #, savepath='figure/y2.png')
#plot_simplex(ax3, post_c3, np.exp(kde3.score_samples(post_c3)), title='$f_3(\mathbf{x})=p(s(\mathbf{x})|y=3)$') #, savepath='figure/y3.png')
def mixture_(prevs, kdes):
def m(p):
total_density = 0
for prev, kde in zip(prevs, kdes):
log_density = kde.score_samples([p]).item()
density = np.exp(log_density)
density *= prev
total_density += density
#print(total_density)
return total_density
return m
title = '$p_{\mathbf{\\alpha}} = \sum_{i \in n} \\alpha_i p_i$'
plot_simplex_(ax4, mixture_(alpha, [kde1, kde2, kde3]), title=title, points=post_test)
#mixture(alpha, [kde1, kde2, kde3])
#post_test = np.concatenate([post_test, np.eye(3, dtype=float)])
#test_scores = sum(alphai*np.exp(kdei.score_samples(post_test)) for alphai, kdei in zip(alpha, [kde1,kde2,kde3]))
#plot_simplex(ax4, post_test, test_scores, title=title, points=post_test)
#ternary.plt.show()
ternary.plt.savefig('./simplex.png')
import quapy as qp
data = qp.datasets.fetch_twitter('wb', min_df=3, pickle=True, for_model_selection=False)
X, y = data.training.Xy
cls = LogisticRegression(C=0.0001, random_state=0)
Xtr, Xte, ytr, yte = train_test_split(X, y, train_size=0.7, stratify=y, random_state=0)
cls.fit(Xtr, ytr)
test = LabelledCollection(Xte, yte)
test = test.sampling(100, *[0.2, 0.1, 0.7])
Xte, yte = test.Xy
post_c1 = cls.predict_proba(Xte[yte==0])
post_c2 = cls.predict_proba(Xte[yte==1])
post_c3 = cls.predict_proba(Xte[yte==2])
post_test = cls.predict_proba(Xte)
print(post_test)
alpha = qp.functional.prevalence_from_labels(yte, classes=[0, 1, 2])
#post_c1 = np.random.dirichlet([10,3,1], 30)
#post_c2 = np.random.dirichlet([1,11,6], 30)
#post_c3 = np.random.dirichlet([1,5,20], 30)
#post_test = np.random.dirichlet([5,1,6], 100)
#alpha = [0.5, 0.3, 0.2]
print(f'test alpha {alpha}')
plot_3class_problem(post_c1, post_c2, post_c3, post_test, alpha, bandwidth=0.1)

View File

@ -0,0 +1,129 @@
import ternary
import math
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.neighbors import KernelDensity
import plotly.figure_factory as ff
from data import LabelledCollection
scale = 100
# con ternary (una lib de matplotlib) salen bien pero no puedo crear contornos, o no se
# con plotly salen los contornos bien, pero es un poco un jaleo porque utiliza el navegador...
def plot_simplex_(ax, density, title='', fontsize=30, points=None):
tax = ternary.TernaryAxesSubplot(ax=ax, scale=scale)
tax.heatmapf(density, boundary=True, style="triangular", colorbar=False, cmap='viridis') #cmap='magma')
tax.boundary(linewidth=1.0)
corner_fontsize = int(5*fontsize//6)
tax.right_corner_label("$y=3$", fontsize=corner_fontsize)
tax.top_corner_label("$y=2$", fontsize=corner_fontsize)
tax.left_corner_label("$y=1$", fontsize=corner_fontsize)
if title:
tax.set_title(title, loc='center', y=-0.11, fontsize=fontsize)
if points is not None:
tax.scatter(points*scale, marker='o', color='w', alpha=0.25, zorder=10, s=5*scale)
tax.get_axes().axis('off')
tax.clear_matplotlib_ticks()
return tax
from mpl_toolkits.axes_grid1 import make_axes_locatable
def plot_3class_problem(post_c1, post_c2, post_c3, post_test, alpha, bandwidth):
post_c1 = np.flip(post_c1, axis=1)
post_c2 = np.flip(post_c2, axis=1)
post_c3 = np.flip(post_c3, axis=1)
post_test = np.flip(post_test, axis=1)
size_=10
fig = ternary.plt.figure(figsize=(5*size_, 1*size_))
fig.tight_layout()
ax1 = fig.add_subplot(1, 4, 1)
divider = make_axes_locatable(ax1)
ax2 = fig.add_subplot(1, 4, 2)
divider = make_axes_locatable(ax2)
ax3 = fig.add_subplot(1, 4, 3)
divider = make_axes_locatable(ax3)
ax4 = fig.add_subplot(1, 4, 4)
divider = make_axes_locatable(ax4)
kde1 = KernelDensity(bandwidth=bandwidth).fit(post_c1)
kde2 = KernelDensity(bandwidth=bandwidth).fit(post_c2)
kde3 = KernelDensity(bandwidth=bandwidth).fit(post_c3)
#post_c1 = np.concatenate([post_c1, np.eye(3, dtype=float)])
#post_c2 = np.concatenate([post_c2, np.eye(3, dtype=float)])
#post_c3 = np.concatenate([post_c3, np.eye(3, dtype=float)])
#plot_simplex_(ax1, lambda x:0, title='$f_1(\mathbf{x})=p(s(\mathbf{x})|y=1)$')
#plot_simplex_(ax2, lambda x:0, title='$f_1(\mathbf{x})=p(s(\mathbf{x})|y=1)$')
#plot_simplex_(ax3, lambda x:0, title='$f_1(\mathbf{x})=p(s(\mathbf{x})|y=1)$')
def density(kde):
def d(p):
return np.exp(kde([p])).item()
return d
plot_simplex_(ax1, density(kde1.score_samples), title='$p_1$')
plot_simplex_(ax2, density(kde2.score_samples), title='$p_2$')
plot_simplex_(ax3, density(kde3.score_samples), title='$p_3$')
#plot_simplex(ax1, post_c1, np.exp(kde1.score_samples(post_c1)), title='$f_1(\mathbf{x})=p(s(\mathbf{x})|y=1)$') #, savepath='figure/y1.png')
#plot_simplex(ax2, post_c2, np.exp(kde2.score_samples(post_c2)), title='$f_2(\mathbf{x})=p(s(\mathbf{x})|y=2)$') #, savepath='figure/y2.png')
#plot_simplex(ax3, post_c3, np.exp(kde3.score_samples(post_c3)), title='$f_3(\mathbf{x})=p(s(\mathbf{x})|y=3)$') #, savepath='figure/y3.png')
def mixture_(prevs, kdes):
def m(p):
total_density = 0
for prev, kde in zip(prevs, kdes):
log_density = kde.score_samples([p]).item()
density = np.exp(log_density)
density *= prev
total_density += density
#print(total_density)
return total_density
return m
title = '$\mathbf{p}_{\mathbf{\\alpha}} = \sum_{i \in n} \\alpha_i p_i$'
plot_simplex_(ax4, mixture_(alpha, [kde1, kde2, kde3]), title=title, points=post_test)
#ternary.plt.show()
ternary.plt.savefig('./simplex.pdf')
import quapy as qp
data = qp.datasets.fetch_twitter('wb', min_df=3, pickle=True, for_model_selection=False)
Xtr, ytr = data.training.Xy
Xte, yte = data.test.sampling(150, *[0.5, 0.1, 0.4]).Xy
cls = LogisticRegression(C=0.0001, random_state=0)
draw_from_training = False
if draw_from_training:
post_tr = cross_val_predict(cls, Xtr, ytr, n_jobs=-1, method='predict_proba')
post_c1 = post_tr[ytr==0]
post_c2 = post_tr[ytr==1]
post_c3 = post_tr[ytr==2]
cls.fit(Xtr, ytr)
else:
cls.fit(Xtr, ytr)
post_te = cls.predict_proba(Xte)
post_c1 = post_te[yte == 0]
post_c2 = post_te[yte == 1]
post_c3 = post_te[yte == 2]
post_test = cls.predict_proba(Xte)
alpha = qp.functional.prevalence_from_labels(yte, classes=[0, 1, 2])
print(f'test alpha {alpha}')
plot_3class_problem(post_c1, post_c2, post_c3, post_test, alpha, bandwidth=0.1)

View File

@ -0,0 +1,65 @@
import pickle
import numpy as np
import os
import pandas as pd
from distribution_matching.commons import METHODS, BIN_METHODS, new_method, show_results
import quapy as qp
from quapy.model_selection import GridSearchQ
if __name__ == '__main__':
for task in ['T1A', 'T1B']:
qp.environ['SAMPLE_SIZE'] = qp.datasets.LEQUA2022_SAMPLE_SIZE[task]
qp.environ['N_JOBS'] = -1
for optim in ['mae', 'mrae']:
result_dir = f'results/lequa/{task}/{optim}'
os.makedirs(result_dir, exist_ok=True)
for method in (METHODS if task=='T1B' else BIN_METHODS):
print('Init method', method)
result_path = f'{result_dir}/{method}'
if os.path.exists(result_path+'.csv'):
print(f'file {result_path}.csv already exist; skipping')
continue
with open(result_path+'.csv', 'wt') as csv:
csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n')
dataset = task
train, val_gen, test_gen = qp.datasets.fetch_lequa2022(dataset)
print(f'init {dataset} #instances: {len(train)}')
param_grid, quantifier = new_method(method)
if param_grid is not None:
modsel = GridSearchQ(quantifier, param_grid, protocol=val_gen, refit=False, n_jobs=-1, verbose=1, error=optim)
modsel.fit(train)
print(f'best params {modsel.best_params_}')
print(f'best score {modsel.best_score_}')
pickle.dump(
(modsel.best_params_, modsel.best_score_,),
open(f'{result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
quantifier = modsel.best_model()
else:
print('debug mode... skipping model selection')
quantifier.fit(train)
report = qp.evaluation.evaluation_report(
quantifier, protocol=test_gen, error_metrics=['mae', 'mrae', 'kld'],
verbose=True, verbose_error=optim[1:], n_jobs=-1
)
means = report.mean()
report.to_csv(result_path+'.dataframe')
csv.write(f'{method}\tLeQua-{task}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
csv.flush()
print(means)
show_results(result_path)

View File

@ -0,0 +1,56 @@
import numpy as np
from sklearn.linear_model import LogisticRegression
import os
import quapy as qp
from distribution_matching.commons import show_results
from distribution_matching.method.method_kdey import KDEy
from quapy.method.aggregative import DistributionMatching
SEED=1
def task(val):
print('job-init', val)
train, val_gen, test_gen = qp.datasets.fetch_lequa2022('T1B')
with qp.util.temp_seed(SEED):
if method=='KDEy-ML':
quantifier = KDEy(LogisticRegression(), target='max_likelihood', val_split=10, bandwidth=val)
elif method == 'DM-HD':
quantifier = DistributionMatching(LogisticRegression(), val_split=10, nbins=val, divergence='HD')
quantifier.fit(train)
report = qp.evaluation.evaluation_report(
quantifier, protocol=test_gen, error_metrics=['mae', 'mrae', 'kld'], verbose=True)
return report
if __name__ == '__main__':
qp.environ['SAMPLE_SIZE'] = qp.datasets.LEQUA2022_SAMPLE_SIZE['T1B']
qp.environ['N_JOBS'] = -1
result_dir = f'results/lequa/T1B/sensibility'
os.makedirs(result_dir, exist_ok=True)
for method, param, grid in [
('KDEy-ML', 'Bandwidth', np.linspace(0.01, 0.2, 20)),
('DM-HD', 'nbins', list(range(2, 10)) + list(range(10, 34, 2)))
]:
global_result_path = f'{result_dir}/{method}'
if not os.path.exists(global_result_path+'.csv'):
with open(global_result_path+'.csv', 'wt') as csv:
csv.write(f'Method\tDataset\t{param}\tMAE\tMRAE\tKLD\n')
reports = qp.util.parallel(task, grid, n_jobs=-1)
with open(global_result_path + '.csv', 'at') as csv:
for val, report in zip(grid, reports):
means = report.mean()
local_result_path = global_result_path + '_T1B' + (f'_{val:.3f}' if isinstance(val, float) else f'{val}')
report.to_csv(f'{local_result_path}.dataframe')
csv.write(f'{method}\tLeQua-T1B\t{val}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
csv.flush()
show_results(global_result_path)

View File

@ -0,0 +1,105 @@
import os
import sys
from typing import Union, Callable
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KernelDensity
import quapy as qp
from quapy.data import LabelledCollection
from quapy.protocol import APP, UPP
from quapy.method.aggregative import AggregativeProbabilisticQuantifier, _training_helper, cross_generate_predictions, \
DistributionMatching, _get_divergence
import scipy
from scipy import optimize
from statsmodels.nonparametric.kernel_density import KDEMultivariateConditional
import dirichlet
class DIRy(AggregativeProbabilisticQuantifier):
MAXITER = 100000
def __init__(self, classifier: BaseEstimator, val_split=0.4, n_jobs=None, target='max_likelihood'):
self.classifier = classifier
self.val_split = val_split
self.n_jobs = n_jobs
self.target = target
def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
if val_split is None:
val_split = self.val_split
self.classifier, y, posteriors, _, _ = cross_generate_predictions(
data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
)
self.val_parameters = []
for cat in range(data.n_classes):
dir_i = dirichlet.mle(posteriors[y == cat], maxiter=DIRy.MAXITER)
self.val_parameters.append(dir_i)
# print(cat)
# self.val_parameters = [dirichlet.mle(posteriors[y == cat], maxiter=DIRy.MAXITER) for cat in range(data.n_classes)]
return self
def val_pdf(self, prev):
"""
Returns a function that computes the mixture model with the given prev as mixture factor
:param prev: a prevalence vector, ndarray
:return: a function implementing the validation distribution with fixed mixture factor
"""
return lambda posteriors: sum(prev_i * dirichlet.pdf(parameters_i)(posteriors) for parameters_i, prev_i in zip(self.val_parameters, prev))
def aggregate(self, posteriors: np.ndarray):
if self.target == 'min_divergence':
raise NotImplementedError('not yet')
return self._target_divergence(posteriors)
elif self.target == 'max_likelihood':
return self._target_likelihood(posteriors)
else:
raise ValueError('unknown target')
def _target_divergence(self, posteriors):
test_density = self.get_kde(posteriors)
# val_test_posteriors = np.concatenate([self.val_posteriors, posteriors])
test_likelihood = self.pdf(test_density, posteriors)
divergence = _get_divergence(self.divergence)
n_classes = len(self.val_densities)
def match(prev):
val_pdf = self.val_pdf(prev)
val_likelihood = val_pdf(posteriors)
return divergence(val_likelihood, test_likelihood)
# the initial point is set as the uniform distribution
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
# solutions are bounded to those contained in the unit-simplex
bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1]
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
return r.x
def _target_likelihood(self, posteriors, eps=0.000001):
n_classes = len(self.val_parameters)
def neg_loglikelihood(prev):
val_pdf = self.val_pdf(prev)
test_likelihood = val_pdf(posteriors)
test_loglikelihood = np.log(test_likelihood + eps)
return -np.sum(test_loglikelihood)
# the initial point is set as the uniform distribution
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
# solutions are bounded to those contained in the unit-simplex
bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1]
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
r = optimize.minimize(neg_loglikelihood, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
return r.x

View File

@ -0,0 +1,50 @@
from quapy.method.base import BaseQuantifier
import numpy as np
from distribution_matching.method.kdey import KDEBase
import quapy as qp
from quapy.data import LabelledCollection
import quapy.functional as F
from sklearn.preprocessing import StandardScaler
class KDExML(BaseQuantifier, KDEBase):
def __init__(self, bandwidth=0.1, standardize=True):
self._check_bandwidth(bandwidth)
self.bandwidth = bandwidth
self.standardize = standardize
def fit(self, data: LabelledCollection):
X, y = data.Xy
if self.standardize:
self.scaler = StandardScaler()
X = self.scaler.fit_transform(X)
self.mix_densities = self.get_mixture_components(X, y, data.n_classes, self.bandwidth)
return self
def quantify(self, X):
"""
Searches for the mixture model parameter (the sought prevalence values) that maximizes the likelihood
of the data (i.e., that minimizes the negative log-likelihood)
:param X: instances in the sample
:return: a vector of class prevalence estimates
"""
epsilon = 1e-10
n_classes = len(self.mix_densities)
if self.standardize:
X = self.scaler.transform(X)
test_densities = [self.pdf(kde_i, X) for kde_i in self.mix_densities]
def neg_loglikelihood(prev):
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities))
test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
return -np.sum(test_loglikelihood)
return F.optim_minimize(neg_loglikelihood, n_classes)

View File

@ -0,0 +1,234 @@
from typing import Union
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.neighbors import KernelDensity
import quapy as qp
from quapy.data import LabelledCollection
from quapy.method.aggregative import AggregativeProbabilisticQuantifier, cross_generate_predictions
import quapy.functional as F
from sklearn.metrics.pairwise import rbf_kernel
class KDEBase:
BANDWIDTH_METHOD = ['scott', 'silverman']
@classmethod
def _check_bandwidth(cls, bandwidth):
assert bandwidth in KDEBase.BANDWIDTH_METHOD or isinstance(bandwidth, float), \
f'invalid bandwidth, valid ones are {KDEBase.BANDWIDTH_METHOD} or float values'
if isinstance(bandwidth, float):
assert 0 < bandwidth < 1, "the bandwith for KDEy should be in (0,1), since this method models the unit simplex"
def get_kde_function(self, X, bandwidth):
return KernelDensity(bandwidth=bandwidth).fit(X)
def pdf(self, kde, X):
return np.exp(kde.score_samples(X))
def get_mixture_components(self, X, y, n_classes, bandwidth):
return [self.get_kde_function(X[y == cat], bandwidth) for cat in range(n_classes)]
class KDEyML(AggregativeProbabilisticQuantifier, KDEBase):
def __init__(self, classifier: BaseEstimator, val_split=10, bandwidth=0.1, n_jobs=None, random_state=0):
self._check_bandwidth(bandwidth)
self.classifier = classifier
self.val_split = val_split
self.bandwidth = bandwidth
self.n_jobs = n_jobs
self.random_state=random_state
def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
if val_split is None:
val_split = self.val_split
self.classifier, y, posteriors, _, _ = cross_generate_predictions(
data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
)
self.mix_densities = self.get_mixture_components(posteriors, y, data.n_classes, self.bandwidth)
return self
def aggregate(self, posteriors: np.ndarray):
"""
Searches for the mixture model parameter (the sought prevalence values) that maximizes the likelihood
of the data (i.e., that minimizes the negative log-likelihood)
:param posteriors: instances in the sample converted into posterior probabilities
:return: a vector of class prevalence estimates
"""
np.random.RandomState(self.random_state)
epsilon = 1e-10
n_classes = len(self.mix_densities)
test_densities = [self.pdf(kde_i, posteriors) for kde_i in self.mix_densities]
def neg_loglikelihood(prev):
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities))
test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
return -np.sum(test_loglikelihood)
return F.optim_minimize(neg_loglikelihood, n_classes)
class KDEyHD(AggregativeProbabilisticQuantifier, KDEBase):
def __init__(self, classifier: BaseEstimator, val_split=10, divergence: str='HD',
bandwidth=0.1, n_jobs=None, random_state=0, montecarlo_trials=10000):
self._check_bandwidth(bandwidth)
self.classifier = classifier
self.val_split = val_split
self.divergence = divergence
self.bandwidth = bandwidth
self.n_jobs = n_jobs
self.random_state=random_state
self.montecarlo_trials = montecarlo_trials
def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
if val_split is None:
val_split = self.val_split
self.classifier, y, posteriors, _, _ = cross_generate_predictions(
data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
)
self.mix_densities = self.get_mixture_components(posteriors, y, data.n_classes, self.bandwidth)
N = self.montecarlo_trials
rs = self.random_state
n = data.n_classes
self.reference_samples = np.vstack([kde_i.sample(N//n, random_state=rs) for kde_i in self.mix_densities])
self.reference_classwise_densities = np.asarray([self.pdf(kde_j, self.reference_samples) for kde_j in self.mix_densities])
self.reference_density = np.mean(self.reference_classwise_densities, axis=0) # equiv. to (uniform @ self.reference_classwise_densities)
return self
def aggregate(self, posteriors: np.ndarray):
# we retain all n*N examples (sampled from a mixture with uniform parameter), and then
# apply importance sampling (IS). In this version we compute D(p_alpha||q) with IS
n_classes = len(self.mix_densities)
test_kde = self.get_kde_function(posteriors, self.bandwidth)
test_densities = self.pdf(test_kde, self.reference_samples)
def f_squared_hellinger(u):
return (np.sqrt(u)-1)**2
# todo: this will fail when self.divergence is a callable, and is not the right place to do it anyway
if self.divergence.lower() == 'hd':
f = f_squared_hellinger
else:
raise ValueError('only squared HD is currently implemented')
epsilon = 1e-10
qs = test_densities + epsilon
rs = self.reference_density + epsilon
iw = qs/rs #importance weights
p_class = self.reference_classwise_densities + epsilon
fracs = p_class/qs
def divergence(prev):
# ps / qs = (prev @ p_class) / qs = prev @ (p_class / qs) = prev @ fracs
ps_div_qs = prev @ fracs
return np.mean( f(ps_div_qs) * iw )
return F.optim_minimize(divergence, n_classes)
class KDEyCS(AggregativeProbabilisticQuantifier):
def __init__(self, classifier: BaseEstimator, val_split=10, bandwidth=0.1, n_jobs=None, random_state=0):
KDEBase._check_bandwidth(bandwidth)
self.classifier = classifier
self.val_split = val_split
self.bandwidth = bandwidth
self.n_jobs = n_jobs
self.random_state=random_state
def gram_matrix_mix_sum(self, X, Y=None):
# this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y))
# to contain pairwise evaluations of N(x|mu,Sigma1+Sigma2) with mu=y and Sigma1 and Sigma2 are
# two "scalar matrices" (h^2)*I each, so Sigma1+Sigma2 has scalar 2(h^2) (h is the bandwidth)
h = self.bandwidth
variance = 2 * (h**2)
nD = X.shape[1]
gamma = 1/(2*variance)
norm_factor = 1/np.sqrt(((2*np.pi)**nD) * (variance**(nD)))
gram = norm_factor * rbf_kernel(X, Y, gamma=gamma)
return gram.sum()
def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
if val_split is None:
val_split = self.val_split
self.classifier, y, posteriors, _, _ = cross_generate_predictions(
data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
)
assert all(sorted(np.unique(y)) == np.arange(data.n_classes)), \
'label name gaps not allowed in current implementation'
n = data.n_classes
P = posteriors
# counts_inv keeps track of the relative weight of each datapoint within its class
# (i.e., the weight in its KDE model)
counts_inv = 1 / (data.counts())
# tr_tr_sums corresponds to symbol \overline{B} in the paper
tr_tr_sums = np.zeros(shape=(n,n), dtype=float)
for i in range(n):
for j in range(n):
if i > j:
tr_tr_sums[i,j] = tr_tr_sums[j,i]
else:
block = self.gram_matrix_mix_sum(P[y == i], P[y == j] if i!=j else None)
tr_tr_sums[i, j] = block
# keep track of these data structures for the test phase
self.Ptr = P
self.ytr = y
self.tr_tr_sums = tr_tr_sums
self.counts_inv = counts_inv
return self
def aggregate(self, posteriors: np.ndarray):
Ptr = self.Ptr
Pte = posteriors
y = self.ytr
tr_tr_sums = self.tr_tr_sums
M, nD = Pte.shape
Minv = (1/M) # t in the paper
n = Ptr.shape[1]
# becomes a constant that does not affect the optimization, no need to compute it
# partC = 0.5*np.log(self.gram_matrix_mix_sum(Pte) * Kinv * Kinv)
# tr_te_sums corresponds to \overline{a}*(1/Li)*(1/M) in the paper (note the constants
# are already aggregated to tr_te_sums, so these multiplications are not carried out
# at each iteration of the optimization phase)
tr_te_sums = np.zeros(shape=n, dtype=float)
for i in range(n):
tr_te_sums[i] = self.gram_matrix_mix_sum(Ptr[y==i], Pte)
def divergence(alpha):
# called \overline{r} in the paper
alpha_ratio = alpha * self.counts_inv
# recal that tr_te_sums already accounts for the constant terms (1/Li)*(1/M)
partA = -np.log((alpha_ratio @ tr_te_sums) * Minv)
partB = 0.5 * np.log(alpha_ratio @ tr_tr_sums @ alpha_ratio)
return partA + partB #+ partC
return F.optim_minimize(divergence, n)

View File

@ -0,0 +1,321 @@
from cgi import test
import os
import sys
from typing import Union, Callable
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KernelDensity
import quapy as qp
from quapy.data import LabelledCollection
from quapy.protocol import APP, UPP
from quapy.method.aggregative import AggregativeProbabilisticQuantifier, _training_helper, cross_generate_predictions, \
DistributionMatching, _get_divergence
import quapy.functional as F
import scipy
from scipy import optimize
#from statsmodels.nonparametric.kernel_density import KDEMultivariateConditional
# TODO: optimize the bandwidth automatically
# TODO: think of a MMD-y variant, i.e., a MMD variant that uses the points in the simplex and possibly any non-linear kernel
class KDEy(AggregativeProbabilisticQuantifier):
BANDWIDTH_METHOD = ['auto', 'scott', 'silverman']
ENGINE = ['scipy', 'sklearn', 'statsmodels']
TARGET = ['min_divergence', 'min_divergence_uniform', 'max_likelihood']
def __init__(self, classifier: BaseEstimator, val_split=0.4, divergence: Union[str, Callable]='L2',
bandwidth='scott', engine='sklearn', target='min_divergence', n_jobs=None, random_state=0, montecarlo_trials=1000):
assert bandwidth in KDEy.BANDWIDTH_METHOD or isinstance(bandwidth, float), \
f'unknown bandwidth_method, valid ones are {KDEy.BANDWIDTH_METHOD}'
assert engine in KDEy.ENGINE, f'unknown engine, valid ones are {KDEy.ENGINE}'
assert target in KDEy.TARGET, f'unknown target, valid ones are {KDEy.TARGET}'
assert target=='max_likelihood' or divergence in ['KLD', 'HD', 'JS'], \
'in this version I will only allow KLD or squared HD as a divergence'
self.classifier = classifier
self.val_split = val_split
self.divergence = divergence
self.bandwidth = bandwidth
self.engine = engine
self.target = target
self.n_jobs = n_jobs
self.random_state=random_state
self.montecarlo_trials = montecarlo_trials
def search_bandwidth_maxlikelihood(self, posteriors, labels):
grid = {'bandwidth': np.linspace(0.001, 0.2, 100)}
search = GridSearchCV(
KernelDensity(), param_grid=grid, n_jobs=-1, cv=50, verbose=1, refit=True
)
search.fit(posteriors, labels)
bandwidth = search.best_params_['bandwidth']
print(f'auto: bandwidth={bandwidth:.5f}')
return bandwidth
def get_kde_function(self, posteriors):
# if self.bandwidth == 'auto':
# print('adjusting bandwidth')
#
# if self.engine == 'sklearn':
# grid = {'bandwidth': np.linspace(0.001,0.2,41)}
# search = GridSearchCV(
# KernelDensity(), param_grid=grid, n_jobs=-1, cv=10, verbose=1, refit=True
# )
# search.fit(posteriors)
# print(search.best_score_)
# print(search.best_params_)
#
# import pandas as pd
# df = pd.DataFrame(search.cv_results_)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
# pd.set_option('expand_frame_repr', False)
# print(df)
# sys.exit(0)
#
# kde = search
#else:
if self.engine == 'scipy':
# scipy treats columns as datapoints, and need the datapoints not to lie in a lower-dimensional subspace, which
# requires removing the last dimension which is constrained
posteriors = posteriors[:,:-1].T
kde = scipy.stats.gaussian_kde(posteriors)
kde.set_bandwidth(self.bandwidth)
elif self.engine == 'sklearn':
#print('fitting kde')
kde = KernelDensity(bandwidth=self.bandwidth).fit(posteriors)
#print('[fitting done]')
return kde
def pdf(self, kde, posteriors):
if self.engine == 'scipy':
return kde(posteriors[:, :-1].T)
elif self.engine == 'sklearn':
return np.exp(kde.score_samples(posteriors))
def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
"""
:param data: the training set
:param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit)
:param val_split: either a float in (0,1) indicating the proportion of training instances to use for
validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV
to estimate the parameters
"""
if val_split is None:
val_split = self.val_split
self.classifier, y, posteriors, classes, class_count = cross_generate_predictions(
data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
)
if self.bandwidth == 'auto':
self.bandwidth = self.search_bandwidth_maxlikelihood(posteriors, y)
self.val_densities = [self.get_kde_function(posteriors[y == cat]) for cat in range(data.n_classes)]
self.val_posteriors = posteriors
if self.target == 'min_divergence_uniform':
self.samples = qp.functional.uniform_prevalence_sampling(n_classes=data.n_classes, size=self.montecarlo_trials)
self.sample_densities = [self.pdf(kde_i, self.samples) for kde_i in self.val_densities]
elif self.target == 'min_divergence':
N = self.montecarlo_trials
rs = self.random_state
n = data.n_classes
self.reference_samples = np.vstack([kde_i.sample(N//n, random_state=rs) for kde_i in self.val_densities])
self.reference_classwise_densities = np.asarray([self.pdf(kde_j, self.reference_samples) for kde_j in self.val_densities])
self.reference_density = np.mean(self.reference_classwise_densities, axis=0) # equiv. to (uniform @ self.reference_classwise_densities)
elif self.target == 'min_divergence_deprecated': # the version of the first draft, with n*N presampled, then alpha*N chosen for class
self.class_samples = [kde_i.sample(self.montecarlo_trials, random_state=self.random_state) for kde_i in self.val_densities]
self.class_sample_densities = {}
for ci, samples_i in enumerate(self.class_samples):
self.class_sample_densities[ci] = np.asarray([self.pdf(kde_j, samples_i) for kde_j in self.val_densities]).T
return self
def aggregate(self, posteriors: np.ndarray):
if self.target == 'min_divergence':
return self._target_divergence(posteriors)
elif self.target == 'min_divergence_uniform':
return self._target_divergence_uniform(posteriors)
elif self.target == 'max_likelihood':
return self._target_likelihood(posteriors)
else:
raise ValueError('unknown target')
# new version in which we retain all n*N examples (sampled from a mixture with uniform parameter), and then
# apply importance sampling (IS). In this version we compute D(p_alpha||q) with IS, and not D(q||p_alpha) as
# in the first draft
def _target_divergence(self, posteriors):
# in this variant we evaluate the divergence using a Montecarlo approach
n_classes = len(self.val_densities)
test_kde = self.get_kde_function(posteriors)
test_densities = self.pdf(test_kde, self.reference_samples)
def f_squared_hellinger(u):
return (np.sqrt(u)-1)**2
# todo: this will fail when self.divergence is a callable, and is not the right place to do it anyway
if self.divergence.lower() == 'hd':
f = f_squared_hellinger
else:
raise ValueError('only squared HD is currently implemented')
epsilon = 1e-10
qs = test_densities + epsilon
rs = self.reference_density + epsilon
iw = qs/rs #importance weights
p_class = self.reference_classwise_densities + epsilon
fracs = p_class/qs
def divergence(prev):
# ps / qs = (prev @ p_class) / qs = prev @ (p_class / qs) = prev @ fracs
ps_div_qs = prev @ fracs
return np.mean( f(ps_div_qs) * iw )
return F.optim_minimize(divergence, n_classes)
# new version in which we retain all n*N examples (sampled from a mixture with uniform parameter), and then
# apply importance sampling (IS). In this version we compute D(q||p_alpha) with IS, and not D(p_alpha||q) as
# in the reformulation proposed above
def _target_divergence_q__p(self, posteriors):
# in this variant we evaluate the divergence using a Montecarlo approach
n_classes = len(self.val_densities)
test_kde = self.get_kde_function(posteriors)
test_densities = self.pdf(test_kde, self.reference_samples)
def f_squared_hellinger(u):
return (np.sqrt(u)-1)**2
# todo: this will fail when self.divergence is a callable, and is not the right place to do it anyway
if self.divergence.lower() == 'hd':
f = f_squared_hellinger
else:
raise ValueError('only squared HD is currently implemented')
epsilon = 1e-10
qs = test_densities + epsilon
rs = self.reference_density + epsilon
p_class = self.reference_classwise_densities + epsilon
# D(q||p_a) = 1/N sum f(q/p_a) * (p_a / p_u)
def divergence(prev):
p_a = prev @ p_class
return np.mean( f(qs/p_a) * (p_a/rs) )
return F.optim_minimize(divergence, n_classes)
# the first version we explain in the draft, choosing alpha*N from a pool of N for each class and w/o importance sampling
def _target_divergence_deprecated(self, posteriors):
# in this variant we evaluate the divergence using a Montecarlo approach
n_classes = len(self.val_densities)
test_kde = self.get_kde_function(posteriors)
test_densities_per_class = [self.pdf(test_kde, samples_i) for samples_i in self.class_samples]
# divergence = _get_divergence(self.divergence)
def kld_monte(pi, qi, eps=1e-8):
# there is no pi in front of the log because the samples are already drawn according to pi
smooth_pi = pi+eps
smooth_qi = qi+eps
return np.mean(np.log(smooth_pi / smooth_qi))
def squared_hellinger(pi, qi, eps=1e-8):
smooth_pi = pi + eps
smooth_qi = qi + eps
return np.mean((np.sqrt(smooth_pi/smooth_qi)-1)**2)
# todo: this will fail when self.divergence is a callable, and is not the right place to do it anyway
if self.divergence.lower() == 'kld':
fdivergence = kld_monte
elif self.divergence.lower() == 'hd':
fdivergence = squared_hellinger
def dissimilarity(prev):
# choose the samples according to the prevalence vector
# e.g., prev = [0.5, 0.3, 0.2] will draw 50% from KDE_0, 30% from KDE_1, and 20% from KDE_2
# the points are already pre-sampled and de densities are pre-computed, so that now all that remains
# is to pick a proportional number of each from each class (same for test)
num_variates_per_class = np.round(prev * self.montecarlo_trials).astype(int)
sample_densities = np.vstack(
[self.class_sample_densities[ci][:num_i] for ci, num_i in enumerate(num_variates_per_class)]
)
#val_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prev, sample_densities.T))
val_likelihood = prev @ sample_densities.T
#test_likelihood = []
#for samples_i, num_i in zip(test_densities_per_class, num_variates_per_class):
# test_likelihood.append(samples_i[:num_i])
#test_likelihood = np.concatenate[test]
test_likelihood = np.concatenate(
[samples_i[:num_i] for samples_i, num_i in zip(test_densities_per_class, num_variates_per_class)]
)
# return fdivergence(val_likelihood, test_likelihood) # this is wrong, If I sample from the val distribution
# then I am computing D(Test||Val), so it should be E_{x ~ Val}[f(Test(x)/Val(x))]
return fdivergence(test_likelihood, val_likelihood)
return F.optim_minimize(dissimilarity, n_classes)
# this version explores the entire simplex, and then applies importance sampling. We have not really tested it in deep but
# seems not to be promising
def _target_divergence_uniform(self, posteriors):
# in this variant we evaluate the divergence using a Montecarlo approach
n_classes = len(self.val_densities)
test_kde = self.get_kde_function(posteriors)
test_likelihood = self.pdf(test_kde, self.samples)
def f_squared_hellinger(t):
return (np.sqrt(t) - 1)**2
def f_jensen_shannon(t):
return -(t+1)*np.log((t+1)/2) + t*np.log(t)
def fdivergence(pi, qi, f, eps=1e-10):
spi = pi+eps
sqi = qi+eps
return np.mean(f(spi/sqi)*sqi)
if self.divergence.lower() == 'hd':
f = f_squared_hellinger
elif self.divergence.lower() == 'js':
f = f_jensen_shannon
def dissimilarity(prev):
val_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, self.sample_densities))
return fdivergence(val_likelihood, test_likelihood, f)
return F.optim_minimize(dissimilarity, n_classes)
def _target_likelihood(self, posteriors, eps=0.000001):
"""
Searches for the mixture model parameter (the sought prevalence values) that yields a validation distribution
(the mixture) that best matches the test distribution, in terms of the divergence measure of choice.
:param instances: instances in the sample
:return: a vector of class prevalence estimates
"""
np.random.RandomState(self.random_state)
n_classes = len(self.val_densities)
test_densities = [self.pdf(kde_i, posteriors) for kde_i in self.val_densities]
#return lambda posteriors: sum(prev_i * self.pdf(kde_i, posteriors) for kde_i, prev_i in zip(self.val_densities, prev))
def neg_loglikelihood(prev):
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities))
test_loglikelihood = np.log(test_mixture_likelihood + eps)
return -np.sum(test_loglikelihood)
return F.optim_minimize(neg_loglikelihood, n_classes)

View File

@ -0,0 +1,174 @@
from cgi import test
import os
import sys
from typing import Union, Callable
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KernelDensity
from scipy.stats import multivariate_normal
import quapy as qp
from quapy.data import LabelledCollection
from quapy.protocol import APP, UPP
from quapy.method.aggregative import AggregativeProbabilisticQuantifier, _training_helper, cross_generate_predictions, \
DistributionMatching, _get_divergence
import scipy
from scipy import optimize
from statsmodels.nonparametric.kernel_density import KDEMultivariateConditional
from time import time
from sklearn.metrics.pairwise import rbf_kernel
def gram_matrix_mix(bandwidth, X, Y=None):
# this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y))
# to contain pairwise evaluations of N(x|mu,Sigma1+Sigma2) with mu=y and Sigma1 and Sigma2 are
# two "scalar matrices" (h^2) I each, so Sigma1+Sigma2 has scalar 2(h^2) (h is the bandwidth)
variance = 2 * (bandwidth**2)
nD = X.shape[1]
gamma = 1/(2*variance)
gram = rbf_kernel(X, Y, gamma=gamma)
norm_factor = 1/np.sqrt(((2*np.pi)**nD) * (variance**(nD)))
gram *= norm_factor
print('GRAM SUM:', gram.sum())
return gram
def weighted_prod(pi, tau, G):
return pi[:,np.newaxis] * G * tau
def tril_weighted_prod(pi, G):
M = weighted_prod(pi, pi, G)
return np.triu(M,1)
class KDEyclosed(AggregativeProbabilisticQuantifier):
def __init__(self, classifier: BaseEstimator, val_split=0.4, bandwidth=0.1, n_jobs=None, random_state=0):
self.classifier = classifier
self.val_split = val_split
self.bandwidth = bandwidth
self.n_jobs = n_jobs
self.random_state=random_state
def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
"""
:param data: the training set
:param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit)
:param val_split: either a float in (0,1) indicating the proportion of training instances to use for
validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV
to estimate the parameters
"""
# print('[fit] enter')
if val_split is None:
val_split = self.val_split
self.classifier, y, posteriors, classes, class_count = cross_generate_predictions(
data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
)
# from distribution_matching.binary_debug import HACK
# posteriors, y = HACK(posteriors, y)
# print('[fit] precomputing stuff')
n = data.n_classes
#L = [posteriors[y==i] for i in range(n)]
#l = [len(Li) for Li in L]
D = n
h = self.bandwidth
#cov_mix_scalar = 2 * h * h # corresponds to a bandwidth of sqrt(2)*h
#Kernel = multivariate_normal(mean=np.zeros(D), cov=cov_mix_scalar)
# print('[fit] classifier ready; precomputing gram')
self.gram_tr_tr = gram_matrix_mix(h, posteriors)
# li_inv keeps track of the relative weight of each datapoint within its class
# (i.e., the weight in its KDE model)
counts_inv = 1/(data.counts())
self.li_inv = counts_inv[y]
# Khash = {}
# for a in range(n):
# for b in range(l[a]):
# for i in range(n):
# Khash[(a,b,i)] = sum(Kernel.pdf(L[i][j] - L[a][b]) for j in range(l[i]))
# for j in range(l[i]): # this for, and index j, can be supressed and store the sum across j
# Khash[(a, b, i, j)] = Kernel.pdf(L[i][j] - L[a][b])
self.n = n
#self.L = L
#self.l = l
#self.Kernel = Kernel
#self.Khash = Khash
self.C = ((2 * np.pi) ** (-D / 2)) * h ** (-D)
print('C:', self.C)
self.Ptr = posteriors
self.ytr = y
assert all(sorted(np.unique(y)) == np.arange(data.n_classes)), 'label name gaps not allowed in current implementation'
# print('[fit] exit')
return self
def aggregate(self, posteriors: np.ndarray):
# print('[aggregate] enter')
Ptr = self.Ptr
Pte = posteriors
gram_te_te = gram_matrix_mix(self.bandwidth, Pte, Pte)
gram_tr_te = gram_matrix_mix(self.bandwidth, Ptr, Pte)
K = Pte.shape[0]
tau = np.full(shape=K, fill_value=1/K, dtype=float)
h = self.bandwidth
D = Ptr.shape[1]
C = self.C
partC = 0.5 * np.log( C/K + 2 * tril_weighted_prod(tau, gram_te_te).sum())
def match(alpha):
pi = alpha[self.ytr] * self.li_inv
partA = -np.log(weighted_prod(pi, tau, gram_tr_te).sum())
# print('gram_Tr_Tr sum', self.gram_tr_tr.sum())
# print('pretril', (np.triu(self.gram_tr_tr,1).sum()))
# print('tril', (2 * tril_weighted_prod(pi, self.gram_tr_tr).sum()))
# print('pi', pi.sum(), pi[:10])
# print('Cs', C*(pi**2).sum())
partB = 0.5 * np.log(C*(pi**2).sum() + 2*tril_weighted_prod(pi, self.gram_tr_tr).sum())
Dcs = partA + partB + partC
# print(f'{alpha=}\t{partA=}\t{partB=}\t{partC}')
# print()
return Dcs
# print('[aggregate] starts search')
# the initial point is set as the uniform distribution
uniform_distribution = np.full(fill_value=1 / self.n, shape=(self.n,))
# uniform_distribution = [0.2, 0.8]
# solutions are bounded to those contained in the unit-simplex
bounds = tuple((0, 1) for _ in range(self.n)) # values in [0,1]
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
# print('[aggregate] end')
return r.x

View File

@ -0,0 +1,145 @@
from cgi import test
import os
import sys
from typing import Union, Callable
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KernelDensity
from scipy.stats import multivariate_normal
import quapy as qp
from quapy.data import LabelledCollection
from quapy.protocol import APP, UPP
from quapy.method.aggregative import AggregativeProbabilisticQuantifier, _training_helper, cross_generate_predictions, \
DistributionMatching, _get_divergence
import scipy
from scipy import optimize
from statsmodels.nonparametric.kernel_density import KDEMultivariateConditional
from time import time
from sklearn.metrics.pairwise import rbf_kernel
def gram_matrix_mix_sum(bandwidth, X, Y=None, reduce=True):
# this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y))
# to contain pairwise evaluations of N(x|mu,Sigma1+Sigma2) with mu=y and Sigma1 and Sigma2 are
# two "scalar matrices" (h^2) I each, so Sigma1+Sigma2 has scalar 2(h^2) (h is the bandwidth)
variance = 2 * (bandwidth**2)
nRows,nD = X.shape
gamma = 1/(2*variance)
gram = rbf_kernel(X, Y, gamma=gamma)
norm_factor = 1/np.sqrt(((2*np.pi)**nD) * (variance**(nD)))
gram *= norm_factor
if Y is None:
# ignores the diagonal
aggr = (2 * np.triu(gram, 1)).sum()
else:
aggr = gram.sum()
return aggr
class KDEyclosed_efficient(AggregativeProbabilisticQuantifier):
def __init__(self, classifier: BaseEstimator, val_split=0.4, bandwidth=0.1, n_jobs=None, random_state=0):
self.classifier = classifier
self.val_split = val_split
self.bandwidth = bandwidth
self.n_jobs = n_jobs
self.random_state=random_state
def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
"""
:param data: the training set
:param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit)
:param val_split: either a float in (0,1) indicating the proportion of training instances to use for
validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV
to estimate the parameters
"""
# print('[fit] enter')
if val_split is None:
val_split = self.val_split
self.classifier, y, posteriors, classes, class_count = cross_generate_predictions(
data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
)
assert all(sorted(np.unique(y)) == np.arange(data.n_classes)), \
'label name gaps not allowed in current implementation'
n = data.n_classes
h = self.bandwidth
P = posteriors
counts_inv = 1 / (data.counts())
nD = P.shape[1]
C = ((2 * np.pi) ** (-nD / 2)) * (self.bandwidth ** (-nD))
tr_tr_sums = np.zeros(shape=(n,n), dtype=float)
self.tr_C = []
for i in range(n):
for j in range(n):
if i > j:
tr_tr_sums[i,j] = tr_tr_sums[j,i]
else:
if i == j:
tr_tr_sums[i, j] = gram_matrix_mix_sum(h, P[y == i])
self.tr_C.append(C * sum(y == i))
else:
block = gram_matrix_mix_sum(h, P[y == i], P[y == j])
tr_tr_sums[i, j] = block
self.tr_C = np.asarray(self.tr_C)
self.Ptr = posteriors
self.ytr = y
self.tr_tr_sums = tr_tr_sums
self.counts_inv = counts_inv
return self
def aggregate(self, posteriors: np.ndarray):
# print('[aggregate] enter')
Ptr = self.Ptr
Pte = posteriors
K,nD = Pte.shape
Kinv = (1/K)
h = self.bandwidth
n = Ptr.shape[1]
y = self.ytr
tr_tr_sums = self.tr_tr_sums
C = ((2 * np.pi) ** (-nD / 2)) * (self.bandwidth ** (-nD))
partC = 0.5*np.log(gram_matrix_mix_sum(h, Pte) * Kinv * Kinv + C*Kinv)
tr_te_sums = np.zeros(shape=n, dtype=float)
for i in range(n):
tr_te_sums[i] = gram_matrix_mix_sum(h, Ptr[y==i], Pte) * self.counts_inv[i] * Kinv
def match(alpha):
partA = -np.log((alpha * tr_te_sums).sum())
alpha_l = alpha * self.counts_inv
partB = 0.5 * np.log((alpha_l[:,np.newaxis] * tr_tr_sums * alpha_l).sum() + (self.tr_C*(alpha_l**2)).sum())
return partA + partB + partC
# print('[aggregate] starts search')
# the initial point is set as the uniform distribution
uniform_distribution = np.full(fill_value=1 / n, shape=(n,))
# uniform_distribution = [0.2, 0.8]
# solutions are bounded to those contained in the unit-simplex
bounds = tuple((0, 1) for _ in range(n)) # values in [0,1]
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
# print('[aggregate] end')
return r.x

View File

@ -0,0 +1,139 @@
import os
import sys
from typing import Union, Callable
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KernelDensity
from scipy.stats import multivariate_normal
import quapy as qp
from quapy.data import LabelledCollection
from quapy.protocol import APP, UPP
from quapy.method.aggregative import AggregativeProbabilisticQuantifier, _training_helper, cross_generate_predictions, \
DistributionMatching, _get_divergence
import scipy
from scipy import optimize
from statsmodels.nonparametric.kernel_density import KDEMultivariateConditional
from time import time
from sklearn.metrics.pairwise import rbf_kernel
def gram_matrix_mix_sum(bandwidth, X, Y=None, reduce=True):
# this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y))
# to contain pairwise evaluations of N(x|mu,Sigma1+Sigma2) with mu=y and Sigma1 and Sigma2 are
# two "scalar matrices" (h^2) I each, so Sigma1+Sigma2 has scalar 2(h^2) (h is the bandwidth)
variance = 2 * (bandwidth**2)
nRows,nD = X.shape
gamma = 1/(2*variance)
norm_factor = 1/np.sqrt(((2*np.pi)**nD) * (variance**(nD)))
gram = norm_factor * rbf_kernel(X, Y, gamma=gamma)
return gram.sum()
class KDEyclosed_efficient_corr(AggregativeProbabilisticQuantifier):
def __init__(self, classifier: BaseEstimator, val_split=0.4, bandwidth=0.1, n_jobs=None, random_state=0):
self.classifier = classifier
self.val_split = val_split
self.bandwidth = bandwidth
self.n_jobs = n_jobs
self.random_state=random_state
def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
"""
:param data: the training set
:param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit)
:param val_split: either a float in (0,1) indicating the proportion of training instances to use for
validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV
to estimate the parameters
"""
# print('[fit] enter')
if val_split is None:
val_split = self.val_split
self.classifier, y, posteriors, classes, class_count = cross_generate_predictions(
data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
)
print('training over')
assert all(sorted(np.unique(y)) == np.arange(data.n_classes)), \
'label name gaps not allowed in current implementation'
# print('[fit] precomputing stuff')
# from distribution_matching.binary_debug import HACK
# posteriors, y = HACK(posteriors, y)
n = data.n_classes
h = self.bandwidth
# print('[fit] classifier ready; precomputing gram')
P = posteriors
# li_inv keeps track of the relative weight of each datapoint within its class
# (i.e., the weight in its KDE model)
counts_inv = 1 / (data.counts())
tr_tr_sums = np.zeros(shape=(n,n), dtype=float)
for i in range(n):
for j in range(n):
if i > j:
tr_tr_sums[i,j] = tr_tr_sums[j,i]
else:
block = gram_matrix_mix_sum(h, P[y == i], P[y == j] if i!=j else None)
tr_tr_sums[i, j] = block
# compute class-class block-sums
self.Ptr = posteriors
self.ytr = y
self.tr_tr_sums = tr_tr_sums
self.counts_inv = counts_inv
print('fit over')
return self
def aggregate(self, posteriors: np.ndarray):
# print('aggregating')
Ptr = self.Ptr
Pte = posteriors
K,nD = Pte.shape
Kinv = (1/K)
h = self.bandwidth
n = Ptr.shape[1]
y = self.ytr
tr_tr_sums = self.tr_tr_sums
partC = 0.5*np.log(gram_matrix_mix_sum(h, Pte) * Kinv * Kinv)
tr_te_sums = np.zeros(shape=n, dtype=float)
for i in range(n):
tr_te_sums[i] = gram_matrix_mix_sum(h, Ptr[y==i], Pte) * self.counts_inv[i] * Kinv
def match(alpha):
partA = -np.log((alpha * tr_te_sums).sum())
alpha_l = alpha * self.counts_inv
partB = 0.5 * np.log((alpha_l[:,np.newaxis] * tr_tr_sums * alpha_l).sum())
return partA + partB + partC
# print('starting search')
# the initial point is set as the uniform distribution
uniform_distribution = np.full(fill_value=1 / n, shape=(n,))
# solutions are bounded to those contained in the unit-simplex
bounds = tuple((0, 1) for _ in range(n)) # values in [0,1]
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
return r.x

View File

@ -0,0 +1,210 @@
from distribution_matching.commons import (ADJUSTMENT_METHODS, BIN_METHODS, DISTR_MATCH_METHODS, MAX_LIKE_METHODS,
METHODS, FULL_METHOD_LIST)
import quapy as qp
from os import makedirs
import os
from tabular import Table
import pandas as pd
tables_path = '.'
# makedirs(tables_path, exist_ok=True)
MAXTONE = 35 # sets the intensity of the maximum color reached by the worst (red) and best (green) results
SHOW_STD = False
NUM_ADJUSTMENT_METHODS = len(ADJUSTMENT_METHODS)
NUM_MAXIMUM_LIKELIHOOD_METHODS = len(MAX_LIKE_METHODS)
NUM_DISTRIBUTION_MATCHING_METHODS = len(DISTR_MATCH_METHODS)
qp.environ['SAMPLE_SIZE'] = 100
nice_bench = {
'sanders': 'Sanders',
'semeval13': 'SemEval13',
'semeval14': 'SemEval14',
'semeval15': 'SemEval15',
'semeval16': 'SemEval16',
}
def save_table(path, table):
print(f'saving results in {path}')
with open(path, 'wt') as foo:
foo.write(table)
def new_table(datasets, methods):
return Table(
benchmarks=datasets,
methods=methods,
ttest='wilcoxon',
prec_mean=5,
show_std=SHOW_STD,
prec_std=4,
clean_zero=(eval=='mae'),
average=True,
maxtone=MAXTONE
)
def make_table(tabs, eval, benchmark_groups, benchmark_names, compact=False):
n_methods = len(METHODS)
assert n_methods == (NUM_ADJUSTMENT_METHODS+NUM_DISTRIBUTION_MATCHING_METHODS+NUM_MAXIMUM_LIKELIHOOD_METHODS), \
"Unexpected number of methods"
cline = "\cline{2-" + str(n_methods+ 1) + "}"
# write the latex table
tabular = """
\\begin{tabular}{|c|""" + ('c|' * NUM_ADJUSTMENT_METHODS) + ('c|' * NUM_DISTRIBUTION_MATCHING_METHODS) + ('c|' * NUM_MAXIMUM_LIKELIHOOD_METHODS) + """} """ + cline + """
\multicolumn{1}{c}{} &
\multicolumn{"""+str(NUM_ADJUSTMENT_METHODS)+"""}{|c}{Adjustment} &
\multicolumn{"""+str(NUM_DISTRIBUTION_MATCHING_METHODS)+"""}{|c|}{Distribution Matching} &
\multicolumn{"""+str(NUM_MAXIMUM_LIKELIHOOD_METHODS)+"""}{c|}{Maximum Likelihood} \\\\
\hline
"""
for i, (tab, group, name) in enumerate(zip(tabs, benchmark_groups, benchmark_names)):
tablines = tab.latexTabular(benchmark_replace=nice_bench, endl='\\\\'+ cline, aslines=True)
tablines[0] = tablines[0].replace('\multicolumn{1}{c|}{}', '\\textbf{'+name+'}')
if compact or len(tab.benchmarks)==1:
# if compact, keep the method names and the average; discard the rest
tabular += tablines[0] + '\n' + tablines[1 if len(tab.benchmarks)==1 else -1] + '\n'
else:
tabular += '\n'.join(tablines)
tabular += "\n" + "\\textit{Rank} & " + tab.getRankTable(prec_mean=0 if name.startswith('LeQua') else 1).latexAverage()
if i < (len(tabs) - 1):
tabular += "\\hline\n"
else:
tabular += "\n"
tabular += "\end{tabular}"
return tabular
def gen_tables_uci_multiclass(eval):
print('Generating table for UCI Multiclass Datasets', eval)
dir_results = f'../results/ucimulti/{eval}'
datasets = qp.datasets.UCI_MULTICLASS_DATASETS
tab = new_table(datasets, METHODS)
for dataset in datasets:
print(f'\t Dataset: {dataset}: ', end='')
for method in METHODS:
result_path = f'{dir_results}/{method}_{dataset}.dataframe'
if os.path.exists(result_path):
df = pd.read_csv(result_path)
print(f'{method}', end=' ')
tab.add(dataset, method, df[eval].values)
else:
print(f'MISSING-{method}', end=' ')
print()
return tab
def gen_tables_uci_bin(eval):
print('Generating table for UCI Datasets', eval)
dir_results = f'../results/binary/{eval}'
exclude = ['acute.a', 'acute.b', 'iris.1', 'balance.2']
datasets = [x for x in qp.datasets.UCI_DATASETS if x not in exclude]
tab = new_table(datasets, BIN_METHODS)
for dataset in datasets:
print(f'\t Dataset: {dataset}: ', end='')
for method in BIN_METHODS:
result_path = f'{dir_results}/{method}_{dataset}.dataframe'
if os.path.exists(result_path):
df = pd.read_csv(result_path)
print(f'{method}', end=' ')
tab.add(dataset, method, df[eval].values)
else:
print(f'MISSING-{method}', end=' ')
return tab
def gen_tables_tweet(eval):
print('Generating table for Twitter', eval)
dir_results = f'../results/tweet/{eval}'
datasets = qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST
tab = new_table(datasets, METHODS)
for dataset in datasets:
print(f'\t Dataset: {dataset}: ', end='')
for method in METHODS:
result_path = f'{dir_results}/{method}_{dataset}.dataframe'
if os.path.exists(result_path):
df = pd.read_csv(result_path)
print(f'{method}', end=' ')
tab.add(dataset, method, df[eval].values)
else:
print(f'MISSING-{method}', end=' ')
print()
return tab
def gen_tables_lequa(Methods, task, eval):
# generating table for LeQua-T1A or Lequa-T1B; only one table with two rows, one for MAE, another for MRAE
tab = new_table([task], Methods)
print('Generating table for T1A@Lequa', eval, end='')
dir_results = f'../results/lequa/{task}/{eval}'
for method in Methods:
result_path = f'{dir_results}/{method}.dataframe'
if os.path.exists(result_path):
df = pd.read_csv(result_path)
print(f'{method}', end=' ')
tab.add(task, method, df[eval].values)
else:
print(f'MISSING-{method}', end=' ')
print()
return tab
if __name__ == '__main__':
os.makedirs('./latex', exist_ok=True)
for eval in ['mae', 'mrae']:
tabs = []
tabs.append(gen_tables_tweet(eval))
tabs.append(gen_tables_uci_multiclass(eval))
tabs.append(gen_tables_lequa(METHODS, 'T1B', eval))
names = ['Tweets', 'UCI-multi', 'LeQua']
table = make_table(tabs, eval, benchmark_groups=tabs, benchmark_names=names)
save_table(f'./latex/multiclass_{eval}.tex', table)
for eval in ['mae', 'mrae']:
tabs = []
tabs.append(gen_tables_uci_bin(eval))
# print uci-binary with all datasets for the appendix
table = make_table(tabs, eval, benchmark_groups=tabs, benchmark_names=['UCI-binary'])
save_table(f'./latex/ucibinary_{eval}.tex', table)
# print uci-bin compacted plus lequa-T1A for the main body
tabs.append(gen_tables_lequa(BIN_METHODS, 'T1A', eval))
table = make_table(tabs, eval, benchmark_groups=tabs, benchmark_names=['UCI-binary', 'LeQua'], compact=True)
save_table(f'./latex/binary_{eval}.tex', table)
print("[Tables Done] runing latex")
os.chdir('./latex/')
os.system('pdflatex tables_compact.tex')
os.system('rm tables_compact.aux tables_compact.bbl tables_compact.blg tables_compact.log tables_compact.out tables_compact.dvi')

View File

@ -0,0 +1,107 @@
\documentclass{article}
\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc} % use 8-bit T1 fonts
\usepackage{hyperref} % hyperlinks
\usepackage{url} % simple URL typesetting
\usepackage{booktabs} % professional-quality tables
\usepackage{amsfonts} % blackboard math symbols
\usepackage{nicefrac} % compact symbols for 1/2, etc.
\usepackage{microtype} % microtypography
\usepackage{lipsum}
\usepackage{fancyhdr} % header
\usepackage{graphicx} % graphics
\graphicspath{{media/}} % organize your images and other figures under media/ folder
\usepackage{amsmath}
\usepackage{bm}
\usepackage{tabularx}
\usepackage{color}
\usepackage{colortbl}
\usepackage{xcolor}
\usepackage{lmodern}
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}
\newif\ifdraft
\drafttrue
\newcommand{\juanjo}[1]{\ifdraft{\leavevmode\color{purple}{[JJ]:
{#1}}}\else{\vspace{0ex}}\fi}
\newcommand{\alex}[1]{\ifdraft{\leavevmode\color{violet}{[AM]:
{#1}}}\else{\vspace{0ex}}\fi}
\newcommand{\pablo}[1]{\ifdraft{\leavevmode\color{red}{[PG]:
{#1}}}\else{\vspace{0ex}}\fi}
\title{Tables}
\author{
Alejandro Moreo
}
\begin{document}
\maketitle
\begin{table}[h]
\centering
\caption{Multiclass AE}
\resizebox{\textwidth}{!}{%
\input{multiclass_mae}
}%
\end{table}
\begin{table}[h]
\centering
\caption{Multiclass RAE}
\resizebox{\textwidth}{!}{%
\input{multiclass_mrae}
}%
\end{table}
\begin{table}[h]
\centering
\caption{Binary MAE}
\resizebox{\textwidth}{!}{%
\input{binary_mae}
}%
\end{table}
\begin{table}[h]
\centering
\caption{Binary MRAE}
\resizebox{\textwidth}{!}{%
\input{binary_mrae}
}%
\end{table}
\begin{table}[h]
\centering
\caption{UCI binary full AE}
\resizebox{\textwidth}{!}{%
\input{ucibinary_mae}
}%
\end{table}
\begin{table}[h]
\centering
\caption{UCI binary full RAE}
\resizebox{\textwidth}{!}{%
\input{ucibinary_mrae}
}%
\end{table}
\end{document}

View File

@ -0,0 +1,348 @@
import numpy as np
import itertools
from scipy.stats import ttest_ind_from_stats, wilcoxon
class Table:
VALID_TESTS = [None, "wilcoxon", "ttest"]
def __init__(self, benchmarks, methods, lower_is_better=True, ttest='ttest', prec_mean=3,
clean_zero=False, show_std=False, prec_std=3, average=True, missing=None, missing_str='--', color=True, maxtone=50):
assert ttest in self.VALID_TESTS, f'unknown test, valid are {self.VALID_TESTS}'
self.benchmarks = np.asarray(benchmarks)
self.benchmark_index = {row:i for i, row in enumerate(benchmarks)}
self.methods = np.asarray(methods)
self.method_index = {col:j for j, col in enumerate(methods)}
self.map = {}
# keyed (#rows,#cols)-ndarrays holding computations from self.map['values']
self._addmap('values', dtype=object)
self.lower_is_better = lower_is_better
self.ttest = ttest
self.prec_mean = prec_mean
self.clean_zero = clean_zero
self.show_std = show_std
self.prec_std = prec_std
self.add_average = average
self.missing = missing
self.missing_str = missing_str
self.color = color
self.maxtone = maxtone
self.touch()
@property
def nbenchmarks(self):
return len(self.benchmarks)
@property
def nmethods(self):
return len(self.methods)
def touch(self):
self._modif = True
def update(self):
if self._modif:
self.compute()
def _getfilled(self):
return np.argwhere(self.map['fill'])
@property
def values(self):
return self.map['values']
def _indexes(self):
return itertools.product(range(self.nbenchmarks), range(self.nmethods))
def _addmap(self, map, dtype, func=None):
self.map[map] = np.empty((self.nbenchmarks, self.nmethods), dtype=dtype)
if func is None:
return
m = self.map[map]
f = func
indexes = self._indexes() if map == 'fill' else self._getfilled()
for i, j in indexes:
m[i, j] = f(self.values[i, j])
def _addrank(self):
for i in range(self.nbenchmarks):
filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
col_means = [self.map['mean'][i,j] for j in filled_cols_idx]
ranked_cols_idx = filled_cols_idx[np.argsort(col_means)]
if not self.lower_is_better:
ranked_cols_idx = ranked_cols_idx[::-1]
self.map['rank'][i, ranked_cols_idx] = np.arange(1, len(filled_cols_idx)+1)
def _addcolor(self):
for i in range(self.nbenchmarks):
filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
if filled_cols_idx.size==0:
continue
col_means = [self.map['mean'][i,j] for j in filled_cols_idx]
#col_means = [self.map['rank'][i, j] for j in filled_cols_idx]
minval = min(col_means)
maxval = max(col_means)
for col_idx in filled_cols_idx:
val = self.map['mean'][i,col_idx]
norm = (maxval - minval)
if norm > 0:
normval = (val - minval) / norm
else:
normval = 0.5
if self.lower_is_better:
normval = 1 - normval
normval = np.clip(normval, 0,1)
self.map['color'][i, col_idx] = color_red2green_01(normval, self.maxtone)
def _run_ttest(self, row, col1, col2):
mean1 = self.map['mean'][row, col1]
std1 = self.map['std'][row, col1]
nobs1 = self.map['nobs'][row, col1]
mean2 = self.map['mean'][row, col2]
std2 = self.map['std'][row, col2]
nobs2 = self.map['nobs'][row, col2]
_, p_val = ttest_ind_from_stats(mean1, std1, nobs1, mean2, std2, nobs2)
return p_val
def _run_wilcoxon(self, row, col1, col2):
values1 = self.map['values'][row, col1]
values2 = self.map['values'][row, col2]
try:
_, p_val = wilcoxon(values1, values2)
except ValueError:
p_val = 0
return p_val
def _add_statistical_test(self):
if self.ttest is None:
return
self.some_similar = [False]*self.nmethods
for i in range(self.nbenchmarks):
filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
if len(filled_cols_idx) <= 1:
continue
col_means = [self.map['mean'][i,j] for j in filled_cols_idx]
best_pos = filled_cols_idx[np.argmin(col_means)]
for j in filled_cols_idx:
if j==best_pos:
continue
if self.ttest == 'ttest':
p_val = self._run_ttest(i, best_pos, j)
else:
p_val = self._run_wilcoxon(i, best_pos, j)
pval_outcome = pval_interpretation(p_val)
self.map['ttest'][i, j] = pval_outcome
if pval_outcome != 'Diff':
self.some_similar[j] = True
def compute(self):
self._addmap('fill', dtype=bool, func=lambda x: x is not None)
self._addmap('mean', dtype=float, func=np.mean)
self._addmap('std', dtype=float, func=np.std)
self._addmap('nobs', dtype=float, func=len)
self._addmap('rank', dtype=int, func=None)
self._addmap('color', dtype=object, func=None)
self._addmap('ttest', dtype=object, func=None)
self._addmap('latex', dtype=object, func=None)
self._addrank()
self._addcolor()
self._add_statistical_test()
if self.add_average:
self._addave()
self._modif = False
def _is_column_full(self, col):
return all(self.map['fill'][:, self.method_index[col]])
def _addave(self):
ave = Table(['ave'], self.methods,
lower_is_better=self.lower_is_better,
ttest=self.ttest,
average=False,
missing=self.missing,
missing_str=self.missing_str,
prec_mean=self.prec_mean,
prec_std=self.prec_std,
clean_zero=self.clean_zero,
show_std=self.show_std,
color=self.color,
maxtone=self.maxtone)
for col in self.methods:
values = None
if self._is_column_full(col):
if self.ttest == 'ttest':
# values = np.asarray(self.map['mean'][:, self.method_index[col]])
values = np.concatenate(self.values[:, self.method_index[col]])
else: # wilcoxon
# values = np.asarray(self.map['mean'][:, self.method_index[col]])
values = np.concatenate(self.values[:, self.method_index[col]])
ave.add('ave', col, values)
self.average = ave
def add(self, benchmark, method, values):
if values is not None:
values = np.asarray(values)
if values.ndim==0:
values = values.flatten()
rid, cid = self._coordinates(benchmark, method)
self.map['values'][rid, cid] = values
self.touch()
def get(self, benchmark, method, attr='mean'):
self.update()
assert attr in self.map, f'unknwon attribute {attr}'
rid, cid = self._coordinates(benchmark, method)
if self.map['fill'][rid, cid]:
v = self.map[attr][rid, cid]
if v is None or (isinstance(v,float) and np.isnan(v)):
return self.missing
return v
else:
return self.missing
def _coordinates(self, benchmark, method):
assert benchmark in self.benchmark_index, f'benchmark {benchmark} out of range'
assert method in self.method_index, f'method {method} out of range'
rid = self.benchmark_index[benchmark]
cid = self.method_index[method]
return rid, cid
def get_average(self, method, attr='mean'):
self.update()
if self.add_average:
return self.average.get('ave', method, attr=attr)
return None
def get_color(self, benchmark, method):
color = self.get(benchmark, method, attr='color')
if color is None:
return ''
return color
def latex(self, benchmark, method):
self.update()
i,j = self._coordinates(benchmark, method)
if self.map['fill'][i,j] == False:
return self.missing_str
mean = self.map['mean'][i,j]
l = f" {mean:.{self.prec_mean}f}"
if self.clean_zero:
l = l.replace(' 0.', '.')
isbest = self.map['rank'][i,j] == 1
if isbest:
l = "\\textbf{"+l.strip()+"}"
stat = '' if self.ttest is None else '^{\phantom{\ddag}}'
if self.ttest is not None and self.some_similar[j]:
test_label = self.map['ttest'][i,j]
if test_label == 'Sim':
stat = '^{\dag}'
elif test_label == 'Same':
stat = '^{\ddag}'
elif isbest or test_label == 'Diff':
stat = '^{\phantom{\ddag}}'
std = ''
if self.show_std:
std = self.map['std'][i,j]
std = f" {std:.{self.prec_std}f}"
if self.clean_zero:
std = std.replace(' 0.', '.')
std = f"\pm {std:{self.prec_std}}"
if stat!='' or std!='':
l = f'{l}${stat}{std}$'
if self.color:
l += ' ' + self.map['color'][i,j]
return l
def latexTabular(self, benchmark_replace={}, method_replace={}, aslines=False, endl='\\\\\hline'):
lines = []
l = '\multicolumn{1}{c|}{} & '
l += ' & '.join([method_replace.get(col, col) for col in self.methods])
l += ' \\\\\hline'
lines.append(l)
for row in self.benchmarks:
rowname = benchmark_replace.get(row, row)
l = rowname + ' & '
l += self.latexRow(row, endl=endl)
lines.append(l)
if self.add_average:
# l += '\hline\n'
l = '\hline \n \\textit{Average} & '
l += self.latexAverage(endl=endl)
lines.append(l)
if not aslines:
lines='\n'.join(lines)
return lines
def latexRow(self, benchmark, endl='\\\\\hline\n'):
s = [self.latex(benchmark, col) for col in self.methods]
s = ' & '.join(s)
s += ' ' + endl
return s
def latexAverage(self, endl='\\\\\hline\n'):
if self.add_average:
return self.average.latexRow('ave', endl=endl)
def getRankTable(self, prec_mean=0):
t = Table(benchmarks=self.benchmarks, methods=self.methods, prec_mean=prec_mean, average=True, maxtone=self.maxtone, ttest=None)
for rid, cid in self._getfilled():
row = self.benchmarks[rid]
col = self.methods[cid]
t.add(row, col, self.get(row, col, 'rank'))
t.compute()
return t
def dropMethods(self, methods):
drop_index = [self.method_index[m] for m in methods]
new_methods = np.delete(self.methods, drop_index)
new_index = {col:j for j, col in enumerate(new_methods)}
self.map['values'] = self.values[:,np.asarray([self.method_index[m] for m in new_methods], dtype=int)]
self.methods = new_methods
self.method_index = new_index
self.touch()
def pval_interpretation(p_val):
if 0.005 >= p_val:
return 'Diff'
elif 0.05 >= p_val > 0.005:
return 'Sim'
elif p_val > 0.05:
return 'Same'
def color_red2green_01(val, maxtone=50):
if np.isnan(val): return None
assert 0 <= val <= 1, f'val {val} out of range [0,1]'
# rescale to [-1,1]
val = val * 2 - 1
if val < 0:
color = 'red'
tone = maxtone * (-val)
else:
color = 'green'
tone = maxtone * val
return '\cellcolor{' + color + f'!{int(tone)}' + '}'

View File

@ -0,0 +1,82 @@
import numpy as np
from scipy.stats import norm, uniform, expon
from scipy.special import rel_entr
def kld_approx(p, q, resolution=100):
steps = np.linspace(-5, 5, resolution)
integral = 0
for step_i1, step_i2 in zip(steps[:-1], steps[1:]):
base = step_i2-step_i1
middle = (step_i1+step_i2)/2
pi = p.pdf(middle)
qi = q.pdf(middle)
integral += (base * pi*np.log(pi/qi))
return integral
def integrate(f, resolution=10000):
steps = np.linspace(-10, 10, resolution)
integral = 0
for step_i1, step_i2 in zip(steps[:-1], steps[1:]):
base = step_i2 - step_i1
middle = (step_i1 + step_i2) / 2
integral += (base * f(middle))
return integral
def kl_analytic(m1, s1, m2, s2):
return np.log(s2/s1)+ (s1*s1 + (m1-m2)**2) / (2*s2*s2) - 0.5
def montecarlo(p, q, trials=1000000):
xs = p.rvs(trials)
ps = p.pdf(xs)
qs = q.pdf(xs)
return np.mean(np.log(ps/qs))
def montecarlo2(p, q, trials=100000):
#r = norm(-3, scale=5)
r = uniform(-10, 20)
# r = expon(loc=-6)
#r = p
xs = r.rvs(trials)
rs = r.pdf(xs)
print(rs)
ps = p.pdf(xs)+0.0000001
qs = q.pdf(xs)+0.0000001
return np.mean((ps/rs)*np.log(ps/qs))
def wrong(p, q, trials=100000):
xs = np.random.uniform(-10,10, trials)
ps = p.pdf(xs)
qs = q.pdf(xs)
return np.mean(ps*np.log(ps/qs))
p = norm(loc=0, scale=1)
q = norm(loc=1, scale=1)
integral_approx = kld_approx(p, q)
analytic_solution = kl_analytic(p.kwds['loc'], p.kwds['scale'], q.kwds['loc'], q.kwds['scale'])
montecarlo_approx = montecarlo(p, q)
montecarlo_approx2 = montecarlo2(p, q)
wrong_approx = wrong(p, q)
print(f'{analytic_solution=:.10f}')
print()
print(f'{integral_approx=:.10f}')
print(f'integra error = {np.abs(analytic_solution-integral_approx):.10f}')
print()
print(f'{montecarlo_approx=:.10f}')
print(f'montecarlo error = {np.abs(analytic_solution-montecarlo_approx):.10f}')
print()
print(f'{montecarlo_approx2=:.10f}')
print(f'montecarlo2 error = {np.abs(analytic_solution-montecarlo_approx2):.10f}')
print()
print(f'{wrong_approx=:.10f}')
print(f'wrong error = {np.abs(analytic_solution-wrong_approx):.10f}')

View File

@ -0,0 +1,113 @@
import numpy as np
from sklearn.neighbors import KernelDensity
from scipy.stats import norm, multivariate_normal, Covariance
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.neighbors import KDTree
# def CauchySchwarzDivGaussMix(pi_i, mu_i, Lambda_i, tau_i, nu_i, Omega_i):
# Lambda_i_inv =
# def z()
nD=2
mu = [[0.1,0.1], [0.2, 0.3], [1, 1]] # centers
bandwidth = 0.10 # bandwidth and scale (standard deviation)
standard_deviation = bandwidth
variance = standard_deviation**2
mus = np.asarray(mu).reshape(-1,nD)
kde = KernelDensity(bandwidth=bandwidth).fit(mus)
x = [0.5,0.2]
print('with scikit-learn KDE')
prob = np.exp(kde.score_samples(np.asarray(x).reshape(-1,nD)))
print(prob)
# univariate
# N = norm(loc=mu, scale=scale)
# prob = N.pdf(x)
# print(prob)
# multivariate
print('with scipy multivariate normal')
npoints = mus.shape[0]
probs = sum(multivariate_normal(mean=mu_i, cov=variance).pdf(x) for mu_i in mu)
probs /= npoints
print(probs)
print('with scikit learn rbf_kernel')
x = np.asarray(x).reshape(-1,nD)
gamma = 1/(2*variance)
gram = rbf_kernel(mus, x, gamma=gamma)
const = 1/np.sqrt(((2*np.pi)**nD) * (variance**(nD)))
gram *= const
print(gram)
print(np.sum(gram)/npoints)
print('with stackoverflow answer')
from scipy.spatial.distance import pdist, cdist, squareform
import scipy
# this is an NxD matrix, where N is number of items and D its dimensionalites
pairwise_sq_dists = cdist(mus, x, 'euclidean')
print(pairwise_sq_dists)
K = np.exp(-pairwise_sq_dists / variance)
print(K)
print(np.sum(K)/npoints)
print("trying with scipy multivariate on more than one instance")
probs = sum(multivariate_normal(mean=x.reshape(-nD), cov=variance).pdf(mus))
probs /= npoints
print(probs)
import sys
sys.exit(0)
# N1 = multivariate_normal(mean=mu[0], cov=variance)
# prob1 = N1.pdf(x)
# N2 = multivariate_normal(mean=mu[1], cov=variance)
# prob2 = N2.pdf(x)
# print(prob1+prob2)
cov = Covariance.from_diagonal([variance, variance])
print(cov.covariance)
precision_matrix = np.asarray([[1/variance, 0],[0, 1/variance]])
print(Covariance.from_precision(precision_matrix).covariance)
print(np.linalg.inv(precision_matrix))
print(np.linalg.inv(cov.covariance))
print('-'*100)
nD=2
mu = np.asarray([[0.1,0.5]]) # centers
bandwidth = 0.10 # bandwidth and scale (standard deviation)
standard_deviation = bandwidth
variance = standard_deviation**2
mus = np.asarray([mu]).reshape(-1,nD)
kde = KernelDensity(bandwidth=bandwidth).fit(mus)
x = np.asarray([0.5,0.2])
prob = np.exp(kde.score_samples(np.asarray(x).reshape(-1,nD)))
print(prob)
probs = sum(multivariate_normal(mean=mu_i, cov=variance).pdf(x) for mu_i in mu) / len(mu)
probs = sum(multivariate_normal(mean=[0,0], cov=1).pdf((x-mu_i)/bandwidth) for mu_i in mu) / len(mu)
probs = sum(multivariate_normal(mean=[0,0], cov=variance).pdf(x-mu_i) for mu_i in mu) / len(mu)
print(probs)
# N1 = multivariate_normal(mean=mu[0], cov=variance)
# prob1 = N1.pdf(x)
# N2 = multivariate_normal(mean=mu[1], cov=variance)
# prob2 = N2.pdf(x)
# print(prob1+prob2)
h=0.1
D=4
print(np.sqrt((4**2) * (5**2)))
print(4*5)

View File

@ -0,0 +1,153 @@
import numpy as np
from scipy.stats import multivariate_normal
from scipy import optimize
def cauchy_schwarz_divergence_kde(L:list, Xte:np.ndarray, bandwidth:float, alpha:np.ndarray):
"""
:param L: a list of np.ndarray (instances x dimensions) with the Li being the instances of class i
:param Xte: an np.ndarray (instances x dimensions)
:param bandwidth: the bandwidth of the kernel
:param alpha: the mixture parameter
:return: the Cauchy-Schwarz divergence between the validation KDE mixture distribution (with mixture paramerter
alpha) and the test KDE distribution
"""
n = len(L) # number of classes
K, D = Xte.shape # number of test instances, and number of dimensions
Kinv = 1/K
# the lengths of each block
l = np.asarray([len(Li) for Li in L])
# contains the a_i / l_i
alpha_r = alpha / l
alpha2_r_sum = np.sum(alpha * alpha_r) # contains the sum_i a_i**2 / l_i
h = bandwidth
# we will only use the bandwidth (h) between two gaussians with covariance matrix a "scalar matrix" h**2
cov_mix_scalar = 2*h*h # corresponds to a bandwidth of sqrt(2)*h
# constant
C = ((2*np.pi)**(-D/2))*h**(-D)
Kernel = multivariate_normal(mean=np.zeros(D), cov=cov_mix_scalar)
K0 = Kernel.pdf(np.zeros(D))
def compute_block_E():
kernel_block_E = []
for i,Li in enumerate(L):
acc = 0
for x_ji in Li: #optimize...
for x_k in Xte: #optimize...
acc += Kernel.pdf(x_ji - x_k) #optimize...
kernel_block_E.append(acc)
return np.asarray(kernel_block_E)
def compute_block_F_hash():
# this can be computed entirely at training time
Khash = {}
for a in range(n):
for b in range(l[a]):
for i in range(n):
for j in range(l[i]): # this for, and index j, can be supressed and store the sum across j
Khash[(a,b,i,j)] = Kernel.pdf(L[i][j]-L[a][b])
return Khash
def compute_block_Ktest():
# this can be optimized in several ways, starting by computing only the lower diagonal triangle... and remove
# then the K0 which is not needed after that
acc = 0
for x_i in Xte:
for x_j in Xte:
acc += Kernel.pdf(x_i-x_j)
return acc
def compute_block_F():
F = 0
for a in range(n):
tmp_b = 0
for b in range(l[a]):
tmp_i = 0
for i in range(n):
tmp_j = 0
for j in range(l[i]):
tmp_j += Fh[(a, b, i, j)]
tmp_i += (alpha_r[i] * tmp_j)
tmp_b += tmp_i
F += (alpha_r[a] * tmp_b)
return F
E = compute_block_E()
Fh = compute_block_F_hash()
# Ktest = compute_block_Ktest()
F = compute_block_F()
C1 = K*Kinv*Kinv*C
C2 = 2 * np.sum([Kernel.pdf(Xte[k]-Xte[k_p]) for k in range(K) for k_p in range(k)])
partA = -np.log(Kinv * (alpha_r @ E))
partB = 0.5*np.log(C*alpha2_r_sum + F - (K0*alpha2_r_sum))
# partC = 0.5*np.log(Kinv) + 0.5*np.log(C + Kinv*Ktest - K0)
partC = 0.5*np.log(C1+C2)
Dcs = partA + partB + partC
return Dcs
L = [
np.asarray([
[-1,-1,-1]
]),
np.asarray([
[0,0,0],
]),
np.asarray([
[0,0,0.1],
[1,1,1],
[3,3,1],
]),
np.asarray([
[1,0,0]
]),
np.asarray([
[0,1,0]
])
]
Xte = np.asarray(
[[0,0,0],
[0,0,0],
[1,0,0],
[0,1,0]]
)
bandwidth=0.01
alpha=np.asarray([0, 2/4, 0, 1/4, 1/4])
div = cauchy_schwarz_divergence_kde(L, Xte, bandwidth, alpha)
print(div)
def divergence(alpha):
return cauchy_schwarz_divergence_kde(L, Xte, bandwidth, alpha)
# the initial point is set as the uniform distribution
n_classes = len(L)
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
# solutions are bounded to those contained in the unit-simplex
bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1]
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
#print('searching for alpha')
r = optimize.minimize(divergence, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
sol = r.x
for x in sol:
print(f'{x:.4f}')
print(cauchy_schwarz_divergence_kde(L, Xte, bandwidth, sol))

View File

@ -0,0 +1,19 @@
import pickle
import os
dataset = 'lequa/T1B'
for metric in ['mae', 'mrae']:
print('metric', metric)
for method in ['KDEy-DMhd4', 'KDEy-DMhd4+', 'KDEy-DMhd4++']:
path = f'/home/moreo/QuaPy/distribution_matching/results/{dataset}/{metric}/{method}.hyper.pkl'
if os.path.exists(path):
obj = pickle.load(open(path, 'rb'))
print(method, obj)
else:
print(f'path {path} does not exist')
print()

View File

@ -0,0 +1,27 @@
from time import time
from sklearn.metrics.pairwise import rbf_kernel
import torch
import numpy as np
def rbf(X, Y):
return X @ Y.T
@torch.compile
def rbf_comp(X, Y):
return X @ Y.T
def measure_time(X, Y, func):
tinit = time()
func(X, Y)
tend = time()
print(f'took {tend-tinit:.3}s')
X = np.random.rand(1000, 100)
Y = np.random.rand(1000, 100)
measure_time(X, Y, rbf)
measure_time(X, Y, rbf_comp)

View File

@ -0,0 +1,20 @@
import pickle
import os
import quapy as qp
from quapy.model_selection import GridSearchQ
from quapy.protocol import UPP
toprint = []
for dataset in qp.datasets.UCI_MULTICLASS_DATASETS:
data = qp.datasets.fetch_UCIMulticlassDataset(dataset)
# model selection
train, test = data.train_test
toprint.append(f'{dataset}\t{len(train)}\t{len(test)}\t{data.n_classes}')
print()
for pr in toprint:
print(pr)

View File

@ -0,0 +1,6 @@
import pandas as pd
report = pd.read_csv('../results_lequa_mrae/KDEy-MLE.dataframe')
means = report.mean()
print(f'KDEy-MLE\tLeQua-T1B\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')

View File

@ -0,0 +1,155 @@
import numpy as np
from scipy.stats import norm
EPS=1e-8
LARGE_NUM=1000000
TRIALS=LARGE_NUM
# calculamos: D(p||q)
# se asume:
# q = distribución de referencia (en nuestro caso: el mixture de KDEs en training)
# p = distribución (en nuestro caso: KDE del test)
# N = TRIALS el número de trials para montecarlo
# n = numero de clases, en este ejemplo n=3
# ejemplo de f-generator function: Squared Hellinger Distance
def hd2(u):
v = (np.sqrt(u)-1)
return v*v
# esta funcion estima la divergencia entre dos mixtures (univariates)
def integrate(p, q, f=hd2, resolution=TRIALS, epsilon=EPS):
xs = np.linspace(-50, 50, resolution)
ps = p.pdf(xs)+epsilon
qs = q.pdf(xs)+epsilon
rs = ps/qs
base = xs[1]-xs[0]
return np.sum(base * f(rs) * qs)
# esta es la aproximación de montecarlo según está en el artículo
# es decir, sampleando la q, y haciendo la media (1/N). En el artículo
# en realidad se "presamplean" N por cada clase y luego se eligen los que
# corresponda a cada clase. Aquí directamente sampleamos solo los
# que correspondan.
def montecarlo(p, q, f=hd2, trials=TRIALS, epsilon=EPS):
xs = q.rvs(trials)
ps = p.pdf(xs)+epsilon
qs = q.pdf(xs)+epsilon
N = trials
return (1/N)*np.sum(f(ps/qs))
# esta es la aproximación de montecarlo asumiendo que vayamos con todos los N*n
# puntos (es decir, si hubieramos sampleado de una "q" en la que todas las clases
# eran igualmente probables, 1/3) y luego reescalamos los pesos con "importance
# weighting" <-- esto no está en el artículo, pero me gusta más que lo que hay
# porque desaparece el paso de "seleccionar los que corresponden por clase"
def montecarlo_importancesampling(p, q, r, f=hd2, trials=TRIALS, epsilon=EPS):
xs = r.rvs(trials) # <- distribución de referencia (aquí: class weight uniforme)
rs = r.pdf(xs)
ps = p.pdf(xs)+epsilon
qs = q.pdf(xs)+epsilon
N = trials
importance_weight = qs/rs
return (1/N)*np.sum(f(ps/qs)*(importance_weight))
# he intentado implementar la variante que propne Juanjo pero creo que no la he
# entendido bien. Tal vez haya que reescalar el peso por 1/3 o algo así, pero no
# doy con la tecla...
def montecarlo_classweight(p, q, f=hd2, trials=TRIALS, epsilon=EPS):
xs_1 = q.rvs_fromclass(0, trials)
xs_2 = q.rvs_fromclass(1, trials)
xs_3 = q.rvs_fromclass(2, trials)
xs = np.concatenate([xs_1, xs_2, xs_3])
weights = np.asarray([[alpha_i]*trials for alpha_i in q.alphas]).flatten()
ps = p.pdf(xs)+epsilon
qs = q.pdf(xs)+epsilon
N = trials
n = q.n
return (1/(N))*np.sum(weights*f(ps/qs))
class Q:
"""
Esta clase es un mixture de gausianas.
:param locs: medias, una por clase
:param scales: stds, una por clase
:param alphas: peso, uno por clase
"""
def __init__(self, locs, scales, alphas):
self.qs = []
for loc, scale in zip(locs, scales):
self.qs.append(norm(loc=loc, scale=scale))
assert np.isclose(np.sum(alphas), 1), 'alphas do not sum up to 1!'
self.alphas = np.asarray(alphas) / np.sum(alphas)
def pdf(self, xs):
q_xs = np.vstack([
q_i.pdf(xs) * alpha_i for q_i, alpha_i in zip(self.qs, self.alphas)
])
v = q_xs.sum(axis=0)
if len(v)==1:
v = v[0]
return v
@property
def n(self):
return len(self.alphas)
def rvs_fromclass(self, inclass, trials):
return self.qs[inclass].rvs(trials)
def rvs(self, trials):
variates = []
added = 0
for i, (q_i, alpha_i) in enumerate(zip(self.qs, self.alphas)):
trials_i = int(trials*alpha_i) if (i < self.n-1) else trials-added
variates_i = q_i.rvs(trials_i)
variates.append(variates_i)
added += len(variates_i)
return np.concatenate(variates)
@property
def locs(self):
return np.asarray([x.kwds['loc'] for x in self.qs])
@property
def scales(self):
return np.asarray([x.kwds['scale'] for x in self.qs])
@classmethod
def change_priors(cls, q, alphas):
assert len(alphas)==len(q.alphas)
return Q(locs=q.locs, scales=q.scales, alphas=alphas)
# distribucion de test
p = Q(locs=[1, 1.5], scales=[1,1], alphas=[0.8, 0.2])
# distribucion de training (mixture por clase)
q = Q(locs=[0, 0.5, 1.2], scales=[1,1,1.5], alphas=[0.1, 0.2, 0.7])
# distribucion de referencia (mixture copia de q, con pesos de clase uniformes)
r = Q.change_priors(q, alphas=[1/3, 1/3, 1/3])
integral_approx = integrate(p, q)
montecarlo_approx = montecarlo(p, q)
montecarlo_approx2 = montecarlo_importancesampling(p, q, r)
montecarlo_approx3 = montecarlo_classweight(p, q)
print(f'{integral_approx=:.10f}')
print()
print(f'{montecarlo_approx=:.10f}, error={np.abs(integral_approx-montecarlo_approx):.10f}')
print()
print(f'{montecarlo_approx2=:.10f}, error={np.abs(integral_approx-montecarlo_approx2):.10f}')
print()
print(f'{montecarlo_approx3=:.10f}, error={np.abs(integral_approx-montecarlo_approx3):.10f}')

View File

@ -0,0 +1,21 @@
1.- No se si sería más facil tomar r=uniforme, y no r=mixture model, simplifica mucho el sampling y tal vez incluso produzca menos error
2.- Por ahora tengo KLD y HD:
- para KLD no he entendido si tengo que añadir el -x + y
3.- Se puede poner la topsoe como una f-divergence?
La topsoe parece que es 2 veces la jensen-shannon divergence, o sea
topsoe(p,q) = kld(p|m) + kld(q|m), con m = (p+q)/2
4.- Se puede poner la Wasserstein como una f-divergence?
5.- En general, qué relación hay con las "distancias"?
2) implementar el auto
- optimización interna para likelihood [ninguno parece funcionar bien]
- de todo (e.g., todo el training)?
- independiente para cada conjunto etiquetado? (e.g., positivos, negativos, neutros, y test)
- optimización como un parámetro GridSearchQ
6) optimizar kernel? optimizar distancia?
10) Definir un clasificador que devuelva, para cada clase, una posterior como la likelihood en la class-conditional KDE dividida
por la likelihood en en todas las clases (como propone Juanjo) y meterlo en EMD. Hacer al contario: re-calibrar con
EMD y meterlo en KDEy
11) KDEx?

View File

@ -0,0 +1,85 @@
import pickle
import os
import pandas as pd
from distribution_matching.commons import METHODS, new_method, show_results
import quapy as qp
from quapy.model_selection import GridSearchQ
from quapy.protocol import UPP
SEED=1
if __name__ == '__main__':
qp.environ['SAMPLE_SIZE'] = 100
qp.environ['N_JOBS'] = -1
n_bags_val = 250
n_bags_test = 1000
for optim in ['mae', 'mrae']:
result_dir = f'results/tweet/{optim}'
os.makedirs(result_dir, exist_ok=True)
for method in METHODS:
print('Init method', method)
global_result_path = f'{result_dir}/{method}'
if not os.path.exists(global_result_path+'.csv'):
with open(global_result_path+'.csv', 'wt') as csv:
csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n')
with open(global_result_path+'.csv', 'at') as csv:
# four semeval dataset share the training, so it is useless to optimize hyperparameters four times;
# this variable controls that the mod sel has already been done, and skip this otherwise
semeval_trained = False
for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST:
print('init', dataset)
local_result_path = global_result_path + '_' + dataset
if os.path.exists(local_result_path+'.dataframe'):
print(f'result file {local_result_path}.dataframe already exist; skipping')
continue
with qp.util.temp_seed(SEED):
is_semeval = dataset.startswith('semeval')
if not is_semeval or not semeval_trained:
param_grid, quantifier = new_method(method)
# model selection
data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=True)
protocol = UPP(data.test, repeats=n_bags_val)
modsel = GridSearchQ(quantifier, param_grid, protocol, refit=False, n_jobs=-1, verbose=1, error=optim)
modsel.fit(data.training)
print(f'best params {modsel.best_params_}')
print(f'best score {modsel.best_score_}')
pickle.dump(
(modsel.best_params_, modsel.best_score_,),
open(f'{local_result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
quantifier = modsel.best_model()
if is_semeval:
semeval_trained = True
else:
print(f'model selection for {dataset} already done; skipping')
data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=False)
quantifier.fit(data.training)
protocol = UPP(data.test, repeats=n_bags_test)
report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], verbose=True)
report.to_csv(f'{local_result_path}.dataframe')
means = report.mean()
csv.write(f'{method}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
csv.flush()
show_results(global_result_path)

View File

@ -0,0 +1,57 @@
import numpy as np
from sklearn.linear_model import LogisticRegression
import os
import quapy as qp
from distribution_matching.commons import show_results
from quapy.method.aggregative import DMy
from distribution_matching.method.method_kdey import KDEy
from quapy.protocol import UPP
SEED=1
if __name__ == '__main__':
qp.environ['SAMPLE_SIZE'] = 100
qp.environ['N_JOBS'] = -1
n_bags_val = 250
n_bags_test = 1000
result_dir = f'results/tweet/sensibility'
os.makedirs(result_dir, exist_ok=True)
for method, param, grid in [
('KDEy-ML', 'Bandwidth', np.linspace(0.01, 0.2, 20)),
('DM-HD', 'nbins', list(range(2,10)) + list(range(10,34,2)))
]:
global_result_path = f'{result_dir}/{method}'
if not os.path.exists(global_result_path+'.csv'):
with open(global_result_path+'.csv', 'wt') as csv:
csv.write(f'Method\tDataset\t{param}\tMAE\tMRAE\tKLD\n')
with open(global_result_path+'.csv', 'at') as csv:
for val in grid:
for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST:
print('init', dataset)
local_result_path = global_result_path + '_' + dataset + (f'_{val:.3f}' if isinstance(val, float) else f'{val}')
with qp.util.temp_seed(SEED):
data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=False)
if method == 'KDEy-ML':
quantifier = KDEy(LogisticRegression(n_jobs=-1), target='max_likelihood', val_split=10, bandwidth=val)
elif method == 'DM-HD':
quantifier = DMy(LogisticRegression(n_jobs=-1), val_split=10, nbins=val, divergence='HD', n_jobs=-1)
quantifier.fit(data.training)
protocol = UPP(data.test, repeats=n_bags_test)
report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], verbose=True, n_jobs=-1)
report.to_csv(f'{local_result_path}.dataframe')
means = report.mean()
csv.write(f'{method}\t{data.name}\t{val}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
csv.flush()
show_results(global_result_path)

View File

@ -0,0 +1,83 @@
import pickle
import os
from distribution_matching.commons import BIN_METHODS, new_method, show_results
import quapy as qp
from quapy.model_selection import GridSearchQ
from quapy.protocol import UPP
SEED = 1
if __name__ == '__main__':
qp.environ['SAMPLE_SIZE'] = 100
qp.environ['N_JOBS'] = -1
n_bags_val = 250
n_bags_test = 1000
for optim in ['mae', 'mrae']:
result_dir = f'results/binary/{optim}'
os.makedirs(result_dir, exist_ok=True)
for method in BIN_METHODS:
print('Init method', method)
global_result_path = f'{result_dir}/{method}'
if not os.path.exists(global_result_path + '.csv'):
with open(global_result_path + '.csv', 'wt') as csv:
csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n')
with open(global_result_path + '.csv', 'at') as csv:
for dataset in qp.datasets.UCI_DATASETS:
if dataset in ['acute.a', 'acute.b', 'iris.1']: continue # , 'pageblocks.5', 'spambase', 'wdbc']: continue
print('init', dataset)
local_result_path = global_result_path + '_' + dataset
if os.path.exists(local_result_path + '.dataframe'):
print(f'result file {local_result_path}.dataframe already exist; skipping')
continue
with qp.util.temp_seed(SEED):
param_grid, quantifier = new_method(method, max_iter=3000)
data = qp.datasets.fetch_UCIDataset(dataset)
# model selection
train, test = data.train_test
train, val = train.split_stratified(random_state=SEED)
protocol = UPP(val, repeats=n_bags_val)
modsel = GridSearchQ(
quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=1, error=optim
)
try:
modsel.fit(train)
print(f'best params {modsel.best_params_}')
print(f'best score {modsel.best_score_}')
pickle.dump(
(modsel.best_params_, modsel.best_score_,),
open(f'{local_result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
quantifier = modsel.best_model()
except:
print('something went wrong... trying to fit the default model')
quantifier.fit(train)
protocol = UPP(test, repeats=n_bags_test)
report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'],
verbose=True)
report.to_csv(f'{local_result_path}.dataframe')
means = report.mean()
csv.write(f'{method}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
csv.flush()
show_results(global_result_path)

View File

@ -0,0 +1,88 @@
import pickle
import os
from data.base import LabelledCollection
from sklearn.linear_model import LogisticRegression
from distribution_matching.commons import METHODS, new_method, show_results
import quapy as qp
from quapy.model_selection import GridSearchQ
from quapy.protocol import UPP
SEED = 1
if __name__ == '__main__':
qp.environ['SAMPLE_SIZE'] = 500
qp.environ['N_JOBS'] = -1
n_bags_val = 250
n_bags_test = 1000
for optim in ['mae', 'mrae']:
result_dir = f'results/ucimulti/{optim}'
os.makedirs(result_dir, exist_ok=True)
for method in METHODS:
print('Init method', method)
global_result_path = f'{result_dir}/{method}'
if not os.path.exists(global_result_path + '.csv'):
with open(global_result_path + '.csv', 'wt') as csv:
csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n')
with open(global_result_path + '.csv', 'at') as csv:
for dataset in qp.datasets.UCI_MULTICLASS_DATASETS:
print('init', dataset)
local_result_path = global_result_path + '_' + dataset
if os.path.exists(local_result_path + '.dataframe'):
print(f'result file {local_result_path}.dataframe already exist; skipping')
continue
with qp.util.temp_seed(SEED):
param_grid, quantifier = new_method(method, max_iter=3000)
data = qp.datasets.fetch_UCIMulticlassDataset(dataset)
# model selection
train, test = data.train_test
train, val = train.split_stratified(random_state=SEED)
protocol = UPP(val, repeats=n_bags_val)
modsel = GridSearchQ(
quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=1, error=optim
)
try:
modsel.fit(train)
print(f'best params {modsel.best_params_}')
print(f'best score {modsel.best_score_}')
pickle.dump(
(modsel.best_params_, modsel.best_score_,),
open(f'{local_result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
quantifier = modsel.best_model()
except:
print('something went wrong... trying to fit the default model')
quantifier.fit(train)
# quantifier = qp.method.aggregative.CC(LogisticRegression()).fit(train)
protocol = UPP(test, repeats=n_bags_test)
report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'],
verbose=True, n_jobs=-1)
report.to_csv(f'{local_result_path}.dataframe')
means = report.mean()
csv.write(f'{method}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
csv.flush()
show_results(global_result_path)

View File

@ -0,0 +1,63 @@
import numpy as np
from sklearn.linear_model import LogisticRegression
import os
import quapy as qp
from distribution_matching.commons import show_results
from distribution_matching.method.method_kdey import KDEy
from quapy.method.aggregative import DMy
from quapy.protocol import UPP
SEED=1
def task(val):
print('job-init', dataset, val)
with qp.util.temp_seed(SEED):
if method=='KDEy-ML':
quantifier = KDEy(LogisticRegression(), target='max_likelihood', val_split=10, bandwidth=val)
elif method == 'DM-HD':
quantifier = DMy(LogisticRegression(), val_split=10, nbins=val, divergence='HD')
quantifier.fit(data.data)
protocol = UPP(data.test, repeats=n_bags_test)
report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'],
verbose=True, n_jobs=-1)
return report
if __name__ == '__main__':
qp.environ['SAMPLE_SIZE'] = 500
qp.environ['N_JOBS'] = -1
n_bags_val = 250
n_bags_test = 1000
result_dir = f'results/ucimulti/sensibility'
os.makedirs(result_dir, exist_ok=True)
for dataset in qp.datasets.UCI_MULTICLASS_DATASETS:
data = qp.datasets.fetch_UCIMulticlassDataset(dataset)
for method, param, grid in [
('KDEy-ML', 'Bandwidth', np.linspace(0.01, 0.2, 20)),
('DM-HD', 'nbins', list(range(2, 10)) + list(range(10, 34, 2)))
]:
global_result_path = f'{result_dir}/{method}'
if not os.path.exists(global_result_path+'.csv'):
with open(global_result_path+'.csv', 'wt') as csv:
csv.write(f'Method\tDataset\t{param}\tMAE\tMRAE\tKLD\n')
reports = qp.util.parallel(task, grid, n_jobs=-1)
with open(global_result_path + '.csv', 'at') as csv:
for val, report in zip(grid, reports):
means = report.mean()
local_result_path = global_result_path + '_' + dataset + (f'_{val:.3f}' if isinstance(val, float) else f'{val}')
report.to_csv(f'{local_result_path}.dataframe')
csv.write(f'{method}\t{dataset}\t{val}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
csv.flush()
show_results(global_result_path)

View File

@ -116,7 +116,7 @@ def run(experiment):
model, model,
protocol=APP(test, n_prevalences=21, repeats=100) protocol=APP(test, n_prevalences=21, repeats=100)
) )
test_true_prevalence = data.test.prevalence() test_true_prevalence = data.mixture.prevalence()
evaluate_experiment(true_prevalences, estim_prevalences) evaluate_experiment(true_prevalences, estim_prevalences)
save_results(dataset_name, model_name, run, optim_loss, save_results(dataset_name, model_name, run, optim_loss,

View File

@ -0,0 +1,244 @@
from scipy.sparse import csc_matrix, csr_matrix
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
import numpy as np
from joblib import Parallel, delayed
import sklearn
import math
from scipy.stats import t
class ContTable:
def __init__(self, tp=0, tn=0, fp=0, fn=0):
self.tp=tp
self.tn=tn
self.fp=fp
self.fn=fn
def get_d(self): return self.tp + self.tn + self.fp + self.fn
def get_c(self): return self.tp + self.fn
def get_not_c(self): return self.tn + self.fp
def get_f(self): return self.tp + self.fp
def get_not_f(self): return self.tn + self.fn
def p_c(self): return (1.0*self.get_c())/self.get_d()
def p_not_c(self): return 1.0-self.p_c()
def p_f(self): return (1.0*self.get_f())/self.get_d()
def p_not_f(self): return 1.0-self.p_f()
def p_tp(self): return (1.0*self.tp) / self.get_d()
def p_tn(self): return (1.0*self.tn) / self.get_d()
def p_fp(self): return (1.0*self.fp) / self.get_d()
def p_fn(self): return (1.0*self.fn) / self.get_d()
def tpr(self):
c = 1.0*self.get_c()
return self.tp / c if c > 0.0 else 0.0
def fpr(self):
_c = 1.0*self.get_not_c()
return self.fp / _c if _c > 0.0 else 0.0
def __ig_factor(p_tc, p_t, p_c):
den = p_t * p_c
if den != 0.0 and p_tc != 0:
return p_tc * math.log(p_tc / den, 2)
else:
return 0.0
def information_gain(cell):
return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + \
__ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) +\
__ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c()) + \
__ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c())
def squared_information_gain(cell):
return information_gain(cell)**2
def posneg_information_gain(cell):
ig = information_gain(cell)
if cell.tpr() < cell.fpr():
return -ig
else:
return ig
def pos_information_gain(cell):
if cell.tpr() < cell.fpr():
return 0
else:
return information_gain(cell)
def pointwise_mutual_information(cell):
return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c())
def gss(cell):
return cell.p_tp()*cell.p_tn() - cell.p_fp()*cell.p_fn()
def chi_square(cell):
den = cell.p_f() * cell.p_not_f() * cell.p_c() * cell.p_not_c()
if den==0.0: return 0.0
num = gss(cell)**2
return num / den
def conf_interval(xt, n):
if n>30:
z2 = 3.84145882069 # norm.ppf(0.5+0.95/2.0)**2
else:
z2 = t.ppf(0.5 + 0.95 / 2.0, df=max(n-1,1)) ** 2
p = (xt + 0.5 * z2) / (n + z2)
amplitude = 0.5 * z2 * math.sqrt((p * (1.0 - p)) / (n + z2))
return p, amplitude
def strength(minPosRelFreq, minPos, maxNeg):
if minPos > maxNeg:
return math.log(2.0 * minPosRelFreq, 2.0)
else:
return 0.0
#set cancel_features=True to allow some features to be weighted as 0 (as in the original article)
#however, for some extremely imbalanced dataset caused all documents to be 0
def conf_weight(cell, cancel_features=False):
c = cell.get_c()
not_c = cell.get_not_c()
tp = cell.tp
fp = cell.fp
pos_p, pos_amp = conf_interval(tp, c)
neg_p, neg_amp = conf_interval(fp, not_c)
min_pos = pos_p-pos_amp
max_neg = neg_p+neg_amp
den = (min_pos + max_neg)
minpos_relfreq = min_pos / (den if den != 0 else 1)
str_tplus = strength(minpos_relfreq, min_pos, max_neg);
if str_tplus == 0 and not cancel_features:
return 1e-20
return str_tplus;
def get_tsr_matrix(cell_matrix, tsr_score_funtion):
nC = len(cell_matrix)
nF = len(cell_matrix[0])
tsr_matrix = [[tsr_score_funtion(cell_matrix[c,f]) for f in range(nF)] for c in range(nC)]
return np.array(tsr_matrix)
def feature_label_contingency_table(positive_document_indexes, feature_document_indexes, nD):
tp_ = len(positive_document_indexes & feature_document_indexes)
fp_ = len(feature_document_indexes - positive_document_indexes)
fn_ = len(positive_document_indexes - feature_document_indexes)
tn_ = nD - (tp_ + fp_ + fn_)
return ContTable(tp=tp_, tn=tn_, fp=fp_, fn=fn_)
def category_tables(feature_sets, category_sets, c, nD, nF):
return [feature_label_contingency_table(category_sets[c], feature_sets[f], nD) for f in range(nF)]
def get_supervised_matrix(coocurrence_matrix, label_matrix, n_jobs=-1):
"""
Computes the nC x nF supervised matrix M where Mcf is the 4-cell contingency table for feature f and class c.
Efficiency O(nF x nC x log(S)) where S is the sparse factor
"""
nD, nF = coocurrence_matrix.shape
nD2, nC = label_matrix.shape
if nD != nD2:
raise ValueError('Number of rows in coocurrence matrix shape %s and label matrix shape %s is not consistent' %
(coocurrence_matrix.shape,label_matrix.shape))
def nonzero_set(matrix, col):
return set(matrix[:, col].nonzero()[0])
if isinstance(coocurrence_matrix, csr_matrix):
coocurrence_matrix = csc_matrix(coocurrence_matrix)
feature_sets = [nonzero_set(coocurrence_matrix, f) for f in range(nF)]
category_sets = [nonzero_set(label_matrix, c) for c in range(nC)]
cell_matrix = Parallel(n_jobs=n_jobs, backend="threading")(delayed(category_tables)(feature_sets, category_sets, c, nD, nF) for c in range(nC))
return np.array(cell_matrix)
class TSRweighting(BaseEstimator,TransformerMixin):
"""
Supervised Term Weighting function based on any Term Selection Reduction (TSR) function (e.g., information gain,
chi-square, etc.) or, more generally, on any function that could be computed on the 4-cell contingency table for
each category-feature pair.
The supervised_4cell_matrix (a CxF matrix containing the 4-cell contingency tables
for each category-feature pair) can be pre-computed (e.g., during the feature selection phase) and passed as an
argument.
When C>1, i.e., in multiclass scenarios, a global_policy is used in order to determine a single feature-score which
informs about its relevance. Accepted policies include "max" (takes the max score across categories), "ave" and "wave"
(take the average, or weighted average, across all categories -- weights correspond to the class prevalence), and "sum"
(which sums all category scores).
"""
def __init__(self, tsr_function, global_policy='max', supervised_4cell_matrix=None, sublinear_tf=True, norm='l2', min_df=3, n_jobs=-1):
if global_policy not in ['max', 'ave', 'wave', 'sum']: raise ValueError('Global policy should be in {"max", "ave", "wave", "sum"}')
self.tsr_function = tsr_function
self.global_policy = global_policy
self.supervised_4cell_matrix = supervised_4cell_matrix
self.sublinear_tf=sublinear_tf
self.norm=norm
self.min_df = min_df
self.n_jobs=n_jobs
def fit(self, X, y):
self.count_vectorizer = CountVectorizer(min_df=self.min_df)
X = self.count_vectorizer.fit_transform(X)
self.tf_vectorizer = TfidfTransformer(
norm=None, use_idf=False, smooth_idf=False, sublinear_tf=self.sublinear_tf).fit(X)
if len(y.shape) == 1:
y = np.expand_dims(y, axis=1)
nD, nC = y.shape
nF = len(self.tf_vectorizer.get_feature_names_out())
if self.supervised_4cell_matrix is None:
self.supervised_4cell_matrix = get_supervised_matrix(X, y, n_jobs=self.n_jobs)
else:
if self.supervised_4cell_matrix.shape != (nC, nF): raise ValueError("Shape of supervised information matrix is inconsistent with X and y")
tsr_matrix = get_tsr_matrix(self.supervised_4cell_matrix, self.tsr_function)
if self.global_policy == 'ave':
self.global_tsr_vector = np.average(tsr_matrix, axis=0)
elif self.global_policy == 'wave':
category_prevalences = [sum(y[:,c])*1.0/nD for c in range(nC)]
self.global_tsr_vector = np.average(tsr_matrix, axis=0, weights=category_prevalences)
elif self.global_policy == 'sum':
self.global_tsr_vector = np.sum(tsr_matrix, axis=0)
elif self.global_policy == 'max':
self.global_tsr_vector = np.amax(tsr_matrix, axis=0)
return self
def fit_transform(self, X, y):
return self.fit(X,y).transform(X)
def transform(self, X):
if not hasattr(self, 'global_tsr_vector'): raise NameError('TSRweighting: transform method called before fit.')
X = self.count_vectorizer.transform(X)
tf_X = self.tf_vectorizer.transform(X).toarray()
weighted_X = np.multiply(tf_X, self.global_tsr_vector)
if self.norm is not None and self.norm!='none':
weighted_X = sklearn.preprocessing.normalize(weighted_X, norm=self.norm, axis=1, copy=False)
return csr_matrix(weighted_X)

View File

0
laboratory/main.py Normal file
View File

View File

@ -0,0 +1,62 @@
from sklearn.linear_model import LogisticRegression
import os
import sys
import pandas as pd
import quapy as qp
from method.aggregative import DistributionMatching
from distribution_matching.method.method_kdey import KDEy
from protocol import UPP
if __name__ == '__main__':
qp.environ['SAMPLE_SIZE'] = 100
qp.environ['N_JOBS'] = -1
method = 'KDE'
param = 0.1
div = 'topsoe'
method_identifier = f'{method}_{param}_{div}'
# generates tuples (dataset, method, method_name)
# (the dataset is needed for methods that process the dataset differently)
def gen_methods():
for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST:
data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True)
if method == 'KDE':
kdey = KDEy(LogisticRegression(), divergence=div, bandwidth=param, engine='sklearn')
yield data, kdey, method_identifier
elif method == 'DM':
dm = DistributionMatching(LogisticRegression(), divergence=div, nbins=param)
yield data, dm, method_identifier
else:
raise NotImplementedError('unknown method')
os.makedirs('results', exist_ok=True)
result_path = f'results/{method_identifier}.csv'
if os.path.exists(result_path):
print('Result already exit. Nothing to do')
sys.exit(0)
with open(result_path, 'wt') as csv:
csv.write(f'Method\tDataset\tMAE\tMRAE\n')
for data, quantifier, quant_name in gen_methods():
quantifier.fit(data.training)
protocol = UPP(data.mixture, repeats=100)
report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae'], verbose=True)
means = report.mean()
csv.write(f'{quant_name}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n')
csv.flush()
df = pd.read_csv(result_path, sep='\t')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"])
print(pv)

148
laboratory/method_dxs.py Normal file
View File

@ -0,0 +1,148 @@
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
import quapy as qp
from data import LabelledCollection
import numpy as np
from laboratory.custom_vectorizers import *
from protocol import APP
from quapy.method.aggregative import _get_divergence, HDy, DistributionMatching
from quapy.method.base import BaseQuantifier
from scipy import optimize
import pandas as pd
# TODO: explore the bernoulli (term presence/absence) variant
# TODO: explore the multinomial (term frequency) variant
# TODO: explore the multinomial + length normalization variant
# TODO: consolidate the TSR-variant (e.g., using information gain) variant;
# - works better with the idf?
# - works better with length normalization?
# - etc
class DxS(BaseQuantifier):
def __init__(self, vectorizer=None, divergence='topsoe'):
self.vectorizer = vectorizer
self.divergence = divergence
# def __as_distribution(self, instances):
# return np.asarray(instances.sum(axis=0) / instances.sum()).flatten()
def __as_distribution(self, instances):
dist = instances.sum(axis=0) / instances.sum()
return np.asarray(dist).flatten()
def fit(self, data: LabelledCollection):
text_instances, labels = data.Xy
if self.vectorizer is not None:
text_instances = self.vectorizer.fit_transform(text_instances, y=labels)
distributions = []
for class_i in data.classes_:
distributions.append(self.__as_distribution(text_instances[labels == class_i]))
self.validation_distribution = np.asarray(distributions)
return self
def quantify(self, text_instances):
if self.vectorizer is not None:
text_instances = self.vectorizer.transform(text_instances)
test_distribution = self.__as_distribution(text_instances)
divergence = _get_divergence(self.divergence)
n_classes, n_feats = self.validation_distribution.shape
def match(prev):
prev = np.expand_dims(prev, axis=0)
mixture_distribution = (prev @ self.validation_distribution).flatten()
return divergence(test_distribution, mixture_distribution)
# the initial point is set as the uniform distribution
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
# solutions are bounded to those contained in the unit-simplex
bounds = tuple((0, 1) for x in range(n_classes)) # values in [0,1]
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
return r.x
if __name__ == '__main__':
qp.environ['SAMPLE_SIZE'] = 250
qp.environ['N_JOBS'] = -1
min_df = 10
# dataset = 'imdb'
repeats = 10
error = 'mae'
div = 'topsoe'
# generates tuples (dataset, method, method_name)
# (the dataset is needed for methods that process the dataset differently)
def gen_methods():
for dataset in qp.datasets.REVIEWS_SENTIMENT_DATASETS:
data = qp.datasets.fetch_reviews(dataset, tfidf=False)
bernoulli_vectorizer = CountVectorizer(min_df=min_df, binary=True)
dxs = DxS(divergence=div, vectorizer=bernoulli_vectorizer)
yield data, dxs, 'DxS-Bernoulli'
multinomial_vectorizer = CountVectorizer(min_df=min_df, binary=False)
dxs = DxS(divergence=div, vectorizer=multinomial_vectorizer)
yield data, dxs, 'DxS-multinomial'
tf_vectorizer = TfidfVectorizer(sublinear_tf=False, use_idf=False, min_df=min_df, norm=None)
dxs = DxS(divergence=div, vectorizer=tf_vectorizer)
yield data, dxs, 'DxS-TF'
logtf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=False, min_df=min_df, norm=None)
dxs = DxS(divergence=div, vectorizer=logtf_vectorizer)
yield data, dxs, 'DxS-logTF'
tfidf_vectorizer = TfidfVectorizer(use_idf=True, min_df=min_df, norm=None)
dxs = DxS(divergence=div, vectorizer=tfidf_vectorizer)
yield data, dxs, 'DxS-TFIDF'
tfidf_vectorizer = TfidfVectorizer(use_idf=True, min_df=min_df, norm='l2')
dxs = DxS(divergence=div, vectorizer=tfidf_vectorizer)
yield data, dxs, 'DxS-TFIDF-l2'
tsr_vectorizer = TSRweighting(tsr_function=information_gain, min_df=min_df, norm='l2')
dxs = DxS(divergence=div, vectorizer=tsr_vectorizer)
yield data, dxs, 'DxS-TFTSR-l2'
data = qp.datasets.fetch_reviews(dataset, tfidf=True, min_df=min_df)
hdy = HDy(LogisticRegression())
yield data, hdy, 'HDy'
dm = DistributionMatching(LogisticRegression(), divergence=div, nbins=5)
yield data, dm, 'DM-5b'
dm = DistributionMatching(LogisticRegression(), divergence=div, nbins=10)
yield data, dm, 'DM-10b'
result_path = 'results.csv'
with open(result_path, 'wt') as csv:
csv.write(f'Method\tDataset\tMAE\tMRAE\n')
for data, quantifier, quant_name in gen_methods():
quantifier.fit(data.training)
report = qp.evaluation.evaluation_report(quantifier, APP(data.mixture, repeats=repeats), error_metrics=['mae', 'mrae'], verbose=True)
means = report.mean()
csv.write(f'{quant_name}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n')
df = pd.read_csv(result_path, sep='\t')
# print(df)
pv = df.pivot_table(index='Method', columns="Dataset", values=["MAE", "MRAE"])
print(pv)

View File

@ -1,3 +1,15 @@
Change Log 0.1.8
----------------
- A conceptual error in MedianSweep and MedianSweep2 has been solved. The error consisted on computing all TPRs and
FPRs and report the median; then, the adjustment then operated on these single values. Instead, the original
method proposed by G.Forman comes down to generating all prevalence predictions, for all TPRs and FPRs, and then
computing the median of it.
- qp.evaluation now runs in parallel <improve, remove or fix the ongoing error, put at the qp. level instead of
qp.evaluation because I don't like the qp.evaluation.evaluate thing>
Change Log 0.1.7 Change Log 0.1.7
---------------- ----------------

View File

@ -59,7 +59,7 @@ class RecalibratedProbabilisticClassifierBase(BaseEstimator, RecalibratedProbabi
elif isinstance(k, float): elif isinstance(k, float):
if not (0 < k < 1): if not (0 < k < 1):
raise ValueError('wrong value for val_split: the proportion of validation documents must be in (0,1)') raise ValueError('wrong value for val_split: the proportion of validation documents must be in (0,1)')
return self.fit_cv(X, y) return self.fit_tr_val(X, y)
def fit_cv(self, X, y): def fit_cv(self, X, y):
""" """

View File

@ -176,7 +176,7 @@ class NeuralClassifierTrainer:
self.classes_ = train.classes_ self.classes_ = train.classes_
opt = self.trainer_hyperparams opt = self.trainer_hyperparams
checkpoint = self.checkpointpath checkpoint = self.checkpointpath
self.reset_net_params(self.vocab_size, train.n_classes) self.reset_net_params(self.vocab_size, train.arange_classes)
train_generator = TorchDataset(train.instances, train.labels).asDataloader( train_generator = TorchDataset(train.instances, train.labels).asDataloader(
opt['batch_size'], shuffle=True, pad_length=opt['padding_length'], device=opt['device']) opt['batch_size'], shuffle=True, pad_length=opt['padding_length'], device=opt['device'])

View File

@ -123,7 +123,7 @@ class LabelledCollection:
return self.uniform_sampling_index(size, random_state=random_state) return self.uniform_sampling_index(size, random_state=random_state)
if len(prevs) == self.n_classes - 1: if len(prevs) == self.n_classes - 1:
prevs = prevs + (1 - sum(prevs),) prevs = prevs + (1 - sum(prevs),)
assert len(prevs) == self.n_classes, 'unexpected number of prevalences' assert len(prevs) == self.n_classes, f'unexpected number of prevalences (found {len(prevs)}, expected {self.n_classes})'
assert sum(prevs) == 1, f'prevalences ({prevs}) wrong range (sum={sum(prevs)})' assert sum(prevs) == 1, f'prevalences ({prevs}) wrong range (sum={sum(prevs)})'
# Decide how many instances should be taken for each class in order to satisfy the requested prevalence # Decide how many instances should be taken for each class in order to satisfy the requested prevalence
@ -269,6 +269,7 @@ class LabelledCollection:
test = self.sampling_from_index(right) test = self.sampling_from_index(right)
return training, test return training, test
def __add__(self, other): def __add__(self, other):
""" """
Returns a new :class:`LabelledCollection` as the union of this collection with another collection. Returns a new :class:`LabelledCollection` as the union of this collection with another collection.
@ -319,7 +320,8 @@ class LabelledCollection:
else: else:
raise NotImplementedError('unsupported operation for collection types') raise NotImplementedError('unsupported operation for collection types')
labels = np.concatenate([lc.labels for lc in args]) labels = np.concatenate([lc.labels for lc in args])
classes = np.unique(labels).sort() classes = np.unique(labels)
classes.sort()
return LabelledCollection(instances, labels, classes=classes) return LabelledCollection(instances, labels, classes=classes)
@property @property

View File

@ -1,3 +1,6 @@
import numpy as np
def warn(*args, **kwargs): def warn(*args, **kwargs):
pass pass
import warnings import warnings
@ -7,7 +10,8 @@ import zipfile
from os.path import join from os.path import join
import pandas as pd import pandas as pd
import scipy import scipy
import pickle
from ucimlrepo import fetch_ucirepo
from quapy.data.base import Dataset, LabelledCollection from quapy.data.base import Dataset, LabelledCollection
from quapy.data.preprocessing import text2tfidf, reduce_columns from quapy.data.preprocessing import text2tfidf, reduce_columns
from quapy.data.reader import * from quapy.data.reader import *
@ -45,6 +49,12 @@ UCI_DATASETS = ['acute.a', 'acute.b',
'wine-q-red', 'wine-q-white', 'wine-q-red', 'wine-q-white',
'yeast'] 'yeast']
UCI_MULTICLASS_DATASETS = ['dry-bean',
'wine-quality',
'academic-success',
'digits',
'letter']
LEQUA2022_TASKS = ['T1A', 'T1B', 'T2A', 'T2B'] LEQUA2022_TASKS = ['T1A', 'T1B', 'T2A', 'T2B']
_TXA_SAMPLE_SIZE = 250 _TXA_SAMPLE_SIZE = 250
@ -204,7 +214,7 @@ def fetch_UCIDataset(dataset_name, data_home=None, test_split=0.3, verbose=False
:return: a :class:`quapy.data.base.Dataset` instance :return: a :class:`quapy.data.base.Dataset` instance
""" """
data = fetch_UCILabelledCollection(dataset_name, data_home, verbose) data = fetch_UCILabelledCollection(dataset_name, data_home, verbose)
return Dataset(*data.split_stratified(1 - test_split, random_state=0)) return Dataset(*data.split_stratified(1 - test_split, random_state=0), name=dataset_name)
def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection: def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection:
@ -548,11 +558,110 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) ->
data.stats() data.stats()
return data return data
def _df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float): def _df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):
df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False) df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False)
def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset:
"""
Loads a UCI multiclass dataset as an instance of :class:`quapy.data.base.Dataset`, as used in
`Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
Information Fusion, 34, 87-100. <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_
and
`Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).
Dynamic ensemble selection for quantification tasks.
Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_.
The datasets do not come with a predefined train-test split (see :meth:`fetch_UCILabelledCollection` for further
information on how to use these collections), and so a train-test split is generated at desired proportion.
The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_DATASETS`
:param dataset_name: a dataset name
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)
:param test_split: proportion of documents to be included in the test set. The rest conforms the training set
:param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
:return: a :class:`quapy.data.base.Dataset` instance
"""
data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, verbose)
return Dataset(*data.split_stratified(1 - test_split, random_state=0))
def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection:
"""
Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`, as used in
`Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
Information Fusion, 34, 87-100. <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_
and
`Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).
Dynamic ensemble selection for quantification tasks.
Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_.
The datasets do not come with a predefined train-test split, and so Pérez-Gállego et al. adopted a 5FCVx2 evaluation
protocol, meaning that each collection was used to generate two rounds (hence the x2) of 5 fold cross validation.
This can be reproduced by using :meth:`quapy.data.base.Dataset.kFCV`, e.g.:
>>> import quapy as qp
>>> collection = qp.datasets.fetch_UCILabelledCollection("dry-bean")
>>> for data in qp.domains.Dataset.kFCV(collection, nfolds=5, nrepeats=2):
>>> ...
The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS`
:param dataset_name: a dataset name
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)
:param test_split: proportion of documents to be included in the test set. The rest conforms the training set
:param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
:return: a :class:`quapy.data.base.LabelledCollection` instance
"""
assert dataset_name in UCI_MULTICLASS_DATASETS, \
f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository (multiclass). ' \
f'Valid ones are {UCI_MULTICLASS_DATASETS}'
if data_home is None:
data_home = get_quapy_home()
identifiers = {"dry-bean": 602,
"wine-quality": 186,
"academic-success": 697,
"digits": 80,
"letter": 59}
full_names = {"dry-bean": "Dry Bean Dataset",
"wine-quality": "Wine Quality",
"academic-success": "Predict students' dropout and academic success",
"digits": "Optical Recognition of Handwritten Digits",
"letter": "Letter Recognition"
}
identifier = identifiers[dataset_name]
fullname = full_names[dataset_name]
print(f'Loading UCI Muticlass {dataset_name} ({fullname})')
file = join(data_home, 'uci_multiclass', dataset_name + '.pkl')
if os.path.exists(file):
with open(file, 'rb') as file:
data = pickle.load(file)
else:
data = fetch_ucirepo(id=identifier)
X, y = data['data']['features'].to_numpy(), data['data']['targets'].to_numpy().squeeze()
classes = np.sort(np.unique(y))
y = np.searchsorted(classes, y)
data = LabelledCollection(X, y)
os.makedirs(os.path.dirname(file), exist_ok=True)
with open(file, 'wb') as file:
pickle.dump(data, file)
data.stats()
return data
def _df_replace(df, col, repl={'yes': 1, 'no': 0}, astype=float):
df[col] = df[col].apply(lambda x: repl[x]).astype(astype, copy=False)
def fetch_lequa2022(task, data_home=None): def fetch_lequa2022(task, data_home=None):
""" """
Loads the official datasets provided for the `LeQua <https://lequa2022.github.io/index>`_ competition. Loads the official datasets provided for the `LeQua <https://lequa2022.github.io/index>`_ competition.

View File

@ -1,3 +1,4 @@
from copy import deepcopy
from typing import Union, Callable, Iterable from typing import Union, Callable, Iterable
import numpy as np import numpy as np
from tqdm import tqdm from tqdm import tqdm
@ -11,7 +12,9 @@ def prediction(
model: BaseQuantifier, model: BaseQuantifier,
protocol: AbstractProtocol, protocol: AbstractProtocol,
aggr_speedup: Union[str, bool] = 'auto', aggr_speedup: Union[str, bool] = 'auto',
verbose=False): verbose=False,
verbose_error=None,
n_jobs=1):
""" """
Uses a quantification model to generate predictions for the samples generated via a specific protocol. Uses a quantification model to generate predictions for the samples generated via a specific protocol.
This function is central to all evaluation processes, and is endowed with an optimization to speed-up the This function is central to all evaluation processes, and is endowed with an optimization to speed-up the
@ -32,6 +35,11 @@ def prediction(
in the samples to be generated. Set to True or "auto" (default) for letting QuaPy decide whether it is in the samples to be generated. Set to True or "auto" (default) for letting QuaPy decide whether it is
convenient or not. Set to False to deactivate. convenient or not. Set to False to deactivate.
:param verbose: boolean, show or not information in stdout :param verbose: boolean, show or not information in stdout
:param verbose_error: an evaluation function to be used to display intermediate results if verbose=True (default None)
:param n_jobs: number of parallel workers. Default is 1 so that, if not explicitly requested, the evaluation phase
is to be carried out in a single core. That is to say, this parameter will not inspect the environment variable
N_JOBS by default. This might be convenient in many situations, since parallelizing the evaluation entails
adding an overhead for cloning the objects within different threads that is often not worth the effort.
:return: a tuple `(true_prevs, estim_prevs)` in which each element in the tuple is an array of shape :return: a tuple `(true_prevs, estim_prevs)` in which each element in the tuple is an array of shape
`(n_samples, n_classes)` containing the true, or predicted, prevalence values for each sample `(n_samples, n_classes)` containing the true, or predicted, prevalence values for each sample
""" """
@ -61,16 +69,44 @@ def prediction(
if apply_optimization: if apply_optimization:
pre_classified = model.classify(protocol.get_labelled_collection().instances) pre_classified = model.classify(protocol.get_labelled_collection().instances)
protocol_with_predictions = protocol.on_preclassified_instances(pre_classified) protocol_with_predictions = protocol.on_preclassified_instances(pre_classified)
return __prediction_helper(model.aggregate, protocol_with_predictions, verbose) return __prediction_helper(model, protocol_with_predictions, True, verbose, verbose_error, n_jobs)
else: else:
return __prediction_helper(model.quantify, protocol, verbose) return __prediction_helper(model, protocol, False, verbose, verbose_error, n_jobs)
def __prediction_helper(quantification_fn, protocol: AbstractProtocol, verbose=False): def __delayed_prediction(args):
quantifier, aggregate, sample_instances, sample_prev = args
quantifier = deepcopy(quantifier)
quant_fn = quantifier.aggregate if aggregate else quantifier.quantify
predicted = quant_fn(sample_instances)
return sample_prev, predicted
def __prediction_helper(quantifier, protocol: AbstractProtocol, aggregate: bool, verbose=False, verbose_error=None, n_jobs=1):
true_prevs, estim_prevs = [], [] true_prevs, estim_prevs = [], []
for sample_instances, sample_prev in tqdm(protocol(), total=protocol.total(), desc='predicting') if verbose else protocol(): ongoing_errors = []
estim_prevs.append(quantification_fn(sample_instances)) if verbose:
true_prevs.append(sample_prev) pbar = tqdm(protocol(), total=protocol.total(), desc='predicting')
if n_jobs==1:
quant_fn = quantifier.aggregate if aggregate else quantifier.quantify
for sample_instances, sample_prev in pbar if verbose else protocol():
predicted = quant_fn(sample_instances)
estim_prevs.append(predicted)
true_prevs.append(sample_prev)
if verbose and verbose_error is not None:
err = verbose_error(sample_prev, predicted)
ongoing_errors.append(err)
pbar.set_description(f'predicting: ongoing error={np.mean(ongoing_errors):.5f}')
else:
if verbose:
print('parallelizing prediction')
outputs = qp.util.parallel(
__delayed_prediction,
((quantifier, aggregate, sample_X, sample_p) for (sample_X, sample_p) in (pbar if verbose else protocol())),
seed=qp.environ.get('_R_SEED', None),
n_jobs=n_jobs
)
true_prevs, estim_prevs = list(zip(*outputs))
true_prevs = np.asarray(true_prevs) true_prevs = np.asarray(true_prevs)
estim_prevs = np.asarray(estim_prevs) estim_prevs = np.asarray(estim_prevs)
@ -82,7 +118,7 @@ def evaluation_report(model: BaseQuantifier,
protocol: AbstractProtocol, protocol: AbstractProtocol,
error_metrics: Iterable[Union[str,Callable]] = 'mae', error_metrics: Iterable[Union[str,Callable]] = 'mae',
aggr_speedup: Union[str, bool] = 'auto', aggr_speedup: Union[str, bool] = 'auto',
verbose=False): verbose=False, verbose_error=None, n_jobs=1):
""" """
Generates a report (a pandas' DataFrame) containing information of the evaluation of the model as according Generates a report (a pandas' DataFrame) containing information of the evaluation of the model as according
to a specific protocol and in terms of one or more evaluation metrics (errors). to a specific protocol and in terms of one or more evaluation metrics (errors).
@ -100,12 +136,19 @@ def evaluation_report(model: BaseQuantifier,
in the samples to be generated. Set to True or "auto" (default) for letting QuaPy decide whether it is in the samples to be generated. Set to True or "auto" (default) for letting QuaPy decide whether it is
convenient or not. Set to False to deactivate. convenient or not. Set to False to deactivate.
:param verbose: boolean, show or not information in stdout :param verbose: boolean, show or not information in stdout
:param verbose_error: an evaluation function to be used to display intermediate results if verbose=True (default None)
:param n_jobs: number of parallel workers. Default is 1 so that, if not explicitly requested, the evaluation phase
is to be carried out in a single core. That is to say, this parameter will not inspect the environment variable
N_JOBS by default. This might be convenient in many situations, since parallelizing the evaluation entails
adding an overhead for cloning the objects within different threads that is often not worth the effort.
:return: a pandas' DataFrame containing the columns 'true-prev' (the true prevalence of each sample), :return: a pandas' DataFrame containing the columns 'true-prev' (the true prevalence of each sample),
'estim-prev' (the prevalence estimated by the model for each sample), and as many columns as error metrics 'estim-prev' (the prevalence estimated by the model for each sample), and as many columns as error metrics
have been indicated, each displaying the score in terms of that metric for every sample. have been indicated, each displaying the score in terms of that metric for every sample.
""" """
true_prevs, estim_prevs = prediction(model, protocol, aggr_speedup=aggr_speedup, verbose=verbose) true_prevs, estim_prevs = prediction(
model, protocol, aggr_speedup=aggr_speedup, verbose=verbose, verbose_error=verbose_error, n_jobs=n_jobs
)
return _prevalence_report(true_prevs, estim_prevs, error_metrics) return _prevalence_report(true_prevs, estim_prevs, error_metrics)

View File

@ -90,11 +90,32 @@ def TopsoeDistance(P, Q, epsilon=1e-20):
:param P: real-valued array-like of shape `(k,)` representing a discrete distribution :param P: real-valued array-like of shape `(k,)` representing a discrete distribution
:param Q: real-valued array-like of shape `(k,)` representing a discrete distribution :param Q: real-valued array-like of shape `(k,)` representing a discrete distribution
:param epsilon: small value to smooth the distributions for numerical stability
:return: float :return: float
""" """
return np.sum(P*np.log((2*P+epsilon)/(P+Q+epsilon)) + Q*np.log((2*Q+epsilon)/(P+Q+epsilon))) return np.sum(P*np.log((2*P+epsilon)/(P+Q+epsilon)) + Q*np.log((2*Q+epsilon)/(P+Q+epsilon)))
def CauchySchwarz(P, Q, epsilon=1e-20):
"""
Cauchy-Schwarz divergence between two (discretized) distributions `P` and `Q`.
The Cauchy-Schwarz divergence for two discrete distributions of `k` bins is defined as:
.. math::
CS(P,Q) = \\frac{ \\sum_{i=1}^k p_i q_i }{
\\left( \\sum_{i=1}^k p^2_i \\right) \\left( \\sum_{i=1}^k q^2_i \\right)
}
:param P: real-valued array-like of shape `(k,)` representing a discrete distribution
:param Q: real-valued array-like of shape `(k,)` representing a discrete distribution
:param epsilon: small value to smooth the distributions for numerical stability
:return: float
"""
P += epsilon
Q += epsilon
return - np.log(sum(P * Q) / np.sqrt(sum(P ** 2) * sum(Q ** 2)))
def uniform_prevalence_sampling(n_classes, size=1): def uniform_prevalence_sampling(n_classes, size=1):
""" """
Implements the `Kraemer algorithm <http://www.cs.cmu.edu/~nasmith/papers/smith+tromble.tr04.pdf>`_ Implements the `Kraemer algorithm <http://www.cs.cmu.edu/~nasmith/papers/smith+tromble.tr04.pdf>`_
@ -276,3 +297,24 @@ def check_prevalence_vector(p, raise_exception=False, toleranze=1e-08):
return False return False
return True return True
def optim_minimize(loss, n_classes):
"""
Searches for the optimal prevalence values, i.e., an `n_classes`-dimensional vector of the (`n_classes`-1)-simplex
that yields the smallest lost. This optimization is carried out by means of a constrained search using scipy's
SLSQP routine.
:param loss: (callable) the function to minimize
:param n_classes: (int) the number of classes, i.e., the dimensionality of the prevalence vector
:return: (ndarray) the best prevalence vector found
"""
from scipy import optimize
# the initial point is set as the uniform distribution
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
# solutions are bounded to those contained in the unit-simplex
bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1]
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
r = optimize.minimize(loss, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
return r.x

View File

@ -604,10 +604,17 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
def _get_divergence(divergence: Union[str, Callable]): def _get_divergence(divergence: Union[str, Callable]):
if isinstance(divergence, str): if isinstance(divergence, str):
if divergence=='HD': divergence = divergence.lower()
if divergence=='hd':
return F.HellingerDistance return F.HellingerDistance
elif divergence=='topsoe': elif divergence=='topsoe':
return F.TopsoeDistance return F.TopsoeDistance
elif divergence.lower()=='cs':
return F.CauchySchwarz
elif divergence.lower()=='l2':
return lambda a,b: np.linalg.norm(a-b)
elif divergence.lower()=='l1':
return lambda a,b: np.linalg.norm(a-b, ord=1)
else: else:
raise ValueError(f'unknown divergence {divergence}') raise ValueError(f'unknown divergence {divergence}')
elif callable(divergence): elif callable(divergence):
@ -770,7 +777,9 @@ class DistributionMatching(AggregativeProbabilisticQuantifier):
""" """
Trains the classifier (if requested) and generates the validation distributions out of the training data. Trains the classifier (if requested) and generates the validation distributions out of the training data.
The validation distributions have shape `(n, ch, nbins)`, with `n` the number of classes, `ch` the number of The validation distributions have shape `(n, ch, nbins)`, with `n` the number of classes, `ch` the number of
channels, and `nbins` the number of bins. In particular, let `V` be the validation distributions; `di=V[i]` channels (a channel is a description, in form of a histogram, of a specific class -- there are as many channels
as classes, although in the binary case one can use only one channel, since the other one is constrained),
and `nbins` the number of bins. In particular, let `V` be the validation distributions; `di=V[i]`
are the distributions obtained from training data labelled with class `i`; `dij = di[j]` is the discrete are the distributions obtained from training data labelled with class `i`; `dij = di[j]` is the discrete
distribution of posterior probabilities `P(Y=j|X=x)` for training data labelled with class `i`, and `dij[k]` distribution of posterior probabilities `P(Y=j|X=x)` for training data labelled with class `i`, and `dij[k]`
is the fraction of instances with a value in the `k`-th bin. is the fraction of instances with a value in the `k`-th bin.
@ -819,7 +828,7 @@ class DistributionMatching(AggregativeProbabilisticQuantifier):
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,)) uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
# solutions are bounded to those contained in the unit-simplex # solutions are bounded to those contained in the unit-simplex
bounds = tuple((0, 1) for x in range(n_classes)) # values in [0,1] bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1]
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1 constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints) r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
return r.x return r.x
@ -1085,7 +1094,7 @@ class T50(ThresholdOptimization):
Threshold Optimization variant for :class:`ACC` as proposed by Threshold Optimization variant for :class:`ACC` as proposed by
`Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and `Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and
`Forman 2008 <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_ that looks `Forman 2008 <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_ that looks
for the threshold that makes `tpr` cosest to 0.5. for the threshold that makes `tpr` closest to 0.5.
The goal is to bring improved stability to the denominator of the adjustment. The goal is to bring improved stability to the denominator of the adjustment.
:param classifier: a sklearn's Estimator that generates a classifier :param classifier: a sklearn's Estimator that generates a classifier
@ -1173,7 +1182,7 @@ class MS(ThresholdOptimization):
super().__init__(classifier, val_split) super().__init__(classifier, val_split)
def _condition(self, tpr, fpr) -> float: def _condition(self, tpr, fpr) -> float:
pass return True
def _optimize_threshold(self, y, probabilities): def _optimize_threshold(self, y, probabilities):
tprs = [] tprs = []
@ -1184,9 +1193,26 @@ class MS(ThresholdOptimization):
TP, FP, FN, TN = self._compute_table(y, y_) TP, FP, FN, TN = self._compute_table(y, y_)
tpr = self._compute_tpr(TP, FP) tpr = self._compute_tpr(TP, FP)
fpr = self._compute_fpr(FP, TN) fpr = self._compute_fpr(FP, TN)
tprs.append(tpr) if self._condition(tpr, fpr):
fprs.append(fpr) tprs.append(tpr)
return np.median(tprs), np.median(fprs) fprs.append(fpr)
return tprs, fprs
def aggregate(self, classif_predictions):
prevs_estim = self.cc.aggregate(classif_predictions)
positive_prevs = []
for tpr, fpr in zip(self.tpr, self.fpr):
if tpr - fpr > 0:
acc = np.clip((prevs_estim[1] - fpr) / (tpr - fpr), 0, 1)
positive_prevs.append(acc)
if len(positive_prevs) > 0:
adjusted_positive_prev = np.median(positive_prevs)
adjusted_prevs_estim = np.array((1 - adjusted_positive_prev, adjusted_positive_prev))
return adjusted_prevs_estim
else:
return prevs_estim
class MS2(MS): class MS2(MS):
@ -1209,19 +1235,9 @@ class MS2(MS):
def __init__(self, classifier: BaseEstimator, val_split=0.4): def __init__(self, classifier: BaseEstimator, val_split=0.4):
super().__init__(classifier, val_split) super().__init__(classifier, val_split)
def _optimize_threshold(self, y, probabilities): def _condition(self, tpr, fpr) -> float:
tprs = [0, 1] return (tpr - fpr) > 0.25
fprs = [0, 1]
candidate_thresholds = np.unique(probabilities[:, 1])
for candidate_threshold in candidate_thresholds:
y_ = [self.classes_[1] if p > candidate_threshold else self.classes_[0] for p in probabilities[:, 1]]
TP, FP, FN, TN = self._compute_table(y, y_)
tpr = self._compute_tpr(TP, FP)
fpr = self._compute_fpr(FP, TN)
if (tpr - fpr) > 0.25:
tprs.append(tpr)
fprs.append(fpr)
return np.median(tprs), np.median(fprs)
ClassifyAndCount = CC ClassifyAndCount = CC

View File

@ -32,13 +32,13 @@ class QuaNetTrainer(BaseQuantifier):
>>> qp.domains.preprocessing.index(dataset, min_df=5, inplace=True) >>> qp.domains.preprocessing.index(dataset, min_df=5, inplace=True)
>>> >>>
>>> # the text classifier is a CNN trained by NeuralClassifierTrainer >>> # the text classifier is a CNN trained by NeuralClassifierTrainer
>>> cnn = CNNnet(dataset.vocabulary_size, dataset.n_classes) >>> cnn = CNNnet(dataset.vocabulary_size, dataset.arange_classes)
>>> classifier = NeuralClassifierTrainer(cnn, device='cuda') >>> classifier = NeuralClassifierTrainer(cnn, device='cuda')
>>> >>>
>>> # train QuaNet (QuaNet is an alias to QuaNetTrainer) >>> # train QuaNet (QuaNet is an alias to QuaNetTrainer)
>>> model = QuaNet(classifier, qp.environ['SAMPLE_SIZE'], device='cuda') >>> model = QuaNet(classifier, qp.environ['SAMPLE_SIZE'], device='cuda')
>>> model.fit(dataset.training) >>> model.fit(dataset.training)
>>> estim_prevalence = model.quantify(dataset.test.instances) >>> estim_prevalence = model.quantify(dataset.mixture.instances)
:param classifier: an object implementing `fit` (i.e., that can be trained on labelled data), :param classifier: an object implementing `fit` (i.e., that can be trained on labelled data),
`predict_proba` (i.e., that can generate posterior probabilities of unlabelled examples) and `predict_proba` (i.e., that can generate posterior probabilities of unlabelled examples) and

View File

@ -54,6 +54,7 @@ class GridSearchQ(BaseQuantifier):
self.__check_error(error) self.__check_error(error)
assert isinstance(protocol, AbstractProtocol), 'unknown protocol' assert isinstance(protocol, AbstractProtocol), 'unknown protocol'
def _sout(self, msg): def _sout(self, msg):
if self.verbose: if self.verbose:
print(f'[{self.__class__.__name__}:{self.model.__class__.__name__}]: {msg}') print(f'[{self.__class__.__name__}:{self.model.__class__.__name__}]: {msg}')
@ -88,7 +89,7 @@ class GridSearchQ(BaseQuantifier):
hyper = [dict({k: val[i] for i, k in enumerate(params_keys)}) for val in itertools.product(*params_values)] hyper = [dict({k: val[i] for i, k in enumerate(params_keys)}) for val in itertools.product(*params_values)]
self._sout(f'starting model selection with {self.n_jobs =}') self._sout(f'starting model selection with {self.n_jobs =}')
#pass a seed to parallel so it is set in clild processes # pass a seed to parallel so it is set in clild processes
scores = qp.util.parallel( scores = qp.util.parallel(
self._delayed_eval, self._delayed_eval,
((params, training) for params in hyper), ((params, training) for params in hyper),
@ -223,7 +224,7 @@ def cross_val_predict(quantifier: BaseQuantifier, data: LabelledCollection, nfol
for train, test in data.kFCV(nfolds=nfolds, random_state=random_state): for train, test in data.kFCV(nfolds=nfolds, random_state=random_state):
quantifier.fit(train) quantifier.fit(train)
fold_prev = quantifier.quantify(test.X) fold_prev = quantifier.quantify(test.X)
rel_size = len(test.X)/len(data) rel_size = len(test)/len(data)
total_prev += fold_prev*rel_size total_prev += fold_prev*rel_size
return total_prev return total_prev

View File

@ -11,7 +11,7 @@ import quapy as qp
plt.rcParams['figure.figsize'] = [10, 6] plt.rcParams['figure.figsize'] = [10, 6]
plt.rcParams['figure.dpi'] = 200 plt.rcParams['figure.dpi'] = 200
plt.rcParams['font.size'] = 18 plt.rcParams['font.size'] = 12
def binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title=None, show_std=True, legend=True, def binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title=None, show_std=True, legend=True,
@ -259,7 +259,7 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
data = _join_data_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, x_error, y_error, method_order) data = _join_data_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, x_error, y_error, method_order)
if method_order is None: if method_order is None:
method_order = method_names method_order = np.unique(method_names)
_set_colors(ax, n_methods=len(method_order)) _set_colors(ax, n_methods=len(method_order))
@ -330,8 +330,8 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
if show_legend: if show_legend:
fig.legend(loc='lower center', fig.legend(loc='lower center',
bbox_to_anchor=(1, 0.5), bbox_to_anchor=(0.9, 0.2),
ncol=(len(method_names)+1)//2) ncol=1)
_save_or_show(savepath) _save_or_show(savepath)