trying optimizing both prev and band at the same time, and per-class bandwidth

This commit is contained in:
Alejandro Moreo Fernandez 2024-09-22 22:47:07 +02:00
parent 9fb208fe4c
commit 5f9dad4644
10 changed files with 430 additions and 369 deletions

3
.gitmodules vendored Normal file
View File

@ -0,0 +1,3 @@
[submodule "result_table"]
path = result_table
url = gitea@gitea-s2i2s.isti.cnr.it:moreo/result_table.git

2
KDEy/constants.py Normal file
View File

@ -0,0 +1,2 @@
DEBUG = False

View File

@ -5,13 +5,14 @@ import numpy as np
from sklearn.linear_model import LogisticRegression
from os.path import join
import quapy as qp
from quapy.method.aggregative import KDEyML
from quapy.protocol import UPP
from kdey_devel import KDEyML
from utils import measuretime
from kdey_devel import KDEyMLauto
from utils import *
from constants import *
import quapy.functional as F
DEBUG = True
qp.environ["SAMPLE_SIZE"] = 100 if DEBUG else 500
val_repeats = 100 if DEBUG else 500
test_repeats = 100 if DEBUG else 500
@ -27,7 +28,7 @@ if DEBUG:
def datasets():
dataset_list = qp.datasets.UCI_MULTICLASS_DATASETS
dataset_list = qp.datasets.UCI_MULTICLASS_DATASETS[:4]
if DEBUG:
dataset_list = dataset_list[:4]
for dataset_name in dataset_list:
@ -58,11 +59,21 @@ def predict_b_modsel(dataset):
# kdey.qua
return modsel_choice
@measuretime
def predict_b_kdeymlauto(dataset):
# bandwidth chosen during model selection in validation
train, test = dataset.train_test
kdey = KDEyMLauto(random_state=0)
print(f'true-prevalence: {F.strprev(test.prevalence())}')
chosen_bandwidth, _ = kdey.chose_bandwidth(train, test.X)
auto_bandwidth = float(chosen_bandwidth)
return auto_bandwidth
def in_test_search(dataset, n_jobs=-1):
train, test = dataset.train_test
print(f"testing KDEy in {dataset.name}")
print(f"generating true tests scores using KDEy in {dataset.name}")
def experiment_job(bandwidth):
kdey = KDEyML(bandwidth=bandwidth, random_state=0)
@ -76,50 +87,8 @@ def in_test_search(dataset, n_jobs=-1):
return dataset_results, bandwidth_range
def plot_bandwidth(dataset_name, test_results, bandwidths, triplet_list_results):
import matplotlib.pyplot as plt
print("PLOT", dataset_name)
print(dataset_name)
plt.figure(figsize=(8, 6))
# show test results
plt.plot(bandwidths, test_results, marker='o')
for (method_name, method_choice, method_time) in triplet_list_results:
plt.axvline(x=method_choice, linestyle='--', label=method_name)
# Agregar etiquetas y título
plt.xlabel('Bandwidth')
plt.ylabel('MAE')
plt.title(dataset_name)
# Mostrar la leyenda
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
# Mostrar la gráfica
plt.grid(True)
plotdir = './plots'
if DEBUG:
plotdir = './plots_debug'
os.makedirs(plotdir, exist_ok=True)
plt.tight_layout()
plt.savefig(f'{plotdir}/{dataset_name}.png')
plt.close()
def error_table(dataset_name, test_results, bandwidth_range, triplet_list_results):
best_bandwidth = bandwidth_range[np.argmin(test_results)]
print(f'Method\tChoice\tAE\tTime')
for method_name, method_choice, took in triplet_list_results:
if method_choice in bandwidth_range:
index = np.where(bandwidth_range == method_choice)[0][0]
method_score = test_results[index]
else:
method_score = 1
error = np.abs(best_bandwidth-method_score)
print(f'{method_name}\t{method_choice}\t{error}\t{took:.3}s')
for dataset in datasets():
@ -138,6 +107,8 @@ for dataset in datasets():
triplet_list_results = []
modsel_choice, modsel_time = qp.util.pickled_resource(join(result_path, 'modsel.pkl'), predict_b_modsel, dataset)
triplet_list_results.append(('modsel', modsel_choice, modsel_time,))
auto_choice, auto_time = qp.util.pickled_resource(join(result_path, 'auto.pkl'), predict_b_kdeymlauto, dataset)
triplet_list_results.append(('auto', auto_choice, auto_time,))
print(f'Dataset = {dataset.name}')
print(modsel_choice)

View File

@ -1,358 +1,171 @@
from typing import Union
from typing import Union, Callable
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.neighbors import KernelDensity
import quapy as qp
from quapy.data import LabelledCollection
from quapy.method.aggregative import AggregativeSoftQuantifier
from quapy.method.aggregative import AggregativeSoftQuantifier, KDEyML
import quapy.functional as F
from sklearn.metrics.pairwise import rbf_kernel
from scipy import optimize
class KDEBase:
"""
Common ancestor for KDE-based methods. Implements some common routines.
"""
BANDWIDTH_METHOD = ['scott', 'silverman']
@classmethod
def _check_bandwidth(cls, bandwidth):
"""
Checks that the bandwidth parameter is correct
:param bandwidth: either a string (see BANDWIDTH_METHOD) or a float
:return: the bandwidth if the check is passed, or raises an exception for invalid values
"""
assert bandwidth in KDEBase.BANDWIDTH_METHOD or isinstance(bandwidth, float), \
f'invalid bandwidth, valid ones are {KDEBase.BANDWIDTH_METHOD} or float values'
if isinstance(bandwidth, float):
assert 0 < bandwidth < 1, \
"the bandwith for KDEy should be in (0,1), since this method models the unit simplex"
return bandwidth
def get_kde_function(self, X, bandwidth):
"""
Wraps the KDE function from scikit-learn.
:param X: data for which the density function is to be estimated
:param bandwidth: the bandwidth of the kernel
:return: a scikit-learn's KernelDensity object
"""
return KernelDensity(bandwidth=bandwidth).fit(X)
def pdf(self, kde, X):
"""
Wraps the density evalution of scikit-learn's KDE. Scikit-learn returns log-scores (s), so this
function returns :math:`e^{s}`
:param kde: a previously fit KDE function
:param X: the data for which the density is to be estimated
:return: np.ndarray with the densities
"""
return np.exp(kde.score_samples(X))
def get_mixture_components(self, X, y, classes, bandwidth):
"""
Returns an array containing the mixture components, i.e., the KDE functions for each class.
:param X: the data containing the covariates
:param y: the class labels
:param n_classes: integer, the number of classes
:param bandwidth: float, the bandwidth of the kernel
:return: a list of KernelDensity objects, each fitted with the corresponding class-specific covariates
"""
class_cond_X = []
for cat in classes:
selX = X[y == cat]
if selX.size == 0:
selX = [F.uniform_prevalence(len(classes))]
class_cond_X.append(selX)
return [self.get_kde_function(X_cond_yi, bandwidth) for X_cond_yi in class_cond_X]
class KDEyML(AggregativeSoftQuantifier, KDEBase):
"""
Kernel Density Estimation model for quantification (KDEy) relying on the Kullback-Leibler divergence (KLD) as
the divergence measure to be minimized. This method was first proposed in the paper
`Kernel Density Estimation for Multiclass Quantification <https://arxiv.org/abs/2401.00490>`_, in which
the authors show that minimizing the distribution mathing criterion for KLD is akin to performing
maximum likelihood (ML).
The distribution matching optimization problem comes down to solving:
:math:`\\hat{\\alpha} = \\arg\\min_{\\alpha\\in\\Delta^{n-1}} \\mathcal{D}(\\boldsymbol{p}_{\\alpha}||q_{\\widetilde{U}})`
where :math:`p_{\\alpha}` is the mixture of class-specific KDEs with mixture parameter (hence class prevalence)
:math:`\\alpha` defined by
:math:`\\boldsymbol{p}_{\\alpha}(\\widetilde{x}) = \\sum_{i=1}^n \\alpha_i p_{\\widetilde{L}_i}(\\widetilde{x})`
where :math:`p_X(\\boldsymbol{x}) = \\frac{1}{|X|} \\sum_{x_i\\in X} K\\left(\\frac{x-x_i}{h}\\right)` is the
KDE function that uses the datapoints in X as the kernel centers.
In KDEy-ML, the divergence is taken to be the Kullback-Leibler Divergence. This is equivalent to solving:
:math:`\\hat{\\alpha} = \\arg\\min_{\\alpha\\in\\Delta^{n-1}} -
\\mathbb{E}_{q_{\\widetilde{U}}} \\left[ \\log \\boldsymbol{p}_{\\alpha}(\\widetilde{x}) \\right]`
which corresponds to the maximum likelihood estimate.
:param classifier: a sklearn's Estimator that generates a binary classifier.
:param val_split: specifies the data used for generating classifier predictions. This specification
can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
be extracted from the training set; or as an integer (default 5), indicating that the predictions
are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
for `k`); or as a collection defining the specific set of data to use for validation.
Alternatively, this set can be specified at fit time by indicating the exact set of data
on which the predictions are to be generated.
:param bandwidth: float, the bandwidth of the Kernel
:param random_state: a seed to be set before fitting any base quantifier (default None)
"""
def __init__(self, classifier: BaseEstimator = None, val_split=5, bandwidth=0.1, random_state=None):
class KDEyMLauto(KDEyML):
def __init__(self, classifier: BaseEstimator = None, val_split=5, random_state=None, optim='two_steps'):
self.classifier = qp._get_classifier(classifier)
self.val_split = val_split
self.bandwidth = KDEBase._check_bandwidth(bandwidth)
self.bandwidth = None
self.random_state = random_state
self.optim = optim
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth)
return self
def chose_bandwidth(self, train, test_instances):
classif_predictions = self.classifier_fit_predict(train, fit_classifier=True, predict_on=self.val_split)
te_posteriors = self.classify(test_instances)
return self.transduce(classif_predictions, te_posteriors)
def aggregate(self, posteriors: np.ndarray):
"""
Searches for the mixture model parameter (the sought prevalence values) that maximizes the likelihood
of the data (i.e., that minimizes the negative log-likelihood)
def transduce(self, classif_predictions, te_posteriors):
tr_posteriors, tr_y = classif_predictions.Xy
classes = classif_predictions.classes_
n_classes = len(classes)
:param posteriors: instances in the sample converted into posterior probabilities
:return: a vector of class prevalence estimates
"""
current_bandwidth = 0.05
if self.optim == 'both_fine':
current_bandwidth = np.full(fill_value=current_bandwidth, shape=(n_classes,))
current_prevalence = np.full(fill_value=1 / n_classes, shape=(n_classes,))
iterations = 0
convergence = False
with qp.util.temp_seed(self.random_state):
epsilon = 1e-10
n_classes = len(self.mix_densities)
test_densities = [self.pdf(kde_i, posteriors) for kde_i in self.mix_densities]
def neg_loglikelihood(prev):
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prev, test_densities))
test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
return -np.sum(test_loglikelihood)
while not convergence:
previous_bandwidth = current_bandwidth
previous_prevalence = current_prevalence
return F.optim_minimize(neg_loglikelihood, n_classes)
iterations += 1
print(f'{iterations}:')
if self.optim == 'two_steps':
current_prevalence = self.optim_minimize_prevalence(current_bandwidth, current_prevalence, tr_posteriors, tr_y, te_posteriors, classes)
print(f'\testim-prev={F.strprev(current_prevalence)}')
class KDEyHD(AggregativeSoftQuantifier, KDEBase):
"""
Kernel Density Estimation model for quantification (KDEy) relying on the squared Hellinger Disntace (HD) as
the divergence measure to be minimized. This method was first proposed in the paper
`Kernel Density Estimation for Multiclass Quantification <https://arxiv.org/abs/2401.00490>`_, in which
the authors proposed a Monte Carlo approach for minimizing the divergence.
current_bandwidth = self.optim_minimize_bandwidth(current_bandwidth, current_prevalence, tr_posteriors, tr_y, te_posteriors, classes)
print(f'\tbandwidth={current_bandwidth}')
if np.isclose(previous_bandwidth, current_bandwidth, atol=0.0001) and all(
np.isclose(previous_prevalence, current_prevalence, atol=0.0001)):
convergence = True
elif self.optim == 'both':
current_prevalence, current_bandwidth = self.optim_minimize_both(current_bandwidth, current_prevalence, tr_posteriors, tr_y, te_posteriors, classes)
if np.isclose(previous_bandwidth, current_bandwidth, atol=0.0001) and all(np.isclose(previous_prevalence, current_prevalence, atol=0.0001)):
convergence = True
elif self.optim == 'both_fine':
current_prevalence, current_bandwidth = self.optim_minimize_both_fine(current_bandwidth, current_prevalence, tr_posteriors, tr_y,
te_posteriors, classes)
The distribution matching optimization problem comes down to solving:
if all(np.isclose(previous_bandwidth, current_bandwidth, atol=0.0001)) and all(np.isclose(previous_prevalence, current_prevalence, atol=0.0001)):
convergence = True
:math:`\\hat{\\alpha} = \\arg\\min_{\\alpha\\in\\Delta^{n-1}} \\mathcal{D}(\\boldsymbol{p}_{\\alpha}||q_{\\widetilde{U}})`
where :math:`p_{\\alpha}` is the mixture of class-specific KDEs with mixture parameter (hence class prevalence)
:math:`\\alpha` defined by
:math:`\\boldsymbol{p}_{\\alpha}(\\widetilde{x}) = \\sum_{i=1}^n \\alpha_i p_{\\widetilde{L}_i}(\\widetilde{x})`
where :math:`p_X(\\boldsymbol{x}) = \\frac{1}{|X|} \\sum_{x_i\\in X} K\\left(\\frac{x-x_i}{h}\\right)` is the
KDE function that uses the datapoints in X as the kernel centers.
In KDEy-HD, the divergence is taken to be the squared Hellinger Distance, an f-divergence with corresponding
f-generator function given by:
:math:`f(u)=(\\sqrt{u}-1)^2`
The authors proposed a Monte Carlo solution that relies on importance sampling:
:math:`\\hat{D}_f(p||q)= \\frac{1}{t} \\sum_{i=1}^t f\\left(\\frac{p(x_i)}{q(x_i)}\\right) \\frac{q(x_i)}{r(x_i)}`
where the datapoints (trials) :math:`x_1,\\ldots,x_t\\sim_{\\mathrm{iid}} r` with :math:`r` the
uniform distribution.
:param classifier: a sklearn's Estimator that generates a binary classifier.
:param val_split: specifies the data used for generating classifier predictions. This specification
can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
be extracted from the training set; or as an integer (default 5), indicating that the predictions
are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
for `k`); or as a collection defining the specific set of data to use for validation.
Alternatively, this set can be specified at fit time by indicating the exact set of data
on which the predictions are to be generated.
:param bandwidth: float, the bandwidth of the Kernel
:param random_state: a seed to be set before fitting any base quantifier (default None)
:param montecarlo_trials: number of Monte Carlo trials (default 10000)
"""
def __init__(self, classifier: BaseEstimator = None, val_split=5, divergence: str = 'HD',
bandwidth=0.1, random_state=None, montecarlo_trials=10000):
self.classifier = qp._get_classifier(classifier)
self.val_split = val_split
self.divergence = divergence
self.bandwidth = KDEBase._check_bandwidth(bandwidth)
self.random_state = random_state
self.montecarlo_trials = montecarlo_trials
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth)
N = self.montecarlo_trials
rs = self.random_state
n = data.n_classes
self.reference_samples = np.vstack([kde_i.sample(N // n, random_state=rs) for kde_i in self.mix_densities])
self.reference_classwise_densities = np.asarray(
[self.pdf(kde_j, self.reference_samples) for kde_j in self.mix_densities])
self.reference_density = np.mean(self.reference_classwise_densities,
axis=0) # equiv. to (uniform @ self.reference_classwise_densities)
return self
def aggregate(self, posteriors: np.ndarray):
# we retain all n*N examples (sampled from a mixture with uniform parameter), and then
# apply importance sampling (IS). In this version we compute D(p_alpha||q) with IS
n_classes = len(self.mix_densities)
test_kde = self.get_kde_function(posteriors, self.bandwidth)
test_densities = self.pdf(test_kde, self.reference_samples)
def f_squared_hellinger(u):
return (np.sqrt(u) - 1) ** 2
# todo: this will fail when self.divergence is a callable, and is not the right place to do it anyway
if self.divergence.lower() == 'hd':
f = f_squared_hellinger
else:
raise ValueError('only squared HD is currently implemented')
self.bandwidth = current_bandwidth
print('bandwidth=', current_bandwidth)
print('prevalence=', current_prevalence)
return current_prevalence
def optim_minimize_prevalence(self, current_bandwidth, current_prev, tr_posteriors, tr_y, te_posteriors, classes):
epsilon = 1e-10
qs = test_densities + epsilon
rs = self.reference_density + epsilon
iw = qs / rs # importance weights
p_class = self.reference_classwise_densities + epsilon
fracs = p_class / qs
mix_densities = self.get_mixture_components(tr_posteriors, tr_y, classes, current_bandwidth)
test_densities = [self.pdf(kde_i, te_posteriors) for kde_i in mix_densities]
def divergence(prev):
# ps / qs = (prev @ p_class) / qs = prev @ (p_class / qs) = prev @ fracs
ps_div_qs = prev @ fracs
return np.mean(f(ps_div_qs) * iw)
def neg_loglikelihood_prev(prev):
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prev, test_densities))
test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
return -np.sum(test_loglikelihood)
return F.optim_minimize(divergence, n_classes)
return optim_minimize(neg_loglikelihood_prev, current_prev)
def optim_minimize_bandwidth(self, current_bandwidth, current_prev, tr_posteriors, tr_y, te_posteriors, classes):
epsilon = 1e-10
def neg_loglikelihood_bandwidth(bandwidth):
mix_densities = self.get_mixture_components(tr_posteriors, tr_y, classes, bandwidth[0])
test_densities = [self.pdf(kde_i, te_posteriors) for kde_i in mix_densities]
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(current_prev, test_densities))
test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
return -np.sum(test_loglikelihood)
class KDEyCS(AggregativeSoftQuantifier):
"""
Kernel Density Estimation model for quantification (KDEy) relying on the Cauchy-Schwarz divergence (CS) as
the divergence measure to be minimized. This method was first proposed in the paper
`Kernel Density Estimation for Multiclass Quantification <https://arxiv.org/abs/2401.00490>`_, in which
the authors proposed a Monte Carlo approach for minimizing the divergence.
bounds = [(0.00001, 1)]
r = optimize.minimize(neg_loglikelihood_bandwidth, x0=[current_bandwidth], method='SLSQP', bounds=bounds)
print(f'iterations-bandwidth={r.nit}')
return r.x[0]
The distribution matching optimization problem comes down to solving:
def optim_minimize_both(self, current_bandwidth, current_prev, tr_posteriors, tr_y, te_posteriors, classes):
epsilon = 1e-10
n_classes = len(current_prev)
def neg_loglikelihood_bandwidth(prevalence_bandwidth):
bandwidth = prevalence_bandwidth[-1]
prevalence = prevalence_bandwidth[:-1]
mix_densities = self.get_mixture_components(tr_posteriors, tr_y, classes, bandwidth)
test_densities = [self.pdf(kde_i, te_posteriors) for kde_i in mix_densities]
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prevalence, test_densities))
test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
return -np.sum(test_loglikelihood)
:math:`\\hat{\\alpha} = \\arg\\min_{\\alpha\\in\\Delta^{n-1}} \\mathcal{D}(\\boldsymbol{p}_{\\alpha}||q_{\\widetilde{U}})`
bounds = [(0, 1) for _ in range(n_classes)] + [(0.00001, 1)]
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x[:n_classes])})
prevalence_bandwidth = np.append(current_prev, current_bandwidth)
r = optimize.minimize(neg_loglikelihood_bandwidth, x0=prevalence_bandwidth, method='SLSQP', bounds=bounds, constraints=constraints)
print(f'iterations-both={r.nit}')
prev_band = r.x
current_prevalence = prev_band[:-1]
current_bandwidth = prev_band[-1]
return current_prevalence, current_bandwidth
where :math:`p_{\\alpha}` is the mixture of class-specific KDEs with mixture parameter (hence class prevalence)
:math:`\\alpha` defined by
def optim_minimize_both_fine(self, current_bandwidth, current_prev, tr_posteriors, tr_y, te_posteriors, classes):
epsilon = 1e-10
n_classes = len(current_bandwidth)
def neg_loglikelihood_bandwidth(prevalence_bandwidth):
prevalence = prevalence_bandwidth[:n_classes]
bandwidth = prevalence_bandwidth[n_classes:]
mix_densities = self.get_mixture_components(tr_posteriors, tr_y, classes, bandwidth)
test_densities = [self.pdf(kde_i, te_posteriors) for kde_i in mix_densities]
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prevalence, test_densities))
test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
return -np.sum(test_loglikelihood)
:math:`\\boldsymbol{p}_{\\alpha}(\\widetilde{x}) = \\sum_{i=1}^n \\alpha_i p_{\\widetilde{L}_i}(\\widetilde{x})`
where :math:`p_X(\\boldsymbol{x}) = \\frac{1}{|X|} \\sum_{x_i\\in X} K\\left(\\frac{x-x_i}{h}\\right)` is the
KDE function that uses the datapoints in X as the kernel centers.
In KDEy-CS, the divergence is taken to be the Cauchy-Schwarz divergence given by:
:math:`\\mathcal{D}_{\\mathrm{CS}}(p||q)=-\\log\\left(\\frac{\\int p(x)q(x)dx}{\\sqrt{\\int p(x)^2dx \\int q(x)^2dx}}\\right)`
The authors showed that this distribution matching admits a closed-form solution
:param classifier: a sklearn's Estimator that generates a binary classifier.
:param val_split: specifies the data used for generating classifier predictions. This specification
can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
be extracted from the training set; or as an integer (default 5), indicating that the predictions
are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
for `k`); or as a collection defining the specific set of data to use for validation.
Alternatively, this set can be specified at fit time by indicating the exact set of data
on which the predictions are to be generated.
:param bandwidth: float, the bandwidth of the Kernel
"""
def __init__(self, classifier: BaseEstimator = None, val_split=5, bandwidth=0.1):
self.classifier = qp._get_classifier(classifier)
self.val_split = val_split
self.bandwidth = KDEBase._check_bandwidth(bandwidth)
def gram_matrix_mix_sum(self, X, Y=None):
# this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y))
# to contain pairwise evaluations of N(x|mu,Sigma1+Sigma2) with mu=y and Sigma1 and Sigma2 are
# two "scalar matrices" (h^2)*I each, so Sigma1+Sigma2 has scalar 2(h^2) (h is the bandwidth)
h = self.bandwidth
variance = 2 * (h ** 2)
nD = X.shape[1]
gamma = 1 / (2 * variance)
norm_factor = 1 / np.sqrt(((2 * np.pi) ** nD) * (variance ** (nD)))
gram = norm_factor * rbf_kernel(X, Y, gamma=gamma)
return gram.sum()
bounds = [(0, 1) for _ in range(n_classes)] + [(0.00001, 1) for _ in range(n_classes)]
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x[:n_classes])})
prevalence_bandwidth = np.concatenate((current_prev, current_bandwidth))
r = optimize.minimize(neg_loglikelihood_bandwidth, x0=prevalence_bandwidth, method='SLSQP', bounds=bounds, constraints=constraints)
print(f'iterations-both-fine={r.nit}')
prev_band = r.x
current_prevalence = prev_band[:n_classes]
current_bandwidth = prev_band[n_classes:]
return current_prevalence, current_bandwidth
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
P, y = classif_predictions.Xy
n = data.n_classes
assert all(sorted(np.unique(y)) == np.arange(n)), \
'label name gaps not allowed in current implementation'
# counts_inv keeps track of the relative weight of each datapoint within its class
# (i.e., the weight in its KDE model)
counts_inv = 1 / (data.counts())
# tr_tr_sums corresponds to symbol \overline{B} in the paper
tr_tr_sums = np.zeros(shape=(n, n), dtype=float)
for i in range(n):
for j in range(n):
if i > j:
tr_tr_sums[i, j] = tr_tr_sums[j, i]
else:
block = self.gram_matrix_mix_sum(P[y == i], P[y == j] if i != j else None)
tr_tr_sums[i, j] = block
# keep track of these data structures for the test phase
self.Ptr = P
self.ytr = y
self.tr_tr_sums = tr_tr_sums
self.counts_inv = counts_inv
self.classif_predictions = classif_predictions
return self
def aggregate(self, posteriors: np.ndarray):
Ptr = self.Ptr
Pte = posteriors
y = self.ytr
tr_tr_sums = self.tr_tr_sums
return self.transduce(self.classif_predictions, posteriors)
M, nD = Pte.shape
Minv = (1 / M) # t in the paper
n = Ptr.shape[1]
# becomes a constant that does not affect the optimization, no need to compute it
# partC = 0.5*np.log(self.gram_matrix_mix_sum(Pte) * Kinv * Kinv)
def optim_minimize(loss: Callable, init_prev: np.ndarray):
"""
Searches for the optimal prevalence values, i.e., an `n_classes`-dimensional vector of the (`n_classes`-1)-simplex
that yields the smallest lost. This optimization is carried out by means of a constrained search using scipy's
SLSQP routine.
# tr_te_sums corresponds to \overline{a}*(1/Li)*(1/M) in the paper (note the constants
# are already aggregated to tr_te_sums, so these multiplications are not carried out
# at each iteration of the optimization phase)
tr_te_sums = np.zeros(shape=n, dtype=float)
for i in range(n):
tr_te_sums[i] = self.gram_matrix_mix_sum(Ptr[y == i], Pte)
:param loss: (callable) the function to minimize
:return: (ndarray) the best prevalence vector found
"""
def divergence(alpha):
# called \overline{r} in the paper
alpha_ratio = alpha * self.counts_inv
# recall that tr_te_sums already accounts for the constant terms (1/Li)*(1/M)
partA = -np.log((alpha_ratio @ tr_te_sums) * Minv)
partB = 0.5 * np.log(alpha_ratio @ tr_tr_sums @ alpha_ratio)
return partA + partB # + partC
return F.optim_minimize(divergence, n)
n_classes = len(init_prev)
# solutions are bounded to those contained in the unit-simplex
bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1]
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
r = optimize.minimize(loss, x0=init_prev, method='SLSQP', bounds=bounds, constraints=constraints)
print(f'iterations-prevalence={r.nit}')
return r.x

View File

@ -0,0 +1,156 @@
import pickle
import os
from time import time
from collections import defaultdict
import numpy as np
from sklearn.linear_model import LogisticRegression
import quapy as qp
from KDEy.kdey_devel import KDEyMLauto
from quapy.method.aggregative import PACC, EMQ, KDEyML
from quapy.model_selection import GridSearchQ
from quapy.protocol import UPP
from pathlib import Path
SEED = 1
def newLR():
return LogisticRegression(max_iter=3000)
# typical hyperparameters explored for Logistic Regression
logreg_grid = {
'C': [1],
'class_weight': [None]
}
def wrap_hyper(classifier_hyper_grid: dict):
return {'classifier__' + k: v for k, v in classifier_hyper_grid.items()}
METHODS = [
('PACC', PACC(newLR()), wrap_hyper(logreg_grid)),
('EMQ', EMQ(newLR()), wrap_hyper(logreg_grid)),
('KDEy-ML', KDEyML(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.linspace(0.01, 0.2, 20)}}),
]
"""
TKDEyML era primero bandwidth (init 0.05) y luego prevalence (init uniform)
TKDEyML2 era primero prevalence (init uniform) y luego bandwidth (init 0.05)
TKDEyML3 era primero prevalence (init uniform) y luego bandwidth (init 0.1)
TKDEyML4 es como ML2 pero max 5 iteraciones por optimización
"""
TRANSDUCTIVE_METHODS = [
#('TKDEy-ML', KDEyMLauto(newLR()), None),
('TKDEy-MLboth', KDEyMLauto(newLR(), optim='both'), None),
('TKDEy-MLbothfine', KDEyMLauto(newLR(), optim='both_fine'), None),
('TKDEy-ML2', KDEyMLauto(newLR()), None),
#('TKDEy-ML3', KDEyMLauto(newLR()), None),
#('TKDEy-ML4', KDEyMLauto(newLR()), None),
]
def show_results(result_path):
import pandas as pd
df = pd.read_csv(result_path + '.csv', sep='\t')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE", "t_train"], margins=True)
print(pv)
def load_timings(result_path):
import pandas as pd
timings = defaultdict(lambda: {})
if not Path(result_path + '.csv').exists():
return timings
df = pd.read_csv(result_path + '.csv', sep='\t')
return timings | df.pivot_table(index='Dataset', columns='Method', values='t_train').to_dict()
if __name__ == '__main__':
qp.environ['SAMPLE_SIZE'] = 500
qp.environ['N_JOBS'] = -1
n_bags_val = 25
n_bags_test = 100
result_dir = f'results_quantification/ucimulti'
os.makedirs(result_dir, exist_ok=True)
global_result_path = f'{result_dir}/allmethods'
timings = load_timings(global_result_path)
with open(global_result_path + '.csv', 'wt') as csv:
csv.write(f'Method\tDataset\tMAE\tMRAE\tt_train\n')
for method_name, quantifier, param_grid in METHODS + TRANSDUCTIVE_METHODS:
print('Init method', method_name)
with open(global_result_path + '.csv', 'at') as csv:
for dataset in qp.datasets.UCI_MULTICLASS_DATASETS[:4]:
print('init', dataset)
local_result_path = os.path.join(Path(global_result_path).parent,
method_name + '_' + dataset + '.dataframe')
if os.path.exists(local_result_path):
print(f'result file {local_result_path} already exist; skipping')
report = qp.util.load_report(local_result_path)
else:
with qp.util.temp_seed(SEED):
data = qp.datasets.fetch_UCIMulticlassDataset(dataset, verbose=True)
if not method_name.startswith("TKDEy-ML"):
# model selection
train, test = data.train_test
train, val = train.split_stratified(random_state=SEED)
protocol = UPP(val, repeats=n_bags_val)
modsel = GridSearchQ(
quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=1, error='mae'
)
t_init = time()
try:
modsel.fit(train)
print(f'best params {modsel.best_params_}')
print(f'best score {modsel.best_score_}')
quantifier = modsel.best_model()
except:
print('something went wrong... trying to fit the default model')
quantifier.fit(train)
timings[method_name][dataset] = time() - t_init
protocol = UPP(test, repeats=n_bags_test)
report = qp.evaluation.evaluation_report(
quantifier, protocol, error_metrics=['mae', 'mrae'], verbose=True
)
report.to_csv(local_result_path)
else:
# model selection
train, test = data.train_test
t_init = time()
quantifier.fit(train)
timings[method_name][dataset] = time() - t_init
protocol = UPP(test, repeats=n_bags_test)
report = qp.evaluation.evaluation_report(
quantifier, protocol, error_metrics=['mae', 'mrae'], verbose=True
)
report.to_csv(local_result_path)
means = report.mean(numeric_only=True)
csv.write(
f'{method_name}\t{dataset}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n')#\t{timings[method_name][dataset]:.3f}\n')
csv.flush()
show_results(global_result_path)

View File

@ -1,5 +1,10 @@
import time
from functools import wraps
import os
from os.path import join
from result_table.src.table import Table
import numpy as np
from constants import *
def measuretime(func):
@wraps(func)
@ -12,4 +17,65 @@ def measuretime(func):
return (*result, time_it_took)
else:
return result, time_it_took
return wrapper
return wrapper
def plot_bandwidth(dataset_name, test_results, bandwidths, triplet_list_results):
import matplotlib.pyplot as plt
print("PLOT", dataset_name)
print(dataset_name)
plt.figure(figsize=(8, 6))
# show test results
plt.plot(bandwidths, test_results, marker='o', color='k')
colors = plt.cm.tab10(np.linspace(0, 1, len(triplet_list_results)))
for i, (method_name, method_choice, method_time) in enumerate(triplet_list_results):
plt.axvline(x=method_choice, linestyle='--', label=method_name, color=colors[i])
# Agregar etiquetas y título
plt.xlabel('Bandwidth')
plt.ylabel('MAE')
plt.title(dataset_name)
# Mostrar la leyenda
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
# Mostrar la gráfica
plt.grid(True)
plotdir = './plots'
if DEBUG:
plotdir = './plots_debug'
os.makedirs(plotdir, exist_ok=True)
plt.tight_layout()
plt.savefig(f'{plotdir}/{dataset_name}.png')
plt.close()
def error_table(dataset_name, test_results, bandwidth_range, triplet_list_results):
best_bandwidth = bandwidth_range[np.argmin(test_results)]
best_score = np.min(test_results)
print(f'Method\tChoice\tAE\tTime')
table=Table(name=dataset_name)
table.format.with_mean=False
table.format.with_rank_mean = False
table.format.show_std = False
for method_name, method_choice, took in triplet_list_results:
if method_choice in bandwidth_range:
index = np.where(bandwidth_range == method_choice)[0][0]
method_score = test_results[index]
else:
method_score = 1
error = np.abs(best_score-method_score)
table.add(benchmark='Choice', method=method_name, v=method_choice)
table.add(benchmark='ScoreChoice', method=method_name, v=method_score)
table.add(benchmark='Best', method=method_name, v=best_bandwidth)
table.add(benchmark='ScoreBest', method=method_name, v=best_score)
table.add(benchmark='AE', method=method_name, v=error)
table.add(benchmark='Time', method=method_name, v=took)
outpath = './tables'
if DEBUG:
outpath = './tables_debug'
table.latexPDF(join(outpath, dataset_name+'.pdf'), transpose=True)

View File

@ -14,7 +14,7 @@ from . import model_selection
from . import classification
import os
__version__ = '0.1.9'
__version__ = '0.1.10'
environ = {
'SAMPLE_SIZE': None,

View File

@ -3,6 +3,7 @@ from contextlib import contextmanager
import zipfile
from os.path import join
import pandas as pd
import sklearn.datasets
from ucimlrepo import fetch_ucirepo
from quapy.data.base import Dataset, LabelledCollection
from quapy.data.preprocessing import text2tfidf, reduce_columns
@ -1004,3 +1005,49 @@ def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=No
return train, test_gen
else:
return train_gen, test_gen
def syntheticUniformLabelledCollection(n_samples, n_features, n_classes, n_clusters_per_class=1, **kwargs):
"""
Generates a synthetic labelled collection with uniform priors and
of `n_samples` instances, `n_features` features, and `n_classes` classes.
The underlying generator relies on the function
`sklearn.datasets.make_classification`. Other options can be specified using the `kwargs`;
see the `scikit-learn documentation
<https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html>`_
for a full list of optional parameters.
:param n_samples: number of instances
:param n_features: number of features
:param n_classes: number of classes
"""
X, y = sklearn.datasets.make_classification(
n_samples=n_samples,
n_features=n_features,
n_classes=n_classes,
n_clusters_per_class=n_clusters_per_class,
**kwargs
)
return LabelledCollection(X, y)
def syntheticUniformDataset(n_samples, n_features, n_classes, test_split=0.3, **kwargs):
"""
Generates a synthetic Dataset with approximately uniform priors and
of `n_samples` instances, `n_features` features, and `n_classes` classes.
The underlying generator relies on the function
`sklearn.datasets.make_classification`. Other options can be specified using the `kwargs`;
see the `scikit-learn documentation
<https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html>`_
for a full list of optional parameters.
:param n_samples: number of instances
:param n_features: number of features
:param n_classes: number of classes
:param test_split: proportion of test instances
"""
assert 0. < test_split < 1., "invalid proportion of test instances; the value must be in (0, 1)"
lc = syntheticUniformLabelledCollection(n_samples, n_features, n_classes, **kwargs)
training, test = lc.split_stratified(train_prop=1-test_split, random_state=kwargs.get('random_state', None))
dataset = Dataset(training=training, test=test, name=f'synthetic(nF={n_features},nC={n_classes})')
return dataset

View File

@ -66,11 +66,13 @@ class KDEBase:
"""
class_cond_X = []
for cat in classes:
selX = X[y==cat]
if selX.size==0:
selX = X[y == cat]
if selX.size == 0:
selX = [F.uniform_prevalence(len(classes))]
class_cond_X.append(selX)
return [self.get_kde_function(X_cond_yi, bandwidth) for X_cond_yi in class_cond_X]
if isinstance(bandwidth, float):
bandwidth = np.full(fill_value=bandwidth, shape=(len(classes),))
return [self.get_kde_function(X_cond_yi, band_i) for X_cond_yi, band_i in zip(class_cond_X, bandwidth)]
class KDEyML(AggregativeSoftQuantifier, KDEBase):
@ -188,7 +190,7 @@ class KDEyHD(AggregativeSoftQuantifier, KDEBase):
def __init__(self, classifier: BaseEstimator=None, val_split=5, divergence: str='HD',
bandwidth=0.1, random_state=None, montecarlo_trials=10000):
self.classifier = qp._get_classifier(classifier)
self.val_split = val_split
self.divergence = divergence
@ -218,7 +220,7 @@ class KDEyHD(AggregativeSoftQuantifier, KDEBase):
def f_squared_hellinger(u):
return (np.sqrt(u)-1)**2
# todo: this will fail when self.divergence is a callable, and is not the right place to do it anyway
if self.divergence.lower() == 'hd':
f = f_squared_hellinger
@ -283,7 +285,7 @@ class KDEyCS(AggregativeSoftQuantifier):
def gram_matrix_mix_sum(self, X, Y=None):
# this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y))
# to contain pairwise evaluations of N(x|mu,Sigma1+Sigma2) with mu=y and Sigma1 and Sigma2 are
# to contain pairwise evaluations of N(x|mu,Sigma1+Sigma2) with mu=y and Sigma1 and Sigma2 are
# two "scalar matrices" (h^2)*I each, so Sigma1+Sigma2 has scalar 2(h^2) (h is the bandwidth)
h = self.bandwidth
variance = 2 * (h**2)
@ -342,7 +344,7 @@ class KDEyCS(AggregativeSoftQuantifier):
# at each iteration of the optimization phase)
tr_te_sums = np.zeros(shape=n, dtype=float)
for i in range(n):
tr_te_sums[i] = self.gram_matrix_mix_sum(Ptr[y==i], Pte)
tr_te_sums[i] = self.gram_matrix_mix_sum(Ptr[y==i], Pte)
def divergence(alpha):
# called \overline{r} in the paper

1
result_table Submodule

@ -0,0 +1 @@
Subproject commit c223c9f1fe3c9708e8c5a5c56e438cdaaa857be4