Compare commits

...

19 Commits

Author SHA1 Message Date
Alejandro Moreo Fernandez d66cbb3c8d switching 2024-10-04 14:30:26 +02:00
Alejandro Moreo Fernandez 14ff3c9884 switching to kde 2024-09-27 10:18:20 +02:00
Alejandro Moreo Fernandez 641228bf62 Merge branch 'localstack' of gitea-s2i2s.isti.cnr.it:moreo/QuaPy into localstack 2024-09-26 16:06:01 +02:00
Alejandro Moreo Fernandez 4fa4540aab autolike+ fixed 2024-09-25 17:23:43 +02:00
Alejandro Moreo Fernandez 531badffc8 change in minimize -> minimize_scalar in TKDE2 2024-09-25 17:00:22 +02:00
Alejandro Moreo Fernandez da006ee89a reduction kdey 2024-09-25 13:34:34 +02:00
Alejandro Moreo Fernandez 4a3b18b3a3 debug + 2024-09-25 11:33:51 +02:00
Alejandro Moreo Fernandez ac4f81918e search + 2024-09-25 11:29:12 +02:00
Alejandro Moreo Fernandez 05d1967cd5 bugfix 2024-09-25 11:24:46 +02:00
Alejandro Moreo Fernandez 9020d7ff31 auto with optim instead of grid 2024-09-25 10:59:24 +02:00
Alejandro Moreo Fernandez 2aabfdc4c0 testing bandwidth selection as internal model selection with reduction 2024-09-24 17:10:04 +02:00
Alejandro Moreo Fernandez 84f5799219 another type of search for the bandwidth based in likelihood 2024-09-23 22:23:41 +02:00
Alejandro Moreo Fernandez 5f9dad4644 trying optimizing both prev and band at the same time, and per-class bandwidth 2024-09-22 22:47:07 +02:00
Alejandro Moreo Fernandez 9fb208fe4c switch 2024-09-18 10:33:58 +02:00
Alejandro Moreo Fernandez 6ce5eea4f2 switch 2024-09-17 10:57:46 +02:00
Alejandro Moreo Fernandez f30c6ceaa1 switch 2024-09-17 10:02:08 +02:00
Alejandro Moreo Fernandez faba2494b2 some plots 2024-09-16 17:50:34 +02:00
Alejandro Moreo Fernandez ede214aa54 switching 2024-09-16 15:06:29 +02:00
Alejandro Moreo Fernandez af2c4eaf01 first example 2024-09-16 13:21:18 +02:00
14 changed files with 1010 additions and 13 deletions

3
.gitmodules vendored Normal file
View File

@ -0,0 +1,3 @@
[submodule "result_table"]
path = result_table
url = gitea@gitea-s2i2s.isti.cnr.it:moreo/result_table.git

2
KDEy/constants.py Normal file
View File

@ -0,0 +1,2 @@
DEBUG = False

124
KDEy/experiments.py Normal file
View File

@ -0,0 +1,124 @@
import os
import pickle
import shutil
import numpy as np
from sklearn.linear_model import LogisticRegression
from os.path import join
import quapy as qp
from quapy.method.aggregative import KDEyML
from quapy.protocol import UPP
from kdey_devel import KDEyMLauto
from utils import *
from constants import *
import quapy.functional as F
qp.environ["SAMPLE_SIZE"] = 100 if DEBUG else 500
val_repeats = 100 if DEBUG else 500
test_repeats = 100 if DEBUG else 500
if DEBUG:
qp.environ["DEFAULT_CLS"] = LogisticRegression()
test_results = {}
val_choice = {}
bandwidth_range = np.linspace(0.01, 0.20, 20)
if DEBUG:
bandwidth_range = np.linspace(0.01, 0.20, 5)
def datasets():
dataset_list = qp.datasets.UCI_MULTICLASS_DATASETS[:4]
if DEBUG:
dataset_list = dataset_list[:4]
for dataset_name in dataset_list:
dataset = qp.datasets.fetch_UCIMulticlassDataset(dataset_name)
if DEBUG:
dataset = dataset.reduce(random_state=0)
yield dataset
@measuretime
def predict_b_modsel(dataset):
# bandwidth chosen during model selection in validation
train = dataset.training
train_tr, train_va = train.split_stratified(random_state=0)
kdey = KDEyML(random_state=0)
modsel = qp.model_selection.GridSearchQ(
model=kdey,
param_grid={'bandwidth': bandwidth_range},
protocol=UPP(train_va, repeats=val_repeats),
refit=False,
n_jobs=-1,
verbose=True
).fit(train_tr)
chosen_bandwidth = modsel.best_params_['bandwidth']
modsel_choice = float(chosen_bandwidth)
# kdey.set_params(bandwidth=chosen_bandwidth)
# kdey.fit(train)
# kdey.qua
return modsel_choice
@measuretime
def predict_b_kdeymlauto(dataset):
# bandwidth chosen during model selection in validation
train, test = dataset.train_test
kdey = KDEyMLauto(random_state=0)
print(f'true-prevalence: {F.strprev(test.prevalence())}')
chosen_bandwidth, _ = kdey.chose_bandwidth(train, test.X)
auto_bandwidth = float(chosen_bandwidth)
return auto_bandwidth
def in_test_search(dataset, n_jobs=-1):
train, test = dataset.train_test
print(f"generating true tests scores using KDEy in {dataset.name}")
def experiment_job(bandwidth):
kdey = KDEyML(bandwidth=bandwidth, random_state=0)
kdey.fit(train)
test_gen = UPP(test, repeats=test_repeats)
mae = qp.evaluation.evaluate(kdey, protocol=test_gen, error_metric='mae', verbose=True)
print(f'{bandwidth=}: {mae:.5f}')
return float(mae)
dataset_results = qp.util.parallel(experiment_job, bandwidth_range, n_jobs=n_jobs)
return dataset_results, bandwidth_range
for dataset in datasets():
print('NAME', dataset.name)
print(len(dataset.training))
print(len(dataset.test))
result_path = f'./results/{dataset.name}/'
if DEBUG:
result_path = result_path.replace('results', 'results_debug')
if os.path.exists(result_path):
shutil.rmtree(result_path)
dataset_results, bandwidth_range = qp.util.pickled_resource(join(result_path, 'test.pkl'), in_test_search, dataset)
triplet_list_results = []
modsel_choice, modsel_time = qp.util.pickled_resource(join(result_path, 'modsel.pkl'), predict_b_modsel, dataset)
triplet_list_results.append(('modsel', modsel_choice, modsel_time,))
auto_choice, auto_time = qp.util.pickled_resource(join(result_path, 'auto.pkl'), predict_b_kdeymlauto, dataset)
triplet_list_results.append(('auto', auto_choice, auto_time,))
print(f'Dataset = {dataset.name}')
print(modsel_choice)
print(dataset_results)
plot_bandwidth(dataset.name, dataset_results, bandwidth_range, triplet_list_results)
error_table(dataset.name, dataset_results, bandwidth_range, triplet_list_results)
# time_table(dataset.name, dataset_results, bandwidth_range, triplet_list_results)

386
KDEy/kdey_devel.py Normal file
View File

@ -0,0 +1,386 @@
from typing import Union, Callable
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.neighbors import KernelDensity
import quapy as qp
from quapy.protocol import UPP
from quapy.method._kdey import KDEBase
from quapy.data import LabelledCollection
from quapy.method.aggregative import AggregativeSoftQuantifier, KDEyML
import quapy.functional as F
from sklearn.metrics.pairwise import rbf_kernel
from scipy import optimize
from tqdm import tqdm
import quapy.functional as F
epsilon = 1e-10
class KDEyMLauto(KDEyML):
def __init__(self, classifier: BaseEstimator = None, val_split=5, random_state=None, optim='two_steps'):
self.classifier = qp._get_classifier(classifier)
self.val_split = val_split
self.bandwidth = None
self.random_state = random_state
self.optim = optim
def chose_bandwidth(self, train, test_instances):
classif_predictions = self.classifier_fit_predict(train, fit_classifier=True, predict_on=self.val_split)
te_posteriors = self.classify(test_instances)
return self.transduce(classif_predictions, te_posteriors)
def transduce(self, classif_predictions, te_posteriors):
tr_posteriors, tr_y = classif_predictions.Xy
classes = classif_predictions.classes_
n_classes = len(classes)
current_bandwidth = 0.05
if self.optim == 'both_fine':
current_bandwidth = np.full(fill_value=current_bandwidth, shape=(n_classes,))
current_prevalence = np.full(fill_value=1 / n_classes, shape=(n_classes,))
if self.optim == 'max_likelihood':
current_prevalence, current_bandwidth = self.optim_minimize_like(tr_posteriors, tr_y, te_posteriors, classes, grid=True)
elif self.optim == 'max_likelihood2':
current_prevalence, current_bandwidth = self.optim_minimize_like(tr_posteriors, tr_y, te_posteriors, classes, grid=False)
else:
iterations = 0
convergence = False
with qp.util.temp_seed(self.random_state):
while not convergence:
previous_bandwidth = current_bandwidth
previous_prevalence = current_prevalence
iterations += 1
print(f'{iterations}:')
if self.optim == 'two_steps':
current_prevalence = self.optim_minimize_prevalence(current_bandwidth, current_prevalence, tr_posteriors, tr_y, te_posteriors, classes)
print(f'\testim-prev={F.strprev(current_prevalence)}')
current_bandwidth = self.optim_minimize_bandwidth(current_bandwidth, current_prevalence, tr_posteriors, tr_y, te_posteriors, classes)
print(f'\tbandwidth={current_bandwidth}')
elif self.optim == 'both':
current_prevalence, current_bandwidth = self.optim_minimize_both(current_bandwidth, current_prevalence, tr_posteriors, tr_y, te_posteriors, classes)
elif self.optim == 'both_fine':
current_prevalence, current_bandwidth = self.optim_minimize_both_fine(current_bandwidth, current_prevalence, tr_posteriors, tr_y, te_posteriors, classes)
# check converngece
prev_convergence = all(np.isclose(previous_prevalence, current_prevalence, atol=0.01))
if isinstance(current_bandwidth, float):
band_convergence = np.isclose(previous_bandwidth, current_bandwidth, atol=0.001)
else:
band_convergence = all(np.isclose(previous_bandwidth, current_bandwidth, atol=0.001))
convergence = band_convergence and prev_convergence
self.bandwidth = current_bandwidth
print('bandwidth=', current_bandwidth)
print('prevalence=', current_prevalence)
return current_prevalence
def optim_minimize_prevalence(self, current_bandwidth, current_prev, tr_posteriors, tr_y, te_posteriors, classes):
mix_densities = self.get_mixture_components(tr_posteriors, tr_y, classes, current_bandwidth)
test_densities = [self.pdf(kde_i, te_posteriors) for kde_i in mix_densities]
def neg_loglikelihood_prev(prev):
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prev, test_densities))
test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
return -np.sum(test_loglikelihood)
return optim_minimize(neg_loglikelihood_prev, current_prev)
def optim_minimize_bandwidth(self, current_bandwidth, current_prev, tr_posteriors, tr_y, te_posteriors, classes):
def neg_loglikelihood_bandwidth(bandwidth):
# bandwidth = bandwidth[0]
mix_densities = self.get_mixture_components(tr_posteriors, tr_y, classes, bandwidth)
test_densities = [self.pdf(kde_i, te_posteriors) for kde_i in mix_densities]
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(current_prev, test_densities))
test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
nll = -np.sum(test_loglikelihood)
# print(f'\t{bandwidth=:.10f}\t{nll=:.10f}')
return nll
# bounds = [(0.00001, 0.2)]
# r = optimize.minimize(neg_loglikelihood_bandwidth, x0=[current_bandwidth], method='SLSQP', bounds=bounds)
r = optimize.minimize_scalar(neg_loglikelihood_bandwidth, bounds=(0.00001, 0.2))
# print(f'iterations-bandwidth={r.nit}')
assert r.success, f'Process did not converge! {r.message}'
return r.x
def optim_minimize_both(self, current_bandwidth, current_prev, tr_posteriors, tr_y, te_posteriors, classes):
n_classes = len(current_prev)
def neg_loglikelihood_bandwidth(prevalence_bandwidth):
bandwidth = prevalence_bandwidth[-1]
prevalence = prevalence_bandwidth[:-1]
mix_densities = self.get_mixture_components(tr_posteriors, tr_y, classes, bandwidth)
test_densities = [self.pdf(kde_i, te_posteriors) for kde_i in mix_densities]
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prevalence, test_densities))
test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
return -np.sum(test_loglikelihood)
bounds = [(0, 1) for _ in range(n_classes)] + [(0.00001, 0.2)]
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x[:n_classes])})
prevalence_bandwidth = np.append(current_prev, current_bandwidth)
r = optimize.minimize(neg_loglikelihood_bandwidth, x0=prevalence_bandwidth, method='SLSQP', bounds=bounds, constraints=constraints)
print(f'iterations-both={r.nit}')
assert r.success, 'Process did not converge!'
prev_band = r.x
current_prevalence = prev_band[:-1]
current_bandwidth = prev_band[-1]
return current_prevalence, current_bandwidth
def optim_minimize_both_fine(self, current_bandwidth, current_prev, tr_posteriors, tr_y, te_posteriors, classes):
n_classes = len(current_bandwidth)
def neg_loglikelihood_bandwidth(prevalence_bandwidth):
prevalence = prevalence_bandwidth[:n_classes]
bandwidth = prevalence_bandwidth[n_classes:]
mix_densities = self.get_mixture_components(tr_posteriors, tr_y, classes, bandwidth)
test_densities = [self.pdf(kde_i, te_posteriors) for kde_i in mix_densities]
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prevalence, test_densities))
test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
return -np.sum(test_loglikelihood)
bounds = [(0, 1) for _ in range(n_classes)] + [(0.00001, 1) for _ in range(n_classes)]
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x[:n_classes])})
prevalence_bandwidth = np.concatenate((current_prev, current_bandwidth))
r = optimize.minimize(neg_loglikelihood_bandwidth, x0=prevalence_bandwidth, method='SLSQP', bounds=bounds, constraints=constraints)
print(f'iterations-both-fine={r.nit}')
assert r.success, 'Process did not converge!'
prev_band = r.x
current_prevalence = prev_band[:n_classes]
current_bandwidth = prev_band[n_classes:]
return current_prevalence, current_bandwidth
def optim_minimize_like(self, tr_posteriors, tr_y, te_posteriors, classes, reduction=100, grid=True):
n_classes = len(classes)
# reduce samples to speed up computation
posteriors_subsample = LabelledCollection(tr_posteriors, tr_y)
posteriors_subsample = posteriors_subsample.sampling(reduction*n_classes)
n_test = te_posteriors.shape[0]
subsample_index = np.random.choice(np.arange(n_test), size=reduction)
te_posterior_subsample = te_posteriors[subsample_index]
if grid:
_, best_band = self.choose_bandwidth_maxlikelihood_grid(*posteriors_subsample.Xy, te_posterior_subsample, classes)
else:
best_band = self.choose_bandwidth_maxlikelihood_search(*posteriors_subsample.Xy, te_posterior_subsample, classes)
mix_densities = self.get_mixture_components(tr_posteriors, tr_y, classes, best_band)
test_densities = [self.pdf(kde_i, te_posteriors) for kde_i in mix_densities]
def neg_loglikelihood_prev(prev):
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prev, test_densities))
test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
return -np.sum(test_loglikelihood)
init_prev = np.full(fill_value=1 / n_classes, shape=(n_classes,))
pred_prev, neglikelihood = optim_minimize(neg_loglikelihood_prev, init_prev, return_loss=True)
return pred_prev, best_band
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
self.classif_predictions = classif_predictions
return self
def aggregate(self, posteriors: np.ndarray):
return self.transduce(self.classif_predictions, posteriors)
def choose_bandwidth_maxlikelihood_grid(self, tr_posteriors, tr_y, te_posteriors, classes):
n_classes = len(classes)
best_band = None
best_like = None
best_prev = None
init_prev = np.full(fill_value=1 / n_classes, shape=(n_classes,))
for bandwidth in np.logspace(-4, 0.5, 50):
mix_densities = self.get_mixture_components(tr_posteriors, tr_y, classes, bandwidth)
test_densities = [self.pdf(kde_i, te_posteriors) for kde_i in mix_densities]
def neg_loglikelihood_prev(prev):
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prev, test_densities))
test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
return -np.sum(test_loglikelihood)
pred_prev, neglikelihood = optim_minimize(neg_loglikelihood_prev, init_prev, return_loss=True)
if best_like is None or neglikelihood < best_like:
best_like = neglikelihood
best_band = bandwidth
best_prev = pred_prev
print(f'best-like={best_like:.4f}')
print(f'best-band={best_band:.4f}')
return best_prev, best_band
def choose_bandwidth_maxlikelihood_search(self, tr_posteriors, tr_y, te_posteriors, classes):
n_classes = len(classes)
init_prev = np.full(fill_value=1 / n_classes, shape=(n_classes,))
def neglikelihood_band(bandwidth):
mix_densities = self.get_mixture_components(tr_posteriors, tr_y, classes, bandwidth[0])
test_densities = [self.pdf(kde_i, te_posteriors) for kde_i in mix_densities]
def neg_loglikelihood_prev(prev):
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prev, test_densities))
test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
return -np.sum(test_loglikelihood)
pred_prev, neglikelihood = optim_minimize(neg_loglikelihood_prev, init_prev, return_loss=True)
return neglikelihood
bounds = [(0.0001, 0.2)]
r = optimize.minimize(neglikelihood_band, x0=[0.001], method='SLSQP', bounds=bounds)
best_band = r.x[0]
assert r.success, 'Process did not converge!'
print(f'solved in nit={r.nit}')
return best_band
def optim_minimize(loss: Callable, init_prev: np.ndarray, return_loss=False):
"""
Searches for the optimal prevalence values, i.e., an `n_classes`-dimensional vector of the (`n_classes`-1)-simplex
that yields the smallest lost. This optimization is carried out by means of a constrained search using scipy's
SLSQP routine.
:param loss: (callable) the function to minimize
:return: (ndarray) the best prevalence vector found
"""
n_classes = len(init_prev)
# solutions are bounded to those contained in the unit-simplex
bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1]
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
r = optimize.minimize(loss, x0=init_prev, method='SLSQP', bounds=bounds, constraints=constraints)
# print(f'iterations-prevalence={r.nit}')
assert r.success, 'Process did not converge!'
if return_loss:
return r.x, r.fun
else:
return r.x
class KDEyMLauto2(KDEyML):
def __init__(self, classifier: BaseEstimator=None, val_split=5, bandwidth=0.1, random_state=None, reduction=100, max_reduced=500, target='likelihood'):
"""
reduction: number of examples per class for automatically setting the bandwidth
"""
self.classifier = qp._get_classifier(classifier)
self.val_split = val_split
if bandwidth == 'auto':
self.bandwidth = bandwidth
else:
self.bandwidth = KDEBase._check_bandwidth(bandwidth)
self.reduction = reduction
self.max_reduced = max_reduced
self.random_state = random_state
assert target in ['likelihood', 'likelihood+'] or target in qp.error.QUANTIFICATION_ERROR_NAMES, 'unknown target for auto'
self.target = target
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
if self.bandwidth == 'auto':
self.auto_bandwidth_likelihood(classif_predictions)
else:
self.bandwidth_ = self.bandwidth
self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth_)
return self
def auto_bandwidth_likelihood(self, classif_predictions: LabelledCollection):
n_classes = classif_predictions.n_classes
train, val = classif_predictions.split_stratified(train_prop=0.5, random_state=self.random_state)
if self.reduction is not None:
# reduce samples to speed up computation
tr_length = min(self.reduction * n_classes, self.max_reduced)
if len(train) > tr_length:
train = train.sampling(tr_length)
init_prev = np.full(fill_value=1 / n_classes, shape=(n_classes,))
repeats = 25
prot = UPP(val, sample_size=self.reduction, repeats=repeats, random_state=self.random_state)
if self.target == 'likelihood+':
def neg_loglikelihood_bandwidth(bandwidth):
mix_densities = self.get_mixture_components(*train.Xy, train.classes_, bandwidth)
loss_accum = 0
for (sample, prevtrue) in prot():
test_densities = [self.pdf(kde_i, sample) for kde_i in mix_densities]
def neg_loglikelihood_prev(prev):
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prev, test_densities))
test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
nll = -np.sum(test_loglikelihood)
return nll
pred_prev, neglikelihood = optim_minimize(neg_loglikelihood_prev, init_prev, return_loss=True)
# print(f'\t\tprev={F.strprev(pred_prev)} (true={F.strprev(prev)}) got {neglikelihood=}')
loss_accum += neglikelihood
return loss_accum
r = optimize.minimize_scalar(neg_loglikelihood_bandwidth, bounds=(0.00001, 0.2))
best_band = r.x
best_loss_value = r.fun
nit = r.nit
# assert r.success, 'Process did not converge!'
#found bandwidth=0.00994664 after nit=3 iterations loss_val=-212247.24305)
else:
best_band = None
best_loss_value = None
init_prev = np.full(fill_value=1 / n_classes, shape=(n_classes,))
for bandwidth in np.logspace(-4, np.log10(0.2), 20):
mix_densities = self.get_mixture_components(*train.Xy, train.classes_, bandwidth)
loss_accum = 0
for (sample, prev) in tqdm(prot(), total=repeats):
test_densities = [self.pdf(kde_i, sample) for kde_i in mix_densities]
def neg_loglikelihood_prev_(prev):
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prev, test_densities))
test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
return -np.sum(test_loglikelihood)
if self.target == 'likelihood':
loss_fn = neg_loglikelihood_prev_
else:
loss_fn = lambda prev_hat: qp.error.from_name(self.target)(prev, prev_hat)
pred_prev, loss_val = optim_minimize(loss_fn, init_prev, return_loss=True)
loss_accum += loss_val
if best_loss_value is None or loss_accum < best_loss_value:
best_loss_value = loss_accum
best_band = bandwidth
nit=20
print(f'found bandwidth={best_band:.8f} after {nit=} iterations loss_val={best_loss_value:.5f})')
self.bandwidth_ = best_band
class KDEyMLred(KDEyML):
def __init__(self, classifier: BaseEstimator=None, val_split=5, bandwidth=0.1, random_state=None, reduction=100, max_reduced=500):
self.classifier = qp._get_classifier(classifier)
self.val_split = val_split
self.bandwidth = KDEBase._check_bandwidth(bandwidth)
self.reduction = reduction
self.max_reduced = max_reduced
self.random_state = random_state
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
n_classes = classif_predictions.n_classes
tr_length = min(self.reduction * n_classes, self.max_reduced)
if len(classif_predictions) > tr_length:
classif_predictions = classif_predictions.sampling(tr_length)
self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth)
return self

View File

@ -0,0 +1,163 @@
import pickle
import os
from time import time
from collections import defaultdict
import numpy as np
from sklearn.linear_model import LogisticRegression
import quapy as qp
from KDEy.kdey_devel import KDEyMLauto, KDEyMLauto2, KDEyMLred
from quapy.method.aggregative import PACC, EMQ, KDEyML
from quapy.model_selection import GridSearchQ
from quapy.protocol import UPP
from pathlib import Path
SEED = 1
def newLR():
return LogisticRegression(max_iter=3000)
# typical hyperparameters explored for Logistic Regression
logreg_grid = {
'C': np.logspace(-3,3,7),
'class_weight': [None, 'balanced']
}
def wrap_hyper(classifier_hyper_grid: dict):
return {'classifier__' + k: v for k, v in classifier_hyper_grid.items()}
METHODS = [
('PACC', PACC(newLR()), wrap_hyper(logreg_grid)),
('EMQ', EMQ(newLR()), wrap_hyper(logreg_grid)),
('KDEy-ML', KDEyML(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.logspace(-4, np.log10(0.2), 20)}}),
# ('KDEy-MLred', KDEyMLred(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.logspace(-4, np.log10(0.2), 20)}}),
('KDEy-ML-scott', KDEyML(newLR(), bandwidth='scott'), wrap_hyper(logreg_grid)),
('KDEy-ML-silver', KDEyML(newLR(), bandwidth='silverman'), wrap_hyper(logreg_grid)),
('KDEy-ML-autoLike', KDEyMLauto2(newLR(), bandwidth='auto', target='likelihood'), wrap_hyper(logreg_grid)),
('KDEy-ML-autoLike+', KDEyMLauto2(newLR(), bandwidth='auto', target='likelihood+'), wrap_hyper(logreg_grid)),
('KDEy-ML-autoAE', KDEyMLauto2(newLR(), bandwidth='auto', target='mae'), wrap_hyper(logreg_grid)),
('KDEy-ML-autoRAE', KDEyMLauto2(newLR(), bandwidth='auto', target='mrae'), wrap_hyper(logreg_grid)),
]
"""
TKDEyML era primero bandwidth (init 0.05) y luego prevalence (init uniform)
TKDEyML2 era primero prevalence (init uniform) y luego bandwidth (init 0.05)
TKDEyML3 era primero prevalence (init uniform) y luego bandwidth (init 0.1)
TKDEyML4 es como ML2 pero max 5 iteraciones por optimización
"""
TRANSDUCTIVE_METHODS = [
#('TKDEy-ML', KDEyMLauto(newLR()), None),
# ('TKDEy-MLboth', KDEyMLauto(newLR(), optim='both'), None),
# ('TKDEy-MLbothfine', KDEyMLauto(newLR(), optim='both_fine'), None),
# ('TKDEy-ML2', KDEyMLauto(newLR(), optim='two_steps'), None),
# ('TKDEy-MLike', KDEyMLauto(newLR(), optim='max_likelihood'), None),
# ('TKDEy-MLike2', KDEyMLauto(newLR(), optim='max_likelihood2'), None),
#('TKDEy-ML3', KDEyMLauto(newLR()), None),
#('TKDEy-ML4', KDEyMLauto(newLR()), None),
]
def show_results(result_path):
import pandas as pd
df = pd.read_csv(result_path + '.csv', sep='\t')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 1000) # Ajustar el ancho máximo
pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE"], margins=True)
print(pv)
pv = df.pivot_table(index='Dataset', columns="Method", values=["MRAE"], margins=True)
print(pv)
pv = df.pivot_table(index='Dataset', columns="Method", values=["KLD"], margins=True)
print(pv)
pv = df.pivot_table(index='Dataset', columns="Method", values=["TR-TIME"], margins=True)
print(pv)
pv = df.pivot_table(index='Dataset', columns="Method", values=["TE-TIME"], margins=True)
print(pv)
if __name__ == '__main__':
qp.environ['SAMPLE_SIZE'] = 500
qp.environ['N_JOBS'] = -1
n_bags_val = 25
n_bags_test = 100
result_dir = f'results_quantification/ucimulti'
os.makedirs(result_dir, exist_ok=True)
global_result_path = f'{result_dir}/allmethods'
with open(global_result_path + '.csv', 'wt') as csv:
csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\tTR-TIME\tTE-TIME\n')
for method_name, quantifier, param_grid in METHODS + TRANSDUCTIVE_METHODS:
print('Init method', method_name)
with open(global_result_path + '.csv', 'at') as csv:
for dataset in qp.datasets.UCI_MULTICLASS_DATASETS:
print('init', dataset)
# run_experiment(global_result_path, method_name, quantifier, param_grid, dataset)
local_result_path = os.path.join(Path(global_result_path).parent, method_name + '_' + dataset + '.dataframe')
if os.path.exists(local_result_path):
print(f'result file {local_result_path} already exist; skipping')
report = qp.util.load_report(local_result_path)
else:
with qp.util.temp_seed(SEED):
data = qp.datasets.fetch_UCIMulticlassDataset(dataset, verbose=True)
train, test = data.train_test
transductive_names = [name for (name, *_) in TRANSDUCTIVE_METHODS]
if method_name not in transductive_names:
if len(param_grid) == 0:
t_init = time()
quantifier.fit(train)
train_time = time() - t_init
else:
# model selection (train)
train, val = train.split_stratified(random_state=SEED)
protocol = UPP(val, repeats=n_bags_val)
modsel = GridSearchQ(
quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=1, error='mae'
)
t_init = time()
try:
modsel.fit(train)
print(f'best params {modsel.best_params_}')
print(f'best score {modsel.best_score_}')
quantifier = modsel.best_model()
except:
print('something went wrong... trying to fit the default model')
quantifier.fit(train)
train_time = time() - t_init
else:
# transductive
t_init = time()
quantifier.fit(train) # <-- nothing actually (proyects the X into posteriors only)
train_time = time() - t_init
# test
t_init = time()
protocol = UPP(test, repeats=n_bags_test)
report = qp.evaluation.evaluation_report(
quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], verbose=True
)
test_time = time() - t_init
report['tr_time'] = train_time
report['te_time'] = test_time
report.to_csv(local_result_path)
means = report.mean(numeric_only=True)
csv.write(f'{method_name}\t{dataset}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\t{means["tr_time"]:.3f}\t{means["te_time"]:.3f}\n')
csv.flush()
show_results(global_result_path)

View File

@ -0,0 +1,188 @@
import pickle
import os
from time import time
from collections import defaultdict
from tqdm import tqdm
import numpy as np
from sklearn.linear_model import LogisticRegression
import quapy as qp
from KDEy.kdey_devel import KDEyMLauto, optim_minimize
from method._kdey import KDEBase
from quapy.method.aggregative import PACC, EMQ, KDEyML
from quapy.model_selection import GridSearchQ
from quapy.protocol import UPP
from pathlib import Path
from quapy import functional as F
import matplotlib.pyplot as plt
SEED = 1
def newLR():
return LogisticRegression(max_iter=1000)#, C=1, class_weight='balanced')
SAMPLE_SIZE=150
qp.environ['SAMPLE_SIZE'] = SAMPLE_SIZE
show_ae = True
show_rae = True
show_mse = False
show_kld = True
epsilon = 1e-10
# n_bags_test = 2
# DATASETS = [qp.datasets.UCI_MULTICLASS_DATASETS[21]]
DATASETS = qp.datasets.UCI_MULTICLASS_DATASETS
for i, dataset in enumerate(DATASETS):
def generate_data():
data = qp.datasets.fetch_UCIMulticlassDataset(dataset)
n_classes = data.n_classes
print(f'{i=}')
print(f'{dataset=}')
print(f'{n_classes=}')
print(len(data.training))
print(len(data.test))
train, test = data.train_test
train_prev = train.prevalence()
test_prev = test.prevalence()
print(f'train-prev = {F.strprev(train_prev)}')
print(f'test-prev = {F.strprev(test_prev)}')
repeats = 10
prot = UPP(test, sample_size=SAMPLE_SIZE, repeats=repeats)
kde = KDEyMLauto(newLR())
kde.fit(train)
AE_error, RAE_error, MSE_error, KLD_error, LIKE_value = [], [], [], [], []
tr_posteriors, tr_y = kde.classif_predictions.Xy
for it, (sample, prev) in tqdm(enumerate(prot()), total=repeats):
te_posteriors = kde.classifier.predict_proba(sample)
classes = train.classes_
xaxis = []
ae_error = []
rae_error = []
mse_error = []
kld_error = []
likelihood_value = []
# for bandwidth in np.linspace(0.01, 0.2, 50):
for bandwidth in np.logspace(-5, 0.5, 50):
mix_densities = kde.get_mixture_components(tr_posteriors, tr_y, classes, bandwidth)
test_densities = [kde.pdf(kde_i, te_posteriors) for kde_i in mix_densities]
def neg_loglikelihood_prev(prev):
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prev, test_densities))
test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
return -np.sum(test_loglikelihood)
init_prev = np.full(fill_value=1 / n_classes, shape=(n_classes,))
pred_prev, likelihood = optim_minimize(neg_loglikelihood_prev, init_prev, return_loss=True)
xaxis.append(bandwidth)
ae_error.append(qp.error.ae(prev, pred_prev))
rae_error.append(qp.error.rae(prev, pred_prev))
mse_error.append(qp.error.mse(prev, pred_prev))
kld_error.append(qp.error.kld(prev, pred_prev))
likelihood_value.append(likelihood)
AE_error.append(ae_error)
RAE_error.append(rae_error)
MSE_error.append(mse_error)
KLD_error.append(kld_error)
LIKE_value.append(likelihood_value)
return xaxis, AE_error, RAE_error, MSE_error, KLD_error, LIKE_value
xaxis, AE_error, RAE_error, MSE_error, KLD_error, LIKE_value = qp.util.pickled_resource(
f'./plots/likelihood/pickles/{dataset}.pkl', generate_data)
for row in range(len(AE_error)):
# Crear la figura
# ----------------------------------------------------------------------------------------------------
fig, ax1 = plt.subplots(figsize=(8, 6))
# Pintar las series ae_error, rae_error, y kld_error en el primer eje Y
if show_ae:
ax1.plot(xaxis, AE_error[row], label='AE', marker='o', color='b')
if show_rae:
ax1.plot(xaxis, RAE_error[row], label='RAE', marker='s', color='g')
if show_kld:
ax1.plot(xaxis, KLD_error[row], label='KLD', marker='^', color='r')
if show_mse:
ax1.plot(xaxis, MSE_error[row], label='MSE', marker='^', color='c')
ax1.set_xscale('log')
# Configurar etiquetas para el primer eje Y
ax1.set_xlabel('Bandwidth')
ax1.set_ylabel('Error Value')
ax1.grid(True)
ax1.legend(loc='upper left')
# Crear un segundo eje Y que comparte el eje X
ax2 = ax1.twinx()
# Pintar likelihood_val en el segundo eje Y
ax2.plot(xaxis, LIKE_value[row], label='(neg)Likelihood', marker='x', color='purple')
# Configurar etiquetas para el segundo eje Y
ax2.set_ylabel('Likelihood Value')
ax2.legend(loc='upper right')
# Mostrar el gráfico
plt.title('Error Metrics vs Bandwidth')
# plt.show()
os.makedirs('./plots/likelihood/', exist_ok=True)
plt.savefig(f'./plots/likelihood/{dataset}-fig{row}.png')
plt.close()
# Crear la figura con las medias
# ----------------------------------------------------------------------------------------------------
fig, ax1 = plt.subplots(figsize=(8, 6))
def add_plot(ax, vals_error, name, color, marker, show):
if not show:
return
vals_error = np.asarray(vals_error)
vals_ave = np.mean(vals_error, axis=0)
vals_std = np.std(vals_error, axis=0)
ax.plot(xaxis, vals_ave, label=name, marker=marker, color=color)
ax.fill_between(xaxis, vals_ave - vals_std, vals_ave + vals_std, color=color, alpha=0.2)
add_plot(ax1, AE_error, 'AE', color='b', marker='o', show=show_ae)
add_plot(ax1, RAE_error, 'RAE', color='g', marker='s', show=show_rae)
add_plot(ax1, KLD_error, 'KLD', color='r', marker='^', show=show_kld)
add_plot(ax1, MSE_error, 'MSE', color='c', marker='^', show=show_mse)
ax1.set_xscale('log')
# Configurar etiquetas para el primer eje Y
ax1.set_xlabel('Bandwidth')
ax1.set_ylabel('Error Value')
ax1.grid(True)
ax1.legend(loc='upper left')
# Crear un segundo eje Y que comparte el eje X
ax2 = ax1.twinx()
# Pintar likelihood_val en el segundo eje Y
add_plot(ax2, LIKE_value, '(neg)Likelihood', color='purple', marker='x', show=True)
# Configurar etiquetas para el segundo eje Y
ax2.set_ylabel('Likelihood Value')
ax2.legend(loc='upper right')
# Mostrar el gráfico
plt.title('Error Metrics vs Bandwidth')
# plt.show()
os.makedirs('./plots/likelihood/', exist_ok=True)
plt.savefig(f'./plots/likelihood/{dataset}-figAve.png')
plt.close()

81
KDEy/utils.py Normal file
View File

@ -0,0 +1,81 @@
import time
from functools import wraps
import os
from os.path import join
from result_table.src.table import Table
import numpy as np
from constants import *
def measuretime(func):
@wraps(func)
def wrapper(*args, **kwargs):
start_time = time.time()
result = func(*args, **kwargs)
end_time = time.time()
time_it_took = end_time - start_time
if isinstance(result, tuple):
return (*result, time_it_took)
else:
return result, time_it_took
return wrapper
def plot_bandwidth(dataset_name, test_results, bandwidths, triplet_list_results):
import matplotlib.pyplot as plt
print("PLOT", dataset_name)
print(dataset_name)
plt.figure(figsize=(8, 6))
# show test results
plt.plot(bandwidths, test_results, marker='o', color='k')
colors = plt.cm.tab10(np.linspace(0, 1, len(triplet_list_results)))
for i, (method_name, method_choice, method_time) in enumerate(triplet_list_results):
plt.axvline(x=method_choice, linestyle='--', label=method_name, color=colors[i])
# Agregar etiquetas y título
plt.xlabel('Bandwidth')
plt.ylabel('MAE')
plt.title(dataset_name)
# Mostrar la leyenda
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
# Mostrar la gráfica
plt.grid(True)
plotdir = './plots'
if DEBUG:
plotdir = './plots_debug'
os.makedirs(plotdir, exist_ok=True)
plt.tight_layout()
plt.savefig(f'{plotdir}/{dataset_name}.png')
plt.close()
def error_table(dataset_name, test_results, bandwidth_range, triplet_list_results):
best_bandwidth = bandwidth_range[np.argmin(test_results)]
best_score = np.min(test_results)
print(f'Method\tChoice\tAE\tTime')
table=Table(name=dataset_name)
table.format.with_mean=False
table.format.with_rank_mean = False
table.format.show_std = False
for method_name, method_choice, took in triplet_list_results:
if method_choice in bandwidth_range:
index = np.where(bandwidth_range == method_choice)[0][0]
method_score = test_results[index]
else:
method_score = 1
error = np.abs(best_score-method_score)
table.add(benchmark='Choice', method=method_name, v=method_choice)
table.add(benchmark='ScoreChoice', method=method_name, v=method_score)
table.add(benchmark='Best', method=method_name, v=best_bandwidth)
table.add(benchmark='ScoreBest', method=method_name, v=best_score)
table.add(benchmark='AE', method=method_name, v=error)
table.add(benchmark='Time', method=method_name, v=took)
outpath = './tables'
if DEBUG:
outpath = './tables_debug'
table.latexPDF(join(outpath, dataset_name+'.pdf'), transpose=True)

View File

@ -5,7 +5,7 @@ import torch.nn.functional as F
class DistributionRegressor(nn.Module): class DistributionRegressor(nn.Module):
def __init__(self, n_classes, hidden_dim=64): def __init__(self, n_classes, hidden_dim=256):
super(DistributionRegressor, self).__init__() super(DistributionRegressor, self).__init__()
self.fc1 = nn.Linear(n_classes, hidden_dim) self.fc1 = nn.Linear(n_classes, hidden_dim)
self.fc2 = nn.Linear(hidden_dim, n_classes) self.fc2 = nn.Linear(hidden_dim, n_classes)

View File

@ -6,7 +6,7 @@ from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR from sklearn.svm import SVR
from LocalStack._neural import DistributionRegressor from LocalStack._neural import DistributionRegressor
from data import LabelledCollection from quapy.data import LabelledCollection
from quapy.method.base import BaseQuantifier from quapy.method.base import BaseQuantifier
from quapy.method.aggregative import AggregativeSoftQuantifier from quapy.method.aggregative import AggregativeSoftQuantifier
from tqdm import tqdm from tqdm import tqdm
@ -41,7 +41,7 @@ class LocalStackingQuantification(BaseQuantifier):
samples_pred_prevs = [] samples_pred_prevs = []
samples_distance = [] samples_distance = []
for i in range(self.n_samples_gen): for i in range(self.n_samples_gen):
sample_i = self.val_data.sampling(test_size, *pred_prevs) sample_i = self.val_data.sampling(test_size, *pred_prevs, random_state=self.random_state)
pred_prev_sample_i = self.surrogate_quantifier.quantify(sample_i.X) pred_prev_sample_i = self.surrogate_quantifier.quantify(sample_i.X)
err_dist = self.comparison_measure(pred_prevs, pred_prev_sample_i) err_dist = self.comparison_measure(pred_prevs, pred_prev_sample_i)
@ -53,7 +53,7 @@ class LocalStackingQuantification(BaseQuantifier):
samples_sel = np.asarray(samples)[ord_distances][:self.n_samples_sel] samples_sel = np.asarray(samples)[ord_distances][:self.n_samples_sel]
samples_pred_prevs_sel = np.asarray(samples_pred_prevs)[ord_distances][:self.n_samples_sel] samples_pred_prevs_sel = np.asarray(samples_pred_prevs)[ord_distances][:self.n_samples_sel]
reg = MultiOutputRegressor(SVR(C=1000)) reg = MultiOutputRegressor(SVR())
reg_X = samples_pred_prevs_sel reg_X = samples_pred_prevs_sel
reg_y = [s.prevalence() for s in samples_sel] reg_y = [s.prevalence() for s in samples_sel]
reg.fit(reg_X, reg_y) reg.fit(reg_X, reg_y)

View File

@ -14,7 +14,7 @@ from . import model_selection
from . import classification from . import classification
import os import os
__version__ = '0.1.9' __version__ = '0.1.10'
environ = { environ = {
'SAMPLE_SIZE': None, 'SAMPLE_SIZE': None,

View File

@ -502,7 +502,7 @@ class Dataset:
return len(self.vocabulary) return len(self.vocabulary)
@property @property
def train_test(self): def train_test(self) -> (LabelledCollection, LabelledCollection):
""" """
Alias to `self.training` and `self.test` Alias to `self.training` and `self.test`

View File

@ -3,6 +3,7 @@ from contextlib import contextmanager
import zipfile import zipfile
from os.path import join from os.path import join
import pandas as pd import pandas as pd
import sklearn.datasets
from ucimlrepo import fetch_ucirepo from ucimlrepo import fetch_ucirepo
from quapy.data.base import Dataset, LabelledCollection from quapy.data.base import Dataset, LabelledCollection
from quapy.data.preprocessing import text2tfidf, reduce_columns from quapy.data.preprocessing import text2tfidf, reduce_columns
@ -1004,3 +1005,49 @@ def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=No
return train, test_gen return train, test_gen
else: else:
return train_gen, test_gen return train_gen, test_gen
def syntheticUniformLabelledCollection(n_samples, n_features, n_classes, n_clusters_per_class=1, **kwargs):
"""
Generates a synthetic labelled collection with uniform priors and
of `n_samples` instances, `n_features` features, and `n_classes` classes.
The underlying generator relies on the function
`sklearn.datasets.make_classification`. Other options can be specified using the `kwargs`;
see the `scikit-learn documentation
<https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html>`_
for a full list of optional parameters.
:param n_samples: number of instances
:param n_features: number of features
:param n_classes: number of classes
"""
X, y = sklearn.datasets.make_classification(
n_samples=n_samples,
n_features=n_features,
n_classes=n_classes,
n_clusters_per_class=n_clusters_per_class,
**kwargs
)
return LabelledCollection(X, y)
def syntheticUniformDataset(n_samples, n_features, n_classes, test_split=0.3, **kwargs):
"""
Generates a synthetic Dataset with approximately uniform priors and
of `n_samples` instances, `n_features` features, and `n_classes` classes.
The underlying generator relies on the function
`sklearn.datasets.make_classification`. Other options can be specified using the `kwargs`;
see the `scikit-learn documentation
<https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html>`_
for a full list of optional parameters.
:param n_samples: number of instances
:param n_features: number of features
:param n_classes: number of classes
:param test_split: proportion of test instances
"""
assert 0. < test_split < 1., "invalid proportion of test instances; the value must be in (0, 1)"
lc = syntheticUniformLabelledCollection(n_samples, n_features, n_classes, **kwargs)
training, test = lc.split_stratified(train_prop=1-test_split, random_state=kwargs.get('random_state', None))
dataset = Dataset(training=training, test=test, name=f'synthetic(nF={n_features},nC={n_classes})')
return dataset

View File

@ -66,11 +66,13 @@ class KDEBase:
""" """
class_cond_X = [] class_cond_X = []
for cat in classes: for cat in classes:
selX = X[y==cat] selX = X[y == cat]
if selX.size==0: if selX.size == 0:
selX = [F.uniform_prevalence(len(classes))] selX = [F.uniform_prevalence(len(classes))]
class_cond_X.append(selX) class_cond_X.append(selX)
return [self.get_kde_function(X_cond_yi, bandwidth) for X_cond_yi in class_cond_X] if isinstance(bandwidth, float) or isinstance(bandwidth, str):
bandwidth = np.full(fill_value=bandwidth, shape=(len(classes),))
return [self.get_kde_function(X_cond_yi, band_i) for X_cond_yi, band_i in zip(class_cond_X, bandwidth)]
class KDEyML(AggregativeSoftQuantifier, KDEBase): class KDEyML(AggregativeSoftQuantifier, KDEBase):
@ -188,7 +190,7 @@ class KDEyHD(AggregativeSoftQuantifier, KDEBase):
def __init__(self, classifier: BaseEstimator=None, val_split=5, divergence: str='HD', def __init__(self, classifier: BaseEstimator=None, val_split=5, divergence: str='HD',
bandwidth=0.1, random_state=None, montecarlo_trials=10000): bandwidth=0.1, random_state=None, montecarlo_trials=10000):
self.classifier = qp._get_classifier(classifier) self.classifier = qp._get_classifier(classifier)
self.val_split = val_split self.val_split = val_split
self.divergence = divergence self.divergence = divergence
@ -218,7 +220,7 @@ class KDEyHD(AggregativeSoftQuantifier, KDEBase):
def f_squared_hellinger(u): def f_squared_hellinger(u):
return (np.sqrt(u)-1)**2 return (np.sqrt(u)-1)**2
# todo: this will fail when self.divergence is a callable, and is not the right place to do it anyway # todo: this will fail when self.divergence is a callable, and is not the right place to do it anyway
if self.divergence.lower() == 'hd': if self.divergence.lower() == 'hd':
f = f_squared_hellinger f = f_squared_hellinger
@ -283,7 +285,7 @@ class KDEyCS(AggregativeSoftQuantifier):
def gram_matrix_mix_sum(self, X, Y=None): def gram_matrix_mix_sum(self, X, Y=None):
# this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y)) # this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y))
# to contain pairwise evaluations of N(x|mu,Sigma1+Sigma2) with mu=y and Sigma1 and Sigma2 are # to contain pairwise evaluations of N(x|mu,Sigma1+Sigma2) with mu=y and Sigma1 and Sigma2 are
# two "scalar matrices" (h^2)*I each, so Sigma1+Sigma2 has scalar 2(h^2) (h is the bandwidth) # two "scalar matrices" (h^2)*I each, so Sigma1+Sigma2 has scalar 2(h^2) (h is the bandwidth)
h = self.bandwidth h = self.bandwidth
variance = 2 * (h**2) variance = 2 * (h**2)
@ -342,7 +344,7 @@ class KDEyCS(AggregativeSoftQuantifier):
# at each iteration of the optimization phase) # at each iteration of the optimization phase)
tr_te_sums = np.zeros(shape=n, dtype=float) tr_te_sums = np.zeros(shape=n, dtype=float)
for i in range(n): for i in range(n):
tr_te_sums[i] = self.gram_matrix_mix_sum(Ptr[y==i], Pte) tr_te_sums[i] = self.gram_matrix_mix_sum(Ptr[y==i], Pte)
def divergence(alpha): def divergence(alpha):
# called \overline{r} in the paper # called \overline{r} in the paper

1
result_table Submodule

@ -0,0 +1 @@
Subproject commit c223c9f1fe3c9708e8c5a5c56e438cdaaa857be4