Compare commits

...

4 Commits

10 changed files with 533 additions and 60 deletions

3
.gitmodules vendored Normal file
View File

@ -0,0 +1,3 @@
[submodule "result_table"]
path = result_table
url = gitea@gitea-s2i2s.isti.cnr.it:moreo/result_table.git

2
KDEy/constants.py Normal file
View File

@ -0,0 +1,2 @@
DEBUG = False

View File

@ -1,13 +1,17 @@
import os
import pickle
import shutil
import numpy as np
from sklearn.linear_model import LogisticRegression
from os.path import join
import quapy as qp
from quapy.protocol import UPP
from quapy.method.aggregative import KDEyML
from quapy.protocol import UPP
from kdey_devel import KDEyMLauto
from utils import *
from constants import *
import quapy.functional as F
DEBUG = False
qp.environ["SAMPLE_SIZE"] = 100 if DEBUG else 500
val_repeats = 100 if DEBUG else 500
@ -20,21 +24,24 @@ val_choice = {}
bandwidth_range = np.linspace(0.01, 0.20, 20)
if DEBUG:
bandwidth_range = np.linspace(0.01, 0.20, 10)
bandwidth_range = np.linspace(0.01, 0.20, 5)
def datasets():
for dataset_name in qp.datasets.UCI_MULTICLASS_DATASETS:
dataset_list = qp.datasets.UCI_MULTICLASS_DATASETS[:4]
if DEBUG:
dataset_list = dataset_list[:4]
for dataset_name in dataset_list:
dataset = qp.datasets.fetch_UCIMulticlassDataset(dataset_name)
if DEBUG:
dataset = dataset.reduce(random_state=0)
yield dataset
def experiment_dataset(dataset):
train, test = dataset.train_test
test_gen = UPP(test, repeats=test_repeats)
@measuretime
def predict_b_modsel(dataset):
# bandwidth chosen during model selection in validation
train = dataset.training
train_tr, train_va = train.split_stratified(random_state=0)
kdey = KDEyML(random_state=0)
modsel = qp.model_selection.GridSearchQ(
@ -47,66 +54,69 @@ def experiment_dataset(dataset):
).fit(train_tr)
chosen_bandwidth = modsel.best_params_['bandwidth']
modsel_choice = float(chosen_bandwidth)
# kdey.set_params(bandwidth=chosen_bandwidth)
# kdey.fit(train)
# kdey.qua
return modsel_choice
# results in test
print(f"testing KDEy in {dataset.name}")
dataset_results = []
for b in bandwidth_range:
kdey = KDEyML(bandwidth=b, random_state=0)
@measuretime
def predict_b_kdeymlauto(dataset):
# bandwidth chosen during model selection in validation
train, test = dataset.train_test
kdey = KDEyMLauto(random_state=0)
print(f'true-prevalence: {F.strprev(test.prevalence())}')
chosen_bandwidth, _ = kdey.chose_bandwidth(train, test.X)
auto_bandwidth = float(chosen_bandwidth)
return auto_bandwidth
def in_test_search(dataset, n_jobs=-1):
train, test = dataset.train_test
print(f"generating true tests scores using KDEy in {dataset.name}")
def experiment_job(bandwidth):
kdey = KDEyML(bandwidth=bandwidth, random_state=0)
kdey.fit(train)
test_gen = UPP(test, repeats=test_repeats)
mae = qp.evaluation.evaluate(kdey, protocol=test_gen, error_metric='mae', verbose=True)
print(f'bandwidth={b}: {mae:.5f}')
dataset_results.append((float(b), float(mae)))
print(f'{bandwidth=}: {mae:.5f}')
return float(mae)
return modsel_choice, dataset_results
dataset_results = qp.util.parallel(experiment_job, bandwidth_range, n_jobs=n_jobs)
return dataset_results, bandwidth_range
def plot_bandwidth(val_choice, test_results):
for dataset_name in val_choice.keys():
import matplotlib.pyplot as plt
bandwidths, results = zip(*test_results[dataset_name])
# Crear la gráfica
plt.figure(figsize=(8, 6))
# Graficar los puntos de datos
plt.plot(bandwidths, results, marker='o')
# Agregar la línea vertical en bandwidth_chosen
plt.axvline(x=val_choice[dataset_name], color='r', linestyle='--', label=f'Bandwidth elegido: {val_choice[dataset_name]}')
# Agregar etiquetas y título
plt.xlabel('Bandwidth')
plt.ylabel('Resultado')
plt.title('Gráfica de Bandwidth vs Resultado')
# Mostrar la leyenda
plt.legend()
# Mostrar la gráfica
plt.grid(True)
# plt.show()
os.makedirs('./plots', exist_ok=True)
plt.savefig(f'./plots/{dataset_name}.png')
for dataset in datasets():
if DEBUG:
result_path = f'./results/debug/{dataset.name}.pkl'
else:
result_path = f'./results/{dataset.name}.pkl'
print('NAME', dataset.name)
print(len(dataset.training))
print(len(dataset.test))
modsel_choice, dataset_results = qp.util.pickled_resource(result_path, experiment_dataset, dataset)
val_choice[dataset.name] = modsel_choice
test_results[dataset.name] = dataset_results
result_path = f'./results/{dataset.name}/'
if DEBUG:
result_path = result_path.replace('results', 'results_debug')
if os.path.exists(result_path):
shutil.rmtree(result_path)
dataset_results, bandwidth_range = qp.util.pickled_resource(join(result_path, 'test.pkl'), in_test_search, dataset)
triplet_list_results = []
modsel_choice, modsel_time = qp.util.pickled_resource(join(result_path, 'modsel.pkl'), predict_b_modsel, dataset)
triplet_list_results.append(('modsel', modsel_choice, modsel_time,))
auto_choice, auto_time = qp.util.pickled_resource(join(result_path, 'auto.pkl'), predict_b_kdeymlauto, dataset)
triplet_list_results.append(('auto', auto_choice, auto_time,))
print(f'Dataset = {dataset.name}')
print(modsel_choice)
print(dataset_results)
plot_bandwidth(val_choice, test_results)
plot_bandwidth(dataset.name, dataset_results, bandwidth_range, triplet_list_results)
error_table(dataset.name, dataset_results, bandwidth_range, triplet_list_results)
# time_table(dataset.name, dataset_results, bandwidth_range, triplet_list_results)

171
KDEy/kdey_devel.py Normal file
View File

@ -0,0 +1,171 @@
from typing import Union, Callable
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.neighbors import KernelDensity
import quapy as qp
from quapy.data import LabelledCollection
from quapy.method.aggregative import AggregativeSoftQuantifier, KDEyML
import quapy.functional as F
from sklearn.metrics.pairwise import rbf_kernel
from scipy import optimize
class KDEyMLauto(KDEyML):
def __init__(self, classifier: BaseEstimator = None, val_split=5, random_state=None, optim='two_steps'):
self.classifier = qp._get_classifier(classifier)
self.val_split = val_split
self.bandwidth = None
self.random_state = random_state
self.optim = optim
def chose_bandwidth(self, train, test_instances):
classif_predictions = self.classifier_fit_predict(train, fit_classifier=True, predict_on=self.val_split)
te_posteriors = self.classify(test_instances)
return self.transduce(classif_predictions, te_posteriors)
def transduce(self, classif_predictions, te_posteriors):
tr_posteriors, tr_y = classif_predictions.Xy
classes = classif_predictions.classes_
n_classes = len(classes)
current_bandwidth = 0.05
if self.optim == 'both_fine':
current_bandwidth = np.full(fill_value=current_bandwidth, shape=(n_classes,))
current_prevalence = np.full(fill_value=1 / n_classes, shape=(n_classes,))
iterations = 0
convergence = False
with qp.util.temp_seed(self.random_state):
while not convergence:
previous_bandwidth = current_bandwidth
previous_prevalence = current_prevalence
iterations += 1
print(f'{iterations}:')
if self.optim == 'two_steps':
current_prevalence = self.optim_minimize_prevalence(current_bandwidth, current_prevalence, tr_posteriors, tr_y, te_posteriors, classes)
print(f'\testim-prev={F.strprev(current_prevalence)}')
current_bandwidth = self.optim_minimize_bandwidth(current_bandwidth, current_prevalence, tr_posteriors, tr_y, te_posteriors, classes)
print(f'\tbandwidth={current_bandwidth}')
if np.isclose(previous_bandwidth, current_bandwidth, atol=0.0001) and all(
np.isclose(previous_prevalence, current_prevalence, atol=0.0001)):
convergence = True
elif self.optim == 'both':
current_prevalence, current_bandwidth = self.optim_minimize_both(current_bandwidth, current_prevalence, tr_posteriors, tr_y, te_posteriors, classes)
if np.isclose(previous_bandwidth, current_bandwidth, atol=0.0001) and all(np.isclose(previous_prevalence, current_prevalence, atol=0.0001)):
convergence = True
elif self.optim == 'both_fine':
current_prevalence, current_bandwidth = self.optim_minimize_both_fine(current_bandwidth, current_prevalence, tr_posteriors, tr_y,
te_posteriors, classes)
if all(np.isclose(previous_bandwidth, current_bandwidth, atol=0.0001)) and all(np.isclose(previous_prevalence, current_prevalence, atol=0.0001)):
convergence = True
self.bandwidth = current_bandwidth
print('bandwidth=', current_bandwidth)
print('prevalence=', current_prevalence)
return current_prevalence
def optim_minimize_prevalence(self, current_bandwidth, current_prev, tr_posteriors, tr_y, te_posteriors, classes):
epsilon = 1e-10
mix_densities = self.get_mixture_components(tr_posteriors, tr_y, classes, current_bandwidth)
test_densities = [self.pdf(kde_i, te_posteriors) for kde_i in mix_densities]
def neg_loglikelihood_prev(prev):
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prev, test_densities))
test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
return -np.sum(test_loglikelihood)
return optim_minimize(neg_loglikelihood_prev, current_prev)
def optim_minimize_bandwidth(self, current_bandwidth, current_prev, tr_posteriors, tr_y, te_posteriors, classes):
epsilon = 1e-10
def neg_loglikelihood_bandwidth(bandwidth):
mix_densities = self.get_mixture_components(tr_posteriors, tr_y, classes, bandwidth[0])
test_densities = [self.pdf(kde_i, te_posteriors) for kde_i in mix_densities]
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(current_prev, test_densities))
test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
return -np.sum(test_loglikelihood)
bounds = [(0.00001, 1)]
r = optimize.minimize(neg_loglikelihood_bandwidth, x0=[current_bandwidth], method='SLSQP', bounds=bounds)
print(f'iterations-bandwidth={r.nit}')
return r.x[0]
def optim_minimize_both(self, current_bandwidth, current_prev, tr_posteriors, tr_y, te_posteriors, classes):
epsilon = 1e-10
n_classes = len(current_prev)
def neg_loglikelihood_bandwidth(prevalence_bandwidth):
bandwidth = prevalence_bandwidth[-1]
prevalence = prevalence_bandwidth[:-1]
mix_densities = self.get_mixture_components(tr_posteriors, tr_y, classes, bandwidth)
test_densities = [self.pdf(kde_i, te_posteriors) for kde_i in mix_densities]
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prevalence, test_densities))
test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
return -np.sum(test_loglikelihood)
bounds = [(0, 1) for _ in range(n_classes)] + [(0.00001, 1)]
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x[:n_classes])})
prevalence_bandwidth = np.append(current_prev, current_bandwidth)
r = optimize.minimize(neg_loglikelihood_bandwidth, x0=prevalence_bandwidth, method='SLSQP', bounds=bounds, constraints=constraints)
print(f'iterations-both={r.nit}')
prev_band = r.x
current_prevalence = prev_band[:-1]
current_bandwidth = prev_band[-1]
return current_prevalence, current_bandwidth
def optim_minimize_both_fine(self, current_bandwidth, current_prev, tr_posteriors, tr_y, te_posteriors, classes):
epsilon = 1e-10
n_classes = len(current_bandwidth)
def neg_loglikelihood_bandwidth(prevalence_bandwidth):
prevalence = prevalence_bandwidth[:n_classes]
bandwidth = prevalence_bandwidth[n_classes:]
mix_densities = self.get_mixture_components(tr_posteriors, tr_y, classes, bandwidth)
test_densities = [self.pdf(kde_i, te_posteriors) for kde_i in mix_densities]
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prevalence, test_densities))
test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
return -np.sum(test_loglikelihood)
bounds = [(0, 1) for _ in range(n_classes)] + [(0.00001, 1) for _ in range(n_classes)]
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x[:n_classes])})
prevalence_bandwidth = np.concatenate((current_prev, current_bandwidth))
r = optimize.minimize(neg_loglikelihood_bandwidth, x0=prevalence_bandwidth, method='SLSQP', bounds=bounds, constraints=constraints)
print(f'iterations-both-fine={r.nit}')
prev_band = r.x
current_prevalence = prev_band[:n_classes]
current_bandwidth = prev_band[n_classes:]
return current_prevalence, current_bandwidth
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
self.classif_predictions = classif_predictions
return self
def aggregate(self, posteriors: np.ndarray):
return self.transduce(self.classif_predictions, posteriors)
def optim_minimize(loss: Callable, init_prev: np.ndarray):
"""
Searches for the optimal prevalence values, i.e., an `n_classes`-dimensional vector of the (`n_classes`-1)-simplex
that yields the smallest lost. This optimization is carried out by means of a constrained search using scipy's
SLSQP routine.
:param loss: (callable) the function to minimize
:return: (ndarray) the best prevalence vector found
"""
n_classes = len(init_prev)
# solutions are bounded to those contained in the unit-simplex
bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1]
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
r = optimize.minimize(loss, x0=init_prev, method='SLSQP', bounds=bounds, constraints=constraints)
print(f'iterations-prevalence={r.nit}')
return r.x

View File

@ -0,0 +1,156 @@
import pickle
import os
from time import time
from collections import defaultdict
import numpy as np
from sklearn.linear_model import LogisticRegression
import quapy as qp
from KDEy.kdey_devel import KDEyMLauto
from quapy.method.aggregative import PACC, EMQ, KDEyML
from quapy.model_selection import GridSearchQ
from quapy.protocol import UPP
from pathlib import Path
SEED = 1
def newLR():
return LogisticRegression(max_iter=3000)
# typical hyperparameters explored for Logistic Regression
logreg_grid = {
'C': [1],
'class_weight': [None]
}
def wrap_hyper(classifier_hyper_grid: dict):
return {'classifier__' + k: v for k, v in classifier_hyper_grid.items()}
METHODS = [
('PACC', PACC(newLR()), wrap_hyper(logreg_grid)),
('EMQ', EMQ(newLR()), wrap_hyper(logreg_grid)),
('KDEy-ML', KDEyML(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.linspace(0.01, 0.2, 20)}}),
]
"""
TKDEyML era primero bandwidth (init 0.05) y luego prevalence (init uniform)
TKDEyML2 era primero prevalence (init uniform) y luego bandwidth (init 0.05)
TKDEyML3 era primero prevalence (init uniform) y luego bandwidth (init 0.1)
TKDEyML4 es como ML2 pero max 5 iteraciones por optimización
"""
TRANSDUCTIVE_METHODS = [
#('TKDEy-ML', KDEyMLauto(newLR()), None),
('TKDEy-MLboth', KDEyMLauto(newLR(), optim='both'), None),
('TKDEy-MLbothfine', KDEyMLauto(newLR(), optim='both_fine'), None),
('TKDEy-ML2', KDEyMLauto(newLR()), None),
#('TKDEy-ML3', KDEyMLauto(newLR()), None),
#('TKDEy-ML4', KDEyMLauto(newLR()), None),
]
def show_results(result_path):
import pandas as pd
df = pd.read_csv(result_path + '.csv', sep='\t')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE", "t_train"], margins=True)
print(pv)
def load_timings(result_path):
import pandas as pd
timings = defaultdict(lambda: {})
if not Path(result_path + '.csv').exists():
return timings
df = pd.read_csv(result_path + '.csv', sep='\t')
return timings | df.pivot_table(index='Dataset', columns='Method', values='t_train').to_dict()
if __name__ == '__main__':
qp.environ['SAMPLE_SIZE'] = 500
qp.environ['N_JOBS'] = -1
n_bags_val = 25
n_bags_test = 100
result_dir = f'results_quantification/ucimulti'
os.makedirs(result_dir, exist_ok=True)
global_result_path = f'{result_dir}/allmethods'
timings = load_timings(global_result_path)
with open(global_result_path + '.csv', 'wt') as csv:
csv.write(f'Method\tDataset\tMAE\tMRAE\tt_train\n')
for method_name, quantifier, param_grid in METHODS + TRANSDUCTIVE_METHODS:
print('Init method', method_name)
with open(global_result_path + '.csv', 'at') as csv:
for dataset in qp.datasets.UCI_MULTICLASS_DATASETS[:4]:
print('init', dataset)
local_result_path = os.path.join(Path(global_result_path).parent,
method_name + '_' + dataset + '.dataframe')
if os.path.exists(local_result_path):
print(f'result file {local_result_path} already exist; skipping')
report = qp.util.load_report(local_result_path)
else:
with qp.util.temp_seed(SEED):
data = qp.datasets.fetch_UCIMulticlassDataset(dataset, verbose=True)
if not method_name.startswith("TKDEy-ML"):
# model selection
train, test = data.train_test
train, val = train.split_stratified(random_state=SEED)
protocol = UPP(val, repeats=n_bags_val)
modsel = GridSearchQ(
quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=1, error='mae'
)
t_init = time()
try:
modsel.fit(train)
print(f'best params {modsel.best_params_}')
print(f'best score {modsel.best_score_}')
quantifier = modsel.best_model()
except:
print('something went wrong... trying to fit the default model')
quantifier.fit(train)
timings[method_name][dataset] = time() - t_init
protocol = UPP(test, repeats=n_bags_test)
report = qp.evaluation.evaluation_report(
quantifier, protocol, error_metrics=['mae', 'mrae'], verbose=True
)
report.to_csv(local_result_path)
else:
# model selection
train, test = data.train_test
t_init = time()
quantifier.fit(train)
timings[method_name][dataset] = time() - t_init
protocol = UPP(test, repeats=n_bags_test)
report = qp.evaluation.evaluation_report(
quantifier, protocol, error_metrics=['mae', 'mrae'], verbose=True
)
report.to_csv(local_result_path)
means = report.mean(numeric_only=True)
csv.write(
f'{method_name}\t{dataset}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n')#\t{timings[method_name][dataset]:.3f}\n')
csv.flush()
show_results(global_result_path)

81
KDEy/utils.py Normal file
View File

@ -0,0 +1,81 @@
import time
from functools import wraps
import os
from os.path import join
from result_table.src.table import Table
import numpy as np
from constants import *
def measuretime(func):
@wraps(func)
def wrapper(*args, **kwargs):
start_time = time.time()
result = func(*args, **kwargs)
end_time = time.time()
time_it_took = end_time - start_time
if isinstance(result, tuple):
return (*result, time_it_took)
else:
return result, time_it_took
return wrapper
def plot_bandwidth(dataset_name, test_results, bandwidths, triplet_list_results):
import matplotlib.pyplot as plt
print("PLOT", dataset_name)
print(dataset_name)
plt.figure(figsize=(8, 6))
# show test results
plt.plot(bandwidths, test_results, marker='o', color='k')
colors = plt.cm.tab10(np.linspace(0, 1, len(triplet_list_results)))
for i, (method_name, method_choice, method_time) in enumerate(triplet_list_results):
plt.axvline(x=method_choice, linestyle='--', label=method_name, color=colors[i])
# Agregar etiquetas y título
plt.xlabel('Bandwidth')
plt.ylabel('MAE')
plt.title(dataset_name)
# Mostrar la leyenda
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
# Mostrar la gráfica
plt.grid(True)
plotdir = './plots'
if DEBUG:
plotdir = './plots_debug'
os.makedirs(plotdir, exist_ok=True)
plt.tight_layout()
plt.savefig(f'{plotdir}/{dataset_name}.png')
plt.close()
def error_table(dataset_name, test_results, bandwidth_range, triplet_list_results):
best_bandwidth = bandwidth_range[np.argmin(test_results)]
best_score = np.min(test_results)
print(f'Method\tChoice\tAE\tTime')
table=Table(name=dataset_name)
table.format.with_mean=False
table.format.with_rank_mean = False
table.format.show_std = False
for method_name, method_choice, took in triplet_list_results:
if method_choice in bandwidth_range:
index = np.where(bandwidth_range == method_choice)[0][0]
method_score = test_results[index]
else:
method_score = 1
error = np.abs(best_score-method_score)
table.add(benchmark='Choice', method=method_name, v=method_choice)
table.add(benchmark='ScoreChoice', method=method_name, v=method_score)
table.add(benchmark='Best', method=method_name, v=best_bandwidth)
table.add(benchmark='ScoreBest', method=method_name, v=best_score)
table.add(benchmark='AE', method=method_name, v=error)
table.add(benchmark='Time', method=method_name, v=took)
outpath = './tables'
if DEBUG:
outpath = './tables_debug'
table.latexPDF(join(outpath, dataset_name+'.pdf'), transpose=True)

View File

@ -14,7 +14,7 @@ from . import model_selection
from . import classification
import os
__version__ = '0.1.9'
__version__ = '0.1.10'
environ = {
'SAMPLE_SIZE': None,

View File

@ -3,6 +3,7 @@ from contextlib import contextmanager
import zipfile
from os.path import join
import pandas as pd
import sklearn.datasets
from ucimlrepo import fetch_ucirepo
from quapy.data.base import Dataset, LabelledCollection
from quapy.data.preprocessing import text2tfidf, reduce_columns
@ -1004,3 +1005,49 @@ def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=No
return train, test_gen
else:
return train_gen, test_gen
def syntheticUniformLabelledCollection(n_samples, n_features, n_classes, n_clusters_per_class=1, **kwargs):
"""
Generates a synthetic labelled collection with uniform priors and
of `n_samples` instances, `n_features` features, and `n_classes` classes.
The underlying generator relies on the function
`sklearn.datasets.make_classification`. Other options can be specified using the `kwargs`;
see the `scikit-learn documentation
<https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html>`_
for a full list of optional parameters.
:param n_samples: number of instances
:param n_features: number of features
:param n_classes: number of classes
"""
X, y = sklearn.datasets.make_classification(
n_samples=n_samples,
n_features=n_features,
n_classes=n_classes,
n_clusters_per_class=n_clusters_per_class,
**kwargs
)
return LabelledCollection(X, y)
def syntheticUniformDataset(n_samples, n_features, n_classes, test_split=0.3, **kwargs):
"""
Generates a synthetic Dataset with approximately uniform priors and
of `n_samples` instances, `n_features` features, and `n_classes` classes.
The underlying generator relies on the function
`sklearn.datasets.make_classification`. Other options can be specified using the `kwargs`;
see the `scikit-learn documentation
<https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html>`_
for a full list of optional parameters.
:param n_samples: number of instances
:param n_features: number of features
:param n_classes: number of classes
:param test_split: proportion of test instances
"""
assert 0. < test_split < 1., "invalid proportion of test instances; the value must be in (0, 1)"
lc = syntheticUniformLabelledCollection(n_samples, n_features, n_classes, **kwargs)
training, test = lc.split_stratified(train_prop=1-test_split, random_state=kwargs.get('random_state', None))
dataset = Dataset(training=training, test=test, name=f'synthetic(nF={n_features},nC={n_classes})')
return dataset

View File

@ -66,11 +66,13 @@ class KDEBase:
"""
class_cond_X = []
for cat in classes:
selX = X[y==cat]
if selX.size==0:
selX = X[y == cat]
if selX.size == 0:
selX = [F.uniform_prevalence(len(classes))]
class_cond_X.append(selX)
return [self.get_kde_function(X_cond_yi, bandwidth) for X_cond_yi in class_cond_X]
if isinstance(bandwidth, float):
bandwidth = np.full(fill_value=bandwidth, shape=(len(classes),))
return [self.get_kde_function(X_cond_yi, band_i) for X_cond_yi, band_i in zip(class_cond_X, bandwidth)]
class KDEyML(AggregativeSoftQuantifier, KDEBase):
@ -188,7 +190,7 @@ class KDEyHD(AggregativeSoftQuantifier, KDEBase):
def __init__(self, classifier: BaseEstimator=None, val_split=5, divergence: str='HD',
bandwidth=0.1, random_state=None, montecarlo_trials=10000):
self.classifier = qp._get_classifier(classifier)
self.val_split = val_split
self.divergence = divergence
@ -218,7 +220,7 @@ class KDEyHD(AggregativeSoftQuantifier, KDEBase):
def f_squared_hellinger(u):
return (np.sqrt(u)-1)**2
# todo: this will fail when self.divergence is a callable, and is not the right place to do it anyway
if self.divergence.lower() == 'hd':
f = f_squared_hellinger
@ -283,7 +285,7 @@ class KDEyCS(AggregativeSoftQuantifier):
def gram_matrix_mix_sum(self, X, Y=None):
# this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y))
# to contain pairwise evaluations of N(x|mu,Sigma1+Sigma2) with mu=y and Sigma1 and Sigma2 are
# to contain pairwise evaluations of N(x|mu,Sigma1+Sigma2) with mu=y and Sigma1 and Sigma2 are
# two "scalar matrices" (h^2)*I each, so Sigma1+Sigma2 has scalar 2(h^2) (h is the bandwidth)
h = self.bandwidth
variance = 2 * (h**2)
@ -342,7 +344,7 @@ class KDEyCS(AggregativeSoftQuantifier):
# at each iteration of the optimization phase)
tr_te_sums = np.zeros(shape=n, dtype=float)
for i in range(n):
tr_te_sums[i] = self.gram_matrix_mix_sum(Ptr[y==i], Pte)
tr_te_sums[i] = self.gram_matrix_mix_sum(Ptr[y==i], Pte)
def divergence(alpha):
# called \overline{r} in the paper

1
result_table Submodule

@ -0,0 +1 @@
Subproject commit c223c9f1fe3c9708e8c5a5c56e438cdaaa857be4