experiment

This commit is contained in:
Alejandro Moreo Fernandez 2025-09-27 17:41:12 +02:00
parent 636e33318f
commit c3fd92efde
4 changed files with 242 additions and 8 deletions

52
KDEyAitchison/commons.py Normal file
View File

@ -0,0 +1,52 @@
import numpy as np
import pandas as pd
from quapy.method.aggregative import EMQ, KDEyML
from sklearn.linear_model import LogisticRegression
METHODS = ['EMQ',
# 'KDEy-ML',
'KDEy-MLA'
]
# common hyperparameterss
hyper_LR = {
'classifier__C': np.logspace(-3, 3, 7),
'classifier__class_weight': ['balanced', None]
}
hyper_kde = {
'bandwidth': np.linspace(0.01, 0.2, 20)
}
hyper_kde_aitchison = {
'bandwidth': np.linspace(0.01, 2, 100)
}
# instances a new quantifier based on a string name
def new_method(method, **lr_kwargs):
lr = LogisticRegression(**lr_kwargs)
if method == 'KDEy-ML':
param_grid = {**hyper_kde, **hyper_LR}
quantifier = KDEyML(lr, kernel='gaussian')
elif method == 'KDEy-MLA':
param_grid = {**hyper_kde_aitchison, **hyper_LR}
quantifier = KDEyML(lr, kernel='aitchison')
elif method == 'EMQ':
param_grid = hyper_LR
quantifier = EMQ(lr)
else:
raise NotImplementedError('unknown method', method)
return param_grid, quantifier
def show_results(result_path):
df = pd.read_csv(result_path+'.csv', sep='\t')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"])
print(pv)

View File

@ -0,0 +1,34 @@
import pickle
import os
import sys
import pandas as pd
import quapy as qp
from quapy.model_selection import GridSearchQ
from quapy.protocol import UPP
from commons import METHODS, new_method, show_results
SEED = 1
if __name__ == '__main__':
print(qp.datasets.UCI_MULTICLASS_DATASETS)
for optim in ['mae']:
result_dir = f'results/ucimulti/{optim}'
for method in METHODS:
print()
global_result_path = f'{result_dir}/{method}'
print(f'Method\tDataset\tMAE\tMRAE\tKLD')
for dataset in qp.datasets.UCI_MULTICLASS_DATASETS:
print(dataset)
local_result_path = global_result_path + '_' + dataset
if os.path.exists(local_result_path + '.dataframe'):
report = pd.read_csv(local_result_path+'.dataframe')
print(f'{method}\t{dataset}\t{report["mae"].mean():.5f}')
else:
print(dataset, 'not found')

View File

@ -0,0 +1,94 @@
import pickle
import os
import sys
import pandas as pd
import quapy as qp
from quapy.model_selection import GridSearchQ
from quapy.protocol import UPP
from commons import METHODS, new_method, show_results
SEED = 1
if __name__ == '__main__':
qp.environ['SAMPLE_SIZE'] = 500
qp.environ['N_JOBS'] = -1
n_bags_val = 250
n_bags_test = 1000
for optim in ['mae']:
result_dir = f'results/ucimulti/{optim}'
os.makedirs(result_dir, exist_ok=True)
for method in METHODS:
print('Init method', method)
global_result_path = f'{result_dir}/{method}'
# show_results(global_result_path)
# sys.exit(0)
if not os.path.exists(global_result_path + '.csv'):
with open(global_result_path + '.csv', 'wt') as csv:
csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n')
with open(global_result_path + '.csv', 'at') as csv:
for dataset in qp.datasets.UCI_MULTICLASS_DATASETS:
print('init', dataset)
local_result_path = global_result_path + '_' + dataset
if os.path.exists(local_result_path + '.dataframe'):
print(f'result file {local_result_path}.dataframe already exist; skipping')
report = pd.read_csv(local_result_path+'.dataframe')
print(report["mae"].mean())
# data = qp.datasets.fetch_UCIMulticlassDataset(dataset)
# csv.write(f'{method}\t{data.name}\t{report["mae"].mean():.5f}\t{report["mrae"].mean():.5f}\t{report["kld"].mean():.5f}\n')
continue
with qp.util.temp_seed(SEED):
param_grid, quantifier = new_method(method, max_iter=3000)
data = qp.datasets.fetch_UCIMulticlassDataset(dataset)
# model selection
train, test = data.train_test
train, val = train.split_stratified(random_state=SEED)
protocol = UPP(val, repeats=n_bags_val)
modsel = GridSearchQ(
quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=True, error=optim
)
try:
modsel.fit(*train.Xy)
print(f'best params {modsel.best_params_}')
print(f'best score {modsel.best_score_}')
pickle.dump(
(modsel.best_params_, modsel.best_score_,),
open(f'{local_result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
quantifier = modsel.best_model()
except:
print('something went wrong... trying to fit the default model')
quantifier.fit(*train.Xy)
protocol = UPP(test, repeats=n_bags_test)
report = qp.evaluation.evaluation_report(
quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], verbose=True
)
report.to_csv(f'{local_result_path}.dataframe')
print(f'{method}\t{data.name}\t{report["mae"].mean():.5f}\t{report["mrae"].mean():.5f}\t{report["kld"].mean():.5f}\n')
csv.write(f'{method}\t{data.name}\t{report["mae"].mean():.5f}\t{report["mrae"].mean():.5f}\t{report["kld"].mean():.5f}\n')
csv.flush()
show_results(global_result_path)

View File

@ -11,12 +11,28 @@ import quapy.functional as F
from sklearn.metrics.pairwise import rbf_kernel
# class KDE(KernelDensity):
#
# KERNELS = ['gaussian', 'aitchison']
#
# def __init__(self, bandwidth, kernel):
# assert kernel in KDE.KERNELS, f'unknown {kernel=}'
# self.bandwidth = bandwidth
# self.kernel = kernel
#
# def
class KDEBase:
"""
Common ancestor for KDE-based methods. Implements some common routines.
"""
BANDWIDTH_METHOD = ['scott', 'silverman']
KERNELS = ['gaussian', 'aitchison']
@classmethod
def _check_bandwidth(cls, bandwidth):
@ -30,31 +46,62 @@ class KDEBase:
f'invalid bandwidth, valid ones are {KDEBase.BANDWIDTH_METHOD} or float values'
if isinstance(bandwidth, float):
assert 0 < bandwidth < 1, \
"the bandwith for KDEy should be in (0,1), since this method models the unit simplex"
"the bandwidth for KDEy should be in (0,1), since this method models the unit simplex"
return bandwidth
def get_kde_function(self, X, bandwidth):
@classmethod
def _check_kernel(cls, kernel):
"""
Checks that the kernel parameter is correct
:param kernel: str
:return: the validated kernel
"""
assert kernel in KDEBase.KERNELS, f'unknown {kernel=}'
return kernel
@classmethod
def clr_transform(cls, P, eps=1e-7):
"""
Centered-Log Ratio (CLR) transform.
P: array (n_samples, n_classes), every row is a point in the probability simplex
eps: smoothing, to avoid log(0)
"""
X_safe = np.clip(P, eps, None)
X_safe /= X_safe.sum(axis=1, keepdims=True) # renormalize
gm = np.exp(np.mean(np.log(X_safe), axis=1, keepdims=True))
return np.log(X_safe / gm)
def get_kde_function(self, X, bandwidth, kernel):
"""
Wraps the KDE function from scikit-learn.
:param X: data for which the density function is to be estimated
:param bandwidth: the bandwidth of the kernel
:param kernel: the kernel (valid ones are in KDEBase.KERNELS)
:return: a scikit-learn's KernelDensity object
"""
if kernel == 'aitchison':
X = KDEBase.clr_transform(X)
return KernelDensity(bandwidth=bandwidth).fit(X)
def pdf(self, kde, X):
def pdf(self, kde, X, kernel):
"""
Wraps the density evalution of scikit-learn's KDE. Scikit-learn returns log-scores (s), so this
function returns :math:`e^{s}`
:param kde: a previously fit KDE function
:param X: the data for which the density is to be estimated
:param kernel: the kernel (valid ones are in KDEBase.KERNELS)
:return: np.ndarray with the densities
"""
if kernel == 'aitchison':
X = KDEBase.clr_transform(X)
return np.exp(kde.score_samples(X))
def get_mixture_components(self, X, y, classes, bandwidth):
def get_mixture_components(self, X, y, classes, bandwidth, kernel):
"""
Returns an array containing the mixture components, i.e., the KDE functions for each class.
@ -62,6 +109,7 @@ class KDEBase:
:param y: the class labels
:param n_classes: integer, the number of classes
:param bandwidth: float, the bandwidth of the kernel
:param kernel: the kernel (valid ones are in KDEBase.KERNELS)
:return: a list of KernelDensity objects, each fitted with the corresponding class-specific covariates
"""
class_cond_X = []
@ -69,8 +117,12 @@ class KDEBase:
selX = X[y==cat]
if selX.size==0:
selX = [F.uniform_prevalence(len(classes))]
# if kernel == 'aitchison':
# this is already done within get_kde_function
# selX = KDEBase.clr_transform(selX)
class_cond_X.append(selX)
return [self.get_kde_function(X_cond_yi, bandwidth) for X_cond_yi in class_cond_X]
return [self.get_kde_function(X_cond_yi, bandwidth, kernel) for X_cond_yi in class_cond_X]
class KDEyML(AggregativeSoftQuantifier, KDEBase):
@ -109,17 +161,19 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase):
are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
for `k`); or as a tuple (X,y) defining the specific set of data to use for validation.
:param bandwidth: float, the bandwidth of the Kernel
:param kernel: kernel of KDE, valid ones are in KDEBase.KERNELS
:param random_state: a seed to be set before fitting any base quantifier (default None)
"""
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=0.1,
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=0.1, kernel='gaussian',
random_state=None):
super().__init__(classifier, fit_classifier, val_split)
self.bandwidth = KDEBase._check_bandwidth(bandwidth)
self.kernel = self._check_kernel(kernel)
self.random_state=random_state
def aggregation_fit(self, classif_predictions, labels):
self.mix_densities = self.get_mixture_components(classif_predictions, labels, self.classes_, self.bandwidth)
self.mix_densities = self.get_mixture_components(classif_predictions, labels, self.classes_, self.bandwidth, self.kernel)
return self
def aggregate(self, posteriors: np.ndarray):
@ -133,7 +187,7 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase):
with qp.util.temp_seed(self.random_state):
epsilon = 1e-10
n_classes = len(self.mix_densities)
test_densities = [self.pdf(kde_i, posteriors) for kde_i in self.mix_densities]
test_densities = [self.pdf(kde_i, posteriors, self.kernel) for kde_i in self.mix_densities]
def neg_loglikelihood(prev):
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities))