experiment
This commit is contained in:
parent
636e33318f
commit
c3fd92efde
|
|
@ -0,0 +1,52 @@
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from quapy.method.aggregative import EMQ, KDEyML
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
|
||||||
|
METHODS = ['EMQ',
|
||||||
|
# 'KDEy-ML',
|
||||||
|
'KDEy-MLA'
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# common hyperparameterss
|
||||||
|
hyper_LR = {
|
||||||
|
'classifier__C': np.logspace(-3, 3, 7),
|
||||||
|
'classifier__class_weight': ['balanced', None]
|
||||||
|
}
|
||||||
|
|
||||||
|
hyper_kde = {
|
||||||
|
'bandwidth': np.linspace(0.01, 0.2, 20)
|
||||||
|
}
|
||||||
|
|
||||||
|
hyper_kde_aitchison = {
|
||||||
|
'bandwidth': np.linspace(0.01, 2, 100)
|
||||||
|
}
|
||||||
|
|
||||||
|
# instances a new quantifier based on a string name
|
||||||
|
def new_method(method, **lr_kwargs):
|
||||||
|
lr = LogisticRegression(**lr_kwargs)
|
||||||
|
|
||||||
|
if method == 'KDEy-ML':
|
||||||
|
param_grid = {**hyper_kde, **hyper_LR}
|
||||||
|
quantifier = KDEyML(lr, kernel='gaussian')
|
||||||
|
elif method == 'KDEy-MLA':
|
||||||
|
param_grid = {**hyper_kde_aitchison, **hyper_LR}
|
||||||
|
quantifier = KDEyML(lr, kernel='aitchison')
|
||||||
|
elif method == 'EMQ':
|
||||||
|
param_grid = hyper_LR
|
||||||
|
quantifier = EMQ(lr)
|
||||||
|
else:
|
||||||
|
raise NotImplementedError('unknown method', method)
|
||||||
|
|
||||||
|
return param_grid, quantifier
|
||||||
|
|
||||||
|
|
||||||
|
def show_results(result_path):
|
||||||
|
df = pd.read_csv(result_path+'.csv', sep='\t')
|
||||||
|
|
||||||
|
pd.set_option('display.max_columns', None)
|
||||||
|
pd.set_option('display.max_rows', None)
|
||||||
|
pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"])
|
||||||
|
print(pv)
|
||||||
|
|
@ -0,0 +1,34 @@
|
||||||
|
import pickle
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
import quapy as qp
|
||||||
|
from quapy.model_selection import GridSearchQ
|
||||||
|
from quapy.protocol import UPP
|
||||||
|
from commons import METHODS, new_method, show_results
|
||||||
|
|
||||||
|
|
||||||
|
SEED = 1
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
print(qp.datasets.UCI_MULTICLASS_DATASETS)
|
||||||
|
for optim in ['mae']:
|
||||||
|
result_dir = f'results/ucimulti/{optim}'
|
||||||
|
|
||||||
|
for method in METHODS:
|
||||||
|
print()
|
||||||
|
global_result_path = f'{result_dir}/{method}'
|
||||||
|
print(f'Method\tDataset\tMAE\tMRAE\tKLD')
|
||||||
|
for dataset in qp.datasets.UCI_MULTICLASS_DATASETS:
|
||||||
|
print(dataset)
|
||||||
|
local_result_path = global_result_path + '_' + dataset
|
||||||
|
if os.path.exists(local_result_path + '.dataframe'):
|
||||||
|
report = pd.read_csv(local_result_path+'.dataframe')
|
||||||
|
print(f'{method}\t{dataset}\t{report["mae"].mean():.5f}')
|
||||||
|
else:
|
||||||
|
print(dataset, 'not found')
|
||||||
|
|
||||||
|
|
@ -0,0 +1,94 @@
|
||||||
|
import pickle
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
import quapy as qp
|
||||||
|
from quapy.model_selection import GridSearchQ
|
||||||
|
from quapy.protocol import UPP
|
||||||
|
from commons import METHODS, new_method, show_results
|
||||||
|
|
||||||
|
|
||||||
|
SEED = 1
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
qp.environ['SAMPLE_SIZE'] = 500
|
||||||
|
qp.environ['N_JOBS'] = -1
|
||||||
|
n_bags_val = 250
|
||||||
|
n_bags_test = 1000
|
||||||
|
for optim in ['mae']:
|
||||||
|
result_dir = f'results/ucimulti/{optim}'
|
||||||
|
|
||||||
|
os.makedirs(result_dir, exist_ok=True)
|
||||||
|
|
||||||
|
for method in METHODS:
|
||||||
|
|
||||||
|
print('Init method', method)
|
||||||
|
|
||||||
|
global_result_path = f'{result_dir}/{method}'
|
||||||
|
# show_results(global_result_path)
|
||||||
|
# sys.exit(0)
|
||||||
|
|
||||||
|
if not os.path.exists(global_result_path + '.csv'):
|
||||||
|
with open(global_result_path + '.csv', 'wt') as csv:
|
||||||
|
csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n')
|
||||||
|
|
||||||
|
with open(global_result_path + '.csv', 'at') as csv:
|
||||||
|
|
||||||
|
for dataset in qp.datasets.UCI_MULTICLASS_DATASETS:
|
||||||
|
|
||||||
|
print('init', dataset)
|
||||||
|
|
||||||
|
local_result_path = global_result_path + '_' + dataset
|
||||||
|
if os.path.exists(local_result_path + '.dataframe'):
|
||||||
|
print(f'result file {local_result_path}.dataframe already exist; skipping')
|
||||||
|
report = pd.read_csv(local_result_path+'.dataframe')
|
||||||
|
print(report["mae"].mean())
|
||||||
|
# data = qp.datasets.fetch_UCIMulticlassDataset(dataset)
|
||||||
|
# csv.write(f'{method}\t{data.name}\t{report["mae"].mean():.5f}\t{report["mrae"].mean():.5f}\t{report["kld"].mean():.5f}\n')
|
||||||
|
continue
|
||||||
|
|
||||||
|
with qp.util.temp_seed(SEED):
|
||||||
|
|
||||||
|
param_grid, quantifier = new_method(method, max_iter=3000)
|
||||||
|
|
||||||
|
data = qp.datasets.fetch_UCIMulticlassDataset(dataset)
|
||||||
|
|
||||||
|
# model selection
|
||||||
|
train, test = data.train_test
|
||||||
|
train, val = train.split_stratified(random_state=SEED)
|
||||||
|
|
||||||
|
protocol = UPP(val, repeats=n_bags_val)
|
||||||
|
modsel = GridSearchQ(
|
||||||
|
quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=True, error=optim
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
modsel.fit(*train.Xy)
|
||||||
|
|
||||||
|
print(f'best params {modsel.best_params_}')
|
||||||
|
print(f'best score {modsel.best_score_}')
|
||||||
|
pickle.dump(
|
||||||
|
(modsel.best_params_, modsel.best_score_,),
|
||||||
|
open(f'{local_result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
|
quantifier = modsel.best_model()
|
||||||
|
except:
|
||||||
|
print('something went wrong... trying to fit the default model')
|
||||||
|
quantifier.fit(*train.Xy)
|
||||||
|
|
||||||
|
|
||||||
|
protocol = UPP(test, repeats=n_bags_test)
|
||||||
|
report = qp.evaluation.evaluation_report(
|
||||||
|
quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], verbose=True
|
||||||
|
)
|
||||||
|
report.to_csv(f'{local_result_path}.dataframe')
|
||||||
|
print(f'{method}\t{data.name}\t{report["mae"].mean():.5f}\t{report["mrae"].mean():.5f}\t{report["kld"].mean():.5f}\n')
|
||||||
|
csv.write(f'{method}\t{data.name}\t{report["mae"].mean():.5f}\t{report["mrae"].mean():.5f}\t{report["kld"].mean():.5f}\n')
|
||||||
|
csv.flush()
|
||||||
|
|
||||||
|
show_results(global_result_path)
|
||||||
|
|
@ -11,12 +11,28 @@ import quapy.functional as F
|
||||||
from sklearn.metrics.pairwise import rbf_kernel
|
from sklearn.metrics.pairwise import rbf_kernel
|
||||||
|
|
||||||
|
|
||||||
|
# class KDE(KernelDensity):
|
||||||
|
#
|
||||||
|
# KERNELS = ['gaussian', 'aitchison']
|
||||||
|
#
|
||||||
|
# def __init__(self, bandwidth, kernel):
|
||||||
|
# assert kernel in KDE.KERNELS, f'unknown {kernel=}'
|
||||||
|
# self.bandwidth = bandwidth
|
||||||
|
# self.kernel = kernel
|
||||||
|
#
|
||||||
|
# def
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class KDEBase:
|
class KDEBase:
|
||||||
"""
|
"""
|
||||||
Common ancestor for KDE-based methods. Implements some common routines.
|
Common ancestor for KDE-based methods. Implements some common routines.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
BANDWIDTH_METHOD = ['scott', 'silverman']
|
BANDWIDTH_METHOD = ['scott', 'silverman']
|
||||||
|
KERNELS = ['gaussian', 'aitchison']
|
||||||
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _check_bandwidth(cls, bandwidth):
|
def _check_bandwidth(cls, bandwidth):
|
||||||
|
|
@ -30,31 +46,62 @@ class KDEBase:
|
||||||
f'invalid bandwidth, valid ones are {KDEBase.BANDWIDTH_METHOD} or float values'
|
f'invalid bandwidth, valid ones are {KDEBase.BANDWIDTH_METHOD} or float values'
|
||||||
if isinstance(bandwidth, float):
|
if isinstance(bandwidth, float):
|
||||||
assert 0 < bandwidth < 1, \
|
assert 0 < bandwidth < 1, \
|
||||||
"the bandwith for KDEy should be in (0,1), since this method models the unit simplex"
|
"the bandwidth for KDEy should be in (0,1), since this method models the unit simplex"
|
||||||
return bandwidth
|
return bandwidth
|
||||||
|
|
||||||
def get_kde_function(self, X, bandwidth):
|
@classmethod
|
||||||
|
def _check_kernel(cls, kernel):
|
||||||
|
"""
|
||||||
|
Checks that the kernel parameter is correct
|
||||||
|
|
||||||
|
:param kernel: str
|
||||||
|
:return: the validated kernel
|
||||||
|
"""
|
||||||
|
assert kernel in KDEBase.KERNELS, f'unknown {kernel=}'
|
||||||
|
return kernel
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def clr_transform(cls, P, eps=1e-7):
|
||||||
|
"""
|
||||||
|
Centered-Log Ratio (CLR) transform.
|
||||||
|
P: array (n_samples, n_classes), every row is a point in the probability simplex
|
||||||
|
eps: smoothing, to avoid log(0)
|
||||||
|
"""
|
||||||
|
X_safe = np.clip(P, eps, None)
|
||||||
|
X_safe /= X_safe.sum(axis=1, keepdims=True) # renormalize
|
||||||
|
gm = np.exp(np.mean(np.log(X_safe), axis=1, keepdims=True))
|
||||||
|
return np.log(X_safe / gm)
|
||||||
|
|
||||||
|
def get_kde_function(self, X, bandwidth, kernel):
|
||||||
"""
|
"""
|
||||||
Wraps the KDE function from scikit-learn.
|
Wraps the KDE function from scikit-learn.
|
||||||
|
|
||||||
:param X: data for which the density function is to be estimated
|
:param X: data for which the density function is to be estimated
|
||||||
:param bandwidth: the bandwidth of the kernel
|
:param bandwidth: the bandwidth of the kernel
|
||||||
|
:param kernel: the kernel (valid ones are in KDEBase.KERNELS)
|
||||||
:return: a scikit-learn's KernelDensity object
|
:return: a scikit-learn's KernelDensity object
|
||||||
"""
|
"""
|
||||||
|
if kernel == 'aitchison':
|
||||||
|
X = KDEBase.clr_transform(X)
|
||||||
|
|
||||||
return KernelDensity(bandwidth=bandwidth).fit(X)
|
return KernelDensity(bandwidth=bandwidth).fit(X)
|
||||||
|
|
||||||
def pdf(self, kde, X):
|
def pdf(self, kde, X, kernel):
|
||||||
"""
|
"""
|
||||||
Wraps the density evalution of scikit-learn's KDE. Scikit-learn returns log-scores (s), so this
|
Wraps the density evalution of scikit-learn's KDE. Scikit-learn returns log-scores (s), so this
|
||||||
function returns :math:`e^{s}`
|
function returns :math:`e^{s}`
|
||||||
|
|
||||||
:param kde: a previously fit KDE function
|
:param kde: a previously fit KDE function
|
||||||
:param X: the data for which the density is to be estimated
|
:param X: the data for which the density is to be estimated
|
||||||
|
:param kernel: the kernel (valid ones are in KDEBase.KERNELS)
|
||||||
:return: np.ndarray with the densities
|
:return: np.ndarray with the densities
|
||||||
"""
|
"""
|
||||||
|
if kernel == 'aitchison':
|
||||||
|
X = KDEBase.clr_transform(X)
|
||||||
|
|
||||||
return np.exp(kde.score_samples(X))
|
return np.exp(kde.score_samples(X))
|
||||||
|
|
||||||
def get_mixture_components(self, X, y, classes, bandwidth):
|
def get_mixture_components(self, X, y, classes, bandwidth, kernel):
|
||||||
"""
|
"""
|
||||||
Returns an array containing the mixture components, i.e., the KDE functions for each class.
|
Returns an array containing the mixture components, i.e., the KDE functions for each class.
|
||||||
|
|
||||||
|
|
@ -62,6 +109,7 @@ class KDEBase:
|
||||||
:param y: the class labels
|
:param y: the class labels
|
||||||
:param n_classes: integer, the number of classes
|
:param n_classes: integer, the number of classes
|
||||||
:param bandwidth: float, the bandwidth of the kernel
|
:param bandwidth: float, the bandwidth of the kernel
|
||||||
|
:param kernel: the kernel (valid ones are in KDEBase.KERNELS)
|
||||||
:return: a list of KernelDensity objects, each fitted with the corresponding class-specific covariates
|
:return: a list of KernelDensity objects, each fitted with the corresponding class-specific covariates
|
||||||
"""
|
"""
|
||||||
class_cond_X = []
|
class_cond_X = []
|
||||||
|
|
@ -69,8 +117,12 @@ class KDEBase:
|
||||||
selX = X[y==cat]
|
selX = X[y==cat]
|
||||||
if selX.size==0:
|
if selX.size==0:
|
||||||
selX = [F.uniform_prevalence(len(classes))]
|
selX = [F.uniform_prevalence(len(classes))]
|
||||||
|
|
||||||
|
# if kernel == 'aitchison':
|
||||||
|
# this is already done within get_kde_function
|
||||||
|
# selX = KDEBase.clr_transform(selX)
|
||||||
class_cond_X.append(selX)
|
class_cond_X.append(selX)
|
||||||
return [self.get_kde_function(X_cond_yi, bandwidth) for X_cond_yi in class_cond_X]
|
return [self.get_kde_function(X_cond_yi, bandwidth, kernel) for X_cond_yi in class_cond_X]
|
||||||
|
|
||||||
|
|
||||||
class KDEyML(AggregativeSoftQuantifier, KDEBase):
|
class KDEyML(AggregativeSoftQuantifier, KDEBase):
|
||||||
|
|
@ -109,17 +161,19 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase):
|
||||||
are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
|
are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
|
||||||
for `k`); or as a tuple (X,y) defining the specific set of data to use for validation.
|
for `k`); or as a tuple (X,y) defining the specific set of data to use for validation.
|
||||||
:param bandwidth: float, the bandwidth of the Kernel
|
:param bandwidth: float, the bandwidth of the Kernel
|
||||||
|
:param kernel: kernel of KDE, valid ones are in KDEBase.KERNELS
|
||||||
:param random_state: a seed to be set before fitting any base quantifier (default None)
|
:param random_state: a seed to be set before fitting any base quantifier (default None)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=0.1,
|
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=0.1, kernel='gaussian',
|
||||||
random_state=None):
|
random_state=None):
|
||||||
super().__init__(classifier, fit_classifier, val_split)
|
super().__init__(classifier, fit_classifier, val_split)
|
||||||
self.bandwidth = KDEBase._check_bandwidth(bandwidth)
|
self.bandwidth = KDEBase._check_bandwidth(bandwidth)
|
||||||
|
self.kernel = self._check_kernel(kernel)
|
||||||
self.random_state=random_state
|
self.random_state=random_state
|
||||||
|
|
||||||
def aggregation_fit(self, classif_predictions, labels):
|
def aggregation_fit(self, classif_predictions, labels):
|
||||||
self.mix_densities = self.get_mixture_components(classif_predictions, labels, self.classes_, self.bandwidth)
|
self.mix_densities = self.get_mixture_components(classif_predictions, labels, self.classes_, self.bandwidth, self.kernel)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def aggregate(self, posteriors: np.ndarray):
|
def aggregate(self, posteriors: np.ndarray):
|
||||||
|
|
@ -133,7 +187,7 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase):
|
||||||
with qp.util.temp_seed(self.random_state):
|
with qp.util.temp_seed(self.random_state):
|
||||||
epsilon = 1e-10
|
epsilon = 1e-10
|
||||||
n_classes = len(self.mix_densities)
|
n_classes = len(self.mix_densities)
|
||||||
test_densities = [self.pdf(kde_i, posteriors) for kde_i in self.mix_densities]
|
test_densities = [self.pdf(kde_i, posteriors, self.kernel) for kde_i in self.mix_densities]
|
||||||
|
|
||||||
def neg_loglikelihood(prev):
|
def neg_loglikelihood(prev):
|
||||||
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities))
|
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities))
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue