experiment
This commit is contained in:
parent
636e33318f
commit
c3fd92efde
|
|
@ -0,0 +1,52 @@
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from quapy.method.aggregative import EMQ, KDEyML
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
|
||||
METHODS = ['EMQ',
|
||||
# 'KDEy-ML',
|
||||
'KDEy-MLA'
|
||||
]
|
||||
|
||||
|
||||
# common hyperparameterss
|
||||
hyper_LR = {
|
||||
'classifier__C': np.logspace(-3, 3, 7),
|
||||
'classifier__class_weight': ['balanced', None]
|
||||
}
|
||||
|
||||
hyper_kde = {
|
||||
'bandwidth': np.linspace(0.01, 0.2, 20)
|
||||
}
|
||||
|
||||
hyper_kde_aitchison = {
|
||||
'bandwidth': np.linspace(0.01, 2, 100)
|
||||
}
|
||||
|
||||
# instances a new quantifier based on a string name
|
||||
def new_method(method, **lr_kwargs):
|
||||
lr = LogisticRegression(**lr_kwargs)
|
||||
|
||||
if method == 'KDEy-ML':
|
||||
param_grid = {**hyper_kde, **hyper_LR}
|
||||
quantifier = KDEyML(lr, kernel='gaussian')
|
||||
elif method == 'KDEy-MLA':
|
||||
param_grid = {**hyper_kde_aitchison, **hyper_LR}
|
||||
quantifier = KDEyML(lr, kernel='aitchison')
|
||||
elif method == 'EMQ':
|
||||
param_grid = hyper_LR
|
||||
quantifier = EMQ(lr)
|
||||
else:
|
||||
raise NotImplementedError('unknown method', method)
|
||||
|
||||
return param_grid, quantifier
|
||||
|
||||
|
||||
def show_results(result_path):
|
||||
df = pd.read_csv(result_path+'.csv', sep='\t')
|
||||
|
||||
pd.set_option('display.max_columns', None)
|
||||
pd.set_option('display.max_rows', None)
|
||||
pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"])
|
||||
print(pv)
|
||||
|
|
@ -0,0 +1,34 @@
|
|||
import pickle
|
||||
import os
|
||||
import sys
|
||||
|
||||
import pandas as pd
|
||||
|
||||
import quapy as qp
|
||||
from quapy.model_selection import GridSearchQ
|
||||
from quapy.protocol import UPP
|
||||
from commons import METHODS, new_method, show_results
|
||||
|
||||
|
||||
SEED = 1
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(qp.datasets.UCI_MULTICLASS_DATASETS)
|
||||
for optim in ['mae']:
|
||||
result_dir = f'results/ucimulti/{optim}'
|
||||
|
||||
for method in METHODS:
|
||||
print()
|
||||
global_result_path = f'{result_dir}/{method}'
|
||||
print(f'Method\tDataset\tMAE\tMRAE\tKLD')
|
||||
for dataset in qp.datasets.UCI_MULTICLASS_DATASETS:
|
||||
print(dataset)
|
||||
local_result_path = global_result_path + '_' + dataset
|
||||
if os.path.exists(local_result_path + '.dataframe'):
|
||||
report = pd.read_csv(local_result_path+'.dataframe')
|
||||
print(f'{method}\t{dataset}\t{report["mae"].mean():.5f}')
|
||||
else:
|
||||
print(dataset, 'not found')
|
||||
|
||||
|
|
@ -0,0 +1,94 @@
|
|||
import pickle
|
||||
import os
|
||||
import sys
|
||||
|
||||
import pandas as pd
|
||||
|
||||
import quapy as qp
|
||||
from quapy.model_selection import GridSearchQ
|
||||
from quapy.protocol import UPP
|
||||
from commons import METHODS, new_method, show_results
|
||||
|
||||
|
||||
SEED = 1
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
qp.environ['SAMPLE_SIZE'] = 500
|
||||
qp.environ['N_JOBS'] = -1
|
||||
n_bags_val = 250
|
||||
n_bags_test = 1000
|
||||
for optim in ['mae']:
|
||||
result_dir = f'results/ucimulti/{optim}'
|
||||
|
||||
os.makedirs(result_dir, exist_ok=True)
|
||||
|
||||
for method in METHODS:
|
||||
|
||||
print('Init method', method)
|
||||
|
||||
global_result_path = f'{result_dir}/{method}'
|
||||
# show_results(global_result_path)
|
||||
# sys.exit(0)
|
||||
|
||||
if not os.path.exists(global_result_path + '.csv'):
|
||||
with open(global_result_path + '.csv', 'wt') as csv:
|
||||
csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n')
|
||||
|
||||
with open(global_result_path + '.csv', 'at') as csv:
|
||||
|
||||
for dataset in qp.datasets.UCI_MULTICLASS_DATASETS:
|
||||
|
||||
print('init', dataset)
|
||||
|
||||
local_result_path = global_result_path + '_' + dataset
|
||||
if os.path.exists(local_result_path + '.dataframe'):
|
||||
print(f'result file {local_result_path}.dataframe already exist; skipping')
|
||||
report = pd.read_csv(local_result_path+'.dataframe')
|
||||
print(report["mae"].mean())
|
||||
# data = qp.datasets.fetch_UCIMulticlassDataset(dataset)
|
||||
# csv.write(f'{method}\t{data.name}\t{report["mae"].mean():.5f}\t{report["mrae"].mean():.5f}\t{report["kld"].mean():.5f}\n')
|
||||
continue
|
||||
|
||||
with qp.util.temp_seed(SEED):
|
||||
|
||||
param_grid, quantifier = new_method(method, max_iter=3000)
|
||||
|
||||
data = qp.datasets.fetch_UCIMulticlassDataset(dataset)
|
||||
|
||||
# model selection
|
||||
train, test = data.train_test
|
||||
train, val = train.split_stratified(random_state=SEED)
|
||||
|
||||
protocol = UPP(val, repeats=n_bags_val)
|
||||
modsel = GridSearchQ(
|
||||
quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=True, error=optim
|
||||
)
|
||||
|
||||
try:
|
||||
modsel.fit(*train.Xy)
|
||||
|
||||
print(f'best params {modsel.best_params_}')
|
||||
print(f'best score {modsel.best_score_}')
|
||||
pickle.dump(
|
||||
(modsel.best_params_, modsel.best_score_,),
|
||||
open(f'{local_result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
quantifier = modsel.best_model()
|
||||
except:
|
||||
print('something went wrong... trying to fit the default model')
|
||||
quantifier.fit(*train.Xy)
|
||||
|
||||
|
||||
protocol = UPP(test, repeats=n_bags_test)
|
||||
report = qp.evaluation.evaluation_report(
|
||||
quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], verbose=True
|
||||
)
|
||||
report.to_csv(f'{local_result_path}.dataframe')
|
||||
print(f'{method}\t{data.name}\t{report["mae"].mean():.5f}\t{report["mrae"].mean():.5f}\t{report["kld"].mean():.5f}\n')
|
||||
csv.write(f'{method}\t{data.name}\t{report["mae"].mean():.5f}\t{report["mrae"].mean():.5f}\t{report["kld"].mean():.5f}\n')
|
||||
csv.flush()
|
||||
|
||||
show_results(global_result_path)
|
||||
|
|
@ -11,12 +11,28 @@ import quapy.functional as F
|
|||
from sklearn.metrics.pairwise import rbf_kernel
|
||||
|
||||
|
||||
# class KDE(KernelDensity):
|
||||
#
|
||||
# KERNELS = ['gaussian', 'aitchison']
|
||||
#
|
||||
# def __init__(self, bandwidth, kernel):
|
||||
# assert kernel in KDE.KERNELS, f'unknown {kernel=}'
|
||||
# self.bandwidth = bandwidth
|
||||
# self.kernel = kernel
|
||||
#
|
||||
# def
|
||||
|
||||
|
||||
|
||||
|
||||
class KDEBase:
|
||||
"""
|
||||
Common ancestor for KDE-based methods. Implements some common routines.
|
||||
"""
|
||||
|
||||
BANDWIDTH_METHOD = ['scott', 'silverman']
|
||||
KERNELS = ['gaussian', 'aitchison']
|
||||
|
||||
|
||||
@classmethod
|
||||
def _check_bandwidth(cls, bandwidth):
|
||||
|
|
@ -30,31 +46,62 @@ class KDEBase:
|
|||
f'invalid bandwidth, valid ones are {KDEBase.BANDWIDTH_METHOD} or float values'
|
||||
if isinstance(bandwidth, float):
|
||||
assert 0 < bandwidth < 1, \
|
||||
"the bandwith for KDEy should be in (0,1), since this method models the unit simplex"
|
||||
"the bandwidth for KDEy should be in (0,1), since this method models the unit simplex"
|
||||
return bandwidth
|
||||
|
||||
def get_kde_function(self, X, bandwidth):
|
||||
@classmethod
|
||||
def _check_kernel(cls, kernel):
|
||||
"""
|
||||
Checks that the kernel parameter is correct
|
||||
|
||||
:param kernel: str
|
||||
:return: the validated kernel
|
||||
"""
|
||||
assert kernel in KDEBase.KERNELS, f'unknown {kernel=}'
|
||||
return kernel
|
||||
|
||||
@classmethod
|
||||
def clr_transform(cls, P, eps=1e-7):
|
||||
"""
|
||||
Centered-Log Ratio (CLR) transform.
|
||||
P: array (n_samples, n_classes), every row is a point in the probability simplex
|
||||
eps: smoothing, to avoid log(0)
|
||||
"""
|
||||
X_safe = np.clip(P, eps, None)
|
||||
X_safe /= X_safe.sum(axis=1, keepdims=True) # renormalize
|
||||
gm = np.exp(np.mean(np.log(X_safe), axis=1, keepdims=True))
|
||||
return np.log(X_safe / gm)
|
||||
|
||||
def get_kde_function(self, X, bandwidth, kernel):
|
||||
"""
|
||||
Wraps the KDE function from scikit-learn.
|
||||
|
||||
:param X: data for which the density function is to be estimated
|
||||
:param bandwidth: the bandwidth of the kernel
|
||||
:param kernel: the kernel (valid ones are in KDEBase.KERNELS)
|
||||
:return: a scikit-learn's KernelDensity object
|
||||
"""
|
||||
if kernel == 'aitchison':
|
||||
X = KDEBase.clr_transform(X)
|
||||
|
||||
return KernelDensity(bandwidth=bandwidth).fit(X)
|
||||
|
||||
def pdf(self, kde, X):
|
||||
def pdf(self, kde, X, kernel):
|
||||
"""
|
||||
Wraps the density evalution of scikit-learn's KDE. Scikit-learn returns log-scores (s), so this
|
||||
function returns :math:`e^{s}`
|
||||
|
||||
:param kde: a previously fit KDE function
|
||||
:param X: the data for which the density is to be estimated
|
||||
:param kernel: the kernel (valid ones are in KDEBase.KERNELS)
|
||||
:return: np.ndarray with the densities
|
||||
"""
|
||||
if kernel == 'aitchison':
|
||||
X = KDEBase.clr_transform(X)
|
||||
|
||||
return np.exp(kde.score_samples(X))
|
||||
|
||||
def get_mixture_components(self, X, y, classes, bandwidth):
|
||||
def get_mixture_components(self, X, y, classes, bandwidth, kernel):
|
||||
"""
|
||||
Returns an array containing the mixture components, i.e., the KDE functions for each class.
|
||||
|
||||
|
|
@ -62,6 +109,7 @@ class KDEBase:
|
|||
:param y: the class labels
|
||||
:param n_classes: integer, the number of classes
|
||||
:param bandwidth: float, the bandwidth of the kernel
|
||||
:param kernel: the kernel (valid ones are in KDEBase.KERNELS)
|
||||
:return: a list of KernelDensity objects, each fitted with the corresponding class-specific covariates
|
||||
"""
|
||||
class_cond_X = []
|
||||
|
|
@ -69,8 +117,12 @@ class KDEBase:
|
|||
selX = X[y==cat]
|
||||
if selX.size==0:
|
||||
selX = [F.uniform_prevalence(len(classes))]
|
||||
|
||||
# if kernel == 'aitchison':
|
||||
# this is already done within get_kde_function
|
||||
# selX = KDEBase.clr_transform(selX)
|
||||
class_cond_X.append(selX)
|
||||
return [self.get_kde_function(X_cond_yi, bandwidth) for X_cond_yi in class_cond_X]
|
||||
return [self.get_kde_function(X_cond_yi, bandwidth, kernel) for X_cond_yi in class_cond_X]
|
||||
|
||||
|
||||
class KDEyML(AggregativeSoftQuantifier, KDEBase):
|
||||
|
|
@ -109,17 +161,19 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase):
|
|||
are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
|
||||
for `k`); or as a tuple (X,y) defining the specific set of data to use for validation.
|
||||
:param bandwidth: float, the bandwidth of the Kernel
|
||||
:param kernel: kernel of KDE, valid ones are in KDEBase.KERNELS
|
||||
:param random_state: a seed to be set before fitting any base quantifier (default None)
|
||||
"""
|
||||
|
||||
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=0.1,
|
||||
def __init__(self, classifier: BaseEstimator=None, fit_classifier=True, val_split=5, bandwidth=0.1, kernel='gaussian',
|
||||
random_state=None):
|
||||
super().__init__(classifier, fit_classifier, val_split)
|
||||
self.bandwidth = KDEBase._check_bandwidth(bandwidth)
|
||||
self.kernel = self._check_kernel(kernel)
|
||||
self.random_state=random_state
|
||||
|
||||
def aggregation_fit(self, classif_predictions, labels):
|
||||
self.mix_densities = self.get_mixture_components(classif_predictions, labels, self.classes_, self.bandwidth)
|
||||
self.mix_densities = self.get_mixture_components(classif_predictions, labels, self.classes_, self.bandwidth, self.kernel)
|
||||
return self
|
||||
|
||||
def aggregate(self, posteriors: np.ndarray):
|
||||
|
|
@ -133,7 +187,7 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase):
|
|||
with qp.util.temp_seed(self.random_state):
|
||||
epsilon = 1e-10
|
||||
n_classes = len(self.mix_densities)
|
||||
test_densities = [self.pdf(kde_i, posteriors) for kde_i in self.mix_densities]
|
||||
test_densities = [self.pdf(kde_i, posteriors, self.kernel) for kde_i in self.mix_densities]
|
||||
|
||||
def neg_loglikelihood(prev):
|
||||
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities))
|
||||
|
|
|
|||
Loading…
Reference in New Issue