forked from moreo/QuaPy
refactoring
This commit is contained in:
parent
a776816063
commit
59500a5a42
|
@ -1,23 +1,31 @@
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
from distribution_matching.method_kdey import KDEy
|
||||
from distribution_matching.method_kdey_closed import KDEyclosed
|
||||
from distribution_matching.method_kdey_closed_efficient_correct import KDEyclosed_efficient_corr
|
||||
from distribution_matching.methods_kdey import KDEyCS, KDEyHD, KDEyML
|
||||
from distribution_matching.method.kdex import KDExML
|
||||
from distribution_matching.method.method_kdey import KDEy
|
||||
from distribution_matching.method.method_kdey_closed_efficient_correct import KDEyclosed_efficient_corr
|
||||
from distribution_matching.method.kdey import KDEyCS, KDEyHD, KDEyML
|
||||
from quapy.method.aggregative import EMQ, CC, PCC, DistributionMatching, PACC, HDy, OneVsAllAggregative, ACC
|
||||
from distribution_matching.method_dirichlety import DIRy
|
||||
from distribution_matching.method.dirichlety import DIRy
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from distribution_matching.method_kdey_closed_efficient import KDEyclosed_efficient
|
||||
|
||||
# the full list of methods tested in the paper (reported in the appendix)
|
||||
METHODS = ['ACC', 'PACC', 'HDy-OvA', 'DM-T', 'DM-HD', 'KDEy-HD', 'KDEy-HD2', 'DM-CS', 'KDEy-CS','KDEy-CS2', 'DIR', 'EMQ', 'EMQ-BCTS', 'KDEy-ML', 'KDEy-ML2']
|
||||
# set to True to get the full list of methods tested in the paper (reported in the appendix)
|
||||
# set to False to get the reduced list (shown in the body of the paper)
|
||||
FULL_METHOD_LIST = True
|
||||
|
||||
# uncomment this other list for the methods shown in the body of the paper (the other methods are not comparable in performance)
|
||||
#METHODS = ['PACC', 'DM-T', 'DM-HD', 'KDEy-HD', 'DM-CS', 'KDEy-CS', 'EMQ', 'KDEy-ML']
|
||||
if FULL_METHOD_LIST:
|
||||
ADJUSTMENT_METHODS = ['ACC', 'PACC']
|
||||
DISTR_MATCH_METHODS = ['HDy-OvA', 'DM-T', 'DM-HD', 'KDEy-HD', 'DM-CS', 'KDEy-CS']
|
||||
MAX_LIKE_METHODS = ['DIR', 'EMQ', 'EMQ-BCTS', 'KDEy-ML', 'KDEx-ML']
|
||||
else:
|
||||
ADJUSTMENT_METHODS = ['PACC']
|
||||
DISTR_MATCH_METHODS = ['DM-T', 'DM-HD', 'KDEy-HD', 'DM-CS', 'KDEy-CS']
|
||||
MAX_LIKE_METHODS = ['EMQ', 'KDEy-ML', 'KDEx-ML']
|
||||
|
||||
# list of methods to consider
|
||||
METHODS = ADJUSTMENT_METHODS + DISTR_MATCH_METHODS + MAX_LIKE_METHODS
|
||||
BIN_METHODS = [x.replace('-OvA', '') for x in METHODS]
|
||||
|
||||
|
||||
# common hyperparameterss
|
||||
hyper_LR = {
|
||||
'classifier__C': np.logspace(-3,3,7),
|
||||
'classifier__class_weight': ['balanced', None]
|
||||
|
@ -29,8 +37,9 @@ hyper_kde = {
|
|||
|
||||
nbins_range = [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 64]
|
||||
|
||||
def new_method(method, **lr_kwargs):
|
||||
|
||||
# instances a new quantifier based on a string name
|
||||
def new_method(method, **lr_kwargs):
|
||||
lr = LogisticRegression(**lr_kwargs)
|
||||
|
||||
if method == 'CC':
|
||||
|
@ -46,23 +55,19 @@ def new_method(method, **lr_kwargs):
|
|||
param_grid = hyper_LR
|
||||
quantifier = PACC(lr)
|
||||
elif method in ['KDEy-HD']:
|
||||
param_grid = {**hyper_kde, **hyper_LR}
|
||||
quantifier = KDEy(lr, target='min_divergence', divergence='HD', montecarlo_trials=10000, val_split=10)
|
||||
elif method in ['KDEy-HD2']:
|
||||
param_grid = {**hyper_kde, **hyper_LR}
|
||||
quantifier = KDEyHD(lr)
|
||||
elif method == 'KDEy-CS':
|
||||
param_grid = {**hyper_kde, **hyper_LR}
|
||||
quantifier = KDEyclosed_efficient_corr(lr, val_split=10)
|
||||
elif method == 'KDEy-CS2':
|
||||
param_grid = {**hyper_kde, **hyper_LR}
|
||||
quantifier = KDEyCS(lr)
|
||||
elif method == 'KDEy-ML':
|
||||
param_grid = {**hyper_kde, **hyper_LR}
|
||||
quantifier = KDEy(lr, target='max_likelihood', val_split=10)
|
||||
elif method == 'KDEy-ML2':
|
||||
param_grid = {**hyper_kde, **hyper_LR}
|
||||
quantifier = KDEyML(lr)
|
||||
elif method == 'KDEx-ML':
|
||||
param_grid = {
|
||||
'bandwidth': np.linspace(0.001, 2, 501)
|
||||
}
|
||||
quantifier = KDExML()
|
||||
elif method == 'DIR':
|
||||
param_grid = hyper_LR
|
||||
quantifier = DIRy(lr)
|
||||
|
|
|
@ -1,74 +0,0 @@
|
|||
import pickle
|
||||
import numpy as np
|
||||
import os
|
||||
from os.path import join
|
||||
import pandas as pd
|
||||
from quapy.protocol import UPP
|
||||
from quapy.data import LabelledCollection
|
||||
from distribution_matching.commons import METHODS, new_method, show_results
|
||||
import quapy as qp
|
||||
|
||||
|
||||
SEED=1
|
||||
|
||||
|
||||
def extract_classes(data:LabelledCollection, classes):
|
||||
X, y = data.Xy
|
||||
counts = data.counts()
|
||||
Xs, ys = [], []
|
||||
for class_i in classes:
|
||||
Xs.append(X[y==class_i])
|
||||
ys.append([class_i]*counts[class_i])
|
||||
Xs = np.concatenate(Xs)
|
||||
ys = np.concatenate(ys)
|
||||
return LabelledCollection(Xs, ys, classes=classes
|
||||
)
|
||||
|
||||
def task(nclasses):
|
||||
in_classes = np.arange(0, nclasses)
|
||||
train = extract_classes(train_pool, classes=in_classes)
|
||||
test = extract_classes(test_pool, classes=in_classes)
|
||||
with qp.util.temp_seed(SEED):
|
||||
hyper, quantifier = new_method(method)
|
||||
quantifier.set_params(classifier__C=1, classifier__class_weight='balanced')
|
||||
hyper = {h:v for h,v in hyper.items() if not h.startswith('classifier__')}
|
||||
tr, va = train.split_stratified(random_state=SEED)
|
||||
quantifier = qp.model_selection.GridSearchQ(quantifier, hyper, UPP(va), optim).fit(tr)
|
||||
report = qp.evaluation.evaluation_report(quantifier, protocol=UPP(test), error_metrics=['mae', 'mrae', 'kld'], verbose=True)
|
||||
return report
|
||||
|
||||
|
||||
# only the quantifier-dependent hyperparameters are explored; the classifier is a LR with default parameters
|
||||
if __name__ == '__main__':
|
||||
|
||||
qp.environ['SAMPLE_SIZE'] = qp.datasets.LEQUA2022_SAMPLE_SIZE['T1B']
|
||||
qp.environ['N_JOBS'] = -1
|
||||
|
||||
|
||||
for optim in ['mae']: #, 'mrae']:
|
||||
|
||||
result_dir = f'results/lequa/nclasses/{optim}'
|
||||
os.makedirs(result_dir, exist_ok=True)
|
||||
|
||||
for method in ['DM', 'EMQ', 'KDEy-ML']: # 'KDEy-ML', 'KDEy-DMhd3']:
|
||||
|
||||
result_path = join(result_dir, f'{method}.csv')
|
||||
if os.path.exists(result_path): continue
|
||||
|
||||
train_orig, _, _ = qp.datasets.fetch_lequa2022('T1B')
|
||||
|
||||
train_pool, test_pool = train_orig.split_stratified(0.5, random_state=SEED)
|
||||
arange_classes = np.arange(2, train_orig.n_classes + 1)
|
||||
reports = qp.util.parallel(task, arange_classes, n_jobs=-1)
|
||||
with open(result_path, 'at') as csv:
|
||||
csv.write(f'Method\tDataset\tnClasses\tMAE\tMRAE\tKLD\n')
|
||||
for num_classes, report in zip(arange_classes, reports):
|
||||
means = report.mean()
|
||||
report_result_path = join(result_dir, f'{method}_{num_classes}')+'.dataframe'
|
||||
report.to_csv(report_result_path)
|
||||
csv.write(f'{method}\tLeQua-T1B\t{num_classes}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
|
||||
csv.flush()
|
||||
|
||||
means = report.mean()
|
||||
print(means)
|
||||
|
|
@ -3,7 +3,7 @@ from sklearn.linear_model import LogisticRegression
|
|||
import os
|
||||
import quapy as qp
|
||||
from distribution_matching.commons import show_results
|
||||
from method_kdey import KDEy
|
||||
from distribution_matching.method.method_kdey import KDEy
|
||||
from quapy.method.aggregative import DistributionMatching
|
||||
|
||||
|
||||
|
|
|
@ -5,36 +5,35 @@ from sklearn.neighbors import KernelDensity
|
|||
|
||||
import quapy as qp
|
||||
from quapy.data import LabelledCollection
|
||||
from quapy.method.aggregative import AggregativeProbabilisticQuantifier, _training_helper, cross_generate_predictions
|
||||
from quapy.method.aggregative import AggregativeProbabilisticQuantifier, cross_generate_predictions
|
||||
import quapy.functional as F
|
||||
|
||||
from scipy.stats import multivariate_normal
|
||||
from scipy import optimize
|
||||
from sklearn.metrics.pairwise import rbf_kernel
|
||||
|
||||
|
||||
class KDEyBase:
|
||||
class KDEBase:
|
||||
|
||||
BANDWIDTH_METHOD = ['scott', 'silverman']
|
||||
|
||||
def _check_bandwidth(self, bandwidth):
|
||||
assert bandwidth in KDEyBase.BANDWIDTH_METHOD or isinstance(bandwidth, float), \
|
||||
f'invalid bandwidth, valid ones are {KDEyBase.BANDWIDTH_METHOD} or float values'
|
||||
@classmethod
|
||||
def _check_bandwidth(cls, bandwidth):
|
||||
assert bandwidth in KDEBase.BANDWIDTH_METHOD or isinstance(bandwidth, float), \
|
||||
f'invalid bandwidth, valid ones are {KDEBase.BANDWIDTH_METHOD} or float values'
|
||||
if isinstance(bandwidth, float):
|
||||
assert 0 < bandwidth < 1, "the bandwith for KDEy should be in (0,1), since this method models the unit simplex"
|
||||
|
||||
def get_kde_function(self, posteriors, bandwidth):
|
||||
return KernelDensity(bandwidth=bandwidth).fit(posteriors)
|
||||
def get_kde_function(self, X, bandwidth):
|
||||
return KernelDensity(bandwidth=bandwidth).fit(X)
|
||||
|
||||
def pdf(self, kde, posteriors):
|
||||
return np.exp(kde.score_samples(posteriors))
|
||||
def pdf(self, kde, X):
|
||||
return np.exp(kde.score_samples(X))
|
||||
|
||||
def get_mixture_components(self, posteriors, y, n_classes, bandwidth):
|
||||
return [self.get_kde_function(posteriors[y == cat], bandwidth) for cat in range(n_classes)]
|
||||
def get_mixture_components(self, X, y, n_classes, bandwidth):
|
||||
return [self.get_kde_function(X[y == cat], bandwidth) for cat in range(n_classes)]
|
||||
|
||||
|
||||
|
||||
class KDEyML(AggregativeProbabilisticQuantifier, KDEyBase):
|
||||
class KDEyML(AggregativeProbabilisticQuantifier, KDEBase):
|
||||
|
||||
def __init__(self, classifier: BaseEstimator, val_split=10, bandwidth=0.1, n_jobs=None, random_state=0):
|
||||
self._check_bandwidth(bandwidth)
|
||||
|
@ -77,7 +76,7 @@ class KDEyML(AggregativeProbabilisticQuantifier, KDEyBase):
|
|||
return F.optim_minimize(neg_loglikelihood, n_classes)
|
||||
|
||||
|
||||
class KDEyHD(AggregativeProbabilisticQuantifier, KDEyBase):
|
||||
class KDEyHD(AggregativeProbabilisticQuantifier, KDEBase):
|
||||
|
||||
def __init__(self, classifier: BaseEstimator, val_split=10, divergence: str='HD',
|
||||
bandwidth=0.1, n_jobs=None, random_state=0, montecarlo_trials=10000):
|
||||
|
@ -145,7 +144,7 @@ class KDEyHD(AggregativeProbabilisticQuantifier, KDEyBase):
|
|||
class KDEyCS(AggregativeProbabilisticQuantifier):
|
||||
|
||||
def __init__(self, classifier: BaseEstimator, val_split=10, bandwidth=0.1, n_jobs=None, random_state=0):
|
||||
self._check_bandwidth(bandwidth)
|
||||
KDEBase._check_bandwidth(bandwidth)
|
||||
self.classifier = classifier
|
||||
self.val_split = val_split
|
||||
self.bandwidth = bandwidth
|
|
@ -1,41 +0,0 @@
|
|||
import sys
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
|
||||
result_dir = 'results/results_tweet_mae_redohyper'
|
||||
#result_dir = 'results_lequa_mrae'
|
||||
|
||||
dfs = []
|
||||
|
||||
pathlist = Path(result_dir).rglob('*.csv')
|
||||
for path in pathlist:
|
||||
path_in_str = str(path)
|
||||
|
||||
try:
|
||||
df = pd.read_csv(path_in_str, sep='\t')
|
||||
df = df[df.iloc[:, 0] != df.columns[0]]
|
||||
if not df.empty:
|
||||
dfs.append(df)
|
||||
except Exception:
|
||||
print('empty')
|
||||
|
||||
df = pd.concat(dfs)
|
||||
|
||||
for err in ['MAE', 'MRAE', 'KLD']:
|
||||
print('-'*100)
|
||||
print(err)
|
||||
print('-'*100)
|
||||
piv = df.pivot_table(index='Dataset', columns='Method', values=err)
|
||||
piv.loc['mean'] = piv.mean()
|
||||
|
||||
pd.set_option('display.max_columns', None)
|
||||
pd.set_option('display.max_rows', None)
|
||||
pd.set_option('expand_frame_repr', False)
|
||||
print(piv)
|
||||
print()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,4 +1,5 @@
|
|||
from distribution_matching.commons import BIN_METHODS, METHODS
|
||||
from distribution_matching.commons import (ADJUSTMENT_METHODS, BIN_METHODS, DISTR_MATCH_METHODS, MAX_LIKE_METHODS,
|
||||
METHODS, FULL_METHOD_LIST)
|
||||
import quapy as qp
|
||||
from os import makedirs
|
||||
import os
|
||||
|
@ -12,10 +13,9 @@ tables_path = '.'
|
|||
MAXTONE = 35 # sets the intensity of the maximum color reached by the worst (red) and best (green) results
|
||||
SHOW_STD = False
|
||||
|
||||
NUM_ADJUSTMENT_METHODS = 2 if 'ACC' in METHODS else 1
|
||||
NUM_MAXIMUM_LIKELIHOOD_METHODS = 4 if 'DIR' in METHODS else 3
|
||||
NUM_DISTRIBUTION_MATCHING_PAIRS = 2
|
||||
NUM_DISTRIBUTION_MATCHING_METHODS = NUM_DISTRIBUTION_MATCHING_PAIRS*2 + (2 if 'HDy-OvA' in METHODS else 1)
|
||||
NUM_ADJUSTMENT_METHODS = len(ADJUSTMENT_METHODS)
|
||||
NUM_MAXIMUM_LIKELIHOOD_METHODS = len(MAX_LIKE_METHODS)
|
||||
NUM_DISTRIBUTION_MATCHING_METHODS = len(DISTR_MATCH_METHODS)
|
||||
|
||||
qp.environ['SAMPLE_SIZE'] = 100
|
||||
|
||||
|
@ -27,21 +27,24 @@ nice_bench = {
|
|||
'semeval16': 'SemEval16',
|
||||
}
|
||||
|
||||
nice_method={
|
||||
'KDEy-MLE': 'KDEy-ML',
|
||||
'KDEy-DMhd4': 'KDEy-HD',
|
||||
'KDEy-closed++': 'KDEy-CS',
|
||||
'EMQ-C': 'EMQ-BCTS'
|
||||
}
|
||||
|
||||
def save_table(path, table):
|
||||
print(f'saving results in {path}')
|
||||
with open(path, 'wt') as foo:
|
||||
foo.write(table)
|
||||
|
||||
|
||||
def nicerm(key):
|
||||
return '\mathrm{'+nice[key]+'}'
|
||||
def new_table(datasets, methods):
|
||||
return Table(
|
||||
benchmarks=datasets,
|
||||
methods=methods,
|
||||
ttest='wilcoxon',
|
||||
prec_mean=5,
|
||||
show_std=SHOW_STD,
|
||||
prec_std=4,
|
||||
clean_zero=(eval=='mae'),
|
||||
average=True,
|
||||
maxtone=MAXTONE
|
||||
)
|
||||
|
||||
|
||||
def make_table(tabs, eval, benchmark_groups, benchmark_names, compact=False):
|
||||
|
@ -54,7 +57,7 @@ def make_table(tabs, eval, benchmark_groups, benchmark_names, compact=False):
|
|||
|
||||
# write the latex table
|
||||
tabular = """
|
||||
\\begin{tabular}{|c|""" + ('c|' * NUM_ADJUSTMENT_METHODS) + 'c|c' + ('|c|c' * (NUM_DISTRIBUTION_MATCHING_PAIRS)) + ('|c' * NUM_MAXIMUM_LIKELIHOOD_METHODS) + """|} """ + cline + """
|
||||
\\begin{tabular}{|c|""" + ('c|' * NUM_ADJUSTMENT_METHODS) + ('c|' * NUM_DISTRIBUTION_MATCHING_METHODS) + ('c|' * NUM_MAXIMUM_LIKELIHOOD_METHODS) + """} """ + cline + """
|
||||
\multicolumn{1}{c}{} &
|
||||
\multicolumn{"""+str(NUM_ADJUSTMENT_METHODS)+"""}{|c}{Adjustment} &
|
||||
\multicolumn{"""+str(NUM_DISTRIBUTION_MATCHING_METHODS)+"""}{|c|}{Distribution Matching} &
|
||||
|
@ -62,8 +65,7 @@ def make_table(tabs, eval, benchmark_groups, benchmark_names, compact=False):
|
|||
\hline
|
||||
"""
|
||||
for i, (tab, group, name) in enumerate(zip(tabs, benchmark_groups, benchmark_names)):
|
||||
tablines = tab.latexTabular(benchmark_replace=nice_bench, method_replace=nice_method, endl='\\\\'+ cline, aslines=True)
|
||||
print(tablines)
|
||||
tablines = tab.latexTabular(benchmark_replace=nice_bench, endl='\\\\'+ cline, aslines=True)
|
||||
tablines[0] = tablines[0].replace('\multicolumn{1}{c|}{}', '\\textbf{'+name+'}')
|
||||
if not compact:
|
||||
tabular += '\n'.join(tablines)
|
||||
|
@ -87,17 +89,7 @@ def gen_tables_uci_multiclass(eval):
|
|||
|
||||
datasets = qp.datasets.UCI_MULTICLASS_DATASETS
|
||||
|
||||
tab = Table(
|
||||
benchmarks=datasets,
|
||||
methods=METHODS,
|
||||
ttest='wilcoxon',
|
||||
prec_mean=4,
|
||||
show_std=SHOW_STD,
|
||||
prec_std=4,
|
||||
clean_zero=(eval=='mae'),
|
||||
average=True,
|
||||
maxtone=MAXTONE
|
||||
)
|
||||
tab = new_table(datasets, METHODS)
|
||||
|
||||
for dataset in datasets:
|
||||
print(f'\t Dataset: {dataset}: ', end='')
|
||||
|
@ -122,17 +114,7 @@ def gen_tables_uci_bin(eval):
|
|||
exclude = ['acute.a', 'acute.b', 'iris.1', 'balance.2']
|
||||
datasets = [x for x in qp.datasets.UCI_DATASETS if x not in exclude]
|
||||
|
||||
tab = Table(
|
||||
benchmarks=datasets,
|
||||
methods=BIN_METHODS,
|
||||
ttest='wilcoxon',
|
||||
prec_mean=4,
|
||||
show_std=SHOW_STD,
|
||||
prec_std=4,
|
||||
clean_zero=(eval=='mae'),
|
||||
average=True,
|
||||
maxtone=MAXTONE
|
||||
)
|
||||
tab = new_table(datasets, BIN_METHODS)
|
||||
|
||||
for dataset in datasets:
|
||||
print(f'\t Dataset: {dataset}: ', end='')
|
||||
|
@ -156,17 +138,7 @@ def gen_tables_tweet(eval):
|
|||
|
||||
datasets = qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST
|
||||
|
||||
tab = Table(
|
||||
benchmarks=datasets,
|
||||
methods=METHODS,
|
||||
ttest='wilcoxon',
|
||||
prec_mean=4,
|
||||
show_std=SHOW_STD,
|
||||
prec_std=4,
|
||||
clean_zero=(eval=='mae'),
|
||||
average=True,
|
||||
maxtone=MAXTONE
|
||||
)
|
||||
tab = new_table(datasets, METHODS)
|
||||
|
||||
for dataset in datasets:
|
||||
print(f'\t Dataset: {dataset}: ', end='')
|
||||
|
@ -185,19 +157,8 @@ def gen_tables_tweet(eval):
|
|||
|
||||
def gen_tables_lequa(Methods, task, eval):
|
||||
# generating table for LeQua-T1A or Lequa-T1B; only one table with two rows, one for MAE, another for MRAE
|
||||
dataset_name = 'LeQua-'+task
|
||||
|
||||
tab = Table(
|
||||
benchmarks=[f'Average'],
|
||||
methods=Methods,
|
||||
ttest='wilcoxon',
|
||||
prec_mean=5,
|
||||
show_std=SHOW_STD,
|
||||
prec_std=4,
|
||||
clean_zero=False,
|
||||
average=False,
|
||||
maxtone=MAXTONE
|
||||
)
|
||||
tab = new_table([f'Average'], Methods)
|
||||
|
||||
print('Generating table for T1A@Lequa', eval, end='')
|
||||
dir_results = f'../results/lequa/{task}/{eval}'
|
||||
|
|
|
@ -65,7 +65,7 @@
|
|||
\centering
|
||||
\caption{Multiclass RAE}
|
||||
\resizebox{\textwidth}{!}{%
|
||||
\input{multiclass_mae}
|
||||
\input{multiclass_mrae}
|
||||
}%
|
||||
\end{table}
|
||||
|
||||
|
|
|
@ -0,0 +1,57 @@
|
|||
import numpy as np
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
import os
|
||||
|
||||
import quapy as qp
|
||||
from distribution_matching.commons import show_results
|
||||
from quapy.method.aggregative import DMy
|
||||
from distribution_matching.method.method_kdey import KDEy
|
||||
from quapy.protocol import UPP
|
||||
|
||||
SEED=1
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
qp.environ['SAMPLE_SIZE'] = 100
|
||||
qp.environ['N_JOBS'] = -1
|
||||
n_bags_val = 250
|
||||
n_bags_test = 1000
|
||||
result_dir = f'results/tweet/sensibility'
|
||||
|
||||
os.makedirs(result_dir, exist_ok=True)
|
||||
|
||||
for method, param, grid in [
|
||||
('KDEy-ML', 'Bandwidth', np.linspace(0.01, 0.2, 20)),
|
||||
('DM-HD', 'nbins', list(range(2,10)) + list(range(10,34,2)))
|
||||
]:
|
||||
|
||||
global_result_path = f'{result_dir}/{method}'
|
||||
|
||||
if not os.path.exists(global_result_path+'.csv'):
|
||||
with open(global_result_path+'.csv', 'wt') as csv:
|
||||
csv.write(f'Method\tDataset\t{param}\tMAE\tMRAE\tKLD\n')
|
||||
|
||||
with open(global_result_path+'.csv', 'at') as csv:
|
||||
for val in grid:
|
||||
for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST:
|
||||
print('init', dataset)
|
||||
|
||||
local_result_path = global_result_path + '_' + dataset + (f'_{val:.3f}' if isinstance(val, float) else f'{val}')
|
||||
|
||||
with qp.util.temp_seed(SEED):
|
||||
|
||||
data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=False)
|
||||
|
||||
if method == 'KDEy-ML':
|
||||
quantifier = KDEy(LogisticRegression(n_jobs=-1), target='max_likelihood', val_split=10, bandwidth=val)
|
||||
elif method == 'DM-HD':
|
||||
quantifier = DMy(LogisticRegression(n_jobs=-1), val_split=10, nbins=val, divergence='HD', n_jobs=-1)
|
||||
quantifier.fit(data.training)
|
||||
protocol = UPP(data.test, repeats=n_bags_test)
|
||||
report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], verbose=True, n_jobs=-1)
|
||||
report.to_csv(f'{local_result_path}.dataframe')
|
||||
means = report.mean()
|
||||
csv.write(f'{method}\t{data.name}\t{val}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
|
||||
csv.flush()
|
||||
|
||||
show_results(global_result_path)
|
|
@ -1,5 +1,6 @@
|
|||
import pickle
|
||||
import os
|
||||
from data.base import LabelledCollection
|
||||
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
|
|
@ -0,0 +1,63 @@
|
|||
import numpy as np
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
import os
|
||||
import quapy as qp
|
||||
from distribution_matching.commons import show_results
|
||||
from distribution_matching.method.method_kdey import KDEy
|
||||
from quapy.method.aggregative import DMy
|
||||
from quapy.protocol import UPP
|
||||
|
||||
|
||||
SEED=1
|
||||
|
||||
def task(val):
|
||||
print('job-init', dataset, val)
|
||||
|
||||
with qp.util.temp_seed(SEED):
|
||||
if method=='KDEy-ML':
|
||||
quantifier = KDEy(LogisticRegression(), target='max_likelihood', val_split=10, bandwidth=val)
|
||||
elif method == 'DM-HD':
|
||||
quantifier = DMy(LogisticRegression(), val_split=10, nbins=val, divergence='HD')
|
||||
|
||||
quantifier.fit(data.data)
|
||||
protocol = UPP(data.test, repeats=n_bags_test)
|
||||
report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'],
|
||||
verbose=True, n_jobs=-1)
|
||||
return report
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
qp.environ['SAMPLE_SIZE'] = 500
|
||||
qp.environ['N_JOBS'] = -1
|
||||
n_bags_val = 250
|
||||
n_bags_test = 1000
|
||||
result_dir = f'results/ucimulti/sensibility'
|
||||
|
||||
os.makedirs(result_dir, exist_ok=True)
|
||||
|
||||
for dataset in qp.datasets.UCI_MULTICLASS_DATASETS:
|
||||
|
||||
data = qp.datasets.fetch_UCIMulticlassDataset(dataset)
|
||||
|
||||
for method, param, grid in [
|
||||
('KDEy-ML', 'Bandwidth', np.linspace(0.01, 0.2, 20)),
|
||||
('DM-HD', 'nbins', list(range(2, 10)) + list(range(10, 34, 2)))
|
||||
]:
|
||||
|
||||
global_result_path = f'{result_dir}/{method}'
|
||||
|
||||
if not os.path.exists(global_result_path+'.csv'):
|
||||
with open(global_result_path+'.csv', 'wt') as csv:
|
||||
csv.write(f'Method\tDataset\t{param}\tMAE\tMRAE\tKLD\n')
|
||||
|
||||
reports = qp.util.parallel(task, grid, n_jobs=-1)
|
||||
with open(global_result_path + '.csv', 'at') as csv:
|
||||
for val, report in zip(grid, reports):
|
||||
means = report.mean()
|
||||
local_result_path = global_result_path + '_' + dataset + (f'_{val:.3f}' if isinstance(val, float) else f'{val}')
|
||||
report.to_csv(f'{local_result_path}.dataframe')
|
||||
csv.write(f'{method}\t{dataset}\t{val}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
|
||||
csv.flush()
|
||||
|
||||
show_results(global_result_path)
|
|
@ -5,7 +5,7 @@ import pandas as pd
|
|||
|
||||
import quapy as qp
|
||||
from method.aggregative import DistributionMatching
|
||||
from distribution_matching.method_kdey import KDEy
|
||||
from distribution_matching.method.method_kdey import KDEy
|
||||
from protocol import UPP
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue