more uci datasets, plots improved (higher fonts), and evaluation script that shows numerical results in command line

This commit is contained in:
Alejandro Moreo Fernandez 2021-01-27 22:49:54 +01:00
parent e609c262b4
commit 1d89301089
7 changed files with 108 additions and 32 deletions

View File

@ -0,0 +1,28 @@
import quapy as qp
import settings
import os
import pickle
from glob import glob
import itertools
import pathlib
qp.environ['SAMPLE_SIZE'] = settings.SAMPLE_SIZE
resultdir = './results'
methods = ['*']
def evaluate_results(methods, datasets, error_name):
results_str = []
error = qp.error.from_name(error_name)
for method, dataset in itertools.product(methods, datasets):
for experiment in glob(f'{resultdir}/{dataset}-{method}-{error_name}.pkl'):
true_prevalences, estim_prevalences, tr_prev, te_prev, te_prev_estim, best_params = \
pickle.load(open(experiment, 'rb'))
result = error(true_prevalences, estim_prevalences)
string = f'{pathlib.Path(experiment).name}: {result:.3f}'
results_str.append(string)
results_str = sorted(results_str)
for r in results_str:
print(r)
evaluate_results(methods=['epacc*mae1k'], datasets=['*'], error_name='mae')

View File

@ -10,6 +10,7 @@ from os.path import join
qp.environ['SAMPLE_SIZE'] = settings.SAMPLE_SIZE
plotext='png'
resultdir = './results'
plotdir = './plots'
@ -30,7 +31,7 @@ def gather_results(methods, error_name):
def plot_error_by_drift(methods, error_name, logscale=False, path=None):
print('plotting error by drift')
if path is not None:
path = join(path, f'error_by_drift_{error_name}.pdf')
path = join(path, f'error_by_drift_{error_name}.{plotext}')
method_names, true_prevs, estim_prevs, tr_prevs = gather_results(methods, error_name)
qp.plot.error_by_drift(
method_names,
@ -51,9 +52,9 @@ def diagonal_plot(methods, error_name, path=None):
if path is not None:
path = join(path, f'diag_{error_name}')
method_names, true_prevs, estim_prevs, tr_prevs = gather_results(methods, error_name)
qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=0, title='Negative', legend=False, show_std=False, savepath=path+'_neg.pdf')
qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title='Neutral', legend=False, show_std=False, savepath=path+'_neu.pdf')
qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=2, title='Positive', legend=True, show_std=False, savepath=path+'_pos.pdf')
qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=0, title='Negative', legend=False, show_std=False, savepath=f'{path}_neg.{plotext}')
qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title='Neutral', legend=False, show_std=False, savepath=f'{path}_neu.{plotext}')
qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=2, title='Positive', legend=True, show_std=False, savepath=f'{path}_pos.{plotext}')
def binary_bias_global(methods, error_name, path=None):
@ -61,9 +62,9 @@ def binary_bias_global(methods, error_name, path=None):
if path is not None:
path = join(path, f'globalbias_{error_name}')
method_names, true_prevs, estim_prevs, tr_prevs = gather_results(methods, error_name)
qp.plot.binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=0, title='Negative', savepath=path+'_neg.pdf')
qp.plot.binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=1, title='Neutral', savepath=path+'_neu.pdf')
qp.plot.binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=2, title='Positive', savepath=path+'_pos.pdf')
qp.plot.binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=0, title='Negative', savepath=f'{path}_neg.{plotext}')
qp.plot.binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=1, title='Neutral', savepath=f'{path}_neu.{plotext}')
qp.plot.binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=2, title='Positive', savepath=f'{path}_pos.{plotext}')
def binary_bias_bins(methods, error_name, path=None):
@ -71,24 +72,24 @@ def binary_bias_bins(methods, error_name, path=None):
if path is not None:
path = join(path, f'localbias_{error_name}')
method_names, true_prevs, estim_prevs, tr_prevs = gather_results(methods, error_name)
qp.plot.binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=0, title='Negative', legend=False, savepath=path+'_neg.pdf')
qp.plot.binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title='Neutral', legend=False, savepath=path+'_neu.pdf')
qp.plot.binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=2, title='Positive', legend=True, savepath=path+'_pos.pdf')
qp.plot.binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=0, title='Negative', legend=False, savepath=f'{path}_neg.{plotext}')
qp.plot.binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title='Neutral', legend=False, savepath=f'{path}_neu.{plotext}')
qp.plot.binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=2, title='Positive', legend=True, savepath=f'{path}_pos.{plotext}')
gao_seb_methods = ['cc', 'acc', 'pcc', 'pacc', 'sld', 'svmq', 'svmkld', 'svmnkld']
new_methods_ae = ['svmmae' , 'epaccmaeptr', 'epaccmaemae', 'hdy', 'quanet']
new_methods_rae = ['svmmrae' , 'epaccmraeptr', 'epaccmraemrae', 'hdy', 'quanet']
# plot_error_by_drift(gao_seb_methods+new_methods_ae, error_name='ae', path=plotdir)
# plot_error_by_drift(gao_seb_methods+new_methods_rae, error_name='rae', logscale=True, path=plotdir)
plot_error_by_drift(gao_seb_methods+new_methods_ae, error_name='ae', path=plotdir)
plot_error_by_drift(gao_seb_methods+new_methods_rae, error_name='rae', logscale=True, path=plotdir)
# diagonal_plot(gao_seb_methods+new_methods_ae, error_name='ae', path=plotdir)
# diagonal_plot(gao_seb_methods+new_methods_rae, error_name='rae', path=plotdir)
diagonal_plot(gao_seb_methods+new_methods_ae, error_name='ae', path=plotdir)
diagonal_plot(gao_seb_methods+new_methods_rae, error_name='rae', path=plotdir)
binary_bias_global(gao_seb_methods+new_methods_ae, error_name='ae', path=plotdir)
binary_bias_global(gao_seb_methods+new_methods_rae, error_name='rae', path=plotdir)
# binary_bias_bins(gao_seb_methods+new_methods_ae, error_name='ae', path=plotdir)
# binary_bias_bins(gao_seb_methods+new_methods_rae, error_name='rae', path=plotdir)
#binary_bias_bins(gao_seb_methods+new_methods_ae, error_name='ae', path=plotdir)
#binary_bias_bins(gao_seb_methods+new_methods_rae, error_name='rae', path=plotdir)

View File

@ -1,3 +1,5 @@
import numpy as np
nice = {
'mae':'AE',

View File

@ -10,6 +10,8 @@ from . import model_selection
from . import classification
from quapy.method.base import isprobabilistic, isaggregative
__version__ = '0.1'
environ = {
'SAMPLE_SIZE': None,
'UNK_TOKEN': '[UNK]',
@ -18,6 +20,5 @@ environ = {
'PAD_INDEX': 1,
}
def isbinary(x):
return x.binary

View File

@ -148,7 +148,11 @@ UCI_DATASETS = ['acute.a', 'acute.b',
'pageblocks.5',
#'phoneme', # <-- I haven't found this one...
'semeion',
'sonar'] # ongoing...
'sonar',
'spambase',
'spectf',
'tictactoe',
'transfusion'] # ongoing...
def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3):
@ -180,8 +184,11 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
'mammographic': 'Mammographic Mass',
'pageblocks.5': 'Page Blocks Classification (5)',
'semeion': 'Semeion Handwritten Digit (8)',
'sonar': 'Sonar, Mines vs. Rocks'
'sonar': 'Sonar, Mines vs. Rocks',
'spambase': 'Spambase Data Set',
'spectf': 'SPECTF Heart Data',
'tictactoe': 'Tic-Tac-Toe Endgame Database',
'transfusion': 'Blood Transfusion Service Center Data Set '
}
# the identifier is an alias for the dataset group, it's part of the url data-folder, and is the name we use
@ -208,8 +215,11 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
'mammographic': 'mammographic-masses',
'pageblocks.5': 'page-blocks',
'semeion': 'semeion',
'sonar': 'undocumented/connectionist-bench/sonar'
'sonar': 'undocumented/connectionist-bench/sonar',
'spambase': 'spambase',
'spectf': 'spect',
'tictactoe': 'tic-tac-toe',
'transfusion': 'blood-transfusion'
}
# the filename is the name of the file within the data_folder indexed by the identifier
@ -219,7 +229,9 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
'statlog/german': 'german.data-numeric',
'mammographic-masses': 'mammographic_masses.data',
'page-blocks': 'page-blocks.data.Z',
'undocumented/connectionist-bench/sonar': 'sonar.all-data'
'undocumented/connectionist-bench/sonar': 'sonar.all-data',
'spect': ['SPECTF.train', 'SPECTF.test'],
'blood-transfusion': 'transfusion.data'
}
# the filename containing the dataset description (if any)
@ -228,7 +240,9 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
'00193': None,
'statlog/german': 'german.doc',
'mammographic-masses': 'mammographic_masses.names',
'undocumented/connectionist-bench/sonar': 'sonar.names'
'undocumented/connectionist-bench/sonar': 'sonar.names',
'spect': 'SPECTF.names',
'blood-transfusion': 'transfusion.names'
}
identifier = identifier_map[dataset_name]
@ -238,8 +252,9 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
URL = f'http://archive.ics.uci.edu/ml/machine-learning-databases/{identifier}'
data_dir = join(data_home, 'uci_datasets', identifier)
data_path = join(data_dir, filename)
download_file_if_not_exists(f'{URL}/{filename}', data_path)
if isinstance(filename, str): # filename could be a list of files, in which case it will be processed later
data_path = join(data_dir, filename)
download_file_if_not_exists(f'{URL}/{filename}', data_path)
if descfile:
try:
@ -368,11 +383,38 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
if identifier == 'undocumented/connectionist-bench/sonar':
df = pd.read_csv(data_path, header=None, sep=',')
print(df)
X = df.iloc[:, 0:60].astype(float).values
y = df[60].values
y = df[60].values
y = binarize(y, pos_class='R')
if identifier == 'spambase':
df = pd.read_csv(data_path, header=None, sep=',')
X = df.iloc[:, 0:57].astype(float).values
y = df[57].values
y = binarize(y, pos_class=1)
if identifier == 'spect':
dfs = []
for file in filename:
data_path = join(data_dir, file)
download_file_if_not_exists(f'{URL}/{filename}', data_path)
dfs.append(pd.read_csv(data_path, header=None, sep=','))
df = pd.concat(dfs)
X = df.iloc[:, 1:45].astype(float).values
y = df[0].values
y = binarize(y, pos_class=0)
if identifier == 'tic-tac-toe':
df = pd.read_csv(data_path, header=None, sep=',')
X = df.iloc[:, 0:9].replace('o',0).replace('b',1).replace('x',2).values
y = df[9].values
y = binarize(y, pos_class='negative')
if identifier == 'blood-transfusion':
df = pd.read_csv(data_path, sep=',')
X = df.iloc[:, 0:4].astype(float).values
y = df.iloc[:, 4].values
y = binarize(y, pos_class=1)
data = LabelledCollection(X, y)
data.stats()

View File

@ -5,9 +5,11 @@ import numpy as np
from matplotlib import cm
import quapy as qp
from matplotlib.font_manager import FontProperties
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['figure.dpi'] = 200
plt.rcParams['font.size'] = 16
def binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title=None, show_std=True, legend=True, savepath=None):
@ -44,11 +46,11 @@ def binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title=No
def binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=1, title=None, savepath=None):
method_names, true_prevs, estim_prevs = _merge(method_names, true_prevs, estim_prevs)
fig, ax = plt.subplots()
ax.grid()
method_names, true_prevs, estim_prevs = _merge(method_names, true_prevs, estim_prevs)
data, labels = [], []
for method, true_prev, estim_prev in zip(method_names, true_prevs, estim_prevs):
true_prev = true_prev[:,pos_class]

View File

@ -12,8 +12,8 @@ from classification.neural import NeuralClassifierTrainer, CNNnet
from method.meta import EPACC
from quapy.model_selection import GridSearchQ
# dataset = qp.datasets.fetch_UCIDataset('sonar', verbose=True)
# sys.exit(0)
dataset = qp.datasets.fetch_UCIDataset('transfusion', verbose=True)
sys.exit(0)
qp.environ['SAMPLE_SIZE'] = 500