From 1d893010893d719664b161c2f33c676f3e6a2aff Mon Sep 17 00:00:00 2001
From: Alex Moreo <alejandro.moreo@isti.cnr.it>
Date: Wed, 27 Jan 2021 22:49:54 +0100
Subject: [PATCH] more uci datasets, plots improved (higher fonts), and
 evaluation script that shows numerical results in command line

---
 TweetSentQuant/evaluate_results.py | 28 +++++++++++++
 TweetSentQuant/gen_plots.py        | 33 +++++++--------
 TweetSentQuant/util.py             |  2 +
 quapy/__init__.py                  |  3 +-
 quapy/data/datasets.py             | 64 +++++++++++++++++++++++++-----
 quapy/plot.py                      |  6 ++-
 test.py                            |  4 +-
 7 files changed, 108 insertions(+), 32 deletions(-)
 create mode 100644 TweetSentQuant/evaluate_results.py

diff --git a/TweetSentQuant/evaluate_results.py b/TweetSentQuant/evaluate_results.py
new file mode 100644
index 0000000..a8aba9d
--- /dev/null
+++ b/TweetSentQuant/evaluate_results.py
@@ -0,0 +1,28 @@
+import quapy as qp
+import settings
+import os
+import pickle
+from glob import glob
+import itertools
+import pathlib
+
+qp.environ['SAMPLE_SIZE'] = settings.SAMPLE_SIZE
+
+resultdir = './results'
+methods = ['*']
+
+def evaluate_results(methods, datasets, error_name):
+    results_str = []
+    error = qp.error.from_name(error_name)
+    for method, dataset in itertools.product(methods, datasets):
+        for experiment in glob(f'{resultdir}/{dataset}-{method}-{error_name}.pkl'):
+            true_prevalences, estim_prevalences, tr_prev, te_prev, te_prev_estim, best_params = \
+                pickle.load(open(experiment, 'rb'))
+            result = error(true_prevalences, estim_prevalences)
+            string = f'{pathlib.Path(experiment).name}: {result:.3f}'
+            results_str.append(string)
+    results_str = sorted(results_str)
+    for r in results_str:
+        print(r)
+
+evaluate_results(methods=['epacc*mae1k'], datasets=['*'], error_name='mae')
\ No newline at end of file
diff --git a/TweetSentQuant/gen_plots.py b/TweetSentQuant/gen_plots.py
index 62d63be..4952999 100644
--- a/TweetSentQuant/gen_plots.py
+++ b/TweetSentQuant/gen_plots.py
@@ -10,6 +10,7 @@ from os.path import join
 
 
 qp.environ['SAMPLE_SIZE'] = settings.SAMPLE_SIZE
+plotext='png'
 
 resultdir = './results'
 plotdir = './plots'
@@ -30,7 +31,7 @@ def gather_results(methods, error_name):
 def plot_error_by_drift(methods, error_name, logscale=False, path=None):
     print('plotting error by drift')
     if path is not None:
-        path = join(path, f'error_by_drift_{error_name}.pdf')
+        path = join(path, f'error_by_drift_{error_name}.{plotext}')
     method_names, true_prevs, estim_prevs, tr_prevs = gather_results(methods, error_name)
     qp.plot.error_by_drift(
         method_names,
@@ -51,9 +52,9 @@ def diagonal_plot(methods, error_name, path=None):
     if path is not None:
         path = join(path, f'diag_{error_name}')
     method_names, true_prevs, estim_prevs, tr_prevs = gather_results(methods, error_name)
-    qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=0, title='Negative', legend=False, show_std=False, savepath=path+'_neg.pdf')
-    qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title='Neutral',  legend=False, show_std=False, savepath=path+'_neu.pdf')
-    qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=2, title='Positive', legend=True, show_std=False, savepath=path+'_pos.pdf')
+    qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=0, title='Negative', legend=False, show_std=False, savepath=f'{path}_neg.{plotext}')
+    qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title='Neutral',  legend=False, show_std=False, savepath=f'{path}_neu.{plotext}')
+    qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=2, title='Positive', legend=True, show_std=False, savepath=f'{path}_pos.{plotext}')
 
 
 def binary_bias_global(methods, error_name, path=None):
@@ -61,9 +62,9 @@ def binary_bias_global(methods, error_name, path=None):
     if path is not None:
         path = join(path, f'globalbias_{error_name}')
     method_names, true_prevs, estim_prevs, tr_prevs = gather_results(methods, error_name)
-    qp.plot.binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=0, title='Negative', savepath=path+'_neg.pdf')
-    qp.plot.binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=1, title='Neutral', savepath=path+'_neu.pdf')
-    qp.plot.binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=2, title='Positive', savepath=path+'_pos.pdf')
+    qp.plot.binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=0, title='Negative', savepath=f'{path}_neg.{plotext}')
+    qp.plot.binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=1, title='Neutral', savepath=f'{path}_neu.{plotext}')
+    qp.plot.binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=2, title='Positive', savepath=f'{path}_pos.{plotext}')
 
 
 def binary_bias_bins(methods, error_name, path=None):
@@ -71,24 +72,24 @@ def binary_bias_bins(methods, error_name, path=None):
     if path is not None:
         path = join(path, f'localbias_{error_name}')
     method_names, true_prevs, estim_prevs, tr_prevs = gather_results(methods, error_name)
-    qp.plot.binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=0, title='Negative', legend=False, savepath=path+'_neg.pdf')
-    qp.plot.binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title='Neutral', legend=False, savepath=path+'_neu.pdf')
-    qp.plot.binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=2, title='Positive', legend=True, savepath=path+'_pos.pdf')
+    qp.plot.binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=0, title='Negative', legend=False, savepath=f'{path}_neg.{plotext}')
+    qp.plot.binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title='Neutral', legend=False, savepath=f'{path}_neu.{plotext}')
+    qp.plot.binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=2, title='Positive', legend=True, savepath=f'{path}_pos.{plotext}')
 
 
 gao_seb_methods = ['cc', 'acc', 'pcc', 'pacc', 'sld', 'svmq', 'svmkld', 'svmnkld']
 new_methods_ae = ['svmmae' , 'epaccmaeptr', 'epaccmaemae', 'hdy', 'quanet']
 new_methods_rae = ['svmmrae' , 'epaccmraeptr', 'epaccmraemrae', 'hdy', 'quanet']
 
-# plot_error_by_drift(gao_seb_methods+new_methods_ae, error_name='ae', path=plotdir)
-# plot_error_by_drift(gao_seb_methods+new_methods_rae, error_name='rae', logscale=True, path=plotdir)
+plot_error_by_drift(gao_seb_methods+new_methods_ae, error_name='ae', path=plotdir)
+plot_error_by_drift(gao_seb_methods+new_methods_rae, error_name='rae', logscale=True, path=plotdir)
 
-# diagonal_plot(gao_seb_methods+new_methods_ae, error_name='ae', path=plotdir)
-# diagonal_plot(gao_seb_methods+new_methods_rae, error_name='rae', path=plotdir)
+diagonal_plot(gao_seb_methods+new_methods_ae, error_name='ae', path=plotdir)
+diagonal_plot(gao_seb_methods+new_methods_rae, error_name='rae', path=plotdir)
 
 binary_bias_global(gao_seb_methods+new_methods_ae, error_name='ae', path=plotdir)
 binary_bias_global(gao_seb_methods+new_methods_rae, error_name='rae', path=plotdir)
 
-# binary_bias_bins(gao_seb_methods+new_methods_ae, error_name='ae', path=plotdir)
-# binary_bias_bins(gao_seb_methods+new_methods_rae, error_name='rae', path=plotdir)
+#binary_bias_bins(gao_seb_methods+new_methods_ae, error_name='ae', path=plotdir)
+#binary_bias_bins(gao_seb_methods+new_methods_rae, error_name='rae', path=plotdir)
 
diff --git a/TweetSentQuant/util.py b/TweetSentQuant/util.py
index 7705992..fef866e 100644
--- a/TweetSentQuant/util.py
+++ b/TweetSentQuant/util.py
@@ -1,3 +1,5 @@
+import numpy as np
+
 
 nice = {
     'mae':'AE',
diff --git a/quapy/__init__.py b/quapy/__init__.py
index f2cd0ac..00fceb1 100644
--- a/quapy/__init__.py
+++ b/quapy/__init__.py
@@ -10,6 +10,8 @@ from . import model_selection
 from . import classification
 from quapy.method.base import isprobabilistic, isaggregative
 
+__version__ = '0.1'
+
 environ = {
     'SAMPLE_SIZE': None,
     'UNK_TOKEN': '[UNK]',
@@ -18,6 +20,5 @@ environ = {
     'PAD_INDEX': 1,
 }
 
-
 def isbinary(x):
     return x.binary
\ No newline at end of file
diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py
index bdec1c2..00c4d7d 100644
--- a/quapy/data/datasets.py
+++ b/quapy/data/datasets.py
@@ -148,7 +148,11 @@ UCI_DATASETS = ['acute.a', 'acute.b',
                 'pageblocks.5',
                 #'phoneme', # <-- I haven't found this one...
                 'semeion',
-                'sonar'] # ongoing...
+                'sonar',
+                'spambase',
+                'spectf',
+                'tictactoe',
+                'transfusion'] # ongoing...
 
 def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3):
 
@@ -180,8 +184,11 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
         'mammographic': 'Mammographic Mass',
         'pageblocks.5': 'Page Blocks Classification (5)',
         'semeion': 'Semeion Handwritten Digit (8)',
-        'sonar': 'Sonar, Mines vs. Rocks'
-
+        'sonar': 'Sonar, Mines vs. Rocks',
+        'spambase': 'Spambase Data Set',
+        'spectf': 'SPECTF Heart Data',
+        'tictactoe': 'Tic-Tac-Toe Endgame Database',
+        'transfusion': 'Blood Transfusion Service Center Data Set '
     }
 
     # the identifier is an alias for the dataset group, it's part of the url data-folder, and is the name we use
@@ -208,8 +215,11 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
         'mammographic': 'mammographic-masses',
         'pageblocks.5': 'page-blocks',
         'semeion': 'semeion',
-        'sonar': 'undocumented/connectionist-bench/sonar'
-
+        'sonar': 'undocumented/connectionist-bench/sonar',
+        'spambase': 'spambase',
+        'spectf': 'spect',
+        'tictactoe': 'tic-tac-toe',
+        'transfusion': 'blood-transfusion'
     }
 
     # the filename is the name of the file within the data_folder indexed by the identifier
@@ -219,7 +229,9 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
         'statlog/german': 'german.data-numeric',
         'mammographic-masses': 'mammographic_masses.data',
         'page-blocks': 'page-blocks.data.Z',
-        'undocumented/connectionist-bench/sonar': 'sonar.all-data'
+        'undocumented/connectionist-bench/sonar': 'sonar.all-data',
+        'spect': ['SPECTF.train', 'SPECTF.test'],
+        'blood-transfusion': 'transfusion.data'
     }
 
     # the filename containing the dataset description (if any)
@@ -228,7 +240,9 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
         '00193': None,
         'statlog/german': 'german.doc',
         'mammographic-masses': 'mammographic_masses.names',
-        'undocumented/connectionist-bench/sonar': 'sonar.names'
+        'undocumented/connectionist-bench/sonar': 'sonar.names',
+        'spect': 'SPECTF.names',
+        'blood-transfusion': 'transfusion.names'
     }
 
     identifier = identifier_map[dataset_name]
@@ -238,8 +252,9 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
 
     URL = f'http://archive.ics.uci.edu/ml/machine-learning-databases/{identifier}'
     data_dir = join(data_home, 'uci_datasets', identifier)
-    data_path = join(data_dir, filename)
-    download_file_if_not_exists(f'{URL}/{filename}', data_path)
+    if isinstance(filename, str):  # filename could be a list of files, in which case it will be processed later
+        data_path = join(data_dir, filename)
+        download_file_if_not_exists(f'{URL}/{filename}', data_path)
 
     if descfile:
         try:
@@ -368,11 +383,38 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
 
     if identifier == 'undocumented/connectionist-bench/sonar':
         df = pd.read_csv(data_path, header=None, sep=',')
-        print(df)
         X = df.iloc[:, 0:60].astype(float).values
-        y = df[60].values 
+        y = df[60].values
         y = binarize(y, pos_class='R')
 
+    if identifier == 'spambase':
+        df = pd.read_csv(data_path, header=None, sep=',')
+        X = df.iloc[:, 0:57].astype(float).values
+        y = df[57].values
+        y = binarize(y, pos_class=1)
+
+    if identifier == 'spect':
+        dfs = []
+        for file in  filename:
+            data_path = join(data_dir, file)
+            download_file_if_not_exists(f'{URL}/{filename}', data_path)
+            dfs.append(pd.read_csv(data_path, header=None, sep=','))
+        df = pd.concat(dfs)
+        X = df.iloc[:, 1:45].astype(float).values
+        y = df[0].values
+        y = binarize(y, pos_class=0)
+
+    if identifier == 'tic-tac-toe':
+        df = pd.read_csv(data_path, header=None, sep=',')
+        X = df.iloc[:, 0:9].replace('o',0).replace('b',1).replace('x',2).values
+        y = df[9].values
+        y = binarize(y, pos_class='negative')
+
+    if identifier == 'blood-transfusion':
+        df = pd.read_csv(data_path, sep=',')
+        X = df.iloc[:, 0:4].astype(float).values
+        y = df.iloc[:, 4].values
+        y = binarize(y, pos_class=1)
 
     data = LabelledCollection(X, y)
     data.stats()
diff --git a/quapy/plot.py b/quapy/plot.py
index ea757ec..270fb80 100644
--- a/quapy/plot.py
+++ b/quapy/plot.py
@@ -5,9 +5,11 @@ import numpy as np
 from matplotlib import cm
 
 import quapy as qp
+from matplotlib.font_manager import FontProperties
 
 plt.rcParams['figure.figsize'] = [12, 8]
 plt.rcParams['figure.dpi'] = 200
+plt.rcParams['font.size'] = 16
 
 
 def binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title=None, show_std=True, legend=True, savepath=None):
@@ -44,11 +46,11 @@ def binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title=No
 
 
 def binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=1, title=None, savepath=None):
+    method_names, true_prevs, estim_prevs = _merge(method_names, true_prevs, estim_prevs)
+
     fig, ax = plt.subplots()
     ax.grid()
 
-    method_names, true_prevs, estim_prevs = _merge(method_names, true_prevs, estim_prevs)
-
     data, labels = [], []
     for method, true_prev, estim_prev in zip(method_names, true_prevs, estim_prevs):
         true_prev = true_prev[:,pos_class]
diff --git a/test.py b/test.py
index 9641fde..b301c54 100644
--- a/test.py
+++ b/test.py
@@ -12,8 +12,8 @@ from classification.neural import NeuralClassifierTrainer, CNNnet
 from method.meta import EPACC
 from quapy.model_selection import GridSearchQ
 
-# dataset = qp.datasets.fetch_UCIDataset('sonar', verbose=True)
-# sys.exit(0)
+dataset = qp.datasets.fetch_UCIDataset('transfusion', verbose=True)
+sys.exit(0)
 
 
 qp.environ['SAMPLE_SIZE'] = 500