From 41347b50f9f9b1292dc661126dc6b04fa64bd0f2 Mon Sep 17 00:00:00 2001
From: Alex Moreo <alejandro.moreo@isti.cnr.it>
Date: Mon, 11 Jan 2021 12:55:06 +0100
Subject: [PATCH] cleaning and adding some uci datasets

---
 quapy/data/datasets.py      |  43 ++++++++++++++-
 quapy/method/aggregative.py | 105 +++++++++++++++++++++++++++---------
 quapy/plot.py               |  15 +++---
 3 files changed, 128 insertions(+), 35 deletions(-)

diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py
index 0ef233e..b919959 100644
--- a/quapy/data/datasets.py
+++ b/quapy/data/datasets.py
@@ -120,7 +120,10 @@ def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_hom
 
 
 UCI_DATASETS = ['acute.a', 'acute.b',
-                'balance.1', 'balance.2', 'balance.3']
+                'balance.1', 'balance.2', 'balance.3',
+                'breast-cancer',
+                'cmc.1', 'cmc.2', 'cmc.3',
+                'ctg.1', 'ctg.2', 'ctg.3'] # ongoing...
 
 def fetch_UCIDataset(dataset_name, data_home=None, verbose=False):
 
@@ -136,6 +139,14 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False):
         'balance.1': 'balance-scale',
         'balance.2': 'balance-scale',
         'balance.3': 'balance-scale',
+        'breast-cancer': 'breast-cancer-wisconsin',
+        'cmc.1': 'cmc',
+        'cmc.2': 'cmc',
+        'cmc.3': 'cmc',
+        'ctg.1': 'ctg',
+        'ctg.2': 'ctg',
+        'ctg.3': 'ctg',
+
     }
 
     dataset_fullname = {
@@ -144,11 +155,20 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False):
         'balance.1': 'Balance Scale Weight & Distance Database (left)',
         'balance.2': 'Balance Scale Weight & Distance Database (balanced)',
         'balance.3': 'Balance Scale Weight & Distance Database (right)',
+        'breast-cancer':  'Breast Cancer Wisconsin (Original)',
+        'cmc.1': 'Contraceptive Method Choice (no use)',
+        'cmc.2': 'Contraceptive Method Choice (long term)',
+        'cmc.3': 'Contraceptive Method Choice (short term)',
+        'ctg.1': 'Cardiotocography Data Set (normal)',
+        'ctg.2': 'Cardiotocography Data Set (suspect)',
+        'ctg.3': 'Cardiotocography Data Set (pathologic)',
     }
 
     data_folder = {
         'acute': 'diagnosis',
         'balance-scale': 'balance-scale',
+        'breast-cancer-wisconsin': 'breast-cancer-wisconsin',
+        'cmc': 'cmc'
     }
 
     identifier = identifier_map[dataset_name]
@@ -183,8 +203,29 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False):
             y = binarize(df[0], pos_class='R')
         X = df.loc[:, 1:].astype(float).values
 
+    if identifier == 'breast-cancer-wisconsin':
+        df = pd.read_csv(f'{data_path}/{identifier}.data', header=None, sep=',')
+        Xy = df.loc[:, 1:10]
+        Xy[Xy=='?']=np.nan
+        Xy = Xy.dropna(axis=0)
+        X = Xy.loc[:, 1:9]
+        X = X.astype(float).values
+        y = binarize(Xy[10], pos_class=4)
+
+    if identifier == 'cmc':
+        df = pd.read_csv(f'{data_path}/{identifier}.data', header=None, sep=',')
+        X = df.loc[:, 0:8].astype(float).values
+        y = df[9].astype(int).values
+        if dataset_name == 'cmc.1':
+            y = binarize(y, pos_class=1)
+        elif dataset_name == 'cmc.2':
+            y = binarize(y, pos_class=2)
+        elif dataset_name == 'cmc.3':
+            y = binarize(y, pos_class=3)
+
     data = LabelledCollection(X, y)
     data.stats()
+    raise NotImplementedError()
     #print(df)
     #print(df.loc[:, 0:5].values)
     #print(y)
diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py
index 2613d8e..5a04123 100644
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@@ -11,6 +11,8 @@ from sklearn.calibration import CalibratedClassifierCV
 from joblib import Parallel, delayed
 from abc import abstractmethod
 from typing import Union
+from sklearn.model_selection import StratifiedKFold
+from tqdm import tqdm
 
 
 # Abstract classes
@@ -115,8 +117,8 @@ def training_helper(learner,
                 train = data
                 unused = val_split
             else:
-                raise ValueError('train_val_split not understood; use either a float indicating the split proportion, '
-                                 'or a LabelledCollection indicating the validation split')
+                raise ValueError('param "val_split" not understood; use either a float indicating the split '
+                                 'proportion, or a LabelledCollection indicating the validation split')
         else:
             train, unused = data, None
         learner.fit(train.instances, train.labels)
@@ -159,23 +161,49 @@ class ACC(AggregativeQuantifier):
     def __init__(self, learner:BaseEstimator):
         self.learner = learner
 
-    def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, LabelledCollection]=0.3):
+    def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, int, LabelledCollection]=0.3):
         """
         Trains a ACC quantifier
         :param data: the training set
         :param fit_learner: set to False to bypass the training (the learner is assumed to be already fit)
         :param val_split: either a float in (0,1) indicating the proportion of training instances to use for
          validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
-         indicating the validation set itself
+         indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV
+         to estimate the parameters
         :return: self
         """
-        self.learner, validation = training_helper(self.learner, data, fit_learner, val_split=val_split)
+        if isinstance(val_split, int):
+            # kFCV estimation of parameters
+            y, y_ = [], []
+            kfcv = StratifiedKFold(n_splits=val_split)
+            pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
+            for k, (training_idx, validation_idx) in enumerate(pbar):
+                pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
+                training = data.sampling_from_index(training_idx)
+                validation = data.sampling_from_index(validation_idx)
+                learner, val_data = training_helper(self.learner, training, fit_learner, val_split=validation)
+                y_.append(learner.predict(val_data.instances))
+                y.append(val_data.labels)
+
+            y = np.concatenate(y)
+            y_ = np.concatenate(y_)
+            class_count = data.counts()
+
+            # fit the learner on all data
+            self.learner.fit(*data.Xy)
+
+        else:
+            self.learner, val_data = training_helper(self.learner, data, fit_learner, val_split=val_split)
+            y_ = self.learner.predict(val_data.instances)
+            y = val_data.labels
+            class_count = val_data.counts()
+
         self.cc = CC(self.learner)
-        y_ = self.classify(validation.instances)
-        y  = validation.labels
+
         # estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
         # document that belongs to yj ends up being classified as belonging to yi
-        self.Pte_cond_estim_ = confusion_matrix(y,y_).T / validation.counts()
+        self.Pte_cond_estim_ = confusion_matrix(y, y_).T / class_count
+
         return self
 
     def classify(self, data):
@@ -216,33 +244,53 @@ class PACC(AggregativeProbabilisticQuantifier):
     def __init__(self, learner:BaseEstimator):
         self.learner = learner
 
-    def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, LabelledCollection]=0.3):
+    def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, int, LabelledCollection]=0.3):
         """
         Trains a PACC quantifier
         :param data: the training set
         :param fit_learner: set to False to bypass the training (the learner is assumed to be already fit)
         :param val_split: either a float in (0,1) indicating the proportion of training instances to use for
          validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
-         indicating the validation set itself
+         indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV
+         to estimate the parameters
         :return: self
         """
-        self.learner, validation = training_helper(
-            self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split
-        )
+        if isinstance(val_split, int):
+            # kFCV estimation of parameters
+            y, y_ = [], []
+            kfcv = StratifiedKFold(n_splits=val_split)
+            pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
+            for k, (training_idx, validation_idx) in enumerate(pbar):
+                pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
+                training = data.sampling_from_index(training_idx)
+                validation = data.sampling_from_index(validation_idx)
+                learner, val_data = training_helper(
+                    self.learner, training, fit_learner, ensure_probabilistic=True, val_split=validation)
+                y_.append(learner.predict_proba(val_data.instances))
+                y.append(val_data.labels)
+
+            y = np.concatenate(y)
+            y_ = np.vstack(y_)
+
+            # fit the learner on all data
+            self.learner.fit(*data.Xy)
+
+        else:
+            self.learner, val_data = training_helper(
+                self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split)
+            y_ = self.learner.predict_proba(val_data.instances)
+            y = val_data.labels
+
         self.pcc = PCC(self.learner)
-        y_ = self.soft_classify(validation.instances)
-        y  = validation.labels
+
+        # estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
+        # document that belongs to yj ends up being classified as belonging to yi
         confusion = np.empty(shape=(data.n_classes, data.n_classes))
         for yi in range(data.n_classes):
             confusion[yi] = y_[y==yi].mean(axis=0)
 
         self.Pte_cond_estim_ = confusion.T
 
-        #y_ = self.classify(validation.instances)
-        #y = validation.labels
-        # estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
-        # document that belongs to yj ends up being classified as belonging to yi
-        #self.Pte_cond_estim_ = confusion_matrix(y, y_).T / validation.counts()
         return self
 
     def aggregate(self, classif_posteriors):
@@ -261,7 +309,7 @@ class EMQ(AggregativeProbabilisticQuantifier):
     MAX_ITER = 1000
     EPSILON = 1e-4
 
-    def __init__(self, learner:BaseEstimator):
+    def __init__(self, learner: BaseEstimator):
         self.learner = learner
 
     def fit(self, data: LabelledCollection, fit_learner=True):
@@ -307,10 +355,10 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
     estimation based on the Hellinger distance. Information Sciences, 218:146–164.
     """
 
-    def __init__(self, learner:BaseEstimator):
+    def __init__(self, learner: BaseEstimator):
         self.learner = learner
 
-    def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, LabelledCollection]=0.3):
+    def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, LabelledCollection]=0.3):
         """
         Trains a HDy quantifier
         :param data: the training set
@@ -404,9 +452,9 @@ ClassifyAndCount = CC
 AdjustedClassifyAndCount = ACC
 ProbabilisticClassifyAndCount = PCC
 ProbabilisticAdjustedClassifyAndCount = PACC
-ExplicitLossMinimisation = ELM
 ExpectationMaximizationQuantifier = EMQ
 HellingerDistanceY = HDy
+ExplicitLossMinimisation = ELM
 
 
 class OneVsAll(AggregativeQuantifier):
@@ -436,6 +484,9 @@ class OneVsAll(AggregativeQuantifier):
         return self
 
     def classify(self, instances):
+        # returns a matrix of shape (n,m) with n the number of instances and m the number of classes. The entry
+        # (i,j) is a binary value indicating whether instance i belongs to class j. The binary classifications are
+        # independent of each other, meaning that an instance can end up be attributed to 0, 1, or more classes.
         classif_predictions_bin = self.__parallel(self._delayed_binary_classification, instances)
         return classif_predictions_bin.T
 
@@ -475,10 +526,12 @@ class OneVsAll(AggregativeQuantifier):
         return self.dict_binary_quantifiers[c].classify(X)
 
     def _delayed_binary_quantify(self, c, X):
-        return self.dict_binary_quantifiers[c].quantify(X)[1]  # the estimation for the positive class prevalence
+        # the estimation for the positive class prevalence
+        return self.dict_binary_quantifiers[c].quantify(X)[1]
 
     def _delayed_binary_aggregate(self, c, classif_predictions):
-        return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:,c])[1]  # the estimation for the positive class prevalence
+        # the estimation for the positive class prevalence
+        return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:, c])[1]
 
     def _delayed_binary_fit(self, c, data):
         bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2)
diff --git a/quapy/plot.py b/quapy/plot.py
index 2b9375b..5164a59 100644
--- a/quapy/plot.py
+++ b/quapy/plot.py
@@ -55,15 +55,15 @@ def binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=1, title
     save_or_show(savepath)
 
 
-def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=None, nbins=21, colormap=cm.tab10,
+def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=None, nbins=5, colormap=cm.tab10,
                      vertical_xticks=False, savepath=None):
     from pylab import boxplot, plot, setp
 
     fig, ax = plt.subplots()
     ax.grid()
 
-    bins = np.linspace(0, 1, nbins)
-    binwidth = 1/(nbins - 1)
+    bins = np.linspace(0, 1, nbins+1)
+    binwidth = 1/nbins
     data = {}
     for method, true_prev, estim_prev in zip(method_names, true_prevs, estim_prevs):
         true_prev = true_prev[:,pos_class]
@@ -110,7 +110,7 @@ def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=N
     # set_visible to False for all but the first element) after the legend has been placed
     hs=[ax.plot([0, 1], [0, 0], '-k', zorder=2)[0]]
     for colorid in range(len(method_names)):
-        h, = plot([1, 1], '-s', markerfacecolor=colormap.colors[colorid], color='k',
+        h, = plot([0, 0], '-s', markerfacecolor=colormap.colors[colorid], color='k',
                   mec=colormap.colors[colorid], linewidth=1.)
         hs.append(h)
     box = ax.get_position()
@@ -126,7 +126,7 @@ def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=N
     save_or_show(savepath)
 
 
-def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=21, error_name='ae', show_std=True,
+def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=20, error_name='ae', show_std=True,
                         title=f'Quantification error as a function of distribution shift',
                         savepath=None):
 
@@ -135,7 +135,6 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=21, e
 
     x_error = qp.error.ae
     y_error = getattr(qp.error, error_name)
-    ndims = tr_prevs[0].shape[-1]
 
     # join all data, and keep the order in which the methods appeared for the first time
     data = defaultdict(lambda:{'x':np.empty(shape=(0)), 'y':np.empty(shape=(0))})
@@ -152,8 +151,8 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=21, e
         if method not in method_order:
             method_order.append(method)
 
-    bins = np.linspace(0, 1, n_bins)
-    binwidth = 1 / (n_bins - 1)
+    bins = np.linspace(0, 1, n_bins+1)
+    binwidth = 1 / n_bins
     min_x, max_x = None, None
     for method in method_order:
         tr_test_drifts = data[method]['x']