cleaning and adding some uci datasets

2021-01-11 12:55:06 +01:00 · 2021-01-11 12:55:06 +01:00 · 41347b50f9
parent d1b449d2e9
commit 41347b50f9
3 changed files with 128 additions and 35 deletions
--- a/quapy/data/datasets.py
+++ b/quapy/data/datasets.py
@ -120,7 +120,10 @@ def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_hom


 UCI_DATASETS = ['acute.a', 'acute.b',
-                'balance.1', 'balance.2', 'balance.3']
+                'balance.1', 'balance.2', 'balance.3',
+                'breast-cancer',
+                'cmc.1', 'cmc.2', 'cmc.3',
+                'ctg.1', 'ctg.2', 'ctg.3'] # ongoing...

 def fetch_UCIDataset(dataset_name, data_home=None, verbose=False):

@ -136,6 +139,14 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False):
        'balance.1': 'balance-scale',
        'balance.2': 'balance-scale',
        'balance.3': 'balance-scale',
+        'breast-cancer': 'breast-cancer-wisconsin',
+        'cmc.1': 'cmc',
+        'cmc.2': 'cmc',
+        'cmc.3': 'cmc',
+        'ctg.1': 'ctg',
+        'ctg.2': 'ctg',
+        'ctg.3': 'ctg',
+
    }

    dataset_fullname = {
@ -144,11 +155,20 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False):
        'balance.1': 'Balance Scale Weight & Distance Database (left)',
        'balance.2': 'Balance Scale Weight & Distance Database (balanced)',
        'balance.3': 'Balance Scale Weight & Distance Database (right)',
+        'breast-cancer':  'Breast Cancer Wisconsin (Original)',
+        'cmc.1': 'Contraceptive Method Choice (no use)',
+        'cmc.2': 'Contraceptive Method Choice (long term)',
+        'cmc.3': 'Contraceptive Method Choice (short term)',
+        'ctg.1': 'Cardiotocography Data Set (normal)',
+        'ctg.2': 'Cardiotocography Data Set (suspect)',
+        'ctg.3': 'Cardiotocography Data Set (pathologic)',
    }

    data_folder = {
        'acute': 'diagnosis',
        'balance-scale': 'balance-scale',
+        'breast-cancer-wisconsin': 'breast-cancer-wisconsin',
+        'cmc': 'cmc'
    }

    identifier = identifier_map[dataset_name]
@ -183,8 +203,29 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False):
            y = binarize(df[0], pos_class='R')
        X = df.loc[:, 1:].astype(float).values

+    if identifier == 'breast-cancer-wisconsin':
+        df = pd.read_csv(f'{data_path}/{identifier}.data', header=None, sep=',')
+        Xy = df.loc[:, 1:10]
+        Xy[Xy=='?']=np.nan
+        Xy = Xy.dropna(axis=0)
+        X = Xy.loc[:, 1:9]
+        X = X.astype(float).values
+        y = binarize(Xy[10], pos_class=4)
+
+    if identifier == 'cmc':
+        df = pd.read_csv(f'{data_path}/{identifier}.data', header=None, sep=',')
+        X = df.loc[:, 0:8].astype(float).values
+        y = df[9].astype(int).values
+        if dataset_name == 'cmc.1':
+            y = binarize(y, pos_class=1)
+        elif dataset_name == 'cmc.2':
+            y = binarize(y, pos_class=2)
+        elif dataset_name == 'cmc.3':
+            y = binarize(y, pos_class=3)
+
    data = LabelledCollection(X, y)
    data.stats()
+    raise NotImplementedError()
    #print(df)
    #print(df.loc[:, 0:5].values)
    #print(y)
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@ -11,6 +11,8 @@ from sklearn.calibration import CalibratedClassifierCV
 from joblib import Parallel, delayed
 from abc import abstractmethod
 from typing import Union
+from sklearn.model_selection import StratifiedKFold
+from tqdm import tqdm


 # Abstract classes
@ -115,8 +117,8 @@ def training_helper(learner,
                train = data
                unused = val_split
            else:
-                raise ValueError('train_val_split not understood; use either a float indicating the split proportion, '
-                                 'or a LabelledCollection indicating the validation split')
+                raise ValueError('param "val_split" not understood; use either a float indicating the split '
+                                 'proportion, or a LabelledCollection indicating the validation split')
        else:
            train, unused = data, None
        learner.fit(train.instances, train.labels)
@ -159,23 +161,49 @@ class ACC(AggregativeQuantifier):
    def __init__(self, learner:BaseEstimator):
        self.learner = learner

-    def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, LabelledCollection]=0.3):
+    def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, int, LabelledCollection]=0.3):
        """
        Trains a ACC quantifier
        :param data: the training set
        :param fit_learner: set to False to bypass the training (the learner is assumed to be already fit)
        :param val_split: either a float in (0,1) indicating the proportion of training instances to use for
         validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
-         indicating the validation set itself
+         indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV
+         to estimate the parameters
        :return: self
        """
-        self.learner, validation = training_helper(self.learner, data, fit_learner, val_split=val_split)
+        if isinstance(val_split, int):
+            # kFCV estimation of parameters
+            y, y_ = [], []
+            kfcv = StratifiedKFold(n_splits=val_split)
+            pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
+            for k, (training_idx, validation_idx) in enumerate(pbar):
+                pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
+                training = data.sampling_from_index(training_idx)
+                validation = data.sampling_from_index(validation_idx)
+                learner, val_data = training_helper(self.learner, training, fit_learner, val_split=validation)
+                y_.append(learner.predict(val_data.instances))
+                y.append(val_data.labels)
+
+            y = np.concatenate(y)
+            y_ = np.concatenate(y_)
+            class_count = data.counts()
+
+            # fit the learner on all data
+            self.learner.fit(*data.Xy)
+
+        else:
+            self.learner, val_data = training_helper(self.learner, data, fit_learner, val_split=val_split)
+            y_ = self.learner.predict(val_data.instances)
+            y = val_data.labels
+            class_count = val_data.counts()
+
        self.cc = CC(self.learner)
-        y_ = self.classify(validation.instances)
-        y  = validation.labels
+
        # estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
        # document that belongs to yj ends up being classified as belonging to yi
-        self.Pte_cond_estim_ = confusion_matrix(y,y_).T / validation.counts()
+        self.Pte_cond_estim_ = confusion_matrix(y, y_).T / class_count
+
        return self

    def classify(self, data):
@ -216,33 +244,53 @@ class PACC(AggregativeProbabilisticQuantifier):
    def __init__(self, learner:BaseEstimator):
        self.learner = learner

-    def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, LabelledCollection]=0.3):
+    def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, int, LabelledCollection]=0.3):
        """
        Trains a PACC quantifier
        :param data: the training set
        :param fit_learner: set to False to bypass the training (the learner is assumed to be already fit)
        :param val_split: either a float in (0,1) indicating the proportion of training instances to use for
         validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
-         indicating the validation set itself
+         indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV
+         to estimate the parameters
        :return: self
        """
-        self.learner, validation = training_helper(
-            self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split
-        )
+        if isinstance(val_split, int):
+            # kFCV estimation of parameters
+            y, y_ = [], []
+            kfcv = StratifiedKFold(n_splits=val_split)
+            pbar = tqdm(kfcv.split(*data.Xy), total=val_split)
+            for k, (training_idx, validation_idx) in enumerate(pbar):
+                pbar.set_description(f'{self.__class__.__name__} fitting fold {k}')
+                training = data.sampling_from_index(training_idx)
+                validation = data.sampling_from_index(validation_idx)
+                learner, val_data = training_helper(
+                    self.learner, training, fit_learner, ensure_probabilistic=True, val_split=validation)
+                y_.append(learner.predict_proba(val_data.instances))
+                y.append(val_data.labels)
+
+            y = np.concatenate(y)
+            y_ = np.vstack(y_)
+
+            # fit the learner on all data
+            self.learner.fit(*data.Xy)
+
+        else:
+            self.learner, val_data = training_helper(
+                self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split)
+            y_ = self.learner.predict_proba(val_data.instances)
+            y = val_data.labels
+
        self.pcc = PCC(self.learner)
-        y_ = self.soft_classify(validation.instances)
-        y  = validation.labels
+
+        # estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
+        # document that belongs to yj ends up being classified as belonging to yi
        confusion = np.empty(shape=(data.n_classes, data.n_classes))
        for yi in range(data.n_classes):
            confusion[yi] = y_[y==yi].mean(axis=0)

        self.Pte_cond_estim_ = confusion.T

-        #y_ = self.classify(validation.instances)
-        #y = validation.labels
-        # estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
-        # document that belongs to yj ends up being classified as belonging to yi
-        #self.Pte_cond_estim_ = confusion_matrix(y, y_).T / validation.counts()
        return self

    def aggregate(self, classif_posteriors):
@ -261,7 +309,7 @@ class EMQ(AggregativeProbabilisticQuantifier):
    MAX_ITER = 1000
    EPSILON = 1e-4

-    def __init__(self, learner:BaseEstimator):
+    def __init__(self, learner: BaseEstimator):
        self.learner = learner

    def fit(self, data: LabelledCollection, fit_learner=True):
@ -307,10 +355,10 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
    estimation based on the Hellinger distance. Information Sciences, 218:146–164.
    """

-    def __init__(self, learner:BaseEstimator):
+    def __init__(self, learner: BaseEstimator):
        self.learner = learner

-    def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, LabelledCollection]=0.3):
+    def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, LabelledCollection]=0.3):
        """
        Trains a HDy quantifier
        :param data: the training set
@ -404,9 +452,9 @@ ClassifyAndCount = CC
 AdjustedClassifyAndCount = ACC
 ProbabilisticClassifyAndCount = PCC
 ProbabilisticAdjustedClassifyAndCount = PACC
-ExplicitLossMinimisation = ELM
 ExpectationMaximizationQuantifier = EMQ
 HellingerDistanceY = HDy
+ExplicitLossMinimisation = ELM


 class OneVsAll(AggregativeQuantifier):
@ -436,6 +484,9 @@ class OneVsAll(AggregativeQuantifier):
        return self

    def classify(self, instances):
+        # returns a matrix of shape (n,m) with n the number of instances and m the number of classes. The entry
+        # (i,j) is a binary value indicating whether instance i belongs to class j. The binary classifications are
+        # independent of each other, meaning that an instance can end up be attributed to 0, 1, or more classes.
        classif_predictions_bin = self.__parallel(self._delayed_binary_classification, instances)
        return classif_predictions_bin.T

@ -475,10 +526,12 @@ class OneVsAll(AggregativeQuantifier):
        return self.dict_binary_quantifiers[c].classify(X)

    def _delayed_binary_quantify(self, c, X):
-        return self.dict_binary_quantifiers[c].quantify(X)[1]  # the estimation for the positive class prevalence
+        # the estimation for the positive class prevalence
+        return self.dict_binary_quantifiers[c].quantify(X)[1]

    def _delayed_binary_aggregate(self, c, classif_predictions):
-        return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:,c])[1]  # the estimation for the positive class prevalence
+        # the estimation for the positive class prevalence
+        return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:, c])[1]

    def _delayed_binary_fit(self, c, data):
        bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2)
--- a/quapy/plot.py
+++ b/quapy/plot.py
@ -55,15 +55,15 @@ def binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=1, title
    save_or_show(savepath)


-def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=None, nbins=21, colormap=cm.tab10,
+def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=None, nbins=5, colormap=cm.tab10,
                     vertical_xticks=False, savepath=None):
    from pylab import boxplot, plot, setp

    fig, ax = plt.subplots()
    ax.grid()

-    bins = np.linspace(0, 1, nbins)
-    binwidth = 1/(nbins - 1)
+    bins = np.linspace(0, 1, nbins+1)
+    binwidth = 1/nbins
    data = {}
    for method, true_prev, estim_prev in zip(method_names, true_prevs, estim_prevs):
        true_prev = true_prev[:,pos_class]
@ -110,7 +110,7 @@ def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=N
    # set_visible to False for all but the first element) after the legend has been placed
    hs=[ax.plot([0, 1], [0, 0], '-k', zorder=2)[0]]
    for colorid in range(len(method_names)):
-        h, = plot([1, 1], '-s', markerfacecolor=colormap.colors[colorid], color='k',
+        h, = plot([0, 0], '-s', markerfacecolor=colormap.colors[colorid], color='k',
                  mec=colormap.colors[colorid], linewidth=1.)
        hs.append(h)
    box = ax.get_position()
@ -126,7 +126,7 @@ def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=N
    save_or_show(savepath)


-def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=21, error_name='ae', show_std=True,
+def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=20, error_name='ae', show_std=True,
                        title=f'Quantification error as a function of distribution shift',
                        savepath=None):

@ -135,7 +135,6 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=21, e

    x_error = qp.error.ae
    y_error = getattr(qp.error, error_name)
-    ndims = tr_prevs[0].shape[-1]

    # join all data, and keep the order in which the methods appeared for the first time
    data = defaultdict(lambda:{'x':np.empty(shape=(0)), 'y':np.empty(shape=(0))})
@ -152,8 +151,8 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=21, e
        if method not in method_order:
            method_order.append(method)

-    bins = np.linspace(0, 1, n_bins)
-    binwidth = 1 / (n_bins - 1)
+    bins = np.linspace(0, 1, n_bins+1)
+    binwidth = 1 / n_bins
    min_x, max_x = None, None
    for method in method_order:
        tr_test_drifts = data[method]['x']