From d7192430e4f5215868e726ed49b283f5961f590f Mon Sep 17 00:00:00 2001
From: pglez82 <pglez82@gmail.com>
Date: Tue, 17 Oct 2023 18:24:33 +0200
Subject: [PATCH 1/6] uci multiclass datasets

---
 quapy/data/datasets.py | 103 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 103 insertions(+)

diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py
index 5a0dde1..dfbc14e 100644
--- a/quapy/data/datasets.py
+++ b/quapy/data/datasets.py
@@ -7,6 +7,8 @@ import zipfile
 from os.path import join
 import pandas as pd
 import scipy
+import pickle
+from ucimlrepo import fetch_ucirepo 
 
 from quapy.data.base import Dataset, LabelledCollection
 from quapy.data.preprocessing import text2tfidf, reduce_columns
@@ -45,6 +47,14 @@ UCI_DATASETS = ['acute.a', 'acute.b',
                 'wine-q-red', 'wine-q-white',
                 'yeast']
 
+UCI_MULTICLASS_DATASETS = ['dry-bean',
+                           'wine-quality',
+                           'academic-success',
+                           'digits',
+                           'letter']
+
+KAGGLE_MULTICLASS_DATASETS = ['human-activity']
+
 LEQUA2022_TASKS = ['T1A', 'T1B', 'T2A', 'T2B']
 
 _TXA_SAMPLE_SIZE = 250
@@ -548,6 +558,99 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) ->
     data.stats()
     return data
 
+def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset:
+    """
+    Loads a UCI multiclass dataset as an instance of :class:`quapy.data.base.Dataset`, as used in
+    `Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
+    Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
+    Information Fusion, 34, 87-100. <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_
+    and
+    `Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).
+    Dynamic ensemble selection for quantification tasks.
+    Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_.
+    The datasets do not come with a predefined train-test split (see :meth:`fetch_UCILabelledCollection` for further
+    information on how to use these collections), and so a train-test split is generated at desired proportion.
+    The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_DATASETS`
+
+    :param dataset_name: a dataset name
+    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
+        ~/quay_data/ directory)
+    :param test_split: proportion of documents to be included in the test set. The rest conforms the training set
+    :param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
+    :return: a :class:`quapy.data.base.Dataset` instance
+    """
+    data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, verbose)
+    return Dataset(*data.split_stratified(1 - test_split, random_state=0))
+
+def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection:
+    """
+    Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`, as used in
+    `Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
+    Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
+    Information Fusion, 34, 87-100. <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_
+    and
+    `Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).
+    Dynamic ensemble selection for quantification tasks.
+    Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_.
+    The datasets do not come with a predefined train-test split, and so Pérez-Gállego et al. adopted a 5FCVx2 evaluation
+    protocol, meaning that each collection was used to generate two rounds (hence the x2) of 5 fold cross validation.
+    This can be reproduced by using :meth:`quapy.data.base.Dataset.kFCV`, e.g.:
+
+    >>> import quapy as qp
+    >>> collection = qp.datasets.fetch_UCILabelledCollection("dry-bean")
+    >>> for data in qp.domains.Dataset.kFCV(collection, nfolds=5, nrepeats=2):
+    >>>     ...
+
+    The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS`
+
+    :param dataset_name: a dataset name
+    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
+        ~/quay_data/ directory)
+    :param test_split: proportion of documents to be included in the test set. The rest conforms the training set
+    :param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
+    :return: a :class:`quapy.data.base.LabelledCollection` instance
+    """
+    assert dataset_name in UCI_MULTICLASS_DATASETS, \
+        f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository (multiclass). ' \
+        f'Valid ones are {UCI_MULTICLASS_DATASETS}'
+    
+    if data_home is None:
+        data_home = get_quapy_home()
+    
+    identifiers = {"dry-bean": 602,
+                   "wine-quality":186,
+                   "academic-success":697,
+                   "digits":80,
+                   "letter":59}
+    
+    full_names = {"dry-bean": "Dry Bean Dataset",
+                   "wine-quality":"Wine Quality",
+                   "academic-success":"Predict students' dropout and academic success",
+                   "digits":"Optical Recognition of Handwritten Digits",
+                   "letter":"Letter Recognition"
+    }
+    
+    identifier = identifiers[dataset_name]
+    fullname = full_names[dataset_name]
+
+    print(f'Loading UCI Muticlass {dataset_name} ({fullname})')
+
+    file = join(data_home,'uci_multiclass',dataset_name+'.pkl')
+    if os.path.exists(file):
+        with open(file, 'rb') as file:
+            data = pickle.load(file)
+    else:
+        data = fetch_ucirepo(id=identifier)
+        X, y = data['data']['features'].to_numpy(), data['data']['targets'].to_numpy().squeeze()
+        data = LabelledCollection(X, y)
+        os.makedirs(os.path.dirname(file), exist_ok=True)
+        with open(file, 'wb') as file:
+            pickle.dump(data, file)
+
+
+    data.stats()
+    return data
+
 
 def _df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):
     df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False)

From 72fd21471dce4329cee641b8287b8718c34911ee Mon Sep 17 00:00:00 2001
From: pglez82 <pglez82@gmail.com>
Date: Tue, 17 Oct 2023 18:43:33 +0200
Subject: [PATCH 2/6] fixing mistakes

---
 quapy/data/datasets.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py
index dfbc14e..d5835e4 100644
--- a/quapy/data/datasets.py
+++ b/quapy/data/datasets.py
@@ -53,8 +53,6 @@ UCI_MULTICLASS_DATASETS = ['dry-bean',
                            'digits',
                            'letter']
 
-KAGGLE_MULTICLASS_DATASETS = ['human-activity']
-
 LEQUA2022_TASKS = ['T1A', 'T1B', 'T2A', 'T2B']
 
 _TXA_SAMPLE_SIZE = 250
@@ -597,7 +595,7 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=
     This can be reproduced by using :meth:`quapy.data.base.Dataset.kFCV`, e.g.:
 
     >>> import quapy as qp
-    >>> collection = qp.datasets.fetch_UCILabelledCollection("dry-bean")
+    >>> collection = qp.datasets.fetch_UCIMulticlassLabelledCollection("dry-bean")
     >>> for data in qp.domains.Dataset.kFCV(collection, nfolds=5, nrepeats=2):
     >>>     ...
 

From 239549eb4d15a252aca44d2d4165431ecf68ea9c Mon Sep 17 00:00:00 2001
From: pglez82 <pglez82@gmail.com>
Date: Tue, 17 Oct 2023 18:44:02 +0200
Subject: [PATCH 3/6] fixing mistakes

---
 quapy/data/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py
index d5835e4..e6d88b2 100644
--- a/quapy/data/datasets.py
+++ b/quapy/data/datasets.py
@@ -568,7 +568,7 @@ def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, ver
     Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_.
     The datasets do not come with a predefined train-test split (see :meth:`fetch_UCILabelledCollection` for further
     information on how to use these collections), and so a train-test split is generated at desired proportion.
-    The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_DATASETS`
+    The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS`
 
     :param dataset_name: a dataset name
     :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default

From a9f10f77f4244318d96b599a5ea6f509d81c611e Mon Sep 17 00:00:00 2001
From: pglez82 <pglez82@gmail.com>
Date: Tue, 17 Oct 2023 18:44:28 +0200
Subject: [PATCH 4/6] fixing mistakes

---
 quapy/data/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py
index e6d88b2..a5a66c7 100644
--- a/quapy/data/datasets.py
+++ b/quapy/data/datasets.py
@@ -566,7 +566,7 @@ def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, ver
     `Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).
     Dynamic ensemble selection for quantification tasks.
     Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_.
-    The datasets do not come with a predefined train-test split (see :meth:`fetch_UCILabelledCollection` for further
+    The datasets do not come with a predefined train-test split (see :meth:`fetch_UCIMulticlassLabelledCollection` for further
     information on how to use these collections), and so a train-test split is generated at desired proportion.
     The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS`
 

From ffab2131a89476b0e569b2e96354e44c56dc87d4 Mon Sep 17 00:00:00 2001
From: pglez82 <pglez82@gmail.com>
Date: Wed, 18 Oct 2023 14:12:40 +0200
Subject: [PATCH 5/6] fixing requests

---
 quapy/data/datasets.py | 64 ++++++++++++++++++------------------------
 1 file changed, 27 insertions(+), 37 deletions(-)

diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py
index a5a66c7..84e989d 100644
--- a/quapy/data/datasets.py
+++ b/quapy/data/datasets.py
@@ -7,9 +7,10 @@ import zipfile
 from os.path import join
 import pandas as pd
 import scipy
-import pickle
+
 from ucimlrepo import fetch_ucirepo 
 
+from quapy.util import pickled_resource
 from quapy.data.base import Dataset, LabelledCollection
 from quapy.data.preprocessing import text2tfidf, reduce_columns
 from quapy.data.reader import *
@@ -558,23 +559,20 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) ->
 
 def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset:
     """
-    Loads a UCI multiclass dataset as an instance of :class:`quapy.data.base.Dataset`, as used in
-    `Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
-    Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
-    Information Fusion, 34, 87-100. <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_
-    and
-    `Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).
-    Dynamic ensemble selection for quantification tasks.
-    Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_.
-    The datasets do not come with a predefined train-test split (see :meth:`fetch_UCIMulticlassLabelledCollection` for further
-    information on how to use these collections), and so a train-test split is generated at desired proportion.
+    Loads a UCI multiclass dataset as an instance of :class:`quapy.data.base.Dataset`. 
+
+    The list of available datasets is taken from https://archive.ics.uci.edu/, following these criteria:
+    - The dataset has more than 1000 instances
+    - The dataset is suited for classification
+    - the dataset has more than two classes
+
     The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS`
 
     :param dataset_name: a dataset name
     :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
         ~/quay_data/ directory)
     :param test_split: proportion of documents to be included in the test set. The rest conforms the training set
-    :param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
+    :param verbose: set to True (default is False) to get information (stats) about the dataset
     :return: a :class:`quapy.data.base.Dataset` instance
     """
     data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, verbose)
@@ -582,30 +580,23 @@ def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, ver
 
 def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection:
     """
-    Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`, as used in
-    `Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
-    Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
-    Information Fusion, 34, 87-100. <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_
-    and
-    `Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).
-    Dynamic ensemble selection for quantification tasks.
-    Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_.
-    The datasets do not come with a predefined train-test split, and so Pérez-Gállego et al. adopted a 5FCVx2 evaluation
-    protocol, meaning that each collection was used to generate two rounds (hence the x2) of 5 fold cross validation.
-    This can be reproduced by using :meth:`quapy.data.base.Dataset.kFCV`, e.g.:
+    Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`. 
+    
+    It needs the library `ucimlrepo` for downloading the datasets from https://archive.ics.uci.edu/. 
 
     >>> import quapy as qp
-    >>> collection = qp.datasets.fetch_UCIMulticlassLabelledCollection("dry-bean")
-    >>> for data in qp.domains.Dataset.kFCV(collection, nfolds=5, nrepeats=2):
+    >>> dataset = qp.datasets.fetch_UCIMulticlassLabelledCollection("dry-bean")
     >>>     ...
 
     The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS`
 
+    The datasets are downloaded only once and pickled into disk, saving time for consecutive calls.
+
     :param dataset_name: a dataset name
-    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
+    :param data_home: specify the quapy home directory where the dataset will be dumped (leave empty to use the default
         ~/quay_data/ directory)
     :param test_split: proportion of documents to be included in the test set. The rest conforms the training set
-    :param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
+    :param verbose: set to True (default is False) to get information (stats) about the dataset
     :return: a :class:`quapy.data.base.LabelledCollection` instance
     """
     assert dataset_name in UCI_MULTICLASS_DATASETS, \
@@ -634,19 +625,18 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=
     print(f'Loading UCI Muticlass {dataset_name} ({fullname})')
 
     file = join(data_home,'uci_multiclass',dataset_name+'.pkl')
-    if os.path.exists(file):
-        with open(file, 'rb') as file:
-            data = pickle.load(file)
-    else:
-        data = fetch_ucirepo(id=identifier)
+    
+    def download(id):
+        data = fetch_ucirepo(id=id)
         X, y = data['data']['features'].to_numpy(), data['data']['targets'].to_numpy().squeeze()
-        data = LabelledCollection(X, y)
-        os.makedirs(os.path.dirname(file), exist_ok=True)
-        with open(file, 'wb') as file:
-            pickle.dump(data, file)
+        classes = np.sort(np.unique(y))
+        y = np.searchsorted(classes, y)
+        return LabelledCollection(X,y)
 
+    data = pickled_resource(file, download, identifier)
 
-    data.stats()
+    if verbose:
+        data.stats()
     return data
 
 

From ea71559722cbb19f9d710536b3d6795a419efe50 Mon Sep 17 00:00:00 2001
From: Alex Moreo <alejandro.moreo@isti.cnr.it>
Date: Wed, 18 Oct 2023 17:50:46 +0200
Subject: [PATCH 6/6] revised

---
 quapy/data/datasets.py | 65 +++++++++++++++++++++++++++---------------
 1 file changed, 42 insertions(+), 23 deletions(-)

diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py
index 84e989d..9d34222 100644
--- a/quapy/data/datasets.py
+++ b/quapy/data/datasets.py
@@ -6,11 +6,9 @@ import os
 import zipfile
 from os.path import join
 import pandas as pd
-import scipy
 
 from ucimlrepo import fetch_ucirepo 
 
-from quapy.util import pickled_resource
 from quapy.data.base import Dataset, LabelledCollection
 from quapy.data.preprocessing import text2tfidf, reduce_columns
 from quapy.data.reader import *
@@ -557,17 +555,26 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) ->
     data.stats()
     return data
 
+
 def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset:
     """
     Loads a UCI multiclass dataset as an instance of :class:`quapy.data.base.Dataset`. 
 
     The list of available datasets is taken from https://archive.ics.uci.edu/, following these criteria:
-    - The dataset has more than 1000 instances
-    - The dataset is suited for classification
-    - the dataset has more than two classes
+    - It has more than 1000 instances
+    - It is suited for classification
+    - It has more than two classes
+    - It is available for Python import (requires ucimlrepo package)
+
+    >>> import quapy as qp
+    >>> dataset = qp.datasets.fetch_UCIMulticlassDataset("dry-bean")
+    >>> train, test = dataset.train_test
+    >>>     ...
 
     The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS`
 
+    The datasets are downloaded only once and pickled into disk, saving time for consecutive calls.
+
     :param dataset_name: a dataset name
     :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
         ~/quay_data/ directory)
@@ -578,14 +585,20 @@ def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, ver
     data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, verbose)
     return Dataset(*data.split_stratified(1 - test_split, random_state=0))
 
+
 def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection:
     """
-    Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`. 
-    
-    It needs the library `ucimlrepo` for downloading the datasets from https://archive.ics.uci.edu/. 
+    Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`.
 
+    The list of available datasets is taken from https://archive.ics.uci.edu/, following these criteria:
+    - It has more than 1000 instances
+    - It is suited for classification
+    - It has more than two classes
+    - It is available for Python import (requires ucimlrepo package)
+    
     >>> import quapy as qp
-    >>> dataset = qp.datasets.fetch_UCIMulticlassLabelledCollection("dry-bean")
+    >>> collection = qp.datasets.fetch_UCIMulticlassLabelledCollection("dry-bean")
+    >>> X, y = collection.Xy
     >>>     ...
 
     The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS`
@@ -600,43 +613,49 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=
     :return: a :class:`quapy.data.base.LabelledCollection` instance
     """
     assert dataset_name in UCI_MULTICLASS_DATASETS, \
-        f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository (multiclass). ' \
+        f'Name {dataset_name} does not match any known dataset from the ' \
+        f'UCI Machine Learning datasets repository (multiclass). ' \
         f'Valid ones are {UCI_MULTICLASS_DATASETS}'
     
     if data_home is None:
         data_home = get_quapy_home()
     
-    identifiers = {"dry-bean": 602,
-                   "wine-quality":186,
-                   "academic-success":697,
-                   "digits":80,
-                   "letter":59}
+    identifiers = {
+        "dry-bean": 602,
+        "wine-quality": 186,
+        "academic-success": 697,
+        "digits": 80,
+        "letter": 59
+    }
     
-    full_names = {"dry-bean": "Dry Bean Dataset",
-                   "wine-quality":"Wine Quality",
-                   "academic-success":"Predict students' dropout and academic success",
-                   "digits":"Optical Recognition of Handwritten Digits",
-                   "letter":"Letter Recognition"
+    full_names = {
+        "dry-bean": "Dry Bean Dataset",
+        "wine-quality": "Wine Quality",
+        "academic-success": "Predict students' dropout and academic success",
+        "digits": "Optical Recognition of Handwritten Digits",
+        "letter": "Letter Recognition"
     }
     
     identifier = identifiers[dataset_name]
     fullname = full_names[dataset_name]
 
-    print(f'Loading UCI Muticlass {dataset_name} ({fullname})')
+    if verbose:
+        print(f'Loading UCI Muticlass {dataset_name} ({fullname})')
 
-    file = join(data_home,'uci_multiclass',dataset_name+'.pkl')
+    file = join(data_home, 'uci_multiclass', dataset_name+'.pkl')
     
     def download(id):
         data = fetch_ucirepo(id=id)
         X, y = data['data']['features'].to_numpy(), data['data']['targets'].to_numpy().squeeze()
         classes = np.sort(np.unique(y))
         y = np.searchsorted(classes, y)
-        return LabelledCollection(X,y)
+        return LabelledCollection(X, y)
 
     data = pickled_resource(file, download, identifier)
 
     if verbose:
         data.stats()
+        
     return data