From d7192430e4f5215868e726ed49b283f5961f590f Mon Sep 17 00:00:00 2001 From: pglez82 Date: Tue, 17 Oct 2023 18:24:33 +0200 Subject: [PATCH 1/6] uci multiclass datasets --- quapy/data/datasets.py | 103 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py index 5a0dde1..dfbc14e 100644 --- a/quapy/data/datasets.py +++ b/quapy/data/datasets.py @@ -7,6 +7,8 @@ import zipfile from os.path import join import pandas as pd import scipy +import pickle +from ucimlrepo import fetch_ucirepo from quapy.data.base import Dataset, LabelledCollection from quapy.data.preprocessing import text2tfidf, reduce_columns @@ -45,6 +47,14 @@ UCI_DATASETS = ['acute.a', 'acute.b', 'wine-q-red', 'wine-q-white', 'yeast'] +UCI_MULTICLASS_DATASETS = ['dry-bean', + 'wine-quality', + 'academic-success', + 'digits', + 'letter'] + +KAGGLE_MULTICLASS_DATASETS = ['human-activity'] + LEQUA2022_TASKS = ['T1A', 'T1B', 'T2A', 'T2B'] _TXA_SAMPLE_SIZE = 250 @@ -548,6 +558,99 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) -> data.stats() return data +def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset: + """ + Loads a UCI multiclass dataset as an instance of :class:`quapy.data.base.Dataset`, as used in + `Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017). + Using ensembles for problems with characterizable changes in data distribution: A case study on quantification. + Information Fusion, 34, 87-100. `_ + and + `Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019). + Dynamic ensemble selection for quantification tasks. + Information Fusion, 45, 1-15. `_. + The datasets do not come with a predefined train-test split (see :meth:`fetch_UCILabelledCollection` for further + information on how to use these collections), and so a train-test split is generated at desired proportion. + The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_DATASETS` + + :param dataset_name: a dataset name + :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default + ~/quay_data/ directory) + :param test_split: proportion of documents to be included in the test set. The rest conforms the training set + :param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets + :return: a :class:`quapy.data.base.Dataset` instance + """ + data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, verbose) + return Dataset(*data.split_stratified(1 - test_split, random_state=0)) + +def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection: + """ + Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`, as used in + `Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017). + Using ensembles for problems with characterizable changes in data distribution: A case study on quantification. + Information Fusion, 34, 87-100. `_ + and + `Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019). + Dynamic ensemble selection for quantification tasks. + Information Fusion, 45, 1-15. `_. + The datasets do not come with a predefined train-test split, and so Pérez-Gállego et al. adopted a 5FCVx2 evaluation + protocol, meaning that each collection was used to generate two rounds (hence the x2) of 5 fold cross validation. + This can be reproduced by using :meth:`quapy.data.base.Dataset.kFCV`, e.g.: + + >>> import quapy as qp + >>> collection = qp.datasets.fetch_UCILabelledCollection("dry-bean") + >>> for data in qp.domains.Dataset.kFCV(collection, nfolds=5, nrepeats=2): + >>> ... + + The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS` + + :param dataset_name: a dataset name + :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default + ~/quay_data/ directory) + :param test_split: proportion of documents to be included in the test set. The rest conforms the training set + :param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets + :return: a :class:`quapy.data.base.LabelledCollection` instance + """ + assert dataset_name in UCI_MULTICLASS_DATASETS, \ + f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository (multiclass). ' \ + f'Valid ones are {UCI_MULTICLASS_DATASETS}' + + if data_home is None: + data_home = get_quapy_home() + + identifiers = {"dry-bean": 602, + "wine-quality":186, + "academic-success":697, + "digits":80, + "letter":59} + + full_names = {"dry-bean": "Dry Bean Dataset", + "wine-quality":"Wine Quality", + "academic-success":"Predict students' dropout and academic success", + "digits":"Optical Recognition of Handwritten Digits", + "letter":"Letter Recognition" + } + + identifier = identifiers[dataset_name] + fullname = full_names[dataset_name] + + print(f'Loading UCI Muticlass {dataset_name} ({fullname})') + + file = join(data_home,'uci_multiclass',dataset_name+'.pkl') + if os.path.exists(file): + with open(file, 'rb') as file: + data = pickle.load(file) + else: + data = fetch_ucirepo(id=identifier) + X, y = data['data']['features'].to_numpy(), data['data']['targets'].to_numpy().squeeze() + data = LabelledCollection(X, y) + os.makedirs(os.path.dirname(file), exist_ok=True) + with open(file, 'wb') as file: + pickle.dump(data, file) + + + data.stats() + return data + def _df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float): df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False) From 72fd21471dce4329cee641b8287b8718c34911ee Mon Sep 17 00:00:00 2001 From: pglez82 Date: Tue, 17 Oct 2023 18:43:33 +0200 Subject: [PATCH 2/6] fixing mistakes --- quapy/data/datasets.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py index dfbc14e..d5835e4 100644 --- a/quapy/data/datasets.py +++ b/quapy/data/datasets.py @@ -53,8 +53,6 @@ UCI_MULTICLASS_DATASETS = ['dry-bean', 'digits', 'letter'] -KAGGLE_MULTICLASS_DATASETS = ['human-activity'] - LEQUA2022_TASKS = ['T1A', 'T1B', 'T2A', 'T2B'] _TXA_SAMPLE_SIZE = 250 @@ -597,7 +595,7 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose= This can be reproduced by using :meth:`quapy.data.base.Dataset.kFCV`, e.g.: >>> import quapy as qp - >>> collection = qp.datasets.fetch_UCILabelledCollection("dry-bean") + >>> collection = qp.datasets.fetch_UCIMulticlassLabelledCollection("dry-bean") >>> for data in qp.domains.Dataset.kFCV(collection, nfolds=5, nrepeats=2): >>> ... From 239549eb4d15a252aca44d2d4165431ecf68ea9c Mon Sep 17 00:00:00 2001 From: pglez82 Date: Tue, 17 Oct 2023 18:44:02 +0200 Subject: [PATCH 3/6] fixing mistakes --- quapy/data/datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py index d5835e4..e6d88b2 100644 --- a/quapy/data/datasets.py +++ b/quapy/data/datasets.py @@ -568,7 +568,7 @@ def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, ver Information Fusion, 45, 1-15. `_. The datasets do not come with a predefined train-test split (see :meth:`fetch_UCILabelledCollection` for further information on how to use these collections), and so a train-test split is generated at desired proportion. - The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_DATASETS` + The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS` :param dataset_name: a dataset name :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default From a9f10f77f4244318d96b599a5ea6f509d81c611e Mon Sep 17 00:00:00 2001 From: pglez82 Date: Tue, 17 Oct 2023 18:44:28 +0200 Subject: [PATCH 4/6] fixing mistakes --- quapy/data/datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py index e6d88b2..a5a66c7 100644 --- a/quapy/data/datasets.py +++ b/quapy/data/datasets.py @@ -566,7 +566,7 @@ def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, ver `Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019). Dynamic ensemble selection for quantification tasks. Information Fusion, 45, 1-15. `_. - The datasets do not come with a predefined train-test split (see :meth:`fetch_UCILabelledCollection` for further + The datasets do not come with a predefined train-test split (see :meth:`fetch_UCIMulticlassLabelledCollection` for further information on how to use these collections), and so a train-test split is generated at desired proportion. The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS` From ffab2131a89476b0e569b2e96354e44c56dc87d4 Mon Sep 17 00:00:00 2001 From: pglez82 Date: Wed, 18 Oct 2023 14:12:40 +0200 Subject: [PATCH 5/6] fixing requests --- quapy/data/datasets.py | 64 ++++++++++++++++++------------------------ 1 file changed, 27 insertions(+), 37 deletions(-) diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py index a5a66c7..84e989d 100644 --- a/quapy/data/datasets.py +++ b/quapy/data/datasets.py @@ -7,9 +7,10 @@ import zipfile from os.path import join import pandas as pd import scipy -import pickle + from ucimlrepo import fetch_ucirepo +from quapy.util import pickled_resource from quapy.data.base import Dataset, LabelledCollection from quapy.data.preprocessing import text2tfidf, reduce_columns from quapy.data.reader import * @@ -558,23 +559,20 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) -> def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset: """ - Loads a UCI multiclass dataset as an instance of :class:`quapy.data.base.Dataset`, as used in - `Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017). - Using ensembles for problems with characterizable changes in data distribution: A case study on quantification. - Information Fusion, 34, 87-100. `_ - and - `Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019). - Dynamic ensemble selection for quantification tasks. - Information Fusion, 45, 1-15. `_. - The datasets do not come with a predefined train-test split (see :meth:`fetch_UCIMulticlassLabelledCollection` for further - information on how to use these collections), and so a train-test split is generated at desired proportion. + Loads a UCI multiclass dataset as an instance of :class:`quapy.data.base.Dataset`. + + The list of available datasets is taken from https://archive.ics.uci.edu/, following these criteria: + - The dataset has more than 1000 instances + - The dataset is suited for classification + - the dataset has more than two classes + The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS` :param dataset_name: a dataset name :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default ~/quay_data/ directory) :param test_split: proportion of documents to be included in the test set. The rest conforms the training set - :param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets + :param verbose: set to True (default is False) to get information (stats) about the dataset :return: a :class:`quapy.data.base.Dataset` instance """ data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, verbose) @@ -582,30 +580,23 @@ def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, ver def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection: """ - Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`, as used in - `Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017). - Using ensembles for problems with characterizable changes in data distribution: A case study on quantification. - Information Fusion, 34, 87-100. `_ - and - `Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019). - Dynamic ensemble selection for quantification tasks. - Information Fusion, 45, 1-15. `_. - The datasets do not come with a predefined train-test split, and so Pérez-Gállego et al. adopted a 5FCVx2 evaluation - protocol, meaning that each collection was used to generate two rounds (hence the x2) of 5 fold cross validation. - This can be reproduced by using :meth:`quapy.data.base.Dataset.kFCV`, e.g.: + Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`. + + It needs the library `ucimlrepo` for downloading the datasets from https://archive.ics.uci.edu/. >>> import quapy as qp - >>> collection = qp.datasets.fetch_UCIMulticlassLabelledCollection("dry-bean") - >>> for data in qp.domains.Dataset.kFCV(collection, nfolds=5, nrepeats=2): + >>> dataset = qp.datasets.fetch_UCIMulticlassLabelledCollection("dry-bean") >>> ... The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS` + The datasets are downloaded only once and pickled into disk, saving time for consecutive calls. + :param dataset_name: a dataset name - :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default + :param data_home: specify the quapy home directory where the dataset will be dumped (leave empty to use the default ~/quay_data/ directory) :param test_split: proportion of documents to be included in the test set. The rest conforms the training set - :param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets + :param verbose: set to True (default is False) to get information (stats) about the dataset :return: a :class:`quapy.data.base.LabelledCollection` instance """ assert dataset_name in UCI_MULTICLASS_DATASETS, \ @@ -634,19 +625,18 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose= print(f'Loading UCI Muticlass {dataset_name} ({fullname})') file = join(data_home,'uci_multiclass',dataset_name+'.pkl') - if os.path.exists(file): - with open(file, 'rb') as file: - data = pickle.load(file) - else: - data = fetch_ucirepo(id=identifier) + + def download(id): + data = fetch_ucirepo(id=id) X, y = data['data']['features'].to_numpy(), data['data']['targets'].to_numpy().squeeze() - data = LabelledCollection(X, y) - os.makedirs(os.path.dirname(file), exist_ok=True) - with open(file, 'wb') as file: - pickle.dump(data, file) + classes = np.sort(np.unique(y)) + y = np.searchsorted(classes, y) + return LabelledCollection(X,y) + data = pickled_resource(file, download, identifier) - data.stats() + if verbose: + data.stats() return data From ea71559722cbb19f9d710536b3d6795a419efe50 Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Wed, 18 Oct 2023 17:50:46 +0200 Subject: [PATCH 6/6] revised --- quapy/data/datasets.py | 65 +++++++++++++++++++++++++++--------------- 1 file changed, 42 insertions(+), 23 deletions(-) diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py index 84e989d..9d34222 100644 --- a/quapy/data/datasets.py +++ b/quapy/data/datasets.py @@ -6,11 +6,9 @@ import os import zipfile from os.path import join import pandas as pd -import scipy from ucimlrepo import fetch_ucirepo -from quapy.util import pickled_resource from quapy.data.base import Dataset, LabelledCollection from quapy.data.preprocessing import text2tfidf, reduce_columns from quapy.data.reader import * @@ -557,17 +555,26 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) -> data.stats() return data + def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset: """ Loads a UCI multiclass dataset as an instance of :class:`quapy.data.base.Dataset`. The list of available datasets is taken from https://archive.ics.uci.edu/, following these criteria: - - The dataset has more than 1000 instances - - The dataset is suited for classification - - the dataset has more than two classes + - It has more than 1000 instances + - It is suited for classification + - It has more than two classes + - It is available for Python import (requires ucimlrepo package) + + >>> import quapy as qp + >>> dataset = qp.datasets.fetch_UCIMulticlassDataset("dry-bean") + >>> train, test = dataset.train_test + >>> ... The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS` + The datasets are downloaded only once and pickled into disk, saving time for consecutive calls. + :param dataset_name: a dataset name :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default ~/quay_data/ directory) @@ -578,14 +585,20 @@ def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, ver data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, verbose) return Dataset(*data.split_stratified(1 - test_split, random_state=0)) + def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection: """ - Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`. - - It needs the library `ucimlrepo` for downloading the datasets from https://archive.ics.uci.edu/. + Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`. + The list of available datasets is taken from https://archive.ics.uci.edu/, following these criteria: + - It has more than 1000 instances + - It is suited for classification + - It has more than two classes + - It is available for Python import (requires ucimlrepo package) + >>> import quapy as qp - >>> dataset = qp.datasets.fetch_UCIMulticlassLabelledCollection("dry-bean") + >>> collection = qp.datasets.fetch_UCIMulticlassLabelledCollection("dry-bean") + >>> X, y = collection.Xy >>> ... The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS` @@ -600,43 +613,49 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose= :return: a :class:`quapy.data.base.LabelledCollection` instance """ assert dataset_name in UCI_MULTICLASS_DATASETS, \ - f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository (multiclass). ' \ + f'Name {dataset_name} does not match any known dataset from the ' \ + f'UCI Machine Learning datasets repository (multiclass). ' \ f'Valid ones are {UCI_MULTICLASS_DATASETS}' if data_home is None: data_home = get_quapy_home() - identifiers = {"dry-bean": 602, - "wine-quality":186, - "academic-success":697, - "digits":80, - "letter":59} + identifiers = { + "dry-bean": 602, + "wine-quality": 186, + "academic-success": 697, + "digits": 80, + "letter": 59 + } - full_names = {"dry-bean": "Dry Bean Dataset", - "wine-quality":"Wine Quality", - "academic-success":"Predict students' dropout and academic success", - "digits":"Optical Recognition of Handwritten Digits", - "letter":"Letter Recognition" + full_names = { + "dry-bean": "Dry Bean Dataset", + "wine-quality": "Wine Quality", + "academic-success": "Predict students' dropout and academic success", + "digits": "Optical Recognition of Handwritten Digits", + "letter": "Letter Recognition" } identifier = identifiers[dataset_name] fullname = full_names[dataset_name] - print(f'Loading UCI Muticlass {dataset_name} ({fullname})') + if verbose: + print(f'Loading UCI Muticlass {dataset_name} ({fullname})') - file = join(data_home,'uci_multiclass',dataset_name+'.pkl') + file = join(data_home, 'uci_multiclass', dataset_name+'.pkl') def download(id): data = fetch_ucirepo(id=id) X, y = data['data']['features'].to_numpy(), data['data']['targets'].to_numpy().squeeze() classes = np.sort(np.unique(y)) y = np.searchsorted(classes, y) - return LabelledCollection(X,y) + return LabelledCollection(X, y) data = pickled_resource(file, download, identifier) if verbose: data.stats() + return data