From d7192430e4f5215868e726ed49b283f5961f590f Mon Sep 17 00:00:00 2001 From: pglez82 Date: Tue, 17 Oct 2023 18:24:33 +0200 Subject: [PATCH] uci multiclass datasets --- quapy/data/datasets.py | 103 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py index 5a0dde1..dfbc14e 100644 --- a/quapy/data/datasets.py +++ b/quapy/data/datasets.py @@ -7,6 +7,8 @@ import zipfile from os.path import join import pandas as pd import scipy +import pickle +from ucimlrepo import fetch_ucirepo from quapy.data.base import Dataset, LabelledCollection from quapy.data.preprocessing import text2tfidf, reduce_columns @@ -45,6 +47,14 @@ UCI_DATASETS = ['acute.a', 'acute.b', 'wine-q-red', 'wine-q-white', 'yeast'] +UCI_MULTICLASS_DATASETS = ['dry-bean', + 'wine-quality', + 'academic-success', + 'digits', + 'letter'] + +KAGGLE_MULTICLASS_DATASETS = ['human-activity'] + LEQUA2022_TASKS = ['T1A', 'T1B', 'T2A', 'T2B'] _TXA_SAMPLE_SIZE = 250 @@ -548,6 +558,99 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) -> data.stats() return data +def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset: + """ + Loads a UCI multiclass dataset as an instance of :class:`quapy.data.base.Dataset`, as used in + `Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017). + Using ensembles for problems with characterizable changes in data distribution: A case study on quantification. + Information Fusion, 34, 87-100. `_ + and + `Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019). + Dynamic ensemble selection for quantification tasks. + Information Fusion, 45, 1-15. `_. + The datasets do not come with a predefined train-test split (see :meth:`fetch_UCILabelledCollection` for further + information on how to use these collections), and so a train-test split is generated at desired proportion. + The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_DATASETS` + + :param dataset_name: a dataset name + :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default + ~/quay_data/ directory) + :param test_split: proportion of documents to be included in the test set. The rest conforms the training set + :param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets + :return: a :class:`quapy.data.base.Dataset` instance + """ + data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, verbose) + return Dataset(*data.split_stratified(1 - test_split, random_state=0)) + +def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection: + """ + Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`, as used in + `Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017). + Using ensembles for problems with characterizable changes in data distribution: A case study on quantification. + Information Fusion, 34, 87-100. `_ + and + `Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019). + Dynamic ensemble selection for quantification tasks. + Information Fusion, 45, 1-15. `_. + The datasets do not come with a predefined train-test split, and so Pérez-Gállego et al. adopted a 5FCVx2 evaluation + protocol, meaning that each collection was used to generate two rounds (hence the x2) of 5 fold cross validation. + This can be reproduced by using :meth:`quapy.data.base.Dataset.kFCV`, e.g.: + + >>> import quapy as qp + >>> collection = qp.datasets.fetch_UCILabelledCollection("dry-bean") + >>> for data in qp.domains.Dataset.kFCV(collection, nfolds=5, nrepeats=2): + >>> ... + + The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS` + + :param dataset_name: a dataset name + :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default + ~/quay_data/ directory) + :param test_split: proportion of documents to be included in the test set. The rest conforms the training set + :param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets + :return: a :class:`quapy.data.base.LabelledCollection` instance + """ + assert dataset_name in UCI_MULTICLASS_DATASETS, \ + f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository (multiclass). ' \ + f'Valid ones are {UCI_MULTICLASS_DATASETS}' + + if data_home is None: + data_home = get_quapy_home() + + identifiers = {"dry-bean": 602, + "wine-quality":186, + "academic-success":697, + "digits":80, + "letter":59} + + full_names = {"dry-bean": "Dry Bean Dataset", + "wine-quality":"Wine Quality", + "academic-success":"Predict students' dropout and academic success", + "digits":"Optical Recognition of Handwritten Digits", + "letter":"Letter Recognition" + } + + identifier = identifiers[dataset_name] + fullname = full_names[dataset_name] + + print(f'Loading UCI Muticlass {dataset_name} ({fullname})') + + file = join(data_home,'uci_multiclass',dataset_name+'.pkl') + if os.path.exists(file): + with open(file, 'rb') as file: + data = pickle.load(file) + else: + data = fetch_ucirepo(id=identifier) + X, y = data['data']['features'].to_numpy(), data['data']['targets'].to_numpy().squeeze() + data = LabelledCollection(X, y) + os.makedirs(os.path.dirname(file), exist_ok=True) + with open(file, 'wb') as file: + pickle.dump(data, file) + + + data.stats() + return data + def _df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float): df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False)