diff --git a/README.md b/README.md index e3bedbf..e48f87e 100644 --- a/README.md +++ b/README.md @@ -111,3 +111,7 @@ are provided: * [SVMperf](https://github.com/HLT-ISTI/QuaPy/wiki/ExplicitLossMinimization) * [Model Selection](https://github.com/HLT-ISTI/QuaPy/wiki/Model-Selection) * [Plotting](https://github.com/HLT-ISTI/QuaPy/wiki/Plotting) + +## Acknowledgments: + +SoBigData++ diff --git a/SoBigData.png b/SoBigData.png new file mode 100644 index 0000000..f279e7b Binary files /dev/null and b/SoBigData.png differ diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py index f058c2c..b01dcec 100644 --- a/quapy/data/datasets.py +++ b/quapy/data/datasets.py @@ -6,8 +6,7 @@ import os import zipfile from os.path import join import pandas as pd -import scipy -import quapy +from ucimlrepo import fetch_ucirepo from quapy.data.base import Dataset, LabelledCollection from quapy.data.preprocessing import text2tfidf, reduce_columns from quapy.data.reader import * @@ -45,6 +44,12 @@ UCI_DATASETS = ['acute.a', 'acute.b', 'wine-q-red', 'wine-q-white', 'yeast'] +UCI_MULTICLASS_DATASETS = ['dry-bean', + 'wine-quality', + 'academic-success', + 'digits', + 'letter'] + LEQUA2022_TASKS = ['T1A', 'T1B', 'T2A', 'T2B'] _TXA_SAMPLE_SIZE = 250 @@ -549,6 +554,109 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) -> return data +def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset: + """ + Loads a UCI multiclass dataset as an instance of :class:`quapy.data.base.Dataset`. + + The list of available datasets is taken from https://archive.ics.uci.edu/, following these criteria: + - It has more than 1000 instances + - It is suited for classification + - It has more than two classes + - It is available for Python import (requires ucimlrepo package) + + >>> import quapy as qp + >>> dataset = qp.datasets.fetch_UCIMulticlassDataset("dry-bean") + >>> train, test = dataset.train_test + >>> ... + + The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS` + + The datasets are downloaded only once and pickled into disk, saving time for consecutive calls. + + :param dataset_name: a dataset name + :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default + ~/quay_data/ directory) + :param test_split: proportion of documents to be included in the test set. The rest conforms the training set + :param verbose: set to True (default is False) to get information (stats) about the dataset + :return: a :class:`quapy.data.base.Dataset` instance + """ + data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, verbose) + return Dataset(*data.split_stratified(1 - test_split, random_state=0)) + + +def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection: + """ + Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`. + + The list of available datasets is taken from https://archive.ics.uci.edu/, following these criteria: + - It has more than 1000 instances + - It is suited for classification + - It has more than two classes + - It is available for Python import (requires ucimlrepo package) + + >>> import quapy as qp + >>> collection = qp.datasets.fetch_UCIMulticlassLabelledCollection("dry-bean") + >>> X, y = collection.Xy + >>> ... + + The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS` + + The datasets are downloaded only once and pickled into disk, saving time for consecutive calls. + + :param dataset_name: a dataset name + :param data_home: specify the quapy home directory where the dataset will be dumped (leave empty to use the default + ~/quay_data/ directory) + :param test_split: proportion of documents to be included in the test set. The rest conforms the training set + :param verbose: set to True (default is False) to get information (stats) about the dataset + :return: a :class:`quapy.data.base.LabelledCollection` instance + """ + assert dataset_name in UCI_MULTICLASS_DATASETS, \ + f'Name {dataset_name} does not match any known dataset from the ' \ + f'UCI Machine Learning datasets repository (multiclass). ' \ + f'Valid ones are {UCI_MULTICLASS_DATASETS}' + + if data_home is None: + data_home = get_quapy_home() + + identifiers = { + "dry-bean": 602, + "wine-quality": 186, + "academic-success": 697, + "digits": 80, + "letter": 59 + } + + full_names = { + "dry-bean": "Dry Bean Dataset", + "wine-quality": "Wine Quality", + "academic-success": "Predict students' dropout and academic success", + "digits": "Optical Recognition of Handwritten Digits", + "letter": "Letter Recognition" + } + + identifier = identifiers[dataset_name] + fullname = full_names[dataset_name] + + if verbose: + print(f'Loading UCI Muticlass {dataset_name} ({fullname})') + + file = join(data_home, 'uci_multiclass', dataset_name+'.pkl') + + def download(id): + data = fetch_ucirepo(id=id) + X, y = data['data']['features'].to_numpy(), data['data']['targets'].to_numpy().squeeze() + classes = np.sort(np.unique(y)) + y = np.searchsorted(classes, y) + return LabelledCollection(X, y) + + data = pickled_resource(file, download, identifier) + + if verbose: + data.stats() + + return data + + def _df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float): df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False) @@ -624,26 +732,3 @@ def fetch_lequa2022(task, data_home=None): return train, val_gen, test_gen -def fetch_IFCB(data_home=None): - - if data_home is None: - data_home = get_quapy_home() - - URL_TRAINDEV=f'https://zenodo.org/records/10036244/files/IFCB.train.zip' - URL_TEST=f'https://zenodo.org/records/10036244/files/IFCB.test.zip' - URL_TEST_PREV=f'https://zenodo.org/records/10036244/files/IFCB.test_prevalences.zip' - - ifcb_dir = join(data_home, 'ifcb') - os.makedirs(ifcb_dir, exist_ok=True) - - def download_unzip_and_remove(unzipped_path, url): - tmp_path = join(ifcb_dir, 'tmp.zip') - download_file_if_not_exists(url, tmp_path) - with zipfile.ZipFile(tmp_path) as file: - file.extractall(unzipped_path) - os.remove(tmp_path) - - if not os.path.exists(join(ifcb_dir, task)): - download_unzip_and_remove(ifcb_dir, URL_TRAINDEV) - download_unzip_and_remove(ifcb_dir, URL_TEST) - download_unzip_and_remove(ifcb_dir, URL_TEST_PREV) \ No newline at end of file diff --git a/quapy/model_selection.py b/quapy/model_selection.py index 4a53bb6..f02d9dc 100644 --- a/quapy/model_selection.py +++ b/quapy/model_selection.py @@ -223,7 +223,7 @@ def cross_val_predict(quantifier: BaseQuantifier, data: LabelledCollection, nfol for train, test in data.kFCV(nfolds=nfolds, random_state=random_state): quantifier.fit(train) fold_prev = quantifier.quantify(test.X) - rel_size = len(test.X)/len(data) + rel_size = 1. * len(test) / len(data) total_prev += fold_prev*rel_size return total_prev