1
0
Fork 0
This commit is contained in:
Alejandro Moreo Fernandez 2023-10-18 17:50:46 +02:00
parent ffab2131a8
commit ea71559722
1 changed files with 42 additions and 23 deletions

View File

@ -6,11 +6,9 @@ import os
import zipfile import zipfile
from os.path import join from os.path import join
import pandas as pd import pandas as pd
import scipy
from ucimlrepo import fetch_ucirepo from ucimlrepo import fetch_ucirepo
from quapy.util import pickled_resource
from quapy.data.base import Dataset, LabelledCollection from quapy.data.base import Dataset, LabelledCollection
from quapy.data.preprocessing import text2tfidf, reduce_columns from quapy.data.preprocessing import text2tfidf, reduce_columns
from quapy.data.reader import * from quapy.data.reader import *
@ -557,17 +555,26 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) ->
data.stats() data.stats()
return data return data
def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset: def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset:
""" """
Loads a UCI multiclass dataset as an instance of :class:`quapy.data.base.Dataset`. Loads a UCI multiclass dataset as an instance of :class:`quapy.data.base.Dataset`.
The list of available datasets is taken from https://archive.ics.uci.edu/, following these criteria: The list of available datasets is taken from https://archive.ics.uci.edu/, following these criteria:
- The dataset has more than 1000 instances - It has more than 1000 instances
- The dataset is suited for classification - It is suited for classification
- the dataset has more than two classes - It has more than two classes
- It is available for Python import (requires ucimlrepo package)
>>> import quapy as qp
>>> dataset = qp.datasets.fetch_UCIMulticlassDataset("dry-bean")
>>> train, test = dataset.train_test
>>> ...
The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS` The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS`
The datasets are downloaded only once and pickled into disk, saving time for consecutive calls.
:param dataset_name: a dataset name :param dataset_name: a dataset name
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory) ~/quay_data/ directory)
@ -578,14 +585,20 @@ def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, ver
data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, verbose) data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, verbose)
return Dataset(*data.split_stratified(1 - test_split, random_state=0)) return Dataset(*data.split_stratified(1 - test_split, random_state=0))
def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection: def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection:
""" """
Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`. Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`.
It needs the library `ucimlrepo` for downloading the datasets from https://archive.ics.uci.edu/. The list of available datasets is taken from https://archive.ics.uci.edu/, following these criteria:
- It has more than 1000 instances
- It is suited for classification
- It has more than two classes
- It is available for Python import (requires ucimlrepo package)
>>> import quapy as qp >>> import quapy as qp
>>> dataset = qp.datasets.fetch_UCIMulticlassLabelledCollection("dry-bean") >>> collection = qp.datasets.fetch_UCIMulticlassLabelledCollection("dry-bean")
>>> X, y = collection.Xy
>>> ... >>> ...
The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS` The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS`
@ -600,43 +613,49 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=
:return: a :class:`quapy.data.base.LabelledCollection` instance :return: a :class:`quapy.data.base.LabelledCollection` instance
""" """
assert dataset_name in UCI_MULTICLASS_DATASETS, \ assert dataset_name in UCI_MULTICLASS_DATASETS, \
f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository (multiclass). ' \ f'Name {dataset_name} does not match any known dataset from the ' \
f'UCI Machine Learning datasets repository (multiclass). ' \
f'Valid ones are {UCI_MULTICLASS_DATASETS}' f'Valid ones are {UCI_MULTICLASS_DATASETS}'
if data_home is None: if data_home is None:
data_home = get_quapy_home() data_home = get_quapy_home()
identifiers = {"dry-bean": 602, identifiers = {
"wine-quality":186, "dry-bean": 602,
"academic-success":697, "wine-quality": 186,
"digits":80, "academic-success": 697,
"letter":59} "digits": 80,
"letter": 59
}
full_names = {"dry-bean": "Dry Bean Dataset", full_names = {
"wine-quality":"Wine Quality", "dry-bean": "Dry Bean Dataset",
"academic-success":"Predict students' dropout and academic success", "wine-quality": "Wine Quality",
"digits":"Optical Recognition of Handwritten Digits", "academic-success": "Predict students' dropout and academic success",
"letter":"Letter Recognition" "digits": "Optical Recognition of Handwritten Digits",
"letter": "Letter Recognition"
} }
identifier = identifiers[dataset_name] identifier = identifiers[dataset_name]
fullname = full_names[dataset_name] fullname = full_names[dataset_name]
if verbose:
print(f'Loading UCI Muticlass {dataset_name} ({fullname})') print(f'Loading UCI Muticlass {dataset_name} ({fullname})')
file = join(data_home,'uci_multiclass',dataset_name+'.pkl') file = join(data_home, 'uci_multiclass', dataset_name+'.pkl')
def download(id): def download(id):
data = fetch_ucirepo(id=id) data = fetch_ucirepo(id=id)
X, y = data['data']['features'].to_numpy(), data['data']['targets'].to_numpy().squeeze() X, y = data['data']['features'].to_numpy(), data['data']['targets'].to_numpy().squeeze()
classes = np.sort(np.unique(y)) classes = np.sort(np.unique(y))
y = np.searchsorted(classes, y) y = np.searchsorted(classes, y)
return LabelledCollection(X,y) return LabelledCollection(X, y)
data = pickled_resource(file, download, identifier) data = pickled_resource(file, download, identifier)
if verbose: if verbose:
data.stats() data.stats()
return data return data