2021-01-28 18:22:43 +01:00
|
|
|
|
def warn(*args, **kwargs):
|
|
|
|
|
pass
|
|
|
|
|
import warnings
|
|
|
|
|
warnings.warn = warn
|
2020-12-14 18:36:19 +01:00
|
|
|
|
import os
|
2021-01-15 18:32:32 +01:00
|
|
|
|
import zipfile
|
2020-12-14 18:36:19 +01:00
|
|
|
|
from os.path import join
|
2021-01-06 14:58:29 +01:00
|
|
|
|
import pandas as pd
|
2023-11-08 10:00:25 +01:00
|
|
|
|
from ucimlrepo import fetch_ucirepo
|
2021-03-19 17:34:09 +01:00
|
|
|
|
from quapy.data.base import Dataset, LabelledCollection
|
2021-01-15 18:32:32 +01:00
|
|
|
|
from quapy.data.preprocessing import text2tfidf, reduce_columns
|
|
|
|
|
from quapy.data.reader import *
|
|
|
|
|
from quapy.util import download_file_if_not_exists, download_file, get_quapy_home, pickled_resource
|
2020-12-14 18:36:19 +01:00
|
|
|
|
|
2022-11-04 15:04:36 +01:00
|
|
|
|
|
2020-12-14 18:36:19 +01:00
|
|
|
|
REVIEWS_SENTIMENT_DATASETS = ['hp', 'kindle', 'imdb']
|
2021-01-12 17:39:00 +01:00
|
|
|
|
TWITTER_SENTIMENT_DATASETS_TEST = ['gasp', 'hcr', 'omd', 'sanders',
|
2021-01-06 14:58:29 +01:00
|
|
|
|
'semeval13', 'semeval14', 'semeval15', 'semeval16',
|
2020-12-14 18:36:19 +01:00
|
|
|
|
'sst', 'wa', 'wb']
|
2021-01-12 17:39:00 +01:00
|
|
|
|
TWITTER_SENTIMENT_DATASETS_TRAIN = ['gasp', 'hcr', 'omd', 'sanders',
|
|
|
|
|
'semeval', 'semeval16',
|
|
|
|
|
'sst', 'wa', 'wb']
|
2024-02-07 18:31:34 +01:00
|
|
|
|
UCI_BINARY_DATASETS = ['acute.a', 'acute.b',
|
2021-01-28 18:22:43 +01:00
|
|
|
|
'balance.1', 'balance.2', 'balance.3',
|
|
|
|
|
'breast-cancer',
|
|
|
|
|
'cmc.1', 'cmc.2', 'cmc.3',
|
|
|
|
|
'ctg.1', 'ctg.2', 'ctg.3',
|
2024-02-07 18:31:34 +01:00
|
|
|
|
#'diabetes', # <-- I haven't found this one...
|
2021-01-28 18:22:43 +01:00
|
|
|
|
'german',
|
|
|
|
|
'haberman',
|
|
|
|
|
'ionosphere',
|
|
|
|
|
'iris.1', 'iris.2', 'iris.3',
|
|
|
|
|
'mammographic',
|
2024-02-07 18:31:34 +01:00
|
|
|
|
'pageblocks.5',
|
|
|
|
|
#'phoneme', # <-- I haven't found this one...
|
|
|
|
|
'semeion',
|
|
|
|
|
'sonar',
|
|
|
|
|
'spambase',
|
|
|
|
|
'spectf',
|
|
|
|
|
'tictactoe',
|
|
|
|
|
'transfusion',
|
|
|
|
|
'wdbc',
|
|
|
|
|
'wine.1', 'wine.2', 'wine.3',
|
|
|
|
|
'wine-q-red', 'wine-q-white',
|
|
|
|
|
'yeast']
|
2020-12-14 18:36:19 +01:00
|
|
|
|
|
2023-10-17 18:24:33 +02:00
|
|
|
|
UCI_MULTICLASS_DATASETS = ['dry-bean',
|
|
|
|
|
'wine-quality',
|
|
|
|
|
'academic-success',
|
|
|
|
|
'digits',
|
|
|
|
|
'letter']
|
|
|
|
|
|
2022-06-01 18:28:59 +02:00
|
|
|
|
LEQUA2022_TASKS = ['T1A', 'T1B', 'T2A', 'T2B']
|
|
|
|
|
|
2022-11-04 15:04:36 +01:00
|
|
|
|
_TXA_SAMPLE_SIZE = 250
|
|
|
|
|
_TXB_SAMPLE_SIZE = 1000
|
|
|
|
|
|
|
|
|
|
LEQUA2022_SAMPLE_SIZE = {
|
|
|
|
|
'TXA': _TXA_SAMPLE_SIZE,
|
|
|
|
|
'TXB': _TXB_SAMPLE_SIZE,
|
|
|
|
|
'T1A': _TXA_SAMPLE_SIZE,
|
|
|
|
|
'T1B': _TXB_SAMPLE_SIZE,
|
|
|
|
|
'T2A': _TXA_SAMPLE_SIZE,
|
|
|
|
|
'T2B': _TXB_SAMPLE_SIZE,
|
|
|
|
|
'binary': _TXA_SAMPLE_SIZE,
|
|
|
|
|
'multiclass': _TXB_SAMPLE_SIZE
|
|
|
|
|
}
|
|
|
|
|
|
2020-12-14 18:36:19 +01:00
|
|
|
|
|
2021-05-05 17:12:44 +02:00
|
|
|
|
def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False) -> Dataset:
|
2020-12-22 17:43:23 +01:00
|
|
|
|
"""
|
2021-12-06 18:25:47 +01:00
|
|
|
|
Loads a Reviews dataset as a Dataset instance, as used in
|
|
|
|
|
`Esuli, A., Moreo, A., and Sebastiani, F. "A recurrent neural network for sentiment quantification."
|
|
|
|
|
Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018. <https://dl.acm.org/doi/abs/10.1145/3269206.3269287>`_.
|
|
|
|
|
The list of valid dataset names can be accessed in `quapy.data.datasets.REVIEWS_SENTIMENT_DATASETS`
|
|
|
|
|
|
2020-12-22 17:43:23 +01:00
|
|
|
|
:param dataset_name: the name of the dataset: valid ones are 'hp', 'kindle', 'imdb'
|
|
|
|
|
:param tfidf: set to True to transform the raw documents into tfidf weighted matrices
|
|
|
|
|
:param min_df: minimun number of documents that should contain a term in order for the term to be
|
2021-12-06 18:25:47 +01:00
|
|
|
|
kept (ignored if tfidf==False)
|
2020-12-22 17:43:23 +01:00
|
|
|
|
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
|
2021-12-06 18:25:47 +01:00
|
|
|
|
~/quay_data/ directory)
|
2020-12-22 17:43:23 +01:00
|
|
|
|
:param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for
|
2021-12-06 18:25:47 +01:00
|
|
|
|
faster subsequent invokations
|
|
|
|
|
:return: a :class:`quapy.data.base.Dataset` instance
|
2020-12-22 17:43:23 +01:00
|
|
|
|
"""
|
2020-12-14 18:36:19 +01:00
|
|
|
|
assert dataset_name in REVIEWS_SENTIMENT_DATASETS, \
|
|
|
|
|
f'Name {dataset_name} does not match any known dataset for sentiment reviews. ' \
|
|
|
|
|
f'Valid ones are {REVIEWS_SENTIMENT_DATASETS}'
|
|
|
|
|
if data_home is None:
|
|
|
|
|
data_home = get_quapy_home()
|
|
|
|
|
|
|
|
|
|
URL_TRAIN = f'https://zenodo.org/record/4117827/files/{dataset_name}_train.txt'
|
|
|
|
|
URL_TEST = f'https://zenodo.org/record/4117827/files/{dataset_name}_test.txt'
|
|
|
|
|
os.makedirs(join(data_home, 'reviews'), exist_ok=True)
|
|
|
|
|
train_path = join(data_home, 'reviews', dataset_name, 'train.txt')
|
|
|
|
|
test_path = join(data_home, 'reviews', dataset_name, 'test.txt')
|
|
|
|
|
download_file_if_not_exists(URL_TRAIN, train_path)
|
|
|
|
|
download_file_if_not_exists(URL_TEST, test_path)
|
|
|
|
|
|
2020-12-22 17:43:23 +01:00
|
|
|
|
pickle_path = None
|
|
|
|
|
if pickle:
|
|
|
|
|
pickle_path = join(data_home, 'reviews', 'pickle', f'{dataset_name}.pkl')
|
|
|
|
|
data = pickled_resource(pickle_path, Dataset.load, train_path, test_path, from_text)
|
2020-12-14 18:36:19 +01:00
|
|
|
|
|
|
|
|
|
if tfidf:
|
|
|
|
|
text2tfidf(data, inplace=True)
|
2020-12-22 17:43:23 +01:00
|
|
|
|
if min_df is not None:
|
|
|
|
|
reduce_columns(data, min_df=min_df, inplace=True)
|
2020-12-14 18:36:19 +01:00
|
|
|
|
|
2021-01-11 18:31:12 +01:00
|
|
|
|
data.name = dataset_name
|
|
|
|
|
|
2020-12-14 18:36:19 +01:00
|
|
|
|
return data
|
|
|
|
|
|
|
|
|
|
|
2021-05-05 17:12:44 +02:00
|
|
|
|
def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_home=None, pickle=False) -> Dataset:
|
2020-12-22 17:43:23 +01:00
|
|
|
|
"""
|
2021-12-06 18:25:47 +01:00
|
|
|
|
Loads a Twitter dataset as a :class:`quapy.data.base.Dataset` instance, as used in:
|
|
|
|
|
`Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
|
|
|
|
|
Social Network Analysis and Mining6(19), 1–22 (2016) <https://link.springer.com/content/pdf/10.1007/s13278-016-0327-z.pdf>`_
|
|
|
|
|
Note that the datasets 'semeval13', 'semeval14', 'semeval15' share the same training set.
|
|
|
|
|
The list of valid dataset names corresponding to training sets can be accessed in
|
|
|
|
|
`quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TRAIN`, while the test sets can be accessed in
|
|
|
|
|
`quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TEST`
|
2020-12-22 17:43:23 +01:00
|
|
|
|
|
|
|
|
|
:param dataset_name: the name of the dataset: valid ones are 'gasp', 'hcr', 'omd', 'sanders', 'semeval13',
|
2021-12-06 18:25:47 +01:00
|
|
|
|
'semeval14', 'semeval15', 'semeval16', 'sst', 'wa', 'wb'
|
2020-12-22 17:43:23 +01:00
|
|
|
|
:param for_model_selection: if True, then returns the train split as the training set and the devel split
|
2021-12-06 18:25:47 +01:00
|
|
|
|
as the test set; if False, then returns the train+devel split as the training set and the test set as the
|
|
|
|
|
test set
|
2020-12-22 17:43:23 +01:00
|
|
|
|
:param min_df: minimun number of documents that should contain a term in order for the term to be kept
|
|
|
|
|
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
|
2021-12-06 18:25:47 +01:00
|
|
|
|
~/quay_data/ directory)
|
2020-12-22 17:43:23 +01:00
|
|
|
|
:param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for
|
2021-12-06 18:25:47 +01:00
|
|
|
|
faster subsequent invokations
|
|
|
|
|
:return: a :class:`quapy.data.base.Dataset` instance
|
2020-12-22 17:43:23 +01:00
|
|
|
|
"""
|
2021-01-12 17:39:00 +01:00
|
|
|
|
assert dataset_name in TWITTER_SENTIMENT_DATASETS_TRAIN + TWITTER_SENTIMENT_DATASETS_TEST, \
|
2020-12-14 18:36:19 +01:00
|
|
|
|
f'Name {dataset_name} does not match any known dataset for sentiment twitter. ' \
|
2021-01-12 17:39:00 +01:00
|
|
|
|
f'Valid ones are {TWITTER_SENTIMENT_DATASETS_TRAIN} for model selection and ' \
|
|
|
|
|
f'{TWITTER_SENTIMENT_DATASETS_TEST} for test (datasets "semeval14", "semeval15", "semeval16" share ' \
|
|
|
|
|
f'a common training set "semeval")'
|
2020-12-14 18:36:19 +01:00
|
|
|
|
if data_home is None:
|
|
|
|
|
data_home = get_quapy_home()
|
|
|
|
|
|
|
|
|
|
URL = 'https://zenodo.org/record/4255764/files/tweet_sentiment_quantification_snam.zip'
|
|
|
|
|
unzipped_path = join(data_home, 'tweet_sentiment_quantification_snam')
|
|
|
|
|
if not os.path.exists(unzipped_path):
|
|
|
|
|
downloaded_path = join(data_home, 'tweet_sentiment_quantification_snam.zip')
|
|
|
|
|
download_file(URL, downloaded_path)
|
|
|
|
|
with zipfile.ZipFile(downloaded_path) as file:
|
|
|
|
|
file.extractall(data_home)
|
|
|
|
|
os.remove(downloaded_path)
|
|
|
|
|
|
|
|
|
|
if dataset_name in {'semeval13', 'semeval14', 'semeval15'}:
|
|
|
|
|
trainset_name = 'semeval'
|
2020-12-22 17:43:23 +01:00
|
|
|
|
testset_name = 'semeval' if for_model_selection else dataset_name
|
2020-12-14 18:36:19 +01:00
|
|
|
|
print(f"the training and development sets for datasets 'semeval13', 'semeval14', 'semeval15' are common "
|
|
|
|
|
f"(called 'semeval'); returning trainin-set='{trainset_name}' and test-set={testset_name}")
|
|
|
|
|
else:
|
2021-01-12 17:39:00 +01:00
|
|
|
|
if dataset_name == 'semeval' and for_model_selection==False:
|
|
|
|
|
raise ValueError('dataset "semeval" can only be used for model selection. '
|
|
|
|
|
'Use "semeval13", "semeval14", or "semeval15" for model evaluation.')
|
2020-12-14 18:36:19 +01:00
|
|
|
|
trainset_name = testset_name = dataset_name
|
|
|
|
|
|
2020-12-22 17:43:23 +01:00
|
|
|
|
if for_model_selection:
|
2020-12-14 18:36:19 +01:00
|
|
|
|
train = join(unzipped_path, 'train', f'{trainset_name}.train.feature.txt')
|
|
|
|
|
test = join(unzipped_path, 'test', f'{testset_name}.dev.feature.txt')
|
|
|
|
|
else:
|
|
|
|
|
train = join(unzipped_path, 'train', f'{trainset_name}.train+dev.feature.txt')
|
2020-12-22 17:43:23 +01:00
|
|
|
|
if dataset_name == 'semeval16': # there is a different test name in the case of semeval16 only
|
2020-12-14 18:36:19 +01:00
|
|
|
|
test = join(unzipped_path, 'test', f'{testset_name}.dev-test.feature.txt')
|
|
|
|
|
else:
|
|
|
|
|
test = join(unzipped_path, 'test', f'{testset_name}.test.feature.txt')
|
|
|
|
|
|
2020-12-22 17:43:23 +01:00
|
|
|
|
pickle_path = None
|
|
|
|
|
if pickle:
|
|
|
|
|
mode = "train-dev" if for_model_selection else "train+dev-test"
|
|
|
|
|
pickle_path = join(unzipped_path, 'pickle', f'{testset_name}.{mode}.pkl')
|
|
|
|
|
data = pickled_resource(pickle_path, Dataset.load, train, test, from_sparse)
|
2020-12-14 18:36:19 +01:00
|
|
|
|
|
|
|
|
|
if min_df is not None:
|
|
|
|
|
reduce_columns(data, min_df=min_df, inplace=True)
|
|
|
|
|
|
2021-01-11 18:31:12 +01:00
|
|
|
|
data.name = dataset_name
|
|
|
|
|
|
2020-12-14 18:36:19 +01:00
|
|
|
|
return data
|
|
|
|
|
|
|
|
|
|
|
2024-02-07 18:31:34 +01:00
|
|
|
|
def fetch_UCIBinaryDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset:
|
2021-12-06 18:25:47 +01:00
|
|
|
|
"""
|
|
|
|
|
Loads a UCI dataset as an instance of :class:`quapy.data.base.Dataset`, as used in
|
|
|
|
|
`Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
|
|
|
|
|
Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
|
|
|
|
|
Information Fusion, 34, 87-100. <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_
|
|
|
|
|
and
|
|
|
|
|
`Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).
|
|
|
|
|
Dynamic ensemble selection for quantification tasks.
|
|
|
|
|
Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_.
|
|
|
|
|
The datasets do not come with a predefined train-test split (see :meth:`fetch_UCILabelledCollection` for further
|
|
|
|
|
information on how to use these collections), and so a train-test split is generated at desired proportion.
|
|
|
|
|
The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_DATASETS`
|
|
|
|
|
|
|
|
|
|
:param dataset_name: a dataset name
|
|
|
|
|
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
|
|
|
|
|
~/quay_data/ directory)
|
|
|
|
|
:param test_split: proportion of documents to be included in the test set. The rest conforms the training set
|
|
|
|
|
:param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
|
|
|
|
|
:return: a :class:`quapy.data.base.Dataset` instance
|
|
|
|
|
"""
|
2024-02-07 18:31:34 +01:00
|
|
|
|
data = fetch_UCIBinaryLabelledCollection(dataset_name, data_home, verbose)
|
2021-01-28 18:22:43 +01:00
|
|
|
|
return Dataset(*data.split_stratified(1 - test_split, random_state=0))
|
|
|
|
|
|
2020-12-14 18:36:19 +01:00
|
|
|
|
|
2024-02-07 18:31:34 +01:00
|
|
|
|
def fetch_UCIBinaryLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection:
|
2021-12-06 18:25:47 +01:00
|
|
|
|
"""
|
|
|
|
|
Loads a UCI collection as an instance of :class:`quapy.data.base.LabelledCollection`, as used in
|
|
|
|
|
`Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
|
|
|
|
|
Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
|
|
|
|
|
Information Fusion, 34, 87-100. <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_
|
|
|
|
|
and
|
|
|
|
|
`Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).
|
|
|
|
|
Dynamic ensemble selection for quantification tasks.
|
|
|
|
|
Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_.
|
|
|
|
|
The datasets do not come with a predefined train-test split, and so Pérez-Gállego et al. adopted a 5FCVx2 evaluation
|
|
|
|
|
protocol, meaning that each collection was used to generate two rounds (hence the x2) of 5 fold cross validation.
|
|
|
|
|
This can be reproduced by using :meth:`quapy.data.base.Dataset.kFCV`, e.g.:
|
|
|
|
|
|
|
|
|
|
>>> import quapy as qp
|
2024-02-07 18:31:34 +01:00
|
|
|
|
>>> collection = qp.datasets.fetch_UCIBinaryLabelledCollection("yeast")
|
|
|
|
|
>>> for data in qp.train.Dataset.kFCV(collection, nfolds=5, nrepeats=2):
|
2021-12-06 18:25:47 +01:00
|
|
|
|
>>> ...
|
|
|
|
|
|
|
|
|
|
The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_DATASETS`
|
|
|
|
|
|
|
|
|
|
:param dataset_name: a dataset name
|
|
|
|
|
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
|
|
|
|
|
~/quay_data/ directory)
|
|
|
|
|
:param test_split: proportion of documents to be included in the test set. The rest conforms the training set
|
|
|
|
|
:param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
|
2023-03-23 15:46:03 +01:00
|
|
|
|
:return: a :class:`quapy.data.base.LabelledCollection` instance
|
2021-12-06 18:25:47 +01:00
|
|
|
|
"""
|
2021-01-06 14:58:29 +01:00
|
|
|
|
|
2024-02-07 18:31:34 +01:00
|
|
|
|
assert dataset_name in UCI_BINARY_DATASETS, \
|
2021-01-06 14:58:29 +01:00
|
|
|
|
f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \
|
2024-02-07 18:31:34 +01:00
|
|
|
|
f'Valid ones are {UCI_BINARY_DATASETS}'
|
2021-01-06 14:58:29 +01:00
|
|
|
|
if data_home is None:
|
|
|
|
|
data_home = get_quapy_home()
|
|
|
|
|
|
|
|
|
|
dataset_fullname = {
|
|
|
|
|
'acute.a': 'Acute Inflammations (urinary bladder)',
|
|
|
|
|
'acute.b': 'Acute Inflammations (renal pelvis)',
|
|
|
|
|
'balance.1': 'Balance Scale Weight & Distance Database (left)',
|
|
|
|
|
'balance.2': 'Balance Scale Weight & Distance Database (balanced)',
|
|
|
|
|
'balance.3': 'Balance Scale Weight & Distance Database (right)',
|
2021-01-11 12:55:06 +01:00
|
|
|
|
'breast-cancer': 'Breast Cancer Wisconsin (Original)',
|
|
|
|
|
'cmc.1': 'Contraceptive Method Choice (no use)',
|
|
|
|
|
'cmc.2': 'Contraceptive Method Choice (long term)',
|
|
|
|
|
'cmc.3': 'Contraceptive Method Choice (short term)',
|
|
|
|
|
'ctg.1': 'Cardiotocography Data Set (normal)',
|
|
|
|
|
'ctg.2': 'Cardiotocography Data Set (suspect)',
|
|
|
|
|
'ctg.3': 'Cardiotocography Data Set (pathologic)',
|
2021-01-22 18:01:51 +01:00
|
|
|
|
'german': 'Statlog German Credit Data',
|
2021-01-25 18:38:56 +01:00
|
|
|
|
'haberman': "Haberman's Survival Data",
|
|
|
|
|
'ionosphere': 'Johns Hopkins University Ionosphere DB',
|
|
|
|
|
'iris.1': 'Iris Plants Database(x)',
|
|
|
|
|
'iris.2': 'Iris Plants Database(versicolour)',
|
|
|
|
|
'iris.3': 'Iris Plants Database(virginica)',
|
|
|
|
|
'mammographic': 'Mammographic Mass',
|
|
|
|
|
'pageblocks.5': 'Page Blocks Classification (5)',
|
|
|
|
|
'semeion': 'Semeion Handwritten Digit (8)',
|
2021-01-27 22:49:54 +01:00
|
|
|
|
'sonar': 'Sonar, Mines vs. Rocks',
|
|
|
|
|
'spambase': 'Spambase Data Set',
|
|
|
|
|
'spectf': 'SPECTF Heart Data',
|
|
|
|
|
'tictactoe': 'Tic-Tac-Toe Endgame Database',
|
2021-01-28 18:22:43 +01:00
|
|
|
|
'transfusion': 'Blood Transfusion Service Center Data Set',
|
|
|
|
|
'wdbc': 'Wisconsin Diagnostic Breast Cancer',
|
|
|
|
|
'wine.1': 'Wine Recognition Data (1)',
|
|
|
|
|
'wine.2': 'Wine Recognition Data (2)',
|
|
|
|
|
'wine.3': 'Wine Recognition Data (3)',
|
|
|
|
|
'wine-q-red': 'Wine Quality Red (6-10)',
|
|
|
|
|
'wine-q-white': 'Wine Quality White (6-10)',
|
|
|
|
|
'yeast': 'Yeast',
|
2021-01-22 18:01:51 +01:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# the identifier is an alias for the dataset group, it's part of the url data-folder, and is the name we use
|
|
|
|
|
# to download the raw dataset
|
|
|
|
|
identifier_map = {
|
|
|
|
|
'acute.a': 'acute',
|
|
|
|
|
'acute.b': 'acute',
|
|
|
|
|
'balance.1': 'balance-scale',
|
|
|
|
|
'balance.2': 'balance-scale',
|
|
|
|
|
'balance.3': 'balance-scale',
|
|
|
|
|
'breast-cancer': 'breast-cancer-wisconsin',
|
|
|
|
|
'cmc.1': 'cmc',
|
|
|
|
|
'cmc.2': 'cmc',
|
|
|
|
|
'cmc.3': 'cmc',
|
|
|
|
|
'ctg.1': '00193',
|
|
|
|
|
'ctg.2': '00193',
|
|
|
|
|
'ctg.3': '00193',
|
2021-01-25 18:38:56 +01:00
|
|
|
|
'german': 'statlog/german',
|
|
|
|
|
'haberman': 'haberman',
|
|
|
|
|
'ionosphere': 'ionosphere',
|
|
|
|
|
'iris.1': 'iris',
|
|
|
|
|
'iris.2': 'iris',
|
|
|
|
|
'iris.3': 'iris',
|
|
|
|
|
'mammographic': 'mammographic-masses',
|
|
|
|
|
'pageblocks.5': 'page-blocks',
|
|
|
|
|
'semeion': 'semeion',
|
2021-01-27 22:49:54 +01:00
|
|
|
|
'sonar': 'undocumented/connectionist-bench/sonar',
|
|
|
|
|
'spambase': 'spambase',
|
|
|
|
|
'spectf': 'spect',
|
|
|
|
|
'tictactoe': 'tic-tac-toe',
|
2021-01-28 18:22:43 +01:00
|
|
|
|
'transfusion': 'blood-transfusion',
|
|
|
|
|
'wdbc': 'breast-cancer-wisconsin',
|
|
|
|
|
'wine-q-red': 'wine-quality',
|
|
|
|
|
'wine-q-white': 'wine-quality',
|
|
|
|
|
'wine.1': 'wine',
|
|
|
|
|
'wine.2': 'wine',
|
|
|
|
|
'wine.3': 'wine',
|
|
|
|
|
'yeast': 'yeast',
|
2021-01-06 14:58:29 +01:00
|
|
|
|
}
|
|
|
|
|
|
2021-01-22 18:01:51 +01:00
|
|
|
|
# the filename is the name of the file within the data_folder indexed by the identifier
|
|
|
|
|
file_name = {
|
|
|
|
|
'acute': 'diagnosis.data',
|
|
|
|
|
'00193': 'CTG.xls',
|
2021-01-25 18:38:56 +01:00
|
|
|
|
'statlog/german': 'german.data-numeric',
|
|
|
|
|
'mammographic-masses': 'mammographic_masses.data',
|
|
|
|
|
'page-blocks': 'page-blocks.data.Z',
|
2021-01-27 22:49:54 +01:00
|
|
|
|
'undocumented/connectionist-bench/sonar': 'sonar.all-data',
|
|
|
|
|
'spect': ['SPECTF.train', 'SPECTF.test'],
|
2021-01-28 18:22:43 +01:00
|
|
|
|
'blood-transfusion': 'transfusion.data',
|
|
|
|
|
'wine-quality': ['winequality-red.csv', 'winequality-white.csv'],
|
|
|
|
|
'breast-cancer-wisconsin': 'breast-cancer-wisconsin.data' if dataset_name=='breast-cancer' else 'wdbc.data'
|
2021-01-22 18:01:51 +01:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# the filename containing the dataset description (if any)
|
|
|
|
|
desc_name = {
|
|
|
|
|
'acute': 'diagnosis.names',
|
|
|
|
|
'00193': None,
|
2021-01-25 18:38:56 +01:00
|
|
|
|
'statlog/german': 'german.doc',
|
|
|
|
|
'mammographic-masses': 'mammographic_masses.names',
|
2021-01-27 22:49:54 +01:00
|
|
|
|
'undocumented/connectionist-bench/sonar': 'sonar.names',
|
|
|
|
|
'spect': 'SPECTF.names',
|
2021-01-28 18:22:43 +01:00
|
|
|
|
'blood-transfusion': 'transfusion.names',
|
|
|
|
|
'wine-quality': 'winequality.names',
|
|
|
|
|
'breast-cancer-wisconsin': 'breast-cancer-wisconsin.names' if dataset_name == 'breast-cancer' else 'wdbc.names'
|
2021-01-06 14:58:29 +01:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
identifier = identifier_map[dataset_name]
|
2021-01-25 18:38:56 +01:00
|
|
|
|
filename = file_name.get(identifier, f'{identifier}.data')
|
|
|
|
|
descfile = desc_name.get(identifier, f'{identifier}.names')
|
|
|
|
|
fullname = dataset_fullname[dataset_name]
|
|
|
|
|
|
2021-01-06 14:58:29 +01:00
|
|
|
|
URL = f'http://archive.ics.uci.edu/ml/machine-learning-databases/{identifier}'
|
2021-01-22 18:01:51 +01:00
|
|
|
|
data_dir = join(data_home, 'uci_datasets', identifier)
|
2021-01-27 22:49:54 +01:00
|
|
|
|
if isinstance(filename, str): # filename could be a list of files, in which case it will be processed later
|
|
|
|
|
data_path = join(data_dir, filename)
|
|
|
|
|
download_file_if_not_exists(f'{URL}/{filename}', data_path)
|
2021-01-22 18:01:51 +01:00
|
|
|
|
|
|
|
|
|
if descfile:
|
2021-01-25 18:38:56 +01:00
|
|
|
|
try:
|
|
|
|
|
download_file_if_not_exists(f'{URL}/{descfile}', f'{data_dir}/{descfile}')
|
|
|
|
|
if verbose:
|
|
|
|
|
print(open(f'{data_dir}/{descfile}', 'rt').read())
|
|
|
|
|
except Exception:
|
|
|
|
|
print('could not read the description file')
|
2021-01-22 18:01:51 +01:00
|
|
|
|
elif verbose:
|
|
|
|
|
print('no file description available')
|
2021-01-06 14:58:29 +01:00
|
|
|
|
|
2023-11-08 15:34:17 +01:00
|
|
|
|
if verbose:
|
|
|
|
|
print(f'Loading {dataset_name} ({fullname})')
|
2021-01-06 14:58:29 +01:00
|
|
|
|
if identifier == 'acute':
|
2021-01-22 18:01:51 +01:00
|
|
|
|
df = pd.read_csv(data_path, header=None, encoding='utf-16', sep='\t')
|
2021-01-28 18:22:43 +01:00
|
|
|
|
|
|
|
|
|
df[0] = df[0].apply(lambda x: float(x.replace(',', '.'))).astype(float, copy=False)
|
2021-12-06 18:25:47 +01:00
|
|
|
|
[_df_replace(df, col) for col in range(1, 6)]
|
2021-01-28 18:22:43 +01:00
|
|
|
|
X = df.loc[:, 0:5].values
|
2021-01-06 14:58:29 +01:00
|
|
|
|
if dataset_name == 'acute.a':
|
|
|
|
|
y = binarize(df[6], pos_class='yes')
|
|
|
|
|
elif dataset_name == 'acute.b':
|
|
|
|
|
y = binarize(df[7], pos_class='yes')
|
|
|
|
|
|
|
|
|
|
if identifier == 'balance-scale':
|
2021-01-22 18:01:51 +01:00
|
|
|
|
df = pd.read_csv(data_path, header=None, sep=',')
|
2021-01-06 14:58:29 +01:00
|
|
|
|
if dataset_name == 'balance.1':
|
|
|
|
|
y = binarize(df[0], pos_class='L')
|
|
|
|
|
elif dataset_name == 'balance.2':
|
|
|
|
|
y = binarize(df[0], pos_class='B')
|
|
|
|
|
elif dataset_name == 'balance.3':
|
|
|
|
|
y = binarize(df[0], pos_class='R')
|
|
|
|
|
X = df.loc[:, 1:].astype(float).values
|
|
|
|
|
|
2021-01-28 18:22:43 +01:00
|
|
|
|
if identifier == 'breast-cancer-wisconsin' and dataset_name=='breast-cancer':
|
2021-01-22 18:01:51 +01:00
|
|
|
|
df = pd.read_csv(data_path, header=None, sep=',')
|
2021-01-11 12:55:06 +01:00
|
|
|
|
Xy = df.loc[:, 1:10]
|
|
|
|
|
Xy[Xy=='?']=np.nan
|
|
|
|
|
Xy = Xy.dropna(axis=0)
|
|
|
|
|
X = Xy.loc[:, 1:9]
|
|
|
|
|
X = X.astype(float).values
|
2021-01-28 18:22:43 +01:00
|
|
|
|
y = binarize(Xy[10], pos_class=2)
|
|
|
|
|
|
|
|
|
|
if identifier == 'breast-cancer-wisconsin' and dataset_name=='wdbc':
|
|
|
|
|
df = pd.read_csv(data_path, header=None, sep=',')
|
|
|
|
|
X = df.loc[:, 2:32].astype(float).values
|
|
|
|
|
y = df[1].values
|
|
|
|
|
y = binarize(y, pos_class='M')
|
2021-01-11 12:55:06 +01:00
|
|
|
|
|
|
|
|
|
if identifier == 'cmc':
|
2021-01-22 18:01:51 +01:00
|
|
|
|
df = pd.read_csv(data_path, header=None, sep=',')
|
2021-01-11 12:55:06 +01:00
|
|
|
|
X = df.loc[:, 0:8].astype(float).values
|
|
|
|
|
y = df[9].astype(int).values
|
|
|
|
|
if dataset_name == 'cmc.1':
|
|
|
|
|
y = binarize(y, pos_class=1)
|
|
|
|
|
elif dataset_name == 'cmc.2':
|
|
|
|
|
y = binarize(y, pos_class=2)
|
|
|
|
|
elif dataset_name == 'cmc.3':
|
|
|
|
|
y = binarize(y, pos_class=3)
|
|
|
|
|
|
2021-01-22 18:01:51 +01:00
|
|
|
|
if identifier == '00193':
|
|
|
|
|
df = pd.read_excel(data_path, sheet_name='Data', skipfooter=3)
|
|
|
|
|
df = df[list(range(1,24))] # select columns numbered (number 23 is the target label)
|
|
|
|
|
# replaces the header with the first row
|
|
|
|
|
new_header = df.iloc[0] # grab the first row for the header
|
|
|
|
|
df = df[1:] # take the data less the header row
|
|
|
|
|
df.columns = new_header # set the header row as the df header
|
|
|
|
|
X = df.iloc[:, 0:22].astype(float).values
|
|
|
|
|
y = df['NSP'].astype(int).values
|
2021-01-25 18:38:56 +01:00
|
|
|
|
if dataset_name == 'ctg.1':
|
|
|
|
|
y = binarize(y, pos_class=1) # 1==Normal
|
2021-01-22 18:01:51 +01:00
|
|
|
|
elif dataset_name == 'ctg.2':
|
2021-01-25 18:38:56 +01:00
|
|
|
|
y = binarize(y, pos_class=2) # 2==Suspect
|
2021-01-22 18:01:51 +01:00
|
|
|
|
elif dataset_name == 'ctg.3':
|
2021-01-25 18:38:56 +01:00
|
|
|
|
y = binarize(y, pos_class=3) # 3==Pathologic
|
2021-01-06 14:58:29 +01:00
|
|
|
|
|
2021-01-22 18:01:51 +01:00
|
|
|
|
if identifier == 'statlog/german':
|
|
|
|
|
df = pd.read_csv(data_path, header=None, delim_whitespace=True)
|
|
|
|
|
X = df.iloc[:, 0:24].astype(float).values
|
|
|
|
|
y = df[24].astype(int).values
|
|
|
|
|
y = binarize(y, pos_class=1)
|
2021-01-06 14:58:29 +01:00
|
|
|
|
|
2021-01-25 18:38:56 +01:00
|
|
|
|
if identifier == 'haberman':
|
|
|
|
|
df = pd.read_csv(data_path, header=None)
|
|
|
|
|
X = df.iloc[:, 0:3].astype(float).values
|
|
|
|
|
y = df[3].astype(int).values
|
|
|
|
|
y = binarize(y, pos_class=2)
|
|
|
|
|
|
|
|
|
|
if identifier == 'ionosphere':
|
|
|
|
|
df = pd.read_csv(data_path, header=None)
|
|
|
|
|
X = df.iloc[:, 0:34].astype(float).values
|
|
|
|
|
y = df[34].values
|
|
|
|
|
y = binarize(y, pos_class='b')
|
|
|
|
|
|
|
|
|
|
if identifier == 'iris':
|
|
|
|
|
df = pd.read_csv(data_path, header=None)
|
|
|
|
|
X = df.iloc[:, 0:4].astype(float).values
|
|
|
|
|
y = df[4].values
|
|
|
|
|
if dataset_name == 'iris.1':
|
|
|
|
|
y = binarize(y, pos_class='Iris-setosa') # 1==Setosa
|
|
|
|
|
elif dataset_name == 'iris.2':
|
|
|
|
|
y = binarize(y, pos_class='Iris-versicolor') # 2==Versicolor
|
|
|
|
|
elif dataset_name == 'iris.3':
|
|
|
|
|
y = binarize(y, pos_class='Iris-virginica') # 3==Virginica
|
|
|
|
|
|
|
|
|
|
if identifier == 'mammographic-masses':
|
|
|
|
|
df = pd.read_csv(data_path, header=None, sep=',')
|
2021-01-28 18:22:43 +01:00
|
|
|
|
df[df == '?'] = np.nan
|
|
|
|
|
Xy = df.dropna(axis=0)
|
2021-01-25 18:38:56 +01:00
|
|
|
|
X = Xy.iloc[:, 0:5]
|
|
|
|
|
X = X.astype(float).values
|
|
|
|
|
y = binarize(Xy.iloc[:,5], pos_class=1)
|
|
|
|
|
|
|
|
|
|
if identifier == 'page-blocks':
|
|
|
|
|
data_path_ = data_path.replace('.Z', '')
|
|
|
|
|
if not os.path.exists(data_path_):
|
|
|
|
|
raise FileNotFoundError(f'Warning: file {data_path_} does not exist. If this is the first time you '
|
|
|
|
|
f'attempt to load this dataset, then you have to manually unzip the {data_path} '
|
|
|
|
|
f'and name the extracted file {data_path_} (unfortunately, neither zipfile, nor '
|
|
|
|
|
f'gzip can handle unix compressed files automatically -- there is a repo in GitHub '
|
|
|
|
|
f'https://github.com/umeat/unlzw where the problem seems to be solved anyway).')
|
|
|
|
|
df = pd.read_csv(data_path_, header=None, delim_whitespace=True)
|
|
|
|
|
X = df.iloc[:, 0:10].astype(float).values
|
|
|
|
|
y = df[10].values
|
|
|
|
|
y = binarize(y, pos_class=5) # 5==block "graphic"
|
|
|
|
|
|
|
|
|
|
if identifier == 'semeion':
|
|
|
|
|
df = pd.read_csv(data_path, header=None, delim_whitespace=True )
|
|
|
|
|
X = df.iloc[:, 0:256].astype(float).values
|
|
|
|
|
y = df[263].values # 263 stands for digit 8 (labels are one-hot vectors from col 256-266)
|
|
|
|
|
y = binarize(y, pos_class=1)
|
|
|
|
|
|
|
|
|
|
if identifier == 'undocumented/connectionist-bench/sonar':
|
|
|
|
|
df = pd.read_csv(data_path, header=None, sep=',')
|
|
|
|
|
X = df.iloc[:, 0:60].astype(float).values
|
2021-01-27 22:49:54 +01:00
|
|
|
|
y = df[60].values
|
2021-01-25 18:38:56 +01:00
|
|
|
|
y = binarize(y, pos_class='R')
|
|
|
|
|
|
2021-01-27 22:49:54 +01:00
|
|
|
|
if identifier == 'spambase':
|
|
|
|
|
df = pd.read_csv(data_path, header=None, sep=',')
|
|
|
|
|
X = df.iloc[:, 0:57].astype(float).values
|
|
|
|
|
y = df[57].values
|
|
|
|
|
y = binarize(y, pos_class=1)
|
|
|
|
|
|
|
|
|
|
if identifier == 'spect':
|
|
|
|
|
dfs = []
|
2021-01-28 18:22:43 +01:00
|
|
|
|
for file in filename:
|
2021-01-27 22:49:54 +01:00
|
|
|
|
data_path = join(data_dir, file)
|
2021-01-28 18:22:43 +01:00
|
|
|
|
download_file_if_not_exists(f'{URL}/{file}', data_path)
|
2021-01-27 22:49:54 +01:00
|
|
|
|
dfs.append(pd.read_csv(data_path, header=None, sep=','))
|
|
|
|
|
df = pd.concat(dfs)
|
|
|
|
|
X = df.iloc[:, 1:45].astype(float).values
|
|
|
|
|
y = df[0].values
|
|
|
|
|
y = binarize(y, pos_class=0)
|
|
|
|
|
|
|
|
|
|
if identifier == 'tic-tac-toe':
|
|
|
|
|
df = pd.read_csv(data_path, header=None, sep=',')
|
|
|
|
|
X = df.iloc[:, 0:9].replace('o',0).replace('b',1).replace('x',2).values
|
|
|
|
|
y = df[9].values
|
|
|
|
|
y = binarize(y, pos_class='negative')
|
|
|
|
|
|
|
|
|
|
if identifier == 'blood-transfusion':
|
|
|
|
|
df = pd.read_csv(data_path, sep=',')
|
|
|
|
|
X = df.iloc[:, 0:4].astype(float).values
|
|
|
|
|
y = df.iloc[:, 4].values
|
|
|
|
|
y = binarize(y, pos_class=1)
|
2021-01-25 18:38:56 +01:00
|
|
|
|
|
2021-01-28 18:22:43 +01:00
|
|
|
|
if identifier == 'wine':
|
|
|
|
|
df = pd.read_csv(data_path, header=None, sep=',')
|
|
|
|
|
X = df.iloc[:, 1:14].astype(float).values
|
|
|
|
|
y = df[0].values
|
|
|
|
|
if dataset_name == 'wine.1':
|
|
|
|
|
y = binarize(y, pos_class=1)
|
|
|
|
|
elif dataset_name == 'wine.2':
|
|
|
|
|
y = binarize(y, pos_class=2)
|
|
|
|
|
elif dataset_name == 'wine.3':
|
|
|
|
|
y = binarize(y, pos_class=3)
|
|
|
|
|
|
|
|
|
|
if identifier == 'wine-quality':
|
|
|
|
|
filename = filename[0] if dataset_name=='wine-q-red' else filename[1]
|
|
|
|
|
data_path = join(data_dir, filename)
|
|
|
|
|
download_file_if_not_exists(f'{URL}/{filename}', data_path)
|
|
|
|
|
df = pd.read_csv(data_path, sep=';')
|
|
|
|
|
X = df.iloc[:, 0:11].astype(float).values
|
|
|
|
|
y = df.iloc[:, 11].values > 5
|
|
|
|
|
|
|
|
|
|
if identifier == 'yeast':
|
|
|
|
|
df = pd.read_csv(data_path, header=None, delim_whitespace=True)
|
|
|
|
|
X = df.iloc[:, 1:9].astype(float).values
|
|
|
|
|
y = df.iloc[:, 9].values
|
|
|
|
|
y = binarize(y, pos_class='NUC')
|
|
|
|
|
|
2021-01-22 18:01:51 +01:00
|
|
|
|
data = LabelledCollection(X, y)
|
2023-11-08 15:34:17 +01:00
|
|
|
|
if verbose:
|
|
|
|
|
data.stats()
|
2021-01-28 18:22:43 +01:00
|
|
|
|
return data
|
2021-01-06 14:58:29 +01:00
|
|
|
|
|
|
|
|
|
|
2023-10-17 18:24:33 +02:00
|
|
|
|
def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset:
|
|
|
|
|
"""
|
2023-10-18 14:12:40 +02:00
|
|
|
|
Loads a UCI multiclass dataset as an instance of :class:`quapy.data.base.Dataset`.
|
|
|
|
|
|
|
|
|
|
The list of available datasets is taken from https://archive.ics.uci.edu/, following these criteria:
|
2023-10-18 17:50:46 +02:00
|
|
|
|
- It has more than 1000 instances
|
|
|
|
|
- It is suited for classification
|
|
|
|
|
- It has more than two classes
|
|
|
|
|
- It is available for Python import (requires ucimlrepo package)
|
|
|
|
|
|
|
|
|
|
>>> import quapy as qp
|
|
|
|
|
>>> dataset = qp.datasets.fetch_UCIMulticlassDataset("dry-bean")
|
|
|
|
|
>>> train, test = dataset.train_test
|
|
|
|
|
>>> ...
|
2023-10-18 14:12:40 +02:00
|
|
|
|
|
2023-10-17 18:44:02 +02:00
|
|
|
|
The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS`
|
2023-10-17 18:24:33 +02:00
|
|
|
|
|
2023-10-18 17:50:46 +02:00
|
|
|
|
The datasets are downloaded only once and pickled into disk, saving time for consecutive calls.
|
|
|
|
|
|
2023-10-17 18:24:33 +02:00
|
|
|
|
:param dataset_name: a dataset name
|
|
|
|
|
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
|
|
|
|
|
~/quay_data/ directory)
|
|
|
|
|
:param test_split: proportion of documents to be included in the test set. The rest conforms the training set
|
2023-10-18 14:12:40 +02:00
|
|
|
|
:param verbose: set to True (default is False) to get information (stats) about the dataset
|
2023-10-17 18:24:33 +02:00
|
|
|
|
:return: a :class:`quapy.data.base.Dataset` instance
|
|
|
|
|
"""
|
|
|
|
|
data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, verbose)
|
|
|
|
|
return Dataset(*data.split_stratified(1 - test_split, random_state=0))
|
|
|
|
|
|
2023-10-18 17:50:46 +02:00
|
|
|
|
|
2023-10-17 18:24:33 +02:00
|
|
|
|
def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection:
|
|
|
|
|
"""
|
2023-10-18 17:50:46 +02:00
|
|
|
|
Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`.
|
2023-10-17 18:24:33 +02:00
|
|
|
|
|
2023-10-18 17:50:46 +02:00
|
|
|
|
The list of available datasets is taken from https://archive.ics.uci.edu/, following these criteria:
|
|
|
|
|
- It has more than 1000 instances
|
|
|
|
|
- It is suited for classification
|
|
|
|
|
- It has more than two classes
|
|
|
|
|
- It is available for Python import (requires ucimlrepo package)
|
|
|
|
|
|
2023-10-17 18:24:33 +02:00
|
|
|
|
>>> import quapy as qp
|
2023-10-18 17:50:46 +02:00
|
|
|
|
>>> collection = qp.datasets.fetch_UCIMulticlassLabelledCollection("dry-bean")
|
|
|
|
|
>>> X, y = collection.Xy
|
2023-10-17 18:24:33 +02:00
|
|
|
|
>>> ...
|
|
|
|
|
|
|
|
|
|
The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS`
|
|
|
|
|
|
2023-10-18 14:12:40 +02:00
|
|
|
|
The datasets are downloaded only once and pickled into disk, saving time for consecutive calls.
|
|
|
|
|
|
2023-10-17 18:24:33 +02:00
|
|
|
|
:param dataset_name: a dataset name
|
2023-10-18 14:12:40 +02:00
|
|
|
|
:param data_home: specify the quapy home directory where the dataset will be dumped (leave empty to use the default
|
2023-10-17 18:24:33 +02:00
|
|
|
|
~/quay_data/ directory)
|
|
|
|
|
:param test_split: proportion of documents to be included in the test set. The rest conforms the training set
|
2023-10-18 14:12:40 +02:00
|
|
|
|
:param verbose: set to True (default is False) to get information (stats) about the dataset
|
2023-10-17 18:24:33 +02:00
|
|
|
|
:return: a :class:`quapy.data.base.LabelledCollection` instance
|
|
|
|
|
"""
|
|
|
|
|
assert dataset_name in UCI_MULTICLASS_DATASETS, \
|
2023-10-18 17:50:46 +02:00
|
|
|
|
f'Name {dataset_name} does not match any known dataset from the ' \
|
|
|
|
|
f'UCI Machine Learning datasets repository (multiclass). ' \
|
2023-10-17 18:24:33 +02:00
|
|
|
|
f'Valid ones are {UCI_MULTICLASS_DATASETS}'
|
|
|
|
|
|
|
|
|
|
if data_home is None:
|
|
|
|
|
data_home = get_quapy_home()
|
|
|
|
|
|
2023-10-18 17:50:46 +02:00
|
|
|
|
identifiers = {
|
|
|
|
|
"dry-bean": 602,
|
|
|
|
|
"wine-quality": 186,
|
|
|
|
|
"academic-success": 697,
|
|
|
|
|
"digits": 80,
|
|
|
|
|
"letter": 59
|
|
|
|
|
}
|
2023-10-17 18:24:33 +02:00
|
|
|
|
|
2023-10-18 17:50:46 +02:00
|
|
|
|
full_names = {
|
|
|
|
|
"dry-bean": "Dry Bean Dataset",
|
|
|
|
|
"wine-quality": "Wine Quality",
|
|
|
|
|
"academic-success": "Predict students' dropout and academic success",
|
|
|
|
|
"digits": "Optical Recognition of Handwritten Digits",
|
|
|
|
|
"letter": "Letter Recognition"
|
2023-10-17 18:24:33 +02:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
identifier = identifiers[dataset_name]
|
|
|
|
|
fullname = full_names[dataset_name]
|
|
|
|
|
|
2023-10-18 17:50:46 +02:00
|
|
|
|
if verbose:
|
|
|
|
|
print(f'Loading UCI Muticlass {dataset_name} ({fullname})')
|
2023-10-17 18:24:33 +02:00
|
|
|
|
|
2023-10-18 17:50:46 +02:00
|
|
|
|
file = join(data_home, 'uci_multiclass', dataset_name+'.pkl')
|
2023-10-18 14:12:40 +02:00
|
|
|
|
|
|
|
|
|
def download(id):
|
|
|
|
|
data = fetch_ucirepo(id=id)
|
2023-10-17 18:24:33 +02:00
|
|
|
|
X, y = data['data']['features'].to_numpy(), data['data']['targets'].to_numpy().squeeze()
|
2023-10-18 14:12:40 +02:00
|
|
|
|
classes = np.sort(np.unique(y))
|
|
|
|
|
y = np.searchsorted(classes, y)
|
2023-10-18 17:50:46 +02:00
|
|
|
|
return LabelledCollection(X, y)
|
2023-10-17 18:24:33 +02:00
|
|
|
|
|
2023-10-18 14:12:40 +02:00
|
|
|
|
data = pickled_resource(file, download, identifier)
|
2023-10-17 18:24:33 +02:00
|
|
|
|
|
2023-10-18 14:12:40 +02:00
|
|
|
|
if verbose:
|
|
|
|
|
data.stats()
|
2023-10-18 17:50:46 +02:00
|
|
|
|
|
2023-10-17 18:24:33 +02:00
|
|
|
|
return data
|
|
|
|
|
|
2021-01-06 14:58:29 +01:00
|
|
|
|
|
2021-12-06 18:25:47 +01:00
|
|
|
|
def _df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):
|
2022-06-01 18:28:59 +02:00
|
|
|
|
df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def fetch_lequa2022(task, data_home=None):
|
|
|
|
|
"""
|
2023-02-08 19:06:53 +01:00
|
|
|
|
Loads the official datasets provided for the `LeQua <https://lequa2022.github.io/index>`_ competition.
|
|
|
|
|
In brief, there are 4 tasks (T1A, T1B, T2A, T2B) having to do with text quantification
|
|
|
|
|
problems. Tasks T1A and T1B provide documents in vector form, while T2A and T2B provide raw documents instead.
|
|
|
|
|
Tasks T1A and T2A are binary sentiment quantification problems, while T2A and T2B are multiclass quantification
|
|
|
|
|
problems consisting of estimating the class prevalence values of 28 different merchandise products.
|
|
|
|
|
We refer to the `Esuli, A., Moreo, A., Sebastiani, F., & Sperduti, G. (2022).
|
|
|
|
|
A Detailed Overview of LeQua@ CLEF 2022: Learning to Quantify.
|
|
|
|
|
<https://ceur-ws.org/Vol-3180/paper-146.pdf>`_ for a detailed description
|
|
|
|
|
on the tasks and datasets.
|
|
|
|
|
|
|
|
|
|
The datasets are downloaded only once, and stored for fast reuse.
|
|
|
|
|
|
|
|
|
|
See `lequa2022_experiments.py` provided in the example folder, that can serve as a guide on how to use these
|
|
|
|
|
datasets.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
:param task: a string representing the task name; valid ones are T1A, T1B, T2A, and T2B
|
|
|
|
|
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
|
|
|
|
|
~/quay_data/ directory)
|
|
|
|
|
:return: a tuple `(train, val_gen, test_gen)` where `train` is an instance of
|
|
|
|
|
:class:`quapy.data.base.LabelledCollection`, `val_gen` and `test_gen` are instances of
|
2023-10-30 09:47:01 +01:00
|
|
|
|
:class:`quapy.data._lequa2022.SamplesFromDir`, a subclass of :class:`quapy.protocol.AbstractProtocol`,
|
|
|
|
|
that return a series of samples stored in a directory which are labelled by prevalence.
|
2022-06-01 18:28:59 +02:00
|
|
|
|
"""
|
2023-02-08 19:06:53 +01:00
|
|
|
|
|
2022-06-01 18:28:59 +02:00
|
|
|
|
from quapy.data._lequa2022 import load_raw_documents, load_vector_documents, SamplesFromDir
|
|
|
|
|
|
|
|
|
|
assert task in LEQUA2022_TASKS, \
|
|
|
|
|
f'Unknown task {task}. Valid ones are {LEQUA2022_TASKS}'
|
|
|
|
|
if data_home is None:
|
|
|
|
|
data_home = get_quapy_home()
|
|
|
|
|
|
|
|
|
|
URL_TRAINDEV=f'https://zenodo.org/record/6546188/files/{task}.train_dev.zip'
|
|
|
|
|
URL_TEST=f'https://zenodo.org/record/6546188/files/{task}.test.zip'
|
|
|
|
|
URL_TEST_PREV=f'https://zenodo.org/record/6546188/files/{task}.test_prevalences.zip'
|
|
|
|
|
|
|
|
|
|
lequa_dir = join(data_home, 'lequa2022')
|
|
|
|
|
os.makedirs(lequa_dir, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
def download_unzip_and_remove(unzipped_path, url):
|
|
|
|
|
tmp_path = join(lequa_dir, task + '_tmp.zip')
|
|
|
|
|
download_file_if_not_exists(url, tmp_path)
|
|
|
|
|
with zipfile.ZipFile(tmp_path) as file:
|
|
|
|
|
file.extractall(unzipped_path)
|
|
|
|
|
os.remove(tmp_path)
|
|
|
|
|
|
|
|
|
|
if not os.path.exists(join(lequa_dir, task)):
|
|
|
|
|
download_unzip_and_remove(lequa_dir, URL_TRAINDEV)
|
|
|
|
|
download_unzip_and_remove(lequa_dir, URL_TEST)
|
|
|
|
|
download_unzip_and_remove(lequa_dir, URL_TEST_PREV)
|
|
|
|
|
|
|
|
|
|
if task in ['T1A', 'T1B']:
|
|
|
|
|
load_fn = load_vector_documents
|
|
|
|
|
elif task in ['T2A', 'T2B']:
|
|
|
|
|
load_fn = load_raw_documents
|
|
|
|
|
|
|
|
|
|
tr_path = join(lequa_dir, task, 'public', 'training_data.txt')
|
|
|
|
|
train = LabelledCollection.load(tr_path, loader_func=load_fn)
|
|
|
|
|
|
|
|
|
|
val_samples_path = join(lequa_dir, task, 'public', 'dev_samples')
|
|
|
|
|
val_true_prev_path = join(lequa_dir, task, 'public', 'dev_prevalences.txt')
|
|
|
|
|
val_gen = SamplesFromDir(val_samples_path, val_true_prev_path, load_fn=load_fn)
|
|
|
|
|
|
2022-11-04 15:04:36 +01:00
|
|
|
|
test_samples_path = join(lequa_dir, task, 'public', 'test_samples')
|
2022-06-01 18:28:59 +02:00
|
|
|
|
test_true_prev_path = join(lequa_dir, task, 'public', 'test_prevalences.txt')
|
2022-06-15 14:36:02 +02:00
|
|
|
|
test_gen = SamplesFromDir(test_samples_path, test_true_prev_path, load_fn=load_fn)
|
2022-06-01 18:28:59 +02:00
|
|
|
|
|
|
|
|
|
return train, val_gen, test_gen
|
|
|
|
|
|
2024-02-08 14:33:22 +01:00
|
|
|
|
|
2024-02-07 18:45:42 +01:00
|
|
|
|
def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=None):
|
2023-11-08 11:07:47 +01:00
|
|
|
|
"""
|
2024-02-08 14:33:22 +01:00
|
|
|
|
Loads the IFCB dataset for quantification from `Zenodo <https://zenodo.org/records/10036244>`_ (for more
|
|
|
|
|
information on this dataset, please follow the zenodo link).
|
|
|
|
|
This dataset is based on the data available publicly at
|
|
|
|
|
`WHOI-Plankton repo <https://github.com/hsosik/WHOI-Plankton>`_.
|
|
|
|
|
The scripts for the processing are available at `P. González's repo <https://github.com/pglez82/IFCB_Zenodo>`_.
|
2023-11-08 11:07:47 +01:00
|
|
|
|
Basically, this is the IFCB dataset with precomputed features for testing quantification algorithms.
|
|
|
|
|
|
|
|
|
|
The datasets are downloaded only once, and stored for fast reuse.
|
|
|
|
|
|
2024-02-07 18:45:42 +01:00
|
|
|
|
:param single_sample_train: a boolean. If true, it will return the train dataset as a
|
2023-11-08 11:07:47 +01:00
|
|
|
|
:class:`quapy.data.base.LabelledCollection` (all examples together).
|
2024-02-07 18:45:42 +01:00
|
|
|
|
If false, a generator of training samples will be returned. Each example in the training set has an individual label.
|
|
|
|
|
:param for_model_selection: if True, then returns a split 30% of the training set (86 out of 286 samples) to be used for model selection;
|
|
|
|
|
if False, then returns the full training set as training set and the test set as the test set
|
2023-11-08 11:07:47 +01:00
|
|
|
|
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
|
|
|
|
|
~/quay_data/ directory)
|
|
|
|
|
:return: a tuple `(train, test_gen)` where `train` is an instance of
|
2024-02-07 18:45:42 +01:00
|
|
|
|
:class:`quapy.data.base.LabelledCollection`, if `single_sample_train` is true or
|
|
|
|
|
:class:`quapy.data._ifcb.IFCBTrainSamplesFromDir`, i.e. a sampling protocol that returns a series of samples
|
|
|
|
|
labelled example by example. test_gen will be a :class:`quapy.data._ifcb.IFCBTestSamples`,
|
2023-11-08 11:07:47 +01:00
|
|
|
|
i.e., a sampling protocol that returns a series of samples labelled by prevalence.
|
|
|
|
|
"""
|
|
|
|
|
|
2024-02-07 18:45:42 +01:00
|
|
|
|
from quapy.data._ifcb import IFCBTrainSamplesFromDir, IFCBTestSamples, get_sample_list, generate_modelselection_split
|
2023-11-08 11:07:47 +01:00
|
|
|
|
|
|
|
|
|
if data_home is None:
|
|
|
|
|
data_home = get_quapy_home()
|
|
|
|
|
|
|
|
|
|
URL_TRAIN=f'https://zenodo.org/records/10036244/files/IFCB.train.zip'
|
|
|
|
|
URL_TEST=f'https://zenodo.org/records/10036244/files/IFCB.test.zip'
|
|
|
|
|
URL_TEST_PREV=f'https://zenodo.org/records/10036244/files/IFCB.test_prevalences.zip'
|
|
|
|
|
|
|
|
|
|
ifcb_dir = join(data_home, 'ifcb')
|
|
|
|
|
os.makedirs(ifcb_dir, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
def download_unzip_and_remove(unzipped_path, url):
|
|
|
|
|
tmp_path = join(ifcb_dir, 'ifcb_tmp.zip')
|
|
|
|
|
download_file_if_not_exists(url, tmp_path)
|
|
|
|
|
with zipfile.ZipFile(tmp_path) as file:
|
|
|
|
|
file.extractall(unzipped_path)
|
|
|
|
|
os.remove(tmp_path)
|
|
|
|
|
|
|
|
|
|
if not os.path.exists(os.path.join(ifcb_dir,'train')):
|
|
|
|
|
download_unzip_and_remove(ifcb_dir, URL_TRAIN)
|
|
|
|
|
if not os.path.exists(os.path.join(ifcb_dir,'test')):
|
|
|
|
|
download_unzip_and_remove(ifcb_dir, URL_TEST)
|
|
|
|
|
if not os.path.exists(os.path.join(ifcb_dir,'test_prevalences.csv')):
|
|
|
|
|
download_unzip_and_remove(ifcb_dir, URL_TEST_PREV)
|
|
|
|
|
|
|
|
|
|
# Load test prevalences and classes
|
|
|
|
|
test_true_prev_path = join(ifcb_dir, 'test_prevalences.csv')
|
|
|
|
|
test_true_prev = pd.read_csv(test_true_prev_path)
|
|
|
|
|
classes = test_true_prev.columns[1:]
|
|
|
|
|
|
2024-02-07 18:45:42 +01:00
|
|
|
|
#Load train and test samples
|
2023-11-08 11:07:47 +01:00
|
|
|
|
train_samples_path = join(ifcb_dir,'train')
|
|
|
|
|
test_samples_path = join(ifcb_dir,'test')
|
2024-02-07 18:45:42 +01:00
|
|
|
|
|
|
|
|
|
if for_model_selection:
|
|
|
|
|
# In this case, return 70% of training data as the training set and 30% as the test set
|
|
|
|
|
samples = get_sample_list(train_samples_path)
|
|
|
|
|
train, test = generate_modelselection_split(samples, split=0.3)
|
|
|
|
|
train_gen = IFCBTrainSamplesFromDir(path_dir=train_samples_path, classes=classes, samples=train)
|
|
|
|
|
|
|
|
|
|
# Test prevalence is computed from class labels
|
|
|
|
|
test_gen = IFCBTestSamples(path_dir=train_samples_path, test_prevalences=None, samples=test, classes=classes)
|
|
|
|
|
else:
|
|
|
|
|
# In this case, we use all training samples as the training set and the test samples as the test set
|
|
|
|
|
train_gen = IFCBTrainSamplesFromDir(path_dir=train_samples_path, classes=classes)
|
|
|
|
|
test_gen = IFCBTestSamples(path_dir=test_samples_path, test_prevalences=test_true_prev)
|
2023-11-08 11:07:47 +01:00
|
|
|
|
|
|
|
|
|
# In the case the user wants it, join all the train samples in one LabelledCollection
|
|
|
|
|
if single_sample_train:
|
2024-02-12 12:39:18 +01:00
|
|
|
|
train = LabelledCollection.join(*[lc for lc in train_gen()])
|
2023-11-08 11:07:47 +01:00
|
|
|
|
return train, test_gen
|
|
|
|
|
else:
|
|
|
|
|
return train_gen, test_gen
|