QuaPy/quapy/data/datasets.py

826 lines
38 KiB
Python
Raw Normal View History

def warn(*args, **kwargs):
pass
import warnings
warnings.warn = warn
import os
2021-01-15 18:32:32 +01:00
import zipfile
from os.path import join
import pandas as pd
2023-11-08 10:00:25 +01:00
from ucimlrepo import fetch_ucirepo
2021-03-19 17:34:09 +01:00
from quapy.data.base import Dataset, LabelledCollection
2021-01-15 18:32:32 +01:00
from quapy.data.preprocessing import text2tfidf, reduce_columns
from quapy.data.reader import *
from quapy.util import download_file_if_not_exists, download_file, get_quapy_home, pickled_resource
REVIEWS_SENTIMENT_DATASETS = ['hp', 'kindle', 'imdb']
TWITTER_SENTIMENT_DATASETS_TEST = ['gasp', 'hcr', 'omd', 'sanders',
'semeval13', 'semeval14', 'semeval15', 'semeval16',
'sst', 'wa', 'wb']
TWITTER_SENTIMENT_DATASETS_TRAIN = ['gasp', 'hcr', 'omd', 'sanders',
'semeval', 'semeval16',
'sst', 'wa', 'wb']
2024-02-07 18:31:34 +01:00
UCI_BINARY_DATASETS = ['acute.a', 'acute.b',
'balance.1', 'balance.2', 'balance.3',
'breast-cancer',
'cmc.1', 'cmc.2', 'cmc.3',
'ctg.1', 'ctg.2', 'ctg.3',
2024-02-07 18:31:34 +01:00
#'diabetes', # <-- I haven't found this one...
'german',
'haberman',
'ionosphere',
'iris.1', 'iris.2', 'iris.3',
'mammographic',
2024-02-07 18:31:34 +01:00
'pageblocks.5',
#'phoneme', # <-- I haven't found this one...
'semeion',
'sonar',
'spambase',
'spectf',
'tictactoe',
'transfusion',
'wdbc',
'wine.1', 'wine.2', 'wine.3',
'wine-q-red', 'wine-q-white',
'yeast']
2023-10-17 18:24:33 +02:00
UCI_MULTICLASS_DATASETS = ['dry-bean',
'wine-quality',
'academic-success',
'digits',
'letter']
2022-06-01 18:28:59 +02:00
LEQUA2022_TASKS = ['T1A', 'T1B', 'T2A', 'T2B']
_TXA_SAMPLE_SIZE = 250
_TXB_SAMPLE_SIZE = 1000
LEQUA2022_SAMPLE_SIZE = {
'TXA': _TXA_SAMPLE_SIZE,
'TXB': _TXB_SAMPLE_SIZE,
'T1A': _TXA_SAMPLE_SIZE,
'T1B': _TXB_SAMPLE_SIZE,
'T2A': _TXA_SAMPLE_SIZE,
'T2B': _TXB_SAMPLE_SIZE,
'binary': _TXA_SAMPLE_SIZE,
'multiclass': _TXB_SAMPLE_SIZE
}
2021-05-05 17:12:44 +02:00
def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False) -> Dataset:
"""
2021-12-06 18:25:47 +01:00
Loads a Reviews dataset as a Dataset instance, as used in
`Esuli, A., Moreo, A., and Sebastiani, F. "A recurrent neural network for sentiment quantification."
Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018. <https://dl.acm.org/doi/abs/10.1145/3269206.3269287>`_.
The list of valid dataset names can be accessed in `quapy.data.datasets.REVIEWS_SENTIMENT_DATASETS`
:param dataset_name: the name of the dataset: valid ones are 'hp', 'kindle', 'imdb'
:param tfidf: set to True to transform the raw documents into tfidf weighted matrices
:param min_df: minimun number of documents that should contain a term in order for the term to be
2021-12-06 18:25:47 +01:00
kept (ignored if tfidf==False)
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
2021-12-06 18:25:47 +01:00
~/quay_data/ directory)
:param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for
2021-12-06 18:25:47 +01:00
faster subsequent invokations
:return: a :class:`quapy.data.base.Dataset` instance
"""
assert dataset_name in REVIEWS_SENTIMENT_DATASETS, \
f'Name {dataset_name} does not match any known dataset for sentiment reviews. ' \
f'Valid ones are {REVIEWS_SENTIMENT_DATASETS}'
if data_home is None:
data_home = get_quapy_home()
URL_TRAIN = f'https://zenodo.org/record/4117827/files/{dataset_name}_train.txt'
URL_TEST = f'https://zenodo.org/record/4117827/files/{dataset_name}_test.txt'
os.makedirs(join(data_home, 'reviews'), exist_ok=True)
train_path = join(data_home, 'reviews', dataset_name, 'train.txt')
test_path = join(data_home, 'reviews', dataset_name, 'test.txt')
download_file_if_not_exists(URL_TRAIN, train_path)
download_file_if_not_exists(URL_TEST, test_path)
pickle_path = None
if pickle:
pickle_path = join(data_home, 'reviews', 'pickle', f'{dataset_name}.pkl')
data = pickled_resource(pickle_path, Dataset.load, train_path, test_path, from_text)
if tfidf:
text2tfidf(data, inplace=True)
if min_df is not None:
reduce_columns(data, min_df=min_df, inplace=True)
2021-01-11 18:31:12 +01:00
data.name = dataset_name
return data
2021-05-05 17:12:44 +02:00
def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_home=None, pickle=False) -> Dataset:
"""
2021-12-06 18:25:47 +01:00
Loads a Twitter dataset as a :class:`quapy.data.base.Dataset` instance, as used in:
`Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
Social Network Analysis and Mining6(19), 122 (2016) <https://link.springer.com/content/pdf/10.1007/s13278-016-0327-z.pdf>`_
Note that the datasets 'semeval13', 'semeval14', 'semeval15' share the same training set.
The list of valid dataset names corresponding to training sets can be accessed in
`quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TRAIN`, while the test sets can be accessed in
`quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TEST`
:param dataset_name: the name of the dataset: valid ones are 'gasp', 'hcr', 'omd', 'sanders', 'semeval13',
2021-12-06 18:25:47 +01:00
'semeval14', 'semeval15', 'semeval16', 'sst', 'wa', 'wb'
:param for_model_selection: if True, then returns the train split as the training set and the devel split
2021-12-06 18:25:47 +01:00
as the test set; if False, then returns the train+devel split as the training set and the test set as the
test set
:param min_df: minimun number of documents that should contain a term in order for the term to be kept
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
2021-12-06 18:25:47 +01:00
~/quay_data/ directory)
:param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for
2021-12-06 18:25:47 +01:00
faster subsequent invokations
:return: a :class:`quapy.data.base.Dataset` instance
"""
assert dataset_name in TWITTER_SENTIMENT_DATASETS_TRAIN + TWITTER_SENTIMENT_DATASETS_TEST, \
f'Name {dataset_name} does not match any known dataset for sentiment twitter. ' \
f'Valid ones are {TWITTER_SENTIMENT_DATASETS_TRAIN} for model selection and ' \
f'{TWITTER_SENTIMENT_DATASETS_TEST} for test (datasets "semeval14", "semeval15", "semeval16" share ' \
f'a common training set "semeval")'
if data_home is None:
data_home = get_quapy_home()
URL = 'https://zenodo.org/record/4255764/files/tweet_sentiment_quantification_snam.zip'
unzipped_path = join(data_home, 'tweet_sentiment_quantification_snam')
if not os.path.exists(unzipped_path):
downloaded_path = join(data_home, 'tweet_sentiment_quantification_snam.zip')
download_file(URL, downloaded_path)
with zipfile.ZipFile(downloaded_path) as file:
file.extractall(data_home)
os.remove(downloaded_path)
if dataset_name in {'semeval13', 'semeval14', 'semeval15'}:
trainset_name = 'semeval'
testset_name = 'semeval' if for_model_selection else dataset_name
print(f"the training and development sets for datasets 'semeval13', 'semeval14', 'semeval15' are common "
f"(called 'semeval'); returning trainin-set='{trainset_name}' and test-set={testset_name}")
else:
if dataset_name == 'semeval' and for_model_selection==False:
raise ValueError('dataset "semeval" can only be used for model selection. '
'Use "semeval13", "semeval14", or "semeval15" for model evaluation.')
trainset_name = testset_name = dataset_name
if for_model_selection:
train = join(unzipped_path, 'train', f'{trainset_name}.train.feature.txt')
test = join(unzipped_path, 'test', f'{testset_name}.dev.feature.txt')
else:
train = join(unzipped_path, 'train', f'{trainset_name}.train+dev.feature.txt')
if dataset_name == 'semeval16': # there is a different test name in the case of semeval16 only
test = join(unzipped_path, 'test', f'{testset_name}.dev-test.feature.txt')
else:
test = join(unzipped_path, 'test', f'{testset_name}.test.feature.txt')
pickle_path = None
if pickle:
mode = "train-dev" if for_model_selection else "train+dev-test"
pickle_path = join(unzipped_path, 'pickle', f'{testset_name}.{mode}.pkl')
data = pickled_resource(pickle_path, Dataset.load, train, test, from_sparse)
if min_df is not None:
reduce_columns(data, min_df=min_df, inplace=True)
2021-01-11 18:31:12 +01:00
data.name = dataset_name
return data
2024-02-07 18:31:34 +01:00
def fetch_UCIBinaryDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset:
2021-12-06 18:25:47 +01:00
"""
Loads a UCI dataset as an instance of :class:`quapy.data.base.Dataset`, as used in
`Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
Information Fusion, 34, 87-100. <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_
and
`Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).
Dynamic ensemble selection for quantification tasks.
Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_.
The datasets do not come with a predefined train-test split (see :meth:`fetch_UCILabelledCollection` for further
information on how to use these collections), and so a train-test split is generated at desired proportion.
The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_DATASETS`
:param dataset_name: a dataset name
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)
:param test_split: proportion of documents to be included in the test set. The rest conforms the training set
:param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
:return: a :class:`quapy.data.base.Dataset` instance
"""
2024-02-07 18:31:34 +01:00
data = fetch_UCIBinaryLabelledCollection(dataset_name, data_home, verbose)
return Dataset(*data.split_stratified(1 - test_split, random_state=0))
2024-02-07 18:31:34 +01:00
def fetch_UCIBinaryLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection:
2021-12-06 18:25:47 +01:00
"""
Loads a UCI collection as an instance of :class:`quapy.data.base.LabelledCollection`, as used in
`Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
Information Fusion, 34, 87-100. <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_
and
`Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).
Dynamic ensemble selection for quantification tasks.
Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_.
The datasets do not come with a predefined train-test split, and so Pérez-Gállego et al. adopted a 5FCVx2 evaluation
protocol, meaning that each collection was used to generate two rounds (hence the x2) of 5 fold cross validation.
This can be reproduced by using :meth:`quapy.data.base.Dataset.kFCV`, e.g.:
>>> import quapy as qp
2024-02-07 18:31:34 +01:00
>>> collection = qp.datasets.fetch_UCIBinaryLabelledCollection("yeast")
>>> for data in qp.train.Dataset.kFCV(collection, nfolds=5, nrepeats=2):
2021-12-06 18:25:47 +01:00
>>> ...
The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_DATASETS`
:param dataset_name: a dataset name
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)
:param test_split: proportion of documents to be included in the test set. The rest conforms the training set
:param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
:return: a :class:`quapy.data.base.LabelledCollection` instance
2021-12-06 18:25:47 +01:00
"""
2024-02-07 18:31:34 +01:00
assert dataset_name in UCI_BINARY_DATASETS, \
f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \
2024-02-07 18:31:34 +01:00
f'Valid ones are {UCI_BINARY_DATASETS}'
if data_home is None:
data_home = get_quapy_home()
dataset_fullname = {
'acute.a': 'Acute Inflammations (urinary bladder)',
'acute.b': 'Acute Inflammations (renal pelvis)',
'balance.1': 'Balance Scale Weight & Distance Database (left)',
'balance.2': 'Balance Scale Weight & Distance Database (balanced)',
'balance.3': 'Balance Scale Weight & Distance Database (right)',
2021-01-11 12:55:06 +01:00
'breast-cancer': 'Breast Cancer Wisconsin (Original)',
'cmc.1': 'Contraceptive Method Choice (no use)',
'cmc.2': 'Contraceptive Method Choice (long term)',
'cmc.3': 'Contraceptive Method Choice (short term)',
'ctg.1': 'Cardiotocography Data Set (normal)',
'ctg.2': 'Cardiotocography Data Set (suspect)',
'ctg.3': 'Cardiotocography Data Set (pathologic)',
'german': 'Statlog German Credit Data',
2021-01-25 18:38:56 +01:00
'haberman': "Haberman's Survival Data",
'ionosphere': 'Johns Hopkins University Ionosphere DB',
'iris.1': 'Iris Plants Database(x)',
'iris.2': 'Iris Plants Database(versicolour)',
'iris.3': 'Iris Plants Database(virginica)',
'mammographic': 'Mammographic Mass',
'pageblocks.5': 'Page Blocks Classification (5)',
'semeion': 'Semeion Handwritten Digit (8)',
'sonar': 'Sonar, Mines vs. Rocks',
'spambase': 'Spambase Data Set',
'spectf': 'SPECTF Heart Data',
'tictactoe': 'Tic-Tac-Toe Endgame Database',
'transfusion': 'Blood Transfusion Service Center Data Set',
'wdbc': 'Wisconsin Diagnostic Breast Cancer',
'wine.1': 'Wine Recognition Data (1)',
'wine.2': 'Wine Recognition Data (2)',
'wine.3': 'Wine Recognition Data (3)',
'wine-q-red': 'Wine Quality Red (6-10)',
'wine-q-white': 'Wine Quality White (6-10)',
'yeast': 'Yeast',
}
# the identifier is an alias for the dataset group, it's part of the url data-folder, and is the name we use
# to download the raw dataset
identifier_map = {
'acute.a': 'acute',
'acute.b': 'acute',
'balance.1': 'balance-scale',
'balance.2': 'balance-scale',
'balance.3': 'balance-scale',
'breast-cancer': 'breast-cancer-wisconsin',
'cmc.1': 'cmc',
'cmc.2': 'cmc',
'cmc.3': 'cmc',
'ctg.1': '00193',
'ctg.2': '00193',
'ctg.3': '00193',
2021-01-25 18:38:56 +01:00
'german': 'statlog/german',
'haberman': 'haberman',
'ionosphere': 'ionosphere',
'iris.1': 'iris',
'iris.2': 'iris',
'iris.3': 'iris',
'mammographic': 'mammographic-masses',
'pageblocks.5': 'page-blocks',
'semeion': 'semeion',
'sonar': 'undocumented/connectionist-bench/sonar',
'spambase': 'spambase',
'spectf': 'spect',
'tictactoe': 'tic-tac-toe',
'transfusion': 'blood-transfusion',
'wdbc': 'breast-cancer-wisconsin',
'wine-q-red': 'wine-quality',
'wine-q-white': 'wine-quality',
'wine.1': 'wine',
'wine.2': 'wine',
'wine.3': 'wine',
'yeast': 'yeast',
}
# the filename is the name of the file within the data_folder indexed by the identifier
file_name = {
'acute': 'diagnosis.data',
'00193': 'CTG.xls',
2021-01-25 18:38:56 +01:00
'statlog/german': 'german.data-numeric',
'mammographic-masses': 'mammographic_masses.data',
'page-blocks': 'page-blocks.data.Z',
'undocumented/connectionist-bench/sonar': 'sonar.all-data',
'spect': ['SPECTF.train', 'SPECTF.test'],
'blood-transfusion': 'transfusion.data',
'wine-quality': ['winequality-red.csv', 'winequality-white.csv'],
'breast-cancer-wisconsin': 'breast-cancer-wisconsin.data' if dataset_name=='breast-cancer' else 'wdbc.data'
}
# the filename containing the dataset description (if any)
desc_name = {
'acute': 'diagnosis.names',
'00193': None,
2021-01-25 18:38:56 +01:00
'statlog/german': 'german.doc',
'mammographic-masses': 'mammographic_masses.names',
'undocumented/connectionist-bench/sonar': 'sonar.names',
'spect': 'SPECTF.names',
'blood-transfusion': 'transfusion.names',
'wine-quality': 'winequality.names',
'breast-cancer-wisconsin': 'breast-cancer-wisconsin.names' if dataset_name == 'breast-cancer' else 'wdbc.names'
}
identifier = identifier_map[dataset_name]
2021-01-25 18:38:56 +01:00
filename = file_name.get(identifier, f'{identifier}.data')
descfile = desc_name.get(identifier, f'{identifier}.names')
fullname = dataset_fullname[dataset_name]
URL = f'http://archive.ics.uci.edu/ml/machine-learning-databases/{identifier}'
data_dir = join(data_home, 'uci_datasets', identifier)
if isinstance(filename, str): # filename could be a list of files, in which case it will be processed later
data_path = join(data_dir, filename)
download_file_if_not_exists(f'{URL}/{filename}', data_path)
if descfile:
2021-01-25 18:38:56 +01:00
try:
download_file_if_not_exists(f'{URL}/{descfile}', f'{data_dir}/{descfile}')
if verbose:
print(open(f'{data_dir}/{descfile}', 'rt').read())
except Exception:
print('could not read the description file')
elif verbose:
print('no file description available')
if verbose:
print(f'Loading {dataset_name} ({fullname})')
if identifier == 'acute':
df = pd.read_csv(data_path, header=None, encoding='utf-16', sep='\t')
df[0] = df[0].apply(lambda x: float(x.replace(',', '.'))).astype(float, copy=False)
2021-12-06 18:25:47 +01:00
[_df_replace(df, col) for col in range(1, 6)]
X = df.loc[:, 0:5].values
if dataset_name == 'acute.a':
y = binarize(df[6], pos_class='yes')
elif dataset_name == 'acute.b':
y = binarize(df[7], pos_class='yes')
if identifier == 'balance-scale':
df = pd.read_csv(data_path, header=None, sep=',')
if dataset_name == 'balance.1':
y = binarize(df[0], pos_class='L')
elif dataset_name == 'balance.2':
y = binarize(df[0], pos_class='B')
elif dataset_name == 'balance.3':
y = binarize(df[0], pos_class='R')
X = df.loc[:, 1:].astype(float).values
if identifier == 'breast-cancer-wisconsin' and dataset_name=='breast-cancer':
df = pd.read_csv(data_path, header=None, sep=',')
2021-01-11 12:55:06 +01:00
Xy = df.loc[:, 1:10]
Xy[Xy=='?']=np.nan
Xy = Xy.dropna(axis=0)
X = Xy.loc[:, 1:9]
X = X.astype(float).values
y = binarize(Xy[10], pos_class=2)
if identifier == 'breast-cancer-wisconsin' and dataset_name=='wdbc':
df = pd.read_csv(data_path, header=None, sep=',')
X = df.loc[:, 2:32].astype(float).values
y = df[1].values
y = binarize(y, pos_class='M')
2021-01-11 12:55:06 +01:00
if identifier == 'cmc':
df = pd.read_csv(data_path, header=None, sep=',')
2021-01-11 12:55:06 +01:00
X = df.loc[:, 0:8].astype(float).values
y = df[9].astype(int).values
if dataset_name == 'cmc.1':
y = binarize(y, pos_class=1)
elif dataset_name == 'cmc.2':
y = binarize(y, pos_class=2)
elif dataset_name == 'cmc.3':
y = binarize(y, pos_class=3)
if identifier == '00193':
df = pd.read_excel(data_path, sheet_name='Data', skipfooter=3)
df = df[list(range(1,24))] # select columns numbered (number 23 is the target label)
# replaces the header with the first row
new_header = df.iloc[0] # grab the first row for the header
df = df[1:] # take the data less the header row
df.columns = new_header # set the header row as the df header
X = df.iloc[:, 0:22].astype(float).values
y = df['NSP'].astype(int).values
2021-01-25 18:38:56 +01:00
if dataset_name == 'ctg.1':
y = binarize(y, pos_class=1) # 1==Normal
elif dataset_name == 'ctg.2':
2021-01-25 18:38:56 +01:00
y = binarize(y, pos_class=2) # 2==Suspect
elif dataset_name == 'ctg.3':
2021-01-25 18:38:56 +01:00
y = binarize(y, pos_class=3) # 3==Pathologic
if identifier == 'statlog/german':
df = pd.read_csv(data_path, header=None, delim_whitespace=True)
X = df.iloc[:, 0:24].astype(float).values
y = df[24].astype(int).values
y = binarize(y, pos_class=1)
2021-01-25 18:38:56 +01:00
if identifier == 'haberman':
df = pd.read_csv(data_path, header=None)
X = df.iloc[:, 0:3].astype(float).values
y = df[3].astype(int).values
y = binarize(y, pos_class=2)
if identifier == 'ionosphere':
df = pd.read_csv(data_path, header=None)
X = df.iloc[:, 0:34].astype(float).values
y = df[34].values
y = binarize(y, pos_class='b')
if identifier == 'iris':
df = pd.read_csv(data_path, header=None)
X = df.iloc[:, 0:4].astype(float).values
y = df[4].values
if dataset_name == 'iris.1':
y = binarize(y, pos_class='Iris-setosa') # 1==Setosa
elif dataset_name == 'iris.2':
y = binarize(y, pos_class='Iris-versicolor') # 2==Versicolor
elif dataset_name == 'iris.3':
y = binarize(y, pos_class='Iris-virginica') # 3==Virginica
if identifier == 'mammographic-masses':
df = pd.read_csv(data_path, header=None, sep=',')
df[df == '?'] = np.nan
Xy = df.dropna(axis=0)
2021-01-25 18:38:56 +01:00
X = Xy.iloc[:, 0:5]
X = X.astype(float).values
y = binarize(Xy.iloc[:,5], pos_class=1)
if identifier == 'page-blocks':
data_path_ = data_path.replace('.Z', '')
if not os.path.exists(data_path_):
raise FileNotFoundError(f'Warning: file {data_path_} does not exist. If this is the first time you '
f'attempt to load this dataset, then you have to manually unzip the {data_path} '
f'and name the extracted file {data_path_} (unfortunately, neither zipfile, nor '
f'gzip can handle unix compressed files automatically -- there is a repo in GitHub '
f'https://github.com/umeat/unlzw where the problem seems to be solved anyway).')
df = pd.read_csv(data_path_, header=None, delim_whitespace=True)
X = df.iloc[:, 0:10].astype(float).values
y = df[10].values
y = binarize(y, pos_class=5) # 5==block "graphic"
if identifier == 'semeion':
df = pd.read_csv(data_path, header=None, delim_whitespace=True )
X = df.iloc[:, 0:256].astype(float).values
y = df[263].values # 263 stands for digit 8 (labels are one-hot vectors from col 256-266)
y = binarize(y, pos_class=1)
if identifier == 'undocumented/connectionist-bench/sonar':
df = pd.read_csv(data_path, header=None, sep=',')
X = df.iloc[:, 0:60].astype(float).values
y = df[60].values
2021-01-25 18:38:56 +01:00
y = binarize(y, pos_class='R')
if identifier == 'spambase':
df = pd.read_csv(data_path, header=None, sep=',')
X = df.iloc[:, 0:57].astype(float).values
y = df[57].values
y = binarize(y, pos_class=1)
if identifier == 'spect':
dfs = []
for file in filename:
data_path = join(data_dir, file)
download_file_if_not_exists(f'{URL}/{file}', data_path)
dfs.append(pd.read_csv(data_path, header=None, sep=','))
df = pd.concat(dfs)
X = df.iloc[:, 1:45].astype(float).values
y = df[0].values
y = binarize(y, pos_class=0)
if identifier == 'tic-tac-toe':
df = pd.read_csv(data_path, header=None, sep=',')
X = df.iloc[:, 0:9].replace('o',0).replace('b',1).replace('x',2).values
y = df[9].values
y = binarize(y, pos_class='negative')
if identifier == 'blood-transfusion':
df = pd.read_csv(data_path, sep=',')
X = df.iloc[:, 0:4].astype(float).values
y = df.iloc[:, 4].values
y = binarize(y, pos_class=1)
2021-01-25 18:38:56 +01:00
if identifier == 'wine':
df = pd.read_csv(data_path, header=None, sep=',')
X = df.iloc[:, 1:14].astype(float).values
y = df[0].values
if dataset_name == 'wine.1':
y = binarize(y, pos_class=1)
elif dataset_name == 'wine.2':
y = binarize(y, pos_class=2)
elif dataset_name == 'wine.3':
y = binarize(y, pos_class=3)
if identifier == 'wine-quality':
filename = filename[0] if dataset_name=='wine-q-red' else filename[1]
data_path = join(data_dir, filename)
download_file_if_not_exists(f'{URL}/{filename}', data_path)
df = pd.read_csv(data_path, sep=';')
X = df.iloc[:, 0:11].astype(float).values
y = df.iloc[:, 11].values > 5
if identifier == 'yeast':
df = pd.read_csv(data_path, header=None, delim_whitespace=True)
X = df.iloc[:, 1:9].astype(float).values
y = df.iloc[:, 9].values
y = binarize(y, pos_class='NUC')
data = LabelledCollection(X, y)
if verbose:
data.stats()
return data
2023-10-17 18:24:33 +02:00
def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset:
"""
2023-10-18 14:12:40 +02:00
Loads a UCI multiclass dataset as an instance of :class:`quapy.data.base.Dataset`.
The list of available datasets is taken from https://archive.ics.uci.edu/, following these criteria:
2023-10-18 17:50:46 +02:00
- It has more than 1000 instances
- It is suited for classification
- It has more than two classes
- It is available for Python import (requires ucimlrepo package)
>>> import quapy as qp
>>> dataset = qp.datasets.fetch_UCIMulticlassDataset("dry-bean")
>>> train, test = dataset.train_test
>>> ...
2023-10-18 14:12:40 +02:00
2023-10-17 18:44:02 +02:00
The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS`
2023-10-17 18:24:33 +02:00
2023-10-18 17:50:46 +02:00
The datasets are downloaded only once and pickled into disk, saving time for consecutive calls.
2023-10-17 18:24:33 +02:00
:param dataset_name: a dataset name
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)
:param test_split: proportion of documents to be included in the test set. The rest conforms the training set
2023-10-18 14:12:40 +02:00
:param verbose: set to True (default is False) to get information (stats) about the dataset
2023-10-17 18:24:33 +02:00
:return: a :class:`quapy.data.base.Dataset` instance
"""
data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, verbose)
return Dataset(*data.split_stratified(1 - test_split, random_state=0))
2023-10-18 17:50:46 +02:00
2023-10-17 18:24:33 +02:00
def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection:
"""
2023-10-18 17:50:46 +02:00
Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`.
2023-10-17 18:24:33 +02:00
2023-10-18 17:50:46 +02:00
The list of available datasets is taken from https://archive.ics.uci.edu/, following these criteria:
- It has more than 1000 instances
- It is suited for classification
- It has more than two classes
- It is available for Python import (requires ucimlrepo package)
2023-10-17 18:24:33 +02:00
>>> import quapy as qp
2023-10-18 17:50:46 +02:00
>>> collection = qp.datasets.fetch_UCIMulticlassLabelledCollection("dry-bean")
>>> X, y = collection.Xy
2023-10-17 18:24:33 +02:00
>>> ...
The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS`
2023-10-18 14:12:40 +02:00
The datasets are downloaded only once and pickled into disk, saving time for consecutive calls.
2023-10-17 18:24:33 +02:00
:param dataset_name: a dataset name
2023-10-18 14:12:40 +02:00
:param data_home: specify the quapy home directory where the dataset will be dumped (leave empty to use the default
2023-10-17 18:24:33 +02:00
~/quay_data/ directory)
:param test_split: proportion of documents to be included in the test set. The rest conforms the training set
2023-10-18 14:12:40 +02:00
:param verbose: set to True (default is False) to get information (stats) about the dataset
2023-10-17 18:24:33 +02:00
:return: a :class:`quapy.data.base.LabelledCollection` instance
"""
assert dataset_name in UCI_MULTICLASS_DATASETS, \
2023-10-18 17:50:46 +02:00
f'Name {dataset_name} does not match any known dataset from the ' \
f'UCI Machine Learning datasets repository (multiclass). ' \
2023-10-17 18:24:33 +02:00
f'Valid ones are {UCI_MULTICLASS_DATASETS}'
if data_home is None:
data_home = get_quapy_home()
2023-10-18 17:50:46 +02:00
identifiers = {
"dry-bean": 602,
"wine-quality": 186,
"academic-success": 697,
"digits": 80,
"letter": 59
}
2023-10-17 18:24:33 +02:00
2023-10-18 17:50:46 +02:00
full_names = {
"dry-bean": "Dry Bean Dataset",
"wine-quality": "Wine Quality",
"academic-success": "Predict students' dropout and academic success",
"digits": "Optical Recognition of Handwritten Digits",
"letter": "Letter Recognition"
2023-10-17 18:24:33 +02:00
}
identifier = identifiers[dataset_name]
fullname = full_names[dataset_name]
2023-10-18 17:50:46 +02:00
if verbose:
print(f'Loading UCI Muticlass {dataset_name} ({fullname})')
2023-10-17 18:24:33 +02:00
2023-10-18 17:50:46 +02:00
file = join(data_home, 'uci_multiclass', dataset_name+'.pkl')
2023-10-18 14:12:40 +02:00
def download(id):
data = fetch_ucirepo(id=id)
2023-10-17 18:24:33 +02:00
X, y = data['data']['features'].to_numpy(), data['data']['targets'].to_numpy().squeeze()
2023-10-18 14:12:40 +02:00
classes = np.sort(np.unique(y))
y = np.searchsorted(classes, y)
2023-10-18 17:50:46 +02:00
return LabelledCollection(X, y)
2023-10-17 18:24:33 +02:00
2023-10-18 14:12:40 +02:00
data = pickled_resource(file, download, identifier)
2023-10-17 18:24:33 +02:00
2023-10-18 14:12:40 +02:00
if verbose:
data.stats()
2023-10-18 17:50:46 +02:00
2023-10-17 18:24:33 +02:00
return data
2021-12-06 18:25:47 +01:00
def _df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):
2022-06-01 18:28:59 +02:00
df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False)
def fetch_lequa2022(task, data_home=None):
"""
Loads the official datasets provided for the `LeQua <https://lequa2022.github.io/index>`_ competition.
In brief, there are 4 tasks (T1A, T1B, T2A, T2B) having to do with text quantification
problems. Tasks T1A and T1B provide documents in vector form, while T2A and T2B provide raw documents instead.
Tasks T1A and T2A are binary sentiment quantification problems, while T2A and T2B are multiclass quantification
problems consisting of estimating the class prevalence values of 28 different merchandise products.
We refer to the `Esuli, A., Moreo, A., Sebastiani, F., & Sperduti, G. (2022).
A Detailed Overview of LeQua@ CLEF 2022: Learning to Quantify.
<https://ceur-ws.org/Vol-3180/paper-146.pdf>`_ for a detailed description
on the tasks and datasets.
The datasets are downloaded only once, and stored for fast reuse.
See `lequa2022_experiments.py` provided in the example folder, that can serve as a guide on how to use these
datasets.
:param task: a string representing the task name; valid ones are T1A, T1B, T2A, and T2B
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)
:return: a tuple `(train, val_gen, test_gen)` where `train` is an instance of
:class:`quapy.data.base.LabelledCollection`, `val_gen` and `test_gen` are instances of
2023-10-30 09:47:01 +01:00
:class:`quapy.data._lequa2022.SamplesFromDir`, a subclass of :class:`quapy.protocol.AbstractProtocol`,
that return a series of samples stored in a directory which are labelled by prevalence.
2022-06-01 18:28:59 +02:00
"""
2022-06-01 18:28:59 +02:00
from quapy.data._lequa2022 import load_raw_documents, load_vector_documents, SamplesFromDir
assert task in LEQUA2022_TASKS, \
f'Unknown task {task}. Valid ones are {LEQUA2022_TASKS}'
if data_home is None:
data_home = get_quapy_home()
URL_TRAINDEV=f'https://zenodo.org/record/6546188/files/{task}.train_dev.zip'
URL_TEST=f'https://zenodo.org/record/6546188/files/{task}.test.zip'
URL_TEST_PREV=f'https://zenodo.org/record/6546188/files/{task}.test_prevalences.zip'
lequa_dir = join(data_home, 'lequa2022')
os.makedirs(lequa_dir, exist_ok=True)
def download_unzip_and_remove(unzipped_path, url):
tmp_path = join(lequa_dir, task + '_tmp.zip')
download_file_if_not_exists(url, tmp_path)
with zipfile.ZipFile(tmp_path) as file:
file.extractall(unzipped_path)
os.remove(tmp_path)
if not os.path.exists(join(lequa_dir, task)):
download_unzip_and_remove(lequa_dir, URL_TRAINDEV)
download_unzip_and_remove(lequa_dir, URL_TEST)
download_unzip_and_remove(lequa_dir, URL_TEST_PREV)
if task in ['T1A', 'T1B']:
load_fn = load_vector_documents
elif task in ['T2A', 'T2B']:
load_fn = load_raw_documents
tr_path = join(lequa_dir, task, 'public', 'training_data.txt')
train = LabelledCollection.load(tr_path, loader_func=load_fn)
val_samples_path = join(lequa_dir, task, 'public', 'dev_samples')
val_true_prev_path = join(lequa_dir, task, 'public', 'dev_prevalences.txt')
val_gen = SamplesFromDir(val_samples_path, val_true_prev_path, load_fn=load_fn)
test_samples_path = join(lequa_dir, task, 'public', 'test_samples')
2022-06-01 18:28:59 +02:00
test_true_prev_path = join(lequa_dir, task, 'public', 'test_prevalences.txt')
2022-06-15 14:36:02 +02:00
test_gen = SamplesFromDir(test_samples_path, test_true_prev_path, load_fn=load_fn)
2022-06-01 18:28:59 +02:00
return train, val_gen, test_gen
2024-02-08 14:33:22 +01:00
2024-02-07 18:45:42 +01:00
def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=None):
2023-11-08 11:07:47 +01:00
"""
2024-02-08 14:33:22 +01:00
Loads the IFCB dataset for quantification from `Zenodo <https://zenodo.org/records/10036244>`_ (for more
information on this dataset, please follow the zenodo link).
This dataset is based on the data available publicly at
`WHOI-Plankton repo <https://github.com/hsosik/WHOI-Plankton>`_.
The scripts for the processing are available at `P. González's repo <https://github.com/pglez82/IFCB_Zenodo>`_.
2023-11-08 11:07:47 +01:00
Basically, this is the IFCB dataset with precomputed features for testing quantification algorithms.
The datasets are downloaded only once, and stored for fast reuse.
2024-02-07 18:45:42 +01:00
:param single_sample_train: a boolean. If true, it will return the train dataset as a
2023-11-08 11:07:47 +01:00
:class:`quapy.data.base.LabelledCollection` (all examples together).
2024-02-07 18:45:42 +01:00
If false, a generator of training samples will be returned. Each example in the training set has an individual label.
:param for_model_selection: if True, then returns a split 30% of the training set (86 out of 286 samples) to be used for model selection;
if False, then returns the full training set as training set and the test set as the test set
2023-11-08 11:07:47 +01:00
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)
:return: a tuple `(train, test_gen)` where `train` is an instance of
2024-02-07 18:45:42 +01:00
:class:`quapy.data.base.LabelledCollection`, if `single_sample_train` is true or
:class:`quapy.data._ifcb.IFCBTrainSamplesFromDir`, i.e. a sampling protocol that returns a series of samples
labelled example by example. test_gen will be a :class:`quapy.data._ifcb.IFCBTestSamples`,
2023-11-08 11:07:47 +01:00
i.e., a sampling protocol that returns a series of samples labelled by prevalence.
"""
2024-02-07 18:45:42 +01:00
from quapy.data._ifcb import IFCBTrainSamplesFromDir, IFCBTestSamples, get_sample_list, generate_modelselection_split
2023-11-08 11:07:47 +01:00
if data_home is None:
data_home = get_quapy_home()
URL_TRAIN=f'https://zenodo.org/records/10036244/files/IFCB.train.zip'
URL_TEST=f'https://zenodo.org/records/10036244/files/IFCB.test.zip'
URL_TEST_PREV=f'https://zenodo.org/records/10036244/files/IFCB.test_prevalences.zip'
ifcb_dir = join(data_home, 'ifcb')
os.makedirs(ifcb_dir, exist_ok=True)
def download_unzip_and_remove(unzipped_path, url):
tmp_path = join(ifcb_dir, 'ifcb_tmp.zip')
download_file_if_not_exists(url, tmp_path)
with zipfile.ZipFile(tmp_path) as file:
file.extractall(unzipped_path)
os.remove(tmp_path)
if not os.path.exists(os.path.join(ifcb_dir,'train')):
download_unzip_and_remove(ifcb_dir, URL_TRAIN)
if not os.path.exists(os.path.join(ifcb_dir,'test')):
download_unzip_and_remove(ifcb_dir, URL_TEST)
if not os.path.exists(os.path.join(ifcb_dir,'test_prevalences.csv')):
download_unzip_and_remove(ifcb_dir, URL_TEST_PREV)
# Load test prevalences and classes
test_true_prev_path = join(ifcb_dir, 'test_prevalences.csv')
test_true_prev = pd.read_csv(test_true_prev_path)
classes = test_true_prev.columns[1:]
2024-02-07 18:45:42 +01:00
#Load train and test samples
2023-11-08 11:07:47 +01:00
train_samples_path = join(ifcb_dir,'train')
test_samples_path = join(ifcb_dir,'test')
2024-02-07 18:45:42 +01:00
if for_model_selection:
# In this case, return 70% of training data as the training set and 30% as the test set
samples = get_sample_list(train_samples_path)
train, test = generate_modelselection_split(samples, split=0.3)
train_gen = IFCBTrainSamplesFromDir(path_dir=train_samples_path, classes=classes, samples=train)
# Test prevalence is computed from class labels
test_gen = IFCBTestSamples(path_dir=train_samples_path, test_prevalences=None, samples=test, classes=classes)
else:
# In this case, we use all training samples as the training set and the test samples as the test set
train_gen = IFCBTrainSamplesFromDir(path_dir=train_samples_path, classes=classes)
test_gen = IFCBTestSamples(path_dir=test_samples_path, test_prevalences=test_true_prev)
2023-11-08 11:07:47 +01:00
# In the case the user wants it, join all the train samples in one LabelledCollection
if single_sample_train:
2024-02-07 18:45:42 +01:00
X, y = [], []
2023-11-08 11:07:47 +01:00
for X_, y_ in train_gen():
X.append(X_)
y.append(y_)
X = np.vstack(X)
y = np.concatenate(y)
2024-02-07 18:45:42 +01:00
train = LabelledCollection(X, y, classes = classes)
2023-11-08 11:07:47 +01:00
return train, test_gen
2024-02-07 18:45:42 +01:00
2023-11-08 11:07:47 +01:00
else:
return train_gen, test_gen