Added UCI multiclass datasets; added filter for min instances per class to UCI multiclass datasets
This commit is contained in:
parent
75af15ae4a
commit
1a7a658191
|
@ -14,41 +14,76 @@ from quapy.util import download_file_if_not_exists, download_file, get_quapy_hom
|
||||||
|
|
||||||
|
|
||||||
REVIEWS_SENTIMENT_DATASETS = ['hp', 'kindle', 'imdb']
|
REVIEWS_SENTIMENT_DATASETS = ['hp', 'kindle', 'imdb']
|
||||||
TWITTER_SENTIMENT_DATASETS_TEST = ['gasp', 'hcr', 'omd', 'sanders',
|
TWITTER_SENTIMENT_DATASETS_TEST = [
|
||||||
'semeval13', 'semeval14', 'semeval15', 'semeval16',
|
'gasp', 'hcr', 'omd', 'sanders',
|
||||||
'sst', 'wa', 'wb']
|
'semeval13', 'semeval14', 'semeval15', 'semeval16',
|
||||||
TWITTER_SENTIMENT_DATASETS_TRAIN = ['gasp', 'hcr', 'omd', 'sanders',
|
'sst', 'wa', 'wb',
|
||||||
'semeval', 'semeval16',
|
]
|
||||||
'sst', 'wa', 'wb']
|
TWITTER_SENTIMENT_DATASETS_TRAIN = [
|
||||||
UCI_BINARY_DATASETS = ['acute.a', 'acute.b',
|
'gasp', 'hcr', 'omd', 'sanders',
|
||||||
'balance.1', 'balance.2', 'balance.3',
|
'semeval', 'semeval16',
|
||||||
'breast-cancer',
|
'sst', 'wa', 'wb',
|
||||||
'cmc.1', 'cmc.2', 'cmc.3',
|
]
|
||||||
'ctg.1', 'ctg.2', 'ctg.3',
|
UCI_BINARY_DATASETS = [
|
||||||
#'diabetes', # <-- I haven't found this one...
|
'acute.a', 'acute.b',
|
||||||
'german',
|
'balance.1', 'balance.2', 'balance.3',
|
||||||
'haberman',
|
'breast-cancer',
|
||||||
'ionosphere',
|
'cmc.1', 'cmc.2', 'cmc.3',
|
||||||
'iris.1', 'iris.2', 'iris.3',
|
'ctg.1', 'ctg.2', 'ctg.3',
|
||||||
'mammographic',
|
#'diabetes', # <-- I haven't found this one...
|
||||||
'pageblocks.5',
|
'german',
|
||||||
#'phoneme', # <-- I haven't found this one...
|
'haberman',
|
||||||
'semeion',
|
'ionosphere',
|
||||||
'sonar',
|
'iris.1', 'iris.2', 'iris.3',
|
||||||
'spambase',
|
'mammographic',
|
||||||
'spectf',
|
'pageblocks.5',
|
||||||
'tictactoe',
|
#'phoneme', # <-- I haven't found this one...
|
||||||
'transfusion',
|
'semeion',
|
||||||
'wdbc',
|
'sonar',
|
||||||
'wine.1', 'wine.2', 'wine.3',
|
'spambase',
|
||||||
'wine-q-red', 'wine-q-white',
|
'spectf',
|
||||||
'yeast']
|
'tictactoe',
|
||||||
|
'transfusion',
|
||||||
|
'wdbc',
|
||||||
|
'wine.1', 'wine.2', 'wine.3',
|
||||||
|
'wine-q-red',
|
||||||
|
'wine-q-white',
|
||||||
|
'yeast',
|
||||||
|
]
|
||||||
|
|
||||||
UCI_MULTICLASS_DATASETS = ['dry-bean',
|
UCI_MULTICLASS_DATASETS = [
|
||||||
'wine-quality',
|
'dry-bean',
|
||||||
'academic-success',
|
'wine-quality',
|
||||||
'digits',
|
'academic-success',
|
||||||
'letter']
|
'digits',
|
||||||
|
'letter',
|
||||||
|
'abalone',
|
||||||
|
'obesity',
|
||||||
|
'covertype',
|
||||||
|
'nursery',
|
||||||
|
'diabetes',
|
||||||
|
'yeast',
|
||||||
|
'hand_digits',
|
||||||
|
'satellite',
|
||||||
|
'shuttle',
|
||||||
|
'cmc',
|
||||||
|
'isolet',
|
||||||
|
'waveform.v1',
|
||||||
|
'molecular',
|
||||||
|
'poker_hand',
|
||||||
|
'connect-4',
|
||||||
|
'cardiotocography',
|
||||||
|
'mhr',
|
||||||
|
'chess2',
|
||||||
|
'page_block',
|
||||||
|
'room',
|
||||||
|
'phishing2',
|
||||||
|
'rt-iot22',
|
||||||
|
'support2',
|
||||||
|
'image_seg',
|
||||||
|
'steel_plates',
|
||||||
|
'hcv',
|
||||||
|
]
|
||||||
|
|
||||||
LEQUA2022_TASKS = ['T1A', 'T1B', 'T2A', 'T2B']
|
LEQUA2022_TASKS = ['T1A', 'T1B', 'T2A', 'T2B']
|
||||||
|
|
||||||
|
@ -586,7 +621,7 @@ def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, ver
|
||||||
return Dataset(*data.split_stratified(1 - test_split, random_state=0))
|
return Dataset(*data.split_stratified(1 - test_split, random_state=0))
|
||||||
|
|
||||||
|
|
||||||
def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection:
|
def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=False, min_ipc=100) -> LabelledCollection:
|
||||||
"""
|
"""
|
||||||
Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`.
|
Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`.
|
||||||
|
|
||||||
|
@ -610,6 +645,8 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=
|
||||||
~/quay_data/ directory)
|
~/quay_data/ directory)
|
||||||
:param test_split: proportion of documents to be included in the test set. The rest conforms the training set
|
:param test_split: proportion of documents to be included in the test set. The rest conforms the training set
|
||||||
:param verbose: set to True (default is False) to get information (stats) about the dataset
|
:param verbose: set to True (default is False) to get information (stats) about the dataset
|
||||||
|
:param min_ipc: minimum number of istances per class. Classes with less instances than min_ipc are discarded
|
||||||
|
(deafult is 100)
|
||||||
:return: a :class:`quapy.data.base.LabelledCollection` instance
|
:return: a :class:`quapy.data.base.LabelledCollection` instance
|
||||||
"""
|
"""
|
||||||
assert dataset_name in UCI_MULTICLASS_DATASETS, \
|
assert dataset_name in UCI_MULTICLASS_DATASETS, \
|
||||||
|
@ -621,19 +658,71 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=
|
||||||
data_home = get_quapy_home()
|
data_home = get_quapy_home()
|
||||||
|
|
||||||
identifiers = {
|
identifiers = {
|
||||||
"dry-bean": 602,
|
'dry-bean': 602,
|
||||||
"wine-quality": 186,
|
'wine-quality': 186,
|
||||||
"academic-success": 697,
|
'academic-success': 697,
|
||||||
"digits": 80,
|
'digits': 80,
|
||||||
"letter": 59
|
'letter': 59,
|
||||||
|
'abalone': 1,
|
||||||
|
'obesity': 544,
|
||||||
|
'covertype': 31,
|
||||||
|
'nursery': 76,
|
||||||
|
'diabetes': 296,
|
||||||
|
'yeast': 110,
|
||||||
|
'hand_digits': 81,
|
||||||
|
'satellite': 146,
|
||||||
|
'shuttle': 148,
|
||||||
|
'cmc': 30,
|
||||||
|
'isolet': 54,
|
||||||
|
'waveform.v1': 107,
|
||||||
|
'molecular': 69,
|
||||||
|
'poker_hand': 158,
|
||||||
|
'connect-4': 26,
|
||||||
|
'cardiotocography': 193,
|
||||||
|
'mhr': 863,
|
||||||
|
'chess2': 23,
|
||||||
|
'page_block': 78,
|
||||||
|
'room': 864,
|
||||||
|
'phishing2': 379,
|
||||||
|
'rt-iot22': 942,
|
||||||
|
'support2': 880,
|
||||||
|
'image_seg': 147,
|
||||||
|
'steel_plates': 198,
|
||||||
|
'hcv': 503,
|
||||||
}
|
}
|
||||||
|
|
||||||
full_names = {
|
full_names = {
|
||||||
"dry-bean": "Dry Bean Dataset",
|
'dry-bean': 'Dry Bean Dataset',
|
||||||
"wine-quality": "Wine Quality",
|
'wine-quality': 'Wine Quality',
|
||||||
"academic-success": "Predict students' dropout and academic success",
|
'academic-success': 'Predict students\' dropout and academic success',
|
||||||
"digits": "Optical Recognition of Handwritten Digits",
|
'digits': 'Optical Recognition of Handwritten Digits',
|
||||||
"letter": "Letter Recognition"
|
'letter': 'Letter Recognition',
|
||||||
|
'abalone': 'Abalone',
|
||||||
|
'obesity': 'Estimation of Obesity Levels Based On Eating Habits and Physical Condition',
|
||||||
|
'covertype': 'Covertype',
|
||||||
|
'nursery': 'Nursery',
|
||||||
|
'diabetes': 'Diabetes 130-US Hospitals for Years 1999-2008',
|
||||||
|
'yeast': 'Yeast',
|
||||||
|
'hand_digits': 'Pen-Based Recognition of Handwritten Digits',
|
||||||
|
'satellite': 'Statlog Landsat Satellite',
|
||||||
|
'shuttle': 'Statlog Shuttle',
|
||||||
|
'cmc': 'Contraceptive Method Choice',
|
||||||
|
'isolet': 'ISOLET',
|
||||||
|
'waveform.v1': 'Waveform Database Generator (Version 1)',
|
||||||
|
'molecular': 'Molecular Biology (Splice-junction Gene Sequences)',
|
||||||
|
'poker_hand': 'Poker Hand',
|
||||||
|
'connect-4': 'Connect-4',
|
||||||
|
'cardiotocography': 'Cardiotocography',
|
||||||
|
'mhr': 'Maternal Health Risk',
|
||||||
|
'chess2': 'Chess (King-Rook vs. King)',
|
||||||
|
'page_block': 'Page Blocks Classification',
|
||||||
|
'room': 'Room Occupancy Estimation',
|
||||||
|
'phishing2': 'Website Phishing',
|
||||||
|
'rt-iot22': 'RT-IoT2022',
|
||||||
|
'support2': 'SUPPORT2',
|
||||||
|
'image_seg': 'Statlog (Image Segmentation)',
|
||||||
|
'steel_plates': 'Steel Plates Faults',
|
||||||
|
'hcv': 'Hepatitis C Virus (HCV) for Egyptian patients',
|
||||||
}
|
}
|
||||||
|
|
||||||
identifier = identifiers[dataset_name]
|
identifier = identifiers[dataset_name]
|
||||||
|
@ -644,14 +733,36 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=
|
||||||
|
|
||||||
file = join(data_home, 'uci_multiclass', dataset_name+'.pkl')
|
file = join(data_home, 'uci_multiclass', dataset_name+'.pkl')
|
||||||
|
|
||||||
def download(id):
|
def download(id, name):
|
||||||
data = fetch_ucirepo(id=id)
|
data = fetch_ucirepo(id=id)
|
||||||
X, y = data['data']['features'].to_numpy(), data['data']['targets'].to_numpy().squeeze()
|
X, y = data['data']['features'].to_numpy(), data['data']['targets'].to_numpy().squeeze()
|
||||||
|
# classes represented as arrays are transformed to tuples to treat them as signle objects
|
||||||
|
if name == 'support2':
|
||||||
|
y[:, 2] = np.fromiter((str(elm) for elm in y[:, 2]), dtype='object')
|
||||||
|
if y.ndim > 1:
|
||||||
|
y = np.fromiter((tuple(elm) for elm in y), dtype='object')
|
||||||
classes = np.sort(np.unique(y))
|
classes = np.sort(np.unique(y))
|
||||||
y = np.searchsorted(classes, y)
|
y = np.searchsorted(classes, y)
|
||||||
return LabelledCollection(X, y)
|
return LabelledCollection(X, y)
|
||||||
|
|
||||||
data = pickled_resource(file, download, identifier)
|
def filter_classes(data: LabelledCollection, min_ipc):
|
||||||
|
classes = data.classes_
|
||||||
|
# restrict classes to only those with at least min_ipc instances
|
||||||
|
classes = classes[data.counts() >= min_ipc]
|
||||||
|
# filter X and y keeping only datapoints belonging to valid classes
|
||||||
|
filter_idx = np.in1d(data.y, classes)
|
||||||
|
X, y = data.X[filter_idx], data.y[filter_idx]
|
||||||
|
# map classes to range(len(classes))
|
||||||
|
y = np.searchsorted(classes, y)
|
||||||
|
return LabelledCollection(X, y)
|
||||||
|
|
||||||
|
data = pickled_resource(file, download, identifier, dataset_name)
|
||||||
|
data = filter_classes(data, min_ipc)
|
||||||
|
if data.n_classes <= 2:
|
||||||
|
raise ValueError(
|
||||||
|
f'Dataset {dataset_name} has too few valid classes to be multiclass with {min_ipc=}. '
|
||||||
|
'Try a lower value for min_ipc.'
|
||||||
|
)
|
||||||
|
|
||||||
if verbose:
|
if verbose:
|
||||||
data.stats()
|
data.stats()
|
||||||
|
|
Loading…
Reference in New Issue