ucimulti datasets removed, cleaning

This commit is contained in:
Lorenzo Volpi 2024-04-29 17:36:13 +02:00
parent 93dd6cb1c1
commit 19524f9aa8
1 changed files with 8 additions and 41 deletions

View File

@ -59,28 +59,22 @@ UCI_MULTICLASS_DATASETS = [
'letter', 'letter',
'abalone', 'abalone',
'obesity', 'obesity',
# 'covertype', --> very slow, skipped
'nursery', 'nursery',
# 'diabetes', --> very slow, skipped
'yeast', 'yeast',
'hand_digits', 'hand_digits',
'satellite', 'satellite',
'shuttle', 'shuttle',
'cmc', 'cmc',
'isolet', 'isolet',
'waveform.v1', 'waveform-v1',
'molecular', 'molecular',
# 'poker_hand', --> very slow, skipped 'poker_hand',
'connect-4', 'connect-4',
# 'cardiotocography', --> multiple labels, skipped
'mhr', 'mhr',
'chess', 'chess',
'page_block', 'page_block',
# 'room', --> very slow, skipped 'phishing',
'phishing2',
# 'rt-iot22', --> very slow, skipped
'image_seg', 'image_seg',
# 'steel_plates', --> multiple labels, skipped
'hcv', 'hcv',
] ]
@ -682,28 +676,22 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_clas
'letter': 59, 'letter': 59,
'abalone': 1, 'abalone': 1,
'obesity': 544, 'obesity': 544,
'covertype': 31,
'nursery': 76, 'nursery': 76,
'diabetes': 296,
'yeast': 110, 'yeast': 110,
'hand_digits': 81, 'hand_digits': 81,
'satellite': 146, 'satellite': 146,
'shuttle': 148, 'shuttle': 148,
'cmc': 30, 'cmc': 30,
'isolet': 54, 'isolet': 54,
'waveform.v1': 107, 'waveform-v1': 107,
'molecular': 69, 'molecular': 69,
'poker_hand': 158, 'poker_hand': 158,
'connect-4': 26, 'connect-4': 26,
'cardiotocography': 193,
'mhr': 863, 'mhr': 863,
'chess': 23, 'chess': 23,
'page_block': 78, 'page_block': 78,
'room': 864, 'phishing': 379,
'phishing2': 379,
'rt-iot22': 942,
'image_seg': 147, 'image_seg': 147,
'steel_plates': 198,
'hcv': 503, 'hcv': 503,
} }
@ -715,28 +703,22 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_clas
'letter': 'Letter Recognition', 'letter': 'Letter Recognition',
'abalone': 'Abalone', 'abalone': 'Abalone',
'obesity': 'Estimation of Obesity Levels Based On Eating Habits and Physical Condition', 'obesity': 'Estimation of Obesity Levels Based On Eating Habits and Physical Condition',
'covertype': 'Covertype',
'nursery': 'Nursery', 'nursery': 'Nursery',
'diabetes': 'Diabetes 130-US Hospitals for Years 1999-2008',
'yeast': 'Yeast', 'yeast': 'Yeast',
'hand_digits': 'Pen-Based Recognition of Handwritten Digits', 'hand_digits': 'Pen-Based Recognition of Handwritten Digits',
'satellite': 'Statlog Landsat Satellite', 'satellite': 'Statlog Landsat Satellite',
'shuttle': 'Statlog Shuttle', 'shuttle': 'Statlog Shuttle',
'cmc': 'Contraceptive Method Choice', 'cmc': 'Contraceptive Method Choice',
'isolet': 'ISOLET', 'isolet': 'ISOLET',
'waveform.v1': 'Waveform Database Generator (Version 1)', 'waveform-v1': 'Waveform Database Generator (Version 1)',
'molecular': 'Molecular Biology (Splice-junction Gene Sequences)', 'molecular': 'Molecular Biology (Splice-junction Gene Sequences)',
'poker_hand': 'Poker Hand', 'poker_hand': 'Poker Hand',
'connect-4': 'Connect-4', 'connect-4': 'Connect-4',
'cardiotocography': 'Cardiotocography',
'mhr': 'Maternal Health Risk', 'mhr': 'Maternal Health Risk',
'chess': 'Chess (King-Rook vs. King)', 'chess': 'Chess (King-Rook vs. King)',
'page_block': 'Page Blocks Classification', 'page_block': 'Page Blocks Classification',
'room': 'Room Occupancy Estimation', 'phishing': 'Website Phishing',
'phishing2': 'Website Phishing',
'rt-iot22': 'RT-IoT2022',
'image_seg': 'Statlog (Image Segmentation)', 'image_seg': 'Statlog (Image Segmentation)',
'steel_plates': 'Steel Plates Faults',
'hcv': 'Hepatitis C Virus (HCV) for Egyptian patients', 'hcv': 'Hepatitis C Virus (HCV) for Egyptian patients',
} }
@ -750,26 +732,11 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_clas
def download(id, name): def download(id, name):
df = fetch_ucirepo(id=id) df = fetch_ucirepo(id=id)
df.data.features = pd.get_dummies(df.data.features, drop_first=True) df.data.features = pd.get_dummies(df.data.features, drop_first=True)
X, y = df.data.features.to_numpy(), df.data.targets.to_numpy().squeeze() X, y = df.data.features.to_numpy(), df.data.targets.to_numpy().squeeze()
with open(f"var/{name}_Xy.txt", "w") as f: assert y.ndim == 1, 'more than one y'
for row in X:
f.write(str(row) + "\n")
f.write("\n\n")
if y.ndim > 1:
unique_y = np.unique(np.fromiter((tuple(elm) for elm in y), dtype='object'))
else:
unique_y = np.unique(y)
f.write(str(unique_y) + "\n\n")
for row in y:
f.write(str(row) + "\n")
if y.ndim > 1:
raise ValueError('more than one y')
classes = np.sort(np.unique(y)) classes = np.sort(np.unique(y))
y = np.searchsorted(classes, y) y = np.searchsorted(classes, y)