ucimulti datasets removed, cleaning
This commit is contained in:
parent
93dd6cb1c1
commit
19524f9aa8
|
@ -59,28 +59,22 @@ UCI_MULTICLASS_DATASETS = [
|
||||||
'letter',
|
'letter',
|
||||||
'abalone',
|
'abalone',
|
||||||
'obesity',
|
'obesity',
|
||||||
# 'covertype', --> very slow, skipped
|
|
||||||
'nursery',
|
'nursery',
|
||||||
# 'diabetes', --> very slow, skipped
|
|
||||||
'yeast',
|
'yeast',
|
||||||
'hand_digits',
|
'hand_digits',
|
||||||
'satellite',
|
'satellite',
|
||||||
'shuttle',
|
'shuttle',
|
||||||
'cmc',
|
'cmc',
|
||||||
'isolet',
|
'isolet',
|
||||||
'waveform.v1',
|
'waveform-v1',
|
||||||
'molecular',
|
'molecular',
|
||||||
# 'poker_hand', --> very slow, skipped
|
'poker_hand',
|
||||||
'connect-4',
|
'connect-4',
|
||||||
# 'cardiotocography', --> multiple labels, skipped
|
|
||||||
'mhr',
|
'mhr',
|
||||||
'chess',
|
'chess',
|
||||||
'page_block',
|
'page_block',
|
||||||
# 'room', --> very slow, skipped
|
'phishing',
|
||||||
'phishing2',
|
|
||||||
# 'rt-iot22', --> very slow, skipped
|
|
||||||
'image_seg',
|
'image_seg',
|
||||||
# 'steel_plates', --> multiple labels, skipped
|
|
||||||
'hcv',
|
'hcv',
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -682,28 +676,22 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_clas
|
||||||
'letter': 59,
|
'letter': 59,
|
||||||
'abalone': 1,
|
'abalone': 1,
|
||||||
'obesity': 544,
|
'obesity': 544,
|
||||||
'covertype': 31,
|
|
||||||
'nursery': 76,
|
'nursery': 76,
|
||||||
'diabetes': 296,
|
|
||||||
'yeast': 110,
|
'yeast': 110,
|
||||||
'hand_digits': 81,
|
'hand_digits': 81,
|
||||||
'satellite': 146,
|
'satellite': 146,
|
||||||
'shuttle': 148,
|
'shuttle': 148,
|
||||||
'cmc': 30,
|
'cmc': 30,
|
||||||
'isolet': 54,
|
'isolet': 54,
|
||||||
'waveform.v1': 107,
|
'waveform-v1': 107,
|
||||||
'molecular': 69,
|
'molecular': 69,
|
||||||
'poker_hand': 158,
|
'poker_hand': 158,
|
||||||
'connect-4': 26,
|
'connect-4': 26,
|
||||||
'cardiotocography': 193,
|
|
||||||
'mhr': 863,
|
'mhr': 863,
|
||||||
'chess': 23,
|
'chess': 23,
|
||||||
'page_block': 78,
|
'page_block': 78,
|
||||||
'room': 864,
|
'phishing': 379,
|
||||||
'phishing2': 379,
|
|
||||||
'rt-iot22': 942,
|
|
||||||
'image_seg': 147,
|
'image_seg': 147,
|
||||||
'steel_plates': 198,
|
|
||||||
'hcv': 503,
|
'hcv': 503,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -715,28 +703,22 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_clas
|
||||||
'letter': 'Letter Recognition',
|
'letter': 'Letter Recognition',
|
||||||
'abalone': 'Abalone',
|
'abalone': 'Abalone',
|
||||||
'obesity': 'Estimation of Obesity Levels Based On Eating Habits and Physical Condition',
|
'obesity': 'Estimation of Obesity Levels Based On Eating Habits and Physical Condition',
|
||||||
'covertype': 'Covertype',
|
|
||||||
'nursery': 'Nursery',
|
'nursery': 'Nursery',
|
||||||
'diabetes': 'Diabetes 130-US Hospitals for Years 1999-2008',
|
|
||||||
'yeast': 'Yeast',
|
'yeast': 'Yeast',
|
||||||
'hand_digits': 'Pen-Based Recognition of Handwritten Digits',
|
'hand_digits': 'Pen-Based Recognition of Handwritten Digits',
|
||||||
'satellite': 'Statlog Landsat Satellite',
|
'satellite': 'Statlog Landsat Satellite',
|
||||||
'shuttle': 'Statlog Shuttle',
|
'shuttle': 'Statlog Shuttle',
|
||||||
'cmc': 'Contraceptive Method Choice',
|
'cmc': 'Contraceptive Method Choice',
|
||||||
'isolet': 'ISOLET',
|
'isolet': 'ISOLET',
|
||||||
'waveform.v1': 'Waveform Database Generator (Version 1)',
|
'waveform-v1': 'Waveform Database Generator (Version 1)',
|
||||||
'molecular': 'Molecular Biology (Splice-junction Gene Sequences)',
|
'molecular': 'Molecular Biology (Splice-junction Gene Sequences)',
|
||||||
'poker_hand': 'Poker Hand',
|
'poker_hand': 'Poker Hand',
|
||||||
'connect-4': 'Connect-4',
|
'connect-4': 'Connect-4',
|
||||||
'cardiotocography': 'Cardiotocography',
|
|
||||||
'mhr': 'Maternal Health Risk',
|
'mhr': 'Maternal Health Risk',
|
||||||
'chess': 'Chess (King-Rook vs. King)',
|
'chess': 'Chess (King-Rook vs. King)',
|
||||||
'page_block': 'Page Blocks Classification',
|
'page_block': 'Page Blocks Classification',
|
||||||
'room': 'Room Occupancy Estimation',
|
'phishing': 'Website Phishing',
|
||||||
'phishing2': 'Website Phishing',
|
|
||||||
'rt-iot22': 'RT-IoT2022',
|
|
||||||
'image_seg': 'Statlog (Image Segmentation)',
|
'image_seg': 'Statlog (Image Segmentation)',
|
||||||
'steel_plates': 'Steel Plates Faults',
|
|
||||||
'hcv': 'Hepatitis C Virus (HCV) for Egyptian patients',
|
'hcv': 'Hepatitis C Virus (HCV) for Egyptian patients',
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -750,26 +732,11 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_clas
|
||||||
|
|
||||||
def download(id, name):
|
def download(id, name):
|
||||||
df = fetch_ucirepo(id=id)
|
df = fetch_ucirepo(id=id)
|
||||||
|
|
||||||
|
|
||||||
df.data.features = pd.get_dummies(df.data.features, drop_first=True)
|
df.data.features = pd.get_dummies(df.data.features, drop_first=True)
|
||||||
|
|
||||||
X, y = df.data.features.to_numpy(), df.data.targets.to_numpy().squeeze()
|
X, y = df.data.features.to_numpy(), df.data.targets.to_numpy().squeeze()
|
||||||
|
|
||||||
with open(f"var/{name}_Xy.txt", "w") as f:
|
assert y.ndim == 1, 'more than one y'
|
||||||
for row in X:
|
|
||||||
f.write(str(row) + "\n")
|
|
||||||
f.write("\n\n")
|
|
||||||
if y.ndim > 1:
|
|
||||||
unique_y = np.unique(np.fromiter((tuple(elm) for elm in y), dtype='object'))
|
|
||||||
else:
|
|
||||||
unique_y = np.unique(y)
|
|
||||||
f.write(str(unique_y) + "\n\n")
|
|
||||||
for row in y:
|
|
||||||
f.write(str(row) + "\n")
|
|
||||||
|
|
||||||
if y.ndim > 1:
|
|
||||||
raise ValueError('more than one y')
|
|
||||||
|
|
||||||
classes = np.sort(np.unique(y))
|
classes = np.sort(np.unique(y))
|
||||||
y = np.searchsorted(classes, y)
|
y = np.searchsorted(classes, y)
|
||||||
|
|
Loading…
Reference in New Issue