ucimulti datasets removed, cleaning

2024-04-29 17:36:13 +02:00 · 2024-04-29 17:36:13 +02:00 · 19524f9aa8
parent 93dd6cb1c1
commit 19524f9aa8
1 changed files with 8 additions and 41 deletions
--- a/quapy/data/datasets.py
+++ b/quapy/data/datasets.py
@ -59,28 +59,22 @@ UCI_MULTICLASS_DATASETS = [
    'letter',
    'abalone',
    'obesity',
    # 'covertype', --> very slow, skipped
    'nursery',
    # 'diabetes', --> very slow, skipped
    'yeast',
    'hand_digits',
    'satellite',
    'shuttle',
    'cmc',
    'isolet',
-    'waveform.v1',
+    'waveform-v1',
    'molecular',
-    # 'poker_hand', --> very slow, skipped
+    'poker_hand',
    'connect-4',
    # 'cardiotocography', --> multiple labels, skipped
    'mhr',
    'chess',
    'page_block',
-    # 'room', --> very slow, skipped
+    'phishing',
    'phishing2',
    # 'rt-iot22', --> very slow, skipped
    'image_seg',
    # 'steel_plates', --> multiple labels, skipped
    'hcv',
 ]
@ -682,28 +676,22 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_clas
        'letter': 59,
        'abalone': 1,
        'obesity': 544,
        'covertype': 31,
        'nursery': 76,
        'diabetes': 296,
        'yeast': 110,
        'hand_digits': 81,
        'satellite': 146,
        'shuttle': 148,
        'cmc': 30,
        'isolet': 54,
-        'waveform.v1': 107,
+        'waveform-v1': 107,
        'molecular': 69,
        'poker_hand': 158,
        'connect-4': 26,
        'cardiotocography': 193,
        'mhr': 863,
        'chess': 23,
        'page_block': 78,
-        'room': 864,
+        'phishing': 379,
        'phishing2': 379,
        'rt-iot22': 942,
        'image_seg': 147,
        'steel_plates': 198,
        'hcv': 503,
    }
@ -715,28 +703,22 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_clas
        'letter': 'Letter Recognition',
        'abalone': 'Abalone',
        'obesity': 'Estimation of Obesity Levels Based On Eating Habits and Physical Condition',
        'covertype': 'Covertype',
        'nursery': 'Nursery',
        'diabetes': 'Diabetes 130-US Hospitals for Years 1999-2008',
        'yeast': 'Yeast',
        'hand_digits': 'Pen-Based Recognition of Handwritten Digits',
        'satellite': 'Statlog Landsat Satellite',
        'shuttle': 'Statlog Shuttle',
        'cmc': 'Contraceptive Method Choice',
        'isolet': 'ISOLET',
-        'waveform.v1': 'Waveform Database Generator (Version 1)',
+        'waveform-v1': 'Waveform Database Generator (Version 1)',
        'molecular': 'Molecular Biology (Splice-junction Gene Sequences)',
        'poker_hand': 'Poker Hand',
        'connect-4': 'Connect-4',
        'cardiotocography': 'Cardiotocography',
        'mhr': 'Maternal Health Risk',
        'chess': 'Chess (King-Rook vs. King)',
        'page_block': 'Page Blocks Classification',
-        'room': 'Room Occupancy Estimation',
+        'phishing': 'Website Phishing',
        'phishing2': 'Website Phishing',
        'rt-iot22': 'RT-IoT2022',
        'image_seg': 'Statlog (Image Segmentation)',
        'steel_plates': 'Steel Plates Faults',
        'hcv': 'Hepatitis C Virus (HCV) for Egyptian patients',
    }
@ -750,26 +732,11 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_clas
    def download(id, name):
        df = fetch_ucirepo(id=id)
        df.data.features = pd.get_dummies(df.data.features, drop_first=True)
        X, y = df.data.features.to_numpy(), df.data.targets.to_numpy().squeeze()
-        with open(f"var/{name}_Xy.txt", "w") as f:
+        assert y.ndim == 1, 'more than one y'
            for row in X:
                f.write(str(row) + "\n")
            f.write("\n\n")
            if y.ndim > 1:
                unique_y = np.unique(np.fromiter((tuple(elm) for elm in y), dtype='object'))
            else:
                unique_y = np.unique(y)
            f.write(str(unique_y) + "\n\n")
            for row in y:
                f.write(str(row) + "\n")
        if y.ndim > 1:
            raise ValueError('more than one y')
        classes = np.sort(np.unique(y))
        y = np.searchsorted(classes, y)