Merge branch 'lorenzovolpi-cv_len_fix' into devel

2023-11-08 10:00:44 +01:00 · 2023-11-08 10:00:44 +01:00 · cc5ab8ad70
parent 15777b0fab 3d4ffcea62
commit cc5ab8ad70
4 changed files with 115 additions and 26 deletions
--- a/README.md
+++ b/README.md
@ -111,3 +111,7 @@ are provided:
 * [SVMperf](https://github.com/HLT-ISTI/QuaPy/wiki/ExplicitLossMinimization)
 * [Model Selection](https://github.com/HLT-ISTI/QuaPy/wiki/Model-Selection)
 * [Plotting](https://github.com/HLT-ISTI/QuaPy/wiki/Plotting)
+
+## Acknowledgments:
+
+<img src="SoBigData.png" alt="SoBigData++" width="250"/>
--- a/SoBigData.png
+++ b/SoBigData.png
--- a/quapy/data/datasets.py
+++ b/quapy/data/datasets.py
@ -6,8 +6,7 @@ import os
 import zipfile
 from os.path import join
 import pandas as pd
-import scipy
-import quapy
+from ucimlrepo import fetch_ucirepo
 from quapy.data.base import Dataset, LabelledCollection
 from quapy.data.preprocessing import text2tfidf, reduce_columns
 from quapy.data.reader import *
@ -45,6 +44,12 @@ UCI_DATASETS = ['acute.a', 'acute.b',
                'wine-q-red', 'wine-q-white',
                'yeast']

+UCI_MULTICLASS_DATASETS = ['dry-bean',
+                           'wine-quality',
+                           'academic-success',
+                           'digits',
+                           'letter']
+
 LEQUA2022_TASKS = ['T1A', 'T1B', 'T2A', 'T2B']

 _TXA_SAMPLE_SIZE = 250
@ -549,6 +554,109 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) ->
    return data


+def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset:
+    """
+    Loads a UCI multiclass dataset as an instance of :class:`quapy.data.base.Dataset`. 
+
+    The list of available datasets is taken from https://archive.ics.uci.edu/, following these criteria:
+    - It has more than 1000 instances
+    - It is suited for classification
+    - It has more than two classes
+    - It is available for Python import (requires ucimlrepo package)
+
+    >>> import quapy as qp
+    >>> dataset = qp.datasets.fetch_UCIMulticlassDataset("dry-bean")
+    >>> train, test = dataset.train_test
+    >>>     ...
+
+    The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS`
+
+    The datasets are downloaded only once and pickled into disk, saving time for consecutive calls.
+
+    :param dataset_name: a dataset name
+    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
+        ~/quay_data/ directory)
+    :param test_split: proportion of documents to be included in the test set. The rest conforms the training set
+    :param verbose: set to True (default is False) to get information (stats) about the dataset
+    :return: a :class:`quapy.data.base.Dataset` instance
+    """
+    data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, verbose)
+    return Dataset(*data.split_stratified(1 - test_split, random_state=0))
+
+
+def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection:
+    """
+    Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`.
+
+    The list of available datasets is taken from https://archive.ics.uci.edu/, following these criteria:
+    - It has more than 1000 instances
+    - It is suited for classification
+    - It has more than two classes
+    - It is available for Python import (requires ucimlrepo package)
+    
+    >>> import quapy as qp
+    >>> collection = qp.datasets.fetch_UCIMulticlassLabelledCollection("dry-bean")
+    >>> X, y = collection.Xy
+    >>>     ...
+
+    The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS`
+
+    The datasets are downloaded only once and pickled into disk, saving time for consecutive calls.
+
+    :param dataset_name: a dataset name
+    :param data_home: specify the quapy home directory where the dataset will be dumped (leave empty to use the default
+        ~/quay_data/ directory)
+    :param test_split: proportion of documents to be included in the test set. The rest conforms the training set
+    :param verbose: set to True (default is False) to get information (stats) about the dataset
+    :return: a :class:`quapy.data.base.LabelledCollection` instance
+    """
+    assert dataset_name in UCI_MULTICLASS_DATASETS, \
+        f'Name {dataset_name} does not match any known dataset from the ' \
+        f'UCI Machine Learning datasets repository (multiclass). ' \
+        f'Valid ones are {UCI_MULTICLASS_DATASETS}'
+    
+    if data_home is None:
+        data_home = get_quapy_home()
+    
+    identifiers = {
+        "dry-bean": 602,
+        "wine-quality": 186,
+        "academic-success": 697,
+        "digits": 80,
+        "letter": 59
+    }
+    
+    full_names = {
+        "dry-bean": "Dry Bean Dataset",
+        "wine-quality": "Wine Quality",
+        "academic-success": "Predict students' dropout and academic success",
+        "digits": "Optical Recognition of Handwritten Digits",
+        "letter": "Letter Recognition"
+    }
+    
+    identifier = identifiers[dataset_name]
+    fullname = full_names[dataset_name]
+
+    if verbose:
+        print(f'Loading UCI Muticlass {dataset_name} ({fullname})')
+
+    file = join(data_home, 'uci_multiclass', dataset_name+'.pkl')
+    
+    def download(id):
+        data = fetch_ucirepo(id=id)
+        X, y = data['data']['features'].to_numpy(), data['data']['targets'].to_numpy().squeeze()
+        classes = np.sort(np.unique(y))
+        y = np.searchsorted(classes, y)
+        return LabelledCollection(X, y)
+
+    data = pickled_resource(file, download, identifier)
+
+    if verbose:
+        data.stats()
+        
+    return data
+
+
 def _df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):
    df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False)

@ -624,26 +732,3 @@ def fetch_lequa2022(task, data_home=None):

    return train, val_gen, test_gen

-def fetch_IFCB(data_home=None):
-
-    if data_home is None:
-        data_home = get_quapy_home()
-
-    URL_TRAINDEV=f'https://zenodo.org/records/10036244/files/IFCB.train.zip'
-    URL_TEST=f'https://zenodo.org/records/10036244/files/IFCB.test.zip'
-    URL_TEST_PREV=f'https://zenodo.org/records/10036244/files/IFCB.test_prevalences.zip'
-
-    ifcb_dir = join(data_home, 'ifcb')
-    os.makedirs(ifcb_dir, exist_ok=True)
-
-    def download_unzip_and_remove(unzipped_path, url):
-        tmp_path = join(ifcb_dir, 'tmp.zip')
-        download_file_if_not_exists(url, tmp_path)
-        with zipfile.ZipFile(tmp_path) as file:
-            file.extractall(unzipped_path)
-        os.remove(tmp_path)
-
-    if not os.path.exists(join(ifcb_dir, task)):
-        download_unzip_and_remove(ifcb_dir, URL_TRAINDEV)
-        download_unzip_and_remove(ifcb_dir, URL_TEST)
-        download_unzip_and_remove(ifcb_dir, URL_TEST_PREV)
--- a/quapy/model_selection.py
+++ b/quapy/model_selection.py
@ -223,7 +223,7 @@ def cross_val_predict(quantifier: BaseQuantifier, data: LabelledCollection, nfol
    for train, test in data.kFCV(nfolds=nfolds, random_state=random_state):
        quantifier.fit(train)
        fold_prev = quantifier.quantify(test.X)
-        rel_size = len(test.X)/len(data)
+        rel_size = 1. * len(test) / len(data)
        total_prev += fold_prev*rel_size

    return total_prev