QuaPy/quapy/data/datasets.py

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
import os
import zipfile
from os.path import join
import pandas as pd
from ucimlrepo import fetch_ucirepo
from quapy.data.base import Dataset, LabelledCollection
from quapy.data.preprocessing import text2tfidf, reduce_columns
from quapy.data.reader import *
from quapy.util import download_file_if_not_exists, download_file, get_quapy_home, pickled_resource


REVIEWS_SENTIMENT_DATASETS = ['hp', 'kindle', 'imdb']
TWITTER_SENTIMENT_DATASETS_TEST = ['gasp', 'hcr', 'omd', 'sanders',
                              'semeval13', 'semeval14', 'semeval15', 'semeval16',
                              'sst', 'wa', 'wb']
TWITTER_SENTIMENT_DATASETS_TRAIN = ['gasp', 'hcr', 'omd', 'sanders',
                                 'semeval', 'semeval16',
                                 'sst', 'wa', 'wb']
UCI_BINARY_DATASETS = ['acute.a', 'acute.b',
                'balance.1', 'balance.2', 'balance.3',
                'breast-cancer',
                'cmc.1', 'cmc.2', 'cmc.3',
                'ctg.1', 'ctg.2', 'ctg.3',
                       #'diabetes', # <-- I haven't found this one...
                'german',
                'haberman',
                'ionosphere',
                'iris.1', 'iris.2', 'iris.3',
                'mammographic',
                       'pageblocks.5',
                       #'phoneme', # <-- I haven't found this one...
                       'semeion',
                       'sonar',
                       'spambase',
                       'spectf',
                       'tictactoe',
                       'transfusion',
                       'wdbc',
                       'wine.1', 'wine.2', 'wine.3',
                       'wine-q-red', 'wine-q-white',
                       'yeast']

UCI_MULTICLASS_DATASETS = ['dry-bean',
                           'wine-quality',
                           'academic-success',
                           'digits',
                           'letter']

LEQUA2022_TASKS = ['T1A', 'T1B', 'T2A', 'T2B']

_TXA_SAMPLE_SIZE = 250
_TXB_SAMPLE_SIZE = 1000

LEQUA2022_SAMPLE_SIZE = {
    'TXA': _TXA_SAMPLE_SIZE,
    'TXB': _TXB_SAMPLE_SIZE,
    'T1A': _TXA_SAMPLE_SIZE,
    'T1B': _TXB_SAMPLE_SIZE,
    'T2A': _TXA_SAMPLE_SIZE,
    'T2B': _TXB_SAMPLE_SIZE,
    'binary': _TXA_SAMPLE_SIZE,
    'multiclass': _TXB_SAMPLE_SIZE
}


def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False) -> Dataset:
    """
    Loads a Reviews dataset as a Dataset instance, as used in
    `Esuli, A., Moreo, A., and Sebastiani, F. "A recurrent neural network for sentiment quantification."
    Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018. <https://dl.acm.org/doi/abs/10.1145/3269206.3269287>`_.
    The list of valid dataset names can be accessed in `quapy.data.datasets.REVIEWS_SENTIMENT_DATASETS`

    :param dataset_name: the name of the dataset: valid ones are 'hp', 'kindle', 'imdb'
    :param tfidf: set to True to transform the raw documents into tfidf weighted matrices
    :param min_df: minimun number of documents that should contain a term in order for the term to be
        kept (ignored if tfidf==False)
    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
        ~/quay_data/ directory)
    :param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for
        faster subsequent invokations
    :return: a :class:`quapy.data.base.Dataset` instance
    """
    assert dataset_name in REVIEWS_SENTIMENT_DATASETS, \
        f'Name {dataset_name} does not match any known dataset for sentiment reviews. ' \
        f'Valid ones are {REVIEWS_SENTIMENT_DATASETS}'
    if data_home is None:
        data_home = get_quapy_home()

    URL_TRAIN = f'https://zenodo.org/record/4117827/files/{dataset_name}_train.txt'
    URL_TEST = f'https://zenodo.org/record/4117827/files/{dataset_name}_test.txt'
    os.makedirs(join(data_home, 'reviews'), exist_ok=True)
    train_path = join(data_home, 'reviews', dataset_name, 'train.txt')
    test_path = join(data_home, 'reviews', dataset_name, 'test.txt')
    download_file_if_not_exists(URL_TRAIN, train_path)
    download_file_if_not_exists(URL_TEST, test_path)

    pickle_path = None
    if pickle:
        pickle_path = join(data_home, 'reviews', 'pickle', f'{dataset_name}.pkl')
    data = pickled_resource(pickle_path, Dataset.load, train_path, test_path, from_text)

    if tfidf:
        text2tfidf(data, inplace=True)
        if min_df is not None:
            reduce_columns(data, min_df=min_df, inplace=True)

    data.name = dataset_name

    return data


def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_home=None, pickle=False) -> Dataset:
    """
    Loads a Twitter dataset as a :class:`quapy.data.base.Dataset` instance, as used in:
    `Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
    Social Network Analysis and Mining6(19), 1–22 (2016) <https://link.springer.com/content/pdf/10.1007/s13278-016-0327-z.pdf>`_
    Note that the datasets 'semeval13', 'semeval14', 'semeval15' share the same training set.
    The list of valid dataset names corresponding to training sets can be accessed in
    `quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TRAIN`, while the test sets can be accessed in
    `quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TEST`

    :param dataset_name: the name of the dataset: valid ones are 'gasp', 'hcr', 'omd', 'sanders', 'semeval13',
        'semeval14', 'semeval15', 'semeval16', 'sst', 'wa', 'wb'
    :param for_model_selection: if True, then returns the train split as the training set and the devel split
        as the test set; if False, then returns the train+devel split as the training set and the test set as the
        test set
    :param min_df: minimun number of documents that should contain a term in order for the term to be kept
    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
        ~/quay_data/ directory)
    :param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for
        faster subsequent invokations
    :return: a :class:`quapy.data.base.Dataset` instance
    """
    assert dataset_name in TWITTER_SENTIMENT_DATASETS_TRAIN + TWITTER_SENTIMENT_DATASETS_TEST, \
        f'Name {dataset_name} does not match any known dataset for sentiment twitter. ' \
        f'Valid ones are {TWITTER_SENTIMENT_DATASETS_TRAIN} for model selection and ' \
        f'{TWITTER_SENTIMENT_DATASETS_TEST} for test (datasets "semeval14", "semeval15", "semeval16" share ' \
        f'a common training set "semeval")'
    if data_home is None:
        data_home = get_quapy_home()

    URL = 'https://zenodo.org/record/4255764/files/tweet_sentiment_quantification_snam.zip'
    unzipped_path = join(data_home, 'tweet_sentiment_quantification_snam')
    if not os.path.exists(unzipped_path):
        downloaded_path = join(data_home, 'tweet_sentiment_quantification_snam.zip')
        download_file(URL, downloaded_path)
        with zipfile.ZipFile(downloaded_path) as file:
            file.extractall(data_home)
        os.remove(downloaded_path)

    if dataset_name in {'semeval13', 'semeval14', 'semeval15'}:
        trainset_name = 'semeval'
        testset_name  = 'semeval' if for_model_selection else dataset_name
        print(f"the training and development sets for datasets 'semeval13', 'semeval14', 'semeval15' are common "
              f"(called 'semeval'); returning trainin-set='{trainset_name}' and test-set={testset_name}")
    else:
        if dataset_name == 'semeval' and for_model_selection==False:
            raise ValueError('dataset "semeval" can only be used for model selection. '
                             'Use "semeval13", "semeval14", or "semeval15" for model evaluation.')
        trainset_name = testset_name = dataset_name

    if for_model_selection:
        train = join(unzipped_path, 'train', f'{trainset_name}.train.feature.txt')
        test  = join(unzipped_path, 'test', f'{testset_name}.dev.feature.txt')
    else:
        train = join(unzipped_path, 'train', f'{trainset_name}.train+dev.feature.txt')
        if dataset_name == 'semeval16':  # there is a different test name in the case of semeval16 only
            test = join(unzipped_path, 'test', f'{testset_name}.dev-test.feature.txt')
        else:
            test = join(unzipped_path, 'test', f'{testset_name}.test.feature.txt')

    pickle_path = None
    if pickle:
        mode = "train-dev" if for_model_selection else "train+dev-test"
        pickle_path = join(unzipped_path, 'pickle', f'{testset_name}.{mode}.pkl')
    data = pickled_resource(pickle_path, Dataset.load, train, test, from_sparse)

    if min_df is not None:
        reduce_columns(data, min_df=min_df, inplace=True)

    data.name = dataset_name

    return data


def fetch_UCIBinaryDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset:
    """
    Loads a UCI dataset as an instance of :class:`quapy.data.base.Dataset`, as used in
    `Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
    Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
    Information Fusion, 34, 87-100. <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_
    and
    `Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).
    Dynamic ensemble selection for quantification tasks.
    Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_.
    The datasets do not come with a predefined train-test split (see :meth:`fetch_UCILabelledCollection` for further
    information on how to use these collections), and so a train-test split is generated at desired proportion.
    The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_DATASETS`

    :param dataset_name: a dataset name
    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
        ~/quay_data/ directory)
    :param test_split: proportion of documents to be included in the test set. The rest conforms the training set
    :param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
    :return: a :class:`quapy.data.base.Dataset` instance
    """
    data = fetch_UCIBinaryLabelledCollection(dataset_name, data_home, verbose)
    return Dataset(*data.split_stratified(1 - test_split, random_state=0))


def fetch_UCIBinaryLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection:
    """
    Loads a UCI collection as an instance of :class:`quapy.data.base.LabelledCollection`, as used in
    `Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
    Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
    Information Fusion, 34, 87-100. <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_
    and
    `Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).
    Dynamic ensemble selection for quantification tasks.
    Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_.
    The datasets do not come with a predefined train-test split, and so Pérez-Gállego et al. adopted a 5FCVx2 evaluation
    protocol, meaning that each collection was used to generate two rounds (hence the x2) of 5 fold cross validation.
    This can be reproduced by using :meth:`quapy.data.base.Dataset.kFCV`, e.g.:

    >>> import quapy as qp
    >>> collection = qp.datasets.fetch_UCIBinaryLabelledCollection("yeast")
    >>> for data in qp.train.Dataset.kFCV(collection, nfolds=5, nrepeats=2):
    >>>     ...

    The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_DATASETS`

    :param dataset_name: a dataset name
    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
        ~/quay_data/ directory)
    :param test_split: proportion of documents to be included in the test set. The rest conforms the training set
    :param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
    :return: a :class:`quapy.data.base.LabelledCollection` instance
    """

    assert dataset_name in UCI_BINARY_DATASETS, \
        f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \
        f'Valid ones are {UCI_BINARY_DATASETS}'
    if data_home is None:
        data_home = get_quapy_home()

    dataset_fullname = {
        'acute.a': 'Acute Inflammations (urinary bladder)',
        'acute.b': 'Acute Inflammations (renal pelvis)',
        'balance.1': 'Balance Scale Weight & Distance Database (left)',
        'balance.2': 'Balance Scale Weight & Distance Database (balanced)',
        'balance.3': 'Balance Scale Weight & Distance Database (right)',
        'breast-cancer':  'Breast Cancer Wisconsin (Original)',
        'cmc.1': 'Contraceptive Method Choice (no use)',
        'cmc.2': 'Contraceptive Method Choice (long term)',
        'cmc.3': 'Contraceptive Method Choice (short term)',
        'ctg.1': 'Cardiotocography Data Set (normal)',
        'ctg.2': 'Cardiotocography Data Set (suspect)',
        'ctg.3': 'Cardiotocography Data Set (pathologic)',
        'german': 'Statlog German Credit Data',
        'haberman': "Haberman's Survival Data",
        'ionosphere': 'Johns Hopkins University Ionosphere DB',
        'iris.1': 'Iris Plants Database(x)',
        'iris.2': 'Iris Plants Database(versicolour)',
        'iris.3': 'Iris Plants Database(virginica)',
        'mammographic': 'Mammographic Mass',
        'pageblocks.5': 'Page Blocks Classification (5)',
        'semeion': 'Semeion Handwritten Digit (8)',
        'sonar': 'Sonar, Mines vs. Rocks',
        'spambase': 'Spambase Data Set',
        'spectf': 'SPECTF Heart Data',
        'tictactoe': 'Tic-Tac-Toe Endgame Database',
        'transfusion': 'Blood Transfusion Service Center Data Set',
        'wdbc': 'Wisconsin Diagnostic Breast Cancer',
        'wine.1': 'Wine Recognition Data (1)',
        'wine.2': 'Wine Recognition Data (2)',
        'wine.3': 'Wine Recognition Data (3)',
        'wine-q-red': 'Wine Quality Red (6-10)',
        'wine-q-white': 'Wine Quality White (6-10)',
        'yeast': 'Yeast',
    }

    # the identifier is an alias for the dataset group, it's part of the url data-folder, and is the name we use
    # to download the raw dataset
    identifier_map = {
        'acute.a': 'acute',
        'acute.b': 'acute',
        'balance.1': 'balance-scale',
        'balance.2': 'balance-scale',
        'balance.3': 'balance-scale',
        'breast-cancer': 'breast-cancer-wisconsin',
        'cmc.1': 'cmc',
        'cmc.2': 'cmc',
        'cmc.3': 'cmc',
        'ctg.1': '00193',
        'ctg.2': '00193',
        'ctg.3': '00193',
        'german': 'statlog/german',
        'haberman': 'haberman',
        'ionosphere': 'ionosphere',
        'iris.1': 'iris',
        'iris.2': 'iris',
        'iris.3': 'iris',
        'mammographic': 'mammographic-masses',
        'pageblocks.5': 'page-blocks',
        'semeion': 'semeion',
        'sonar': 'undocumented/connectionist-bench/sonar',
        'spambase': 'spambase',
        'spectf': 'spect',
        'tictactoe': 'tic-tac-toe',
        'transfusion': 'blood-transfusion',
        'wdbc': 'breast-cancer-wisconsin',
        'wine-q-red': 'wine-quality',
        'wine-q-white': 'wine-quality',
        'wine.1': 'wine',
        'wine.2': 'wine',
        'wine.3': 'wine',
        'yeast': 'yeast',
    }

    # the filename is the name of the file within the data_folder indexed by the identifier
    file_name = {
        'acute': 'diagnosis.data',
        '00193': 'CTG.xls',
        'statlog/german': 'german.data-numeric',
        'mammographic-masses': 'mammographic_masses.data',
        'page-blocks': 'page-blocks.data.Z',
        'undocumented/connectionist-bench/sonar': 'sonar.all-data',
        'spect': ['SPECTF.train', 'SPECTF.test'],
        'blood-transfusion': 'transfusion.data',
        'wine-quality': ['winequality-red.csv', 'winequality-white.csv'],
        'breast-cancer-wisconsin': 'breast-cancer-wisconsin.data' if dataset_name=='breast-cancer' else 'wdbc.data'
    }

    # the filename containing the dataset description (if any)
    desc_name = {
        'acute': 'diagnosis.names',
        '00193': None,
        'statlog/german': 'german.doc',
        'mammographic-masses': 'mammographic_masses.names',
        'undocumented/connectionist-bench/sonar': 'sonar.names',
        'spect': 'SPECTF.names',
        'blood-transfusion': 'transfusion.names',
        'wine-quality': 'winequality.names',
        'breast-cancer-wisconsin': 'breast-cancer-wisconsin.names' if dataset_name == 'breast-cancer' else 'wdbc.names'
    }

    identifier = identifier_map[dataset_name]
    filename = file_name.get(identifier, f'{identifier}.data')
    descfile = desc_name.get(identifier, f'{identifier}.names')
    fullname = dataset_fullname[dataset_name]

    URL = f'http://archive.ics.uci.edu/ml/machine-learning-databases/{identifier}'
    data_dir = join(data_home, 'uci_datasets', identifier)
    if isinstance(filename, str):  # filename could be a list of files, in which case it will be processed later
        data_path = join(data_dir, filename)
        download_file_if_not_exists(f'{URL}/{filename}', data_path)

    if descfile:
        try:
            download_file_if_not_exists(f'{URL}/{descfile}', f'{data_dir}/{descfile}')
            if verbose:
                print(open(f'{data_dir}/{descfile}', 'rt').read())
        except Exception:
            print('could not read the description file')
    elif verbose:
        print('no file description available')

    if verbose:
        print(f'Loading {dataset_name} ({fullname})')
    if identifier == 'acute':
        df = pd.read_csv(data_path, header=None, encoding='utf-16', sep='\t')

        df[0] = df[0].apply(lambda x: float(x.replace(',', '.'))).astype(float, copy=False)
        [_df_replace(df, col) for col in range(1, 6)]
        X = df.loc[:, 0:5].values
        if dataset_name == 'acute.a':
            y = binarize(df[6], pos_class='yes')
        elif dataset_name == 'acute.b':
            y = binarize(df[7], pos_class='yes')

    if identifier == 'balance-scale':
        df = pd.read_csv(data_path, header=None, sep=',')
        if dataset_name == 'balance.1':
            y = binarize(df[0], pos_class='L')
        elif dataset_name == 'balance.2':
            y = binarize(df[0], pos_class='B')
        elif dataset_name == 'balance.3':
            y = binarize(df[0], pos_class='R')
        X = df.loc[:, 1:].astype(float).values

    if identifier == 'breast-cancer-wisconsin' and dataset_name=='breast-cancer':
        df = pd.read_csv(data_path, header=None, sep=',')
        Xy = df.loc[:, 1:10]
        Xy[Xy=='?']=np.nan
        Xy = Xy.dropna(axis=0)
        X = Xy.loc[:, 1:9]
        X = X.astype(float).values
        y = binarize(Xy[10], pos_class=2)

    if identifier == 'breast-cancer-wisconsin' and dataset_name=='wdbc':
        df = pd.read_csv(data_path, header=None, sep=',')
        X = df.loc[:, 2:32].astype(float).values
        y = df[1].values
        y = binarize(y, pos_class='M')

    if identifier == 'cmc':
        df = pd.read_csv(data_path, header=None, sep=',')
        X = df.loc[:, 0:8].astype(float).values
        y = df[9].astype(int).values
        if dataset_name == 'cmc.1':
            y = binarize(y, pos_class=1)
        elif dataset_name == 'cmc.2':
            y = binarize(y, pos_class=2)
        elif dataset_name == 'cmc.3':
            y = binarize(y, pos_class=3)

    if identifier == '00193':
        df = pd.read_excel(data_path, sheet_name='Data', skipfooter=3)
        df = df[list(range(1,24))] # select columns numbered (number 23 is the target label)
        # replaces the header with the first row
        new_header = df.iloc[0]  # grab the first row for the header
        df = df[1:]  # take the data less the header row
        df.columns = new_header  # set the header row as the df header
        X = df.iloc[:, 0:22].astype(float).values
        y = df['NSP'].astype(int).values
        if dataset_name == 'ctg.1':
            y = binarize(y, pos_class=1)  # 1==Normal
        elif dataset_name == 'ctg.2':
            y = binarize(y, pos_class=2)  # 2==Suspect
        elif dataset_name == 'ctg.3':
            y = binarize(y, pos_class=3)  # 3==Pathologic

    if identifier == 'statlog/german':
        df = pd.read_csv(data_path, header=None, delim_whitespace=True)
        X = df.iloc[:, 0:24].astype(float).values
        y = df[24].astype(int).values
        y = binarize(y, pos_class=1)

    if identifier == 'haberman':
        df = pd.read_csv(data_path, header=None)
        X = df.iloc[:, 0:3].astype(float).values
        y = df[3].astype(int).values
        y = binarize(y, pos_class=2)

    if identifier == 'ionosphere':
        df = pd.read_csv(data_path, header=None)
        X = df.iloc[:, 0:34].astype(float).values
        y = df[34].values
        y = binarize(y, pos_class='b')

    if identifier == 'iris':
        df = pd.read_csv(data_path, header=None)
        X = df.iloc[:, 0:4].astype(float).values
        y = df[4].values
        if dataset_name == 'iris.1':
            y = binarize(y, pos_class='Iris-setosa')  # 1==Setosa
        elif dataset_name == 'iris.2':
            y = binarize(y, pos_class='Iris-versicolor')  # 2==Versicolor
        elif dataset_name == 'iris.3':
            y = binarize(y, pos_class='Iris-virginica')  # 3==Virginica

    if identifier == 'mammographic-masses':
        df = pd.read_csv(data_path, header=None, sep=',')
        df[df == '?'] = np.nan
        Xy = df.dropna(axis=0)
        X = Xy.iloc[:, 0:5]
        X = X.astype(float).values
        y = binarize(Xy.iloc[:,5], pos_class=1)

    if identifier == 'page-blocks':
        data_path_ = data_path.replace('.Z', '')
        if not os.path.exists(data_path_):
            raise FileNotFoundError(f'Warning: file {data_path_} does not exist. If this is the first time you '
                                    f'attempt to load this dataset, then you have to manually unzip the {data_path} '
                                    f'and name the extracted file {data_path_} (unfortunately, neither zipfile, nor '
                                    f'gzip can handle unix compressed files automatically -- there is a repo in GitHub '
                                    f'https://github.com/umeat/unlzw where the problem seems to be solved anyway).')
        df = pd.read_csv(data_path_, header=None, delim_whitespace=True)
        X = df.iloc[:, 0:10].astype(float).values
        y = df[10].values
        y = binarize(y, pos_class=5)  # 5==block "graphic"

    if identifier == 'semeion':
        df = pd.read_csv(data_path, header=None, delim_whitespace=True )
        X = df.iloc[:, 0:256].astype(float).values
        y = df[263].values  # 263 stands for digit 8 (labels are one-hot vectors from col 256-266)
        y = binarize(y, pos_class=1)

    if identifier == 'undocumented/connectionist-bench/sonar':
        df = pd.read_csv(data_path, header=None, sep=',')
        X = df.iloc[:, 0:60].astype(float).values
        y = df[60].values
        y = binarize(y, pos_class='R')

    if identifier == 'spambase':
        df = pd.read_csv(data_path, header=None, sep=',')
        X = df.iloc[:, 0:57].astype(float).values
        y = df[57].values
        y = binarize(y, pos_class=1)

    if identifier == 'spect':
        dfs = []
        for file in filename:
            data_path = join(data_dir, file)
            download_file_if_not_exists(f'{URL}/{file}', data_path)
            dfs.append(pd.read_csv(data_path, header=None, sep=','))
        df = pd.concat(dfs)
        X = df.iloc[:, 1:45].astype(float).values
        y = df[0].values
        y = binarize(y, pos_class=0)

    if identifier == 'tic-tac-toe':
        df = pd.read_csv(data_path, header=None, sep=',')
        X = df.iloc[:, 0:9].replace('o',0).replace('b',1).replace('x',2).values
        y = df[9].values
        y = binarize(y, pos_class='negative')

    if identifier == 'blood-transfusion':
        df = pd.read_csv(data_path, sep=',')
        X = df.iloc[:, 0:4].astype(float).values
        y = df.iloc[:, 4].values
        y = binarize(y, pos_class=1)

    if identifier == 'wine':
        df = pd.read_csv(data_path, header=None, sep=',')
        X = df.iloc[:, 1:14].astype(float).values
        y = df[0].values
        if dataset_name == 'wine.1':
            y = binarize(y, pos_class=1)
        elif dataset_name == 'wine.2':
            y = binarize(y, pos_class=2)
        elif dataset_name == 'wine.3':
            y = binarize(y, pos_class=3)

    if identifier == 'wine-quality':
        filename = filename[0] if dataset_name=='wine-q-red' else filename[1]
        data_path = join(data_dir, filename)
        download_file_if_not_exists(f'{URL}/{filename}', data_path)
        df = pd.read_csv(data_path, sep=';')
        X = df.iloc[:, 0:11].astype(float).values
        y = df.iloc[:, 11].values > 5

    if identifier == 'yeast':
        df = pd.read_csv(data_path, header=None, delim_whitespace=True)
        X = df.iloc[:, 1:9].astype(float).values
        y = df.iloc[:, 9].values
        y = binarize(y, pos_class='NUC')

    data = LabelledCollection(X, y)
    if verbose:
        data.stats()
    return data


def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset:
    """
    Loads a UCI multiclass dataset as an instance of :class:`quapy.data.base.Dataset`. 

    The list of available datasets is taken from https://archive.ics.uci.edu/, following these criteria:
    - It has more than 1000 instances
    - It is suited for classification
    - It has more than two classes
    - It is available for Python import (requires ucimlrepo package)

    >>> import quapy as qp
    >>> dataset = qp.datasets.fetch_UCIMulticlassDataset("dry-bean")
    >>> train, test = dataset.train_test
    >>>     ...

    The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS`

    The datasets are downloaded only once and pickled into disk, saving time for consecutive calls.

    :param dataset_name: a dataset name
    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
        ~/quay_data/ directory)
    :param test_split: proportion of documents to be included in the test set. The rest conforms the training set
    :param verbose: set to True (default is False) to get information (stats) about the dataset
    :return: a :class:`quapy.data.base.Dataset` instance
    """
    data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, verbose)
    return Dataset(*data.split_stratified(1 - test_split, random_state=0))


def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection:
    """
    Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`.

    The list of available datasets is taken from https://archive.ics.uci.edu/, following these criteria:
    - It has more than 1000 instances
    - It is suited for classification
    - It has more than two classes
    - It is available for Python import (requires ucimlrepo package)
    
    >>> import quapy as qp
    >>> collection = qp.datasets.fetch_UCIMulticlassLabelledCollection("dry-bean")
    >>> X, y = collection.Xy
    >>>     ...

    The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS`

    The datasets are downloaded only once and pickled into disk, saving time for consecutive calls.

    :param dataset_name: a dataset name
    :param data_home: specify the quapy home directory where the dataset will be dumped (leave empty to use the default
        ~/quay_data/ directory)
    :param test_split: proportion of documents to be included in the test set. The rest conforms the training set
    :param verbose: set to True (default is False) to get information (stats) about the dataset
    :return: a :class:`quapy.data.base.LabelledCollection` instance
    """
    assert dataset_name in UCI_MULTICLASS_DATASETS, \
        f'Name {dataset_name} does not match any known dataset from the ' \
        f'UCI Machine Learning datasets repository (multiclass). ' \
        f'Valid ones are {UCI_MULTICLASS_DATASETS}'
    
    if data_home is None:
        data_home = get_quapy_home()
    
    identifiers = {
        "dry-bean": 602,
        "wine-quality": 186,
        "academic-success": 697,
        "digits": 80,
        "letter": 59
    }
    
    full_names = {
        "dry-bean": "Dry Bean Dataset",
        "wine-quality": "Wine Quality",
        "academic-success": "Predict students' dropout and academic success",
        "digits": "Optical Recognition of Handwritten Digits",
        "letter": "Letter Recognition"
    }
    
    identifier = identifiers[dataset_name]
    fullname = full_names[dataset_name]

    if verbose:
        print(f'Loading UCI Muticlass {dataset_name} ({fullname})')

    file = join(data_home, 'uci_multiclass', dataset_name+'.pkl')
    
    def download(id):
        data = fetch_ucirepo(id=id)
        X, y = data['data']['features'].to_numpy(), data['data']['targets'].to_numpy().squeeze()
        classes = np.sort(np.unique(y))
        y = np.searchsorted(classes, y)
        return LabelledCollection(X, y)

    data = pickled_resource(file, download, identifier)

    if verbose:
        data.stats()
        
    return data


def _df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):
    df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False)


def fetch_lequa2022(task, data_home=None):
    """
    Loads the official datasets provided for the `LeQua <https://lequa2022.github.io/index>`_ competition.
    In brief, there are 4 tasks (T1A, T1B, T2A, T2B) having to do with text quantification
    problems. Tasks T1A and T1B provide documents in vector form, while T2A and T2B provide raw documents instead.
    Tasks T1A and T2A are binary sentiment quantification problems, while T2A and T2B are multiclass quantification
    problems consisting of estimating the class prevalence values of 28 different merchandise products.
    We refer to the `Esuli, A., Moreo, A., Sebastiani, F., & Sperduti, G. (2022).
    A Detailed Overview of LeQua@ CLEF 2022: Learning to Quantify.
    <https://ceur-ws.org/Vol-3180/paper-146.pdf>`_ for a detailed description
    on the tasks and datasets.

    The datasets are downloaded only once, and stored for fast reuse.

    See `lequa2022_experiments.py` provided in the example folder, that can serve as a guide on how to use these
    datasets.


    :param task: a string representing the task name; valid ones are T1A, T1B, T2A, and T2B
    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
        ~/quay_data/ directory)
    :return: a tuple `(train, val_gen, test_gen)` where `train` is an instance of
        :class:`quapy.data.base.LabelledCollection`, `val_gen` and `test_gen` are instances of
        :class:`quapy.data._lequa2022.SamplesFromDir`, a subclass of :class:`quapy.protocol.AbstractProtocol`,
        that return a series of samples stored in a directory which are labelled by prevalence.
    """

    from quapy.data._lequa2022 import load_raw_documents, load_vector_documents, SamplesFromDir

    assert task in LEQUA2022_TASKS, \
        f'Unknown task {task}. Valid ones are {LEQUA2022_TASKS}'
    if data_home is None:
        data_home = get_quapy_home()

    URL_TRAINDEV=f'https://zenodo.org/record/6546188/files/{task}.train_dev.zip'
    URL_TEST=f'https://zenodo.org/record/6546188/files/{task}.test.zip'
    URL_TEST_PREV=f'https://zenodo.org/record/6546188/files/{task}.test_prevalences.zip'

    lequa_dir = join(data_home, 'lequa2022')
    os.makedirs(lequa_dir, exist_ok=True)

    def download_unzip_and_remove(unzipped_path, url):
        tmp_path = join(lequa_dir, task + '_tmp.zip')
        download_file_if_not_exists(url, tmp_path)
        with zipfile.ZipFile(tmp_path) as file:
            file.extractall(unzipped_path)
        os.remove(tmp_path)

    if not os.path.exists(join(lequa_dir, task)):
        download_unzip_and_remove(lequa_dir, URL_TRAINDEV)
        download_unzip_and_remove(lequa_dir, URL_TEST)
        download_unzip_and_remove(lequa_dir, URL_TEST_PREV)

    if task in ['T1A', 'T1B']:
        load_fn = load_vector_documents
    elif task in ['T2A', 'T2B']:
        load_fn = load_raw_documents

    tr_path = join(lequa_dir, task, 'public', 'training_data.txt')
    train = LabelledCollection.load(tr_path, loader_func=load_fn)

    val_samples_path = join(lequa_dir, task, 'public', 'dev_samples')
    val_true_prev_path = join(lequa_dir, task, 'public', 'dev_prevalences.txt')
    val_gen = SamplesFromDir(val_samples_path, val_true_prev_path, load_fn=load_fn)

    test_samples_path = join(lequa_dir, task, 'public', 'test_samples')
    test_true_prev_path = join(lequa_dir, task, 'public', 'test_prevalences.txt')
    test_gen = SamplesFromDir(test_samples_path, test_true_prev_path, load_fn=load_fn)

    return train, val_gen, test_gen


def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=None):
    """
    Loads the IFCB dataset for quantification from `Zenodo <https://zenodo.org/records/10036244>`_ (for more
    information on this dataset, please follow the zenodo link).
    This dataset is based on the data available publicly at
    `WHOI-Plankton repo <https://github.com/hsosik/WHOI-Plankton>`_.
    The scripts for the processing are available at `P. González's repo <https://github.com/pglez82/IFCB_Zenodo>`_.
    Basically, this is the IFCB dataset with precomputed features for testing quantification algorithms.

    The datasets are downloaded only once, and stored for fast reuse.

    :param single_sample_train: a boolean. If true, it will return the train dataset as a
        :class:`quapy.data.base.LabelledCollection` (all examples together).
        If false, a generator of training samples will be returned. Each example in the training set has an individual label.
    :param for_model_selection: if True, then returns a split 30% of the training set (86 out of 286 samples) to be used for model selection; 
        if False, then returns the full training set as training set and the test set as the test set
    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
        ~/quay_data/ directory)
    :return: a tuple `(train, test_gen)` where `train` is an instance of
        :class:`quapy.data.base.LabelledCollection`, if `single_sample_train` is true or
        :class:`quapy.data._ifcb.IFCBTrainSamplesFromDir`, i.e. a sampling protocol that returns a series of samples
        labelled example by example. test_gen will be a :class:`quapy.data._ifcb.IFCBTestSamples`, 
        i.e., a sampling protocol that returns a series of samples labelled by prevalence.
    """

    from quapy.data._ifcb import IFCBTrainSamplesFromDir, IFCBTestSamples, get_sample_list, generate_modelselection_split

    if data_home is None:
        data_home = get_quapy_home()
    
    URL_TRAIN=f'https://zenodo.org/records/10036244/files/IFCB.train.zip'
    URL_TEST=f'https://zenodo.org/records/10036244/files/IFCB.test.zip'
    URL_TEST_PREV=f'https://zenodo.org/records/10036244/files/IFCB.test_prevalences.zip'

    ifcb_dir = join(data_home, 'ifcb')
    os.makedirs(ifcb_dir, exist_ok=True)

    def download_unzip_and_remove(unzipped_path, url):
        tmp_path = join(ifcb_dir, 'ifcb_tmp.zip')
        download_file_if_not_exists(url, tmp_path)
        with zipfile.ZipFile(tmp_path) as file:
            file.extractall(unzipped_path)
        os.remove(tmp_path)

    if not os.path.exists(os.path.join(ifcb_dir,'train')):
        download_unzip_and_remove(ifcb_dir, URL_TRAIN)
    if not os.path.exists(os.path.join(ifcb_dir,'test')):
        download_unzip_and_remove(ifcb_dir, URL_TEST)
    if not os.path.exists(os.path.join(ifcb_dir,'test_prevalences.csv')):
        download_unzip_and_remove(ifcb_dir, URL_TEST_PREV)

    # Load test prevalences and classes
    test_true_prev_path = join(ifcb_dir, 'test_prevalences.csv')
    test_true_prev = pd.read_csv(test_true_prev_path)
    classes = test_true_prev.columns[1:]

    #Load train and test samples
    train_samples_path = join(ifcb_dir,'train')
    test_samples_path = join(ifcb_dir,'test')

    if for_model_selection:
        # In this case, return 70% of training data as the training set and 30% as the test set
        samples = get_sample_list(train_samples_path)
        train, test = generate_modelselection_split(samples, split=0.3)
        train_gen = IFCBTrainSamplesFromDir(path_dir=train_samples_path, classes=classes, samples=train)

        # Test prevalence is computed from class labels
        test_gen = IFCBTestSamples(path_dir=train_samples_path, test_prevalences=None, samples=test, classes=classes)
    else:
        # In this case, we use all training samples as the training set and the test samples as the test set
        train_gen = IFCBTrainSamplesFromDir(path_dir=train_samples_path, classes=classes)
        test_gen = IFCBTestSamples(path_dir=test_samples_path, test_prevalences=test_true_prev)

    # In the case the user wants it, join all the train samples in one LabelledCollection
    if single_sample_train:
        train = LabelledCollection.join(*[lc for lc in train_gen()])
        return train, test_gen
    else:
        return train_gen, test_gen
-												all uci datasets from Pérez-Gállego added, quantification report added

											
										
										
											2021-01-28 18:22:43 +01:00
+								def warn(*args, **kwargs):
 								    pass
 								import warnings
 								warnings.warn = warn
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
+								import os
-												import fixes

											
										
										
											2021-01-15 18:32:32 +01:00
+								import zipfile
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
+								from os.path import join
-												added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used)

											
										
										
											2021-01-06 14:58:29 +01:00
+								import pandas as pd
-												merging cross-val fix

											
										
										
											2023-11-08 10:00:25 +01:00
+								from ucimlrepo import fetch_ucirepo
-												imports fix

											
										
										
											2021-03-19 17:34:09 +01:00
+								from quapy.data.base import Dataset, LabelledCollection
-												import fixes

											
										
										
											2021-01-15 18:32:32 +01:00
+								from quapy.data.preprocessing import text2tfidf, reduce_columns
 								from quapy.data.reader import *
 								from quapy.util import download_file_if_not_exists, download_file, get_quapy_home, pickled_resource
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
-												full example of training, model selection, and evaluation using the lequa2022 dataset with the new protocols

											
										
										
											2022-11-04 15:04:36 +01:00
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
+								REVIEWS_SENTIMENT_DATASETS = ['hp', 'kindle', 'imdb']
-												testing quapy via replicating Tweet Quantification experiments

											
										
										
											2021-01-12 17:39:00 +01:00
+								TWITTER_SENTIMENT_DATASETS_TEST = ['gasp', 'hcr', 'omd', 'sanders',
-												added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used)

											
										
										
											2021-01-06 14:58:29 +01:00
+								                              'semeval13', 'semeval14', 'semeval15', 'semeval16',
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
+								                              'sst', 'wa', 'wb']
-												testing quapy via replicating Tweet Quantification experiments

											
										
										
											2021-01-12 17:39:00 +01:00
+								TWITTER_SENTIMENT_DATASETS_TRAIN = ['gasp', 'hcr', 'omd', 'sanders',
 								                                 'semeval', 'semeval16',
 								                                 'sst', 'wa', 'wb']
-												fixing sphinx doc

											
										
										
											2024-02-07 18:31:34 +01:00
+								UCI_BINARY_DATASETS = ['acute.a', 'acute.b',
-												all uci datasets from Pérez-Gállego added, quantification report added

											
										
										
											2021-01-28 18:22:43 +01:00
+								                'balance.1', 'balance.2', 'balance.3',
 								                'breast-cancer',
 								                'cmc.1', 'cmc.2', 'cmc.3',
 								                'ctg.1', 'ctg.2', 'ctg.3',
-												fixing sphinx doc

											
										
										
											2024-02-07 18:31:34 +01:00
+								                       #'diabetes', # <-- I haven't found this one...
-												all uci datasets from Pérez-Gállego added, quantification report added

											
										
										
											2021-01-28 18:22:43 +01:00
+								                'german',
 								                'haberman',
 								                'ionosphere',
 								                'iris.1', 'iris.2', 'iris.3',
 								                'mammographic',
-												fixing sphinx doc

											
										
										
											2024-02-07 18:31:34 +01:00
+								                       'pageblocks.5',
 								                       #'phoneme', # <-- I haven't found this one...
 								                       'semeion',
 								                       'sonar',
 								                       'spambase',
 								                       'spectf',
 								                       'tictactoe',
 								                       'transfusion',
 								                       'wdbc',
 								                       'wine.1', 'wine.2', 'wine.3',
 								                       'wine-q-red', 'wine-q-white',
 								                       'yeast']
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
-												uci multiclass datasets

											
										
										
											2023-10-17 18:24:33 +02:00
+								UCI_MULTICLASS_DATASETS = ['dry-bean',
 								                           'wine-quality',
 								                           'academic-success',
 								                           'digits',
 								                           'letter']
-												lequa as dataset

											
										
										
											2022-06-01 18:28:59 +02:00
+								LEQUA2022_TASKS = ['T1A', 'T1B', 'T2A', 'T2B']
-												full example of training, model selection, and evaluation using the lequa2022 dataset with the new protocols

											
										
										
											2022-11-04 15:04:36 +01:00
+								_TXA_SAMPLE_SIZE = 250
 								_TXB_SAMPLE_SIZE = 1000
 								LEQUA2022_SAMPLE_SIZE = {
 								    'TXA': _TXA_SAMPLE_SIZE,
 								    'TXB': _TXB_SAMPLE_SIZE,
 								    'T1A': _TXA_SAMPLE_SIZE,
 								    'T1B': _TXB_SAMPLE_SIZE,
 								    'T2A': _TXA_SAMPLE_SIZE,
 								    'T2B': _TXB_SAMPLE_SIZE,
 								    'binary': _TXA_SAMPLE_SIZE,
 								    'multiclass': _TXB_SAMPLE_SIZE
 								}
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
-												Bug fixes on use of classes_. Tests.

											
										
										
											2021-05-05 17:12:44 +02:00
+								def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False) -> Dataset:
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
+								    """
-												updating the documentation

											
										
										
											2021-12-06 18:25:47 +01:00
+								    Loads a Reviews dataset as a Dataset instance, as used in
 								    `Esuli, A., Moreo, A., and Sebastiani, F. "A recurrent neural network for sentiment quantification."
 								    Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018. <https://dl.acm.org/doi/abs/10.1145/3269206.3269287>`_.
 								    The list of valid dataset names can be accessed in `quapy.data.datasets.REVIEWS_SENTIMENT_DATASETS`
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
+								    :param dataset_name: the name of the dataset: valid ones are 'hp', 'kindle', 'imdb'
 								    :param tfidf: set to True to transform the raw documents into tfidf weighted matrices
 								    :param min_df: minimun number of documents that should contain a term in order for the term to be
-												updating the documentation

											
										
										
											2021-12-06 18:25:47 +01:00
+								        kept (ignored if tfidf==False)
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
+								    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
-												updating the documentation

											
										
										
											2021-12-06 18:25:47 +01:00
+								        ~/quay_data/ directory)
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
+								    :param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for
-												updating the documentation

											
										
										
											2021-12-06 18:25:47 +01:00
+								        faster subsequent invokations
 								    :return: a :class:`quapy.data.base.Dataset` instance
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
+								    """
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
+								    assert dataset_name in REVIEWS_SENTIMENT_DATASETS, \
 								        f'Name {dataset_name} does not match any known dataset for sentiment reviews. ' \
 								        f'Valid ones are {REVIEWS_SENTIMENT_DATASETS}'
 								    if data_home is None:
 								        data_home = get_quapy_home()
 								    URL_TRAIN = f'https://zenodo.org/record/4117827/files/{dataset_name}_train.txt'
 								    URL_TEST = f'https://zenodo.org/record/4117827/files/{dataset_name}_test.txt'
 								    os.makedirs(join(data_home, 'reviews'), exist_ok=True)
 								    train_path = join(data_home, 'reviews', dataset_name, 'train.txt')
 								    test_path = join(data_home, 'reviews', dataset_name, 'test.txt')
 								    download_file_if_not_exists(URL_TRAIN, train_path)
 								    download_file_if_not_exists(URL_TEST, test_path)
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
+								    pickle_path = None
 								    if pickle:
 								        pickle_path = join(data_home, 'reviews', 'pickle', f'{dataset_name}.pkl')
 								    data = pickled_resource(pickle_path, Dataset.load, train_path, test_path, from_text)
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
 								    if tfidf:
 								        text2tfidf(data, inplace=True)
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
+								        if min_df is not None:
 								            reduce_columns(data, min_df=min_df, inplace=True)
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
-												adding tweet sent quant experiments

											
										
										
											2021-01-11 18:31:12 +01:00
+								    data.name = dataset_name
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
+								    return data
-												Bug fixes on use of classes_. Tests.

											
										
										
											2021-05-05 17:12:44 +02:00
+								def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_home=None, pickle=False) -> Dataset:
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
+								    """
-												updating the documentation

											
										
										
											2021-12-06 18:25:47 +01:00
+								    Loads a Twitter dataset as a :class:`quapy.data.base.Dataset` instance, as used in:
 								    `Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
 								    Social Network Analysis and Mining6(19), 1–22 (2016) <https://link.springer.com/content/pdf/10.1007/s13278-016-0327-z.pdf>`_
 								    Note that the datasets 'semeval13', 'semeval14', 'semeval15' share the same training set.
 								    The list of valid dataset names corresponding to training sets can be accessed in
 								    `quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TRAIN`, while the test sets can be accessed in
 								    `quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TEST`
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
 								    :param dataset_name: the name of the dataset: valid ones are 'gasp', 'hcr', 'omd', 'sanders', 'semeval13',
-												updating the documentation

											
										
										
											2021-12-06 18:25:47 +01:00
+								        'semeval14', 'semeval15', 'semeval16', 'sst', 'wa', 'wb'
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
+								    :param for_model_selection: if True, then returns the train split as the training set and the devel split
-												updating the documentation

											
										
										
											2021-12-06 18:25:47 +01:00
+								        as the test set; if False, then returns the train+devel split as the training set and the test set as the
 								        test set
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
+								    :param min_df: minimun number of documents that should contain a term in order for the term to be kept
 								    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
-												updating the documentation

											
										
										
											2021-12-06 18:25:47 +01:00
+								        ~/quay_data/ directory)
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
+								    :param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for
-												updating the documentation

											
										
										
											2021-12-06 18:25:47 +01:00
+								        faster subsequent invokations
 								    :return: a :class:`quapy.data.base.Dataset` instance
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
+								    """
-												testing quapy via replicating Tweet Quantification experiments

											
										
										
											2021-01-12 17:39:00 +01:00
+								    assert dataset_name in TWITTER_SENTIMENT_DATASETS_TRAIN + TWITTER_SENTIMENT_DATASETS_TEST, \
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
+								        f'Name {dataset_name} does not match any known dataset for sentiment twitter. ' \
-												testing quapy via replicating Tweet Quantification experiments

											
										
										
											2021-01-12 17:39:00 +01:00
+								        f'Valid ones are {TWITTER_SENTIMENT_DATASETS_TRAIN} for model selection and ' \
 								        f'{TWITTER_SENTIMENT_DATASETS_TEST} for test (datasets "semeval14", "semeval15", "semeval16" share ' \
 								        f'a common training set "semeval")'
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
+								    if data_home is None:
 								        data_home = get_quapy_home()
 								    URL = 'https://zenodo.org/record/4255764/files/tweet_sentiment_quantification_snam.zip'
 								    unzipped_path = join(data_home, 'tweet_sentiment_quantification_snam')
 								    if not os.path.exists(unzipped_path):
 								        downloaded_path = join(data_home, 'tweet_sentiment_quantification_snam.zip')
 								        download_file(URL, downloaded_path)
 								        with zipfile.ZipFile(downloaded_path) as file:
 								            file.extractall(data_home)
 								        os.remove(downloaded_path)
 								    if dataset_name in {'semeval13', 'semeval14', 'semeval15'}:
 								        trainset_name = 'semeval'
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
+								        testset_name  = 'semeval' if for_model_selection else dataset_name
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
+								        print(f"the training and development sets for datasets 'semeval13', 'semeval14', 'semeval15' are common "
 								              f"(called 'semeval'); returning trainin-set='{trainset_name}' and test-set={testset_name}")
 								    else:
-												testing quapy via replicating Tweet Quantification experiments

											
										
										
											2021-01-12 17:39:00 +01:00
+								        if dataset_name == 'semeval' and for_model_selection==False:
 								            raise ValueError('dataset "semeval" can only be used for model selection. '
 								                             'Use "semeval13", "semeval14", or "semeval15" for model evaluation.')
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
+								        trainset_name = testset_name = dataset_name
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
+								    if for_model_selection:
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
+								        train = join(unzipped_path, 'train', f'{trainset_name}.train.feature.txt')
 								        test  = join(unzipped_path, 'test', f'{testset_name}.dev.feature.txt')
 								    else:
 								        train = join(unzipped_path, 'train', f'{trainset_name}.train+dev.feature.txt')
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
+								        if dataset_name == 'semeval16':  # there is a different test name in the case of semeval16 only
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
+								            test = join(unzipped_path, 'test', f'{testset_name}.dev-test.feature.txt')
 								        else:
 								            test = join(unzipped_path, 'test', f'{testset_name}.test.feature.txt')
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
+								    pickle_path = None
 								    if pickle:
 								        mode = "train-dev" if for_model_selection else "train+dev-test"
 								        pickle_path = join(unzipped_path, 'pickle', f'{testset_name}.{mode}.pkl')
 								    data = pickled_resource(pickle_path, Dataset.load, train, test, from_sparse)
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
 								    if min_df is not None:
 								        reduce_columns(data, min_df=min_df, inplace=True)
-												adding tweet sent quant experiments

											
										
										
											2021-01-11 18:31:12 +01:00
+								    data.name = dataset_name
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
+								    return data
-												fixing sphinx doc

											
										
										
											2024-02-07 18:31:34 +01:00
+								def fetch_UCIBinaryDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset:
-												updating the documentation

											
										
										
											2021-12-06 18:25:47 +01:00
+								    """
 								    Loads a UCI dataset as an instance of :class:`quapy.data.base.Dataset`, as used in
 								    `Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
 								    Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
 								    Information Fusion, 34, 87-100. <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_
 								    and
 								    `Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).
 								    Dynamic ensemble selection for quantification tasks.
 								    Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_.
 								    The datasets do not come with a predefined train-test split (see :meth:`fetch_UCILabelledCollection` for further
 								    information on how to use these collections), and so a train-test split is generated at desired proportion.
 								    The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_DATASETS`
 								    :param dataset_name: a dataset name
 								    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
 								        ~/quay_data/ directory)
 								    :param test_split: proportion of documents to be included in the test set. The rest conforms the training set
 								    :param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
 								    :return: a :class:`quapy.data.base.Dataset` instance
 								    """
-												fixing sphinx doc

											
										
										
											2024-02-07 18:31:34 +01:00
+								    data = fetch_UCIBinaryLabelledCollection(dataset_name, data_home, verbose)
-												all uci datasets from Pérez-Gállego added, quantification report added

											
										
										
											2021-01-28 18:22:43 +01:00
+								    return Dataset(*data.split_stratified(1 - test_split, random_state=0))
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
-												fixing sphinx doc

											
										
										
											2024-02-07 18:31:34 +01:00
+								def fetch_UCIBinaryLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection:
-												updating the documentation

											
										
										
											2021-12-06 18:25:47 +01:00
+								    """
 								    Loads a UCI collection as an instance of :class:`quapy.data.base.LabelledCollection`, as used in
 								    `Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
 								    Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
 								    Information Fusion, 34, 87-100. <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_
 								    and
 								    `Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).
 								    Dynamic ensemble selection for quantification tasks.
 								    Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_.
 								    The datasets do not come with a predefined train-test split, and so Pérez-Gállego et al. adopted a 5FCVx2 evaluation
 								    protocol, meaning that each collection was used to generate two rounds (hence the x2) of 5 fold cross validation.
 								    This can be reproduced by using :meth:`quapy.data.base.Dataset.kFCV`, e.g.:
 								    >>> import quapy as qp
-												fixing sphinx doc

											
										
										
											2024-02-07 18:31:34 +01:00
+								    >>> collection = qp.datasets.fetch_UCIBinaryLabelledCollection("yeast")
 								    >>> for data in qp.train.Dataset.kFCV(collection, nfolds=5, nrepeats=2):
-												updating the documentation

											
										
										
											2021-12-06 18:25:47 +01:00
+								    >>>     ...
 								    The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_DATASETS`
 								    :param dataset_name: a dataset name
 								    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
 								        ~/quay_data/ directory)
 								    :param test_split: proportion of documents to be included in the test set. The rest conforms the training set
 								    :param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
-												adding uci_experiments to examples folder

											
										
										
											2023-03-23 15:46:03 +01:00
+								    :return: a :class:`quapy.data.base.LabelledCollection` instance
-												updating the documentation

											
										
										
											2021-12-06 18:25:47 +01:00
+								    """
-												added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used)

											
										
										
											2021-01-06 14:58:29 +01:00
-												fixing sphinx doc

											
										
										
											2024-02-07 18:31:34 +01:00
+								    assert dataset_name in UCI_BINARY_DATASETS, \
-												added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used)

											
										
										
											2021-01-06 14:58:29 +01:00
+								        f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \
-												fixing sphinx doc

											
										
										
											2024-02-07 18:31:34 +01:00
+								        f'Valid ones are {UCI_BINARY_DATASETS}'
-												added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used)

											
										
										
											2021-01-06 14:58:29 +01:00
+								    if data_home is None:
 								        data_home = get_quapy_home()
 								    dataset_fullname = {
 								        'acute.a': 'Acute Inflammations (urinary bladder)',
 								        'acute.b': 'Acute Inflammations (renal pelvis)',
 								        'balance.1': 'Balance Scale Weight & Distance Database (left)',
 								        'balance.2': 'Balance Scale Weight & Distance Database (balanced)',
 								        'balance.3': 'Balance Scale Weight & Distance Database (right)',
-												cleaning and adding some uci datasets

											
										
										
											2021-01-11 12:55:06 +01:00
+								        'breast-cancer':  'Breast Cancer Wisconsin (Original)',
 								        'cmc.1': 'Contraceptive Method Choice (no use)',
 								        'cmc.2': 'Contraceptive Method Choice (long term)',
 								        'cmc.3': 'Contraceptive Method Choice (short term)',
 								        'ctg.1': 'Cardiotocography Data Set (normal)',
 								        'ctg.2': 'Cardiotocography Data Set (suspect)',
 								        'ctg.3': 'Cardiotocography Data Set (pathologic)',
-												refactor: methods requiring a val_split can now declare a default value in the __init__ method that will be used in case the fit method is called without specifying the val_split, which now is by default None in the fit, i.e., by default takes the value of the init, that is generally set to 0.4; some uci datasets added; ensembles can now be optimized for quantification, and can be trained on samples of smaller size

											
										
										
											2021-01-22 18:01:51 +01:00
+								        'german': 'Statlog German Credit Data',
-												adding uci ml datasets

											
										
										
											2021-01-25 18:38:56 +01:00
+								        'haberman': "Haberman's Survival Data",
 								        'ionosphere': 'Johns Hopkins University Ionosphere DB',
 								        'iris.1': 'Iris Plants Database(x)',
 								        'iris.2': 'Iris Plants Database(versicolour)',
 								        'iris.3': 'Iris Plants Database(virginica)',
 								        'mammographic': 'Mammographic Mass',
 								        'pageblocks.5': 'Page Blocks Classification (5)',
 								        'semeion': 'Semeion Handwritten Digit (8)',
-												more uci datasets, plots improved (higher fonts), and evaluation script that shows numerical results in command line

											
										
										
											2021-01-27 22:49:54 +01:00
+								        'sonar': 'Sonar, Mines vs. Rocks',
 								        'spambase': 'Spambase Data Set',
 								        'spectf': 'SPECTF Heart Data',
 								        'tictactoe': 'Tic-Tac-Toe Endgame Database',
-												all uci datasets from Pérez-Gállego added, quantification report added

											
										
										
											2021-01-28 18:22:43 +01:00
+								        'transfusion': 'Blood Transfusion Service Center Data Set',
 								        'wdbc': 'Wisconsin Diagnostic Breast Cancer',
 								        'wine.1': 'Wine Recognition Data (1)',
 								        'wine.2': 'Wine Recognition Data (2)',
 								        'wine.3': 'Wine Recognition Data (3)',
 								        'wine-q-red': 'Wine Quality Red (6-10)',
 								        'wine-q-white': 'Wine Quality White (6-10)',
 								        'yeast': 'Yeast',
-												refactor: methods requiring a val_split can now declare a default value in the __init__ method that will be used in case the fit method is called without specifying the val_split, which now is by default None in the fit, i.e., by default takes the value of the init, that is generally set to 0.4; some uci datasets added; ensembles can now be optimized for quantification, and can be trained on samples of smaller size

											
										
										
											2021-01-22 18:01:51 +01:00
+								    }
 								    # the identifier is an alias for the dataset group, it's part of the url data-folder, and is the name we use
 								    # to download the raw dataset
 								    identifier_map = {
 								        'acute.a': 'acute',
 								        'acute.b': 'acute',
 								        'balance.1': 'balance-scale',
 								        'balance.2': 'balance-scale',
 								        'balance.3': 'balance-scale',
 								        'breast-cancer': 'breast-cancer-wisconsin',
 								        'cmc.1': 'cmc',
 								        'cmc.2': 'cmc',
 								        'cmc.3': 'cmc',
 								        'ctg.1': '00193',
 								        'ctg.2': '00193',
 								        'ctg.3': '00193',
-												adding uci ml datasets

											
										
										
											2021-01-25 18:38:56 +01:00
+								        'german': 'statlog/german',
 								        'haberman': 'haberman',
 								        'ionosphere': 'ionosphere',
 								        'iris.1': 'iris',
 								        'iris.2': 'iris',
 								        'iris.3': 'iris',
 								        'mammographic': 'mammographic-masses',
 								        'pageblocks.5': 'page-blocks',
 								        'semeion': 'semeion',
-												more uci datasets, plots improved (higher fonts), and evaluation script that shows numerical results in command line

											
										
										
											2021-01-27 22:49:54 +01:00
+								        'sonar': 'undocumented/connectionist-bench/sonar',
 								        'spambase': 'spambase',
 								        'spectf': 'spect',
 								        'tictactoe': 'tic-tac-toe',
-												all uci datasets from Pérez-Gállego added, quantification report added

											
										
										
											2021-01-28 18:22:43 +01:00
+								        'transfusion': 'blood-transfusion',
 								        'wdbc': 'breast-cancer-wisconsin',
 								        'wine-q-red': 'wine-quality',
 								        'wine-q-white': 'wine-quality',
 								        'wine.1': 'wine',
 								        'wine.2': 'wine',
 								        'wine.3': 'wine',
 								        'yeast': 'yeast',
-												added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used)

											
										
										
											2021-01-06 14:58:29 +01:00
+								    }
-												refactor: methods requiring a val_split can now declare a default value in the __init__ method that will be used in case the fit method is called without specifying the val_split, which now is by default None in the fit, i.e., by default takes the value of the init, that is generally set to 0.4; some uci datasets added; ensembles can now be optimized for quantification, and can be trained on samples of smaller size

											
										
										
											2021-01-22 18:01:51 +01:00
+								    # the filename is the name of the file within the data_folder indexed by the identifier
 								    file_name = {
 								        'acute': 'diagnosis.data',
 								        '00193': 'CTG.xls',
-												adding uci ml datasets

											
										
										
											2021-01-25 18:38:56 +01:00
+								        'statlog/german': 'german.data-numeric',
 								        'mammographic-masses': 'mammographic_masses.data',
 								        'page-blocks': 'page-blocks.data.Z',
-												more uci datasets, plots improved (higher fonts), and evaluation script that shows numerical results in command line

											
										
										
											2021-01-27 22:49:54 +01:00
+								        'undocumented/connectionist-bench/sonar': 'sonar.all-data',
 								        'spect': ['SPECTF.train', 'SPECTF.test'],
-												all uci datasets from Pérez-Gállego added, quantification report added

											
										
										
											2021-01-28 18:22:43 +01:00
+								        'blood-transfusion': 'transfusion.data',
 								        'wine-quality': ['winequality-red.csv', 'winequality-white.csv'],
 								        'breast-cancer-wisconsin': 'breast-cancer-wisconsin.data' if dataset_name=='breast-cancer' else 'wdbc.data'
-												refactor: methods requiring a val_split can now declare a default value in the __init__ method that will be used in case the fit method is called without specifying the val_split, which now is by default None in the fit, i.e., by default takes the value of the init, that is generally set to 0.4; some uci datasets added; ensembles can now be optimized for quantification, and can be trained on samples of smaller size

											
										
										
											2021-01-22 18:01:51 +01:00
+								    }
 								    # the filename containing the dataset description (if any)
 								    desc_name = {
 								        'acute': 'diagnosis.names',
 								        '00193': None,
-												adding uci ml datasets

											
										
										
											2021-01-25 18:38:56 +01:00
+								        'statlog/german': 'german.doc',
 								        'mammographic-masses': 'mammographic_masses.names',
-												more uci datasets, plots improved (higher fonts), and evaluation script that shows numerical results in command line

											
										
										
											2021-01-27 22:49:54 +01:00
+								        'undocumented/connectionist-bench/sonar': 'sonar.names',
 								        'spect': 'SPECTF.names',
-												all uci datasets from Pérez-Gállego added, quantification report added

											
										
										
											2021-01-28 18:22:43 +01:00
+								        'blood-transfusion': 'transfusion.names',
 								        'wine-quality': 'winequality.names',
 								        'breast-cancer-wisconsin': 'breast-cancer-wisconsin.names' if dataset_name == 'breast-cancer' else 'wdbc.names'
-												added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used)

											
										
										
											2021-01-06 14:58:29 +01:00
+								    }
 								    identifier = identifier_map[dataset_name]
-												adding uci ml datasets

											
										
										
											2021-01-25 18:38:56 +01:00
+								    filename = file_name.get(identifier, f'{identifier}.data')
 								    descfile = desc_name.get(identifier, f'{identifier}.names')
 								    fullname = dataset_fullname[dataset_name]
-												added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used)

											
										
										
											2021-01-06 14:58:29 +01:00
+								    URL = f'http://archive.ics.uci.edu/ml/machine-learning-databases/{identifier}'
-												refactor: methods requiring a val_split can now declare a default value in the __init__ method that will be used in case the fit method is called without specifying the val_split, which now is by default None in the fit, i.e., by default takes the value of the init, that is generally set to 0.4; some uci datasets added; ensembles can now be optimized for quantification, and can be trained on samples of smaller size

											
										
										
											2021-01-22 18:01:51 +01:00
+								    data_dir = join(data_home, 'uci_datasets', identifier)
-												more uci datasets, plots improved (higher fonts), and evaluation script that shows numerical results in command line

											
										
										
											2021-01-27 22:49:54 +01:00
+								    if isinstance(filename, str):  # filename could be a list of files, in which case it will be processed later
 								        data_path = join(data_dir, filename)
 								        download_file_if_not_exists(f'{URL}/{filename}', data_path)
-												refactor: methods requiring a val_split can now declare a default value in the __init__ method that will be used in case the fit method is called without specifying the val_split, which now is by default None in the fit, i.e., by default takes the value of the init, that is generally set to 0.4; some uci datasets added; ensembles can now be optimized for quantification, and can be trained on samples of smaller size

											
										
										
											2021-01-22 18:01:51 +01:00
 								    if descfile:
-												adding uci ml datasets

											
										
										
											2021-01-25 18:38:56 +01:00
+								        try:
 								            download_file_if_not_exists(f'{URL}/{descfile}', f'{data_dir}/{descfile}')
 								            if verbose:
 								                print(open(f'{data_dir}/{descfile}', 'rt').read())
 								        except Exception:
 								            print('could not read the description file')
-												refactor: methods requiring a val_split can now declare a default value in the __init__ method that will be used in case the fit method is called without specifying the val_split, which now is by default None in the fit, i.e., by default takes the value of the init, that is generally set to 0.4; some uci datasets added; ensembles can now be optimized for quantification, and can be trained on samples of smaller size

											
										
										
											2021-01-22 18:01:51 +01:00
+								    elif verbose:
 								        print('no file description available')
-												added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used)

											
										
										
											2021-01-06 14:58:29 +01:00
-												added HDx and an example comparing HDy vs HDx

											
										
										
											2023-11-08 15:34:17 +01:00
+								    if verbose:
 								        print(f'Loading {dataset_name} ({fullname})')
-												added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used)

											
										
										
											2021-01-06 14:58:29 +01:00
+								    if identifier == 'acute':
-												refactor: methods requiring a val_split can now declare a default value in the __init__ method that will be used in case the fit method is called without specifying the val_split, which now is by default None in the fit, i.e., by default takes the value of the init, that is generally set to 0.4; some uci datasets added; ensembles can now be optimized for quantification, and can be trained on samples of smaller size

											
										
										
											2021-01-22 18:01:51 +01:00
+								        df = pd.read_csv(data_path, header=None, encoding='utf-16', sep='\t')
-												all uci datasets from Pérez-Gállego added, quantification report added

											
										
										
											2021-01-28 18:22:43 +01:00
 								        df[0] = df[0].apply(lambda x: float(x.replace(',', '.'))).astype(float, copy=False)
-												updating the documentation

											
										
										
											2021-12-06 18:25:47 +01:00
+								        [_df_replace(df, col) for col in range(1, 6)]
-												all uci datasets from Pérez-Gállego added, quantification report added

											
										
										
											2021-01-28 18:22:43 +01:00
+								        X = df.loc[:, 0:5].values
-												added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used)

											
										
										
											2021-01-06 14:58:29 +01:00
+								        if dataset_name == 'acute.a':
 								            y = binarize(df[6], pos_class='yes')
 								        elif dataset_name == 'acute.b':
 								            y = binarize(df[7], pos_class='yes')
 								    if identifier == 'balance-scale':
-												refactor: methods requiring a val_split can now declare a default value in the __init__ method that will be used in case the fit method is called without specifying the val_split, which now is by default None in the fit, i.e., by default takes the value of the init, that is generally set to 0.4; some uci datasets added; ensembles can now be optimized for quantification, and can be trained on samples of smaller size

											
										
										
											2021-01-22 18:01:51 +01:00
+								        df = pd.read_csv(data_path, header=None, sep=',')
-												added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used)

											
										
										
											2021-01-06 14:58:29 +01:00
+								        if dataset_name == 'balance.1':
 								            y = binarize(df[0], pos_class='L')
 								        elif dataset_name == 'balance.2':
 								            y = binarize(df[0], pos_class='B')
 								        elif dataset_name == 'balance.3':
 								            y = binarize(df[0], pos_class='R')
 								        X = df.loc[:, 1:].astype(float).values
-												all uci datasets from Pérez-Gállego added, quantification report added

											
										
										
											2021-01-28 18:22:43 +01:00
+								    if identifier == 'breast-cancer-wisconsin' and dataset_name=='breast-cancer':
-												refactor: methods requiring a val_split can now declare a default value in the __init__ method that will be used in case the fit method is called without specifying the val_split, which now is by default None in the fit, i.e., by default takes the value of the init, that is generally set to 0.4; some uci datasets added; ensembles can now be optimized for quantification, and can be trained on samples of smaller size

											
										
										
											2021-01-22 18:01:51 +01:00
+								        df = pd.read_csv(data_path, header=None, sep=',')
-												cleaning and adding some uci datasets

											
										
										
											2021-01-11 12:55:06 +01:00
+								        Xy = df.loc[:, 1:10]
 								        Xy[Xy=='?']=np.nan
 								        Xy = Xy.dropna(axis=0)
 								        X = Xy.loc[:, 1:9]
 								        X = X.astype(float).values
-												all uci datasets from Pérez-Gállego added, quantification report added

											
										
										
											2021-01-28 18:22:43 +01:00
+								        y = binarize(Xy[10], pos_class=2)
 								    if identifier == 'breast-cancer-wisconsin' and dataset_name=='wdbc':
 								        df = pd.read_csv(data_path, header=None, sep=',')
 								        X = df.loc[:, 2:32].astype(float).values
 								        y = df[1].values
 								        y = binarize(y, pos_class='M')
-												cleaning and adding some uci datasets

											
										
										
											2021-01-11 12:55:06 +01:00
 								    if identifier == 'cmc':
-												refactor: methods requiring a val_split can now declare a default value in the __init__ method that will be used in case the fit method is called without specifying the val_split, which now is by default None in the fit, i.e., by default takes the value of the init, that is generally set to 0.4; some uci datasets added; ensembles can now be optimized for quantification, and can be trained on samples of smaller size

											
										
										
											2021-01-22 18:01:51 +01:00
+								        df = pd.read_csv(data_path, header=None, sep=',')
-												cleaning and adding some uci datasets

											
										
										
											2021-01-11 12:55:06 +01:00
+								        X = df.loc[:, 0:8].astype(float).values
 								        y = df[9].astype(int).values
 								        if dataset_name == 'cmc.1':
 								            y = binarize(y, pos_class=1)
 								        elif dataset_name == 'cmc.2':
 								            y = binarize(y, pos_class=2)
 								        elif dataset_name == 'cmc.3':
 								            y = binarize(y, pos_class=3)
-												refactor: methods requiring a val_split can now declare a default value in the __init__ method that will be used in case the fit method is called without specifying the val_split, which now is by default None in the fit, i.e., by default takes the value of the init, that is generally set to 0.4; some uci datasets added; ensembles can now be optimized for quantification, and can be trained on samples of smaller size

											
										
										
											2021-01-22 18:01:51 +01:00
+								    if identifier == '00193':
 								        df = pd.read_excel(data_path, sheet_name='Data', skipfooter=3)
 								        df = df[list(range(1,24))] # select columns numbered (number 23 is the target label)
 								        # replaces the header with the first row
 								        new_header = df.iloc[0]  # grab the first row for the header
 								        df = df[1:]  # take the data less the header row
 								        df.columns = new_header  # set the header row as the df header
 								        X = df.iloc[:, 0:22].astype(float).values
 								        y = df['NSP'].astype(int).values
-												adding uci ml datasets

											
										
										
											2021-01-25 18:38:56 +01:00
+								        if dataset_name == 'ctg.1':
 								            y = binarize(y, pos_class=1)  # 1==Normal
-												refactor: methods requiring a val_split can now declare a default value in the __init__ method that will be used in case the fit method is called without specifying the val_split, which now is by default None in the fit, i.e., by default takes the value of the init, that is generally set to 0.4; some uci datasets added; ensembles can now be optimized for quantification, and can be trained on samples of smaller size

											
										
										
											2021-01-22 18:01:51 +01:00
+								        elif dataset_name == 'ctg.2':
-												adding uci ml datasets

											
										
										
											2021-01-25 18:38:56 +01:00
+								            y = binarize(y, pos_class=2)  # 2==Suspect
-												refactor: methods requiring a val_split can now declare a default value in the __init__ method that will be used in case the fit method is called without specifying the val_split, which now is by default None in the fit, i.e., by default takes the value of the init, that is generally set to 0.4; some uci datasets added; ensembles can now be optimized for quantification, and can be trained on samples of smaller size

											
										
										
											2021-01-22 18:01:51 +01:00
+								        elif dataset_name == 'ctg.3':
-												adding uci ml datasets

											
										
										
											2021-01-25 18:38:56 +01:00
+								            y = binarize(y, pos_class=3)  # 3==Pathologic
-												added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used)

											
										
										
											2021-01-06 14:58:29 +01:00
-												refactor: methods requiring a val_split can now declare a default value in the __init__ method that will be used in case the fit method is called without specifying the val_split, which now is by default None in the fit, i.e., by default takes the value of the init, that is generally set to 0.4; some uci datasets added; ensembles can now be optimized for quantification, and can be trained on samples of smaller size

											
										
										
											2021-01-22 18:01:51 +01:00
+								    if identifier == 'statlog/german':
 								        df = pd.read_csv(data_path, header=None, delim_whitespace=True)
 								        X = df.iloc[:, 0:24].astype(float).values
 								        y = df[24].astype(int).values
 								        y = binarize(y, pos_class=1)
-												added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used)

											
										
										
											2021-01-06 14:58:29 +01:00
-												adding uci ml datasets

											
										
										
											2021-01-25 18:38:56 +01:00
+								    if identifier == 'haberman':
 								        df = pd.read_csv(data_path, header=None)
 								        X = df.iloc[:, 0:3].astype(float).values
 								        y = df[3].astype(int).values
 								        y = binarize(y, pos_class=2)
 								    if identifier == 'ionosphere':
 								        df = pd.read_csv(data_path, header=None)
 								        X = df.iloc[:, 0:34].astype(float).values
 								        y = df[34].values
 								        y = binarize(y, pos_class='b')
 								    if identifier == 'iris':
 								        df = pd.read_csv(data_path, header=None)
 								        X = df.iloc[:, 0:4].astype(float).values
 								        y = df[4].values
 								        if dataset_name == 'iris.1':
 								            y = binarize(y, pos_class='Iris-setosa')  # 1==Setosa
 								        elif dataset_name == 'iris.2':
 								            y = binarize(y, pos_class='Iris-versicolor')  # 2==Versicolor
 								        elif dataset_name == 'iris.3':
 								            y = binarize(y, pos_class='Iris-virginica')  # 3==Virginica
 								    if identifier == 'mammographic-masses':
 								        df = pd.read_csv(data_path, header=None, sep=',')
-												all uci datasets from Pérez-Gállego added, quantification report added

											
										
										
											2021-01-28 18:22:43 +01:00
+								        df[df == '?'] = np.nan
 								        Xy = df.dropna(axis=0)
-												adding uci ml datasets

											
										
										
											2021-01-25 18:38:56 +01:00
+								        X = Xy.iloc[:, 0:5]
 								        X = X.astype(float).values
 								        y = binarize(Xy.iloc[:,5], pos_class=1)
 								    if identifier == 'page-blocks':
 								        data_path_ = data_path.replace('.Z', '')
 								        if not os.path.exists(data_path_):
 								            raise FileNotFoundError(f'Warning: file {data_path_} does not exist. If this is the first time you '
 								                                    f'attempt to load this dataset, then you have to manually unzip the {data_path} '
 								                                    f'and name the extracted file {data_path_} (unfortunately, neither zipfile, nor '
 								                                    f'gzip can handle unix compressed files automatically -- there is a repo in GitHub '
 								                                    f'https://github.com/umeat/unlzw where the problem seems to be solved anyway).')
 								        df = pd.read_csv(data_path_, header=None, delim_whitespace=True)
 								        X = df.iloc[:, 0:10].astype(float).values
 								        y = df[10].values
 								        y = binarize(y, pos_class=5)  # 5==block "graphic"
 								    if identifier == 'semeion':
 								        df = pd.read_csv(data_path, header=None, delim_whitespace=True )
 								        X = df.iloc[:, 0:256].astype(float).values
 								        y = df[263].values  # 263 stands for digit 8 (labels are one-hot vectors from col 256-266)
 								        y = binarize(y, pos_class=1)
 								    if identifier == 'undocumented/connectionist-bench/sonar':
 								        df = pd.read_csv(data_path, header=None, sep=',')
 								        X = df.iloc[:, 0:60].astype(float).values
-												more uci datasets, plots improved (higher fonts), and evaluation script that shows numerical results in command line

											
										
										
											2021-01-27 22:49:54 +01:00
+								        y = df[60].values
-												adding uci ml datasets

											
										
										
											2021-01-25 18:38:56 +01:00
+								        y = binarize(y, pos_class='R')
-												more uci datasets, plots improved (higher fonts), and evaluation script that shows numerical results in command line

											
										
										
											2021-01-27 22:49:54 +01:00
+								    if identifier == 'spambase':
 								        df = pd.read_csv(data_path, header=None, sep=',')
 								        X = df.iloc[:, 0:57].astype(float).values
 								        y = df[57].values
 								        y = binarize(y, pos_class=1)
 								    if identifier == 'spect':
 								        dfs = []
-												all uci datasets from Pérez-Gállego added, quantification report added

											
										
										
											2021-01-28 18:22:43 +01:00
+								        for file in filename:
-												more uci datasets, plots improved (higher fonts), and evaluation script that shows numerical results in command line

											
										
										
											2021-01-27 22:49:54 +01:00
+								            data_path = join(data_dir, file)
-												all uci datasets from Pérez-Gállego added, quantification report added

											
										
										
											2021-01-28 18:22:43 +01:00
+								            download_file_if_not_exists(f'{URL}/{file}', data_path)
-												more uci datasets, plots improved (higher fonts), and evaluation script that shows numerical results in command line

											
										
										
											2021-01-27 22:49:54 +01:00
+								            dfs.append(pd.read_csv(data_path, header=None, sep=','))
 								        df = pd.concat(dfs)
 								        X = df.iloc[:, 1:45].astype(float).values
 								        y = df[0].values
 								        y = binarize(y, pos_class=0)
 								    if identifier == 'tic-tac-toe':
 								        df = pd.read_csv(data_path, header=None, sep=',')
 								        X = df.iloc[:, 0:9].replace('o',0).replace('b',1).replace('x',2).values
 								        y = df[9].values
 								        y = binarize(y, pos_class='negative')
 								    if identifier == 'blood-transfusion':
 								        df = pd.read_csv(data_path, sep=',')
 								        X = df.iloc[:, 0:4].astype(float).values
 								        y = df.iloc[:, 4].values
 								        y = binarize(y, pos_class=1)
-												adding uci ml datasets

											
										
										
											2021-01-25 18:38:56 +01:00
-												all uci datasets from Pérez-Gállego added, quantification report added

											
										
										
											2021-01-28 18:22:43 +01:00
+								    if identifier == 'wine':
 								        df = pd.read_csv(data_path, header=None, sep=',')
 								        X = df.iloc[:, 1:14].astype(float).values
 								        y = df[0].values
 								        if dataset_name == 'wine.1':
 								            y = binarize(y, pos_class=1)
 								        elif dataset_name == 'wine.2':
 								            y = binarize(y, pos_class=2)
 								        elif dataset_name == 'wine.3':
 								            y = binarize(y, pos_class=3)
 								    if identifier == 'wine-quality':
 								        filename = filename[0] if dataset_name=='wine-q-red' else filename[1]
 								        data_path = join(data_dir, filename)
 								        download_file_if_not_exists(f'{URL}/{filename}', data_path)
 								        df = pd.read_csv(data_path, sep=';')
 								        X = df.iloc[:, 0:11].astype(float).values
 								        y = df.iloc[:, 11].values > 5
 								    if identifier == 'yeast':
 								        df = pd.read_csv(data_path, header=None, delim_whitespace=True)
 								        X = df.iloc[:, 1:9].astype(float).values
 								        y = df.iloc[:, 9].values
 								        y = binarize(y, pos_class='NUC')
-												refactor: methods requiring a val_split can now declare a default value in the __init__ method that will be used in case the fit method is called without specifying the val_split, which now is by default None in the fit, i.e., by default takes the value of the init, that is generally set to 0.4; some uci datasets added; ensembles can now be optimized for quantification, and can be trained on samples of smaller size

											
										
										
											2021-01-22 18:01:51 +01:00
+								    data = LabelledCollection(X, y)
-												added HDx and an example comparing HDy vs HDx

											
										
										
											2023-11-08 15:34:17 +01:00
+								    if verbose:
 								        data.stats()
-												all uci datasets from Pérez-Gállego added, quantification report added

											
										
										
											2021-01-28 18:22:43 +01:00
+								    return data
-												added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used)

											
										
										
											2021-01-06 14:58:29 +01:00
-												uci multiclass datasets

											
										
										
											2023-10-17 18:24:33 +02:00
+								def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset:
 								    """
-												fixing requests

											
										
										
											2023-10-18 14:12:40 +02:00
+								    Loads a UCI multiclass dataset as an instance of :class:`quapy.data.base.Dataset`.
 								    The list of available datasets is taken from https://archive.ics.uci.edu/, following these criteria:
-												revised

											
										
										
											2023-10-18 17:50:46 +02:00
+								    - It has more than 1000 instances
 								    - It is suited for classification
 								    - It has more than two classes
 								    - It is available for Python import (requires ucimlrepo package)
 								    >>> import quapy as qp
 								    >>> dataset = qp.datasets.fetch_UCIMulticlassDataset("dry-bean")
 								    >>> train, test = dataset.train_test
 								    >>>     ...
-												fixing requests

											
										
										
											2023-10-18 14:12:40 +02:00
-												fixing mistakes

											
										
										
											2023-10-17 18:44:02 +02:00
+								    The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS`
-												uci multiclass datasets

											
										
										
											2023-10-17 18:24:33 +02:00
-												revised

											
										
										
											2023-10-18 17:50:46 +02:00
+								    The datasets are downloaded only once and pickled into disk, saving time for consecutive calls.
-												uci multiclass datasets

											
										
										
											2023-10-17 18:24:33 +02:00
+								    :param dataset_name: a dataset name
 								    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
 								        ~/quay_data/ directory)
 								    :param test_split: proportion of documents to be included in the test set. The rest conforms the training set
-												fixing requests

											
										
										
											2023-10-18 14:12:40 +02:00
+								    :param verbose: set to True (default is False) to get information (stats) about the dataset
-												uci multiclass datasets

											
										
										
											2023-10-17 18:24:33 +02:00
+								    :return: a :class:`quapy.data.base.Dataset` instance
 								    """
 								    data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, verbose)
 								    return Dataset(*data.split_stratified(1 - test_split, random_state=0))
-												revised

											
										
										
											2023-10-18 17:50:46 +02:00
-												uci multiclass datasets

											
										
										
											2023-10-17 18:24:33 +02:00
+								def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection:
 								    """
-												revised

											
										
										
											2023-10-18 17:50:46 +02:00
+								    Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`.
-												uci multiclass datasets

											
										
										
											2023-10-17 18:24:33 +02:00
-												revised

											
										
										
											2023-10-18 17:50:46 +02:00
+								    The list of available datasets is taken from https://archive.ics.uci.edu/, following these criteria:
 								    - It has more than 1000 instances
 								    - It is suited for classification
 								    - It has more than two classes
 								    - It is available for Python import (requires ucimlrepo package)
-												uci multiclass datasets

											
										
										
											2023-10-17 18:24:33 +02:00
+								    >>> import quapy as qp
-												revised

											
										
										
											2023-10-18 17:50:46 +02:00
+								    >>> collection = qp.datasets.fetch_UCIMulticlassLabelledCollection("dry-bean")
 								    >>> X, y = collection.Xy
-												uci multiclass datasets

											
										
										
											2023-10-17 18:24:33 +02:00
+								    >>>     ...
 								    The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS`
-												fixing requests

											
										
										
											2023-10-18 14:12:40 +02:00
+								    The datasets are downloaded only once and pickled into disk, saving time for consecutive calls.
-												uci multiclass datasets

											
										
										
											2023-10-17 18:24:33 +02:00
+								    :param dataset_name: a dataset name
-												fixing requests

											
										
										
											2023-10-18 14:12:40 +02:00
+								    :param data_home: specify the quapy home directory where the dataset will be dumped (leave empty to use the default
-												uci multiclass datasets

											
										
										
											2023-10-17 18:24:33 +02:00
+								        ~/quay_data/ directory)
 								    :param test_split: proportion of documents to be included in the test set. The rest conforms the training set
-												fixing requests

											
										
										
											2023-10-18 14:12:40 +02:00
+								    :param verbose: set to True (default is False) to get information (stats) about the dataset
-												uci multiclass datasets

											
										
										
											2023-10-17 18:24:33 +02:00
+								    :return: a :class:`quapy.data.base.LabelledCollection` instance
 								    """
 								    assert dataset_name in UCI_MULTICLASS_DATASETS, \
-												revised

											
										
										
											2023-10-18 17:50:46 +02:00
+								        f'Name {dataset_name} does not match any known dataset from the ' \
 								        f'UCI Machine Learning datasets repository (multiclass). ' \
-												uci multiclass datasets

											
										
										
											2023-10-17 18:24:33 +02:00
+								        f'Valid ones are {UCI_MULTICLASS_DATASETS}'
 								    if data_home is None:
 								        data_home = get_quapy_home()
-												revised

											
										
										
											2023-10-18 17:50:46 +02:00
+								    identifiers = {
 								        "dry-bean": 602,
 								        "wine-quality": 186,
 								        "academic-success": 697,
 								        "digits": 80,
 								        "letter": 59
 								    }
-												uci multiclass datasets

											
										
										
											2023-10-17 18:24:33 +02:00
-												revised

											
										
										
											2023-10-18 17:50:46 +02:00
+								    full_names = {
 								        "dry-bean": "Dry Bean Dataset",
 								        "wine-quality": "Wine Quality",
 								        "academic-success": "Predict students' dropout and academic success",
 								        "digits": "Optical Recognition of Handwritten Digits",
 								        "letter": "Letter Recognition"
-												uci multiclass datasets

											
										
										
											2023-10-17 18:24:33 +02:00
+								    }
 								    identifier = identifiers[dataset_name]
 								    fullname = full_names[dataset_name]
-												revised

											
										
										
											2023-10-18 17:50:46 +02:00
+								    if verbose:
 								        print(f'Loading UCI Muticlass {dataset_name} ({fullname})')
-												uci multiclass datasets

											
										
										
											2023-10-17 18:24:33 +02:00
-												revised

											
										
										
											2023-10-18 17:50:46 +02:00
+								    file = join(data_home, 'uci_multiclass', dataset_name+'.pkl')
-												fixing requests

											
										
										
											2023-10-18 14:12:40 +02:00
 								    def download(id):
 								        data = fetch_ucirepo(id=id)
-												uci multiclass datasets

											
										
										
											2023-10-17 18:24:33 +02:00
+								        X, y = data['data']['features'].to_numpy(), data['data']['targets'].to_numpy().squeeze()
-												fixing requests

											
										
										
											2023-10-18 14:12:40 +02:00
+								        classes = np.sort(np.unique(y))
 								        y = np.searchsorted(classes, y)
-												revised

											
										
										
											2023-10-18 17:50:46 +02:00
+								        return LabelledCollection(X, y)
-												uci multiclass datasets

											
										
										
											2023-10-17 18:24:33 +02:00
-												fixing requests

											
										
										
											2023-10-18 14:12:40 +02:00
+								    data = pickled_resource(file, download, identifier)
-												uci multiclass datasets

											
										
										
											2023-10-17 18:24:33 +02:00
-												fixing requests

											
										
										
											2023-10-18 14:12:40 +02:00
+								    if verbose:
 								        data.stats()
-												revised

											
										
										
											2023-10-18 17:50:46 +02:00
-												uci multiclass datasets

											
										
										
											2023-10-17 18:24:33 +02:00
+								    return data
-												added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used)

											
										
										
											2021-01-06 14:58:29 +01:00
-												updating the documentation

											
										
										
											2021-12-06 18:25:47 +01:00
+								def _df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):
-												lequa as dataset

											
										
										
											2022-06-01 18:28:59 +02:00
+								    df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False)
 								def fetch_lequa2022(task, data_home=None):
 								    """
-												adding documentation and adding one new example

											
										
										
											2023-02-08 19:06:53 +01:00
+								    Loads the official datasets provided for the `LeQua <https://lequa2022.github.io/index>`_ competition.
 								    In brief, there are 4 tasks (T1A, T1B, T2A, T2B) having to do with text quantification
 								    problems. Tasks T1A and T1B provide documents in vector form, while T2A and T2B provide raw documents instead.
 								    Tasks T1A and T2A are binary sentiment quantification problems, while T2A and T2B are multiclass quantification
 								    problems consisting of estimating the class prevalence values of 28 different merchandise products.
 								    We refer to the `Esuli, A., Moreo, A., Sebastiani, F., & Sperduti, G. (2022).
 								    A Detailed Overview of LeQua@ CLEF 2022: Learning to Quantify.
 								    <https://ceur-ws.org/Vol-3180/paper-146.pdf>`_ for a detailed description
 								    on the tasks and datasets.
 								    The datasets are downloaded only once, and stored for fast reuse.
 								    See `lequa2022_experiments.py` provided in the example folder, that can serve as a guide on how to use these
 								    datasets.
 								    :param task: a string representing the task name; valid ones are T1A, T1B, T2A, and T2B
 								    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
 								        ~/quay_data/ directory)
 								    :return: a tuple `(train, val_gen, test_gen)` where `train` is an instance of
 								        :class:`quapy.data.base.LabelledCollection`, `val_gen` and `test_gen` are instances of
-												doc fix for LeQua2022

											
										
										
											2023-10-30 09:47:01 +01:00
+								        :class:`quapy.data._lequa2022.SamplesFromDir`, a subclass of :class:`quapy.protocol.AbstractProtocol`,
 								        that return a series of samples stored in a directory which are labelled by prevalence.
-												lequa as dataset

											
										
										
											2022-06-01 18:28:59 +02:00
+								    """
-												adding documentation and adding one new example

											
										
										
											2023-02-08 19:06:53 +01:00
-												lequa as dataset

											
										
										
											2022-06-01 18:28:59 +02:00
+								    from quapy.data._lequa2022 import load_raw_documents, load_vector_documents, SamplesFromDir
 								    assert task in LEQUA2022_TASKS, \
 								        f'Unknown task {task}. Valid ones are {LEQUA2022_TASKS}'
 								    if data_home is None:
 								        data_home = get_quapy_home()
 								    URL_TRAINDEV=f'https://zenodo.org/record/6546188/files/{task}.train_dev.zip'
 								    URL_TEST=f'https://zenodo.org/record/6546188/files/{task}.test.zip'
 								    URL_TEST_PREV=f'https://zenodo.org/record/6546188/files/{task}.test_prevalences.zip'
 								    lequa_dir = join(data_home, 'lequa2022')
 								    os.makedirs(lequa_dir, exist_ok=True)
 								    def download_unzip_and_remove(unzipped_path, url):
 								        tmp_path = join(lequa_dir, task + '_tmp.zip')
 								        download_file_if_not_exists(url, tmp_path)
 								        with zipfile.ZipFile(tmp_path) as file:
 								            file.extractall(unzipped_path)
 								        os.remove(tmp_path)
 								    if not os.path.exists(join(lequa_dir, task)):
 								        download_unzip_and_remove(lequa_dir, URL_TRAINDEV)
 								        download_unzip_and_remove(lequa_dir, URL_TEST)
 								        download_unzip_and_remove(lequa_dir, URL_TEST_PREV)
 								    if task in ['T1A', 'T1B']:
 								        load_fn = load_vector_documents
 								    elif task in ['T2A', 'T2B']:
 								        load_fn = load_raw_documents
 								    tr_path = join(lequa_dir, task, 'public', 'training_data.txt')
 								    train = LabelledCollection.load(tr_path, loader_func=load_fn)
 								    val_samples_path = join(lequa_dir, task, 'public', 'dev_samples')
 								    val_true_prev_path = join(lequa_dir, task, 'public', 'dev_prevalences.txt')
 								    val_gen = SamplesFromDir(val_samples_path, val_true_prev_path, load_fn=load_fn)
-												full example of training, model selection, and evaluation using the lequa2022 dataset with the new protocols

											
										
										
											2022-11-04 15:04:36 +01:00
+								    test_samples_path = join(lequa_dir, task, 'public', 'test_samples')
-												lequa as dataset

											
										
										
											2022-06-01 18:28:59 +02:00
+								    test_true_prev_path = join(lequa_dir, task, 'public', 'test_prevalences.txt')
-												pathfix in lequa2022 datasets

											
										
										
											2022-06-15 14:36:02 +02:00
+								    test_gen = SamplesFromDir(test_samples_path, test_true_prev_path, load_fn=load_fn)
-												lequa as dataset

											
										
										
											2022-06-01 18:28:59 +02:00
 								    return train, val_gen, test_gen
-												testing IFCB dataset

											
										
										
											2024-02-08 14:33:22 +01:00
-												merged

											
										
										
											2024-02-07 18:45:42 +01:00
+								def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=None):
-												added dataset IFCB plankton

											
										
										
											2023-11-08 11:07:47 +01:00
+								    """
-												testing IFCB dataset

											
										
										
											2024-02-08 14:33:22 +01:00
+								    Loads the IFCB dataset for quantification from `Zenodo <https://zenodo.org/records/10036244>`_ (for more
 								    information on this dataset, please follow the zenodo link).
 								    This dataset is based on the data available publicly at
 								    `WHOI-Plankton repo <https://github.com/hsosik/WHOI-Plankton>`_.
 								    The scripts for the processing are available at `P. González's repo <https://github.com/pglez82/IFCB_Zenodo>`_.
-												added dataset IFCB plankton

											
										
										
											2023-11-08 11:07:47 +01:00
+								    Basically, this is the IFCB dataset with precomputed features for testing quantification algorithms.
 								    The datasets are downloaded only once, and stored for fast reuse.
-												merged

											
										
										
											2024-02-07 18:45:42 +01:00
+								    :param single_sample_train: a boolean. If true, it will return the train dataset as a
-												added dataset IFCB plankton

											
										
										
											2023-11-08 11:07:47 +01:00
+								        :class:`quapy.data.base.LabelledCollection` (all examples together).
-												merged

											
										
										
											2024-02-07 18:45:42 +01:00
+								        If false, a generator of training samples will be returned. Each example in the training set has an individual label.
 								    :param for_model_selection: if True, then returns a split 30% of the training set (86 out of 286 samples) to be used for model selection;
 								        if False, then returns the full training set as training set and the test set as the test set
-												added dataset IFCB plankton

											
										
										
											2023-11-08 11:07:47 +01:00
+								    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
 								        ~/quay_data/ directory)
 								    :return: a tuple `(train, test_gen)` where `train` is an instance of
-												merged

											
										
										
											2024-02-07 18:45:42 +01:00
+								        :class:`quapy.data.base.LabelledCollection`, if `single_sample_train` is true or
 								        :class:`quapy.data._ifcb.IFCBTrainSamplesFromDir`, i.e. a sampling protocol that returns a series of samples
 								        labelled example by example. test_gen will be a :class:`quapy.data._ifcb.IFCBTestSamples`,
-												added dataset IFCB plankton

											
										
										
											2023-11-08 11:07:47 +01:00
+								        i.e., a sampling protocol that returns a series of samples labelled by prevalence.
 								    """
-												merged

											
										
										
											2024-02-07 18:45:42 +01:00
+								    from quapy.data._ifcb import IFCBTrainSamplesFromDir, IFCBTestSamples, get_sample_list, generate_modelselection_split
-												added dataset IFCB plankton

											
										
										
											2023-11-08 11:07:47 +01:00
 								    if data_home is None:
 								        data_home = get_quapy_home()
 								    URL_TRAIN=f'https://zenodo.org/records/10036244/files/IFCB.train.zip'
 								    URL_TEST=f'https://zenodo.org/records/10036244/files/IFCB.test.zip'
 								    URL_TEST_PREV=f'https://zenodo.org/records/10036244/files/IFCB.test_prevalences.zip'
 								    ifcb_dir = join(data_home, 'ifcb')
 								    os.makedirs(ifcb_dir, exist_ok=True)
 								    def download_unzip_and_remove(unzipped_path, url):
 								        tmp_path = join(ifcb_dir, 'ifcb_tmp.zip')
 								        download_file_if_not_exists(url, tmp_path)
 								        with zipfile.ZipFile(tmp_path) as file:
 								            file.extractall(unzipped_path)
 								        os.remove(tmp_path)
 								    if not os.path.exists(os.path.join(ifcb_dir,'train')):
 								        download_unzip_and_remove(ifcb_dir, URL_TRAIN)
 								    if not os.path.exists(os.path.join(ifcb_dir,'test')):
 								        download_unzip_and_remove(ifcb_dir, URL_TEST)
 								    if not os.path.exists(os.path.join(ifcb_dir,'test_prevalences.csv')):
 								        download_unzip_and_remove(ifcb_dir, URL_TEST_PREV)
 								    # Load test prevalences and classes
 								    test_true_prev_path = join(ifcb_dir, 'test_prevalences.csv')
 								    test_true_prev = pd.read_csv(test_true_prev_path)
 								    classes = test_true_prev.columns[1:]
-												merged

											
										
										
											2024-02-07 18:45:42 +01:00
+								    #Load train and test samples
-												added dataset IFCB plankton

											
										
										
											2023-11-08 11:07:47 +01:00
+								    train_samples_path = join(ifcb_dir,'train')
 								    test_samples_path = join(ifcb_dir,'test')
-												merged

											
										
										
											2024-02-07 18:45:42 +01:00
 								    if for_model_selection:
 								        # In this case, return 70% of training data as the training set and 30% as the test set
 								        samples = get_sample_list(train_samples_path)
 								        train, test = generate_modelselection_split(samples, split=0.3)
 								        train_gen = IFCBTrainSamplesFromDir(path_dir=train_samples_path, classes=classes, samples=train)
 								        # Test prevalence is computed from class labels
 								        test_gen = IFCBTestSamples(path_dir=train_samples_path, test_prevalences=None, samples=test, classes=classes)
 								    else:
 								        # In this case, we use all training samples as the training set and the test samples as the test set
 								        train_gen = IFCBTrainSamplesFromDir(path_dir=train_samples_path, classes=classes)
 								        test_gen = IFCBTestSamples(path_dir=test_samples_path, test_prevalences=test_true_prev)
-												added dataset IFCB plankton

											
										
										
											2023-11-08 11:07:47 +01:00
 								    # In the case the user wants it, join all the train samples in one LabelledCollection
 								    if single_sample_train:
-												fixing ifcb and documenting

											
										
										
											2024-02-12 12:39:18 +01:00
+								        train = LabelledCollection.join(*[lc for lc in train_gen()])
-												added dataset IFCB plankton

											
										
										
											2023-11-08 11:07:47 +01:00
+								        return train, test_gen
 								    else:
 								        return train_gen, test_gen