QuaPy/quapy/data/datasets.py

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
import os
import zipfile
from os.path import join
import pandas as pd
import scipy

from quapy.data.base import Dataset, LabelledCollection
from quapy.data.preprocessing import text2tfidf, reduce_columns
from quapy.data.reader import *
from quapy.util import download_file_if_not_exists, download_file, get_quapy_home, pickled_resource


REVIEWS_SENTIMENT_DATASETS = ['hp', 'kindle', 'imdb']
TWITTER_SENTIMENT_DATASETS_TEST = ['gasp', 'hcr', 'omd', 'sanders',
                              'semeval13', 'semeval14', 'semeval15', 'semeval16',
                              'sst', 'wa', 'wb']
TWITTER_SENTIMENT_DATASETS_TRAIN = ['gasp', 'hcr', 'omd', 'sanders',
                                 'semeval', 'semeval16',
                                 'sst', 'wa', 'wb']
UCI_DATASETS = ['acute.a', 'acute.b',
                'balance.1', 'balance.2', 'balance.3',
                'breast-cancer',
                'cmc.1', 'cmc.2', 'cmc.3',
                'ctg.1', 'ctg.2', 'ctg.3',
                #'diabetes', # <-- I haven't found this one...
                'german',
                'haberman',
                'ionosphere',
                'iris.1', 'iris.2', 'iris.3',
                'mammographic',
                'pageblocks.5',
                #'phoneme', # <-- I haven't found this one...
                'semeion',
                'sonar',
                'spambase',
                'spectf',
                'tictactoe',
                'transfusion',
                'wdbc',
                'wine.1', 'wine.2', 'wine.3',
                'wine-q-red', 'wine-q-white',
                'yeast']

LEQUA2022_TASKS = ['T1A', 'T1B', 'T2A', 'T2B']

_TXA_SAMPLE_SIZE = 250
_TXB_SAMPLE_SIZE = 1000

LEQUA2022_SAMPLE_SIZE = {
    'TXA': _TXA_SAMPLE_SIZE,
    'TXB': _TXB_SAMPLE_SIZE,
    'T1A': _TXA_SAMPLE_SIZE,
    'T1B': _TXB_SAMPLE_SIZE,
    'T2A': _TXA_SAMPLE_SIZE,
    'T2B': _TXB_SAMPLE_SIZE,
    'binary': _TXA_SAMPLE_SIZE,
    'multiclass': _TXB_SAMPLE_SIZE
}


def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False) -> Dataset:
    """
    Loads a Reviews dataset as a Dataset instance, as used in
    `Esuli, A., Moreo, A., and Sebastiani, F. "A recurrent neural network for sentiment quantification."
    Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018. <https://dl.acm.org/doi/abs/10.1145/3269206.3269287>`_.
    The list of valid dataset names can be accessed in `quapy.data.datasets.REVIEWS_SENTIMENT_DATASETS`

    :param dataset_name: the name of the dataset: valid ones are 'hp', 'kindle', 'imdb'
    :param tfidf: set to True to transform the raw documents into tfidf weighted matrices
    :param min_df: minimun number of documents that should contain a term in order for the term to be
        kept (ignored if tfidf==False)
    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
        ~/quay_data/ directory)
    :param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for
        faster subsequent invokations
    :return: a :class:`quapy.data.base.Dataset` instance
    """
    assert dataset_name in REVIEWS_SENTIMENT_DATASETS, \
        f'Name {dataset_name} does not match any known dataset for sentiment reviews. ' \
        f'Valid ones are {REVIEWS_SENTIMENT_DATASETS}'
    if data_home is None:
        data_home = get_quapy_home()

    URL_TRAIN = f'https://zenodo.org/record/4117827/files/{dataset_name}_train.txt'
    URL_TEST = f'https://zenodo.org/record/4117827/files/{dataset_name}_test.txt'
    os.makedirs(join(data_home, 'reviews'), exist_ok=True)
    train_path = join(data_home, 'reviews', dataset_name, 'train.txt')
    test_path = join(data_home, 'reviews', dataset_name, 'test.txt')
    download_file_if_not_exists(URL_TRAIN, train_path)
    download_file_if_not_exists(URL_TEST, test_path)

    pickle_path = None
    if pickle:
        pickle_path = join(data_home, 'reviews', 'pickle', f'{dataset_name}.pkl')
    data = pickled_resource(pickle_path, Dataset.load, train_path, test_path, from_text)

    if tfidf:
        text2tfidf(data, inplace=True)
        if min_df is not None:
            reduce_columns(data, min_df=min_df, inplace=True)

    data.name = dataset_name

    return data


def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_home=None, pickle=False) -> Dataset:
    """
    Loads a Twitter dataset as a :class:`quapy.data.base.Dataset` instance, as used in:
    `Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
    Social Network Analysis and Mining6(19), 1–22 (2016) <https://link.springer.com/content/pdf/10.1007/s13278-016-0327-z.pdf>`_
    Note that the datasets 'semeval13', 'semeval14', 'semeval15' share the same training set.
    The list of valid dataset names corresponding to training sets can be accessed in
    `quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TRAIN`, while the test sets can be accessed in
    `quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TEST`

    :param dataset_name: the name of the dataset: valid ones are 'gasp', 'hcr', 'omd', 'sanders', 'semeval13',
        'semeval14', 'semeval15', 'semeval16', 'sst', 'wa', 'wb'
    :param for_model_selection: if True, then returns the train split as the training set and the devel split
        as the test set; if False, then returns the train+devel split as the training set and the test set as the
        test set
    :param min_df: minimun number of documents that should contain a term in order for the term to be kept
    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
        ~/quay_data/ directory)
    :param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for
        faster subsequent invokations
    :return: a :class:`quapy.data.base.Dataset` instance
    """
    assert dataset_name in TWITTER_SENTIMENT_DATASETS_TRAIN + TWITTER_SENTIMENT_DATASETS_TEST, \
        f'Name {dataset_name} does not match any known dataset for sentiment twitter. ' \
        f'Valid ones are {TWITTER_SENTIMENT_DATASETS_TRAIN} for model selection and ' \
        f'{TWITTER_SENTIMENT_DATASETS_TEST} for test (datasets "semeval14", "semeval15", "semeval16" share ' \
        f'a common training set "semeval")'
    if data_home is None:
        data_home = get_quapy_home()

    URL = 'https://zenodo.org/record/4255764/files/tweet_sentiment_quantification_snam.zip'
    unzipped_path = join(data_home, 'tweet_sentiment_quantification_snam')
    if not os.path.exists(unzipped_path):
        downloaded_path = join(data_home, 'tweet_sentiment_quantification_snam.zip')
        download_file(URL, downloaded_path)
        with zipfile.ZipFile(downloaded_path) as file:
            file.extractall(data_home)
        os.remove(downloaded_path)

    if dataset_name in {'semeval13', 'semeval14', 'semeval15'}:
        trainset_name = 'semeval'
        testset_name  = 'semeval' if for_model_selection else dataset_name
        print(f"the training and development sets for datasets 'semeval13', 'semeval14', 'semeval15' are common "
              f"(called 'semeval'); returning trainin-set='{trainset_name}' and test-set={testset_name}")
    else:
        if dataset_name == 'semeval' and for_model_selection==False:
            raise ValueError('dataset "semeval" can only be used for model selection. '
                             'Use "semeval13", "semeval14", or "semeval15" for model evaluation.')
        trainset_name = testset_name = dataset_name

    if for_model_selection:
        train = join(unzipped_path, 'train', f'{trainset_name}.train.feature.txt')
        test  = join(unzipped_path, 'test', f'{testset_name}.dev.feature.txt')
    else:
        train = join(unzipped_path, 'train', f'{trainset_name}.train+dev.feature.txt')
        if dataset_name == 'semeval16':  # there is a different test name in the case of semeval16 only
            test = join(unzipped_path, 'test', f'{testset_name}.dev-test.feature.txt')
        else:
            test = join(unzipped_path, 'test', f'{testset_name}.test.feature.txt')

    pickle_path = None
    if pickle:
        mode = "train-dev" if for_model_selection else "train+dev-test"
        pickle_path = join(unzipped_path, 'pickle', f'{testset_name}.{mode}.pkl')
    data = pickled_resource(pickle_path, Dataset.load, train, test, from_sparse)

    if min_df is not None:
        reduce_columns(data, min_df=min_df, inplace=True)

    data.name = dataset_name

    return data


def fetch_UCIDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset:
    """
    Loads a UCI dataset as an instance of :class:`quapy.data.base.Dataset`, as used in
    `Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
    Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
    Information Fusion, 34, 87-100. <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_
    and
    `Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).
    Dynamic ensemble selection for quantification tasks.
    Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_.
    The datasets do not come with a predefined train-test split (see :meth:`fetch_UCILabelledCollection` for further
    information on how to use these collections), and so a train-test split is generated at desired proportion.
    The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_DATASETS`

    :param dataset_name: a dataset name
    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
        ~/quay_data/ directory)
    :param test_split: proportion of documents to be included in the test set. The rest conforms the training set
    :param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
    :return: a :class:`quapy.data.base.Dataset` instance
    """
    data = fetch_UCILabelledCollection(dataset_name, data_home, verbose)
    return Dataset(*data.split_stratified(1 - test_split, random_state=0))


def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection:
    """
    Loads a UCI collection as an instance of :class:`quapy.data.base.LabelledCollection`, as used in
    `Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
    Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
    Information Fusion, 34, 87-100. <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_
    and
    `Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).
    Dynamic ensemble selection for quantification tasks.
    Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_.
    The datasets do not come with a predefined train-test split, and so Pérez-Gállego et al. adopted a 5FCVx2 evaluation
    protocol, meaning that each collection was used to generate two rounds (hence the x2) of 5 fold cross validation.
    This can be reproduced by using :meth:`quapy.data.base.Dataset.kFCV`, e.g.:

    >>> import quapy as qp
    >>> collection = qp.datasets.fetch_UCILabelledCollection("yeast")
    >>> for data in qp.domains.Dataset.kFCV(collection, nfolds=5, nrepeats=2):
    >>>     ...

    The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_DATASETS`

    :param dataset_name: a dataset name
    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
        ~/quay_data/ directory)
    :param test_split: proportion of documents to be included in the test set. The rest conforms the training set
    :param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
    :return: a :class:`quapy.data.base.LabelledCollection` instance
    """

    assert dataset_name in UCI_DATASETS, \
        f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \
        f'Valid ones are {UCI_DATASETS}'
    if data_home is None:
        data_home = get_quapy_home()

    dataset_fullname = {
        'acute.a': 'Acute Inflammations (urinary bladder)',
        'acute.b': 'Acute Inflammations (renal pelvis)',
        'balance.1': 'Balance Scale Weight & Distance Database (left)',
        'balance.2': 'Balance Scale Weight & Distance Database (balanced)',
        'balance.3': 'Balance Scale Weight & Distance Database (right)',
        'breast-cancer':  'Breast Cancer Wisconsin (Original)',
        'cmc.1': 'Contraceptive Method Choice (no use)',
        'cmc.2': 'Contraceptive Method Choice (long term)',
        'cmc.3': 'Contraceptive Method Choice (short term)',
        'ctg.1': 'Cardiotocography Data Set (normal)',
        'ctg.2': 'Cardiotocography Data Set (suspect)',
        'ctg.3': 'Cardiotocography Data Set (pathologic)',
        'german': 'Statlog German Credit Data',
        'haberman': "Haberman's Survival Data",
        'ionosphere': 'Johns Hopkins University Ionosphere DB',
        'iris.1': 'Iris Plants Database(x)',
        'iris.2': 'Iris Plants Database(versicolour)',
        'iris.3': 'Iris Plants Database(virginica)',
        'mammographic': 'Mammographic Mass',
        'pageblocks.5': 'Page Blocks Classification (5)',
        'semeion': 'Semeion Handwritten Digit (8)',
        'sonar': 'Sonar, Mines vs. Rocks',
        'spambase': 'Spambase Data Set',
        'spectf': 'SPECTF Heart Data',
        'tictactoe': 'Tic-Tac-Toe Endgame Database',
        'transfusion': 'Blood Transfusion Service Center Data Set',
        'wdbc': 'Wisconsin Diagnostic Breast Cancer',
        'wine.1': 'Wine Recognition Data (1)',
        'wine.2': 'Wine Recognition Data (2)',
        'wine.3': 'Wine Recognition Data (3)',
        'wine-q-red': 'Wine Quality Red (6-10)',
        'wine-q-white': 'Wine Quality White (6-10)',
        'yeast': 'Yeast',
    }

    # the identifier is an alias for the dataset group, it's part of the url data-folder, and is the name we use
    # to download the raw dataset
    identifier_map = {
        'acute.a': 'acute',
        'acute.b': 'acute',
        'balance.1': 'balance-scale',
        'balance.2': 'balance-scale',
        'balance.3': 'balance-scale',
        'breast-cancer': 'breast-cancer-wisconsin',
        'cmc.1': 'cmc',
        'cmc.2': 'cmc',
        'cmc.3': 'cmc',
        'ctg.1': '00193',
        'ctg.2': '00193',
        'ctg.3': '00193',
        'german': 'statlog/german',
        'haberman': 'haberman',
        'ionosphere': 'ionosphere',
        'iris.1': 'iris',
        'iris.2': 'iris',
        'iris.3': 'iris',
        'mammographic': 'mammographic-masses',
        'pageblocks.5': 'page-blocks',
        'semeion': 'semeion',
        'sonar': 'undocumented/connectionist-bench/sonar',
        'spambase': 'spambase',
        'spectf': 'spect',
        'tictactoe': 'tic-tac-toe',
        'transfusion': 'blood-transfusion',
        'wdbc': 'breast-cancer-wisconsin',
        'wine-q-red': 'wine-quality',
        'wine-q-white': 'wine-quality',
        'wine.1': 'wine',
        'wine.2': 'wine',
        'wine.3': 'wine',
        'yeast': 'yeast',
    }

    # the filename is the name of the file within the data_folder indexed by the identifier
    file_name = {
        'acute': 'diagnosis.data',
        '00193': 'CTG.xls',
        'statlog/german': 'german.data-numeric',
        'mammographic-masses': 'mammographic_masses.data',
        'page-blocks': 'page-blocks.data.Z',
        'undocumented/connectionist-bench/sonar': 'sonar.all-data',
        'spect': ['SPECTF.train', 'SPECTF.test'],
        'blood-transfusion': 'transfusion.data',
        'wine-quality': ['winequality-red.csv', 'winequality-white.csv'],
        'breast-cancer-wisconsin': 'breast-cancer-wisconsin.data' if dataset_name=='breast-cancer' else 'wdbc.data'
    }

    # the filename containing the dataset description (if any)
    desc_name = {
        'acute': 'diagnosis.names',
        '00193': None,
        'statlog/german': 'german.doc',
        'mammographic-masses': 'mammographic_masses.names',
        'undocumented/connectionist-bench/sonar': 'sonar.names',
        'spect': 'SPECTF.names',
        'blood-transfusion': 'transfusion.names',
        'wine-quality': 'winequality.names',
        'breast-cancer-wisconsin': 'breast-cancer-wisconsin.names' if dataset_name == 'breast-cancer' else 'wdbc.names'
    }

    identifier = identifier_map[dataset_name]
    filename = file_name.get(identifier, f'{identifier}.data')
    descfile = desc_name.get(identifier, f'{identifier}.names')
    fullname = dataset_fullname[dataset_name]

    URL = f'http://archive.ics.uci.edu/ml/machine-learning-databases/{identifier}'
    data_dir = join(data_home, 'uci_datasets', identifier)
    if isinstance(filename, str):  # filename could be a list of files, in which case it will be processed later
        data_path = join(data_dir, filename)
        download_file_if_not_exists(f'{URL}/{filename}', data_path)

    if descfile:
        try:
            download_file_if_not_exists(f'{URL}/{descfile}', f'{data_dir}/{descfile}')
            if verbose:
                print(open(f'{data_dir}/{descfile}', 'rt').read())
        except Exception:
            print('could not read the description file')
    elif verbose:
        print('no file description available')

    print(f'Loading {dataset_name} ({fullname})')
    if identifier == 'acute':
        df = pd.read_csv(data_path, header=None, encoding='utf-16', sep='\t')

        df[0] = df[0].apply(lambda x: float(x.replace(',', '.'))).astype(float, copy=False)
        [_df_replace(df, col) for col in range(1, 6)]
        X = df.loc[:, 0:5].values
        if dataset_name == 'acute.a':
            y = binarize(df[6], pos_class='yes')
        elif dataset_name == 'acute.b':
            y = binarize(df[7], pos_class='yes')

    if identifier == 'balance-scale':
        df = pd.read_csv(data_path, header=None, sep=',')
        if dataset_name == 'balance.1':
            y = binarize(df[0], pos_class='L')
        elif dataset_name == 'balance.2':
            y = binarize(df[0], pos_class='B')
        elif dataset_name == 'balance.3':
            y = binarize(df[0], pos_class='R')
        X = df.loc[:, 1:].astype(float).values

    if identifier == 'breast-cancer-wisconsin' and dataset_name=='breast-cancer':
        df = pd.read_csv(data_path, header=None, sep=',')
        Xy = df.loc[:, 1:10]
        Xy[Xy=='?']=np.nan
        Xy = Xy.dropna(axis=0)
        X = Xy.loc[:, 1:9]
        X = X.astype(float).values
        y = binarize(Xy[10], pos_class=2)

    if identifier == 'breast-cancer-wisconsin' and dataset_name=='wdbc':
        df = pd.read_csv(data_path, header=None, sep=',')
        X = df.loc[:, 2:32].astype(float).values
        y = df[1].values
        y = binarize(y, pos_class='M')

    if identifier == 'cmc':
        df = pd.read_csv(data_path, header=None, sep=',')
        X = df.loc[:, 0:8].astype(float).values
        y = df[9].astype(int).values
        if dataset_name == 'cmc.1':
            y = binarize(y, pos_class=1)
        elif dataset_name == 'cmc.2':
            y = binarize(y, pos_class=2)
        elif dataset_name == 'cmc.3':
            y = binarize(y, pos_class=3)

    if identifier == '00193':
        df = pd.read_excel(data_path, sheet_name='Data', skipfooter=3)
        df = df[list(range(1,24))] # select columns numbered (number 23 is the target label)
        # replaces the header with the first row
        new_header = df.iloc[0]  # grab the first row for the header
        df = df[1:]  # take the data less the header row
        df.columns = new_header  # set the header row as the df header
        X = df.iloc[:, 0:22].astype(float).values
        y = df['NSP'].astype(int).values
        if dataset_name == 'ctg.1':
            y = binarize(y, pos_class=1)  # 1==Normal
        elif dataset_name == 'ctg.2':
            y = binarize(y, pos_class=2)  # 2==Suspect
        elif dataset_name == 'ctg.3':
            y = binarize(y, pos_class=3)  # 3==Pathologic

    if identifier == 'statlog/german':
        df = pd.read_csv(data_path, header=None, delim_whitespace=True)
        X = df.iloc[:, 0:24].astype(float).values
        y = df[24].astype(int).values
        y = binarize(y, pos_class=1)

    if identifier == 'haberman':
        df = pd.read_csv(data_path, header=None)
        X = df.iloc[:, 0:3].astype(float).values
        y = df[3].astype(int).values
        y = binarize(y, pos_class=2)

    if identifier == 'ionosphere':
        df = pd.read_csv(data_path, header=None)
        X = df.iloc[:, 0:34].astype(float).values
        y = df[34].values
        y = binarize(y, pos_class='b')

    if identifier == 'iris':
        df = pd.read_csv(data_path, header=None)
        X = df.iloc[:, 0:4].astype(float).values
        y = df[4].values
        if dataset_name == 'iris.1':
            y = binarize(y, pos_class='Iris-setosa')  # 1==Setosa
        elif dataset_name == 'iris.2':
            y = binarize(y, pos_class='Iris-versicolor')  # 2==Versicolor
        elif dataset_name == 'iris.3':
            y = binarize(y, pos_class='Iris-virginica')  # 3==Virginica

    if identifier == 'mammographic-masses':
        df = pd.read_csv(data_path, header=None, sep=',')
        df[df == '?'] = np.nan
        Xy = df.dropna(axis=0)
        X = Xy.iloc[:, 0:5]
        X = X.astype(float).values
        y = binarize(Xy.iloc[:,5], pos_class=1)

    if identifier == 'page-blocks':
        data_path_ = data_path.replace('.Z', '')
        if not os.path.exists(data_path_):
            raise FileNotFoundError(f'Warning: file {data_path_} does not exist. If this is the first time you '
                                    f'attempt to load this dataset, then you have to manually unzip the {data_path} '
                                    f'and name the extracted file {data_path_} (unfortunately, neither zipfile, nor '
                                    f'gzip can handle unix compressed files automatically -- there is a repo in GitHub '
                                    f'https://github.com/umeat/unlzw where the problem seems to be solved anyway).')
        df = pd.read_csv(data_path_, header=None, delim_whitespace=True)
        X = df.iloc[:, 0:10].astype(float).values
        y = df[10].values
        y = binarize(y, pos_class=5)  # 5==block "graphic"

    if identifier == 'semeion':
        df = pd.read_csv(data_path, header=None, delim_whitespace=True )
        X = df.iloc[:, 0:256].astype(float).values
        y = df[263].values  # 263 stands for digit 8 (labels are one-hot vectors from col 256-266)
        y = binarize(y, pos_class=1)

    if identifier == 'undocumented/connectionist-bench/sonar':
        df = pd.read_csv(data_path, header=None, sep=',')
        X = df.iloc[:, 0:60].astype(float).values
        y = df[60].values
        y = binarize(y, pos_class='R')

    if identifier == 'spambase':
        df = pd.read_csv(data_path, header=None, sep=',')
        X = df.iloc[:, 0:57].astype(float).values
        y = df[57].values
        y = binarize(y, pos_class=1)

    if identifier == 'spect':
        dfs = []
        for file in filename:
            data_path = join(data_dir, file)
            download_file_if_not_exists(f'{URL}/{file}', data_path)
            dfs.append(pd.read_csv(data_path, header=None, sep=','))
        df = pd.concat(dfs)
        X = df.iloc[:, 1:45].astype(float).values
        y = df[0].values
        y = binarize(y, pos_class=0)

    if identifier == 'tic-tac-toe':
        df = pd.read_csv(data_path, header=None, sep=',')
        X = df.iloc[:, 0:9].replace('o',0).replace('b',1).replace('x',2).values
        y = df[9].values
        y = binarize(y, pos_class='negative')

    if identifier == 'blood-transfusion':
        df = pd.read_csv(data_path, sep=',')
        X = df.iloc[:, 0:4].astype(float).values
        y = df.iloc[:, 4].values
        y = binarize(y, pos_class=1)

    if identifier == 'wine':
        df = pd.read_csv(data_path, header=None, sep=',')
        X = df.iloc[:, 1:14].astype(float).values
        y = df[0].values
        if dataset_name == 'wine.1':
            y = binarize(y, pos_class=1)
        elif dataset_name == 'wine.2':
            y = binarize(y, pos_class=2)
        elif dataset_name == 'wine.3':
            y = binarize(y, pos_class=3)

    if identifier == 'wine-quality':
        filename = filename[0] if dataset_name=='wine-q-red' else filename[1]
        data_path = join(data_dir, filename)
        download_file_if_not_exists(f'{URL}/{filename}', data_path)
        df = pd.read_csv(data_path, sep=';')
        X = df.iloc[:, 0:11].astype(float).values
        y = df.iloc[:, 11].values > 5

    if identifier == 'yeast':
        df = pd.read_csv(data_path, header=None, delim_whitespace=True)
        X = df.iloc[:, 1:9].astype(float).values
        y = df.iloc[:, 9].values
        y = binarize(y, pos_class='NUC')

    data = LabelledCollection(X, y)
    data.stats()
    return data


def _df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):
    df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False)


def fetch_lequa2022(task, data_home=None):
    """
    Loads the official datasets provided for the `LeQua <https://lequa2022.github.io/index>`_ competition.
    In brief, there are 4 tasks (T1A, T1B, T2A, T2B) having to do with text quantification
    problems. Tasks T1A and T1B provide documents in vector form, while T2A and T2B provide raw documents instead.
    Tasks T1A and T2A are binary sentiment quantification problems, while T2A and T2B are multiclass quantification
    problems consisting of estimating the class prevalence values of 28 different merchandise products.
    We refer to the `Esuli, A., Moreo, A., Sebastiani, F., & Sperduti, G. (2022).
    A Detailed Overview of LeQua@ CLEF 2022: Learning to Quantify.
    <https://ceur-ws.org/Vol-3180/paper-146.pdf>`_ for a detailed description
    on the tasks and datasets.

    The datasets are downloaded only once, and stored for fast reuse.

    See `lequa2022_experiments.py` provided in the example folder, that can serve as a guide on how to use these
    datasets.


    :param task: a string representing the task name; valid ones are T1A, T1B, T2A, and T2B
    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
        ~/quay_data/ directory)
    :return: a tuple `(train, val_gen, test_gen)` where `train` is an instance of
        :class:`quapy.data.base.LabelledCollection`, `val_gen` and `test_gen` are instances of
        :class:`quapy.protocol.SamplesFromDir`, i.e., are sampling protocols that return a series of samples
        labelled by prevalence.
    """

    from quapy.data._lequa2022 import load_raw_documents, load_vector_documents, SamplesFromDir

    assert task in LEQUA2022_TASKS, \
        f'Unknown task {task}. Valid ones are {LEQUA2022_TASKS}'
    if data_home is None:
        data_home = get_quapy_home()

    URL_TRAINDEV=f'https://zenodo.org/record/6546188/files/{task}.train_dev.zip'
    URL_TEST=f'https://zenodo.org/record/6546188/files/{task}.test.zip'
    URL_TEST_PREV=f'https://zenodo.org/record/6546188/files/{task}.test_prevalences.zip'

    lequa_dir = join(data_home, 'lequa2022')
    os.makedirs(lequa_dir, exist_ok=True)

    def download_unzip_and_remove(unzipped_path, url):
        tmp_path = join(lequa_dir, task + '_tmp.zip')
        download_file_if_not_exists(url, tmp_path)
        with zipfile.ZipFile(tmp_path) as file:
            file.extractall(unzipped_path)
        os.remove(tmp_path)

    if not os.path.exists(join(lequa_dir, task)):
        download_unzip_and_remove(lequa_dir, URL_TRAINDEV)
        download_unzip_and_remove(lequa_dir, URL_TEST)
        download_unzip_and_remove(lequa_dir, URL_TEST_PREV)

    if task in ['T1A', 'T1B']:
        load_fn = load_vector_documents
    elif task in ['T2A', 'T2B']:
        load_fn = load_raw_documents

    tr_path = join(lequa_dir, task, 'public', 'training_data.txt')
    train = LabelledCollection.load(tr_path, loader_func=load_fn)

    val_samples_path = join(lequa_dir, task, 'public', 'dev_samples')
    val_true_prev_path = join(lequa_dir, task, 'public', 'dev_prevalences.txt')
    val_gen = SamplesFromDir(val_samples_path, val_true_prev_path, load_fn=load_fn)

    test_samples_path = join(lequa_dir, task, 'public', 'test_samples')
    test_true_prev_path = join(lequa_dir, task, 'public', 'test_prevalences.txt')
    test_gen = SamplesFromDir(test_samples_path, test_true_prev_path, load_fn=load_fn)

    return train, val_gen, test_gen

def fetch_IFCB(data_home=None):

    if data_home is None:
        data_home = get_quapy_home()

    URL_TRAINDEV=f'https://zenodo.org/records/10036244/files/IFCB.train.zip'
    URL_TEST=f'https://zenodo.org/records/10036244/files/IFCB.test.zip'
    URL_TEST_PREV=f'https://zenodo.org/records/10036244/files/IFCB.test_prevalences.zip'

    ifcb_dir = join(data_home, 'ifcb')
    os.makedirs(ifcb_dir, exist_ok=True)

    def download_unzip_and_remove(unzipped_path, url):
        tmp_path = join(ifcb_dir, 'tmp.zip')
        download_file_if_not_exists(url, tmp_path)
        with zipfile.ZipFile(tmp_path) as file:
            file.extractall(unzipped_path)
        os.remove(tmp_path)

    if not os.path.exists(join(ifcb_dir, task)):
        download_unzip_and_remove(ifcb_dir, URL_TRAINDEV)
        download_unzip_and_remove(ifcb_dir, URL_TEST)
        download_unzip_and_remove(ifcb_dir, URL_TEST_PREV)
-												all uci datasets from Pérez-Gállego added, quantification report added

											
										
										
											2021-01-28 18:22:43 +01:00
+								def warn(*args, **kwargs):
 								    pass
 								import warnings
 								warnings.warn = warn
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
+								import os
-												import fixes

											
										
										
											2021-01-15 18:32:32 +01:00
+								import zipfile
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
+								from os.path import join
-												added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used)

											
										
										
											2021-01-06 14:58:29 +01:00
+								import pandas as pd
-												more examples, one-vs-all fixed

											
										
										
											2023-02-09 19:39:16 +01:00
+								import scipy
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
-												imports fix

											
										
										
											2021-03-19 17:34:09 +01:00
+								from quapy.data.base import Dataset, LabelledCollection
-												import fixes

											
										
										
											2021-01-15 18:32:32 +01:00
+								from quapy.data.preprocessing import text2tfidf, reduce_columns
 								from quapy.data.reader import *
 								from quapy.util import download_file_if_not_exists, download_file, get_quapy_home, pickled_resource
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
-												full example of training, model selection, and evaluation using the lequa2022 dataset with the new protocols

											
										
										
											2022-11-04 15:04:36 +01:00
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
+								REVIEWS_SENTIMENT_DATASETS = ['hp', 'kindle', 'imdb']
-												testing quapy via replicating Tweet Quantification experiments

											
										
										
											2021-01-12 17:39:00 +01:00
+								TWITTER_SENTIMENT_DATASETS_TEST = ['gasp', 'hcr', 'omd', 'sanders',
-												added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used)

											
										
										
											2021-01-06 14:58:29 +01:00
+								                              'semeval13', 'semeval14', 'semeval15', 'semeval16',
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
+								                              'sst', 'wa', 'wb']
-												testing quapy via replicating Tweet Quantification experiments

											
										
										
											2021-01-12 17:39:00 +01:00
+								TWITTER_SENTIMENT_DATASETS_TRAIN = ['gasp', 'hcr', 'omd', 'sanders',
 								                                 'semeval', 'semeval16',
 								                                 'sst', 'wa', 'wb']
-												all uci datasets from Pérez-Gállego added, quantification report added

											
										
										
											2021-01-28 18:22:43 +01:00
+								UCI_DATASETS = ['acute.a', 'acute.b',
 								                'balance.1', 'balance.2', 'balance.3',
 								                'breast-cancer',
 								                'cmc.1', 'cmc.2', 'cmc.3',
 								                'ctg.1', 'ctg.2', 'ctg.3',
 								                #'diabetes', # <-- I haven't found this one...
 								                'german',
 								                'haberman',
 								                'ionosphere',
 								                'iris.1', 'iris.2', 'iris.3',
 								                'mammographic',
 								                'pageblocks.5',
 								                #'phoneme', # <-- I haven't found this one...
 								                'semeion',
 								                'sonar',
 								                'spambase',
 								                'spectf',
 								                'tictactoe',
 								                'transfusion',
 								                'wdbc',
 								                'wine.1', 'wine.2', 'wine.3',
 								                'wine-q-red', 'wine-q-white',
 								                'yeast']
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
-												lequa as dataset

											
										
										
											2022-06-01 18:28:59 +02:00
+								LEQUA2022_TASKS = ['T1A', 'T1B', 'T2A', 'T2B']
-												full example of training, model selection, and evaluation using the lequa2022 dataset with the new protocols

											
										
										
											2022-11-04 15:04:36 +01:00
+								_TXA_SAMPLE_SIZE = 250
 								_TXB_SAMPLE_SIZE = 1000
 								LEQUA2022_SAMPLE_SIZE = {
 								    'TXA': _TXA_SAMPLE_SIZE,
 								    'TXB': _TXB_SAMPLE_SIZE,
 								    'T1A': _TXA_SAMPLE_SIZE,
 								    'T1B': _TXB_SAMPLE_SIZE,
 								    'T2A': _TXA_SAMPLE_SIZE,
 								    'T2B': _TXB_SAMPLE_SIZE,
 								    'binary': _TXA_SAMPLE_SIZE,
 								    'multiclass': _TXB_SAMPLE_SIZE
 								}
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
-												Bug fixes on use of classes_. Tests.

											
										
										
											2021-05-05 17:12:44 +02:00
+								def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False) -> Dataset:
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
+								    """
-												updating the documentation

											
										
										
											2021-12-06 18:25:47 +01:00
+								    Loads a Reviews dataset as a Dataset instance, as used in
 								    `Esuli, A., Moreo, A., and Sebastiani, F. "A recurrent neural network for sentiment quantification."
 								    Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018. <https://dl.acm.org/doi/abs/10.1145/3269206.3269287>`_.
 								    The list of valid dataset names can be accessed in `quapy.data.datasets.REVIEWS_SENTIMENT_DATASETS`
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
+								    :param dataset_name: the name of the dataset: valid ones are 'hp', 'kindle', 'imdb'
 								    :param tfidf: set to True to transform the raw documents into tfidf weighted matrices
 								    :param min_df: minimun number of documents that should contain a term in order for the term to be
-												updating the documentation

											
										
										
											2021-12-06 18:25:47 +01:00
+								        kept (ignored if tfidf==False)
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
+								    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
-												updating the documentation

											
										
										
											2021-12-06 18:25:47 +01:00
+								        ~/quay_data/ directory)
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
+								    :param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for
-												updating the documentation

											
										
										
											2021-12-06 18:25:47 +01:00
+								        faster subsequent invokations
 								    :return: a :class:`quapy.data.base.Dataset` instance
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
+								    """
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
+								    assert dataset_name in REVIEWS_SENTIMENT_DATASETS, \
 								        f'Name {dataset_name} does not match any known dataset for sentiment reviews. ' \
 								        f'Valid ones are {REVIEWS_SENTIMENT_DATASETS}'
 								    if data_home is None:
 								        data_home = get_quapy_home()
 								    URL_TRAIN = f'https://zenodo.org/record/4117827/files/{dataset_name}_train.txt'
 								    URL_TEST = f'https://zenodo.org/record/4117827/files/{dataset_name}_test.txt'
 								    os.makedirs(join(data_home, 'reviews'), exist_ok=True)
 								    train_path = join(data_home, 'reviews', dataset_name, 'train.txt')
 								    test_path = join(data_home, 'reviews', dataset_name, 'test.txt')
 								    download_file_if_not_exists(URL_TRAIN, train_path)
 								    download_file_if_not_exists(URL_TEST, test_path)
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
+								    pickle_path = None
 								    if pickle:
 								        pickle_path = join(data_home, 'reviews', 'pickle', f'{dataset_name}.pkl')
 								    data = pickled_resource(pickle_path, Dataset.load, train_path, test_path, from_text)
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
 								    if tfidf:
 								        text2tfidf(data, inplace=True)
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
+								        if min_df is not None:
 								            reduce_columns(data, min_df=min_df, inplace=True)
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
-												adding tweet sent quant experiments

											
										
										
											2021-01-11 18:31:12 +01:00
+								    data.name = dataset_name
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
+								    return data
-												Bug fixes on use of classes_. Tests.

											
										
										
											2021-05-05 17:12:44 +02:00
+								def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_home=None, pickle=False) -> Dataset:
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
+								    """
-												updating the documentation

											
										
										
											2021-12-06 18:25:47 +01:00
+								    Loads a Twitter dataset as a :class:`quapy.data.base.Dataset` instance, as used in:
 								    `Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
 								    Social Network Analysis and Mining6(19), 1–22 (2016) <https://link.springer.com/content/pdf/10.1007/s13278-016-0327-z.pdf>`_
 								    Note that the datasets 'semeval13', 'semeval14', 'semeval15' share the same training set.
 								    The list of valid dataset names corresponding to training sets can be accessed in
 								    `quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TRAIN`, while the test sets can be accessed in
 								    `quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TEST`
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
 								    :param dataset_name: the name of the dataset: valid ones are 'gasp', 'hcr', 'omd', 'sanders', 'semeval13',
-												updating the documentation

											
										
										
											2021-12-06 18:25:47 +01:00
+								        'semeval14', 'semeval15', 'semeval16', 'sst', 'wa', 'wb'
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
+								    :param for_model_selection: if True, then returns the train split as the training set and the devel split
-												updating the documentation

											
										
										
											2021-12-06 18:25:47 +01:00
+								        as the test set; if False, then returns the train+devel split as the training set and the test set as the
 								        test set
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
+								    :param min_df: minimun number of documents that should contain a term in order for the term to be kept
 								    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
-												updating the documentation

											
										
										
											2021-12-06 18:25:47 +01:00
+								        ~/quay_data/ directory)
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
+								    :param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for
-												updating the documentation

											
										
										
											2021-12-06 18:25:47 +01:00
+								        faster subsequent invokations
 								    :return: a :class:`quapy.data.base.Dataset` instance
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
+								    """
-												testing quapy via replicating Tweet Quantification experiments

											
										
										
											2021-01-12 17:39:00 +01:00
+								    assert dataset_name in TWITTER_SENTIMENT_DATASETS_TRAIN + TWITTER_SENTIMENT_DATASETS_TEST, \
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
+								        f'Name {dataset_name} does not match any known dataset for sentiment twitter. ' \
-												testing quapy via replicating Tweet Quantification experiments

											
										
										
											2021-01-12 17:39:00 +01:00
+								        f'Valid ones are {TWITTER_SENTIMENT_DATASETS_TRAIN} for model selection and ' \
 								        f'{TWITTER_SENTIMENT_DATASETS_TEST} for test (datasets "semeval14", "semeval15", "semeval16" share ' \
 								        f'a common training set "semeval")'
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
+								    if data_home is None:
 								        data_home = get_quapy_home()
 								    URL = 'https://zenodo.org/record/4255764/files/tweet_sentiment_quantification_snam.zip'
 								    unzipped_path = join(data_home, 'tweet_sentiment_quantification_snam')
 								    if not os.path.exists(unzipped_path):
 								        downloaded_path = join(data_home, 'tweet_sentiment_quantification_snam.zip')
 								        download_file(URL, downloaded_path)
 								        with zipfile.ZipFile(downloaded_path) as file:
 								            file.extractall(data_home)
 								        os.remove(downloaded_path)
 								    if dataset_name in {'semeval13', 'semeval14', 'semeval15'}:
 								        trainset_name = 'semeval'
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
+								        testset_name  = 'semeval' if for_model_selection else dataset_name
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
+								        print(f"the training and development sets for datasets 'semeval13', 'semeval14', 'semeval15' are common "
 								              f"(called 'semeval'); returning trainin-set='{trainset_name}' and test-set={testset_name}")
 								    else:
-												testing quapy via replicating Tweet Quantification experiments

											
										
										
											2021-01-12 17:39:00 +01:00
+								        if dataset_name == 'semeval' and for_model_selection==False:
 								            raise ValueError('dataset "semeval" can only be used for model selection. '
 								                             'Use "semeval13", "semeval14", or "semeval15" for model evaluation.')
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
+								        trainset_name = testset_name = dataset_name
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
+								    if for_model_selection:
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
+								        train = join(unzipped_path, 'train', f'{trainset_name}.train.feature.txt')
 								        test  = join(unzipped_path, 'test', f'{testset_name}.dev.feature.txt')
 								    else:
 								        train = join(unzipped_path, 'train', f'{trainset_name}.train+dev.feature.txt')
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
+								        if dataset_name == 'semeval16':  # there is a different test name in the case of semeval16 only
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
+								            test = join(unzipped_path, 'test', f'{testset_name}.dev-test.feature.txt')
 								        else:
 								            test = join(unzipped_path, 'test', f'{testset_name}.test.feature.txt')
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
+								    pickle_path = None
 								    if pickle:
 								        mode = "train-dev" if for_model_selection else "train+dev-test"
 								        pickle_path = join(unzipped_path, 'pickle', f'{testset_name}.{mode}.pkl')
 								    data = pickled_resource(pickle_path, Dataset.load, train, test, from_sparse)
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
 								    if min_df is not None:
 								        reduce_columns(data, min_df=min_df, inplace=True)
-												adding tweet sent quant experiments

											
										
										
											2021-01-11 18:31:12 +01:00
+								    data.name = dataset_name
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
+								    return data
-												Bug fixes on use of classes_. Tests.

											
										
										
											2021-05-05 17:12:44 +02:00
+								def fetch_UCIDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset:
-												updating the documentation

											
										
										
											2021-12-06 18:25:47 +01:00
+								    """
 								    Loads a UCI dataset as an instance of :class:`quapy.data.base.Dataset`, as used in
 								    `Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
 								    Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
 								    Information Fusion, 34, 87-100. <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_
 								    and
 								    `Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).
 								    Dynamic ensemble selection for quantification tasks.
 								    Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_.
 								    The datasets do not come with a predefined train-test split (see :meth:`fetch_UCILabelledCollection` for further
 								    information on how to use these collections), and so a train-test split is generated at desired proportion.
 								    The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_DATASETS`
 								    :param dataset_name: a dataset name
 								    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
 								        ~/quay_data/ directory)
 								    :param test_split: proportion of documents to be included in the test set. The rest conforms the training set
 								    :param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
 								    :return: a :class:`quapy.data.base.Dataset` instance
 								    """
-												all uci datasets from Pérez-Gállego added, quantification report added

											
										
										
											2021-01-28 18:22:43 +01:00
+								    data = fetch_UCILabelledCollection(dataset_name, data_home, verbose)
 								    return Dataset(*data.split_stratified(1 - test_split, random_state=0))
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
-												adding uci_experiments to examples folder

											
										
										
											2023-03-23 15:46:03 +01:00
+								def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection:
-												updating the documentation

											
										
										
											2021-12-06 18:25:47 +01:00
+								    """
 								    Loads a UCI collection as an instance of :class:`quapy.data.base.LabelledCollection`, as used in
 								    `Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
 								    Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
 								    Information Fusion, 34, 87-100. <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_
 								    and
 								    `Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).
 								    Dynamic ensemble selection for quantification tasks.
 								    Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_.
 								    The datasets do not come with a predefined train-test split, and so Pérez-Gállego et al. adopted a 5FCVx2 evaluation
 								    protocol, meaning that each collection was used to generate two rounds (hence the x2) of 5 fold cross validation.
 								    This can be reproduced by using :meth:`quapy.data.base.Dataset.kFCV`, e.g.:
 								    >>> import quapy as qp
 								    >>> collection = qp.datasets.fetch_UCILabelledCollection("yeast")
-												adding uci_experiments to examples folder

											
										
										
											2023-03-23 15:46:03 +01:00
+								    >>> for data in qp.domains.Dataset.kFCV(collection, nfolds=5, nrepeats=2):
-												updating the documentation

											
										
										
											2021-12-06 18:25:47 +01:00
+								    >>>     ...
 								    The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_DATASETS`
 								    :param dataset_name: a dataset name
 								    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
 								        ~/quay_data/ directory)
 								    :param test_split: proportion of documents to be included in the test set. The rest conforms the training set
 								    :param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
-												adding uci_experiments to examples folder

											
										
										
											2023-03-23 15:46:03 +01:00
+								    :return: a :class:`quapy.data.base.LabelledCollection` instance
-												updating the documentation

											
										
										
											2021-12-06 18:25:47 +01:00
+								    """
-												added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used)

											
										
										
											2021-01-06 14:58:29 +01:00
 								    assert dataset_name in UCI_DATASETS, \
 								        f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \
 								        f'Valid ones are {UCI_DATASETS}'
 								    if data_home is None:
 								        data_home = get_quapy_home()
 								    dataset_fullname = {
 								        'acute.a': 'Acute Inflammations (urinary bladder)',
 								        'acute.b': 'Acute Inflammations (renal pelvis)',
 								        'balance.1': 'Balance Scale Weight & Distance Database (left)',
 								        'balance.2': 'Balance Scale Weight & Distance Database (balanced)',
 								        'balance.3': 'Balance Scale Weight & Distance Database (right)',
-												cleaning and adding some uci datasets

											
										
										
											2021-01-11 12:55:06 +01:00
+								        'breast-cancer':  'Breast Cancer Wisconsin (Original)',
 								        'cmc.1': 'Contraceptive Method Choice (no use)',
 								        'cmc.2': 'Contraceptive Method Choice (long term)',
 								        'cmc.3': 'Contraceptive Method Choice (short term)',
 								        'ctg.1': 'Cardiotocography Data Set (normal)',
 								        'ctg.2': 'Cardiotocography Data Set (suspect)',
 								        'ctg.3': 'Cardiotocography Data Set (pathologic)',
-												refactor: methods requiring a val_split can now declare a default value in the __init__ method that will be used in case the fit method is called without specifying the val_split, which now is by default None in the fit, i.e., by default takes the value of the init, that is generally set to 0.4; some uci datasets added; ensembles can now be optimized for quantification, and can be trained on samples of smaller size

											
										
										
											2021-01-22 18:01:51 +01:00
+								        'german': 'Statlog German Credit Data',
-												adding uci ml datasets

											
										
										
											2021-01-25 18:38:56 +01:00
+								        'haberman': "Haberman's Survival Data",
 								        'ionosphere': 'Johns Hopkins University Ionosphere DB',
 								        'iris.1': 'Iris Plants Database(x)',
 								        'iris.2': 'Iris Plants Database(versicolour)',
 								        'iris.3': 'Iris Plants Database(virginica)',
 								        'mammographic': 'Mammographic Mass',
 								        'pageblocks.5': 'Page Blocks Classification (5)',
 								        'semeion': 'Semeion Handwritten Digit (8)',
-												more uci datasets, plots improved (higher fonts), and evaluation script that shows numerical results in command line

											
										
										
											2021-01-27 22:49:54 +01:00
+								        'sonar': 'Sonar, Mines vs. Rocks',
 								        'spambase': 'Spambase Data Set',
 								        'spectf': 'SPECTF Heart Data',
 								        'tictactoe': 'Tic-Tac-Toe Endgame Database',
-												all uci datasets from Pérez-Gállego added, quantification report added

											
										
										
											2021-01-28 18:22:43 +01:00
+								        'transfusion': 'Blood Transfusion Service Center Data Set',
 								        'wdbc': 'Wisconsin Diagnostic Breast Cancer',
 								        'wine.1': 'Wine Recognition Data (1)',
 								        'wine.2': 'Wine Recognition Data (2)',
 								        'wine.3': 'Wine Recognition Data (3)',
 								        'wine-q-red': 'Wine Quality Red (6-10)',
 								        'wine-q-white': 'Wine Quality White (6-10)',
 								        'yeast': 'Yeast',
-												refactor: methods requiring a val_split can now declare a default value in the __init__ method that will be used in case the fit method is called without specifying the val_split, which now is by default None in the fit, i.e., by default takes the value of the init, that is generally set to 0.4; some uci datasets added; ensembles can now be optimized for quantification, and can be trained on samples of smaller size

											
										
										
											2021-01-22 18:01:51 +01:00
+								    }
 								    # the identifier is an alias for the dataset group, it's part of the url data-folder, and is the name we use
 								    # to download the raw dataset
 								    identifier_map = {
 								        'acute.a': 'acute',
 								        'acute.b': 'acute',
 								        'balance.1': 'balance-scale',
 								        'balance.2': 'balance-scale',
 								        'balance.3': 'balance-scale',
 								        'breast-cancer': 'breast-cancer-wisconsin',
 								        'cmc.1': 'cmc',
 								        'cmc.2': 'cmc',
 								        'cmc.3': 'cmc',
 								        'ctg.1': '00193',
 								        'ctg.2': '00193',
 								        'ctg.3': '00193',
-												adding uci ml datasets

											
										
										
											2021-01-25 18:38:56 +01:00
+								        'german': 'statlog/german',
 								        'haberman': 'haberman',
 								        'ionosphere': 'ionosphere',
 								        'iris.1': 'iris',
 								        'iris.2': 'iris',
 								        'iris.3': 'iris',
 								        'mammographic': 'mammographic-masses',
 								        'pageblocks.5': 'page-blocks',
 								        'semeion': 'semeion',
-												more uci datasets, plots improved (higher fonts), and evaluation script that shows numerical results in command line

											
										
										
											2021-01-27 22:49:54 +01:00
+								        'sonar': 'undocumented/connectionist-bench/sonar',
 								        'spambase': 'spambase',
 								        'spectf': 'spect',
 								        'tictactoe': 'tic-tac-toe',
-												all uci datasets from Pérez-Gállego added, quantification report added

											
										
										
											2021-01-28 18:22:43 +01:00
+								        'transfusion': 'blood-transfusion',
 								        'wdbc': 'breast-cancer-wisconsin',
 								        'wine-q-red': 'wine-quality',
 								        'wine-q-white': 'wine-quality',
 								        'wine.1': 'wine',
 								        'wine.2': 'wine',
 								        'wine.3': 'wine',
 								        'yeast': 'yeast',
-												added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used)

											
										
										
											2021-01-06 14:58:29 +01:00
+								    }
-												refactor: methods requiring a val_split can now declare a default value in the __init__ method that will be used in case the fit method is called without specifying the val_split, which now is by default None in the fit, i.e., by default takes the value of the init, that is generally set to 0.4; some uci datasets added; ensembles can now be optimized for quantification, and can be trained on samples of smaller size

											
										
										
											2021-01-22 18:01:51 +01:00
+								    # the filename is the name of the file within the data_folder indexed by the identifier
 								    file_name = {
 								        'acute': 'diagnosis.data',
 								        '00193': 'CTG.xls',
-												adding uci ml datasets

											
										
										
											2021-01-25 18:38:56 +01:00
+								        'statlog/german': 'german.data-numeric',
 								        'mammographic-masses': 'mammographic_masses.data',
 								        'page-blocks': 'page-blocks.data.Z',
-												more uci datasets, plots improved (higher fonts), and evaluation script that shows numerical results in command line

											
										
										
											2021-01-27 22:49:54 +01:00
+								        'undocumented/connectionist-bench/sonar': 'sonar.all-data',
 								        'spect': ['SPECTF.train', 'SPECTF.test'],
-												all uci datasets from Pérez-Gállego added, quantification report added

											
										
										
											2021-01-28 18:22:43 +01:00
+								        'blood-transfusion': 'transfusion.data',
 								        'wine-quality': ['winequality-red.csv', 'winequality-white.csv'],
 								        'breast-cancer-wisconsin': 'breast-cancer-wisconsin.data' if dataset_name=='breast-cancer' else 'wdbc.data'
-												refactor: methods requiring a val_split can now declare a default value in the __init__ method that will be used in case the fit method is called without specifying the val_split, which now is by default None in the fit, i.e., by default takes the value of the init, that is generally set to 0.4; some uci datasets added; ensembles can now be optimized for quantification, and can be trained on samples of smaller size

											
										
										
											2021-01-22 18:01:51 +01:00
+								    }
 								    # the filename containing the dataset description (if any)
 								    desc_name = {
 								        'acute': 'diagnosis.names',
 								        '00193': None,
-												adding uci ml datasets

											
										
										
											2021-01-25 18:38:56 +01:00
+								        'statlog/german': 'german.doc',
 								        'mammographic-masses': 'mammographic_masses.names',
-												more uci datasets, plots improved (higher fonts), and evaluation script that shows numerical results in command line

											
										
										
											2021-01-27 22:49:54 +01:00
+								        'undocumented/connectionist-bench/sonar': 'sonar.names',
 								        'spect': 'SPECTF.names',
-												all uci datasets from Pérez-Gállego added, quantification report added

											
										
										
											2021-01-28 18:22:43 +01:00
+								        'blood-transfusion': 'transfusion.names',
 								        'wine-quality': 'winequality.names',
 								        'breast-cancer-wisconsin': 'breast-cancer-wisconsin.names' if dataset_name == 'breast-cancer' else 'wdbc.names'
-												added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used)

											
										
										
											2021-01-06 14:58:29 +01:00
+								    }
 								    identifier = identifier_map[dataset_name]
-												adding uci ml datasets

											
										
										
											2021-01-25 18:38:56 +01:00
+								    filename = file_name.get(identifier, f'{identifier}.data')
 								    descfile = desc_name.get(identifier, f'{identifier}.names')
 								    fullname = dataset_fullname[dataset_name]
-												added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used)

											
										
										
											2021-01-06 14:58:29 +01:00
+								    URL = f'http://archive.ics.uci.edu/ml/machine-learning-databases/{identifier}'
-												refactor: methods requiring a val_split can now declare a default value in the __init__ method that will be used in case the fit method is called without specifying the val_split, which now is by default None in the fit, i.e., by default takes the value of the init, that is generally set to 0.4; some uci datasets added; ensembles can now be optimized for quantification, and can be trained on samples of smaller size

											
										
										
											2021-01-22 18:01:51 +01:00
+								    data_dir = join(data_home, 'uci_datasets', identifier)
-												more uci datasets, plots improved (higher fonts), and evaluation script that shows numerical results in command line

											
										
										
											2021-01-27 22:49:54 +01:00
+								    if isinstance(filename, str):  # filename could be a list of files, in which case it will be processed later
 								        data_path = join(data_dir, filename)
 								        download_file_if_not_exists(f'{URL}/{filename}', data_path)
-												refactor: methods requiring a val_split can now declare a default value in the __init__ method that will be used in case the fit method is called without specifying the val_split, which now is by default None in the fit, i.e., by default takes the value of the init, that is generally set to 0.4; some uci datasets added; ensembles can now be optimized for quantification, and can be trained on samples of smaller size

											
										
										
											2021-01-22 18:01:51 +01:00
 								    if descfile:
-												adding uci ml datasets

											
										
										
											2021-01-25 18:38:56 +01:00
+								        try:
 								            download_file_if_not_exists(f'{URL}/{descfile}', f'{data_dir}/{descfile}')
 								            if verbose:
 								                print(open(f'{data_dir}/{descfile}', 'rt').read())
 								        except Exception:
 								            print('could not read the description file')
-												refactor: methods requiring a val_split can now declare a default value in the __init__ method that will be used in case the fit method is called without specifying the val_split, which now is by default None in the fit, i.e., by default takes the value of the init, that is generally set to 0.4; some uci datasets added; ensembles can now be optimized for quantification, and can be trained on samples of smaller size

											
										
										
											2021-01-22 18:01:51 +01:00
+								    elif verbose:
 								        print('no file description available')
-												added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used)

											
										
										
											2021-01-06 14:58:29 +01:00
-												adding uci ml datasets

											
										
										
											2021-01-25 18:38:56 +01:00
+								    print(f'Loading {dataset_name} ({fullname})')
-												added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used)

											
										
										
											2021-01-06 14:58:29 +01:00
+								    if identifier == 'acute':
-												refactor: methods requiring a val_split can now declare a default value in the __init__ method that will be used in case the fit method is called without specifying the val_split, which now is by default None in the fit, i.e., by default takes the value of the init, that is generally set to 0.4; some uci datasets added; ensembles can now be optimized for quantification, and can be trained on samples of smaller size

											
										
										
											2021-01-22 18:01:51 +01:00
+								        df = pd.read_csv(data_path, header=None, encoding='utf-16', sep='\t')
-												all uci datasets from Pérez-Gállego added, quantification report added

											
										
										
											2021-01-28 18:22:43 +01:00
 								        df[0] = df[0].apply(lambda x: float(x.replace(',', '.'))).astype(float, copy=False)
-												updating the documentation

											
										
										
											2021-12-06 18:25:47 +01:00
+								        [_df_replace(df, col) for col in range(1, 6)]
-												all uci datasets from Pérez-Gállego added, quantification report added

											
										
										
											2021-01-28 18:22:43 +01:00
+								        X = df.loc[:, 0:5].values
-												added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used)

											
										
										
											2021-01-06 14:58:29 +01:00
+								        if dataset_name == 'acute.a':
 								            y = binarize(df[6], pos_class='yes')
 								        elif dataset_name == 'acute.b':
 								            y = binarize(df[7], pos_class='yes')
 								    if identifier == 'balance-scale':
-												refactor: methods requiring a val_split can now declare a default value in the __init__ method that will be used in case the fit method is called without specifying the val_split, which now is by default None in the fit, i.e., by default takes the value of the init, that is generally set to 0.4; some uci datasets added; ensembles can now be optimized for quantification, and can be trained on samples of smaller size

											
										
										
											2021-01-22 18:01:51 +01:00
+								        df = pd.read_csv(data_path, header=None, sep=',')
-												added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used)

											
										
										
											2021-01-06 14:58:29 +01:00
+								        if dataset_name == 'balance.1':
 								            y = binarize(df[0], pos_class='L')
 								        elif dataset_name == 'balance.2':
 								            y = binarize(df[0], pos_class='B')
 								        elif dataset_name == 'balance.3':
 								            y = binarize(df[0], pos_class='R')
 								        X = df.loc[:, 1:].astype(float).values
-												all uci datasets from Pérez-Gállego added, quantification report added

											
										
										
											2021-01-28 18:22:43 +01:00
+								    if identifier == 'breast-cancer-wisconsin' and dataset_name=='breast-cancer':
-												refactor: methods requiring a val_split can now declare a default value in the __init__ method that will be used in case the fit method is called without specifying the val_split, which now is by default None in the fit, i.e., by default takes the value of the init, that is generally set to 0.4; some uci datasets added; ensembles can now be optimized for quantification, and can be trained on samples of smaller size

											
										
										
											2021-01-22 18:01:51 +01:00
+								        df = pd.read_csv(data_path, header=None, sep=',')
-												cleaning and adding some uci datasets

											
										
										
											2021-01-11 12:55:06 +01:00
+								        Xy = df.loc[:, 1:10]
 								        Xy[Xy=='?']=np.nan
 								        Xy = Xy.dropna(axis=0)
 								        X = Xy.loc[:, 1:9]
 								        X = X.astype(float).values
-												all uci datasets from Pérez-Gállego added, quantification report added

											
										
										
											2021-01-28 18:22:43 +01:00
+								        y = binarize(Xy[10], pos_class=2)
 								    if identifier == 'breast-cancer-wisconsin' and dataset_name=='wdbc':
 								        df = pd.read_csv(data_path, header=None, sep=',')
 								        X = df.loc[:, 2:32].astype(float).values
 								        y = df[1].values
 								        y = binarize(y, pos_class='M')
-												cleaning and adding some uci datasets

											
										
										
											2021-01-11 12:55:06 +01:00
 								    if identifier == 'cmc':
-												refactor: methods requiring a val_split can now declare a default value in the __init__ method that will be used in case the fit method is called without specifying the val_split, which now is by default None in the fit, i.e., by default takes the value of the init, that is generally set to 0.4; some uci datasets added; ensembles can now be optimized for quantification, and can be trained on samples of smaller size

											
										
										
											2021-01-22 18:01:51 +01:00
+								        df = pd.read_csv(data_path, header=None, sep=',')
-												cleaning and adding some uci datasets

											
										
										
											2021-01-11 12:55:06 +01:00
+								        X = df.loc[:, 0:8].astype(float).values
 								        y = df[9].astype(int).values
 								        if dataset_name == 'cmc.1':
 								            y = binarize(y, pos_class=1)
 								        elif dataset_name == 'cmc.2':
 								            y = binarize(y, pos_class=2)
 								        elif dataset_name == 'cmc.3':
 								            y = binarize(y, pos_class=3)
-												refactor: methods requiring a val_split can now declare a default value in the __init__ method that will be used in case the fit method is called without specifying the val_split, which now is by default None in the fit, i.e., by default takes the value of the init, that is generally set to 0.4; some uci datasets added; ensembles can now be optimized for quantification, and can be trained on samples of smaller size

											
										
										
											2021-01-22 18:01:51 +01:00
+								    if identifier == '00193':
 								        df = pd.read_excel(data_path, sheet_name='Data', skipfooter=3)
 								        df = df[list(range(1,24))] # select columns numbered (number 23 is the target label)
 								        # replaces the header with the first row
 								        new_header = df.iloc[0]  # grab the first row for the header
 								        df = df[1:]  # take the data less the header row
 								        df.columns = new_header  # set the header row as the df header
 								        X = df.iloc[:, 0:22].astype(float).values
 								        y = df['NSP'].astype(int).values
-												adding uci ml datasets

											
										
										
											2021-01-25 18:38:56 +01:00
+								        if dataset_name == 'ctg.1':
 								            y = binarize(y, pos_class=1)  # 1==Normal
-												refactor: methods requiring a val_split can now declare a default value in the __init__ method that will be used in case the fit method is called without specifying the val_split, which now is by default None in the fit, i.e., by default takes the value of the init, that is generally set to 0.4; some uci datasets added; ensembles can now be optimized for quantification, and can be trained on samples of smaller size

											
										
										
											2021-01-22 18:01:51 +01:00
+								        elif dataset_name == 'ctg.2':
-												adding uci ml datasets

											
										
										
											2021-01-25 18:38:56 +01:00
+								            y = binarize(y, pos_class=2)  # 2==Suspect
-												refactor: methods requiring a val_split can now declare a default value in the __init__ method that will be used in case the fit method is called without specifying the val_split, which now is by default None in the fit, i.e., by default takes the value of the init, that is generally set to 0.4; some uci datasets added; ensembles can now be optimized for quantification, and can be trained on samples of smaller size

											
										
										
											2021-01-22 18:01:51 +01:00
+								        elif dataset_name == 'ctg.3':
-												adding uci ml datasets

											
										
										
											2021-01-25 18:38:56 +01:00
+								            y = binarize(y, pos_class=3)  # 3==Pathologic
-												added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used)

											
										
										
											2021-01-06 14:58:29 +01:00
-												refactor: methods requiring a val_split can now declare a default value in the __init__ method that will be used in case the fit method is called without specifying the val_split, which now is by default None in the fit, i.e., by default takes the value of the init, that is generally set to 0.4; some uci datasets added; ensembles can now be optimized for quantification, and can be trained on samples of smaller size

											
										
										
											2021-01-22 18:01:51 +01:00
+								    if identifier == 'statlog/german':
 								        df = pd.read_csv(data_path, header=None, delim_whitespace=True)
 								        X = df.iloc[:, 0:24].astype(float).values
 								        y = df[24].astype(int).values
 								        y = binarize(y, pos_class=1)
-												added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used)

											
										
										
											2021-01-06 14:58:29 +01:00
-												adding uci ml datasets

											
										
										
											2021-01-25 18:38:56 +01:00
+								    if identifier == 'haberman':
 								        df = pd.read_csv(data_path, header=None)
 								        X = df.iloc[:, 0:3].astype(float).values
 								        y = df[3].astype(int).values
 								        y = binarize(y, pos_class=2)
 								    if identifier == 'ionosphere':
 								        df = pd.read_csv(data_path, header=None)
 								        X = df.iloc[:, 0:34].astype(float).values
 								        y = df[34].values
 								        y = binarize(y, pos_class='b')
 								    if identifier == 'iris':
 								        df = pd.read_csv(data_path, header=None)
 								        X = df.iloc[:, 0:4].astype(float).values
 								        y = df[4].values
 								        if dataset_name == 'iris.1':
 								            y = binarize(y, pos_class='Iris-setosa')  # 1==Setosa
 								        elif dataset_name == 'iris.2':
 								            y = binarize(y, pos_class='Iris-versicolor')  # 2==Versicolor
 								        elif dataset_name == 'iris.3':
 								            y = binarize(y, pos_class='Iris-virginica')  # 3==Virginica
 								    if identifier == 'mammographic-masses':
 								        df = pd.read_csv(data_path, header=None, sep=',')
-												all uci datasets from Pérez-Gállego added, quantification report added

											
										
										
											2021-01-28 18:22:43 +01:00
+								        df[df == '?'] = np.nan
 								        Xy = df.dropna(axis=0)
-												adding uci ml datasets

											
										
										
											2021-01-25 18:38:56 +01:00
+								        X = Xy.iloc[:, 0:5]
 								        X = X.astype(float).values
 								        y = binarize(Xy.iloc[:,5], pos_class=1)
 								    if identifier == 'page-blocks':
 								        data_path_ = data_path.replace('.Z', '')
 								        if not os.path.exists(data_path_):
 								            raise FileNotFoundError(f'Warning: file {data_path_} does not exist. If this is the first time you '
 								                                    f'attempt to load this dataset, then you have to manually unzip the {data_path} '
 								                                    f'and name the extracted file {data_path_} (unfortunately, neither zipfile, nor '
 								                                    f'gzip can handle unix compressed files automatically -- there is a repo in GitHub '
 								                                    f'https://github.com/umeat/unlzw where the problem seems to be solved anyway).')
 								        df = pd.read_csv(data_path_, header=None, delim_whitespace=True)
 								        X = df.iloc[:, 0:10].astype(float).values
 								        y = df[10].values
 								        y = binarize(y, pos_class=5)  # 5==block "graphic"
 								    if identifier == 'semeion':
 								        df = pd.read_csv(data_path, header=None, delim_whitespace=True )
 								        X = df.iloc[:, 0:256].astype(float).values
 								        y = df[263].values  # 263 stands for digit 8 (labels are one-hot vectors from col 256-266)
 								        y = binarize(y, pos_class=1)
 								    if identifier == 'undocumented/connectionist-bench/sonar':
 								        df = pd.read_csv(data_path, header=None, sep=',')
 								        X = df.iloc[:, 0:60].astype(float).values
-												more uci datasets, plots improved (higher fonts), and evaluation script that shows numerical results in command line

											
										
										
											2021-01-27 22:49:54 +01:00
+								        y = df[60].values
-												adding uci ml datasets

											
										
										
											2021-01-25 18:38:56 +01:00
+								        y = binarize(y, pos_class='R')
-												more uci datasets, plots improved (higher fonts), and evaluation script that shows numerical results in command line

											
										
										
											2021-01-27 22:49:54 +01:00
+								    if identifier == 'spambase':
 								        df = pd.read_csv(data_path, header=None, sep=',')
 								        X = df.iloc[:, 0:57].astype(float).values
 								        y = df[57].values
 								        y = binarize(y, pos_class=1)
 								    if identifier == 'spect':
 								        dfs = []
-												all uci datasets from Pérez-Gállego added, quantification report added

											
										
										
											2021-01-28 18:22:43 +01:00
+								        for file in filename:
-												more uci datasets, plots improved (higher fonts), and evaluation script that shows numerical results in command line

											
										
										
											2021-01-27 22:49:54 +01:00
+								            data_path = join(data_dir, file)
-												all uci datasets from Pérez-Gállego added, quantification report added

											
										
										
											2021-01-28 18:22:43 +01:00
+								            download_file_if_not_exists(f'{URL}/{file}', data_path)
-												more uci datasets, plots improved (higher fonts), and evaluation script that shows numerical results in command line

											
										
										
											2021-01-27 22:49:54 +01:00
+								            dfs.append(pd.read_csv(data_path, header=None, sep=','))
 								        df = pd.concat(dfs)
 								        X = df.iloc[:, 1:45].astype(float).values
 								        y = df[0].values
 								        y = binarize(y, pos_class=0)
 								    if identifier == 'tic-tac-toe':
 								        df = pd.read_csv(data_path, header=None, sep=',')
 								        X = df.iloc[:, 0:9].replace('o',0).replace('b',1).replace('x',2).values
 								        y = df[9].values
 								        y = binarize(y, pos_class='negative')
 								    if identifier == 'blood-transfusion':
 								        df = pd.read_csv(data_path, sep=',')
 								        X = df.iloc[:, 0:4].astype(float).values
 								        y = df.iloc[:, 4].values
 								        y = binarize(y, pos_class=1)
-												adding uci ml datasets

											
										
										
											2021-01-25 18:38:56 +01:00
-												all uci datasets from Pérez-Gállego added, quantification report added

											
										
										
											2021-01-28 18:22:43 +01:00
+								    if identifier == 'wine':
 								        df = pd.read_csv(data_path, header=None, sep=',')
 								        X = df.iloc[:, 1:14].astype(float).values
 								        y = df[0].values
 								        if dataset_name == 'wine.1':
 								            y = binarize(y, pos_class=1)
 								        elif dataset_name == 'wine.2':
 								            y = binarize(y, pos_class=2)
 								        elif dataset_name == 'wine.3':
 								            y = binarize(y, pos_class=3)
 								    if identifier == 'wine-quality':
 								        filename = filename[0] if dataset_name=='wine-q-red' else filename[1]
 								        data_path = join(data_dir, filename)
 								        download_file_if_not_exists(f'{URL}/{filename}', data_path)
 								        df = pd.read_csv(data_path, sep=';')
 								        X = df.iloc[:, 0:11].astype(float).values
 								        y = df.iloc[:, 11].values > 5
 								    if identifier == 'yeast':
 								        df = pd.read_csv(data_path, header=None, delim_whitespace=True)
 								        X = df.iloc[:, 1:9].astype(float).values
 								        y = df.iloc[:, 9].values
 								        y = binarize(y, pos_class='NUC')
-												refactor: methods requiring a val_split can now declare a default value in the __init__ method that will be used in case the fit method is called without specifying the val_split, which now is by default None in the fit, i.e., by default takes the value of the init, that is generally set to 0.4; some uci datasets added; ensembles can now be optimized for quantification, and can be trained on samples of smaller size

											
										
										
											2021-01-22 18:01:51 +01:00
+								    data = LabelledCollection(X, y)
 								    data.stats()
-												all uci datasets from Pérez-Gállego added, quantification report added

											
										
										
											2021-01-28 18:22:43 +01:00
+								    return data
-												added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used)

											
										
										
											2021-01-06 14:58:29 +01:00
-												updating the documentation

											
										
										
											2021-12-06 18:25:47 +01:00
+								def _df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):
-												lequa as dataset

											
										
										
											2022-06-01 18:28:59 +02:00
+								    df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False)
 								def fetch_lequa2022(task, data_home=None):
 								    """
-												adding documentation and adding one new example

											
										
										
											2023-02-08 19:06:53 +01:00
+								    Loads the official datasets provided for the `LeQua <https://lequa2022.github.io/index>`_ competition.
 								    In brief, there are 4 tasks (T1A, T1B, T2A, T2B) having to do with text quantification
 								    problems. Tasks T1A and T1B provide documents in vector form, while T2A and T2B provide raw documents instead.
 								    Tasks T1A and T2A are binary sentiment quantification problems, while T2A and T2B are multiclass quantification
 								    problems consisting of estimating the class prevalence values of 28 different merchandise products.
 								    We refer to the `Esuli, A., Moreo, A., Sebastiani, F., & Sperduti, G. (2022).
 								    A Detailed Overview of LeQua@ CLEF 2022: Learning to Quantify.
 								    <https://ceur-ws.org/Vol-3180/paper-146.pdf>`_ for a detailed description
 								    on the tasks and datasets.
 								    The datasets are downloaded only once, and stored for fast reuse.
 								    See `lequa2022_experiments.py` provided in the example folder, that can serve as a guide on how to use these
 								    datasets.
 								    :param task: a string representing the task name; valid ones are T1A, T1B, T2A, and T2B
 								    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
 								        ~/quay_data/ directory)
 								    :return: a tuple `(train, val_gen, test_gen)` where `train` is an instance of
 								        :class:`quapy.data.base.LabelledCollection`, `val_gen` and `test_gen` are instances of
 								        :class:`quapy.protocol.SamplesFromDir`, i.e., are sampling protocols that return a series of samples
 								        labelled by prevalence.
-												lequa as dataset

											
										
										
											2022-06-01 18:28:59 +02:00
+								    """
-												adding documentation and adding one new example

											
										
										
											2023-02-08 19:06:53 +01:00
-												lequa as dataset

											
										
										
											2022-06-01 18:28:59 +02:00
+								    from quapy.data._lequa2022 import load_raw_documents, load_vector_documents, SamplesFromDir
 								    assert task in LEQUA2022_TASKS, \
 								        f'Unknown task {task}. Valid ones are {LEQUA2022_TASKS}'
 								    if data_home is None:
 								        data_home = get_quapy_home()
 								    URL_TRAINDEV=f'https://zenodo.org/record/6546188/files/{task}.train_dev.zip'
 								    URL_TEST=f'https://zenodo.org/record/6546188/files/{task}.test.zip'
 								    URL_TEST_PREV=f'https://zenodo.org/record/6546188/files/{task}.test_prevalences.zip'
 								    lequa_dir = join(data_home, 'lequa2022')
 								    os.makedirs(lequa_dir, exist_ok=True)
 								    def download_unzip_and_remove(unzipped_path, url):
 								        tmp_path = join(lequa_dir, task + '_tmp.zip')
 								        download_file_if_not_exists(url, tmp_path)
 								        with zipfile.ZipFile(tmp_path) as file:
 								            file.extractall(unzipped_path)
 								        os.remove(tmp_path)
 								    if not os.path.exists(join(lequa_dir, task)):
 								        download_unzip_and_remove(lequa_dir, URL_TRAINDEV)
 								        download_unzip_and_remove(lequa_dir, URL_TEST)
 								        download_unzip_and_remove(lequa_dir, URL_TEST_PREV)
 								    if task in ['T1A', 'T1B']:
 								        load_fn = load_vector_documents
 								    elif task in ['T2A', 'T2B']:
 								        load_fn = load_raw_documents
 								    tr_path = join(lequa_dir, task, 'public', 'training_data.txt')
 								    train = LabelledCollection.load(tr_path, loader_func=load_fn)
 								    val_samples_path = join(lequa_dir, task, 'public', 'dev_samples')
 								    val_true_prev_path = join(lequa_dir, task, 'public', 'dev_prevalences.txt')
 								    val_gen = SamplesFromDir(val_samples_path, val_true_prev_path, load_fn=load_fn)
-												full example of training, model selection, and evaluation using the lequa2022 dataset with the new protocols

											
										
										
											2022-11-04 15:04:36 +01:00
+								    test_samples_path = join(lequa_dir, task, 'public', 'test_samples')
-												lequa as dataset

											
										
										
											2022-06-01 18:28:59 +02:00
+								    test_true_prev_path = join(lequa_dir, task, 'public', 'test_prevalences.txt')
-												pathfix in lequa2022 datasets

											
										
										
											2022-06-15 14:36:02 +02:00
+								    test_gen = SamplesFromDir(test_samples_path, test_true_prev_path, load_fn=load_fn)
-												lequa as dataset

											
										
										
											2022-06-01 18:28:59 +02:00
 								    return train, val_gen, test_gen
-												adding plankton

											
										
										
											2023-10-24 11:54:19 +02:00
+								def fetch_IFCB(data_home=None):
 								    if data_home is None:
 								        data_home = get_quapy_home()
 								    URL_TRAINDEV=f'https://zenodo.org/records/10036244/files/IFCB.train.zip'
 								    URL_TEST=f'https://zenodo.org/records/10036244/files/IFCB.test.zip'
 								    URL_TEST_PREV=f'https://zenodo.org/records/10036244/files/IFCB.test_prevalences.zip'
 								    ifcb_dir = join(data_home, 'ifcb')
 								    os.makedirs(ifcb_dir, exist_ok=True)
 								    def download_unzip_and_remove(unzipped_path, url):
 								        tmp_path = join(ifcb_dir, 'tmp.zip')
 								        download_file_if_not_exists(url, tmp_path)
 								        with zipfile.ZipFile(tmp_path) as file:
 								            file.extractall(unzipped_path)
 								        os.remove(tmp_path)
 								    if not os.path.exists(join(ifcb_dir, task)):
 								        download_unzip_and_remove(ifcb_dir, URL_TRAINDEV)
 								        download_unzip_and_remove(ifcb_dir, URL_TEST)
 								        download_unzip_and_remove(ifcb_dir, URL_TEST_PREV)