QuaPy/quapy/data/datasets.py

import os
import zipfile
from os.path import join

import pandas as pd

from data.base import Dataset, LabelledCollection
from quapy.data.preprocessing import text2tfidf, reduce_columns
from quapy.data.reader import *
from quapy.util import download_file_if_not_exists, download_file, get_quapy_home, pickled_resource

REVIEWS_SENTIMENT_DATASETS = ['hp', 'kindle', 'imdb']
TWITTER_SENTIMENT_DATASETS_TEST = ['gasp', 'hcr', 'omd', 'sanders',
                              'semeval13', 'semeval14', 'semeval15', 'semeval16',
                              'sst', 'wa', 'wb']
TWITTER_SENTIMENT_DATASETS_TRAIN = ['gasp', 'hcr', 'omd', 'sanders',
                                 'semeval', 'semeval16',
                                 'sst', 'wa', 'wb']


def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False):
    """
    Load a Reviews dataset as a Dataset instance, as used in:
    Esuli, A., Moreo, A., and Sebastiani, F. "A recurrent neural network for sentiment quantification."
    Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018.
    :param dataset_name: the name of the dataset: valid ones are 'hp', 'kindle', 'imdb'
    :param tfidf: set to True to transform the raw documents into tfidf weighted matrices
    :param min_df: minimun number of documents that should contain a term in order for the term to be
    kept (ignored if tfidf==False)
    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
    ~/quay_data/ directory)
    :param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for
    faster subsequent invokations
    :return: a Dataset instance
    """
    assert dataset_name in REVIEWS_SENTIMENT_DATASETS, \
        f'Name {dataset_name} does not match any known dataset for sentiment reviews. ' \
        f'Valid ones are {REVIEWS_SENTIMENT_DATASETS}'
    if data_home is None:
        data_home = get_quapy_home()

    URL_TRAIN = f'https://zenodo.org/record/4117827/files/{dataset_name}_train.txt'
    URL_TEST = f'https://zenodo.org/record/4117827/files/{dataset_name}_test.txt'
    os.makedirs(join(data_home, 'reviews'), exist_ok=True)
    train_path = join(data_home, 'reviews', dataset_name, 'train.txt')
    test_path = join(data_home, 'reviews', dataset_name, 'test.txt')
    download_file_if_not_exists(URL_TRAIN, train_path)
    download_file_if_not_exists(URL_TEST, test_path)

    pickle_path = None
    if pickle:
        pickle_path = join(data_home, 'reviews', 'pickle', f'{dataset_name}.pkl')
    data = pickled_resource(pickle_path, Dataset.load, train_path, test_path, from_text)

    if tfidf:
        text2tfidf(data, inplace=True)
        if min_df is not None:
            reduce_columns(data, min_df=min_df, inplace=True)

    data.name = dataset_name

    return data


def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_home=None, pickle=False):
    """
    Load a Twitter dataset as a Dataset instance, as used in:
    Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
    Social Network Analysis and Mining6(19), 1–22 (2016)
    The datasets 'semeval13', 'semeval14', 'semeval15' share the same training set.

    :param dataset_name: the name of the dataset: valid ones are 'gasp', 'hcr', 'omd', 'sanders', 'semeval13',
    'semeval14', 'semeval15', 'semeval16', 'sst', 'wa', 'wb'
    :param for_model_selection: if True, then returns the train split as the training set and the devel split
    as the test set; if False, then returns the train+devel split as the training set and the test set as the
    test set
    :param min_df: minimun number of documents that should contain a term in order for the term to be kept
    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
    ~/quay_data/ directory)
    :param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for
    faster subsequent invokations
    :return: a Dataset instance
    """
    assert dataset_name in TWITTER_SENTIMENT_DATASETS_TRAIN + TWITTER_SENTIMENT_DATASETS_TEST, \
        f'Name {dataset_name} does not match any known dataset for sentiment twitter. ' \
        f'Valid ones are {TWITTER_SENTIMENT_DATASETS_TRAIN} for model selection and ' \
        f'{TWITTER_SENTIMENT_DATASETS_TEST} for test (datasets "semeval14", "semeval15", "semeval16" share ' \
        f'a common training set "semeval")'
    if data_home is None:
        data_home = get_quapy_home()

    URL = 'https://zenodo.org/record/4255764/files/tweet_sentiment_quantification_snam.zip'
    unzipped_path = join(data_home, 'tweet_sentiment_quantification_snam')
    if not os.path.exists(unzipped_path):
        downloaded_path = join(data_home, 'tweet_sentiment_quantification_snam.zip')
        download_file(URL, downloaded_path)
        with zipfile.ZipFile(downloaded_path) as file:
            file.extractall(data_home)
        os.remove(downloaded_path)

    if dataset_name in {'semeval13', 'semeval14', 'semeval15'}:
        trainset_name = 'semeval'
        testset_name  = 'semeval' if for_model_selection else dataset_name
        print(f"the training and development sets for datasets 'semeval13', 'semeval14', 'semeval15' are common "
              f"(called 'semeval'); returning trainin-set='{trainset_name}' and test-set={testset_name}")
    else:
        if dataset_name == 'semeval' and for_model_selection==False:
            raise ValueError('dataset "semeval" can only be used for model selection. '
                             'Use "semeval13", "semeval14", or "semeval15" for model evaluation.')
        trainset_name = testset_name = dataset_name

    if for_model_selection:
        train = join(unzipped_path, 'train', f'{trainset_name}.train.feature.txt')
        test  = join(unzipped_path, 'test', f'{testset_name}.dev.feature.txt')
    else:
        train = join(unzipped_path, 'train', f'{trainset_name}.train+dev.feature.txt')
        if dataset_name == 'semeval16':  # there is a different test name in the case of semeval16 only
            test = join(unzipped_path, 'test', f'{testset_name}.dev-test.feature.txt')
        else:
            test = join(unzipped_path, 'test', f'{testset_name}.test.feature.txt')

    pickle_path = None
    if pickle:
        mode = "train-dev" if for_model_selection else "train+dev-test"
        pickle_path = join(unzipped_path, 'pickle', f'{testset_name}.{mode}.pkl')
    data = pickled_resource(pickle_path, Dataset.load, train, test, from_sparse)

    if min_df is not None:
        reduce_columns(data, min_df=min_df, inplace=True)

    data.name = dataset_name

    return data


UCI_DATASETS = ['acute.a', 'acute.b',
                'balance.1', 'balance.2', 'balance.3',
                'breast-cancer',
                'cmc.1', 'cmc.2', 'cmc.3',
                'ctg.1', 'ctg.2', 'ctg.3'] # ongoing...

def fetch_UCIDataset(dataset_name, data_home=None, verbose=False):

    assert dataset_name in UCI_DATASETS, \
        f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \
        f'Valid ones are {UCI_DATASETS}'
    if data_home is None:
        data_home = get_quapy_home()

    identifier_map = {
        'acute.a': 'acute',
        'acute.b': 'acute',
        'balance.1': 'balance-scale',
        'balance.2': 'balance-scale',
        'balance.3': 'balance-scale',
        'breast-cancer': 'breast-cancer-wisconsin',
        'cmc.1': 'cmc',
        'cmc.2': 'cmc',
        'cmc.3': 'cmc',
        'ctg.1': 'ctg',
        'ctg.2': 'ctg',
        'ctg.3': 'ctg',

    }

    dataset_fullname = {
        'acute.a': 'Acute Inflammations (urinary bladder)',
        'acute.b': 'Acute Inflammations (renal pelvis)',
        'balance.1': 'Balance Scale Weight & Distance Database (left)',
        'balance.2': 'Balance Scale Weight & Distance Database (balanced)',
        'balance.3': 'Balance Scale Weight & Distance Database (right)',
        'breast-cancer':  'Breast Cancer Wisconsin (Original)',
        'cmc.1': 'Contraceptive Method Choice (no use)',
        'cmc.2': 'Contraceptive Method Choice (long term)',
        'cmc.3': 'Contraceptive Method Choice (short term)',
        'ctg.1': 'Cardiotocography Data Set (normal)',
        'ctg.2': 'Cardiotocography Data Set (suspect)',
        'ctg.3': 'Cardiotocography Data Set (pathologic)',
    }

    data_folder = {
        'acute': 'diagnosis',
        'balance-scale': 'balance-scale',
        'breast-cancer-wisconsin': 'breast-cancer-wisconsin',
        'cmc': 'cmc'
    }

    identifier = identifier_map[dataset_name]
    URL = f'http://archive.ics.uci.edu/ml/machine-learning-databases/{identifier}'
    data_path = join(data_home, 'uci_datasets', identifier)
    download_file_if_not_exists(f'{URL}/{data_folder[identifier]}.data', f'{data_path}/{identifier}.data')
    download_file_if_not_exists(f'{URL}/{data_folder[identifier]}.names', f'{data_path}/{identifier}.names')

    if verbose:
        print(open(f'{data_path}/{identifier}.names', 'rt').read())

    print(f'Loading {dataset_name} ({dataset_fullname[dataset_name]})')
    if identifier == 'acute':
        df = pd.read_csv(f'{data_path}/{identifier}.data', header=None, encoding='utf-16', sep='\t')
        if dataset_name == 'acute.a':
            y = binarize(df[6], pos_class='yes')
        elif dataset_name == 'acute.b':
            y = binarize(df[7], pos_class='yes')

        mintemp, maxtemp = 35, 42
        df[0] = df[0].apply(lambda x:(float(x.replace(',','.'))-mintemp)/(maxtemp-mintemp)).astype(float, copy=False)
        [df_replace(df, col) for col in range(1, 6)]
        X = df.loc[:, 0:5].values

    if identifier == 'balance-scale':
        df = pd.read_csv(f'{data_path}/{identifier}.data', header=None, sep=',')
        if dataset_name == 'balance.1':
            y = binarize(df[0], pos_class='L')
        elif dataset_name == 'balance.2':
            y = binarize(df[0], pos_class='B')
        elif dataset_name == 'balance.3':
            y = binarize(df[0], pos_class='R')
        X = df.loc[:, 1:].astype(float).values

    if identifier == 'breast-cancer-wisconsin':
        df = pd.read_csv(f'{data_path}/{identifier}.data', header=None, sep=',')
        Xy = df.loc[:, 1:10]
        Xy[Xy=='?']=np.nan
        Xy = Xy.dropna(axis=0)
        X = Xy.loc[:, 1:9]
        X = X.astype(float).values
        y = binarize(Xy[10], pos_class=4)

    if identifier == 'cmc':
        df = pd.read_csv(f'{data_path}/{identifier}.data', header=None, sep=',')
        X = df.loc[:, 0:8].astype(float).values
        y = df[9].astype(int).values
        if dataset_name == 'cmc.1':
            y = binarize(y, pos_class=1)
        elif dataset_name == 'cmc.2':
            y = binarize(y, pos_class=2)
        elif dataset_name == 'cmc.3':
            y = binarize(y, pos_class=3)

    data = LabelledCollection(X, y)
    data.stats()
    raise NotImplementedError()
    #print(df)
    #print(df.loc[:, 0:5].values)
    #print(y)

#    X = __read_csv(f'{data_path}/{identifier}.data', separator='\t')
#    print(X)

    #X, y = from_csv(f'{data_path}/{dataset_name}.data')
    #y, classnames = reindex_labels(y)


#def __read_csv(path, separator=','):
#    x = []
#    for instance in tqdm(open(path, 'rt', encoding='utf-16').readlines(), desc=f'reading {path}'):
#        x.append(instance.strip().split(separator))
#    return x

def df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):
    df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False)
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
+								import os
-												import fixes

											
										
										
											2021-01-15 18:32:32 +01:00
+								import zipfile
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
+								from os.path import join
-												import fixes

											
										
										
											2021-01-15 18:32:32 +01:00
-												added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used)

											
										
										
											2021-01-06 14:58:29 +01:00
+								import pandas as pd
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
-												import fixes

											
										
										
											2021-01-15 18:32:32 +01:00
+								from data.base import Dataset, LabelledCollection
 								from quapy.data.preprocessing import text2tfidf, reduce_columns
 								from quapy.data.reader import *
 								from quapy.util import download_file_if_not_exists, download_file, get_quapy_home, pickled_resource
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
 								REVIEWS_SENTIMENT_DATASETS = ['hp', 'kindle', 'imdb']
-												testing quapy via replicating Tweet Quantification experiments

											
										
										
											2021-01-12 17:39:00 +01:00
+								TWITTER_SENTIMENT_DATASETS_TEST = ['gasp', 'hcr', 'omd', 'sanders',
-												added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used)

											
										
										
											2021-01-06 14:58:29 +01:00
+								                              'semeval13', 'semeval14', 'semeval15', 'semeval16',
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
+								                              'sst', 'wa', 'wb']
-												testing quapy via replicating Tweet Quantification experiments

											
										
										
											2021-01-12 17:39:00 +01:00
+								TWITTER_SENTIMENT_DATASETS_TRAIN = ['gasp', 'hcr', 'omd', 'sanders',
 								                                 'semeval', 'semeval16',
 								                                 'sst', 'wa', 'wb']
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
+								def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False):
 								    """
 								    Load a Reviews dataset as a Dataset instance, as used in:
 								    Esuli, A., Moreo, A., and Sebastiani, F. "A recurrent neural network for sentiment quantification."
 								    Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018.
 								    :param dataset_name: the name of the dataset: valid ones are 'hp', 'kindle', 'imdb'
 								    :param tfidf: set to True to transform the raw documents into tfidf weighted matrices
 								    :param min_df: minimun number of documents that should contain a term in order for the term to be
 								    kept (ignored if tfidf==False)
 								    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
 								    ~/quay_data/ directory)
 								    :param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for
 								    faster subsequent invokations
 								    :return: a Dataset instance
 								    """
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
+								    assert dataset_name in REVIEWS_SENTIMENT_DATASETS, \
 								        f'Name {dataset_name} does not match any known dataset for sentiment reviews. ' \
 								        f'Valid ones are {REVIEWS_SENTIMENT_DATASETS}'
 								    if data_home is None:
 								        data_home = get_quapy_home()
 								    URL_TRAIN = f'https://zenodo.org/record/4117827/files/{dataset_name}_train.txt'
 								    URL_TEST = f'https://zenodo.org/record/4117827/files/{dataset_name}_test.txt'
 								    os.makedirs(join(data_home, 'reviews'), exist_ok=True)
 								    train_path = join(data_home, 'reviews', dataset_name, 'train.txt')
 								    test_path = join(data_home, 'reviews', dataset_name, 'test.txt')
 								    download_file_if_not_exists(URL_TRAIN, train_path)
 								    download_file_if_not_exists(URL_TEST, test_path)
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
+								    pickle_path = None
 								    if pickle:
 								        pickle_path = join(data_home, 'reviews', 'pickle', f'{dataset_name}.pkl')
 								    data = pickled_resource(pickle_path, Dataset.load, train_path, test_path, from_text)
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
 								    if tfidf:
 								        text2tfidf(data, inplace=True)
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
+								        if min_df is not None:
 								            reduce_columns(data, min_df=min_df, inplace=True)
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
-												adding tweet sent quant experiments

											
										
										
											2021-01-11 18:31:12 +01:00
+								    data.name = dataset_name
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
+								    return data
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
+								def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_home=None, pickle=False):
 								    """
 								    Load a Twitter dataset as a Dataset instance, as used in:
 								    Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
 								    Social Network Analysis and Mining6(19), 1–22 (2016)
-												testing quapy via replicating Tweet Quantification experiments

											
										
										
											2021-01-12 17:39:00 +01:00
+								    The datasets 'semeval13', 'semeval14', 'semeval15' share the same training set.
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
 								    :param dataset_name: the name of the dataset: valid ones are 'gasp', 'hcr', 'omd', 'sanders', 'semeval13',
 								    'semeval14', 'semeval15', 'semeval16', 'sst', 'wa', 'wb'
 								    :param for_model_selection: if True, then returns the train split as the training set and the devel split
 								    as the test set; if False, then returns the train+devel split as the training set and the test set as the
 								    test set
 								    :param min_df: minimun number of documents that should contain a term in order for the term to be kept
 								    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
 								    ~/quay_data/ directory)
 								    :param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for
 								    faster subsequent invokations
 								    :return: a Dataset instance
 								    """
-												testing quapy via replicating Tweet Quantification experiments

											
										
										
											2021-01-12 17:39:00 +01:00
+								    assert dataset_name in TWITTER_SENTIMENT_DATASETS_TRAIN + TWITTER_SENTIMENT_DATASETS_TEST, \
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
+								        f'Name {dataset_name} does not match any known dataset for sentiment twitter. ' \
-												testing quapy via replicating Tweet Quantification experiments

											
										
										
											2021-01-12 17:39:00 +01:00
+								        f'Valid ones are {TWITTER_SENTIMENT_DATASETS_TRAIN} for model selection and ' \
 								        f'{TWITTER_SENTIMENT_DATASETS_TEST} for test (datasets "semeval14", "semeval15", "semeval16" share ' \
 								        f'a common training set "semeval")'
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
+								    if data_home is None:
 								        data_home = get_quapy_home()
 								    URL = 'https://zenodo.org/record/4255764/files/tweet_sentiment_quantification_snam.zip'
 								    unzipped_path = join(data_home, 'tweet_sentiment_quantification_snam')
 								    if not os.path.exists(unzipped_path):
 								        downloaded_path = join(data_home, 'tweet_sentiment_quantification_snam.zip')
 								        download_file(URL, downloaded_path)
 								        with zipfile.ZipFile(downloaded_path) as file:
 								            file.extractall(data_home)
 								        os.remove(downloaded_path)
 								    if dataset_name in {'semeval13', 'semeval14', 'semeval15'}:
 								        trainset_name = 'semeval'
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
+								        testset_name  = 'semeval' if for_model_selection else dataset_name
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
+								        print(f"the training and development sets for datasets 'semeval13', 'semeval14', 'semeval15' are common "
 								              f"(called 'semeval'); returning trainin-set='{trainset_name}' and test-set={testset_name}")
 								    else:
-												testing quapy via replicating Tweet Quantification experiments

											
										
										
											2021-01-12 17:39:00 +01:00
+								        if dataset_name == 'semeval' and for_model_selection==False:
 								            raise ValueError('dataset "semeval" can only be used for model selection. '
 								                             'Use "semeval13", "semeval14", or "semeval15" for model evaluation.')
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
+								        trainset_name = testset_name = dataset_name
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
+								    if for_model_selection:
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
+								        train = join(unzipped_path, 'train', f'{trainset_name}.train.feature.txt')
 								        test  = join(unzipped_path, 'test', f'{testset_name}.dev.feature.txt')
 								    else:
 								        train = join(unzipped_path, 'train', f'{trainset_name}.train+dev.feature.txt')
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
+								        if dataset_name == 'semeval16':  # there is a different test name in the case of semeval16 only
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
+								            test = join(unzipped_path, 'test', f'{testset_name}.dev-test.feature.txt')
 								        else:
 								            test = join(unzipped_path, 'test', f'{testset_name}.test.feature.txt')
-												added model selection for quantification

											
										
										
											2020-12-22 17:43:23 +01:00
+								    pickle_path = None
 								    if pickle:
 								        mode = "train-dev" if for_model_selection else "train+dev-test"
 								        pickle_path = join(unzipped_path, 'pickle', f'{testset_name}.{mode}.pkl')
 								    data = pickled_resource(pickle_path, Dataset.load, train, test, from_sparse)
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
 								    if min_df is not None:
 								        reduce_columns(data, min_df=min_df, inplace=True)
-												adding tweet sent quant experiments

											
										
										
											2021-01-11 18:31:12 +01:00
+								    data.name = dataset_name
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
+								    return data
-												added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used)

											
										
										
											2021-01-06 14:58:29 +01:00
+								UCI_DATASETS = ['acute.a', 'acute.b',
-												cleaning and adding some uci datasets

											
										
										
											2021-01-11 12:55:06 +01:00
+								                'balance.1', 'balance.2', 'balance.3',
 								                'breast-cancer',
 								                'cmc.1', 'cmc.2', 'cmc.3',
 								                'ctg.1', 'ctg.2', 'ctg.3'] # ongoing...
-												dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

											
										
										
											2020-12-14 18:36:19 +01:00
-												added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used)

											
										
										
											2021-01-06 14:58:29 +01:00
+								def fetch_UCIDataset(dataset_name, data_home=None, verbose=False):
 								    assert dataset_name in UCI_DATASETS, \
 								        f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \
 								        f'Valid ones are {UCI_DATASETS}'
 								    if data_home is None:
 								        data_home = get_quapy_home()
 								    identifier_map = {
 								        'acute.a': 'acute',
 								        'acute.b': 'acute',
 								        'balance.1': 'balance-scale',
 								        'balance.2': 'balance-scale',
 								        'balance.3': 'balance-scale',
-												cleaning and adding some uci datasets

											
										
										
											2021-01-11 12:55:06 +01:00
+								        'breast-cancer': 'breast-cancer-wisconsin',
 								        'cmc.1': 'cmc',
 								        'cmc.2': 'cmc',
 								        'cmc.3': 'cmc',
 								        'ctg.1': 'ctg',
 								        'ctg.2': 'ctg',
 								        'ctg.3': 'ctg',
-												added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used)

											
										
										
											2021-01-06 14:58:29 +01:00
+								    }
 								    dataset_fullname = {
 								        'acute.a': 'Acute Inflammations (urinary bladder)',
 								        'acute.b': 'Acute Inflammations (renal pelvis)',
 								        'balance.1': 'Balance Scale Weight & Distance Database (left)',
 								        'balance.2': 'Balance Scale Weight & Distance Database (balanced)',
 								        'balance.3': 'Balance Scale Weight & Distance Database (right)',
-												cleaning and adding some uci datasets

											
										
										
											2021-01-11 12:55:06 +01:00
+								        'breast-cancer':  'Breast Cancer Wisconsin (Original)',
 								        'cmc.1': 'Contraceptive Method Choice (no use)',
 								        'cmc.2': 'Contraceptive Method Choice (long term)',
 								        'cmc.3': 'Contraceptive Method Choice (short term)',
 								        'ctg.1': 'Cardiotocography Data Set (normal)',
 								        'ctg.2': 'Cardiotocography Data Set (suspect)',
 								        'ctg.3': 'Cardiotocography Data Set (pathologic)',
-												added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used)

											
										
										
											2021-01-06 14:58:29 +01:00
+								    }
 								    data_folder = {
 								        'acute': 'diagnosis',
 								        'balance-scale': 'balance-scale',
-												cleaning and adding some uci datasets

											
										
										
											2021-01-11 12:55:06 +01:00
+								        'breast-cancer-wisconsin': 'breast-cancer-wisconsin',
 								        'cmc': 'cmc'
-												added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used)

											
										
										
											2021-01-06 14:58:29 +01:00
+								    }
 								    identifier = identifier_map[dataset_name]
 								    URL = f'http://archive.ics.uci.edu/ml/machine-learning-databases/{identifier}'
 								    data_path = join(data_home, 'uci_datasets', identifier)
 								    download_file_if_not_exists(f'{URL}/{data_folder[identifier]}.data', f'{data_path}/{identifier}.data')
 								    download_file_if_not_exists(f'{URL}/{data_folder[identifier]}.names', f'{data_path}/{identifier}.names')
 								    if verbose:
 								        print(open(f'{data_path}/{identifier}.names', 'rt').read())
 								    print(f'Loading {dataset_name} ({dataset_fullname[dataset_name]})')
 								    if identifier == 'acute':
 								        df = pd.read_csv(f'{data_path}/{identifier}.data', header=None, encoding='utf-16', sep='\t')
 								        if dataset_name == 'acute.a':
 								            y = binarize(df[6], pos_class='yes')
 								        elif dataset_name == 'acute.b':
 								            y = binarize(df[7], pos_class='yes')
 								        mintemp, maxtemp = 35, 42
 								        df[0] = df[0].apply(lambda x:(float(x.replace(',','.'))-mintemp)/(maxtemp-mintemp)).astype(float, copy=False)
 								        [df_replace(df, col) for col in range(1, 6)]
 								        X = df.loc[:, 0:5].values
 								    if identifier == 'balance-scale':
 								        df = pd.read_csv(f'{data_path}/{identifier}.data', header=None, sep=',')
 								        if dataset_name == 'balance.1':
 								            y = binarize(df[0], pos_class='L')
 								        elif dataset_name == 'balance.2':
 								            y = binarize(df[0], pos_class='B')
 								        elif dataset_name == 'balance.3':
 								            y = binarize(df[0], pos_class='R')
 								        X = df.loc[:, 1:].astype(float).values
-												cleaning and adding some uci datasets

											
										
										
											2021-01-11 12:55:06 +01:00
+								    if identifier == 'breast-cancer-wisconsin':
 								        df = pd.read_csv(f'{data_path}/{identifier}.data', header=None, sep=',')
 								        Xy = df.loc[:, 1:10]
 								        Xy[Xy=='?']=np.nan
 								        Xy = Xy.dropna(axis=0)
 								        X = Xy.loc[:, 1:9]
 								        X = X.astype(float).values
 								        y = binarize(Xy[10], pos_class=4)
 								    if identifier == 'cmc':
 								        df = pd.read_csv(f'{data_path}/{identifier}.data', header=None, sep=',')
 								        X = df.loc[:, 0:8].astype(float).values
 								        y = df[9].astype(int).values
 								        if dataset_name == 'cmc.1':
 								            y = binarize(y, pos_class=1)
 								        elif dataset_name == 'cmc.2':
 								            y = binarize(y, pos_class=2)
 								        elif dataset_name == 'cmc.3':
 								            y = binarize(y, pos_class=3)
-												added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used)

											
										
										
											2021-01-06 14:58:29 +01:00
+								    data = LabelledCollection(X, y)
 								    data.stats()
-												cleaning and adding some uci datasets

											
										
										
											2021-01-11 12:55:06 +01:00
+								    raise NotImplementedError()
-												added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used)

											
										
										
											2021-01-06 14:58:29 +01:00
+								    #print(df)
 								    #print(df.loc[:, 0:5].values)
 								    #print(y)
 								#    X = __read_csv(f'{data_path}/{identifier}.data', separator='\t')
 								#    print(X)
 								    #X, y = from_csv(f'{data_path}/{dataset_name}.data')
 								    #y, classnames = reindex_labels(y)
 								#def __read_csv(path, separator=','):
 								#    x = []
 								#    for instance in tqdm(open(path, 'rt', encoding='utf-16').readlines(), desc=f'reading {path}'):
 								#        x.append(instance.strip().split(separator))
 								#    return x
 								def df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):
 								    df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False)