QuaPy/quapy/data/datasets.py

import zipfile
from util import download_file_if_not_exists, download_file, get_quapy_home
import os
from os.path import join
from data.base import Dataset
from data.reader import from_text, from_sparse
from data.preprocessing import text2tfidf, reduce_columns


REVIEWS_SENTIMENT_DATASETS = ['hp', 'kindle', 'imdb']
TWITTER_SENTIMENT_DATASETS = ['gasp', 'hcr', 'omd', 'sanders', 'semeval13', 'semeval14', 'semeval15', 'semeval16',
                              'sst', 'wa', 'wb']


def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None):
    assert dataset_name in REVIEWS_SENTIMENT_DATASETS, \
        f'Name {dataset_name} does not match any known dataset for sentiment reviews. ' \
        f'Valid ones are {REVIEWS_SENTIMENT_DATASETS}'
    if data_home is None:
        data_home = get_quapy_home()

    URL_TRAIN = f'https://zenodo.org/record/4117827/files/{dataset_name}_train.txt'
    URL_TEST = f'https://zenodo.org/record/4117827/files/{dataset_name}_test.txt'
    os.makedirs(join(data_home, 'reviews'), exist_ok=True)
    train_path = join(data_home, 'reviews', dataset_name, 'train.txt')
    test_path = join(data_home, 'reviews', dataset_name, 'test.txt')
    download_file_if_not_exists(URL_TRAIN, train_path)
    download_file_if_not_exists(URL_TEST, test_path)

    data = Dataset.load(train_path, test_path, from_text)

    if tfidf:
        text2tfidf(data, inplace=True)

    if min_df is not None:
        reduce_columns(data, min_df=min_df, inplace=True)

    return data


def fetch_twitter(dataset_name, model_selection=False, min_df=None, data_home=None):
    assert dataset_name in TWITTER_SENTIMENT_DATASETS, \
        f'Name {dataset_name} does not match any known dataset for sentiment twitter. ' \
        f'Valid ones are {TWITTER_SENTIMENT_DATASETS}'
    if data_home is None:
        data_home = get_quapy_home()

    URL = 'https://zenodo.org/record/4255764/files/tweet_sentiment_quantification_snam.zip'
    unzipped_path = join(data_home, 'tweet_sentiment_quantification_snam')
    if not os.path.exists(unzipped_path):
        downloaded_path = join(data_home, 'tweet_sentiment_quantification_snam.zip')
        download_file(URL, downloaded_path)
        with zipfile.ZipFile(downloaded_path) as file:
            file.extractall(data_home)
        os.remove(downloaded_path)

    if dataset_name in {'semeval13', 'semeval14', 'semeval15'}:
        trainset_name = 'semeval'
        testset_name  = 'semeval' if model_selection else dataset_name
        print(f"the training and development sets for datasets 'semeval13', 'semeval14', 'semeval15' are common "
              f"(called 'semeval'); returning trainin-set='{trainset_name}' and test-set={testset_name}")
    else:
        trainset_name = testset_name = dataset_name

    if model_selection:
        train = join(unzipped_path, 'train', f'{trainset_name}.train.feature.txt')
        test  = join(unzipped_path, 'test', f'{testset_name}.dev.feature.txt')
    else:
        train = join(unzipped_path, 'train', f'{trainset_name}.train+dev.feature.txt')
        if dataset_name == 'semeval16':
            test = join(unzipped_path, 'test', f'{testset_name}.dev-test.feature.txt')
        else:
            test = join(unzipped_path, 'test', f'{testset_name}.test.feature.txt')

    data = Dataset.load(train, test, from_sparse)

    if min_df is not None:
        reduce_columns(data, min_df=min_df, inplace=True)

    return data
dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added 2020-12-14 18:36:19 +01:00			`import zipfile`
uniform sampling added if *prevs is empty 2020-12-17 18:17:17 +01:00			`from util import download_file_if_not_exists, download_file, get_quapy_home`
dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added 2020-12-14 18:36:19 +01:00			`import os`
			`from os.path import join`
uniform sampling added if *prevs is empty 2020-12-17 18:17:17 +01:00			`from data.base import Dataset`
dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added 2020-12-14 18:36:19 +01:00			`from data.reader import from_text, from_sparse`
			`from data.preprocessing import text2tfidf, reduce_columns`


			`REVIEWS_SENTIMENT_DATASETS = ['hp', 'kindle', 'imdb']`
			`TWITTER_SENTIMENT_DATASETS = ['gasp', 'hcr', 'omd', 'sanders', 'semeval13', 'semeval14', 'semeval15', 'semeval16',`
			`'sst', 'wa', 'wb']`


			`def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None):`
			`assert dataset_name in REVIEWS_SENTIMENT_DATASETS, \`
			`f'Name {dataset_name} does not match any known dataset for sentiment reviews. ' \`
			`f'Valid ones are {REVIEWS_SENTIMENT_DATASETS}'`
			`if data_home is None:`
			`data_home = get_quapy_home()`

			`URL_TRAIN = f'https://zenodo.org/record/4117827/files/{dataset_name}_train.txt'`
			`URL_TEST = f'https://zenodo.org/record/4117827/files/{dataset_name}_test.txt'`
			`os.makedirs(join(data_home, 'reviews'), exist_ok=True)`
			`train_path = join(data_home, 'reviews', dataset_name, 'train.txt')`
			`test_path = join(data_home, 'reviews', dataset_name, 'test.txt')`
			`download_file_if_not_exists(URL_TRAIN, train_path)`
			`download_file_if_not_exists(URL_TEST, test_path)`

			`data = Dataset.load(train_path, test_path, from_text)`

			`if tfidf:`
			`text2tfidf(data, inplace=True)`

			`if min_df is not None:`
			`reduce_columns(data, min_df=min_df, inplace=True)`

			`return data`


			`def fetch_twitter(dataset_name, model_selection=False, min_df=None, data_home=None):`
			`assert dataset_name in TWITTER_SENTIMENT_DATASETS, \`
			`f'Name {dataset_name} does not match any known dataset for sentiment twitter. ' \`
			`f'Valid ones are {TWITTER_SENTIMENT_DATASETS}'`
			`if data_home is None:`
			`data_home = get_quapy_home()`

			`URL = 'https://zenodo.org/record/4255764/files/tweet_sentiment_quantification_snam.zip'`
			`unzipped_path = join(data_home, 'tweet_sentiment_quantification_snam')`
			`if not os.path.exists(unzipped_path):`
			`downloaded_path = join(data_home, 'tweet_sentiment_quantification_snam.zip')`
			`download_file(URL, downloaded_path)`
			`with zipfile.ZipFile(downloaded_path) as file:`
			`file.extractall(data_home)`
			`os.remove(downloaded_path)`

			`if dataset_name in {'semeval13', 'semeval14', 'semeval15'}:`
			`trainset_name = 'semeval'`
			`testset_name = 'semeval' if model_selection else dataset_name`
			`print(f"the training and development sets for datasets 'semeval13', 'semeval14', 'semeval15' are common "`
			`f"(called 'semeval'); returning trainin-set='{trainset_name}' and test-set={testset_name}")`
			`else:`
			`trainset_name = testset_name = dataset_name`

			`if model_selection:`
			`train = join(unzipped_path, 'train', f'{trainset_name}.train.feature.txt')`
			`test = join(unzipped_path, 'test', f'{testset_name}.dev.feature.txt')`
			`else:`
			`train = join(unzipped_path, 'train', f'{trainset_name}.train+dev.feature.txt')`
			`if dataset_name == 'semeval16':`
			`test = join(unzipped_path, 'test', f'{testset_name}.dev-test.feature.txt')`
			`else:`
			`test = join(unzipped_path, 'test', f'{testset_name}.test.feature.txt')`

			`data = Dataset.load(train, test, from_sparse)`

			`if min_df is not None:`
			`reduce_columns(data, min_df=min_df, inplace=True)`

			`return data`