From 649d412389531f614e6bb8c770d8803b9a90e96a Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Mon, 14 Dec 2020 18:36:19 +0100 Subject: [PATCH] dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added --- quapy/data/__init__.py | 1 + quapy/data/datasets.py | 83 ++++++++++++++++++++++++++++++++++++++++++ quapy/data/reader.py | 1 + quapy/utils/util.py | 28 ++++++++++++++ test.py | 23 +++++------- 5 files changed, 122 insertions(+), 14 deletions(-) create mode 100644 quapy/data/datasets.py diff --git a/quapy/data/__init__.py b/quapy/data/__init__.py index e44efa4..9c119ab 100644 --- a/quapy/data/__init__.py +++ b/quapy/data/__init__.py @@ -1,5 +1,6 @@ from .base import * from .reader import * from . import preprocessing +from . import datasets diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py new file mode 100644 index 0000000..2c25de9 --- /dev/null +++ b/quapy/data/datasets.py @@ -0,0 +1,83 @@ +import zipfile +from utils.util import download_file_if_not_exists, download_file, get_quapy_home +import os +from os.path import join +from data.base import Dataset, LabelledCollection +from data.reader import from_text, from_sparse +from data.preprocessing import text2tfidf, reduce_columns + + +REVIEWS_SENTIMENT_DATASETS = ['hp', 'kindle', 'imdb'] +TWITTER_SENTIMENT_DATASETS = ['gasp', 'hcr', 'omd', 'sanders', 'semeval13', 'semeval14', 'semeval15', 'semeval16', + 'sst', 'wa', 'wb'] + + +def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None): + assert dataset_name in REVIEWS_SENTIMENT_DATASETS, \ + f'Name {dataset_name} does not match any known dataset for sentiment reviews. ' \ + f'Valid ones are {REVIEWS_SENTIMENT_DATASETS}' + if data_home is None: + data_home = get_quapy_home() + + URL_TRAIN = f'https://zenodo.org/record/4117827/files/{dataset_name}_train.txt' + URL_TEST = f'https://zenodo.org/record/4117827/files/{dataset_name}_test.txt' + os.makedirs(join(data_home, 'reviews'), exist_ok=True) + train_path = join(data_home, 'reviews', dataset_name, 'train.txt') + test_path = join(data_home, 'reviews', dataset_name, 'test.txt') + download_file_if_not_exists(URL_TRAIN, train_path) + download_file_if_not_exists(URL_TEST, test_path) + + data = Dataset.load(train_path, test_path, from_text) + + if tfidf: + text2tfidf(data, inplace=True) + + if min_df is not None: + reduce_columns(data, min_df=min_df, inplace=True) + + return data + + +def fetch_twitter(dataset_name, model_selection=False, min_df=None, data_home=None): + assert dataset_name in TWITTER_SENTIMENT_DATASETS, \ + f'Name {dataset_name} does not match any known dataset for sentiment twitter. ' \ + f'Valid ones are {TWITTER_SENTIMENT_DATASETS}' + if data_home is None: + data_home = get_quapy_home() + + URL = 'https://zenodo.org/record/4255764/files/tweet_sentiment_quantification_snam.zip' + unzipped_path = join(data_home, 'tweet_sentiment_quantification_snam') + if not os.path.exists(unzipped_path): + downloaded_path = join(data_home, 'tweet_sentiment_quantification_snam.zip') + download_file(URL, downloaded_path) + with zipfile.ZipFile(downloaded_path) as file: + file.extractall(data_home) + os.remove(downloaded_path) + + if dataset_name in {'semeval13', 'semeval14', 'semeval15'}: + trainset_name = 'semeval' + testset_name = 'semeval' if model_selection else dataset_name + print(f"the training and development sets for datasets 'semeval13', 'semeval14', 'semeval15' are common " + f"(called 'semeval'); returning trainin-set='{trainset_name}' and test-set={testset_name}") + else: + trainset_name = testset_name = dataset_name + + if model_selection: + train = join(unzipped_path, 'train', f'{trainset_name}.train.feature.txt') + test = join(unzipped_path, 'test', f'{testset_name}.dev.feature.txt') + else: + train = join(unzipped_path, 'train', f'{trainset_name}.train+dev.feature.txt') + if dataset_name == 'semeval16': + test = join(unzipped_path, 'test', f'{testset_name}.dev-test.feature.txt') + else: + test = join(unzipped_path, 'test', f'{testset_name}.test.feature.txt') + + data = Dataset.load(train, test, from_sparse) + + if min_df is not None: + reduce_columns(data, min_df=min_df, inplace=True) + + return data + + + diff --git a/quapy/data/reader.py b/quapy/data/reader.py index e160d15..84550c6 100644 --- a/quapy/data/reader.py +++ b/quapy/data/reader.py @@ -54,3 +54,4 @@ def from_sparse(path): X = X.tocsr() y = np.asarray(all_labels) + 1 return X, y + diff --git a/quapy/utils/util.py b/quapy/utils/util.py index 583cb1a..921ab1b 100644 --- a/quapy/utils/util.py +++ b/quapy/utils/util.py @@ -3,6 +3,10 @@ import multiprocessing from joblib import Parallel, delayed import contextlib import numpy as np +import urllib +import os +from pathlib import Path + @@ -33,3 +37,27 @@ def temp_seed(seed): finally: np.random.set_state(state) + +def download_file(url, archive_filename): + def progress(blocknum, bs, size): + total_sz_mb = '%.2f MB' % (size / 1e6) + current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6) + print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb), end='') + print("Downloading %s" % url) + urllib.request.urlretrieve(url, filename=archive_filename, reporthook=progress) + print("") + + +def download_file_if_not_exists(url, archive_path): + if os.path.exists(archive_path): + return + create_if_not_exist(os.path.dirname(archive_path)) + download_file(url,archive_path) + + +def create_if_not_exist(path): + os.makedirs(path, exist_ok=True) + + +def get_quapy_home(): + return os.path.join(str(Path.home()), 'quapy_data') \ No newline at end of file diff --git a/test.py b/test.py index 24fef54..85d8bb6 100644 --- a/test.py +++ b/test.py @@ -2,28 +2,23 @@ from sklearn.linear_model import LogisticRegression from sklearn.svm import LinearSVC import quapy as qp import quapy.functional as F +import sys +#qp.datasets.fetch_reviews('hp') +#qp.datasets.fetch_twitter('sst') + +#sys.exit() SAMPLE_SIZE=500 binary = False svmperf_home = './svm_perf_quantification' if binary: - # load a textual binary dataset and create a tfidf bag of words - train_path = './datasets/reviews/kindle/train.txt' - test_path = './datasets/reviews/kindle/test.txt' - dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_text) - qp.preprocessing.text2tfidf(dataset, inplace=True) - qp.preprocessing.reduce_columns(dataset, min_df=10, inplace=True) + dataset = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5) else: - # load a sparse matrix ternary dataset - train_path = './datasets/twitter/train/sst.train+dev.feature.txt' - test_path = './datasets/twitter/test/sst.test.feature.txt' - dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_sparse) + dataset = qp.datasets.fetch_twitter('semeval13', model_selection=False, min_df=10) dataset.training = dataset.training.sampling(SAMPLE_SIZE, 0.2, 0.5, 0.3) - qp.preprocessing.reduce_columns(dataset, min_df=10, inplace=True) - print(dataset.training.instances.shape) print('dataset loaded') @@ -63,8 +58,8 @@ print(f'mae={error:.3f}') max_evaluations = 5000 n_prevpoints = F.get_nprevpoints_approximation(combinations_budget=max_evaluations, n_classes=dataset.n_classes) n_evaluations = F.num_prevalence_combinations(n_prevpoints, dataset.n_classes) -print(f'the prevalence interval [0,1] will be split in {n_prevpoints} prevalence points for each class, so that ' - f'the requested maximum number of sample evaluations ({max_evaluations}) is not exceeded. ' +print(f'the prevalence interval [0,1] will be split in {n_prevpoints} prevalence points for each class, so that\n' + f'the requested maximum number of sample evaluations ({max_evaluations}) is not exceeded.\n' f'For the {dataset.n_classes} classes this dataset has, this will yield a total of {n_evaluations} evaluations.') true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, SAMPLE_SIZE, n_prevpoints)