From 649d412389531f614e6bb8c770d8803b9a90e96a Mon Sep 17 00:00:00 2001
From: Alex Moreo <alejandro.moreo@isti.cnr.it>
Date: Mon, 14 Dec 2020 18:36:19 +0100
Subject: [PATCH] dataset fetch for polarity reviews (hp, kindle, imdb) and
 twitter sentiment (11 datasets) added

---
 quapy/data/__init__.py |  1 +
 quapy/data/datasets.py | 83 ++++++++++++++++++++++++++++++++++++++++++
 quapy/data/reader.py   |  1 +
 quapy/utils/util.py    | 28 ++++++++++++++
 test.py                | 23 +++++-------
 5 files changed, 122 insertions(+), 14 deletions(-)
 create mode 100644 quapy/data/datasets.py

diff --git a/quapy/data/__init__.py b/quapy/data/__init__.py
index e44efa4..9c119ab 100644
--- a/quapy/data/__init__.py
+++ b/quapy/data/__init__.py
@@ -1,5 +1,6 @@
 from .base import *
 from .reader import *
 from . import preprocessing
+from . import datasets
 
 
diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py
new file mode 100644
index 0000000..2c25de9
--- /dev/null
+++ b/quapy/data/datasets.py
@@ -0,0 +1,83 @@
+import zipfile
+from utils.util import download_file_if_not_exists, download_file, get_quapy_home
+import os
+from os.path import join
+from data.base import Dataset, LabelledCollection
+from data.reader import from_text, from_sparse
+from data.preprocessing import text2tfidf, reduce_columns
+
+
+REVIEWS_SENTIMENT_DATASETS = ['hp', 'kindle', 'imdb']
+TWITTER_SENTIMENT_DATASETS = ['gasp', 'hcr', 'omd', 'sanders', 'semeval13', 'semeval14', 'semeval15', 'semeval16',
+                              'sst', 'wa', 'wb']
+
+
+def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None):
+    assert dataset_name in REVIEWS_SENTIMENT_DATASETS, \
+        f'Name {dataset_name} does not match any known dataset for sentiment reviews. ' \
+        f'Valid ones are {REVIEWS_SENTIMENT_DATASETS}'
+    if data_home is None:
+        data_home = get_quapy_home()
+
+    URL_TRAIN = f'https://zenodo.org/record/4117827/files/{dataset_name}_train.txt'
+    URL_TEST = f'https://zenodo.org/record/4117827/files/{dataset_name}_test.txt'
+    os.makedirs(join(data_home, 'reviews'), exist_ok=True)
+    train_path = join(data_home, 'reviews', dataset_name, 'train.txt')
+    test_path = join(data_home, 'reviews', dataset_name, 'test.txt')
+    download_file_if_not_exists(URL_TRAIN, train_path)
+    download_file_if_not_exists(URL_TEST, test_path)
+
+    data = Dataset.load(train_path, test_path, from_text)
+
+    if tfidf:
+        text2tfidf(data, inplace=True)
+
+    if min_df is not None:
+        reduce_columns(data, min_df=min_df, inplace=True)
+
+    return data
+
+
+def fetch_twitter(dataset_name, model_selection=False, min_df=None, data_home=None):
+    assert dataset_name in TWITTER_SENTIMENT_DATASETS, \
+        f'Name {dataset_name} does not match any known dataset for sentiment twitter. ' \
+        f'Valid ones are {TWITTER_SENTIMENT_DATASETS}'
+    if data_home is None:
+        data_home = get_quapy_home()
+
+    URL = 'https://zenodo.org/record/4255764/files/tweet_sentiment_quantification_snam.zip'
+    unzipped_path = join(data_home, 'tweet_sentiment_quantification_snam')
+    if not os.path.exists(unzipped_path):
+        downloaded_path = join(data_home, 'tweet_sentiment_quantification_snam.zip')
+        download_file(URL, downloaded_path)
+        with zipfile.ZipFile(downloaded_path) as file:
+            file.extractall(data_home)
+        os.remove(downloaded_path)
+
+    if dataset_name in {'semeval13', 'semeval14', 'semeval15'}:
+        trainset_name = 'semeval'
+        testset_name  = 'semeval' if model_selection else dataset_name
+        print(f"the training and development sets for datasets 'semeval13', 'semeval14', 'semeval15' are common "
+              f"(called 'semeval'); returning trainin-set='{trainset_name}' and test-set={testset_name}")
+    else:
+        trainset_name = testset_name = dataset_name
+
+    if model_selection:
+        train = join(unzipped_path, 'train', f'{trainset_name}.train.feature.txt')
+        test  = join(unzipped_path, 'test', f'{testset_name}.dev.feature.txt')
+    else:
+        train = join(unzipped_path, 'train', f'{trainset_name}.train+dev.feature.txt')
+        if dataset_name == 'semeval16':
+            test = join(unzipped_path, 'test', f'{testset_name}.dev-test.feature.txt')
+        else:
+            test = join(unzipped_path, 'test', f'{testset_name}.test.feature.txt')
+
+    data = Dataset.load(train, test, from_sparse)
+
+    if min_df is not None:
+        reduce_columns(data, min_df=min_df, inplace=True)
+
+    return data
+
+
+
diff --git a/quapy/data/reader.py b/quapy/data/reader.py
index e160d15..84550c6 100644
--- a/quapy/data/reader.py
+++ b/quapy/data/reader.py
@@ -54,3 +54,4 @@ def from_sparse(path):
     X = X.tocsr()
     y = np.asarray(all_labels) + 1
     return X, y
+
diff --git a/quapy/utils/util.py b/quapy/utils/util.py
index 583cb1a..921ab1b 100644
--- a/quapy/utils/util.py
+++ b/quapy/utils/util.py
@@ -3,6 +3,10 @@ import multiprocessing
 from joblib import Parallel, delayed
 import contextlib
 import numpy as np
+import urllib
+import os
+from pathlib import Path
+
 
 
 
@@ -33,3 +37,27 @@ def temp_seed(seed):
     finally:
         np.random.set_state(state)
 
+
+def download_file(url, archive_filename):
+    def progress(blocknum, bs, size):
+        total_sz_mb = '%.2f MB' % (size / 1e6)
+        current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6)
+        print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb), end='')
+    print("Downloading %s" % url)
+    urllib.request.urlretrieve(url, filename=archive_filename, reporthook=progress)
+    print("")
+
+
+def download_file_if_not_exists(url, archive_path):
+    if os.path.exists(archive_path):
+        return
+    create_if_not_exist(os.path.dirname(archive_path))
+    download_file(url,archive_path)
+
+
+def create_if_not_exist(path):
+    os.makedirs(path, exist_ok=True)
+
+
+def get_quapy_home():
+    return os.path.join(str(Path.home()), 'quapy_data')
\ No newline at end of file
diff --git a/test.py b/test.py
index 24fef54..85d8bb6 100644
--- a/test.py
+++ b/test.py
@@ -2,28 +2,23 @@ from sklearn.linear_model import LogisticRegression
 from sklearn.svm import LinearSVC
 import quapy as qp
 import quapy.functional as F
+import sys
 
+#qp.datasets.fetch_reviews('hp')
+#qp.datasets.fetch_twitter('sst')
+
+#sys.exit()
 
 SAMPLE_SIZE=500
 binary = False
 svmperf_home = './svm_perf_quantification'
 
 if binary:
-    # load a textual binary dataset and create a tfidf bag of words
-    train_path = './datasets/reviews/kindle/train.txt'
-    test_path = './datasets/reviews/kindle/test.txt'
-    dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_text)
-    qp.preprocessing.text2tfidf(dataset, inplace=True)
-    qp.preprocessing.reduce_columns(dataset, min_df=10, inplace=True)
+    dataset = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5)
 
 else:
-    # load a sparse matrix ternary dataset
-    train_path = './datasets/twitter/train/sst.train+dev.feature.txt'
-    test_path = './datasets/twitter/test/sst.test.feature.txt'
-    dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_sparse)
+    dataset = qp.datasets.fetch_twitter('semeval13', model_selection=False, min_df=10)
     dataset.training = dataset.training.sampling(SAMPLE_SIZE, 0.2, 0.5, 0.3)
-    qp.preprocessing.reduce_columns(dataset, min_df=10, inplace=True)
-    print(dataset.training.instances.shape)
 
 print('dataset loaded')
 
@@ -63,8 +58,8 @@ print(f'mae={error:.3f}')
 max_evaluations = 5000
 n_prevpoints = F.get_nprevpoints_approximation(combinations_budget=max_evaluations, n_classes=dataset.n_classes)
 n_evaluations = F.num_prevalence_combinations(n_prevpoints, dataset.n_classes)
-print(f'the prevalence interval [0,1] will be split in {n_prevpoints} prevalence points for each class, so that '
-      f'the requested maximum number of sample evaluations ({max_evaluations}) is not exceeded. '
+print(f'the prevalence interval [0,1] will be split in {n_prevpoints} prevalence points for each class, so that\n'
+      f'the requested maximum number of sample evaluations ({max_evaluations}) is not exceeded.\n'
       f'For the {dataset.n_classes} classes this dataset has, this will yield a total of {n_evaluations} evaluations.')
 
 true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, SAMPLE_SIZE, n_prevpoints)