1
0
Fork 0

dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added

This commit is contained in:
Alejandro Moreo Fernandez 2020-12-14 18:36:19 +01:00
parent c8a1a70c8a
commit 649d412389
5 changed files with 122 additions and 14 deletions

View File

@ -1,5 +1,6 @@
from .base import * from .base import *
from .reader import * from .reader import *
from . import preprocessing from . import preprocessing
from . import datasets

83
quapy/data/datasets.py Normal file
View File

@ -0,0 +1,83 @@
import zipfile
from utils.util import download_file_if_not_exists, download_file, get_quapy_home
import os
from os.path import join
from data.base import Dataset, LabelledCollection
from data.reader import from_text, from_sparse
from data.preprocessing import text2tfidf, reduce_columns
REVIEWS_SENTIMENT_DATASETS = ['hp', 'kindle', 'imdb']
TWITTER_SENTIMENT_DATASETS = ['gasp', 'hcr', 'omd', 'sanders', 'semeval13', 'semeval14', 'semeval15', 'semeval16',
'sst', 'wa', 'wb']
def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None):
assert dataset_name in REVIEWS_SENTIMENT_DATASETS, \
f'Name {dataset_name} does not match any known dataset for sentiment reviews. ' \
f'Valid ones are {REVIEWS_SENTIMENT_DATASETS}'
if data_home is None:
data_home = get_quapy_home()
URL_TRAIN = f'https://zenodo.org/record/4117827/files/{dataset_name}_train.txt'
URL_TEST = f'https://zenodo.org/record/4117827/files/{dataset_name}_test.txt'
os.makedirs(join(data_home, 'reviews'), exist_ok=True)
train_path = join(data_home, 'reviews', dataset_name, 'train.txt')
test_path = join(data_home, 'reviews', dataset_name, 'test.txt')
download_file_if_not_exists(URL_TRAIN, train_path)
download_file_if_not_exists(URL_TEST, test_path)
data = Dataset.load(train_path, test_path, from_text)
if tfidf:
text2tfidf(data, inplace=True)
if min_df is not None:
reduce_columns(data, min_df=min_df, inplace=True)
return data
def fetch_twitter(dataset_name, model_selection=False, min_df=None, data_home=None):
assert dataset_name in TWITTER_SENTIMENT_DATASETS, \
f'Name {dataset_name} does not match any known dataset for sentiment twitter. ' \
f'Valid ones are {TWITTER_SENTIMENT_DATASETS}'
if data_home is None:
data_home = get_quapy_home()
URL = 'https://zenodo.org/record/4255764/files/tweet_sentiment_quantification_snam.zip'
unzipped_path = join(data_home, 'tweet_sentiment_quantification_snam')
if not os.path.exists(unzipped_path):
downloaded_path = join(data_home, 'tweet_sentiment_quantification_snam.zip')
download_file(URL, downloaded_path)
with zipfile.ZipFile(downloaded_path) as file:
file.extractall(data_home)
os.remove(downloaded_path)
if dataset_name in {'semeval13', 'semeval14', 'semeval15'}:
trainset_name = 'semeval'
testset_name = 'semeval' if model_selection else dataset_name
print(f"the training and development sets for datasets 'semeval13', 'semeval14', 'semeval15' are common "
f"(called 'semeval'); returning trainin-set='{trainset_name}' and test-set={testset_name}")
else:
trainset_name = testset_name = dataset_name
if model_selection:
train = join(unzipped_path, 'train', f'{trainset_name}.train.feature.txt')
test = join(unzipped_path, 'test', f'{testset_name}.dev.feature.txt')
else:
train = join(unzipped_path, 'train', f'{trainset_name}.train+dev.feature.txt')
if dataset_name == 'semeval16':
test = join(unzipped_path, 'test', f'{testset_name}.dev-test.feature.txt')
else:
test = join(unzipped_path, 'test', f'{testset_name}.test.feature.txt')
data = Dataset.load(train, test, from_sparse)
if min_df is not None:
reduce_columns(data, min_df=min_df, inplace=True)
return data

View File

@ -54,3 +54,4 @@ def from_sparse(path):
X = X.tocsr() X = X.tocsr()
y = np.asarray(all_labels) + 1 y = np.asarray(all_labels) + 1
return X, y return X, y

View File

@ -3,6 +3,10 @@ import multiprocessing
from joblib import Parallel, delayed from joblib import Parallel, delayed
import contextlib import contextlib
import numpy as np import numpy as np
import urllib
import os
from pathlib import Path
@ -33,3 +37,27 @@ def temp_seed(seed):
finally: finally:
np.random.set_state(state) np.random.set_state(state)
def download_file(url, archive_filename):
def progress(blocknum, bs, size):
total_sz_mb = '%.2f MB' % (size / 1e6)
current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6)
print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb), end='')
print("Downloading %s" % url)
urllib.request.urlretrieve(url, filename=archive_filename, reporthook=progress)
print("")
def download_file_if_not_exists(url, archive_path):
if os.path.exists(archive_path):
return
create_if_not_exist(os.path.dirname(archive_path))
download_file(url,archive_path)
def create_if_not_exist(path):
os.makedirs(path, exist_ok=True)
def get_quapy_home():
return os.path.join(str(Path.home()), 'quapy_data')

23
test.py
View File

@ -2,28 +2,23 @@ from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC from sklearn.svm import LinearSVC
import quapy as qp import quapy as qp
import quapy.functional as F import quapy.functional as F
import sys
#qp.datasets.fetch_reviews('hp')
#qp.datasets.fetch_twitter('sst')
#sys.exit()
SAMPLE_SIZE=500 SAMPLE_SIZE=500
binary = False binary = False
svmperf_home = './svm_perf_quantification' svmperf_home = './svm_perf_quantification'
if binary: if binary:
# load a textual binary dataset and create a tfidf bag of words dataset = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5)
train_path = './datasets/reviews/kindle/train.txt'
test_path = './datasets/reviews/kindle/test.txt'
dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_text)
qp.preprocessing.text2tfidf(dataset, inplace=True)
qp.preprocessing.reduce_columns(dataset, min_df=10, inplace=True)
else: else:
# load a sparse matrix ternary dataset dataset = qp.datasets.fetch_twitter('semeval13', model_selection=False, min_df=10)
train_path = './datasets/twitter/train/sst.train+dev.feature.txt'
test_path = './datasets/twitter/test/sst.test.feature.txt'
dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_sparse)
dataset.training = dataset.training.sampling(SAMPLE_SIZE, 0.2, 0.5, 0.3) dataset.training = dataset.training.sampling(SAMPLE_SIZE, 0.2, 0.5, 0.3)
qp.preprocessing.reduce_columns(dataset, min_df=10, inplace=True)
print(dataset.training.instances.shape)
print('dataset loaded') print('dataset loaded')
@ -63,8 +58,8 @@ print(f'mae={error:.3f}')
max_evaluations = 5000 max_evaluations = 5000
n_prevpoints = F.get_nprevpoints_approximation(combinations_budget=max_evaluations, n_classes=dataset.n_classes) n_prevpoints = F.get_nprevpoints_approximation(combinations_budget=max_evaluations, n_classes=dataset.n_classes)
n_evaluations = F.num_prevalence_combinations(n_prevpoints, dataset.n_classes) n_evaluations = F.num_prevalence_combinations(n_prevpoints, dataset.n_classes)
print(f'the prevalence interval [0,1] will be split in {n_prevpoints} prevalence points for each class, so that ' print(f'the prevalence interval [0,1] will be split in {n_prevpoints} prevalence points for each class, so that\n'
f'the requested maximum number of sample evaluations ({max_evaluations}) is not exceeded. ' f'the requested maximum number of sample evaluations ({max_evaluations}) is not exceeded.\n'
f'For the {dataset.n_classes} classes this dataset has, this will yield a total of {n_evaluations} evaluations.') f'For the {dataset.n_classes} classes this dataset has, this will yield a total of {n_evaluations} evaluations.')
true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, SAMPLE_SIZE, n_prevpoints) true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, SAMPLE_SIZE, n_prevpoints)