forked from moreo/QuaPy
dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added
This commit is contained in:
parent
c8a1a70c8a
commit
649d412389
|
@ -1,5 +1,6 @@
|
||||||
from .base import *
|
from .base import *
|
||||||
from .reader import *
|
from .reader import *
|
||||||
from . import preprocessing
|
from . import preprocessing
|
||||||
|
from . import datasets
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,83 @@
|
||||||
|
import zipfile
|
||||||
|
from utils.util import download_file_if_not_exists, download_file, get_quapy_home
|
||||||
|
import os
|
||||||
|
from os.path import join
|
||||||
|
from data.base import Dataset, LabelledCollection
|
||||||
|
from data.reader import from_text, from_sparse
|
||||||
|
from data.preprocessing import text2tfidf, reduce_columns
|
||||||
|
|
||||||
|
|
||||||
|
REVIEWS_SENTIMENT_DATASETS = ['hp', 'kindle', 'imdb']
|
||||||
|
TWITTER_SENTIMENT_DATASETS = ['gasp', 'hcr', 'omd', 'sanders', 'semeval13', 'semeval14', 'semeval15', 'semeval16',
|
||||||
|
'sst', 'wa', 'wb']
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None):
|
||||||
|
assert dataset_name in REVIEWS_SENTIMENT_DATASETS, \
|
||||||
|
f'Name {dataset_name} does not match any known dataset for sentiment reviews. ' \
|
||||||
|
f'Valid ones are {REVIEWS_SENTIMENT_DATASETS}'
|
||||||
|
if data_home is None:
|
||||||
|
data_home = get_quapy_home()
|
||||||
|
|
||||||
|
URL_TRAIN = f'https://zenodo.org/record/4117827/files/{dataset_name}_train.txt'
|
||||||
|
URL_TEST = f'https://zenodo.org/record/4117827/files/{dataset_name}_test.txt'
|
||||||
|
os.makedirs(join(data_home, 'reviews'), exist_ok=True)
|
||||||
|
train_path = join(data_home, 'reviews', dataset_name, 'train.txt')
|
||||||
|
test_path = join(data_home, 'reviews', dataset_name, 'test.txt')
|
||||||
|
download_file_if_not_exists(URL_TRAIN, train_path)
|
||||||
|
download_file_if_not_exists(URL_TEST, test_path)
|
||||||
|
|
||||||
|
data = Dataset.load(train_path, test_path, from_text)
|
||||||
|
|
||||||
|
if tfidf:
|
||||||
|
text2tfidf(data, inplace=True)
|
||||||
|
|
||||||
|
if min_df is not None:
|
||||||
|
reduce_columns(data, min_df=min_df, inplace=True)
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_twitter(dataset_name, model_selection=False, min_df=None, data_home=None):
|
||||||
|
assert dataset_name in TWITTER_SENTIMENT_DATASETS, \
|
||||||
|
f'Name {dataset_name} does not match any known dataset for sentiment twitter. ' \
|
||||||
|
f'Valid ones are {TWITTER_SENTIMENT_DATASETS}'
|
||||||
|
if data_home is None:
|
||||||
|
data_home = get_quapy_home()
|
||||||
|
|
||||||
|
URL = 'https://zenodo.org/record/4255764/files/tweet_sentiment_quantification_snam.zip'
|
||||||
|
unzipped_path = join(data_home, 'tweet_sentiment_quantification_snam')
|
||||||
|
if not os.path.exists(unzipped_path):
|
||||||
|
downloaded_path = join(data_home, 'tweet_sentiment_quantification_snam.zip')
|
||||||
|
download_file(URL, downloaded_path)
|
||||||
|
with zipfile.ZipFile(downloaded_path) as file:
|
||||||
|
file.extractall(data_home)
|
||||||
|
os.remove(downloaded_path)
|
||||||
|
|
||||||
|
if dataset_name in {'semeval13', 'semeval14', 'semeval15'}:
|
||||||
|
trainset_name = 'semeval'
|
||||||
|
testset_name = 'semeval' if model_selection else dataset_name
|
||||||
|
print(f"the training and development sets for datasets 'semeval13', 'semeval14', 'semeval15' are common "
|
||||||
|
f"(called 'semeval'); returning trainin-set='{trainset_name}' and test-set={testset_name}")
|
||||||
|
else:
|
||||||
|
trainset_name = testset_name = dataset_name
|
||||||
|
|
||||||
|
if model_selection:
|
||||||
|
train = join(unzipped_path, 'train', f'{trainset_name}.train.feature.txt')
|
||||||
|
test = join(unzipped_path, 'test', f'{testset_name}.dev.feature.txt')
|
||||||
|
else:
|
||||||
|
train = join(unzipped_path, 'train', f'{trainset_name}.train+dev.feature.txt')
|
||||||
|
if dataset_name == 'semeval16':
|
||||||
|
test = join(unzipped_path, 'test', f'{testset_name}.dev-test.feature.txt')
|
||||||
|
else:
|
||||||
|
test = join(unzipped_path, 'test', f'{testset_name}.test.feature.txt')
|
||||||
|
|
||||||
|
data = Dataset.load(train, test, from_sparse)
|
||||||
|
|
||||||
|
if min_df is not None:
|
||||||
|
reduce_columns(data, min_df=min_df, inplace=True)
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -54,3 +54,4 @@ def from_sparse(path):
|
||||||
X = X.tocsr()
|
X = X.tocsr()
|
||||||
y = np.asarray(all_labels) + 1
|
y = np.asarray(all_labels) + 1
|
||||||
return X, y
|
return X, y
|
||||||
|
|
||||||
|
|
|
@ -3,6 +3,10 @@ import multiprocessing
|
||||||
from joblib import Parallel, delayed
|
from joblib import Parallel, delayed
|
||||||
import contextlib
|
import contextlib
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import urllib
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -33,3 +37,27 @@ def temp_seed(seed):
|
||||||
finally:
|
finally:
|
||||||
np.random.set_state(state)
|
np.random.set_state(state)
|
||||||
|
|
||||||
|
|
||||||
|
def download_file(url, archive_filename):
|
||||||
|
def progress(blocknum, bs, size):
|
||||||
|
total_sz_mb = '%.2f MB' % (size / 1e6)
|
||||||
|
current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6)
|
||||||
|
print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb), end='')
|
||||||
|
print("Downloading %s" % url)
|
||||||
|
urllib.request.urlretrieve(url, filename=archive_filename, reporthook=progress)
|
||||||
|
print("")
|
||||||
|
|
||||||
|
|
||||||
|
def download_file_if_not_exists(url, archive_path):
|
||||||
|
if os.path.exists(archive_path):
|
||||||
|
return
|
||||||
|
create_if_not_exist(os.path.dirname(archive_path))
|
||||||
|
download_file(url,archive_path)
|
||||||
|
|
||||||
|
|
||||||
|
def create_if_not_exist(path):
|
||||||
|
os.makedirs(path, exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
|
def get_quapy_home():
|
||||||
|
return os.path.join(str(Path.home()), 'quapy_data')
|
23
test.py
23
test.py
|
@ -2,28 +2,23 @@ from sklearn.linear_model import LogisticRegression
|
||||||
from sklearn.svm import LinearSVC
|
from sklearn.svm import LinearSVC
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
import quapy.functional as F
|
import quapy.functional as F
|
||||||
|
import sys
|
||||||
|
|
||||||
|
#qp.datasets.fetch_reviews('hp')
|
||||||
|
#qp.datasets.fetch_twitter('sst')
|
||||||
|
|
||||||
|
#sys.exit()
|
||||||
|
|
||||||
SAMPLE_SIZE=500
|
SAMPLE_SIZE=500
|
||||||
binary = False
|
binary = False
|
||||||
svmperf_home = './svm_perf_quantification'
|
svmperf_home = './svm_perf_quantification'
|
||||||
|
|
||||||
if binary:
|
if binary:
|
||||||
# load a textual binary dataset and create a tfidf bag of words
|
dataset = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5)
|
||||||
train_path = './datasets/reviews/kindle/train.txt'
|
|
||||||
test_path = './datasets/reviews/kindle/test.txt'
|
|
||||||
dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_text)
|
|
||||||
qp.preprocessing.text2tfidf(dataset, inplace=True)
|
|
||||||
qp.preprocessing.reduce_columns(dataset, min_df=10, inplace=True)
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# load a sparse matrix ternary dataset
|
dataset = qp.datasets.fetch_twitter('semeval13', model_selection=False, min_df=10)
|
||||||
train_path = './datasets/twitter/train/sst.train+dev.feature.txt'
|
|
||||||
test_path = './datasets/twitter/test/sst.test.feature.txt'
|
|
||||||
dataset = qp.Dataset.load(train_path, test_path, qp.reader.from_sparse)
|
|
||||||
dataset.training = dataset.training.sampling(SAMPLE_SIZE, 0.2, 0.5, 0.3)
|
dataset.training = dataset.training.sampling(SAMPLE_SIZE, 0.2, 0.5, 0.3)
|
||||||
qp.preprocessing.reduce_columns(dataset, min_df=10, inplace=True)
|
|
||||||
print(dataset.training.instances.shape)
|
|
||||||
|
|
||||||
print('dataset loaded')
|
print('dataset loaded')
|
||||||
|
|
||||||
|
@ -63,8 +58,8 @@ print(f'mae={error:.3f}')
|
||||||
max_evaluations = 5000
|
max_evaluations = 5000
|
||||||
n_prevpoints = F.get_nprevpoints_approximation(combinations_budget=max_evaluations, n_classes=dataset.n_classes)
|
n_prevpoints = F.get_nprevpoints_approximation(combinations_budget=max_evaluations, n_classes=dataset.n_classes)
|
||||||
n_evaluations = F.num_prevalence_combinations(n_prevpoints, dataset.n_classes)
|
n_evaluations = F.num_prevalence_combinations(n_prevpoints, dataset.n_classes)
|
||||||
print(f'the prevalence interval [0,1] will be split in {n_prevpoints} prevalence points for each class, so that '
|
print(f'the prevalence interval [0,1] will be split in {n_prevpoints} prevalence points for each class, so that\n'
|
||||||
f'the requested maximum number of sample evaluations ({max_evaluations}) is not exceeded. '
|
f'the requested maximum number of sample evaluations ({max_evaluations}) is not exceeded.\n'
|
||||||
f'For the {dataset.n_classes} classes this dataset has, this will yield a total of {n_evaluations} evaluations.')
|
f'For the {dataset.n_classes} classes this dataset has, this will yield a total of {n_evaluations} evaluations.')
|
||||||
|
|
||||||
true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, SAMPLE_SIZE, n_prevpoints)
|
true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, SAMPLE_SIZE, n_prevpoints)
|
||||||
|
|
Loading…
Reference in New Issue