forked from moreo/QuaPy
84 lines
3.4 KiB
Python
84 lines
3.4 KiB
Python
import zipfile
|
|
from util import download_file_if_not_exists, download_file, get_quapy_home
|
|
import os
|
|
from os.path import join
|
|
from data.base import Dataset
|
|
from data.reader import from_text, from_sparse
|
|
from data.preprocessing import text2tfidf, reduce_columns
|
|
|
|
|
|
REVIEWS_SENTIMENT_DATASETS = ['hp', 'kindle', 'imdb']
|
|
TWITTER_SENTIMENT_DATASETS = ['gasp', 'hcr', 'omd', 'sanders', 'semeval13', 'semeval14', 'semeval15', 'semeval16',
|
|
'sst', 'wa', 'wb']
|
|
|
|
|
|
def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None):
|
|
assert dataset_name in REVIEWS_SENTIMENT_DATASETS, \
|
|
f'Name {dataset_name} does not match any known dataset for sentiment reviews. ' \
|
|
f'Valid ones are {REVIEWS_SENTIMENT_DATASETS}'
|
|
if data_home is None:
|
|
data_home = get_quapy_home()
|
|
|
|
URL_TRAIN = f'https://zenodo.org/record/4117827/files/{dataset_name}_train.txt'
|
|
URL_TEST = f'https://zenodo.org/record/4117827/files/{dataset_name}_test.txt'
|
|
os.makedirs(join(data_home, 'reviews'), exist_ok=True)
|
|
train_path = join(data_home, 'reviews', dataset_name, 'train.txt')
|
|
test_path = join(data_home, 'reviews', dataset_name, 'test.txt')
|
|
download_file_if_not_exists(URL_TRAIN, train_path)
|
|
download_file_if_not_exists(URL_TEST, test_path)
|
|
|
|
data = Dataset.load(train_path, test_path, from_text)
|
|
|
|
if tfidf:
|
|
text2tfidf(data, inplace=True)
|
|
|
|
if min_df is not None:
|
|
reduce_columns(data, min_df=min_df, inplace=True)
|
|
|
|
return data
|
|
|
|
|
|
def fetch_twitter(dataset_name, model_selection=False, min_df=None, data_home=None):
|
|
assert dataset_name in TWITTER_SENTIMENT_DATASETS, \
|
|
f'Name {dataset_name} does not match any known dataset for sentiment twitter. ' \
|
|
f'Valid ones are {TWITTER_SENTIMENT_DATASETS}'
|
|
if data_home is None:
|
|
data_home = get_quapy_home()
|
|
|
|
URL = 'https://zenodo.org/record/4255764/files/tweet_sentiment_quantification_snam.zip'
|
|
unzipped_path = join(data_home, 'tweet_sentiment_quantification_snam')
|
|
if not os.path.exists(unzipped_path):
|
|
downloaded_path = join(data_home, 'tweet_sentiment_quantification_snam.zip')
|
|
download_file(URL, downloaded_path)
|
|
with zipfile.ZipFile(downloaded_path) as file:
|
|
file.extractall(data_home)
|
|
os.remove(downloaded_path)
|
|
|
|
if dataset_name in {'semeval13', 'semeval14', 'semeval15'}:
|
|
trainset_name = 'semeval'
|
|
testset_name = 'semeval' if model_selection else dataset_name
|
|
print(f"the training and development sets for datasets 'semeval13', 'semeval14', 'semeval15' are common "
|
|
f"(called 'semeval'); returning trainin-set='{trainset_name}' and test-set={testset_name}")
|
|
else:
|
|
trainset_name = testset_name = dataset_name
|
|
|
|
if model_selection:
|
|
train = join(unzipped_path, 'train', f'{trainset_name}.train.feature.txt')
|
|
test = join(unzipped_path, 'test', f'{testset_name}.dev.feature.txt')
|
|
else:
|
|
train = join(unzipped_path, 'train', f'{trainset_name}.train+dev.feature.txt')
|
|
if dataset_name == 'semeval16':
|
|
test = join(unzipped_path, 'test', f'{testset_name}.dev-test.feature.txt')
|
|
else:
|
|
test = join(unzipped_path, 'test', f'{testset_name}.test.feature.txt')
|
|
|
|
data = Dataset.load(train, test, from_sparse)
|
|
|
|
if min_df is not None:
|
|
reduce_columns(data, min_df=min_df, inplace=True)
|
|
|
|
return data
|
|
|
|
|
|
|