2021-04-29 16:07:39 +02:00
|
|
|
import pytest
|
|
|
|
|
|
|
|
from quapy.data.datasets import REVIEWS_SENTIMENT_DATASETS, TWITTER_SENTIMENT_DATASETS_TEST, \
|
2024-02-07 18:31:34 +01:00
|
|
|
TWITTER_SENTIMENT_DATASETS_TRAIN, UCI_BINARY_DATASETS, LEQUA2022_TASKS, UCI_MULTICLASS_DATASETS,\
|
|
|
|
fetch_reviews, fetch_twitter, fetch_UCIBinaryDataset, fetch_lequa2022, fetch_UCIMulticlassLabelledCollection
|
2021-04-29 16:07:39 +02:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize('dataset_name', REVIEWS_SENTIMENT_DATASETS)
|
|
|
|
def test_fetch_reviews(dataset_name):
|
2021-04-30 17:22:58 +02:00
|
|
|
dataset = fetch_reviews(dataset_name)
|
2021-05-05 17:12:44 +02:00
|
|
|
print(f'Dataset {dataset_name}')
|
|
|
|
print('Training set stats')
|
|
|
|
dataset.training.stats()
|
|
|
|
print('Test set stats')
|
|
|
|
dataset.test.stats()
|
2021-04-29 16:07:39 +02:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize('dataset_name', TWITTER_SENTIMENT_DATASETS_TEST + TWITTER_SENTIMENT_DATASETS_TRAIN)
|
|
|
|
def test_fetch_twitter(dataset_name):
|
2021-04-30 17:22:58 +02:00
|
|
|
try:
|
|
|
|
dataset = fetch_twitter(dataset_name)
|
|
|
|
except ValueError as ve:
|
|
|
|
if dataset_name == 'semeval' and ve.args[0].startswith(
|
|
|
|
'dataset "semeval" can only be used for model selection.'):
|
|
|
|
dataset = fetch_twitter(dataset_name, for_model_selection=True)
|
2021-05-05 17:12:44 +02:00
|
|
|
print(f'Dataset {dataset_name}')
|
|
|
|
print('Training set stats')
|
|
|
|
dataset.training.stats()
|
|
|
|
print('Test set stats')
|
2021-04-29 16:07:39 +02:00
|
|
|
|
2021-04-30 17:22:58 +02:00
|
|
|
|
2024-02-07 18:31:34 +01:00
|
|
|
@pytest.mark.parametrize('dataset_name', UCI_BINARY_DATASETS)
|
2021-04-29 16:07:39 +02:00
|
|
|
def test_fetch_UCIDataset(dataset_name):
|
2021-04-30 17:22:58 +02:00
|
|
|
try:
|
2024-02-07 18:31:34 +01:00
|
|
|
dataset = fetch_UCIBinaryDataset(dataset_name)
|
2021-04-30 17:22:58 +02:00
|
|
|
except FileNotFoundError as fnfe:
|
|
|
|
if dataset_name == 'pageblocks.5' and fnfe.args[0].find(
|
|
|
|
'If this is the first time you attempt to load this dataset') > 0:
|
2021-05-05 17:12:44 +02:00
|
|
|
print('The pageblocks.5 dataset requires some hand processing to be usable, skipping this test.')
|
2021-04-30 17:22:58 +02:00
|
|
|
return
|
2021-05-05 17:12:44 +02:00
|
|
|
print(f'Dataset {dataset_name}')
|
|
|
|
print('Training set stats')
|
|
|
|
dataset.training.stats()
|
|
|
|
print('Test set stats')
|
2022-06-01 18:28:59 +02:00
|
|
|
|
|
|
|
|
2024-01-29 09:43:29 +01:00
|
|
|
@pytest.mark.parametrize('dataset_name', UCI_MULTICLASS_DATASETS)
|
|
|
|
def test_fetch_UCIMultiDataset(dataset_name):
|
|
|
|
dataset = fetch_UCIMulticlassLabelledCollection(dataset_name)
|
|
|
|
print(f'Dataset {dataset_name}')
|
|
|
|
print('Training set stats')
|
|
|
|
dataset.stats()
|
|
|
|
print('Test set stats')
|
|
|
|
|
|
|
|
|
2022-06-01 18:28:59 +02:00
|
|
|
@pytest.mark.parametrize('dataset_name', LEQUA2022_TASKS)
|
|
|
|
def test_fetch_lequa2022(dataset_name):
|
2022-06-03 18:02:52 +02:00
|
|
|
train, gen_val, gen_test = fetch_lequa2022(dataset_name)
|
|
|
|
print(train.stats())
|
|
|
|
print('Val:', gen_val.total())
|
|
|
|
print('Test:', gen_test.total())
|