128 lines
5.0 KiB
Python
128 lines
5.0 KiB
Python
import os
|
|
import unittest
|
|
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sklearn.linear_model import LogisticRegression
|
|
|
|
import quapy.functional as F
|
|
from quapy.method.aggregative import PCC
|
|
from quapy.data.datasets import *
|
|
|
|
|
|
class TestDatasets(unittest.TestCase):
|
|
|
|
def new_quantifier(self):
|
|
return PCC(LogisticRegression(C=0.001, max_iter=100))
|
|
|
|
def _check_dataset(self, dataset):
|
|
q = self.new_quantifier()
|
|
print(f'testing method {q} in {dataset.name}...', end='')
|
|
q.fit(dataset.training)
|
|
estim_prevalences = q.quantify(dataset.test.instances)
|
|
self.assertTrue(F.check_prevalence_vector(estim_prevalences))
|
|
print(f'[done]')
|
|
|
|
def _check_samples(self, gen, q, max_samples_test=5, vectorizer=None):
|
|
for X, p in gen():
|
|
if vectorizer is not None:
|
|
X = vectorizer.transform(X)
|
|
estim_prevalences = q.quantify(X)
|
|
self.assertTrue(F.check_prevalence_vector(estim_prevalences))
|
|
max_samples_test -= 1
|
|
if max_samples_test == 0:
|
|
break
|
|
|
|
def test_reviews(self):
|
|
for dataset_name in REVIEWS_SENTIMENT_DATASETS:
|
|
print(f'loading dataset {dataset_name}...', end='')
|
|
dataset = fetch_reviews(dataset_name, tfidf=True, min_df=10)
|
|
dataset.stats()
|
|
dataset.reduce()
|
|
print(f'[done]')
|
|
self._check_dataset(dataset)
|
|
|
|
def test_twitter(self):
|
|
for dataset_name in TWITTER_SENTIMENT_DATASETS_TEST:
|
|
print(f'loading dataset {dataset_name}...', end='')
|
|
dataset = fetch_twitter(dataset_name, min_df=10)
|
|
dataset.stats()
|
|
dataset.reduce()
|
|
print(f'[done]')
|
|
self._check_dataset(dataset)
|
|
|
|
def test_UCIBinaryDataset(self):
|
|
for dataset_name in UCI_BINARY_DATASETS:
|
|
try:
|
|
print(f'loading dataset {dataset_name}...', end='')
|
|
dataset = fetch_UCIBinaryDataset(dataset_name)
|
|
dataset.stats()
|
|
dataset.reduce()
|
|
print(f'[done]')
|
|
self._check_dataset(dataset)
|
|
except FileNotFoundError as fnfe:
|
|
if dataset_name == 'pageblocks.5' and fnfe.args[0].find(
|
|
'If this is the first time you attempt to load this dataset') > 0:
|
|
print('The pageblocks.5 dataset requires some hand processing to be usable; skipping this test.')
|
|
continue
|
|
|
|
def test_UCIMultiDataset(self):
|
|
for dataset_name in UCI_MULTICLASS_DATASETS:
|
|
print(f'loading dataset {dataset_name}...', end='')
|
|
dataset = fetch_UCIMulticlassDataset(dataset_name)
|
|
dataset.stats()
|
|
n_classes = dataset.n_classes
|
|
uniform_prev = F.uniform_prevalence(n_classes)
|
|
dataset.training = dataset.training.sampling(100, *uniform_prev)
|
|
dataset.test = dataset.test.sampling(100, *uniform_prev)
|
|
print(f'[done]')
|
|
self._check_dataset(dataset)
|
|
|
|
def test_lequa2022(self):
|
|
if os.environ.get('QUAPY_TESTS_OMIT_LARGE_DATASETS'):
|
|
print("omitting test_lequa2022 because QUAPY_TESTS_OMIT_LARGE_DATASETS is set")
|
|
return
|
|
|
|
for dataset_name in LEQUA2022_VECTOR_TASKS:
|
|
print(f'loading dataset {dataset_name}...', end='')
|
|
train, gen_val, gen_test = fetch_lequa2022(dataset_name)
|
|
train.stats()
|
|
n_classes = train.n_classes
|
|
train = train.sampling(100, *F.uniform_prevalence(n_classes))
|
|
q = self.new_quantifier()
|
|
q.fit(train)
|
|
self._check_samples(gen_val, q, max_samples_test=5)
|
|
self._check_samples(gen_test, q, max_samples_test=5)
|
|
|
|
for dataset_name in LEQUA2022_TEXT_TASKS:
|
|
print(f'loading dataset {dataset_name}...', end='')
|
|
train, gen_val, gen_test = fetch_lequa2022(dataset_name)
|
|
train.stats()
|
|
n_classes = train.n_classes
|
|
train = train.sampling(100, *F.uniform_prevalence(n_classes))
|
|
tfidf = TfidfVectorizer()
|
|
train.instances = tfidf.fit_transform(train.instances)
|
|
q = self.new_quantifier()
|
|
q.fit(train)
|
|
self._check_samples(gen_val, q, max_samples_test=5, vectorizer=tfidf)
|
|
self._check_samples(gen_test, q, max_samples_test=5, vectorizer=tfidf)
|
|
|
|
|
|
def test_IFCB(self):
|
|
if os.environ.get('QUAPY_TESTS_OMIT_LARGE_DATASETS'):
|
|
print("omitting test_IFCB because QUAPY_TESTS_OMIT_LARGE_DATASETS is set")
|
|
return
|
|
|
|
print(f'loading dataset IFCB.')
|
|
for mod_sel in [False, True]:
|
|
train, gen = fetch_IFCB(single_sample_train=True, for_model_selection=mod_sel)
|
|
train.stats()
|
|
n_classes = train.n_classes
|
|
train = train.sampling(100, *F.uniform_prevalence(n_classes))
|
|
q = self.new_quantifier()
|
|
q.fit(train)
|
|
self._check_samples(gen, q, max_samples_test=5)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
unittest.main()
|