1
0
Fork 0

lequa as dataset

This commit is contained in:
Alejandro Moreo Fernandez 2022-06-01 18:28:59 +02:00
parent eba6fd8123
commit 45642ad778
11 changed files with 163 additions and 51 deletions

View File

@ -9,9 +9,19 @@
- ACC, PACC, Forman's threshold variants have been parallelized. - ACC, PACC, Forman's threshold variants have been parallelized.
- Exploration of hyperparameters in Model selection can now be run in parallel (it was a n_jobs argument in
QuaPy 0.1.6 but only the evaluation part for one specific hyperparameter was run in parallel).
- The prediction function has been refactored, so it applies the optimization for aggregative quantifiers (that
consists in pre-classifying all instances, and then only invoking aggregate on the samples) only in cases in
which the total number of classifications would be smaller than the number of classifications with the standard
procedure. The user can now specify "force", "auto", True of False, in order to actively decide for applying it
or not.
Things to fix: Things to fix:
- clean functions like binary, aggregative, probabilistic, etc; those should be resolved via isinstance() - clean functions like binary, aggregative, probabilistic, etc; those should be resolved via isinstance():
this is not working; I don't know how to make the isinstance work. Looks like there is some problem with the
path of the imported class wrt the path of the class that arrives from another module...
- clean classes_ and n_classes from methods (maybe not from aggregative ones, but those have to be used only - clean classes_ and n_classes from methods (maybe not from aggregative ones, but those have to be used only
internally and not imposed in any abstract class) internally and not imposed in any abstract class)
- optimize "qp.evaluation.prediction" for aggregative methods (pre-classification) - optimize "qp.evaluation.prediction" for aggregative methods (pre-classification)
@ -33,6 +43,10 @@ Things to fix:
stuff). stuff).
- Check method def __parallel(self, func, *args, **kwargs) in aggregative.OneVsAll - Check method def __parallel(self, func, *args, **kwargs) in aggregative.OneVsAll
New features:
- Add LeQua2022 to datasets (everything automatic, and with proper protocols "gen")
- Add an "experimental room", with scripts to quickly test new ideas and see results.
# 0.1.7 # 0.1.7
# change the LabelledCollection API (removing protocol-related samplings) # change the LabelledCollection API (removing protocol-related samplings)
# need to change the two references to the above in the wiki / doc, and code examples... # need to change the two references to the above in the wiki / doc, and code examples...

View File

@ -43,6 +43,8 @@ UCI_DATASETS = ['acute.a', 'acute.b',
'wine-q-red', 'wine-q-white', 'wine-q-red', 'wine-q-white',
'yeast'] 'yeast']
LEQUA2022_TASKS = ['T1A', 'T1B', 'T2A', 'T2B']
def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False) -> Dataset: def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False) -> Dataset:
""" """
@ -532,4 +534,53 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) ->
def _df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float): def _df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):
df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False) df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False)
def fetch_lequa2022(task, data_home=None):
"""
"""
from quapy.data._lequa2022 import load_raw_documents, load_vector_documents, SamplesFromDir
assert task in LEQUA2022_TASKS, \
f'Unknown task {task}. Valid ones are {LEQUA2022_TASKS}'
if data_home is None:
data_home = get_quapy_home()
URL_TRAINDEV=f'https://zenodo.org/record/6546188/files/{task}.train_dev.zip'
URL_TEST=f'https://zenodo.org/record/6546188/files/{task}.test.zip'
URL_TEST_PREV=f'https://zenodo.org/record/6546188/files/{task}.test_prevalences.zip'
lequa_dir = join(data_home, 'lequa2022')
os.makedirs(lequa_dir, exist_ok=True)
def download_unzip_and_remove(unzipped_path, url):
tmp_path = join(lequa_dir, task + '_tmp.zip')
download_file_if_not_exists(url, tmp_path)
with zipfile.ZipFile(tmp_path) as file:
file.extractall(unzipped_path)
os.remove(tmp_path)
if not os.path.exists(join(lequa_dir, task)):
download_unzip_and_remove(lequa_dir, URL_TRAINDEV)
download_unzip_and_remove(lequa_dir, URL_TEST)
download_unzip_and_remove(lequa_dir, URL_TEST_PREV)
if task in ['T1A', 'T1B']:
load_fn = load_vector_documents
elif task in ['T2A', 'T2B']:
load_fn = load_raw_documents
tr_path = join(lequa_dir, task, 'public', 'training_data.txt')
train = LabelledCollection.load(tr_path, loader_func=load_fn)
val_samples_path = join(lequa_dir, task, 'public', 'dev_samples')
val_true_prev_path = join(lequa_dir, task, 'public', 'dev_prevalences.txt')
val_gen = SamplesFromDir(val_samples_path, val_true_prev_path, load_fn=load_fn)
test_samples_path = join(lequa_dir, task, 'public', 'dev_samples')
test_true_prev_path = join(lequa_dir, task, 'public', 'test_prevalences.txt')
test_gen = SamplesFromDir(val_samples_path, val_true_prev_path, load_fn=load_fn)
return train, val_gen, test_gen

View File

@ -1,13 +1,9 @@
from typing import Union, Callable, Iterable from typing import Union, Callable, Iterable
import numpy as np import numpy as np
from tqdm import tqdm from tqdm import tqdm
import inspect
import quapy as qp import quapy as qp
from quapy.protocol import AbstractProtocol, OnLabelledCollectionProtocol from quapy.protocol import AbstractProtocol, OnLabelledCollectionProtocol
from quapy.data import LabelledCollection
from quapy.method.base import BaseQuantifier from quapy.method.base import BaseQuantifier
from quapy.util import temp_seed
import quapy.functional as F
import pandas as pd import pandas as pd
@ -22,7 +18,7 @@ def prediction(model: BaseQuantifier, protocol: AbstractProtocol, aggr_speedup='
# checks whether the prediction can be made more efficiently; this check consists in verifying if the model is # checks whether the prediction can be made more efficiently; this check consists in verifying if the model is
# of type aggregative, if the protocol is based on LabelledCollection, and if the total number of documents to # of type aggregative, if the protocol is based on LabelledCollection, and if the total number of documents to
# classify using the protocol would exceed the number of test documents in the original collection # classify using the protocol would exceed the number of test documents in the original collection
from method.aggregative import AggregativeQuantifier from quapy.method.aggregative import AggregativeQuantifier
if isinstance(model, AggregativeQuantifier) and isinstance(protocol, OnLabelledCollectionProtocol): if isinstance(model, AggregativeQuantifier) and isinstance(protocol, OnLabelledCollectionProtocol):
if aggr_speedup == 'force': if aggr_speedup == 'force':
apply_optimization = True apply_optimization = True
@ -45,9 +41,9 @@ def prediction(model: BaseQuantifier, protocol: AbstractProtocol, aggr_speedup='
def __prediction_helper(quantification_fn, protocol: AbstractProtocol, verbose=False): def __prediction_helper(quantification_fn, protocol: AbstractProtocol, verbose=False):
true_prevs, estim_prevs = [], [] true_prevs, estim_prevs = [], []
for sample in tqdm(protocol(), total=protocol.total()) if verbose else protocol(): for sample_instances, sample_prev in tqdm(protocol(), total=protocol.total()) if verbose else protocol():
estim_prevs.append(quantification_fn(sample.instances)) estim_prevs.append(quantification_fn(sample_instances))
true_prevs.append(sample.prevalence()) true_prevs.append(sample_prev)
true_prevs = np.asarray(true_prevs) true_prevs = np.asarray(true_prevs)
estim_prevs = np.asarray(estim_prevs) estim_prevs = np.asarray(estim_prevs)

View File

@ -9,7 +9,6 @@ from tqdm import tqdm
import quapy as qp import quapy as qp
from quapy import functional as F from quapy import functional as F
from quapy.data import LabelledCollection from quapy.data import LabelledCollection
from quapy.evaluation import evaluate
from quapy.model_selection import GridSearchQ from quapy.model_selection import GridSearchQ
try: try:
@ -176,6 +175,7 @@ class Ensemble(BaseQuantifier):
For each model in the ensemble, the performance is measured in terms of _error_name_ on the quantification of For each model in the ensemble, the performance is measured in terms of _error_name_ on the quantification of
the samples used for training the rest of the models in the ensemble. the samples used for training the rest of the models in the ensemble.
""" """
from quapy.evaluation import evaluate
error = qp.error.from_name(error_name) error = qp.error.from_name(error_name)
tests = [m[3] for m in self.ensemble] tests = [m[3] for m in self.ensemble]
scores = [] scores = []

View File

@ -81,6 +81,8 @@ class GridSearchQ(BaseQuantifier):
self.param_scores_ = {} self.param_scores_ = {}
self.best_score_ = None self.best_score_ = None
tinit = time()
hyper = [dict({k: values[i] for i, k in enumerate(params_keys)}) for values in itertools.product(*params_values)] hyper = [dict({k: values[i] for i, k in enumerate(params_keys)}) for values in itertools.product(*params_values)]
scores = qp.util.parallel(self._delayed_eval, ((params, training) for params in hyper), n_jobs=n_jobs) scores = qp.util.parallel(self._delayed_eval, ((params, training) for params in hyper), n_jobs=n_jobs)
@ -94,10 +96,13 @@ class GridSearchQ(BaseQuantifier):
else: else:
self.param_scores_[str(params)] = 'timeout' self.param_scores_[str(params)] = 'timeout'
tend = time()-tinit
if self.best_score_ is None: if self.best_score_ is None:
raise TimeoutError('all jobs took more than the timeout time to end') raise TimeoutError('all jobs took more than the timeout time to end')
self._sout(f'optimization finished: best params {self.best_params_} (score={self.best_score_:.5f})') self._sout(f'optimization finished: best params {self.best_params_} (score={self.best_score_:.5f}) '
f'[took {tend:.4f}s]')
if self.refit: if self.refit:
if isinstance(protocol, OnLabelledCollectionProtocol): if isinstance(protocol, OnLabelledCollectionProtocol):

View File

@ -1,14 +1,11 @@
from copy import deepcopy from copy import deepcopy
import quapy as qp import quapy as qp
import numpy as np import numpy as np
import itertools import itertools
from collections.abc import Generator
from contextlib import ExitStack from contextlib import ExitStack
from abc import ABCMeta, abstractmethod from abc import ABCMeta, abstractmethod
from quapy.data import LabelledCollection from quapy.data import LabelledCollection
import quapy.functional as F import quapy.functional as F
from tqdm import tqdm
from os.path import exists from os.path import exists
from glob import glob from glob import glob
@ -87,10 +84,14 @@ class AbstractStochasticSeededProtocol(AbstractProtocol):
if self.random_seed is not None: if self.random_seed is not None:
stack.enter_context(qp.util.temp_seed(self.random_seed)) stack.enter_context(qp.util.temp_seed(self.random_seed))
for params in self.samples_parameters(): for params in self.samples_parameters():
yield self.sample(params) yield self.collator_fn(self.sample(params))
def set_collator(self, collator_fn):
self.collator_fn = collator_fn
class OnLabelledCollectionProtocol: class OnLabelledCollectionProtocol:
def get_labelled_collection(self): def get_labelled_collection(self):
return self.data return self.data
@ -106,31 +107,6 @@ class OnLabelledCollectionProtocol:
return new.on_preclassified_instances(pre_classifications, in_place=True) return new.on_preclassified_instances(pre_classifications, in_place=True)
class LoadSamplesFromDirectory(AbstractProtocol):
def __init__(self, folder_path, loader_fn, classes=None, **loader_kwargs):
assert exists(folder_path), f'folder {folder_path} does not exist'
assert callable(loader_fn), f'the passed load_fn does not seem to be callable'
self.folder_path = folder_path
self.loader_fn = loader_fn
self.classes = classes
self.loader_kwargs = loader_kwargs
self._list_files = None
def __call__(self):
for file in self.list_files:
yield LabelledCollection.load(file, loader_func=self.loader_fn, classes=self.classes, **self.loader_kwargs)
@property
def list_files(self):
if self._list_files is None:
self._list_files = sorted(glob(self.folder_path, '*'))
return self._list_files
def total(self):
return len(self.list_files)
class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol): class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
""" """
Implementation of the artificial prevalence protocol (APP). Implementation of the artificial prevalence protocol (APP).
@ -154,6 +130,7 @@ class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
self.sample_size = sample_size self.sample_size = sample_size
self.n_prevalences = n_prevalences self.n_prevalences = n_prevalences
self.repeats = repeats self.repeats = repeats
self.set_collator(collator_fn=lambda x: (x.instances, x.prevalence()))
def prevalence_grid(self): def prevalence_grid(self):
""" """
@ -210,6 +187,7 @@ class NPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
self.sample_size = sample_size self.sample_size = sample_size
self.repeats = repeats self.repeats = repeats
self.random_seed = random_seed self.random_seed = random_seed
self.set_collator(collator_fn=lambda x: (x.instances, x.prevalence()))
def samples_parameters(self): def samples_parameters(self):
indexes = [] indexes = []
@ -246,6 +224,7 @@ class USimplexPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol)
self.sample_size = sample_size self.sample_size = sample_size
self.repeats = repeats self.repeats = repeats
self.random_seed = random_seed self.random_seed = random_seed
self.set_collator(collator_fn=lambda x: (x.instances, x.prevalence()))
def samples_parameters(self): def samples_parameters(self):
indexes = [] indexes = []
@ -261,6 +240,31 @@ class USimplexPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol)
return self.repeats return self.repeats
# class LoadSamplesFromDirectory(AbstractProtocol):
#
# def __init__(self, folder_path, loader_fn, classes=None, **loader_kwargs):
# assert exists(folder_path), f'folder {folder_path} does not exist'
# assert callable(loader_fn), f'the passed load_fn does not seem to be callable'
# self.folder_path = folder_path
# self.loader_fn = loader_fn
# self.classes = classes
# self.loader_kwargs = loader_kwargs
# self._list_files = None
#
# def __call__(self):
# for file in self.list_files:
# yield LabelledCollection.load(file, loader_func=self.loader_fn, classes=self.classes, **self.loader_kwargs)
#
# @property
# def list_files(self):
# if self._list_files is None:
# self._list_files = sorted(glob(self.folder_path, '*'))
# return self._list_files
#
# def total(self):
# return len(self.list_files)
class CovariateShiftPP(AbstractStochasticSeededProtocol): class CovariateShiftPP(AbstractStochasticSeededProtocol):
""" """
Generates mixtures of two domains (A and B) at controlled rates, but preserving the original class prevalence. Generates mixtures of two domains (A and B) at controlled rates, but preserving the original class prevalence.

View File

@ -1,7 +1,8 @@
import pytest import pytest
from quapy.data.datasets import REVIEWS_SENTIMENT_DATASETS, TWITTER_SENTIMENT_DATASETS_TEST, \ from quapy.data.datasets import REVIEWS_SENTIMENT_DATASETS, TWITTER_SENTIMENT_DATASETS_TEST, \
TWITTER_SENTIMENT_DATASETS_TRAIN, UCI_DATASETS, fetch_reviews, fetch_twitter, fetch_UCIDataset TWITTER_SENTIMENT_DATASETS_TRAIN, UCI_DATASETS, LEQUA2022_TASKS, \
fetch_reviews, fetch_twitter, fetch_UCIDataset, fetch_lequa2022
@pytest.mark.parametrize('dataset_name', REVIEWS_SENTIMENT_DATASETS) @pytest.mark.parametrize('dataset_name', REVIEWS_SENTIMENT_DATASETS)
@ -41,3 +42,13 @@ def test_fetch_UCIDataset(dataset_name):
print('Training set stats') print('Training set stats')
dataset.training.stats() dataset.training.stats()
print('Test set stats') print('Test set stats')
@pytest.mark.parametrize('dataset_name', LEQUA2022_TASKS)
def test_fetch_lequa2022(dataset_name):
fetch_lequa2022(dataset_name)
# dataset = fetch_lequa2022(dataset_name)
# print(f'Dataset {dataset_name}')
# print('Training set stats')
# dataset.training.stats()
# print('Test set stats')

View File

@ -2,8 +2,8 @@ import unittest
import quapy as qp import quapy as qp
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
from time import time from time import time
from method.aggregative import EMQ from quapy.method.aggregative import EMQ
from method.base import BaseQuantifier from quapy.method.base import BaseQuantifier
class EvalTestCase(unittest.TestCase): class EvalTestCase(unittest.TestCase):
@ -12,7 +12,7 @@ class EvalTestCase(unittest.TestCase):
data = qp.datasets.fetch_reviews('hp', tfidf=True, min_df=10, pickle=True) data = qp.datasets.fetch_reviews('hp', tfidf=True, min_df=10, pickle=True)
train, test = data.training, data.test train, test = data.training, data.test
protocol = qp.protocol.APP(test, sample_size=1000, n_prevalences=21, repeats=1, random_seed=1) protocol = qp.protocol.APP(test, sample_size=1000, n_prevalences=11, repeats=1, random_seed=1)
class SlowLR(LogisticRegression): class SlowLR(LogisticRegression):
def predict_proba(self, X): def predict_proba(self, X):
@ -23,7 +23,7 @@ class EvalTestCase(unittest.TestCase):
emq = EMQ(SlowLR()).fit(train) emq = EMQ(SlowLR()).fit(train)
tinit = time() tinit = time()
score = qp.evaluation.evaluate(emq, protocol, error_metric='mae', verbose=True) score = qp.evaluation.evaluate(emq, protocol, error_metric='mae', verbose=True, aggr_speedup='force')
tend_optim = time()-tinit tend_optim = time()-tinit
print(f'evaluation (with optimization) took {tend_optim}s [MAE={score:.4f}]') print(f'evaluation (with optimization) took {tend_optim}s [MAE={score:.4f}]')
@ -50,7 +50,7 @@ class EvalTestCase(unittest.TestCase):
tend_no_optim = time() - tinit tend_no_optim = time() - tinit
print(f'evaluation (w/o optimization) took {tend_no_optim}s [MAE={score:.4f}]') print(f'evaluation (w/o optimization) took {tend_no_optim}s [MAE={score:.4f}]')
self.assertEqual(tend_no_optim>tend_optim, True) self.assertEqual(tend_no_optim>(tend_optim/2), True)
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -8,6 +8,7 @@ import quapy as qp
from method.aggregative import PACC from method.aggregative import PACC
from model_selection import GridSearchQ from model_selection import GridSearchQ
from protocol import APP from protocol import APP
import time
class ModselTestCase(unittest.TestCase): class ModselTestCase(unittest.TestCase):
@ -18,7 +19,6 @@ class ModselTestCase(unittest.TestCase):
data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10) data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10)
training, validation = data.training.split_stratified(0.7, random_state=1) training, validation = data.training.split_stratified(0.7, random_state=1)
# test = data.test
param_grid = {'C': np.logspace(-3,3,7)} param_grid = {'C': np.logspace(-3,3,7)}
app = APP(validation, sample_size=100, random_seed=1) app = APP(validation, sample_size=100, random_seed=1)
@ -50,6 +50,37 @@ class ModselTestCase(unittest.TestCase):
self.assertEqual(q.best_params_['C'], 10.0) self.assertEqual(q.best_params_['C'], 10.0)
self.assertEqual(q.best_model().get_params()['C'], 10.0) self.assertEqual(q.best_model().get_params()['C'], 10.0)
def test_modsel_parallel_speedup(self):
class SlowLR(LogisticRegression):
def fit(self, X, y, sample_weight=None):
time.sleep(1)
return super(SlowLR, self).fit(X, y, sample_weight)
q = PACC(SlowLR(random_state=1, max_iter=5000))
data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10)
training, validation = data.training.split_stratified(0.7, random_state=1)
param_grid = {'C': np.logspace(-3, 3, 7)}
app = APP(validation, sample_size=100, random_seed=1)
tinit = time.time()
GridSearchQ(
q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, n_jobs=1, verbose=True
).fit(training)
tend_nooptim = time.time()-tinit
tinit = time.time()
GridSearchQ(
q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, n_jobs=-1, verbose=True
).fit(training)
tend_optim = time.time() - tinit
print(f'parallel training took {tend_optim:.4f}s')
print(f'sequential training took {tend_nooptim:.4f}s')
self.assertEqual(tend_optim < (0.5*tend_nooptim), True)
def test_modsel_timeout(self): def test_modsel_timeout(self):
class SlowLR(LogisticRegression): class SlowLR(LogisticRegression):

View File

@ -1,7 +1,7 @@
import unittest import unittest
import numpy as np import numpy as np
from data import LabelledCollection from quapy.data import LabelledCollection
from protocol import APP, NPP, USimplexPP, CovariateShiftPP, AbstractStochasticSeededProtocol from quapy.protocol import APP, NPP, USimplexPP, CovariateShiftPP, AbstractStochasticSeededProtocol
def mock_labelled_collection(prefix=''): def mock_labelled_collection(prefix=''):
@ -134,6 +134,5 @@ class TestProtocols(unittest.TestCase):
print('done') print('done')
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

View File

@ -46,6 +46,7 @@ def parallel(func, args, n_jobs):
that takes the `quapy.environ` variable as input silently that takes the `quapy.environ` variable as input silently
""" """
print('n_jobs',n_jobs)
def func_dec(environ, *args): def func_dec(environ, *args):
qp.environ = environ qp.environ = environ
return func(*args) return func(*args)