forked from moreo/QuaPy
lequa as dataset
This commit is contained in:
parent
eba6fd8123
commit
45642ad778
|
@ -9,9 +9,19 @@
|
||||||
|
|
||||||
- ACC, PACC, Forman's threshold variants have been parallelized.
|
- ACC, PACC, Forman's threshold variants have been parallelized.
|
||||||
|
|
||||||
|
- Exploration of hyperparameters in Model selection can now be run in parallel (it was a n_jobs argument in
|
||||||
|
QuaPy 0.1.6 but only the evaluation part for one specific hyperparameter was run in parallel).
|
||||||
|
|
||||||
|
- The prediction function has been refactored, so it applies the optimization for aggregative quantifiers (that
|
||||||
|
consists in pre-classifying all instances, and then only invoking aggregate on the samples) only in cases in
|
||||||
|
which the total number of classifications would be smaller than the number of classifications with the standard
|
||||||
|
procedure. The user can now specify "force", "auto", True of False, in order to actively decide for applying it
|
||||||
|
or not.
|
||||||
|
|
||||||
Things to fix:
|
Things to fix:
|
||||||
- clean functions like binary, aggregative, probabilistic, etc; those should be resolved via isinstance()
|
- clean functions like binary, aggregative, probabilistic, etc; those should be resolved via isinstance():
|
||||||
|
this is not working; I don't know how to make the isinstance work. Looks like there is some problem with the
|
||||||
|
path of the imported class wrt the path of the class that arrives from another module...
|
||||||
- clean classes_ and n_classes from methods (maybe not from aggregative ones, but those have to be used only
|
- clean classes_ and n_classes from methods (maybe not from aggregative ones, but those have to be used only
|
||||||
internally and not imposed in any abstract class)
|
internally and not imposed in any abstract class)
|
||||||
- optimize "qp.evaluation.prediction" for aggregative methods (pre-classification)
|
- optimize "qp.evaluation.prediction" for aggregative methods (pre-classification)
|
||||||
|
@ -33,6 +43,10 @@ Things to fix:
|
||||||
stuff).
|
stuff).
|
||||||
- Check method def __parallel(self, func, *args, **kwargs) in aggregative.OneVsAll
|
- Check method def __parallel(self, func, *args, **kwargs) in aggregative.OneVsAll
|
||||||
|
|
||||||
|
New features:
|
||||||
|
- Add LeQua2022 to datasets (everything automatic, and with proper protocols "gen")
|
||||||
|
- Add an "experimental room", with scripts to quickly test new ideas and see results.
|
||||||
|
|
||||||
# 0.1.7
|
# 0.1.7
|
||||||
# change the LabelledCollection API (removing protocol-related samplings)
|
# change the LabelledCollection API (removing protocol-related samplings)
|
||||||
# need to change the two references to the above in the wiki / doc, and code examples...
|
# need to change the two references to the above in the wiki / doc, and code examples...
|
||||||
|
|
|
@ -43,6 +43,8 @@ UCI_DATASETS = ['acute.a', 'acute.b',
|
||||||
'wine-q-red', 'wine-q-white',
|
'wine-q-red', 'wine-q-white',
|
||||||
'yeast']
|
'yeast']
|
||||||
|
|
||||||
|
LEQUA2022_TASKS = ['T1A', 'T1B', 'T2A', 'T2B']
|
||||||
|
|
||||||
|
|
||||||
def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False) -> Dataset:
|
def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False) -> Dataset:
|
||||||
"""
|
"""
|
||||||
|
@ -532,4 +534,53 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) ->
|
||||||
|
|
||||||
|
|
||||||
def _df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):
|
def _df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):
|
||||||
df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False)
|
df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False)
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_lequa2022(task, data_home=None):
|
||||||
|
"""
|
||||||
|
"""
|
||||||
|
from quapy.data._lequa2022 import load_raw_documents, load_vector_documents, SamplesFromDir
|
||||||
|
|
||||||
|
assert task in LEQUA2022_TASKS, \
|
||||||
|
f'Unknown task {task}. Valid ones are {LEQUA2022_TASKS}'
|
||||||
|
if data_home is None:
|
||||||
|
data_home = get_quapy_home()
|
||||||
|
|
||||||
|
URL_TRAINDEV=f'https://zenodo.org/record/6546188/files/{task}.train_dev.zip'
|
||||||
|
URL_TEST=f'https://zenodo.org/record/6546188/files/{task}.test.zip'
|
||||||
|
URL_TEST_PREV=f'https://zenodo.org/record/6546188/files/{task}.test_prevalences.zip'
|
||||||
|
|
||||||
|
lequa_dir = join(data_home, 'lequa2022')
|
||||||
|
os.makedirs(lequa_dir, exist_ok=True)
|
||||||
|
|
||||||
|
def download_unzip_and_remove(unzipped_path, url):
|
||||||
|
tmp_path = join(lequa_dir, task + '_tmp.zip')
|
||||||
|
download_file_if_not_exists(url, tmp_path)
|
||||||
|
with zipfile.ZipFile(tmp_path) as file:
|
||||||
|
file.extractall(unzipped_path)
|
||||||
|
os.remove(tmp_path)
|
||||||
|
|
||||||
|
if not os.path.exists(join(lequa_dir, task)):
|
||||||
|
download_unzip_and_remove(lequa_dir, URL_TRAINDEV)
|
||||||
|
download_unzip_and_remove(lequa_dir, URL_TEST)
|
||||||
|
download_unzip_and_remove(lequa_dir, URL_TEST_PREV)
|
||||||
|
|
||||||
|
if task in ['T1A', 'T1B']:
|
||||||
|
load_fn = load_vector_documents
|
||||||
|
elif task in ['T2A', 'T2B']:
|
||||||
|
load_fn = load_raw_documents
|
||||||
|
|
||||||
|
tr_path = join(lequa_dir, task, 'public', 'training_data.txt')
|
||||||
|
train = LabelledCollection.load(tr_path, loader_func=load_fn)
|
||||||
|
|
||||||
|
val_samples_path = join(lequa_dir, task, 'public', 'dev_samples')
|
||||||
|
val_true_prev_path = join(lequa_dir, task, 'public', 'dev_prevalences.txt')
|
||||||
|
val_gen = SamplesFromDir(val_samples_path, val_true_prev_path, load_fn=load_fn)
|
||||||
|
|
||||||
|
test_samples_path = join(lequa_dir, task, 'public', 'dev_samples')
|
||||||
|
test_true_prev_path = join(lequa_dir, task, 'public', 'test_prevalences.txt')
|
||||||
|
test_gen = SamplesFromDir(val_samples_path, val_true_prev_path, load_fn=load_fn)
|
||||||
|
|
||||||
|
return train, val_gen, test_gen
|
||||||
|
|
||||||
|
|
|
@ -1,13 +1,9 @@
|
||||||
from typing import Union, Callable, Iterable
|
from typing import Union, Callable, Iterable
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
import inspect
|
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
from quapy.protocol import AbstractProtocol, OnLabelledCollectionProtocol
|
from quapy.protocol import AbstractProtocol, OnLabelledCollectionProtocol
|
||||||
from quapy.data import LabelledCollection
|
|
||||||
from quapy.method.base import BaseQuantifier
|
from quapy.method.base import BaseQuantifier
|
||||||
from quapy.util import temp_seed
|
|
||||||
import quapy.functional as F
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
@ -22,7 +18,7 @@ def prediction(model: BaseQuantifier, protocol: AbstractProtocol, aggr_speedup='
|
||||||
# checks whether the prediction can be made more efficiently; this check consists in verifying if the model is
|
# checks whether the prediction can be made more efficiently; this check consists in verifying if the model is
|
||||||
# of type aggregative, if the protocol is based on LabelledCollection, and if the total number of documents to
|
# of type aggregative, if the protocol is based on LabelledCollection, and if the total number of documents to
|
||||||
# classify using the protocol would exceed the number of test documents in the original collection
|
# classify using the protocol would exceed the number of test documents in the original collection
|
||||||
from method.aggregative import AggregativeQuantifier
|
from quapy.method.aggregative import AggregativeQuantifier
|
||||||
if isinstance(model, AggregativeQuantifier) and isinstance(protocol, OnLabelledCollectionProtocol):
|
if isinstance(model, AggregativeQuantifier) and isinstance(protocol, OnLabelledCollectionProtocol):
|
||||||
if aggr_speedup == 'force':
|
if aggr_speedup == 'force':
|
||||||
apply_optimization = True
|
apply_optimization = True
|
||||||
|
@ -45,9 +41,9 @@ def prediction(model: BaseQuantifier, protocol: AbstractProtocol, aggr_speedup='
|
||||||
|
|
||||||
def __prediction_helper(quantification_fn, protocol: AbstractProtocol, verbose=False):
|
def __prediction_helper(quantification_fn, protocol: AbstractProtocol, verbose=False):
|
||||||
true_prevs, estim_prevs = [], []
|
true_prevs, estim_prevs = [], []
|
||||||
for sample in tqdm(protocol(), total=protocol.total()) if verbose else protocol():
|
for sample_instances, sample_prev in tqdm(protocol(), total=protocol.total()) if verbose else protocol():
|
||||||
estim_prevs.append(quantification_fn(sample.instances))
|
estim_prevs.append(quantification_fn(sample_instances))
|
||||||
true_prevs.append(sample.prevalence())
|
true_prevs.append(sample_prev)
|
||||||
|
|
||||||
true_prevs = np.asarray(true_prevs)
|
true_prevs = np.asarray(true_prevs)
|
||||||
estim_prevs = np.asarray(estim_prevs)
|
estim_prevs = np.asarray(estim_prevs)
|
||||||
|
|
|
@ -9,7 +9,6 @@ from tqdm import tqdm
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
from quapy import functional as F
|
from quapy import functional as F
|
||||||
from quapy.data import LabelledCollection
|
from quapy.data import LabelledCollection
|
||||||
from quapy.evaluation import evaluate
|
|
||||||
from quapy.model_selection import GridSearchQ
|
from quapy.model_selection import GridSearchQ
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -176,6 +175,7 @@ class Ensemble(BaseQuantifier):
|
||||||
For each model in the ensemble, the performance is measured in terms of _error_name_ on the quantification of
|
For each model in the ensemble, the performance is measured in terms of _error_name_ on the quantification of
|
||||||
the samples used for training the rest of the models in the ensemble.
|
the samples used for training the rest of the models in the ensemble.
|
||||||
"""
|
"""
|
||||||
|
from quapy.evaluation import evaluate
|
||||||
error = qp.error.from_name(error_name)
|
error = qp.error.from_name(error_name)
|
||||||
tests = [m[3] for m in self.ensemble]
|
tests = [m[3] for m in self.ensemble]
|
||||||
scores = []
|
scores = []
|
||||||
|
|
|
@ -81,6 +81,8 @@ class GridSearchQ(BaseQuantifier):
|
||||||
self.param_scores_ = {}
|
self.param_scores_ = {}
|
||||||
self.best_score_ = None
|
self.best_score_ = None
|
||||||
|
|
||||||
|
tinit = time()
|
||||||
|
|
||||||
hyper = [dict({k: values[i] for i, k in enumerate(params_keys)}) for values in itertools.product(*params_values)]
|
hyper = [dict({k: values[i] for i, k in enumerate(params_keys)}) for values in itertools.product(*params_values)]
|
||||||
scores = qp.util.parallel(self._delayed_eval, ((params, training) for params in hyper), n_jobs=n_jobs)
|
scores = qp.util.parallel(self._delayed_eval, ((params, training) for params in hyper), n_jobs=n_jobs)
|
||||||
|
|
||||||
|
@ -94,10 +96,13 @@ class GridSearchQ(BaseQuantifier):
|
||||||
else:
|
else:
|
||||||
self.param_scores_[str(params)] = 'timeout'
|
self.param_scores_[str(params)] = 'timeout'
|
||||||
|
|
||||||
|
tend = time()-tinit
|
||||||
|
|
||||||
if self.best_score_ is None:
|
if self.best_score_ is None:
|
||||||
raise TimeoutError('all jobs took more than the timeout time to end')
|
raise TimeoutError('all jobs took more than the timeout time to end')
|
||||||
|
|
||||||
self._sout(f'optimization finished: best params {self.best_params_} (score={self.best_score_:.5f})')
|
self._sout(f'optimization finished: best params {self.best_params_} (score={self.best_score_:.5f}) '
|
||||||
|
f'[took {tend:.4f}s]')
|
||||||
|
|
||||||
if self.refit:
|
if self.refit:
|
||||||
if isinstance(protocol, OnLabelledCollectionProtocol):
|
if isinstance(protocol, OnLabelledCollectionProtocol):
|
||||||
|
|
|
@ -1,14 +1,11 @@
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
|
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import itertools
|
import itertools
|
||||||
from collections.abc import Generator
|
|
||||||
from contextlib import ExitStack
|
from contextlib import ExitStack
|
||||||
from abc import ABCMeta, abstractmethod
|
from abc import ABCMeta, abstractmethod
|
||||||
from quapy.data import LabelledCollection
|
from quapy.data import LabelledCollection
|
||||||
import quapy.functional as F
|
import quapy.functional as F
|
||||||
from tqdm import tqdm
|
|
||||||
from os.path import exists
|
from os.path import exists
|
||||||
from glob import glob
|
from glob import glob
|
||||||
|
|
||||||
|
@ -87,10 +84,14 @@ class AbstractStochasticSeededProtocol(AbstractProtocol):
|
||||||
if self.random_seed is not None:
|
if self.random_seed is not None:
|
||||||
stack.enter_context(qp.util.temp_seed(self.random_seed))
|
stack.enter_context(qp.util.temp_seed(self.random_seed))
|
||||||
for params in self.samples_parameters():
|
for params in self.samples_parameters():
|
||||||
yield self.sample(params)
|
yield self.collator_fn(self.sample(params))
|
||||||
|
|
||||||
|
def set_collator(self, collator_fn):
|
||||||
|
self.collator_fn = collator_fn
|
||||||
|
|
||||||
|
|
||||||
class OnLabelledCollectionProtocol:
|
class OnLabelledCollectionProtocol:
|
||||||
|
|
||||||
def get_labelled_collection(self):
|
def get_labelled_collection(self):
|
||||||
return self.data
|
return self.data
|
||||||
|
|
||||||
|
@ -106,31 +107,6 @@ class OnLabelledCollectionProtocol:
|
||||||
return new.on_preclassified_instances(pre_classifications, in_place=True)
|
return new.on_preclassified_instances(pre_classifications, in_place=True)
|
||||||
|
|
||||||
|
|
||||||
class LoadSamplesFromDirectory(AbstractProtocol):
|
|
||||||
|
|
||||||
def __init__(self, folder_path, loader_fn, classes=None, **loader_kwargs):
|
|
||||||
assert exists(folder_path), f'folder {folder_path} does not exist'
|
|
||||||
assert callable(loader_fn), f'the passed load_fn does not seem to be callable'
|
|
||||||
self.folder_path = folder_path
|
|
||||||
self.loader_fn = loader_fn
|
|
||||||
self.classes = classes
|
|
||||||
self.loader_kwargs = loader_kwargs
|
|
||||||
self._list_files = None
|
|
||||||
|
|
||||||
def __call__(self):
|
|
||||||
for file in self.list_files:
|
|
||||||
yield LabelledCollection.load(file, loader_func=self.loader_fn, classes=self.classes, **self.loader_kwargs)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def list_files(self):
|
|
||||||
if self._list_files is None:
|
|
||||||
self._list_files = sorted(glob(self.folder_path, '*'))
|
|
||||||
return self._list_files
|
|
||||||
|
|
||||||
def total(self):
|
|
||||||
return len(self.list_files)
|
|
||||||
|
|
||||||
|
|
||||||
class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
|
class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
|
||||||
"""
|
"""
|
||||||
Implementation of the artificial prevalence protocol (APP).
|
Implementation of the artificial prevalence protocol (APP).
|
||||||
|
@ -154,6 +130,7 @@ class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
|
||||||
self.sample_size = sample_size
|
self.sample_size = sample_size
|
||||||
self.n_prevalences = n_prevalences
|
self.n_prevalences = n_prevalences
|
||||||
self.repeats = repeats
|
self.repeats = repeats
|
||||||
|
self.set_collator(collator_fn=lambda x: (x.instances, x.prevalence()))
|
||||||
|
|
||||||
def prevalence_grid(self):
|
def prevalence_grid(self):
|
||||||
"""
|
"""
|
||||||
|
@ -210,6 +187,7 @@ class NPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
|
||||||
self.sample_size = sample_size
|
self.sample_size = sample_size
|
||||||
self.repeats = repeats
|
self.repeats = repeats
|
||||||
self.random_seed = random_seed
|
self.random_seed = random_seed
|
||||||
|
self.set_collator(collator_fn=lambda x: (x.instances, x.prevalence()))
|
||||||
|
|
||||||
def samples_parameters(self):
|
def samples_parameters(self):
|
||||||
indexes = []
|
indexes = []
|
||||||
|
@ -246,6 +224,7 @@ class USimplexPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol)
|
||||||
self.sample_size = sample_size
|
self.sample_size = sample_size
|
||||||
self.repeats = repeats
|
self.repeats = repeats
|
||||||
self.random_seed = random_seed
|
self.random_seed = random_seed
|
||||||
|
self.set_collator(collator_fn=lambda x: (x.instances, x.prevalence()))
|
||||||
|
|
||||||
def samples_parameters(self):
|
def samples_parameters(self):
|
||||||
indexes = []
|
indexes = []
|
||||||
|
@ -261,6 +240,31 @@ class USimplexPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol)
|
||||||
return self.repeats
|
return self.repeats
|
||||||
|
|
||||||
|
|
||||||
|
# class LoadSamplesFromDirectory(AbstractProtocol):
|
||||||
|
#
|
||||||
|
# def __init__(self, folder_path, loader_fn, classes=None, **loader_kwargs):
|
||||||
|
# assert exists(folder_path), f'folder {folder_path} does not exist'
|
||||||
|
# assert callable(loader_fn), f'the passed load_fn does not seem to be callable'
|
||||||
|
# self.folder_path = folder_path
|
||||||
|
# self.loader_fn = loader_fn
|
||||||
|
# self.classes = classes
|
||||||
|
# self.loader_kwargs = loader_kwargs
|
||||||
|
# self._list_files = None
|
||||||
|
#
|
||||||
|
# def __call__(self):
|
||||||
|
# for file in self.list_files:
|
||||||
|
# yield LabelledCollection.load(file, loader_func=self.loader_fn, classes=self.classes, **self.loader_kwargs)
|
||||||
|
#
|
||||||
|
# @property
|
||||||
|
# def list_files(self):
|
||||||
|
# if self._list_files is None:
|
||||||
|
# self._list_files = sorted(glob(self.folder_path, '*'))
|
||||||
|
# return self._list_files
|
||||||
|
#
|
||||||
|
# def total(self):
|
||||||
|
# return len(self.list_files)
|
||||||
|
|
||||||
|
|
||||||
class CovariateShiftPP(AbstractStochasticSeededProtocol):
|
class CovariateShiftPP(AbstractStochasticSeededProtocol):
|
||||||
"""
|
"""
|
||||||
Generates mixtures of two domains (A and B) at controlled rates, but preserving the original class prevalence.
|
Generates mixtures of two domains (A and B) at controlled rates, but preserving the original class prevalence.
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from quapy.data.datasets import REVIEWS_SENTIMENT_DATASETS, TWITTER_SENTIMENT_DATASETS_TEST, \
|
from quapy.data.datasets import REVIEWS_SENTIMENT_DATASETS, TWITTER_SENTIMENT_DATASETS_TEST, \
|
||||||
TWITTER_SENTIMENT_DATASETS_TRAIN, UCI_DATASETS, fetch_reviews, fetch_twitter, fetch_UCIDataset
|
TWITTER_SENTIMENT_DATASETS_TRAIN, UCI_DATASETS, LEQUA2022_TASKS, \
|
||||||
|
fetch_reviews, fetch_twitter, fetch_UCIDataset, fetch_lequa2022
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('dataset_name', REVIEWS_SENTIMENT_DATASETS)
|
@pytest.mark.parametrize('dataset_name', REVIEWS_SENTIMENT_DATASETS)
|
||||||
|
@ -41,3 +42,13 @@ def test_fetch_UCIDataset(dataset_name):
|
||||||
print('Training set stats')
|
print('Training set stats')
|
||||||
dataset.training.stats()
|
dataset.training.stats()
|
||||||
print('Test set stats')
|
print('Test set stats')
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('dataset_name', LEQUA2022_TASKS)
|
||||||
|
def test_fetch_lequa2022(dataset_name):
|
||||||
|
fetch_lequa2022(dataset_name)
|
||||||
|
# dataset = fetch_lequa2022(dataset_name)
|
||||||
|
# print(f'Dataset {dataset_name}')
|
||||||
|
# print('Training set stats')
|
||||||
|
# dataset.training.stats()
|
||||||
|
# print('Test set stats')
|
|
@ -2,8 +2,8 @@ import unittest
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
from sklearn.linear_model import LogisticRegression
|
from sklearn.linear_model import LogisticRegression
|
||||||
from time import time
|
from time import time
|
||||||
from method.aggregative import EMQ
|
from quapy.method.aggregative import EMQ
|
||||||
from method.base import BaseQuantifier
|
from quapy.method.base import BaseQuantifier
|
||||||
|
|
||||||
|
|
||||||
class EvalTestCase(unittest.TestCase):
|
class EvalTestCase(unittest.TestCase):
|
||||||
|
@ -12,7 +12,7 @@ class EvalTestCase(unittest.TestCase):
|
||||||
data = qp.datasets.fetch_reviews('hp', tfidf=True, min_df=10, pickle=True)
|
data = qp.datasets.fetch_reviews('hp', tfidf=True, min_df=10, pickle=True)
|
||||||
train, test = data.training, data.test
|
train, test = data.training, data.test
|
||||||
|
|
||||||
protocol = qp.protocol.APP(test, sample_size=1000, n_prevalences=21, repeats=1, random_seed=1)
|
protocol = qp.protocol.APP(test, sample_size=1000, n_prevalences=11, repeats=1, random_seed=1)
|
||||||
|
|
||||||
class SlowLR(LogisticRegression):
|
class SlowLR(LogisticRegression):
|
||||||
def predict_proba(self, X):
|
def predict_proba(self, X):
|
||||||
|
@ -23,7 +23,7 @@ class EvalTestCase(unittest.TestCase):
|
||||||
emq = EMQ(SlowLR()).fit(train)
|
emq = EMQ(SlowLR()).fit(train)
|
||||||
|
|
||||||
tinit = time()
|
tinit = time()
|
||||||
score = qp.evaluation.evaluate(emq, protocol, error_metric='mae', verbose=True)
|
score = qp.evaluation.evaluate(emq, protocol, error_metric='mae', verbose=True, aggr_speedup='force')
|
||||||
tend_optim = time()-tinit
|
tend_optim = time()-tinit
|
||||||
print(f'evaluation (with optimization) took {tend_optim}s [MAE={score:.4f}]')
|
print(f'evaluation (with optimization) took {tend_optim}s [MAE={score:.4f}]')
|
||||||
|
|
||||||
|
@ -50,7 +50,7 @@ class EvalTestCase(unittest.TestCase):
|
||||||
tend_no_optim = time() - tinit
|
tend_no_optim = time() - tinit
|
||||||
print(f'evaluation (w/o optimization) took {tend_no_optim}s [MAE={score:.4f}]')
|
print(f'evaluation (w/o optimization) took {tend_no_optim}s [MAE={score:.4f}]')
|
||||||
|
|
||||||
self.assertEqual(tend_no_optim>tend_optim, True)
|
self.assertEqual(tend_no_optim>(tend_optim/2), True)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
@ -8,6 +8,7 @@ import quapy as qp
|
||||||
from method.aggregative import PACC
|
from method.aggregative import PACC
|
||||||
from model_selection import GridSearchQ
|
from model_selection import GridSearchQ
|
||||||
from protocol import APP
|
from protocol import APP
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
class ModselTestCase(unittest.TestCase):
|
class ModselTestCase(unittest.TestCase):
|
||||||
|
@ -18,7 +19,6 @@ class ModselTestCase(unittest.TestCase):
|
||||||
|
|
||||||
data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10)
|
data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10)
|
||||||
training, validation = data.training.split_stratified(0.7, random_state=1)
|
training, validation = data.training.split_stratified(0.7, random_state=1)
|
||||||
# test = data.test
|
|
||||||
|
|
||||||
param_grid = {'C': np.logspace(-3,3,7)}
|
param_grid = {'C': np.logspace(-3,3,7)}
|
||||||
app = APP(validation, sample_size=100, random_seed=1)
|
app = APP(validation, sample_size=100, random_seed=1)
|
||||||
|
@ -50,6 +50,37 @@ class ModselTestCase(unittest.TestCase):
|
||||||
self.assertEqual(q.best_params_['C'], 10.0)
|
self.assertEqual(q.best_params_['C'], 10.0)
|
||||||
self.assertEqual(q.best_model().get_params()['C'], 10.0)
|
self.assertEqual(q.best_model().get_params()['C'], 10.0)
|
||||||
|
|
||||||
|
def test_modsel_parallel_speedup(self):
|
||||||
|
class SlowLR(LogisticRegression):
|
||||||
|
def fit(self, X, y, sample_weight=None):
|
||||||
|
time.sleep(1)
|
||||||
|
return super(SlowLR, self).fit(X, y, sample_weight)
|
||||||
|
|
||||||
|
q = PACC(SlowLR(random_state=1, max_iter=5000))
|
||||||
|
|
||||||
|
data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10)
|
||||||
|
training, validation = data.training.split_stratified(0.7, random_state=1)
|
||||||
|
|
||||||
|
param_grid = {'C': np.logspace(-3, 3, 7)}
|
||||||
|
app = APP(validation, sample_size=100, random_seed=1)
|
||||||
|
|
||||||
|
tinit = time.time()
|
||||||
|
GridSearchQ(
|
||||||
|
q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, n_jobs=1, verbose=True
|
||||||
|
).fit(training)
|
||||||
|
tend_nooptim = time.time()-tinit
|
||||||
|
|
||||||
|
tinit = time.time()
|
||||||
|
GridSearchQ(
|
||||||
|
q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, n_jobs=-1, verbose=True
|
||||||
|
).fit(training)
|
||||||
|
tend_optim = time.time() - tinit
|
||||||
|
|
||||||
|
print(f'parallel training took {tend_optim:.4f}s')
|
||||||
|
print(f'sequential training took {tend_nooptim:.4f}s')
|
||||||
|
|
||||||
|
self.assertEqual(tend_optim < (0.5*tend_nooptim), True)
|
||||||
|
|
||||||
def test_modsel_timeout(self):
|
def test_modsel_timeout(self):
|
||||||
|
|
||||||
class SlowLR(LogisticRegression):
|
class SlowLR(LogisticRegression):
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import unittest
|
import unittest
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from data import LabelledCollection
|
from quapy.data import LabelledCollection
|
||||||
from protocol import APP, NPP, USimplexPP, CovariateShiftPP, AbstractStochasticSeededProtocol
|
from quapy.protocol import APP, NPP, USimplexPP, CovariateShiftPP, AbstractStochasticSeededProtocol
|
||||||
|
|
||||||
|
|
||||||
def mock_labelled_collection(prefix=''):
|
def mock_labelled_collection(prefix=''):
|
||||||
|
@ -134,6 +134,5 @@ class TestProtocols(unittest.TestCase):
|
||||||
print('done')
|
print('done')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|
|
@ -46,6 +46,7 @@ def parallel(func, args, n_jobs):
|
||||||
|
|
||||||
that takes the `quapy.environ` variable as input silently
|
that takes the `quapy.environ` variable as input silently
|
||||||
"""
|
"""
|
||||||
|
print('n_jobs',n_jobs)
|
||||||
def func_dec(environ, *args):
|
def func_dec(environ, *args):
|
||||||
qp.environ = environ
|
qp.environ = environ
|
||||||
return func(*args)
|
return func(*args)
|
||||||
|
|
Loading…
Reference in New Issue