lequa as dataset

2022-06-01 18:28:59 +02:00 · 2022-06-01 18:28:59 +02:00 · 45642ad778
parent eba6fd8123
commit 45642ad778
11 changed files with 163 additions and 51 deletions
--- a/quapy/CHANGE_LOG.txt
+++ b/quapy/CHANGE_LOG.txt
@ -9,9 +9,19 @@
 - ACC, PACC, Forman's threshold variants have been parallelized.
 - Exploration of hyperparameters in Model selection can now be run in parallel (it was a n_jobs argument in
    QuaPy 0.1.6 but only the evaluation part for one specific hyperparameter was run in parallel).
 - The prediction function has been refactored, so it applies the optimization for aggregative quantifiers (that
    consists in pre-classifying all instances, and then only invoking aggregate on the samples) only in cases in
    which the total number of classifications would be smaller than the number of classifications with the standard
    procedure. The user can now specify "force", "auto", True of False, in order to actively decide for applying it
    or not.
 Things to fix:
- clean functions like binary, aggregative, probabilistic, etc; those should be resolved via isinstance()
+- clean functions like binary, aggregative, probabilistic, etc; those should be resolved via isinstance():
    this is not working; I don't know how to make the isinstance work. Looks like there is some problem with the
    path of the imported class wrt the path of the class that arrives from another module...
 - clean classes_ and n_classes from methods (maybe not from aggregative ones, but those have to be used only
    internally and not imposed in any abstract class)
 - optimize "qp.evaluation.prediction" for aggregative methods (pre-classification)
@ -33,6 +43,10 @@ Things to fix:
    stuff).
 - Check method  def __parallel(self, func, *args, **kwargs) in aggregative.OneVsAll
 New features:
 - Add LeQua2022 to datasets (everything automatic, and with proper protocols "gen")
 - Add an "experimental room", with scripts to quickly test new ideas and see results.
 # 0.1.7
 # change the LabelledCollection API (removing protocol-related samplings)
 # need to change the two references to the above in the wiki / doc, and code examples...
--- a/quapy/data/datasets.py
+++ b/quapy/data/datasets.py
@ -43,6 +43,8 @@ UCI_DATASETS = ['acute.a', 'acute.b',
                'wine-q-red', 'wine-q-white',
                'yeast']
 LEQUA2022_TASKS = ['T1A', 'T1B', 'T2A', 'T2B']
 def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False) -> Dataset:
    """
@ -532,4 +534,53 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) ->
 def _df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):
-    df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False)
+    df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False)
 def fetch_lequa2022(task, data_home=None):
    """
    """
    from quapy.data._lequa2022 import load_raw_documents, load_vector_documents, SamplesFromDir
    assert task in LEQUA2022_TASKS, \
        f'Unknown task {task}. Valid ones are {LEQUA2022_TASKS}'
    if data_home is None:
        data_home = get_quapy_home()
    URL_TRAINDEV=f'https://zenodo.org/record/6546188/files/{task}.train_dev.zip'
    URL_TEST=f'https://zenodo.org/record/6546188/files/{task}.test.zip'
    URL_TEST_PREV=f'https://zenodo.org/record/6546188/files/{task}.test_prevalences.zip'
    lequa_dir = join(data_home, 'lequa2022')
    os.makedirs(lequa_dir, exist_ok=True)
    def download_unzip_and_remove(unzipped_path, url):
        tmp_path = join(lequa_dir, task + '_tmp.zip')
        download_file_if_not_exists(url, tmp_path)
        with zipfile.ZipFile(tmp_path) as file:
            file.extractall(unzipped_path)
        os.remove(tmp_path)
    if not os.path.exists(join(lequa_dir, task)):
        download_unzip_and_remove(lequa_dir, URL_TRAINDEV)
        download_unzip_and_remove(lequa_dir, URL_TEST)
        download_unzip_and_remove(lequa_dir, URL_TEST_PREV)
    if task in ['T1A', 'T1B']:
        load_fn = load_vector_documents
    elif task in ['T2A', 'T2B']:
        load_fn = load_raw_documents
    tr_path = join(lequa_dir, task, 'public', 'training_data.txt')
    train = LabelledCollection.load(tr_path, loader_func=load_fn)
    val_samples_path = join(lequa_dir, task, 'public', 'dev_samples')
    val_true_prev_path = join(lequa_dir, task, 'public', 'dev_prevalences.txt')
    val_gen = SamplesFromDir(val_samples_path, val_true_prev_path, load_fn=load_fn)
    test_samples_path = join(lequa_dir, task, 'public', 'dev_samples')
    test_true_prev_path = join(lequa_dir, task, 'public', 'test_prevalences.txt')
    test_gen = SamplesFromDir(val_samples_path, val_true_prev_path, load_fn=load_fn)
    return train, val_gen, test_gen
--- a/quapy/evaluation.py
+++ b/quapy/evaluation.py
@ -1,13 +1,9 @@
 from typing import Union, Callable, Iterable
 import numpy as np
 from tqdm import tqdm
 import inspect
 import quapy as qp
 from quapy.protocol import AbstractProtocol, OnLabelledCollectionProtocol
 from quapy.data import LabelledCollection
 from quapy.method.base import BaseQuantifier
 from quapy.util import temp_seed
 import quapy.functional as F
 import pandas as pd
@ -22,7 +18,7 @@ def prediction(model: BaseQuantifier, protocol: AbstractProtocol, aggr_speedup='
        # checks whether the prediction can be made more efficiently; this check consists in verifying if the model is
        # of type aggregative, if the protocol is based on LabelledCollection, and if the total number of documents to
        # classify using the protocol would exceed the number of test documents in the original collection
-        from method.aggregative import AggregativeQuantifier
+        from quapy.method.aggregative import AggregativeQuantifier
        if isinstance(model, AggregativeQuantifier) and isinstance(protocol, OnLabelledCollectionProtocol):
            if aggr_speedup == 'force':
                apply_optimization = True
@ -45,9 +41,9 @@ def prediction(model: BaseQuantifier, protocol: AbstractProtocol, aggr_speedup='
 def __prediction_helper(quantification_fn, protocol: AbstractProtocol, verbose=False):
    true_prevs, estim_prevs = [], []
-    for sample in tqdm(protocol(), total=protocol.total()) if verbose else protocol():
+    for sample_instances, sample_prev in tqdm(protocol(), total=protocol.total()) if verbose else protocol():
-        estim_prevs.append(quantification_fn(sample.instances))
+        estim_prevs.append(quantification_fn(sample_instances))
-        true_prevs.append(sample.prevalence())
+        true_prevs.append(sample_prev)
    true_prevs = np.asarray(true_prevs)
    estim_prevs = np.asarray(estim_prevs)
--- a/quapy/method/meta.py
+++ b/quapy/method/meta.py
@ -9,7 +9,6 @@ from tqdm import tqdm
 import quapy as qp
 from quapy import functional as F
 from quapy.data import LabelledCollection
 from quapy.evaluation import evaluate
 from quapy.model_selection import GridSearchQ
 try:
@ -176,6 +175,7 @@ class Ensemble(BaseQuantifier):
        For each model in the ensemble, the performance is measured in terms of _error_name_ on the quantification of
        the samples used for training the rest of the models in the ensemble.
        """
        from quapy.evaluation import evaluate
        error = qp.error.from_name(error_name)
        tests = [m[3] for m in self.ensemble]
        scores = []
--- a/quapy/model_selection.py
+++ b/quapy/model_selection.py
@ -81,6 +81,8 @@ class GridSearchQ(BaseQuantifier):
        self.param_scores_ = {}
        self.best_score_ = None
        tinit = time()
        hyper = [dict({k: values[i] for i, k in enumerate(params_keys)}) for values in itertools.product(*params_values)]
        scores = qp.util.parallel(self._delayed_eval, ((params, training) for params in hyper), n_jobs=n_jobs)
@ -94,10 +96,13 @@ class GridSearchQ(BaseQuantifier):
            else:
                self.param_scores_[str(params)] = 'timeout'
        tend = time()-tinit
        if self.best_score_ is None:
            raise TimeoutError('all jobs took more than the timeout time to end')
-        self._sout(f'optimization finished: best params {self.best_params_} (score={self.best_score_:.5f})')
+        self._sout(f'optimization finished: best params {self.best_params_} (score={self.best_score_:.5f}) '
                   f'[took {tend:.4f}s]')
        if self.refit:
            if isinstance(protocol, OnLabelledCollectionProtocol):
--- a/quapy/protocol.py
+++ b/quapy/protocol.py
@ -1,14 +1,11 @@
 from copy import deepcopy
 import quapy as qp
 import numpy as np
 import itertools
 from collections.abc import Generator
 from contextlib import ExitStack
 from abc import ABCMeta, abstractmethod
 from quapy.data import LabelledCollection
 import quapy.functional as F
 from tqdm import tqdm
 from os.path import exists
 from glob import glob
@ -87,10 +84,14 @@ class AbstractStochasticSeededProtocol(AbstractProtocol):
            if self.random_seed is not None:
                stack.enter_context(qp.util.temp_seed(self.random_seed))
            for params in self.samples_parameters():
-                yield self.sample(params)
+                yield self.collator_fn(self.sample(params))
    def set_collator(self, collator_fn):
        self.collator_fn = collator_fn
 class OnLabelledCollectionProtocol:
    def get_labelled_collection(self):
        return self.data
@ -106,31 +107,6 @@ class OnLabelledCollectionProtocol:
            return new.on_preclassified_instances(pre_classifications, in_place=True)
 class LoadSamplesFromDirectory(AbstractProtocol):
    def __init__(self, folder_path, loader_fn, classes=None, **loader_kwargs):
        assert exists(folder_path), f'folder {folder_path} does not exist'
        assert callable(loader_fn), f'the passed load_fn does not seem to be callable'
        self.folder_path = folder_path
        self.loader_fn = loader_fn
        self.classes = classes
        self.loader_kwargs = loader_kwargs
        self._list_files = None
    def __call__(self):
        for file in self.list_files:
            yield LabelledCollection.load(file, loader_func=self.loader_fn, classes=self.classes, **self.loader_kwargs)
    @property
    def list_files(self):
        if self._list_files is None:
            self._list_files = sorted(glob(self.folder_path, '*'))
        return self._list_files
    def total(self):
        return len(self.list_files)
 class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
    """
    Implementation of the artificial prevalence protocol (APP).
@ -154,6 +130,7 @@ class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
        self.sample_size = sample_size
        self.n_prevalences = n_prevalences
        self.repeats = repeats
        self.set_collator(collator_fn=lambda x: (x.instances, x.prevalence()))
    def prevalence_grid(self):
        """
@ -210,6 +187,7 @@ class NPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
        self.sample_size = sample_size
        self.repeats = repeats
        self.random_seed = random_seed
        self.set_collator(collator_fn=lambda x: (x.instances, x.prevalence()))
    def samples_parameters(self):
        indexes = []
@ -246,6 +224,7 @@ class USimplexPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol)
        self.sample_size = sample_size
        self.repeats = repeats
        self.random_seed = random_seed
        self.set_collator(collator_fn=lambda x: (x.instances, x.prevalence()))
    def samples_parameters(self):
        indexes = []
@ -261,6 +240,31 @@ class USimplexPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol)
        return self.repeats
 # class LoadSamplesFromDirectory(AbstractProtocol):
 #
 #     def __init__(self, folder_path, loader_fn, classes=None, **loader_kwargs):
 #         assert exists(folder_path), f'folder {folder_path} does not exist'
 #         assert callable(loader_fn), f'the passed load_fn does not seem to be callable'
 #         self.folder_path = folder_path
 #         self.loader_fn = loader_fn
 #         self.classes = classes
 #         self.loader_kwargs = loader_kwargs
 #         self._list_files = None
 #
 #     def __call__(self):
 #         for file in self.list_files:
 #             yield LabelledCollection.load(file, loader_func=self.loader_fn, classes=self.classes, **self.loader_kwargs)
 #
 #     @property
 #     def list_files(self):
 #         if self._list_files is None:
 #             self._list_files = sorted(glob(self.folder_path, '*'))
 #         return self._list_files
 #
 #     def total(self):
 #         return len(self.list_files)
 class CovariateShiftPP(AbstractStochasticSeededProtocol):
    """
    Generates mixtures of two domains (A and B) at controlled rates, but preserving the original class prevalence.
--- a/quapy/tests/test_datasets.py
+++ b/quapy/tests/test_datasets.py
@ -1,7 +1,8 @@
 import pytest
 from quapy.data.datasets import REVIEWS_SENTIMENT_DATASETS, TWITTER_SENTIMENT_DATASETS_TEST, \
-    TWITTER_SENTIMENT_DATASETS_TRAIN, UCI_DATASETS, fetch_reviews, fetch_twitter, fetch_UCIDataset
+    TWITTER_SENTIMENT_DATASETS_TRAIN, UCI_DATASETS, LEQUA2022_TASKS, \
    fetch_reviews, fetch_twitter, fetch_UCIDataset, fetch_lequa2022
@pytest.mark.parametrize('dataset_name', REVIEWS_SENTIMENT_DATASETS)
@ -41,3 +42,13 @@ def test_fetch_UCIDataset(dataset_name):
    print('Training set stats')
    dataset.training.stats()
    print('Test set stats')
@pytest.mark.parametrize('dataset_name', LEQUA2022_TASKS)
 def test_fetch_lequa2022(dataset_name):
    fetch_lequa2022(dataset_name)
    # dataset = fetch_lequa2022(dataset_name)
    # print(f'Dataset {dataset_name}')
    # print('Training set stats')
    # dataset.training.stats()
    # print('Test set stats')
--- a/quapy/tests/test_evaluation.py
+++ b/quapy/tests/test_evaluation.py
@ -2,8 +2,8 @@ import unittest
 import quapy as qp
 from sklearn.linear_model import LogisticRegression
 from time import time
-from method.aggregative import EMQ
+from quapy.method.aggregative import EMQ
-from method.base import BaseQuantifier
+from quapy.method.base import BaseQuantifier
 class EvalTestCase(unittest.TestCase):
@ -12,7 +12,7 @@ class EvalTestCase(unittest.TestCase):
        data = qp.datasets.fetch_reviews('hp', tfidf=True, min_df=10, pickle=True)
        train, test = data.training, data.test
-        protocol = qp.protocol.APP(test, sample_size=1000, n_prevalences=21, repeats=1, random_seed=1)
+        protocol = qp.protocol.APP(test, sample_size=1000, n_prevalences=11, repeats=1, random_seed=1)
        class SlowLR(LogisticRegression):
            def predict_proba(self, X):
@ -23,7 +23,7 @@ class EvalTestCase(unittest.TestCase):
        emq = EMQ(SlowLR()).fit(train)
        tinit = time()
-        score = qp.evaluation.evaluate(emq, protocol, error_metric='mae', verbose=True)
+        score = qp.evaluation.evaluate(emq, protocol, error_metric='mae', verbose=True, aggr_speedup='force')
        tend_optim = time()-tinit
        print(f'evaluation (with optimization) took {tend_optim}s [MAE={score:.4f}]')
@ -50,7 +50,7 @@ class EvalTestCase(unittest.TestCase):
        tend_no_optim = time() - tinit
        print(f'evaluation (w/o optimization) took {tend_no_optim}s [MAE={score:.4f}]')
-        self.assertEqual(tend_no_optim>tend_optim, True)
+        self.assertEqual(tend_no_optim>(tend_optim/2), True)
 if __name__ == '__main__':
--- a/quapy/tests/test_modsel.py
+++ b/quapy/tests/test_modsel.py
@ -8,6 +8,7 @@ import quapy as qp
 from method.aggregative import PACC
 from model_selection import GridSearchQ
 from protocol import APP
 import time
 class ModselTestCase(unittest.TestCase):
@ -18,7 +19,6 @@ class ModselTestCase(unittest.TestCase):
        data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10)
        training, validation = data.training.split_stratified(0.7, random_state=1)
        # test = data.test
        param_grid = {'C': np.logspace(-3,3,7)}
        app = APP(validation, sample_size=100, random_seed=1)
@ -50,6 +50,37 @@ class ModselTestCase(unittest.TestCase):
        self.assertEqual(q.best_params_['C'], 10.0)
        self.assertEqual(q.best_model().get_params()['C'], 10.0)
    def test_modsel_parallel_speedup(self):
        class SlowLR(LogisticRegression):
            def fit(self, X, y, sample_weight=None):
                time.sleep(1)
                return super(SlowLR, self).fit(X, y, sample_weight)
        q = PACC(SlowLR(random_state=1, max_iter=5000))
        data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10)
        training, validation = data.training.split_stratified(0.7, random_state=1)
        param_grid = {'C': np.logspace(-3, 3, 7)}
        app = APP(validation, sample_size=100, random_seed=1)
        tinit = time.time()
        GridSearchQ(
            q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, n_jobs=1, verbose=True
        ).fit(training)
        tend_nooptim = time.time()-tinit
        tinit = time.time()
        GridSearchQ(
            q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, n_jobs=-1, verbose=True
        ).fit(training)
        tend_optim = time.time() - tinit
        print(f'parallel training took {tend_optim:.4f}s')
        print(f'sequential training took {tend_nooptim:.4f}s')
        self.assertEqual(tend_optim < (0.5*tend_nooptim), True)
    def test_modsel_timeout(self):
        class SlowLR(LogisticRegression):
--- a/quapy/tests/test_protocols.py
+++ b/quapy/tests/test_protocols.py
@ -1,7 +1,7 @@
 import unittest
 import numpy as np
-from data import LabelledCollection
+from quapy.data import LabelledCollection
-from protocol import APP, NPP, USimplexPP, CovariateShiftPP, AbstractStochasticSeededProtocol
+from quapy.protocol import APP, NPP, USimplexPP, CovariateShiftPP, AbstractStochasticSeededProtocol
 def mock_labelled_collection(prefix=''):
@ -134,6 +134,5 @@ class TestProtocols(unittest.TestCase):
            print('done')
 if __name__ == '__main__':
    unittest.main()
--- a/quapy/util.py
+++ b/quapy/util.py
@ -46,6 +46,7 @@ def parallel(func, args, n_jobs):
    that takes the `quapy.environ` variable as input silently
    """
    print('n_jobs',n_jobs)
    def func_dec(environ, *args):
        qp.environ = environ
        return func(*args)