lequa as dataset

2022-06-01 18:28:59 +02:00 · 2022-06-01 18:28:59 +02:00 · 45642ad778
parent eba6fd8123
commit 45642ad778
11 changed files with 163 additions and 51 deletions
--- a/quapy/CHANGE_LOG.txt
+++ b/quapy/CHANGE_LOG.txt
@ -9,9 +9,19 @@

 - ACC, PACC, Forman's threshold variants have been parallelized.

+- Exploration of hyperparameters in Model selection can now be run in parallel (it was a n_jobs argument in
+    QuaPy 0.1.6 but only the evaluation part for one specific hyperparameter was run in parallel).
+
+- The prediction function has been refactored, so it applies the optimization for aggregative quantifiers (that
+    consists in pre-classifying all instances, and then only invoking aggregate on the samples) only in cases in
+    which the total number of classifications would be smaller than the number of classifications with the standard
+    procedure. The user can now specify "force", "auto", True of False, in order to actively decide for applying it
+    or not.

 Things to fix:
- clean functions like binary, aggregative, probabilistic, etc; those should be resolved via isinstance()
+- clean functions like binary, aggregative, probabilistic, etc; those should be resolved via isinstance():
+    this is not working; I don't know how to make the isinstance work. Looks like there is some problem with the
+    path of the imported class wrt the path of the class that arrives from another module...
 - clean classes_ and n_classes from methods (maybe not from aggregative ones, but those have to be used only
    internally and not imposed in any abstract class)
 - optimize "qp.evaluation.prediction" for aggregative methods (pre-classification)
@ -33,6 +43,10 @@ Things to fix:
    stuff).
 - Check method  def __parallel(self, func, *args, **kwargs) in aggregative.OneVsAll

+New features:
+- Add LeQua2022 to datasets (everything automatic, and with proper protocols "gen")
+- Add an "experimental room", with scripts to quickly test new ideas and see results.
+
 # 0.1.7
 # change the LabelledCollection API (removing protocol-related samplings)
 # need to change the two references to the above in the wiki / doc, and code examples...
--- a/quapy/data/datasets.py
+++ b/quapy/data/datasets.py
@ -43,6 +43,8 @@ UCI_DATASETS = ['acute.a', 'acute.b',
                'wine-q-red', 'wine-q-white',
                'yeast']

+LEQUA2022_TASKS = ['T1A', 'T1B', 'T2A', 'T2B']
+

 def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False) -> Dataset:
    """
@ -532,4 +534,53 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) ->


 def _df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):
-    df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False)
+    df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False)
+
+
+def fetch_lequa2022(task, data_home=None):
+    """
+    """
+    from quapy.data._lequa2022 import load_raw_documents, load_vector_documents, SamplesFromDir
+
+    assert task in LEQUA2022_TASKS, \
+        f'Unknown task {task}. Valid ones are {LEQUA2022_TASKS}'
+    if data_home is None:
+        data_home = get_quapy_home()
+
+    URL_TRAINDEV=f'https://zenodo.org/record/6546188/files/{task}.train_dev.zip'
+    URL_TEST=f'https://zenodo.org/record/6546188/files/{task}.test.zip'
+    URL_TEST_PREV=f'https://zenodo.org/record/6546188/files/{task}.test_prevalences.zip'
+
+    lequa_dir = join(data_home, 'lequa2022')
+    os.makedirs(lequa_dir, exist_ok=True)
+
+    def download_unzip_and_remove(unzipped_path, url):
+        tmp_path = join(lequa_dir, task + '_tmp.zip')
+        download_file_if_not_exists(url, tmp_path)
+        with zipfile.ZipFile(tmp_path) as file:
+            file.extractall(unzipped_path)
+        os.remove(tmp_path)
+
+    if not os.path.exists(join(lequa_dir, task)):
+        download_unzip_and_remove(lequa_dir, URL_TRAINDEV)
+        download_unzip_and_remove(lequa_dir, URL_TEST)
+        download_unzip_and_remove(lequa_dir, URL_TEST_PREV)
+
+    if task in ['T1A', 'T1B']:
+        load_fn = load_vector_documents
+    elif task in ['T2A', 'T2B']:
+        load_fn = load_raw_documents
+
+    tr_path = join(lequa_dir, task, 'public', 'training_data.txt')
+    train = LabelledCollection.load(tr_path, loader_func=load_fn)
+
+    val_samples_path = join(lequa_dir, task, 'public', 'dev_samples')
+    val_true_prev_path = join(lequa_dir, task, 'public', 'dev_prevalences.txt')
+    val_gen = SamplesFromDir(val_samples_path, val_true_prev_path, load_fn=load_fn)
+
+    test_samples_path = join(lequa_dir, task, 'public', 'dev_samples')
+    test_true_prev_path = join(lequa_dir, task, 'public', 'test_prevalences.txt')
+    test_gen = SamplesFromDir(val_samples_path, val_true_prev_path, load_fn=load_fn)
+
+    return train, val_gen, test_gen
+
--- a/quapy/evaluation.py
+++ b/quapy/evaluation.py
@ -1,13 +1,9 @@
 from typing import Union, Callable, Iterable
 import numpy as np
 from tqdm import tqdm
-import inspect
 import quapy as qp
 from quapy.protocol import AbstractProtocol, OnLabelledCollectionProtocol
-from quapy.data import LabelledCollection
 from quapy.method.base import BaseQuantifier
-from quapy.util import temp_seed
-import quapy.functional as F
 import pandas as pd


@ -22,7 +18,7 @@ def prediction(model: BaseQuantifier, protocol: AbstractProtocol, aggr_speedup='
        # checks whether the prediction can be made more efficiently; this check consists in verifying if the model is
        # of type aggregative, if the protocol is based on LabelledCollection, and if the total number of documents to
        # classify using the protocol would exceed the number of test documents in the original collection
-        from method.aggregative import AggregativeQuantifier
+        from quapy.method.aggregative import AggregativeQuantifier
        if isinstance(model, AggregativeQuantifier) and isinstance(protocol, OnLabelledCollectionProtocol):
            if aggr_speedup == 'force':
                apply_optimization = True
@ -45,9 +41,9 @@ def prediction(model: BaseQuantifier, protocol: AbstractProtocol, aggr_speedup='

 def __prediction_helper(quantification_fn, protocol: AbstractProtocol, verbose=False):
    true_prevs, estim_prevs = [], []
-    for sample in tqdm(protocol(), total=protocol.total()) if verbose else protocol():
-        estim_prevs.append(quantification_fn(sample.instances))
-        true_prevs.append(sample.prevalence())
+    for sample_instances, sample_prev in tqdm(protocol(), total=protocol.total()) if verbose else protocol():
+        estim_prevs.append(quantification_fn(sample_instances))
+        true_prevs.append(sample_prev)

    true_prevs = np.asarray(true_prevs)
    estim_prevs = np.asarray(estim_prevs)
--- a/quapy/method/meta.py
+++ b/quapy/method/meta.py
@ -9,7 +9,6 @@ from tqdm import tqdm
 import quapy as qp
 from quapy import functional as F
 from quapy.data import LabelledCollection
-from quapy.evaluation import evaluate
 from quapy.model_selection import GridSearchQ

 try:
@ -176,6 +175,7 @@ class Ensemble(BaseQuantifier):
        For each model in the ensemble, the performance is measured in terms of _error_name_ on the quantification of
        the samples used for training the rest of the models in the ensemble.
        """
+        from quapy.evaluation import evaluate
        error = qp.error.from_name(error_name)
        tests = [m[3] for m in self.ensemble]
        scores = []
--- a/quapy/model_selection.py
+++ b/quapy/model_selection.py
@ -81,6 +81,8 @@ class GridSearchQ(BaseQuantifier):
        self.param_scores_ = {}
        self.best_score_ = None

+        tinit = time()
+
        hyper = [dict({k: values[i] for i, k in enumerate(params_keys)}) for values in itertools.product(*params_values)]
        scores = qp.util.parallel(self._delayed_eval, ((params, training) for params in hyper), n_jobs=n_jobs)

@ -94,10 +96,13 @@ class GridSearchQ(BaseQuantifier):
            else:
                self.param_scores_[str(params)] = 'timeout'

+        tend = time()-tinit
+
        if self.best_score_ is None:
            raise TimeoutError('all jobs took more than the timeout time to end')

-        self._sout(f'optimization finished: best params {self.best_params_} (score={self.best_score_:.5f})')
+        self._sout(f'optimization finished: best params {self.best_params_} (score={self.best_score_:.5f}) '
+                   f'[took {tend:.4f}s]')

        if self.refit:
            if isinstance(protocol, OnLabelledCollectionProtocol):
--- a/quapy/protocol.py
+++ b/quapy/protocol.py
@ -1,14 +1,11 @@
 from copy import deepcopy
-
 import quapy as qp
 import numpy as np
 import itertools
-from collections.abc import Generator
 from contextlib import ExitStack
 from abc import ABCMeta, abstractmethod
 from quapy.data import LabelledCollection
 import quapy.functional as F
-from tqdm import tqdm
 from os.path import exists
 from glob import glob

@ -87,10 +84,14 @@ class AbstractStochasticSeededProtocol(AbstractProtocol):
            if self.random_seed is not None:
                stack.enter_context(qp.util.temp_seed(self.random_seed))
            for params in self.samples_parameters():
-                yield self.sample(params)
+                yield self.collator_fn(self.sample(params))
+
+    def set_collator(self, collator_fn):
+        self.collator_fn = collator_fn


 class OnLabelledCollectionProtocol:
+
    def get_labelled_collection(self):
        return self.data

@ -106,31 +107,6 @@ class OnLabelledCollectionProtocol:
            return new.on_preclassified_instances(pre_classifications, in_place=True)


-class LoadSamplesFromDirectory(AbstractProtocol):
-
-    def __init__(self, folder_path, loader_fn, classes=None, **loader_kwargs):
-        assert exists(folder_path), f'folder {folder_path} does not exist'
-        assert callable(loader_fn), f'the passed load_fn does not seem to be callable'
-        self.folder_path = folder_path
-        self.loader_fn = loader_fn
-        self.classes = classes
-        self.loader_kwargs = loader_kwargs
-        self._list_files = None
-
-    def __call__(self):
-        for file in self.list_files:
-            yield LabelledCollection.load(file, loader_func=self.loader_fn, classes=self.classes, **self.loader_kwargs)
-
-    @property
-    def list_files(self):
-        if self._list_files is None:
-            self._list_files = sorted(glob(self.folder_path, '*'))
-        return self._list_files
-
-    def total(self):
-        return len(self.list_files)
-
-
 class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
    """
    Implementation of the artificial prevalence protocol (APP).
@ -154,6 +130,7 @@ class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
        self.sample_size = sample_size
        self.n_prevalences = n_prevalences
        self.repeats = repeats
+        self.set_collator(collator_fn=lambda x: (x.instances, x.prevalence()))

    def prevalence_grid(self):
        """
@ -210,6 +187,7 @@ class NPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
        self.sample_size = sample_size
        self.repeats = repeats
        self.random_seed = random_seed
+        self.set_collator(collator_fn=lambda x: (x.instances, x.prevalence()))

    def samples_parameters(self):
        indexes = []
@ -246,6 +224,7 @@ class USimplexPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol)
        self.sample_size = sample_size
        self.repeats = repeats
        self.random_seed = random_seed
+        self.set_collator(collator_fn=lambda x: (x.instances, x.prevalence()))

    def samples_parameters(self):
        indexes = []
@ -261,6 +240,31 @@ class USimplexPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol)
        return self.repeats


+# class LoadSamplesFromDirectory(AbstractProtocol):
+#
+#     def __init__(self, folder_path, loader_fn, classes=None, **loader_kwargs):
+#         assert exists(folder_path), f'folder {folder_path} does not exist'
+#         assert callable(loader_fn), f'the passed load_fn does not seem to be callable'
+#         self.folder_path = folder_path
+#         self.loader_fn = loader_fn
+#         self.classes = classes
+#         self.loader_kwargs = loader_kwargs
+#         self._list_files = None
+#
+#     def __call__(self):
+#         for file in self.list_files:
+#             yield LabelledCollection.load(file, loader_func=self.loader_fn, classes=self.classes, **self.loader_kwargs)
+#
+#     @property
+#     def list_files(self):
+#         if self._list_files is None:
+#             self._list_files = sorted(glob(self.folder_path, '*'))
+#         return self._list_files
+#
+#     def total(self):
+#         return len(self.list_files)
+
+
 class CovariateShiftPP(AbstractStochasticSeededProtocol):
    """
    Generates mixtures of two domains (A and B) at controlled rates, but preserving the original class prevalence.
--- a/quapy/tests/test_datasets.py
+++ b/quapy/tests/test_datasets.py
@ -1,7 +1,8 @@
 import pytest

 from quapy.data.datasets import REVIEWS_SENTIMENT_DATASETS, TWITTER_SENTIMENT_DATASETS_TEST, \
-    TWITTER_SENTIMENT_DATASETS_TRAIN, UCI_DATASETS, fetch_reviews, fetch_twitter, fetch_UCIDataset
+    TWITTER_SENTIMENT_DATASETS_TRAIN, UCI_DATASETS, LEQUA2022_TASKS, \
+    fetch_reviews, fetch_twitter, fetch_UCIDataset, fetch_lequa2022


@pytest.mark.parametrize('dataset_name', REVIEWS_SENTIMENT_DATASETS)
@ -41,3 +42,13 @@ def test_fetch_UCIDataset(dataset_name):
    print('Training set stats')
    dataset.training.stats()
    print('Test set stats')
+
+
+@pytest.mark.parametrize('dataset_name', LEQUA2022_TASKS)
+def test_fetch_lequa2022(dataset_name):
+    fetch_lequa2022(dataset_name)
+    # dataset = fetch_lequa2022(dataset_name)
+    # print(f'Dataset {dataset_name}')
+    # print('Training set stats')
+    # dataset.training.stats()
+    # print('Test set stats')
--- a/quapy/tests/test_evaluation.py
+++ b/quapy/tests/test_evaluation.py
@ -2,8 +2,8 @@ import unittest
 import quapy as qp
 from sklearn.linear_model import LogisticRegression
 from time import time
-from method.aggregative import EMQ
-from method.base import BaseQuantifier
+from quapy.method.aggregative import EMQ
+from quapy.method.base import BaseQuantifier


 class EvalTestCase(unittest.TestCase):
@ -12,7 +12,7 @@ class EvalTestCase(unittest.TestCase):
        data = qp.datasets.fetch_reviews('hp', tfidf=True, min_df=10, pickle=True)
        train, test = data.training, data.test

-        protocol = qp.protocol.APP(test, sample_size=1000, n_prevalences=21, repeats=1, random_seed=1)
+        protocol = qp.protocol.APP(test, sample_size=1000, n_prevalences=11, repeats=1, random_seed=1)

        class SlowLR(LogisticRegression):
            def predict_proba(self, X):
@ -23,7 +23,7 @@ class EvalTestCase(unittest.TestCase):
        emq = EMQ(SlowLR()).fit(train)

        tinit = time()
-        score = qp.evaluation.evaluate(emq, protocol, error_metric='mae', verbose=True)
+        score = qp.evaluation.evaluate(emq, protocol, error_metric='mae', verbose=True, aggr_speedup='force')
        tend_optim = time()-tinit
        print(f'evaluation (with optimization) took {tend_optim}s [MAE={score:.4f}]')

@ -50,7 +50,7 @@ class EvalTestCase(unittest.TestCase):
        tend_no_optim = time() - tinit
        print(f'evaluation (w/o optimization) took {tend_no_optim}s [MAE={score:.4f}]')

-        self.assertEqual(tend_no_optim>tend_optim, True)
+        self.assertEqual(tend_no_optim>(tend_optim/2), True)


 if __name__ == '__main__':
--- a/quapy/tests/test_modsel.py
+++ b/quapy/tests/test_modsel.py
@ -8,6 +8,7 @@ import quapy as qp
 from method.aggregative import PACC
 from model_selection import GridSearchQ
 from protocol import APP
+import time


 class ModselTestCase(unittest.TestCase):
@ -18,7 +19,6 @@ class ModselTestCase(unittest.TestCase):

        data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10)
        training, validation = data.training.split_stratified(0.7, random_state=1)
-        # test = data.test

        param_grid = {'C': np.logspace(-3,3,7)}
        app = APP(validation, sample_size=100, random_seed=1)
@ -50,6 +50,37 @@ class ModselTestCase(unittest.TestCase):
        self.assertEqual(q.best_params_['C'], 10.0)
        self.assertEqual(q.best_model().get_params()['C'], 10.0)

+    def test_modsel_parallel_speedup(self):
+        class SlowLR(LogisticRegression):
+            def fit(self, X, y, sample_weight=None):
+                time.sleep(1)
+                return super(SlowLR, self).fit(X, y, sample_weight)
+
+        q = PACC(SlowLR(random_state=1, max_iter=5000))
+
+        data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10)
+        training, validation = data.training.split_stratified(0.7, random_state=1)
+
+        param_grid = {'C': np.logspace(-3, 3, 7)}
+        app = APP(validation, sample_size=100, random_seed=1)
+
+        tinit = time.time()
+        GridSearchQ(
+            q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, n_jobs=1, verbose=True
+        ).fit(training)
+        tend_nooptim = time.time()-tinit
+
+        tinit = time.time()
+        GridSearchQ(
+            q, param_grid, protocol=app, error='mae', refit=False, timeout=-1, n_jobs=-1, verbose=True
+        ).fit(training)
+        tend_optim = time.time() - tinit
+
+        print(f'parallel training took {tend_optim:.4f}s')
+        print(f'sequential training took {tend_nooptim:.4f}s')
+
+        self.assertEqual(tend_optim < (0.5*tend_nooptim), True)
+
    def test_modsel_timeout(self):

        class SlowLR(LogisticRegression):
--- a/quapy/tests/test_protocols.py
+++ b/quapy/tests/test_protocols.py
@ -1,7 +1,7 @@
 import unittest
 import numpy as np
-from data import LabelledCollection
-from protocol import APP, NPP, USimplexPP, CovariateShiftPP, AbstractStochasticSeededProtocol
+from quapy.data import LabelledCollection
+from quapy.protocol import APP, NPP, USimplexPP, CovariateShiftPP, AbstractStochasticSeededProtocol


 def mock_labelled_collection(prefix=''):
@ -134,6 +134,5 @@ class TestProtocols(unittest.TestCase):
            print('done')


-
 if __name__ == '__main__':
    unittest.main()
--- a/quapy/util.py
+++ b/quapy/util.py
@ -46,6 +46,7 @@ def parallel(func, args, n_jobs):

    that takes the `quapy.environ` variable as input silently
    """
+    print('n_jobs',n_jobs)
    def func_dec(environ, *args):
        qp.environ = environ
        return func(*args)