From 69b8327fe925276d9225e1785588a74c6e4fa567 Mon Sep 17 00:00:00 2001 From: Mirko Bunse Date: Wed, 17 Apr 2024 11:44:23 +0200 Subject: [PATCH 1/5] Remove an erroneous import in the unit tests and add extra test dependencies. --- quapy/tests/test_modsel.py | 1 - setup.py | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/quapy/tests/test_modsel.py b/quapy/tests/test_modsel.py index fe416c7..bf3e6f2 100644 --- a/quapy/tests/test_modsel.py +++ b/quapy/tests/test_modsel.py @@ -4,7 +4,6 @@ import numpy as np from sklearn.linear_model import LogisticRegression import quapy as qp -import util from quapy.method.aggregative import PACC from quapy.model_selection import GridSearchQ from quapy.protocol import APP diff --git a/setup.py b/setup.py index 1f6c6fb..9f7df5c 100644 --- a/setup.py +++ b/setup.py @@ -125,6 +125,7 @@ setup( # projects. extras_require={ # Optional 'bayes': ['jax', 'jaxlib', 'numpyro'], + 'tests': ['certifi'], }, # If there are data files included in your packages that need to be From 31a697559cc740a7cdd888bbb6dbb06d01738adb Mon Sep 17 00:00:00 2001 From: Mirko Bunse Date: Wed, 17 Apr 2024 11:47:55 +0200 Subject: [PATCH 2/5] Unittest on GitHub Actions --- .github/workflows/ci.yml | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..c7b0809 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,31 @@ +name: CI + +on: + pull_request: + push: + branches: + - main + - devel + +jobs: + + # take out unit tests + test: + name: Unit tests (Python ${{ matrix.python-version }}) + runs-on: ubuntu-latest + strategy: + matrix: + python-version: + - "3.11" + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip setuptools wheel + python -m pip install -e .[tests] + - name: Test with unittest + run: python -m unittest From f3e543152cd7f131d3a538a3e4a87b30bb13019e Mon Sep 17 00:00:00 2001 From: Mirko Bunse Date: Wed, 17 Apr 2024 12:28:42 +0200 Subject: [PATCH 3/5] CI needs to install the bayes extra dependency --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c7b0809..09cd522 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -26,6 +26,6 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip setuptools wheel - python -m pip install -e .[tests] + python -m pip install -e .[bayes,tests] - name: Test with unittest run: python -m unittest From 72b43bd2f8fd3ae95cedcd4bacd0fd2d4cbe83bf Mon Sep 17 00:00:00 2001 From: Mirko Bunse Date: Wed, 17 Apr 2024 13:46:59 +0200 Subject: [PATCH 4/5] Omit large datasets (LeQua, IFCB) during CI to avoid overful memory of GitHub Actions runners --- .github/workflows/ci.yml | 2 ++ quapy/tests/test_datasets.py | 8 ++++++++ 2 files changed, 10 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 09cd522..1ba6d09 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,6 +17,8 @@ jobs: matrix: python-version: - "3.11" + env: + QUAPY_TESTS_OMIT_LARGE_DATASETS: True steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} diff --git a/quapy/tests/test_datasets.py b/quapy/tests/test_datasets.py index daa9207..a8587b2 100644 --- a/quapy/tests/test_datasets.py +++ b/quapy/tests/test_datasets.py @@ -1,3 +1,4 @@ +import os import unittest from sklearn.feature_extraction.text import TfidfVectorizer @@ -77,6 +78,9 @@ class TestDatasets(unittest.TestCase): self._check_dataset(dataset) def test_lequa2022(self): + if os.environ.get('QUAPY_TESTS_OMIT_LARGE_DATASETS'): + print("omitting test_lequa2022 because QUAPY_TESTS_OMIT_LARGE_DATASETS is set") + return for dataset_name in LEQUA2022_VECTOR_TASKS: print(f'loading dataset {dataset_name}...', end='') @@ -104,6 +108,10 @@ class TestDatasets(unittest.TestCase): def test_IFCB(self): + if os.environ.get('QUAPY_TESTS_OMIT_LARGE_DATASETS'): + print("omitting test_IFCB because QUAPY_TESTS_OMIT_LARGE_DATASETS is set") + return + print(f'loading dataset IFCB.') for mod_sel in [False, True]: train, gen = fetch_IFCB(single_sample_train=True, for_model_selection=mod_sel) From a64620c377c291f8ed706dc059e58034c7c125e3 Mon Sep 17 00:00:00 2001 From: Mirko Bunse Date: Wed, 17 Apr 2024 14:46:37 +0200 Subject: [PATCH 5/5] Dataset.reduce() allows to fix the random_state to have reproducible unit tests. This is required to ensure that the expected hyper-parameters are always chosen, independent of randomness --- quapy/data/base.py | 14 +++++++++++--- quapy/tests/test_modsel.py | 6 +++--- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/quapy/data/base.py b/quapy/data/base.py index e52230e..ceb7402 100644 --- a/quapy/data/base.py +++ b/quapy/data/base.py @@ -549,7 +549,7 @@ class Dataset: yield Dataset(train, test, name=f'fold {(i % nfolds) + 1}/{nfolds} (round={(i // nfolds) + 1})') - def reduce(self, n_train=100, n_test=100): + def reduce(self, n_train=100, n_test=100, random_state=None): """ Reduce the number of instances in place for quick experiments. Preserves the prevalence of each set. @@ -557,6 +557,14 @@ class Dataset: :param n_test: number of test documents to keep (default 100) :return: self """ - self.training = self.training.sampling(n_train, *self.training.prevalence()) - self.test = self.test.sampling(n_test, *self.test.prevalence()) + self.training = self.training.sampling( + n_train, + *self.training.prevalence(), + random_state = random_state + ) + self.test = self.test.sampling( + n_test, + *self.test.prevalence(), + random_state = random_state + ) return self \ No newline at end of file diff --git a/quapy/tests/test_modsel.py b/quapy/tests/test_modsel.py index bf3e6f2..64b0ff4 100644 --- a/quapy/tests/test_modsel.py +++ b/quapy/tests/test_modsel.py @@ -19,7 +19,7 @@ class ModselTestCase(unittest.TestCase): q = PACC(LogisticRegression(random_state=1, max_iter=5000)) - data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10).reduce() + data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10).reduce(random_state=1) training, validation = data.training.split_stratified(0.7, random_state=1) param_grid = {'classifier__C': [0.000001, 10.]} @@ -41,7 +41,7 @@ class ModselTestCase(unittest.TestCase): q = PACC(LogisticRegression(random_state=1, max_iter=5000)) - data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10).reduce(n_train=500) + data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10).reduce(n_train=500, random_state=1) training, validation = data.training.split_stratified(0.7, random_state=1) param_grid = {'classifier__C': np.logspace(-3,3,7)} @@ -79,7 +79,7 @@ class ModselTestCase(unittest.TestCase): q = PACC(SlowLR()) - data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10).reduce() + data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10).reduce(random_state=1) training, validation = data.training.split_stratified(0.7, random_state=1) param_grid = {'classifier__C': np.logspace(-1,1,3)}