diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..1ba6d09 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,33 @@ +name: CI + +on: + pull_request: + push: + branches: + - main + - devel + +jobs: + + # take out unit tests + test: + name: Unit tests (Python ${{ matrix.python-version }}) + runs-on: ubuntu-latest + strategy: + matrix: + python-version: + - "3.11" + env: + QUAPY_TESTS_OMIT_LARGE_DATASETS: True + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip setuptools wheel + python -m pip install -e .[bayes,tests] + - name: Test with unittest + run: python -m unittest diff --git a/quapy/data/base.py b/quapy/data/base.py index e52230e..ceb7402 100644 --- a/quapy/data/base.py +++ b/quapy/data/base.py @@ -549,7 +549,7 @@ class Dataset: yield Dataset(train, test, name=f'fold {(i % nfolds) + 1}/{nfolds} (round={(i // nfolds) + 1})') - def reduce(self, n_train=100, n_test=100): + def reduce(self, n_train=100, n_test=100, random_state=None): """ Reduce the number of instances in place for quick experiments. Preserves the prevalence of each set. @@ -557,6 +557,14 @@ class Dataset: :param n_test: number of test documents to keep (default 100) :return: self """ - self.training = self.training.sampling(n_train, *self.training.prevalence()) - self.test = self.test.sampling(n_test, *self.test.prevalence()) + self.training = self.training.sampling( + n_train, + *self.training.prevalence(), + random_state = random_state + ) + self.test = self.test.sampling( + n_test, + *self.test.prevalence(), + random_state = random_state + ) return self \ No newline at end of file diff --git a/quapy/tests/test_datasets.py b/quapy/tests/test_datasets.py index daa9207..a8587b2 100644 --- a/quapy/tests/test_datasets.py +++ b/quapy/tests/test_datasets.py @@ -1,3 +1,4 @@ +import os import unittest from sklearn.feature_extraction.text import TfidfVectorizer @@ -77,6 +78,9 @@ class TestDatasets(unittest.TestCase): self._check_dataset(dataset) def test_lequa2022(self): + if os.environ.get('QUAPY_TESTS_OMIT_LARGE_DATASETS'): + print("omitting test_lequa2022 because QUAPY_TESTS_OMIT_LARGE_DATASETS is set") + return for dataset_name in LEQUA2022_VECTOR_TASKS: print(f'loading dataset {dataset_name}...', end='') @@ -104,6 +108,10 @@ class TestDatasets(unittest.TestCase): def test_IFCB(self): + if os.environ.get('QUAPY_TESTS_OMIT_LARGE_DATASETS'): + print("omitting test_IFCB because QUAPY_TESTS_OMIT_LARGE_DATASETS is set") + return + print(f'loading dataset IFCB.') for mod_sel in [False, True]: train, gen = fetch_IFCB(single_sample_train=True, for_model_selection=mod_sel) diff --git a/quapy/tests/test_modsel.py b/quapy/tests/test_modsel.py index fe416c7..64b0ff4 100644 --- a/quapy/tests/test_modsel.py +++ b/quapy/tests/test_modsel.py @@ -4,7 +4,6 @@ import numpy as np from sklearn.linear_model import LogisticRegression import quapy as qp -import util from quapy.method.aggregative import PACC from quapy.model_selection import GridSearchQ from quapy.protocol import APP @@ -20,7 +19,7 @@ class ModselTestCase(unittest.TestCase): q = PACC(LogisticRegression(random_state=1, max_iter=5000)) - data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10).reduce() + data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10).reduce(random_state=1) training, validation = data.training.split_stratified(0.7, random_state=1) param_grid = {'classifier__C': [0.000001, 10.]} @@ -42,7 +41,7 @@ class ModselTestCase(unittest.TestCase): q = PACC(LogisticRegression(random_state=1, max_iter=5000)) - data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10).reduce(n_train=500) + data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10).reduce(n_train=500, random_state=1) training, validation = data.training.split_stratified(0.7, random_state=1) param_grid = {'classifier__C': np.logspace(-3,3,7)} @@ -80,7 +79,7 @@ class ModselTestCase(unittest.TestCase): q = PACC(SlowLR()) - data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10).reduce() + data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10).reduce(random_state=1) training, validation = data.training.split_stratified(0.7, random_state=1) param_grid = {'classifier__C': np.logspace(-1,1,3)} diff --git a/setup.py b/setup.py index 1f6c6fb..9f7df5c 100644 --- a/setup.py +++ b/setup.py @@ -125,6 +125,7 @@ setup( # projects. extras_require={ # Optional 'bayes': ['jax', 'jaxlib', 'numpyro'], + 'tests': ['certifi'], }, # If there are data files included in your packages that need to be