diff --git a/quapy/data/base.py b/quapy/data/base.py index e52230e..ceb7402 100644 --- a/quapy/data/base.py +++ b/quapy/data/base.py @@ -549,7 +549,7 @@ class Dataset: yield Dataset(train, test, name=f'fold {(i % nfolds) + 1}/{nfolds} (round={(i // nfolds) + 1})') - def reduce(self, n_train=100, n_test=100): + def reduce(self, n_train=100, n_test=100, random_state=None): """ Reduce the number of instances in place for quick experiments. Preserves the prevalence of each set. @@ -557,6 +557,14 @@ class Dataset: :param n_test: number of test documents to keep (default 100) :return: self """ - self.training = self.training.sampling(n_train, *self.training.prevalence()) - self.test = self.test.sampling(n_test, *self.test.prevalence()) + self.training = self.training.sampling( + n_train, + *self.training.prevalence(), + random_state = random_state + ) + self.test = self.test.sampling( + n_test, + *self.test.prevalence(), + random_state = random_state + ) return self \ No newline at end of file diff --git a/quapy/tests/test_modsel.py b/quapy/tests/test_modsel.py index bf3e6f2..64b0ff4 100644 --- a/quapy/tests/test_modsel.py +++ b/quapy/tests/test_modsel.py @@ -19,7 +19,7 @@ class ModselTestCase(unittest.TestCase): q = PACC(LogisticRegression(random_state=1, max_iter=5000)) - data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10).reduce() + data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10).reduce(random_state=1) training, validation = data.training.split_stratified(0.7, random_state=1) param_grid = {'classifier__C': [0.000001, 10.]} @@ -41,7 +41,7 @@ class ModselTestCase(unittest.TestCase): q = PACC(LogisticRegression(random_state=1, max_iter=5000)) - data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10).reduce(n_train=500) + data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10).reduce(n_train=500, random_state=1) training, validation = data.training.split_stratified(0.7, random_state=1) param_grid = {'classifier__C': np.logspace(-3,3,7)} @@ -79,7 +79,7 @@ class ModselTestCase(unittest.TestCase): q = PACC(SlowLR()) - data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10).reduce() + data = qp.datasets.fetch_reviews('imdb', tfidf=True, min_df=10).reduce(random_state=1) training, validation = data.training.split_stratified(0.7, random_state=1) param_grid = {'classifier__C': np.logspace(-1,1,3)}