fixing random_state in base and in protocols

2022-06-21 10:27:06 +02:00 · 2022-06-21 10:27:06 +02:00 · f4a2a94ba5
parent c0c37f0a17
commit f4a2a94ba5
6 changed files with 47 additions and 42 deletions
--- a/quapy/data/base.py
+++ b/quapy/data/base.py
@ -2,7 +2,7 @@ import numpy as np
 from scipy.sparse import issparse
 from scipy.sparse import vstack
 from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
-
+from numpy.random import RandomState
 from quapy.functional import strprev


@ -146,16 +146,21 @@ class LabelledCollection:

        return indexes_sample

-    def uniform_sampling_index(self, size):
+    def uniform_sampling_index(self, size, random_state=None):
        """
        Returns an index to be used to extract a uniform sample of desired size. The sampling is drawn
        without replacement if the requested size is greater than the number of instances, or with replacement
        otherwise.

        :param size: integer, the size of the uniform sample
+        :param random_state: if specified, guarantees reproducibility of the split.
        :return: a np.ndarray of shape `(size)` with the indexes
        """
-        return np.random.choice(len(self), size, replace=size > len(self))
+        if random_state is not None:
+            ng = RandomState(seed=random_state)
+        else:
+            ng = np.random
+        return ng.choice(len(self), size, replace=size > len(self))

    def sampling(self, size, *prevs, shuffle=True):
        """
@ -174,16 +179,17 @@ class LabelledCollection:
        prev_index = self.sampling_index(size, *prevs, shuffle=shuffle)
        return self.sampling_from_index(prev_index)

-    def uniform_sampling(self, size):
+    def uniform_sampling(self, size, random_state=None):
        """
        Returns a uniform sample (an instance of :class:`LabelledCollection`) of desired size. The sampling is drawn
        without replacement if the requested size is greater than the number of instances, or with replacement
        otherwise.

        :param size: integer, the requested size
+        :param random_state: if specified, guarantees reproducibility of the split.
        :return: an instance of :class:`LabelledCollection` with length == `size`
        """
-        unif_index = self.uniform_sampling_index(size)
+        unif_index = self.uniform_sampling_index(size, random_state=random_state)
        return self.sampling_from_index(unif_index)

    def sampling_from_index(self, index):
--- a/quapy/protocol.py
+++ b/quapy/protocol.py
@ -40,22 +40,22 @@ class AbstractStochasticSeededProtocol(AbstractProtocol):
    needed for extracting the samples, and :meth:`sample` that, given some parameters as input,
    deterministically generates a sample.

-    :param seed: the seed for allowing to replicate any sequence of samples. Default is None, meaning that
+    :param random_state: the seed for allowing to replicate any sequence of samples. Default is None, meaning that
        the sequence will be different every time the protocol is called.
    """

-    _random_seed = -1  # means "not set"
+    _random_state = -1  # means "not set"

-    def __init__(self, seed=None):
-        self.random_seed = seed
+    def __init__(self, random_state=None):
+        self.random_state = random_state

    @property
-    def random_seed(self):
-        return self._random_seed
+    def random_state(self):
+        return self._random_state

-    @random_seed.setter
-    def random_seed(self, seed):
-        self._random_seed = seed
+    @random_state.setter
+    def random_state(self, random_state):
+        self._random_state = random_state

    @abstractmethod
    def samples_parameters(self):
@ -78,11 +78,11 @@ class AbstractStochasticSeededProtocol(AbstractProtocol):

    def __call__(self):
        with ExitStack() as stack:
-            if self.random_seed == -1:
+            if self.random_state == -1:
                raise ValueError('The random seed has never been initialized. '
                                 'Set it to None not to impose replicability.')
-            if self.random_seed is not None:
-                stack.enter_context(qp.util.temp_seed(self.random_seed))
+            if self.random_state is not None:
+                stack.enter_context(qp.util.temp_seed(self.random_state))
            for params in self.samples_parameters():
                yield self.collator(self.sample(params))

@ -132,11 +132,11 @@ class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
    :param n_prevalences: the number of equidistant prevalence points to extract from the [0,1] interval for the
        grid (default is 21)
    :param repeats: number of copies for each valid prevalence vector (default is 10)
-    :param random_seed: allows replicating samples across runs (default None)
+    :param random_state: allows replicating samples across runs (default None)
    """

-    def __init__(self, data:LabelledCollection, sample_size, n_prevalences=21, repeats=10, random_seed=None, return_type='sample_prev'):
-        super(APP, self).__init__(random_seed)
+    def __init__(self, data:LabelledCollection, sample_size, n_prevalences=21, repeats=10, random_state=None, return_type='sample_prev'):
+        super(APP, self).__init__(random_state)
        self.data = data
        self.sample_size = sample_size
        self.n_prevalences = n_prevalences
@ -189,15 +189,15 @@ class NPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
    :param data: a `LabelledCollection` from which the samples will be drawn
    :param sample_size: integer, the number of instances in each sample
    :param repeats: the number of samples to generate. Default is 100.
-    :param random_seed: allows replicating samples across runs (default None)
+    :param random_state: allows replicating samples across runs (default None)
    """

-    def __init__(self, data:LabelledCollection, sample_size, repeats=100, random_seed=None, return_type='sample_prev'):
-        super(NPP, self).__init__(random_seed)
+    def __init__(self, data:LabelledCollection, sample_size, repeats=100, random_state=None, return_type='sample_prev'):
+        super(NPP, self).__init__(random_state)
        self.data = data
        self.sample_size = sample_size
        self.repeats = repeats
-        self.random_seed = random_seed
+        self.random_state = random_state
        self.collator = OnLabelledCollectionProtocol.get_collator(return_type)

    def samples_parameters(self):
@ -226,15 +226,15 @@ class USimplexPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol)
    :param data: a `LabelledCollection` from which the samples will be drawn
    :param sample_size: integer, the number of instances in each sample
    :param repeats: the number of samples to generate. Default is 100.
-    :param random_seed: allows replicating samples across runs (default None)
+    :param random_state: allows replicating samples across runs (default None)
    """

-    def __init__(self, data: LabelledCollection, sample_size, repeats=100, random_seed=None, return_type='sample_prev'):
-        super(USimplexPP, self).__init__(random_seed)
+    def __init__(self, data: LabelledCollection, sample_size, repeats=100, random_state=None, return_type='sample_prev'):
+        super(USimplexPP, self).__init__(random_state)
        self.data = data
        self.sample_size = sample_size
        self.repeats = repeats
-        self.random_seed = random_seed
+        self.random_state = random_state
        self.collator = OnLabelledCollectionProtocol.get_collator(return_type)

    def samples_parameters(self):
@ -290,7 +290,7 @@ class CovariateShiftPP(AbstractStochasticSeededProtocol):
    :param mixture_points: an integer indicating the number of points to take from a linear scale (e.g., 21 will
        generate the mixture points [1, 0.95, 0.9, ..., 0]), or the array of mixture values itself.
        the specific points
-    :param random_seed:
+    :param random_state:
    """

    def __init__(
@ -301,9 +301,9 @@ class CovariateShiftPP(AbstractStochasticSeededProtocol):
            repeats=1,
            prevalence=None,
            mixture_points=11,
-            random_seed=None,
+            random_state=None,
            return_type='sample_prev'):
-        super(CovariateShiftPP, self).__init__(random_seed)
+        super(CovariateShiftPP, self).__init__(random_state)
        self.A = domainA
        self.B = domainB
        self.sample_size = sample_size
@ -322,7 +322,7 @@ class CovariateShiftPP(AbstractStochasticSeededProtocol):
            self.mixture_points = np.asarray(mixture_points)
            assert all(np.logical_and(self.mixture_points >= 0, self.mixture_points<=1)), \
                'mixture_model datatype not understood (expected int or a sequence of real values in [0,1])'
-        self.random_seed = random_seed
+        self.random_state = random_state
        self.collator = OnLabelledCollectionProtocol.get_collator(return_type)

    def samples_parameters(self):
--- a/quapy/tests/test_evaluation.py
+++ b/quapy/tests/test_evaluation.py
@ -12,7 +12,7 @@ class EvalTestCase(unittest.TestCase):
        data = qp.datasets.fetch_reviews('hp', tfidf=True, min_df=10, pickle=True)
        train, test = data.training, data.test

-        protocol = qp.protocol.APP(test, sample_size=1000, n_prevalences=11, repeats=1, random_seed=1)
+        protocol = qp.protocol.APP(test, sample_size=1000, n_prevalences=11, repeats=1, random_state=1)

        class SlowLR(LogisticRegression):
            def predict_proba(self, X):
--- a/quapy/tests/test_modsel.py
+++ b/quapy/tests/test_modsel.py
@ -21,7 +21,7 @@ class ModselTestCase(unittest.TestCase):
        training, validation = data.training.split_stratified(0.7, random_state=1)

        param_grid = {'C': np.logspace(-3,3,7)}
-        app = APP(validation, sample_size=100, random_seed=1)
+        app = APP(validation, sample_size=100, random_state=1)
        q = GridSearchQ(
            q, param_grid, protocol=app, error='mae', refit=True, timeout=-1, verbose=True
        ).fit(training)
@ -40,7 +40,7 @@ class ModselTestCase(unittest.TestCase):
        # test = data.test

        param_grid = {'C': np.logspace(-3,3,7)}
-        app = APP(validation, sample_size=100, random_seed=1)
+        app = APP(validation, sample_size=100, random_state=1)
        q = GridSearchQ(
            q, param_grid, protocol=app, error='mae', refit=True, timeout=-1, n_jobs=-1, verbose=True
        ).fit(training)
@ -62,7 +62,7 @@ class ModselTestCase(unittest.TestCase):
        training, validation = data.training.split_stratified(0.7, random_state=1)

        param_grid = {'C': np.logspace(-3, 3, 7)}
-        app = APP(validation, sample_size=100, random_seed=1)
+        app = APP(validation, sample_size=100, random_state=1)

        tinit = time.time()
        GridSearchQ(
@ -96,7 +96,7 @@ class ModselTestCase(unittest.TestCase):
        # test = data.test

        param_grid = {'C': np.logspace(-3,3,7)}
-        app = APP(validation, sample_size=100, random_seed=1)
+        app = APP(validation, sample_size=100, random_state=1)
        q = GridSearchQ(
            q, param_grid, protocol=app, error='mae', refit=True, timeout=3, n_jobs=-1, verbose=True
        )
--- a/quapy/tests/test_protocols.py
+++ b/quapy/tests/test_protocols.py
@ -21,7 +21,7 @@ class TestProtocols(unittest.TestCase):

    def test_app_replicate(self):
        data = mock_labelled_collection()
-        p = APP(data, sample_size=5, n_prevalences=11, random_seed=42)
+        p = APP(data, sample_size=5, n_prevalences=11, random_state=42)

        samples1 = samples_to_str(p)
        samples2 = samples_to_str(p)
@ -57,7 +57,7 @@ class TestProtocols(unittest.TestCase):

    def test_npp_replicate(self):
        data = mock_labelled_collection()
-        p = NPP(data, sample_size=5, repeats=5, random_seed=42)
+        p = NPP(data, sample_size=5, repeats=5, random_state=42)

        samples1 = samples_to_str(p)
        samples2 = samples_to_str(p)
@ -75,7 +75,7 @@ class TestProtocols(unittest.TestCase):

    def test_kraemer_replicate(self):
        data = mock_labelled_collection()
-        p = USimplexPP(data, sample_size=5, repeats=10, random_seed=42)
+        p = USimplexPP(data, sample_size=5, repeats=10, random_state=42)

        samples1 = samples_to_str(p)
        samples2 = samples_to_str(p)
@ -94,7 +94,7 @@ class TestProtocols(unittest.TestCase):
    def test_covariate_shift_replicate(self):
        dataA = mock_labelled_collection('domA')
        dataB = mock_labelled_collection('domB')
-        p = CovariateShiftPP(dataA, dataB, sample_size=10, mixture_points=11, random_seed=1)
+        p = CovariateShiftPP(dataA, dataB, sample_size=10, mixture_points=11, random_state=1)

        samples1 = samples_to_str(p)
        samples2 = samples_to_str(p)
--- a/quapy/util.py
+++ b/quapy/util.py
@ -50,7 +50,6 @@ def parallel(func, args, n_jobs):
    def func_dec(environ, *args):
        qp.environ = environ.copy()
        qp.environ['N_JOBS'] = 1
-        print(f'setting n_jobs from {environ["N_JOBS"]} to 1')
        return func(*args)
    return Parallel(n_jobs=n_jobs)(
        delayed(func_dec)(qp.environ, args_i) for args_i in args