1
0
Fork 0

fixing random_state in base and in protocols

This commit is contained in:
Alejandro Moreo Fernandez 2022-06-21 10:27:06 +02:00
parent c0c37f0a17
commit f4a2a94ba5
6 changed files with 47 additions and 42 deletions

View File

@ -2,7 +2,7 @@ import numpy as np
from scipy.sparse import issparse
from scipy.sparse import vstack
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
from numpy.random import RandomState
from quapy.functional import strprev
@ -146,16 +146,21 @@ class LabelledCollection:
return indexes_sample
def uniform_sampling_index(self, size):
def uniform_sampling_index(self, size, random_state=None):
"""
Returns an index to be used to extract a uniform sample of desired size. The sampling is drawn
without replacement if the requested size is greater than the number of instances, or with replacement
otherwise.
:param size: integer, the size of the uniform sample
:param random_state: if specified, guarantees reproducibility of the split.
:return: a np.ndarray of shape `(size)` with the indexes
"""
return np.random.choice(len(self), size, replace=size > len(self))
if random_state is not None:
ng = RandomState(seed=random_state)
else:
ng = np.random
return ng.choice(len(self), size, replace=size > len(self))
def sampling(self, size, *prevs, shuffle=True):
"""
@ -174,16 +179,17 @@ class LabelledCollection:
prev_index = self.sampling_index(size, *prevs, shuffle=shuffle)
return self.sampling_from_index(prev_index)
def uniform_sampling(self, size):
def uniform_sampling(self, size, random_state=None):
"""
Returns a uniform sample (an instance of :class:`LabelledCollection`) of desired size. The sampling is drawn
without replacement if the requested size is greater than the number of instances, or with replacement
otherwise.
:param size: integer, the requested size
:param random_state: if specified, guarantees reproducibility of the split.
:return: an instance of :class:`LabelledCollection` with length == `size`
"""
unif_index = self.uniform_sampling_index(size)
unif_index = self.uniform_sampling_index(size, random_state=random_state)
return self.sampling_from_index(unif_index)
def sampling_from_index(self, index):

View File

@ -40,22 +40,22 @@ class AbstractStochasticSeededProtocol(AbstractProtocol):
needed for extracting the samples, and :meth:`sample` that, given some parameters as input,
deterministically generates a sample.
:param seed: the seed for allowing to replicate any sequence of samples. Default is None, meaning that
:param random_state: the seed for allowing to replicate any sequence of samples. Default is None, meaning that
the sequence will be different every time the protocol is called.
"""
_random_seed = -1 # means "not set"
_random_state = -1 # means "not set"
def __init__(self, seed=None):
self.random_seed = seed
def __init__(self, random_state=None):
self.random_state = random_state
@property
def random_seed(self):
return self._random_seed
def random_state(self):
return self._random_state
@random_seed.setter
def random_seed(self, seed):
self._random_seed = seed
@random_state.setter
def random_state(self, random_state):
self._random_state = random_state
@abstractmethod
def samples_parameters(self):
@ -78,11 +78,11 @@ class AbstractStochasticSeededProtocol(AbstractProtocol):
def __call__(self):
with ExitStack() as stack:
if self.random_seed == -1:
if self.random_state == -1:
raise ValueError('The random seed has never been initialized. '
'Set it to None not to impose replicability.')
if self.random_seed is not None:
stack.enter_context(qp.util.temp_seed(self.random_seed))
if self.random_state is not None:
stack.enter_context(qp.util.temp_seed(self.random_state))
for params in self.samples_parameters():
yield self.collator(self.sample(params))
@ -132,11 +132,11 @@ class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
:param n_prevalences: the number of equidistant prevalence points to extract from the [0,1] interval for the
grid (default is 21)
:param repeats: number of copies for each valid prevalence vector (default is 10)
:param random_seed: allows replicating samples across runs (default None)
:param random_state: allows replicating samples across runs (default None)
"""
def __init__(self, data:LabelledCollection, sample_size, n_prevalences=21, repeats=10, random_seed=None, return_type='sample_prev'):
super(APP, self).__init__(random_seed)
def __init__(self, data:LabelledCollection, sample_size, n_prevalences=21, repeats=10, random_state=None, return_type='sample_prev'):
super(APP, self).__init__(random_state)
self.data = data
self.sample_size = sample_size
self.n_prevalences = n_prevalences
@ -189,15 +189,15 @@ class NPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):
:param data: a `LabelledCollection` from which the samples will be drawn
:param sample_size: integer, the number of instances in each sample
:param repeats: the number of samples to generate. Default is 100.
:param random_seed: allows replicating samples across runs (default None)
:param random_state: allows replicating samples across runs (default None)
"""
def __init__(self, data:LabelledCollection, sample_size, repeats=100, random_seed=None, return_type='sample_prev'):
super(NPP, self).__init__(random_seed)
def __init__(self, data:LabelledCollection, sample_size, repeats=100, random_state=None, return_type='sample_prev'):
super(NPP, self).__init__(random_state)
self.data = data
self.sample_size = sample_size
self.repeats = repeats
self.random_seed = random_seed
self.random_state = random_state
self.collator = OnLabelledCollectionProtocol.get_collator(return_type)
def samples_parameters(self):
@ -226,15 +226,15 @@ class USimplexPP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol)
:param data: a `LabelledCollection` from which the samples will be drawn
:param sample_size: integer, the number of instances in each sample
:param repeats: the number of samples to generate. Default is 100.
:param random_seed: allows replicating samples across runs (default None)
:param random_state: allows replicating samples across runs (default None)
"""
def __init__(self, data: LabelledCollection, sample_size, repeats=100, random_seed=None, return_type='sample_prev'):
super(USimplexPP, self).__init__(random_seed)
def __init__(self, data: LabelledCollection, sample_size, repeats=100, random_state=None, return_type='sample_prev'):
super(USimplexPP, self).__init__(random_state)
self.data = data
self.sample_size = sample_size
self.repeats = repeats
self.random_seed = random_seed
self.random_state = random_state
self.collator = OnLabelledCollectionProtocol.get_collator(return_type)
def samples_parameters(self):
@ -290,7 +290,7 @@ class CovariateShiftPP(AbstractStochasticSeededProtocol):
:param mixture_points: an integer indicating the number of points to take from a linear scale (e.g., 21 will
generate the mixture points [1, 0.95, 0.9, ..., 0]), or the array of mixture values itself.
the specific points
:param random_seed:
:param random_state:
"""
def __init__(
@ -301,9 +301,9 @@ class CovariateShiftPP(AbstractStochasticSeededProtocol):
repeats=1,
prevalence=None,
mixture_points=11,
random_seed=None,
random_state=None,
return_type='sample_prev'):
super(CovariateShiftPP, self).__init__(random_seed)
super(CovariateShiftPP, self).__init__(random_state)
self.A = domainA
self.B = domainB
self.sample_size = sample_size
@ -322,7 +322,7 @@ class CovariateShiftPP(AbstractStochasticSeededProtocol):
self.mixture_points = np.asarray(mixture_points)
assert all(np.logical_and(self.mixture_points >= 0, self.mixture_points<=1)), \
'mixture_model datatype not understood (expected int or a sequence of real values in [0,1])'
self.random_seed = random_seed
self.random_state = random_state
self.collator = OnLabelledCollectionProtocol.get_collator(return_type)
def samples_parameters(self):

View File

@ -12,7 +12,7 @@ class EvalTestCase(unittest.TestCase):
data = qp.datasets.fetch_reviews('hp', tfidf=True, min_df=10, pickle=True)
train, test = data.training, data.test
protocol = qp.protocol.APP(test, sample_size=1000, n_prevalences=11, repeats=1, random_seed=1)
protocol = qp.protocol.APP(test, sample_size=1000, n_prevalences=11, repeats=1, random_state=1)
class SlowLR(LogisticRegression):
def predict_proba(self, X):

View File

@ -21,7 +21,7 @@ class ModselTestCase(unittest.TestCase):
training, validation = data.training.split_stratified(0.7, random_state=1)
param_grid = {'C': np.logspace(-3,3,7)}
app = APP(validation, sample_size=100, random_seed=1)
app = APP(validation, sample_size=100, random_state=1)
q = GridSearchQ(
q, param_grid, protocol=app, error='mae', refit=True, timeout=-1, verbose=True
).fit(training)
@ -40,7 +40,7 @@ class ModselTestCase(unittest.TestCase):
# test = data.test
param_grid = {'C': np.logspace(-3,3,7)}
app = APP(validation, sample_size=100, random_seed=1)
app = APP(validation, sample_size=100, random_state=1)
q = GridSearchQ(
q, param_grid, protocol=app, error='mae', refit=True, timeout=-1, n_jobs=-1, verbose=True
).fit(training)
@ -62,7 +62,7 @@ class ModselTestCase(unittest.TestCase):
training, validation = data.training.split_stratified(0.7, random_state=1)
param_grid = {'C': np.logspace(-3, 3, 7)}
app = APP(validation, sample_size=100, random_seed=1)
app = APP(validation, sample_size=100, random_state=1)
tinit = time.time()
GridSearchQ(
@ -96,7 +96,7 @@ class ModselTestCase(unittest.TestCase):
# test = data.test
param_grid = {'C': np.logspace(-3,3,7)}
app = APP(validation, sample_size=100, random_seed=1)
app = APP(validation, sample_size=100, random_state=1)
q = GridSearchQ(
q, param_grid, protocol=app, error='mae', refit=True, timeout=3, n_jobs=-1, verbose=True
)

View File

@ -21,7 +21,7 @@ class TestProtocols(unittest.TestCase):
def test_app_replicate(self):
data = mock_labelled_collection()
p = APP(data, sample_size=5, n_prevalences=11, random_seed=42)
p = APP(data, sample_size=5, n_prevalences=11, random_state=42)
samples1 = samples_to_str(p)
samples2 = samples_to_str(p)
@ -57,7 +57,7 @@ class TestProtocols(unittest.TestCase):
def test_npp_replicate(self):
data = mock_labelled_collection()
p = NPP(data, sample_size=5, repeats=5, random_seed=42)
p = NPP(data, sample_size=5, repeats=5, random_state=42)
samples1 = samples_to_str(p)
samples2 = samples_to_str(p)
@ -75,7 +75,7 @@ class TestProtocols(unittest.TestCase):
def test_kraemer_replicate(self):
data = mock_labelled_collection()
p = USimplexPP(data, sample_size=5, repeats=10, random_seed=42)
p = USimplexPP(data, sample_size=5, repeats=10, random_state=42)
samples1 = samples_to_str(p)
samples2 = samples_to_str(p)
@ -94,7 +94,7 @@ class TestProtocols(unittest.TestCase):
def test_covariate_shift_replicate(self):
dataA = mock_labelled_collection('domA')
dataB = mock_labelled_collection('domB')
p = CovariateShiftPP(dataA, dataB, sample_size=10, mixture_points=11, random_seed=1)
p = CovariateShiftPP(dataA, dataB, sample_size=10, mixture_points=11, random_state=1)
samples1 = samples_to_str(p)
samples2 = samples_to_str(p)

View File

@ -50,7 +50,6 @@ def parallel(func, args, n_jobs):
def func_dec(environ, *args):
qp.environ = environ.copy()
qp.environ['N_JOBS'] = 1
print(f'setting n_jobs from {environ["N_JOBS"]} to 1')
return func(*args)
return Parallel(n_jobs=n_jobs)(
delayed(func_dec)(qp.environ, args_i) for args_i in args