1
0
Fork 0

first commit protocols

This commit is contained in:
Alejandro Moreo Fernandez 2022-05-20 16:48:46 +02:00
parent be7a126c94
commit b453c8fcbc
3 changed files with 444 additions and 0 deletions

View File

@ -239,3 +239,24 @@ def get_nprevpoints_approximation(combinations_budget:int, n_classes:int, n_repe
else: else:
n_prevpoints += 1 n_prevpoints += 1
def check_prevalence_vector(p, raise_exception=False, toleranze=1e-08):
"""
Checks that p is a valid prevalence vector, i.e., that it contains values in [0,1] and that the values sum up to 1.
:param p: the prevalence vector to check
:return: True if `p` is valid, False otherwise
"""
p = np.asarray(p)
if not all(p>=0):
if raise_exception:
raise ValueError('the prevalence vector contains negative numbers')
return False
if not all(p<=1):
if raise_exception:
raise ValueError('the prevalence vector contains values >1')
return False
if not np.isclose(p.sum(), 1, atol=toleranze):
if raise_exception:
raise ValueError('the prevalence vector does not sum up to 1')
return False
return True

244
quapy/newprotocol.py Normal file
View File

@ -0,0 +1,244 @@
import itertools
from collections.abc import Generator
from contextlib import ExitStack
from abc import ABCMeta, abstractmethod
from quapy.data import LabelledCollection
import quapy.functional as F
# 0.1.7
# change the LabelledCollection API (removing protocol-related samplings)
# need to change the two references to the above in the wiki / doc, and code examples...
# removed artificial_prevalence_sampling from functional
# class AbstractProtocol(metaclass=ABCMeta):
# def __call__(self):
# for g in self.gen():
# yield g
#
# @abstractmethod
# def gen(self):
# ...
class AbstractStochasticProtocol(metaclass=ABCMeta):
def __init__(self, seed=None):
self.random_seed = seed
@property
def random_seed(self):
return self._random_seed
@random_seed.setter
def random_seed(self, seed):
self._random_seed = seed
@abstractmethod
def samples_parameters(self):
"""
This function has to return all the necessary parameters to replicate the samples
:return: a list of parameters, each of which serves to deterministically generate a sample
"""
...
@abstractmethod
def sample(self, params):
"""
Extract one sample determined by the given parameters
:param params: all the necessary parameters to generate a sample
:return: one sample (the same sample has to be generated for the same parameters)
"""
...
def __call__(self):
with ExitStack() as stack:
if self.random_seed is not None:
stack.enter_context(qp.util.temp_seed(self.random_seed))
for params in self.samples_parameters():
yield self.sample(params)
class APP(AbstractStochasticProtocol):
"""
Implementation of the artificial prevalence protocol (APP).
The APP consists of exploring a grid of prevalence values containing `n_prevalences` points (e.g.,
[0, 0.05, 0.1, 0.15, ..., 1], if `n_prevalences=21`), and generating all valid combinations of
prevalence values for all classes (e.g., for 3 classes, samples with [0, 0, 1], [0, 0.05, 0.95], ...,
[1, 0, 0] prevalence values of size `sample_size` will be yielded). The number of samples for each valid
combination of prevalence values is indicated by `repeats`.
:param sample_size: integer, number of instances in each sample
:param n_prevalences: the number of equidistant prevalence points to extract from the [0,1] interval for the
grid (default is 21)
:param repeats: number of copies for each valid prevalence vector (default is 1)
:param random_seed: allows replicating samples across runs (default None)
"""
def __init__(self, data:LabelledCollection, sample_size, n_prevalences=21, repeats=1, random_seed=None):
super(APP, self).__init__(random_seed)
self.data = data
self.sample_size = sample_size
self.n_prevalences = n_prevalences
self.repeats = repeats
def prevalence_grid(self, dimensions):
"""
Generates vectors of prevalence values from an exhaustive grid of prevalence values. The
number of prevalence values explored for each dimension depends on `n_prevalences`, so that, if, for example,
`n_prevalences=11` then the prevalence values of the grid are taken from [0, 0.1, 0.2, ..., 0.9, 1]. Only
valid prevalence distributions are returned, i.e., vectors of prevalence values that sum up to 1. For each
valid vector of prevalence values, `repeat` copies are returned. The vector of prevalence values can be
implicit (by setting `return_constrained_dim=False`), meaning that the last dimension (which is constrained
to 1 - sum of the rest) is not returned (note that, quite obviously, in this case the vector does not sum up to
1). Note that this method is deterministic, i.e., there is no random sampling anywhere.
:param dimensions: the number of classes
:return: a `np.ndarray` of shape `(n, dimensions)` if `return_constrained_dim=True` or of shape
`(n, dimensions-1)` if `return_constrained_dim=False`, where `n` is the number of valid combinations found
in the grid multiplied by `repeat`
"""
s = np.linspace(0., 1., self.n_prevalences, endpoint=True)
s = [s] * (dimensions - 1)
prevs = [p for p in itertools.product(*s, repeat=1) if sum(p) <= 1]
prevs = np.asarray(prevs).reshape(len(prevs), -1)
if self.repeats > 1:
prevs = np.repeat(prevs, self.repeats, axis=0)
return prevs
def samples_parameters(self):
indexes = []
for prevs in self.prevalence_grid(dimensions=self.data.n_classes):
index = data.sampling_index(self.sample_size, *prevs)
indexes.append(index)
return indexes
def sample(self, index):
return self.data.sampling_from_index(index)
class NPP(AbstractStochasticProtocol):
"""
A generator of samples that implements the natural prevalence protocol (NPP). The NPP consists of drawing
samples uniformly at random, therefore approximately preserving the natural prevalence of the collection.
:param sample_size: integer, the number of instances in each sample
:param repeats: the number of samples to generate
"""
def __init__(self, data:LabelledCollection, sample_size, repeats=1, random_seed=None):
super(NPP, self).__init__(random_seed)
self.data = data
self.sample_size = sample_size
self.repeats = repeats
self.random_seed = random_seed
def samples_parameters(self):
indexes = []
for _ in range(self.repeats):
index = data.uniform_sampling_index(self.sample_size)
indexes.append(index)
return indexes
def sample(self, index):
return self.data.sampling_from_index(index)
class USimplexPP(AbstractStochasticProtocol):
def __init__(self, data: LabelledCollection, sample_size, repeats=1, random_seed=None):
super(USimplexPP, self).__init__(random_seed)
self.data = data
self.sample_size = sample_size
self.repeats = repeats
self.random_seed = random_seed
def samples_parameters(self):
indexes = []
for prevs in F.uniform_simplex_sampling(n_classes=data.n_classes, size=self.repeats):
index = data.sampling_index(self.sample_size, *prevs)
indexes.append(index)
return indexes
def sample(self, index):
return self.data.sampling_from_index(index)
class CovariateShift(AbstractStochasticProtocol):
"""
Generates mixtures of two domains (A and B) at controlled rates, but preserving the original class prevalence.
:param domainA:
:param domainB:
:param sample_size:
:param repeats:
:param prevalence: the prevalence to preserv along the mixtures. If specified, should be an array containing
one prevalence value (positive float) for each class and summing up to one. If not specified, the prevalence
will be taken from the domain A (default).
:param mixture_points: an integer indicating the number of points to take from a linear scale (e.g., 21 will
generate the mixture points [1, 0.95, 0.9, ..., 0]), or the array of mixture values itself.
the specific points
:param random_seed:
"""
def __init__(
self,
domainA: LabelledCollection,
domainB: LabelledCollection,
sample_size,
repeats=1,
prevalence=None,
mixture_points=11,
random_seed=None):
super(CovariateShift, self).__init__(random_seed)
self.data = data
self.sample_size = sample_size
self.repeats = repeats
if prevalence is None:
self.prevalence = domainA.prevalence()
else:
self.prevalence = np.asarray(prevalence)
assert len(self.prevalence) == domainA.n_classes, \
f'wrong shape for the vector prevalence (expected {domainA.n_classes})'
assert F.check_prevalence_vector(self.prevalence), \
f'the prevalence vector is not valid (either it contains values outside [0,1] or does not sum up to 1)'
assert isinstance(mixture_points, int) or
self.random_seed = random_seed
def samples_parameters(self):
indexes = []
for _ in range(self.repeats):
index = data.uniform_sampling_index(self.sample_size)
indexes.append(index)
return indexes
def sample(self, index):
return self.data.sampling_from_index(index)
if __name__=='__main__':
import numpy as np
import quapy as qp
y = [0]*25 + [1]*25 + [2]*25 + [3]*25
X = [str(i)+'-'+str(yi) for i, yi in enumerate(y)]
data = LabelledCollection(X, y, classes_=sorted(np.unique(y)))
# p=CounterExample(1, 8, 10, 5)
# p = APP(data, sample_size=10, n_prevalences=11, random_seed=42)
# p = NPP(data, sample_size=10, repeats=10, random_seed=42)
# p = NPP(data, sample_size=10, repeats=10)
p = USimplexPP(data, sample_size=10, repeats=10)
for _ in range(2):
print('init generator', p.__class__.__name__)
for i in p():
# print(i)
print(i.instances, i.labels, i.prevalence())
print('done')

179
quapy/protocol.py Normal file
View File

@ -0,0 +1,179 @@
import itertools
from collections.abc import Generator
from contextlib import ExitStack
from abc import ABCMeta, abstractmethod
from quapy.data import LabelledCollection
import quapy.functional as F
# 0.1.7
# change the LabelledCollection API (removing protocol-related samplings)
# need to change the two references to the above in the wiki / doc, and code examples...
# removed artificial_prevalence_sampling from functional
class NewAbstractProtocol(metaclass=Generator):
@abstractmethod
def send(self, value):
"""Send a value into the generator.
Return next yielded value or raise StopIteration.
"""
raise StopIteration
@abstractmethod
def throw(self, typ, val=None, tb=None):
"""Raise an exception in the generator.
Return next yielded value or raise StopIteration.
"""
if val is None:
if tb is None:
raise typ
val = typ()
if tb is not None:
val = val.with_traceback(tb)
raise val
class AbstractProtocol(metaclass=ABCMeta):
"""
Abstract class for sampling protocols.
A sampling protocol defines how to generate samples out of some dataset.
"""
def __call__(self):
"""
A generator that yields one sample at each iteration
:return: yield one sample (instance of :class:`quapy.data.LabelledCollection`) at each iteration
"""
for index in self.indexes(data):
yield data.sampling_from_index(index)
def indexes(self, data: LabelledCollection):
"""
A generator that yields one sample index at each iteration.
(This function is mainly a generic decorator that sets, if requested, the local random seed; the real
sampling is implemented by :meth:`_indexes`.)
:param data: the set of data from which samples' indexes are to be drawn
:return: one sample index (instance of `np.ndarray`) at each iteration
"""
with ExitStack() as stack:
if self.random_seed is not None:
stack.enter_context(qp.util.temp_seed(self.random_seed))
for index in self._indexes(data):
yield index
@abstractmethod
def _indexes(self, data: LabelledCollection):
...
class APP(AbstractProtocol):
"""
Implementation of the artificial prevalence protocol (APP).
The APP consists of exploring a grid of prevalence values containing `n_prevalences` points (e.g.,
[0, 0.05, 0.1, 0.15, ..., 1], if `n_prevalences=21`), and generating all valid combinations of
prevalence values for all classes (e.g., for 3 classes, samples with [0, 0, 1], [0, 0.05, 0.95], ...,
[1, 0, 0] prevalence values of size `sample_size` will be yielded). The number of samples for each valid
combination of prevalence values is indicated by `repeats`.
:param sample_size: integer, number of instances in each sample
:param n_prevalences: the number of equidistant prevalence points to extract from the [0,1] interval for the
grid (default is 21)
:param repeats: number of copies for each valid prevalence vector (default is 1)
:param random_seed: allows replicating samples across runs (default None)
"""
def __init__(self, data:LabelledCollection, sample_size, n_prevalences=21, repeats=1, random_seed=None):
self.data = data
self.sample_size = sample_size
self.n_prevalences = n_prevalences
self.repeats = repeats
self.random_seed = random_seed
def _indexes(self, data: LabelledCollection):
for prevs in self.prevalence_grid(dimensions=data.n_classes):
yield data.sampling_index(self.sample_size, *prevs)
def prevalence_grid(self, dimensions, return_constrained_dim=False):
"""
Generates vectors of prevalence values from an exhaustive grid of prevalence values. The
number of prevalence values explored for each dimension depends on `n_prevalences`, so that, if, for example,
`n_prevalences=11` then the prevalence values of the grid are taken from [0, 0.1, 0.2, ..., 0.9, 1]. Only
valid prevalence distributions are returned, i.e., vectors of prevalence values that sum up to 1. For each
valid vector of prevalence values, `repeat` copies are returned. The vector of prevalence values can be
implicit (by setting `return_constrained_dim=False`), meaning that the last dimension (which is constrained
to 1 - sum of the rest) is not returned (note that, quite obviously, in this case the vector does not sum up to
1). Note that this method is deterministic, i.e., there is no random sampling anywhere.
:param dimensions: the number of classes
:param return_constrained_dim: set to True to return all dimensions, or to False (default) for ommitting the
constrained dimension
:return: a `np.ndarray` of shape `(n, dimensions)` if `return_constrained_dim=True` or of shape
`(n, dimensions-1)` if `return_constrained_dim=False`, where `n` is the number of valid combinations found
in the grid multiplied by `repeat`
"""
s = np.linspace(0., 1., self.n_prevalences, endpoint=True)
s = [s] * (dimensions - 1)
prevs = [p for p in itertools.product(*s, repeat=1) if sum(p) <= 1]
if return_constrained_dim:
prevs = [p + (1 - sum(p),) for p in prevs]
prevs = np.asarray(prevs).reshape(len(prevs), -1)
if self.repeats > 1:
prevs = np.repeat(prevs, self.repeats, axis=0)
return prevs
class NPP(AbstractProtocol):
"""
A generator of samples that implements the natural prevalence protocol (NPP). The NPP consists of drawing
samples uniformly at random, therefore approximately preserving the natural prevalence of the collection.
:param sample_size: integer, the number of instances in each sample
:param repeats: the number of samples to generate
"""
def __init__(self, sample_size, repeats=1, random_seed=None):
self.sample_size = sample_size
self.repeats = repeats
self.random_seed = random_seed
def _indexes(self, data: LabelledCollection):
for _ in range(self.repeats):
yield data.uniform_sampling_index(self.sample_size)
class USimplexPP(AbstractProtocol):
def __init__(self, sample_size, repeats=1, random_seed=None):
self.sample_size = sample_size
self.repeats = repeats
self.random_seed = random_seed
def _indexes(self, data: LabelledCollection):
for prevs in F.uniform_simplex_sampling(n_classes=data.n_classes, size=self.repeats):
yield data.sampling_index(self.sample_size, *prevs)
if __name__=='__main__':
import numpy as np
import quapy as qp
y = [0]*25 + [1]*25 + [2]*25 + [3]*25
X = [str(i)+'-'+str(yi) for i, yi in enumerate(y)]
data = LabelledCollection(X, y, classes_=sorted(np.unique(y)))
# p = APP(10, n_prevalences=11, random_seed=42)
# p = NPP(10, repeats=10, random_seed=42)
p = USimplexPP(10, repeats=10, random_seed=42)
for i in p(data):
print(i.instances, i.classes, i.prevalence())
print('done')