1
0
Fork 0
QuaPy/quapy/data/base.py

490 lines
24 KiB
Python
Raw Normal View History

2020-12-03 16:24:21 +01:00
import numpy as np
2020-12-03 16:36:54 +01:00
from scipy.sparse import issparse
2021-01-15 18:32:32 +01:00
from scipy.sparse import vstack
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
2021-05-05 17:12:44 +02:00
from quapy.functional import artificial_prevalence_sampling, strprev
2020-12-03 16:24:21 +01:00
class LabelledCollection:
2021-12-06 18:25:47 +01:00
"""
A LabelledCollection is a set of objects each with a label associated to it. This class implements many sampling
routines.
:param instances: array-like (np.ndarray, list, or csr_matrix are supported)
:param labels: array-like with the same length of instances
:param classes_: optional, list of classes from which labels are taken. If not specified, the classes are inferred
from the labels. The classes must be indicated in cases in which some of the labels might have no examples
(i.e., a prevalence of 0)
"""
2021-05-05 17:12:44 +02:00
def __init__(self, instances, labels, classes_=None):
if issparse(instances):
self.instances = instances
2021-05-05 17:12:44 +02:00
elif isinstance(instances, list) and len(instances) > 0 and isinstance(instances[0], str):
# lists of strings occupy too much as ndarrays (although python-objects add a heavy overload)
self.instances = np.asarray(instances, dtype=object)
else:
self.instances = np.asarray(instances)
2021-05-05 17:12:44 +02:00
self.labels = np.asarray(labels)
2020-12-03 16:24:21 +01:00
n_docs = len(self)
2021-05-05 17:12:44 +02:00
if classes_ is None:
2020-12-03 16:24:21 +01:00
self.classes_ = np.unique(self.labels)
self.classes_.sort()
else:
2021-05-05 17:12:44 +02:00
self.classes_ = np.unique(np.asarray(classes_))
self.classes_.sort()
if len(set(self.labels).difference(set(classes_))) > 0:
raise ValueError(f'labels ({set(self.labels)}) contain values not included in classes_ ({set(classes_)})')
2021-05-05 17:12:44 +02:00
self.index = {class_: np.arange(n_docs)[self.labels == class_] for class_ in self.classes_}
2020-12-03 16:24:21 +01:00
@classmethod
def load(cls, path: str, loader_func: callable, classes=None, **loader_kwargs):
2021-12-06 18:25:47 +01:00
"""
Loads a labelled set of data and convert it into a :class:`LabelledCollection` instance. The function in charge
of reading the instances must be specified. This function can be a custom one, or any of the reading functions
defined in :mod:`quapy.data.reader` module.
:param path: string, the path to the file containing the labelled instances
:param loader_func: a custom function that implements the data loader and returns a tuple with instances and
labels
:param classes: array-like, the classes according to which the instances are labelled
:param loader_kwargs: any argument that the `loader_func` function needs in order to read the instances, i.e.,
these arguments are used to call `loader_func(path, **loader_kwargs)`
:return: a :class:`LabelledCollection` object
"""
return LabelledCollection(*loader_func(path, **loader_kwargs), classes)
2020-12-03 16:24:21 +01:00
def __len__(self):
2021-12-06 18:25:47 +01:00
"""
Returns the length of this collection (number of labelled instances)
:return: integer
"""
2020-12-03 16:24:21 +01:00
return self.instances.shape[0]
def prevalence(self):
2021-12-06 18:25:47 +01:00
"""
Returns the prevalence, or relative frequency, of the classes of interest.
:return: a np.ndarray of shape `(n_classes)` with the relative frequencies of each class, in the same order
as listed by `self.classes_`
"""
2021-05-05 17:12:44 +02:00
return self.counts() / len(self)
2020-12-03 16:24:21 +01:00
def counts(self):
2021-12-06 18:25:47 +01:00
"""
Returns the number of instances for each of the classes of interest.
:return: a np.ndarray of shape `(n_classes)` with the number of instances of each class, in the same order
as listed by `self.classes_`
"""
2021-05-05 17:12:44 +02:00
return np.asarray([len(self.index[class_]) for class_ in self.classes_])
2020-12-03 16:24:21 +01:00
@property
def n_classes(self):
2021-12-06 18:25:47 +01:00
"""
The number of classes
:return: integer
"""
2020-12-03 16:24:21 +01:00
return len(self.classes_)
@property
def binary(self):
2021-12-06 18:25:47 +01:00
"""
Returns True if the number of classes is 2
:return: boolean
"""
return self.n_classes == 2
2020-12-03 16:24:21 +01:00
def sampling_index(self, size, *prevs, shuffle=True):
2021-12-06 18:25:47 +01:00
"""
Returns an index to be used to extract a random sample of desired size and desired prevalence values. If the
prevalence values are not specified, then returns the index of a uniform sampling.
For each class, the sampling is drawn without replacement if the requested prevalence is larger than
the actual prevalence of the class, or with replacement otherwise.
:param size: integer, the requested size
:param prevs: the prevalence for each class; the prevalence value for the last class can be lead empty since
it is constrained. E.g., for binary collections, only the prevalence `p` for the first class (as listed in
`self.classes_` can be specified, while the other class takes prevalence value `1-p`
:param shuffle: if set to True (default), shuffles the index before returning it
:return: a np.ndarray of shape `(size)` with the indexes
"""
if len(prevs) == 0: # no prevalence was indicated; returns an index for uniform sampling
2021-12-06 18:25:47 +01:00
return self.uniform_sampling_index(size)
2021-05-05 17:12:44 +02:00
if len(prevs) == self.n_classes - 1:
prevs = prevs + (1 - sum(prevs),)
2020-12-03 16:24:21 +01:00
assert len(prevs) == self.n_classes, 'unexpected number of prevalences'
assert sum(prevs) == 1, f'prevalences ({prevs}) wrong range (sum={sum(prevs)})'
2020-12-03 16:24:21 +01:00
taken = 0
indexes_sample = []
2021-05-05 17:12:44 +02:00
for i, class_ in enumerate(self.classes_):
if i == self.n_classes - 1:
2020-12-03 16:24:21 +01:00
n_requested = size - taken
else:
n_requested = int(size * prevs[i])
2021-05-05 17:12:44 +02:00
n_candidates = len(self.index[class_])
index_sample = self.index[class_][
2020-12-03 16:24:21 +01:00
np.random.choice(n_candidates, size=n_requested, replace=(n_requested > n_candidates))
] if n_requested > 0 else []
indexes_sample.append(index_sample)
taken += n_requested
indexes_sample = np.concatenate(indexes_sample).astype(int)
if shuffle:
indexes_sample = np.random.permutation(indexes_sample)
return indexes_sample
def uniform_sampling_index(self, size):
2021-12-06 18:25:47 +01:00
"""
Returns an index to be used to extract a uniform sample of desired size. The sampling is drawn
without replacement if the requested size is greater than the number of instances, or with replacement
otherwise.
2021-12-06 18:25:47 +01:00
:param size: integer, the size of the uniform sample
:return: a np.ndarray of shape `(size)` with the indexes
"""
return np.random.choice(len(self), size, replace=False)
2020-12-03 16:24:21 +01:00
def sampling(self, size, *prevs, shuffle=True):
2021-12-06 18:25:47 +01:00
"""
Return a random sample (an instance of :class:`LabelledCollection`) of desired size and desired prevalence
values. For each class, the sampling is drawn without replacement if the requested prevalence is larger than
the actual prevalence of the class, or with replacement otherwise.
:param size: integer, the requested size
:param prevs: the prevalence for each class; the prevalence value for the last class can be lead empty since
it is constrained. E.g., for binary collections, only the prevalence `p` for the first class (as listed in
`self.classes_` can be specified, while the other class takes prevalence value `1-p`
:param shuffle: if set to True (default), shuffles the index before returning it
:return: an instance of :class:`LabelledCollection` with length == `size` and prevalence close to `prevs` (or
prevalence == `prevs` if the exact prevalence values can be met as proportions of instances)
"""
prev_index = self.sampling_index(size, *prevs, shuffle=shuffle)
return self.sampling_from_index(prev_index)
2020-12-03 16:24:21 +01:00
2021-12-06 18:25:47 +01:00
def uniform_sampling(self, size):
"""
Returns a uniform sample (an instance of :class:`LabelledCollection`) of desired size. The sampling is drawn
without replacement if the requested size is greater than the number of instances, or with replacement
otherwise.
:param size: integer, the requested size
:return: an instance of :class:`LabelledCollection` with length == `size`
"""
unif_index = self.uniform_sampling_index(size)
return self.sampling_from_index(unif_index)
2020-12-03 16:24:21 +01:00
def sampling_from_index(self, index):
2021-12-06 18:25:47 +01:00
"""
Returns an instance of :class:`LabelledCollection` whose elements are sampled from this collection using the
index.
:param index: np.ndarray
:return: an instance of :class:`LabelledCollection`
"""
2020-12-03 16:24:21 +01:00
documents = self.instances[index]
labels = self.labels[index]
2021-05-05 17:12:44 +02:00
return LabelledCollection(documents, labels, classes_=self.classes_)
2020-12-03 16:24:21 +01:00
def split_stratified(self, train_prop=0.6, random_state=None):
2021-12-06 18:25:47 +01:00
"""
Returns two instances of :class:`LabelledCollection` split with stratification from this collection, at desired
proportion.
:param train_prop: the proportion of elements to include in the left-most returned collection (typically used
as the training collection). The rest of elements are included in the right-most returned collection
(typically used as a test collection).
:param random_state: if specified, guarantees reproducibility of the split.
:return: two instances of :class:`LabelledCollection`, the first one with `train_prop` elements, and the
second one with `1-train_prop` elements
"""
2020-12-03 16:24:21 +01:00
tr_docs, te_docs, tr_labels, te_labels = \
2021-05-05 17:12:44 +02:00
train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels,
random_state=random_state)
2020-12-03 16:24:21 +01:00
return LabelledCollection(tr_docs, tr_labels), LabelledCollection(te_docs, te_labels)
def artificial_sampling_generator(self, sample_size, n_prevalences=101, repeats=1):
2021-12-06 18:25:47 +01:00
"""
2021-12-07 17:16:39 +01:00
A generator of samples that implements the artificial prevalence protocol (APP).
The APP consists of exploring a grid of prevalence values containing `n_prevalences` points (e.g.,
[0, 0.05, 0.1, 0.15, ..., 1], if `n_prevalences=21`), and generating all valid combinations of
2021-12-06 18:25:47 +01:00
prevalence values for all classes (e.g., for 3 classes, samples with [0, 0, 1], [0, 0.05, 0.95], ...,
[1, 0, 0] prevalence values of size `sample_size` will be yielded). The number of samples for each valid
2021-12-07 17:16:39 +01:00
combination of prevalence values is indicated by `repeats`.
2021-12-06 18:25:47 +01:00
:param sample_size: the number of instances in each sample
:param n_prevalences: the number of prevalence points to be taken from the [0,1] interval (including the
limits {0,1}). E.g., if `n_prevalences=11`, then the prevalence points to take are [0, 0.1, 0.2, ..., 1]
:param repeats: the number of samples to generate for each valid combination of prevalence values (default 1)
:return: yield samples generated at artificially controlled prevalence values
"""
2021-05-05 17:12:44 +02:00
dimensions = self.n_classes
2020-12-03 16:24:21 +01:00
for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats):
yield self.sampling(sample_size, *prevs)
def artificial_sampling_index_generator(self, sample_size, n_prevalences=101, repeats=1):
2021-12-06 18:25:47 +01:00
"""
A generator of sample indexes implementing the artificial prevalence protocol (APP).
The APP consists of exploring
a grid of prevalence values (e.g., [0, 0.05, 0.1, 0.15, ..., 1]), and generating all valid combinations of
prevalence values for all classes (e.g., for 3 classes, samples with [0, 0, 1], [0, 0.05, 0.95], ...,
[1, 0, 0] prevalence values of size `sample_size` will be yielded). The number of sample indexes for each valid
combination of prevalence values is indicated by `repeats`
:param sample_size: the number of instances in each sample (i.e., length of each index)
:param n_prevalences: the number of prevalence points to be taken from the [0,1] interval (including the
limits {0,1}). E.g., if `n_prevalences=11`, then the prevalence points to take are [0, 0.1, 0.2, ..., 1]
:param repeats: the number of samples to generate for each valid combination of prevalence values (default 1)
:return: yield the indexes that generate the samples according to APP
"""
2021-05-05 17:12:44 +02:00
dimensions = self.n_classes
for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats):
yield self.sampling_index(sample_size, *prevs)
def natural_sampling_generator(self, sample_size, repeats=100):
2021-12-06 18:25:47 +01:00
"""
A generator of samples that implements the natural prevalence protocol (NPP). The NPP consists of drawing
samples uniformly at random, therefore approximately preserving the natural prevalence of the collection.
:param sample_size: integer, the number of instances in each sample
:param repeats: the number of samples to generate
:return: yield instances of :class:`LabelledCollection`
"""
for _ in range(repeats):
yield self.uniform_sampling(sample_size)
def natural_sampling_index_generator(self, sample_size, repeats=100):
2021-12-06 18:25:47 +01:00
"""
A generator of sample indexes according to the natural prevalence protocol (NPP). The NPP consists of drawing
samples uniformly at random, therefore approximately preserving the natural prevalence of the collection.
:param sample_size: integer, the number of instances in each sample (i.e., the length of each index)
:param repeats: the number of indexes to generate
:return: yield `repeats` instances of np.ndarray with shape `(sample_size,)`
"""
for _ in range(repeats):
yield self.uniform_sampling_index(sample_size)
2020-12-03 16:24:21 +01:00
def __add__(self, other):
2021-12-06 18:25:47 +01:00
"""
Returns a new :class:`LabelledCollection` as the union of this collection with another collection
:param other: another :class:`LabelledCollection`
:return: a :class:`LabelledCollection` representing the union of both collections
"""
if other is None:
return self
elif issparse(self.instances) and issparse(other.instances):
join_instances = vstack([self.instances, other.instances])
elif isinstance(self.instances, list) and isinstance(other.instances, list):
join_instances = self.instances + other.instances
elif isinstance(self.instances, np.ndarray) and isinstance(other.instances, np.ndarray):
join_instances = np.concatenate([self.instances, other.instances])
2020-12-03 16:24:21 +01:00
else:
raise NotImplementedError('unsupported operation for collection types')
labels = np.concatenate([self.labels, other.labels])
return LabelledCollection(join_instances, labels)
2020-12-03 16:24:21 +01:00
@property
def Xy(self):
2021-12-06 18:25:47 +01:00
"""
Gets the instances and labels. This is useful when working with `sklearn` estimators, e.g.:
>>> svm = LinearSVC().fit(*my_collection.Xy)
:return: a tuple `(instances, labels)` from this collection
"""
return self.instances, self.labels
2021-01-11 18:31:12 +01:00
def stats(self, show=True):
2021-12-06 18:25:47 +01:00
"""
Returns (and eventually prints) a dictionary with some stats of this collection. E.g.,:
>>> data = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5)
>>> data.training.stats()
>>> #instances=3821, type=<class 'scipy.sparse.csr.csr_matrix'>, #features=4403, #classes=[0 1], prevs=[0.081, 0.919]
:param show: if set to True (default), prints the stats in standard output
:return: a dictionary containing some stats of this collection. Keys include `#instances` (the number of
instances), `type` (the type representing the instances), `#features` (the number of features, if the
instances are in array-like format), `#classes` (the classes of the collection), `prevs` (the prevalence
values for each class)
"""
ninstances = len(self)
instance_type = type(self.instances[0])
if instance_type == list:
nfeats = len(self.instances[0])
2021-01-11 18:31:12 +01:00
elif instance_type == np.ndarray or issparse(self.instances):
nfeats = self.instances.shape[1]
else:
nfeats = '?'
2021-01-11 18:31:12 +01:00
stats_ = {'instances': ninstances,
2021-05-05 17:12:44 +02:00
'type': instance_type,
'features': nfeats,
'classes': self.classes_,
'prevs': strprev(self.prevalence())}
2021-01-11 18:31:12 +01:00
if show:
print(f'#instances={stats_["instances"]}, type={stats_["type"]}, #features={stats_["features"]}, '
f'#classes={stats_["classes"]}, prevs={stats_["prevs"]}')
return stats_
2020-12-03 16:24:21 +01:00
def kFCV(self, nfolds=5, nrepeats=1, random_state=0):
2021-12-06 18:25:47 +01:00
"""
Generator of stratified folds to be used in k-fold cross validation.
:param nfolds: integer (default 5), the number of folds to generate
:param nrepeats: integer (default 1), the number of rounds of k-fold cross validation to run
:param random_state: integer (default 0), guarantees that the folds generated are reproducible
:return: yields `nfolds * nrepeats` folds for k-fold cross validation
"""
kf = RepeatedStratifiedKFold(n_splits=nfolds, n_repeats=nrepeats, random_state=random_state)
for train_index, test_index in kf.split(*self.Xy):
train = self.sampling_from_index(train_index)
2021-05-05 17:12:44 +02:00
test = self.sampling_from_index(test_index)
yield train, test
2020-12-03 16:24:21 +01:00
2021-05-05 17:12:44 +02:00
2020-12-03 16:24:21 +01:00
class Dataset:
2021-12-06 18:25:47 +01:00
"""
Abstraction of training and test :class:`LabelledCollection` objects.
:param training: a :class:`LabelledCollection` instance
:param test: a :class:`LabelledCollection` instance
:param vocabulary: if indicated, is a dictionary of the terms used in this textual dataset
:param name: a string representing the name of the dataset
"""
2020-12-03 16:24:21 +01:00
2021-01-11 18:31:12 +01:00
def __init__(self, training: LabelledCollection, test: LabelledCollection, vocabulary: dict = None, name=''):
2021-05-05 17:12:44 +02:00
assert set(training.classes_) == set(test.classes_), 'incompatible labels in training and test collections'
2020-12-03 16:24:21 +01:00
self.training = training
self.test = test
self.vocabulary = vocabulary
2021-01-11 18:31:12 +01:00
self.name = name
2020-12-03 16:24:21 +01:00
@classmethod
def SplitStratified(cls, collection: LabelledCollection, train_size=0.6):
2021-12-06 18:25:47 +01:00
"""
Generates a :class:`Dataset` from a stratified split of a :class:`LabelledCollection` instance.
See :meth:`LabelledCollection.split_stratified`
:param collection: :class:`LabelledCollection`
:param train_size: the proportion of training documents (the rest conforms the test split)
:return: an instance of :class:`Dataset`
"""
2020-12-03 16:24:21 +01:00
return Dataset(*collection.split_stratified(train_prop=train_size))
@property
2021-05-05 17:12:44 +02:00
def classes_(self):
2021-12-06 18:25:47 +01:00
"""
The classes according to which the training collection is labelled
:return: The classes according to which the training collection is labelled
"""
2021-05-05 17:12:44 +02:00
return self.training.classes_
2020-12-03 16:24:21 +01:00
2021-05-06 16:28:30 +02:00
@property
def n_classes(self):
2021-12-06 18:25:47 +01:00
"""
The number of classes according to which the training collection is labelled
:return: integer
"""
2021-05-06 16:28:30 +02:00
return self.training.n_classes
2020-12-03 16:24:21 +01:00
@property
def binary(self):
2021-12-06 18:25:47 +01:00
"""
Returns True if the training collection is labelled according to two classes
:return: boolean
"""
2020-12-03 16:24:21 +01:00
return self.training.binary
@classmethod
2021-12-06 18:25:47 +01:00
def load(cls, train_path, test_path, loader_func: callable, classes=None, **loader_kwargs):
"""
Loads a training and a test labelled set of data and convert it into a :class:`Dataset` instance.
The function in charge of reading the instances must be specified. This function can be a custom one, or any of
the reading functions defined in :mod:`quapy.data.reader` module.
:param train_path: string, the path to the file containing the training instances
:param test_path: string, the path to the file containing the test instances
:param loader_func: a custom function that implements the data loader and returns a tuple with instances and
labels
:param classes: array-like, the classes according to which the instances are labelled
:param loader_kwargs: any argument that the `loader_func` function needs in order to read the instances.
See :meth:`LabelledCollection.load` for further details.
:return: a :class:`Dataset` object
"""
training = LabelledCollection.load(train_path, loader_func, classes, **loader_kwargs)
test = LabelledCollection.load(test_path, loader_func, classes, **loader_kwargs)
2020-12-03 16:24:21 +01:00
return Dataset(training, test)
@property
def vocabulary_size(self):
2021-12-06 18:25:47 +01:00
"""
If the dataset is textual, and the vocabulary was indicated, returns the size of the vocabulary
:return: integer
"""
return len(self.vocabulary)
2021-12-06 18:25:47 +01:00
def stats(self, show):
"""
Returns (and eventually prints) a dictionary with some stats of this dataset. E.g.,:
>>> data = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5)
>>> data.stats()
>>> Dataset=kindle #tr-instances=3821, #te-instances=21591, type=<class 'scipy.sparse.csr.csr_matrix'>, #features=4403, #classes=[0 1], tr-prevs=[0.081, 0.919], te-prevs=[0.063, 0.937]
:param show: if set to True (default), prints the stats in standard output
:return: a dictionary containing some stats of this collection for the training and test collections. The keys
are `train` and `test`, and point to dedicated dictionaries of stats, for each collection, with keys
`#instances` (the number of instances), `type` (the type representing the instances),
`#features` (the number of features, if the instances are in array-like format), `#classes` (the classes of
the collection), `prevs` (the prevalence values for each class)
"""
2021-01-11 18:31:12 +01:00
tr_stats = self.training.stats(show=False)
te_stats = self.test.stats(show=False)
2021-12-06 18:25:47 +01:00
if show:
print(f'Dataset={self.name} #tr-instances={tr_stats["instances"]}, #te-instances={te_stats["instances"]}, '
f'type={tr_stats["type"]}, #features={tr_stats["features"]}, #classes={tr_stats["classes"]}, '
f'tr-prevs={tr_stats["prevs"]}, te-prevs={te_stats["prevs"]}')
2021-05-05 17:12:44 +02:00
return {'train': tr_stats, 'test': te_stats}
2021-01-11 18:31:12 +01:00
@classmethod
def kFCV(cls, data: LabelledCollection, nfolds=5, nrepeats=1, random_state=0):
2021-12-06 18:25:47 +01:00
"""
Generator of stratified folds to be used in k-fold cross validation. This function is only a wrapper around
:meth:`LabelledCollection.kFCV` that returns :class:`Dataset` instances made of training and test folds.
:param nfolds: integer (default 5), the number of folds to generate
:param nrepeats: integer (default 1), the number of rounds of k-fold cross validation to run
:param random_state: integer (default 0), guarantees that the folds generated are reproducible
:return: yields `nfolds * nrepeats` folds for k-fold cross validation as instances of :class:`Dataset`
"""
for i, (train, test) in enumerate(data.kFCV(nfolds=nfolds, nrepeats=nrepeats, random_state=random_state)):
2021-05-05 17:12:44 +02:00
yield Dataset(train, test, name=f'fold {(i % nfolds) + 1}/{nfolds} (round={(i // nfolds) + 1})')
def isbinary(data):
2021-12-06 18:25:47 +01:00
"""
Returns True if `data` is either a binary :class:`Dataset` or a binary :class:`LabelledCollection`
:param data: a :class:`Dataset` or a :class:`LabelledCollection` object
:return: True if labelled according to two classes
"""
if isinstance(data, Dataset) or isinstance(data, LabelledCollection):
return data.binary
return False