more examples, one-vs-all fixed

This commit is contained in:
Alejandro Moreo Fernandez 2023-02-09 19:39:16 +01:00
parent 2485117f05
commit e28abfc362
10 changed files with 224 additions and 121 deletions

View File

@ -0,0 +1,53 @@
import quapy as qp
from quapy.method.aggregative import MS2, OneVsAllAggregative, OneVsAllGeneric
from quapy.method.base import getOneVsAll
from quapy.model_selection import GridSearchQ
from quapy.protocol import USimplexPP
from sklearn.linear_model import LogisticRegression
import numpy as np
"""
In this example, we will create a quantifier for tweet sentiment analysis considering three classes: negative, neutral,
and positive. We will use a one-vs-all approach using a binary quantifier for demonstration purposes.
"""
qp.environ['SAMPLE_SIZE'] = 100
"""
Any binary quantifier can be turned into a single-label quantifier by means of getOneVsAll function.
This function returns an instance of OneVsAll quantifier. Actually, it either returns the subclass OneVsAllGeneric
when the quantifier is an instance of BaseQuantifier, and it returns OneVsAllAggregative when the quantifier is
an instance of AggregativeQuantifier. Although OneVsAllGeneric works in all cases, using OneVsAllAggregative has
some additional advantages (namely, all the advantages that AggregativeQuantifiers enjoy, i.e., faster predictions
during evaluation).
"""
quantifier = getOneVsAll(MS2(LogisticRegression()), parallel_backend="loky")
print(f'the quantifier is an instance of {quantifier.__class__.__name__}')
# load a ternary dataset
train_modsel, val = qp.datasets.fetch_twitter('hcr', for_model_selection=True, pickle=True).train_test
"""
model selection: for this example, we are relying on the USimplexPP protocol, i.e., a variant of the
artificial-prevalence protocol that generates random samples (100 in this case) for randomly picked priors
from the unit simplex. The priors are sampled using the Kraemer algorithm. Note this is in contrast to the
standard APP protocol, that instead explores a prefixed grid of prevalence values.
"""
param_grid = {
'binary_quantifier__classifier__C': np.logspace(-2,2,5), # classifier-dependent hyperparameter
'binary_quantifier__classifier__class_weight': ['balanced', None] # classifier-dependent hyperparameter
}
print('starting model selection')
gs = GridSearchQ(quantifier, param_grid, protocol=USimplexPP(val), n_jobs=-1, verbose=True, refit=False)
quantifier = gs.fit(train_modsel).best_model()
print('training on the whole training set')
train, test = qp.datasets.fetch_twitter('hcr', for_model_selection=False, pickle=True).train_test
quantifier.fit(train)
# evaluation
mae = qp.evaluation.evaluate(quantifier, protocol=USimplexPP(test), error_metric='mae')
print(f'MAE = {mae:.4f}')

View File

@ -83,7 +83,6 @@ Things to fix:
- update unit tests - update unit tests
- update Wikis... - update Wikis...
- Resolve the OneVsAll thing (it is in base.py and in aggregative.py) - Resolve the OneVsAll thing (it is in base.py and in aggregative.py)
- Add a proper log?
- improve plots - improve plots
- documentation of protocols is incomplete - documentation of protocols is incomplete

View File

@ -65,7 +65,7 @@ class RecalibratedProbabilisticClassifierBase(BaseEstimator, RecalibratedProbabi
""" """
Fits the calibration in a cross-validation manner, i.e., it generates posterior probabilities for all Fits the calibration in a cross-validation manner, i.e., it generates posterior probabilities for all
training instances via cross-validation, and then retrains the classifier on all training instances. training instances via cross-validation, and then retrains the classifier on all training instances.
The posterior probabilities thus generated are used for calibrating the outpus of the classifier. The posterior probabilities thus generated are used for calibrating the outputs of the classifier.
:param X: array-like of shape `(n_samples, n_features)` with the data instances :param X: array-like of shape `(n_samples, n_features)` with the data instances
:param y: array-like of shape `(n_samples,)` with the class labels :param y: array-like of shape `(n_samples,)` with the class labels

View File

@ -6,21 +6,22 @@ from scipy.sparse import vstack
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
from numpy.random import RandomState from numpy.random import RandomState
from quapy.functional import strprev from quapy.functional import strprev
from quapy.util import temp_seed
class LabelledCollection: class LabelledCollection:
""" """
A LabelledCollection is a set of objects each with a label associated to it. This class implements many sampling A LabelledCollection is a set of objects each with a label attached to each of them.
routines. This class implements several sampling routines and other utilities.
:param instances: array-like (np.ndarray, list, or csr_matrix are supported) :param instances: array-like (np.ndarray, list, or csr_matrix are supported)
:param labels: array-like with the same length of instances :param labels: array-like with the same length of instances
:param classes_: optional, list of classes from which labels are taken. If not specified, the classes are inferred :param classes: optional, list of classes from which labels are taken. If not specified, the classes are inferred
from the labels. The classes must be indicated in cases in which some of the labels might have no examples from the labels. The classes must be indicated in cases in which some of the labels might have no examples
(i.e., a prevalence of 0) (i.e., a prevalence of 0)
""" """
def __init__(self, instances, labels, classes_=None): def __init__(self, instances, labels, classes=None):
if issparse(instances): if issparse(instances):
self.instances = instances self.instances = instances
elif isinstance(instances, list) and len(instances) > 0 and isinstance(instances[0], str): elif isinstance(instances, list) and len(instances) > 0 and isinstance(instances[0], str):
@ -30,14 +31,14 @@ class LabelledCollection:
self.instances = np.asarray(instances) self.instances = np.asarray(instances)
self.labels = np.asarray(labels) self.labels = np.asarray(labels)
n_docs = len(self) n_docs = len(self)
if classes_ is None: if classes is None:
self.classes_ = np.unique(self.labels) self.classes_ = np.unique(self.labels)
self.classes_.sort() self.classes_.sort()
else: else:
self.classes_ = np.unique(np.asarray(classes_)) self.classes_ = np.unique(np.asarray(classes))
self.classes_.sort() self.classes_.sort()
if len(set(self.labels).difference(set(classes_))) > 0: if len(set(self.labels).difference(set(classes))) > 0:
raise ValueError(f'labels ({set(self.labels)}) contain values not included in classes_ ({set(classes_)})') raise ValueError(f'labels ({set(self.labels)}) contain values not included in classes_ ({set(classes)})')
self.index = {class_: np.arange(n_docs)[self.labels == class_] for class_ in self.classes_} self.index = {class_: np.arange(n_docs)[self.labels == class_] for class_ in self.classes_}
@classmethod @classmethod
@ -101,7 +102,7 @@ class LabelledCollection:
""" """
return self.n_classes == 2 return self.n_classes == 2
def sampling_index(self, size, *prevs, shuffle=True): def sampling_index(self, size, *prevs, shuffle=True, random_state=None):
""" """
Returns an index to be used to extract a random sample of desired size and desired prevalence values. If the Returns an index to be used to extract a random sample of desired size and desired prevalence values. If the
prevalence values are not specified, then returns the index of a uniform sampling. prevalence values are not specified, then returns the index of a uniform sampling.
@ -113,10 +114,11 @@ class LabelledCollection:
it is constrained. E.g., for binary collections, only the prevalence `p` for the first class (as listed in it is constrained. E.g., for binary collections, only the prevalence `p` for the first class (as listed in
`self.classes_` can be specified, while the other class takes prevalence value `1-p` `self.classes_` can be specified, while the other class takes prevalence value `1-p`
:param shuffle: if set to True (default), shuffles the index before returning it :param shuffle: if set to True (default), shuffles the index before returning it
:param random_state: seed for reproducing sampling
:return: a np.ndarray of shape `(size)` with the indexes :return: a np.ndarray of shape `(size)` with the indexes
""" """
if len(prevs) == 0: # no prevalence was indicated; returns an index for uniform sampling if len(prevs) == 0: # no prevalence was indicated; returns an index for uniform sampling
return self.uniform_sampling_index(size) return self.uniform_sampling_index(size, random_state=random_state)
if len(prevs) == self.n_classes - 1: if len(prevs) == self.n_classes - 1:
prevs = prevs + (1 - sum(prevs),) prevs = prevs + (1 - sum(prevs),)
assert len(prevs) == self.n_classes, 'unexpected number of prevalences' assert len(prevs) == self.n_classes, 'unexpected number of prevalences'
@ -129,6 +131,7 @@ class LabelledCollection:
# (This aims at avoiding the remainder to be placed in a class for which the prevalence requested is 0.) # (This aims at avoiding the remainder to be placed in a class for which the prevalence requested is 0.)
n_requests = {class_: int(size * prevs[i]) for i, class_ in enumerate(self.classes_)} n_requests = {class_: int(size * prevs[i]) for i, class_ in enumerate(self.classes_)}
remainder = size - sum(n_requests.values()) remainder = size - sum(n_requests.values())
with temp_seed(random_state):
for rand_class in np.random.choice(self.classes_, size=remainder, p=prevs): for rand_class in np.random.choice(self.classes_, size=remainder, p=prevs):
n_requests[rand_class] += 1 n_requests[rand_class] += 1
@ -164,7 +167,7 @@ class LabelledCollection:
ng = np.random ng = np.random
return ng.choice(len(self), size, replace=size > len(self)) return ng.choice(len(self), size, replace=size > len(self))
def sampling(self, size, *prevs, shuffle=True): def sampling(self, size, *prevs, shuffle=True, random_state=None):
""" """
Return a random sample (an instance of :class:`LabelledCollection`) of desired size and desired prevalence Return a random sample (an instance of :class:`LabelledCollection`) of desired size and desired prevalence
values. For each class, the sampling is drawn without replacement if the requested prevalence is larger than values. For each class, the sampling is drawn without replacement if the requested prevalence is larger than
@ -175,10 +178,11 @@ class LabelledCollection:
it is constrained. E.g., for binary collections, only the prevalence `p` for the first class (as listed in it is constrained. E.g., for binary collections, only the prevalence `p` for the first class (as listed in
`self.classes_` can be specified, while the other class takes prevalence value `1-p` `self.classes_` can be specified, while the other class takes prevalence value `1-p`
:param shuffle: if set to True (default), shuffles the index before returning it :param shuffle: if set to True (default), shuffles the index before returning it
:param random_state: seed for reproducing sampling
:return: an instance of :class:`LabelledCollection` with length == `size` and prevalence close to `prevs` (or :return: an instance of :class:`LabelledCollection` with length == `size` and prevalence close to `prevs` (or
prevalence == `prevs` if the exact prevalence values can be met as proportions of instances) prevalence == `prevs` if the exact prevalence values can be met as proportions of instances)
""" """
prev_index = self.sampling_index(size, *prevs, shuffle=shuffle) prev_index = self.sampling_index(size, *prevs, shuffle=shuffle, random_state=random_state)
return self.sampling_from_index(prev_index) return self.sampling_from_index(prev_index)
def uniform_sampling(self, size, random_state=None): def uniform_sampling(self, size, random_state=None):
@ -204,7 +208,7 @@ class LabelledCollection:
""" """
documents = self.instances[index] documents = self.instances[index]
labels = self.labels[index] labels = self.labels[index]
return LabelledCollection(documents, labels, classes_=self.classes_) return LabelledCollection(documents, labels, classes=self.classes_)
def split_stratified(self, train_prop=0.6, random_state=None): def split_stratified(self, train_prop=0.6, random_state=None):
""" """
@ -221,11 +225,10 @@ class LabelledCollection:
tr_docs, te_docs, tr_labels, te_labels = train_test_split( tr_docs, te_docs, tr_labels, te_labels = train_test_split(
self.instances, self.labels, train_size=train_prop, stratify=self.labels, random_state=random_state self.instances, self.labels, train_size=train_prop, stratify=self.labels, random_state=random_state
) )
training = LabelledCollection(tr_docs, tr_labels, classes_=self.classes_) training = LabelledCollection(tr_docs, tr_labels, classes=self.classes_)
test = LabelledCollection(te_docs, te_labels, classes_=self.classes_) test = LabelledCollection(te_docs, te_labels, classes=self.classes_)
return training, test return training, test
def split_random(self, train_prop=0.6, random_state=None): def split_random(self, train_prop=0.6, random_state=None):
""" """
Returns two instances of :class:`LabelledCollection` split randomly from this collection, at desired Returns two instances of :class:`LabelledCollection` split randomly from this collection, at desired
@ -261,20 +264,33 @@ class LabelledCollection:
:return: a :class:`LabelledCollection` representing the union of both collections :return: a :class:`LabelledCollection` representing the union of both collections
""" """
if not all(np.sort(self.classes_)==np.sort(other.classes_)): if not all(np.sort(self.classes_)==np.sort(other.classes_)):
raise NotImplementedError('unsupported operation for collections on different classes') raise NotImplementedError(f'unsupported operation for collections on different classes; '
f'expected {self.classes_}, found {other.classes_}')
return LabelledCollection.mix(self, other)
if other is None: @classmethod
return self def mix(cls, a:'LabelledCollection', b:'LabelledCollection'):
elif issparse(self.instances) and issparse(other.instances): """
join_instances = vstack([self.instances, other.instances]) Returns a new :class:`LabelledCollection` as the union of this collection with another collection.
elif isinstance(self.instances, list) and isinstance(other.instances, list):
join_instances = self.instances + other.instances :param a: instance of :class:`LabelledCollection`
elif isinstance(self.instances, np.ndarray) and isinstance(other.instances, np.ndarray): :param b: instance of :class:`LabelledCollection`
join_instances = np.concatenate([self.instances, other.instances]) :return: a :class:`LabelledCollection` representing the union of both collections
"""
if a is None: return b
if b is None: return a
elif issparse(a.instances) and issparse(b.instances):
join_instances = vstack([a.instances, b.instances])
elif isinstance(a.instances, list) and isinstance(b.instances, list):
join_instances = a.instances + b.instances
elif isinstance(a.instances, np.ndarray) and isinstance(b.instances, np.ndarray):
join_instances = np.concatenate([a.instances, b.instances])
else: else:
raise NotImplementedError('unsupported operation for collection types') raise NotImplementedError('unsupported operation for collection types')
labels = np.concatenate([self.labels, other.labels]) labels = np.concatenate([a.labels, b.labels])
return LabelledCollection(join_instances, labels, classes_=self.classes_) classes = np.unique(np.concatenate([a.classes_, b.classes_])).sort()
return LabelledCollection(join_instances, labels, classes=classes)
@property @property
def Xy(self): def Xy(self):
@ -291,7 +307,7 @@ class LabelledCollection:
def Xp(self): def Xp(self):
""" """
Gets the instances and the true prevalence. This is useful when implementing evaluation protocols from Gets the instances and the true prevalence. This is useful when implementing evaluation protocols from
a `LabelledCollection` object. a :class:`LabelledCollection` object.
:return: a tuple `(instances, prevalence)` from this collection :return: a tuple `(instances, prevalence)` from this collection
""" """
@ -357,7 +373,7 @@ class LabelledCollection:
f'#classes={stats_["classes"]}, prevs={stats_["prevs"]}') f'#classes={stats_["classes"]}, prevs={stats_["prevs"]}')
return stats_ return stats_
def kFCV(self, nfolds=5, nrepeats=1, random_state=0): def kFCV(self, nfolds=5, nrepeats=1, random_state=None):
""" """
Generator of stratified folds to be used in k-fold cross validation. Generator of stratified folds to be used in k-fold cross validation.

View File

@ -6,6 +6,7 @@ import os
import zipfile import zipfile
from os.path import join from os.path import join
import pandas as pd import pandas as pd
import scipy
from quapy.data.base import Dataset, LabelledCollection from quapy.data.base import Dataset, LabelledCollection
from quapy.data.preprocessing import text2tfidf, reduce_columns from quapy.data.preprocessing import text2tfidf, reduce_columns

View File

@ -14,7 +14,7 @@ import quapy.functional as F
from classification.calibration import NBVSCalibration, BCTSCalibration, TSCalibration, VSCalibration from classification.calibration import NBVSCalibration, BCTSCalibration, TSCalibration, VSCalibration
from quapy.classification.svmperf import SVMperf from quapy.classification.svmperf import SVMperf
from quapy.data import LabelledCollection from quapy.data import LabelledCollection
from quapy.method.base import BaseQuantifier, BinaryQuantifier from quapy.method.base import BaseQuantifier, BinaryQuantifier, OneVsAllGeneric
# Abstract classes # Abstract classes
@ -1246,7 +1246,7 @@ MedianSweep = MS
MedianSweep2 = MS2 MedianSweep2 = MS2
class OneVsAll(AggregativeQuantifier): class OneVsAllAggregative(OneVsAllGeneric, AggregativeQuantifier):
""" """
Allows any binary quantifier to perform quantification on single-label datasets. Allows any binary quantifier to perform quantification on single-label datasets.
The method maintains one binary quantifier for each class, and then l1-normalizes the outputs so that the The method maintains one binary quantifier for each class, and then l1-normalizes the outputs so that the
@ -1257,25 +1257,19 @@ class OneVsAll(AggregativeQuantifier):
:param binary_quantifier: a quantifier (binary) that will be employed to work on multiclass model in a :param binary_quantifier: a quantifier (binary) that will be employed to work on multiclass model in a
one-vs-all manner one-vs-all manner
:param n_jobs: number of parallel workers :param n_jobs: number of parallel workers
:param parallel_backend: the parallel backend for joblib (default "loky"); this is helpful for some quantifiers
(e.g., ELM-based ones) that cannot be run with multiprocessing, since the temp dir they create during fit will
is removed and no longer available at predict time.
""" """
def __init__(self, binary_quantifier, n_jobs=None): def __init__(self, binary_quantifier, n_jobs=None, parallel_backend='loky'):
assert isinstance(self.binary_quantifier, BaseQuantifier), \ assert isinstance(binary_quantifier, BaseQuantifier), \
f'{self.binary_quantifier} does not seem to be a Quantifier' f'{self.binary_quantifier} does not seem to be a Quantifier'
assert isinstance(self.binary_quantifier, AggregativeQuantifier), \ assert isinstance(binary_quantifier, AggregativeQuantifier), \
f'{self.binary_quantifier} does not seem to be of type Aggregative' f'{self.binary_quantifier} does not seem to be of type Aggregative'
self.binary_quantifier = binary_quantifier self.binary_quantifier = binary_quantifier
self.n_jobs = qp._get_njobs(n_jobs) self.n_jobs = qp._get_njobs(n_jobs)
self.parallel_backend = parallel_backend
def fit(self, data: LabelledCollection, fit_classifier=True):
assert not data.binary, \
f'{self.__class__.__name__} expect non-binary data'
assert fit_classifier == True, \
'fit_classifier must be True'
self.dict_binary_quantifiers = {c: deepcopy(self.binary_quantifier) for c in data.classes_}
self.__parallel(self._delayed_binary_fit, data)
return self
def classify(self, instances): def classify(self, instances):
""" """
@ -1292,35 +1286,16 @@ class OneVsAll(AggregativeQuantifier):
:return: `np.ndarray` :return: `np.ndarray`
""" """
classif_predictions = self.__parallel(self._delayed_binary_classification, instances) classif_predictions = self._parallel(self._delayed_binary_classification, instances)
if isinstance(self.binary_quantifier, AggregativeProbabilisticQuantifier): if isinstance(self.binary_quantifier, AggregativeProbabilisticQuantifier):
return np.swapaxes(classif_predictions, 0, 1) return np.swapaxes(classif_predictions, 0, 1)
else: else:
return classif_predictions.T return classif_predictions.T
def aggregate(self, classif_predictions): def aggregate(self, classif_predictions):
prevalences = self.__parallel(self._delayed_binary_aggregate, classif_predictions) prevalences = self._parallel(self._delayed_binary_aggregate, classif_predictions)
return F.normalize_prevalence(prevalences) return F.normalize_prevalence(prevalences)
def __parallel(self, func, *args, **kwargs):
return np.asarray(
# some quantifiers (in particular, ELM-based ones) cannot be run with multiprocess, since the temp dir they
# create during the fit will be removed and be no longer available for the predict...
Parallel(n_jobs=self.n_jobs, backend='threading')(
delayed(func)(c, *args, **kwargs) for c in self.classes_
)
)
@property
def classes_(self):
return sorted(self.dict_binary_quantifiers.keys())
def set_params(self, **parameters):
self.binary_quantifier.set_params(**parameters)
def get_params(self, deep=True):
return self.binary_quantifier.get_params()
def _delayed_binary_classification(self, c, X): def _delayed_binary_classification(self, c, X):
return self.dict_binary_quantifiers[c].classify(X) return self.dict_binary_quantifiers[c].classify(X)
@ -1328,7 +1303,3 @@ class OneVsAll(AggregativeQuantifier):
# the estimation for the positive class prevalence # the estimation for the positive class prevalence
return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:, c])[1] return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:, c])[1]
def _delayed_binary_fit(self, c, data):
bindata = LabelledCollection(data.instances, data.labels == c, classes_=[False, True])
self.dict_binary_quantifiers[c].fit(bindata)

View File

@ -1,10 +1,12 @@
from abc import ABCMeta, abstractmethod from abc import ABCMeta, abstractmethod
from copy import deepcopy from copy import deepcopy
from joblib import Parallel, delayed
from sklearn.base import BaseEstimator from sklearn.base import BaseEstimator
import quapy as qp import quapy as qp
from quapy.data import LabelledCollection from quapy.data import LabelledCollection
import numpy as np
# Base Quantifier abstract class # Base Quantifier abstract class
@ -48,50 +50,61 @@ class BinaryQuantifier(BaseQuantifier):
f'Use the class OneVsAll to enable {quantifier_name} work on single-label data.' f'Use the class OneVsAll to enable {quantifier_name} work on single-label data.'
class OneVsAllGeneric: class OneVsAll:
pass
def getOneVsAll(binary_quantifier, n_jobs=None, parallel_backend='loky'):
assert isinstance(binary_quantifier, BaseQuantifier), \
f'{binary_quantifier} does not seem to be a Quantifier'
if isinstance(binary_quantifier, qp.method.aggregative.AggregativeQuantifier):
return qp.method.aggregative.OneVsAllAggregative(binary_quantifier, n_jobs, parallel_backend)
else:
return OneVsAllGeneric(binary_quantifier, n_jobs, parallel_backend)
class OneVsAllGeneric(OneVsAll,BaseQuantifier):
""" """
Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary Allows any binary quantifier to perform quantification on single-label datasets. The method maintains one binary
quantifier for each class, and then l1-normalizes the outputs so that the class prevelence values sum up to 1. quantifier for each class, and then l1-normalizes the outputs so that the class prevelence values sum up to 1.
""" """
def __init__(self, binary_quantifier, n_jobs=None): def __init__(self, binary_quantifier, n_jobs=None, parallel_backend='loky'):
assert isinstance(binary_quantifier, BaseQuantifier), \ assert isinstance(binary_quantifier, BaseQuantifier), \
f'{binary_quantifier} does not seem to be a Quantifier' f'{binary_quantifier} does not seem to be a Quantifier'
if isinstance(binary_quantifier, qp.method.aggregative.AggregativeQuantifier):
print('[warning] the quantifier seems to be an instance of qp.method.aggregative.AggregativeQuantifier; '
f'you might prefer instantiating {qp.method.aggregative.OneVsAllAggregative.__name__}')
self.binary_quantifier = binary_quantifier self.binary_quantifier = binary_quantifier
self.n_jobs = qp._get_njobs(n_jobs) self.n_jobs = qp._get_njobs(n_jobs)
self.parallel_backend = parallel_backend
def fit(self, data: LabelledCollection, **kwargs): def fit(self, data: LabelledCollection, fit_classifier=True):
assert not data.binary, \ assert not data.binary, f'{self.__class__.__name__} expect non-binary data'
f'{self.__class__.__name__} expect non-binary data' assert fit_classifier == True, 'fit_classifier must be True'
self.class_quatifier = {c: deepcopy(self.binary_quantifier) for c in data.classes_}
Parallel(n_jobs=self.n_jobs, backend='threading')( self.dict_binary_quantifiers = {c: deepcopy(self.binary_quantifier) for c in data.classes_}
delayed(self._delayed_binary_fit)(c, self.class_quatifier, data, **kwargs) for c in data.classes_ self._parallel(self._delayed_binary_fit, data)
)
return self return self
def quantify(self, X, *args): def _parallel(self, func, *args, **kwargs):
prevalences = np.asarray( return np.asarray(
Parallel(n_jobs=self.n_jobs, backend='threading')( Parallel(n_jobs=self.n_jobs, backend=self.parallel_backend)(
delayed(self._delayed_binary_predict)(c, self.class_quatifier, X) for c in self.classes delayed(func)(c, *args, **kwargs) for c in self.classes_
) )
) )
return F.normalize_prevalence(prevalences)
def quantify(self, instances):
prevalences = self._parallel(self._delayed_binary_predict, instances)
return qp.functional.normalize_prevalence(prevalences)
@property @property
def classes(self): def classes_(self):
return sorted(self.class_quatifier.keys()) return sorted(self.dict_binary_quantifiers.keys())
def set_params(self, **parameters):
self.binary_quantifier.set_params(**parameters)
def get_params(self, deep=True):
return self.binary_quantifier.get_params()
def _delayed_binary_predict(self, c, quantifiers, X):
return quantifiers[c].quantify(X)[:, 1] # the mean is the estimation for the positive class prevalence
def _delayed_binary_fit(self, c, quantifiers, data, **kwargs):
bindata = LabelledCollection(data.instances, data.labels == c, n_classes=2)
quantifiers[c].fit(bindata, **kwargs)
def _delayed_binary_predict(self, c, X):
return self.dict_binary_quantifiers[c].quantify(X)[1]
def _delayed_binary_fit(self, c, data):
bindata = LabelledCollection(data.instances, data.labels == c, classes=[False, True])
self.dict_binary_quantifiers[c].fit(bindata)

View File

@ -7,7 +7,7 @@ from quapy.protocol import APP, NPP, USimplexPP, DomainMixer, AbstractStochastic
def mock_labelled_collection(prefix=''): def mock_labelled_collection(prefix=''):
y = [0] * 250 + [1] * 250 + [2] * 250 + [3] * 250 y = [0] * 250 + [1] * 250 + [2] * 250 + [3] * 250
X = [prefix + str(i) + '-' + str(yi) for i, yi in enumerate(y)] X = [prefix + str(i) + '-' + str(yi) for i, yi in enumerate(y)]
return LabelledCollection(X, y, classes_=sorted(np.unique(y))) return LabelledCollection(X, y, classes=sorted(np.unique(y)))
def samples_to_str(protocol): def samples_to_str(protocol):

View File

@ -1,13 +1,14 @@
import unittest import unittest
import quapy as qp import quapy as qp
from quapy.data import LabelledCollection
from quapy.functional import strprev from quapy.functional import strprev
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
from method.aggregative import PACC from quapy.method.aggregative import PACC
class MyTestCase(unittest.TestCase): class MyTestCase(unittest.TestCase):
def test_replicability(self): def test_prediction_replicability(self):
dataset = qp.datasets.fetch_UCIDataset('yeast') dataset = qp.datasets.fetch_UCIDataset('yeast')
@ -25,6 +26,53 @@ class MyTestCase(unittest.TestCase):
self.assertEqual(str_prev1, str_prev2) # add assertion here self.assertEqual(str_prev1, str_prev2) # add assertion here
def test_samping_replicability(self):
import numpy as np
def equal_collections(c1, c2, value=True):
self.assertEqual(np.all(c1.X == c2.X), value)
self.assertEqual(np.all(c1.y == c2.y), value)
if value:
self.assertEqual(np.all(c1.classes_ == c2.classes_), value)
X = list(map(str, range(100)))
y = np.random.randint(0, 2, 100)
data = LabelledCollection(instances=X, labels=y)
sample1 = data.sampling(50)
sample2 = data.sampling(50)
equal_collections(sample1, sample2, False)
sample1 = data.sampling(50, random_state=0)
sample2 = data.sampling(50, random_state=0)
equal_collections(sample1, sample2, True)
sample1 = data.sampling(50, *[0.7, 0.3], random_state=0)
sample2 = data.sampling(50, *[0.7, 0.3], random_state=0)
equal_collections(sample1, sample2, True)
with qp.util.temp_seed(0):
sample1 = data.sampling(50, *[0.7, 0.3])
with qp.util.temp_seed(0):
sample2 = data.sampling(50, *[0.7, 0.3])
equal_collections(sample1, sample2, True)
sample1 = data.sampling(50, *[0.7, 0.3], random_state=0)
sample2 = data.sampling(50, *[0.7, 0.3], random_state=0)
equal_collections(sample1, sample2, True)
sample1_tr, sample1_te = data.split_stratified(train_prop=0.7, random_state=0)
sample2_tr, sample2_te = data.split_stratified(train_prop=0.7, random_state=0)
equal_collections(sample1_tr, sample2_tr, True)
equal_collections(sample1_te, sample2_te, True)
with qp.util.temp_seed(0):
sample1_tr, sample1_te = data.split_stratified(train_prop=0.7)
with qp.util.temp_seed(0):
sample2_tr, sample2_te = data.split_stratified(train_prop=0.7)
equal_collections(sample1_tr, sample2_tr, True)
equal_collections(sample1_te, sample2_te, True)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

View File

@ -73,6 +73,7 @@ def temp_seed(random_state):
:param random_state: the seed to set within the "with" context :param random_state: the seed to set within the "with" context
""" """
if random_state is not None:
state = np.random.get_state() state = np.random.get_state()
#save the seed just in case is needed (for instance for setting the seed to child processes) #save the seed just in case is needed (for instance for setting the seed to child processes)
qp.environ['_R_SEED'] = random_state qp.environ['_R_SEED'] = random_state
@ -80,6 +81,7 @@ def temp_seed(random_state):
try: try:
yield yield
finally: finally:
if random_state is not None:
np.random.set_state(state) np.random.set_state(state)