bugfix in protocols, return_type='index' not working

This commit is contained in:
Alejandro Moreo Fernandez 2024-11-19 16:00:03 +01:00
parent 24c28edfd9
commit e6ae1e7d77
5 changed files with 108 additions and 10 deletions

View File

@ -0,0 +1,3 @@
- Test the return_type="index" in protocols and finish the "distributin_samples.py" example
- Add EDy (an implementation is available at quantificationlib)
-

View File

@ -33,10 +33,8 @@ import quapy.functional as F # <- this module has some functional utilities, li
print(f'training prevalence = {F.strprev(train.prevalence())}')
# let us train one quantifier, for example, PACC using a sklearn's Logistic Regressor as the underlying classifier
# classifier = LogisticRegression()
# pacc = qp.method.aggregative.PACC(classifier)
pacc = qp.method.aggregative.PACC()
classifier = LogisticRegression()
pacc = qp.method.aggregative.PACC(classifier)
print(f'training {pacc}')
pacc.fit(train)

View File

@ -0,0 +1,38 @@
"""
Imagine we want to generate many samples out of a collection, that we want to distribute for others to run their
own experiments in the very same test samples. One naive solution would come down to applying a given protocol to
our collection (say the artificial prevalence protocol on the 'academic-success' UCI dataset), store all those samples
on disk and make them available online. Distributing many such samples is undesirable.
In this example, we generate the indexes that allow anyone to regenerate the samples out of the original collection.
"""
import quapy as qp
from quapy.method.aggregative import PACC
from quapy.protocol import UPP
data = qp.datasets.fetch_UCIMulticlassDataset('academic-success')
train, test = data.train_test
# let us train a quantifier to check whether we can actually replicate the results
quantifier = PACC()
quantifier.fit(train)
# let us simulate our experimental results
protocol = UPP(test, sample_size=100, repeats=100, random_state=0)
our_mae = qp.evaluation.evaluate(quantifier, protocol=protocol, error_metric='mae')
print(f'We have obtained a MAE={our_mae:.3f}')
# let us distribute the indexes; we specify that we want the indexes, not the samples
protocol = UPP(test, sample_size=100, repeats=100, random_state=0, return_type='index')
indexes = protocol.samples_parameters()
# Imagine we distribute the indexes; now we show how to replicate our experiments.
from quapy.protocol import ProtocolFromIndex
data = qp.datasets.fetch_UCIMulticlassDataset('academic-success')
train, test = data.train_test
protocol = ProtocolFromIndex(data=test, indexes=indexes)
their_mae = qp.evaluation.evaluate(quantifier, protocol=protocol, error_metric='mae')
print(f'Another lab obtains a MAE={our_mae:.3f}')

View File

@ -298,6 +298,31 @@ def nmd(prevs, prevs_hat):
return (1./(n-1))*np.mean(match_distance(prevs, prevs_hat))
def bias_binary(prevs, prevs_hat):
"""
Computes the (positive) bias in a binary problem. The bias is simply the difference between the
predicted positive value and the true positive value, so that a positive such value indicates the
prediction has positive bias (i.e., it tends to overestimate) the true value, and negative otherwise.
:math:`bias(p,\\hat{p})=\\hat{p}_1-p_1`,
:param prevs: array-like of shape `(n_samples, n_classes,)` with the true prevalence values
:param prevs_hat: array-like of shape `(n_samples, n_classes,)` with the predicted
prevalence values
:return: binary bias
"""
assert prevs.shape[-1] == 2 and prevs.shape[-1] == 2, f'bias_binary can only be applied to binary problems'
return prevs_hat[...,1]-prevs[...,1]
def mean_bias_binary(prevs, prevs_hat):
"""
Computes the mean of the (positive) bias in a binary problem.
:param prevs: array-like of shape `(n_classes,)` with the true prevalence values
:param prevs_hat: array-like of shape `(n_classes,)` with the predicted prevalence values
:return: mean binary bias
"""
return np.mean(bias_binary(prevs, prevs_hat))
def md(prevs, prevs_hat, ERROR_TOL=1E-3):
"""
Computes the Match Distance, under the assumption that the cost in mistaking class i with class i+1 is 1 in
@ -338,8 +363,8 @@ def __check_eps(eps=None):
CLASSIFICATION_ERROR = {f1e, acce}
QUANTIFICATION_ERROR = {mae, mnae, mrae, mnrae, mse, mkld, mnkld}
QUANTIFICATION_ERROR_SINGLE = {ae, nae, rae, nrae, se, kld, nkld}
QUANTIFICATION_ERROR = {mae, mnae, mrae, mnrae, mse, mkld, mnkld, mean_bias_binary}
QUANTIFICATION_ERROR_SINGLE = {ae, nae, rae, nrae, se, kld, nkld, bias_binary}
QUANTIFICATION_ERROR_SMOOTH = {kld, nkld, rae, nrae, mkld, mnkld, mrae}
CLASSIFICATION_ERROR_NAMES = {func.__name__ for func in CLASSIFICATION_ERROR}
QUANTIFICATION_ERROR_NAMES = {func.__name__ for func in QUANTIFICATION_ERROR}

View File

@ -1,4 +1,6 @@
from copy import deepcopy
from typing import Iterable
import quapy as qp
import numpy as np
import itertools
@ -62,6 +64,36 @@ class IterateProtocol(AbstractProtocol):
return len(self.samples)
class ProtocolFromIndex(AbstractProtocol):
"""
A protocol from a list of indexes
:param data: a :class:`quapy.data.base.LabelledCollection`
:param indexes: a list of indexes
"""
def __init__(self, data: LabelledCollection, indexes: Iterable):
self.data = data
self.indexes = indexes
def __call__(self):
"""
Yields one sample at a time extracted using the indexes
:return: yields a tuple `(sample, prev) at a time, where `sample` is a set of instances
and in which `prev` is an `nd.array` with the class prevalence values
"""
for index in self.indexes:
yield self.data.sampling_from_index(index).Xp
def total(self):
"""
Returns the number of samples in this protocol
:return: int
"""
return len(self.indexes)
class AbstractStochasticSeededProtocol(AbstractProtocol):
"""
An `AbstractStochasticSeededProtocol` is a protocol that generates, via any random procedure (e.g.,
@ -124,9 +156,9 @@ class AbstractStochasticSeededProtocol(AbstractProtocol):
if self.random_state is not None:
stack.enter_context(qp.util.temp_seed(self.random_state))
for params in self.samples_parameters():
yield self.collator(self.sample(params))
yield self.collator(self.sample(params), params)
def collator(self, sample, *args):
def collator(self, sample, params):
"""
The collator prepares the sample to accommodate the desired output format before returning the output.
This collator simply returns the sample as it is. Classes inheriting from this abstract class can
@ -191,9 +223,11 @@ class OnLabelledCollectionProtocol:
assert return_type in cls.RETURN_TYPES, \
f'unknown return type passed as argument; valid ones are {cls.RETURN_TYPES}'
if return_type=='sample_prev':
return lambda lc:lc.Xp
return lambda lc,params:lc.Xp
elif return_type=='labelled_collection':
return lambda lc:lc
return lambda lc,params:lc
elif return_type=='index':
return lambda lc,params:params
class APP(AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol):