1
0
Fork 0

added MedianEstimator quantifier

This commit is contained in:
Alejandro Moreo Fernandez 2023-11-09 14:20:41 +01:00
parent 66ad7295df
commit daca2bd1cb
5 changed files with 159 additions and 17 deletions

View File

@ -1,13 +1,18 @@
Change Log 0.1.8 Change Log 0.1.8
---------------- ----------------
- Added HDx and DistributionMatchingX to non-aggregative quantifiers (see also the new example "comparing_HDy_HDx.py")
- New UCI multiclass datasets added (thanks to Pablo González). The 5 UCI multiclass datasets are those corresponding - New UCI multiclass datasets added (thanks to Pablo González). The 5 UCI multiclass datasets are those corresponding
to the following criteria: to the following criteria:
- >1000 instances - >1000 instances
- >2 classes - >2 classes
- classification datasets - classification datasets
- Python API available - Python API available
- Added NAE, NRAE - New IFCB (plankton) dataset added. See fetch_IFCB.
- Added new evaluation measures NAE, NRAE
- Added new meta method "MedianEstimator"; an ensemble of binary base quantifiers that receives as input a dictionary
of hyperparameters that will explore exhaustively, fitting and generating predictions for each combination of
hyperparameters, and that returns, as the prevalence estimates, the median across all predictions.
Change Log 0.1.7 Change Log 0.1.7
---------------- ----------------

View File

@ -11,7 +11,7 @@ from . import util
from . import model_selection from . import model_selection
from . import classification from . import classification
__version__ = '0.1.7' __version__ = '0.1.8'
environ = { environ = {
'SAMPLE_SIZE': None, 'SAMPLE_SIZE': None,

View File

@ -1,7 +1,7 @@
from . import aggregative
from . import base from . import base
from . import meta from . import aggregative
from . import non_aggregative from . import non_aggregative
from . import meta
AGGREGATIVE_METHODS = { AGGREGATIVE_METHODS = {
aggregative.CC, aggregative.CC,

View File

@ -1,3 +1,4 @@
import itertools
from copy import deepcopy from copy import deepcopy
from typing import Union from typing import Union
import numpy as np import numpy as np
@ -10,13 +11,14 @@ import quapy as qp
from quapy import functional as F from quapy import functional as F
from quapy.data import LabelledCollection from quapy.data import LabelledCollection
from quapy.model_selection import GridSearchQ from quapy.model_selection import GridSearchQ
from quapy.method.base import BaseQuantifier, BinaryQuantifier
from quapy.method.aggregative import CC, ACC, PACC, HDy, EMQ
try: try:
from . import neural from . import neural
except ModuleNotFoundError: except ModuleNotFoundError:
neural = None neural = None
from .base import BaseQuantifier
from quapy.method.aggregative import CC, ACC, PACC, HDy, EMQ
if neural: if neural:
QuaNet = neural.QuaNetTrainer QuaNet = neural.QuaNetTrainer
@ -24,6 +26,67 @@ else:
QuaNet = "QuaNet is not available due to missing torch package" QuaNet = "QuaNet is not available due to missing torch package"
class MedianEstimator(BinaryQuantifier):
"""
This method is a meta-quantifier that returns, as the estimated class prevalence values, the median of the
estimation returned by differently (hyper)parameterized base quantifiers.
The median of unit-vectors is only guaranteed to be a unit-vector for n=2 dimensions,
i.e., in cases of binary quantification.
:param base_quantifier: the base, binary quantifier
:param random_state: a seed to be set before fitting any base quantifier (default None)
:param param_grid: the grid or parameters towards which the median will be computed
:param n_jobs: number of parllel workes
"""
def __init__(self, base_quantifier: BinaryQuantifier, param_grid: dict, random_state=None, n_jobs=None):
self.base_quantifier = base_quantifier
self.param_grid = param_grid
self.random_state = random_state
self.n_jobs = qp._get_njobs(n_jobs)
def get_params(self, deep=True):
return self.base_quantifier.get_params(deep)
def set_params(self, **params):
self.base_quantifier.set_params(**params)
def _delayed_fit(self, args):
with qp.util.temp_seed(self.random_state):
params, training = args
model = deepcopy(self.base_quantifier)
model.set_params(**params)
model.fit(training)
return model
def fit(self, training: LabelledCollection):
self._check_binary(training, self.__class__.__name__)
params_keys = list(self.param_grid.keys())
params_values = list(self.param_grid.values())
hyper = [dict({k: val[i] for i, k in enumerate(params_keys)}) for val in itertools.product(*params_values)]
self.models = qp.util.parallel(
self._delayed_fit,
((params, training) for params in hyper),
seed=qp.environ.get('_R_SEED', None),
n_jobs=self.n_jobs
)
return self
def _delayed_predict(self, args):
model, instances = args
return model.quantify(instances)
def quantify(self, instances):
prev_preds = qp.util.parallel(
self._delayed_predict,
((model, instances) for model in self.models),
seed=qp.environ.get('_R_SEED', None),
n_jobs=self.n_jobs
)
prev_preds = np.asarray(prev_preds)
return np.median(prev_preds, axis=0)
class Ensemble(BaseQuantifier): class Ensemble(BaseQuantifier):
VALID_POLICIES = {'ave', 'ptr', 'ds'} | qp.error.QUANTIFICATION_ERROR_NAMES VALID_POLICIES = {'ave', 'ptr', 'ds'} | qp.error.QUANTIFICATION_ERROR_NAMES

View File

@ -1,14 +1,17 @@
import numpy import numpy as np
import pytest import pytest
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC from sklearn.svm import LinearSVC
import quapy as qp import quapy as qp
from quapy.model_selection import GridSearchQ
from quapy.method.base import BinaryQuantifier from quapy.method.base import BinaryQuantifier
from quapy.data import Dataset, LabelledCollection from quapy.data import Dataset, LabelledCollection
from quapy.method import AGGREGATIVE_METHODS, NON_AGGREGATIVE_METHODS from quapy.method import AGGREGATIVE_METHODS, NON_AGGREGATIVE_METHODS
from quapy.method.aggregative import ACC, PACC, HDy
from quapy.method.meta import Ensemble from quapy.method.meta import Ensemble
from quapy.protocol import APP
from quapy.method.aggregative import DistributionMatching
from quapy.method.meta import MedianEstimator
datasets = [pytest.param(qp.datasets.fetch_twitter('hcr', pickle=True), id='hcr'), datasets = [pytest.param(qp.datasets.fetch_twitter('hcr', pickle=True), id='hcr'),
pytest.param(qp.datasets.fetch_UCIDataset('ionosphere'), id='ionosphere')] pytest.param(qp.datasets.fetch_UCIDataset('ionosphere'), id='ionosphere')]
@ -36,7 +39,7 @@ def test_aggregative_methods(dataset: Dataset, aggregative_method, learner):
true_prevalences = dataset.test.prevalence() true_prevalences = dataset.test.prevalence()
error = qp.error.mae(true_prevalences, estim_prevalences) error = qp.error.mae(true_prevalences, estim_prevalences)
assert type(error) == numpy.float64 assert type(error) == np.float64
@pytest.mark.parametrize('dataset', datasets) @pytest.mark.parametrize('dataset', datasets)
@ -55,7 +58,7 @@ def test_non_aggregative_methods(dataset: Dataset, non_aggregative_method):
true_prevalences = dataset.test.prevalence() true_prevalences = dataset.test.prevalence()
error = qp.error.mae(true_prevalences, estim_prevalences) error = qp.error.mae(true_prevalences, estim_prevalences)
assert type(error) == numpy.float64 assert type(error) == np.float64
@pytest.mark.parametrize('base_method', AGGREGATIVE_METHODS) @pytest.mark.parametrize('base_method', AGGREGATIVE_METHODS)
@ -80,7 +83,7 @@ def test_ensemble_method(base_method, learner, dataset: Dataset, policy):
true_prevalences = dataset.test.prevalence() true_prevalences = dataset.test.prevalence()
error = qp.error.mae(true_prevalences, estim_prevalences) error = qp.error.mae(true_prevalences, estim_prevalences)
assert type(error) == numpy.float64 assert type(error) == np.float64
def test_quanet_method(): def test_quanet_method():
@ -119,7 +122,7 @@ def test_quanet_method():
true_prevalences = dataset.test.prevalence() true_prevalences = dataset.test.prevalence()
error = qp.error.mae(true_prevalences, estim_prevalences) error = qp.error.mae(true_prevalences, estim_prevalences)
assert type(error) == numpy.float64 assert type(error) == np.float64
def test_str_label_names(): def test_str_label_names():
@ -130,32 +133,103 @@ def test_str_label_names():
dataset.test.sampling(1000, 0.25, 0.75)) dataset.test.sampling(1000, 0.25, 0.75))
qp.data.preprocessing.text2tfidf(dataset, min_df=5, inplace=True) qp.data.preprocessing.text2tfidf(dataset, min_df=5, inplace=True)
numpy.random.seed(0) np.random.seed(0)
model.fit(dataset.training) model.fit(dataset.training)
int_estim_prevalences = model.quantify(dataset.test.instances) int_estim_prevalences = model.quantify(dataset.test.instances)
true_prevalences = dataset.test.prevalence() true_prevalences = dataset.test.prevalence()
error = qp.error.mae(true_prevalences, int_estim_prevalences) error = qp.error.mae(true_prevalences, int_estim_prevalences)
assert type(error) == numpy.float64 assert type(error) == np.float64
dataset_str = Dataset(LabelledCollection(dataset.training.instances, dataset_str = Dataset(LabelledCollection(dataset.training.instances,
['one' if label == 1 else 'zero' for label in dataset.training.labels]), ['one' if label == 1 else 'zero' for label in dataset.training.labels]),
LabelledCollection(dataset.test.instances, LabelledCollection(dataset.test.instances,
['one' if label == 1 else 'zero' for label in dataset.test.labels])) ['one' if label == 1 else 'zero' for label in dataset.test.labels]))
assert all(dataset_str.training.classes_ == dataset_str.test.classes_), 'wrong indexation' assert all(dataset_str.training.classes_ == dataset_str.test.classes_), 'wrong indexation'
numpy.random.seed(0) np.random.seed(0)
model.fit(dataset_str.training) model.fit(dataset_str.training)
str_estim_prevalences = model.quantify(dataset_str.test.instances) str_estim_prevalences = model.quantify(dataset_str.test.instances)
true_prevalences = dataset_str.test.prevalence() true_prevalences = dataset_str.test.prevalence()
error = qp.error.mae(true_prevalences, str_estim_prevalences) error = qp.error.mae(true_prevalences, str_estim_prevalences)
assert type(error) == numpy.float64 assert type(error) == np.float64
print(true_prevalences) print(true_prevalences)
print(int_estim_prevalences) print(int_estim_prevalences)
print(str_estim_prevalences) print(str_estim_prevalences)
numpy.testing.assert_almost_equal(int_estim_prevalences[1], np.testing.assert_almost_equal(int_estim_prevalences[1],
str_estim_prevalences[list(model.classes_).index('one')]) str_estim_prevalences[list(model.classes_).index('one')])
# helper
def __fit_test(quantifier, train, test):
quantifier.fit(train)
test_samples = APP(test)
true_prevs, estim_prevs = qp.evaluation.prediction(quantifier, test_samples)
return qp.error.mae(true_prevs, estim_prevs), estim_prevs
def test_median_meta():
"""
This test compares the performance of the MedianQuantifier with respect to computing the median of the predictions
of a differently parameterized quantifier. We use the DistributionMatching base quantifier and the median is
computed across different values of nbins
"""
qp.environ['SAMPLE_SIZE'] = 100
# grid of values
nbins_grid = list(range(2, 11))
dataset = 'kindle'
train, test = qp.datasets.fetch_reviews(dataset, tfidf=True, min_df=10).train_test
prevs = []
errors = []
for nbins in nbins_grid:
with qp.util.temp_seed(0):
q = DistributionMatching(LogisticRegression(), nbins=nbins)
mae, estim_prevs = __fit_test(q, train, test)
prevs.append(estim_prevs)
errors.append(mae)
print(f'{dataset} DistributionMatching(nbins={nbins}) got MAE {mae:.4f}')
prevs = np.asarray(prevs)
mae = np.mean(errors)
print(f'\tMAE={mae:.4f}')
q = DistributionMatching(LogisticRegression())
q = MedianEstimator(q, param_grid={'nbins': nbins_grid}, random_state=0, n_jobs=-1)
median_mae, prev = __fit_test(q, train, test)
print(f'\tMAE={median_mae:.4f}')
np.testing.assert_almost_equal(np.median(prevs, axis=0), prev)
assert median_mae < mae, 'the median-based quantifier provided a higher error...'
def test_median_meta_modsel():
"""
This test checks the median-meta quantifier with model selection
"""
qp.environ['SAMPLE_SIZE'] = 100
dataset = 'kindle'
train, test = qp.datasets.fetch_reviews(dataset, tfidf=True, min_df=10).train_test
train, val = train.split_stratified(random_state=0)
nbins_grid = [2, 4, 5, 10, 15]
q = DistributionMatching(LogisticRegression())
q = MedianEstimator(q, param_grid={'nbins': nbins_grid}, random_state=0, n_jobs=-1)
median_mae, _ = __fit_test(q, train, test)
print(f'\tMAE={median_mae:.4f}')
q = DistributionMatching(LogisticRegression())
lr_params = {'classifier__C': np.logspace(-1, 1, 3)}
q = MedianEstimator(q, param_grid={'nbins': nbins_grid}, random_state=0, n_jobs=-1)
q = GridSearchQ(q, param_grid=lr_params, protocol=APP(val), n_jobs=-1)
optimized_median_ave, _ = __fit_test(q, train, test)
print(f'\tMAE={optimized_median_ave:.4f}')
assert optimized_median_ave < median_mae, "the optimized method yielded worse performance..."