working on SCMQ, MCSQ, MCMQ ensembles

This commit is contained in:
Alejandro Moreo Fernandez 2024-12-02 17:39:06 +01:00
parent c8235ddb2a
commit 8876d311e7
6 changed files with 236 additions and 29 deletions

View File

@ -0,0 +1,83 @@
import quapy as qp
import numpy as np
from quapy.protocol import UPP
from quapy.method.aggregative import KDEyML
import quapy.functional as F
from time import time
"""
Let see one example:
"""
# load some data
qp.environ['SAMPLE_SIZE'] = 100
data = qp.datasets.fetch_UCIMulticlassDataset('molecular')
training, test = data.train_test
training, validation = training.split_stratified(train_prop=0.7, random_state=0)
protocol = UPP(validation)
hyper_C = np.logspace(-3, 3, 7)
model = KDEyML()
with qp.util.temp_seed(0):
param_grid = {
'classifier__C': hyper_C,
'bandwidth': np.linspace(0.01, 0.20, 20) # [0.01, 0.02, 0.03, ..., 0.20]
}
model = qp.model_selection.GridSearchQ(
model=model,
param_grid=param_grid,
protocol=protocol,
error='mae', # the error to optimize is the MAE (a quantification-oriented loss)
refit=False, # retrain on the whole labelled set once done
n_jobs=-1,
verbose=True # show information as the process goes on
).fit(training)
best_params = model.best_params_
took = model.fit_time_
model = model.best_model_
print(f'model selection ended: best hyper-parameters={best_params}')
# evaluation in terms of MAE
# we use the same evaluation protocol (APP) on the test set
mae_score = qp.evaluation.evaluate(model, protocol=UPP(test), error_metric='mae')
print(f'MAE={mae_score:.5f}')
print(f'model selection took {took:.1f}s')
model = KDEyML(bandwidth='auto')
with qp.util.temp_seed(0):
param_grid = {
'classifier__C': hyper_C,
}
model = qp.model_selection.GridSearchQ(
model=model,
param_grid=param_grid,
protocol=protocol,
error='mae', # the error to optimize is the MAE (a quantification-oriented loss)
refit=False, # retrain on the whole labelled set once done
n_jobs=-1,
verbose=True # show information as the process goes on
).fit(training)
best_params = model.best_params_
took = model.fit_time_
model = model.best_model_
bandwidth = model.bandwidth_val
print(f'model selection ended: best hyper-parameters={best_params} ({bandwidth=})')
# evaluation in terms of MAE
# we use the same evaluation protocol (APP) on the test set
mae_score = qp.evaluation.evaluate(model, protocol=UPP(test), error_metric='mae')
print(f'MAE={mae_score:.5f}')
print(f'model selection took {took:.1f}s')

View File

@ -1,10 +1,16 @@
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from statsmodels.sandbox.distributions.genpareto import quant
import quapy as qp
from quapy.protocol import UPP
from quapy.method.aggregative import PACC, DMy, EMQ, KDEyML
from quapy.method.meta import SCMQ
from quapy.method.meta import SCMQ, MCMQ, MCSQ
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)
qp.environ["SAMPLE_SIZE"]=100
@ -32,5 +38,19 @@ scmq = SCMQ(classifier, quantifiers)
train_and_test_model(scmq, train, test)
for quantifier in quantifiers:
train_and_test_model(quantifier, train, test)
# for quantifier in quantifiers:
# train_and_test_model(quantifier, train, test)
classifiers = [
LogisticRegression(),
KNeighborsClassifier(),
# MultinomialNB()
]
mcmq = MCMQ(classifiers, quantifiers)
train_and_test_model(mcmq, train, test)
mcsq = MCSQ(classifiers, PACC())
train_and_test_model(mcsq, train, test)

View File

@ -416,7 +416,7 @@ def argmin_prevalence(loss: Callable,
raise NotImplementedError()
def optim_minimize(loss: Callable, n_classes: int):
def optim_minimize(loss: Callable, n_classes: int, return_loss=False):
"""
Searches for the optimal prevalence values, i.e., an `n_classes`-dimensional vector of the (`n_classes`-1)-simplex
that yields the smallest lost. This optimization is carried out by means of a constrained search using scipy's
@ -424,17 +424,23 @@ def optim_minimize(loss: Callable, n_classes: int):
:param loss: (callable) the function to minimize
:param n_classes: (int) the number of classes, i.e., the dimensionality of the prevalence vector
:return: (ndarray) the best prevalence vector found
:param return_loss: bool, if True, returns also the value of the loss (default is False).
:return: (ndarray) the best prevalence vector found or a tuple which also contains the value of the loss
if return_loss=True
"""
from scipy import optimize
# the initial point is set as the uniform distribution
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
uniform_distribution = uniform_prevalence(n_classes=n_classes)
# solutions are bounded to those contained in the unit-simplex
bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1]
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
r = optimize.minimize(loss, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
if return_loss:
return r.x, r.fun
else:
return r.x

View File

@ -1,5 +1,8 @@
from typing import Union
import numpy as np
from scipy.optimize import optimize, minimize_scalar
from quapy.protocol import UPP
from sklearn.base import BaseEstimator
from sklearn.neighbors import KernelDensity
@ -111,16 +114,70 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase):
:param random_state: a seed to be set before fitting any base quantifier (default None)
"""
def __init__(self, classifier: BaseEstimator=None, val_split=5, bandwidth=0.1, random_state=None):
def __init__(self, classifier: BaseEstimator=None, val_split=5, bandwidth=0.1, auto_reduction=500, auto_repeats=25, random_state=None):
self.classifier = qp._get_classifier(classifier)
self.val_split = val_split
self.bandwidth = bandwidth
if bandwidth!='auto':
self.bandwidth = KDEBase._check_bandwidth(bandwidth)
assert auto_reduction is None or (isinstance(auto_reduction, int) and auto_reduction>0), \
(f'param {auto_reduction=} should either be None (no reduction) or a positive integer '
f'(number of training instances).')
self.auto_reduction = auto_reduction
self.auto_repeats = auto_repeats
self.random_state=random_state
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth)
if self.bandwidth == 'auto':
self.bandwidth_val = self.auto_bandwidth_likelihood(classif_predictions)
else:
self.bandwidth_val = self.bandwidth
self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth_val)
return self
def auto_bandwidth_likelihood(self, classif_predictions: LabelledCollection):
train, val = classif_predictions.split_stratified(train_prop=0.5, random_state=self.random_state)
n_classes = classif_predictions.n_classes
epsilon = 1e-8
repeats = self.auto_repeats
auto_reduction = self.auto_reduction
if auto_reduction is None:
auto_reduction = len(classif_predictions)
else:
# reduce samples to speed up computation
train = train.sampling(auto_reduction)
prot = UPP(val, sample_size=auto_reduction, repeats=repeats, random_state=self.random_state)
def eval_bandwidth_nll(bandwidth):
mix_densities = self.get_mixture_components(*train.Xy, train.classes_, bandwidth)
loss_accum = 0
for (sample, prevtrue) in prot():
test_densities = [self.pdf(kde_i, sample) for kde_i in mix_densities]
def neg_loglikelihood_prev(prev):
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prev, test_densities))
test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
nll = -np.sum(test_loglikelihood)
return nll
pred_prev, neglikelihood = F.optim_minimize(neg_loglikelihood_prev, n_classes=n_classes, return_loss=True)
loss_accum += neglikelihood
return loss_accum
r = minimize_scalar(eval_bandwidth_nll, bounds=(0.0001, 0.2), options={'xatol': 0.005})
best_band = r.x
best_loss_value = r.fun
nit = r.nit
# print(f'[{self.__class__.__name__}:autobandwidth] '
# f'found bandwidth={best_band:.8f} after {nit=} iterations loss_val={best_loss_value:.5f})')
return best_band
def aggregate(self, posteriors: np.ndarray):
"""
Searches for the mixture model parameter (the sought prevalence values) that maximizes the likelihood

View File

@ -693,14 +693,26 @@ def EEMQ(classifier, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
return ensembleFactory(classifier, EMQ, param_grid, optim, param_mod_sel, **kwargs)
def merge(prev_predictions, merge_fun):
prev_predictions = np.asarray(prev_predictions)
if merge_fun == 'median':
prevalences = np.median(prev_predictions, axis=0)
prevalences = F.normalize_prevalence(prevalences, method='l1')
elif merge_fun == 'mean':
prevalences = np.mean(prev_predictions, axis=0)
else:
raise NotImplementedError(f'merge function {merge_fun} not implemented!')
return prevalences
class SCMQ(AggregativeSoftQuantifier):
MERGE_FUNCTIONS = ['median']
MERGE_FUNCTIONS = ['median', 'mean']
def __init__(self, classifier, quantifiers: List[AggregativeSoftQuantifier], merge_fun='median', val_split=5):
self.classifier = classifier
self.quantifiers = quantifiers
assert merge_fun in self.MERGE_FUNCTIONS, f'unknwon {merge_fun=}, valid ones are {self.MERGE_FUNCTIONS}'
self.quantifiers = [deepcopy(q) for q in quantifiers]
assert merge_fun in self.MERGE_FUNCTIONS, f'unknown {merge_fun=}, valid ones are {self.MERGE_FUNCTIONS}'
self.merge_fun = merge_fun
self.val_split = val_split
@ -715,22 +727,51 @@ class SCMQ(AggregativeSoftQuantifier):
for quantifier_i in self.quantifiers:
prevalence_i = quantifier_i.aggregate(classif_predictions)
prev_predictions.append(prevalence_i)
return self.merge(prev_predictions)
def merge(self, prev_predictions):
prev_predictions = np.asarray(prev_predictions)
if self.merge_fun == 'median':
prevalences = np.median(prev_predictions, axis=0)
prevalences = F.normalize_prevalence(prevalences, method='l1')
elif self.merge_fun == 'mean':
prevalences = np.mean(prev_predictions, axis=0)
else:
raise NotImplementedError(f'merge function {self.merge_fun} not implemented!')
return prevalences
return merge(prev_predictions, merge_fun=self.merge_fun)
class MCSQ(BaseQuantifier):
def __init__(self, classifiers, quantifier: AggregativeSoftQuantifier, merge_fun='median', val_split=5):
self.merge_fun = merge_fun
self.val_split = val_split
self.mcsqs = []
for classifier in classifiers:
quantifier = deepcopy(quantifier)
quantifier.classifier = classifier
self.mcsqs.append(quantifier)
def fit(self, data: LabelledCollection):
for q in self.mcsqs:
q.fit(data, val_split=self.val_split)
return self
def quantify(self, instances):
prev_predictions = []
for q in self.mcsqs:
prevalence_i = q.quantify(instances)
prev_predictions.append(prevalence_i)
return merge(prev_predictions, merge_fun=self.merge_fun)
class MCMQ(BaseQuantifier):
def __init__(self, classifiers, quantifiers: List[AggregativeSoftQuantifier], merge_fun='median', val_split=5):
self.merge_fun = merge_fun
self.scmqs = []
for classifier in classifiers:
self.scmqs.append(SCMQ(classifier, quantifiers, val_split=val_split))
def fit(self, data: LabelledCollection):
for q in self.scmqs:
q.fit(data)
return self
def quantify(self, instances):
prev_predictions = []
for q in self.scmqs:
prevalence_i = q.quantify(instances)
prev_predictions.append(prevalence_i)
return merge(prev_predictions, merge_fun=self.merge_fun)

View File

@ -248,13 +248,13 @@ class GridSearchQ(BaseQuantifier):
self.param_scores_[str(params)] = status.status
self.error_collector.append(status)
tend = time()-tinit
self.fit_time_ = time()-tinit
if self.best_score_ is None:
raise ValueError('no combination of hyperparameters seemed to work')
self._sout(f'optimization finished: best params {self.best_params_} (score={self.best_score_:.5f}) '
f'[took {tend:.4f}s]')
f'[took {self.fit_time_:.4f}s]')
no_errors = len(self.error_collector)
if no_errors>0: