working on SCMQ, MCSQ, MCMQ ensembles
This commit is contained in:
parent
c8235ddb2a
commit
8876d311e7
|
|
@ -0,0 +1,83 @@
|
||||||
|
import quapy as qp
|
||||||
|
import numpy as np
|
||||||
|
from quapy.protocol import UPP
|
||||||
|
from quapy.method.aggregative import KDEyML
|
||||||
|
import quapy.functional as F
|
||||||
|
from time import time
|
||||||
|
|
||||||
|
"""
|
||||||
|
Let see one example:
|
||||||
|
"""
|
||||||
|
|
||||||
|
# load some data
|
||||||
|
qp.environ['SAMPLE_SIZE'] = 100
|
||||||
|
data = qp.datasets.fetch_UCIMulticlassDataset('molecular')
|
||||||
|
training, test = data.train_test
|
||||||
|
training, validation = training.split_stratified(train_prop=0.7, random_state=0)
|
||||||
|
protocol = UPP(validation)
|
||||||
|
|
||||||
|
hyper_C = np.logspace(-3, 3, 7)
|
||||||
|
|
||||||
|
model = KDEyML()
|
||||||
|
|
||||||
|
with qp.util.temp_seed(0):
|
||||||
|
|
||||||
|
param_grid = {
|
||||||
|
'classifier__C': hyper_C,
|
||||||
|
'bandwidth': np.linspace(0.01, 0.20, 20) # [0.01, 0.02, 0.03, ..., 0.20]
|
||||||
|
}
|
||||||
|
|
||||||
|
model = qp.model_selection.GridSearchQ(
|
||||||
|
model=model,
|
||||||
|
param_grid=param_grid,
|
||||||
|
protocol=protocol,
|
||||||
|
error='mae', # the error to optimize is the MAE (a quantification-oriented loss)
|
||||||
|
refit=False, # retrain on the whole labelled set once done
|
||||||
|
n_jobs=-1,
|
||||||
|
verbose=True # show information as the process goes on
|
||||||
|
).fit(training)
|
||||||
|
|
||||||
|
best_params = model.best_params_
|
||||||
|
took = model.fit_time_
|
||||||
|
model = model.best_model_
|
||||||
|
print(f'model selection ended: best hyper-parameters={best_params}')
|
||||||
|
|
||||||
|
# evaluation in terms of MAE
|
||||||
|
# we use the same evaluation protocol (APP) on the test set
|
||||||
|
mae_score = qp.evaluation.evaluate(model, protocol=UPP(test), error_metric='mae')
|
||||||
|
|
||||||
|
print(f'MAE={mae_score:.5f}')
|
||||||
|
print(f'model selection took {took:.1f}s')
|
||||||
|
|
||||||
|
|
||||||
|
model = KDEyML(bandwidth='auto')
|
||||||
|
|
||||||
|
with qp.util.temp_seed(0):
|
||||||
|
|
||||||
|
param_grid = {
|
||||||
|
'classifier__C': hyper_C,
|
||||||
|
}
|
||||||
|
|
||||||
|
model = qp.model_selection.GridSearchQ(
|
||||||
|
model=model,
|
||||||
|
param_grid=param_grid,
|
||||||
|
protocol=protocol,
|
||||||
|
error='mae', # the error to optimize is the MAE (a quantification-oriented loss)
|
||||||
|
refit=False, # retrain on the whole labelled set once done
|
||||||
|
n_jobs=-1,
|
||||||
|
verbose=True # show information as the process goes on
|
||||||
|
).fit(training)
|
||||||
|
|
||||||
|
best_params = model.best_params_
|
||||||
|
took = model.fit_time_
|
||||||
|
model = model.best_model_
|
||||||
|
bandwidth = model.bandwidth_val
|
||||||
|
print(f'model selection ended: best hyper-parameters={best_params} ({bandwidth=})')
|
||||||
|
|
||||||
|
# evaluation in terms of MAE
|
||||||
|
# we use the same evaluation protocol (APP) on the test set
|
||||||
|
mae_score = qp.evaluation.evaluate(model, protocol=UPP(test), error_metric='mae')
|
||||||
|
|
||||||
|
print(f'MAE={mae_score:.5f}')
|
||||||
|
print(f'model selection took {took:.1f}s')
|
||||||
|
|
||||||
|
|
@ -1,10 +1,16 @@
|
||||||
|
from sklearn.exceptions import ConvergenceWarning
|
||||||
from sklearn.linear_model import LogisticRegression
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
from sklearn.naive_bayes import MultinomialNB
|
||||||
|
from sklearn.neighbors import KNeighborsClassifier
|
||||||
from statsmodels.sandbox.distributions.genpareto import quant
|
from statsmodels.sandbox.distributions.genpareto import quant
|
||||||
|
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
from quapy.protocol import UPP
|
from quapy.protocol import UPP
|
||||||
from quapy.method.aggregative import PACC, DMy, EMQ, KDEyML
|
from quapy.method.aggregative import PACC, DMy, EMQ, KDEyML
|
||||||
from quapy.method.meta import SCMQ
|
from quapy.method.meta import SCMQ, MCMQ, MCSQ
|
||||||
|
import warnings
|
||||||
|
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
||||||
|
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
||||||
|
|
||||||
qp.environ["SAMPLE_SIZE"]=100
|
qp.environ["SAMPLE_SIZE"]=100
|
||||||
|
|
||||||
|
|
@ -32,5 +38,19 @@ scmq = SCMQ(classifier, quantifiers)
|
||||||
|
|
||||||
train_and_test_model(scmq, train, test)
|
train_and_test_model(scmq, train, test)
|
||||||
|
|
||||||
for quantifier in quantifiers:
|
# for quantifier in quantifiers:
|
||||||
train_and_test_model(quantifier, train, test)
|
# train_and_test_model(quantifier, train, test)
|
||||||
|
|
||||||
|
classifiers = [
|
||||||
|
LogisticRegression(),
|
||||||
|
KNeighborsClassifier(),
|
||||||
|
# MultinomialNB()
|
||||||
|
]
|
||||||
|
|
||||||
|
mcmq = MCMQ(classifiers, quantifiers)
|
||||||
|
|
||||||
|
train_and_test_model(mcmq, train, test)
|
||||||
|
|
||||||
|
mcsq = MCSQ(classifiers, PACC())
|
||||||
|
|
||||||
|
train_and_test_model(mcsq, train, test)
|
||||||
|
|
@ -416,7 +416,7 @@ def argmin_prevalence(loss: Callable,
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
|
||||||
def optim_minimize(loss: Callable, n_classes: int):
|
def optim_minimize(loss: Callable, n_classes: int, return_loss=False):
|
||||||
"""
|
"""
|
||||||
Searches for the optimal prevalence values, i.e., an `n_classes`-dimensional vector of the (`n_classes`-1)-simplex
|
Searches for the optimal prevalence values, i.e., an `n_classes`-dimensional vector of the (`n_classes`-1)-simplex
|
||||||
that yields the smallest lost. This optimization is carried out by means of a constrained search using scipy's
|
that yields the smallest lost. This optimization is carried out by means of a constrained search using scipy's
|
||||||
|
|
@ -424,18 +424,24 @@ def optim_minimize(loss: Callable, n_classes: int):
|
||||||
|
|
||||||
:param loss: (callable) the function to minimize
|
:param loss: (callable) the function to minimize
|
||||||
:param n_classes: (int) the number of classes, i.e., the dimensionality of the prevalence vector
|
:param n_classes: (int) the number of classes, i.e., the dimensionality of the prevalence vector
|
||||||
:return: (ndarray) the best prevalence vector found
|
:param return_loss: bool, if True, returns also the value of the loss (default is False).
|
||||||
|
:return: (ndarray) the best prevalence vector found or a tuple which also contains the value of the loss
|
||||||
|
if return_loss=True
|
||||||
"""
|
"""
|
||||||
from scipy import optimize
|
from scipy import optimize
|
||||||
|
|
||||||
# the initial point is set as the uniform distribution
|
# the initial point is set as the uniform distribution
|
||||||
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
|
uniform_distribution = uniform_prevalence(n_classes=n_classes)
|
||||||
|
|
||||||
# solutions are bounded to those contained in the unit-simplex
|
# solutions are bounded to those contained in the unit-simplex
|
||||||
bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1]
|
bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1]
|
||||||
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
|
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
|
||||||
r = optimize.minimize(loss, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
|
r = optimize.minimize(loss, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
|
||||||
return r.x
|
|
||||||
|
if return_loss:
|
||||||
|
return r.x, r.fun
|
||||||
|
else:
|
||||||
|
return r.x
|
||||||
|
|
||||||
|
|
||||||
def linear_search(loss: Callable, n_classes: int):
|
def linear_search(loss: Callable, n_classes: int):
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,8 @@
|
||||||
from typing import Union
|
from typing import Union
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from scipy.optimize import optimize, minimize_scalar
|
||||||
|
|
||||||
|
from quapy.protocol import UPP
|
||||||
from sklearn.base import BaseEstimator
|
from sklearn.base import BaseEstimator
|
||||||
from sklearn.neighbors import KernelDensity
|
from sklearn.neighbors import KernelDensity
|
||||||
|
|
||||||
|
|
@ -111,16 +114,70 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase):
|
||||||
:param random_state: a seed to be set before fitting any base quantifier (default None)
|
:param random_state: a seed to be set before fitting any base quantifier (default None)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, classifier: BaseEstimator=None, val_split=5, bandwidth=0.1, random_state=None):
|
def __init__(self, classifier: BaseEstimator=None, val_split=5, bandwidth=0.1, auto_reduction=500, auto_repeats=25, random_state=None):
|
||||||
self.classifier = qp._get_classifier(classifier)
|
self.classifier = qp._get_classifier(classifier)
|
||||||
self.val_split = val_split
|
self.val_split = val_split
|
||||||
self.bandwidth = KDEBase._check_bandwidth(bandwidth)
|
self.bandwidth = bandwidth
|
||||||
|
if bandwidth!='auto':
|
||||||
|
self.bandwidth = KDEBase._check_bandwidth(bandwidth)
|
||||||
|
|
||||||
|
assert auto_reduction is None or (isinstance(auto_reduction, int) and auto_reduction>0), \
|
||||||
|
(f'param {auto_reduction=} should either be None (no reduction) or a positive integer '
|
||||||
|
f'(number of training instances).')
|
||||||
|
|
||||||
|
self.auto_reduction = auto_reduction
|
||||||
|
self.auto_repeats = auto_repeats
|
||||||
self.random_state=random_state
|
self.random_state=random_state
|
||||||
|
|
||||||
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
|
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
|
||||||
self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth)
|
if self.bandwidth == 'auto':
|
||||||
|
self.bandwidth_val = self.auto_bandwidth_likelihood(classif_predictions)
|
||||||
|
else:
|
||||||
|
self.bandwidth_val = self.bandwidth
|
||||||
|
self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth_val)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
def auto_bandwidth_likelihood(self, classif_predictions: LabelledCollection):
|
||||||
|
train, val = classif_predictions.split_stratified(train_prop=0.5, random_state=self.random_state)
|
||||||
|
n_classes = classif_predictions.n_classes
|
||||||
|
epsilon = 1e-8
|
||||||
|
repeats = self.auto_repeats
|
||||||
|
|
||||||
|
auto_reduction = self.auto_reduction
|
||||||
|
if auto_reduction is None:
|
||||||
|
auto_reduction = len(classif_predictions)
|
||||||
|
else:
|
||||||
|
# reduce samples to speed up computation
|
||||||
|
train = train.sampling(auto_reduction)
|
||||||
|
|
||||||
|
prot = UPP(val, sample_size=auto_reduction, repeats=repeats, random_state=self.random_state)
|
||||||
|
|
||||||
|
def eval_bandwidth_nll(bandwidth):
|
||||||
|
mix_densities = self.get_mixture_components(*train.Xy, train.classes_, bandwidth)
|
||||||
|
loss_accum = 0
|
||||||
|
for (sample, prevtrue) in prot():
|
||||||
|
test_densities = [self.pdf(kde_i, sample) for kde_i in mix_densities]
|
||||||
|
|
||||||
|
def neg_loglikelihood_prev(prev):
|
||||||
|
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prev, test_densities))
|
||||||
|
test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
|
||||||
|
nll = -np.sum(test_loglikelihood)
|
||||||
|
return nll
|
||||||
|
|
||||||
|
pred_prev, neglikelihood = F.optim_minimize(neg_loglikelihood_prev, n_classes=n_classes, return_loss=True)
|
||||||
|
loss_accum += neglikelihood
|
||||||
|
return loss_accum
|
||||||
|
|
||||||
|
r = minimize_scalar(eval_bandwidth_nll, bounds=(0.0001, 0.2), options={'xatol': 0.005})
|
||||||
|
best_band = r.x
|
||||||
|
best_loss_value = r.fun
|
||||||
|
nit = r.nit
|
||||||
|
|
||||||
|
# print(f'[{self.__class__.__name__}:autobandwidth] '
|
||||||
|
# f'found bandwidth={best_band:.8f} after {nit=} iterations loss_val={best_loss_value:.5f})')
|
||||||
|
|
||||||
|
return best_band
|
||||||
|
|
||||||
def aggregate(self, posteriors: np.ndarray):
|
def aggregate(self, posteriors: np.ndarray):
|
||||||
"""
|
"""
|
||||||
Searches for the mixture model parameter (the sought prevalence values) that maximizes the likelihood
|
Searches for the mixture model parameter (the sought prevalence values) that maximizes the likelihood
|
||||||
|
|
|
||||||
|
|
@ -693,14 +693,26 @@ def EEMQ(classifier, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
|
||||||
return ensembleFactory(classifier, EMQ, param_grid, optim, param_mod_sel, **kwargs)
|
return ensembleFactory(classifier, EMQ, param_grid, optim, param_mod_sel, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def merge(prev_predictions, merge_fun):
|
||||||
|
prev_predictions = np.asarray(prev_predictions)
|
||||||
|
if merge_fun == 'median':
|
||||||
|
prevalences = np.median(prev_predictions, axis=0)
|
||||||
|
prevalences = F.normalize_prevalence(prevalences, method='l1')
|
||||||
|
elif merge_fun == 'mean':
|
||||||
|
prevalences = np.mean(prev_predictions, axis=0)
|
||||||
|
else:
|
||||||
|
raise NotImplementedError(f'merge function {merge_fun} not implemented!')
|
||||||
|
return prevalences
|
||||||
|
|
||||||
|
|
||||||
class SCMQ(AggregativeSoftQuantifier):
|
class SCMQ(AggregativeSoftQuantifier):
|
||||||
|
|
||||||
MERGE_FUNCTIONS = ['median']
|
MERGE_FUNCTIONS = ['median', 'mean']
|
||||||
|
|
||||||
def __init__(self, classifier, quantifiers: List[AggregativeSoftQuantifier], merge_fun='median', val_split=5):
|
def __init__(self, classifier, quantifiers: List[AggregativeSoftQuantifier], merge_fun='median', val_split=5):
|
||||||
self.classifier = classifier
|
self.classifier = classifier
|
||||||
self.quantifiers = quantifiers
|
self.quantifiers = [deepcopy(q) for q in quantifiers]
|
||||||
assert merge_fun in self.MERGE_FUNCTIONS, f'unknwon {merge_fun=}, valid ones are {self.MERGE_FUNCTIONS}'
|
assert merge_fun in self.MERGE_FUNCTIONS, f'unknown {merge_fun=}, valid ones are {self.MERGE_FUNCTIONS}'
|
||||||
self.merge_fun = merge_fun
|
self.merge_fun = merge_fun
|
||||||
self.val_split = val_split
|
self.val_split = val_split
|
||||||
|
|
||||||
|
|
@ -715,22 +727,51 @@ class SCMQ(AggregativeSoftQuantifier):
|
||||||
for quantifier_i in self.quantifiers:
|
for quantifier_i in self.quantifiers:
|
||||||
prevalence_i = quantifier_i.aggregate(classif_predictions)
|
prevalence_i = quantifier_i.aggregate(classif_predictions)
|
||||||
prev_predictions.append(prevalence_i)
|
prev_predictions.append(prevalence_i)
|
||||||
return self.merge(prev_predictions)
|
return merge(prev_predictions, merge_fun=self.merge_fun)
|
||||||
|
|
||||||
def merge(self, prev_predictions):
|
|
||||||
prev_predictions = np.asarray(prev_predictions)
|
|
||||||
if self.merge_fun == 'median':
|
|
||||||
prevalences = np.median(prev_predictions, axis=0)
|
|
||||||
prevalences = F.normalize_prevalence(prevalences, method='l1')
|
|
||||||
elif self.merge_fun == 'mean':
|
|
||||||
prevalences = np.mean(prev_predictions, axis=0)
|
|
||||||
else:
|
|
||||||
raise NotImplementedError(f'merge function {self.merge_fun} not implemented!')
|
|
||||||
return prevalences
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class MCSQ(BaseQuantifier):
|
||||||
|
def __init__(self, classifiers, quantifier: AggregativeSoftQuantifier, merge_fun='median', val_split=5):
|
||||||
|
self.merge_fun = merge_fun
|
||||||
|
self.val_split = val_split
|
||||||
|
self.mcsqs = []
|
||||||
|
for classifier in classifiers:
|
||||||
|
quantifier = deepcopy(quantifier)
|
||||||
|
quantifier.classifier = classifier
|
||||||
|
self.mcsqs.append(quantifier)
|
||||||
|
|
||||||
|
def fit(self, data: LabelledCollection):
|
||||||
|
for q in self.mcsqs:
|
||||||
|
q.fit(data, val_split=self.val_split)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def quantify(self, instances):
|
||||||
|
prev_predictions = []
|
||||||
|
for q in self.mcsqs:
|
||||||
|
prevalence_i = q.quantify(instances)
|
||||||
|
prev_predictions.append(prevalence_i)
|
||||||
|
return merge(prev_predictions, merge_fun=self.merge_fun)
|
||||||
|
|
||||||
|
|
||||||
|
class MCMQ(BaseQuantifier):
|
||||||
|
def __init__(self, classifiers, quantifiers: List[AggregativeSoftQuantifier], merge_fun='median', val_split=5):
|
||||||
|
self.merge_fun = merge_fun
|
||||||
|
self.scmqs = []
|
||||||
|
for classifier in classifiers:
|
||||||
|
self.scmqs.append(SCMQ(classifier, quantifiers, val_split=val_split))
|
||||||
|
|
||||||
|
def fit(self, data: LabelledCollection):
|
||||||
|
for q in self.scmqs:
|
||||||
|
q.fit(data)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def quantify(self, instances):
|
||||||
|
prev_predictions = []
|
||||||
|
for q in self.scmqs:
|
||||||
|
prevalence_i = q.quantify(instances)
|
||||||
|
prev_predictions.append(prevalence_i)
|
||||||
|
return merge(prev_predictions, merge_fun=self.merge_fun)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -248,13 +248,13 @@ class GridSearchQ(BaseQuantifier):
|
||||||
self.param_scores_[str(params)] = status.status
|
self.param_scores_[str(params)] = status.status
|
||||||
self.error_collector.append(status)
|
self.error_collector.append(status)
|
||||||
|
|
||||||
tend = time()-tinit
|
self.fit_time_ = time()-tinit
|
||||||
|
|
||||||
if self.best_score_ is None:
|
if self.best_score_ is None:
|
||||||
raise ValueError('no combination of hyperparameters seemed to work')
|
raise ValueError('no combination of hyperparameters seemed to work')
|
||||||
|
|
||||||
self._sout(f'optimization finished: best params {self.best_params_} (score={self.best_score_:.5f}) '
|
self._sout(f'optimization finished: best params {self.best_params_} (score={self.best_score_:.5f}) '
|
||||||
f'[took {tend:.4f}s]')
|
f'[took {self.fit_time_:.4f}s]')
|
||||||
|
|
||||||
no_errors = len(self.error_collector)
|
no_errors = len(self.error_collector)
|
||||||
if no_errors>0:
|
if no_errors>0:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue