working on SCMQ, MCSQ, MCMQ ensembles
This commit is contained in:
parent
c8235ddb2a
commit
8876d311e7
|
|
@ -0,0 +1,83 @@
|
|||
import quapy as qp
|
||||
import numpy as np
|
||||
from quapy.protocol import UPP
|
||||
from quapy.method.aggregative import KDEyML
|
||||
import quapy.functional as F
|
||||
from time import time
|
||||
|
||||
"""
|
||||
Let see one example:
|
||||
"""
|
||||
|
||||
# load some data
|
||||
qp.environ['SAMPLE_SIZE'] = 100
|
||||
data = qp.datasets.fetch_UCIMulticlassDataset('molecular')
|
||||
training, test = data.train_test
|
||||
training, validation = training.split_stratified(train_prop=0.7, random_state=0)
|
||||
protocol = UPP(validation)
|
||||
|
||||
hyper_C = np.logspace(-3, 3, 7)
|
||||
|
||||
model = KDEyML()
|
||||
|
||||
with qp.util.temp_seed(0):
|
||||
|
||||
param_grid = {
|
||||
'classifier__C': hyper_C,
|
||||
'bandwidth': np.linspace(0.01, 0.20, 20) # [0.01, 0.02, 0.03, ..., 0.20]
|
||||
}
|
||||
|
||||
model = qp.model_selection.GridSearchQ(
|
||||
model=model,
|
||||
param_grid=param_grid,
|
||||
protocol=protocol,
|
||||
error='mae', # the error to optimize is the MAE (a quantification-oriented loss)
|
||||
refit=False, # retrain on the whole labelled set once done
|
||||
n_jobs=-1,
|
||||
verbose=True # show information as the process goes on
|
||||
).fit(training)
|
||||
|
||||
best_params = model.best_params_
|
||||
took = model.fit_time_
|
||||
model = model.best_model_
|
||||
print(f'model selection ended: best hyper-parameters={best_params}')
|
||||
|
||||
# evaluation in terms of MAE
|
||||
# we use the same evaluation protocol (APP) on the test set
|
||||
mae_score = qp.evaluation.evaluate(model, protocol=UPP(test), error_metric='mae')
|
||||
|
||||
print(f'MAE={mae_score:.5f}')
|
||||
print(f'model selection took {took:.1f}s')
|
||||
|
||||
|
||||
model = KDEyML(bandwidth='auto')
|
||||
|
||||
with qp.util.temp_seed(0):
|
||||
|
||||
param_grid = {
|
||||
'classifier__C': hyper_C,
|
||||
}
|
||||
|
||||
model = qp.model_selection.GridSearchQ(
|
||||
model=model,
|
||||
param_grid=param_grid,
|
||||
protocol=protocol,
|
||||
error='mae', # the error to optimize is the MAE (a quantification-oriented loss)
|
||||
refit=False, # retrain on the whole labelled set once done
|
||||
n_jobs=-1,
|
||||
verbose=True # show information as the process goes on
|
||||
).fit(training)
|
||||
|
||||
best_params = model.best_params_
|
||||
took = model.fit_time_
|
||||
model = model.best_model_
|
||||
bandwidth = model.bandwidth_val
|
||||
print(f'model selection ended: best hyper-parameters={best_params} ({bandwidth=})')
|
||||
|
||||
# evaluation in terms of MAE
|
||||
# we use the same evaluation protocol (APP) on the test set
|
||||
mae_score = qp.evaluation.evaluate(model, protocol=UPP(test), error_metric='mae')
|
||||
|
||||
print(f'MAE={mae_score:.5f}')
|
||||
print(f'model selection took {took:.1f}s')
|
||||
|
||||
|
|
@ -1,10 +1,16 @@
|
|||
from sklearn.exceptions import ConvergenceWarning
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
from statsmodels.sandbox.distributions.genpareto import quant
|
||||
|
||||
import quapy as qp
|
||||
from quapy.protocol import UPP
|
||||
from quapy.method.aggregative import PACC, DMy, EMQ, KDEyML
|
||||
from quapy.method.meta import SCMQ
|
||||
from quapy.method.meta import SCMQ, MCMQ, MCSQ
|
||||
import warnings
|
||||
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
||||
warnings.filterwarnings("ignore", category=ConvergenceWarning)
|
||||
|
||||
qp.environ["SAMPLE_SIZE"]=100
|
||||
|
||||
|
|
@ -32,5 +38,19 @@ scmq = SCMQ(classifier, quantifiers)
|
|||
|
||||
train_and_test_model(scmq, train, test)
|
||||
|
||||
for quantifier in quantifiers:
|
||||
train_and_test_model(quantifier, train, test)
|
||||
# for quantifier in quantifiers:
|
||||
# train_and_test_model(quantifier, train, test)
|
||||
|
||||
classifiers = [
|
||||
LogisticRegression(),
|
||||
KNeighborsClassifier(),
|
||||
# MultinomialNB()
|
||||
]
|
||||
|
||||
mcmq = MCMQ(classifiers, quantifiers)
|
||||
|
||||
train_and_test_model(mcmq, train, test)
|
||||
|
||||
mcsq = MCSQ(classifiers, PACC())
|
||||
|
||||
train_and_test_model(mcsq, train, test)
|
||||
|
|
@ -416,7 +416,7 @@ def argmin_prevalence(loss: Callable,
|
|||
raise NotImplementedError()
|
||||
|
||||
|
||||
def optim_minimize(loss: Callable, n_classes: int):
|
||||
def optim_minimize(loss: Callable, n_classes: int, return_loss=False):
|
||||
"""
|
||||
Searches for the optimal prevalence values, i.e., an `n_classes`-dimensional vector of the (`n_classes`-1)-simplex
|
||||
that yields the smallest lost. This optimization is carried out by means of a constrained search using scipy's
|
||||
|
|
@ -424,17 +424,23 @@ def optim_minimize(loss: Callable, n_classes: int):
|
|||
|
||||
:param loss: (callable) the function to minimize
|
||||
:param n_classes: (int) the number of classes, i.e., the dimensionality of the prevalence vector
|
||||
:return: (ndarray) the best prevalence vector found
|
||||
:param return_loss: bool, if True, returns also the value of the loss (default is False).
|
||||
:return: (ndarray) the best prevalence vector found or a tuple which also contains the value of the loss
|
||||
if return_loss=True
|
||||
"""
|
||||
from scipy import optimize
|
||||
|
||||
# the initial point is set as the uniform distribution
|
||||
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
|
||||
uniform_distribution = uniform_prevalence(n_classes=n_classes)
|
||||
|
||||
# solutions are bounded to those contained in the unit-simplex
|
||||
bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1]
|
||||
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
|
||||
r = optimize.minimize(loss, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
|
||||
|
||||
if return_loss:
|
||||
return r.x, r.fun
|
||||
else:
|
||||
return r.x
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,8 @@
|
|||
from typing import Union
|
||||
import numpy as np
|
||||
from scipy.optimize import optimize, minimize_scalar
|
||||
|
||||
from quapy.protocol import UPP
|
||||
from sklearn.base import BaseEstimator
|
||||
from sklearn.neighbors import KernelDensity
|
||||
|
||||
|
|
@ -111,16 +114,70 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase):
|
|||
:param random_state: a seed to be set before fitting any base quantifier (default None)
|
||||
"""
|
||||
|
||||
def __init__(self, classifier: BaseEstimator=None, val_split=5, bandwidth=0.1, random_state=None):
|
||||
def __init__(self, classifier: BaseEstimator=None, val_split=5, bandwidth=0.1, auto_reduction=500, auto_repeats=25, random_state=None):
|
||||
self.classifier = qp._get_classifier(classifier)
|
||||
self.val_split = val_split
|
||||
self.bandwidth = bandwidth
|
||||
if bandwidth!='auto':
|
||||
self.bandwidth = KDEBase._check_bandwidth(bandwidth)
|
||||
|
||||
assert auto_reduction is None or (isinstance(auto_reduction, int) and auto_reduction>0), \
|
||||
(f'param {auto_reduction=} should either be None (no reduction) or a positive integer '
|
||||
f'(number of training instances).')
|
||||
|
||||
self.auto_reduction = auto_reduction
|
||||
self.auto_repeats = auto_repeats
|
||||
self.random_state=random_state
|
||||
|
||||
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
|
||||
self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth)
|
||||
if self.bandwidth == 'auto':
|
||||
self.bandwidth_val = self.auto_bandwidth_likelihood(classif_predictions)
|
||||
else:
|
||||
self.bandwidth_val = self.bandwidth
|
||||
self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth_val)
|
||||
return self
|
||||
|
||||
def auto_bandwidth_likelihood(self, classif_predictions: LabelledCollection):
|
||||
train, val = classif_predictions.split_stratified(train_prop=0.5, random_state=self.random_state)
|
||||
n_classes = classif_predictions.n_classes
|
||||
epsilon = 1e-8
|
||||
repeats = self.auto_repeats
|
||||
|
||||
auto_reduction = self.auto_reduction
|
||||
if auto_reduction is None:
|
||||
auto_reduction = len(classif_predictions)
|
||||
else:
|
||||
# reduce samples to speed up computation
|
||||
train = train.sampling(auto_reduction)
|
||||
|
||||
prot = UPP(val, sample_size=auto_reduction, repeats=repeats, random_state=self.random_state)
|
||||
|
||||
def eval_bandwidth_nll(bandwidth):
|
||||
mix_densities = self.get_mixture_components(*train.Xy, train.classes_, bandwidth)
|
||||
loss_accum = 0
|
||||
for (sample, prevtrue) in prot():
|
||||
test_densities = [self.pdf(kde_i, sample) for kde_i in mix_densities]
|
||||
|
||||
def neg_loglikelihood_prev(prev):
|
||||
test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prev, test_densities))
|
||||
test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
|
||||
nll = -np.sum(test_loglikelihood)
|
||||
return nll
|
||||
|
||||
pred_prev, neglikelihood = F.optim_minimize(neg_loglikelihood_prev, n_classes=n_classes, return_loss=True)
|
||||
loss_accum += neglikelihood
|
||||
return loss_accum
|
||||
|
||||
r = minimize_scalar(eval_bandwidth_nll, bounds=(0.0001, 0.2), options={'xatol': 0.005})
|
||||
best_band = r.x
|
||||
best_loss_value = r.fun
|
||||
nit = r.nit
|
||||
|
||||
# print(f'[{self.__class__.__name__}:autobandwidth] '
|
||||
# f'found bandwidth={best_band:.8f} after {nit=} iterations loss_val={best_loss_value:.5f})')
|
||||
|
||||
return best_band
|
||||
|
||||
def aggregate(self, posteriors: np.ndarray):
|
||||
"""
|
||||
Searches for the mixture model parameter (the sought prevalence values) that maximizes the likelihood
|
||||
|
|
|
|||
|
|
@ -693,14 +693,26 @@ def EEMQ(classifier, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
|
|||
return ensembleFactory(classifier, EMQ, param_grid, optim, param_mod_sel, **kwargs)
|
||||
|
||||
|
||||
def merge(prev_predictions, merge_fun):
|
||||
prev_predictions = np.asarray(prev_predictions)
|
||||
if merge_fun == 'median':
|
||||
prevalences = np.median(prev_predictions, axis=0)
|
||||
prevalences = F.normalize_prevalence(prevalences, method='l1')
|
||||
elif merge_fun == 'mean':
|
||||
prevalences = np.mean(prev_predictions, axis=0)
|
||||
else:
|
||||
raise NotImplementedError(f'merge function {merge_fun} not implemented!')
|
||||
return prevalences
|
||||
|
||||
|
||||
class SCMQ(AggregativeSoftQuantifier):
|
||||
|
||||
MERGE_FUNCTIONS = ['median']
|
||||
MERGE_FUNCTIONS = ['median', 'mean']
|
||||
|
||||
def __init__(self, classifier, quantifiers: List[AggregativeSoftQuantifier], merge_fun='median', val_split=5):
|
||||
self.classifier = classifier
|
||||
self.quantifiers = quantifiers
|
||||
assert merge_fun in self.MERGE_FUNCTIONS, f'unknwon {merge_fun=}, valid ones are {self.MERGE_FUNCTIONS}'
|
||||
self.quantifiers = [deepcopy(q) for q in quantifiers]
|
||||
assert merge_fun in self.MERGE_FUNCTIONS, f'unknown {merge_fun=}, valid ones are {self.MERGE_FUNCTIONS}'
|
||||
self.merge_fun = merge_fun
|
||||
self.val_split = val_split
|
||||
|
||||
|
|
@ -715,22 +727,51 @@ class SCMQ(AggregativeSoftQuantifier):
|
|||
for quantifier_i in self.quantifiers:
|
||||
prevalence_i = quantifier_i.aggregate(classif_predictions)
|
||||
prev_predictions.append(prevalence_i)
|
||||
return self.merge(prev_predictions)
|
||||
|
||||
def merge(self, prev_predictions):
|
||||
prev_predictions = np.asarray(prev_predictions)
|
||||
if self.merge_fun == 'median':
|
||||
prevalences = np.median(prev_predictions, axis=0)
|
||||
prevalences = F.normalize_prevalence(prevalences, method='l1')
|
||||
elif self.merge_fun == 'mean':
|
||||
prevalences = np.mean(prev_predictions, axis=0)
|
||||
else:
|
||||
raise NotImplementedError(f'merge function {self.merge_fun} not implemented!')
|
||||
return prevalences
|
||||
|
||||
|
||||
return merge(prev_predictions, merge_fun=self.merge_fun)
|
||||
|
||||
|
||||
class MCSQ(BaseQuantifier):
|
||||
def __init__(self, classifiers, quantifier: AggregativeSoftQuantifier, merge_fun='median', val_split=5):
|
||||
self.merge_fun = merge_fun
|
||||
self.val_split = val_split
|
||||
self.mcsqs = []
|
||||
for classifier in classifiers:
|
||||
quantifier = deepcopy(quantifier)
|
||||
quantifier.classifier = classifier
|
||||
self.mcsqs.append(quantifier)
|
||||
|
||||
def fit(self, data: LabelledCollection):
|
||||
for q in self.mcsqs:
|
||||
q.fit(data, val_split=self.val_split)
|
||||
return self
|
||||
|
||||
def quantify(self, instances):
|
||||
prev_predictions = []
|
||||
for q in self.mcsqs:
|
||||
prevalence_i = q.quantify(instances)
|
||||
prev_predictions.append(prevalence_i)
|
||||
return merge(prev_predictions, merge_fun=self.merge_fun)
|
||||
|
||||
|
||||
class MCMQ(BaseQuantifier):
|
||||
def __init__(self, classifiers, quantifiers: List[AggregativeSoftQuantifier], merge_fun='median', val_split=5):
|
||||
self.merge_fun = merge_fun
|
||||
self.scmqs = []
|
||||
for classifier in classifiers:
|
||||
self.scmqs.append(SCMQ(classifier, quantifiers, val_split=val_split))
|
||||
|
||||
def fit(self, data: LabelledCollection):
|
||||
for q in self.scmqs:
|
||||
q.fit(data)
|
||||
return self
|
||||
|
||||
def quantify(self, instances):
|
||||
prev_predictions = []
|
||||
for q in self.scmqs:
|
||||
prevalence_i = q.quantify(instances)
|
||||
prev_predictions.append(prevalence_i)
|
||||
return merge(prev_predictions, merge_fun=self.merge_fun)
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -248,13 +248,13 @@ class GridSearchQ(BaseQuantifier):
|
|||
self.param_scores_[str(params)] = status.status
|
||||
self.error_collector.append(status)
|
||||
|
||||
tend = time()-tinit
|
||||
self.fit_time_ = time()-tinit
|
||||
|
||||
if self.best_score_ is None:
|
||||
raise ValueError('no combination of hyperparameters seemed to work')
|
||||
|
||||
self._sout(f'optimization finished: best params {self.best_params_} (score={self.best_score_:.5f}) '
|
||||
f'[took {tend:.4f}s]')
|
||||
f'[took {self.fit_time_:.4f}s]')
|
||||
|
||||
no_errors = len(self.error_collector)
|
||||
if no_errors>0:
|
||||
|
|
|
|||
Loading…
Reference in New Issue