working on SCMQ, MCSQ, MCMQ ensembles

2024-12-02 17:39:06 +01:00 · 2024-12-02 17:39:06 +01:00 · 8876d311e7
parent c8235ddb2a
commit 8876d311e7
6 changed files with 236 additions and 29 deletions
--- a/examples/16.KDEy_bandwidth.py
+++ b/examples/16.KDEy_bandwidth.py
@ -0,0 +1,83 @@
 import quapy as qp
 import numpy as np
 from quapy.protocol import UPP
 from quapy.method.aggregative import KDEyML
 import quapy.functional as F
 from time import time
 """
 Let see one example:  
 """
 # load some data
 qp.environ['SAMPLE_SIZE'] = 100
 data = qp.datasets.fetch_UCIMulticlassDataset('molecular')
 training, test = data.train_test
 training, validation = training.split_stratified(train_prop=0.7, random_state=0)
 protocol = UPP(validation)
 hyper_C = np.logspace(-3, 3, 7)
 model = KDEyML()
 with qp.util.temp_seed(0):
    param_grid = {
        'classifier__C': hyper_C,
        'bandwidth': np.linspace(0.01, 0.20, 20) # [0.01, 0.02, 0.03, ..., 0.20]
    }
    model = qp.model_selection.GridSearchQ(
        model=model,
        param_grid=param_grid,
        protocol=protocol,
        error='mae',  # the error to optimize is the MAE (a quantification-oriented loss)
        refit=False,  # retrain on the whole labelled set once done
        n_jobs=-1,
        verbose=True  # show information as the process goes on
    ).fit(training)
 best_params = model.best_params_
 took = model.fit_time_
 model = model.best_model_
 print(f'model selection ended: best hyper-parameters={best_params}')
 # evaluation in terms of MAE
 # we use the same evaluation protocol (APP) on the test set
 mae_score = qp.evaluation.evaluate(model, protocol=UPP(test), error_metric='mae')
 print(f'MAE={mae_score:.5f}')
 print(f'model selection took {took:.1f}s')
 model = KDEyML(bandwidth='auto')
 with qp.util.temp_seed(0):
    param_grid = {
        'classifier__C': hyper_C,
    }
    model = qp.model_selection.GridSearchQ(
        model=model,
        param_grid=param_grid,
        protocol=protocol,
        error='mae',  # the error to optimize is the MAE (a quantification-oriented loss)
        refit=False,  # retrain on the whole labelled set once done
        n_jobs=-1,
        verbose=True  # show information as the process goes on
    ).fit(training)
 best_params = model.best_params_
 took = model.fit_time_
 model = model.best_model_
 bandwidth = model.bandwidth_val
 print(f'model selection ended: best hyper-parameters={best_params} ({bandwidth=})')
 # evaluation in terms of MAE
 # we use the same evaluation protocol (APP) on the test set
 mae_score = qp.evaluation.evaluate(model, protocol=UPP(test), error_metric='mae')
 print(f'MAE={mae_score:.5f}')
 print(f'model selection took {took:.1f}s')
--- a/examples/ensembles.py
+++ b/examples/ensembles.py
@ -1,10 +1,16 @@
 from sklearn.exceptions import ConvergenceWarning
 from sklearn.linear_model import LogisticRegression
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.neighbors import KNeighborsClassifier
 from statsmodels.sandbox.distributions.genpareto import quant
 import quapy as qp
 from quapy.protocol import UPP
 from quapy.method.aggregative import PACC, DMy, EMQ, KDEyML
-from quapy.method.meta import SCMQ
+from quapy.method.meta import SCMQ, MCMQ, MCSQ
 import warnings
 warnings.filterwarnings("ignore", category=DeprecationWarning)
 warnings.filterwarnings("ignore", category=ConvergenceWarning)
 qp.environ["SAMPLE_SIZE"]=100
@ -32,5 +38,19 @@ scmq = SCMQ(classifier, quantifiers)
 train_and_test_model(scmq, train, test)
-for quantifier in quantifiers:
+# for quantifier in quantifiers:
-    train_and_test_model(quantifier, train, test)
+#     train_and_test_model(quantifier, train, test)
 classifiers = [
    LogisticRegression(),
    KNeighborsClassifier(),
    # MultinomialNB()
 ]
 mcmq = MCMQ(classifiers, quantifiers)
 train_and_test_model(mcmq, train, test)
 mcsq = MCSQ(classifiers, PACC())
 train_and_test_model(mcsq, train, test)
--- a/quapy/functional.py
+++ b/quapy/functional.py
@ -416,7 +416,7 @@ def argmin_prevalence(loss: Callable,
        raise NotImplementedError()
-def optim_minimize(loss: Callable, n_classes: int):
+def optim_minimize(loss: Callable, n_classes: int, return_loss=False):
    """
    Searches for the optimal prevalence values, i.e., an `n_classes`-dimensional vector of the (`n_classes`-1)-simplex
    that yields the smallest lost. This optimization is carried out by means of a constrained search using scipy's
@ -424,18 +424,24 @@ def optim_minimize(loss: Callable, n_classes: int):
    :param loss: (callable) the function to minimize
    :param n_classes: (int) the number of classes, i.e., the dimensionality of the prevalence vector
-    :return: (ndarray) the best prevalence vector found
+    :param return_loss: bool, if True, returns also the value of the loss (default is False).
    :return: (ndarray) the best prevalence vector found or a tuple which also contains the value of the loss
        if return_loss=True
    """
    from scipy import optimize
    # the initial point is set as the uniform distribution
-    uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
+    uniform_distribution = uniform_prevalence(n_classes=n_classes)
    # solutions are bounded to those contained in the unit-simplex
    bounds = tuple((0, 1) for _ in range(n_classes))  # values in [0,1]
    constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)})  # values summing up to 1
    r = optimize.minimize(loss, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
-    return r.x
+    
    if return_loss:
        return r.x, r.fun
    else:
        return r.x
 def linear_search(loss: Callable, n_classes: int):
--- a/quapy/method/_kdey.py
+++ b/quapy/method/_kdey.py
@ -1,5 +1,8 @@
 from typing import Union
 import numpy as np
 from scipy.optimize import optimize, minimize_scalar
 from quapy.protocol import UPP
 from sklearn.base import BaseEstimator
 from sklearn.neighbors import KernelDensity
@ -111,16 +114,70 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase):
    :param random_state: a seed to be set before fitting any base quantifier (default None)
    """
-    def __init__(self, classifier: BaseEstimator=None, val_split=5, bandwidth=0.1, random_state=None):
+    def __init__(self, classifier: BaseEstimator=None, val_split=5, bandwidth=0.1, auto_reduction=500, auto_repeats=25, random_state=None):
        self.classifier = qp._get_classifier(classifier)
        self.val_split = val_split
-        self.bandwidth = KDEBase._check_bandwidth(bandwidth)
+        self.bandwidth = bandwidth
        if bandwidth!='auto':
            self.bandwidth = KDEBase._check_bandwidth(bandwidth)
        assert auto_reduction is None or (isinstance(auto_reduction, int) and auto_reduction>0), \
            (f'param {auto_reduction=} should either be None (no reduction) or a positive integer '
             f'(number of training instances).')
        self.auto_reduction = auto_reduction
        self.auto_repeats = auto_repeats
        self.random_state=random_state
    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
-        self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth)
+        if self.bandwidth == 'auto':
            self.bandwidth_val = self.auto_bandwidth_likelihood(classif_predictions)
        else:
            self.bandwidth_val = self.bandwidth
        self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth_val)
        return self
    def auto_bandwidth_likelihood(self, classif_predictions: LabelledCollection):
        train, val = classif_predictions.split_stratified(train_prop=0.5, random_state=self.random_state)
        n_classes = classif_predictions.n_classes
        epsilon = 1e-8
        repeats = self.auto_repeats
        auto_reduction = self.auto_reduction
        if auto_reduction is None:
            auto_reduction = len(classif_predictions)
        else:
            # reduce samples to speed up computation
            train = train.sampling(auto_reduction)
        prot = UPP(val, sample_size=auto_reduction, repeats=repeats, random_state=self.random_state)
        def eval_bandwidth_nll(bandwidth):
            mix_densities = self.get_mixture_components(*train.Xy, train.classes_, bandwidth)
            loss_accum = 0
            for (sample, prevtrue) in prot():
                test_densities = [self.pdf(kde_i, sample) for kde_i in mix_densities]
                def neg_loglikelihood_prev(prev):
                    test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prev, test_densities))
                    test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
                    nll = -np.sum(test_loglikelihood)
                    return nll
                pred_prev, neglikelihood = F.optim_minimize(neg_loglikelihood_prev, n_classes=n_classes, return_loss=True)
                loss_accum += neglikelihood
            return loss_accum
        r = minimize_scalar(eval_bandwidth_nll, bounds=(0.0001, 0.2), options={'xatol': 0.005})
        best_band = r.x
        best_loss_value = r.fun
        nit = r.nit
        # print(f'[{self.__class__.__name__}:autobandwidth] '
        #       f'found bandwidth={best_band:.8f} after {nit=} iterations loss_val={best_loss_value:.5f})')
        return best_band
    def aggregate(self, posteriors: np.ndarray):
        """
        Searches for the mixture model parameter (the sought prevalence values) that maximizes the likelihood
--- a/quapy/method/meta.py
+++ b/quapy/method/meta.py
@ -693,14 +693,26 @@ def EEMQ(classifier, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
    return ensembleFactory(classifier, EMQ, param_grid, optim, param_mod_sel, **kwargs)
 def merge(prev_predictions, merge_fun):
    prev_predictions = np.asarray(prev_predictions)
    if merge_fun == 'median':
        prevalences = np.median(prev_predictions, axis=0)
        prevalences = F.normalize_prevalence(prevalences, method='l1')
    elif merge_fun == 'mean':
        prevalences = np.mean(prev_predictions, axis=0)
    else:
        raise NotImplementedError(f'merge function {merge_fun} not implemented!')
    return prevalences
 class SCMQ(AggregativeSoftQuantifier):
-    MERGE_FUNCTIONS = ['median']
+    MERGE_FUNCTIONS = ['median', 'mean']
    def __init__(self, classifier, quantifiers: List[AggregativeSoftQuantifier], merge_fun='median', val_split=5):
        self.classifier = classifier
-        self.quantifiers = quantifiers
+        self.quantifiers = [deepcopy(q) for q in quantifiers]
-        assert merge_fun in self.MERGE_FUNCTIONS, f'unknwon {merge_fun=}, valid ones are {self.MERGE_FUNCTIONS}'
+        assert merge_fun in self.MERGE_FUNCTIONS, f'unknown {merge_fun=}, valid ones are {self.MERGE_FUNCTIONS}'
        self.merge_fun = merge_fun
        self.val_split = val_split
@ -715,22 +727,51 @@ class SCMQ(AggregativeSoftQuantifier):
        for quantifier_i in self.quantifiers:
            prevalence_i = quantifier_i.aggregate(classif_predictions)
            prev_predictions.append(prevalence_i)
-        return self.merge(prev_predictions)
+        return merge(prev_predictions, merge_fun=self.merge_fun)
    def merge(self, prev_predictions):
        prev_predictions = np.asarray(prev_predictions)
        if self.merge_fun == 'median':
            prevalences = np.median(prev_predictions, axis=0)
            prevalences = F.normalize_prevalence(prevalences, method='l1')
        elif self.merge_fun == 'mean':
            prevalences = np.mean(prev_predictions, axis=0)
        else:
            raise NotImplementedError(f'merge function {self.merge_fun} not implemented!')
        return prevalences
 class MCSQ(BaseQuantifier):
    def __init__(self, classifiers, quantifier: AggregativeSoftQuantifier, merge_fun='median', val_split=5):
        self.merge_fun = merge_fun
        self.val_split = val_split
        self.mcsqs = []
        for classifier in classifiers:
            quantifier = deepcopy(quantifier)
            quantifier.classifier = classifier
            self.mcsqs.append(quantifier)
    def fit(self, data: LabelledCollection):
        for q in self.mcsqs:
            q.fit(data, val_split=self.val_split)
        return self
    def quantify(self, instances):
        prev_predictions = []
        for q in self.mcsqs:
            prevalence_i = q.quantify(instances)
            prev_predictions.append(prevalence_i)
        return merge(prev_predictions, merge_fun=self.merge_fun)
 class MCMQ(BaseQuantifier):
    def __init__(self, classifiers, quantifiers: List[AggregativeSoftQuantifier], merge_fun='median', val_split=5):
        self.merge_fun = merge_fun
        self.scmqs = []
        for classifier in classifiers:
            self.scmqs.append(SCMQ(classifier, quantifiers, val_split=val_split))
    def fit(self, data: LabelledCollection):
        for q in self.scmqs:
            q.fit(data)
        return self
    def quantify(self, instances):
        prev_predictions = []
        for q in self.scmqs:
            prevalence_i = q.quantify(instances)
            prev_predictions.append(prevalence_i)
        return merge(prev_predictions, merge_fun=self.merge_fun)
--- a/quapy/model_selection.py
+++ b/quapy/model_selection.py
@ -248,13 +248,13 @@ class GridSearchQ(BaseQuantifier):
                self.param_scores_[str(params)] = status.status
                self.error_collector.append(status)
-        tend = time()-tinit
+        self.fit_time_ = time()-tinit
        if self.best_score_ is None:
            raise ValueError('no combination of hyperparameters seemed to work')
        self._sout(f'optimization finished: best params {self.best_params_} (score={self.best_score_:.5f}) '
-                   f'[took {tend:.4f}s]')
+                   f'[took {self.fit_time_:.4f}s]')
        no_errors = len(self.error_collector)
        if no_errors>0: