added DMx and DMy, with a classmethod that returns HDx and HDy respectively

2023-11-09 18:13:54 +01:00 · 2023-11-09 18:13:54 +01:00 · 29db15ae25
parent daca2bd1cb
commit 29db15ae25
7 changed files with 144 additions and 127 deletions
--- a/examples/comparing_HDy_HDx.py
+++ b/examples/comparing_HDy_HDx.py
@ -6,7 +6,7 @@ from tqdm import tqdm
 import quapy as qp
 from quapy.protocol import APP
 from quapy.method.aggregative import HDy
-from quapy.method.non_aggregative import HDx
+from quapy.method.non_aggregative import DMx
 """
@ -42,7 +42,7 @@ for dataset_name in tqdm(qp.datasets.UCI_DATASETS, total=len(qp.datasets.UCI_DAT
    # HDx............................................
    tinit = time()
-    hdx = HDx().fit(train)
+    hdx = DMx.HDx(n_jobs=-1).fit(train)
    t_hdx_train = time() - tinit
    tinit = time()
--- a/examples/ifcb_experiments.py
+++ b/examples/ifcb_experiments.py
@ -12,7 +12,7 @@ quantifiers = [
    ('ACC', qp.method.aggregative.ACC(newLR())),
    ('PCC', qp.method.aggregative.PCC(newLR())),
    ('PACC', qp.method.aggregative.PACC(newLR())),
-    ('HDy', qp.method.aggregative.DistributionMatching(newLR())),
+    ('HDy', qp.method.aggregative.DMy(newLR())),
    ('EMQ', qp.method.aggregative.EMQ(newLR()))
 ]
--- a/examples/model_selection.py
+++ b/examples/model_selection.py
@ -1,6 +1,6 @@
 import quapy as qp
 from quapy.protocol import APP
-from quapy.method.aggregative import DistributionMatching
+from quapy.method.aggregative import DMy
 from sklearn.linear_model import LogisticRegression
 import numpy as np
@ -8,7 +8,7 @@ import numpy as np
 In this example, we show how to perform model selection on a DistributionMatching quantifier.
 """
-model = DistributionMatching(LogisticRegression())
+model = DMy(LogisticRegression())
 qp.environ['SAMPLE_SIZE'] = 100
 qp.environ['N_JOBS'] = -1
--- a/quapy/functional.py
+++ b/quapy/functional.py
@ -291,3 +291,57 @@ def get_divergence(divergence: Union[str, Callable]):
        return divergence
    else:
        raise ValueError(f'argument "divergence" not understood; use a str or a callable function')
 def argmin_prevalence(loss, n_classes, method='optim_minimize'):
    if method == 'optim_minimize':
        return optim_minimize(loss, n_classes)
    elif method == 'linear_search':
        return linear_search(loss, n_classes)
    elif method == 'ternary_search':
        raise NotImplementedError()
    else:
        raise NotImplementedError()
 def optim_minimize(loss, n_classes):
    """
    Searches for the optimal prevalence values, i.e., an `n_classes`-dimensional vector of the (`n_classes`-1)-simplex
    that yields the smallest lost. This optimization is carried out by means of a constrained search using scipy's
    SLSQP routine.
    :param loss: (callable) the function to minimize
    :param n_classes: (int) the number of classes, i.e., the dimensionality of the prevalence vector
    :return: (ndarray) the best prevalence vector found
    """
    from scipy import optimize
    # the initial point is set as the uniform distribution
    uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
    # solutions are bounded to those contained in the unit-simplex
    bounds = tuple((0, 1) for _ in range(n_classes))  # values in [0,1]
    constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)})  # values summing up to 1
    r = optimize.minimize(loss, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
    return r.x
 def linear_search(loss, n_classes):
    """
    Performs a linear search for the best prevalence value in binary problems. The search is carried out by exploring
    the range [0,1] stepping by 0.01. This search is inefficient, and is added only for completeness (some of the
    early methods in quantification literature used it, e.g., HDy). A most powerful alternative is `optim_minimize`.
    :param loss: (callable) the function to minimize
    :param n_classes: (int) the number of classes, i.e., the dimensionality of the prevalence vector
    :return: (ndarray) the best prevalence vector found
    """
    assert n_classes==2, 'linear search is only available for binary problems'
    prev_selected, min_score = None, None
    for prev in prevalence_linspace(n_prevalences=100, repeats=1, smooth_limits_epsilon=0.0):
        score = loss(np.asarray([1 - prev, prev]))
        if min_score is None or score < min_score:
            prev_selected, min_score = prev, score
    return np.asarray([1 - prev_selected, prev_selected])
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@ -568,10 +568,11 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
        self.Pxy0 = Px[validation.labels == self.classifier.classes_[0]]
        # pre-compute the histogram for positive and negative examples
        self.bins = np.linspace(10, 110, 11, dtype=int)  # [10, 20, 30, ..., 100, 110]
-        self.Pxy1_density = {bins: np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True)[0] for bins in
+        def hist(P, bins):
-                             self.bins}
+            h = np.histogram(P, bins=bins, range=(0, 1), density=True)[0]
-        self.Pxy0_density = {bins: np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True)[0] for bins in
+            return h / h.sum()
-                             self.bins}
+        self.Pxy1_density = {bins: hist(self.Pxy1, bins) for bins in self.bins}
        self.Pxy0_density = {bins: hist(self.Pxy0, bins) for bins in self.bins}
        return self
    def aggregate(self, classif_posteriors):
@ -712,7 +713,7 @@ class SMM(AggregativeProbabilisticQuantifier, BinaryQuantifier):
        return np.asarray([1 - class1_prev, class1_prev])
-class DistributionMatching(AggregativeProbabilisticQuantifier):
+class DMy(AggregativeProbabilisticQuantifier):
    """
    Generic Distribution Matching quantifier for binary or multiclass quantification based on the space of posterior
    probabilities. This implementation takes the number of bins, the divergence, and the possibility to work on CDF
@ -733,14 +734,24 @@ class DistributionMatching(AggregativeProbabilisticQuantifier):
    :param n_jobs: number of parallel workers (default None)
    """
-    def __init__(self, classifier, val_split=0.4, nbins=8, divergence: Union[str, Callable]='HD', cdf=False, n_jobs=None):
+    def __init__(self, classifier, val_split=0.4, nbins=8, divergence: Union[str, Callable]='HD',
                 cdf=False, search='optim_minimize', n_jobs=None):
        self.classifier = classifier
        self.val_split = val_split
        self.nbins = nbins
        self.divergence = divergence
        self.cdf = cdf
        self.search = search
        self.n_jobs = n_jobs
    @classmethod
    def HDy(cls, classifier, val_split=0.4, n_jobs=None):
        from quapy.method.meta import MedianEstimator
        hdy = DMy(classifier=classifier, val_split=val_split, search='linear_search', divergence='HD')
        hdy = MedianEstimator(hdy, param_grid={'nbins': np.linspace(10, 110, 11).astype(int)}, n_jobs=n_jobs)
        return hdy
    def __get_distributions(self, posteriors):
        histograms = []
        post_dims = posteriors.shape[1]
@ -794,26 +805,20 @@ class DistributionMatching(AggregativeProbabilisticQuantifier):
        `n` channels (proper distributions of binned posterior probabilities), on which the divergence is computed
        independently. The matching is computed as an average of the divergence across all channels.
-        :param instances: instances in the sample
+        :param posteriors: posterior probabilities of the instances in the sample
        :return: a vector of class prevalence estimates
        """
        test_distribution = self.__get_distributions(posteriors)
        divergence = get_divergence(self.divergence)
        n_classes, n_channels, nbins = self.validation_distribution.shape
-        def match(prev):
+        def loss(prev):
            prev = np.expand_dims(prev, axis=0)
            mixture_distribution = (prev @ self.validation_distribution.reshape(n_classes,-1)).reshape(n_channels, -1)
            divs = [divergence(test_distribution[ch], mixture_distribution[ch]) for ch in range(n_channels)]
            return np.mean(divs)
-        # the initial point is set as the uniform distribution
+        return F.argmin_prevalence(loss, n_classes, method=self.search)
        uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
        # solutions are bounded to those contained in the unit-simplex
        bounds = tuple((0, 1) for x in range(n_classes))  # values in [0,1]
        constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)})  # values summing up to 1
        r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
        return r.x
 def newELM(svmperf_base=None, loss='01', C=1):
@ -1215,17 +1220,6 @@ class MS2(MS):
        return np.median(tprs), np.median(fprs)
 ClassifyAndCount = CC
 AdjustedClassifyAndCount = ACC
 ProbabilisticClassifyAndCount = PCC
 ProbabilisticAdjustedClassifyAndCount = PACC
 ExpectationMaximizationQuantifier = EMQ
 SLD = EMQ
 HellingerDistanceY = HDy
 MedianSweep = MS
 MedianSweep2 = MS2
 class OneVsAllAggregative(OneVsAllGeneric, AggregativeQuantifier):
    """
    Allows any binary quantifier to perform quantification on single-label datasets.
@ -1283,3 +1277,18 @@ class OneVsAllAggregative(OneVsAllGeneric, AggregativeQuantifier):
        # the estimation for the positive class prevalence
        return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:, c])[1]
 #---------------------------------------------------------------
 # aliases
 #---------------------------------------------------------------
 ClassifyAndCount = CC
 AdjustedClassifyAndCount = ACC
 ProbabilisticClassifyAndCount = PCC
 ProbabilisticAdjustedClassifyAndCount = PACC
 ExpectationMaximizationQuantifier = EMQ
 DistributionMatchingY = DMy
 SLD = EMQ
 HellingerDistanceY = HDy
 MedianSweep = MS
 MedianSweep2 = MS2
--- a/quapy/method/non_aggregative.py
+++ b/quapy/method/non_aggregative.py
@ -1,7 +1,5 @@
 from typing import Union, Callable
 import numpy as np
 from scipy import optimize
 from functional import get_divergence
 from quapy.data import LabelledCollection
@ -41,81 +39,7 @@ class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier):
        return self.estimated_prevalence
-
+class DMx(BaseQuantifier):
 class HDx(BinaryQuantifier):
    """
    `Hellinger Distance x <https://www.sciencedirect.com/science/article/pii/S0020025512004069>`_ (HDx).
    HDx is a method for training binary quantifiers, that models quantification as the problem of
    minimizing the average divergence (in terms of the Hellinger Distance) across the feature-specific normalized
    histograms of two representations, one for the unlabelled examples, and another generated from the training
    examples as a mixture model of the class-specific representations. The parameters of the mixture thus represent
    the estimates of the class prevalence values. The method computes all matchings for nbins in [10, 20, ..., 110]
    and reports the mean of the median. The best prevalence is searched via linear search, from 0 to 1 steppy by 0.01.
    """
    def __init__(self):
        self.feat_ranges = None
    def covariate_histograms(self, X, nbins):
        assert self.feat_ranges is not None, 'quantify called before fit'
        histograms = []
        for col_idx in range(self.nfeats):
            feature = X[:,col_idx]
            feat_range = self.feat_ranges[col_idx]
            histograms.append(np.histogram(feature, bins=nbins, range=feat_range, density=True)[0])
        return np.vstack(histograms).T
    def fit(self, data: LabelledCollection):
        """
        Trains a HDx quantifier.
        :param data: the training set
        :return: self
        """
        self._check_binary(data, self.__class__.__name__)
        X, y = data.Xy
        self.nfeats = X.shape[1]
        self.feat_ranges = _get_features_range(X)
        # pre-compute the representation for positive and negative examples
        self.bins = np.linspace(10, 110, 11, dtype=int)  # [10, 20, 30, ..., 100, 110]
        self.H0 = {bins:self.covariate_histograms(X[y == 0], bins) for bins in self.bins}
        self.H1 = {bins:self.covariate_histograms(X[y == 1], bins) for bins in self.bins}
        return self
    def quantify(self, X):
        # "In this work, the number of bins b used in HDx and HDy was chosen from 10 to 110 in steps of 10,
        # and the final estimated a priori probability was taken as the median of these 11 estimates."
        # (González-Castro, et al., 2013).
        assert X.shape[1] == self.nfeats, f'wrong shape in quantify; expected {self.nfeats}, found {X.shape[1]}'
        prev_estimations = []
        for nbins in self.bins:
            Ht = self.covariate_histograms(X, nbins=nbins)
            H0 = self.H0[nbins]
            H1 = self.H1[nbins]
            # the authors proposed to search for the prevalence yielding the best matching as a linear search
            # at small steps (modern implementations resort to an optimization procedure)
            prev_selected, min_dist = None, None
            for prev in F.prevalence_linspace(n_prevalences=100, repeats=1, smooth_limits_epsilon=0.0):
                Hx = prev * H1 + (1 - prev) * H0
                hdx = np.mean([F.HellingerDistance(Hx[:,col], Ht[:,col]) for col in range(self.nfeats)])
                if prev_selected is None or hdx < min_dist:
                    prev_selected, min_dist = prev, hdx
            prev_estimations.append(prev_selected)
        class1_prev = np.median(prev_estimations)
        return np.asarray([1 - class1_prev, class1_prev])
 class DistributionMatchingX(BaseQuantifier):
    """
    Generic Distribution Matching quantifier for binary or multiclass quantification based on the space of covariates.
    This implementation takes the number of bins, the divergence, and the possibility to work on CDF as hyperparameters.
@ -128,22 +52,51 @@ class DistributionMatchingX(BaseQuantifier):
    :param n_jobs: number of parallel workers (default None)
    """
-    def __init__(self, nbins=8, divergence: Union[str, Callable]='HD', cdf=False, n_jobs=None):
+    def __init__(self, nbins=8, divergence: Union[str, Callable]='HD', cdf=False, search='optim_minimize', n_jobs=None):
        self.nbins = nbins
        self.divergence = divergence
        self.cdf = cdf
        self.search = search
        self.n_jobs = n_jobs
    @classmethod
    def HDx(cls, n_jobs=None):
        """
        `Hellinger Distance x <https://www.sciencedirect.com/science/article/pii/S0020025512004069>`_ (HDx).
        HDx is a method for training binary quantifiers, that models quantification as the problem of
        minimizing the average divergence (in terms of the Hellinger Distance) across the feature-specific normalized
        histograms of two representations, one for the unlabelled examples, and another generated from the training
        examples as a mixture model of the class-specific representations. The parameters of the mixture thus represent
        the estimates of the class prevalence values.
        The method computes all matchings for nbins in [10, 20, ..., 110] and reports the mean of the median.
        The best prevalence is searched via linear search, from 0 to 1 stepping by 0.01.
        :param n_jobs: number of parallel workers
        :return: an instance of this class setup to mimick the performance of the HDx as originally proposed by
            González-Castro, Alaiz-Rodríguez, Alegre (2013)
        """
        from quapy.method.meta import MedianEstimator
        dmx = DMx(divergence='HD', cdf=False, search='linear_search')
        nbins = {'nbins': np.linspace(10, 110, 11, dtype=int)}
        hdx = MedianEstimator(base_quantifier=dmx, param_grid=nbins, n_jobs=n_jobs)
        return hdx
    def __get_distributions(self, X):
        histograms = []
        for feat_idx in range(self.nfeats):
-            hist = np.histogram(X[:, feat_idx], bins=self.nbins, range=self.feat_ranges[feat_idx])[0]
+            feature = X[:, feat_idx]
-            normhist = hist / hist.sum()
+            feat_range = self.feat_ranges[feat_idx]
-            histograms.append(normhist)
+            hist = np.histogram(feature, bins=self.nbins, range=feat_range)[0]
-
+            norm_hist = hist / hist.sum()
            histograms.append(norm_hist)
        distributions = np.vstack(histograms)
        if self.cdf:
            distributions = np.cumsum(distributions, axis=1)
        return distributions
    def fit(self, data: LabelledCollection):
@ -184,20 +137,14 @@ class DistributionMatchingX(BaseQuantifier):
        test_distribution = self.__get_distributions(instances)
        divergence = get_divergence(self.divergence)
        n_classes, n_feats, nbins = self.validation_distribution.shape
-        def match(prev):
+        def loss(prev):
            prev = np.expand_dims(prev, axis=0)
            mixture_distribution = (prev @ self.validation_distribution.reshape(n_classes,-1)).reshape(n_feats, -1)
            divs = [divergence(test_distribution[feat], mixture_distribution[feat]) for feat in range(n_feats)]
            return np.mean(divs)
-        # the initial point is set as the uniform distribution
+        return F.argmin_prevalence(loss, n_classes, method=self.search)
        uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
        # solutions are bounded to those contained in the unit-simplex
        bounds = tuple((0, 1) for x in range(n_classes))  # values in [0,1]
        constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)})  # values summing up to 1
        r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
        return r.x
 def _get_features_range(X):
@ -206,4 +153,11 @@ def _get_features_range(X):
    for col_idx in range(ncols):
        feature = X[:,col_idx]
        feat_ranges.append((np.min(feature), np.max(feature)))
-    return feat_ranges
+    return feat_ranges
 #---------------------------------------------------------------
 # aliases
 #---------------------------------------------------------------
 DistributionMatchingX = DMx
--- a/quapy/tests/test_methods.py
+++ b/quapy/tests/test_methods.py
@ -10,7 +10,7 @@ from quapy.data import Dataset, LabelledCollection
 from quapy.method import AGGREGATIVE_METHODS, NON_AGGREGATIVE_METHODS
 from quapy.method.meta import Ensemble
 from quapy.protocol import APP
-from quapy.method.aggregative import DistributionMatching
+from quapy.method.aggregative import DMy
 from quapy.method.meta import MedianEstimator
 datasets = [pytest.param(qp.datasets.fetch_twitter('hcr', pickle=True), id='hcr'),
@ -189,7 +189,7 @@ def test_median_meta():
    errors = []
    for nbins in nbins_grid:
        with qp.util.temp_seed(0):
-            q = DistributionMatching(LogisticRegression(), nbins=nbins)
+            q = DMy(LogisticRegression(), nbins=nbins)
            mae, estim_prevs = __fit_test(q, train, test)
            prevs.append(estim_prevs)
            errors.append(mae)
@ -198,7 +198,7 @@ def test_median_meta():
    mae = np.mean(errors)
    print(f'\tMAE={mae:.4f}')
-    q = DistributionMatching(LogisticRegression())
+    q = DMy(LogisticRegression())
    q = MedianEstimator(q, param_grid={'nbins': nbins_grid}, random_state=0, n_jobs=-1)
    median_mae, prev = __fit_test(q, train, test)
    print(f'\tMAE={median_mae:.4f}')
@ -220,12 +220,12 @@ def test_median_meta_modsel():
    nbins_grid = [2, 4, 5, 10, 15]
-    q = DistributionMatching(LogisticRegression())
+    q = DMy(LogisticRegression())
    q = MedianEstimator(q, param_grid={'nbins': nbins_grid}, random_state=0, n_jobs=-1)
    median_mae, _ = __fit_test(q, train, test)
    print(f'\tMAE={median_mae:.4f}')
-    q = DistributionMatching(LogisticRegression())
+    q = DMy(LogisticRegression())
    lr_params = {'classifier__C': np.logspace(-1, 1, 3)}
    q = MedianEstimator(q, param_grid={'nbins': nbins_grid}, random_state=0, n_jobs=-1)
    q = GridSearchQ(q, param_grid=lr_params, protocol=APP(val), n_jobs=-1)