added HDx and an example comparing HDy vs HDx

2023-11-08 15:34:17 +01:00 · 2023-11-08 15:34:17 +01:00 · 76cf784844
parent 8a6579428b
commit 76cf784844
5 changed files with 166 additions and 6 deletions
--- a/examples/comparing_HDy_HDx.py
+++ b/examples/comparing_HDy_HDx.py
@ -0,0 +1,74 @@
+from sklearn.linear_model import LogisticRegression
+from time import time
+import pandas as pd
+from tqdm import tqdm
+
+import quapy as qp
+from quapy.protocol import APP
+from quapy.method.aggregative import HDy
+from quapy.method.non_aggregative import HDx
+
+
+"""
+This example is meant to experimentally compare HDy and HDx. 
+The implementations of these methods adhere to the original design of the methods; in particular, this means that
+the number of bins is not an hyperparameter, but is something that the method explores internally (returning the
+median of the estimates as the final prevalence prediction), and the prevalence is not searched through any 
+numerical optimization procedure, but simply as a linear search between 0 and 1 steppy by 0.01.
+See <https://www.sciencedirect.com/science/article/pii/S0020025512004069>`_ for further details   
+"""
+
+qp.environ['SAMPLE_SIZE']=100
+
+
+df = pd.DataFrame(columns=('method', 'dataset', 'MAE', 'MRAE', 'tr-time', 'te-time'))
+
+
+for dataset_name in tqdm(qp.datasets.UCI_DATASETS, total=len(qp.datasets.UCI_DATASETS)):
+    if dataset_name in ['acute.a', 'acute.b', 'balance.2', 'iris.1']: continue
+
+    collection = qp.datasets.fetch_UCILabelledCollection(dataset_name, verbose=False)
+    train, test = collection.split_stratified()
+
+    # HDy............................................
+    tinit = time()
+    hdy = HDy(LogisticRegression()).fit(train)
+    t_hdy_train = time()-tinit
+
+    tinit = time()
+    hdy_report = qp.evaluation.evaluation_report(hdy, APP(test), error_metrics=['mae', 'mrae']).mean()
+    t_hdy_test = time() - tinit
+    df.loc[len(df)] = ['HDy', dataset_name, hdy_report['mae'], hdy_report['mrae'], t_hdy_train, t_hdy_test]
+
+    # HDx............................................
+    tinit = time()
+    hdx = HDx().fit(train)
+    t_hdx_train = time() - tinit
+
+    tinit = time()
+    hdx_report = qp.evaluation.evaluation_report(hdx, APP(test), error_metrics=['mae', 'mrae']).mean()
+    t_hdx_test = time() - tinit
+    df.loc[len(df)] = ['HDx', dataset_name, hdx_report['mae'], hdx_report['mrae'], t_hdx_train, t_hdx_test]
+
+# evaluation reports
+
+print('\n'*3)
+print('='*80)
+print('Comparison in terms of performance')
+print('='*80)
+pv = df.pivot_table(index='dataset', columns='method', values=['MAE', 'MRAE'])
+print(pv)
+print('\nAveraged values:')
+print(pv.mean())
+
+print('\n'*3)
+print('='*80)
+print('Comparison in terms of efficiency')
+print('='*80)
+pv = df.pivot_table(index='dataset', columns='method', values=['tr-time', 'te-time'])
+print(pv)
+print('\nAveraged values:')
+print(pv.mean())
+
+
+
--- a/quapy/data/datasets.py
+++ b/quapy/data/datasets.py
@ -369,7 +369,8 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) ->
    elif verbose:
        print('no file description available')

-    print(f'Loading {dataset_name} ({fullname})')
+    if verbose:
+        print(f'Loading {dataset_name} ({fullname})')
    if identifier == 'acute':
        df = pd.read_csv(data_path, header=None, encoding='utf-16', sep='\t')

@ -550,7 +551,8 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) ->
        y = binarize(y, pos_class='NUC')

    data = LabelledCollection(X, y)
-    data.stats()
+    if verbose:
+        data.stats()
    return data


--- a/quapy/functional.py
+++ b/quapy/functional.py
@ -64,7 +64,7 @@ def prevalence_from_probabilities(posteriors, binarize: bool = False):
        return prevalences


-def HellingerDistance(P, Q):
+def HellingerDistance(P, Q) -> float:
    """
    Computes the Hellingher Distance (HD) between (discretized) distributions `P` and `Q`.
    The HD for two discrete distributions of `k` bins is defined as:
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@ -530,7 +530,7 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
    """
    `Hellinger Distance y <https://www.sciencedirect.com/science/article/pii/S0020025512004069>`_ (HDy).
    HDy is a probabilistic method for training binary quantifiers, that models quantification as the problem of
-    minimizing the divergence (in terms of the Hellinger Distance) between two cumulative distributions of posterior
+    minimizing the divergence (in terms of the Hellinger Distance) between two distributions of posterior
    probabilities returned by the classifier. One of the distributions is generated from the unlabelled examples and
    the other is generated from a validation set. This latter distribution is defined as a mixture of the
    class-conditional distributions of the posterior probabilities returned for the positive and negative validation
@ -590,6 +590,9 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):

            Px_test, _ = np.histogram(Px, bins=bins, range=(0, 1), density=True)

+            # the authors proposed to search for the prevalence yielding the best matching as a linear search
+            # at small steps (modern implementations resort to an optimization procedure,
+            # see class DistributionMatching)
            prev_selected, min_dist = None, None
            for prev in F.prevalence_linspace(n_prevalences=100, repeats=1, smooth_limits_epsilon=0.0):
                Px_train = prev * Pxy1_density + (1 - prev) * Pxy0_density
--- a/quapy/method/non_aggregative.py
+++ b/quapy/method/non_aggregative.py
@ -1,6 +1,7 @@
+import numpy as np
 from quapy.data import LabelledCollection
-from .base import BaseQuantifier
-
+from quapy.method.base import BaseQuantifier, BinaryQuantifier
+import quapy.functional as F

 class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier):
    """
@ -33,3 +34,83 @@ class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier):
        """
        return self.estimated_prevalence

+
+class HDx(BinaryQuantifier):
+    """
+    `Hellinger Distance x <https://www.sciencedirect.com/science/article/pii/S0020025512004069>`_ (HDx).
+    HDx is a method for training binary quantifiers, that models quantification as the problem of
+    minimizing the average divergence (in terms of the Hellinger Distance) across the feature-specific normalized
+    histograms of two representations, one for the unlabelled examples, and another generated from the training
+    examples as a mixture model of the class-specific representations. The parameters of the mixture thus represent
+    the estimates of the class prevalence values. The method computes all matchings for nbins in [10, 20, ..., 110]
+    and reports the mean of the median. The best prevalence is searched via linear search, from 0 to 1 steppy by 0.01.
+    """
+
+    def __init__(self):
+        self.feat_ranges = None
+
+    def get_features_range(self, X):
+        feat_ranges = []
+        ncols = X.shape[1]
+        for col_idx in range(ncols):
+            feature = X[:,col_idx]
+            feat_ranges.append((np.min(feature), np.max(feature)))
+        return feat_ranges
+
+    def covariate_histograms(self, X, nbins):
+        assert self.feat_ranges is not None, 'quantify called before fit'
+
+        histograms = []
+        for col_idx in range(self.ncols):
+            feature = X[:,col_idx]
+            feat_range = self.feat_ranges[col_idx]
+            histograms.append(np.histogram(feature, bins=nbins, range=feat_range, density=True)[0])
+
+        return np.vstack(histograms).T
+
+    def fit(self, data: LabelledCollection):
+        """
+        Trains a HDx quantifier.
+
+        :param data: the training set
+        :return: self
+        """
+
+        self._check_binary(data, self.__class__.__name__)
+        X, y = data.Xy
+
+        self.ncols = X.shape[1]
+        self.feat_ranges = self.get_features_range(X)
+
+        # pre-compute the representation for positive and negative examples
+        self.bins = np.linspace(10, 110, 11, dtype=int)  # [10, 20, 30, ..., 100, 110]
+        self.H0 = {bins:self.covariate_histograms(X[y == 0], bins) for bins in self.bins}
+        self.H1 = {bins:self.covariate_histograms(X[y == 1], bins) for bins in self.bins}
+        return self
+
+    def quantify(self, X):
+        # "In this work, the number of bins b used in HDx and HDy was chosen from 10 to 110 in steps of 10,
+        # and the final estimated a priori probability was taken as the median of these 11 estimates."
+        # (González-Castro, et al., 2013).
+
+        assert X.shape[1] == self.ncols, f'wrong shape in quantify; expected {self.ncols}, found {X.shape[1]}'
+
+        prev_estimations = []
+        for nbins in self.bins:
+            Ht = self.covariate_histograms(X, nbins=nbins)
+            H0 = self.H0[nbins]
+            H1 = self.H1[nbins]
+
+            # the authors proposed to search for the prevalence yielding the best matching as a linear search
+            # at small steps (modern implementations resort to an optimization procedure)
+            prev_selected, min_dist = None, None
+            for prev in F.prevalence_linspace(n_prevalences=100, repeats=1, smooth_limits_epsilon=0.0):
+                Hx = prev * H1 + (1 - prev) * H0
+                hdx = np.mean([F.HellingerDistance(Hx[:,col], Ht[:,col]) for col in range(self.ncols)])
+
+                if prev_selected is None or hdx < min_dist:
+                    prev_selected, min_dist = prev, hdx
+            prev_estimations.append(prev_selected)
+
+        class1_prev = np.median(prev_estimations)
+        return np.asarray([1 - class1_prev, class1_prev])