QuaPy/quapy/method/non_aggregative.py

116 lines
4.9 KiB
Python
Raw Normal View History

import numpy as np
2021-01-15 18:32:32 +01:00
from quapy.data import LabelledCollection
from quapy.method.base import BaseQuantifier, BinaryQuantifier
import quapy.functional as F
2020-12-03 18:12:28 +01:00
class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier):
2021-12-15 15:27:43 +01:00
"""
The `Maximum Likelihood Prevalence Estimation` (MLPE) method is a lazy method that assumes there is no prior
probability shift between training and test instances (put it other way, that the i.i.d. assumpion holds).
The estimation of class prevalence values for any test sample is always (i.e., irrespective of the test sample
itself) the class prevalence seen during training. This method is considered to be a lower-bound quantifier that
any quantification method should beat.
"""
def __init__(self):
self._classes_ = None
2020-12-03 18:12:28 +01:00
2021-12-15 15:27:43 +01:00
def fit(self, data: LabelledCollection):
"""
Computes the training prevalence and stores it.
:param data: the training sample
:return: self
"""
2020-12-03 18:12:28 +01:00
self.estimated_prevalence = data.prevalence()
return self
2020-12-03 18:12:28 +01:00
2021-12-15 15:27:43 +01:00
def quantify(self, instances):
"""
Ignores the input instances and returns, as the class prevalence estimantes, the training prevalence.
:param instances: array-like (ignored)
:return: the class prevalence seen during training
"""
2020-12-03 18:12:28 +01:00
return self.estimated_prevalence
class HDx(BinaryQuantifier):
"""
`Hellinger Distance x <https://www.sciencedirect.com/science/article/pii/S0020025512004069>`_ (HDx).
HDx is a method for training binary quantifiers, that models quantification as the problem of
minimizing the average divergence (in terms of the Hellinger Distance) across the feature-specific normalized
histograms of two representations, one for the unlabelled examples, and another generated from the training
examples as a mixture model of the class-specific representations. The parameters of the mixture thus represent
the estimates of the class prevalence values. The method computes all matchings for nbins in [10, 20, ..., 110]
and reports the mean of the median. The best prevalence is searched via linear search, from 0 to 1 steppy by 0.01.
"""
def __init__(self):
self.feat_ranges = None
def get_features_range(self, X):
feat_ranges = []
ncols = X.shape[1]
for col_idx in range(ncols):
feature = X[:,col_idx]
feat_ranges.append((np.min(feature), np.max(feature)))
return feat_ranges
def covariate_histograms(self, X, nbins):
assert self.feat_ranges is not None, 'quantify called before fit'
histograms = []
for col_idx in range(self.ncols):
feature = X[:,col_idx]
feat_range = self.feat_ranges[col_idx]
histograms.append(np.histogram(feature, bins=nbins, range=feat_range, density=True)[0])
return np.vstack(histograms).T
def fit(self, data: LabelledCollection):
"""
Trains a HDx quantifier.
:param data: the training set
:return: self
"""
self._check_binary(data, self.__class__.__name__)
X, y = data.Xy
self.ncols = X.shape[1]
self.feat_ranges = self.get_features_range(X)
# pre-compute the representation for positive and negative examples
self.bins = np.linspace(10, 110, 11, dtype=int) # [10, 20, 30, ..., 100, 110]
self.H0 = {bins:self.covariate_histograms(X[y == 0], bins) for bins in self.bins}
self.H1 = {bins:self.covariate_histograms(X[y == 1], bins) for bins in self.bins}
return self
def quantify(self, X):
# "In this work, the number of bins b used in HDx and HDy was chosen from 10 to 110 in steps of 10,
# and the final estimated a priori probability was taken as the median of these 11 estimates."
# (González-Castro, et al., 2013).
assert X.shape[1] == self.ncols, f'wrong shape in quantify; expected {self.ncols}, found {X.shape[1]}'
prev_estimations = []
for nbins in self.bins:
Ht = self.covariate_histograms(X, nbins=nbins)
H0 = self.H0[nbins]
H1 = self.H1[nbins]
# the authors proposed to search for the prevalence yielding the best matching as a linear search
# at small steps (modern implementations resort to an optimization procedure)
prev_selected, min_dist = None, None
for prev in F.prevalence_linspace(n_prevalences=100, repeats=1, smooth_limits_epsilon=0.0):
Hx = prev * H1 + (1 - prev) * H0
hdx = np.mean([F.HellingerDistance(Hx[:,col], Ht[:,col]) for col in range(self.ncols)])
if prev_selected is None or hdx < min_dist:
prev_selected, min_dist = prev, hdx
prev_estimations.append(prev_selected)
class1_prev = np.median(prev_estimations)
return np.asarray([1 - class1_prev, class1_prev])