forked from moreo/QuaPy
added DMx and DMy, with a classmethod that returns HDx and HDy respectively
This commit is contained in:
parent
daca2bd1cb
commit
29db15ae25
|
@ -6,7 +6,7 @@ from tqdm import tqdm
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
from quapy.protocol import APP
|
from quapy.protocol import APP
|
||||||
from quapy.method.aggregative import HDy
|
from quapy.method.aggregative import HDy
|
||||||
from quapy.method.non_aggregative import HDx
|
from quapy.method.non_aggregative import DMx
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
@ -42,7 +42,7 @@ for dataset_name in tqdm(qp.datasets.UCI_DATASETS, total=len(qp.datasets.UCI_DAT
|
||||||
|
|
||||||
# HDx............................................
|
# HDx............................................
|
||||||
tinit = time()
|
tinit = time()
|
||||||
hdx = HDx().fit(train)
|
hdx = DMx.HDx(n_jobs=-1).fit(train)
|
||||||
t_hdx_train = time() - tinit
|
t_hdx_train = time() - tinit
|
||||||
|
|
||||||
tinit = time()
|
tinit = time()
|
||||||
|
|
|
@ -12,7 +12,7 @@ quantifiers = [
|
||||||
('ACC', qp.method.aggregative.ACC(newLR())),
|
('ACC', qp.method.aggregative.ACC(newLR())),
|
||||||
('PCC', qp.method.aggregative.PCC(newLR())),
|
('PCC', qp.method.aggregative.PCC(newLR())),
|
||||||
('PACC', qp.method.aggregative.PACC(newLR())),
|
('PACC', qp.method.aggregative.PACC(newLR())),
|
||||||
('HDy', qp.method.aggregative.DistributionMatching(newLR())),
|
('HDy', qp.method.aggregative.DMy(newLR())),
|
||||||
('EMQ', qp.method.aggregative.EMQ(newLR()))
|
('EMQ', qp.method.aggregative.EMQ(newLR()))
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
from quapy.protocol import APP
|
from quapy.protocol import APP
|
||||||
from quapy.method.aggregative import DistributionMatching
|
from quapy.method.aggregative import DMy
|
||||||
from sklearn.linear_model import LogisticRegression
|
from sklearn.linear_model import LogisticRegression
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
@ -8,7 +8,7 @@ import numpy as np
|
||||||
In this example, we show how to perform model selection on a DistributionMatching quantifier.
|
In this example, we show how to perform model selection on a DistributionMatching quantifier.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
model = DistributionMatching(LogisticRegression())
|
model = DMy(LogisticRegression())
|
||||||
|
|
||||||
qp.environ['SAMPLE_SIZE'] = 100
|
qp.environ['SAMPLE_SIZE'] = 100
|
||||||
qp.environ['N_JOBS'] = -1
|
qp.environ['N_JOBS'] = -1
|
||||||
|
|
|
@ -291,3 +291,57 @@ def get_divergence(divergence: Union[str, Callable]):
|
||||||
return divergence
|
return divergence
|
||||||
else:
|
else:
|
||||||
raise ValueError(f'argument "divergence" not understood; use a str or a callable function')
|
raise ValueError(f'argument "divergence" not understood; use a str or a callable function')
|
||||||
|
|
||||||
|
|
||||||
|
def argmin_prevalence(loss, n_classes, method='optim_minimize'):
|
||||||
|
if method == 'optim_minimize':
|
||||||
|
return optim_minimize(loss, n_classes)
|
||||||
|
elif method == 'linear_search':
|
||||||
|
return linear_search(loss, n_classes)
|
||||||
|
elif method == 'ternary_search':
|
||||||
|
raise NotImplementedError()
|
||||||
|
else:
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
|
||||||
|
def optim_minimize(loss, n_classes):
|
||||||
|
"""
|
||||||
|
Searches for the optimal prevalence values, i.e., an `n_classes`-dimensional vector of the (`n_classes`-1)-simplex
|
||||||
|
that yields the smallest lost. This optimization is carried out by means of a constrained search using scipy's
|
||||||
|
SLSQP routine.
|
||||||
|
|
||||||
|
:param loss: (callable) the function to minimize
|
||||||
|
:param n_classes: (int) the number of classes, i.e., the dimensionality of the prevalence vector
|
||||||
|
:return: (ndarray) the best prevalence vector found
|
||||||
|
"""
|
||||||
|
from scipy import optimize
|
||||||
|
|
||||||
|
# the initial point is set as the uniform distribution
|
||||||
|
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
|
||||||
|
|
||||||
|
# solutions are bounded to those contained in the unit-simplex
|
||||||
|
bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1]
|
||||||
|
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
|
||||||
|
r = optimize.minimize(loss, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
|
||||||
|
return r.x
|
||||||
|
|
||||||
|
|
||||||
|
def linear_search(loss, n_classes):
|
||||||
|
"""
|
||||||
|
Performs a linear search for the best prevalence value in binary problems. The search is carried out by exploring
|
||||||
|
the range [0,1] stepping by 0.01. This search is inefficient, and is added only for completeness (some of the
|
||||||
|
early methods in quantification literature used it, e.g., HDy). A most powerful alternative is `optim_minimize`.
|
||||||
|
|
||||||
|
:param loss: (callable) the function to minimize
|
||||||
|
:param n_classes: (int) the number of classes, i.e., the dimensionality of the prevalence vector
|
||||||
|
:return: (ndarray) the best prevalence vector found
|
||||||
|
"""
|
||||||
|
assert n_classes==2, 'linear search is only available for binary problems'
|
||||||
|
|
||||||
|
prev_selected, min_score = None, None
|
||||||
|
for prev in prevalence_linspace(n_prevalences=100, repeats=1, smooth_limits_epsilon=0.0):
|
||||||
|
score = loss(np.asarray([1 - prev, prev]))
|
||||||
|
if min_score is None or score < min_score:
|
||||||
|
prev_selected, min_score = prev, score
|
||||||
|
|
||||||
|
return np.asarray([1 - prev_selected, prev_selected])
|
|
@ -568,10 +568,11 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
|
||||||
self.Pxy0 = Px[validation.labels == self.classifier.classes_[0]]
|
self.Pxy0 = Px[validation.labels == self.classifier.classes_[0]]
|
||||||
# pre-compute the histogram for positive and negative examples
|
# pre-compute the histogram for positive and negative examples
|
||||||
self.bins = np.linspace(10, 110, 11, dtype=int) # [10, 20, 30, ..., 100, 110]
|
self.bins = np.linspace(10, 110, 11, dtype=int) # [10, 20, 30, ..., 100, 110]
|
||||||
self.Pxy1_density = {bins: np.histogram(self.Pxy1, bins=bins, range=(0, 1), density=True)[0] for bins in
|
def hist(P, bins):
|
||||||
self.bins}
|
h = np.histogram(P, bins=bins, range=(0, 1), density=True)[0]
|
||||||
self.Pxy0_density = {bins: np.histogram(self.Pxy0, bins=bins, range=(0, 1), density=True)[0] for bins in
|
return h / h.sum()
|
||||||
self.bins}
|
self.Pxy1_density = {bins: hist(self.Pxy1, bins) for bins in self.bins}
|
||||||
|
self.Pxy0_density = {bins: hist(self.Pxy0, bins) for bins in self.bins}
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def aggregate(self, classif_posteriors):
|
def aggregate(self, classif_posteriors):
|
||||||
|
@ -712,7 +713,7 @@ class SMM(AggregativeProbabilisticQuantifier, BinaryQuantifier):
|
||||||
return np.asarray([1 - class1_prev, class1_prev])
|
return np.asarray([1 - class1_prev, class1_prev])
|
||||||
|
|
||||||
|
|
||||||
class DistributionMatching(AggregativeProbabilisticQuantifier):
|
class DMy(AggregativeProbabilisticQuantifier):
|
||||||
"""
|
"""
|
||||||
Generic Distribution Matching quantifier for binary or multiclass quantification based on the space of posterior
|
Generic Distribution Matching quantifier for binary or multiclass quantification based on the space of posterior
|
||||||
probabilities. This implementation takes the number of bins, the divergence, and the possibility to work on CDF
|
probabilities. This implementation takes the number of bins, the divergence, and the possibility to work on CDF
|
||||||
|
@ -733,14 +734,24 @@ class DistributionMatching(AggregativeProbabilisticQuantifier):
|
||||||
:param n_jobs: number of parallel workers (default None)
|
:param n_jobs: number of parallel workers (default None)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, classifier, val_split=0.4, nbins=8, divergence: Union[str, Callable]='HD', cdf=False, n_jobs=None):
|
def __init__(self, classifier, val_split=0.4, nbins=8, divergence: Union[str, Callable]='HD',
|
||||||
|
cdf=False, search='optim_minimize', n_jobs=None):
|
||||||
self.classifier = classifier
|
self.classifier = classifier
|
||||||
self.val_split = val_split
|
self.val_split = val_split
|
||||||
self.nbins = nbins
|
self.nbins = nbins
|
||||||
self.divergence = divergence
|
self.divergence = divergence
|
||||||
self.cdf = cdf
|
self.cdf = cdf
|
||||||
|
self.search = search
|
||||||
self.n_jobs = n_jobs
|
self.n_jobs = n_jobs
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def HDy(cls, classifier, val_split=0.4, n_jobs=None):
|
||||||
|
from quapy.method.meta import MedianEstimator
|
||||||
|
|
||||||
|
hdy = DMy(classifier=classifier, val_split=val_split, search='linear_search', divergence='HD')
|
||||||
|
hdy = MedianEstimator(hdy, param_grid={'nbins': np.linspace(10, 110, 11).astype(int)}, n_jobs=n_jobs)
|
||||||
|
return hdy
|
||||||
|
|
||||||
def __get_distributions(self, posteriors):
|
def __get_distributions(self, posteriors):
|
||||||
histograms = []
|
histograms = []
|
||||||
post_dims = posteriors.shape[1]
|
post_dims = posteriors.shape[1]
|
||||||
|
@ -794,26 +805,20 @@ class DistributionMatching(AggregativeProbabilisticQuantifier):
|
||||||
`n` channels (proper distributions of binned posterior probabilities), on which the divergence is computed
|
`n` channels (proper distributions of binned posterior probabilities), on which the divergence is computed
|
||||||
independently. The matching is computed as an average of the divergence across all channels.
|
independently. The matching is computed as an average of the divergence across all channels.
|
||||||
|
|
||||||
:param instances: instances in the sample
|
:param posteriors: posterior probabilities of the instances in the sample
|
||||||
:return: a vector of class prevalence estimates
|
:return: a vector of class prevalence estimates
|
||||||
"""
|
"""
|
||||||
test_distribution = self.__get_distributions(posteriors)
|
test_distribution = self.__get_distributions(posteriors)
|
||||||
divergence = get_divergence(self.divergence)
|
divergence = get_divergence(self.divergence)
|
||||||
n_classes, n_channels, nbins = self.validation_distribution.shape
|
n_classes, n_channels, nbins = self.validation_distribution.shape
|
||||||
def match(prev):
|
def loss(prev):
|
||||||
prev = np.expand_dims(prev, axis=0)
|
prev = np.expand_dims(prev, axis=0)
|
||||||
mixture_distribution = (prev @ self.validation_distribution.reshape(n_classes,-1)).reshape(n_channels, -1)
|
mixture_distribution = (prev @ self.validation_distribution.reshape(n_classes,-1)).reshape(n_channels, -1)
|
||||||
divs = [divergence(test_distribution[ch], mixture_distribution[ch]) for ch in range(n_channels)]
|
divs = [divergence(test_distribution[ch], mixture_distribution[ch]) for ch in range(n_channels)]
|
||||||
return np.mean(divs)
|
return np.mean(divs)
|
||||||
|
|
||||||
# the initial point is set as the uniform distribution
|
return F.argmin_prevalence(loss, n_classes, method=self.search)
|
||||||
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
|
|
||||||
|
|
||||||
# solutions are bounded to those contained in the unit-simplex
|
|
||||||
bounds = tuple((0, 1) for x in range(n_classes)) # values in [0,1]
|
|
||||||
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
|
|
||||||
r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
|
|
||||||
return r.x
|
|
||||||
|
|
||||||
|
|
||||||
def newELM(svmperf_base=None, loss='01', C=1):
|
def newELM(svmperf_base=None, loss='01', C=1):
|
||||||
|
@ -1215,17 +1220,6 @@ class MS2(MS):
|
||||||
return np.median(tprs), np.median(fprs)
|
return np.median(tprs), np.median(fprs)
|
||||||
|
|
||||||
|
|
||||||
ClassifyAndCount = CC
|
|
||||||
AdjustedClassifyAndCount = ACC
|
|
||||||
ProbabilisticClassifyAndCount = PCC
|
|
||||||
ProbabilisticAdjustedClassifyAndCount = PACC
|
|
||||||
ExpectationMaximizationQuantifier = EMQ
|
|
||||||
SLD = EMQ
|
|
||||||
HellingerDistanceY = HDy
|
|
||||||
MedianSweep = MS
|
|
||||||
MedianSweep2 = MS2
|
|
||||||
|
|
||||||
|
|
||||||
class OneVsAllAggregative(OneVsAllGeneric, AggregativeQuantifier):
|
class OneVsAllAggregative(OneVsAllGeneric, AggregativeQuantifier):
|
||||||
"""
|
"""
|
||||||
Allows any binary quantifier to perform quantification on single-label datasets.
|
Allows any binary quantifier to perform quantification on single-label datasets.
|
||||||
|
@ -1283,3 +1277,18 @@ class OneVsAllAggregative(OneVsAllGeneric, AggregativeQuantifier):
|
||||||
# the estimation for the positive class prevalence
|
# the estimation for the positive class prevalence
|
||||||
return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:, c])[1]
|
return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:, c])[1]
|
||||||
|
|
||||||
|
|
||||||
|
#---------------------------------------------------------------
|
||||||
|
# aliases
|
||||||
|
#---------------------------------------------------------------
|
||||||
|
|
||||||
|
ClassifyAndCount = CC
|
||||||
|
AdjustedClassifyAndCount = ACC
|
||||||
|
ProbabilisticClassifyAndCount = PCC
|
||||||
|
ProbabilisticAdjustedClassifyAndCount = PACC
|
||||||
|
ExpectationMaximizationQuantifier = EMQ
|
||||||
|
DistributionMatchingY = DMy
|
||||||
|
SLD = EMQ
|
||||||
|
HellingerDistanceY = HDy
|
||||||
|
MedianSweep = MS
|
||||||
|
MedianSweep2 = MS2
|
||||||
|
|
|
@ -1,7 +1,5 @@
|
||||||
from typing import Union, Callable
|
from typing import Union, Callable
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from scipy import optimize
|
|
||||||
|
|
||||||
from functional import get_divergence
|
from functional import get_divergence
|
||||||
from quapy.data import LabelledCollection
|
from quapy.data import LabelledCollection
|
||||||
|
@ -41,81 +39,7 @@ class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier):
|
||||||
return self.estimated_prevalence
|
return self.estimated_prevalence
|
||||||
|
|
||||||
|
|
||||||
|
class DMx(BaseQuantifier):
|
||||||
class HDx(BinaryQuantifier):
|
|
||||||
"""
|
|
||||||
`Hellinger Distance x <https://www.sciencedirect.com/science/article/pii/S0020025512004069>`_ (HDx).
|
|
||||||
HDx is a method for training binary quantifiers, that models quantification as the problem of
|
|
||||||
minimizing the average divergence (in terms of the Hellinger Distance) across the feature-specific normalized
|
|
||||||
histograms of two representations, one for the unlabelled examples, and another generated from the training
|
|
||||||
examples as a mixture model of the class-specific representations. The parameters of the mixture thus represent
|
|
||||||
the estimates of the class prevalence values. The method computes all matchings for nbins in [10, 20, ..., 110]
|
|
||||||
and reports the mean of the median. The best prevalence is searched via linear search, from 0 to 1 steppy by 0.01.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.feat_ranges = None
|
|
||||||
|
|
||||||
def covariate_histograms(self, X, nbins):
|
|
||||||
assert self.feat_ranges is not None, 'quantify called before fit'
|
|
||||||
|
|
||||||
histograms = []
|
|
||||||
for col_idx in range(self.nfeats):
|
|
||||||
feature = X[:,col_idx]
|
|
||||||
feat_range = self.feat_ranges[col_idx]
|
|
||||||
histograms.append(np.histogram(feature, bins=nbins, range=feat_range, density=True)[0])
|
|
||||||
|
|
||||||
return np.vstack(histograms).T
|
|
||||||
|
|
||||||
def fit(self, data: LabelledCollection):
|
|
||||||
"""
|
|
||||||
Trains a HDx quantifier.
|
|
||||||
|
|
||||||
:param data: the training set
|
|
||||||
:return: self
|
|
||||||
"""
|
|
||||||
|
|
||||||
self._check_binary(data, self.__class__.__name__)
|
|
||||||
X, y = data.Xy
|
|
||||||
|
|
||||||
self.nfeats = X.shape[1]
|
|
||||||
self.feat_ranges = _get_features_range(X)
|
|
||||||
|
|
||||||
# pre-compute the representation for positive and negative examples
|
|
||||||
self.bins = np.linspace(10, 110, 11, dtype=int) # [10, 20, 30, ..., 100, 110]
|
|
||||||
self.H0 = {bins:self.covariate_histograms(X[y == 0], bins) for bins in self.bins}
|
|
||||||
self.H1 = {bins:self.covariate_histograms(X[y == 1], bins) for bins in self.bins}
|
|
||||||
return self
|
|
||||||
|
|
||||||
def quantify(self, X):
|
|
||||||
# "In this work, the number of bins b used in HDx and HDy was chosen from 10 to 110 in steps of 10,
|
|
||||||
# and the final estimated a priori probability was taken as the median of these 11 estimates."
|
|
||||||
# (González-Castro, et al., 2013).
|
|
||||||
|
|
||||||
assert X.shape[1] == self.nfeats, f'wrong shape in quantify; expected {self.nfeats}, found {X.shape[1]}'
|
|
||||||
|
|
||||||
prev_estimations = []
|
|
||||||
for nbins in self.bins:
|
|
||||||
Ht = self.covariate_histograms(X, nbins=nbins)
|
|
||||||
H0 = self.H0[nbins]
|
|
||||||
H1 = self.H1[nbins]
|
|
||||||
|
|
||||||
# the authors proposed to search for the prevalence yielding the best matching as a linear search
|
|
||||||
# at small steps (modern implementations resort to an optimization procedure)
|
|
||||||
prev_selected, min_dist = None, None
|
|
||||||
for prev in F.prevalence_linspace(n_prevalences=100, repeats=1, smooth_limits_epsilon=0.0):
|
|
||||||
Hx = prev * H1 + (1 - prev) * H0
|
|
||||||
hdx = np.mean([F.HellingerDistance(Hx[:,col], Ht[:,col]) for col in range(self.nfeats)])
|
|
||||||
|
|
||||||
if prev_selected is None or hdx < min_dist:
|
|
||||||
prev_selected, min_dist = prev, hdx
|
|
||||||
prev_estimations.append(prev_selected)
|
|
||||||
|
|
||||||
class1_prev = np.median(prev_estimations)
|
|
||||||
return np.asarray([1 - class1_prev, class1_prev])
|
|
||||||
|
|
||||||
|
|
||||||
class DistributionMatchingX(BaseQuantifier):
|
|
||||||
"""
|
"""
|
||||||
Generic Distribution Matching quantifier for binary or multiclass quantification based on the space of covariates.
|
Generic Distribution Matching quantifier for binary or multiclass quantification based on the space of covariates.
|
||||||
This implementation takes the number of bins, the divergence, and the possibility to work on CDF as hyperparameters.
|
This implementation takes the number of bins, the divergence, and the possibility to work on CDF as hyperparameters.
|
||||||
|
@ -128,22 +52,51 @@ class DistributionMatchingX(BaseQuantifier):
|
||||||
:param n_jobs: number of parallel workers (default None)
|
:param n_jobs: number of parallel workers (default None)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, nbins=8, divergence: Union[str, Callable]='HD', cdf=False, n_jobs=None):
|
def __init__(self, nbins=8, divergence: Union[str, Callable]='HD', cdf=False, search='optim_minimize', n_jobs=None):
|
||||||
self.nbins = nbins
|
self.nbins = nbins
|
||||||
self.divergence = divergence
|
self.divergence = divergence
|
||||||
self.cdf = cdf
|
self.cdf = cdf
|
||||||
|
self.search = search
|
||||||
self.n_jobs = n_jobs
|
self.n_jobs = n_jobs
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def HDx(cls, n_jobs=None):
|
||||||
|
"""
|
||||||
|
`Hellinger Distance x <https://www.sciencedirect.com/science/article/pii/S0020025512004069>`_ (HDx).
|
||||||
|
HDx is a method for training binary quantifiers, that models quantification as the problem of
|
||||||
|
minimizing the average divergence (in terms of the Hellinger Distance) across the feature-specific normalized
|
||||||
|
histograms of two representations, one for the unlabelled examples, and another generated from the training
|
||||||
|
examples as a mixture model of the class-specific representations. The parameters of the mixture thus represent
|
||||||
|
the estimates of the class prevalence values.
|
||||||
|
|
||||||
|
The method computes all matchings for nbins in [10, 20, ..., 110] and reports the mean of the median.
|
||||||
|
The best prevalence is searched via linear search, from 0 to 1 stepping by 0.01.
|
||||||
|
|
||||||
|
:param n_jobs: number of parallel workers
|
||||||
|
:return: an instance of this class setup to mimick the performance of the HDx as originally proposed by
|
||||||
|
González-Castro, Alaiz-Rodríguez, Alegre (2013)
|
||||||
|
"""
|
||||||
|
from quapy.method.meta import MedianEstimator
|
||||||
|
|
||||||
|
dmx = DMx(divergence='HD', cdf=False, search='linear_search')
|
||||||
|
nbins = {'nbins': np.linspace(10, 110, 11, dtype=int)}
|
||||||
|
hdx = MedianEstimator(base_quantifier=dmx, param_grid=nbins, n_jobs=n_jobs)
|
||||||
|
return hdx
|
||||||
|
|
||||||
def __get_distributions(self, X):
|
def __get_distributions(self, X):
|
||||||
|
|
||||||
histograms = []
|
histograms = []
|
||||||
for feat_idx in range(self.nfeats):
|
for feat_idx in range(self.nfeats):
|
||||||
hist = np.histogram(X[:, feat_idx], bins=self.nbins, range=self.feat_ranges[feat_idx])[0]
|
feature = X[:, feat_idx]
|
||||||
normhist = hist / hist.sum()
|
feat_range = self.feat_ranges[feat_idx]
|
||||||
histograms.append(normhist)
|
hist = np.histogram(feature, bins=self.nbins, range=feat_range)[0]
|
||||||
|
norm_hist = hist / hist.sum()
|
||||||
|
histograms.append(norm_hist)
|
||||||
distributions = np.vstack(histograms)
|
distributions = np.vstack(histograms)
|
||||||
|
|
||||||
if self.cdf:
|
if self.cdf:
|
||||||
distributions = np.cumsum(distributions, axis=1)
|
distributions = np.cumsum(distributions, axis=1)
|
||||||
|
|
||||||
return distributions
|
return distributions
|
||||||
|
|
||||||
def fit(self, data: LabelledCollection):
|
def fit(self, data: LabelledCollection):
|
||||||
|
@ -184,20 +137,14 @@ class DistributionMatchingX(BaseQuantifier):
|
||||||
test_distribution = self.__get_distributions(instances)
|
test_distribution = self.__get_distributions(instances)
|
||||||
divergence = get_divergence(self.divergence)
|
divergence = get_divergence(self.divergence)
|
||||||
n_classes, n_feats, nbins = self.validation_distribution.shape
|
n_classes, n_feats, nbins = self.validation_distribution.shape
|
||||||
def match(prev):
|
def loss(prev):
|
||||||
prev = np.expand_dims(prev, axis=0)
|
prev = np.expand_dims(prev, axis=0)
|
||||||
mixture_distribution = (prev @ self.validation_distribution.reshape(n_classes,-1)).reshape(n_feats, -1)
|
mixture_distribution = (prev @ self.validation_distribution.reshape(n_classes,-1)).reshape(n_feats, -1)
|
||||||
divs = [divergence(test_distribution[feat], mixture_distribution[feat]) for feat in range(n_feats)]
|
divs = [divergence(test_distribution[feat], mixture_distribution[feat]) for feat in range(n_feats)]
|
||||||
return np.mean(divs)
|
return np.mean(divs)
|
||||||
|
|
||||||
# the initial point is set as the uniform distribution
|
return F.argmin_prevalence(loss, n_classes, method=self.search)
|
||||||
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
|
|
||||||
|
|
||||||
# solutions are bounded to those contained in the unit-simplex
|
|
||||||
bounds = tuple((0, 1) for x in range(n_classes)) # values in [0,1]
|
|
||||||
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
|
|
||||||
r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
|
|
||||||
return r.x
|
|
||||||
|
|
||||||
|
|
||||||
def _get_features_range(X):
|
def _get_features_range(X):
|
||||||
|
@ -206,4 +153,11 @@ def _get_features_range(X):
|
||||||
for col_idx in range(ncols):
|
for col_idx in range(ncols):
|
||||||
feature = X[:,col_idx]
|
feature = X[:,col_idx]
|
||||||
feat_ranges.append((np.min(feature), np.max(feature)))
|
feat_ranges.append((np.min(feature), np.max(feature)))
|
||||||
return feat_ranges
|
return feat_ranges
|
||||||
|
|
||||||
|
|
||||||
|
#---------------------------------------------------------------
|
||||||
|
# aliases
|
||||||
|
#---------------------------------------------------------------
|
||||||
|
|
||||||
|
DistributionMatchingX = DMx
|
|
@ -10,7 +10,7 @@ from quapy.data import Dataset, LabelledCollection
|
||||||
from quapy.method import AGGREGATIVE_METHODS, NON_AGGREGATIVE_METHODS
|
from quapy.method import AGGREGATIVE_METHODS, NON_AGGREGATIVE_METHODS
|
||||||
from quapy.method.meta import Ensemble
|
from quapy.method.meta import Ensemble
|
||||||
from quapy.protocol import APP
|
from quapy.protocol import APP
|
||||||
from quapy.method.aggregative import DistributionMatching
|
from quapy.method.aggregative import DMy
|
||||||
from quapy.method.meta import MedianEstimator
|
from quapy.method.meta import MedianEstimator
|
||||||
|
|
||||||
datasets = [pytest.param(qp.datasets.fetch_twitter('hcr', pickle=True), id='hcr'),
|
datasets = [pytest.param(qp.datasets.fetch_twitter('hcr', pickle=True), id='hcr'),
|
||||||
|
@ -189,7 +189,7 @@ def test_median_meta():
|
||||||
errors = []
|
errors = []
|
||||||
for nbins in nbins_grid:
|
for nbins in nbins_grid:
|
||||||
with qp.util.temp_seed(0):
|
with qp.util.temp_seed(0):
|
||||||
q = DistributionMatching(LogisticRegression(), nbins=nbins)
|
q = DMy(LogisticRegression(), nbins=nbins)
|
||||||
mae, estim_prevs = __fit_test(q, train, test)
|
mae, estim_prevs = __fit_test(q, train, test)
|
||||||
prevs.append(estim_prevs)
|
prevs.append(estim_prevs)
|
||||||
errors.append(mae)
|
errors.append(mae)
|
||||||
|
@ -198,7 +198,7 @@ def test_median_meta():
|
||||||
mae = np.mean(errors)
|
mae = np.mean(errors)
|
||||||
print(f'\tMAE={mae:.4f}')
|
print(f'\tMAE={mae:.4f}')
|
||||||
|
|
||||||
q = DistributionMatching(LogisticRegression())
|
q = DMy(LogisticRegression())
|
||||||
q = MedianEstimator(q, param_grid={'nbins': nbins_grid}, random_state=0, n_jobs=-1)
|
q = MedianEstimator(q, param_grid={'nbins': nbins_grid}, random_state=0, n_jobs=-1)
|
||||||
median_mae, prev = __fit_test(q, train, test)
|
median_mae, prev = __fit_test(q, train, test)
|
||||||
print(f'\tMAE={median_mae:.4f}')
|
print(f'\tMAE={median_mae:.4f}')
|
||||||
|
@ -220,12 +220,12 @@ def test_median_meta_modsel():
|
||||||
|
|
||||||
nbins_grid = [2, 4, 5, 10, 15]
|
nbins_grid = [2, 4, 5, 10, 15]
|
||||||
|
|
||||||
q = DistributionMatching(LogisticRegression())
|
q = DMy(LogisticRegression())
|
||||||
q = MedianEstimator(q, param_grid={'nbins': nbins_grid}, random_state=0, n_jobs=-1)
|
q = MedianEstimator(q, param_grid={'nbins': nbins_grid}, random_state=0, n_jobs=-1)
|
||||||
median_mae, _ = __fit_test(q, train, test)
|
median_mae, _ = __fit_test(q, train, test)
|
||||||
print(f'\tMAE={median_mae:.4f}')
|
print(f'\tMAE={median_mae:.4f}')
|
||||||
|
|
||||||
q = DistributionMatching(LogisticRegression())
|
q = DMy(LogisticRegression())
|
||||||
lr_params = {'classifier__C': np.logspace(-1, 1, 3)}
|
lr_params = {'classifier__C': np.logspace(-1, 1, 3)}
|
||||||
q = MedianEstimator(q, param_grid={'nbins': nbins_grid}, random_state=0, n_jobs=-1)
|
q = MedianEstimator(q, param_grid={'nbins': nbins_grid}, random_state=0, n_jobs=-1)
|
||||||
q = GridSearchQ(q, param_grid=lr_params, protocol=APP(val), n_jobs=-1)
|
q = GridSearchQ(q, param_grid=lr_params, protocol=APP(val), n_jobs=-1)
|
||||||
|
|
Loading…
Reference in New Issue