forked from moreo/QuaPy
plot functionality added
This commit is contained in:
parent
5894d46b31
commit
d1b449d2e9
|
@ -0,0 +1,48 @@
|
||||||
|
from sklearn.model_selection import GridSearchCV
|
||||||
|
import numpy as np
|
||||||
|
import quapy as qp
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
|
||||||
|
sample_size = 500
|
||||||
|
qp.environ['SAMPLE_SIZE'] = sample_size
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def gen_data():
|
||||||
|
|
||||||
|
data = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5)
|
||||||
|
|
||||||
|
models = [
|
||||||
|
qp.method.aggregative.CC,
|
||||||
|
qp.method.aggregative.ACC,
|
||||||
|
qp.method.aggregative.PCC,
|
||||||
|
qp.method.aggregative.PACC,
|
||||||
|
qp.method.aggregative.HDy,
|
||||||
|
qp.method.aggregative.EMQ,
|
||||||
|
qp.method.meta.ECC,
|
||||||
|
qp.method.meta.EACC,
|
||||||
|
qp.method.meta.EHDy,
|
||||||
|
]
|
||||||
|
|
||||||
|
method_names, true_prevs, estim_prevs, tr_prevs = [], [], [], []
|
||||||
|
for Quantifier in models:
|
||||||
|
print(f'training {Quantifier.__name__}')
|
||||||
|
lr = LogisticRegression(max_iter=1000, class_weight='balanced')
|
||||||
|
# lr = GridSearchCV(lr, param_grid={'C':np.logspace(-3,3,7)}, n_jobs=-1)
|
||||||
|
model = Quantifier(lr).fit(data.training)
|
||||||
|
true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(
|
||||||
|
model, data.test, sample_size, n_repetitions=20, n_prevpoints=11)
|
||||||
|
|
||||||
|
method_names.append(Quantifier.__name__)
|
||||||
|
true_prevs.append(true_prev)
|
||||||
|
estim_prevs.append(estim_prev)
|
||||||
|
tr_prevs.append(data.training.prevalence())
|
||||||
|
|
||||||
|
return method_names, true_prevs, estim_prevs, tr_prevs
|
||||||
|
|
||||||
|
method_names, true_prevs, estim_prevs, tr_prevs = qp.util.pickled_resource('./plots/plot_data.pkl', gen_data)
|
||||||
|
|
||||||
|
qp.plot.error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=11, savepath='./plots/err_drift.png')
|
||||||
|
qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, savepath='./plots/bin_diag.png')
|
||||||
|
qp.plot.binary_bias_global(method_names, true_prevs, estim_prevs, savepath='./plots/bin_bias.png')
|
||||||
|
qp.plot.binary_bias_bins(method_names, true_prevs, estim_prevs, nbins=11, savepath='./plots/bin_bias_bin.png')
|
|
@ -4,6 +4,8 @@ from . import functional
|
||||||
from . import method
|
from . import method
|
||||||
from . import data
|
from . import data
|
||||||
from . import evaluation
|
from . import evaluation
|
||||||
|
from . import plot
|
||||||
|
from . import util
|
||||||
from method.aggregative import isaggregative, isprobabilistic
|
from method.aggregative import isaggregative, isprobabilistic
|
||||||
|
|
||||||
|
|
||||||
|
@ -17,4 +19,4 @@ environ = {
|
||||||
|
|
||||||
|
|
||||||
def isbinary(x):
|
def isbinary(x):
|
||||||
return data.isbinary(x) or method.aggregative.isbinary(x)
|
return data.isbinary(x) or method.isbinary(x)
|
|
@ -30,9 +30,9 @@ def artificial_sampling_prediction(
|
||||||
:param random_seed: allows to replicate the samplings. The seed is local to the method and does not affect
|
:param random_seed: allows to replicate the samplings. The seed is local to the method and does not affect
|
||||||
any other random process.
|
any other random process.
|
||||||
:param verbose: if True, shows a progress bar
|
:param verbose: if True, shows a progress bar
|
||||||
:return: two ndarrays of [m,n] with m the number of samples (n_prevpoints*n_repetitions) and n the
|
:return: two ndarrays of shape (m,n) with m the number of samples (n_prevpoints*n_repetitions) and n the
|
||||||
number of classes. The first one contains the true prevalences for the samples generated while the second one
|
number of classes. The first one contains the true prevalences for the samples generated while the second one
|
||||||
containing the the prevalences estimations
|
contains the the prevalence estimations
|
||||||
"""
|
"""
|
||||||
|
|
||||||
with temp_seed(random_seed):
|
with temp_seed(random_seed):
|
||||||
|
|
|
@ -5,13 +5,13 @@ from . import meta
|
||||||
|
|
||||||
|
|
||||||
AGGREGATIVE_METHODS = {
|
AGGREGATIVE_METHODS = {
|
||||||
aggregative.ClassifyAndCount,
|
aggregative.CC,
|
||||||
aggregative.AdjustedClassifyAndCount,
|
aggregative.ACC,
|
||||||
aggregative.ProbabilisticClassifyAndCount,
|
aggregative.PCC,
|
||||||
aggregative.ProbabilisticAdjustedClassifyAndCount,
|
aggregative.PACC,
|
||||||
aggregative.ExplicitLossMinimisation,
|
aggregative.ELM,
|
||||||
aggregative.ExpectationMaximizationQuantifier,
|
aggregative.EMQ,
|
||||||
aggregative.HellingerDistanceY
|
aggregative.HDy
|
||||||
}
|
}
|
||||||
|
|
||||||
NON_AGGREGATIVE_METHODS = {
|
NON_AGGREGATIVE_METHODS = {
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
|
from sklearn.base import BaseEstimator, clone
|
||||||
import functional as F
|
import functional as F
|
||||||
import error
|
import error
|
||||||
from method.base import BaseQuantifier, BinaryQuantifier
|
from method.base import BaseQuantifier, BinaryQuantifier
|
||||||
|
@ -130,13 +131,13 @@ def training_helper(learner,
|
||||||
|
|
||||||
# Methods
|
# Methods
|
||||||
# ------------------------------------
|
# ------------------------------------
|
||||||
class ClassifyAndCount(AggregativeQuantifier):
|
class CC(AggregativeQuantifier):
|
||||||
"""
|
"""
|
||||||
The most basic Quantification method. One that simply classifies all instances and countes how many have been
|
The most basic Quantification method. One that simply classifies all instances and countes how many have been
|
||||||
attributed each of the classes in order to compute class prevalence estimates.
|
attributed each of the classes in order to compute class prevalence estimates.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, learner):
|
def __init__(self, learner:BaseEstimator):
|
||||||
self.learner = learner
|
self.learner = learner
|
||||||
|
|
||||||
def fit(self, data: LabelledCollection, fit_learner=True):
|
def fit(self, data: LabelledCollection, fit_learner=True):
|
||||||
|
@ -153,9 +154,9 @@ class ClassifyAndCount(AggregativeQuantifier):
|
||||||
return F.prevalence_from_labels(classif_predictions, self.n_classes)
|
return F.prevalence_from_labels(classif_predictions, self.n_classes)
|
||||||
|
|
||||||
|
|
||||||
class AdjustedClassifyAndCount(AggregativeQuantifier):
|
class ACC(AggregativeQuantifier):
|
||||||
|
|
||||||
def __init__(self, learner):
|
def __init__(self, learner:BaseEstimator):
|
||||||
self.learner = learner
|
self.learner = learner
|
||||||
|
|
||||||
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, LabelledCollection]=0.3):
|
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, LabelledCollection]=0.3):
|
||||||
|
@ -169,7 +170,7 @@ class AdjustedClassifyAndCount(AggregativeQuantifier):
|
||||||
:return: self
|
:return: self
|
||||||
"""
|
"""
|
||||||
self.learner, validation = training_helper(self.learner, data, fit_learner, val_split=val_split)
|
self.learner, validation = training_helper(self.learner, data, fit_learner, val_split=val_split)
|
||||||
self.cc = ClassifyAndCount(self.learner)
|
self.cc = CC(self.learner)
|
||||||
y_ = self.classify(validation.instances)
|
y_ = self.classify(validation.instances)
|
||||||
y = validation.labels
|
y = validation.labels
|
||||||
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
|
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
|
||||||
|
@ -182,7 +183,7 @@ class AdjustedClassifyAndCount(AggregativeQuantifier):
|
||||||
|
|
||||||
def aggregate(self, classif_predictions):
|
def aggregate(self, classif_predictions):
|
||||||
prevs_estim = self.cc.aggregate(classif_predictions)
|
prevs_estim = self.cc.aggregate(classif_predictions)
|
||||||
return AdjustedClassifyAndCount.solve_adjustment(self.Pte_cond_estim_, prevs_estim)
|
return ACC.solve_adjustment(self.Pte_cond_estim_, prevs_estim)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def solve_adjustment(cls, PteCondEstim, prevs_estim):
|
def solve_adjustment(cls, PteCondEstim, prevs_estim):
|
||||||
|
@ -198,8 +199,8 @@ class AdjustedClassifyAndCount(AggregativeQuantifier):
|
||||||
return adjusted_prevs
|
return adjusted_prevs
|
||||||
|
|
||||||
|
|
||||||
class ProbabilisticClassifyAndCount(AggregativeProbabilisticQuantifier):
|
class PCC(AggregativeProbabilisticQuantifier):
|
||||||
def __init__(self, learner):
|
def __init__(self, learner:BaseEstimator):
|
||||||
self.learner = learner
|
self.learner = learner
|
||||||
|
|
||||||
def fit(self, data : LabelledCollection, fit_learner=True):
|
def fit(self, data : LabelledCollection, fit_learner=True):
|
||||||
|
@ -210,9 +211,9 @@ class ProbabilisticClassifyAndCount(AggregativeProbabilisticQuantifier):
|
||||||
return F.prevalence_from_probabilities(classif_posteriors, binarize=False)
|
return F.prevalence_from_probabilities(classif_posteriors, binarize=False)
|
||||||
|
|
||||||
|
|
||||||
class ProbabilisticAdjustedClassifyAndCount(AggregativeProbabilisticQuantifier):
|
class PACC(AggregativeProbabilisticQuantifier):
|
||||||
|
|
||||||
def __init__(self, learner):
|
def __init__(self, learner:BaseEstimator):
|
||||||
self.learner = learner
|
self.learner = learner
|
||||||
|
|
||||||
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, LabelledCollection]=0.3):
|
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, LabelledCollection]=0.3):
|
||||||
|
@ -228,7 +229,7 @@ class ProbabilisticAdjustedClassifyAndCount(AggregativeProbabilisticQuantifier):
|
||||||
self.learner, validation = training_helper(
|
self.learner, validation = training_helper(
|
||||||
self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split
|
self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split
|
||||||
)
|
)
|
||||||
self.pcc = ProbabilisticClassifyAndCount(self.learner)
|
self.pcc = PCC(self.learner)
|
||||||
y_ = self.soft_classify(validation.instances)
|
y_ = self.soft_classify(validation.instances)
|
||||||
y = validation.labels
|
y = validation.labels
|
||||||
confusion = np.empty(shape=(data.n_classes, data.n_classes))
|
confusion = np.empty(shape=(data.n_classes, data.n_classes))
|
||||||
|
@ -246,7 +247,7 @@ class ProbabilisticAdjustedClassifyAndCount(AggregativeProbabilisticQuantifier):
|
||||||
|
|
||||||
def aggregate(self, classif_posteriors):
|
def aggregate(self, classif_posteriors):
|
||||||
prevs_estim = self.pcc.aggregate(classif_posteriors)
|
prevs_estim = self.pcc.aggregate(classif_posteriors)
|
||||||
return AdjustedClassifyAndCount.solve_adjustment(self.Pte_cond_estim_, prevs_estim)
|
return ACC.solve_adjustment(self.Pte_cond_estim_, prevs_estim)
|
||||||
|
|
||||||
def classify(self, data):
|
def classify(self, data):
|
||||||
return self.pcc.classify(data)
|
return self.pcc.classify(data)
|
||||||
|
@ -255,12 +256,12 @@ class ProbabilisticAdjustedClassifyAndCount(AggregativeProbabilisticQuantifier):
|
||||||
return self.pcc.posterior_probabilities(data)
|
return self.pcc.posterior_probabilities(data)
|
||||||
|
|
||||||
|
|
||||||
class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier):
|
class EMQ(AggregativeProbabilisticQuantifier):
|
||||||
|
|
||||||
MAX_ITER = 1000
|
MAX_ITER = 1000
|
||||||
EPSILON = 1e-4
|
EPSILON = 1e-4
|
||||||
|
|
||||||
def __init__(self, learner):
|
def __init__(self, learner:BaseEstimator):
|
||||||
self.learner = learner
|
self.learner = learner
|
||||||
|
|
||||||
def fit(self, data: LabelledCollection, fit_learner=True):
|
def fit(self, data: LabelledCollection, fit_learner=True):
|
||||||
|
@ -279,7 +280,7 @@ class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier):
|
||||||
|
|
||||||
s, converged = 0, False
|
s, converged = 0, False
|
||||||
qs_prev_ = None
|
qs_prev_ = None
|
||||||
while not converged and s < ExpectationMaximizationQuantifier.MAX_ITER:
|
while not converged and s < EMQ.MAX_ITER:
|
||||||
# E-step: ps is Ps(y=+1|xi)
|
# E-step: ps is Ps(y=+1|xi)
|
||||||
ps_unnormalized = (qs / Ptr) * Px
|
ps_unnormalized = (qs / Ptr) * Px
|
||||||
ps = ps_unnormalized / ps_unnormalized.sum(axis=1).reshape(-1,1)
|
ps = ps_unnormalized / ps_unnormalized.sum(axis=1).reshape(-1,1)
|
||||||
|
@ -299,14 +300,14 @@ class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier):
|
||||||
return qs
|
return qs
|
||||||
|
|
||||||
|
|
||||||
class HellingerDistanceY(AggregativeProbabilisticQuantifier, BinaryQuantifier):
|
class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
|
||||||
"""
|
"""
|
||||||
Implementation of the method based on the Hellinger Distance y (HDy) proposed by
|
Implementation of the method based on the Hellinger Distance y (HDy) proposed by
|
||||||
González-Castro, V., Alaiz-Rodrı́guez, R., and Alegre, E. (2013). Class distribution
|
González-Castro, V., Alaiz-Rodrı́guez, R., and Alegre, E. (2013). Class distribution
|
||||||
estimation based on the Hellinger distance. Information Sciences, 218:146–164.
|
estimation based on the Hellinger distance. Information Sciences, 218:146–164.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, learner):
|
def __init__(self, learner:BaseEstimator):
|
||||||
self.learner = learner
|
self.learner = learner
|
||||||
|
|
||||||
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, LabelledCollection]=0.3):
|
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, LabelledCollection]=0.3):
|
||||||
|
@ -353,7 +354,7 @@ class HellingerDistanceY(AggregativeProbabilisticQuantifier, BinaryQuantifier):
|
||||||
return np.asarray([1-pos_class_prev, pos_class_prev])
|
return np.asarray([1-pos_class_prev, pos_class_prev])
|
||||||
|
|
||||||
|
|
||||||
class ExplicitLossMinimisation(AggregativeQuantifier, BinaryQuantifier):
|
class ELM(AggregativeQuantifier, BinaryQuantifier):
|
||||||
|
|
||||||
def __init__(self, svmperf_base, loss, **kwargs):
|
def __init__(self, svmperf_base, loss, **kwargs):
|
||||||
self.svmperf_base = svmperf_base
|
self.svmperf_base = svmperf_base
|
||||||
|
@ -374,38 +375,38 @@ class ExplicitLossMinimisation(AggregativeQuantifier, BinaryQuantifier):
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class SVMQ(ExplicitLossMinimisation):
|
class SVMQ(ELM):
|
||||||
def __init__(self, svmperf_base, **kwargs):
|
def __init__(self, svmperf_base, **kwargs):
|
||||||
super(SVMQ, self).__init__(svmperf_base, loss='q', **kwargs)
|
super(SVMQ, self).__init__(svmperf_base, loss='q', **kwargs)
|
||||||
|
|
||||||
|
|
||||||
class SVMKLD(ExplicitLossMinimisation):
|
class SVMKLD(ELM):
|
||||||
def __init__(self, svmperf_base, **kwargs):
|
def __init__(self, svmperf_base, **kwargs):
|
||||||
super(SVMKLD, self).__init__(svmperf_base, loss='kld', **kwargs)
|
super(SVMKLD, self).__init__(svmperf_base, loss='kld', **kwargs)
|
||||||
|
|
||||||
|
|
||||||
class SVMNKLD(ExplicitLossMinimisation):
|
class SVMNKLD(ELM):
|
||||||
def __init__(self, svmperf_base, **kwargs):
|
def __init__(self, svmperf_base, **kwargs):
|
||||||
super(SVMNKLD, self).__init__(svmperf_base, loss='nkld', **kwargs)
|
super(SVMNKLD, self).__init__(svmperf_base, loss='nkld', **kwargs)
|
||||||
|
|
||||||
|
|
||||||
class SVMAE(ExplicitLossMinimisation):
|
class SVMAE(ELM):
|
||||||
def __init__(self, svmperf_base, **kwargs):
|
def __init__(self, svmperf_base, **kwargs):
|
||||||
super(SVMAE, self).__init__(svmperf_base, loss='mae', **kwargs)
|
super(SVMAE, self).__init__(svmperf_base, loss='mae', **kwargs)
|
||||||
|
|
||||||
|
|
||||||
class SVMRAE(ExplicitLossMinimisation):
|
class SVMRAE(ELM):
|
||||||
def __init__(self, svmperf_base, **kwargs):
|
def __init__(self, svmperf_base, **kwargs):
|
||||||
super(SVMRAE, self).__init__(svmperf_base, loss='mrae', **kwargs)
|
super(SVMRAE, self).__init__(svmperf_base, loss='mrae', **kwargs)
|
||||||
|
|
||||||
|
|
||||||
CC = ClassifyAndCount
|
ClassifyAndCount = CC
|
||||||
ACC = AdjustedClassifyAndCount
|
AdjustedClassifyAndCount = ACC
|
||||||
PCC = ProbabilisticClassifyAndCount
|
ProbabilisticClassifyAndCount = PCC
|
||||||
PACC = ProbabilisticAdjustedClassifyAndCount
|
ProbabilisticAdjustedClassifyAndCount = PACC
|
||||||
ELM = ExplicitLossMinimisation
|
ExplicitLossMinimisation = ELM
|
||||||
EMQ = ExpectationMaximizationQuantifier
|
ExpectationMaximizationQuantifier = EMQ
|
||||||
HDy = HellingerDistanceY
|
HellingerDistanceY = HDy
|
||||||
|
|
||||||
|
|
||||||
class OneVsAll(AggregativeQuantifier):
|
class OneVsAll(AggregativeQuantifier):
|
||||||
|
@ -414,7 +415,7 @@ class OneVsAll(AggregativeQuantifier):
|
||||||
quantifier for each class, and then l1-normalizes the outputs so that the class prevelences sum up to 1.
|
quantifier for each class, and then l1-normalizes the outputs so that the class prevelences sum up to 1.
|
||||||
This variant was used, along with the ExplicitLossMinimization quantifier in
|
This variant was used, along with the ExplicitLossMinimization quantifier in
|
||||||
Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
|
Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
|
||||||
Social Network Analysis and Mining6(19), 1–22 (2016)
|
Social Network Analysis and Mining 6(19), 1–22 (2016)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, binary_quantifier, n_jobs=-1):
|
def __init__(self, binary_quantifier, n_jobs=-1):
|
||||||
|
@ -484,15 +485,14 @@ class OneVsAll(AggregativeQuantifier):
|
||||||
self.dict_binary_quantifiers[c].fit(bindata)
|
self.dict_binary_quantifiers[c].fit(bindata)
|
||||||
|
|
||||||
|
|
||||||
def isaggregative(model):
|
def isaggregative(model:BaseQuantifier):
|
||||||
return isinstance(model, AggregativeQuantifier)
|
return isinstance(model, AggregativeQuantifier)
|
||||||
|
|
||||||
|
|
||||||
def isprobabilistic(model):
|
def isprobabilistic(model:BaseQuantifier):
|
||||||
return isinstance(model, AggregativeProbabilisticQuantifier)
|
return isinstance(model, AggregativeProbabilisticQuantifier)
|
||||||
|
|
||||||
|
|
||||||
def isbinary(model):
|
|
||||||
return isinstance(model, BinaryQuantifier)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -20,12 +20,24 @@ class BaseQuantifier(metaclass=ABCMeta):
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def get_params(self, deep=True): ...
|
def get_params(self, deep=True): ...
|
||||||
|
|
||||||
|
@property
|
||||||
|
def binary(self):
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
class BinaryQuantifier(BaseQuantifier):
|
class BinaryQuantifier(BaseQuantifier):
|
||||||
def _check_binary(self, data: LabelledCollection, quantifier_name):
|
def _check_binary(self, data: LabelledCollection, quantifier_name):
|
||||||
assert data.binary, f'{quantifier_name} works only on problems of binary classification. ' \
|
assert data.binary, f'{quantifier_name} works only on problems of binary classification. ' \
|
||||||
f'Use the class OneVsAll to enable {quantifier_name} work on single-label data.'
|
f'Use the class OneVsAll to enable {quantifier_name} work on single-label data.'
|
||||||
|
|
||||||
|
@property
|
||||||
|
def binary(self):
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def isbinary(model:BaseQuantifier):
|
||||||
|
return model.binary
|
||||||
|
|
||||||
|
|
||||||
# class OneVsAll(AggregativeQuantifier):
|
# class OneVsAll(AggregativeQuantifier):
|
||||||
# """
|
# """
|
||||||
|
|
|
@ -76,11 +76,11 @@ class QuaNetTrainer(BaseQuantifier):
|
||||||
self.tr_prev = data.prevalence()
|
self.tr_prev = data.prevalence()
|
||||||
|
|
||||||
self.quantifiers = {
|
self.quantifiers = {
|
||||||
'cc': ClassifyAndCount(self.learner).fit(data, fit_learner=False),
|
'cc': CC(self.learner).fit(data, fit_learner=False),
|
||||||
'acc': AdjustedClassifyAndCount(self.learner).fit(data, fit_learner=False),
|
'acc': ACC(self.learner).fit(data, fit_learner=False),
|
||||||
'pcc': ProbabilisticClassifyAndCount(self.learner).fit(data, fit_learner=False),
|
'pcc': PCC(self.learner).fit(data, fit_learner=False),
|
||||||
'pacc': ProbabilisticAdjustedClassifyAndCount(self.learner).fit(data, fit_learner=False),
|
'pacc': PACC(self.learner).fit(data, fit_learner=False),
|
||||||
'emq': ExpectationMaximizationQuantifier(self.learner).fit(data, fit_learner=False),
|
'emq': EMQ(self.learner).fit(data, fit_learner=False),
|
||||||
}
|
}
|
||||||
|
|
||||||
self.status = {
|
self.status = {
|
||||||
|
|
|
@ -0,0 +1,202 @@
|
||||||
|
from collections import defaultdict
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from matplotlib import cm
|
||||||
|
import numpy as np
|
||||||
|
import quapy as qp
|
||||||
|
|
||||||
|
|
||||||
|
plt.rcParams['figure.figsize'] = [12, 8]
|
||||||
|
plt.rcParams['figure.dpi'] = 200
|
||||||
|
|
||||||
|
|
||||||
|
def binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title=None, savepath=None):
|
||||||
|
fig, ax = plt.subplots()
|
||||||
|
ax.set_aspect('equal')
|
||||||
|
ax.grid()
|
||||||
|
ax.plot([0, 1], [0, 1], '--k', label='ideal', zorder=1)
|
||||||
|
|
||||||
|
for method, true_prev, estim_prev in zip(method_names, true_prevs, estim_prevs):
|
||||||
|
true_prev = true_prev[:,pos_class]
|
||||||
|
estim_prev = estim_prev[:,pos_class]
|
||||||
|
|
||||||
|
x_ticks = np.unique(true_prev)
|
||||||
|
x_ticks.sort()
|
||||||
|
y_ave = np.asarray([estim_prev[true_prev == x].mean() for x in x_ticks])
|
||||||
|
y_std = np.asarray([estim_prev[true_prev == x].std() for x in x_ticks])
|
||||||
|
|
||||||
|
ax.errorbar(x_ticks, y_ave, fmt='-', marker='o', label=method, markersize=3, zorder=2)
|
||||||
|
ax.fill_between(x_ticks, y_ave - y_std, y_ave + y_std, alpha=0.25)
|
||||||
|
|
||||||
|
ax.set(xlabel='true prevalence', ylabel='estimated prevalence', title=title)
|
||||||
|
ax.set_ylim(0, 1)
|
||||||
|
ax.set_xlim(0, 1)
|
||||||
|
|
||||||
|
box = ax.get_position()
|
||||||
|
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
|
||||||
|
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
|
||||||
|
|
||||||
|
save_or_show(savepath)
|
||||||
|
|
||||||
|
|
||||||
|
def binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=1, title=None, savepath=None):
|
||||||
|
fig, ax = plt.subplots()
|
||||||
|
ax.grid()
|
||||||
|
|
||||||
|
data, labels = [], []
|
||||||
|
for method, true_prev, estim_prev in zip(method_names, true_prevs, estim_prevs):
|
||||||
|
true_prev = true_prev[:,pos_class]
|
||||||
|
estim_prev = estim_prev[:,pos_class]
|
||||||
|
data.append(estim_prev-true_prev)
|
||||||
|
labels.append(method)
|
||||||
|
|
||||||
|
ax.boxplot(data, labels=labels, patch_artist=False, showmeans=True)
|
||||||
|
ax.set(ylabel='error bias', title=title)
|
||||||
|
|
||||||
|
save_or_show(savepath)
|
||||||
|
|
||||||
|
|
||||||
|
def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=None, nbins=21, colormap=cm.tab10,
|
||||||
|
vertical_xticks=False, savepath=None):
|
||||||
|
from pylab import boxplot, plot, setp
|
||||||
|
|
||||||
|
fig, ax = plt.subplots()
|
||||||
|
ax.grid()
|
||||||
|
|
||||||
|
bins = np.linspace(0, 1, nbins)
|
||||||
|
binwidth = 1/(nbins - 1)
|
||||||
|
data = {}
|
||||||
|
for method, true_prev, estim_prev in zip(method_names, true_prevs, estim_prevs):
|
||||||
|
true_prev = true_prev[:,pos_class]
|
||||||
|
estim_prev = estim_prev[:,pos_class]
|
||||||
|
|
||||||
|
data[method] = []
|
||||||
|
inds = np.digitize(true_prev, bins, right=True)
|
||||||
|
for ind in range(len(bins)):
|
||||||
|
selected = inds==ind
|
||||||
|
data[method].append(estim_prev[selected] - true_prev[selected])
|
||||||
|
|
||||||
|
nmethods = len(method_names)
|
||||||
|
boxwidth = binwidth/(nmethods+1)
|
||||||
|
for i,bin in enumerate(bins[:-1]):
|
||||||
|
boxdata = [data[method][i] for method in method_names]
|
||||||
|
positions = [bin+(i*boxwidth)+boxwidth for i,_ in enumerate(method_names)]
|
||||||
|
box = boxplot(boxdata, showmeans=False, positions=positions, widths = boxwidth, sym='+', patch_artist=True)
|
||||||
|
for boxid in range(len(method_names)):
|
||||||
|
c = colormap.colors[boxid]
|
||||||
|
setp(box['fliers'][boxid], color=c, marker='+', markersize=3., markeredgecolor=c)
|
||||||
|
setp(box['boxes'][boxid], color=c)
|
||||||
|
setp(box['medians'][boxid], color='k')
|
||||||
|
|
||||||
|
major_xticks_positions, minor_xticks_positions = [], []
|
||||||
|
major_xticks_labels, minor_xticks_labels = [], []
|
||||||
|
for i,b in enumerate(bins[:-1]):
|
||||||
|
major_xticks_positions.append(b)
|
||||||
|
minor_xticks_positions.append(b + binwidth / 2)
|
||||||
|
major_xticks_labels.append('')
|
||||||
|
minor_xticks_labels.append(f'[{bins[i]:.2f}-{bins[i + 1]:.2f})')
|
||||||
|
ax.set_xticks(major_xticks_positions)
|
||||||
|
ax.set_xticks(minor_xticks_positions, minor=True)
|
||||||
|
ax.set_xticklabels(major_xticks_labels)
|
||||||
|
ax.set_xticklabels(minor_xticks_labels, minor=True, rotation='vertical' if vertical_xticks else 'horizontal')
|
||||||
|
|
||||||
|
if vertical_xticks:
|
||||||
|
# Pad margins so that markers don't get clipped by the axes
|
||||||
|
plt.margins(0.2)
|
||||||
|
# Tweak spacing to prevent clipping of tick-labels
|
||||||
|
plt.subplots_adjust(bottom=0.15)
|
||||||
|
|
||||||
|
# adds the legend to the list hs, initialized with the "ideal" quantifier (one that has 0 bias across all bins. i.e.
|
||||||
|
# a line from (0,0) to (1,0). The other elements are simply labelled dot-plots that are to be removed (setting
|
||||||
|
# set_visible to False for all but the first element) after the legend has been placed
|
||||||
|
hs=[ax.plot([0, 1], [0, 0], '-k', zorder=2)[0]]
|
||||||
|
for colorid in range(len(method_names)):
|
||||||
|
h, = plot([1, 1], '-s', markerfacecolor=colormap.colors[colorid], color='k',
|
||||||
|
mec=colormap.colors[colorid], linewidth=1.)
|
||||||
|
hs.append(h)
|
||||||
|
box = ax.get_position()
|
||||||
|
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
|
||||||
|
ax.legend(hs, ['ideal']+method_names, loc='center left', bbox_to_anchor=(1, 0.5))
|
||||||
|
[h.set_visible(False) for h in hs[1:]]
|
||||||
|
|
||||||
|
# x-axis and y-axis labels and limits
|
||||||
|
ax.set(xlabel='prevalence', ylabel='error bias', title=title)
|
||||||
|
# ax.set_ylim(-1, 1)
|
||||||
|
ax.set_xlim(0, 1)
|
||||||
|
|
||||||
|
save_or_show(savepath)
|
||||||
|
|
||||||
|
|
||||||
|
def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=21, error_name='ae', show_std=True,
|
||||||
|
title=f'Quantification error as a function of distribution shift',
|
||||||
|
savepath=None):
|
||||||
|
|
||||||
|
fig, ax = plt.subplots()
|
||||||
|
ax.grid()
|
||||||
|
|
||||||
|
x_error = qp.error.ae
|
||||||
|
y_error = getattr(qp.error, error_name)
|
||||||
|
ndims = tr_prevs[0].shape[-1]
|
||||||
|
|
||||||
|
# join all data, and keep the order in which the methods appeared for the first time
|
||||||
|
data = defaultdict(lambda:{'x':np.empty(shape=(0)), 'y':np.empty(shape=(0))})
|
||||||
|
method_order = []
|
||||||
|
for method, test_prevs_i, estim_prevs_i, tr_prev_i in zip(method_names, true_prevs, estim_prevs, tr_prevs):
|
||||||
|
tr_prev_i = np.repeat(tr_prev_i.reshape(1,-1), repeats=test_prevs_i.shape[0], axis=0)
|
||||||
|
|
||||||
|
tr_test_drifts = x_error(test_prevs_i, tr_prev_i)
|
||||||
|
data[method]['x'] = np.concatenate([data[method]['x'], tr_test_drifts])
|
||||||
|
|
||||||
|
method_drifts = y_error(test_prevs_i, estim_prevs_i)
|
||||||
|
data[method]['y'] = np.concatenate([data[method]['y'], method_drifts])
|
||||||
|
|
||||||
|
if method not in method_order:
|
||||||
|
method_order.append(method)
|
||||||
|
|
||||||
|
bins = np.linspace(0, 1, n_bins)
|
||||||
|
binwidth = 1 / (n_bins - 1)
|
||||||
|
min_x, max_x = None, None
|
||||||
|
for method in method_order:
|
||||||
|
tr_test_drifts = data[method]['x']
|
||||||
|
method_drifts = data[method]['y']
|
||||||
|
|
||||||
|
inds = np.digitize(tr_test_drifts, bins, right=True)
|
||||||
|
xs, ys, ystds = [], [], []
|
||||||
|
for ind in range(len(bins)):
|
||||||
|
selected = inds==ind
|
||||||
|
if selected.sum() > 0:
|
||||||
|
xs.append(ind*binwidth)
|
||||||
|
ys.append(np.mean(method_drifts[selected]))
|
||||||
|
ystds.append(np.std(method_drifts[selected]))
|
||||||
|
|
||||||
|
xs = np.asarray(xs)
|
||||||
|
ys = np.asarray(ys)
|
||||||
|
ystds = np.asarray(ystds)
|
||||||
|
|
||||||
|
min_x_method, max_x_method = xs.min(), xs.max()
|
||||||
|
min_x = min_x_method if min_x is None or min_x_method < min_x else min_x
|
||||||
|
max_x = max_x_method if max_x is None or max_x_method > max_x else max_x
|
||||||
|
|
||||||
|
ax.errorbar(xs, ys, fmt='-', marker='o', label=method, markersize=3, zorder=2)
|
||||||
|
if show_std:
|
||||||
|
ax.fill_between(xs, ys-ystds, ys+ystds, alpha=0.25)
|
||||||
|
|
||||||
|
ax.set(xlabel=f'Distribution shift between training set and test sample',
|
||||||
|
ylabel=f'{error_name.upper()} (true distribution, predicted distribution)',
|
||||||
|
title=title)
|
||||||
|
box = ax.get_position()
|
||||||
|
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
|
||||||
|
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
|
||||||
|
ax.set_xlim(min_x, max_x)
|
||||||
|
|
||||||
|
save_or_show(savepath)
|
||||||
|
|
||||||
|
|
||||||
|
def save_or_show(savepath):
|
||||||
|
# if savepath is specified, then saves the plot in that path; otherwise the plot is shown
|
||||||
|
if savepath is not None:
|
||||||
|
qp.util.create_parent_dir(savepath)
|
||||||
|
# plt.tight_layout()
|
||||||
|
plt.savefig(savepath)
|
||||||
|
else:
|
||||||
|
plt.show()
|
||||||
|
|
|
@ -64,6 +64,10 @@ def get_quapy_home():
|
||||||
return home
|
return home
|
||||||
|
|
||||||
|
|
||||||
|
def create_parent_dir(path):
|
||||||
|
os.makedirs(Path(path).parent, exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
def pickled_resource(pickle_path:str, generation_func:callable, *args):
|
def pickled_resource(pickle_path:str, generation_func:callable, *args):
|
||||||
if pickle_path is None:
|
if pickle_path is None:
|
||||||
return generation_func(*args)
|
return generation_func(*args)
|
||||||
|
|
Loading…
Reference in New Issue