plot functionality added

This commit is contained in:
Alejandro Moreo Fernandez 2021-01-07 17:58:48 +01:00
parent 5894d46b31
commit d1b449d2e9
9 changed files with 318 additions and 50 deletions

48
plot_example.py Normal file
View File

@ -0,0 +1,48 @@
from sklearn.model_selection import GridSearchCV
import numpy as np
import quapy as qp
from sklearn.linear_model import LogisticRegression
sample_size = 500
qp.environ['SAMPLE_SIZE'] = sample_size
def gen_data():
data = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5)
models = [
qp.method.aggregative.CC,
qp.method.aggregative.ACC,
qp.method.aggregative.PCC,
qp.method.aggregative.PACC,
qp.method.aggregative.HDy,
qp.method.aggregative.EMQ,
qp.method.meta.ECC,
qp.method.meta.EACC,
qp.method.meta.EHDy,
]
method_names, true_prevs, estim_prevs, tr_prevs = [], [], [], []
for Quantifier in models:
print(f'training {Quantifier.__name__}')
lr = LogisticRegression(max_iter=1000, class_weight='balanced')
# lr = GridSearchCV(lr, param_grid={'C':np.logspace(-3,3,7)}, n_jobs=-1)
model = Quantifier(lr).fit(data.training)
true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(
model, data.test, sample_size, n_repetitions=20, n_prevpoints=11)
method_names.append(Quantifier.__name__)
true_prevs.append(true_prev)
estim_prevs.append(estim_prev)
tr_prevs.append(data.training.prevalence())
return method_names, true_prevs, estim_prevs, tr_prevs
method_names, true_prevs, estim_prevs, tr_prevs = qp.util.pickled_resource('./plots/plot_data.pkl', gen_data)
qp.plot.error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=11, savepath='./plots/err_drift.png')
qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, savepath='./plots/bin_diag.png')
qp.plot.binary_bias_global(method_names, true_prevs, estim_prevs, savepath='./plots/bin_bias.png')
qp.plot.binary_bias_bins(method_names, true_prevs, estim_prevs, nbins=11, savepath='./plots/bin_bias_bin.png')

View File

@ -4,6 +4,8 @@ from . import functional
from . import method
from . import data
from . import evaluation
from . import plot
from . import util
from method.aggregative import isaggregative, isprobabilistic
@ -17,4 +19,4 @@ environ = {
def isbinary(x):
return data.isbinary(x) or method.aggregative.isbinary(x)
return data.isbinary(x) or method.isbinary(x)

View File

@ -30,9 +30,9 @@ def artificial_sampling_prediction(
:param random_seed: allows to replicate the samplings. The seed is local to the method and does not affect
any other random process.
:param verbose: if True, shows a progress bar
:return: two ndarrays of [m,n] with m the number of samples (n_prevpoints*n_repetitions) and n the
:return: two ndarrays of shape (m,n) with m the number of samples (n_prevpoints*n_repetitions) and n the
number of classes. The first one contains the true prevalences for the samples generated while the second one
containing the the prevalences estimations
contains the the prevalence estimations
"""
with temp_seed(random_seed):

View File

@ -5,13 +5,13 @@ from . import meta
AGGREGATIVE_METHODS = {
aggregative.ClassifyAndCount,
aggregative.AdjustedClassifyAndCount,
aggregative.ProbabilisticClassifyAndCount,
aggregative.ProbabilisticAdjustedClassifyAndCount,
aggregative.ExplicitLossMinimisation,
aggregative.ExpectationMaximizationQuantifier,
aggregative.HellingerDistanceY
aggregative.CC,
aggregative.ACC,
aggregative.PCC,
aggregative.PACC,
aggregative.ELM,
aggregative.EMQ,
aggregative.HDy
}
NON_AGGREGATIVE_METHODS = {

View File

@ -1,5 +1,6 @@
import numpy as np
from copy import deepcopy
from sklearn.base import BaseEstimator, clone
import functional as F
import error
from method.base import BaseQuantifier, BinaryQuantifier
@ -130,13 +131,13 @@ def training_helper(learner,
# Methods
# ------------------------------------
class ClassifyAndCount(AggregativeQuantifier):
class CC(AggregativeQuantifier):
"""
The most basic Quantification method. One that simply classifies all instances and countes how many have been
attributed each of the classes in order to compute class prevalence estimates.
"""
def __init__(self, learner):
def __init__(self, learner:BaseEstimator):
self.learner = learner
def fit(self, data: LabelledCollection, fit_learner=True):
@ -153,9 +154,9 @@ class ClassifyAndCount(AggregativeQuantifier):
return F.prevalence_from_labels(classif_predictions, self.n_classes)
class AdjustedClassifyAndCount(AggregativeQuantifier):
class ACC(AggregativeQuantifier):
def __init__(self, learner):
def __init__(self, learner:BaseEstimator):
self.learner = learner
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, LabelledCollection]=0.3):
@ -169,7 +170,7 @@ class AdjustedClassifyAndCount(AggregativeQuantifier):
:return: self
"""
self.learner, validation = training_helper(self.learner, data, fit_learner, val_split=val_split)
self.cc = ClassifyAndCount(self.learner)
self.cc = CC(self.learner)
y_ = self.classify(validation.instances)
y = validation.labels
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
@ -182,7 +183,7 @@ class AdjustedClassifyAndCount(AggregativeQuantifier):
def aggregate(self, classif_predictions):
prevs_estim = self.cc.aggregate(classif_predictions)
return AdjustedClassifyAndCount.solve_adjustment(self.Pte_cond_estim_, prevs_estim)
return ACC.solve_adjustment(self.Pte_cond_estim_, prevs_estim)
@classmethod
def solve_adjustment(cls, PteCondEstim, prevs_estim):
@ -198,8 +199,8 @@ class AdjustedClassifyAndCount(AggregativeQuantifier):
return adjusted_prevs
class ProbabilisticClassifyAndCount(AggregativeProbabilisticQuantifier):
def __init__(self, learner):
class PCC(AggregativeProbabilisticQuantifier):
def __init__(self, learner:BaseEstimator):
self.learner = learner
def fit(self, data : LabelledCollection, fit_learner=True):
@ -210,9 +211,9 @@ class ProbabilisticClassifyAndCount(AggregativeProbabilisticQuantifier):
return F.prevalence_from_probabilities(classif_posteriors, binarize=False)
class ProbabilisticAdjustedClassifyAndCount(AggregativeProbabilisticQuantifier):
class PACC(AggregativeProbabilisticQuantifier):
def __init__(self, learner):
def __init__(self, learner:BaseEstimator):
self.learner = learner
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, LabelledCollection]=0.3):
@ -228,7 +229,7 @@ class ProbabilisticAdjustedClassifyAndCount(AggregativeProbabilisticQuantifier):
self.learner, validation = training_helper(
self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split
)
self.pcc = ProbabilisticClassifyAndCount(self.learner)
self.pcc = PCC(self.learner)
y_ = self.soft_classify(validation.instances)
y = validation.labels
confusion = np.empty(shape=(data.n_classes, data.n_classes))
@ -246,7 +247,7 @@ class ProbabilisticAdjustedClassifyAndCount(AggregativeProbabilisticQuantifier):
def aggregate(self, classif_posteriors):
prevs_estim = self.pcc.aggregate(classif_posteriors)
return AdjustedClassifyAndCount.solve_adjustment(self.Pte_cond_estim_, prevs_estim)
return ACC.solve_adjustment(self.Pte_cond_estim_, prevs_estim)
def classify(self, data):
return self.pcc.classify(data)
@ -255,12 +256,12 @@ class ProbabilisticAdjustedClassifyAndCount(AggregativeProbabilisticQuantifier):
return self.pcc.posterior_probabilities(data)
class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier):
class EMQ(AggregativeProbabilisticQuantifier):
MAX_ITER = 1000
EPSILON = 1e-4
def __init__(self, learner):
def __init__(self, learner:BaseEstimator):
self.learner = learner
def fit(self, data: LabelledCollection, fit_learner=True):
@ -279,7 +280,7 @@ class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier):
s, converged = 0, False
qs_prev_ = None
while not converged and s < ExpectationMaximizationQuantifier.MAX_ITER:
while not converged and s < EMQ.MAX_ITER:
# E-step: ps is Ps(y=+1|xi)
ps_unnormalized = (qs / Ptr) * Px
ps = ps_unnormalized / ps_unnormalized.sum(axis=1).reshape(-1,1)
@ -299,14 +300,14 @@ class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier):
return qs
class HellingerDistanceY(AggregativeProbabilisticQuantifier, BinaryQuantifier):
class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
"""
Implementation of the method based on the Hellinger Distance y (HDy) proposed by
González-Castro, V., Alaiz-Rodrı́guez, R., and Alegre, E. (2013). Class distribution
estimation based on the Hellinger distance. Information Sciences, 218:146164.
"""
def __init__(self, learner):
def __init__(self, learner:BaseEstimator):
self.learner = learner
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, LabelledCollection]=0.3):
@ -353,7 +354,7 @@ class HellingerDistanceY(AggregativeProbabilisticQuantifier, BinaryQuantifier):
return np.asarray([1-pos_class_prev, pos_class_prev])
class ExplicitLossMinimisation(AggregativeQuantifier, BinaryQuantifier):
class ELM(AggregativeQuantifier, BinaryQuantifier):
def __init__(self, svmperf_base, loss, **kwargs):
self.svmperf_base = svmperf_base
@ -374,38 +375,38 @@ class ExplicitLossMinimisation(AggregativeQuantifier, BinaryQuantifier):
class SVMQ(ExplicitLossMinimisation):
class SVMQ(ELM):
def __init__(self, svmperf_base, **kwargs):
super(SVMQ, self).__init__(svmperf_base, loss='q', **kwargs)
class SVMKLD(ExplicitLossMinimisation):
class SVMKLD(ELM):
def __init__(self, svmperf_base, **kwargs):
super(SVMKLD, self).__init__(svmperf_base, loss='kld', **kwargs)
class SVMNKLD(ExplicitLossMinimisation):
class SVMNKLD(ELM):
def __init__(self, svmperf_base, **kwargs):
super(SVMNKLD, self).__init__(svmperf_base, loss='nkld', **kwargs)
class SVMAE(ExplicitLossMinimisation):
class SVMAE(ELM):
def __init__(self, svmperf_base, **kwargs):
super(SVMAE, self).__init__(svmperf_base, loss='mae', **kwargs)
class SVMRAE(ExplicitLossMinimisation):
class SVMRAE(ELM):
def __init__(self, svmperf_base, **kwargs):
super(SVMRAE, self).__init__(svmperf_base, loss='mrae', **kwargs)
CC = ClassifyAndCount
ACC = AdjustedClassifyAndCount
PCC = ProbabilisticClassifyAndCount
PACC = ProbabilisticAdjustedClassifyAndCount
ELM = ExplicitLossMinimisation
EMQ = ExpectationMaximizationQuantifier
HDy = HellingerDistanceY
ClassifyAndCount = CC
AdjustedClassifyAndCount = ACC
ProbabilisticClassifyAndCount = PCC
ProbabilisticAdjustedClassifyAndCount = PACC
ExplicitLossMinimisation = ELM
ExpectationMaximizationQuantifier = EMQ
HellingerDistanceY = HDy
class OneVsAll(AggregativeQuantifier):
@ -414,7 +415,7 @@ class OneVsAll(AggregativeQuantifier):
quantifier for each class, and then l1-normalizes the outputs so that the class prevelences sum up to 1.
This variant was used, along with the ExplicitLossMinimization quantifier in
Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
Social Network Analysis and Mining6(19), 122 (2016)
Social Network Analysis and Mining 6(19), 122 (2016)
"""
def __init__(self, binary_quantifier, n_jobs=-1):
@ -484,15 +485,14 @@ class OneVsAll(AggregativeQuantifier):
self.dict_binary_quantifiers[c].fit(bindata)
def isaggregative(model):
def isaggregative(model:BaseQuantifier):
return isinstance(model, AggregativeQuantifier)
def isprobabilistic(model):
def isprobabilistic(model:BaseQuantifier):
return isinstance(model, AggregativeProbabilisticQuantifier)
def isbinary(model):
return isinstance(model, BinaryQuantifier)

View File

@ -20,12 +20,24 @@ class BaseQuantifier(metaclass=ABCMeta):
@abstractmethod
def get_params(self, deep=True): ...
@property
def binary(self):
return False
class BinaryQuantifier(BaseQuantifier):
def _check_binary(self, data: LabelledCollection, quantifier_name):
assert data.binary, f'{quantifier_name} works only on problems of binary classification. ' \
f'Use the class OneVsAll to enable {quantifier_name} work on single-label data.'
@property
def binary(self):
return True
def isbinary(model:BaseQuantifier):
return model.binary
# class OneVsAll(AggregativeQuantifier):
# """

View File

@ -76,11 +76,11 @@ class QuaNetTrainer(BaseQuantifier):
self.tr_prev = data.prevalence()
self.quantifiers = {
'cc': ClassifyAndCount(self.learner).fit(data, fit_learner=False),
'acc': AdjustedClassifyAndCount(self.learner).fit(data, fit_learner=False),
'pcc': ProbabilisticClassifyAndCount(self.learner).fit(data, fit_learner=False),
'pacc': ProbabilisticAdjustedClassifyAndCount(self.learner).fit(data, fit_learner=False),
'emq': ExpectationMaximizationQuantifier(self.learner).fit(data, fit_learner=False),
'cc': CC(self.learner).fit(data, fit_learner=False),
'acc': ACC(self.learner).fit(data, fit_learner=False),
'pcc': PCC(self.learner).fit(data, fit_learner=False),
'pacc': PACC(self.learner).fit(data, fit_learner=False),
'emq': EMQ(self.learner).fit(data, fit_learner=False),
}
self.status = {

202
quapy/plot.py Normal file
View File

@ -0,0 +1,202 @@
from collections import defaultdict
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
import quapy as qp
plt.rcParams['figure.figsize'] = [12, 8]
plt.rcParams['figure.dpi'] = 200
def binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title=None, savepath=None):
fig, ax = plt.subplots()
ax.set_aspect('equal')
ax.grid()
ax.plot([0, 1], [0, 1], '--k', label='ideal', zorder=1)
for method, true_prev, estim_prev in zip(method_names, true_prevs, estim_prevs):
true_prev = true_prev[:,pos_class]
estim_prev = estim_prev[:,pos_class]
x_ticks = np.unique(true_prev)
x_ticks.sort()
y_ave = np.asarray([estim_prev[true_prev == x].mean() for x in x_ticks])
y_std = np.asarray([estim_prev[true_prev == x].std() for x in x_ticks])
ax.errorbar(x_ticks, y_ave, fmt='-', marker='o', label=method, markersize=3, zorder=2)
ax.fill_between(x_ticks, y_ave - y_std, y_ave + y_std, alpha=0.25)
ax.set(xlabel='true prevalence', ylabel='estimated prevalence', title=title)
ax.set_ylim(0, 1)
ax.set_xlim(0, 1)
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
save_or_show(savepath)
def binary_bias_global(method_names, true_prevs, estim_prevs, pos_class=1, title=None, savepath=None):
fig, ax = plt.subplots()
ax.grid()
data, labels = [], []
for method, true_prev, estim_prev in zip(method_names, true_prevs, estim_prevs):
true_prev = true_prev[:,pos_class]
estim_prev = estim_prev[:,pos_class]
data.append(estim_prev-true_prev)
labels.append(method)
ax.boxplot(data, labels=labels, patch_artist=False, showmeans=True)
ax.set(ylabel='error bias', title=title)
save_or_show(savepath)
def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=None, nbins=21, colormap=cm.tab10,
vertical_xticks=False, savepath=None):
from pylab import boxplot, plot, setp
fig, ax = plt.subplots()
ax.grid()
bins = np.linspace(0, 1, nbins)
binwidth = 1/(nbins - 1)
data = {}
for method, true_prev, estim_prev in zip(method_names, true_prevs, estim_prevs):
true_prev = true_prev[:,pos_class]
estim_prev = estim_prev[:,pos_class]
data[method] = []
inds = np.digitize(true_prev, bins, right=True)
for ind in range(len(bins)):
selected = inds==ind
data[method].append(estim_prev[selected] - true_prev[selected])
nmethods = len(method_names)
boxwidth = binwidth/(nmethods+1)
for i,bin in enumerate(bins[:-1]):
boxdata = [data[method][i] for method in method_names]
positions = [bin+(i*boxwidth)+boxwidth for i,_ in enumerate(method_names)]
box = boxplot(boxdata, showmeans=False, positions=positions, widths = boxwidth, sym='+', patch_artist=True)
for boxid in range(len(method_names)):
c = colormap.colors[boxid]
setp(box['fliers'][boxid], color=c, marker='+', markersize=3., markeredgecolor=c)
setp(box['boxes'][boxid], color=c)
setp(box['medians'][boxid], color='k')
major_xticks_positions, minor_xticks_positions = [], []
major_xticks_labels, minor_xticks_labels = [], []
for i,b in enumerate(bins[:-1]):
major_xticks_positions.append(b)
minor_xticks_positions.append(b + binwidth / 2)
major_xticks_labels.append('')
minor_xticks_labels.append(f'[{bins[i]:.2f}-{bins[i + 1]:.2f})')
ax.set_xticks(major_xticks_positions)
ax.set_xticks(minor_xticks_positions, minor=True)
ax.set_xticklabels(major_xticks_labels)
ax.set_xticklabels(minor_xticks_labels, minor=True, rotation='vertical' if vertical_xticks else 'horizontal')
if vertical_xticks:
# Pad margins so that markers don't get clipped by the axes
plt.margins(0.2)
# Tweak spacing to prevent clipping of tick-labels
plt.subplots_adjust(bottom=0.15)
# adds the legend to the list hs, initialized with the "ideal" quantifier (one that has 0 bias across all bins. i.e.
# a line from (0,0) to (1,0). The other elements are simply labelled dot-plots that are to be removed (setting
# set_visible to False for all but the first element) after the legend has been placed
hs=[ax.plot([0, 1], [0, 0], '-k', zorder=2)[0]]
for colorid in range(len(method_names)):
h, = plot([1, 1], '-s', markerfacecolor=colormap.colors[colorid], color='k',
mec=colormap.colors[colorid], linewidth=1.)
hs.append(h)
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
ax.legend(hs, ['ideal']+method_names, loc='center left', bbox_to_anchor=(1, 0.5))
[h.set_visible(False) for h in hs[1:]]
# x-axis and y-axis labels and limits
ax.set(xlabel='prevalence', ylabel='error bias', title=title)
# ax.set_ylim(-1, 1)
ax.set_xlim(0, 1)
save_or_show(savepath)
def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, n_bins=21, error_name='ae', show_std=True,
title=f'Quantification error as a function of distribution shift',
savepath=None):
fig, ax = plt.subplots()
ax.grid()
x_error = qp.error.ae
y_error = getattr(qp.error, error_name)
ndims = tr_prevs[0].shape[-1]
# join all data, and keep the order in which the methods appeared for the first time
data = defaultdict(lambda:{'x':np.empty(shape=(0)), 'y':np.empty(shape=(0))})
method_order = []
for method, test_prevs_i, estim_prevs_i, tr_prev_i in zip(method_names, true_prevs, estim_prevs, tr_prevs):
tr_prev_i = np.repeat(tr_prev_i.reshape(1,-1), repeats=test_prevs_i.shape[0], axis=0)
tr_test_drifts = x_error(test_prevs_i, tr_prev_i)
data[method]['x'] = np.concatenate([data[method]['x'], tr_test_drifts])
method_drifts = y_error(test_prevs_i, estim_prevs_i)
data[method]['y'] = np.concatenate([data[method]['y'], method_drifts])
if method not in method_order:
method_order.append(method)
bins = np.linspace(0, 1, n_bins)
binwidth = 1 / (n_bins - 1)
min_x, max_x = None, None
for method in method_order:
tr_test_drifts = data[method]['x']
method_drifts = data[method]['y']
inds = np.digitize(tr_test_drifts, bins, right=True)
xs, ys, ystds = [], [], []
for ind in range(len(bins)):
selected = inds==ind
if selected.sum() > 0:
xs.append(ind*binwidth)
ys.append(np.mean(method_drifts[selected]))
ystds.append(np.std(method_drifts[selected]))
xs = np.asarray(xs)
ys = np.asarray(ys)
ystds = np.asarray(ystds)
min_x_method, max_x_method = xs.min(), xs.max()
min_x = min_x_method if min_x is None or min_x_method < min_x else min_x
max_x = max_x_method if max_x is None or max_x_method > max_x else max_x
ax.errorbar(xs, ys, fmt='-', marker='o', label=method, markersize=3, zorder=2)
if show_std:
ax.fill_between(xs, ys-ystds, ys+ystds, alpha=0.25)
ax.set(xlabel=f'Distribution shift between training set and test sample',
ylabel=f'{error_name.upper()} (true distribution, predicted distribution)',
title=title)
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
ax.set_xlim(min_x, max_x)
save_or_show(savepath)
def save_or_show(savepath):
# if savepath is specified, then saves the plot in that path; otherwise the plot is shown
if savepath is not None:
qp.util.create_parent_dir(savepath)
# plt.tight_layout()
plt.savefig(savepath)
else:
plt.show()

View File

@ -64,6 +64,10 @@ def get_quapy_home():
return home
def create_parent_dir(path):
os.makedirs(Path(path).parent, exist_ok=True)
def pickled_resource(pickle_path:str, generation_func:callable, *args):
if pickle_path is None:
return generation_func(*args)