1
0
Fork 0

more fgsld

This commit is contained in:
Alejandro Moreo Fernandez 2021-03-11 19:00:40 +01:00
parent 0d8c6aeba6
commit 8381bce3a8
9 changed files with 88 additions and 69 deletions

View File

@ -32,4 +32,4 @@ def evaluate_results(methods, datasets, error_name):
print(f'Ave: {np.mean(all):.3f}')
evaluate_results(methods=['epacc*mae1k'], datasets=['*'], error_name='mae')
evaluate_results(methods=['*'], datasets=['*'], error_name='mae')

View File

@ -1,5 +1,6 @@
from sklearn.linear_model import LogisticRegression
import quapy as qp
from NewMethods.fgsld.fgsld_quantifiers import FakeFGLSD
from classification.methods import PCALR
from method.meta import QuaNet
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation
@ -36,8 +37,10 @@ def experimental_models():
svmperf_params = {'C': __C_range}
#yield 'paccsld', PACCSLD(newLR()), lr_params
# yield 'hdysld', OneVsAll(HDySLD(newLR())), lr_params # <-- promising!
yield 'PACC(5)', PACC(newLR(), val_split=5), {}
yield 'PACC(10)', PACC(newLR(), val_split=10), {}
#yield 'PACC(5)', PACC(newLR(), val_split=5), {}
#yield 'PACC(10)', PACC(newLR(), val_split=10), {}
yield 'FGSLD(3)', FakeFGLSD(newLR(), nbins=3, isomerous=False, recompute_bins=True), {}
yield 'FGSLD(5)', FakeFGLSD(newLR(), nbins=5, isomerous=False, recompute_bins=True), {}
@ -209,7 +212,7 @@ if __name__ == '__main__':
print(f'Result folder: {args.results}')
np.random.seed(0)
optim_losses = ['mae', 'mrae']
optim_losses = ['mae']
datasets = qp.datasets.TWITTER_SENTIMENT_DATASETS_TRAIN
qp.util.parallel(run, itertools.product(optim_losses, datasets, experimental_models()), n_jobs=settings.N_JOBS)

View File

@ -5,7 +5,7 @@ from collections import namedtuple
from sklearn.metrics import brier_score_loss
from sklearn.preprocessing import MultiLabelBinarizer
from metrics import smoothmacroF1, isometric_brier_decomposition, isomerous_brier_decomposition
from NewMethods.fgsld.metrics import smoothmacroF1, isometric_brier_decomposition, isomerous_brier_decomposition
History = namedtuple('History', ('posteriors', 'priors', 'y', 'iteration', 'stopping_criterium'))
MeasureSingleHistory = namedtuple('MeasureSingleHistory', (

View File

@ -1,13 +1,15 @@
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from fgsld.fgsld_quantifiers import FakeFGLSD
from fgsld_quantifiers import FakeFGLSD
from method.aggregative import EMQ, CC
import quapy as qp
import numpy as np
qp.environ['SAMPLE_SIZE'] = 500
dataset = qp.datasets.fetch_reviews('kindle')
dataset = qp.datasets.fetch_reviews('hp')
qp.data.preprocessing.text2tfidf(dataset, min_df=5, inplace=True)
training = dataset.training
@ -15,22 +17,22 @@ test = dataset.test
cls = CalibratedClassifierCV(LinearSVC())
#cls = LogisticRegression()
method_names, true_prevs, estim_prevs, tr_prevs = [], [], [], []
for model, model_name in [
(CC(cls), 'CC'),
# (FakeFGLSD(cls, nbins=5, isomerous=False, recompute_bins=False), 'FGSLD-isometric-stat-5'),
(FakeFGLSD(cls, nbins=5, isomerous=True, recompute_bins=True), 'FGSLD-isometric-dyn-5'),
# (FakeFGLSD(cls, nbins=5, isomerous=True, recompute_bins=False), 'FGSLD-isomerous-stat-5'),
# (FakeFGLSD(cls, nbins=10, isomerous=True, recompute_bins=True), 'FGSLD-isomerous-dyn-10'),
#(FakeFGLSD(cls, nbins=5, isomerous=False), 'FGSLD-5'),
#(FakeFGLSD(cls, nbins=10, isomerous=False), 'FGSLD-10'),
#(FakeFGLSD(cls, nbins=50, isomerous=False), 'FGSLD-50'),
#(FakeFGLSD(cls, nbins=100, isomerous=False), 'FGSLD-100'),
# (FakeFGLSD(cls, nbins=1, isomerous=False), 'FGSLD-1'),
#(FakeFGLSD(cls, nbins=10, isomerous=True), 'FGSLD-10-ISO'),
# (FakeFGLSD(cls, nbins=50, isomerous=False), 'FGSLD-50'),
# (FakeFGLSD(cls, nbins=20, isomerous=False, recompute_bins=True), 'FGSLD-isometric-dyn-20'),
(FakeFGLSD(cls, nbins=11, isomerous=False, recompute_bins=True), 'FGSLD-isometric-dyn-11'),
#(FakeFGLSD(cls, nbins=8, isomerous=False, recompute_bins=True), 'FGSLD-isometric-dyn-8'),
#(FakeFGLSD(cls, nbins=6, isomerous=False, recompute_bins=True), 'FGSLD-isometric-dyn-6'),
(FakeFGLSD(cls, nbins=5, isomerous=False, recompute_bins=True), 'FGSLD-isometric-dyn-5'),
#(FakeFGLSD(cls, nbins=4, isomerous=False, recompute_bins=True), 'FGSLD-isometric-dyn-4'),
(FakeFGLSD(cls, nbins=3, isomerous=False, recompute_bins=True), 'FGSLD-isometric-dyn-3'),
# (FakeFGLSD(cls, nbins=1, isomerous=False, recompute_bins=True), 'FGSLD-isometric-dyn-1'),
# (FakeFGLSD(cls, nbins=3, isomerous=False, recompute_bins=False), 'FGSLD-isometric-sta-3'),
(EMQ(cls), 'SLD'),
]:
print('running ', model_name)
@ -42,6 +44,8 @@ for model, model_name in [
true_prevs.append(true_prev)
estim_prevs.append(estim_prev)
tr_prevs.append(training.prevalence())
#if hasattr(model, 'iterations'):
# print(f'iterations ave={np.mean(model.iterations):.3f}, min={np.min(model.iterations):.3f}, max={np.max(model.iterations):.3f}')
qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, train_prev=tr_prevs[0], savepath='./plot_fglsd.png')

View File

@ -14,6 +14,7 @@ class FakeFGLSD(BaseQuantifier):
self.nbins = nbins
self.isomerous = isomerous
self.recompute_bins = recompute_bins
self.iterations=[]
def fit(self, data: LabelledCollection):
self.Xtr, self.ytr = data.Xy
@ -24,6 +25,7 @@ class FakeFGLSD(BaseQuantifier):
tr_priors = F.prevalence_from_labels(self.ytr, n_classes=2)
fgsld = FineGrainedSLD(self.Xtr, instances, self.ytr, tr_priors, self.learner, n_bins=self.nbins)
priors, posteriors = fgsld.run(self.isomerous, compute_bins_at_every_iter=self.recompute_bins)
self.iterations.append(fgsld.iterations)
return priors
def get_params(self, deep=True):

View File

@ -1,9 +1,9 @@
import numpy as np
from metrics import isomerous_bins, isometric_bins
from em import History, get_measures_single_history
from NewMethods.fgsld.metrics import isomerous_bins, isometric_bins
from NewMethods.fgsld.em import History, get_measures_single_history
from sklearn.model_selection import cross_val_predict
import math
from scipy.special import softmax
class FineGrainedSLD:
def __init__(self, x_tr, x_te, y_tr, tr_priors, clf, n_bins=10):
@ -16,7 +16,7 @@ class FineGrainedSLD:
self.history: [History] = []
self.multi_class = False
def run(self, isomerous_binning, epsilon=1e-6, compute_bins_at_every_iter=True, return_posteriors_hist=False):
def run(self, isomerous_binning, epsilon=1e-6, compute_bins_at_every_iter=True):
"""
Run the FGSLD algorithm.
@ -26,22 +26,18 @@ class FineGrainedSLD:
:param return_posteriors_hist: whether to return posteriors at every iteration or not.
:return: If `return_posteriors_hist` is true, the returned posteriors will be a list of numpy arrays, else a single numpy array with posteriors at last iteration.
"""
smoothing_tr = 1 / (2 * self.tr_preds.shape[0])
smoothing_te = 1 / (2 * self.te_preds.shape[0])
smoothing_tr = 1e-9 # 1 / (2 * self.tr_preds.shape[0])
smoothing_te = 1e-9 # 1 / (2 * self.te_preds.shape[0])
s = 0
tr_bin_priors = np.zeros((self.n_bins, self.tr_preds.shape[1]), dtype=np.float)
te_bin_priors = np.zeros((self.n_bins, self.te_preds.shape[1]), dtype=np.float)
tr_bins = self.__create_bins(training=True, isomerous_binning=isomerous_binning)
te_bins = self.__create_bins(training=False, isomerous_binning=isomerous_binning)
self.__compute_bins_priors(tr_bin_priors, self.tr_preds, tr_bins, smoothing_tr)
te_preds_cp = self.te_preds.copy()
val = 2 * epsilon
if return_posteriors_hist:
posteriors_hist = [self.te_preds.copy()]
while not val < epsilon and s < 1000:
assert np.all(np.around(self.te_preds.sum(axis=1), 4) == 1), f"Probabilities do not sum to 1:\ns={s}, " \
f"probs={self.te_preds.sum(axis=1)}"
if compute_bins_at_every_iter:
if compute_bins_at_every_iter or s==0:
te_bins = self.__create_bins(training=False, isomerous_binning=isomerous_binning)
if s == 0:
@ -50,34 +46,47 @@ class FineGrainedSLD:
te_bin_priors_prev = te_bin_priors.copy()
self.__compute_bins_priors(te_bin_priors, self.te_preds, te_bins, smoothing_te)
te_preds_cp = self.te_preds.copy()
for label_idx, bins in te_bins.items():
for i, bin_ in enumerate(bins):
if bin_.shape[0] == 0:
continue
te = te_bin_priors[i][label_idx]
tr = tr_bin_priors[i][label_idx]
# local_min = (math.floor(tr * 10) / 10)
alpha = 1
beta = 0.1
local_te = te_bin_priors[i][label_idx]
global_te = self.te_preds[:,label_idx].mean()
te = local_te*alpha + global_te*(1-alpha)
local_tr = tr_bin_priors[i][label_idx]
global_tr = self.tr_priors[label_idx]
tr = local_tr*beta + global_tr*(1-beta)
#local_min = (math.floor(tr * self.n_bins) / self.n_bins)
# local_max = local_min + .1
# trans = lambda l: min(max((l - local_min) / 1, 0), 1)
trans = lambda l: l
self.te_preds[:, label_idx][bin_] = (te_preds_cp[:, label_idx][bin_]) * \
(trans(te) / trans(tr))
assert not isomerous_binning, 'not tested'
#trans = lambda l: l - local_min
# trans = lambda l: l
# ratio = (trans(te) / trans(tr))
#ratio = np.clip(ratio, 0.1, 2)
#ratio = ratio**3
#self.te_preds[:, label_idx][bin_] = (te_preds_cp[:, label_idx][bin_]) * ratio
old_posterior = te_preds_cp[:, label_idx][bin_]
lr = 1
#self.te_preds[:, label_idx][bin_] = np.clip(old_posterior + (te-tr)*lr, 0, None)
self.te_preds[:, label_idx][bin_] = np.clip(old_posterior + (te - tr) * lr, 0, None)
#self.te_preds[:, label_idx][bin_] = (te_preds_cp[:, label_idx][bin_]) * ratio
# Normalization step
self.te_preds = (self.te_preds / self.te_preds.sum(axis=1, keepdims=True))
#self.te_preds = softmax(self.te_preds, axis=1)
val = 0
for label_idx in range(te_bin_priors.shape[1]):
temp = max(abs((te_bin_priors[:, label_idx] / te_bin_priors_prev[:, label_idx]) - 1))
if temp > val:
val = temp
val = np.max(np.abs(te_bin_priors / te_bin_priors_prev) - 1)
s += 1
if return_posteriors_hist:
posteriors_hist.append(self.te_preds.copy())
if return_posteriors_hist:
return self.te_preds.mean(axis=0), posteriors_hist
return self.te_preds.mean(axis=0), self.te_preds
self.iterations = s
priors = self.te_preds.mean(axis=0)
posteriors = self.te_preds
return priors, posteriors
def __compute_bins_priors(self, bin_priors_placeholder, posteriors, bins, smoothing):
for label_idx, bins in bins.items():
@ -85,23 +94,10 @@ class FineGrainedSLD:
if bin_.shape[0] == 0:
bin_priors_placeholder[i, label_idx] = smoothing
continue
numerator = posteriors[:, label_idx][bin_].mean()
numerator = posteriors[bin_, label_idx].mean()
bin_prior = (numerator + smoothing) / (1 + self.n_bins * smoothing) # normalize priors
bin_priors_placeholder[i, label_idx] = bin_prior
def __find_bin_idx(self, label_bins: [np.array], idx: int or list):
if hasattr(idx, '__len__'):
idxs = np.zeros(len(idx), dtype=np.int)
for i, bin_ in enumerate(label_bins):
for j, id_ in enumerate(idx):
if id_ in bin_:
idxs[j] = i
return idxs
else:
for i, bin_ in enumerate(label_bins):
if idx in bin_:
return i
def __create_bins(self, training: bool, isomerous_binning: bool):
bins = {}
preds = self.tr_preds if training else self.te_preds
@ -111,6 +107,6 @@ class FineGrainedSLD:
else:
intervals = np.linspace(0., 1., num=self.n_bins, endpoint=False)
for label_idx in range(preds.shape[1]):
bins_ = isometric_bins(label_idx, preds, intervals, 0.1)
bins_ = isometric_bins(label_idx, preds, intervals)
bins[label_idx] = [bins_[i] for i in intervals]
return bins

View File

@ -73,10 +73,21 @@ def brier_decomposition(bins, true_labels, predicted_labels, class_=1):
return calibration_score / (labels_len * len(bins)), refinement_score / (labels_len * len(bins))
def isometric_bins(label_index, predicted_labels, bin_intervals, step):
#def isometric_bins(label_index, predicted_labels, bin_intervals, step):
# predicted_class_label = predicted_labels[:, label_index]
# return {interv: np.where(np.logical_and(interv <= predicted_class_label, predicted_class_label < interv + step))[0]
# for interv in bin_intervals}
def isometric_bins(label_index, predicted_labels, bin_intervals):
def next_intv(i):
return bin_intervals[i + 1] if (i + 1) < len(bin_intervals) else 1.
predicted_class_label = predicted_labels[:, label_index]
return {interv: np.where(np.logical_and(interv <= predicted_class_label, predicted_class_label < interv + step))[0]
for interv in bin_intervals}
return {
interv:
np.where(np.logical_and(interv <= predicted_class_label, predicted_class_label < next_intv(i)))[
0]
for i, interv in enumerate(bin_intervals)
}
def isomerous_bins(label_index, predicted_labels, n):

Binary file not shown.

Before

Width:  |  Height:  |  Size: 163 KiB

After

Width:  |  Height:  |  Size: 238 KiB

View File

@ -352,6 +352,7 @@ class EMQ(AggregativeProbabilisticQuantifier):
@classmethod
def EM(cls, tr_prev, posterior_probabilities, epsilon=EPSILON):
#print('training-priors', tr_prev)
Px = posterior_probabilities
Ptr = np.copy(tr_prev)
qs = np.copy(Ptr) # qs (the running estimate) is initialized as the training prevalence
@ -359,11 +360,14 @@ class EMQ(AggregativeProbabilisticQuantifier):
s, converged = 0, False
qs_prev_ = None
while not converged and s < EMQ.MAX_ITER:
# E-step: ps is Ps(y=+1|xi)
#print('iter: ', s)
# E-step: ps is Ps(y|xi)
ps_unnormalized = (qs / Ptr) * Px
ps = ps_unnormalized / ps_unnormalized.sum(axis=1).reshape(-1,1)
ps = ps_unnormalized / ps_unnormalized.sum(axis=1, keepdims=True)
#print(f'\tratio=', qs / Ptr)
#print(f'\torigin_posteriors ', Px)
# M-step: qs_pos is Ps+1(y=+1)
# M-step:
qs = ps.mean(axis=0)
if qs_prev_ is not None and qp.error.mae(qs, qs_prev_) < epsilon and s>10:
@ -373,7 +377,6 @@ class EMQ(AggregativeProbabilisticQuantifier):
s += 1
if not converged:
#raise UserWarning('the method has reached the maximum number of iterations; it might have not converged')
print('[warning] the method has reached the maximum number of iterations; it might have not converged')
return qs, ps