Merge branch 'master' of gitea-s2i2s.isti.cnr.it:moreo/QuaPy
This commit is contained in:
commit
f3b505eb4e
|
@ -0,0 +1,47 @@
|
||||||
|
from sklearn.calibration import CalibratedClassifierCV
|
||||||
|
from sklearn.svm import LinearSVC
|
||||||
|
from fgsld.fgsld_quantifiers import FakeFGLSD
|
||||||
|
from method.aggregative import EMQ, CC
|
||||||
|
import quapy as qp
|
||||||
|
|
||||||
|
|
||||||
|
qp.environ['SAMPLE_SIZE'] = 500
|
||||||
|
|
||||||
|
dataset = qp.datasets.fetch_reviews('kindle')
|
||||||
|
qp.data.preprocessing.text2tfidf(dataset, min_df=5, inplace=True)
|
||||||
|
|
||||||
|
training = dataset.training
|
||||||
|
test = dataset.test
|
||||||
|
|
||||||
|
cls = CalibratedClassifierCV(LinearSVC())
|
||||||
|
|
||||||
|
|
||||||
|
method_names, true_prevs, estim_prevs, tr_prevs = [], [], [], []
|
||||||
|
|
||||||
|
for model, model_name in [
|
||||||
|
(CC(cls), 'CC'),
|
||||||
|
# (FakeFGLSD(cls, nbins=5, isomerous=False, recompute_bins=False), 'FGSLD-isometric-stat-5'),
|
||||||
|
(FakeFGLSD(cls, nbins=5, isomerous=True, recompute_bins=True), 'FGSLD-isometric-dyn-5'),
|
||||||
|
# (FakeFGLSD(cls, nbins=5, isomerous=True, recompute_bins=False), 'FGSLD-isomerous-stat-5'),
|
||||||
|
# (FakeFGLSD(cls, nbins=10, isomerous=True, recompute_bins=True), 'FGSLD-isomerous-dyn-10'),
|
||||||
|
#(FakeFGLSD(cls, nbins=5, isomerous=False), 'FGSLD-5'),
|
||||||
|
#(FakeFGLSD(cls, nbins=10, isomerous=False), 'FGSLD-10'),
|
||||||
|
#(FakeFGLSD(cls, nbins=50, isomerous=False), 'FGSLD-50'),
|
||||||
|
#(FakeFGLSD(cls, nbins=100, isomerous=False), 'FGSLD-100'),
|
||||||
|
# (FakeFGLSD(cls, nbins=1, isomerous=False), 'FGSLD-1'),
|
||||||
|
#(FakeFGLSD(cls, nbins=10, isomerous=True), 'FGSLD-10-ISO'),
|
||||||
|
# (FakeFGLSD(cls, nbins=50, isomerous=False), 'FGSLD-50'),
|
||||||
|
(EMQ(cls), 'SLD'),
|
||||||
|
]:
|
||||||
|
print('running ', model_name)
|
||||||
|
model.fit(training)
|
||||||
|
true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(
|
||||||
|
model, test, qp.environ['SAMPLE_SIZE'], n_repetitions=5, n_prevpoints=11, n_jobs=-1
|
||||||
|
)
|
||||||
|
method_names.append(model_name)
|
||||||
|
true_prevs.append(true_prev)
|
||||||
|
estim_prevs.append(estim_prev)
|
||||||
|
tr_prevs.append(training.prevalence())
|
||||||
|
|
||||||
|
|
||||||
|
qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, train_prev=tr_prevs[0], savepath='./plot_fglsd.png')
|
|
@ -0,0 +1,116 @@
|
||||||
|
import numpy as np
|
||||||
|
from metrics import isomerous_bins, isometric_bins
|
||||||
|
from em import History, get_measures_single_history
|
||||||
|
from sklearn.model_selection import cross_val_predict
|
||||||
|
import math
|
||||||
|
|
||||||
|
|
||||||
|
class FineGrainedSLD:
|
||||||
|
def __init__(self, x_tr, x_te, y_tr, tr_priors, clf, n_bins=10):
|
||||||
|
self.y_tr = y_tr
|
||||||
|
self.clf = clf
|
||||||
|
self.tr_priors = tr_priors
|
||||||
|
self.te_preds = clf.predict_proba(x_te)
|
||||||
|
self.tr_preds = cross_val_predict(clf, x_tr, y_tr, method='predict_proba', n_jobs=10)
|
||||||
|
self.n_bins = n_bins
|
||||||
|
self.history: [History] = []
|
||||||
|
self.multi_class = False
|
||||||
|
|
||||||
|
def run(self, isomerous_binning, epsilon=1e-6, compute_bins_at_every_iter=True, return_posteriors_hist=False):
|
||||||
|
"""
|
||||||
|
Run the FGSLD algorithm.
|
||||||
|
|
||||||
|
:param isomerous_binning: whether to use isomerous or isometric binning.
|
||||||
|
:param epsilon: stopping condition.
|
||||||
|
:param compute_bins_at_every_iter: whether FGSLD should recompute the posterior bins at every iteration or not.
|
||||||
|
:param return_posteriors_hist: whether to return posteriors at every iteration or not.
|
||||||
|
:return: If `return_posteriors_hist` is true, the returned posteriors will be a list of numpy arrays, else a single numpy array with posteriors at last iteration.
|
||||||
|
"""
|
||||||
|
smoothing_tr = 1 / (2 * self.tr_preds.shape[0])
|
||||||
|
smoothing_te = 1 / (2 * self.te_preds.shape[0])
|
||||||
|
s = 0
|
||||||
|
tr_bin_priors = np.zeros((self.n_bins, self.tr_preds.shape[1]), dtype=np.float)
|
||||||
|
te_bin_priors = np.zeros((self.n_bins, self.te_preds.shape[1]), dtype=np.float)
|
||||||
|
tr_bins = self.__create_bins(training=True, isomerous_binning=isomerous_binning)
|
||||||
|
te_bins = self.__create_bins(training=False, isomerous_binning=isomerous_binning)
|
||||||
|
self.__compute_bins_priors(tr_bin_priors, self.tr_preds, tr_bins, smoothing_tr)
|
||||||
|
|
||||||
|
val = 2 * epsilon
|
||||||
|
if return_posteriors_hist:
|
||||||
|
posteriors_hist = [self.te_preds.copy()]
|
||||||
|
while not val < epsilon and s < 1000:
|
||||||
|
assert np.all(np.around(self.te_preds.sum(axis=1), 4) == 1), f"Probabilities do not sum to 1:\ns={s}, " \
|
||||||
|
f"probs={self.te_preds.sum(axis=1)}"
|
||||||
|
if compute_bins_at_every_iter:
|
||||||
|
te_bins = self.__create_bins(training=False, isomerous_binning=isomerous_binning)
|
||||||
|
|
||||||
|
if s == 0:
|
||||||
|
te_bin_priors_prev = tr_bin_priors.copy()
|
||||||
|
else:
|
||||||
|
te_bin_priors_prev = te_bin_priors.copy()
|
||||||
|
self.__compute_bins_priors(te_bin_priors, self.te_preds, te_bins, smoothing_te)
|
||||||
|
|
||||||
|
te_preds_cp = self.te_preds.copy()
|
||||||
|
for label_idx, bins in te_bins.items():
|
||||||
|
for i, bin_ in enumerate(bins):
|
||||||
|
if bin_.shape[0] == 0:
|
||||||
|
continue
|
||||||
|
te = te_bin_priors[i][label_idx]
|
||||||
|
tr = tr_bin_priors[i][label_idx]
|
||||||
|
# local_min = (math.floor(tr * 10) / 10)
|
||||||
|
# local_max = local_min + .1
|
||||||
|
# trans = lambda l: min(max((l - local_min) / 1, 0), 1)
|
||||||
|
trans = lambda l: l
|
||||||
|
self.te_preds[:, label_idx][bin_] = (te_preds_cp[:, label_idx][bin_]) * \
|
||||||
|
(trans(te) / trans(tr))
|
||||||
|
|
||||||
|
# Normalization step
|
||||||
|
self.te_preds = (self.te_preds / self.te_preds.sum(axis=1, keepdims=True))
|
||||||
|
|
||||||
|
val = 0
|
||||||
|
for label_idx in range(te_bin_priors.shape[1]):
|
||||||
|
temp = max(abs((te_bin_priors[:, label_idx] / te_bin_priors_prev[:, label_idx]) - 1))
|
||||||
|
if temp > val:
|
||||||
|
val = temp
|
||||||
|
s += 1
|
||||||
|
if return_posteriors_hist:
|
||||||
|
posteriors_hist.append(self.te_preds.copy())
|
||||||
|
if return_posteriors_hist:
|
||||||
|
return self.te_preds.mean(axis=0), posteriors_hist
|
||||||
|
return self.te_preds.mean(axis=0), self.te_preds
|
||||||
|
|
||||||
|
def __compute_bins_priors(self, bin_priors_placeholder, posteriors, bins, smoothing):
|
||||||
|
for label_idx, bins in bins.items():
|
||||||
|
for i, bin_ in enumerate(bins):
|
||||||
|
if bin_.shape[0] == 0:
|
||||||
|
bin_priors_placeholder[i, label_idx] = smoothing
|
||||||
|
continue
|
||||||
|
numerator = posteriors[:, label_idx][bin_].mean()
|
||||||
|
bin_prior = (numerator + smoothing) / (1 + self.n_bins * smoothing) # normalize priors
|
||||||
|
bin_priors_placeholder[i, label_idx] = bin_prior
|
||||||
|
|
||||||
|
def __find_bin_idx(self, label_bins: [np.array], idx: int or list):
|
||||||
|
if hasattr(idx, '__len__'):
|
||||||
|
idxs = np.zeros(len(idx), dtype=np.int)
|
||||||
|
for i, bin_ in enumerate(label_bins):
|
||||||
|
for j, id_ in enumerate(idx):
|
||||||
|
if id_ in bin_:
|
||||||
|
idxs[j] = i
|
||||||
|
return idxs
|
||||||
|
else:
|
||||||
|
for i, bin_ in enumerate(label_bins):
|
||||||
|
if idx in bin_:
|
||||||
|
return i
|
||||||
|
|
||||||
|
def __create_bins(self, training: bool, isomerous_binning: bool):
|
||||||
|
bins = {}
|
||||||
|
preds = self.tr_preds if training else self.te_preds
|
||||||
|
if isomerous_binning:
|
||||||
|
for label_idx in range(preds.shape[1]):
|
||||||
|
bins[label_idx] = isomerous_bins(label_idx, preds, self.n_bins)
|
||||||
|
else:
|
||||||
|
intervals = np.linspace(0., 1., num=self.n_bins, endpoint=False)
|
||||||
|
for label_idx in range(preds.shape[1]):
|
||||||
|
bins_ = isometric_bins(label_idx, preds, intervals, 0.1)
|
||||||
|
bins[label_idx] = [bins_[i] for i in intervals]
|
||||||
|
return bins
|
Binary file not shown.
After Width: | Height: | Size: 163 KiB |
|
@ -0,0 +1,161 @@
|
||||||
|
import quapy as qp
|
||||||
|
import numpy as np
|
||||||
|
from os import makedirs
|
||||||
|
import sys, os
|
||||||
|
import pickle
|
||||||
|
from experiments import result_path
|
||||||
|
from tabular import Table
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
tables_path = './tables'
|
||||||
|
MAXTONE = 50 # sets the intensity of the maximum color reached by the worst (red) and best (green) results
|
||||||
|
|
||||||
|
makedirs(tables_path, exist_ok=True)
|
||||||
|
|
||||||
|
sample_size = 100
|
||||||
|
qp.environ['SAMPLE_SIZE'] = sample_size
|
||||||
|
|
||||||
|
|
||||||
|
nice = {
|
||||||
|
'mae':'AE',
|
||||||
|
'mrae':'RAE',
|
||||||
|
'ae':'AE',
|
||||||
|
'rae':'RAE',
|
||||||
|
'svmkld': 'SVM(KLD)',
|
||||||
|
'svmnkld': 'SVM(NKLD)',
|
||||||
|
'svmq': 'SVM(Q)',
|
||||||
|
'svmae': 'SVM(AE)',
|
||||||
|
'svmnae': 'SVM(NAE)',
|
||||||
|
'svmmae': 'SVM(AE)',
|
||||||
|
'svmmrae': 'SVM(RAE)',
|
||||||
|
'quanet': 'QuaNet',
|
||||||
|
'hdy': 'HDy',
|
||||||
|
'hdysld': 'HDy-SLD',
|
||||||
|
'dys': 'DyS',
|
||||||
|
'svmperf':'',
|
||||||
|
'sanders': 'Sanders',
|
||||||
|
'semeval13': 'SemEval13',
|
||||||
|
'semeval14': 'SemEval14',
|
||||||
|
'semeval15': 'SemEval15',
|
||||||
|
'semeval16': 'SemEval16',
|
||||||
|
'Average': 'Average'
|
||||||
|
}
|
||||||
|
|
||||||
|
def save_table(path, table):
|
||||||
|
print(f'saving results in {path}')
|
||||||
|
with open(path, 'wt') as foo:
|
||||||
|
foo.write(table)
|
||||||
|
|
||||||
|
|
||||||
|
def experiment_errors(path, dataset, method, loss):
|
||||||
|
path = result_path(path, dataset, method, 'm'+loss if not loss.startswith('m') else loss)
|
||||||
|
if os.path.exists(path):
|
||||||
|
true_prevs, estim_prevs, _, _, _, _ = pickle.load(open(path, 'rb'))
|
||||||
|
err_fn = getattr(qp.error, loss)
|
||||||
|
errors = err_fn(true_prevs, estim_prevs)
|
||||||
|
return errors
|
||||||
|
return None
|
||||||
|
|
||||||
|
def nicerm(key):
|
||||||
|
return '\mathrm{'+nice[key]+'}'
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser(description='Generate tables for Tweeter Sentiment Quantification')
|
||||||
|
parser.add_argument('results', metavar='RESULT_PATH', type=str,
|
||||||
|
help='path to the directory containing the results of the methods tested in Gao & Sebastiani')
|
||||||
|
parser.add_argument('newresults', metavar='RESULT_PATH', type=str,
|
||||||
|
help='path to the directory containing the results for the experimental methods')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
datasets = qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST
|
||||||
|
evaluation_measures = [qp.error.ae, qp.error.rae]
|
||||||
|
gao_seb_methods = ['cc', 'acc', 'pcc', 'pacc', 'sld', 'svmq', 'svmkld', 'svmnkld']
|
||||||
|
new_methods = ['hdy'] # methods added to the Gao & Sebastiani methods
|
||||||
|
experimental_methods = ['hdysld'] # experimental
|
||||||
|
|
||||||
|
for i, eval_func in enumerate(evaluation_measures):
|
||||||
|
|
||||||
|
# Tables evaluation scores for AE and RAE (two tables)
|
||||||
|
# ----------------------------------------------------
|
||||||
|
|
||||||
|
eval_name = eval_func.__name__
|
||||||
|
|
||||||
|
added_methods = ['svmm' + eval_name] + new_methods
|
||||||
|
methods = gao_seb_methods + added_methods + experimental_methods
|
||||||
|
nold_methods = len(gao_seb_methods)
|
||||||
|
nnew_methods = len(added_methods)
|
||||||
|
nexp_methods = len(experimental_methods)
|
||||||
|
|
||||||
|
# fill data table
|
||||||
|
table = Table(benchmarks=datasets, methods=methods)
|
||||||
|
for dataset in datasets:
|
||||||
|
for method in methods:
|
||||||
|
if method in experimental_methods:
|
||||||
|
path = args.newresults
|
||||||
|
else:
|
||||||
|
path = args.results
|
||||||
|
table.add(dataset, method, experiment_errors(path, dataset, method, eval_name))
|
||||||
|
|
||||||
|
# write the latex table
|
||||||
|
tabular = """
|
||||||
|
\\begin{tabularx}{\\textwidth}{|c||""" + ('Y|'*nold_methods) + '|' + ('Y|'*nnew_methods) + '|' + ('Y|'*nexp_methods) + """} \hline
|
||||||
|
& \multicolumn{"""+str(nold_methods)+"""}{c||}{Methods tested in~\cite{Gao:2016uq}} &
|
||||||
|
\multicolumn{"""+str(nnew_methods)+"""}{c|}{} &
|
||||||
|
\multicolumn{"""+str(nexp_methods)+"""}{c|}{}\\\\ \hline
|
||||||
|
"""
|
||||||
|
rowreplace={dataset: nice.get(dataset, dataset.upper()) for dataset in datasets}
|
||||||
|
colreplace={method:'\side{' + nice.get(method, method.upper()) +'$^{' + nicerm(eval_name) + '}$} ' for method in methods}
|
||||||
|
|
||||||
|
tabular += table.latexTabular(benchmark_replace=rowreplace, method_replace=colreplace)
|
||||||
|
tabular += "\n\end{tabularx}"
|
||||||
|
|
||||||
|
save_table(f'./tables/tab_results_{eval_name}.new.tex', tabular)
|
||||||
|
|
||||||
|
# Tables ranks for AE and RAE (two tables)
|
||||||
|
# ----------------------------------------------------
|
||||||
|
# fill the data table
|
||||||
|
ranktable = Table(benchmarks=datasets, methods=methods, missing='--')
|
||||||
|
for dataset in datasets:
|
||||||
|
for method in methods:
|
||||||
|
ranktable.add(dataset, method, values=table.get(dataset, method, 'rank'))
|
||||||
|
|
||||||
|
# write the latex table
|
||||||
|
tabular = """
|
||||||
|
\\begin{tabularx}{\\textwidth}{|c||""" + ('Y|'*nold_methods) + '|' + ('Y|'*nnew_methods) + '|' + ('Y|'*nexp_methods) + """} \hline
|
||||||
|
& \multicolumn{"""+str(nold_methods)+"""}{c||}{Methods tested in~\cite{Gao:2016uq}} &
|
||||||
|
\multicolumn{"""+str(nnew_methods)+"""}{c|}{} &
|
||||||
|
\multicolumn{"""+str(nexp_methods)+"""}{c|}{}\\\\ \hline
|
||||||
|
"""
|
||||||
|
for method in methods:
|
||||||
|
tabular += ' & \side{' + nice.get(method, method.upper()) +'$^{' + nicerm(eval_name) + '}$} '
|
||||||
|
tabular += '\\\\\hline\n'
|
||||||
|
|
||||||
|
for dataset in datasets:
|
||||||
|
tabular += nice.get(dataset, dataset.upper()) + ' '
|
||||||
|
for method in methods:
|
||||||
|
newrank = ranktable.get(dataset, method)
|
||||||
|
if newrank != '--':
|
||||||
|
newrank = f'{int(newrank)}'
|
||||||
|
color = ranktable.get_color(dataset, method)
|
||||||
|
if color == '--':
|
||||||
|
color = ''
|
||||||
|
tabular += ' & ' + f'{newrank}' + color
|
||||||
|
tabular += '\\\\\hline\n'
|
||||||
|
tabular += '\hline\n'
|
||||||
|
|
||||||
|
tabular += 'Average '
|
||||||
|
for method in methods:
|
||||||
|
newrank = ranktable.get_average(method)
|
||||||
|
if newrank != '--':
|
||||||
|
newrank = f'{newrank:.1f}'
|
||||||
|
color = ranktable.get_average(method, 'color')
|
||||||
|
if color == '--':
|
||||||
|
color = ''
|
||||||
|
tabular += ' & ' + f'{newrank}' + color
|
||||||
|
tabular += '\\\\\hline\n'
|
||||||
|
tabular += "\end{tabularx}"
|
||||||
|
|
||||||
|
save_table(f'./tables/tab_rank_{eval_name}.new.tex', tabular)
|
||||||
|
|
||||||
|
print("[Done]")
|
|
@ -0,0 +1,5 @@
|
||||||
|
import multiprocessing
|
||||||
|
|
||||||
|
N_JOBS = -2 #multiprocessing.cpu_count()
|
||||||
|
ENSEMBLE_N_JOBS=1
|
||||||
|
SAMPLE_SIZE = 100
|
|
@ -83,21 +83,21 @@ def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=N
|
||||||
binwidth = 1/nbins
|
binwidth = 1/nbins
|
||||||
data = {}
|
data = {}
|
||||||
for method, true_prev, estim_prev in zip(method_names, true_prevs, estim_prevs):
|
for method, true_prev, estim_prev in zip(method_names, true_prevs, estim_prevs):
|
||||||
true_prev = true_prev[:,pos_class]
|
true_prev = true_prev[:, pos_class]
|
||||||
estim_prev = estim_prev[:,pos_class]
|
estim_prev = estim_prev[:, pos_class]
|
||||||
|
|
||||||
data[method] = []
|
data[method] = []
|
||||||
inds = np.digitize(true_prev, bins, right=True)
|
inds = np.digitize(true_prev, bins[1:], right=True)
|
||||||
for ind in range(len(bins)):
|
for ind in range(len(bins)):
|
||||||
selected = inds==ind
|
selected = inds==ind
|
||||||
data[method].append(estim_prev[selected] - true_prev[selected])
|
data[method].append(estim_prev[selected] - true_prev[selected])
|
||||||
|
|
||||||
nmethods = len(method_names)
|
nmethods = len(method_names)
|
||||||
boxwidth = binwidth/(nmethods+4)
|
boxwidth = binwidth/(nmethods+4)
|
||||||
for i,bin in enumerate(bins[:-1]):
|
for i,bin in enumerate(bins):
|
||||||
boxdata = [data[method][i] for method in method_names]
|
boxdata = [data[method][i] for method in method_names]
|
||||||
positions = [bin+(i*boxwidth)+2*boxwidth for i,_ in enumerate(method_names)]
|
positions = [bin+(i*boxwidth)+2*boxwidth for i,_ in enumerate(method_names)]
|
||||||
box = boxplot(boxdata, showmeans=False, positions=positions, widths = boxwidth, sym='+', patch_artist=True)
|
box = boxplot(boxdata, showmeans=False, positions=positions, widths=boxwidth, sym='+', patch_artist=True)
|
||||||
for boxid in range(len(method_names)):
|
for boxid in range(len(method_names)):
|
||||||
c = colormap.colors[boxid%len(colormap.colors)]
|
c = colormap.colors[boxid%len(colormap.colors)]
|
||||||
setp(box['fliers'][boxid], color=c, marker='+', markersize=3., markeredgecolor=c)
|
setp(box['fliers'][boxid], color=c, marker='+', markersize=3., markeredgecolor=c)
|
||||||
|
@ -110,7 +110,7 @@ def binary_bias_bins(method_names, true_prevs, estim_prevs, pos_class=1, title=N
|
||||||
major_xticks_positions.append(b)
|
major_xticks_positions.append(b)
|
||||||
minor_xticks_positions.append(b + binwidth / 2)
|
minor_xticks_positions.append(b + binwidth / 2)
|
||||||
major_xticks_labels.append('')
|
major_xticks_labels.append('')
|
||||||
minor_xticks_labels.append(f'[{bins[i]:.2f}-{bins[i + 1]:.2f})')
|
minor_xticks_labels.append(f'[{bins[i]:.2f}-{bins[i + 1]:.2f}' + (')' if i < len(bins)-2 else ']'))
|
||||||
ax.set_xticks(major_xticks_positions)
|
ax.set_xticks(major_xticks_positions)
|
||||||
ax.set_xticks(minor_xticks_positions, minor=True)
|
ax.set_xticks(minor_xticks_positions, minor=True)
|
||||||
ax.set_xticklabels(major_xticks_labels)
|
ax.set_xticklabels(major_xticks_labels)
|
||||||
|
|
Loading…
Reference in New Issue