quick test kdex

first push to github
refactoring
2023-12-17 20:26:53 +01:00 · 2023-12-17 20:14:38 +01:00 · 2023-12-11 16:43:45 +01:00 · 2023-12-06 19:24:42 +01:00 · 2023-12-06 17:05:12 +01:00 · 2023-12-06 16:55:06 +01:00
50 changed files with 4273 additions and 46 deletions
--- a/.gitignore
+++ b/.gitignore
@ -27,6 +27,40 @@ share/python-wheels/
 .installed.cfg
 *.egg
 MANIFEST
 .idea
 .vscode
 LeQua2022
 MultiLabel/results_generales
 MultiLabel/mlqtables
 NewMethods/plots*
 NewMethods/results*
 NewMethods/tables*
 NewMethods/latex*
 Ordinal/data*
 Ordinal/roberta*
 Ordinal/tables*
 Ordinal/results*
 eDiscovery/plots*
 eDiscovery/results*
 examples/results*
 poster-cikm*
 slides-cikm*
 slides-short-cikm*
 quick_experiment/figures*
 quick_experiment/figures*
 svm_perf_quantification/*
 TweetSentQuant/plots*
 TweetSentQuant/results*
 TweetSentQuant/tables*
 TweetSentQuant/Tweet Sentiment Quantification_NPP
 TweetSentQuant/checkpoint
 TweetSentQuant/*.tex
 checkpoint
 *.png
 *.zip
 *.pkl
 *.pickle
 *.pdf
 # PyInstaller
 #  Usually these files are written by a python script from a template
@ -130,3 +164,32 @@ dmypy.json
 .pyre/
 *__pycache__*
 *.dataframe
 *.svg
 cross_lingual.py
 # results
 *.csv
 # other projects
 *.tex
 *.md
 *.doctree
 *.rst
 MultiLabel/
 NewMethods/
 Ordinal/
 Retrieval/
 quick_experiment/
 Transduction/
 Transduction_office/
 TweetSentQuant/
 laboratory/
 docs/
 dataset_drifts.py
 notes.txt
 eDiscovery/
 examples/experiment_Dirichlet_calibration.py
 examples/experiments_dirichlet_cal.txt
--- a/distribution_matching/commons.py
+++ b/distribution_matching/commons.py
@ -0,0 +1,124 @@
 import numpy as np
 import pandas as pd
 from distribution_matching.method.kdex import KDExML
 from distribution_matching.method.method_kdey import KDEy
 from distribution_matching.method.method_kdey_closed_efficient_correct import KDEyclosed_efficient_corr
 from distribution_matching.method.kdey import KDEyCS, KDEyHD, KDEyML
 from quapy.method.aggregative import EMQ, CC, PCC, DistributionMatching, PACC, HDy, OneVsAllAggregative, ACC
 from distribution_matching.method.dirichlety import DIRy
 from sklearn.linear_model import LogisticRegression
 # set to True to get the full list of methods tested in the paper (reported in the appendix)
 # set to False to get the reduced list (shown in the body of the paper)
 FULL_METHOD_LIST = False
 if FULL_METHOD_LIST:
    ADJUSTMENT_METHODS = ['ACC', 'PACC']
    DISTR_MATCH_METHODS = ['HDy-OvA', 'DM-T', 'DM-HD', 'KDEy-HD',  'DM-CS', 'KDEy-CS']
    MAX_LIKE_METHODS = ['DIR', 'EMQ', 'EMQ-BCTS', 'KDEy-ML']
 else:
    ADJUSTMENT_METHODS = ['PACC']
    DISTR_MATCH_METHODS = ['DM-T', 'DM-HD', 'KDEy-HD',  'DM-CS', 'KDEy-CS']
    MAX_LIKE_METHODS = ['EMQ', 'KDEy-ML']
 # list of methods to consider
 METHODS  = ADJUSTMENT_METHODS + DISTR_MATCH_METHODS + MAX_LIKE_METHODS
 BIN_METHODS = [x.replace('-OvA', '') for x in METHODS]
 # common hyperparameterss
 hyper_LR = {
    'classifier__C': np.logspace(-3,3,7),
    'classifier__class_weight': ['balanced', None]
 }
 hyper_kde = {
    'bandwidth': np.linspace(0.01, 0.2, 20)
 }
 nbins_range = [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 64]
 # instances a new quantifier based on a string name
 def new_method(method, **lr_kwargs):
    lr = LogisticRegression(**lr_kwargs)
    if method == 'CC':
        param_grid = hyper_LR
        quantifier = CC(lr)
    elif method == 'PCC':
        param_grid = hyper_LR
        quantifier = PCC(lr)
    elif method == 'ACC':
        param_grid = hyper_LR
        quantifier = ACC(lr)
    elif method == 'PACC':
        param_grid = hyper_LR
        quantifier = PACC(lr)
    elif method in ['KDEy-HD']:
        param_grid = {**hyper_kde, **hyper_LR}
        quantifier = KDEyHD(lr)
    elif method == 'KDEy-CS':
        param_grid = {**hyper_kde, **hyper_LR}
        quantifier = KDEyCS(lr)
    elif method == 'KDEy-ML':
        param_grid = {**hyper_kde, **hyper_LR}
        quantifier = KDEyML(lr)
    elif method == 'KDEx-ML':
        param_grid = {
            'bandwidth': np.linspace(0.001, 2, 501)
        }
        quantifier = KDExML()
    elif method == 'DIR':
        param_grid = hyper_LR
        quantifier = DIRy(lr)
    elif method == 'EMQ':
        param_grid = hyper_LR
        quantifier = EMQ(lr)
    elif method == 'EMQ-BCTS':
        method_params = {'exact_train_prev': [False], 'recalib': ['bcts']}
        param_grid = {**method_params, **hyper_LR}
        quantifier = EMQ(lr)
    elif method == 'HDy':
        param_grid = hyper_LR
        quantifier = HDy(lr)
    elif method == 'HDy-OvA':
        param_grid = {'binary_quantifier__' + key: val for key, val in hyper_LR.items()}
        quantifier = OneVsAllAggregative(HDy(lr))
    elif method == 'DM-T':
        method_params = {
            'nbins': nbins_range,
            'val_split': [10],
            'divergence': ['topsoe']
        }
        param_grid = {**method_params, **hyper_LR}
        quantifier = DistributionMatching(lr)
    elif method == 'DM-HD':
        method_params = {
            'nbins': nbins_range,
            'val_split': [10],
            'divergence': ['HD']
        }
        param_grid = {**method_params, **hyper_LR}
        quantifier = DistributionMatching(lr)
    elif method == 'DM-CS':
        method_params = {
            'nbins': nbins_range,
            'val_split': [10],
            'divergence': ['CS']
        }
        param_grid = {**method_params, **hyper_LR}
        quantifier = DistributionMatching(lr)
    else:
        raise NotImplementedError('unknown method', method)
    return param_grid, quantifier
 def show_results(result_path):
    df = pd.read_csv(result_path+'.csv', sep='\t')
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', None)
    pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"])
    print(pv)
--- a/distribution_matching/figures/histograms_density_plot.py
+++ b/distribution_matching/figures/histograms_density_plot.py
@ -0,0 +1,73 @@
 import math
 import numpy as np
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import train_test_split, cross_val_predict
 from sklearn.neighbors import KernelDensity
 import matplotlib.pyplot as plt
 import numpy as np
 from data import LabelledCollection
 scale = 100
 import quapy as qp
 data = qp.datasets.fetch_twitter('wb', min_df=3, pickle=True, for_model_selection=False)
 X, y = data.training.Xy
 cls = LogisticRegression(C=0.0001, random_state=0)
 posteriors = cross_val_predict(cls, X=X, y=y, method='predict_proba', n_jobs=-1, cv=3)
 cls.fit(X, y)
 Xte, yte = data.test.Xy
 post_c1 = posteriors[y==0]
 post_c2 = posteriors[y==1]
 post_c3 = posteriors[y==2]
 print(len(post_c1))
 print(len(post_c2))
 print(len(post_c3))
 post_test = cls.predict_proba(Xte)
 alpha = qp.functional.prevalence_from_labels(yte, classes=[0, 1, 2])
 nbins = 20
 plt.rcParams.update({'font.size': 7})
 fig = plt.figure()
 positions = np.asarray([2,1,0])
 colors = ['r', 'g', 'b']
 for i, post_set in enumerate([post_c1, post_c2, post_c3, post_test]):
    ax = fig.add_subplot(141+i, projection='3d')
    for post, c, z in zip(post_set.T, colors, positions):
        hist, bins = np.histogram(post, bins=nbins, density=True, range=[0,1])
        xs = (bins[:-1] + bins[1:])/2
        ax.bar(xs, hist, width=1/nbins, zs=z, zdir='y', color=c, ec=c, alpha=0.6)
    ax.yaxis.set_ticks(positions)
    ax.yaxis.set_ticklabels(['$y=1$', '$y=2$', '$y=3$'])
    ax.xaxis.set_ticks([])
    ax.xaxis.set_ticklabels([], minor=True)
    ax.zaxis.set_ticks([])
    ax.zaxis.set_ticklabels([], minor=True)
 #plt.figure(figsize=(10,6))
 #plt.show()
 plt.savefig('./histograms.pdf')
--- a/distribution_matching/figures/nclasses_plot.py
+++ b/distribution_matching/figures/nclasses_plot.py
@ -0,0 +1,29 @@
 import numpy as np
 import pandas as pd
 import seaborn as sns
 import matplotlib.pyplot as plt
 """
 This script generates plots of sensibility for the number of classes
 Plots results for MAE, MRAE, and KLD
 The hyperparameters were set as: 
    quantifier.set_params(classifier__C=0.01, classifier__class_weight='balanced', bandwidth=0.2)
 """
 methods = ['DM', 'KDEy-ML', 'EMQ']
 optim = 'mae'
 dfs = [pd.read_csv(f'../results/lequa/nclasses/{optim}/{method}.csv', sep='\t') for method in methods]
 df = pd.concat(dfs)
 for err in ['MAE', 'MRAE']:
    piv = df.pivot_table(index='nClasses', columns='Method', values=err)
    g = sns.lineplot(data=piv, markers=True, dashes=False)
    g.set(xlim=(1, 28))
    g.legend(loc="center left", bbox_to_anchor=(1, 0.5))
    g.set_ylabel(err)
    g.set_xticks(np.linspace(1, 28, 28))
    plt.xticks(rotation=90)
    plt.grid()
    plt.savefig(f'./nclasses_{err}.pdf', bbox_inches='tight')
    plt.clf()
--- a/distribution_matching/figures/sensibility_plot.py
+++ b/distribution_matching/figures/sensibility_plot.py
@ -0,0 +1,50 @@
 import numpy as np
 import pandas as pd
 import seaborn as sns
 import matplotlib.pyplot as plt
 """
 This script generates plots of sensibility for the hyperparameter "bandwidth".
 Plots results for MAE, MRAE, and KLD
 The rest of hyperparameters were set to their default values
 """
 log_mrae = True
 for method, param, xlim, xticks in [
    ('KDEy-ML', 'Bandwidth', (0.01, 0.2), np.linspace(0.01, 0.2, 20)),
    ('DM-HD', 'nbins', (2,32), list(range(2,10)) + list(range(10,34,2)))
 ]:
    for dataset in ['tweet', 'lequa', 'uciml']:
        if dataset == 'tweet':
            df = pd.read_csv(f'../results/tweet/sensibility/{method}.csv', sep='\t')
            ylim = (0.03, 0.21)
        elif dataset == 'lequa':
            df = pd.read_csv(f'../results/lequa/T1B/sensibility/{method}.csv', sep='\t')
            ylim = (0.0125, 0.03)
        elif dataset == 'uciml':
            ylim = (0, 0.23)
            df = pd.read_csv(f'../results/ucimulti/sensibility/{method}.csv', sep='\t')
        for err in ['MAE']: #, 'MRAE']:
            piv = df.pivot_table(index=param, columns='Dataset', values=err)
            g = sns.lineplot(data=piv, markers=True, dashes=False)
            g.set(xlim=xlim)
            g.legend(loc="center left", bbox_to_anchor=(1, 0.5))
            if log_mrae and err=='MRAE':
                plt.yscale('log')
                g.set_ylabel('log('+err+')')
            else:
                g.set_ylabel(err)
            g.set_ylim(ylim)
            g.set_xticks(xticks)
            plt.xticks(rotation=90)
            plt.grid()
            plt.savefig(f'./sensibility_{method}_{dataset}_{err}.pdf', bbox_inches='tight')
            plt.clf()
--- a/distribution_matching/figures/simplex_density_plot.py
+++ b/distribution_matching/figures/simplex_density_plot.py
@ -0,0 +1,154 @@
 import math
 import numpy as np
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import train_test_split
 from sklearn.neighbors import KernelDensity
 from data import LabelledCollection
 scale = 10
 # con ternary (una lib de matplotlib) salen bien pero no puedo crear contornos, o no se
 # con plotly salen los contornos bien, pero es un poco un jaleo porque utiliza el navegador...
 def plot_simplex_(ax, density, title='', fontsize=9, points=None):
    import ternary
    tax = ternary.TernaryAxesSubplot(ax=ax, scale=scale)
    tax.heatmapf(density, boundary=True, style="triangular", colorbar=False, cmap='viridis') #cmap='magma')
    tax.boundary(linewidth=1.0)
    corner_fontsize = 5*fontsize//6
    tax.right_corner_label("$y=3$", fontsize=corner_fontsize)
    tax.top_corner_label("$y=2$", fontsize=corner_fontsize)
    tax.left_corner_label("$y=1$", fontsize=corner_fontsize)
    if title:
        tax.set_title(title, loc='center', y=-0.11, fontsize=fontsize)
    if points is not None:
        tax.scatter(points*scale, marker='o', color='w', alpha=0.25, zorder=10)
    tax.get_axes().axis('off')
    tax.clear_matplotlib_ticks()
    return tax
 def plot_simplex(ax, coord, kde_scores, title='', fontsize=11, points=None, savepath=None):
    import plotly.figure_factory as ff
    tax = ff.create_ternary_contour(coord.T, kde_scores, pole_labels=['y=1', 'y=2', 'y=3'],
                                interp_mode='cartesian',
                                ncontours=20,
                                colorscale='Viridis',
                                showscale=True,
                                title=title)
    if savepath is None:
        tax.show()
    else:
        tax.write_image(savepath)
    return tax
 from mpl_toolkits.axes_grid1 import make_axes_locatable
 def plot_3class_problem(post_c1, post_c2, post_c3, post_test, alpha, bandwidth):
    import ternary
    post_c1 = np.flip(post_c1, axis=1)
    post_c2 = np.flip(post_c2, axis=1)
    post_c3 = np.flip(post_c3, axis=1)
    post_test = np.flip(post_test, axis=1)
    size_=10
    fig = ternary.plt.figure(figsize=(4*size_, 1*size_))
    fig.tight_layout()
    ax1 = fig.add_subplot(1, 4, 1)
    divider = make_axes_locatable(ax1)
    ax2 = fig.add_subplot(1, 4, 2)
    divider = make_axes_locatable(ax2)
    ax3 = fig.add_subplot(1, 4, 3)
    divider = make_axes_locatable(ax3)
    ax4 = fig.add_subplot(1, 4, 4)
    divider = make_axes_locatable(ax4)
    kde1 = KernelDensity(bandwidth=bandwidth).fit(post_c1)
    kde2 = KernelDensity(bandwidth=bandwidth).fit(post_c2)
    kde3 = KernelDensity(bandwidth=bandwidth).fit(post_c3)
    #post_c1 = np.concatenate([post_c1, np.eye(3, dtype=float)])
    #post_c2 = np.concatenate([post_c2, np.eye(3, dtype=float)])
    #post_c3 = np.concatenate([post_c3, np.eye(3, dtype=float)])
    #plot_simplex_(ax1, lambda x:0, title='$f_1(\mathbf{x})=p(s(\mathbf{x})|y=1)$')
    #plot_simplex_(ax2, lambda x:0, title='$f_1(\mathbf{x})=p(s(\mathbf{x})|y=1)$')
    #plot_simplex_(ax3, lambda x:0, title='$f_1(\mathbf{x})=p(s(\mathbf{x})|y=1)$')
    def density(kde):
        def d(p):
            return np.exp(kde([p])).item()
        return d
    plot_simplex_(ax1, density(kde1.score_samples), title='$p_1$')
    plot_simplex_(ax2, density(kde2.score_samples), title='$p_2$')
    plot_simplex_(ax3, density(kde3.score_samples), title='$p_3$')
    #plot_simplex(ax1, post_c1, np.exp(kde1.score_samples(post_c1)), title='$f_1(\mathbf{x})=p(s(\mathbf{x})|y=1)$') #, savepath='figure/y1.png')
    #plot_simplex(ax2, post_c2, np.exp(kde2.score_samples(post_c2)), title='$f_2(\mathbf{x})=p(s(\mathbf{x})|y=2)$') #, savepath='figure/y2.png')
    #plot_simplex(ax3, post_c3, np.exp(kde3.score_samples(post_c3)), title='$f_3(\mathbf{x})=p(s(\mathbf{x})|y=3)$') #, savepath='figure/y3.png')
    def mixture_(prevs, kdes):
        def m(p):
            total_density = 0
            for prev, kde in zip(prevs, kdes):
                log_density = kde.score_samples([p]).item()
                density = np.exp(log_density)
                density *= prev
                total_density += density
            #print(total_density)
            return total_density
        return m
    title = '$p_{\mathbf{\\alpha}} = \sum_{i \in n} \\alpha_i p_i$'
    plot_simplex_(ax4, mixture_(alpha, [kde1, kde2, kde3]), title=title, points=post_test)
    #mixture(alpha, [kde1, kde2, kde3])
    #post_test = np.concatenate([post_test, np.eye(3, dtype=float)])
    #test_scores = sum(alphai*np.exp(kdei.score_samples(post_test)) for alphai, kdei in zip(alpha, [kde1,kde2,kde3]))
    #plot_simplex(ax4, post_test, test_scores, title=title, points=post_test)
    #ternary.plt.show()
    ternary.plt.savefig('./simplex.png')
 import quapy as qp
 data = qp.datasets.fetch_twitter('wb', min_df=3, pickle=True, for_model_selection=False)
 X, y = data.training.Xy
 cls = LogisticRegression(C=0.0001, random_state=0)
 Xtr, Xte, ytr, yte = train_test_split(X, y, train_size=0.7, stratify=y, random_state=0)
 cls.fit(Xtr, ytr)
 test = LabelledCollection(Xte, yte)
 test = test.sampling(100, *[0.2, 0.1, 0.7])
 Xte, yte = test.Xy
 post_c1 = cls.predict_proba(Xte[yte==0])
 post_c2 = cls.predict_proba(Xte[yte==1])
 post_c3 = cls.predict_proba(Xte[yte==2])
 post_test = cls.predict_proba(Xte)
 print(post_test)
 alpha = qp.functional.prevalence_from_labels(yte, classes=[0, 1, 2])
 #post_c1 = np.random.dirichlet([10,3,1], 30)
 #post_c2 = np.random.dirichlet([1,11,6], 30)
 #post_c3 = np.random.dirichlet([1,5,20], 30)
 #post_test = np.random.dirichlet([5,1,6], 100)
 #alpha = [0.5, 0.3, 0.2]
 print(f'test alpha {alpha}')
 plot_3class_problem(post_c1, post_c2, post_c3, post_test, alpha, bandwidth=0.1)
--- a/distribution_matching/figures/simplex_density_plot2.py
+++ b/distribution_matching/figures/simplex_density_plot2.py
@ -0,0 +1,129 @@
 import ternary
 import math
 import numpy as np
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import train_test_split, cross_val_predict
 from sklearn.neighbors import KernelDensity
 import plotly.figure_factory as ff
 from data import LabelledCollection
 scale = 100
 # con ternary (una lib de matplotlib) salen bien pero no puedo crear contornos, o no se
 # con plotly salen los contornos bien, pero es un poco un jaleo porque utiliza el navegador...
 def plot_simplex_(ax, density, title='', fontsize=30, points=None):
    tax = ternary.TernaryAxesSubplot(ax=ax, scale=scale)
    tax.heatmapf(density, boundary=True, style="triangular", colorbar=False, cmap='viridis') #cmap='magma')
    tax.boundary(linewidth=1.0)
    corner_fontsize = int(5*fontsize//6)
    tax.right_corner_label("$y=3$", fontsize=corner_fontsize)
    tax.top_corner_label("$y=2$", fontsize=corner_fontsize)
    tax.left_corner_label("$y=1$", fontsize=corner_fontsize)
    if title:
        tax.set_title(title, loc='center', y=-0.11, fontsize=fontsize)
    if points is not None:
        tax.scatter(points*scale, marker='o', color='w', alpha=0.25, zorder=10, s=5*scale)
    tax.get_axes().axis('off')
    tax.clear_matplotlib_ticks()
    return tax
 from mpl_toolkits.axes_grid1 import make_axes_locatable
 def plot_3class_problem(post_c1, post_c2, post_c3, post_test, alpha, bandwidth):
    post_c1 = np.flip(post_c1, axis=1)
    post_c2 = np.flip(post_c2, axis=1)
    post_c3 = np.flip(post_c3, axis=1)
    post_test = np.flip(post_test, axis=1)
    size_=10
    fig = ternary.plt.figure(figsize=(5*size_, 1*size_))
    fig.tight_layout()
    ax1 = fig.add_subplot(1, 4, 1)
    divider = make_axes_locatable(ax1)
    ax2 = fig.add_subplot(1, 4, 2)
    divider = make_axes_locatable(ax2)
    ax3 = fig.add_subplot(1, 4, 3)
    divider = make_axes_locatable(ax3)
    ax4 = fig.add_subplot(1, 4, 4)
    divider = make_axes_locatable(ax4)
    kde1 = KernelDensity(bandwidth=bandwidth).fit(post_c1)
    kde2 = KernelDensity(bandwidth=bandwidth).fit(post_c2)
    kde3 = KernelDensity(bandwidth=bandwidth).fit(post_c3)
    #post_c1 = np.concatenate([post_c1, np.eye(3, dtype=float)])
    #post_c2 = np.concatenate([post_c2, np.eye(3, dtype=float)])
    #post_c3 = np.concatenate([post_c3, np.eye(3, dtype=float)])
    #plot_simplex_(ax1, lambda x:0, title='$f_1(\mathbf{x})=p(s(\mathbf{x})|y=1)$')
    #plot_simplex_(ax2, lambda x:0, title='$f_1(\mathbf{x})=p(s(\mathbf{x})|y=1)$')
    #plot_simplex_(ax3, lambda x:0, title='$f_1(\mathbf{x})=p(s(\mathbf{x})|y=1)$')
    def density(kde):
        def d(p):
            return np.exp(kde([p])).item()
        return d
    plot_simplex_(ax1, density(kde1.score_samples), title='$p_1$')
    plot_simplex_(ax2, density(kde2.score_samples), title='$p_2$')
    plot_simplex_(ax3, density(kde3.score_samples), title='$p_3$')
    #plot_simplex(ax1, post_c1, np.exp(kde1.score_samples(post_c1)), title='$f_1(\mathbf{x})=p(s(\mathbf{x})|y=1)$') #, savepath='figure/y1.png')
    #plot_simplex(ax2, post_c2, np.exp(kde2.score_samples(post_c2)), title='$f_2(\mathbf{x})=p(s(\mathbf{x})|y=2)$') #, savepath='figure/y2.png')
    #plot_simplex(ax3, post_c3, np.exp(kde3.score_samples(post_c3)), title='$f_3(\mathbf{x})=p(s(\mathbf{x})|y=3)$') #, savepath='figure/y3.png')
    def mixture_(prevs, kdes):
        def m(p):
            total_density = 0
            for prev, kde in zip(prevs, kdes):
                log_density = kde.score_samples([p]).item()
                density = np.exp(log_density)
                density *= prev
                total_density += density
            #print(total_density)
            return total_density
        return m
    title = '$\mathbf{p}_{\mathbf{\\alpha}} = \sum_{i \in n} \\alpha_i p_i$'
    plot_simplex_(ax4, mixture_(alpha, [kde1, kde2, kde3]), title=title, points=post_test)
    #ternary.plt.show()
    ternary.plt.savefig('./simplex.pdf')
 import quapy as qp
 data = qp.datasets.fetch_twitter('wb', min_df=3, pickle=True, for_model_selection=False)
 Xtr, ytr = data.training.Xy
 Xte, yte = data.test.sampling(150, *[0.5, 0.1, 0.4]).Xy
 cls = LogisticRegression(C=0.0001, random_state=0)
 draw_from_training = False
 if draw_from_training:
    post_tr = cross_val_predict(cls, Xtr, ytr, n_jobs=-1, method='predict_proba')
    post_c1 = post_tr[ytr==0]
    post_c2 = post_tr[ytr==1]
    post_c3 = post_tr[ytr==2]
    cls.fit(Xtr, ytr)
 else:
    cls.fit(Xtr, ytr)
    post_te = cls.predict_proba(Xte)
    post_c1 = post_te[yte == 0]
    post_c2 = post_te[yte == 1]
    post_c3 = post_te[yte == 2]
 post_test = cls.predict_proba(Xte)
 alpha = qp.functional.prevalence_from_labels(yte, classes=[0, 1, 2])
 print(f'test alpha {alpha}')
 plot_3class_problem(post_c1, post_c2, post_c3, post_test, alpha, bandwidth=0.1)
--- a/distribution_matching/lequa_experiments.py
+++ b/distribution_matching/lequa_experiments.py
@ -0,0 +1,65 @@
 import pickle
 import numpy as np
 import os
 import pandas as pd
 from distribution_matching.commons import METHODS, BIN_METHODS, new_method, show_results
 import quapy as qp
 from quapy.model_selection import GridSearchQ
 if __name__ == '__main__':
    for task in ['T1A', 'T1B']:
        qp.environ['SAMPLE_SIZE'] = qp.datasets.LEQUA2022_SAMPLE_SIZE[task]
        qp.environ['N_JOBS'] = -1
        for optim in ['mae', 'mrae']:
            result_dir = f'results/lequa/{task}/{optim}'
            os.makedirs(result_dir, exist_ok=True)
            for method in (METHODS if task=='T1B' else BIN_METHODS):
                print('Init method', method)
                result_path = f'{result_dir}/{method}'
                if os.path.exists(result_path+'.csv'):
                    print(f'file {result_path}.csv already exist; skipping')
                    continue
                with open(result_path+'.csv', 'wt') as csv:
                    csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n')
                    dataset = task
                    train, val_gen, test_gen = qp.datasets.fetch_lequa2022(dataset)
                    print(f'init {dataset} #instances: {len(train)}')
                    param_grid, quantifier = new_method(method)
                    if param_grid is not None:
                        modsel = GridSearchQ(quantifier, param_grid, protocol=val_gen, refit=False, n_jobs=-1, verbose=1, error=optim)
                        modsel.fit(train)
                        print(f'best params {modsel.best_params_}')
                        print(f'best score {modsel.best_score_}')
                        pickle.dump(
                            (modsel.best_params_, modsel.best_score_,),
                            open(f'{result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
                        quantifier = modsel.best_model()
                    else:
                        print('debug mode... skipping model selection')
                        quantifier.fit(train)
                    report = qp.evaluation.evaluation_report(
                        quantifier, protocol=test_gen, error_metrics=['mae', 'mrae', 'kld'],
                        verbose=True, verbose_error=optim[1:], n_jobs=-1
                    )
                    means = report.mean()
                    report.to_csv(result_path+'.dataframe')
                    csv.write(f'{method}\tLeQua-{task}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
                    csv.flush()
                    print(means)
        show_results(result_path)
--- a/distribution_matching/lequa_sensibility_analysis.py
+++ b/distribution_matching/lequa_sensibility_analysis.py
@ -0,0 +1,56 @@
 import numpy as np
 from sklearn.linear_model import LogisticRegression
 import os
 import quapy as qp
 from distribution_matching.commons import show_results
 from distribution_matching.method.method_kdey import KDEy
 from quapy.method.aggregative import DistributionMatching
 SEED=1
 def task(val):
    print('job-init', val)
    train, val_gen, test_gen = qp.datasets.fetch_lequa2022('T1B')
    with qp.util.temp_seed(SEED):
        if method=='KDEy-ML':
            quantifier = KDEy(LogisticRegression(), target='max_likelihood', val_split=10, bandwidth=val)
        elif method == 'DM-HD':
            quantifier = DistributionMatching(LogisticRegression(), val_split=10, nbins=val, divergence='HD')
        quantifier.fit(train)
        report = qp.evaluation.evaluation_report(
            quantifier, protocol=test_gen, error_metrics=['mae', 'mrae', 'kld'], verbose=True)
        return report
 if __name__ == '__main__':
    qp.environ['SAMPLE_SIZE'] = qp.datasets.LEQUA2022_SAMPLE_SIZE['T1B']
    qp.environ['N_JOBS'] = -1
    result_dir = f'results/lequa/T1B/sensibility'
    os.makedirs(result_dir, exist_ok=True)
    for method, param, grid in [
        ('KDEy-ML', 'Bandwidth', np.linspace(0.01, 0.2, 20)),
        ('DM-HD', 'nbins', list(range(2, 10)) + list(range(10, 34, 2)))
    ]:
        global_result_path = f'{result_dir}/{method}'
        if not os.path.exists(global_result_path+'.csv'):
            with open(global_result_path+'.csv', 'wt') as csv:
                csv.write(f'Method\tDataset\t{param}\tMAE\tMRAE\tKLD\n')
        reports = qp.util.parallel(task, grid, n_jobs=-1)
        with open(global_result_path + '.csv', 'at') as csv:
            for val, report in zip(grid, reports):
                means = report.mean()
                local_result_path = global_result_path + '_T1B' + (f'_{val:.3f}' if isinstance(val, float) else f'{val}')
                report.to_csv(f'{local_result_path}.dataframe')
                csv.write(f'{method}\tLeQua-T1B\t{val}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
                csv.flush()
        show_results(global_result_path)
--- a/distribution_matching/method/dirichlety.py
+++ b/distribution_matching/method/dirichlety.py
@ -0,0 +1,105 @@
 import os
 import sys
 from typing import Union, Callable
 import numpy as np
 from sklearn.base import BaseEstimator
 from sklearn.linear_model import LogisticRegression
 import pandas as pd
 from sklearn.model_selection import GridSearchCV
 from sklearn.neighbors import KernelDensity
 import quapy as qp
 from quapy.data import LabelledCollection
 from quapy.protocol import APP, UPP
 from quapy.method.aggregative import AggregativeProbabilisticQuantifier, _training_helper, cross_generate_predictions, \
    DistributionMatching, _get_divergence
 import scipy
 from scipy import optimize
 from statsmodels.nonparametric.kernel_density import KDEMultivariateConditional
 import dirichlet
 class DIRy(AggregativeProbabilisticQuantifier):
    MAXITER = 100000
    def __init__(self, classifier: BaseEstimator, val_split=0.4, n_jobs=None, target='max_likelihood'):
        self.classifier = classifier
        self.val_split = val_split
        self.n_jobs = n_jobs
        self.target = target
    def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
        if val_split is None:
            val_split = self.val_split
        self.classifier, y, posteriors, _, _ = cross_generate_predictions(
            data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
        )
        self.val_parameters = []
        for cat in range(data.n_classes):
            dir_i = dirichlet.mle(posteriors[y == cat], maxiter=DIRy.MAXITER)
            self.val_parameters.append(dir_i)
            # print(cat)
        # self.val_parameters = [dirichlet.mle(posteriors[y == cat], maxiter=DIRy.MAXITER) for cat in range(data.n_classes)]
        return self
    def val_pdf(self, prev):
        """
        Returns a function that computes the mixture model with the given prev as mixture factor
        :param prev: a prevalence vector, ndarray
        :return: a function implementing the validation distribution with fixed mixture factor
        """
        return lambda posteriors: sum(prev_i * dirichlet.pdf(parameters_i)(posteriors) for parameters_i, prev_i in zip(self.val_parameters, prev))
    def aggregate(self, posteriors: np.ndarray):
        if self.target == 'min_divergence':
            raise NotImplementedError('not yet')
            return self._target_divergence(posteriors)
        elif self.target == 'max_likelihood':
            return self._target_likelihood(posteriors)
        else:
            raise ValueError('unknown target')
    def _target_divergence(self, posteriors):
        test_density = self.get_kde(posteriors)
        # val_test_posteriors = np.concatenate([self.val_posteriors, posteriors])
        test_likelihood = self.pdf(test_density, posteriors)
        divergence = _get_divergence(self.divergence)
        n_classes = len(self.val_densities)
        def match(prev):
            val_pdf = self.val_pdf(prev)
            val_likelihood = val_pdf(posteriors)
            return divergence(val_likelihood, test_likelihood)
        # the initial point is set as the uniform distribution
        uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
        # solutions are bounded to those contained in the unit-simplex
        bounds = tuple((0, 1) for _ in range(n_classes))  # values in [0,1]
        constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)})  # values summing up to 1
        r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
        return r.x
    def _target_likelihood(self, posteriors, eps=0.000001):
        n_classes = len(self.val_parameters)
        def neg_loglikelihood(prev):
            val_pdf = self.val_pdf(prev)
            test_likelihood = val_pdf(posteriors)
            test_loglikelihood = np.log(test_likelihood + eps)
            return -np.sum(test_loglikelihood)
        # the initial point is set as the uniform distribution
        uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
        # solutions are bounded to those contained in the unit-simplex
        bounds = tuple((0, 1) for _ in range(n_classes))  # values in [0,1]
        constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)})  # values summing up to 1
        r = optimize.minimize(neg_loglikelihood, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
        return r.x
--- a/distribution_matching/method/kdex.py
+++ b/distribution_matching/method/kdex.py
@ -0,0 +1,50 @@
 from quapy.method.base import BaseQuantifier
 import numpy as np
 from distribution_matching.method.kdey import KDEBase
 import quapy as qp
 from quapy.data import LabelledCollection
 import quapy.functional as F
 from sklearn.preprocessing import StandardScaler
 class KDExML(BaseQuantifier, KDEBase):
    def __init__(self, bandwidth=0.1, standardize=True):
        self._check_bandwidth(bandwidth)
        self.bandwidth = bandwidth
        self.standardize = standardize
    def fit(self, data: LabelledCollection):
        X, y = data.Xy
        if self.standardize:
            self.scaler = StandardScaler()
            X = self.scaler.fit_transform(X)
        self.mix_densities = self.get_mixture_components(X, y, data.n_classes, self.bandwidth)
        return self
    def quantify(self, X):
        """
        Searches for the mixture model parameter (the sought prevalence values) that maximizes the likelihood
        of the data (i.e., that minimizes the negative log-likelihood)
        :param X: instances in the sample 
        :return: a vector of class prevalence estimates
        """
        epsilon = 1e-10
        n_classes = len(self.mix_densities)
        if self.standardize:
            X = self.scaler.transform(X)
        test_densities = [self.pdf(kde_i, X) for kde_i in self.mix_densities]
        def neg_loglikelihood(prev):
            test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities))
            test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
            return  -np.sum(test_loglikelihood)
        return F.optim_minimize(neg_loglikelihood, n_classes)
--- a/distribution_matching/method/kdey.py
+++ b/distribution_matching/method/kdey.py
@ -0,0 +1,234 @@
 from typing import Union
 import numpy as np
 from sklearn.base import BaseEstimator
 from sklearn.neighbors import KernelDensity
 import quapy as qp
 from quapy.data import LabelledCollection
 from quapy.method.aggregative import AggregativeProbabilisticQuantifier, cross_generate_predictions
 import quapy.functional as F
 from sklearn.metrics.pairwise import rbf_kernel
 class KDEBase:
    BANDWIDTH_METHOD = ['scott', 'silverman']
    @classmethod
    def _check_bandwidth(cls, bandwidth):
        assert bandwidth in KDEBase.BANDWIDTH_METHOD or isinstance(bandwidth, float), \
            f'invalid bandwidth, valid ones are {KDEBase.BANDWIDTH_METHOD} or float values'
        if isinstance(bandwidth, float):
            assert 0 < bandwidth < 1,  "the bandwith for KDEy should be in (0,1), since this method models the unit simplex"
    def get_kde_function(self, X, bandwidth):
        return KernelDensity(bandwidth=bandwidth).fit(X)
    def pdf(self, kde, X):
        return np.exp(kde.score_samples(X))
    def get_mixture_components(self, X, y, n_classes, bandwidth):
        return [self.get_kde_function(X[y == cat], bandwidth) for cat in range(n_classes)]
 class KDEyML(AggregativeProbabilisticQuantifier, KDEBase):
    def __init__(self, classifier: BaseEstimator, val_split=10, bandwidth=0.1, n_jobs=None, random_state=0):
        self._check_bandwidth(bandwidth)
        self.classifier = classifier
        self.val_split = val_split
        self.bandwidth = bandwidth
        self.n_jobs = n_jobs
        self.random_state=random_state
    def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
        if val_split is None:
            val_split = self.val_split
        self.classifier, y, posteriors, _, _ = cross_generate_predictions(
            data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
        )
        self.mix_densities = self.get_mixture_components(posteriors, y, data.n_classes, self.bandwidth)
        return self
    def aggregate(self, posteriors: np.ndarray):
        """
        Searches for the mixture model parameter (the sought prevalence values) that maximizes the likelihood
        of the data (i.e., that minimizes the negative log-likelihood)
        :param posteriors: instances in the sample converted into posterior probabilities
        :return: a vector of class prevalence estimates
        """
        np.random.RandomState(self.random_state)
        epsilon = 1e-10
        n_classes = len(self.mix_densities)
        test_densities = [self.pdf(kde_i, posteriors) for kde_i in self.mix_densities]
        def neg_loglikelihood(prev):
            test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities))
            test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
            return  -np.sum(test_loglikelihood)
        return F.optim_minimize(neg_loglikelihood, n_classes)
 class KDEyHD(AggregativeProbabilisticQuantifier, KDEBase):
    def __init__(self, classifier: BaseEstimator, val_split=10, divergence: str='HD',
                 bandwidth=0.1, n_jobs=None, random_state=0, montecarlo_trials=10000):
        self._check_bandwidth(bandwidth)
        self.classifier = classifier
        self.val_split = val_split
        self.divergence = divergence
        self.bandwidth = bandwidth
        self.n_jobs = n_jobs
        self.random_state=random_state
        self.montecarlo_trials = montecarlo_trials
    def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
        if val_split is None:
            val_split = self.val_split
        self.classifier, y, posteriors, _, _ = cross_generate_predictions(
            data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
        )
        self.mix_densities = self.get_mixture_components(posteriors, y, data.n_classes, self.bandwidth)
        N = self.montecarlo_trials
        rs = self.random_state
        n = data.n_classes
        self.reference_samples = np.vstack([kde_i.sample(N//n, random_state=rs) for kde_i in self.mix_densities])
        self.reference_classwise_densities = np.asarray([self.pdf(kde_j, self.reference_samples) for kde_j in self.mix_densities])
        self.reference_density = np.mean(self.reference_classwise_densities, axis=0)  # equiv. to (uniform @ self.reference_classwise_densities)
        return self
    def aggregate(self, posteriors: np.ndarray):
        # we retain all n*N examples (sampled from a mixture with uniform parameter), and then
        # apply importance sampling (IS). In this version we compute D(p_alpha||q) with IS
        n_classes = len(self.mix_densities)
        test_kde = self.get_kde_function(posteriors, self.bandwidth)
        test_densities = self.pdf(test_kde, self.reference_samples)
        def f_squared_hellinger(u):
            return (np.sqrt(u)-1)**2
        # todo: this will fail when self.divergence is a callable, and is not the right place to do it anyway
        if self.divergence.lower() == 'hd':
            f = f_squared_hellinger
        else:
            raise ValueError('only squared HD is currently implemented')
        epsilon = 1e-10
        qs = test_densities + epsilon
        rs = self.reference_density + epsilon
        iw = qs/rs  #importance weights
        p_class = self.reference_classwise_densities + epsilon
        fracs = p_class/qs
        def divergence(prev):
            # ps / qs = (prev @ p_class) / qs = prev @ (p_class / qs) = prev @ fracs
            ps_div_qs = prev @ fracs
            return np.mean( f(ps_div_qs) * iw )
        return F.optim_minimize(divergence, n_classes)
 class KDEyCS(AggregativeProbabilisticQuantifier):
    def __init__(self, classifier: BaseEstimator, val_split=10, bandwidth=0.1, n_jobs=None, random_state=0):
        KDEBase._check_bandwidth(bandwidth)
        self.classifier = classifier
        self.val_split = val_split
        self.bandwidth = bandwidth
        self.n_jobs = n_jobs
        self.random_state=random_state
    def gram_matrix_mix_sum(self, X, Y=None):
        # this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y))
        # to contain pairwise evaluations of N(x|mu,Sigma1+Sigma2) with mu=y and Sigma1 and Sigma2 are 
        # two "scalar matrices" (h^2)*I each, so Sigma1+Sigma2 has scalar 2(h^2) (h is the bandwidth)
        h = self.bandwidth
        variance = 2 * (h**2)
        nD = X.shape[1]
        gamma = 1/(2*variance)
        norm_factor = 1/np.sqrt(((2*np.pi)**nD) * (variance**(nD)))
        gram = norm_factor * rbf_kernel(X, Y, gamma=gamma)
        return gram.sum()
    def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
        if val_split is None:
            val_split = self.val_split
        self.classifier, y, posteriors, _, _ = cross_generate_predictions(
            data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
        )
        assert all(sorted(np.unique(y)) == np.arange(data.n_classes)), \
            'label name gaps not allowed in current implementation'
        n = data.n_classes
        P = posteriors
        # counts_inv keeps track of the relative weight of each datapoint within its class
        # (i.e., the weight in its KDE model)
        counts_inv = 1 / (data.counts())
        # tr_tr_sums corresponds to symbol \overline{B} in the paper
        tr_tr_sums = np.zeros(shape=(n,n), dtype=float)
        for i in range(n):
            for j in range(n):
                if i > j:
                    tr_tr_sums[i,j] = tr_tr_sums[j,i]
                else:
                    block = self.gram_matrix_mix_sum(P[y == i], P[y == j] if i!=j else None)
                    tr_tr_sums[i, j] = block
        # keep track of these data structures for the test phase
        self.Ptr = P
        self.ytr = y
        self.tr_tr_sums = tr_tr_sums
        self.counts_inv = counts_inv
        return self
    def aggregate(self, posteriors: np.ndarray):
        Ptr = self.Ptr
        Pte = posteriors
        y = self.ytr
        tr_tr_sums = self.tr_tr_sums
        M, nD = Pte.shape
        Minv = (1/M) # t in the paper
        n = Ptr.shape[1]
        # becomes a constant that does not affect the optimization, no need to compute it
        # partC = 0.5*np.log(self.gram_matrix_mix_sum(Pte) * Kinv * Kinv)
        # tr_te_sums corresponds to \overline{a}*(1/Li)*(1/M) in the paper (note the constants
        # are already aggregated to tr_te_sums, so these multiplications are not carried out
        # at each iteration of the optimization phase)
        tr_te_sums = np.zeros(shape=n, dtype=float)
        for i in range(n):
            tr_te_sums[i] = self.gram_matrix_mix_sum(Ptr[y==i], Pte) 
        def divergence(alpha):
            # called \overline{r} in the paper
            alpha_ratio = alpha * self.counts_inv
            # recal that tr_te_sums already accounts for the constant terms (1/Li)*(1/M)
            partA = -np.log((alpha_ratio @ tr_te_sums) * Minv)
            partB = 0.5 * np.log(alpha_ratio @ tr_tr_sums @ alpha_ratio)
            return partA + partB #+ partC
        return F.optim_minimize(divergence, n)
--- a/distribution_matching/method/method_kdey.py
+++ b/distribution_matching/method/method_kdey.py
@ -0,0 +1,321 @@
 from cgi import test
 import os
 import sys
 from typing import Union, Callable
 import numpy as np
 from sklearn.base import BaseEstimator
 from sklearn.linear_model import LogisticRegression
 import pandas as pd
 from sklearn.model_selection import GridSearchCV
 from sklearn.neighbors import KernelDensity
 import quapy as qp
 from quapy.data import LabelledCollection
 from quapy.protocol import APP, UPP
 from quapy.method.aggregative import AggregativeProbabilisticQuantifier, _training_helper, cross_generate_predictions, \
    DistributionMatching, _get_divergence
 import quapy.functional as F
 import scipy
 from scipy import optimize
 #from statsmodels.nonparametric.kernel_density import KDEMultivariateConditional
 # TODO: optimize the bandwidth automatically
 # TODO: think of a MMD-y variant, i.e., a MMD variant that uses the points in the simplex and possibly any non-linear kernel
 class KDEy(AggregativeProbabilisticQuantifier):
    BANDWIDTH_METHOD = ['auto', 'scott', 'silverman']
    ENGINE = ['scipy', 'sklearn', 'statsmodels']
    TARGET = ['min_divergence', 'min_divergence_uniform', 'max_likelihood']
    def __init__(self, classifier: BaseEstimator, val_split=0.4, divergence: Union[str, Callable]='L2',
                 bandwidth='scott', engine='sklearn', target='min_divergence', n_jobs=None, random_state=0, montecarlo_trials=1000):
        assert bandwidth in KDEy.BANDWIDTH_METHOD or isinstance(bandwidth, float), \
            f'unknown bandwidth_method, valid ones are {KDEy.BANDWIDTH_METHOD}'
        assert engine in KDEy.ENGINE, f'unknown engine, valid ones are {KDEy.ENGINE}'
        assert target in KDEy.TARGET, f'unknown target, valid ones are {KDEy.TARGET}'
        assert target=='max_likelihood' or divergence in ['KLD', 'HD', 'JS'], \
            'in this version I will only allow KLD or squared HD as a divergence'
        self.classifier = classifier
        self.val_split = val_split
        self.divergence = divergence
        self.bandwidth = bandwidth
        self.engine = engine
        self.target = target
        self.n_jobs = n_jobs
        self.random_state=random_state
        self.montecarlo_trials = montecarlo_trials
    def search_bandwidth_maxlikelihood(self, posteriors, labels):
        grid = {'bandwidth': np.linspace(0.001, 0.2, 100)}
        search = GridSearchCV(
            KernelDensity(), param_grid=grid, n_jobs=-1, cv=50, verbose=1, refit=True
        )
        search.fit(posteriors, labels)
        bandwidth = search.best_params_['bandwidth']
        print(f'auto: bandwidth={bandwidth:.5f}')
        return bandwidth
    def get_kde_function(self, posteriors):
        # if self.bandwidth == 'auto':
        #     print('adjusting bandwidth')
        #
        #     if self.engine == 'sklearn':
        #         grid = {'bandwidth': np.linspace(0.001,0.2,41)}
        #         search = GridSearchCV(
        #             KernelDensity(), param_grid=grid, n_jobs=-1, cv=10, verbose=1, refit=True
        #         )
        #         search.fit(posteriors)
        #         print(search.best_score_)
        #         print(search.best_params_)
        #
        #         import pandas as pd
        #         df = pd.DataFrame(search.cv_results_)
        #         pd.set_option('display.max_columns', None)
        #         pd.set_option('display.max_rows', None)
        #         pd.set_option('expand_frame_repr', False)
        #         print(df)
        #         sys.exit(0)
        #
        #         kde = search
        #else:
        if self.engine == 'scipy':
            # scipy treats columns as datapoints, and need the datapoints not to lie in a lower-dimensional subspace, which
            # requires removing the last dimension which is constrained
            posteriors = posteriors[:,:-1].T
            kde = scipy.stats.gaussian_kde(posteriors)
            kde.set_bandwidth(self.bandwidth)
        elif self.engine == 'sklearn':
            #print('fitting kde')
            kde = KernelDensity(bandwidth=self.bandwidth).fit(posteriors)
            #print('[fitting done]')
        return kde
    def pdf(self, kde, posteriors):
        if self.engine == 'scipy':
            return kde(posteriors[:, :-1].T)
        elif self.engine == 'sklearn':
            return np.exp(kde.score_samples(posteriors))
    def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
        """
        :param data: the training set
        :param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit)
        :param val_split: either a float in (0,1) indicating the proportion of training instances to use for
         validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
         indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV
         to estimate the parameters
        """
        if val_split is None:
            val_split = self.val_split
        self.classifier, y, posteriors, classes, class_count = cross_generate_predictions(
            data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
        )
        if self.bandwidth == 'auto':
            self.bandwidth = self.search_bandwidth_maxlikelihood(posteriors, y)
        self.val_densities = [self.get_kde_function(posteriors[y == cat]) for cat in range(data.n_classes)]
        self.val_posteriors = posteriors
        if self.target == 'min_divergence_uniform':
            self.samples = qp.functional.uniform_prevalence_sampling(n_classes=data.n_classes, size=self.montecarlo_trials)
            self.sample_densities = [self.pdf(kde_i, self.samples) for kde_i in self.val_densities]
        elif self.target == 'min_divergence':
            N = self.montecarlo_trials
            rs = self.random_state
            n = data.n_classes
            self.reference_samples = np.vstack([kde_i.sample(N//n, random_state=rs) for kde_i in self.val_densities])
            self.reference_classwise_densities = np.asarray([self.pdf(kde_j, self.reference_samples) for kde_j in self.val_densities])
            self.reference_density = np.mean(self.reference_classwise_densities, axis=0)  # equiv. to (uniform @ self.reference_classwise_densities)
        elif self.target == 'min_divergence_deprecated':  # the version of the first draft, with n*N presampled, then alpha*N chosen for class
            self.class_samples = [kde_i.sample(self.montecarlo_trials, random_state=self.random_state) for kde_i in self.val_densities]
            self.class_sample_densities = {}
            for ci, samples_i in enumerate(self.class_samples):
                self.class_sample_densities[ci] = np.asarray([self.pdf(kde_j, samples_i) for kde_j in self.val_densities]).T
        return self
    def aggregate(self, posteriors: np.ndarray):
        if self.target == 'min_divergence':
            return self._target_divergence(posteriors)
        elif self.target == 'min_divergence_uniform':
            return self._target_divergence_uniform(posteriors)
        elif self.target == 'max_likelihood':
            return self._target_likelihood(posteriors)
        else:
            raise ValueError('unknown target')
    # new version in which we retain all n*N examples (sampled from a mixture with uniform parameter), and then
    # apply importance sampling (IS). In this version we compute D(p_alpha||q) with IS, and not D(q||p_alpha) as
    # in the first draft
    def _target_divergence(self, posteriors):
        # in this variant we evaluate the divergence using a Montecarlo approach
        n_classes = len(self.val_densities)
        test_kde = self.get_kde_function(posteriors)
        test_densities = self.pdf(test_kde, self.reference_samples)
        def f_squared_hellinger(u):
            return (np.sqrt(u)-1)**2
        # todo: this will fail when self.divergence is a callable, and is not the right place to do it anyway
        if self.divergence.lower() == 'hd':
            f = f_squared_hellinger
        else:
            raise ValueError('only squared HD is currently implemented')
        epsilon = 1e-10
        qs = test_densities + epsilon
        rs = self.reference_density + epsilon
        iw = qs/rs  #importance weights
        p_class = self.reference_classwise_densities + epsilon
        fracs = p_class/qs
        def divergence(prev):
            # ps / qs = (prev @ p_class) / qs = prev @ (p_class / qs) = prev @ fracs
            ps_div_qs = prev @ fracs
            return np.mean( f(ps_div_qs) * iw )
        return F.optim_minimize(divergence, n_classes)
    # new version in which we retain all n*N examples (sampled from a mixture with uniform parameter), and then
    # apply importance sampling (IS). In this version we compute D(q||p_alpha) with IS, and not D(p_alpha||q) as
    # in the reformulation proposed above
    def _target_divergence_q__p(self, posteriors):
        # in this variant we evaluate the divergence using a Montecarlo approach
        n_classes = len(self.val_densities)
        test_kde = self.get_kde_function(posteriors)
        test_densities = self.pdf(test_kde, self.reference_samples)
        def f_squared_hellinger(u):
            return (np.sqrt(u)-1)**2
        # todo: this will fail when self.divergence is a callable, and is not the right place to do it anyway
        if self.divergence.lower() == 'hd':
            f = f_squared_hellinger
        else:
            raise ValueError('only squared HD is currently implemented')
        epsilon = 1e-10
        qs = test_densities + epsilon
        rs = self.reference_density + epsilon
        p_class = self.reference_classwise_densities + epsilon
        # D(q||p_a) = 1/N sum f(q/p_a) * (p_a / p_u)
        def divergence(prev):
            p_a = prev @ p_class
            return np.mean( f(qs/p_a) * (p_a/rs) )
        return F.optim_minimize(divergence, n_classes)
    # the first version we explain in the draft, choosing alpha*N from a pool of N for each class and w/o importance sampling
    def _target_divergence_deprecated(self, posteriors):
        # in this variant we evaluate the divergence using a Montecarlo approach
        n_classes = len(self.val_densities)
        test_kde = self.get_kde_function(posteriors)
        test_densities_per_class = [self.pdf(test_kde, samples_i) for samples_i in self.class_samples]
        # divergence = _get_divergence(self.divergence)
        def kld_monte(pi, qi, eps=1e-8):
            # there is no pi in front of the log because the samples are already drawn according to pi
            smooth_pi = pi+eps
            smooth_qi = qi+eps
            return np.mean(np.log(smooth_pi / smooth_qi))
        def squared_hellinger(pi, qi, eps=1e-8):
            smooth_pi = pi + eps
            smooth_qi = qi + eps
            return np.mean((np.sqrt(smooth_pi/smooth_qi)-1)**2)
        # todo: this will fail when self.divergence is a callable, and is not the right place to do it anyway
        if self.divergence.lower() == 'kld':
            fdivergence = kld_monte
        elif self.divergence.lower() == 'hd':
            fdivergence = squared_hellinger
        def dissimilarity(prev):
            # choose the samples according to the prevalence vector
            # e.g., prev = [0.5, 0.3, 0.2] will draw 50% from KDE_0, 30% from KDE_1, and 20% from KDE_2
            #       the points are already pre-sampled and de densities are pre-computed, so that now all that remains
            #       is to pick a proportional number of each from each class (same for test)
            num_variates_per_class = np.round(prev * self.montecarlo_trials).astype(int)
            sample_densities = np.vstack(
                [self.class_sample_densities[ci][:num_i] for ci, num_i in enumerate(num_variates_per_class)]
            )
            #val_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip(prev, sample_densities.T))
            val_likelihood = prev @ sample_densities.T
            #test_likelihood = []
            #for samples_i, num_i in zip(test_densities_per_class, num_variates_per_class):
            #    test_likelihood.append(samples_i[:num_i])
            #test_likelihood = np.concatenate[test]
            test_likelihood = np.concatenate(
                [samples_i[:num_i] for samples_i, num_i in zip(test_densities_per_class, num_variates_per_class)]
            )
            # return fdivergence(val_likelihood, test_likelihood)  # this is wrong, If I sample from the val distribution
            # then I am computing D(Test||Val), so it should be E_{x ~ Val}[f(Test(x)/Val(x))]
            return fdivergence(test_likelihood, val_likelihood)
        return F.optim_minimize(dissimilarity, n_classes)
    # this version explores the entire simplex, and then applies importance sampling. We have not really tested it in deep but
    # seems not to be promising
    def _target_divergence_uniform(self, posteriors):
        # in this variant we evaluate the divergence using a Montecarlo approach
        n_classes = len(self.val_densities)
        test_kde = self.get_kde_function(posteriors)
        test_likelihood = self.pdf(test_kde, self.samples)
        def f_squared_hellinger(t):
            return (np.sqrt(t) - 1)**2
        def f_jensen_shannon(t):
            return -(t+1)*np.log((t+1)/2) + t*np.log(t)
        def fdivergence(pi, qi, f, eps=1e-10):
            spi = pi+eps
            sqi = qi+eps
            return np.mean(f(spi/sqi)*sqi)
        if self.divergence.lower() == 'hd':
            f = f_squared_hellinger
        elif self.divergence.lower() == 'js':
            f = f_jensen_shannon
        def dissimilarity(prev):
            val_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, self.sample_densities))
            return fdivergence(val_likelihood, test_likelihood, f)
        return F.optim_minimize(dissimilarity, n_classes)
    def _target_likelihood(self, posteriors, eps=0.000001):
        """
        Searches for the mixture model parameter (the sought prevalence values) that yields a validation distribution
        (the mixture) that best matches the test distribution, in terms of the divergence measure of choice.
        :param instances: instances in the sample
        :return: a vector of class prevalence estimates
        """
        np.random.RandomState(self.random_state)
        n_classes = len(self.val_densities)
        test_densities = [self.pdf(kde_i, posteriors) for kde_i in self.val_densities]
        #return lambda posteriors: sum(prev_i * self.pdf(kde_i, posteriors) for kde_i, prev_i in zip(self.val_densities, prev))
        def neg_loglikelihood(prev):
            test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities))
            test_loglikelihood = np.log(test_mixture_likelihood + eps)
            return  -np.sum(test_loglikelihood)
        return F.optim_minimize(neg_loglikelihood, n_classes)
--- a/distribution_matching/method/method_kdey_closed.py
+++ b/distribution_matching/method/method_kdey_closed.py
@ -0,0 +1,174 @@
 from cgi import test
 import os
 import sys
 from typing import Union, Callable
 import numpy as np
 from sklearn.base import BaseEstimator
 from sklearn.linear_model import LogisticRegression
 import pandas as pd
 from sklearn.model_selection import GridSearchCV
 from sklearn.neighbors import KernelDensity
 from scipy.stats import multivariate_normal
 import quapy as qp
 from quapy.data import LabelledCollection
 from quapy.protocol import APP, UPP
 from quapy.method.aggregative import AggregativeProbabilisticQuantifier, _training_helper, cross_generate_predictions, \
    DistributionMatching, _get_divergence
 import scipy
 from scipy import optimize
 from statsmodels.nonparametric.kernel_density import KDEMultivariateConditional
 from time import time
 from sklearn.metrics.pairwise import rbf_kernel
 def gram_matrix_mix(bandwidth, X, Y=None):
    # this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y))
    # to contain pairwise evaluations of N(x|mu,Sigma1+Sigma2) with mu=y and Sigma1 and Sigma2 are 
    # two "scalar matrices" (h^2) I each, so Sigma1+Sigma2 has scalar 2(h^2) (h is the bandwidth)
    variance = 2 * (bandwidth**2)
    nD = X.shape[1]
    gamma = 1/(2*variance)
    gram = rbf_kernel(X, Y, gamma=gamma)
    norm_factor = 1/np.sqrt(((2*np.pi)**nD) * (variance**(nD)))
    gram *= norm_factor
    print('GRAM SUM:', gram.sum())
    return gram
 def weighted_prod(pi, tau, G):
    return pi[:,np.newaxis] * G * tau
 def tril_weighted_prod(pi, G):
    M = weighted_prod(pi, pi, G)
    return np.triu(M,1)
 class KDEyclosed(AggregativeProbabilisticQuantifier):
    def __init__(self, classifier: BaseEstimator, val_split=0.4, bandwidth=0.1, n_jobs=None, random_state=0):
        self.classifier = classifier
        self.val_split = val_split
        self.bandwidth = bandwidth
        self.n_jobs = n_jobs
        self.random_state=random_state
    def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
        """
        :param data: the training set
        :param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit)
        :param val_split: either a float in (0,1) indicating the proportion of training instances to use for
         validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
         indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV
         to estimate the parameters
        """
        # print('[fit] enter')
        if val_split is None:
            val_split = self.val_split
        self.classifier, y, posteriors, classes, class_count = cross_generate_predictions(
            data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
        )
        # from distribution_matching.binary_debug import HACK
        # posteriors, y = HACK(posteriors, y)
        # print('[fit] precomputing stuff')
        n = data.n_classes
        #L = [posteriors[y==i] for i in range(n)]
        #l = [len(Li) for Li in L]
        D = n
        h = self.bandwidth
        #cov_mix_scalar = 2 * h * h  # corresponds to a bandwidth of sqrt(2)*h
        #Kernel = multivariate_normal(mean=np.zeros(D), cov=cov_mix_scalar)
        # print('[fit] classifier ready; precomputing gram')
        self.gram_tr_tr = gram_matrix_mix(h, posteriors)
        # li_inv keeps track of the relative weight of each datapoint within its class 
        # (i.e., the weight in its KDE model)
        counts_inv = 1/(data.counts())
        self.li_inv = counts_inv[y]
 #        Khash = {}
 #        for a in range(n):
 #            for b in range(l[a]):
 #                for i in range(n):
 #                    Khash[(a,b,i)] = sum(Kernel.pdf(L[i][j] - L[a][b]) for j in range(l[i]))
                    # for j in range(l[i]):  # this for, and index j, can be supressed and store the sum across j
                    #     Khash[(a, b, i, j)] = Kernel.pdf(L[i][j] - L[a][b])
        self.n = n
        #self.L = L
        #self.l = l
        #self.Kernel = Kernel
        #self.Khash = Khash
        self.C = ((2 * np.pi) ** (-D / 2)) * h ** (-D)
        print('C:', self.C)
        self.Ptr = posteriors
        self.ytr = y
        assert all(sorted(np.unique(y)) == np.arange(data.n_classes)), 'label name gaps not allowed in current implementation'
        # print('[fit] exit')
        return self
    def aggregate(self, posteriors: np.ndarray):
        # print('[aggregate] enter')
        Ptr = self.Ptr
        Pte = posteriors
        gram_te_te = gram_matrix_mix(self.bandwidth, Pte, Pte)
        gram_tr_te = gram_matrix_mix(self.bandwidth, Ptr, Pte)
        K = Pte.shape[0]
        tau = np.full(shape=K, fill_value=1/K, dtype=float)
        h = self.bandwidth
        D = Ptr.shape[1]
        C = self.C
        partC = 0.5 * np.log( C/K + 2 * tril_weighted_prod(tau, gram_te_te).sum())
        def match(alpha):
            pi = alpha[self.ytr] * self.li_inv
            partA = -np.log(weighted_prod(pi, tau, gram_tr_te).sum())
            # print('gram_Tr_Tr sum', self.gram_tr_tr.sum())
            # print('pretril', (np.triu(self.gram_tr_tr,1).sum()))
            # print('tril', (2 * tril_weighted_prod(pi, self.gram_tr_tr).sum()))
            # print('pi', pi.sum(), pi[:10])
            # print('Cs', C*(pi**2).sum())
            partB = 0.5 * np.log(C*(pi**2).sum() + 2*tril_weighted_prod(pi, self.gram_tr_tr).sum())
            Dcs = partA + partB + partC
            # print(f'{alpha=}\t{partA=}\t{partB=}\t{partC}')
            # print()
            return Dcs
        # print('[aggregate] starts search')
        # the initial point is set as the uniform distribution
        uniform_distribution = np.full(fill_value=1 / self.n, shape=(self.n,))
        # uniform_distribution = [0.2, 0.8]
        # solutions are bounded to those contained in the unit-simplex
        bounds = tuple((0, 1) for _ in range(self.n))  # values in [0,1]
        constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)})  # values summing up to 1
        r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
        # print('[aggregate] end')
        return r.x
--- a/distribution_matching/method/method_kdey_closed_efficient.py
+++ b/distribution_matching/method/method_kdey_closed_efficient.py
@ -0,0 +1,145 @@
 from cgi import test
 import os
 import sys
 from typing import Union, Callable
 import numpy as np
 from sklearn.base import BaseEstimator
 from sklearn.linear_model import LogisticRegression
 import pandas as pd
 from sklearn.model_selection import GridSearchCV
 from sklearn.neighbors import KernelDensity
 from scipy.stats import multivariate_normal
 import quapy as qp
 from quapy.data import LabelledCollection
 from quapy.protocol import APP, UPP
 from quapy.method.aggregative import AggregativeProbabilisticQuantifier, _training_helper, cross_generate_predictions, \
    DistributionMatching, _get_divergence
 import scipy
 from scipy import optimize
 from statsmodels.nonparametric.kernel_density import KDEMultivariateConditional
 from time import time
 from sklearn.metrics.pairwise import rbf_kernel
 def gram_matrix_mix_sum(bandwidth, X, Y=None, reduce=True):
    # this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y))
    # to contain pairwise evaluations of N(x|mu,Sigma1+Sigma2) with mu=y and Sigma1 and Sigma2 are 
    # two "scalar matrices" (h^2) I each, so Sigma1+Sigma2 has scalar 2(h^2) (h is the bandwidth)
    variance = 2 * (bandwidth**2)
    nRows,nD = X.shape
    gamma = 1/(2*variance)
    gram = rbf_kernel(X, Y, gamma=gamma)
    norm_factor = 1/np.sqrt(((2*np.pi)**nD) * (variance**(nD)))
    gram *= norm_factor
    if Y is None:
        # ignores the diagonal
        aggr = (2 * np.triu(gram, 1)).sum()
    else:
        aggr = gram.sum()
    return aggr
 class KDEyclosed_efficient(AggregativeProbabilisticQuantifier):
    def __init__(self, classifier: BaseEstimator, val_split=0.4, bandwidth=0.1, n_jobs=None, random_state=0):
        self.classifier = classifier
        self.val_split = val_split
        self.bandwidth = bandwidth
        self.n_jobs = n_jobs
        self.random_state=random_state
    def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
        """
        :param data: the training set
        :param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit)
        :param val_split: either a float in (0,1) indicating the proportion of training instances to use for
         validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
         indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV
         to estimate the parameters
        """
        # print('[fit] enter')
        if val_split is None:
            val_split = self.val_split
        self.classifier, y, posteriors, classes, class_count = cross_generate_predictions(
            data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
        )
        assert all(sorted(np.unique(y)) == np.arange(data.n_classes)), \
            'label name gaps not allowed in current implementation'
        n = data.n_classes
        h = self.bandwidth
        P = posteriors
        counts_inv = 1 / (data.counts())
        nD = P.shape[1]
        C = ((2 * np.pi) ** (-nD / 2)) * (self.bandwidth ** (-nD))
        tr_tr_sums = np.zeros(shape=(n,n), dtype=float)
        self.tr_C = []
        for i in range(n):
            for j in range(n):
                if i > j:
                    tr_tr_sums[i,j] = tr_tr_sums[j,i]
                else:
                    if i == j:
                        tr_tr_sums[i, j] = gram_matrix_mix_sum(h, P[y == i])
                        self.tr_C.append(C * sum(y == i))
                    else:
                        block = gram_matrix_mix_sum(h, P[y == i], P[y == j])
                        tr_tr_sums[i, j] = block
        self.tr_C = np.asarray(self.tr_C)
        self.Ptr = posteriors
        self.ytr = y
        self.tr_tr_sums = tr_tr_sums
        self.counts_inv = counts_inv
        return self
    def aggregate(self, posteriors: np.ndarray):
        # print('[aggregate] enter')
        Ptr = self.Ptr
        Pte = posteriors
        K,nD = Pte.shape
        Kinv = (1/K)
        h = self.bandwidth
        n = Ptr.shape[1]
        y = self.ytr
        tr_tr_sums = self.tr_tr_sums
        C = ((2 * np.pi) ** (-nD / 2)) * (self.bandwidth ** (-nD))
        partC = 0.5*np.log(gram_matrix_mix_sum(h, Pte) * Kinv * Kinv + C*Kinv)
        tr_te_sums = np.zeros(shape=n, dtype=float)
        for i in range(n):
            tr_te_sums[i] = gram_matrix_mix_sum(h, Ptr[y==i], Pte) * self.counts_inv[i] * Kinv
        def match(alpha):
            partA = -np.log((alpha * tr_te_sums).sum())
            alpha_l = alpha * self.counts_inv
            partB = 0.5 * np.log((alpha_l[:,np.newaxis] * tr_tr_sums * alpha_l).sum() + (self.tr_C*(alpha_l**2)).sum())
            return partA + partB + partC
        # print('[aggregate] starts search')
        # the initial point is set as the uniform distribution
        uniform_distribution = np.full(fill_value=1 / n, shape=(n,))
        # uniform_distribution = [0.2, 0.8]
        # solutions are bounded to those contained in the unit-simplex
        bounds = tuple((0, 1) for _ in range(n))  # values in [0,1]
        constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)})  # values summing up to 1
        r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
        # print('[aggregate] end')
        return r.x
--- a/distribution_matching/method/method_kdey_closed_efficient_correct.py
+++ b/distribution_matching/method/method_kdey_closed_efficient_correct.py
@ -0,0 +1,139 @@
 import os
 import sys
 from typing import Union, Callable
 import numpy as np
 from sklearn.base import BaseEstimator
 from sklearn.linear_model import LogisticRegression
 import pandas as pd
 from sklearn.model_selection import GridSearchCV
 from sklearn.neighbors import KernelDensity
 from scipy.stats import multivariate_normal
 import quapy as qp
 from quapy.data import LabelledCollection
 from quapy.protocol import APP, UPP
 from quapy.method.aggregative import AggregativeProbabilisticQuantifier, _training_helper, cross_generate_predictions, \
    DistributionMatching, _get_divergence
 import scipy
 from scipy import optimize
 from statsmodels.nonparametric.kernel_density import KDEMultivariateConditional
 from time import time
 from sklearn.metrics.pairwise import rbf_kernel
 def gram_matrix_mix_sum(bandwidth, X, Y=None, reduce=True):
    # this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y))
    # to contain pairwise evaluations of N(x|mu,Sigma1+Sigma2) with mu=y and Sigma1 and Sigma2 are 
    # two "scalar matrices" (h^2) I each, so Sigma1+Sigma2 has scalar 2(h^2) (h is the bandwidth)
    variance = 2 * (bandwidth**2)
    nRows,nD = X.shape
    gamma = 1/(2*variance)
    norm_factor = 1/np.sqrt(((2*np.pi)**nD) * (variance**(nD)))
    gram = norm_factor * rbf_kernel(X, Y, gamma=gamma)
    return gram.sum()
 class KDEyclosed_efficient_corr(AggregativeProbabilisticQuantifier):
    def __init__(self, classifier: BaseEstimator, val_split=0.4, bandwidth=0.1, n_jobs=None, random_state=0):
        self.classifier = classifier
        self.val_split = val_split
        self.bandwidth = bandwidth
        self.n_jobs = n_jobs
        self.random_state=random_state
    def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
        """
        :param data: the training set
        :param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit)
        :param val_split: either a float in (0,1) indicating the proportion of training instances to use for
         validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
         indicating the validation set itself, or an int indicating the number k of folds to be used in kFCV
         to estimate the parameters
        """
        # print('[fit] enter')
        if val_split is None:
            val_split = self.val_split
        self.classifier, y, posteriors, classes, class_count = cross_generate_predictions(
            data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
        )
        print('training over')
        assert all(sorted(np.unique(y)) == np.arange(data.n_classes)), \
            'label name gaps not allowed in current implementation'
        # print('[fit] precomputing stuff')
        # from distribution_matching.binary_debug import HACK
        # posteriors, y = HACK(posteriors, y)
        n = data.n_classes
        h = self.bandwidth
        # print('[fit] classifier ready; precomputing gram')
        P = posteriors
        # li_inv keeps track of the relative weight of each datapoint within its class
        # (i.e., the weight in its KDE model)
        counts_inv = 1 / (data.counts())
        tr_tr_sums = np.zeros(shape=(n,n), dtype=float)
        for i in range(n):
            for j in range(n):
                if i > j:
                    tr_tr_sums[i,j] = tr_tr_sums[j,i]
                else:
                    block = gram_matrix_mix_sum(h, P[y == i], P[y == j] if i!=j else None)
                    tr_tr_sums[i, j] = block
        # compute class-class block-sums
        self.Ptr = posteriors
        self.ytr = y
        self.tr_tr_sums = tr_tr_sums
        self.counts_inv = counts_inv
        print('fit over')
        return self
    def aggregate(self, posteriors: np.ndarray):
        # print('aggregating')
        Ptr = self.Ptr
        Pte = posteriors
        K,nD = Pte.shape
        Kinv = (1/K)
        h = self.bandwidth
        n = Ptr.shape[1]
        y = self.ytr
        tr_tr_sums = self.tr_tr_sums
        partC = 0.5*np.log(gram_matrix_mix_sum(h, Pte) * Kinv * Kinv)
        tr_te_sums = np.zeros(shape=n, dtype=float)
        for i in range(n):
            tr_te_sums[i] = gram_matrix_mix_sum(h, Ptr[y==i], Pte) * self.counts_inv[i] * Kinv
        def match(alpha):
            partA = -np.log((alpha * tr_te_sums).sum())
            alpha_l = alpha * self.counts_inv
            partB = 0.5 * np.log((alpha_l[:,np.newaxis] * tr_tr_sums * alpha_l).sum())
            return partA + partB + partC
        # print('starting search')
        # the initial point is set as the uniform distribution
        uniform_distribution = np.full(fill_value=1 / n, shape=(n,))
        # solutions are bounded to those contained in the unit-simplex
        bounds = tuple((0, 1) for _ in range(n))  # values in [0,1]
        constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)})  # values summing up to 1
        r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
        return r.x
--- a/distribution_matching/tables/gen_tables_compact.py
+++ b/distribution_matching/tables/gen_tables_compact.py
@ -0,0 +1,210 @@
 from distribution_matching.commons import (ADJUSTMENT_METHODS, BIN_METHODS, DISTR_MATCH_METHODS, MAX_LIKE_METHODS,
                                           METHODS, FULL_METHOD_LIST)
 import quapy as qp
 from os import makedirs
 import os
 from tabular import Table
 import pandas as pd
 tables_path = '.'
 # makedirs(tables_path, exist_ok=True)
 MAXTONE = 35  # sets the intensity of the maximum color reached by the worst (red) and best (green) results
 SHOW_STD = False
 NUM_ADJUSTMENT_METHODS = len(ADJUSTMENT_METHODS)
 NUM_MAXIMUM_LIKELIHOOD_METHODS = len(MAX_LIKE_METHODS)
 NUM_DISTRIBUTION_MATCHING_METHODS = len(DISTR_MATCH_METHODS)
 qp.environ['SAMPLE_SIZE'] = 100
 nice_bench = {
    'sanders': 'Sanders',
    'semeval13': 'SemEval13',
    'semeval14': 'SemEval14',
    'semeval15': 'SemEval15',
    'semeval16': 'SemEval16',
 }
 def save_table(path, table):
    print(f'saving results in {path}')
    with open(path, 'wt') as foo:
        foo.write(table)
 def new_table(datasets, methods):
    return Table(
        benchmarks=datasets,
        methods=methods,
        ttest='wilcoxon',
        prec_mean=5,
        show_std=SHOW_STD,
        prec_std=4,
        clean_zero=(eval=='mae'),
        average=True,
        maxtone=MAXTONE
    )
 def make_table(tabs, eval, benchmark_groups, benchmark_names, compact=False):
    n_methods = len(METHODS)
    assert n_methods == (NUM_ADJUSTMENT_METHODS+NUM_DISTRIBUTION_MATCHING_METHODS+NUM_MAXIMUM_LIKELIHOOD_METHODS), \
        "Unexpected number of methods"
    cline = "\cline{2-" + str(n_methods+ 1) + "}"
    # write the latex table
    tabular = """
            \\begin{tabular}{|c|""" + ('c|' * NUM_ADJUSTMENT_METHODS) + ('c|' * NUM_DISTRIBUTION_MATCHING_METHODS) +  ('c|' * NUM_MAXIMUM_LIKELIHOOD_METHODS) + """} """ + cline + """           
            \multicolumn{1}{c}{} & 
            \multicolumn{"""+str(NUM_ADJUSTMENT_METHODS)+"""}{|c}{Adjustment} & 
            \multicolumn{"""+str(NUM_DISTRIBUTION_MATCHING_METHODS)+"""}{|c|}{Distribution Matching} & 
            \multicolumn{"""+str(NUM_MAXIMUM_LIKELIHOOD_METHODS)+"""}{c|}{Maximum Likelihood} \\\\
            \hline               
            """
    for i, (tab, group, name) in enumerate(zip(tabs, benchmark_groups, benchmark_names)):
        tablines = tab.latexTabular(benchmark_replace=nice_bench, endl='\\\\'+ cline, aslines=True)
        tablines[0] = tablines[0].replace('\multicolumn{1}{c|}{}', '\\textbf{'+name+'}')
        if compact or len(tab.benchmarks)==1:
            # if compact, keep the method names and the average; discard the rest
            tabular += tablines[0] + '\n' + tablines[1 if len(tab.benchmarks)==1 else -1] + '\n'
        else:
            tabular += '\n'.join(tablines)
        tabular += "\n" + "\\textit{Rank} & " + tab.getRankTable(prec_mean=0 if name.startswith('LeQua') else 1).latexAverage()
        if i < (len(tabs) - 1):
            tabular += "\\hline\n"
        else:
            tabular += "\n"
    tabular += "\end{tabular}"
    return tabular
 def gen_tables_uci_multiclass(eval):
    print('Generating table for UCI Multiclass Datasets', eval)
    dir_results = f'../results/ucimulti/{eval}'
    datasets = qp.datasets.UCI_MULTICLASS_DATASETS
    tab =  new_table(datasets, METHODS)
    for dataset in datasets:
        print(f'\t Dataset: {dataset}: ', end='')
        for method in METHODS:
            result_path = f'{dir_results}/{method}_{dataset}.dataframe'
            if os.path.exists(result_path):
                df = pd.read_csv(result_path)
                print(f'{method}', end=' ')
                tab.add(dataset, method, df[eval].values)
            else:
                print(f'MISSING-{method}', end=' ')
        print()
    return tab
 def gen_tables_uci_bin(eval):
    print('Generating table for UCI Datasets', eval)
    dir_results = f'../results/binary/{eval}'
    exclude = ['acute.a', 'acute.b', 'iris.1', 'balance.2']
    datasets = [x for x in qp.datasets.UCI_DATASETS if x not in exclude]
    tab =  new_table(datasets, BIN_METHODS)
    for dataset in datasets:
        print(f'\t Dataset: {dataset}: ', end='')
        for method in BIN_METHODS:
            result_path = f'{dir_results}/{method}_{dataset}.dataframe'
            if os.path.exists(result_path):
                df = pd.read_csv(result_path)
                print(f'{method}', end=' ')
                tab.add(dataset, method, df[eval].values)
            else:
                print(f'MISSING-{method}', end=' ')
    return tab
 def gen_tables_tweet(eval):
    print('Generating table for Twitter', eval)
    dir_results = f'../results/tweet/{eval}'
    datasets = qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST
    tab =  new_table(datasets, METHODS)
    for dataset in datasets:
        print(f'\t Dataset: {dataset}: ', end='')
        for method in METHODS:
            result_path = f'{dir_results}/{method}_{dataset}.dataframe'
            if os.path.exists(result_path):
                df = pd.read_csv(result_path)
                print(f'{method}', end=' ')
                tab.add(dataset, method, df[eval].values)
            else:
                print(f'MISSING-{method}', end=' ')
        print()
    return tab
 def gen_tables_lequa(Methods, task, eval):
    # generating table for LeQua-T1A or Lequa-T1B; only one table with two rows, one for MAE, another for MRAE
    tab = new_table([task], Methods)
    print('Generating table for T1A@Lequa', eval, end='')
    dir_results = f'../results/lequa/{task}/{eval}'
    for method in Methods:
        result_path = f'{dir_results}/{method}.dataframe'
        if os.path.exists(result_path):
            df = pd.read_csv(result_path)
            print(f'{method}', end=' ')
            tab.add(task, method, df[eval].values)
        else:
            print(f'MISSING-{method}', end=' ')
    print()
    return tab
 if __name__ == '__main__':
    os.makedirs('./latex', exist_ok=True)
    for eval in ['mae', 'mrae']:
        tabs = []
        tabs.append(gen_tables_tweet(eval))
        tabs.append(gen_tables_uci_multiclass(eval))
        tabs.append(gen_tables_lequa(METHODS, 'T1B', eval))
        names = ['Tweets', 'UCI-multi', 'LeQua']
        table = make_table(tabs, eval, benchmark_groups=tabs, benchmark_names=names)
        save_table(f'./latex/multiclass_{eval}.tex', table)
    for eval in ['mae', 'mrae']:
        tabs = []
        tabs.append(gen_tables_uci_bin(eval))
        # print uci-binary with all datasets for the appendix
        table = make_table(tabs, eval, benchmark_groups=tabs, benchmark_names=['UCI-binary'])
        save_table(f'./latex/ucibinary_{eval}.tex', table)
        # print uci-bin compacted plus lequa-T1A for the main body
        tabs.append(gen_tables_lequa(BIN_METHODS, 'T1A', eval))
        table = make_table(tabs, eval, benchmark_groups=tabs, benchmark_names=['UCI-binary', 'LeQua'], compact=True)
        save_table(f'./latex/binary_{eval}.tex', table)
    print("[Tables Done] runing latex")
    os.chdir('./latex/')
    os.system('pdflatex tables_compact.tex')
    os.system('rm tables_compact.aux tables_compact.bbl tables_compact.blg tables_compact.log tables_compact.out tables_compact.dvi')
--- a/distribution_matching/tables/latex/tables_compact.tex
+++ b/distribution_matching/tables/latex/tables_compact.tex
@ -0,0 +1,107 @@
 \documentclass{article}
 \usepackage[utf8]{inputenc} % allow utf-8 input
 \usepackage[T1]{fontenc}    % use 8-bit T1 fonts
 \usepackage{hyperref}       % hyperlinks
 \usepackage{url}            % simple URL typesetting
 \usepackage{booktabs}       % professional-quality tables
 \usepackage{amsfonts}       % blackboard math symbols
 \usepackage{nicefrac}       % compact symbols for 1/2, etc.
 \usepackage{microtype}      % microtypography
 \usepackage{lipsum}
 \usepackage{fancyhdr}       % header
 \usepackage{graphicx}       % graphics
 \graphicspath{{media/}}     % organize your images and other figures under media/ folder
 \usepackage{amsmath}
 \usepackage{bm}
 \usepackage{tabularx}
 \usepackage{color}
 \usepackage{colortbl}
 \usepackage{xcolor}
 \usepackage{lmodern}
 \DeclareMathOperator*{\argmax}{arg\,max}
 \DeclareMathOperator*{\argmin}{arg\,min}
 \newif\ifdraft
 \drafttrue
 \newcommand{\juanjo}[1]{\ifdraft{\leavevmode\color{purple}{[JJ]:
 {#1}}}\else{\vspace{0ex}}\fi}
 \newcommand{\alex}[1]{\ifdraft{\leavevmode\color{violet}{[AM]:
 {#1}}}\else{\vspace{0ex}}\fi}
 \newcommand{\pablo}[1]{\ifdraft{\leavevmode\color{red}{[PG]:
 {#1}}}\else{\vspace{0ex}}\fi}
 \title{Tables} 
 \author{
  Alejandro Moreo 
 }
 \begin{document}
 \maketitle
 \begin{table}[h]
 \centering
 \caption{Multiclass AE}
 \resizebox{\textwidth}{!}{%
 \input{multiclass_mae}
 }%
 \end{table}
 \begin{table}[h]
 \centering
 \caption{Multiclass RAE}
 \resizebox{\textwidth}{!}{%
 \input{multiclass_mrae}
 }%
 \end{table}
 \begin{table}[h]
 \centering
 \caption{Binary MAE}
 \resizebox{\textwidth}{!}{%
 \input{binary_mae}
 }%
 \end{table}
 \begin{table}[h]
  \centering
  \caption{Binary MRAE}
 \resizebox{\textwidth}{!}{%
 \input{binary_mrae}
 }%
 \end{table}
 \begin{table}[h]
 \centering
 \caption{UCI binary full AE}
 \resizebox{\textwidth}{!}{%
 \input{ucibinary_mae}
 }%
 \end{table}
 \begin{table}[h]
 \centering
 \caption{UCI binary full RAE}
 \resizebox{\textwidth}{!}{%
 \input{ucibinary_mrae}
 }%
 \end{table}
 \end{document}
--- a/distribution_matching/tables/tabular.py
+++ b/distribution_matching/tables/tabular.py
@ -0,0 +1,348 @@
 import numpy as np
 import itertools
 from scipy.stats import ttest_ind_from_stats, wilcoxon
 class Table:
    VALID_TESTS = [None, "wilcoxon", "ttest"]
    def __init__(self, benchmarks, methods, lower_is_better=True, ttest='ttest', prec_mean=3,
                 clean_zero=False, show_std=False, prec_std=3, average=True, missing=None, missing_str='--', color=True, maxtone=50):
        assert ttest in self.VALID_TESTS, f'unknown test, valid are {self.VALID_TESTS}'
        self.benchmarks = np.asarray(benchmarks)
        self.benchmark_index = {row:i for i, row in enumerate(benchmarks)}
        self.methods = np.asarray(methods)
        self.method_index = {col:j for j, col in enumerate(methods)}
        self.map = {}  
        # keyed (#rows,#cols)-ndarrays holding computations from self.map['values']
        self._addmap('values', dtype=object)
        self.lower_is_better = lower_is_better
        self.ttest = ttest
        self.prec_mean = prec_mean
        self.clean_zero = clean_zero
        self.show_std = show_std
        self.prec_std = prec_std
        self.add_average = average
        self.missing = missing
        self.missing_str = missing_str
        self.color = color
        self.maxtone = maxtone
        self.touch()
    @property
    def nbenchmarks(self):
        return len(self.benchmarks)
    @property
    def nmethods(self):
        return len(self.methods)
    def touch(self):
        self._modif = True
    def update(self):
        if self._modif:
            self.compute()
    def _getfilled(self):
        return np.argwhere(self.map['fill'])
    @property
    def values(self):
        return self.map['values']
    def _indexes(self):
        return itertools.product(range(self.nbenchmarks), range(self.nmethods))
    def _addmap(self, map, dtype, func=None):
        self.map[map] = np.empty((self.nbenchmarks, self.nmethods), dtype=dtype)
        if func is None:
            return
        m = self.map[map]
        f = func
        indexes = self._indexes() if map == 'fill' else self._getfilled()
        for i, j in indexes:
            m[i, j] = f(self.values[i, j])
    def _addrank(self):
        for i in range(self.nbenchmarks):
            filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
            col_means = [self.map['mean'][i,j] for j in filled_cols_idx]
            ranked_cols_idx = filled_cols_idx[np.argsort(col_means)]
            if not self.lower_is_better:
                ranked_cols_idx = ranked_cols_idx[::-1]
            self.map['rank'][i, ranked_cols_idx] = np.arange(1, len(filled_cols_idx)+1)
    def _addcolor(self):
        for i in range(self.nbenchmarks):
            filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
            if filled_cols_idx.size==0:
                continue
            col_means = [self.map['mean'][i,j] for j in filled_cols_idx]
            #col_means = [self.map['rank'][i, j] for j in filled_cols_idx]
            minval = min(col_means)
            maxval = max(col_means)
            for col_idx in filled_cols_idx:
                val = self.map['mean'][i,col_idx]
                norm = (maxval - minval)
                if norm > 0:
                    normval = (val - minval) / norm
                else:
                    normval = 0.5
                if self.lower_is_better:
                    normval = 1 - normval
                normval = np.clip(normval, 0,1)
                self.map['color'][i, col_idx] = color_red2green_01(normval, self.maxtone)
    def _run_ttest(self, row, col1, col2):
        mean1 = self.map['mean'][row, col1]
        std1 = self.map['std'][row, col1]
        nobs1 = self.map['nobs'][row, col1]
        mean2 = self.map['mean'][row, col2]
        std2 = self.map['std'][row, col2]
        nobs2 = self.map['nobs'][row, col2]
        _, p_val = ttest_ind_from_stats(mean1, std1, nobs1, mean2, std2, nobs2)
        return p_val
    def _run_wilcoxon(self, row, col1, col2):
        values1 = self.map['values'][row, col1]
        values2 = self.map['values'][row, col2]
        try:
            _, p_val = wilcoxon(values1, values2)
        except ValueError:
            p_val = 0
        return p_val
    def _add_statistical_test(self):
        if self.ttest is None:
            return
        self.some_similar = [False]*self.nmethods
        for i in range(self.nbenchmarks):
            filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
            if len(filled_cols_idx) <= 1:
                continue
            col_means = [self.map['mean'][i,j] for j in filled_cols_idx]
            best_pos = filled_cols_idx[np.argmin(col_means)]
            for j in filled_cols_idx:
                if j==best_pos:
                    continue
                if self.ttest == 'ttest':
                    p_val = self._run_ttest(i, best_pos, j)
                else:
                    p_val = self._run_wilcoxon(i, best_pos, j)
                pval_outcome = pval_interpretation(p_val)
                self.map['ttest'][i, j] = pval_outcome
                if pval_outcome != 'Diff':
                    self.some_similar[j] = True
    def compute(self):
        self._addmap('fill', dtype=bool, func=lambda x: x is not None)
        self._addmap('mean', dtype=float, func=np.mean)
        self._addmap('std', dtype=float, func=np.std)
        self._addmap('nobs', dtype=float, func=len)
        self._addmap('rank', dtype=int, func=None)
        self._addmap('color', dtype=object, func=None)
        self._addmap('ttest', dtype=object, func=None)
        self._addmap('latex', dtype=object, func=None)
        self._addrank()
        self._addcolor()
        self._add_statistical_test()
        if self.add_average:
            self._addave()
        self._modif = False
    def _is_column_full(self, col):
        return all(self.map['fill'][:, self.method_index[col]])
    def _addave(self):
        ave = Table(['ave'], self.methods,
                    lower_is_better=self.lower_is_better,
                    ttest=self.ttest,
                    average=False,
                    missing=self.missing,
                    missing_str=self.missing_str,
                    prec_mean=self.prec_mean,
                    prec_std=self.prec_std,
                    clean_zero=self.clean_zero,
                    show_std=self.show_std,
                    color=self.color,
                    maxtone=self.maxtone)
        for col in self.methods:
            values = None
            if self._is_column_full(col):
                if self.ttest == 'ttest':
                    # values = np.asarray(self.map['mean'][:, self.method_index[col]])
                    values = np.concatenate(self.values[:, self.method_index[col]])
                else:  # wilcoxon
                    # values = np.asarray(self.map['mean'][:, self.method_index[col]])
                    values = np.concatenate(self.values[:, self.method_index[col]])
            ave.add('ave', col, values)
        self.average = ave
    def add(self, benchmark, method, values):
        if values is not None:
            values = np.asarray(values)
            if values.ndim==0:
                values = values.flatten()
        rid, cid = self._coordinates(benchmark, method)
        self.map['values'][rid, cid] = values
        self.touch()
    def get(self, benchmark, method, attr='mean'):
        self.update()
        assert attr in self.map, f'unknwon attribute {attr}'
        rid, cid = self._coordinates(benchmark, method)
        if self.map['fill'][rid, cid]:
            v = self.map[attr][rid, cid]
            if v is None or (isinstance(v,float) and np.isnan(v)):
                return self.missing
            return v
        else:
            return self.missing
    def _coordinates(self, benchmark, method):
        assert benchmark in self.benchmark_index, f'benchmark {benchmark} out of range'
        assert method in self.method_index, f'method {method} out of range'
        rid = self.benchmark_index[benchmark]
        cid = self.method_index[method]
        return rid, cid
    def get_average(self, method, attr='mean'):
        self.update()
        if self.add_average:
            return self.average.get('ave', method, attr=attr)
        return None
    def get_color(self, benchmark, method):
        color = self.get(benchmark, method, attr='color')
        if color is None:
            return ''
        return color
    def latex(self, benchmark, method):
        self.update()
        i,j = self._coordinates(benchmark, method)
        if self.map['fill'][i,j] == False:
            return self.missing_str
        mean = self.map['mean'][i,j]
        l = f" {mean:.{self.prec_mean}f}"
        if self.clean_zero:
            l = l.replace(' 0.', '.')
        isbest = self.map['rank'][i,j] == 1
        if isbest:
            l = "\\textbf{"+l.strip()+"}"
        stat = '' if self.ttest is None else '^{\phantom{\ddag}}'
        if self.ttest is not None and self.some_similar[j]:
            test_label = self.map['ttest'][i,j]
            if test_label == 'Sim':
                stat = '^{\dag}'
            elif test_label == 'Same':
                stat = '^{\ddag}'
            elif isbest or test_label == 'Diff':
                stat = '^{\phantom{\ddag}}'
        std = ''
        if self.show_std:
            std = self.map['std'][i,j]
            std = f" {std:.{self.prec_std}f}"
            if self.clean_zero:
                std = std.replace(' 0.', '.')
            std = f"\pm {std:{self.prec_std}}"
        if stat!='' or std!='':
            l = f'{l}${stat}{std}$'
        if self.color:
            l += ' ' + self.map['color'][i,j]
        return l
    def latexTabular(self, benchmark_replace={}, method_replace={}, aslines=False, endl='\\\\\hline'):
        lines = []
        l = '\multicolumn{1}{c|}{} & '
        l += ' & '.join([method_replace.get(col, col) for col in self.methods])
        l += ' \\\\\hline'
        lines.append(l)
        for row in self.benchmarks:
            rowname = benchmark_replace.get(row, row)
            l = rowname + ' & '
            l += self.latexRow(row, endl=endl)
            lines.append(l)
        if self.add_average:
            # l += '\hline\n'
            l = '\hline \n \\textit{Average} & '
            l += self.latexAverage(endl=endl)
            lines.append(l)
        if not aslines:
            lines='\n'.join(lines)
        return lines
    def latexRow(self, benchmark, endl='\\\\\hline\n'):
        s = [self.latex(benchmark, col) for col in self.methods]
        s = ' & '.join(s)
        s += ' ' + endl
        return s
    def latexAverage(self, endl='\\\\\hline\n'):
        if self.add_average:
            return self.average.latexRow('ave', endl=endl)
    def getRankTable(self, prec_mean=0):
        t = Table(benchmarks=self.benchmarks, methods=self.methods, prec_mean=prec_mean, average=True, maxtone=self.maxtone, ttest=None)
        for rid, cid in self._getfilled():
            row = self.benchmarks[rid]
            col = self.methods[cid]
            t.add(row, col, self.get(row, col, 'rank'))
        t.compute()
        return t
    def dropMethods(self, methods):
        drop_index = [self.method_index[m] for m in methods]
        new_methods = np.delete(self.methods, drop_index)
        new_index = {col:j for j, col in enumerate(new_methods)}
        self.map['values'] = self.values[:,np.asarray([self.method_index[m] for m in new_methods], dtype=int)]
        self.methods = new_methods
        self.method_index = new_index
        self.touch()
 def pval_interpretation(p_val):
    if 0.005 >= p_val:
        return 'Diff'
    elif 0.05 >= p_val > 0.005:
        return 'Sim'
    elif p_val > 0.05:
        return 'Same'
 def color_red2green_01(val, maxtone=50):
    if np.isnan(val): return None
    assert 0 <= val <= 1, f'val {val} out of range [0,1]'
    # rescale to [-1,1]
    val = val * 2 - 1
    if val < 0:
        color = 'red'
        tone = maxtone * (-val)
    else:
        color = 'green'
        tone = maxtone * val
    return '\cellcolor{' + color + f'!{int(tone)}' + '}'
--- a/distribution_matching/tmp/approximating_divergence_montecarlo.py
+++ b/distribution_matching/tmp/approximating_divergence_montecarlo.py
@ -0,0 +1,82 @@
 import numpy as np
 from scipy.stats import norm, uniform, expon
 from scipy.special import rel_entr
 def kld_approx(p, q, resolution=100):
    steps = np.linspace(-5, 5, resolution)
    integral = 0
    for step_i1, step_i2 in zip(steps[:-1], steps[1:]):
        base = step_i2-step_i1
        middle = (step_i1+step_i2)/2
        pi = p.pdf(middle)
        qi = q.pdf(middle)
        integral += (base * pi*np.log(pi/qi))
    return integral
 def integrate(f, resolution=10000):
    steps = np.linspace(-10, 10, resolution)
    integral = 0
    for step_i1, step_i2 in zip(steps[:-1], steps[1:]):
        base = step_i2 - step_i1
        middle = (step_i1 + step_i2) / 2
        integral += (base * f(middle))
    return integral
 def kl_analytic(m1, s1, m2, s2):
    return np.log(s2/s1)+ (s1*s1 + (m1-m2)**2) / (2*s2*s2) - 0.5
 def montecarlo(p, q, trials=1000000):
    xs = p.rvs(trials)
    ps = p.pdf(xs)
    qs = q.pdf(xs)
    return np.mean(np.log(ps/qs))
 def montecarlo2(p, q, trials=100000):
    #r = norm(-3, scale=5)
    r = uniform(-10, 20)
    # r = expon(loc=-6)
    #r = p
    xs = r.rvs(trials)
    rs = r.pdf(xs)
    print(rs)
    ps = p.pdf(xs)+0.0000001
    qs = q.pdf(xs)+0.0000001
    return np.mean((ps/rs)*np.log(ps/qs))
 def wrong(p, q, trials=100000):
    xs = np.random.uniform(-10,10, trials)
    ps = p.pdf(xs)
    qs = q.pdf(xs)
    return np.mean(ps*np.log(ps/qs))
 p = norm(loc=0, scale=1)
 q = norm(loc=1, scale=1)
 integral_approx = kld_approx(p, q)
 analytic_solution = kl_analytic(p.kwds['loc'], p.kwds['scale'], q.kwds['loc'], q.kwds['scale'])
 montecarlo_approx = montecarlo(p, q)
 montecarlo_approx2 = montecarlo2(p, q)
 wrong_approx = wrong(p, q)
 print(f'{analytic_solution=:.10f}')
 print()
 print(f'{integral_approx=:.10f}')
 print(f'integra error = {np.abs(analytic_solution-integral_approx):.10f}')
 print()
 print(f'{montecarlo_approx=:.10f}')
 print(f'montecarlo error = {np.abs(analytic_solution-montecarlo_approx):.10f}')
 print()
 print(f'{montecarlo_approx2=:.10f}')
 print(f'montecarlo2 error = {np.abs(analytic_solution-montecarlo_approx2):.10f}')
 print()
 print(f'{wrong_approx=:.10f}')
 print(f'wrong error = {np.abs(analytic_solution-wrong_approx):.10f}')
--- a/distribution_matching/tmp/cauchy_schwarz_div_gauss_mix.py
+++ b/distribution_matching/tmp/cauchy_schwarz_div_gauss_mix.py
@ -0,0 +1,113 @@
 import numpy as np
 from sklearn.neighbors import KernelDensity
 from scipy.stats import norm, multivariate_normal, Covariance
 from sklearn.metrics.pairwise import rbf_kernel
 from sklearn.neighbors import KDTree
 # def CauchySchwarzDivGaussMix(pi_i, mu_i, Lambda_i, tau_i, nu_i, Omega_i):
 #     Lambda_i_inv =
 # def z()
 nD=2
 mu = [[0.1,0.1], [0.2, 0.3], [1, 1]]      # centers
 bandwidth = 0.10   # bandwidth and scale (standard deviation)
 standard_deviation = bandwidth
 variance = standard_deviation**2
 mus = np.asarray(mu).reshape(-1,nD)
 kde = KernelDensity(bandwidth=bandwidth).fit(mus)
 x = [0.5,0.2]
 print('with scikit-learn KDE')
 prob = np.exp(kde.score_samples(np.asarray(x).reshape(-1,nD)))
 print(prob)
 # univariate
 # N = norm(loc=mu, scale=scale)
 # prob = N.pdf(x)
 # print(prob)
 # multivariate
 print('with scipy multivariate normal')
 npoints = mus.shape[0]
 probs = sum(multivariate_normal(mean=mu_i, cov=variance).pdf(x) for mu_i in mu)
 probs /= npoints
 print(probs)
 print('with scikit learn rbf_kernel')
 x = np.asarray(x).reshape(-1,nD)
 gamma = 1/(2*variance)
 gram = rbf_kernel(mus, x, gamma=gamma)
 const = 1/np.sqrt(((2*np.pi)**nD) * (variance**(nD)))
 gram *= const
 print(gram)
 print(np.sum(gram)/npoints)
 print('with stackoverflow answer')
 from scipy.spatial.distance import pdist, cdist, squareform
 import scipy
  # this is an NxD matrix, where N is number of items and D its dimensionalites
 pairwise_sq_dists = cdist(mus, x, 'euclidean')
 print(pairwise_sq_dists)
 K = np.exp(-pairwise_sq_dists / variance)
 print(K)
 print(np.sum(K)/npoints)
 print("trying with scipy multivariate on more than one instance")
 probs = sum(multivariate_normal(mean=x.reshape(-nD), cov=variance).pdf(mus))
 probs /= npoints
 print(probs)
 import sys
 sys.exit(0)
 # N1 = multivariate_normal(mean=mu[0], cov=variance)
 # prob1 = N1.pdf(x)
 # N2 = multivariate_normal(mean=mu[1], cov=variance)
 # prob2 = N2.pdf(x)
 # print(prob1+prob2)
 cov = Covariance.from_diagonal([variance, variance])
 print(cov.covariance)
 precision_matrix = np.asarray([[1/variance, 0],[0, 1/variance]])
 print(Covariance.from_precision(precision_matrix).covariance)
 print(np.linalg.inv(precision_matrix))
 print(np.linalg.inv(cov.covariance))
 print('-'*100)
 nD=2
 mu = np.asarray([[0.1,0.5]])      # centers
 bandwidth = 0.10   # bandwidth and scale (standard deviation)
 standard_deviation = bandwidth
 variance = standard_deviation**2
 mus = np.asarray([mu]).reshape(-1,nD)
 kde = KernelDensity(bandwidth=bandwidth).fit(mus)
 x = np.asarray([0.5,0.2])
 prob = np.exp(kde.score_samples(np.asarray(x).reshape(-1,nD)))
 print(prob)
 probs = sum(multivariate_normal(mean=mu_i, cov=variance).pdf(x) for mu_i in mu) / len(mu)
 probs = sum(multivariate_normal(mean=[0,0], cov=1).pdf((x-mu_i)/bandwidth) for mu_i in mu) / len(mu)
 probs = sum(multivariate_normal(mean=[0,0], cov=variance).pdf(x-mu_i) for mu_i in mu) / len(mu)
 print(probs)
 # N1 = multivariate_normal(mean=mu[0], cov=variance)
 # prob1 = N1.pdf(x)
 # N2 = multivariate_normal(mean=mu[1], cov=variance)
 # prob2 = N2.pdf(x)
 # print(prob1+prob2)
 h=0.1
 D=4
 print(np.sqrt((4**2) * (5**2)))
 print(4*5)
--- a/distribution_matching/tmp/cauchy_schwarz_div_kde.py
+++ b/distribution_matching/tmp/cauchy_schwarz_div_kde.py
@ -0,0 +1,153 @@
 import numpy as np
 from scipy.stats import multivariate_normal
 from scipy import optimize
 def cauchy_schwarz_divergence_kde(L:list, Xte:np.ndarray, bandwidth:float, alpha:np.ndarray):
    """
    :param L: a list of np.ndarray (instances x dimensions) with the Li being the instances of class i
    :param Xte: an np.ndarray (instances x dimensions)
    :param bandwidth: the bandwidth of the kernel
    :param alpha: the mixture parameter
    :return: the Cauchy-Schwarz divergence between the validation KDE mixture distribution (with mixture paramerter
        alpha) and the test KDE distribution
    """
    n = len(L) # number of classes
    K, D = Xte.shape # number of test instances, and number of dimensions
    Kinv = 1/K
    # the lengths of each block
    l = np.asarray([len(Li) for Li in L])
    # contains the a_i / l_i
    alpha_r = alpha / l
    alpha2_r_sum = np.sum(alpha * alpha_r)  # contains the sum_i a_i**2 / l_i
    h = bandwidth
    # we will only use the bandwidth (h) between two gaussians with covariance matrix a "scalar matrix" h**2
    cov_mix_scalar = 2*h*h   # corresponds to a bandwidth of sqrt(2)*h
    # constant
    C = ((2*np.pi)**(-D/2))*h**(-D)
    Kernel = multivariate_normal(mean=np.zeros(D), cov=cov_mix_scalar)
    K0 = Kernel.pdf(np.zeros(D))
    def compute_block_E():
        kernel_block_E = []
        for i,Li in enumerate(L):
            acc = 0
            for x_ji in Li: #optimize...
                for x_k in Xte: #optimize...
                    acc += Kernel.pdf(x_ji - x_k) #optimize...
            kernel_block_E.append(acc)
        return np.asarray(kernel_block_E)
    def compute_block_F_hash():
        # this can be computed entirely at training time
        Khash = {}
        for a in range(n):
            for b in range(l[a]):
                for i in range(n):
                    for j in range(l[i]): # this for, and index j, can be supressed and store the sum across j
                        Khash[(a,b,i,j)] = Kernel.pdf(L[i][j]-L[a][b])
        return Khash
    def compute_block_Ktest():
        # this can be optimized in several ways, starting by computing only the lower diagonal triangle... and remove
        # then the K0 which is not needed after that
        acc = 0
        for x_i in Xte:
            for x_j in Xte:
                acc += Kernel.pdf(x_i-x_j)
        return acc
    def compute_block_F():
        F = 0
        for a in range(n):
            tmp_b = 0
            for b in range(l[a]):
                tmp_i = 0
                for i in range(n):
                    tmp_j = 0
                    for j in range(l[i]):
                        tmp_j += Fh[(a, b, i, j)]
                    tmp_i += (alpha_r[i] * tmp_j)
                tmp_b += tmp_i
            F += (alpha_r[a] * tmp_b)
        return F
    E = compute_block_E()
    Fh = compute_block_F_hash()
    # Ktest = compute_block_Ktest()
    F = compute_block_F()
    C1 = K*Kinv*Kinv*C
    C2 = 2 * np.sum([Kernel.pdf(Xte[k]-Xte[k_p]) for k in range(K) for k_p in range(k)])
    partA = -np.log(Kinv * (alpha_r @ E))
    partB = 0.5*np.log(C*alpha2_r_sum + F - (K0*alpha2_r_sum))
    # partC = 0.5*np.log(Kinv) + 0.5*np.log(C + Kinv*Ktest - K0)
    partC = 0.5*np.log(C1+C2)
    Dcs = partA + partB + partC
    return Dcs
 L = [
    np.asarray([
        [-1,-1,-1]
    ]),
    np.asarray([
        [0,0,0],
    ]),
    np.asarray([
        [0,0,0.1],
        [1,1,1],
        [3,3,1],
    ]),
    np.asarray([
        [1,0,0]
    ]),
    np.asarray([
        [0,1,0]
    ])
 ]
 Xte = np.asarray(
    [[0,0,0],
     [0,0,0],
     [1,0,0],
     [0,1,0]]
 )
 bandwidth=0.01
 alpha=np.asarray([0, 2/4, 0, 1/4, 1/4])
 div = cauchy_schwarz_divergence_kde(L, Xte, bandwidth, alpha)
 print(div)
 def divergence(alpha):
    return cauchy_schwarz_divergence_kde(L, Xte, bandwidth, alpha)
 # the initial point is set as the uniform distribution
 n_classes = len(L)
 uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
 # solutions are bounded to those contained in the unit-simplex
 bounds = tuple((0, 1) for _ in range(n_classes))  # values in [0,1]
 constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)})  # values summing up to 1
 #print('searching for alpha')
 r = optimize.minimize(divergence, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
 sol = r.x
 for x in sol:
    print(f'{x:.4f}')
 print(cauchy_schwarz_divergence_kde(L, Xte, bandwidth, sol))
--- a/distribution_matching/tmp/check_hyperparams.py
+++ b/distribution_matching/tmp/check_hyperparams.py
@ -0,0 +1,19 @@
 import pickle
 import os
 dataset = 'lequa/T1B'
 for metric in ['mae', 'mrae']:
    print('metric', metric)
    for method in ['KDEy-DMhd4', 'KDEy-DMhd4+', 'KDEy-DMhd4++']:
        path = f'/home/moreo/QuaPy/distribution_matching/results/{dataset}/{metric}/{method}.hyper.pkl'
        if os.path.exists(path):
            obj = pickle.load(open(path, 'rb'))
            print(method, obj)
        else:
            print(f'path {path} does not exist')
    print()
--- a/distribution_matching/tmp/compile_test.py
+++ b/distribution_matching/tmp/compile_test.py
@ -0,0 +1,27 @@
 from time import time
 from sklearn.metrics.pairwise import rbf_kernel
 import torch
 import numpy as np
 def rbf(X, Y):
    return X @ Y.T
@torch.compile
 def rbf_comp(X, Y):
    return X @ Y.T
 def measure_time(X, Y, func):
    tinit = time()
    func(X, Y)
    tend = time()
    print(f'took {tend-tinit:.3}s')
 X = np.random.rand(1000, 100)
 Y = np.random.rand(1000, 100)
 measure_time(X, Y, rbf)
 measure_time(X, Y, rbf_comp)
--- a/distribution_matching/tmp/show_datasets_stats.py
+++ b/distribution_matching/tmp/show_datasets_stats.py
@ -0,0 +1,20 @@
 import pickle
 import os
 import quapy as qp
 from quapy.model_selection import GridSearchQ
 from quapy.protocol import UPP
 toprint = []
 for dataset in qp.datasets.UCI_MULTICLASS_DATASETS:
    data = qp.datasets.fetch_UCIMulticlassDataset(dataset)
    # model selection
    train, test = data.train_test
    toprint.append(f'{dataset}\t{len(train)}\t{len(test)}\t{data.n_classes}')
 print()
 for pr in toprint:
    print(pr)
--- a/distribution_matching/tmp/tmp.py
+++ b/distribution_matching/tmp/tmp.py
@ -0,0 +1,6 @@
 import pandas as pd
 report = pd.read_csv('../results_lequa_mrae/KDEy-MLE.dataframe')
 means = report.mean()
 print(f'KDEy-MLE\tLeQua-T1B\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
--- a/distribution_matching/tmp/understanding_divergence_montecarlo.py
+++ b/distribution_matching/tmp/understanding_divergence_montecarlo.py
@ -0,0 +1,155 @@
 import numpy as np
 from scipy.stats import norm
 EPS=1e-8
 LARGE_NUM=1000000
 TRIALS=LARGE_NUM
 # calculamos: D(p||q)
 # se asume:
 # q = distribución de referencia (en nuestro caso: el mixture de KDEs en training)
 # p = distribución (en nuestro caso: KDE del test)
 # N = TRIALS el número de trials para montecarlo
 # n = numero de clases, en este ejemplo n=3
 # ejemplo de f-generator function: Squared Hellinger Distance
 def hd2(u):
    v = (np.sqrt(u)-1)
    return v*v
 # esta funcion estima la divergencia entre dos mixtures (univariates)
 def integrate(p, q, f=hd2, resolution=TRIALS, epsilon=EPS):
    xs = np.linspace(-50, 50, resolution)
    ps = p.pdf(xs)+epsilon
    qs = q.pdf(xs)+epsilon
    rs = ps/qs
    base = xs[1]-xs[0]
    return np.sum(base * f(rs) * qs)
 # esta es la aproximación de montecarlo según está en el artículo
 # es decir, sampleando la q, y haciendo la media (1/N). En el artículo
 # en realidad se "presamplean" N por cada clase y luego se eligen los que
 # corresponda a cada clase. Aquí directamente sampleamos solo los
 # que correspondan.
 def montecarlo(p, q, f=hd2, trials=TRIALS, epsilon=EPS):
    xs = q.rvs(trials)
    ps = p.pdf(xs)+epsilon
    qs = q.pdf(xs)+epsilon
    N = trials
    return (1/N)*np.sum(f(ps/qs))
 # esta es la aproximación de montecarlo asumiendo que vayamos con todos los N*n
 # puntos (es decir, si hubieramos sampleado de una "q" en la que todas las clases
 # eran igualmente probables, 1/3) y luego reescalamos los pesos con "importance
 # weighting" <-- esto no está en el artículo, pero me gusta más que lo que hay
 # porque desaparece el paso de "seleccionar los que corresponden por clase"
 def montecarlo_importancesampling(p, q, r, f=hd2, trials=TRIALS, epsilon=EPS):
    xs = r.rvs(trials)  # <- distribución de referencia (aquí: class weight uniforme)
    rs = r.pdf(xs)
    ps = p.pdf(xs)+epsilon
    qs = q.pdf(xs)+epsilon
    N = trials
    importance_weight = qs/rs
    return (1/N)*np.sum(f(ps/qs)*(importance_weight))
 # he intentado implementar la variante que propne Juanjo pero creo que no la he
 # entendido bien. Tal vez haya que reescalar el peso por 1/3 o algo así, pero no
 # doy con la tecla...
 def montecarlo_classweight(p, q, f=hd2, trials=TRIALS, epsilon=EPS):
    xs_1 = q.rvs_fromclass(0, trials)
    xs_2 = q.rvs_fromclass(1, trials)
    xs_3 = q.rvs_fromclass(2, trials)
    xs = np.concatenate([xs_1, xs_2, xs_3])
    weights = np.asarray([[alpha_i]*trials for alpha_i in q.alphas]).flatten()
    ps = p.pdf(xs)+epsilon
    qs = q.pdf(xs)+epsilon
    N = trials
    n = q.n
    return (1/(N))*np.sum(weights*f(ps/qs))
 class Q:
    """
    Esta clase es un mixture de gausianas.
    :param locs: medias, una por clase
    :param scales: stds, una por clase
    :param alphas: peso, uno por clase
    """
    def __init__(self, locs, scales, alphas):
        self.qs = []
        for loc, scale in zip(locs, scales):
            self.qs.append(norm(loc=loc, scale=scale))
        assert np.isclose(np.sum(alphas), 1), 'alphas do not sum up to 1!'
        self.alphas = np.asarray(alphas) / np.sum(alphas)
    def pdf(self, xs):
        q_xs = np.vstack([
            q_i.pdf(xs) * alpha_i for q_i, alpha_i in zip(self.qs, self.alphas)
        ])
        v = q_xs.sum(axis=0)
        if len(v)==1:
            v = v[0]
        return v
    @property
    def n(self):
        return len(self.alphas)
    def rvs_fromclass(self, inclass, trials):
        return self.qs[inclass].rvs(trials)
    def rvs(self, trials):
        variates = []
        added = 0
        for i, (q_i, alpha_i) in enumerate(zip(self.qs, self.alphas)):
            trials_i = int(trials*alpha_i) if (i < self.n-1) else trials-added
            variates_i = q_i.rvs(trials_i)
            variates.append(variates_i)
            added += len(variates_i)
        return np.concatenate(variates)
    @property
    def locs(self):
        return np.asarray([x.kwds['loc'] for x in self.qs])
    @property
    def scales(self):
        return np.asarray([x.kwds['scale'] for x in self.qs])
    @classmethod
    def change_priors(cls, q, alphas):
        assert len(alphas)==len(q.alphas)
        return Q(locs=q.locs, scales=q.scales, alphas=alphas)
 # distribucion de test
 p = Q(locs=[1, 1.5], scales=[1,1], alphas=[0.8, 0.2])
 # distribucion de training (mixture por clase)
 q = Q(locs=[0, 0.5, 1.2], scales=[1,1,1.5], alphas=[0.1, 0.2, 0.7])
 # distribucion de referencia (mixture copia de q, con pesos de clase uniformes)
 r = Q.change_priors(q, alphas=[1/3, 1/3, 1/3])
 integral_approx = integrate(p, q)
 montecarlo_approx = montecarlo(p, q)
 montecarlo_approx2 = montecarlo_importancesampling(p, q, r)
 montecarlo_approx3 = montecarlo_classweight(p, q)
 print(f'{integral_approx=:.10f}')
 print()
 print(f'{montecarlo_approx=:.10f}, error={np.abs(integral_approx-montecarlo_approx):.10f}')
 print()
 print(f'{montecarlo_approx2=:.10f}, error={np.abs(integral_approx-montecarlo_approx2):.10f}')
 print()
 print(f'{montecarlo_approx3=:.10f}, error={np.abs(integral_approx-montecarlo_approx3):.10f}')
--- a/distribution_matching/todo.txt
+++ b/distribution_matching/todo.txt
@ -0,0 +1,21 @@
 1.- No se si sería más facil tomar r=uniforme, y no r=mixture model, simplifica mucho el sampling y tal vez incluso produzca menos error
 2.- Por ahora tengo KLD y HD:
    - para KLD no he entendido si tengo que añadir el -x + y
 3.- Se puede poner la topsoe como una f-divergence?
    La topsoe parece que es 2 veces la jensen-shannon divergence, o sea
    topsoe(p,q) = kld(p|m) + kld(q|m), con m = (p+q)/2
 4.- Se puede poner la Wasserstein como una f-divergence?
 5.- En general, qué relación hay con las "distancias"?
 2) implementar el auto
    - optimización interna para likelihood [ninguno parece funcionar bien]
        - de todo (e.g., todo el training)?
        - independiente para cada conjunto etiquetado? (e.g., positivos, negativos, neutros, y test)
    - optimización como un parámetro GridSearchQ
 6) optimizar kernel? optimizar distancia?
 10) Definir un clasificador que devuelva, para cada clase, una posterior como la likelihood en la class-conditional KDE dividida
    por la likelihood en en todas las clases (como propone Juanjo) y meterlo en EMD. Hacer al contario: re-calibrar con
    EMD y meterlo en KDEy
 11) KDEx?
--- a/distribution_matching/tweets_experiments.py
+++ b/distribution_matching/tweets_experiments.py
@ -0,0 +1,85 @@
 import pickle
 import os
 import pandas as pd
 from distribution_matching.commons import METHODS, new_method, show_results
 import quapy as qp
 from quapy.model_selection import GridSearchQ
 from quapy.protocol import UPP
 SEED=1
 if __name__ == '__main__':
    qp.environ['SAMPLE_SIZE'] = 100
    qp.environ['N_JOBS'] = -1
    n_bags_val = 250
    n_bags_test = 1000
    for optim in ['mae', 'mrae']:
        result_dir = f'results/tweet/{optim}'
        os.makedirs(result_dir, exist_ok=True)
        for method in METHODS:
            print('Init method', method)
            global_result_path = f'{result_dir}/{method}'
            if not os.path.exists(global_result_path+'.csv'):
                with open(global_result_path+'.csv', 'wt') as csv:
                    csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n')
            with open(global_result_path+'.csv', 'at') as csv:
                # four semeval dataset share the training, so it is useless to optimize hyperparameters four times;
                # this variable controls that the mod sel has already been done, and skip this otherwise
                semeval_trained = False
                for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST:
                    print('init', dataset)
                    local_result_path = global_result_path + '_' + dataset
                    if os.path.exists(local_result_path+'.dataframe'):
                        print(f'result file {local_result_path}.dataframe already exist; skipping')
                        continue
                    with qp.util.temp_seed(SEED):
                        is_semeval = dataset.startswith('semeval')
                        if not is_semeval or not semeval_trained:
                            param_grid, quantifier = new_method(method)
                            # model selection
                            data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=True)
                            protocol = UPP(data.test, repeats=n_bags_val)
                            modsel = GridSearchQ(quantifier, param_grid, protocol, refit=False, n_jobs=-1, verbose=1, error=optim)
                            modsel.fit(data.training)
                            print(f'best params {modsel.best_params_}')
                            print(f'best score {modsel.best_score_}')
                            pickle.dump(
                                (modsel.best_params_, modsel.best_score_,),
                                open(f'{local_result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
                            quantifier = modsel.best_model()
                            if is_semeval:
                                semeval_trained = True
                        else:
                            print(f'model selection for {dataset} already done; skipping')
                        data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=False)
                        quantifier.fit(data.training)
                        protocol = UPP(data.test, repeats=n_bags_test)
                        report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], verbose=True)
                        report.to_csv(f'{local_result_path}.dataframe')
                        means = report.mean()
                        csv.write(f'{method}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
                        csv.flush()
            show_results(global_result_path)
--- a/distribution_matching/tweets_sensibility_analysis.py
+++ b/distribution_matching/tweets_sensibility_analysis.py
@ -0,0 +1,57 @@
 import numpy as np
 from sklearn.linear_model import LogisticRegression
 import os
 import quapy as qp
 from distribution_matching.commons import show_results
 from quapy.method.aggregative import DMy
 from distribution_matching.method.method_kdey import KDEy
 from quapy.protocol import UPP
 SEED=1
 if __name__ == '__main__':
    qp.environ['SAMPLE_SIZE'] = 100
    qp.environ['N_JOBS'] = -1
    n_bags_val = 250
    n_bags_test = 1000
    result_dir = f'results/tweet/sensibility'
    os.makedirs(result_dir, exist_ok=True)
    for method, param, grid in [
        ('KDEy-ML', 'Bandwidth', np.linspace(0.01, 0.2, 20)),
        ('DM-HD', 'nbins', list(range(2,10)) + list(range(10,34,2)))
    ]:
        global_result_path = f'{result_dir}/{method}'
        if not os.path.exists(global_result_path+'.csv'):
            with open(global_result_path+'.csv', 'wt') as csv:
                csv.write(f'Method\tDataset\t{param}\tMAE\tMRAE\tKLD\n')
        with open(global_result_path+'.csv', 'at') as csv:
            for val in grid:
                for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST:
                    print('init', dataset)
                    local_result_path = global_result_path + '_' + dataset + (f'_{val:.3f}' if isinstance(val, float) else f'{val}')
                    with qp.util.temp_seed(SEED):
                        data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=False)
                        if method == 'KDEy-ML':
                            quantifier = KDEy(LogisticRegression(n_jobs=-1), target='max_likelihood', val_split=10, bandwidth=val)
                        elif method == 'DM-HD':
                            quantifier = DMy(LogisticRegression(n_jobs=-1), val_split=10, nbins=val, divergence='HD', n_jobs=-1)
                        quantifier.fit(data.training)
                        protocol = UPP(data.test, repeats=n_bags_test)
                        report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], verbose=True, n_jobs=-1)
                        report.to_csv(f'{local_result_path}.dataframe')
                        means = report.mean()
                        csv.write(f'{method}\t{data.name}\t{val}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
                        csv.flush()
        show_results(global_result_path)
--- a/distribution_matching/ucibinary_experiments.py
+++ b/distribution_matching/ucibinary_experiments.py
@ -0,0 +1,83 @@
 import pickle
 import os
 from distribution_matching.commons import BIN_METHODS, new_method, show_results
 import quapy as qp
 from quapy.model_selection import GridSearchQ
 from quapy.protocol import UPP
 SEED = 1
 if __name__ == '__main__':
    qp.environ['SAMPLE_SIZE'] = 100
    qp.environ['N_JOBS'] = -1
    n_bags_val = 250
    n_bags_test = 1000
    for optim in ['mae', 'mrae']:
        result_dir = f'results/binary/{optim}'
        os.makedirs(result_dir, exist_ok=True)
        for method in BIN_METHODS:
            print('Init method', method)
            global_result_path = f'{result_dir}/{method}'
            if not os.path.exists(global_result_path + '.csv'):
                with open(global_result_path + '.csv', 'wt') as csv:
                    csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n')
            with open(global_result_path + '.csv', 'at') as csv:
                for dataset in qp.datasets.UCI_DATASETS:
                    if dataset in ['acute.a', 'acute.b', 'iris.1']: continue # , 'pageblocks.5', 'spambase', 'wdbc']: continue
                    print('init', dataset)
                    local_result_path = global_result_path + '_' + dataset
                    if os.path.exists(local_result_path + '.dataframe'):
                        print(f'result file {local_result_path}.dataframe already exist; skipping')
                        continue
                    with qp.util.temp_seed(SEED):
                        param_grid, quantifier = new_method(method, max_iter=3000)
                        data = qp.datasets.fetch_UCIDataset(dataset)
                        # model selection
                        train, test = data.train_test
                        train, val = train.split_stratified(random_state=SEED)
                        protocol = UPP(val, repeats=n_bags_val)
                        modsel = GridSearchQ(
                            quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=1, error=optim
                        )
                        try:
                            modsel.fit(train)
                            print(f'best params {modsel.best_params_}')
                            print(f'best score {modsel.best_score_}')
                            pickle.dump(
                                (modsel.best_params_, modsel.best_score_,),
                                open(f'{local_result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
                            quantifier = modsel.best_model()
                        except:
                            print('something went wrong... trying to fit the default model')
                            quantifier.fit(train)
                        protocol = UPP(test, repeats=n_bags_test)
                        report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'],
                                                                verbose=True)
                        report.to_csv(f'{local_result_path}.dataframe')
                        means = report.mean()
                        csv.write(f'{method}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
                        csv.flush()
        show_results(global_result_path)
--- a/distribution_matching/ucimulti_experiments.py
+++ b/distribution_matching/ucimulti_experiments.py
@ -0,0 +1,88 @@
 import pickle
 import os
 from data.base import LabelledCollection
 from sklearn.linear_model import LogisticRegression
 from distribution_matching.commons import METHODS, new_method, show_results
 import quapy as qp
 from quapy.model_selection import GridSearchQ
 from quapy.protocol import UPP
 SEED = 1
 if __name__ == '__main__':
    qp.environ['SAMPLE_SIZE'] = 500
    qp.environ['N_JOBS'] = -1
    n_bags_val = 250
    n_bags_test = 1000
    for optim in ['mae', 'mrae']:
        result_dir = f'results/ucimulti/{optim}'
        os.makedirs(result_dir, exist_ok=True)
        for method in METHODS:
            print('Init method', method)
            global_result_path = f'{result_dir}/{method}'
            if not os.path.exists(global_result_path + '.csv'):
                with open(global_result_path + '.csv', 'wt') as csv:
                    csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n')
            with open(global_result_path + '.csv', 'at') as csv:
                for dataset in qp.datasets.UCI_MULTICLASS_DATASETS:
                    print('init', dataset)
                    local_result_path = global_result_path + '_' + dataset
                    if os.path.exists(local_result_path + '.dataframe'):
                        print(f'result file {local_result_path}.dataframe already exist; skipping')
                        continue
                    with qp.util.temp_seed(SEED):
                        param_grid, quantifier = new_method(method, max_iter=3000)
                        data = qp.datasets.fetch_UCIMulticlassDataset(dataset)
                        # model selection
                        train, test = data.train_test
                        train, val = train.split_stratified(random_state=SEED)
                        protocol = UPP(val, repeats=n_bags_val)
                        modsel = GridSearchQ(
                            quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=1, error=optim
                        )
                        try:
                            modsel.fit(train)
                            print(f'best params {modsel.best_params_}')
                            print(f'best score {modsel.best_score_}')
                            pickle.dump(
                                (modsel.best_params_, modsel.best_score_,),
                                open(f'{local_result_path}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
                            quantifier = modsel.best_model()
                        except:
                            print('something went wrong... trying to fit the default model')
                            quantifier.fit(train)
                            # quantifier = qp.method.aggregative.CC(LogisticRegression()).fit(train)
                        protocol = UPP(test, repeats=n_bags_test)
                        report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'],
                                                                verbose=True, n_jobs=-1)
                        report.to_csv(f'{local_result_path}.dataframe')
                        means = report.mean()
                        csv.write(f'{method}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
                        csv.flush()
        show_results(global_result_path)
--- a/distribution_matching/ucimulti_sensibility_analysis.py
+++ b/distribution_matching/ucimulti_sensibility_analysis.py
@ -0,0 +1,63 @@
 import numpy as np
 from sklearn.linear_model import LogisticRegression
 import os
 import quapy as qp
 from distribution_matching.commons import show_results
 from distribution_matching.method.method_kdey import KDEy
 from quapy.method.aggregative import DMy
 from quapy.protocol import UPP
 SEED=1
 def task(val):
    print('job-init', dataset, val)
    with qp.util.temp_seed(SEED):
        if method=='KDEy-ML':
            quantifier = KDEy(LogisticRegression(), target='max_likelihood', val_split=10, bandwidth=val)
        elif method == 'DM-HD':
            quantifier = DMy(LogisticRegression(), val_split=10, nbins=val, divergence='HD')
        quantifier.fit(data.data)
        protocol = UPP(data.test, repeats=n_bags_test)
        report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'],
                                                 verbose=True, n_jobs=-1)
        return report
 if __name__ == '__main__':
    qp.environ['SAMPLE_SIZE'] = 500
    qp.environ['N_JOBS'] = -1
    n_bags_val = 250
    n_bags_test = 1000
    result_dir = f'results/ucimulti/sensibility'
    os.makedirs(result_dir, exist_ok=True)
    for dataset in qp.datasets.UCI_MULTICLASS_DATASETS:
        data = qp.datasets.fetch_UCIMulticlassDataset(dataset)
        for method, param, grid in [
            ('KDEy-ML', 'Bandwidth', np.linspace(0.01, 0.2, 20)),
            ('DM-HD', 'nbins', list(range(2, 10)) + list(range(10, 34, 2)))
        ]:
            global_result_path = f'{result_dir}/{method}'
            if not os.path.exists(global_result_path+'.csv'):
                with open(global_result_path+'.csv', 'wt') as csv:
                    csv.write(f'Method\tDataset\t{param}\tMAE\tMRAE\tKLD\n')
            reports = qp.util.parallel(task, grid, n_jobs=-1)
            with open(global_result_path + '.csv', 'at') as csv:
                for val, report in zip(grid, reports):
                    means = report.mean()
                    local_result_path = global_result_path + '_' + dataset + (f'_{val:.3f}' if isinstance(val, float) else f'{val}')
                    report.to_csv(f'{local_result_path}.dataframe')
                    csv.write(f'{method}\t{dataset}\t{val}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
                    csv.flush()
            show_results(global_result_path)
--- a/examples/uci_experiments.py
+++ b/examples/uci_experiments.py
@ -116,7 +116,7 @@ def run(experiment):
            model,
            protocol=APP(test, n_prevalences=21, repeats=100)
        )
-        test_true_prevalence = data.test.prevalence()
+        test_true_prevalence = data.mixture.prevalence()
        evaluate_experiment(true_prevalences, estim_prevalences)
        save_results(dataset_name, model_name, run, optim_loss,
--- a/laboratory/custom_vectorizers.py
+++ b/laboratory/custom_vectorizers.py
@ -0,0 +1,244 @@
 from scipy.sparse import csc_matrix, csr_matrix
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
 import numpy as np
 from joblib import Parallel, delayed
 import sklearn
 import math
 from scipy.stats import t
 class ContTable:
    def __init__(self, tp=0, tn=0, fp=0, fn=0):
        self.tp=tp
        self.tn=tn
        self.fp=fp
        self.fn=fn
    def get_d(self): return self.tp + self.tn + self.fp + self.fn
    def get_c(self): return self.tp + self.fn
    def get_not_c(self): return self.tn + self.fp
    def get_f(self): return self.tp + self.fp
    def get_not_f(self): return self.tn + self.fn
    def p_c(self): return (1.0*self.get_c())/self.get_d()
    def p_not_c(self): return 1.0-self.p_c()
    def p_f(self): return (1.0*self.get_f())/self.get_d()
    def p_not_f(self): return 1.0-self.p_f()
    def p_tp(self): return (1.0*self.tp) / self.get_d()
    def p_tn(self): return (1.0*self.tn) / self.get_d()
    def p_fp(self): return (1.0*self.fp) / self.get_d()
    def p_fn(self): return (1.0*self.fn) / self.get_d()
    def tpr(self):
        c = 1.0*self.get_c()
        return self.tp / c if c > 0.0 else 0.0
    def fpr(self):
        _c = 1.0*self.get_not_c()
        return self.fp / _c if _c > 0.0 else 0.0
 def __ig_factor(p_tc, p_t, p_c):
    den = p_t * p_c
    if den != 0.0 and p_tc != 0:
        return p_tc * math.log(p_tc / den, 2)
    else:
        return 0.0
 def information_gain(cell):
    return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + \
           __ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) +\
           __ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c()) + \
           __ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c())
 def squared_information_gain(cell):
    return information_gain(cell)**2
 def posneg_information_gain(cell):
    ig = information_gain(cell)
    if cell.tpr() < cell.fpr():
        return -ig
    else:
        return ig
 def pos_information_gain(cell):
    if cell.tpr() < cell.fpr():
        return 0
    else:
        return information_gain(cell)
 def pointwise_mutual_information(cell):
    return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c())
 def gss(cell):
    return cell.p_tp()*cell.p_tn() - cell.p_fp()*cell.p_fn()
 def chi_square(cell):
    den = cell.p_f() * cell.p_not_f() * cell.p_c() * cell.p_not_c()
    if den==0.0: return 0.0
    num = gss(cell)**2
    return num / den
 def conf_interval(xt, n):
    if n>30:
        z2 = 3.84145882069 # norm.ppf(0.5+0.95/2.0)**2
    else:
        z2 = t.ppf(0.5 + 0.95 / 2.0, df=max(n-1,1)) ** 2
    p = (xt + 0.5 * z2) / (n + z2)
    amplitude = 0.5 * z2 * math.sqrt((p * (1.0 - p)) / (n + z2))
    return p, amplitude
 def strength(minPosRelFreq, minPos, maxNeg):
    if minPos > maxNeg:
        return math.log(2.0 * minPosRelFreq, 2.0)
    else:
        return 0.0
 #set cancel_features=True to allow some features to be weighted as 0 (as in the original article)
 #however, for some extremely imbalanced dataset caused all documents to be 0
 def conf_weight(cell, cancel_features=False):
    c = cell.get_c()
    not_c = cell.get_not_c()
    tp = cell.tp
    fp = cell.fp
    pos_p, pos_amp = conf_interval(tp, c)
    neg_p, neg_amp = conf_interval(fp, not_c)
    min_pos = pos_p-pos_amp
    max_neg = neg_p+neg_amp
    den = (min_pos + max_neg)
    minpos_relfreq = min_pos / (den if den != 0 else 1)
    str_tplus = strength(minpos_relfreq, min_pos, max_neg);
    if str_tplus == 0 and not cancel_features:
        return 1e-20
    return str_tplus;
 def get_tsr_matrix(cell_matrix, tsr_score_funtion):
    nC = len(cell_matrix)
    nF = len(cell_matrix[0])
    tsr_matrix = [[tsr_score_funtion(cell_matrix[c,f]) for f in range(nF)] for c in range(nC)]
    return np.array(tsr_matrix)
 def feature_label_contingency_table(positive_document_indexes, feature_document_indexes, nD):
    tp_ = len(positive_document_indexes & feature_document_indexes)
    fp_ = len(feature_document_indexes - positive_document_indexes)
    fn_ = len(positive_document_indexes - feature_document_indexes)
    tn_ = nD - (tp_ + fp_ + fn_)
    return ContTable(tp=tp_, tn=tn_, fp=fp_, fn=fn_)
 def category_tables(feature_sets, category_sets, c, nD, nF):
    return [feature_label_contingency_table(category_sets[c], feature_sets[f], nD) for f in range(nF)]
 def get_supervised_matrix(coocurrence_matrix, label_matrix, n_jobs=-1):
    """
    Computes the nC x nF supervised matrix M where Mcf is the 4-cell contingency table for feature f and class c.
    Efficiency O(nF x nC x log(S)) where S is the sparse factor
    """
    nD, nF = coocurrence_matrix.shape
    nD2, nC = label_matrix.shape
    if nD != nD2:
        raise ValueError('Number of rows in coocurrence matrix shape %s and label matrix shape %s is not consistent' %
                         (coocurrence_matrix.shape,label_matrix.shape))
    def nonzero_set(matrix, col):
        return set(matrix[:, col].nonzero()[0])
    if isinstance(coocurrence_matrix, csr_matrix):
        coocurrence_matrix = csc_matrix(coocurrence_matrix)
    feature_sets = [nonzero_set(coocurrence_matrix, f) for f in range(nF)]
    category_sets = [nonzero_set(label_matrix, c) for c in range(nC)]
    cell_matrix = Parallel(n_jobs=n_jobs, backend="threading")(delayed(category_tables)(feature_sets, category_sets, c, nD, nF) for c in range(nC))
    return np.array(cell_matrix)
 class TSRweighting(BaseEstimator,TransformerMixin):
    """
    Supervised Term Weighting function based on any Term Selection Reduction (TSR) function (e.g., information gain,
    chi-square, etc.) or, more generally, on any function that could be computed on the 4-cell contingency table for
    each category-feature pair.
    The supervised_4cell_matrix (a CxF matrix containing the 4-cell contingency tables
    for each category-feature pair) can be pre-computed (e.g., during the feature selection phase) and passed as an
    argument.
    When C>1, i.e., in multiclass scenarios, a global_policy is used in order to determine a single feature-score which
    informs about its relevance. Accepted policies include "max" (takes the max score across categories), "ave" and "wave"
    (take the average, or weighted average, across all categories -- weights correspond to the class prevalence), and "sum"
    (which sums all category scores).
    """
    def __init__(self, tsr_function, global_policy='max', supervised_4cell_matrix=None, sublinear_tf=True, norm='l2', min_df=3, n_jobs=-1):
        if global_policy not in ['max', 'ave', 'wave', 'sum']: raise ValueError('Global policy should be in {"max", "ave", "wave", "sum"}')
        self.tsr_function = tsr_function
        self.global_policy = global_policy
        self.supervised_4cell_matrix = supervised_4cell_matrix
        self.sublinear_tf=sublinear_tf
        self.norm=norm
        self.min_df = min_df
        self.n_jobs=n_jobs
    def fit(self, X, y):
        self.count_vectorizer = CountVectorizer(min_df=self.min_df)
        X = self.count_vectorizer.fit_transform(X)
        self.tf_vectorizer = TfidfTransformer(
            norm=None, use_idf=False, smooth_idf=False, sublinear_tf=self.sublinear_tf).fit(X)
        if len(y.shape) == 1:
            y = np.expand_dims(y, axis=1)
        nD, nC = y.shape
        nF = len(self.tf_vectorizer.get_feature_names_out())
        if self.supervised_4cell_matrix is None:
            self.supervised_4cell_matrix = get_supervised_matrix(X, y, n_jobs=self.n_jobs)
        else:
            if self.supervised_4cell_matrix.shape != (nC, nF): raise ValueError("Shape of supervised information matrix is inconsistent with X and y")
        tsr_matrix = get_tsr_matrix(self.supervised_4cell_matrix, self.tsr_function)
        if self.global_policy == 'ave':
            self.global_tsr_vector = np.average(tsr_matrix, axis=0)
        elif self.global_policy == 'wave':
            category_prevalences = [sum(y[:,c])*1.0/nD for c in range(nC)]
            self.global_tsr_vector = np.average(tsr_matrix, axis=0, weights=category_prevalences)
        elif self.global_policy == 'sum':
            self.global_tsr_vector = np.sum(tsr_matrix, axis=0)
        elif self.global_policy == 'max':
            self.global_tsr_vector = np.amax(tsr_matrix, axis=0)
        return self
    def fit_transform(self, X, y):
        return self.fit(X,y).transform(X)
    def transform(self, X):
        if not hasattr(self, 'global_tsr_vector'): raise NameError('TSRweighting: transform method called before fit.')
        X = self.count_vectorizer.transform(X)
        tf_X = self.tf_vectorizer.transform(X).toarray()
        weighted_X = np.multiply(tf_X, self.global_tsr_vector)
        if self.norm is not None and self.norm!='none':
            weighted_X = sklearn.preprocessing.normalize(weighted_X, norm=self.norm, axis=1, copy=False)
        return csr_matrix(weighted_X)
--- a/laboratory/dataset_adapter.py
+++ b/laboratory/dataset_adapter.py
--- a/laboratory/main.py
+++ b/laboratory/main.py
--- a/laboratory/main_tweets_auto.py
+++ b/laboratory/main_tweets_auto.py
@ -0,0 +1,62 @@
 from sklearn.linear_model import LogisticRegression
 import os
 import sys
 import pandas as pd
 import quapy as qp
 from method.aggregative import DistributionMatching
 from distribution_matching.method.method_kdey import KDEy
 from protocol import UPP
 if __name__ == '__main__':
    qp.environ['SAMPLE_SIZE'] = 100
    qp.environ['N_JOBS'] = -1
    method = 'KDE'
    param = 0.1
    div = 'topsoe'
    method_identifier = f'{method}_{param}_{div}'
    # generates tuples (dataset, method, method_name)
    # (the dataset is needed for methods that process the dataset differently)
    def gen_methods():
        for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST:
            data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True)
            if method == 'KDE':
                kdey = KDEy(LogisticRegression(), divergence=div, bandwidth=param, engine='sklearn')
                yield data, kdey, method_identifier
            elif method == 'DM':
                dm = DistributionMatching(LogisticRegression(), divergence=div, nbins=param)
                yield data, dm, method_identifier
            else:
                raise NotImplementedError('unknown method')
    os.makedirs('results', exist_ok=True)
    result_path = f'results/{method_identifier}.csv'
    if os.path.exists(result_path):
        print('Result already exit. Nothing to do')
        sys.exit(0)
    with open(result_path, 'wt') as csv:
        csv.write(f'Method\tDataset\tMAE\tMRAE\n')
        for data, quantifier, quant_name in gen_methods():
            quantifier.fit(data.training)
            protocol = UPP(data.mixture, repeats=100)
            report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae'], verbose=True)
            means = report.mean()
            csv.write(f'{quant_name}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n')
            csv.flush()
    df = pd.read_csv(result_path, sep='\t')
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', None)
    pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"])
    print(pv)
--- a/laboratory/method_dxs.py
+++ b/laboratory/method_dxs.py
@ -0,0 +1,148 @@
 from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
 from sklearn.linear_model import LogisticRegression
 import quapy as qp
 from data import LabelledCollection
 import numpy as np
 from laboratory.custom_vectorizers import *
 from protocol import APP
 from quapy.method.aggregative import _get_divergence, HDy, DistributionMatching
 from quapy.method.base import BaseQuantifier
 from scipy import optimize
 import pandas as pd
 # TODO: explore the bernoulli (term presence/absence) variant
 # TODO: explore the multinomial (term frequency) variant
 # TODO: explore the multinomial + length normalization variant
 # TODO: consolidate the TSR-variant (e.g., using information gain) variant;
 #   - works better with the idf?
 #   - works better with length normalization?
 #   - etc
 class DxS(BaseQuantifier):
    def __init__(self, vectorizer=None, divergence='topsoe'):
        self.vectorizer = vectorizer
        self.divergence = divergence
    # def __as_distribution(self, instances):
    #     return np.asarray(instances.sum(axis=0) / instances.sum()).flatten()
    def __as_distribution(self, instances):
        dist = instances.sum(axis=0) / instances.sum()
        return np.asarray(dist).flatten()
    def fit(self, data: LabelledCollection):
        text_instances, labels = data.Xy
        if self.vectorizer is not None:
            text_instances = self.vectorizer.fit_transform(text_instances, y=labels)
        distributions = []
        for class_i in data.classes_:
            distributions.append(self.__as_distribution(text_instances[labels == class_i]))
        self.validation_distribution = np.asarray(distributions)
        return self
    def quantify(self, text_instances):
        if self.vectorizer is not None:
            text_instances = self.vectorizer.transform(text_instances)
        test_distribution = self.__as_distribution(text_instances)
        divergence = _get_divergence(self.divergence)
        n_classes, n_feats = self.validation_distribution.shape
        def match(prev):
            prev = np.expand_dims(prev, axis=0)
            mixture_distribution = (prev @ self.validation_distribution).flatten()
            return divergence(test_distribution, mixture_distribution)
        # the initial point is set as the uniform distribution
        uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
        # solutions are bounded to those contained in the unit-simplex
        bounds = tuple((0, 1) for x in range(n_classes))  # values in [0,1]
        constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)})  # values summing up to 1
        r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
        return r.x
 if __name__ == '__main__':
    qp.environ['SAMPLE_SIZE'] = 250
    qp.environ['N_JOBS'] = -1
    min_df = 10
    # dataset = 'imdb'
    repeats = 10
    error = 'mae'
    div = 'topsoe'
    # generates tuples (dataset, method, method_name)
    # (the dataset is needed for methods that process the dataset differently)
    def gen_methods():
        for dataset in qp.datasets.REVIEWS_SENTIMENT_DATASETS:
            data = qp.datasets.fetch_reviews(dataset, tfidf=False)
            bernoulli_vectorizer = CountVectorizer(min_df=min_df, binary=True)
            dxs = DxS(divergence=div, vectorizer=bernoulli_vectorizer)
            yield data, dxs, 'DxS-Bernoulli'
            multinomial_vectorizer = CountVectorizer(min_df=min_df, binary=False)
            dxs = DxS(divergence=div, vectorizer=multinomial_vectorizer)
            yield data, dxs, 'DxS-multinomial'
            tf_vectorizer = TfidfVectorizer(sublinear_tf=False, use_idf=False, min_df=min_df, norm=None)
            dxs = DxS(divergence=div, vectorizer=tf_vectorizer)
            yield data, dxs, 'DxS-TF'
            logtf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=False, min_df=min_df, norm=None)
            dxs = DxS(divergence=div, vectorizer=logtf_vectorizer)
            yield data, dxs, 'DxS-logTF'
            tfidf_vectorizer = TfidfVectorizer(use_idf=True, min_df=min_df, norm=None)
            dxs = DxS(divergence=div, vectorizer=tfidf_vectorizer)
            yield data, dxs, 'DxS-TFIDF'
            tfidf_vectorizer = TfidfVectorizer(use_idf=True, min_df=min_df, norm='l2')
            dxs = DxS(divergence=div, vectorizer=tfidf_vectorizer)
            yield data, dxs, 'DxS-TFIDF-l2'
            tsr_vectorizer = TSRweighting(tsr_function=information_gain, min_df=min_df, norm='l2')
            dxs = DxS(divergence=div, vectorizer=tsr_vectorizer)
            yield data, dxs, 'DxS-TFTSR-l2'
            data = qp.datasets.fetch_reviews(dataset, tfidf=True, min_df=min_df)
            hdy = HDy(LogisticRegression())
            yield data, hdy, 'HDy'
            dm = DistributionMatching(LogisticRegression(), divergence=div, nbins=5)
            yield data, dm, 'DM-5b'
            dm = DistributionMatching(LogisticRegression(), divergence=div, nbins=10)
            yield data, dm, 'DM-10b'
    result_path = 'results.csv'
    with open(result_path, 'wt') as csv:
        csv.write(f'Method\tDataset\tMAE\tMRAE\n')
        for data, quantifier, quant_name in gen_methods():
            quantifier.fit(data.training)
            report = qp.evaluation.evaluation_report(quantifier, APP(data.mixture, repeats=repeats), error_metrics=['mae', 'mrae'], verbose=True)
            means = report.mean()
            csv.write(f'{quant_name}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n')
    df = pd.read_csv(result_path, sep='\t')
    # print(df)
    pv = df.pivot_table(index='Method', columns="Dataset", values=["MAE", "MRAE"])
    print(pv)
--- a/quapy/CHANGE_LOG.txt
+++ b/quapy/CHANGE_LOG.txt
@ -1,3 +1,15 @@
 Change Log 0.1.8
 ----------------
 - A conceptual error in MedianSweep and MedianSweep2 has been solved. The error consisted on computing all TPRs and
    FPRs and report the median; then, the adjustment then operated on these single values. Instead, the original
    method proposed by G.Forman comes down to generating all prevalence predictions, for all TPRs and FPRs, and then
    computing the median of it.
 - qp.evaluation now runs in parallel <improve, remove or fix the ongoing error, put at the qp. level instead of
    qp.evaluation because I don't like the qp.evaluation.evaluate thing>
 Change Log 0.1.7
 ----------------
--- a/quapy/classification/calibration.py
+++ b/quapy/classification/calibration.py
@ -59,7 +59,7 @@ class RecalibratedProbabilisticClassifierBase(BaseEstimator, RecalibratedProbabi
        elif isinstance(k, float):
            if not (0 < k < 1):
                raise ValueError('wrong value for val_split: the proportion of validation documents must be in (0,1)')
-            return self.fit_cv(X, y)
+            return self.fit_tr_val(X, y)
    def fit_cv(self, X, y):
        """
--- a/quapy/classification/neural.py
+++ b/quapy/classification/neural.py
@ -176,7 +176,7 @@ class NeuralClassifierTrainer:
        self.classes_ = train.classes_
        opt = self.trainer_hyperparams
        checkpoint = self.checkpointpath
-        self.reset_net_params(self.vocab_size, train.n_classes)
+        self.reset_net_params(self.vocab_size, train.arange_classes)
        train_generator = TorchDataset(train.instances, train.labels).asDataloader(
            opt['batch_size'], shuffle=True, pad_length=opt['padding_length'], device=opt['device'])
--- a/quapy/data/base.py
+++ b/quapy/data/base.py
@ -123,7 +123,7 @@ class LabelledCollection:
            return self.uniform_sampling_index(size, random_state=random_state)
        if len(prevs) == self.n_classes - 1:
            prevs = prevs + (1 - sum(prevs),)
-        assert len(prevs) == self.n_classes, 'unexpected number of prevalences'
+        assert len(prevs) == self.n_classes, f'unexpected number of prevalences (found {len(prevs)}, expected {self.n_classes})'
        assert sum(prevs) == 1, f'prevalences ({prevs}) wrong range (sum={sum(prevs)})'
        # Decide how many instances should be taken for each class in order to satisfy the requested prevalence
@ -269,6 +269,7 @@ class LabelledCollection:
        test = self.sampling_from_index(right)
        return training, test
    def __add__(self, other):
        """
        Returns a new :class:`LabelledCollection` as the union of this collection with another collection.
@ -319,7 +320,8 @@ class LabelledCollection:
        else:
            raise NotImplementedError('unsupported operation for collection types')
        labels = np.concatenate([lc.labels for lc in args])
-        classes = np.unique(labels).sort()
+        classes = np.unique(labels)
        classes.sort()
        return LabelledCollection(instances, labels, classes=classes)
    @property
--- a/quapy/data/datasets.py
+++ b/quapy/data/datasets.py
@ -1,3 +1,6 @@
 import numpy as np
 def warn(*args, **kwargs):
    pass
 import warnings
@ -7,7 +10,8 @@ import zipfile
 from os.path import join
 import pandas as pd
 import scipy
-
+import pickle
 from ucimlrepo import fetch_ucirepo
 from quapy.data.base import Dataset, LabelledCollection
 from quapy.data.preprocessing import text2tfidf, reduce_columns
 from quapy.data.reader import *
@ -45,6 +49,12 @@ UCI_DATASETS = ['acute.a', 'acute.b',
                'wine-q-red', 'wine-q-white',
                'yeast']
 UCI_MULTICLASS_DATASETS = ['dry-bean',
                           'wine-quality',
                           'academic-success',
                           'digits',
                           'letter']
 LEQUA2022_TASKS = ['T1A', 'T1B', 'T2A', 'T2B']
 _TXA_SAMPLE_SIZE = 250
@ -204,7 +214,7 @@ def fetch_UCIDataset(dataset_name, data_home=None, test_split=0.3, verbose=False
    :return: a :class:`quapy.data.base.Dataset` instance
    """
    data = fetch_UCILabelledCollection(dataset_name, data_home, verbose)
-    return Dataset(*data.split_stratified(1 - test_split, random_state=0))
+    return Dataset(*data.split_stratified(1 - test_split, random_state=0), name=dataset_name)
 def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection:
@ -548,11 +558,110 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) ->
    data.stats()
    return data
 def _df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):
    df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False)
 def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset:
    """
    Loads a UCI multiclass dataset as an instance of :class:`quapy.data.base.Dataset`, as used in
    `Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
    Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
    Information Fusion, 34, 87-100. <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_
    and
    `Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).
    Dynamic ensemble selection for quantification tasks.
    Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_.
    The datasets do not come with a predefined train-test split (see :meth:`fetch_UCILabelledCollection` for further
    information on how to use these collections), and so a train-test split is generated at desired proportion.
    The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_DATASETS`
    :param dataset_name: a dataset name
    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
        ~/quay_data/ directory)
    :param test_split: proportion of documents to be included in the test set. The rest conforms the training set
    :param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
    :return: a :class:`quapy.data.base.Dataset` instance
    """
    data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, verbose)
    return Dataset(*data.split_stratified(1 - test_split, random_state=0))
 def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection:
    """
    Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`, as used in
    `Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
    Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
    Information Fusion, 34, 87-100. <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_
    and
    `Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).
    Dynamic ensemble selection for quantification tasks.
    Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_.
    The datasets do not come with a predefined train-test split, and so Pérez-Gállego et al. adopted a 5FCVx2 evaluation
    protocol, meaning that each collection was used to generate two rounds (hence the x2) of 5 fold cross validation.
    This can be reproduced by using :meth:`quapy.data.base.Dataset.kFCV`, e.g.:
    >>> import quapy as qp
    >>> collection = qp.datasets.fetch_UCILabelledCollection("dry-bean")
    >>> for data in qp.domains.Dataset.kFCV(collection, nfolds=5, nrepeats=2):
    >>>     ...
    The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS`
    :param dataset_name: a dataset name
    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
        ~/quay_data/ directory)
    :param test_split: proportion of documents to be included in the test set. The rest conforms the training set
    :param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
    :return: a :class:`quapy.data.base.LabelledCollection` instance
    """
    assert dataset_name in UCI_MULTICLASS_DATASETS, \
        f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository (multiclass). ' \
        f'Valid ones are {UCI_MULTICLASS_DATASETS}'
    if data_home is None:
        data_home = get_quapy_home()
    identifiers = {"dry-bean": 602,
                   "wine-quality": 186,
                   "academic-success": 697,
                   "digits": 80,
                   "letter": 59}
    full_names = {"dry-bean": "Dry Bean Dataset",
                  "wine-quality": "Wine Quality",
                  "academic-success": "Predict students' dropout and academic success",
                  "digits": "Optical Recognition of Handwritten Digits",
                  "letter": "Letter Recognition"
                  }
    identifier = identifiers[dataset_name]
    fullname = full_names[dataset_name]
    print(f'Loading UCI Muticlass {dataset_name} ({fullname})')
    file = join(data_home, 'uci_multiclass', dataset_name + '.pkl')
    if os.path.exists(file):
        with open(file, 'rb') as file:
            data = pickle.load(file)
    else:
        data = fetch_ucirepo(id=identifier)
        X, y = data['data']['features'].to_numpy(), data['data']['targets'].to_numpy().squeeze()
        classes = np.sort(np.unique(y))
        y = np.searchsorted(classes, y)
        data = LabelledCollection(X, y)
        os.makedirs(os.path.dirname(file), exist_ok=True)
        with open(file, 'wb') as file:
            pickle.dump(data, file)
    data.stats()
    return data
 def _df_replace(df, col, repl={'yes': 1, 'no': 0}, astype=float):
    df[col] = df[col].apply(lambda x: repl[x]).astype(astype, copy=False)
 def fetch_lequa2022(task, data_home=None):
    """
    Loads the official datasets provided for the `LeQua <https://lequa2022.github.io/index>`_ competition.
--- a/quapy/evaluation.py
+++ b/quapy/evaluation.py
@ -1,3 +1,4 @@
 from copy import deepcopy
 from typing import Union, Callable, Iterable
 import numpy as np
 from tqdm import tqdm
@ -11,7 +12,9 @@ def prediction(
        model: BaseQuantifier,
        protocol: AbstractProtocol,
        aggr_speedup: Union[str, bool] = 'auto',
-        verbose=False):
+        verbose=False,
        verbose_error=None,
        n_jobs=1):
    """
    Uses a quantification model to generate predictions for the samples generated via a specific protocol.
    This function is central to all evaluation processes, and is endowed with an optimization to speed-up the
@ -32,6 +35,11 @@ def prediction(
        in the samples to be generated. Set to True or "auto" (default) for letting QuaPy decide whether it is
        convenient or not. Set to False to deactivate.
    :param verbose: boolean, show or not information in stdout
    :param verbose_error: an evaluation function to be used to display intermediate results if verbose=True (default None)
    :param n_jobs: number of parallel workers. Default is 1 so that, if not explicitly requested, the evaluation phase
        is to be carried out in a single core. That is to say, this parameter will not inspect the environment variable
        N_JOBS by default. This might be convenient in many situations, since parallelizing the evaluation entails
        adding an overhead for cloning the objects within different threads that is often not worth the effort.
    :return: a tuple `(true_prevs, estim_prevs)` in which each element in the tuple is an array of shape
        `(n_samples, n_classes)` containing the true, or predicted, prevalence values for each sample
    """
@ -61,16 +69,44 @@ def prediction(
    if apply_optimization:
        pre_classified = model.classify(protocol.get_labelled_collection().instances)
        protocol_with_predictions = protocol.on_preclassified_instances(pre_classified)
-        return __prediction_helper(model.aggregate, protocol_with_predictions, verbose)
+        return __prediction_helper(model, protocol_with_predictions, True, verbose, verbose_error, n_jobs)
    else:
-        return __prediction_helper(model.quantify, protocol, verbose)
+        return __prediction_helper(model, protocol, False, verbose, verbose_error, n_jobs)
-def __prediction_helper(quantification_fn, protocol: AbstractProtocol, verbose=False):
+def __delayed_prediction(args):
    quantifier, aggregate, sample_instances, sample_prev = args
    quantifier = deepcopy(quantifier)
    quant_fn = quantifier.aggregate if aggregate else quantifier.quantify
    predicted = quant_fn(sample_instances)
    return sample_prev, predicted
 def __prediction_helper(quantifier, protocol: AbstractProtocol, aggregate: bool, verbose=False, verbose_error=None, n_jobs=1):
    true_prevs, estim_prevs = [], []
-    for sample_instances, sample_prev in tqdm(protocol(), total=protocol.total(), desc='predicting') if verbose else protocol():
+    ongoing_errors = []
-        estim_prevs.append(quantification_fn(sample_instances))
+    if verbose:
-        true_prevs.append(sample_prev)
+        pbar =  tqdm(protocol(), total=protocol.total(), desc='predicting')
    if n_jobs==1:
        quant_fn = quantifier.aggregate if aggregate else quantifier.quantify
        for sample_instances, sample_prev in pbar if verbose else protocol():
            predicted = quant_fn(sample_instances)
            estim_prevs.append(predicted)
            true_prevs.append(sample_prev)
            if verbose and verbose_error is not None:
                err = verbose_error(sample_prev, predicted)
                ongoing_errors.append(err)
                pbar.set_description(f'predicting: ongoing error={np.mean(ongoing_errors):.5f}')
    else:
        if verbose:
            print('parallelizing prediction')
        outputs = qp.util.parallel(
            __delayed_prediction,
            ((quantifier, aggregate, sample_X, sample_p) for (sample_X, sample_p) in (pbar if verbose else protocol())),
            seed=qp.environ.get('_R_SEED', None),
            n_jobs=n_jobs
        )
        true_prevs, estim_prevs = list(zip(*outputs))
    true_prevs = np.asarray(true_prevs)
    estim_prevs = np.asarray(estim_prevs)
@ -82,7 +118,7 @@ def evaluation_report(model: BaseQuantifier,
                      protocol: AbstractProtocol,
                      error_metrics: Iterable[Union[str,Callable]] = 'mae',
                      aggr_speedup: Union[str, bool] = 'auto',
-                      verbose=False):
+                      verbose=False, verbose_error=None, n_jobs=1):
    """
    Generates a report (a pandas' DataFrame) containing information of the evaluation of the model as according
    to a specific protocol and in terms of one or more evaluation metrics (errors).
@ -100,12 +136,19 @@ def evaluation_report(model: BaseQuantifier,
        in the samples to be generated. Set to True or "auto" (default) for letting QuaPy decide whether it is
        convenient or not. Set to False to deactivate.
    :param verbose: boolean, show or not information in stdout
    :param verbose_error: an evaluation function to be used to display intermediate results if verbose=True (default None)
    :param n_jobs: number of parallel workers. Default is 1 so that, if not explicitly requested, the evaluation phase
        is to be carried out in a single core. That is to say, this parameter will not inspect the environment variable
        N_JOBS by default. This might be convenient in many situations, since parallelizing the evaluation entails
        adding an overhead for cloning the objects within different threads that is often not worth the effort.
    :return: a pandas' DataFrame containing the columns 'true-prev' (the true prevalence of each sample),
        'estim-prev' (the prevalence estimated by the model for each sample), and as many columns as error metrics
        have been indicated, each displaying the score in terms of that metric for every sample.
    """
-    true_prevs, estim_prevs = prediction(model, protocol, aggr_speedup=aggr_speedup, verbose=verbose)
+    true_prevs, estim_prevs = prediction(
        model, protocol, aggr_speedup=aggr_speedup, verbose=verbose, verbose_error=verbose_error, n_jobs=n_jobs
    )
    return _prevalence_report(true_prevs, estim_prevs, error_metrics)
--- a/quapy/functional.py
+++ b/quapy/functional.py
@ -90,11 +90,32 @@ def TopsoeDistance(P, Q, epsilon=1e-20):
    :param P: real-valued array-like of shape `(k,)` representing a discrete distribution
    :param Q: real-valued array-like of shape `(k,)` representing a discrete distribution
    :param epsilon: small value to smooth the distributions for numerical stability
    :return: float
    """
    return np.sum(P*np.log((2*P+epsilon)/(P+Q+epsilon)) + Q*np.log((2*Q+epsilon)/(P+Q+epsilon)))
 def CauchySchwarz(P, Q, epsilon=1e-20):
    """
    Cauchy-Schwarz divergence between two (discretized) distributions `P` and `Q`.
    The Cauchy-Schwarz divergence for two discrete distributions of `k` bins is defined as:
    .. math::
        CS(P,Q) = \\frac{ \\sum_{i=1}^k  p_i q_i }{
            \\left( \\sum_{i=1}^k  p^2_i \\right) \\left( \\sum_{i=1}^k  q^2_i \\right)
        }
    :param P: real-valued array-like of shape `(k,)` representing a discrete distribution
    :param Q: real-valued array-like of shape `(k,)` representing a discrete distribution
    :param epsilon: small value to smooth the distributions for numerical stability
    :return: float
    """
    P += epsilon
    Q += epsilon
    return - np.log(sum(P * Q) / np.sqrt(sum(P ** 2) * sum(Q ** 2)))
 def uniform_prevalence_sampling(n_classes, size=1):
    """
    Implements the `Kraemer algorithm <http://www.cs.cmu.edu/~nasmith/papers/smith+tromble.tr04.pdf>`_
@ -276,3 +297,24 @@ def check_prevalence_vector(p, raise_exception=False, toleranze=1e-08):
        return False
    return True
 def optim_minimize(loss, n_classes):
    """
    Searches for the optimal prevalence values, i.e., an `n_classes`-dimensional vector of the (`n_classes`-1)-simplex
    that yields the smallest lost. This optimization is carried out by means of a constrained search using scipy's
    SLSQP routine.
    :param loss: (callable) the function to minimize
    :param n_classes: (int) the number of classes, i.e., the dimensionality of the prevalence vector
    :return: (ndarray) the best prevalence vector found
    """
    from scipy import optimize
    # the initial point is set as the uniform distribution
    uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
    # solutions are bounded to those contained in the unit-simplex
    bounds = tuple((0, 1) for _ in range(n_classes))  # values in [0,1]
    constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)})  # values summing up to 1
    r = optimize.minimize(loss, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
    return r.x
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@ -604,10 +604,17 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
 def _get_divergence(divergence: Union[str, Callable]):
    if isinstance(divergence, str):
-        if divergence=='HD':
+        divergence = divergence.lower()
        if divergence=='hd':
            return F.HellingerDistance
        elif divergence=='topsoe':
            return F.TopsoeDistance
        elif divergence.lower()=='cs':
            return F.CauchySchwarz
        elif divergence.lower()=='l2':
            return lambda a,b: np.linalg.norm(a-b)
        elif divergence.lower()=='l1':
            return lambda a,b: np.linalg.norm(a-b, ord=1)
        else:
            raise ValueError(f'unknown divergence {divergence}')
    elif callable(divergence):
@ -770,7 +777,9 @@ class DistributionMatching(AggregativeProbabilisticQuantifier):
        """
        Trains the classifier (if requested) and generates the validation distributions out of the training data.
        The validation distributions have shape `(n, ch, nbins)`, with `n` the number of classes, `ch` the number of
-        channels, and `nbins` the number of bins. In particular, let `V` be the validation distributions; `di=V[i]`
+        channels (a channel is a description, in form of a histogram, of a specific class -- there are as many channels
        as classes, although in the binary case one can use only one channel, since the other one is constrained),
        and `nbins` the number of bins. In particular, let `V` be the validation distributions; `di=V[i]`
        are the distributions obtained from training data labelled with class `i`; `dij = di[j]` is the discrete
        distribution of posterior probabilities `P(Y=j|X=x)` for training data labelled with class `i`, and `dij[k]`
        is the fraction of instances with a value in the `k`-th bin.
@ -819,7 +828,7 @@ class DistributionMatching(AggregativeProbabilisticQuantifier):
        uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
        # solutions are bounded to those contained in the unit-simplex
-        bounds = tuple((0, 1) for x in range(n_classes))  # values in [0,1]
+        bounds = tuple((0, 1) for _ in range(n_classes))  # values in [0,1]
        constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)})  # values summing up to 1
        r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
        return r.x
@ -1085,7 +1094,7 @@ class T50(ThresholdOptimization):
    Threshold Optimization variant for :class:`ACC` as proposed by
    `Forman 2006 <https://dl.acm.org/doi/abs/10.1145/1150402.1150423>`_ and
    `Forman 2008 <https://link.springer.com/article/10.1007/s10618-008-0097-y>`_ that looks
-    for the threshold that makes `tpr` cosest to 0.5.
+    for the threshold that makes `tpr` closest to 0.5.
    The goal is to bring improved stability to the denominator of the adjustment.
    :param classifier: a sklearn's Estimator that generates a classifier
@ -1173,7 +1182,7 @@ class MS(ThresholdOptimization):
        super().__init__(classifier, val_split)
    def _condition(self, tpr, fpr) -> float:
-        pass
+        return True
    def _optimize_threshold(self, y, probabilities):
        tprs = []
@ -1184,9 +1193,26 @@ class MS(ThresholdOptimization):
            TP, FP, FN, TN = self._compute_table(y, y_)
            tpr = self._compute_tpr(TP, FP)
            fpr = self._compute_fpr(FP, TN)
-            tprs.append(tpr)
+            if self._condition(tpr, fpr):
-            fprs.append(fpr)
+                tprs.append(tpr)
-        return np.median(tprs), np.median(fprs)
+                fprs.append(fpr)
        return tprs, fprs
    def aggregate(self, classif_predictions):
        prevs_estim = self.cc.aggregate(classif_predictions)
        positive_prevs = []
        for tpr, fpr in zip(self.tpr, self.fpr):
            if tpr - fpr > 0:
                acc = np.clip((prevs_estim[1] - fpr) / (tpr - fpr), 0, 1)
                positive_prevs.append(acc)
        if len(positive_prevs) > 0:
            adjusted_positive_prev = np.median(positive_prevs)
            adjusted_prevs_estim = np.array((1 - adjusted_positive_prev, adjusted_positive_prev))
            return adjusted_prevs_estim
        else:
            return prevs_estim
 class MS2(MS):
@ -1209,19 +1235,9 @@ class MS2(MS):
    def __init__(self, classifier: BaseEstimator, val_split=0.4):
        super().__init__(classifier, val_split)
-    def _optimize_threshold(self, y, probabilities):
+    def _condition(self, tpr, fpr) -> float:
-        tprs = [0, 1]
+        return (tpr - fpr) > 0.25
-        fprs = [0, 1]
+
        candidate_thresholds = np.unique(probabilities[:, 1])
        for candidate_threshold in candidate_thresholds:
            y_ = [self.classes_[1] if p > candidate_threshold else self.classes_[0] for p in probabilities[:, 1]]
            TP, FP, FN, TN = self._compute_table(y, y_)
            tpr = self._compute_tpr(TP, FP)
            fpr = self._compute_fpr(FP, TN)
            if (tpr - fpr) > 0.25:
                tprs.append(tpr)
                fprs.append(fpr)
        return np.median(tprs), np.median(fprs)
 ClassifyAndCount = CC
--- a/quapy/method/neural.py
+++ b/quapy/method/neural.py
@ -32,13 +32,13 @@ class QuaNetTrainer(BaseQuantifier):
    >>> qp.domains.preprocessing.index(dataset, min_df=5, inplace=True)
    >>>
    >>> # the text classifier is a CNN trained by NeuralClassifierTrainer
-    >>> cnn = CNNnet(dataset.vocabulary_size, dataset.n_classes)
+    >>> cnn = CNNnet(dataset.vocabulary_size, dataset.arange_classes)
    >>> classifier = NeuralClassifierTrainer(cnn, device='cuda')
    >>>
    >>> # train QuaNet (QuaNet is an alias to QuaNetTrainer)
    >>> model = QuaNet(classifier, qp.environ['SAMPLE_SIZE'], device='cuda')
    >>> model.fit(dataset.training)
-    >>> estim_prevalence = model.quantify(dataset.test.instances)
+    >>> estim_prevalence = model.quantify(dataset.mixture.instances)
    :param classifier: an object implementing `fit` (i.e., that can be trained on labelled data),
        `predict_proba` (i.e., that can generate posterior probabilities of unlabelled examples) and
--- a/quapy/model_selection.py
+++ b/quapy/model_selection.py
@ -54,6 +54,7 @@ class GridSearchQ(BaseQuantifier):
        self.__check_error(error)
        assert isinstance(protocol, AbstractProtocol), 'unknown protocol'
    def _sout(self, msg):
        if self.verbose:
            print(f'[{self.__class__.__name__}:{self.model.__class__.__name__}]: {msg}')
@ -88,7 +89,7 @@ class GridSearchQ(BaseQuantifier):
        hyper = [dict({k: val[i] for i, k in enumerate(params_keys)}) for val in itertools.product(*params_values)]
        self._sout(f'starting model selection with {self.n_jobs =}')
-        #pass a seed to parallel so it is set in clild processes
+        # pass a seed to parallel so it is set in clild processes
        scores = qp.util.parallel(
            self._delayed_eval,
            ((params, training) for params in hyper),
@ -223,7 +224,7 @@ def cross_val_predict(quantifier: BaseQuantifier, data: LabelledCollection, nfol
    for train, test in data.kFCV(nfolds=nfolds, random_state=random_state):
        quantifier.fit(train)
        fold_prev = quantifier.quantify(test.X)
-        rel_size = len(test.X)/len(data)
+        rel_size = len(test)/len(data)
        total_prev += fold_prev*rel_size
    return total_prev
--- a/quapy/plot.py
+++ b/quapy/plot.py
@ -11,7 +11,7 @@ import quapy as qp
 plt.rcParams['figure.figsize'] = [10, 6]
 plt.rcParams['figure.dpi'] = 200
-plt.rcParams['font.size'] = 18
+plt.rcParams['font.size'] = 12
 def binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title=None, show_std=True, legend=True,
@ -259,7 +259,7 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
    data = _join_data_by_drift(method_names, true_prevs, estim_prevs, tr_prevs, x_error, y_error, method_order)
    if method_order is None:
-        method_order = method_names
+        method_order = np.unique(method_names)
    _set_colors(ax, n_methods=len(method_order))
@ -330,8 +330,8 @@ def error_by_drift(method_names, true_prevs, estim_prevs, tr_prevs,
    if show_legend:
        fig.legend(loc='lower center',
-                  bbox_to_anchor=(1, 0.5),
+                  bbox_to_anchor=(0.9, 0.2),
-                  ncol=(len(method_names)+1)//2)
+                  ncol=1)
    _save_or_show(savepath)
Author	SHA1	Message	Date
Alejandro Moreo Fernandez	0e81117cfb	quick test kdex	2023-12-17 20:26:53 +01:00
Alejandro Moreo Fernandez	9bdc7676d6	first push to github	2023-12-17 20:14:38 +01:00
Alejandro Moreo Fernandez	59500a5a42	refactoring	2023-12-11 16:43:45 +01:00
Alejandro Moreo Fernandez	a776816063	cleaning project and refactoring kde methods	2023-12-06 19:24:42 +01:00
Alejandro Moreo Fernandez	29e23b5bf6	cleaning gitginore	2023-12-06 17:05:12 +01:00
Alejandro Moreo Fernandez	b823745dd1	cleaning project	2023-12-06 16:55:06 +01:00
Alejandro Moreo Fernandez	b7b052ef7d	checking stat significance	2023-11-28 10:20:54 +01:00
Alejandro Moreo Fernandez	01ef81bf25	sampling better the KDEy-HD approach	2023-11-23 11:30:45 +01:00
Alejandro Moreo Fernandez	dcfd61ae5b	launching new KDEy-HD with importance sampling	2023-11-22 12:27:15 +01:00
Alejandro Moreo Fernandez	ac88dc8961	understanding importance sampling	2023-11-21 10:26:13 +01:00
Alejandro Moreo Fernandez	61399d66f6	scripts for understanding the divergence approximations	2023-11-20 15:28:56 +01:00
Alejandro Moreo Fernandez	d060ffa591	Merge branch 'lab' of gitea-s2i2s.isti.cnr.it:moreo/QuaPy into lab	2023-11-09 18:15:33 +01:00
Alejandro Moreo Fernandez	949bd3cca7	gitignore update	2023-11-07 17:33:54 +01:00
Alejandro Moreo Fernandez	c0f9a50a14	plotting simplex and 3d-histograms	2023-11-07 17:28:32 +01:00
Alejandro Moreo Fernandez	50d0ed2e84	switching	2023-11-03 09:54:36 +01:00
Alejandro Moreo Fernandez	0f4008e18d	switching to devel	2023-10-30 09:41:52 +01:00
Alejandro Moreo Fernandez	3243fd90f8	running final experiments, one dedicated DM for each divergence with fine-grained exploration of nbins	2023-10-23 11:32:35 +02:00
Alejandro Moreo Fernandez	f08885dca3	renaming dataset labels as numeric values w/o gaps	2023-10-18 11:42:52 +02:00
Alejandro Moreo Fernandez	cfd5b00b97	renaming binary -> ucibinary	2023-10-18 10:55:56 +02:00
Alejandro Moreo Fernandez	53b052edc7	add uci multiclass datasets	2023-10-18 10:52:04 +02:00
Alejandro Moreo Fernandez	1c258a2000	adding multiclass experiments	2023-10-18 10:48:50 +02:00
Alejandro Moreo Fernandez	dca955819c	everything working; need to clean prints though	2023-10-17 18:11:25 +02:00
Alejandro Moreo Fernandez	7593fde2e0	KDE with closed form working	2023-10-13 17:34:26 +02:00
Alejandro Moreo Fernandez	32d6aa58f6	understanding montecarlo sampling	2023-10-02 17:50:12 +02:00
Alejandro Moreo Fernandez	56dbe744df	cleaning and refactoring, also trying to repair the montecarlo approximation	2023-09-28 17:47:39 +02:00
Alejandro Moreo Fernandez	9221b3f775	binary experiments	2023-09-11 12:48:32 +02:00
Alejandro Moreo Fernandez	7e2237abe8	stability	2023-09-05 16:57:43 +02:00
Alejandro Moreo Fernandez	2d6ac4af0d	testing the sensibility of KDEy to the bandwidth	2023-09-04 12:05:25 +02:00
Alejandro Moreo Fernandez	b1715b685c	Merge branch 'lab' of gitea-s2i2s.isti.cnr.it:moreo/QuaPy into lab	2023-08-01 19:04:36 +02:00
Alejandro Moreo Fernandez	f72a612011	everything working, last results	2023-08-01 19:04:23 +02:00
Alejandro Moreo Fernandez	fe819a7a3c	plots	2023-07-31 09:03:41 +02:00
Alejandro Moreo Fernandez	96758834f3	kde working with kfcv	2023-07-24 16:28:21 +02:00
Alejandro Moreo Fernandez	0ce6106ee1	simplex plot	2023-07-21 12:56:01 +02:00
Alejandro Moreo Fernandez	d995990fba	initial experiments and DIR method	2023-07-21 11:41:16 +02:00
Alejandro Moreo Fernandez	26de9d92eb	distribution matching multiclass	2023-07-20 09:03:22 +02:00
Alejandro Moreo Fernandez	4a7ffe5b50	testing kde-y	2023-05-22 17:34:00 +02:00
Alejandro Moreo Fernandez	ca25f1d601	lab + some experimental methods based on distribution matching	2023-05-04 17:26:17 +02:00