Merge branch 'lab' of gitea-s2i2s.isti.cnr.it:moreo/QuaPy into lab

2023-11-09 18:15:33 +01:00 · 2023-11-09 18:15:33 +01:00 · d060ffa591
parent 50d0ed2e84 949bd3cca7
commit d060ffa591
4 changed files with 212 additions and 7 deletions
--- a/.gitignore
+++ b/.gitignore
@ -165,3 +165,4 @@ dmypy.json

 *__pycache__*
 *.dataframe
+*.svg
--- a/distribution_matching/figures/histograms_density_plot.py
+++ b/distribution_matching/figures/histograms_density_plot.py
@ -0,0 +1,73 @@
+
+import math
+import numpy as np
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import train_test_split, cross_val_predict
+from sklearn.neighbors import KernelDensity
+import matplotlib.pyplot as plt
+import numpy as np
+
+from data import LabelledCollection
+
+scale = 100
+
+
+import quapy as qp
+
+data = qp.datasets.fetch_twitter('wb', min_df=3, pickle=True, for_model_selection=False)
+
+X, y = data.training.Xy
+
+cls = LogisticRegression(C=0.0001, random_state=0)
+
+
+posteriors = cross_val_predict(cls, X=X, y=y, method='predict_proba', n_jobs=-1, cv=3)
+
+cls.fit(X, y)
+
+Xte, yte = data.test.Xy
+
+post_c1 = posteriors[y==0]
+post_c2 = posteriors[y==1]
+post_c3 = posteriors[y==2]
+
+
+print(len(post_c1))
+print(len(post_c2))
+print(len(post_c3))
+
+post_test = cls.predict_proba(Xte)
+
+alpha = qp.functional.prevalence_from_labels(yte, classes=[0, 1, 2])
+
+
+nbins = 20
+
+plt.rcParams.update({'font.size': 7})
+
+fig = plt.figure()
+positions = np.asarray([2,1,0])
+colors = ['r', 'g', 'b']
+
+for i, post_set in enumerate([post_c1, post_c2, post_c3, post_test]):
+    ax = fig.add_subplot(141+i, projection='3d')
+    for post, c, z in zip(post_set.T, colors, positions):
+
+        hist, bins = np.histogram(post, bins=nbins, density=True)
+        xs = (bins[:-1] + bins[1:])/2
+
+        ax.bar(xs, hist, width=1/nbins, zs=z, zdir='y', color=c, ec=c, alpha=0.6)
+
+    ax.yaxis.set_ticks(positions)
+    ax.yaxis.set_ticklabels(['$y=1$', '$y=2$', '$y=3$'])
+    ax.xaxis.set_ticks([])
+    ax.xaxis.set_ticklabels([], minor=True)
+    ax.zaxis.set_ticks([])
+    ax.zaxis.set_ticklabels([], minor=True)
+
+
+#plt.figure(figsize=(10,6))
+#plt.show()
+plt.savefig('./histograms.pdf')
+
+
--- a/distribution_matching/figures/simplex_density_plot.py
+++ b/distribution_matching/figures/simplex_density_plot.py
@ -6,7 +6,7 @@ from sklearn.neighbors import KernelDensity

 from data import LabelledCollection

-scale = 200
+scale = 10


 # con ternary (una lib de matplotlib) salen bien pero no puedo crear contornos, o no se
@ -56,7 +56,8 @@ def plot_3class_problem(post_c1, post_c2, post_c3, post_test, alpha, bandwidth):
    post_c3 = np.flip(post_c3, axis=1)
    post_test = np.flip(post_test, axis=1)

-    fig = ternary.plt.figure(figsize=(26, 3))
+    size_=10
+    fig = ternary.plt.figure(figsize=(4*size_, 1*size_))
    fig.tight_layout()
    ax1 = fig.add_subplot(1, 4, 1)
    divider = make_axes_locatable(ax1)
@ -83,9 +84,9 @@ def plot_3class_problem(post_c1, post_c2, post_c3, post_test, alpha, bandwidth):
            return np.exp(kde([p])).item()
        return d

-    plot_simplex_(ax1, density(kde1.score_samples), title='$f_1(\mathbf{x})=p(s(\mathbf{x})|y=1)$')
-    plot_simplex_(ax2, density(kde2.score_samples), title='$f_2(\mathbf{x})=p(s(\mathbf{x})|y=2)$')
-    plot_simplex_(ax3, density(kde3.score_samples), title='$f_3(\mathbf{x})=p(s(\mathbf{x})|y=3)$')
+    plot_simplex_(ax1, density(kde1.score_samples), title='$p_1$')
+    plot_simplex_(ax2, density(kde2.score_samples), title='$p_2$')
+    plot_simplex_(ax3, density(kde3.score_samples), title='$p_3$')
    #plot_simplex(ax1, post_c1, np.exp(kde1.score_samples(post_c1)), title='$f_1(\mathbf{x})=p(s(\mathbf{x})|y=1)$') #, savepath='figure/y1.png')
    #plot_simplex(ax2, post_c2, np.exp(kde2.score_samples(post_c2)), title='$f_2(\mathbf{x})=p(s(\mathbf{x})|y=2)$') #, savepath='figure/y2.png')
    #plot_simplex(ax3, post_c3, np.exp(kde3.score_samples(post_c3)), title='$f_3(\mathbf{x})=p(s(\mathbf{x})|y=3)$') #, savepath='figure/y3.png')
@ -102,7 +103,7 @@ def plot_3class_problem(post_c1, post_c2, post_c3, post_test, alpha, bandwidth):
            return total_density
        return m

-    title = '$\sum_{i \in \mathcal{Y}} \\alpha_i f_i(\mathbf{x})$'
+    title = '$p_{\mathbf{\\alpha}} = \sum_{i \in n} \\alpha_i p_i$'

    plot_simplex_(ax4, mixture_(alpha, [kde1, kde2, kde3]), title=title, points=post_test)
    #mixture(alpha, [kde1, kde2, kde3])
@ -111,7 +112,8 @@ def plot_3class_problem(post_c1, post_c2, post_c3, post_test, alpha, bandwidth):
    #test_scores = sum(alphai*np.exp(kdei.score_samples(post_test)) for alphai, kdei in zip(alpha, [kde1,kde2,kde3]))
    #plot_simplex(ax4, post_test, test_scores, title=title, points=post_test)

-    ternary.plt.show()
+    #ternary.plt.show()
+    ternary.plt.savefig('./simplex.png')


 import quapy as qp
--- a/distribution_matching/figures/simplex_density_plot2.py
+++ b/distribution_matching/figures/simplex_density_plot2.py
@ -0,0 +1,129 @@
+import ternary
+import math
+import numpy as np
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import train_test_split, cross_val_predict
+from sklearn.neighbors import KernelDensity
+import plotly.figure_factory as ff
+
+from data import LabelledCollection
+
+scale = 100
+
+
+# con ternary (una lib de matplotlib) salen bien pero no puedo crear contornos, o no se
+# con plotly salen los contornos bien, pero es un poco un jaleo porque utiliza el navegador...
+
+def plot_simplex_(ax, density, title='', fontsize=30, points=None):
+
+    tax = ternary.TernaryAxesSubplot(ax=ax, scale=scale)
+    tax.heatmapf(density, boundary=True, style="triangular", colorbar=False, cmap='viridis') #cmap='magma')
+    tax.boundary(linewidth=1.0)
+    corner_fontsize = int(5*fontsize//6)
+    tax.right_corner_label("$y=3$", fontsize=corner_fontsize)
+    tax.top_corner_label("$y=2$", fontsize=corner_fontsize)
+    tax.left_corner_label("$y=1$", fontsize=corner_fontsize)
+    if title:
+        tax.set_title(title, loc='center', y=-0.11, fontsize=fontsize)
+    if points is not None:
+        tax.scatter(points*scale, marker='o', color='w', alpha=0.25, zorder=10, s=5*scale)
+    tax.get_axes().axis('off')
+    tax.clear_matplotlib_ticks()
+
+    return tax
+
+
+
+from mpl_toolkits.axes_grid1 import make_axes_locatable
+def plot_3class_problem(post_c1, post_c2, post_c3, post_test, alpha, bandwidth):
+    post_c1 = np.flip(post_c1, axis=1)
+    post_c2 = np.flip(post_c2, axis=1)
+    post_c3 = np.flip(post_c3, axis=1)
+    post_test = np.flip(post_test, axis=1)
+
+    size_=10
+    fig = ternary.plt.figure(figsize=(5*size_, 1*size_))
+    fig.tight_layout()
+    ax1 = fig.add_subplot(1, 4, 1)
+    divider = make_axes_locatable(ax1)
+    ax2 = fig.add_subplot(1, 4, 2)
+    divider = make_axes_locatable(ax2)
+    ax3 = fig.add_subplot(1, 4, 3)
+    divider = make_axes_locatable(ax3)
+    ax4 = fig.add_subplot(1, 4, 4)
+    divider = make_axes_locatable(ax4)
+
+    kde1 = KernelDensity(bandwidth=bandwidth).fit(post_c1)
+    kde2 = KernelDensity(bandwidth=bandwidth).fit(post_c2)
+    kde3 = KernelDensity(bandwidth=bandwidth).fit(post_c3)
+
+    #post_c1 = np.concatenate([post_c1, np.eye(3, dtype=float)])
+    #post_c2 = np.concatenate([post_c2, np.eye(3, dtype=float)])
+    #post_c3 = np.concatenate([post_c3, np.eye(3, dtype=float)])
+
+    #plot_simplex_(ax1, lambda x:0, title='$f_1(\mathbf{x})=p(s(\mathbf{x})|y=1)$')
+    #plot_simplex_(ax2, lambda x:0, title='$f_1(\mathbf{x})=p(s(\mathbf{x})|y=1)$')
+    #plot_simplex_(ax3, lambda x:0, title='$f_1(\mathbf{x})=p(s(\mathbf{x})|y=1)$')
+    def density(kde):
+        def d(p):
+            return np.exp(kde([p])).item()
+        return d
+
+    plot_simplex_(ax1, density(kde1.score_samples), title='$p_1$')
+    plot_simplex_(ax2, density(kde2.score_samples), title='$p_2$')
+    plot_simplex_(ax3, density(kde3.score_samples), title='$p_3$')
+    #plot_simplex(ax1, post_c1, np.exp(kde1.score_samples(post_c1)), title='$f_1(\mathbf{x})=p(s(\mathbf{x})|y=1)$') #, savepath='figure/y1.png')
+    #plot_simplex(ax2, post_c2, np.exp(kde2.score_samples(post_c2)), title='$f_2(\mathbf{x})=p(s(\mathbf{x})|y=2)$') #, savepath='figure/y2.png')
+    #plot_simplex(ax3, post_c3, np.exp(kde3.score_samples(post_c3)), title='$f_3(\mathbf{x})=p(s(\mathbf{x})|y=3)$') #, savepath='figure/y3.png')
+
+    def mixture_(prevs, kdes):
+        def m(p):
+            total_density = 0
+            for prev, kde in zip(prevs, kdes):
+                log_density = kde.score_samples([p]).item()
+                density = np.exp(log_density)
+                density *= prev
+                total_density += density
+            #print(total_density)
+            return total_density
+        return m
+
+    title = '$\mathbf{p}_{\mathbf{\\alpha}} = \sum_{i \in n} \\alpha_i p_i$'
+
+    plot_simplex_(ax4, mixture_(alpha, [kde1, kde2, kde3]), title=title, points=post_test)
+
+    #ternary.plt.show()
+    ternary.plt.savefig('./simplex.pdf')
+
+
+import quapy as qp
+
+
+data = qp.datasets.fetch_twitter('wb', min_df=3, pickle=True, for_model_selection=False)
+
+Xtr, ytr = data.training.Xy
+Xte, yte = data.test.sampling(150, *[0.5, 0.1, 0.4]).Xy
+
+cls = LogisticRegression(C=0.0001, random_state=0)
+
+draw_from_training = False
+if draw_from_training:
+    post_tr = cross_val_predict(cls, Xtr, ytr, n_jobs=-1, method='predict_proba')
+    post_c1 = post_tr[ytr==0]
+    post_c2 = post_tr[ytr==1]
+    post_c3 = post_tr[ytr==2]
+    cls.fit(Xtr, ytr)
+else:
+    cls.fit(Xtr, ytr)
+    post_te = cls.predict_proba(Xte)
+    post_c1 = post_te[yte == 0]
+    post_c2 = post_te[yte == 1]
+    post_c3 = post_te[yte == 2]
+
+post_test = cls.predict_proba(Xte)
+
+alpha = qp.functional.prevalence_from_labels(yte, classes=[0, 1, 2])
+
+print(f'test alpha {alpha}')
+plot_3class_problem(post_c1, post_c2, post_c3, post_test, alpha, bandwidth=0.1)
+