adding scripts for plots (only local)

deleting scripts folder so it can be downloaded
documenting the regressor
2024-11-13 17:16:18 +01:00 · 2024-05-30 11:42:48 +02:00 · 2024-05-29 11:23:17 +02:00 · 2024-05-29 11:12:43 +02:00 · 2024-05-28 10:24:55 +02:00 · 2024-05-28 10:24:13 +02:00
10 changed files with 6437 additions and 12 deletions
--- a/CHANGE_LOG.txt
+++ b/CHANGE_LOG.txt
@ -1,4 +1,4 @@
-Change Log 0.1.8
+Change Log 0.1.8g
 ----------------

 - Added Kernel Density Estimation methods (KDEyML, KDEyCS, KDEyHD) as proposed in the paper:
--- a/LeQua2024/T4_domain_prevalence/dev_domain_prevalences.txt
+++ b/LeQua2024/T4_domain_prevalence/dev_domain_prevalences.txt
--- a/LeQua2024/T4_domain_prevalence/test_domain_prevalences.txt
+++ b/LeQua2024/T4_domain_prevalence/test_domain_prevalences.txt
--- a/LeQua2024/baselines.py
+++ b/LeQua2024/baselines.py
@ -3,9 +3,12 @@ import pickle
 import os
 import sys
 from os.path import join
+
+import numpy as np
 from sklearn.linear_model import LogisticRegression as LR

 from scripts.constants import SAMPLE_SIZE
+from scripts.evaluate import normalized_match_distance
 from LeQua2024._lequa2024 import LEQUA2024_TASKS, fetch_lequa2024, LEQUA2024_ZENODO
 from quapy.method.aggregative import *
 from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE
@ -35,11 +38,18 @@ def wrap_params(cls_params:dict, prefix:str):
 def baselines():

    q_params = wrap_params(lr_params, 'classifier')
+    kde_params = {**q_params, 'bandwidth': np.linspace(0.01, 0.20, 20)}
+    dm_params = {**q_params, 'nbins': [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 64]}

    yield CC(new_cls()), "CC", q_params
    yield ACC(new_cls()), "ACC", q_params
    yield PCC(new_cls()), "PCC", q_params
    yield PACC(new_cls()), "PACC", q_params
+    yield SLD(new_cls()), "SLD", q_params
+    #yield KDEyML(new_cls()), "KDEy-ML", kde_params
+    #yield KDEyHD(new_cls()), "KDEy-HD", kde_params
+    # yield KDEyCS(new_cls()), "KDEy-CS", kde_params
+    #yield DMy(new_cls()), "DMy", dm_params


 def main(args):
@ -77,7 +87,7 @@ def main(args):
                quantifier,
                param_grid,
                protocol=gen_val,
-                error=qp.error.mrae,
+                error=normalized_match_distance if args.task=='T3' else qp.error.mrae,
                refit=False,
                verbose=True,
                n_jobs=-1
--- a/LeQua2024/predict.py
+++ b/LeQua2024/predict.py
@ -7,6 +7,7 @@ from tqdm import tqdm
 from scripts.data import gen_load_samples
 from glob import glob
 from scripts import constants
+from regressor import KDEyRegressor, RegressionToSimplex

 """
 LeQua2024 prediction script 
--- a/LeQua2024/regressor.py
+++ b/LeQua2024/regressor.py
@ -0,0 +1,133 @@
+import pickle
+
+import numpy as np
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.model_selection import GridSearchCV
+from sklearn.multioutput import MultiOutputRegressor
+from sklearn.pipeline import Pipeline
+from sklearn.svm import SVR
+
+from LeQua2024._lequa2024 import fetch_lequa2024
+from quapy.data import LabelledCollection
+from quapy.protocol import AbstractProtocol
+from quapy.method.base import BaseQuantifier
+import quapy.functional as F
+from tqdm import tqdm
+from scripts.evaluate import normalized_match_distance, match_distance
+
+
+def projection_simplex_sort(unnormalized_arr) -> np.ndarray:
+    """Projects a point onto the probability simplex.
+    [This code is taken from the devel branch, that will correspond to the future QuaPy 0.1.9]
+
+    The code is adapted from Mathieu Blondel's BSD-licensed
+    `implementation <https://gist.github.com/mblondel/6f3b7aaad90606b98f71>`_
+    (see function `projection_simplex_sort` in their repo) which is accompanying the paper
+
+    Mathieu Blondel, Akinori Fujino, and Naonori Ueda.
+    Large-scale Multiclass Support Vector Machine Training via Euclidean Projection onto the Simplex,
+    ICPR 2014, `URL <http://www.mblondel.org/publications/mblondel-icpr2014.pdf>`_
+
+    :param `unnormalized_arr`: point in n-dimensional space, shape `(n,)`
+    :return: projection of `unnormalized_arr` onto the (n-1)-dimensional probability simplex, shape `(n,)`
+    """
+    unnormalized_arr = np.asarray(unnormalized_arr)
+    n = len(unnormalized_arr)
+    u = np.sort(unnormalized_arr)[::-1]
+    cssv = np.cumsum(u) - 1.0
+    ind = np.arange(1, n + 1)
+    cond = u - cssv / ind > 0
+    rho = ind[cond][-1]
+    theta = cssv[cond][-1] / float(rho)
+    return np.maximum(unnormalized_arr - theta, 0)
+
+
+class RegressionToSimplex(BaseEstimator):
+    """
+    A very simple regressor of probability distributions.
+    Internally, this class works by invoking an SVR regressor multioutput
+    followed by a mapping onto the probability simplex.
+
+    :param C: regularziation parameter for SVR
+    """
+
+    def __init__(self, C=1):
+        self.C = C
+
+    def fit(self, X, y):
+        """
+        Learns the correction
+
+        :param X: array-like of shape `(n_instances, n_classes)` with uncorrected prevalence vectors
+        :param y: array-like of shape `(n_instances, n_classes)` with true prevalence vectors
+        :return: self
+        """
+        self.reg = MultiOutputRegressor(SVR(C=self.C), n_jobs=-1)
+        self.reg.fit(X, y)
+        return self
+
+    def predict(self, X):
+        """
+        Corrects the a vector of prevalence values
+
+        :param X: array-like of shape `(n_classes,)` with one vector of uncorrected prevalence values
+        :return: array-like of shape `(n_classes,)` with one vector of corrected prevalence values
+        """
+        y_ = self.reg.predict(X)
+        y_ = np.asarray([projection_simplex_sort(y_i) for y_i in y_])
+        return y_
+
+
+class KDEyRegressor(BaseQuantifier):
+    """
+    This class implements a regressor-based correction on top of a quantifier.
+    The quantifier is taken to be KDEy-ML, which is considered to be already trained (this
+    method simply loads a pickled object).
+    The method then optimizes a regressor that corrects prevalence vectors using the
+    validation samples as training data.
+    The regressor is based on a multioutput SVR and relies on a post-processing to guarantee
+    that the output lies on the probability simplex (see also RegressionToSimplex)
+    """
+
+    def __init__(self, kde_path, Cs=np.logspace(-3,3,7)):
+        self.kde_path = kde_path
+        self.Cs = Cs
+
+    def fit(self, val_data: AbstractProtocol):
+        print(f'loading kde from {self.kde_path}')
+        self.kdey = pickle.load(open(self.kde_path, 'rb'))
+
+        print('representing val data with kde')
+        pbar = tqdm(val_data(), total=val_data.total())
+        Xs, Ys = [], []
+        for sample, prev in pbar:
+            prev_hat = self.kdey.quantify(sample)
+            Xs.append(prev_hat)
+            Ys.append(prev)
+
+        Xs = np.asarray(Xs)
+        Ys = np.asarray(Ys)
+
+        def scorer(estimator, X, y):
+            y_hat = estimator.predict(X)
+            md = normalized_match_distance(y, y_hat)
+            return (-md)
+
+        grid = {'C': self.Cs}
+        optim = GridSearchCV(
+            RegressionToSimplex(), param_grid=grid, scoring=scorer, verbose=0, cv=10, n_jobs=64
+        ).fit(Xs, Ys)
+        self.regressor = optim.best_estimator_
+        return self
+
+    def quantify(self, instances):
+        prev_hat = self.kdey.quantify(instances)
+        return self.regressor.predict([prev_hat])[0]
+
+
+if __name__ == '__main__':
+    train, gen_val, _ = fetch_lequa2024(task='T3', data_home='./data', merge_T3=True)
+    kdey_r = KDEyRegressor('./models/T3/KDEy-ML.pkl')
+    kdey_r.fit(gen_val)
+    pickle.dump(kdey_r, open('./models/T3/KDEyRegressor.pkl', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
+
--- a/LeQua2024/run_baselines.sh
+++ b/LeQua2024/run_baselines.sh
@ -1,15 +1,6 @@
 #!/bin/bash
 set -x

-# download the official scripts 
-if [ ! -d "scripts" ]; then
-   echo "Downloading the official scripts from the LeQua 2024 github repo"
-   wget https://github.com/HLT-ISTI/LeQua2024_scripts/archive/refs/heads/main.zip
-   unzip main.zip 
-   mv LeQua2024_scripts-main scripts
-   rm main.zip
-fi
-
 # T1: binary (n=2)
 # T2: multiclass (n=28)
 # T3: ordinal (n=5)
--- a/LeQua2024/util_scripts/covariate_shift_plot.py
+++ b/LeQua2024/util_scripts/covariate_shift_plot.py
@ -0,0 +1,120 @@
+import os
+from os.path import join
+import pandas as pd
+import quapy as qp
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+os.chdir('/home/moreo/QuaPy/LeQua2024/util_scripts')
+print(os.getcwd())
+
+qp.environ['SAMPLE_SIZE']=250
+
+true_prevs_path = '../TruePrevalences/T4.test_prevalences/T4/public/test_prevalences.txt'
+domain_prevs_path = '../T4_domain_prevalence/test_domain_prevalences.txt'
+folder = '../Results_CODALAB_2024/extracted/TASK_4'
+
+def load_result_file(path):
+    df = pd.read_csv(path, index_col=0)
+    id = df.index.to_numpy()
+    prevs = df.values
+    return id, prevs
+
+method_files = [
+    #'ACC.csv',
+    #'CC.csv',
+    #'DistMatching-y.csv',
+    #'KDEy.csv',
+    #'PACC.csv',
+    'PCC.csv',
+    #'SLD.csv',
+    #'TeamCUFE.csv',
+    #'TeamGMNet.csv',
+    'tobiaslotz.csv'
+]
+
+method_names_nice={
+    'DistMatching-y': 'DM',
+    'TeamGMNet': 'UniOviedo(Team1)',
+    'tobiaslotz': 'Lamarr'
+}
+
+desired_order=[
+    'Lamarr',
+    'SLD',
+    'DM',
+    'KDEy',
+    'UniOviedo(Team1)'
+]
+desired_order=[
+    'PCC', 'Lamarr'
+]
+
+# load the true values (sentiment prevalence, domain prevalence)
+true_id, true_prevs = load_result_file(true_prevs_path)
+dom_id, dom_prevs = load_result_file(domain_prevs_path)
+assert (true_id == dom_id).all(), 'unmatched files'
+
+# define the loss for evaluation
+error_name = 'RAE'
+error_log = False
+
+if error_name == 'RAE':
+    err_function_ = qp.error.rae
+elif error_name == 'AE':
+    err_function_ = qp.error.ae
+else:
+    raise ValueError()
+
+if error_log:
+    error_name = f'log({error_name})'
+    err_function = lambda x,y: np.log(err_function_(x,y))
+else:
+    err_function = err_function_
+
+# load the participant and baseline results
+errors = {}
+for method_file in method_files:
+    method_name = method_file.replace('.csv', '')
+    id, method_prevs = load_result_file(join(folder, method_file))
+    print(method_file)
+    assert (true_id == id).all(), f'unmatched files for {method_file}'
+    method_error = err_function(true_prevs, method_prevs)
+    method_name = method_names_nice.get(method_name, method_name)
+    errors[method_name] = method_error
+
+dom_A_prevs = dom_prevs[:,0]
+
+n_bins = 5
+bins = np.linspace(dom_A_prevs.min(), dom_A_prevs.max(), n_bins + 1)
+
+# Crear un DataFrame para los datos
+df = pd.DataFrame({'dom_A_prevs': dom_A_prevs})
+for method, err in errors.items():
+    df[method] = err
+
+# Asignar cada valor de dom_A_prevs a un bin
+df['bin'] = pd.cut(df['dom_A_prevs'], bins=bins, labels=False, include_lowest=True)
+
+# Convertir el DataFrame a formato largo
+df_long = df.melt(id_vars=['dom_A_prevs', 'bin'], value_vars=errors.keys(), var_name='Método', value_name='Error')
+
+# Crear etiquetas de los bins para el eje X
+bin_labels = [f"[{bins[i]:.3f}-{bins[i + 1]:.3f}" + (']' if i == n_bins-1 else ')') for i in range(n_bins)]
+df_long['bin_label'] = df_long['bin'].map(dict(enumerate(bin_labels)))
+
+# Crear el gráfico de boxplot en Seaborn
+plt.figure(figsize=(14, 8))
+sns.boxplot(x='bin', y='Error', hue='Método', data=df_long, palette='Set2', showfliers=False, hue_order=desired_order)
+
+# Configurar etiquetas del eje X con los rangos de los bins
+plt.xticks(ticks=range(n_bins), labels=bin_labels, rotation=0)
+plt.xlabel("Prevalence of Books")
+plt.ylabel(error_name)
+#plt.title("Boxplots de Errores por Método dentro de Bins de dom_A_prevs")
+plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
+plt.tight_layout()
+plt.grid(True, which='both', linestyle='--', linewidth=0.5)
+#plt.show()
+plt.savefig(f'./t4_{error_name}_pcc.png')
--- a/LeQua2024/util_scripts/prior_shift_plot.py
+++ b/LeQua2024/util_scripts/prior_shift_plot.py
@ -0,0 +1,168 @@
+import os
+from os.path import join
+import pandas as pd
+
+from quapy.data.base import LabelledCollection
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../')))
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), './')))
+#from LeQua2024.scripts import constants
+#from LeQua2024._lequa2024 import fetch_lequa2024
+import quapy as qp
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from pathlib import Path
+import glob
+
+
+os.chdir('/home/moreo/QuaPy/LeQua2024')
+print(os.getcwd())
+
+
+qp.environ['SAMPLE_SIZE']=250
+
+TASK=1
+
+true_prevs_path = f'./TruePrevalences/T{TASK}.test_prevalences/T{TASK}/public/test_prevalences.txt'
+folder = F'./Results_CODALAB_2024/extracted/TASK_{TASK}'
+
+def load_result_file(path):
+    df = pd.read_csv(path, index_col=0)
+    id = df.index.to_numpy()
+    prevs = df.values
+    return id, prevs
+
+
+method_files = glob.glob(f"{folder}/*.csv")
+
+
+method_names_nice={
+    'DistMatching-y': 'DM',
+    'TeamGMNet': 'UniOviedo(Team1)',
+    'tobiaslotz': 'Lamarr'
+}
+
+exclude_methods=[
+    'TeamCUFE',
+    'hustav',
+    'PCC',
+    'CC'
+]
+
+
+# desired_order=[
+#     'Lamarr',
+#     'SLD',
+#     'DM',
+#     'KDEy',
+#     'UniOviedo(Team1)'
+# ]
+# desired_order=[
+#     'PCC', 'Lamarr'
+# ]
+
+# load the true values (sentiment prevalence, domain prevalence)
+true_id, true_prevs = load_result_file(true_prevs_path)
+
+
+# define the loss for evaluation
+error_name = 'RAE'
+error_log = False
+
+if error_name == 'RAE':
+    err_function_ = qp.error.rae
+elif error_name == 'AE':
+    err_function_ = qp.error.ae
+else:
+    raise ValueError()
+
+if error_log:
+    error_name = f'log({error_name})'
+    err_function = lambda x,y: np.log(err_function_(x,y))
+else:
+    err_function = err_function_
+
+
+def load_vector_documents(path):
+    """
+    Loads vectorized documents. In case the sample is unlabelled,
+    the labels returned are None
+
+    :param path: path to the data sample containing the raw documents
+    :return: a tuple with the documents (np.ndarray of shape `(n,256)`) and the labels (a np.ndarray of shape `(n,)` if
+        the sample is labelled, or None if the sample is unlabelled), with `n` the number of instances in the sample
+        (250 for T1 and T4, 1000 for T2, and 200 for T3)
+    """
+    D = pd.read_csv(path).to_numpy(dtype=float)
+    labelled = D.shape[1] == 257
+    if labelled:
+        X, y = D[:,1:], D[:,0].astype(int).flatten()
+    else:
+        X, y = D, None
+    return X, y
+
+#train_prevalence = fetch_lequa2024(task=f'T{TASK}', data_home='./data')
+train = LabelledCollection.load(f'/home/moreo/QuaPy/LeQua2024/data/lequa2024/T{TASK}/public/training_data.txt', loader_func=load_vector_documents)
+train_prev = train.prevalence()
+#train_prev = np.tile(train_prev, (len(true_id),1))
+
+from quapy.plot import error_by_drift
+
+# load the participant and baseline results
+method_names, estim_prevs = [], []
+for method_file in method_files:
+    method_name = Path(method_file).name.replace('.csv', '')
+    if method_name in exclude_methods:
+        continue
+    id, method_prevs = load_result_file(join(folder, method_name+'.csv'))
+    assert (true_id == id).all(), f'unmatched files for {method_file}'
+    method_name = method_names_nice.get(method_name, method_name)
+    method_names.append(method_name)
+    estim_prevs.append(method_prevs)
+
+true_prevs = [true_prevs]*len(method_names)
+tr_prevs =[train.prevalence()]*len(method_names)
+error_by_drift(method_names, 
+               true_prevs, 
+               estim_prevs, 
+               tr_prevs, 
+               error_name='mrae', show_std=True,
+               show_density=True, vlines=True, savepath=f'./util_scripts/t{TASK}_{error_name}_pcc.png')
+sys.exit()
+
+shift=qp.error.ae(train_prev, true_prevs)
+
+n_bins = 5
+bins = np.linspace(shift.min(), shift.max(), n_bins + 1)
+
+# Crear un DataFrame para los datos
+df = pd.DataFrame({'dom_A_prevs': shift})
+for method, err in errors.items():
+    df[method] = err
+
+# Asignar cada valor de dom_A_prevs a un bin
+df['bin'] = pd.cut(df['dom_A_prevs'], bins=bins, labels=False, include_lowest=True)
+
+# Convertir el DataFrame a formato largo
+df_long = df.melt(id_vars=['dom_A_prevs', 'bin'], value_vars=errors.keys(), var_name='Método', value_name='Error')
+
+# Crear etiquetas de los bins para el eje X
+bin_labels = [f"[{bins[i]:.3f}-{bins[i + 1]:.3f}" + (']' if i == n_bins-1 else ')') for i in range(n_bins)]
+df_long['bin_label'] = df_long['bin'].map(dict(enumerate(bin_labels)))
+
+# Crear el gráfico de boxplot en Seaborn
+plt.figure(figsize=(14, 8))
+sns.boxplot(x='bin', y='Error', hue='Método', data=df_long, palette='Set2', showfliers=False)
+
+# Configurar etiquetas del eje X con los rangos de los bins
+plt.xticks(ticks=range(n_bins), labels=bin_labels, rotation=0)
+plt.xlabel("Amount of PPS between the training prevalence and the test prevalences, in terms of AE ")
+plt.ylabel(error_name)
+#plt.title("Boxplots de Errores por Método dentro de Bins de dom_A_prevs")
+plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
+plt.tight_layout()
+plt.grid(True, which='both', linestyle='--', linewidth=0.5)
+#plt.show()
+plt.savefig(f'./util_scripts/t{TASK}_{error_name}_pcc.png')
--- a/quapy/plot.py
+++ b/quapy/plot.py
@ -1,6 +1,6 @@
 from collections import defaultdict
 import matplotlib.pyplot as plt
-from matplotlib.cm import get_cmap
+from matplotlib.pyplot import get_cmap
 import numpy as np
 from matplotlib import cm
 from scipy.stats import ttest_ind_from_stats
Author	SHA1	Message	Date
Alejandro Moreo Fernandez	6f7a1e511e	adding scripts for plots (only local)	2024-11-13 17:16:18 +01:00
Alejandro Moreo Fernandez	9d5ff154a0	deleting scripts folder so it can be downloaded	2024-05-30 11:42:48 +02:00
Alejandro Moreo Fernandez	7febaa2693	documenting the regressor	2024-05-29 11:23:17 +02:00
Alejandro Moreo Fernandez	3264e66cc9	adding regressor for T3	2024-05-29 11:12:43 +02:00
Alejandro Moreo Fernandez	a124e791ae	Merge branch 'lequa2024' of github.com:HLT-ISTI/QuaPy into lequa2024	2024-05-28 10:24:55 +02:00
Alejandro Moreo Fernandez	12a44586a8	optimizing T3 for normalized match distance	2024-05-28 10:24:13 +02:00
Alejandro Moreo Fernandez	ea1e2d2813	more baselines, kde, and dm	2024-05-20 10:15:05 +02:00
Alejandro Moreo Fernandez	6cb30edb7b	Update run_baselines.sh	2024-05-16 21:05:06 +02:00
Alejandro Moreo Fernandez	6b754dd845	redundant code cleaned from run_baselines.sh	2024-05-15 12:01:31 +02:00
Alejandro Moreo Fernandez	81fbb54992	kde	2024-05-10 10:15:44 +02:00