Compare commits
10 Commits
8dfb109b41
...
6f7a1e511e
Author | SHA1 | Date |
---|---|---|
Alejandro Moreo Fernandez | 6f7a1e511e | |
Alejandro Moreo Fernandez | 9d5ff154a0 | |
Alejandro Moreo Fernandez | 7febaa2693 | |
Alejandro Moreo Fernandez | 3264e66cc9 | |
Alejandro Moreo Fernandez | a124e791ae | |
Alejandro Moreo Fernandez | 12a44586a8 | |
Alejandro Moreo Fernandez | ea1e2d2813 | |
Alejandro Moreo Fernandez | 6cb30edb7b | |
Alejandro Moreo Fernandez | 6b754dd845 | |
Alejandro Moreo Fernandez | 81fbb54992 |
|
@ -1,4 +1,4 @@
|
||||||
Change Log 0.1.8
|
Change Log 0.1.8g
|
||||||
----------------
|
----------------
|
||||||
|
|
||||||
- Added Kernel Density Estimation methods (KDEyML, KDEyCS, KDEyHD) as proposed in the paper:
|
- Added Kernel Density Estimation methods (KDEyML, KDEyCS, KDEyHD) as proposed in the paper:
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -3,9 +3,12 @@ import pickle
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
from os.path import join
|
from os.path import join
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
from sklearn.linear_model import LogisticRegression as LR
|
from sklearn.linear_model import LogisticRegression as LR
|
||||||
|
|
||||||
from scripts.constants import SAMPLE_SIZE
|
from scripts.constants import SAMPLE_SIZE
|
||||||
|
from scripts.evaluate import normalized_match_distance
|
||||||
from LeQua2024._lequa2024 import LEQUA2024_TASKS, fetch_lequa2024, LEQUA2024_ZENODO
|
from LeQua2024._lequa2024 import LEQUA2024_TASKS, fetch_lequa2024, LEQUA2024_ZENODO
|
||||||
from quapy.method.aggregative import *
|
from quapy.method.aggregative import *
|
||||||
from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE
|
from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE
|
||||||
|
@ -35,11 +38,18 @@ def wrap_params(cls_params:dict, prefix:str):
|
||||||
def baselines():
|
def baselines():
|
||||||
|
|
||||||
q_params = wrap_params(lr_params, 'classifier')
|
q_params = wrap_params(lr_params, 'classifier')
|
||||||
|
kde_params = {**q_params, 'bandwidth': np.linspace(0.01, 0.20, 20)}
|
||||||
|
dm_params = {**q_params, 'nbins': [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 64]}
|
||||||
|
|
||||||
yield CC(new_cls()), "CC", q_params
|
yield CC(new_cls()), "CC", q_params
|
||||||
yield ACC(new_cls()), "ACC", q_params
|
yield ACC(new_cls()), "ACC", q_params
|
||||||
yield PCC(new_cls()), "PCC", q_params
|
yield PCC(new_cls()), "PCC", q_params
|
||||||
yield PACC(new_cls()), "PACC", q_params
|
yield PACC(new_cls()), "PACC", q_params
|
||||||
|
yield SLD(new_cls()), "SLD", q_params
|
||||||
|
#yield KDEyML(new_cls()), "KDEy-ML", kde_params
|
||||||
|
#yield KDEyHD(new_cls()), "KDEy-HD", kde_params
|
||||||
|
# yield KDEyCS(new_cls()), "KDEy-CS", kde_params
|
||||||
|
#yield DMy(new_cls()), "DMy", dm_params
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
|
@ -77,7 +87,7 @@ def main(args):
|
||||||
quantifier,
|
quantifier,
|
||||||
param_grid,
|
param_grid,
|
||||||
protocol=gen_val,
|
protocol=gen_val,
|
||||||
error=qp.error.mrae,
|
error=normalized_match_distance if args.task=='T3' else qp.error.mrae,
|
||||||
refit=False,
|
refit=False,
|
||||||
verbose=True,
|
verbose=True,
|
||||||
n_jobs=-1
|
n_jobs=-1
|
||||||
|
|
|
@ -7,6 +7,7 @@ from tqdm import tqdm
|
||||||
from scripts.data import gen_load_samples
|
from scripts.data import gen_load_samples
|
||||||
from glob import glob
|
from glob import glob
|
||||||
from scripts import constants
|
from scripts import constants
|
||||||
|
from regressor import KDEyRegressor, RegressionToSimplex
|
||||||
|
|
||||||
"""
|
"""
|
||||||
LeQua2024 prediction script
|
LeQua2024 prediction script
|
||||||
|
|
|
@ -0,0 +1,133 @@
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.base import BaseEstimator, TransformerMixin
|
||||||
|
from sklearn.model_selection import GridSearchCV
|
||||||
|
from sklearn.multioutput import MultiOutputRegressor
|
||||||
|
from sklearn.pipeline import Pipeline
|
||||||
|
from sklearn.svm import SVR
|
||||||
|
|
||||||
|
from LeQua2024._lequa2024 import fetch_lequa2024
|
||||||
|
from quapy.data import LabelledCollection
|
||||||
|
from quapy.protocol import AbstractProtocol
|
||||||
|
from quapy.method.base import BaseQuantifier
|
||||||
|
import quapy.functional as F
|
||||||
|
from tqdm import tqdm
|
||||||
|
from scripts.evaluate import normalized_match_distance, match_distance
|
||||||
|
|
||||||
|
|
||||||
|
def projection_simplex_sort(unnormalized_arr) -> np.ndarray:
|
||||||
|
"""Projects a point onto the probability simplex.
|
||||||
|
[This code is taken from the devel branch, that will correspond to the future QuaPy 0.1.9]
|
||||||
|
|
||||||
|
The code is adapted from Mathieu Blondel's BSD-licensed
|
||||||
|
`implementation <https://gist.github.com/mblondel/6f3b7aaad90606b98f71>`_
|
||||||
|
(see function `projection_simplex_sort` in their repo) which is accompanying the paper
|
||||||
|
|
||||||
|
Mathieu Blondel, Akinori Fujino, and Naonori Ueda.
|
||||||
|
Large-scale Multiclass Support Vector Machine Training via Euclidean Projection onto the Simplex,
|
||||||
|
ICPR 2014, `URL <http://www.mblondel.org/publications/mblondel-icpr2014.pdf>`_
|
||||||
|
|
||||||
|
:param `unnormalized_arr`: point in n-dimensional space, shape `(n,)`
|
||||||
|
:return: projection of `unnormalized_arr` onto the (n-1)-dimensional probability simplex, shape `(n,)`
|
||||||
|
"""
|
||||||
|
unnormalized_arr = np.asarray(unnormalized_arr)
|
||||||
|
n = len(unnormalized_arr)
|
||||||
|
u = np.sort(unnormalized_arr)[::-1]
|
||||||
|
cssv = np.cumsum(u) - 1.0
|
||||||
|
ind = np.arange(1, n + 1)
|
||||||
|
cond = u - cssv / ind > 0
|
||||||
|
rho = ind[cond][-1]
|
||||||
|
theta = cssv[cond][-1] / float(rho)
|
||||||
|
return np.maximum(unnormalized_arr - theta, 0)
|
||||||
|
|
||||||
|
|
||||||
|
class RegressionToSimplex(BaseEstimator):
|
||||||
|
"""
|
||||||
|
A very simple regressor of probability distributions.
|
||||||
|
Internally, this class works by invoking an SVR regressor multioutput
|
||||||
|
followed by a mapping onto the probability simplex.
|
||||||
|
|
||||||
|
:param C: regularziation parameter for SVR
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, C=1):
|
||||||
|
self.C = C
|
||||||
|
|
||||||
|
def fit(self, X, y):
|
||||||
|
"""
|
||||||
|
Learns the correction
|
||||||
|
|
||||||
|
:param X: array-like of shape `(n_instances, n_classes)` with uncorrected prevalence vectors
|
||||||
|
:param y: array-like of shape `(n_instances, n_classes)` with true prevalence vectors
|
||||||
|
:return: self
|
||||||
|
"""
|
||||||
|
self.reg = MultiOutputRegressor(SVR(C=self.C), n_jobs=-1)
|
||||||
|
self.reg.fit(X, y)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def predict(self, X):
|
||||||
|
"""
|
||||||
|
Corrects the a vector of prevalence values
|
||||||
|
|
||||||
|
:param X: array-like of shape `(n_classes,)` with one vector of uncorrected prevalence values
|
||||||
|
:return: array-like of shape `(n_classes,)` with one vector of corrected prevalence values
|
||||||
|
"""
|
||||||
|
y_ = self.reg.predict(X)
|
||||||
|
y_ = np.asarray([projection_simplex_sort(y_i) for y_i in y_])
|
||||||
|
return y_
|
||||||
|
|
||||||
|
|
||||||
|
class KDEyRegressor(BaseQuantifier):
|
||||||
|
"""
|
||||||
|
This class implements a regressor-based correction on top of a quantifier.
|
||||||
|
The quantifier is taken to be KDEy-ML, which is considered to be already trained (this
|
||||||
|
method simply loads a pickled object).
|
||||||
|
The method then optimizes a regressor that corrects prevalence vectors using the
|
||||||
|
validation samples as training data.
|
||||||
|
The regressor is based on a multioutput SVR and relies on a post-processing to guarantee
|
||||||
|
that the output lies on the probability simplex (see also RegressionToSimplex)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, kde_path, Cs=np.logspace(-3,3,7)):
|
||||||
|
self.kde_path = kde_path
|
||||||
|
self.Cs = Cs
|
||||||
|
|
||||||
|
def fit(self, val_data: AbstractProtocol):
|
||||||
|
print(f'loading kde from {self.kde_path}')
|
||||||
|
self.kdey = pickle.load(open(self.kde_path, 'rb'))
|
||||||
|
|
||||||
|
print('representing val data with kde')
|
||||||
|
pbar = tqdm(val_data(), total=val_data.total())
|
||||||
|
Xs, Ys = [], []
|
||||||
|
for sample, prev in pbar:
|
||||||
|
prev_hat = self.kdey.quantify(sample)
|
||||||
|
Xs.append(prev_hat)
|
||||||
|
Ys.append(prev)
|
||||||
|
|
||||||
|
Xs = np.asarray(Xs)
|
||||||
|
Ys = np.asarray(Ys)
|
||||||
|
|
||||||
|
def scorer(estimator, X, y):
|
||||||
|
y_hat = estimator.predict(X)
|
||||||
|
md = normalized_match_distance(y, y_hat)
|
||||||
|
return (-md)
|
||||||
|
|
||||||
|
grid = {'C': self.Cs}
|
||||||
|
optim = GridSearchCV(
|
||||||
|
RegressionToSimplex(), param_grid=grid, scoring=scorer, verbose=0, cv=10, n_jobs=64
|
||||||
|
).fit(Xs, Ys)
|
||||||
|
self.regressor = optim.best_estimator_
|
||||||
|
return self
|
||||||
|
|
||||||
|
def quantify(self, instances):
|
||||||
|
prev_hat = self.kdey.quantify(instances)
|
||||||
|
return self.regressor.predict([prev_hat])[0]
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
train, gen_val, _ = fetch_lequa2024(task='T3', data_home='./data', merge_T3=True)
|
||||||
|
kdey_r = KDEyRegressor('./models/T3/KDEy-ML.pkl')
|
||||||
|
kdey_r.fit(gen_val)
|
||||||
|
pickle.dump(kdey_r, open('./models/T3/KDEyRegressor.pkl', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
|
@ -1,15 +1,6 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
set -x
|
set -x
|
||||||
|
|
||||||
# download the official scripts
|
|
||||||
if [ ! -d "scripts" ]; then
|
|
||||||
echo "Downloading the official scripts from the LeQua 2024 github repo"
|
|
||||||
wget https://github.com/HLT-ISTI/LeQua2024_scripts/archive/refs/heads/main.zip
|
|
||||||
unzip main.zip
|
|
||||||
mv LeQua2024_scripts-main scripts
|
|
||||||
rm main.zip
|
|
||||||
fi
|
|
||||||
|
|
||||||
# T1: binary (n=2)
|
# T1: binary (n=2)
|
||||||
# T2: multiclass (n=28)
|
# T2: multiclass (n=28)
|
||||||
# T3: ordinal (n=5)
|
# T3: ordinal (n=5)
|
||||||
|
|
|
@ -0,0 +1,120 @@
|
||||||
|
import os
|
||||||
|
from os.path import join
|
||||||
|
import pandas as pd
|
||||||
|
import quapy as qp
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import seaborn as sns
|
||||||
|
|
||||||
|
os.chdir('/home/moreo/QuaPy/LeQua2024/util_scripts')
|
||||||
|
print(os.getcwd())
|
||||||
|
|
||||||
|
qp.environ['SAMPLE_SIZE']=250
|
||||||
|
|
||||||
|
true_prevs_path = '../TruePrevalences/T4.test_prevalences/T4/public/test_prevalences.txt'
|
||||||
|
domain_prevs_path = '../T4_domain_prevalence/test_domain_prevalences.txt'
|
||||||
|
folder = '../Results_CODALAB_2024/extracted/TASK_4'
|
||||||
|
|
||||||
|
def load_result_file(path):
|
||||||
|
df = pd.read_csv(path, index_col=0)
|
||||||
|
id = df.index.to_numpy()
|
||||||
|
prevs = df.values
|
||||||
|
return id, prevs
|
||||||
|
|
||||||
|
method_files = [
|
||||||
|
#'ACC.csv',
|
||||||
|
#'CC.csv',
|
||||||
|
#'DistMatching-y.csv',
|
||||||
|
#'KDEy.csv',
|
||||||
|
#'PACC.csv',
|
||||||
|
'PCC.csv',
|
||||||
|
#'SLD.csv',
|
||||||
|
#'TeamCUFE.csv',
|
||||||
|
#'TeamGMNet.csv',
|
||||||
|
'tobiaslotz.csv'
|
||||||
|
]
|
||||||
|
|
||||||
|
method_names_nice={
|
||||||
|
'DistMatching-y': 'DM',
|
||||||
|
'TeamGMNet': 'UniOviedo(Team1)',
|
||||||
|
'tobiaslotz': 'Lamarr'
|
||||||
|
}
|
||||||
|
|
||||||
|
desired_order=[
|
||||||
|
'Lamarr',
|
||||||
|
'SLD',
|
||||||
|
'DM',
|
||||||
|
'KDEy',
|
||||||
|
'UniOviedo(Team1)'
|
||||||
|
]
|
||||||
|
desired_order=[
|
||||||
|
'PCC', 'Lamarr'
|
||||||
|
]
|
||||||
|
|
||||||
|
# load the true values (sentiment prevalence, domain prevalence)
|
||||||
|
true_id, true_prevs = load_result_file(true_prevs_path)
|
||||||
|
dom_id, dom_prevs = load_result_file(domain_prevs_path)
|
||||||
|
assert (true_id == dom_id).all(), 'unmatched files'
|
||||||
|
|
||||||
|
# define the loss for evaluation
|
||||||
|
error_name = 'RAE'
|
||||||
|
error_log = False
|
||||||
|
|
||||||
|
if error_name == 'RAE':
|
||||||
|
err_function_ = qp.error.rae
|
||||||
|
elif error_name == 'AE':
|
||||||
|
err_function_ = qp.error.ae
|
||||||
|
else:
|
||||||
|
raise ValueError()
|
||||||
|
|
||||||
|
if error_log:
|
||||||
|
error_name = f'log({error_name})'
|
||||||
|
err_function = lambda x,y: np.log(err_function_(x,y))
|
||||||
|
else:
|
||||||
|
err_function = err_function_
|
||||||
|
|
||||||
|
# load the participant and baseline results
|
||||||
|
errors = {}
|
||||||
|
for method_file in method_files:
|
||||||
|
method_name = method_file.replace('.csv', '')
|
||||||
|
id, method_prevs = load_result_file(join(folder, method_file))
|
||||||
|
print(method_file)
|
||||||
|
assert (true_id == id).all(), f'unmatched files for {method_file}'
|
||||||
|
method_error = err_function(true_prevs, method_prevs)
|
||||||
|
method_name = method_names_nice.get(method_name, method_name)
|
||||||
|
errors[method_name] = method_error
|
||||||
|
|
||||||
|
dom_A_prevs = dom_prevs[:,0]
|
||||||
|
|
||||||
|
n_bins = 5
|
||||||
|
bins = np.linspace(dom_A_prevs.min(), dom_A_prevs.max(), n_bins + 1)
|
||||||
|
|
||||||
|
# Crear un DataFrame para los datos
|
||||||
|
df = pd.DataFrame({'dom_A_prevs': dom_A_prevs})
|
||||||
|
for method, err in errors.items():
|
||||||
|
df[method] = err
|
||||||
|
|
||||||
|
# Asignar cada valor de dom_A_prevs a un bin
|
||||||
|
df['bin'] = pd.cut(df['dom_A_prevs'], bins=bins, labels=False, include_lowest=True)
|
||||||
|
|
||||||
|
# Convertir el DataFrame a formato largo
|
||||||
|
df_long = df.melt(id_vars=['dom_A_prevs', 'bin'], value_vars=errors.keys(), var_name='Método', value_name='Error')
|
||||||
|
|
||||||
|
# Crear etiquetas de los bins para el eje X
|
||||||
|
bin_labels = [f"[{bins[i]:.3f}-{bins[i + 1]:.3f}" + (']' if i == n_bins-1 else ')') for i in range(n_bins)]
|
||||||
|
df_long['bin_label'] = df_long['bin'].map(dict(enumerate(bin_labels)))
|
||||||
|
|
||||||
|
# Crear el gráfico de boxplot en Seaborn
|
||||||
|
plt.figure(figsize=(14, 8))
|
||||||
|
sns.boxplot(x='bin', y='Error', hue='Método', data=df_long, palette='Set2', showfliers=False, hue_order=desired_order)
|
||||||
|
|
||||||
|
# Configurar etiquetas del eje X con los rangos de los bins
|
||||||
|
plt.xticks(ticks=range(n_bins), labels=bin_labels, rotation=0)
|
||||||
|
plt.xlabel("Prevalence of Books")
|
||||||
|
plt.ylabel(error_name)
|
||||||
|
#plt.title("Boxplots de Errores por Método dentro de Bins de dom_A_prevs")
|
||||||
|
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
|
||||||
|
#plt.show()
|
||||||
|
plt.savefig(f'./t4_{error_name}_pcc.png')
|
|
@ -0,0 +1,168 @@
|
||||||
|
import os
|
||||||
|
from os.path import join
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from quapy.data.base import LabelledCollection
|
||||||
|
import sys
|
||||||
|
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
|
||||||
|
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../')))
|
||||||
|
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), './')))
|
||||||
|
#from LeQua2024.scripts import constants
|
||||||
|
#from LeQua2024._lequa2024 import fetch_lequa2024
|
||||||
|
import quapy as qp
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import seaborn as sns
|
||||||
|
from pathlib import Path
|
||||||
|
import glob
|
||||||
|
|
||||||
|
|
||||||
|
os.chdir('/home/moreo/QuaPy/LeQua2024')
|
||||||
|
print(os.getcwd())
|
||||||
|
|
||||||
|
|
||||||
|
qp.environ['SAMPLE_SIZE']=250
|
||||||
|
|
||||||
|
TASK=1
|
||||||
|
|
||||||
|
true_prevs_path = f'./TruePrevalences/T{TASK}.test_prevalences/T{TASK}/public/test_prevalences.txt'
|
||||||
|
folder = F'./Results_CODALAB_2024/extracted/TASK_{TASK}'
|
||||||
|
|
||||||
|
def load_result_file(path):
|
||||||
|
df = pd.read_csv(path, index_col=0)
|
||||||
|
id = df.index.to_numpy()
|
||||||
|
prevs = df.values
|
||||||
|
return id, prevs
|
||||||
|
|
||||||
|
|
||||||
|
method_files = glob.glob(f"{folder}/*.csv")
|
||||||
|
|
||||||
|
|
||||||
|
method_names_nice={
|
||||||
|
'DistMatching-y': 'DM',
|
||||||
|
'TeamGMNet': 'UniOviedo(Team1)',
|
||||||
|
'tobiaslotz': 'Lamarr'
|
||||||
|
}
|
||||||
|
|
||||||
|
exclude_methods=[
|
||||||
|
'TeamCUFE',
|
||||||
|
'hustav',
|
||||||
|
'PCC',
|
||||||
|
'CC'
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# desired_order=[
|
||||||
|
# 'Lamarr',
|
||||||
|
# 'SLD',
|
||||||
|
# 'DM',
|
||||||
|
# 'KDEy',
|
||||||
|
# 'UniOviedo(Team1)'
|
||||||
|
# ]
|
||||||
|
# desired_order=[
|
||||||
|
# 'PCC', 'Lamarr'
|
||||||
|
# ]
|
||||||
|
|
||||||
|
# load the true values (sentiment prevalence, domain prevalence)
|
||||||
|
true_id, true_prevs = load_result_file(true_prevs_path)
|
||||||
|
|
||||||
|
|
||||||
|
# define the loss for evaluation
|
||||||
|
error_name = 'RAE'
|
||||||
|
error_log = False
|
||||||
|
|
||||||
|
if error_name == 'RAE':
|
||||||
|
err_function_ = qp.error.rae
|
||||||
|
elif error_name == 'AE':
|
||||||
|
err_function_ = qp.error.ae
|
||||||
|
else:
|
||||||
|
raise ValueError()
|
||||||
|
|
||||||
|
if error_log:
|
||||||
|
error_name = f'log({error_name})'
|
||||||
|
err_function = lambda x,y: np.log(err_function_(x,y))
|
||||||
|
else:
|
||||||
|
err_function = err_function_
|
||||||
|
|
||||||
|
|
||||||
|
def load_vector_documents(path):
|
||||||
|
"""
|
||||||
|
Loads vectorized documents. In case the sample is unlabelled,
|
||||||
|
the labels returned are None
|
||||||
|
|
||||||
|
:param path: path to the data sample containing the raw documents
|
||||||
|
:return: a tuple with the documents (np.ndarray of shape `(n,256)`) and the labels (a np.ndarray of shape `(n,)` if
|
||||||
|
the sample is labelled, or None if the sample is unlabelled), with `n` the number of instances in the sample
|
||||||
|
(250 for T1 and T4, 1000 for T2, and 200 for T3)
|
||||||
|
"""
|
||||||
|
D = pd.read_csv(path).to_numpy(dtype=float)
|
||||||
|
labelled = D.shape[1] == 257
|
||||||
|
if labelled:
|
||||||
|
X, y = D[:,1:], D[:,0].astype(int).flatten()
|
||||||
|
else:
|
||||||
|
X, y = D, None
|
||||||
|
return X, y
|
||||||
|
|
||||||
|
#train_prevalence = fetch_lequa2024(task=f'T{TASK}', data_home='./data')
|
||||||
|
train = LabelledCollection.load(f'/home/moreo/QuaPy/LeQua2024/data/lequa2024/T{TASK}/public/training_data.txt', loader_func=load_vector_documents)
|
||||||
|
train_prev = train.prevalence()
|
||||||
|
#train_prev = np.tile(train_prev, (len(true_id),1))
|
||||||
|
|
||||||
|
from quapy.plot import error_by_drift
|
||||||
|
|
||||||
|
# load the participant and baseline results
|
||||||
|
method_names, estim_prevs = [], []
|
||||||
|
for method_file in method_files:
|
||||||
|
method_name = Path(method_file).name.replace('.csv', '')
|
||||||
|
if method_name in exclude_methods:
|
||||||
|
continue
|
||||||
|
id, method_prevs = load_result_file(join(folder, method_name+'.csv'))
|
||||||
|
assert (true_id == id).all(), f'unmatched files for {method_file}'
|
||||||
|
method_name = method_names_nice.get(method_name, method_name)
|
||||||
|
method_names.append(method_name)
|
||||||
|
estim_prevs.append(method_prevs)
|
||||||
|
|
||||||
|
true_prevs = [true_prevs]*len(method_names)
|
||||||
|
tr_prevs =[train.prevalence()]*len(method_names)
|
||||||
|
error_by_drift(method_names,
|
||||||
|
true_prevs,
|
||||||
|
estim_prevs,
|
||||||
|
tr_prevs,
|
||||||
|
error_name='mrae', show_std=True,
|
||||||
|
show_density=True, vlines=True, savepath=f'./util_scripts/t{TASK}_{error_name}_pcc.png')
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
shift=qp.error.ae(train_prev, true_prevs)
|
||||||
|
|
||||||
|
n_bins = 5
|
||||||
|
bins = np.linspace(shift.min(), shift.max(), n_bins + 1)
|
||||||
|
|
||||||
|
# Crear un DataFrame para los datos
|
||||||
|
df = pd.DataFrame({'dom_A_prevs': shift})
|
||||||
|
for method, err in errors.items():
|
||||||
|
df[method] = err
|
||||||
|
|
||||||
|
# Asignar cada valor de dom_A_prevs a un bin
|
||||||
|
df['bin'] = pd.cut(df['dom_A_prevs'], bins=bins, labels=False, include_lowest=True)
|
||||||
|
|
||||||
|
# Convertir el DataFrame a formato largo
|
||||||
|
df_long = df.melt(id_vars=['dom_A_prevs', 'bin'], value_vars=errors.keys(), var_name='Método', value_name='Error')
|
||||||
|
|
||||||
|
# Crear etiquetas de los bins para el eje X
|
||||||
|
bin_labels = [f"[{bins[i]:.3f}-{bins[i + 1]:.3f}" + (']' if i == n_bins-1 else ')') for i in range(n_bins)]
|
||||||
|
df_long['bin_label'] = df_long['bin'].map(dict(enumerate(bin_labels)))
|
||||||
|
|
||||||
|
# Crear el gráfico de boxplot en Seaborn
|
||||||
|
plt.figure(figsize=(14, 8))
|
||||||
|
sns.boxplot(x='bin', y='Error', hue='Método', data=df_long, palette='Set2', showfliers=False)
|
||||||
|
|
||||||
|
# Configurar etiquetas del eje X con los rangos de los bins
|
||||||
|
plt.xticks(ticks=range(n_bins), labels=bin_labels, rotation=0)
|
||||||
|
plt.xlabel("Amount of PPS between the training prevalence and the test prevalences, in terms of AE ")
|
||||||
|
plt.ylabel(error_name)
|
||||||
|
#plt.title("Boxplots de Errores por Método dentro de Bins de dom_A_prevs")
|
||||||
|
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
|
||||||
|
#plt.show()
|
||||||
|
plt.savefig(f'./util_scripts/t{TASK}_{error_name}_pcc.png')
|
|
@ -1,6 +1,6 @@
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
from matplotlib.cm import get_cmap
|
from matplotlib.pyplot import get_cmap
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from matplotlib import cm
|
from matplotlib import cm
|
||||||
from scipy.stats import ttest_ind_from_stats
|
from scipy.stats import ttest_ind_from_stats
|
||||||
|
|
Loading…
Reference in New Issue