Compare commits

...

10 Commits

10 changed files with 6437 additions and 12 deletions

View File

@ -1,4 +1,4 @@
Change Log 0.1.8 Change Log 0.1.8g
---------------- ----------------
- Added Kernel Density Estimation methods (KDEyML, KDEyCS, KDEyHD) as proposed in the paper: - Added Kernel Density Estimation methods (KDEyML, KDEyCS, KDEyHD) as proposed in the paper:

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -3,9 +3,12 @@ import pickle
import os import os
import sys import sys
from os.path import join from os.path import join
import numpy as np
from sklearn.linear_model import LogisticRegression as LR from sklearn.linear_model import LogisticRegression as LR
from scripts.constants import SAMPLE_SIZE from scripts.constants import SAMPLE_SIZE
from scripts.evaluate import normalized_match_distance
from LeQua2024._lequa2024 import LEQUA2024_TASKS, fetch_lequa2024, LEQUA2024_ZENODO from LeQua2024._lequa2024 import LEQUA2024_TASKS, fetch_lequa2024, LEQUA2024_ZENODO
from quapy.method.aggregative import * from quapy.method.aggregative import *
from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE
@ -35,11 +38,18 @@ def wrap_params(cls_params:dict, prefix:str):
def baselines(): def baselines():
q_params = wrap_params(lr_params, 'classifier') q_params = wrap_params(lr_params, 'classifier')
kde_params = {**q_params, 'bandwidth': np.linspace(0.01, 0.20, 20)}
dm_params = {**q_params, 'nbins': [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 64]}
yield CC(new_cls()), "CC", q_params yield CC(new_cls()), "CC", q_params
yield ACC(new_cls()), "ACC", q_params yield ACC(new_cls()), "ACC", q_params
yield PCC(new_cls()), "PCC", q_params yield PCC(new_cls()), "PCC", q_params
yield PACC(new_cls()), "PACC", q_params yield PACC(new_cls()), "PACC", q_params
yield SLD(new_cls()), "SLD", q_params
#yield KDEyML(new_cls()), "KDEy-ML", kde_params
#yield KDEyHD(new_cls()), "KDEy-HD", kde_params
# yield KDEyCS(new_cls()), "KDEy-CS", kde_params
#yield DMy(new_cls()), "DMy", dm_params
def main(args): def main(args):
@ -77,7 +87,7 @@ def main(args):
quantifier, quantifier,
param_grid, param_grid,
protocol=gen_val, protocol=gen_val,
error=qp.error.mrae, error=normalized_match_distance if args.task=='T3' else qp.error.mrae,
refit=False, refit=False,
verbose=True, verbose=True,
n_jobs=-1 n_jobs=-1

View File

@ -7,6 +7,7 @@ from tqdm import tqdm
from scripts.data import gen_load_samples from scripts.data import gen_load_samples
from glob import glob from glob import glob
from scripts import constants from scripts import constants
from regressor import KDEyRegressor, RegressionToSimplex
""" """
LeQua2024 prediction script LeQua2024 prediction script

133
LeQua2024/regressor.py Normal file
View File

@ -0,0 +1,133 @@
import pickle
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputRegressor
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from LeQua2024._lequa2024 import fetch_lequa2024
from quapy.data import LabelledCollection
from quapy.protocol import AbstractProtocol
from quapy.method.base import BaseQuantifier
import quapy.functional as F
from tqdm import tqdm
from scripts.evaluate import normalized_match_distance, match_distance
def projection_simplex_sort(unnormalized_arr) -> np.ndarray:
"""Projects a point onto the probability simplex.
[This code is taken from the devel branch, that will correspond to the future QuaPy 0.1.9]
The code is adapted from Mathieu Blondel's BSD-licensed
`implementation <https://gist.github.com/mblondel/6f3b7aaad90606b98f71>`_
(see function `projection_simplex_sort` in their repo) which is accompanying the paper
Mathieu Blondel, Akinori Fujino, and Naonori Ueda.
Large-scale Multiclass Support Vector Machine Training via Euclidean Projection onto the Simplex,
ICPR 2014, `URL <http://www.mblondel.org/publications/mblondel-icpr2014.pdf>`_
:param `unnormalized_arr`: point in n-dimensional space, shape `(n,)`
:return: projection of `unnormalized_arr` onto the (n-1)-dimensional probability simplex, shape `(n,)`
"""
unnormalized_arr = np.asarray(unnormalized_arr)
n = len(unnormalized_arr)
u = np.sort(unnormalized_arr)[::-1]
cssv = np.cumsum(u) - 1.0
ind = np.arange(1, n + 1)
cond = u - cssv / ind > 0
rho = ind[cond][-1]
theta = cssv[cond][-1] / float(rho)
return np.maximum(unnormalized_arr - theta, 0)
class RegressionToSimplex(BaseEstimator):
"""
A very simple regressor of probability distributions.
Internally, this class works by invoking an SVR regressor multioutput
followed by a mapping onto the probability simplex.
:param C: regularziation parameter for SVR
"""
def __init__(self, C=1):
self.C = C
def fit(self, X, y):
"""
Learns the correction
:param X: array-like of shape `(n_instances, n_classes)` with uncorrected prevalence vectors
:param y: array-like of shape `(n_instances, n_classes)` with true prevalence vectors
:return: self
"""
self.reg = MultiOutputRegressor(SVR(C=self.C), n_jobs=-1)
self.reg.fit(X, y)
return self
def predict(self, X):
"""
Corrects the a vector of prevalence values
:param X: array-like of shape `(n_classes,)` with one vector of uncorrected prevalence values
:return: array-like of shape `(n_classes,)` with one vector of corrected prevalence values
"""
y_ = self.reg.predict(X)
y_ = np.asarray([projection_simplex_sort(y_i) for y_i in y_])
return y_
class KDEyRegressor(BaseQuantifier):
"""
This class implements a regressor-based correction on top of a quantifier.
The quantifier is taken to be KDEy-ML, which is considered to be already trained (this
method simply loads a pickled object).
The method then optimizes a regressor that corrects prevalence vectors using the
validation samples as training data.
The regressor is based on a multioutput SVR and relies on a post-processing to guarantee
that the output lies on the probability simplex (see also RegressionToSimplex)
"""
def __init__(self, kde_path, Cs=np.logspace(-3,3,7)):
self.kde_path = kde_path
self.Cs = Cs
def fit(self, val_data: AbstractProtocol):
print(f'loading kde from {self.kde_path}')
self.kdey = pickle.load(open(self.kde_path, 'rb'))
print('representing val data with kde')
pbar = tqdm(val_data(), total=val_data.total())
Xs, Ys = [], []
for sample, prev in pbar:
prev_hat = self.kdey.quantify(sample)
Xs.append(prev_hat)
Ys.append(prev)
Xs = np.asarray(Xs)
Ys = np.asarray(Ys)
def scorer(estimator, X, y):
y_hat = estimator.predict(X)
md = normalized_match_distance(y, y_hat)
return (-md)
grid = {'C': self.Cs}
optim = GridSearchCV(
RegressionToSimplex(), param_grid=grid, scoring=scorer, verbose=0, cv=10, n_jobs=64
).fit(Xs, Ys)
self.regressor = optim.best_estimator_
return self
def quantify(self, instances):
prev_hat = self.kdey.quantify(instances)
return self.regressor.predict([prev_hat])[0]
if __name__ == '__main__':
train, gen_val, _ = fetch_lequa2024(task='T3', data_home='./data', merge_T3=True)
kdey_r = KDEyRegressor('./models/T3/KDEy-ML.pkl')
kdey_r.fit(gen_val)
pickle.dump(kdey_r, open('./models/T3/KDEyRegressor.pkl', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)

View File

@ -1,15 +1,6 @@
#!/bin/bash #!/bin/bash
set -x set -x
# download the official scripts
if [ ! -d "scripts" ]; then
echo "Downloading the official scripts from the LeQua 2024 github repo"
wget https://github.com/HLT-ISTI/LeQua2024_scripts/archive/refs/heads/main.zip
unzip main.zip
mv LeQua2024_scripts-main scripts
rm main.zip
fi
# T1: binary (n=2) # T1: binary (n=2)
# T2: multiclass (n=28) # T2: multiclass (n=28)
# T3: ordinal (n=5) # T3: ordinal (n=5)

View File

@ -0,0 +1,120 @@
import os
from os.path import join
import pandas as pd
import quapy as qp
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
os.chdir('/home/moreo/QuaPy/LeQua2024/util_scripts')
print(os.getcwd())
qp.environ['SAMPLE_SIZE']=250
true_prevs_path = '../TruePrevalences/T4.test_prevalences/T4/public/test_prevalences.txt'
domain_prevs_path = '../T4_domain_prevalence/test_domain_prevalences.txt'
folder = '../Results_CODALAB_2024/extracted/TASK_4'
def load_result_file(path):
df = pd.read_csv(path, index_col=0)
id = df.index.to_numpy()
prevs = df.values
return id, prevs
method_files = [
#'ACC.csv',
#'CC.csv',
#'DistMatching-y.csv',
#'KDEy.csv',
#'PACC.csv',
'PCC.csv',
#'SLD.csv',
#'TeamCUFE.csv',
#'TeamGMNet.csv',
'tobiaslotz.csv'
]
method_names_nice={
'DistMatching-y': 'DM',
'TeamGMNet': 'UniOviedo(Team1)',
'tobiaslotz': 'Lamarr'
}
desired_order=[
'Lamarr',
'SLD',
'DM',
'KDEy',
'UniOviedo(Team1)'
]
desired_order=[
'PCC', 'Lamarr'
]
# load the true values (sentiment prevalence, domain prevalence)
true_id, true_prevs = load_result_file(true_prevs_path)
dom_id, dom_prevs = load_result_file(domain_prevs_path)
assert (true_id == dom_id).all(), 'unmatched files'
# define the loss for evaluation
error_name = 'RAE'
error_log = False
if error_name == 'RAE':
err_function_ = qp.error.rae
elif error_name == 'AE':
err_function_ = qp.error.ae
else:
raise ValueError()
if error_log:
error_name = f'log({error_name})'
err_function = lambda x,y: np.log(err_function_(x,y))
else:
err_function = err_function_
# load the participant and baseline results
errors = {}
for method_file in method_files:
method_name = method_file.replace('.csv', '')
id, method_prevs = load_result_file(join(folder, method_file))
print(method_file)
assert (true_id == id).all(), f'unmatched files for {method_file}'
method_error = err_function(true_prevs, method_prevs)
method_name = method_names_nice.get(method_name, method_name)
errors[method_name] = method_error
dom_A_prevs = dom_prevs[:,0]
n_bins = 5
bins = np.linspace(dom_A_prevs.min(), dom_A_prevs.max(), n_bins + 1)
# Crear un DataFrame para los datos
df = pd.DataFrame({'dom_A_prevs': dom_A_prevs})
for method, err in errors.items():
df[method] = err
# Asignar cada valor de dom_A_prevs a un bin
df['bin'] = pd.cut(df['dom_A_prevs'], bins=bins, labels=False, include_lowest=True)
# Convertir el DataFrame a formato largo
df_long = df.melt(id_vars=['dom_A_prevs', 'bin'], value_vars=errors.keys(), var_name='Método', value_name='Error')
# Crear etiquetas de los bins para el eje X
bin_labels = [f"[{bins[i]:.3f}-{bins[i + 1]:.3f}" + (']' if i == n_bins-1 else ')') for i in range(n_bins)]
df_long['bin_label'] = df_long['bin'].map(dict(enumerate(bin_labels)))
# Crear el gráfico de boxplot en Seaborn
plt.figure(figsize=(14, 8))
sns.boxplot(x='bin', y='Error', hue='Método', data=df_long, palette='Set2', showfliers=False, hue_order=desired_order)
# Configurar etiquetas del eje X con los rangos de los bins
plt.xticks(ticks=range(n_bins), labels=bin_labels, rotation=0)
plt.xlabel("Prevalence of Books")
plt.ylabel(error_name)
#plt.title("Boxplots de Errores por Método dentro de Bins de dom_A_prevs")
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
plt.tight_layout()
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
#plt.show()
plt.savefig(f'./t4_{error_name}_pcc.png')

View File

@ -0,0 +1,168 @@
import os
from os.path import join
import pandas as pd
from quapy.data.base import LabelledCollection
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../')))
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), './')))
#from LeQua2024.scripts import constants
#from LeQua2024._lequa2024 import fetch_lequa2024
import quapy as qp
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import glob
os.chdir('/home/moreo/QuaPy/LeQua2024')
print(os.getcwd())
qp.environ['SAMPLE_SIZE']=250
TASK=1
true_prevs_path = f'./TruePrevalences/T{TASK}.test_prevalences/T{TASK}/public/test_prevalences.txt'
folder = F'./Results_CODALAB_2024/extracted/TASK_{TASK}'
def load_result_file(path):
df = pd.read_csv(path, index_col=0)
id = df.index.to_numpy()
prevs = df.values
return id, prevs
method_files = glob.glob(f"{folder}/*.csv")
method_names_nice={
'DistMatching-y': 'DM',
'TeamGMNet': 'UniOviedo(Team1)',
'tobiaslotz': 'Lamarr'
}
exclude_methods=[
'TeamCUFE',
'hustav',
'PCC',
'CC'
]
# desired_order=[
# 'Lamarr',
# 'SLD',
# 'DM',
# 'KDEy',
# 'UniOviedo(Team1)'
# ]
# desired_order=[
# 'PCC', 'Lamarr'
# ]
# load the true values (sentiment prevalence, domain prevalence)
true_id, true_prevs = load_result_file(true_prevs_path)
# define the loss for evaluation
error_name = 'RAE'
error_log = False
if error_name == 'RAE':
err_function_ = qp.error.rae
elif error_name == 'AE':
err_function_ = qp.error.ae
else:
raise ValueError()
if error_log:
error_name = f'log({error_name})'
err_function = lambda x,y: np.log(err_function_(x,y))
else:
err_function = err_function_
def load_vector_documents(path):
"""
Loads vectorized documents. In case the sample is unlabelled,
the labels returned are None
:param path: path to the data sample containing the raw documents
:return: a tuple with the documents (np.ndarray of shape `(n,256)`) and the labels (a np.ndarray of shape `(n,)` if
the sample is labelled, or None if the sample is unlabelled), with `n` the number of instances in the sample
(250 for T1 and T4, 1000 for T2, and 200 for T3)
"""
D = pd.read_csv(path).to_numpy(dtype=float)
labelled = D.shape[1] == 257
if labelled:
X, y = D[:,1:], D[:,0].astype(int).flatten()
else:
X, y = D, None
return X, y
#train_prevalence = fetch_lequa2024(task=f'T{TASK}', data_home='./data')
train = LabelledCollection.load(f'/home/moreo/QuaPy/LeQua2024/data/lequa2024/T{TASK}/public/training_data.txt', loader_func=load_vector_documents)
train_prev = train.prevalence()
#train_prev = np.tile(train_prev, (len(true_id),1))
from quapy.plot import error_by_drift
# load the participant and baseline results
method_names, estim_prevs = [], []
for method_file in method_files:
method_name = Path(method_file).name.replace('.csv', '')
if method_name in exclude_methods:
continue
id, method_prevs = load_result_file(join(folder, method_name+'.csv'))
assert (true_id == id).all(), f'unmatched files for {method_file}'
method_name = method_names_nice.get(method_name, method_name)
method_names.append(method_name)
estim_prevs.append(method_prevs)
true_prevs = [true_prevs]*len(method_names)
tr_prevs =[train.prevalence()]*len(method_names)
error_by_drift(method_names,
true_prevs,
estim_prevs,
tr_prevs,
error_name='mrae', show_std=True,
show_density=True, vlines=True, savepath=f'./util_scripts/t{TASK}_{error_name}_pcc.png')
sys.exit()
shift=qp.error.ae(train_prev, true_prevs)
n_bins = 5
bins = np.linspace(shift.min(), shift.max(), n_bins + 1)
# Crear un DataFrame para los datos
df = pd.DataFrame({'dom_A_prevs': shift})
for method, err in errors.items():
df[method] = err
# Asignar cada valor de dom_A_prevs a un bin
df['bin'] = pd.cut(df['dom_A_prevs'], bins=bins, labels=False, include_lowest=True)
# Convertir el DataFrame a formato largo
df_long = df.melt(id_vars=['dom_A_prevs', 'bin'], value_vars=errors.keys(), var_name='Método', value_name='Error')
# Crear etiquetas de los bins para el eje X
bin_labels = [f"[{bins[i]:.3f}-{bins[i + 1]:.3f}" + (']' if i == n_bins-1 else ')') for i in range(n_bins)]
df_long['bin_label'] = df_long['bin'].map(dict(enumerate(bin_labels)))
# Crear el gráfico de boxplot en Seaborn
plt.figure(figsize=(14, 8))
sns.boxplot(x='bin', y='Error', hue='Método', data=df_long, palette='Set2', showfliers=False)
# Configurar etiquetas del eje X con los rangos de los bins
plt.xticks(ticks=range(n_bins), labels=bin_labels, rotation=0)
plt.xlabel("Amount of PPS between the training prevalence and the test prevalences, in terms of AE ")
plt.ylabel(error_name)
#plt.title("Boxplots de Errores por Método dentro de Bins de dom_A_prevs")
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
plt.tight_layout()
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
#plt.show()
plt.savefig(f'./util_scripts/t{TASK}_{error_name}_pcc.png')

View File

@ -1,6 +1,6 @@
from collections import defaultdict from collections import defaultdict
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from matplotlib.cm import get_cmap from matplotlib.pyplot import get_cmap
import numpy as np import numpy as np
from matplotlib import cm from matplotlib import cm
from scipy.stats import ttest_ind_from_stats from scipy.stats import ttest_ind_from_stats