baseline performance test updated

This commit is contained in:
Lorenzo Volpi 2023-09-26 07:58:40 +02:00
parent 575c1dd6a1
commit db129c4093
11 changed files with 136 additions and 72 deletions

View File

@ -201,13 +201,13 @@ def rca_score(
]
results = []
for test in protocol():
[f_prev, t_prev] = test.prevalence()
try:
try:
[f_prev, t_prev] = test.prevalence()
test_pred = c_model_predict(test.X)
c_model2 = rca.clone_fit(c_model, test.X, test_pred)
c_model2_predict = getattr(c_model2, predict_method)
val_pred2 = c_model2_predict(validation.X)
rca_score = 1.0 - rca.get_score(val_pred1, val_pred2, validation.y)
rca_score = rca.get_score(val_pred1, val_pred2, validation.y)
results.append({k: v for k, v in zip(cols, [f_prev, t_prev, rca_score])})
except ValueError:
results.append({k: v for k, v in zip(cols, [f_prev, t_prev, float("nan")])})
@ -248,7 +248,7 @@ def rca_star_score(
c_model2 = rca.clone_fit(c_model, test.X, test_pred)
c_model2_predict = getattr(c_model2, predict_method)
val2_pred2 = c_model2_predict(validation2.X)
rca_star_score = 1.0 - rca.get_score(val2_pred1, val2_pred2, validation2.y)
rca_star_score = rca.get_score(val2_pred1, val2_pred2, validation2.y)
results.append(
{k: v for k, v in zip(cols, [f_prev, t_prev, rca_star_score])}
)

View File

@ -1,3 +1,4 @@
from operator import index
from typing import Tuple
import numpy as np
from quapy.data.base import LabelledCollection
@ -18,11 +19,29 @@ def get_spambase() -> Tuple[LabelledCollection]:
train, validation = train.split_stratified(train_prop=TRAIN_VAL_PROP)
return train, validation, test
# >>> fetch_rcv1().target_names
# array(['C11', 'C12', 'C13', 'C14', 'C15', 'C151', 'C1511', 'C152', 'C16',
# 'C17', 'C171', 'C172', 'C173', 'C174', 'C18', 'C181', 'C182',
# 'C183', 'C21', 'C22', 'C23', 'C24', 'C31', 'C311', 'C312', 'C313',
# 'C32', 'C33', 'C331', 'C34', 'C41', 'C411', 'C42', 'CCAT', 'E11',
# 'E12', 'E121', 'E13', 'E131', 'E132', 'E14', 'E141', 'E142',
# 'E143', 'E21', 'E211', 'E212', 'E31', 'E311', 'E312', 'E313',
# 'E41', 'E411', 'E51', 'E511', 'E512', 'E513', 'E61', 'E71', 'ECAT',
# 'G15', 'G151', 'G152', 'G153', 'G154', 'G155', 'G156', 'G157',
# 'G158', 'G159', 'GCAT', 'GCRIM', 'GDEF', 'GDIP', 'GDIS', 'GENT',
# 'GENV', 'GFAS', 'GHEA', 'GJOB', 'GMIL', 'GOBIT', 'GODD', 'GPOL',
# 'GPRO', 'GREL', 'GSCI', 'GSPO', 'GTOUR', 'GVIO', 'GVOTE', 'GWEA',
# 'GWELF', 'M11', 'M12', 'M13', 'M131', 'M132', 'M14', 'M141',
# 'M142', 'M143', 'MCAT'], dtype=object)
def get_rcv1(sample_size=100):
def get_rcv1(target:str):
sample_size = qp.environ["SAMPLE_SIZE"]
n_train = 23149
dataset = fetch_rcv1()
if target not in dataset.target_names:
raise ValueError("Invalid target")
def dataset_split(data, labels, classes=[0, 1]) -> Tuple[LabelledCollection]:
all_train_d, test_d = data[:n_train, :], data[n_train:, :]
all_train_l, test_l = labels[:n_train], labels[n_train:]
@ -31,14 +50,13 @@ def get_rcv1(sample_size=100):
train, validation = all_train.split_stratified(train_prop=TRAIN_VAL_PROP)
return train, validation, test
target_labels = [
(target, dataset.target[:, ind].toarray().flatten())
for (ind, target) in enumerate(dataset.target_names)
]
filtered_target_labels = filter(
lambda _, labels: np.sum(labels[n_train:]) >= sample_size, target_labels
)
return {
target: dataset_split(dataset.data, labels, classes=[0, 1])
for (target, labels) in filtered_target_labels
}
target_index = np.where(dataset.target_names == target)[0]
target_labels = dataset.target[:, target_index].toarray().flatten()
if np.sum(target_labels[n_train:]) < sample_size:
raise ValueError("Target has too few positive samples")
d = dataset_split(dataset.data, target_labels, classes=[0, 1])
return d

View File

@ -8,18 +8,28 @@ def from_name(err_name):
else:
return qp.error.from_name(err_name)
# def f1(prev):
# # https://github.com/dice-group/gerbil/wiki/Precision,-Recall-and-F1-measure
# if prev[0] == 0 and prev[1] == 0 and prev[2] == 0:
# return 1.0
# elif prev[0] == 0 and prev[1] > 0 and prev[2] == 0:
# return 0.0
# elif prev[0] == 0 and prev[1] == 0 and prev[2] > 0:
# return float('NaN')
# else:
# recall = prev[0] / (prev[0] + prev[1])
# precision = prev[0] / (prev[0] + prev[2])
# return 2 * (precision * recall) / (precision + recall)
def f1(prev):
# https://github.com/dice-group/gerbil/wiki/Precision,-Recall-and-F1-measure
if prev[0] == 0 and prev[1] == 0 and prev[2] == 0:
den = (2*prev[3]) + prev[1] + prev[2]
if den == 0:
return 1.0
elif prev[0] == 0 and prev[1] > 0 and prev[2] == 0:
return 0.0
elif prev[0] == 0 and prev[1] == 0 and prev[2] > 0:
return float('NaN')
else:
recall = prev[0] / (prev[0] + prev[1])
precision = prev[0] / (prev[0] + prev[2])
return 2 * (precision * recall) / (precision + recall)
return (2*prev[3])/den
def f1e(prev):
return 1 - f1(prev)
def mae(prev):
return (prev[1] + prev[2]) / sum(prev)

View File

@ -80,55 +80,63 @@ def evaluation_report(
protocol: AbstractStochasticSeededProtocol,
error_metrics: Iterable[Union[str, Callable]] = "all",
aggregate: bool = True,
prevalence: bool = True,
):
def _report_columns(err_names):
base_cols = list(itertools.product(["base"], ["F", "T"]))
prev_cols = list(itertools.product(["true", "estim"], ["TN", "FP", "FN", "TP"]))
err_cols = list(itertools.product(["errors"], err_names))
return base_cols + prev_cols, err_cols
return base_cols, prev_cols, err_cols
base_prevs, true_prevs, estim_prevs = estimate(estimator, protocol)
if error_metrics == "all":
error_metrics = ["ae", "f1"]
error_metrics = ["mae", "f1"]
error_funcs = [
error.from_name(e) if isinstance(e, str) else e for e in error_metrics
]
assert all(hasattr(e, "__call__") for e in error_funcs), "invalid error function"
error_names = [e.__name__ for e in error_funcs]
error_cols = error_names.copy()
if "f1" in error_cols:
error_cols.remove("f1")
error_cols.extend(["f1_true", "f1_estim"])
if "f1e" in error_cols:
error_cols.remove("f1e")
error_cols.extend(["f1e_true", "f1e_estim"])
error_cols = []
for err in error_names:
if err == "mae":
error_cols.extend(["mae_estim", "mae_true"])
elif err == "f1":
error_cols.extend(["f1_estim", "f1_true"])
elif err == "f1e":
error_cols.extend(["f1e_estim", "f1e_true"])
else:
error_cols.append(err)
# df_cols = ["base_prev", "true_prev", "estim_prev"] + error_names
prev_cols, err_cols = _report_columns(error_cols)
base_cols, prev_cols, err_cols = _report_columns(error_cols)
lst = []
for base_prev, true_prev, estim_prev in zip(base_prevs, true_prevs, estim_prevs):
series = {
k: v
for (k, v) in zip(
prev_cols, np.concatenate((base_prev, true_prev, estim_prev), axis=0)
)
}
for error_name, error_metric in zip(error_names, error_funcs):
if error_name == "f1e":
series[("errors", "f1e_true")] = error_metric(true_prev)
series[("errors", "f1e_estim")] = error_metric(estim_prev)
continue
if error_name == "f1":
f1_true, f1_estim = error_metric(true_prev), error_metric(estim_prev)
series[("errors", "f1_true")] = f1_true
series[("errors", "f1_estim")] = f1_estim
continue
if prevalence:
series = {
k: v
for (k, v) in zip(
base_cols + prev_cols,
np.concatenate((base_prev, true_prev, estim_prev), axis=0),
)
}
df_cols = base_cols + prev_cols + err_cols
else:
series = {k: v for (k, v) in zip(base_cols, base_prev)}
df_cols = base_cols + err_cols
score = error_metric(true_prev, estim_prev)
series[("errors", error_name)] = score
for err in error_cols:
error_funcs = {
"mae_true": lambda: error.mae(true_prev),
"mae_estim": lambda: error.mae(estim_prev),
"f1_true": lambda: error.f1(true_prev),
"f1_estim": lambda: error.f1(estim_prev),
"f1e_true": lambda: error.f1e(true_prev),
"f1e_estim": lambda: error.f1e(estim_prev),
}
series[("errors", err)] = error_funcs[err]()
lst.append(series)
@ -136,6 +144,6 @@ def evaluation_report(
df = pd.DataFrame(
lst,
columns=pd.MultiIndex.from_tuples(prev_cols + err_cols),
columns=pd.MultiIndex.from_tuples(df_cols),
)
return df

View File

@ -2,6 +2,7 @@ import pandas as pd
import quapy as qp
from quapy.protocol import APP
from sklearn.linear_model import LogisticRegression
from quacc import utils
import quacc.evaluation as eval
import quacc.baseline as baseline
@ -10,7 +11,7 @@ from quacc.estimator import (
MulticlassAccuracyEstimator,
)
from quacc.dataset import get_imdb, get_spambase
from quacc.dataset import get_imdb, get_rcv1, get_spambase
qp.environ["SAMPLE_SIZE"] = 100
@ -109,25 +110,21 @@ def estimate_comparison():
estimator = BinaryQuantifierAccuracyEstimator(model)
estimator.fit(validation)
df = eval.evaluation_report(estimator, protocol)
df = eval.evaluation_report(estimator, protocol, prevalence=False)
df_index = [("base", "F"), ("base", "T")]
df = utils.combine_dataframes(
baseline.atc_mc(model, validation, protocol),
baseline.atc_ne(model, validation, protocol),
baseline.doc_feat(model, validation, protocol),
baseline.rca_score(model, validation, protocol),
baseline.rca_star_score(model, validation, protocol),
baseline.bbse_score(model, validation, protocol),
df,
df_index=[("base", "F"), ("base", "T")]
)
atc_mc_df = baseline.atc_mc(model, validation, protocol)
atc_ne_df = baseline.atc_ne(model, validation, protocol)
doc_feat_df = baseline.doc_feat(model, validation, protocol)
rca_df = baseline.rca_score(model, validation, protocol)
rca_star_df = baseline.rca_star_score(model, validation, protocol)
bbse_df = baseline.bbse_score(model, validation, protocol)
df = df.join(atc_mc_df.set_index(df_index), on=df_index)
df = df.join(atc_ne_df.set_index(df_index), on=df_index)
df = df.join(doc_feat_df.set_index(df_index), on=df_index)
df = df.join(rca_df.set_index(df_index), on=df_index)
df = df.join(rca_star_df.set_index(df_index), on=df_index)
df = df.join(bbse_df.set_index(df_index), on=df_index)
print(df.to_string())
print(df.to_latex(float_format="{:.4f}".format))
print(utils.avg_group_report(df).to_latex(float_format="{:.4f}".format))
def main():
estimate_comparison()

31
quacc/utils.py Normal file
View File

@ -0,0 +1,31 @@
import functools
import pandas as pd
def combine_dataframes(*dfs, df_index=[]) -> pd.DataFrame:
if len(dfs) < 1:
raise ValueError
if len(dfs) == 1:
return dfs[0]
df = dfs[0]
for ndf in dfs[1:]:
df = df.join(ndf.set_index(df_index), on=df_index)
return df
def avg_group_report(df: pd.DataFrame) -> pd.DataFrame:
def _reduce_func(s1, s2):
return {
(n1, n2): v + s2[(n1, n2)] for ((n1, n2), v) in s1.items()
}
lst = df.to_dict(orient="records")[1:-1]
summed_series = functools.reduce(_reduce_func, lst)
idx = df.columns.drop([("base", "T"), ("base", "F")])
avg_report = {
(n1, n2): (v / len(lst))
for ((n1, n2), v) in summed_series.items()
if n1 != "base"
}
return pd.DataFrame([avg_report], columns=idx)