diff --git a/elsahar19_rca/__pycache__/rca.cpython-311.pyc b/elsahar19_rca/__pycache__/rca.cpython-311.pyc deleted file mode 100644 index aeb753d..0000000 Binary files a/elsahar19_rca/__pycache__/rca.cpython-311.pyc and /dev/null differ diff --git a/garg22_ATC/__pycache__/ATC_helper.cpython-311.pyc b/garg22_ATC/__pycache__/ATC_helper.cpython-311.pyc deleted file mode 100644 index c8e0c9d..0000000 Binary files a/garg22_ATC/__pycache__/ATC_helper.cpython-311.pyc and /dev/null differ diff --git a/guillory21_doc/__pycache__/doc.cpython-311.pyc b/guillory21_doc/__pycache__/doc.cpython-311.pyc deleted file mode 100644 index ea09f92..0000000 Binary files a/guillory21_doc/__pycache__/doc.cpython-311.pyc and /dev/null differ diff --git a/jiang18_trustscore/__pycache__/trustscore.cpython-311.pyc b/jiang18_trustscore/__pycache__/trustscore.cpython-311.pyc deleted file mode 100644 index e787657..0000000 Binary files a/jiang18_trustscore/__pycache__/trustscore.cpython-311.pyc and /dev/null differ diff --git a/lipton_bbse/__pycache__/labelshift.cpython-311.pyc b/lipton_bbse/__pycache__/labelshift.cpython-311.pyc deleted file mode 100644 index 10decf8..0000000 Binary files a/lipton_bbse/__pycache__/labelshift.cpython-311.pyc and /dev/null differ diff --git a/quacc/baseline.py b/quacc/baseline.py index b96f1bc..9a53db7 100644 --- a/quacc/baseline.py +++ b/quacc/baseline.py @@ -201,13 +201,13 @@ def rca_score( ] results = [] for test in protocol(): - [f_prev, t_prev] = test.prevalence() - try: + try: + [f_prev, t_prev] = test.prevalence() test_pred = c_model_predict(test.X) c_model2 = rca.clone_fit(c_model, test.X, test_pred) c_model2_predict = getattr(c_model2, predict_method) val_pred2 = c_model2_predict(validation.X) - rca_score = 1.0 - rca.get_score(val_pred1, val_pred2, validation.y) + rca_score = rca.get_score(val_pred1, val_pred2, validation.y) results.append({k: v for k, v in zip(cols, [f_prev, t_prev, rca_score])}) except ValueError: results.append({k: v for k, v in zip(cols, [f_prev, t_prev, float("nan")])}) @@ -248,7 +248,7 @@ def rca_star_score( c_model2 = rca.clone_fit(c_model, test.X, test_pred) c_model2_predict = getattr(c_model2, predict_method) val2_pred2 = c_model2_predict(validation2.X) - rca_star_score = 1.0 - rca.get_score(val2_pred1, val2_pred2, validation2.y) + rca_star_score = rca.get_score(val2_pred1, val2_pred2, validation2.y) results.append( {k: v for k, v in zip(cols, [f_prev, t_prev, rca_star_score])} ) diff --git a/quacc/dataset.py b/quacc/dataset.py index d009a78..8098966 100644 --- a/quacc/dataset.py +++ b/quacc/dataset.py @@ -1,3 +1,4 @@ +from operator import index from typing import Tuple import numpy as np from quapy.data.base import LabelledCollection @@ -18,11 +19,29 @@ def get_spambase() -> Tuple[LabelledCollection]: train, validation = train.split_stratified(train_prop=TRAIN_VAL_PROP) return train, validation, test +# >>> fetch_rcv1().target_names +# array(['C11', 'C12', 'C13', 'C14', 'C15', 'C151', 'C1511', 'C152', 'C16', +# 'C17', 'C171', 'C172', 'C173', 'C174', 'C18', 'C181', 'C182', +# 'C183', 'C21', 'C22', 'C23', 'C24', 'C31', 'C311', 'C312', 'C313', +# 'C32', 'C33', 'C331', 'C34', 'C41', 'C411', 'C42', 'CCAT', 'E11', +# 'E12', 'E121', 'E13', 'E131', 'E132', 'E14', 'E141', 'E142', +# 'E143', 'E21', 'E211', 'E212', 'E31', 'E311', 'E312', 'E313', +# 'E41', 'E411', 'E51', 'E511', 'E512', 'E513', 'E61', 'E71', 'ECAT', +# 'G15', 'G151', 'G152', 'G153', 'G154', 'G155', 'G156', 'G157', +# 'G158', 'G159', 'GCAT', 'GCRIM', 'GDEF', 'GDIP', 'GDIS', 'GENT', +# 'GENV', 'GFAS', 'GHEA', 'GJOB', 'GMIL', 'GOBIT', 'GODD', 'GPOL', +# 'GPRO', 'GREL', 'GSCI', 'GSPO', 'GTOUR', 'GVIO', 'GVOTE', 'GWEA', +# 'GWELF', 'M11', 'M12', 'M13', 'M131', 'M132', 'M14', 'M141', +# 'M142', 'M143', 'MCAT'], dtype=object) -def get_rcv1(sample_size=100): +def get_rcv1(target:str): + sample_size = qp.environ["SAMPLE_SIZE"] n_train = 23149 dataset = fetch_rcv1() + if target not in dataset.target_names: + raise ValueError("Invalid target") + def dataset_split(data, labels, classes=[0, 1]) -> Tuple[LabelledCollection]: all_train_d, test_d = data[:n_train, :], data[n_train:, :] all_train_l, test_l = labels[:n_train], labels[n_train:] @@ -31,14 +50,13 @@ def get_rcv1(sample_size=100): train, validation = all_train.split_stratified(train_prop=TRAIN_VAL_PROP) return train, validation, test - target_labels = [ - (target, dataset.target[:, ind].toarray().flatten()) - for (ind, target) in enumerate(dataset.target_names) - ] - filtered_target_labels = filter( - lambda _, labels: np.sum(labels[n_train:]) >= sample_size, target_labels - ) - return { - target: dataset_split(dataset.data, labels, classes=[0, 1]) - for (target, labels) in filtered_target_labels - } + target_index = np.where(dataset.target_names == target)[0] + target_labels = dataset.target[:, target_index].toarray().flatten() + + if np.sum(target_labels[n_train:]) < sample_size: + raise ValueError("Target has too few positive samples") + + d = dataset_split(dataset.data, target_labels, classes=[0, 1]) + + return d + diff --git a/quacc/error.py b/quacc/error.py index 90e5701..dfd19bd 100644 --- a/quacc/error.py +++ b/quacc/error.py @@ -8,18 +8,28 @@ def from_name(err_name): else: return qp.error.from_name(err_name) +# def f1(prev): +# # https://github.com/dice-group/gerbil/wiki/Precision,-Recall-and-F1-measure +# if prev[0] == 0 and prev[1] == 0 and prev[2] == 0: +# return 1.0 +# elif prev[0] == 0 and prev[1] > 0 and prev[2] == 0: +# return 0.0 +# elif prev[0] == 0 and prev[1] == 0 and prev[2] > 0: +# return float('NaN') +# else: +# recall = prev[0] / (prev[0] + prev[1]) +# precision = prev[0] / (prev[0] + prev[2]) +# return 2 * (precision * recall) / (precision + recall) + def f1(prev): - # https://github.com/dice-group/gerbil/wiki/Precision,-Recall-and-F1-measure - if prev[0] == 0 and prev[1] == 0 and prev[2] == 0: + den = (2*prev[3]) + prev[1] + prev[2] + if den == 0: return 1.0 - elif prev[0] == 0 and prev[1] > 0 and prev[2] == 0: - return 0.0 - elif prev[0] == 0 and prev[1] == 0 and prev[2] > 0: - return float('NaN') else: - recall = prev[0] / (prev[0] + prev[1]) - precision = prev[0] / (prev[0] + prev[2]) - return 2 * (precision * recall) / (precision + recall) + return (2*prev[3])/den def f1e(prev): return 1 - f1(prev) + +def mae(prev): + return (prev[1] + prev[2]) / sum(prev) \ No newline at end of file diff --git a/quacc/evaluation.py b/quacc/evaluation.py index b07b4f2..a58f86c 100644 --- a/quacc/evaluation.py +++ b/quacc/evaluation.py @@ -80,55 +80,63 @@ def evaluation_report( protocol: AbstractStochasticSeededProtocol, error_metrics: Iterable[Union[str, Callable]] = "all", aggregate: bool = True, + prevalence: bool = True, ): def _report_columns(err_names): base_cols = list(itertools.product(["base"], ["F", "T"])) prev_cols = list(itertools.product(["true", "estim"], ["TN", "FP", "FN", "TP"])) err_cols = list(itertools.product(["errors"], err_names)) - return base_cols + prev_cols, err_cols + return base_cols, prev_cols, err_cols base_prevs, true_prevs, estim_prevs = estimate(estimator, protocol) if error_metrics == "all": - error_metrics = ["ae", "f1"] + error_metrics = ["mae", "f1"] error_funcs = [ error.from_name(e) if isinstance(e, str) else e for e in error_metrics ] assert all(hasattr(e, "__call__") for e in error_funcs), "invalid error function" error_names = [e.__name__ for e in error_funcs] - error_cols = error_names.copy() - if "f1" in error_cols: - error_cols.remove("f1") - error_cols.extend(["f1_true", "f1_estim"]) - if "f1e" in error_cols: - error_cols.remove("f1e") - error_cols.extend(["f1e_true", "f1e_estim"]) + error_cols = [] + for err in error_names: + if err == "mae": + error_cols.extend(["mae_estim", "mae_true"]) + elif err == "f1": + error_cols.extend(["f1_estim", "f1_true"]) + elif err == "f1e": + error_cols.extend(["f1e_estim", "f1e_true"]) + else: + error_cols.append(err) # df_cols = ["base_prev", "true_prev", "estim_prev"] + error_names - prev_cols, err_cols = _report_columns(error_cols) + base_cols, prev_cols, err_cols = _report_columns(error_cols) lst = [] for base_prev, true_prev, estim_prev in zip(base_prevs, true_prevs, estim_prevs): - series = { - k: v - for (k, v) in zip( - prev_cols, np.concatenate((base_prev, true_prev, estim_prev), axis=0) - ) - } - for error_name, error_metric in zip(error_names, error_funcs): - if error_name == "f1e": - series[("errors", "f1e_true")] = error_metric(true_prev) - series[("errors", "f1e_estim")] = error_metric(estim_prev) - continue - if error_name == "f1": - f1_true, f1_estim = error_metric(true_prev), error_metric(estim_prev) - series[("errors", "f1_true")] = f1_true - series[("errors", "f1_estim")] = f1_estim - continue + if prevalence: + series = { + k: v + for (k, v) in zip( + base_cols + prev_cols, + np.concatenate((base_prev, true_prev, estim_prev), axis=0), + ) + } + df_cols = base_cols + prev_cols + err_cols + else: + series = {k: v for (k, v) in zip(base_cols, base_prev)} + df_cols = base_cols + err_cols - score = error_metric(true_prev, estim_prev) - series[("errors", error_name)] = score + for err in error_cols: + error_funcs = { + "mae_true": lambda: error.mae(true_prev), + "mae_estim": lambda: error.mae(estim_prev), + "f1_true": lambda: error.f1(true_prev), + "f1_estim": lambda: error.f1(estim_prev), + "f1e_true": lambda: error.f1e(true_prev), + "f1e_estim": lambda: error.f1e(estim_prev), + } + series[("errors", err)] = error_funcs[err]() lst.append(series) @@ -136,6 +144,6 @@ def evaluation_report( df = pd.DataFrame( lst, - columns=pd.MultiIndex.from_tuples(prev_cols + err_cols), + columns=pd.MultiIndex.from_tuples(df_cols), ) return df diff --git a/quacc/main.py b/quacc/main.py index 879b3a5..d58a65e 100644 --- a/quacc/main.py +++ b/quacc/main.py @@ -2,6 +2,7 @@ import pandas as pd import quapy as qp from quapy.protocol import APP from sklearn.linear_model import LogisticRegression +from quacc import utils import quacc.evaluation as eval import quacc.baseline as baseline @@ -10,7 +11,7 @@ from quacc.estimator import ( MulticlassAccuracyEstimator, ) -from quacc.dataset import get_imdb, get_spambase +from quacc.dataset import get_imdb, get_rcv1, get_spambase qp.environ["SAMPLE_SIZE"] = 100 @@ -109,25 +110,21 @@ def estimate_comparison(): estimator = BinaryQuantifierAccuracyEstimator(model) estimator.fit(validation) - df = eval.evaluation_report(estimator, protocol) + df = eval.evaluation_report(estimator, protocol, prevalence=False) - df_index = [("base", "F"), ("base", "T")] + df = utils.combine_dataframes( + baseline.atc_mc(model, validation, protocol), + baseline.atc_ne(model, validation, protocol), + baseline.doc_feat(model, validation, protocol), + baseline.rca_score(model, validation, protocol), + baseline.rca_star_score(model, validation, protocol), + baseline.bbse_score(model, validation, protocol), + df, + df_index=[("base", "F"), ("base", "T")] + ) - atc_mc_df = baseline.atc_mc(model, validation, protocol) - atc_ne_df = baseline.atc_ne(model, validation, protocol) - doc_feat_df = baseline.doc_feat(model, validation, protocol) - rca_df = baseline.rca_score(model, validation, protocol) - rca_star_df = baseline.rca_star_score(model, validation, protocol) - bbse_df = baseline.bbse_score(model, validation, protocol) - - df = df.join(atc_mc_df.set_index(df_index), on=df_index) - df = df.join(atc_ne_df.set_index(df_index), on=df_index) - df = df.join(doc_feat_df.set_index(df_index), on=df_index) - df = df.join(rca_df.set_index(df_index), on=df_index) - df = df.join(rca_star_df.set_index(df_index), on=df_index) - df = df.join(bbse_df.set_index(df_index), on=df_index) - - print(df.to_string()) + print(df.to_latex(float_format="{:.4f}".format)) + print(utils.avg_group_report(df).to_latex(float_format="{:.4f}".format)) def main(): estimate_comparison() diff --git a/quacc/utils.py b/quacc/utils.py new file mode 100644 index 0000000..6da5b39 --- /dev/null +++ b/quacc/utils.py @@ -0,0 +1,31 @@ + +import functools +import pandas as pd + +def combine_dataframes(*dfs, df_index=[]) -> pd.DataFrame: + if len(dfs) < 1: + raise ValueError + if len(dfs) == 1: + return dfs[0] + df = dfs[0] + for ndf in dfs[1:]: + df = df.join(ndf.set_index(df_index), on=df_index) + + return df + + +def avg_group_report(df: pd.DataFrame) -> pd.DataFrame: + def _reduce_func(s1, s2): + return { + (n1, n2): v + s2[(n1, n2)] for ((n1, n2), v) in s1.items() + } + + lst = df.to_dict(orient="records")[1:-1] + summed_series = functools.reduce(_reduce_func, lst) + idx = df.columns.drop([("base", "T"), ("base", "F")]) + avg_report = { + (n1, n2): (v / len(lst)) + for ((n1, n2), v) in summed_series.items() + if n1 != "base" + } + return pd.DataFrame([avg_report], columns=idx) \ No newline at end of file