diff --git a/elsahar19_rca/__pycache__/rca.cpython-311.pyc b/elsahar19_rca/__pycache__/rca.cpython-311.pyc new file mode 100644 index 0000000..aeb753d Binary files /dev/null and b/elsahar19_rca/__pycache__/rca.cpython-311.pyc differ diff --git a/garg22_ATC/__pycache__/ATC_helper.cpython-311.pyc b/garg22_ATC/__pycache__/ATC_helper.cpython-311.pyc index 5b2f09d..c8e0c9d 100644 Binary files a/garg22_ATC/__pycache__/ATC_helper.cpython-311.pyc and b/garg22_ATC/__pycache__/ATC_helper.cpython-311.pyc differ diff --git a/guillory21_doc/__pycache__/doc.cpython-311.pyc b/guillory21_doc/__pycache__/doc.cpython-311.pyc index a98676f..ea09f92 100644 Binary files a/guillory21_doc/__pycache__/doc.cpython-311.pyc and b/guillory21_doc/__pycache__/doc.cpython-311.pyc differ diff --git a/jiang18_trustscore/__pycache__/trustscore.cpython-311.pyc b/jiang18_trustscore/__pycache__/trustscore.cpython-311.pyc index bfba8b5..e787657 100644 Binary files a/jiang18_trustscore/__pycache__/trustscore.cpython-311.pyc and b/jiang18_trustscore/__pycache__/trustscore.cpython-311.pyc differ diff --git a/lipton_bbse/__pycache__/labelshift.cpython-311.pyc b/lipton_bbse/__pycache__/labelshift.cpython-311.pyc new file mode 100644 index 0000000..10decf8 Binary files /dev/null and b/lipton_bbse/__pycache__/labelshift.cpython-311.pyc differ diff --git a/pyproject.toml b/pyproject.toml index 9ca845e..2f07619 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,8 +12,10 @@ pandas = "^2.0.3" jinja2 = "^3.1.2" [tool.poetry.scripts] +main = "quacc.main:main" multi = "quacc.main:estimate_multiclass" bin = "quacc.main:estimate_binary" +comp = "quacc.main:estimate_comparison" [tool.poetry.group.dev.dependencies] diff --git a/quacc/baseline.py b/quacc/baseline.py index b9922a9..b96f1bc 100644 --- a/quacc/baseline.py +++ b/quacc/baseline.py @@ -6,12 +6,18 @@ import quapy as qp from quapy.data import LabelledCollection from sklearn.base import BaseEstimator from sklearn.model_selection import cross_validate +from quapy.protocol import ( + AbstractStochasticSeededProtocol, + OnLabelledCollectionProtocol, +) import elsahar19_rca.rca as rca import garg22_ATC.ATC_helper as atc import guillory21_doc.doc as doc import jiang18_trustscore.trustscore as trustscore import lipton_bbse.labelshift as bbse +import pandas as pd +import statistics as stats def kfcv(c_model: BaseEstimator, validation: LabelledCollection) -> Dict: @@ -20,10 +26,34 @@ def kfcv(c_model: BaseEstimator, validation: LabelledCollection) -> Dict: return {"f1_score": mean(scores["test_f1_macro"])} +def avg_groupby_distribution(results): + def base_prev(s): + return (s[("base", "F")], s[("base", "T")]) + + grouped_list = {} + for r in results: + bp = base_prev(r) + if bp in grouped_list.keys(): + grouped_list[bp].append(r) + else: + grouped_list[bp] = [r] + + series = [] + for (fp, tp), r_list in grouped_list.items(): + assert len(r_list) > 0 + r_avg = {} + r_avg[("base", "F")], r_avg[("base", "T")] = fp, tp + for pn in [(n1, n2) for ((n1, n2), _) in r_list[0].items() if n1 != "base"]: + r_avg[pn] = stats.mean(map(lambda r: r[pn], r_list)) + series.append(r_avg) + + return series + + def atc_mc( c_model: BaseEstimator, validation: LabelledCollection, - test: LabelledCollection, + protocol: AbstractStochasticSeededProtocol, predict_method="predict_proba", ): c_model_predict = getattr(c_model, predict_method) @@ -31,27 +61,39 @@ def atc_mc( ## Load ID validation data probs and labels val_probs, val_labels = c_model_predict(validation.X), validation.y - ## Load OOD test data probs - test_probs = c_model_predict(test.X) - ## score function, e.g., negative entropy or argmax confidence val_scores = atc.get_max_conf(val_probs) val_preds = np.argmax(val_probs, axis=-1) - test_scores = atc.get_max_conf(test_probs) - _, atc_thres = atc.find_ATC_threshold(val_scores, val_labels == val_preds) - atc_accuracy = atc.get_ATC_acc(atc_thres, test_scores) - return { - "true_acc": 100 * np.mean(np.argmax(test_probs, axis=-1) == test.y), - "pred_acc": atc_accuracy, - } + # ensure that the protocol returns a LabelledCollection for each iteration + protocol.collator = OnLabelledCollectionProtocol.get_collator("labelled_collection") + + cols = [ + ("base", "F"), + ("base", "T"), + ("atc_mc", "accuracy"), + ] + results = [] + for test in protocol(): + ## Load OOD test data probs + test_probs = c_model_predict(test.X) + test_scores = atc.get_max_conf(test_probs) + atc_accuracy = 1.0 - (atc.get_ATC_acc(atc_thres, test_scores) / 100.0) + [f_prev, t_prev] = test.prevalence() + results.append({k: v for k, v in zip(cols, [f_prev, t_prev, atc_accuracy])}) + + series = avg_groupby_distribution(results) + return pd.DataFrame( + series, + columns=pd.MultiIndex.from_tuples(cols), + ) def atc_ne( c_model: BaseEstimator, validation: LabelledCollection, - test: LabelledCollection, + protocol: AbstractStochasticSeededProtocol, predict_method="predict_proba", ): c_model_predict = getattr(c_model, predict_method) @@ -59,22 +101,33 @@ def atc_ne( ## Load ID validation data probs and labels val_probs, val_labels = c_model_predict(validation.X), validation.y - ## Load OOD test data probs - test_probs = c_model_predict(test.X) - ## score function, e.g., negative entropy or argmax confidence val_scores = atc.get_entropy(val_probs) val_preds = np.argmax(val_probs, axis=-1) - - test_scores = atc.get_entropy(test_probs) - _, atc_thres = atc.find_ATC_threshold(val_scores, val_labels == val_preds) - atc_accuracy = atc.get_ATC_acc(atc_thres, test_scores) - return { - "true_acc": 100 * np.mean(np.argmax(test_probs, axis=-1) == test.y), - "pred_acc": atc_accuracy, - } + # ensure that the protocol returns a LabelledCollection for each iteration + protocol.collator = OnLabelledCollectionProtocol.get_collator("labelled_collection") + + cols = [ + ("base", "F"), + ("base", "T"), + ("atc_ne", "accuracy"), + ] + results = [] + for test in protocol(): + ## Load OOD test data probs + test_probs = c_model_predict(test.X) + test_scores = atc.get_entropy(test_probs) + atc_accuracy = 1.0 - (atc.get_ATC_acc(atc_thres, test_scores) / 100.0) + [f_prev, t_prev] = test.prevalence() + results.append({k: v for k, v in zip(cols, [f_prev, t_prev, atc_accuracy])}) + + series = avg_groupby_distribution(results) + return pd.DataFrame( + series, + columns=pd.MultiIndex.from_tuples(cols), + ) def trust_score( @@ -96,70 +149,148 @@ def trust_score( def doc_feat( c_model: BaseEstimator, validation: LabelledCollection, - test: LabelledCollection, + protocol: AbstractStochasticSeededProtocol, predict_method="predict_proba", ): c_model_predict = getattr(c_model, predict_method) val_probs, val_labels = c_model_predict(validation.X), validation.y - test_probs = c_model_predict(test.X) val_scores = np.max(val_probs, axis=-1) - test_scores = np.max(test_probs, axis=-1) val_preds = np.argmax(val_probs, axis=-1) - v1acc = np.mean(val_preds == val_labels) * 100 - return v1acc + doc.get_doc(val_scores, test_scores) + + # ensure that the protocol returns a LabelledCollection for each iteration + protocol.collator = OnLabelledCollectionProtocol.get_collator("labelled_collection") + + cols = [ + ("base", "F"), + ("base", "T"), + ("doc_feat", "score"), + ] + results = [] + for test in protocol(): + test_probs = c_model_predict(test.X) + test_scores = np.max(test_probs, axis=-1) + score = 1.0 - ((v1acc + doc.get_doc(val_scores, test_scores)) / 100.0) + [f_prev, t_prev] = test.prevalence() + results.append({k: v for k, v in zip(cols, [f_prev, t_prev, score])}) + + series = avg_groupby_distribution(results) + return pd.DataFrame( + series, + columns=pd.MultiIndex.from_tuples(cols), + ) def rca_score( c_model: BaseEstimator, validation: LabelledCollection, - test: LabelledCollection, + protocol: AbstractStochasticSeededProtocol, predict_method="predict", ): c_model_predict = getattr(c_model, predict_method) - test_pred = c_model_predict(test.X) - c_model2 = rca.clone_fit(test.X, test_pred) - c_model2_predict = getattr(c_model2, predict_method) - val_pred1 = c_model_predict(validation.X) - val_pred2 = c_model2_predict(validation.X) - return rca.get_score(val_pred1, val_pred2, validation.y) + # ensure that the protocol returns a LabelledCollection for each iteration + protocol.collator = OnLabelledCollectionProtocol.get_collator("labelled_collection") + + cols = [ + ("base", "F"), + ("base", "T"), + ("rca", "score"), + ] + results = [] + for test in protocol(): + [f_prev, t_prev] = test.prevalence() + try: + test_pred = c_model_predict(test.X) + c_model2 = rca.clone_fit(c_model, test.X, test_pred) + c_model2_predict = getattr(c_model2, predict_method) + val_pred2 = c_model2_predict(validation.X) + rca_score = 1.0 - rca.get_score(val_pred1, val_pred2, validation.y) + results.append({k: v for k, v in zip(cols, [f_prev, t_prev, rca_score])}) + except ValueError: + results.append({k: v for k, v in zip(cols, [f_prev, t_prev, float("nan")])}) + + series = avg_groupby_distribution(results) + return pd.DataFrame( + series, + columns=pd.MultiIndex.from_tuples(cols), + ) + def rca_star_score( c_model: BaseEstimator, validation: LabelledCollection, - test: LabelledCollection, + protocol: AbstractStochasticSeededProtocol, predict_method="predict", ): c_model_predict = getattr(c_model, predict_method) validation1, validation2 = validation.split_stratified(train_prop=0.5) - test_pred = c_model_predict(test.X) val1_pred = c_model_predict(validation1.X) - c_model1 = rca.clone_fit(validation1.X, val1_pred) - c_model2 = rca.clone_fit(test.X, test_pred) + c_model1 = rca.clone_fit(c_model, validation1.X, val1_pred) c_model1_predict = getattr(c_model1, predict_method) - c_model2_predict = getattr(c_model2, predict_method) - val2_pred1 = c_model1_predict(validation2.X) - val2_pred2 = c_model2_predict(validation2.X) - return rca.get_score(val2_pred1, val2_pred2, validation2.y) + # ensure that the protocol returns a LabelledCollection for each iteration + protocol.collator = OnLabelledCollectionProtocol.get_collator("labelled_collection") + + cols = [ + ("base", "F"), + ("base", "T"), + ("rca*", "score"), + ] + results = [] + for test in protocol(): + [f_prev, t_prev] = test.prevalence() + try: + test_pred = c_model_predict(test.X) + c_model2 = rca.clone_fit(c_model, test.X, test_pred) + c_model2_predict = getattr(c_model2, predict_method) + val2_pred2 = c_model2_predict(validation2.X) + rca_star_score = 1.0 - rca.get_score(val2_pred1, val2_pred2, validation2.y) + results.append( + {k: v for k, v in zip(cols, [f_prev, t_prev, rca_star_score])} + ) + except ValueError: + results.append({k: v for k, v in zip(cols, [f_prev, t_prev, float("nan")])}) + + series = avg_groupby_distribution(results) + return pd.DataFrame( + series, + columns=pd.MultiIndex.from_tuples(cols), + ) + - def bbse_score( c_model: BaseEstimator, validation: LabelledCollection, - test: LabelledCollection, + protocol: AbstractStochasticSeededProtocol, predict_method="predict_proba", ): - c_model_predict = getattr(c_model, predict_method) val_probs, val_labels = c_model_predict(validation.X), validation.y - test_probs = c_model_predict(test.X) - wt = bbse.estimate_labelshift_ratio(val_labels, val_probs, test_probs, 2) - estim_prev = bbse.estimate_target_dist(wt, val_labels, 2) - true_prev = test.prevalence() - return qp.error.ae(true_prev, estim_prev) + # ensure that the protocol returns a LabelledCollection for each iteration + protocol.collator = OnLabelledCollectionProtocol.get_collator("labelled_collection") + + cols = [ + ("base", "F"), + ("base", "T"), + ("bbse", "score"), + ] + results = [] + for test in protocol(): + test_probs = c_model_predict(test.X) + wt = bbse.estimate_labelshift_ratio(val_labels, val_probs, test_probs, 2) + estim_prev = bbse.estimate_target_dist(wt, val_labels, 2)[1] + true_prev = test.prevalence() + [f_prev, t_prev] = true_prev + acc = qp.error.ae(true_prev, estim_prev) + results.append({k: v for k, v in zip(cols, [f_prev, t_prev, acc])}) + + series = avg_groupby_distribution(results) + return pd.DataFrame( + series, + columns=pd.MultiIndex.from_tuples(cols), + ) diff --git a/quacc/evaluation.py b/quacc/evaluation.py index 3d1cc20..b07b4f2 100644 --- a/quacc/evaluation.py +++ b/quacc/evaluation.py @@ -23,9 +23,6 @@ def estimate( for sample in protocol(): e_sample = estimator.extend(sample) estim_prev = estimator.estimate(e_sample.X, ext=True) - # base_prevs.append(_prettyfloat(accuracy, sample.prevalence())) - # true_prevs.append(_prettyfloat(accuracy, e_sample.prevalence())) - # estim_prevs.append(_prettyfloat(accuracy, estim_prev)) base_prevs.append(sample.prevalence()) true_prevs.append(e_sample.prevalence()) estim_prevs.append(estim_prev) @@ -33,37 +30,20 @@ def estimate( return base_prevs, true_prevs, estim_prevs -_bprev_col_0 = ["base"] -_bprev_col_1 = ["0", "1"] -_prev_col_0 = ["true", "estim"] -_prev_col_1 = ["TN", "FP", "FN", "TP"] -_err_col_0 = ["errors"] - - -def _report_columns(err_names): - bprev_cols = list(itertools.product(_bprev_col_0, _bprev_col_1)) - prev_cols = list(itertools.product(_prev_col_0, _prev_col_1)) - - err_1 = err_names - err_cols = list(itertools.product(_err_col_0, err_1)) - - cols = bprev_cols + prev_cols + err_cols - - return pd.MultiIndex.from_tuples(cols) - -def _report_avg_groupby_distribution(lst, error_names): +def avg_groupby_distribution(lst, error_names): def _bprev(s): - return (s[("base", "0")], s[("base", "1")]) - - def _normalize_prev(r, prev_name): - raw_prev = [v for ((k0, k1), v) in r.items() if k0 == prev_name] - norm_prev = [v/sum(raw_prev) for v in raw_prev] - for n, v in zip(itertools.product([prev_name], _prev_col_1), norm_prev): - r[n] = v + return (s[("base", "F")], s[("base", "T")]) + def _normalize_prev(r): + for prev_name in ["true", "estim"]: + raw_prev = [v for ((k0, k1), v) in r.items() if k0 == prev_name] + norm_prev = [v / sum(raw_prev) for v in raw_prev] + for n, v in zip( + itertools.product([prev_name], ["TN", "FP", "FN", "TP"]), norm_prev + ): + r[n] = v return r - current_bprev = _bprev(lst[0]) bprev_cnt = 0 g_lst = [[]] @@ -80,27 +60,33 @@ def _report_avg_groupby_distribution(lst, error_names): for gs in g_lst: assert len(gs) > 0 r = {} - r[("base", "0")], r[("base", "1")] = _bprev(gs[0]) + r[("base", "F")], r[("base", "T")] = _bprev(gs[0]) - for pn in itertools.product(_prev_col_0, _prev_col_1): + for pn in [(n1, n2) for ((n1, n2), _) in gs[0].items() if n1 != "base"]: r[pn] = stats.mean(map(lambda s: s[pn], gs)) - r = _normalize_prev(r, "true") - r = _normalize_prev(r, "estim") + r = _normalize_prev(r) - for en in itertools.product(_err_col_0, error_names): + for en in itertools.product(["errors"], error_names): r[en] = stats.mean(map(lambda s: s[en], gs)) r_lst.append(r) return r_lst + def evaluation_report( estimator: AccuracyEstimator, protocol: AbstractStochasticSeededProtocol, error_metrics: Iterable[Union[str, Callable]] = "all", aggregate: bool = True, ): + def _report_columns(err_names): + base_cols = list(itertools.product(["base"], ["F", "T"])) + prev_cols = list(itertools.product(["true", "estim"], ["TN", "FP", "FN", "TP"])) + err_cols = list(itertools.product(["errors"], err_names)) + return base_cols + prev_cols, err_cols + base_prevs, true_prevs, estim_prevs = estimate(estimator, protocol) if error_metrics == "all": @@ -114,20 +100,16 @@ def evaluation_report( error_cols = error_names.copy() if "f1" in error_cols: error_cols.remove("f1") - error_cols.extend(["f1_true", "f1_estim", "f1_dist"]) + error_cols.extend(["f1_true", "f1_estim"]) if "f1e" in error_cols: error_cols.remove("f1e") error_cols.extend(["f1e_true", "f1e_estim"]) # df_cols = ["base_prev", "true_prev", "estim_prev"] + error_names - df_cols = _report_columns(error_cols) + prev_cols, err_cols = _report_columns(error_cols) lst = [] for base_prev, true_prev, estim_prev in zip(base_prevs, true_prevs, estim_prevs): - prev_cols = list(itertools.product(_bprev_col_0, _bprev_col_1)) + list( - itertools.product(_prev_col_0, _prev_col_1) - ) - series = { k: v for (k, v) in zip( @@ -143,7 +125,6 @@ def evaluation_report( f1_true, f1_estim = error_metric(true_prev), error_metric(estim_prev) series[("errors", "f1_true")] = f1_true series[("errors", "f1_estim")] = f1_estim - series[("errors", "f1_dist")] = abs(f1_estim - f1_true) continue score = error_metric(true_prev, estim_prev) @@ -151,6 +132,10 @@ def evaluation_report( lst.append(series) - lst = _report_avg_groupby_distribution(lst, error_cols) if aggregate else lst - df = pd.DataFrame(lst, columns=df_cols) + lst = avg_groupby_distribution(lst, error_cols) if aggregate else lst + + df = pd.DataFrame( + lst, + columns=pd.MultiIndex.from_tuples(prev_cols + err_cols), + ) return df diff --git a/quacc/main.py b/quacc/main.py index bbb054e..879b3a5 100644 --- a/quacc/main.py +++ b/quacc/main.py @@ -4,12 +4,13 @@ from quapy.protocol import APP from sklearn.linear_model import LogisticRegression import quacc.evaluation as eval +import quacc.baseline as baseline from quacc.estimator import ( BinaryQuantifierAccuracyEstimator, MulticlassAccuracyEstimator, ) -from quacc.dataset import get_imdb +from quacc.dataset import get_imdb, get_spambase qp.environ["SAMPLE_SIZE"] = 100 @@ -20,7 +21,7 @@ dataset_name = "imdb" def estimate_multiclass(): print(dataset_name) - train, validation, test = get_imdb(dataset_name) + train, validation, test = get_imdb() model = LogisticRegression() @@ -59,7 +60,7 @@ def estimate_multiclass(): def estimate_binary(): print(dataset_name) - train, validation, test = get_imdb(dataset_name) + train, validation, test = get_imdb() model = LogisticRegression() @@ -97,6 +98,39 @@ def estimate_binary(): # print(df.to_html()) print() +def estimate_comparison(): + train, validation, test = get_spambase() + model = LogisticRegression() + model.fit(*train.Xy) + + n_prevalences = 21 + repreats = 1000 + protocol = APP(test, n_prevalences=n_prevalences, repeats=repreats) + + estimator = BinaryQuantifierAccuracyEstimator(model) + estimator.fit(validation) + df = eval.evaluation_report(estimator, protocol) + + df_index = [("base", "F"), ("base", "T")] + + atc_mc_df = baseline.atc_mc(model, validation, protocol) + atc_ne_df = baseline.atc_ne(model, validation, protocol) + doc_feat_df = baseline.doc_feat(model, validation, protocol) + rca_df = baseline.rca_score(model, validation, protocol) + rca_star_df = baseline.rca_star_score(model, validation, protocol) + bbse_df = baseline.bbse_score(model, validation, protocol) + + df = df.join(atc_mc_df.set_index(df_index), on=df_index) + df = df.join(atc_ne_df.set_index(df_index), on=df_index) + df = df.join(doc_feat_df.set_index(df_index), on=df_index) + df = df.join(rca_df.set_index(df_index), on=df_index) + df = df.join(rca_star_df.set_index(df_index), on=df_index) + df = df.join(bbse_df.set_index(df_index), on=df_index) + + print(df.to_string()) + +def main(): + estimate_comparison() if __name__ == "__main__": - estimate_binary() + main()