diff --git a/elsahar19_rca/__pycache__/rca.cpython-311.pyc b/elsahar19_rca/__pycache__/rca.cpython-311.pyc
new file mode 100644
index 0000000..aeb753d
Binary files /dev/null and b/elsahar19_rca/__pycache__/rca.cpython-311.pyc differ
diff --git a/garg22_ATC/__pycache__/ATC_helper.cpython-311.pyc b/garg22_ATC/__pycache__/ATC_helper.cpython-311.pyc
index 5b2f09d..c8e0c9d 100644
Binary files a/garg22_ATC/__pycache__/ATC_helper.cpython-311.pyc and b/garg22_ATC/__pycache__/ATC_helper.cpython-311.pyc differ
diff --git a/guillory21_doc/__pycache__/doc.cpython-311.pyc b/guillory21_doc/__pycache__/doc.cpython-311.pyc
index a98676f..ea09f92 100644
Binary files a/guillory21_doc/__pycache__/doc.cpython-311.pyc and b/guillory21_doc/__pycache__/doc.cpython-311.pyc differ
diff --git a/jiang18_trustscore/__pycache__/trustscore.cpython-311.pyc b/jiang18_trustscore/__pycache__/trustscore.cpython-311.pyc
index bfba8b5..e787657 100644
Binary files a/jiang18_trustscore/__pycache__/trustscore.cpython-311.pyc and b/jiang18_trustscore/__pycache__/trustscore.cpython-311.pyc differ
diff --git a/lipton_bbse/__pycache__/labelshift.cpython-311.pyc b/lipton_bbse/__pycache__/labelshift.cpython-311.pyc
new file mode 100644
index 0000000..10decf8
Binary files /dev/null and b/lipton_bbse/__pycache__/labelshift.cpython-311.pyc differ
diff --git a/pyproject.toml b/pyproject.toml
index 9ca845e..2f07619 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,8 +12,10 @@ pandas = "^2.0.3"
 jinja2 = "^3.1.2"
 
 [tool.poetry.scripts]
+main = "quacc.main:main"
 multi = "quacc.main:estimate_multiclass"
 bin = "quacc.main:estimate_binary"
+comp = "quacc.main:estimate_comparison"
 
 
 [tool.poetry.group.dev.dependencies]
diff --git a/quacc/baseline.py b/quacc/baseline.py
index b9922a9..b96f1bc 100644
--- a/quacc/baseline.py
+++ b/quacc/baseline.py
@@ -6,12 +6,18 @@ import quapy as qp
 from quapy.data import LabelledCollection
 from sklearn.base import BaseEstimator
 from sklearn.model_selection import cross_validate
+from quapy.protocol import (
+    AbstractStochasticSeededProtocol,
+    OnLabelledCollectionProtocol,
+)
 
 import elsahar19_rca.rca as rca
 import garg22_ATC.ATC_helper as atc
 import guillory21_doc.doc as doc
 import jiang18_trustscore.trustscore as trustscore
 import lipton_bbse.labelshift as bbse
+import pandas as pd
+import statistics as stats
 
 
 def kfcv(c_model: BaseEstimator, validation: LabelledCollection) -> Dict:
@@ -20,10 +26,34 @@ def kfcv(c_model: BaseEstimator, validation: LabelledCollection) -> Dict:
     return {"f1_score": mean(scores["test_f1_macro"])}
 
 
+def avg_groupby_distribution(results):
+    def base_prev(s):
+        return (s[("base", "F")], s[("base", "T")])
+
+    grouped_list = {}
+    for r in results:
+        bp = base_prev(r)
+        if bp in grouped_list.keys():
+            grouped_list[bp].append(r)
+        else:
+            grouped_list[bp] = [r]
+
+    series = []
+    for (fp, tp), r_list in grouped_list.items():
+        assert len(r_list) > 0
+        r_avg = {}
+        r_avg[("base", "F")], r_avg[("base", "T")] = fp, tp
+        for pn in [(n1, n2) for ((n1, n2), _) in r_list[0].items() if n1 != "base"]:
+            r_avg[pn] = stats.mean(map(lambda r: r[pn], r_list))
+        series.append(r_avg)
+
+    return series
+
+
 def atc_mc(
     c_model: BaseEstimator,
     validation: LabelledCollection,
-    test: LabelledCollection,
+    protocol: AbstractStochasticSeededProtocol,
     predict_method="predict_proba",
 ):
     c_model_predict = getattr(c_model, predict_method)
@@ -31,27 +61,39 @@ def atc_mc(
     ## Load ID validation data probs and labels
     val_probs, val_labels = c_model_predict(validation.X), validation.y
 
-    ## Load OOD test data probs
-    test_probs = c_model_predict(test.X)
-
     ## score function, e.g., negative entropy or argmax confidence
     val_scores = atc.get_max_conf(val_probs)
     val_preds = np.argmax(val_probs, axis=-1)
-    test_scores = atc.get_max_conf(test_probs)
-
     _, atc_thres = atc.find_ATC_threshold(val_scores, val_labels == val_preds)
-    atc_accuracy = atc.get_ATC_acc(atc_thres, test_scores)
 
-    return {
-        "true_acc": 100 * np.mean(np.argmax(test_probs, axis=-1) == test.y),
-        "pred_acc": atc_accuracy,
-    }
+    # ensure that the protocol returns a LabelledCollection for each iteration
+    protocol.collator = OnLabelledCollectionProtocol.get_collator("labelled_collection")
+
+    cols = [
+        ("base", "F"),
+        ("base", "T"),
+        ("atc_mc", "accuracy"),
+    ]
+    results = []
+    for test in protocol():
+        ## Load OOD test data probs
+        test_probs = c_model_predict(test.X)
+        test_scores = atc.get_max_conf(test_probs)
+        atc_accuracy = 1.0 - (atc.get_ATC_acc(atc_thres, test_scores) / 100.0)
+        [f_prev, t_prev] = test.prevalence()
+        results.append({k: v for k, v in zip(cols, [f_prev, t_prev, atc_accuracy])})
+
+    series = avg_groupby_distribution(results)
+    return pd.DataFrame(
+        series,
+        columns=pd.MultiIndex.from_tuples(cols),
+    )
 
 
 def atc_ne(
     c_model: BaseEstimator,
     validation: LabelledCollection,
-    test: LabelledCollection,
+    protocol: AbstractStochasticSeededProtocol,
     predict_method="predict_proba",
 ):
     c_model_predict = getattr(c_model, predict_method)
@@ -59,22 +101,33 @@ def atc_ne(
     ## Load ID validation data probs and labels
     val_probs, val_labels = c_model_predict(validation.X), validation.y
 
-    ## Load OOD test data probs
-    test_probs = c_model_predict(test.X)
-
     ## score function, e.g., negative entropy or argmax confidence
     val_scores = atc.get_entropy(val_probs)
     val_preds = np.argmax(val_probs, axis=-1)
-
-    test_scores = atc.get_entropy(test_probs)
-
     _, atc_thres = atc.find_ATC_threshold(val_scores, val_labels == val_preds)
-    atc_accuracy = atc.get_ATC_acc(atc_thres, test_scores)
 
-    return {
-        "true_acc": 100 * np.mean(np.argmax(test_probs, axis=-1) == test.y),
-        "pred_acc": atc_accuracy,
-    }
+    # ensure that the protocol returns a LabelledCollection for each iteration
+    protocol.collator = OnLabelledCollectionProtocol.get_collator("labelled_collection")
+
+    cols = [
+        ("base", "F"),
+        ("base", "T"),
+        ("atc_ne", "accuracy"),
+    ]
+    results = []
+    for test in protocol():
+        ## Load OOD test data probs
+        test_probs = c_model_predict(test.X)
+        test_scores = atc.get_entropy(test_probs)
+        atc_accuracy = 1.0 - (atc.get_ATC_acc(atc_thres, test_scores) / 100.0)
+        [f_prev, t_prev] = test.prevalence()
+        results.append({k: v for k, v in zip(cols, [f_prev, t_prev, atc_accuracy])})
+
+    series = avg_groupby_distribution(results)
+    return pd.DataFrame(
+        series,
+        columns=pd.MultiIndex.from_tuples(cols),
+    )
 
 
 def trust_score(
@@ -96,70 +149,148 @@ def trust_score(
 def doc_feat(
     c_model: BaseEstimator,
     validation: LabelledCollection,
-    test: LabelledCollection,
+    protocol: AbstractStochasticSeededProtocol,
     predict_method="predict_proba",
 ):
     c_model_predict = getattr(c_model, predict_method)
 
     val_probs, val_labels = c_model_predict(validation.X), validation.y
-    test_probs = c_model_predict(test.X)
     val_scores = np.max(val_probs, axis=-1)
-    test_scores = np.max(test_probs, axis=-1)
     val_preds = np.argmax(val_probs, axis=-1)
-
     v1acc = np.mean(val_preds == val_labels) * 100
-    return v1acc + doc.get_doc(val_scores, test_scores)
+
+    # ensure that the protocol returns a LabelledCollection for each iteration
+    protocol.collator = OnLabelledCollectionProtocol.get_collator("labelled_collection")
+
+    cols = [
+        ("base", "F"),
+        ("base", "T"),
+        ("doc_feat", "score"),
+    ]
+    results = []
+    for test in protocol():
+        test_probs = c_model_predict(test.X)
+        test_scores = np.max(test_probs, axis=-1)
+        score = 1.0 - ((v1acc + doc.get_doc(val_scores, test_scores)) / 100.0)
+        [f_prev, t_prev] = test.prevalence()
+        results.append({k: v for k, v in zip(cols, [f_prev, t_prev, score])})
+
+    series = avg_groupby_distribution(results)
+    return pd.DataFrame(
+        series,
+        columns=pd.MultiIndex.from_tuples(cols),
+    )
 
 
 def rca_score(
     c_model: BaseEstimator,
     validation: LabelledCollection,
-    test: LabelledCollection,
+    protocol: AbstractStochasticSeededProtocol,
     predict_method="predict",
 ):
     c_model_predict = getattr(c_model, predict_method)
-    test_pred = c_model_predict(test.X)
-    c_model2 = rca.clone_fit(test.X, test_pred)
-    c_model2_predict = getattr(c_model2, predict_method)
-
     val_pred1 = c_model_predict(validation.X)
-    val_pred2 = c_model2_predict(validation.X)
 
-    return rca.get_score(val_pred1, val_pred2, validation.y)
+    # ensure that the protocol returns a LabelledCollection for each iteration
+    protocol.collator = OnLabelledCollectionProtocol.get_collator("labelled_collection")
+
+    cols = [
+        ("base", "F"),
+        ("base", "T"),
+        ("rca", "score"),
+    ]
+    results = []
+    for test in protocol():
+        [f_prev, t_prev] = test.prevalence()
+        try:
+            test_pred = c_model_predict(test.X)
+            c_model2 = rca.clone_fit(c_model, test.X, test_pred)
+            c_model2_predict = getattr(c_model2, predict_method)
+            val_pred2 = c_model2_predict(validation.X)
+            rca_score = 1.0 - rca.get_score(val_pred1, val_pred2, validation.y)
+            results.append({k: v for k, v in zip(cols, [f_prev, t_prev, rca_score])})
+        except ValueError:
+            results.append({k: v for k, v in zip(cols, [f_prev, t_prev, float("nan")])})
+
+    series = avg_groupby_distribution(results)
+    return pd.DataFrame(
+        series,
+        columns=pd.MultiIndex.from_tuples(cols),
+    )
+
 
 def rca_star_score(
     c_model: BaseEstimator,
     validation: LabelledCollection,
-    test: LabelledCollection,
+    protocol: AbstractStochasticSeededProtocol,
     predict_method="predict",
 ):
     c_model_predict = getattr(c_model, predict_method)
     validation1, validation2 = validation.split_stratified(train_prop=0.5)
-    test_pred = c_model_predict(test.X)
     val1_pred = c_model_predict(validation1.X)
-    c_model1 = rca.clone_fit(validation1.X, val1_pred)
-    c_model2 = rca.clone_fit(test.X, test_pred)
+    c_model1 = rca.clone_fit(c_model, validation1.X, val1_pred)
     c_model1_predict = getattr(c_model1, predict_method)
-    c_model2_predict = getattr(c_model2, predict_method)
-
     val2_pred1 = c_model1_predict(validation2.X)
-    val2_pred2 = c_model2_predict(validation2.X)
 
-    return rca.get_score(val2_pred1, val2_pred2, validation2.y)
+    # ensure that the protocol returns a LabelledCollection for each iteration
+    protocol.collator = OnLabelledCollectionProtocol.get_collator("labelled_collection")
+
+    cols = [
+        ("base", "F"),
+        ("base", "T"),
+        ("rca*", "score"),
+    ]
+    results = []
+    for test in protocol():
+        [f_prev, t_prev] = test.prevalence()
+        try:
+            test_pred = c_model_predict(test.X)
+            c_model2 = rca.clone_fit(c_model, test.X, test_pred)
+            c_model2_predict = getattr(c_model2, predict_method)
+            val2_pred2 = c_model2_predict(validation2.X)
+            rca_star_score = 1.0 - rca.get_score(val2_pred1, val2_pred2, validation2.y)
+            results.append(
+                {k: v for k, v in zip(cols, [f_prev, t_prev, rca_star_score])}
+            )
+        except ValueError:
+            results.append({k: v for k, v in zip(cols, [f_prev, t_prev, float("nan")])})
+
+    series = avg_groupby_distribution(results)
+    return pd.DataFrame(
+        series,
+        columns=pd.MultiIndex.from_tuples(cols),
+    )
+
 
-    
 def bbse_score(
     c_model: BaseEstimator,
     validation: LabelledCollection,
-    test: LabelledCollection,
+    protocol: AbstractStochasticSeededProtocol,
     predict_method="predict_proba",
 ):
-
     c_model_predict = getattr(c_model, predict_method)
     val_probs, val_labels = c_model_predict(validation.X), validation.y
-    test_probs = c_model_predict(test.X)
 
-    wt = bbse.estimate_labelshift_ratio(val_labels, val_probs, test_probs, 2)
-    estim_prev = bbse.estimate_target_dist(wt, val_labels, 2)
-    true_prev = test.prevalence()
-    return qp.error.ae(true_prev, estim_prev)
+    # ensure that the protocol returns a LabelledCollection for each iteration
+    protocol.collator = OnLabelledCollectionProtocol.get_collator("labelled_collection")
+
+    cols = [
+        ("base", "F"),
+        ("base", "T"),
+        ("bbse", "score"),
+    ]
+    results = []
+    for test in protocol():
+        test_probs = c_model_predict(test.X)
+        wt = bbse.estimate_labelshift_ratio(val_labels, val_probs, test_probs, 2)
+        estim_prev = bbse.estimate_target_dist(wt, val_labels, 2)[1]
+        true_prev = test.prevalence()
+        [f_prev, t_prev] = true_prev
+        acc = qp.error.ae(true_prev, estim_prev)
+        results.append({k: v for k, v in zip(cols, [f_prev, t_prev, acc])})
+
+    series = avg_groupby_distribution(results)
+    return pd.DataFrame(
+        series,
+        columns=pd.MultiIndex.from_tuples(cols),
+    )
diff --git a/quacc/evaluation.py b/quacc/evaluation.py
index 3d1cc20..b07b4f2 100644
--- a/quacc/evaluation.py
+++ b/quacc/evaluation.py
@@ -23,9 +23,6 @@ def estimate(
     for sample in protocol():
         e_sample = estimator.extend(sample)
         estim_prev = estimator.estimate(e_sample.X, ext=True)
-        # base_prevs.append(_prettyfloat(accuracy, sample.prevalence()))
-        # true_prevs.append(_prettyfloat(accuracy, e_sample.prevalence()))
-        # estim_prevs.append(_prettyfloat(accuracy, estim_prev))
         base_prevs.append(sample.prevalence())
         true_prevs.append(e_sample.prevalence())
         estim_prevs.append(estim_prev)
@@ -33,37 +30,20 @@ def estimate(
     return base_prevs, true_prevs, estim_prevs
 
 
-_bprev_col_0 = ["base"]
-_bprev_col_1 = ["0", "1"]
-_prev_col_0 = ["true", "estim"]
-_prev_col_1 = ["TN", "FP", "FN", "TP"]
-_err_col_0 = ["errors"]
-
-
-def _report_columns(err_names):
-    bprev_cols = list(itertools.product(_bprev_col_0, _bprev_col_1))
-    prev_cols = list(itertools.product(_prev_col_0, _prev_col_1))
-
-    err_1 = err_names
-    err_cols = list(itertools.product(_err_col_0, err_1))
-
-    cols = bprev_cols + prev_cols + err_cols
-
-    return pd.MultiIndex.from_tuples(cols)
-
-def _report_avg_groupby_distribution(lst, error_names):
+def avg_groupby_distribution(lst, error_names):
     def _bprev(s):
-        return (s[("base", "0")], s[("base", "1")])
-
-    def _normalize_prev(r, prev_name):
-        raw_prev = [v for ((k0, k1), v) in r.items() if k0 == prev_name]
-        norm_prev = [v/sum(raw_prev) for v in raw_prev]
-        for n, v in zip(itertools.product([prev_name], _prev_col_1), norm_prev):
-            r[n] = v
+        return (s[("base", "F")], s[("base", "T")])
 
+    def _normalize_prev(r):
+        for prev_name in ["true", "estim"]:
+            raw_prev = [v for ((k0, k1), v) in r.items() if k0 == prev_name]
+            norm_prev = [v / sum(raw_prev) for v in raw_prev]
+            for n, v in zip(
+                itertools.product([prev_name], ["TN", "FP", "FN", "TP"]), norm_prev
+            ):
+                r[n] = v
         return r
 
-
     current_bprev = _bprev(lst[0])
     bprev_cnt = 0
     g_lst = [[]]
@@ -80,27 +60,33 @@ def _report_avg_groupby_distribution(lst, error_names):
     for gs in g_lst:
         assert len(gs) > 0
         r = {}
-        r[("base", "0")], r[("base", "1")] = _bprev(gs[0])
+        r[("base", "F")], r[("base", "T")] = _bprev(gs[0])
 
-        for pn in itertools.product(_prev_col_0, _prev_col_1):
+        for pn in [(n1, n2) for ((n1, n2), _) in gs[0].items() if n1 != "base"]:
             r[pn] = stats.mean(map(lambda s: s[pn], gs))
 
-        r = _normalize_prev(r, "true")
-        r = _normalize_prev(r, "estim")
+        r = _normalize_prev(r)
 
-        for en in itertools.product(_err_col_0, error_names):
+        for en in itertools.product(["errors"], error_names):
             r[en] = stats.mean(map(lambda s: s[en], gs))
 
         r_lst.append(r)
 
     return r_lst
 
+
 def evaluation_report(
     estimator: AccuracyEstimator,
     protocol: AbstractStochasticSeededProtocol,
     error_metrics: Iterable[Union[str, Callable]] = "all",
     aggregate: bool = True,
 ):
+    def _report_columns(err_names):
+        base_cols = list(itertools.product(["base"], ["F", "T"]))
+        prev_cols = list(itertools.product(["true", "estim"], ["TN", "FP", "FN", "TP"]))
+        err_cols = list(itertools.product(["errors"], err_names))
+        return base_cols + prev_cols, err_cols
+
     base_prevs, true_prevs, estim_prevs = estimate(estimator, protocol)
 
     if error_metrics == "all":
@@ -114,20 +100,16 @@ def evaluation_report(
     error_cols = error_names.copy()
     if "f1" in error_cols:
         error_cols.remove("f1")
-        error_cols.extend(["f1_true", "f1_estim", "f1_dist"])
+        error_cols.extend(["f1_true", "f1_estim"])
     if "f1e" in error_cols:
         error_cols.remove("f1e")
         error_cols.extend(["f1e_true", "f1e_estim"])
 
     # df_cols = ["base_prev", "true_prev", "estim_prev"] + error_names
-    df_cols = _report_columns(error_cols)
+    prev_cols, err_cols = _report_columns(error_cols)
 
     lst = []
     for base_prev, true_prev, estim_prev in zip(base_prevs, true_prevs, estim_prevs):
-        prev_cols = list(itertools.product(_bprev_col_0, _bprev_col_1)) + list(
-            itertools.product(_prev_col_0, _prev_col_1)
-        )
-
         series = {
             k: v
             for (k, v) in zip(
@@ -143,7 +125,6 @@ def evaluation_report(
                 f1_true, f1_estim = error_metric(true_prev), error_metric(estim_prev)
                 series[("errors", "f1_true")] = f1_true
                 series[("errors", "f1_estim")] = f1_estim
-                series[("errors", "f1_dist")] = abs(f1_estim - f1_true)
                 continue
 
             score = error_metric(true_prev, estim_prev)
@@ -151,6 +132,10 @@ def evaluation_report(
 
         lst.append(series)
 
-    lst = _report_avg_groupby_distribution(lst, error_cols) if aggregate else lst
-    df = pd.DataFrame(lst, columns=df_cols)
+    lst = avg_groupby_distribution(lst, error_cols) if aggregate else lst
+
+    df = pd.DataFrame(
+        lst,
+        columns=pd.MultiIndex.from_tuples(prev_cols + err_cols),
+    )
     return df
diff --git a/quacc/main.py b/quacc/main.py
index bbb054e..879b3a5 100644
--- a/quacc/main.py
+++ b/quacc/main.py
@@ -4,12 +4,13 @@ from quapy.protocol import APP
 from sklearn.linear_model import LogisticRegression
 
 import quacc.evaluation as eval
+import quacc.baseline as baseline
 from quacc.estimator import (
     BinaryQuantifierAccuracyEstimator,
     MulticlassAccuracyEstimator,
 )
 
-from quacc.dataset import get_imdb
+from quacc.dataset import get_imdb, get_spambase
 
 qp.environ["SAMPLE_SIZE"] = 100
 
@@ -20,7 +21,7 @@ dataset_name = "imdb"
 
 def estimate_multiclass():
     print(dataset_name)
-    train, validation, test = get_imdb(dataset_name)
+    train, validation, test = get_imdb()
 
     model = LogisticRegression()
 
@@ -59,7 +60,7 @@ def estimate_multiclass():
 
 def estimate_binary():
     print(dataset_name)
-    train, validation, test = get_imdb(dataset_name)
+    train, validation, test = get_imdb()
 
     model = LogisticRegression()
 
@@ -97,6 +98,39 @@ def estimate_binary():
     # print(df.to_html())
     print()
 
+def estimate_comparison():
+    train, validation, test = get_spambase()
+    model = LogisticRegression()
+    model.fit(*train.Xy)
+
+    n_prevalences = 21
+    repreats = 1000
+    protocol = APP(test, n_prevalences=n_prevalences, repeats=repreats)
+
+    estimator = BinaryQuantifierAccuracyEstimator(model)
+    estimator.fit(validation)
+    df = eval.evaluation_report(estimator, protocol)
+    
+    df_index = [("base", "F"), ("base", "T")]
+
+    atc_mc_df = baseline.atc_mc(model, validation, protocol)
+    atc_ne_df = baseline.atc_ne(model, validation, protocol)
+    doc_feat_df = baseline.doc_feat(model, validation, protocol)
+    rca_df = baseline.rca_score(model, validation, protocol)
+    rca_star_df = baseline.rca_star_score(model, validation, protocol)
+    bbse_df = baseline.bbse_score(model, validation, protocol)
+
+    df = df.join(atc_mc_df.set_index(df_index), on=df_index)
+    df = df.join(atc_ne_df.set_index(df_index), on=df_index)
+    df = df.join(doc_feat_df.set_index(df_index), on=df_index)
+    df = df.join(rca_df.set_index(df_index), on=df_index)
+    df = df.join(rca_star_df.set_index(df_index), on=df_index)
+    df = df.join(bbse_df.set_index(df_index), on=df_index)
+
+    print(df.to_string())
+
+def main():
+    estimate_comparison()
 
 if __name__ == "__main__":
-    estimate_binary()
+    main()