diff --git a/elsahar19_rca/__pycache__/rca.cpython-311.pyc b/elsahar19_rca/__pycache__/rca.cpython-311.pyc
deleted file mode 100644
index aeb753d..0000000
Binary files a/elsahar19_rca/__pycache__/rca.cpython-311.pyc and /dev/null differ
diff --git a/garg22_ATC/__pycache__/ATC_helper.cpython-311.pyc b/garg22_ATC/__pycache__/ATC_helper.cpython-311.pyc
deleted file mode 100644
index c8e0c9d..0000000
Binary files a/garg22_ATC/__pycache__/ATC_helper.cpython-311.pyc and /dev/null differ
diff --git a/guillory21_doc/__pycache__/doc.cpython-311.pyc b/guillory21_doc/__pycache__/doc.cpython-311.pyc
deleted file mode 100644
index ea09f92..0000000
Binary files a/guillory21_doc/__pycache__/doc.cpython-311.pyc and /dev/null differ
diff --git a/jiang18_trustscore/__pycache__/trustscore.cpython-311.pyc b/jiang18_trustscore/__pycache__/trustscore.cpython-311.pyc
deleted file mode 100644
index e787657..0000000
Binary files a/jiang18_trustscore/__pycache__/trustscore.cpython-311.pyc and /dev/null differ
diff --git a/lipton_bbse/__pycache__/labelshift.cpython-311.pyc b/lipton_bbse/__pycache__/labelshift.cpython-311.pyc
deleted file mode 100644
index 10decf8..0000000
Binary files a/lipton_bbse/__pycache__/labelshift.cpython-311.pyc and /dev/null differ
diff --git a/quacc/baseline.py b/quacc/baseline.py
index b96f1bc..9a53db7 100644
--- a/quacc/baseline.py
+++ b/quacc/baseline.py
@@ -201,13 +201,13 @@ def rca_score(
     ]
     results = []
     for test in protocol():
-        [f_prev, t_prev] = test.prevalence()
-        try:
+        try: 
+            [f_prev, t_prev] = test.prevalence()
             test_pred = c_model_predict(test.X)
             c_model2 = rca.clone_fit(c_model, test.X, test_pred)
             c_model2_predict = getattr(c_model2, predict_method)
             val_pred2 = c_model2_predict(validation.X)
-            rca_score = 1.0 - rca.get_score(val_pred1, val_pred2, validation.y)
+            rca_score = rca.get_score(val_pred1, val_pred2, validation.y)
             results.append({k: v for k, v in zip(cols, [f_prev, t_prev, rca_score])})
         except ValueError:
             results.append({k: v for k, v in zip(cols, [f_prev, t_prev, float("nan")])})
@@ -248,7 +248,7 @@ def rca_star_score(
             c_model2 = rca.clone_fit(c_model, test.X, test_pred)
             c_model2_predict = getattr(c_model2, predict_method)
             val2_pred2 = c_model2_predict(validation2.X)
-            rca_star_score = 1.0 - rca.get_score(val2_pred1, val2_pred2, validation2.y)
+            rca_star_score = rca.get_score(val2_pred1, val2_pred2, validation2.y)
             results.append(
                 {k: v for k, v in zip(cols, [f_prev, t_prev, rca_star_score])}
             )
diff --git a/quacc/dataset.py b/quacc/dataset.py
index d009a78..8098966 100644
--- a/quacc/dataset.py
+++ b/quacc/dataset.py
@@ -1,3 +1,4 @@
+from operator import index
 from typing import Tuple
 import numpy as np
 from quapy.data.base import LabelledCollection
@@ -18,11 +19,29 @@ def get_spambase() -> Tuple[LabelledCollection]:
     train, validation = train.split_stratified(train_prop=TRAIN_VAL_PROP)
     return train, validation, test
 
+# >>> fetch_rcv1().target_names                  
+# array(['C11', 'C12', 'C13', 'C14', 'C15', 'C151', 'C1511', 'C152', 'C16',
+#        'C17', 'C171', 'C172', 'C173', 'C174', 'C18', 'C181', 'C182',
+#        'C183', 'C21', 'C22', 'C23', 'C24', 'C31', 'C311', 'C312', 'C313',
+#        'C32', 'C33', 'C331', 'C34', 'C41', 'C411', 'C42', 'CCAT', 'E11',
+#        'E12', 'E121', 'E13', 'E131', 'E132', 'E14', 'E141', 'E142',
+#        'E143', 'E21', 'E211', 'E212', 'E31', 'E311', 'E312', 'E313',
+#        'E41', 'E411', 'E51', 'E511', 'E512', 'E513', 'E61', 'E71', 'ECAT',
+#        'G15', 'G151', 'G152', 'G153', 'G154', 'G155', 'G156', 'G157',
+#        'G158', 'G159', 'GCAT', 'GCRIM', 'GDEF', 'GDIP', 'GDIS', 'GENT',
+#        'GENV', 'GFAS', 'GHEA', 'GJOB', 'GMIL', 'GOBIT', 'GODD', 'GPOL',
+#        'GPRO', 'GREL', 'GSCI', 'GSPO', 'GTOUR', 'GVIO', 'GVOTE', 'GWEA',
+#        'GWELF', 'M11', 'M12', 'M13', 'M131', 'M132', 'M14', 'M141',
+#        'M142', 'M143', 'MCAT'], dtype=object)
 
-def get_rcv1(sample_size=100):
+def get_rcv1(target:str):
+    sample_size = qp.environ["SAMPLE_SIZE"]
     n_train = 23149
     dataset = fetch_rcv1()
 
+    if target not in dataset.target_names:
+        raise ValueError("Invalid target")
+
     def dataset_split(data, labels, classes=[0, 1]) -> Tuple[LabelledCollection]:
         all_train_d, test_d = data[:n_train, :], data[n_train:, :]
         all_train_l, test_l = labels[:n_train], labels[n_train:]
@@ -31,14 +50,13 @@ def get_rcv1(sample_size=100):
         train, validation = all_train.split_stratified(train_prop=TRAIN_VAL_PROP)
         return train, validation, test
 
-    target_labels = [
-        (target, dataset.target[:, ind].toarray().flatten())
-        for (ind, target) in enumerate(dataset.target_names)
-    ]
-    filtered_target_labels = filter(
-        lambda _, labels: np.sum(labels[n_train:]) >= sample_size, target_labels
-    )
-    return {
-        target: dataset_split(dataset.data, labels, classes=[0, 1])
-        for (target, labels) in filtered_target_labels
-    }
+    target_index = np.where(dataset.target_names == target)[0]
+    target_labels = dataset.target[:, target_index].toarray().flatten()
+
+    if np.sum(target_labels[n_train:]) < sample_size:
+        raise ValueError("Target has too few positive samples")
+
+    d = dataset_split(dataset.data, target_labels, classes=[0, 1])
+
+    return d
+
diff --git a/quacc/error.py b/quacc/error.py
index 90e5701..dfd19bd 100644
--- a/quacc/error.py
+++ b/quacc/error.py
@@ -8,18 +8,28 @@ def from_name(err_name):
     else:
         return qp.error.from_name(err_name)
     
+# def f1(prev):
+#     # https://github.com/dice-group/gerbil/wiki/Precision,-Recall-and-F1-measure
+#     if prev[0] == 0 and prev[1] == 0 and prev[2] == 0:
+#         return 1.0
+#     elif prev[0] == 0 and prev[1] > 0 and prev[2] == 0:
+#         return 0.0
+#     elif prev[0] == 0 and prev[1] == 0 and prev[2] > 0:
+#         return float('NaN')
+#     else:
+#         recall = prev[0] / (prev[0] + prev[1])
+#         precision = prev[0] / (prev[0] + prev[2]) 
+#         return 2 * (precision * recall) / (precision + recall)
+
 def f1(prev):
-    # https://github.com/dice-group/gerbil/wiki/Precision,-Recall-and-F1-measure
-    if prev[0] == 0 and prev[1] == 0 and prev[2] == 0:
+    den = (2*prev[3]) + prev[1] + prev[2]
+    if den == 0:
         return 1.0
-    elif prev[0] == 0 and prev[1] > 0 and prev[2] == 0:
-        return 0.0
-    elif prev[0] == 0 and prev[1] == 0 and prev[2] > 0:
-        return float('NaN')
     else:
-        recall = prev[0] / (prev[0] + prev[1])
-        precision = prev[0] / (prev[0] + prev[2]) 
-        return 2 * (precision * recall) / (precision + recall)
+        return (2*prev[3])/den
 
 def f1e(prev):
     return 1 - f1(prev)
+
+def mae(prev):
+    return (prev[1] + prev[2]) / sum(prev)
\ No newline at end of file
diff --git a/quacc/evaluation.py b/quacc/evaluation.py
index b07b4f2..a58f86c 100644
--- a/quacc/evaluation.py
+++ b/quacc/evaluation.py
@@ -80,55 +80,63 @@ def evaluation_report(
     protocol: AbstractStochasticSeededProtocol,
     error_metrics: Iterable[Union[str, Callable]] = "all",
     aggregate: bool = True,
+    prevalence: bool = True,
 ):
     def _report_columns(err_names):
         base_cols = list(itertools.product(["base"], ["F", "T"]))
         prev_cols = list(itertools.product(["true", "estim"], ["TN", "FP", "FN", "TP"]))
         err_cols = list(itertools.product(["errors"], err_names))
-        return base_cols + prev_cols, err_cols
+        return base_cols, prev_cols, err_cols
 
     base_prevs, true_prevs, estim_prevs = estimate(estimator, protocol)
 
     if error_metrics == "all":
-        error_metrics = ["ae", "f1"]
+        error_metrics = ["mae", "f1"]
 
     error_funcs = [
         error.from_name(e) if isinstance(e, str) else e for e in error_metrics
     ]
     assert all(hasattr(e, "__call__") for e in error_funcs), "invalid error function"
     error_names = [e.__name__ for e in error_funcs]
-    error_cols = error_names.copy()
-    if "f1" in error_cols:
-        error_cols.remove("f1")
-        error_cols.extend(["f1_true", "f1_estim"])
-    if "f1e" in error_cols:
-        error_cols.remove("f1e")
-        error_cols.extend(["f1e_true", "f1e_estim"])
+    error_cols = []
+    for err in error_names:
+        if err == "mae":
+            error_cols.extend(["mae_estim", "mae_true"])
+        elif err == "f1":
+            error_cols.extend(["f1_estim", "f1_true"])
+        elif err == "f1e":
+            error_cols.extend(["f1e_estim", "f1e_true"])
+        else:
+            error_cols.append(err)
 
     # df_cols = ["base_prev", "true_prev", "estim_prev"] + error_names
-    prev_cols, err_cols = _report_columns(error_cols)
+    base_cols, prev_cols, err_cols = _report_columns(error_cols)
 
     lst = []
     for base_prev, true_prev, estim_prev in zip(base_prevs, true_prevs, estim_prevs):
-        series = {
-            k: v
-            for (k, v) in zip(
-                prev_cols, np.concatenate((base_prev, true_prev, estim_prev), axis=0)
-            )
-        }
-        for error_name, error_metric in zip(error_names, error_funcs):
-            if error_name == "f1e":
-                series[("errors", "f1e_true")] = error_metric(true_prev)
-                series[("errors", "f1e_estim")] = error_metric(estim_prev)
-                continue
-            if error_name == "f1":
-                f1_true, f1_estim = error_metric(true_prev), error_metric(estim_prev)
-                series[("errors", "f1_true")] = f1_true
-                series[("errors", "f1_estim")] = f1_estim
-                continue
+        if prevalence:
+            series = {
+                k: v
+                for (k, v) in zip(
+                    base_cols + prev_cols,
+                    np.concatenate((base_prev, true_prev, estim_prev), axis=0),
+                )
+            }
+            df_cols = base_cols + prev_cols + err_cols
+        else:
+            series = {k: v for (k, v) in zip(base_cols, base_prev)}
+            df_cols = base_cols + err_cols
 
-            score = error_metric(true_prev, estim_prev)
-            series[("errors", error_name)] = score
+        for err in error_cols:
+            error_funcs = {
+                "mae_true": lambda: error.mae(true_prev),
+                "mae_estim": lambda: error.mae(estim_prev),
+                "f1_true": lambda: error.f1(true_prev),
+                "f1_estim": lambda: error.f1(estim_prev),
+                "f1e_true": lambda: error.f1e(true_prev),
+                "f1e_estim": lambda: error.f1e(estim_prev),
+            }
+            series[("errors", err)] = error_funcs[err]()
 
         lst.append(series)
 
@@ -136,6 +144,6 @@ def evaluation_report(
 
     df = pd.DataFrame(
         lst,
-        columns=pd.MultiIndex.from_tuples(prev_cols + err_cols),
+        columns=pd.MultiIndex.from_tuples(df_cols),
     )
     return df
diff --git a/quacc/main.py b/quacc/main.py
index 879b3a5..d58a65e 100644
--- a/quacc/main.py
+++ b/quacc/main.py
@@ -2,6 +2,7 @@ import pandas as pd
 import quapy as qp
 from quapy.protocol import APP
 from sklearn.linear_model import LogisticRegression
+from quacc import utils
 
 import quacc.evaluation as eval
 import quacc.baseline as baseline
@@ -10,7 +11,7 @@ from quacc.estimator import (
     MulticlassAccuracyEstimator,
 )
 
-from quacc.dataset import get_imdb, get_spambase
+from quacc.dataset import get_imdb, get_rcv1, get_spambase
 
 qp.environ["SAMPLE_SIZE"] = 100
 
@@ -109,25 +110,21 @@ def estimate_comparison():
 
     estimator = BinaryQuantifierAccuracyEstimator(model)
     estimator.fit(validation)
-    df = eval.evaluation_report(estimator, protocol)
+    df = eval.evaluation_report(estimator, protocol, prevalence=False)
     
-    df_index = [("base", "F"), ("base", "T")]
+    df = utils.combine_dataframes(
+        baseline.atc_mc(model, validation, protocol),
+        baseline.atc_ne(model, validation, protocol),
+        baseline.doc_feat(model, validation, protocol),
+        baseline.rca_score(model, validation, protocol),
+        baseline.rca_star_score(model, validation, protocol),
+        baseline.bbse_score(model, validation, protocol),
+        df,
+        df_index=[("base", "F"), ("base", "T")]
+    )
 
-    atc_mc_df = baseline.atc_mc(model, validation, protocol)
-    atc_ne_df = baseline.atc_ne(model, validation, protocol)
-    doc_feat_df = baseline.doc_feat(model, validation, protocol)
-    rca_df = baseline.rca_score(model, validation, protocol)
-    rca_star_df = baseline.rca_star_score(model, validation, protocol)
-    bbse_df = baseline.bbse_score(model, validation, protocol)
-
-    df = df.join(atc_mc_df.set_index(df_index), on=df_index)
-    df = df.join(atc_ne_df.set_index(df_index), on=df_index)
-    df = df.join(doc_feat_df.set_index(df_index), on=df_index)
-    df = df.join(rca_df.set_index(df_index), on=df_index)
-    df = df.join(rca_star_df.set_index(df_index), on=df_index)
-    df = df.join(bbse_df.set_index(df_index), on=df_index)
-
-    print(df.to_string())
+    print(df.to_latex(float_format="{:.4f}".format))
+    print(utils.avg_group_report(df).to_latex(float_format="{:.4f}".format))
 
 def main():
     estimate_comparison()
diff --git a/quacc/utils.py b/quacc/utils.py
new file mode 100644
index 0000000..6da5b39
--- /dev/null
+++ b/quacc/utils.py
@@ -0,0 +1,31 @@
+
+import functools
+import pandas as pd
+
+def combine_dataframes(*dfs, df_index=[]) -> pd.DataFrame:
+    if len(dfs) < 1:
+        raise ValueError
+    if len(dfs) == 1:
+        return dfs[0]
+    df = dfs[0]
+    for ndf in dfs[1:]:
+        df = df.join(ndf.set_index(df_index), on=df_index)
+    
+    return df
+
+
+def avg_group_report(df: pd.DataFrame) -> pd.DataFrame:
+    def _reduce_func(s1, s2):
+        return {
+            (n1, n2): v + s2[(n1, n2)] for ((n1, n2), v) in s1.items()
+        }
+
+    lst = df.to_dict(orient="records")[1:-1]
+    summed_series = functools.reduce(_reduce_func, lst)
+    idx = df.columns.drop([("base", "T"), ("base", "F")])
+    avg_report = {
+        (n1, n2): (v / len(lst))
+        for ((n1, n2), v) in summed_series.items()
+        if n1 != "base"
+    }
+    return pd.DataFrame([avg_report], columns=idx)
\ No newline at end of file