From d906502c29239622ce2131dc6d5294d6a7fbe46b Mon Sep 17 00:00:00 2001
From: Lorenzo Volpi <lorenzo.volpi@outlook.com>
Date: Fri, 20 Oct 2023 23:36:05 +0200
Subject: [PATCH] Dataset refactored, training sampling added

---
 .gitignore                   |   3 +-
 TODO.html                    |  55 ++++++++++
 TODO.md                      |  14 ++-
 quacc/dataset.py             | 149 ++++++++++++++++++++-------
 quacc/environ.py             |  31 ++++++
 quacc/evaluation/baseline.py |  21 ++--
 quacc/evaluation/comp.py     |  91 +++++++++++++++++
 quacc/evaluation/method.py   |  98 ++++--------------
 quacc/evaluation/report.py   | 189 ++++++++++++++++-------------------
 quacc/main.py                |  37 ++-----
 quacc/utils.py               |  15 +--
 11 files changed, 426 insertions(+), 277 deletions(-)
 create mode 100644 TODO.html
 create mode 100644 quacc/environ.py
 create mode 100644 quacc/evaluation/comp.py
diff --git a/.gitignore b/.gitignore
index 1ae9719..f450568 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,4 +11,5 @@ lipton_bbse/__pycache__/*
 elsahar19_rca/__pycache__/*
 *.coverage
 .coverage
-scp_sync.py
\ No newline at end of file
+scp_sync.py
+out/*
\ No newline at end of file
diff --git a/TODO.html b/TODO.html
new file mode 100644
index 0000000..ddfdc17
--- /dev/null
+++ b/TODO.html
@@ -0,0 +1,55 @@
+<!DOCTYPE html>
+    <html>
+    <head>
+        <meta charset="UTF-8">
+        <title></title>
+        <style>
+/* From extension vscode.github */
+/*---------------------------------------------------------------------------------------------
+ *  Copyright (c) Microsoft Corporation. All rights reserved.
+ *  Licensed under the MIT License. See License.txt in the project root for license information.
+ *--------------------------------------------------------------------------------------------*/
+
+.vscode-dark img[src$=\#gh-light-mode-only],
+.vscode-light img[src$=\#gh-dark-mode-only] {
+	display: none;
+}
+
+</style>
+        
+        <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/Microsoft/vscode/extensions/markdown-language-features/media/markdown.css">
+<link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/Microsoft/vscode/extensions/markdown-language-features/media/highlight.css">
+<style>
+            body {
+                font-family: -apple-system, BlinkMacSystemFont, 'Segoe WPC', 'Segoe UI', system-ui, 'Ubuntu', 'Droid Sans', sans-serif;
+                font-size: 14px;
+                line-height: 1.6;
+            }
+        </style>
+        <style>
+.task-list-item {
+    list-style-type: none;
+}
+
+.task-list-item-checkbox {
+    margin-left: -20px;
+    vertical-align: middle;
+    pointer-events: none;
+}
+</style>
+        
+    </head>
+    <body class="vscode-body vscode-light">
+        <ul class="contains-task-list">
+<li class="task-list-item enabled"><input class="task-list-item-checkbox"type="checkbox"> aggiungere media tabelle</li>
+<li class="task-list-item enabled"><input class="task-list-item-checkbox"type="checkbox"> plot; 3 tipi (appunti + email + garg)</li>
+<li class="task-list-item enabled"><input class="task-list-item-checkbox"type="checkbox"> sistemare kfcv baseline</li>
+<li class="task-list-item enabled"><input class="task-list-item-checkbox"type="checkbox"> aggiungere metodo con CC oltre SLD</li>
+<li class="task-list-item enabled"><input class="task-list-item-checkbox" checked=""type="checkbox"> prendere classe più popolosa di rcv1, togliere negativi fino a raggiungere 50/50; poi fare subsampling con 9 training prvalences (da 0.1-0.9 a 0.9-0.1)</li>
+<li class="task-list-item enabled"><input class="task-list-item-checkbox"type="checkbox"> variare parametro recalibration in SLD</li>
+</ul>
+
+        
+        
+    </body>
+    </html>
\ No newline at end of file
diff --git a/TODO.md b/TODO.md
index a3a7321..2f6d846 100644
--- a/TODO.md
+++ b/TODO.md
@@ -1,8 +1,6 @@
-- aggiungere media tabelle
-- plot
-    - 3 tipi (vedi appunti + garg)
-- sistemare kfcv baseline
-- aggiungere metodo con CC oltre SLD
-- prendere classe più popolosa di rcv1, togliere negativi fino a raggiungere 50/50
-  poi fare subsampling con 9 training prvalences (da 0.1-0.9 a 0.9-0.1)
-- variare parametro recalibration in SLD
\ No newline at end of file
+- [ ] aggiungere media tabelle
+- [ ] plot; 3 tipi (appunti + email + garg)
+- [ ] sistemare kfcv baseline
+- [ ] aggiungere metodo con CC oltre SLD
+- [x] prendere classe più popolosa di rcv1, togliere negativi fino a raggiungere 50/50; poi fare subsampling con 9 training prvalences (da 0.1-0.9 a 0.9-0.1)
+- [ ] variare parametro recalibration in SLD
\ No newline at end of file
diff --git a/quacc/dataset.py b/quacc/dataset.py
index a4e1735..c63552f 100644
--- a/quacc/dataset.py
+++ b/quacc/dataset.py
@@ -1,26 +1,102 @@
-from typing import Tuple
+import math
+from typing import List
+
 import numpy as np
-from quapy.data.base import LabelledCollection
 import quapy as qp
+from quapy.data.base import LabelledCollection
 from sklearn.conftest import fetch_rcv1
 
 TRAIN_VAL_PROP = 0.5
 
 
-def get_imdb(**kwargs) -> Tuple[LabelledCollection]:
-    train, test = qp.datasets.fetch_reviews("imdb", tfidf=True).train_test
-    train, validation = train.split_stratified(
-        train_prop=TRAIN_VAL_PROP, random_state=0
-    )
-    return train, validation, test
+class DatasetSample:
+    def __init__(
+        self,
+        train: LabelledCollection,
+        validation: LabelledCollection,
+        test: LabelledCollection,
+    ):
+        self.train = train
+        self.validation = validation
+        self.test = test
+
+    @property
+    def train_prev(self):
+        return self.train.prevalence()
+
+    @property
+    def validation_prev(self):
+        return self.validation.prevalence()
+
+    @property
+    def prevs(self):
+        return {"train": self.train_prev, "validation": self.validation_prev}
 
 
-def get_spambase(**kwargs) -> Tuple[LabelledCollection]:
-    train, test = qp.datasets.fetch_UCIDataset("spambase", verbose=False).train_test
-    train, validation = train.split_stratified(
-        train_prop=TRAIN_VAL_PROP, random_state=0
-    )
-    return train, validation, test
+class Dataset:
+    def __init__(self, name, n_prevalences=9, target=None):
+        self._name = name
+        self._target = target
+        self.n_prevs = n_prevalences
+
+    def __spambase(self):
+        return qp.datasets.fetch_reviews("imdb", tfidf=True).train_test
+
+    def __imdb(self):
+        return qp.datasets.fetch_UCIDataset("spambase", verbose=False).train_test
+
+    def __rcv1(self):
+        n_train = 23149
+        available_targets = ["CCAT", "GCAT", "MCAT"]
+
+        if self._target is None or self._target not in available_targets:
+            raise ValueError("Invalid target")
+
+        dataset = fetch_rcv1()
+        target_index = np.where(dataset.target_names == self._target)[0]
+        all_train_d, test_d = dataset.data[:n_train, :], dataset.data[n_train:, :]
+        labels = dataset.target[:, target_index].toarray().flatten()
+        all_train_l, test_l = labels[:n_train], labels[n_train:]
+        all_train = LabelledCollection(all_train_d, all_train_l, classes=[0, 1])
+        test = LabelledCollection(test_d, test_l, classes=[0, 1])
+
+        return all_train, test
+
+    def get(self) -> List[DatasetSample]:
+        all_train, test = {
+            "spambase": self.__spambase,
+            "imdb": self.__imdb,
+            "rcv1": self.__rcv1,
+        }[self._name]()
+
+        # resample all_train set to have (0.5, 0.5) prevalence
+        at_positives = np.sum(all_train.y)
+        all_train = all_train.sampling(
+            min(at_positives, len(all_train) - at_positives) * 2, 0.5, random_state=0
+        )
+
+        # sample prevalences
+        prevalences = np.linspace(0.0, 1.0, num=self.n_prevs + 1, endpoint=False)[1:]
+        at_size = min(math.floor(len(all_train) * 0.5 / p) for p in prevalences)
+        datasets = []
+        for p in prevalences:
+            all_train_sampled = all_train.sampling(at_size, p, random_state=0)
+            train, validation = all_train_sampled.split_stratified(
+                train_prop=TRAIN_VAL_PROP, random_state=0
+            )
+            datasets.append(DatasetSample(train, validation, test))
+
+        return datasets
+
+    def __call__(self):
+        return self.get()
+
+    @property
+    def name(self):
+        if self._name == "rcv1":
+            return f"{self._name}_{self._target}"
+        else:
+            return self._name
 
 
 # >>> fetch_rcv1().target_names
@@ -39,33 +115,30 @@ def get_spambase(**kwargs) -> Tuple[LabelledCollection]:
 #        'M142', 'M143', 'MCAT'], dtype=object)
 
 
-def get_rcv1(target = "default", **kwargs):
-    sample_size = qp.environ["SAMPLE_SIZE"]
-    n_train = 23149
+def rcv1_info():
     dataset = fetch_rcv1()
+    n_train = 23149
 
-    if target == "default":
-        target = "C12"
-
-    if target not in dataset.target_names:
-        raise ValueError("Invalid target")
-
-    def dataset_split(data, labels, classes=[0, 1]) -> Tuple[LabelledCollection]:
-        all_train_d, test_d = data[:n_train, :], data[n_train:, :]
-        all_train_l, test_l = labels[:n_train], labels[n_train:]
-        all_train = LabelledCollection(all_train_d, all_train_l, classes=classes)
-        test = LabelledCollection(test_d, test_l, classes=classes)
-        train, validation = all_train.split_stratified(
-            train_prop=TRAIN_VAL_PROP, random_state=0
+    targets = []
+    for target in range(103):
+        train_t_prev = np.average(dataset.target[:n_train, target].toarray().flatten())
+        test_t_prev = np.average(dataset.target[n_train:, target].toarray().flatten())
+        targets.append(
+            (
+                dataset.target_names[target],
+                {
+                    "train": (1.0 - train_t_prev, train_t_prev),
+                    "test": (1.0 - test_t_prev, test_t_prev),
+                },
+            )
         )
-        return train, validation, test
 
-    target_index = np.where(dataset.target_names == target)[0]
-    target_labels = dataset.target[:, target_index].toarray().flatten()
+    targets.sort(key=lambda t: t[1]["train"][1])
+    for n, d in targets:
+        print(f"{n}:")
+        for k, (fp, tp) in d.items():
+            print(f"\t{k}: {fp:.4f}, {tp:.4f}")
 
-    if np.sum(target_labels[n_train:]) < sample_size:
-        raise ValueError("Target has too few positive samples")
 
-    d = dataset_split(dataset.data, target_labels, classes=[0, 1])
-
-    return d
+if __name__ == "__main__":
+    rcv1_info()
diff --git a/quacc/environ.py b/quacc/environ.py
new file mode 100644
index 0000000..cc2f13c
--- /dev/null
+++ b/quacc/environ.py
@@ -0,0 +1,31 @@
+from pathlib import Path
+
+defalut_env = {
+    "DATASET_NAME": "rcv1",
+    "DATASET_TARGET": "CCAT",
+    "COMP_ESTIMATORS": [
+        "OUR_BIN_SLD",
+        "OUR_MUL_SLD",
+        "KFCV",
+        "ATC_MC",
+        "ATC_NE",
+        "DOC_FEAT",
+        # "RCA",
+        # "RCA_STAR",
+    ],
+    "DATASET_N_PREVS": 9,
+    "OUT_DIR": Path("out"),
+    "PLOT_OUT_DIR": Path("out/plot"),
+    "PROTOCOL_N_PREVS": 21,
+    "PROTOCOL_REPEATS": 100,
+    "SAMPLE_SIZE": 1000,
+}
+
+
+class Environ:
+    def __init__(self, **kwargs):
+        for k, v in kwargs.items():
+            self.__setattr__(k, v)
+
+
+env = Environ(**defalut_env)
diff --git a/quacc/evaluation/baseline.py b/quacc/evaluation/baseline.py
index ce36045..f4e969d 100644
--- a/quacc/evaluation/baseline.py
+++ b/quacc/evaluation/baseline.py
@@ -1,29 +1,28 @@
 from statistics import mean
-from typing import Dict
 
 import numpy as np
-from quapy.data import LabelledCollection
-from sklearn.base import BaseEstimator
-from sklearn.model_selection import cross_validate
 import sklearn.metrics as metrics
+from quapy.data import LabelledCollection
 from quapy.protocol import (
     AbstractStochasticSeededProtocol,
     OnLabelledCollectionProtocol,
 )
-
-from .report import EvaluationReport
+from sklearn.base import BaseEstimator
+from sklearn.model_selection import cross_validate
 
 import elsahar19_rca.rca as rca
 import garg22_ATC.ATC_helper as atc
 import guillory21_doc.doc as doc
 import jiang18_trustscore.trustscore as trustscore
 
+from .report import EvaluationReport
+
 
 def kfcv(
-    c_model: BaseEstimator, 
+    c_model: BaseEstimator,
     validation: LabelledCollection,
     protocol: AbstractStochasticSeededProtocol,
-    predict_method="predict"
+    predict_method="predict",
 ):
     c_model_predict = getattr(c_model, predict_method)
 
@@ -42,12 +41,12 @@ def kfcv(
         meta_f1 = abs(f1_score - metrics.f1_score(test.y, test_preds))
         report.append_row(
             test.prevalence(),
-            acc_score=(1. - acc_score),
+            acc_score=(1.0 - acc_score),
             f1_score=f1_score,
             acc=meta_acc,
             f1=meta_f1,
         )
-    
+
     return report
 
 
@@ -63,7 +62,7 @@ def reference(
         test_probs = c_model_predict(test.X)
         test_preds = np.argmax(test_probs, axis=-1)
         report.append_row(
-            test.prevalence(), 
+            test.prevalence(),
             acc_score=(1 - metrics.accuracy_score(test.y, test_preds)),
             f1_score=metrics.f1_score(test.y, test_preds),
         )
diff --git a/quacc/evaluation/comp.py b/quacc/evaluation/comp.py
new file mode 100644
index 0000000..ccc4e18
--- /dev/null
+++ b/quacc/evaluation/comp.py
@@ -0,0 +1,91 @@
+import multiprocessing
+import time
+from typing import List
+
+import pandas as pd
+import quapy as qp
+from quapy.protocol import APP
+from sklearn.linear_model import LogisticRegression
+
+from quacc.dataset import Dataset
+from quacc.environ import env
+from quacc.evaluation import baseline, method
+from quacc.evaluation.report import DatasetReport, EvaluationReport
+
+qp.environ["SAMPLE_SIZE"] = env.SAMPLE_SIZE
+
+pd.set_option("display.float_format", "{:.4f}".format)
+
+
+class CompEstimator:
+    __dict = {
+        "OUR_BIN_SLD": method.evaluate_bin_sld,
+        "OUR_MUL_SLD": method.evaluate_mul_sld,
+        "KFCV": baseline.kfcv,
+        "ATC_MC": baseline.atc_mc,
+        "ATC_NE": baseline.atc_ne,
+        "DOC_FEAT": baseline.doc_feat,
+        "RCA": baseline.rca_score,
+        "RCA_STAR": baseline.rca_star_score,
+    }
+
+    def __class_getitem__(cls, e: str | List[str]):
+        if isinstance(e, str):
+            try:
+                return cls.__dict[e]
+            except KeyError:
+                raise KeyError(f"Invalid estimator: estimator {e} does not exist")
+        elif isinstance(e, list):
+            try:
+                return [cls.__dict[est] for est in e]
+            except KeyError as ke:
+                raise KeyError(
+                    f"Invalid estimator: estimator {ke.args[0]} does not exist"
+                )
+
+
+CE = CompEstimator
+
+
+def fit_and_estimate(_estimate, train, validation, test):
+    model = LogisticRegression()
+
+    model.fit(*train.Xy)
+    protocol = APP(
+        test, n_prevalences=env.PROTOCOL_N_PREVS, repeats=env.PROTOCOL_REPEATS
+    )
+    start = time.time()
+    result = _estimate(model, validation, protocol)
+    end = time.time()
+    print(f"{_estimate.__name__}: {end-start:.2f}s")
+
+    return {
+        "name": _estimate.__name__,
+        "result": result,
+        "time": end - start,
+    }
+
+
+def evaluate_comparison(
+    dataset: Dataset, estimators=["OUR_BIN_SLD", "OUR_MUL_SLD"]
+) -> EvaluationReport:
+    with multiprocessing.Pool(8) as pool:
+        dr = DatasetReport(dataset.name)
+        for d in dataset():
+            print(f"train prev.: {d.train_prev}")
+            start = time.time()
+            tasks = [(estim, d.train, d.validation, d.test) for estim in CE[estimators]]
+            results = [pool.apply_async(fit_and_estimate, t) for t in tasks]
+            results = list(map(lambda r: r.get(), results))
+            er = EvaluationReport.combine_reports(
+                *list(map(lambda r: r["result"], results)), name=dataset.name
+            )
+            times = {r["name"]: r["time"] for r in results}
+            end = time.time()
+            times["tot"] = end - start
+            er.times = times
+            er.train_prevs = d.prevs
+            dr.add(er)
+            print()
+
+    return dr
diff --git a/quacc/evaluation/method.py b/quacc/evaluation/method.py
index 0c69ba1..e42f203 100644
--- a/quacc/evaluation/method.py
+++ b/quacc/evaluation/method.py
@@ -1,20 +1,11 @@
-import multiprocessing
-import time
-
-import pandas as pd
-import quapy as qp
 from quapy.data import LabelledCollection
 from quapy.protocol import (
-    APP,
     AbstractStochasticSeededProtocol,
     OnLabelledCollectionProtocol,
 )
 from sklearn.base import BaseEstimator
-from sklearn.linear_model import LogisticRegression
 
 import quacc.error as error
-import quacc.evaluation.baseline as baseline
-from quacc.dataset import get_imdb, get_rcv1, get_spambase
 from quacc.evaluation.report import EvaluationReport
 
 from ..estimator import (
@@ -23,13 +14,6 @@ from ..estimator import (
     MulticlassAccuracyEstimator,
 )
 
-qp.environ["SAMPLE_SIZE"] = 100
-
-pd.set_option("display.float_format", "{:.4f}".format)
-
-n_prevalences = 21
-repreats = 100
-
 
 def estimate(
     estimator: AccuracyEstimator,
@@ -61,11 +45,11 @@ def evaluation_report(
         acc_score = error.acc(estim_prev)
         f1_score = error.f1(estim_prev)
         report.append_row(
-            base_prev, 
-            acc_score=1. - acc_score,
-            acc = abs(error.acc(true_prev) - acc_score),
+            base_prev,
+            acc_score=1.0 - acc_score,
+            acc=abs(error.acc(true_prev) - acc_score),
             f1_score=f1_score,
-            f1=abs(error.f1(true_prev) - f1_score)
+            f1=abs(error.f1(true_prev) - f1_score),
         )
 
     return report
@@ -77,7 +61,7 @@ def evaluate(
     protocol: AbstractStochasticSeededProtocol,
     method: str,
 ):
-    estimator : AccuracyEstimator = {
+    estimator: AccuracyEstimator = {
         "bin": BinaryQuantifierAccuracyEstimator,
         "mul": MulticlassAccuracyEstimator,
     }[method](c_model)
@@ -85,65 +69,17 @@ def evaluate(
     return evaluation_report(estimator, protocol, method)
 
 
-def evaluate_binary(model, validation, protocol):
-    return evaluate(model, validation, protocol, "bin")
+def evaluate_bin_sld(
+    c_model: BaseEstimator,
+    validation: LabelledCollection,
+    protocol: AbstractStochasticSeededProtocol,
+) -> EvaluationReport:
+    return evaluate(c_model, validation, protocol, "bin")
 
 
-def evaluate_multiclass(model, validation, protocol):
-    return evaluate(model, validation, protocol, "mul")
-
-
-def fit_and_estimate(_estimate, train, validation, test):
-    model = LogisticRegression()
-
-    model.fit(*train.Xy)
-    protocol = APP(test, n_prevalences=n_prevalences, repeats=repreats)
-    start = time.time()
-    result = _estimate(model, validation, protocol)
-    end = time.time()
-
-    return {
-        "name": _estimate.__name__,
-        "result": result,
-        "time": end - start,
-    }
-
-
-def evaluate_comparison(dataset: str, **kwargs) -> EvaluationReport:
-    train, validation, test = {
-        "spambase": get_spambase,
-        "imdb": get_imdb,
-        "rcv1": get_rcv1,
-    }[dataset](**kwargs)
-
-    for k,v in kwargs.items():
-        print(k, ":", v)
-
-    prevs = {
-        "train": train.prevalence(),
-        "validation": validation.prevalence(),
-    }
-
-    start = time.time()
-    with multiprocessing.Pool(8) as pool:
-        estimators = [
-            evaluate_binary,
-            evaluate_multiclass,
-            baseline.kfcv,
-            baseline.atc_mc,
-            baseline.atc_ne,
-            baseline.doc_feat,
-            baseline.rca_score,
-            baseline.rca_star_score,
-        ]
-        tasks = [(estim, train, validation, test) for estim in estimators]
-        results = [pool.apply_async(fit_and_estimate, t) for t in tasks]
-        results = list(map(lambda r: r.get(), results))
-        er = EvaluationReport.combine_reports(*list(map(lambda r: r["result"], results)))
-        times = {r["name"]:r["time"] for r in results}
-    end = time.time()
-    times["tot"] = end - start
-    er.times = times
-    er.prevs = prevs
-
-    return er
+def evaluate_mul_sld(
+    c_model: BaseEstimator,
+    validation: LabelledCollection,
+    protocol: AbstractStochasticSeededProtocol,
+) -> EvaluationReport:
+    return evaluate(c_model, validation, protocol, "mul")
diff --git a/quacc/evaluation/report.py b/quacc/evaluation/report.py
index 696bc85..0236b98 100644
--- a/quacc/evaluation/report.py
+++ b/quacc/evaluation/report.py
@@ -1,143 +1,122 @@
-from typing import Tuple
+import math
 import statistics as stats
+from typing import List, Tuple
+
 import numpy as np
 import pandas as pd
 
-
-def _fmt_line(s):
-    return f"> {s}  \n"
+from quacc import plot
+from quacc.utils import fmt_line_md
 
 
 class EvaluationReport:
     def __init__(self, prefix=None):
-        self.base = []
-        self.dict = {}
-        self._grouped = False
-        self._grouped_base = []
-        self._grouped_dict = {}
-        self._dataframe = None
-        self.prefix = prefix if prefix is not None else "default"
-        self._times = {}
-        self._prevs = {}
-        self._target = "default"
+        self._prevs = []
+        self._dict = {}
+        self._g_prevs = None
+        self._g_dict = None
+        self.name = prefix if prefix is not None else "default"
+        self.times = {}
+        self.train_prevs = {}
+        self.target = "default"
 
     def append_row(self, base: np.ndarray | Tuple, **row):
         if isinstance(base, np.ndarray):
             base = tuple(base.tolist())
-        self.base.append(base)
+        self._prevs.append(base)
         for k, v in row.items():
-            if (k, self.prefix) in self.dict:
-                self.dict[(k, self.prefix)].append(v)
+            if (k, self.name) in self._dict:
+                self._dict[(k, self.name)].append(v)
             else:
-                self.dict[(k, self.prefix)] = [v]
-        self._grouped = False
-        self._dataframe = None
+                self._dict[(k, self.name)] = [v]
+        self._g_prevs = None
 
     @property
     def columns(self):
-        return self.dict.keys()
+        return self._dict.keys()
 
-    @property
-    def grouped(self):
-        if self._grouped:
-            return self._grouped_dict
+    def groupby_prevs(self, metric: str = None):
+        if self._g_dict is None:
+            self._g_prevs = []
+            self._g_dict = {k: [] for k in self._dict.keys()}
 
-        self._grouped_base = []
-        self._grouped_dict = {k: [] for k in self.dict.keys()}
+            last_end = 0
+            for ind, bp in enumerate(self._prevs):
+                if ind < (len(self._prevs) - 1) and bp == self._prevs[ind + 1]:
+                    continue
 
-        last_end = 0
-        for ind, bp in enumerate(self.base):
-            if ind < (len(self.base) - 1) and bp == self.base[ind + 1]:
-                continue
+                self._g_prevs.append(bp)
+                for col in self._dict.keys():
+                    self._g_dict[col].append(
+                        stats.mean(self._dict[col][last_end : ind + 1])
+                    )
 
-            self._grouped_base.append(bp)
-            for col in self.dict.keys():
-                self._grouped_dict[col].append(
-                    stats.mean(self.dict[col][last_end : ind + 1])
-                )
+                last_end = ind + 1
 
-            last_end = ind + 1
+        filtered_g_dict = self._g_dict
+        if metric is not None:
+            filtered_g_dict = {
+                c1: ls for ((c0, c1), ls) in self._g_dict.items() if c0 == metric
+            }
 
-        self._grouped = True
-        return self._grouped_dict
+        return self._g_prevs, filtered_g_dict
 
-    @property
-    def gbase(self):
-        self.grouped
-        return self._grouped_base
-
-    def get_dataframe(self, metrics=None):
-        if self._dataframe is None:
-            self_columns = sorted(self.columns, key=lambda c: c[0])
-            self._dataframe = pd.DataFrame(
-                self.grouped,
-                index=self.gbase,
-                columns=pd.MultiIndex.from_tuples(self_columns),
-            )
-
-        df = pd.DataFrame(self._dataframe)
-        if metrics is not None:
-            df = df.drop(
-                [(c0, c1) for (c0, c1) in df.columns if c0 not in metrics], axis=1
-            )
-
-        if len(set(k0 for k0, k1 in df.columns)) == 1:
-            df = df.droplevel(0, axis=1)
-
-        return df
-
-    def merge(self, other):
-        if not all(v1 == v2 for v1, v2 in zip(self.base, other.base)):
-            raise ValueError("other has not same base prevalences of self")
-
-        if len(set(self.dict.keys()).intersection(set(other.dict.keys()))) > 0:
-            raise ValueError("self and other have matching keys")
-
-        report = EvaluationReport()
-        report.base = self.base
-        report.dict = self.dict | other.dict
-        return report
-
-    @property
-    def times(self):
-        return self._times
-
-    @times.setter
-    def times(self, val):
-        self._times = val
-
-    @property
-    def prevs(self):
-        return self._prevs
-
-    @prevs.setter
-    def prevs(self, val):
-        self._prevs = val
-
-    @property
-    def target(self):
-        return self._target
-
-    @target.setter
-    def target(self, val):
-        self._target = val
+    def get_dataframe(self, metric="acc"):
+        g_prevs, g_dict = self.groupby_prevs(metric=metric)
+        return pd.DataFrame(
+            g_dict,
+            index=g_prevs,
+            columns=g_dict.keys(),
+        )
 
     def to_md(self, *metrics):
-        res = _fmt_line("target: " + self.target)
-        for k, v in self.prevs.items():
-            res += _fmt_line(f"{k}: {str(v)}")
+        res = ""
+        for k, v in self.train_prevs.items():
+            res += fmt_line_md(f"{k}: {str(v)}")
         for k, v in self.times.items():
-            res += _fmt_line(f"{k}: {v:.3f}s")
+            res += fmt_line_md(f"{k}: {v:.3f}s")
         res += "\n"
         for m in metrics:
-            res += self.get_dataframe(metrics=m).to_html() + "\n\n"
+            res += self.get_dataframe(metric=m).to_html() + "\n\n"
 
         return res
 
+    def merge(self, other):
+        if not all(v1 == v2 for v1, v2 in zip(self._prevs, other._prevs)):
+            raise ValueError("other has not same base prevalences of self")
+
+        if len(set(self._dict.keys()).intersection(set(other._dict.keys()))) > 0:
+            raise ValueError("self and other have matching keys")
+
+        report = EvaluationReport()
+        report._prevs = self._prevs
+        report._dict = self._dict | other._dict
+        return report
+
     @staticmethod
-    def combine_reports(*args):
+    def combine_reports(*args, name="default"):
         er = args[0]
         for r in args[1:]:
             er = er.merge(r)
 
+        er.name = name
         return er
+
+
+class DatasetReport:
+    def __init__(self, name):
+        self.name = name
+        self.ers: List[EvaluationReport] = []
+
+    def add(self, er: EvaluationReport):
+        self.ers.append(er)
+
+    def to_md(self, *metrics):
+        res = f"{self.name}\n\n"
+        for er in self.ers:
+            res += f"{er.to_md(*metrics)}\n\n"
+
+        return res
+
+    def __iter__(self):
+        return (er for er in self.ers)
diff --git a/quacc/main.py b/quacc/main.py
index fea3800..c900a98 100644
--- a/quacc/main.py
+++ b/quacc/main.py
@@ -1,33 +1,16 @@
-import traceback
-import quacc.evaluation.method as method
+import quacc.evaluation.comp as comp
+from quacc.dataset import Dataset
+from quacc.environ import env
 
-DATASET = "imdb"
-OUTPUT_FILE = "out_" + DATASET + ".md"
-TARGETS = {
-    "rcv1" : [ 
-        'C12', 
-        'C13', 'C15', 'C151', 'C1511', 'C152', 'C17', 'C172', 
-        'C18', 'C181', 'C21', 'C24', 'C31', 'C42', 'CCAT'
-        'E11', 'E12', 'E21', 'E211', 'E212', 'E41', 'E51',  'ECAT',
-        'G15', 'GCAT', 'GCRIM', 'GDIP', 'GPOL', 'GVIO', 'GVOTE', 'GWEA',
-        'GWELF', 'M11', 'M12', 'M13', 'M131', 'M132', 'M14', 'M141',
-        'M142', 'M143', 'MCAT'
-    ],
-    "spambase": ["default"],
-    "imdb": ["default"],
-}
 
 def estimate_comparison():
-    open(OUTPUT_FILE, "w").close()
-    targets = TARGETS[DATASET]
-    for target in targets:
-        try:
-            er = method.evaluate_comparison(DATASET, target=target)
-            er.target = target
-            with open(OUTPUT_FILE, "a") as f:
-                f.write(er.to_md(["acc"], ["f1"]))
-        except Exception:
-            traceback.print_exc()
+    dataset = Dataset(
+        env.DATASET_NAME, target=env.DATASET_TARGET, n_prevalences=env.DATASET_N_PREVS
+    )
+    output_path = env.OUT_DIR / f"{dataset.name}.md"
+    with open(output_path, "w") as f:
+        dr = comp.evaluate_comparison(dataset, estimators=env.COMP_ESTIMATORS)
+        f.write(dr.to_md("acc"))
 
     # print(df.to_latex(float_format="{:.4f}".format))
     # print(utils.avg_group_report(df).to_latex(float_format="{:.4f}".format))
diff --git a/quacc/utils.py b/quacc/utils.py
index d38b9f6..d2b61f0 100644
--- a/quacc/utils.py
+++ b/quacc/utils.py
@@ -1,7 +1,8 @@
-
 import functools
+
 import pandas as pd
 
+
 def combine_dataframes(dfs, df_index=[]) -> pd.DataFrame:
     if len(dfs) < 1:
         raise ValueError
@@ -10,15 +11,13 @@ def combine_dataframes(dfs, df_index=[]) -> pd.DataFrame:
     df = dfs[0]
     for ndf in dfs[1:]:
         df = df.join(ndf.set_index(df_index), on=df_index)
-    
+
     return df
 
 
 def avg_group_report(df: pd.DataFrame) -> pd.DataFrame:
     def _reduce_func(s1, s2):
-        return {
-            (n1, n2): v + s2[(n1, n2)] for ((n1, n2), v) in s1.items()
-        }
+        return {(n1, n2): v + s2[(n1, n2)] for ((n1, n2), v) in s1.items()}
 
     lst = df.to_dict(orient="records")[1:-1]
     summed_series = functools.reduce(_reduce_func, lst)
@@ -28,4 +27,8 @@ def avg_group_report(df: pd.DataFrame) -> pd.DataFrame:
         for ((n1, n2), v) in summed_series.items()
         if n1 != "base"
     }
-    return pd.DataFrame([avg_report], columns=idx)
\ No newline at end of file
+    return pd.DataFrame([avg_report], columns=idx)
+
+
+def fmt_line_md(s):
+    return f"> {s}  \n"