diff --git a/.vscode/launch.json b/.vscode/launch.json
index f6c8bea..429433a 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -5,22 +5,21 @@
     "version": "0.2.0",
     "configurations": [
 
-
         {
             "name": "main",
             "type": "python",
             "request": "launch",
             "program": "C:\\Users\\Lorenzo Volpi\\source\\tesi\\quacc\\main.py",
             "console": "integratedTerminal",
-            "justMyCode": true
+            "justMyCode": false
         },
         {
-            "name": "models",
+            "name": "main_test",
             "type": "python",
             "request": "launch",
-            "program": "C:\\Users\\Lorenzo Volpi\\source\\tesi\\baselines\\models.py",
+            "program": "C:\\Users\\Lorenzo Volpi\\source\\tesi\\quacc\\main_test.py",
             "console": "integratedTerminal",
             "justMyCode": true
-        }
+        },
     ]
 }
\ No newline at end of file
diff --git a/conf.yaml b/conf.yaml
index e4e6294..8cf0b81 100644
--- a/conf.yaml
+++ b/conf.yaml
@@ -5,7 +5,6 @@ debug_conf: &debug_conf
     DATASET_N_PREVS: 5
     DATASET_PREVS:
     - 0.5
-    - 0.1
 
   confs:
     - DATASET_NAME: imdb
@@ -13,17 +12,9 @@ debug_conf: &debug_conf
   plot_confs:
     debug:
       PLOT_ESTIMATORS:
+        - mul_sld_gs
         - ref
-        - atc_mc
-        - atc_ne
       PLOT_STDEV: true
-    debug_plus:
-      PLOT_ESTIMATORS:
-        - mul_sld_bcts
-        - mul_sld
-        - ref
-        - atc_mc
-        - atc_ne
 
 test_conf: &test_conf
   global:
diff --git a/quacc.log b/quacc.log
index 8fc15af..ffe98ee 100644
--- a/quacc.log
+++ b/quacc.log
@@ -1473,3 +1473,24 @@
 31/10/23 17:05:50| INFO        mul_sld finished [took 29.3523s]
 31/10/23 17:06:00| INFO        mul_sld_bcts finished [took 39.8376s]
 31/10/23 17:06:00| INFO     Dataset sample 0.50 of dataset imdb_2prevs finished [took 41.2888s]
+----------------------------------------------------------------------------------------------------
+31/10/23 20:19:37| INFO     dataset imdb_1prevs
+31/10/23 20:19:48| INFO     Dataset sample 0.50 of dataset imdb_1prevs started
+31/10/23 20:20:07| INFO        ref finished [took 17.4125s]
+----------------------------------------------------------------------------------------------------
+31/10/23 20:20:50| INFO     dataset imdb_1prevs
+31/10/23 20:21:01| INFO     Dataset sample 0.50 of dataset imdb_1prevs started
+31/10/23 20:21:19| INFO        ref finished [took 17.0717s]
+----------------------------------------------------------------------------------------------------
+31/10/23 20:22:05| INFO     dataset imdb_1prevs
+31/10/23 20:22:15| INFO     Dataset sample 0.50 of dataset imdb_1prevs started
+31/10/23 20:22:35| INFO        ref finished [took 18.4752s]
+----------------------------------------------------------------------------------------------------
+31/10/23 20:23:38| INFO     dataset imdb_1prevs
+31/10/23 20:23:48| INFO     Dataset sample 0.50 of dataset imdb_1prevs started
+31/10/23 20:24:08| INFO        ref finished [took 18.3216s]
+----------------------------------------------------------------------------------------------------
+01/11/23 13:07:19| INFO     dataset imdb_1prevs
+01/11/23 13:07:27| INFO     Dataset sample 0.50 of dataset imdb_1prevs started
+01/11/23 13:07:27| ERROR    Evaluation over imdb_1prevs failed. Exception: 'Invalid estimator: estimator mul_sld_gs does not exist'
+01/11/23 13:07:27| ERROR    Failed while saving configuration imdb_debug of imdb_1prevs. Exception: cannot access local variable 'dr' where it is not associated with a value
diff --git a/quacc/dataset.py b/quacc/dataset.py
index c11b0c9..aff0fbb 100644
--- a/quacc/dataset.py
+++ b/quacc/dataset.py
@@ -71,18 +71,16 @@ class Dataset:
 
         return all_train, test
 
-    def get_raw(self, validation=True) -> DatasetSample:
+    def get_raw(self) -> DatasetSample:
         all_train, test = {
             "spambase": self.__spambase,
             "imdb": self.__imdb,
             "rcv1": self.__rcv1,
         }[self._name]()
 
-        train, val = all_train, None
-        if validation:
-            train, val = all_train.split_stratified(
-                train_prop=TRAIN_VAL_PROP, random_state=0
-            )
+        train, val = all_train.split_stratified(
+            train_prop=TRAIN_VAL_PROP, random_state=0
+        )
 
         return DatasetSample(train, val, test)
 
diff --git a/quacc/error.py b/quacc/error.py
index 6ed7dd4..4393d72 100644
--- a/quacc/error.py
+++ b/quacc/error.py
@@ -1,13 +1,10 @@
-import quapy as qp
+import numpy as np
 
 
 def from_name(err_name):
-    if err_name == "f1e":
-        return f1e
-    elif err_name == "f1":
-        return f1
-    else:
-        return qp.error.from_name(err_name)
+    assert err_name in ERROR_NAMES, f"unknown error {err_name}"
+    callable_error = globals()[err_name]
+    return callable_error
 
 
 # def f1(prev):
@@ -36,5 +33,23 @@ def f1e(prev):
     return 1 - f1(prev)
 
 
-def acc(prev):
-    return (prev[0] + prev[3]) / sum(prev)
+def acc(prev: np.ndarray) -> float:
+    return (prev[0] + prev[3]) / np.sum(prev)
+
+
+def accd(true_prevs: np.ndarray, estim_prevs: np.ndarray) -> np.ndarray:
+    vacc = np.vectorize(acc, signature="(m)->()")
+    a_tp = vacc(true_prevs)
+    a_ep = vacc(estim_prevs)
+    return np.abs(a_tp - a_ep)
+
+
+def maccd(true_prevs: np.ndarray, estim_prevs: np.ndarray) -> float:
+    return accd(true_prevs, estim_prevs).mean()
+
+
+ACCURACY_ERROR = {maccd}
+ACCURACY_ERROR_SINGLE = {accd}
+ACCURACY_ERROR_NAMES = {func.__name__ for func in ACCURACY_ERROR}
+ACCURACY_ERROR_SINGLE_NAMES = {func.__name__ for func in ACCURACY_ERROR_SINGLE}
+ERROR_NAMES = ACCURACY_ERROR_NAMES | ACCURACY_ERROR_SINGLE_NAMES
diff --git a/quacc/evaluation/method.py b/quacc/evaluation/method.py
index b15990a..f08bd0b 100644
--- a/quacc/evaluation/method.py
+++ b/quacc/evaluation/method.py
@@ -1,19 +1,16 @@
 from functools import wraps
+from typing import Callable, Union
 
 import numpy as np
-import sklearn.metrics as metrics
-from quapy.data import LabelledCollection
-from quapy.protocol import AbstractStochasticSeededProtocol
-from sklearn.base import BaseEstimator
+from quapy.method.aggregative import SLD
+from quapy.protocol import UPP, AbstractProtocol, OnLabelledCollectionProtocol
+from sklearn.linear_model import LogisticRegression
 
-import quacc.error as error
+import quacc as qc
 from quacc.evaluation.report import EvaluationReport
+from quacc.method.model_selection import GridSearchAE
 
-from ..estimator import (
-    AccuracyEstimator,
-    BinaryQuantifierAccuracyEstimator,
-    MulticlassAccuracyEstimator,
-)
+from ..method.base import BQAE, MCAE, BaseAccuracyEstimator
 
 _methods = {}
 
@@ -28,108 +25,115 @@ def method(func):
     return wrapper
 
 
-def estimate(
-    estimator: AccuracyEstimator,
-    protocol: AbstractStochasticSeededProtocol,
-):
-    base_prevs, true_prevs, estim_prevs, pred_probas, labels = [], [], [], [], []
-    for sample in protocol():
-        e_sample, pred_proba = estimator.extend(sample)
-        estim_prev = estimator.estimate(e_sample.X, ext=True)
-        base_prevs.append(sample.prevalence())
-        true_prevs.append(e_sample.prevalence())
-        estim_prevs.append(estim_prev)
-        pred_probas.append(pred_proba)
-        labels.append(sample.y)
+def evaluate(
+    estimator: BaseAccuracyEstimator,
+    protocol: AbstractProtocol,
+    error_metric: Union[Callable | str],
+) -> float:
+    if isinstance(error_metric, str):
+        error_metric = qc.error.from_name(error_metric)
 
-    return base_prevs, true_prevs, estim_prevs, pred_probas, labels
+    collator_bck_ = protocol.collator
+    protocol.collator = OnLabelledCollectionProtocol.get_collator("labelled_collection")
+
+    estim_prevs, true_prevs = [], []
+    for sample in protocol():
+        e_sample = estimator.extend(sample)
+        estim_prev = estimator.estimate(e_sample.X, ext=True)
+        estim_prevs.append(estim_prev)
+        true_prevs.append(e_sample.prevalence())
+
+    protocol.collator = collator_bck_
+
+    true_prevs = np.array(true_prevs)
+    estim_prevs = np.array(estim_prevs)
+
+    return error_metric(true_prevs, estim_prevs)
 
 
 def evaluation_report(
-    estimator: AccuracyEstimator,
-    protocol: AbstractStochasticSeededProtocol,
+    estimator: BaseAccuracyEstimator,
+    protocol: AbstractProtocol,
     method: str,
 ) -> EvaluationReport:
-    base_prevs, true_prevs, estim_prevs, pred_probas, labels = estimate(
-        estimator, protocol
-    )
     report = EvaluationReport(name=method)
-
-    for base_prev, true_prev, estim_prev, pred_proba, label in zip(
-        base_prevs, true_prevs, estim_prevs, pred_probas, labels
-    ):
-        pred = np.argmax(pred_proba, axis=-1)
-        acc_score = error.acc(estim_prev)
-        f1_score = error.f1(estim_prev)
+    for sample in protocol():
+        e_sample = estimator.extend(sample)
+        estim_prev = estimator.estimate(e_sample.X, ext=True)
+        acc_score = qc.error.acc(estim_prev)
+        f1_score = qc.error.f1(estim_prev)
         report.append_row(
-            base_prev,
+            sample.prevalence(),
             acc_score=acc_score,
-            acc=abs(metrics.accuracy_score(label, pred) - acc_score),
+            acc=abs(qc.error.acc(e_sample.prevalence()) - acc_score),
             f1_score=f1_score,
-            f1=abs(error.f1(true_prev) - f1_score),
+            f1=abs(qc.error.f1(e_sample.prevalence()) - f1_score),
         )
 
-    report.fit_score = estimator.fit_score
-
     return report
 
 
-def evaluate(
-    c_model: BaseEstimator,
-    validation: LabelledCollection,
-    protocol: AbstractStochasticSeededProtocol,
-    method: str,
-    q_model: str,
-    **kwargs,
-):
-    estimator: AccuracyEstimator = {
-        "bin": BinaryQuantifierAccuracyEstimator,
-        "mul": MulticlassAccuracyEstimator,
-    }[method](c_model, q_model=q_model.upper(), **kwargs)
-    estimator.fit(validation)
-    _method = f"{method}_{q_model}"
-    if "recalib" in kwargs:
-        _method += f"_{kwargs['recalib']}"
-    if ("gs", True) in kwargs.items():
-        _method += "_gs"
-    return evaluation_report(estimator, protocol, _method)
-
-
 @method
 def bin_sld(c_model, validation, protocol) -> EvaluationReport:
-    return evaluate(c_model, validation, protocol, "bin", "sld")
+    est = BQAE(c_model, SLD(LogisticRegression()))
+    est.fit(validation)
+    return evaluation_report(
+        estimator=est,
+        protocol=protocol,
+        method="bin_sld",
+    )
 
 
 @method
 def mul_sld(c_model, validation, protocol) -> EvaluationReport:
-    return evaluate(c_model, validation, protocol, "mul", "sld")
+    est = MCAE(c_model, SLD(LogisticRegression()))
+    est.fit(validation)
+    return evaluation_report(
+        estimator=est,
+        protocor=protocol,
+        method="mul_sld",
+    )
 
 
 @method
 def bin_sld_bcts(c_model, validation, protocol) -> EvaluationReport:
-    return evaluate(c_model, validation, protocol, "bin", "sld", recalib="bcts")
+    est = BQAE(c_model, SLD(LogisticRegression(), recalib="bcts"))
+    est.fit(validation)
+    return evaluation_report(
+        estimator=est,
+        protocol=protocol,
+        method="bin_sld_bcts",
+    )
 
 
 @method
 def mul_sld_bcts(c_model, validation, protocol) -> EvaluationReport:
-    return evaluate(c_model, validation, protocol, "mul", "sld", recalib="bcts")
-
-
-@method
-def bin_sld_gs(c_model, validation, protocol) -> EvaluationReport:
-    return evaluate(c_model, validation, protocol, "bin", "sld", gs=True)
+    est = MCAE(c_model, SLD(LogisticRegression(), recalib="bcts"))
+    est.fit(validation)
+    return evaluation_report(
+        estimator=est,
+        protocol=protocol,
+        method="mul_sld_bcts",
+    )
 
 
 @method
 def mul_sld_gs(c_model, validation, protocol) -> EvaluationReport:
-    return evaluate(c_model, validation, protocol, "mul", "sld", gs=True)
-
-
-@method
-def bin_cc(c_model, validation, protocol) -> EvaluationReport:
-    return evaluate(c_model, validation, protocol, "bin", "cc")
-
-
-@method
-def mul_cc(c_model, validation, protocol) -> EvaluationReport:
-    return evaluate(c_model, validation, protocol, "mul", "cc")
+    v_train, v_val = validation.split_stratified(0.6, random_state=0)
+    model = SLD(LogisticRegression())
+    est = GridSearchAE(
+        model=model,
+        param_grid={
+            "q__classifier__C": np.logspace(-3, 3, 7),
+            "q__classifier__class_weight": [None, "balanced"],
+            "q__recalib": [None, "bcts", "vs"],
+        },
+        refit=False,
+        protocol=UPP(v_val, repeats=100),
+        verbose=True,
+    ).fit(v_train)
+    return evaluation_report(
+        estimator=est,
+        protocol=protocol,
+        method="mul_sld_gs",
+    )
diff --git a/quacc/main_test.py b/quacc/main_test.py
new file mode 100644
index 0000000..7239908
--- /dev/null
+++ b/quacc/main_test.py
@@ -0,0 +1,76 @@
+from copy import deepcopy
+from time import time
+
+import numpy as np
+import win11toast
+from quapy.method.aggregative import SLD
+from quapy.protocol import APP, UPP
+from sklearn.linear_model import LogisticRegression
+
+from quacc.dataset import Dataset
+from quacc.error import acc
+from quacc.evaluation.report import CompReport, EvaluationReport
+from quacc.method.base import MultiClassAccuracyEstimator
+from quacc.method.model_selection import GridSearchAE
+
+
+def test_gs():
+    d = Dataset(name="rcv1", target="CCAT", n_prevalences=1).get_raw()
+
+    classifier = LogisticRegression()
+    classifier.fit(*d.train.Xy)
+
+    quantifier = SLD(LogisticRegression())
+    estimator = MultiClassAccuracyEstimator(classifier, quantifier)
+    estimator.fit(d.validation)
+
+    v_train, v_val = d.validation.split_stratified(0.6, random_state=0)
+    gs_protocol = UPP(v_val, sample_size=1000, repeats=100)
+    gs_estimator = GridSearchAE(
+        model=deepcopy(estimator),
+        param_grid={
+            "q__classifier__C": np.logspace(-3, 3, 7),
+            "q__classifier__class_weight": [None, "balanced"],
+            "q__recalib": [None, "bcts", "vs"],
+        },
+        refit=False,
+        protocol=gs_protocol,
+        verbose=True,
+    ).fit(v_train)
+
+    tstart = time()
+    erb, ergs = EvaluationReport("base"), EvaluationReport("gs")
+    protocol = APP(
+        d.test,
+        sample_size=1000,
+        n_prevalences=21,
+        repeats=100,
+        return_type="labelled_collection",
+    )
+    for sample in protocol():
+        e_sample = gs_estimator.extend(sample)
+        estim_prev_b = estimator.estimate(e_sample.X, ext=True)
+        estim_prev_gs = gs_estimator.estimate(e_sample.X, ext=True)
+        erb.append_row(
+            sample.prevalence(),
+            acc=abs(acc(e_sample.prevalence()) - acc(estim_prev_b)),
+        )
+        ergs.append_row(
+            sample.prevalence(),
+            acc=abs(acc(e_sample.prevalence()) - acc(estim_prev_gs)),
+        )
+
+    cr = CompReport(
+        [erb, ergs],
+        "test",
+        train_prev=d.train_prev,
+        valid_prev=d.validation_prev,
+    )
+
+    print(cr.table())
+    print(f"[took {time() - tstart:.3f}s]")
+    win11toast.notify("Test", "completed")
+
+
+if __name__ == "__main__":
+    test_gs()
diff --git a/quacc/method/__pycache__/base.cpython-311.pyc b/quacc/method/__pycache__/base.cpython-311.pyc
new file mode 100644
index 0000000..220398a
Binary files /dev/null and b/quacc/method/__pycache__/base.cpython-311.pyc differ
diff --git a/quacc/method/__pycache__/model_selection.cpython-311.pyc b/quacc/method/__pycache__/model_selection.cpython-311.pyc
new file mode 100644
index 0000000..03b8910
Binary files /dev/null and b/quacc/method/__pycache__/model_selection.cpython-311.pyc differ
diff --git a/quacc/estimator.py b/quacc/method/base.py
similarity index 63%
rename from quacc/estimator.py
rename to quacc/method/base.py
index 216b8a1..c36636b 100644
--- a/quacc/estimator.py
+++ b/quacc/method/base.py
@@ -4,7 +4,7 @@ from abc import abstractmethod
 import numpy as np
 import quapy as qp
 from quapy.data import LabelledCollection
-from quapy.method.aggregative import CC, SLD
+from quapy.method.aggregative import CC, SLD, BaseQuantifier
 from quapy.model_selection import GridSearchQ
 from quapy.protocol import UPP
 from sklearn.base import BaseEstimator
@@ -14,9 +14,22 @@ from sklearn.model_selection import cross_val_predict
 from quacc.data import ExtendedCollection
 
 
-class AccuracyEstimator:
-    def __init__(self):
+class BaseAccuracyEstimator(BaseQuantifier):
+    def __init__(
+        self,
+        classifier: BaseEstimator,
+        quantifier: BaseQuantifier,
+    ):
         self.fit_score = None
+        self.__check_classifier(classifier)
+        self.classifier = classifier
+        self.quantifier = quantifier
+
+    def __check_classifier(self, classifier):
+        if not hasattr(classifier, "predict_proba"):
+            raise ValueError(
+                f"Passed classifier {classifier.__class__.__name__} cannot predict probabilities."
+            )
 
     def _gs_params(self, t_val: LabelledCollection):
         return {
@@ -33,85 +46,55 @@ class AccuracyEstimator:
             "verbose": True,
         }
 
-    def extend(self, base: LabelledCollection, pred_proba=None) -> ExtendedCollection:
+    def extend(self, coll: LabelledCollection, pred_proba=None) -> ExtendedCollection:
         if not pred_proba:
-            pred_proba = self.c_model.predict_proba(base.X)
-        return ExtendedCollection.extend_collection(base, pred_proba), pred_proba
+            pred_proba = self.classifier.predict_proba(coll.X)
+        return ExtendedCollection.extend_collection(coll, pred_proba)
 
     @abstractmethod
     def fit(self, train: LabelledCollection | ExtendedCollection):
         ...
 
     @abstractmethod
-    def estimate(self, instances, ext=False):
+    def estimate(self, instances, ext=False) -> np.ndarray:
         ...
 
 
-AE = AccuracyEstimator
+class MultiClassAccuracyEstimator(BaseAccuracyEstimator):
+    def __init__(
+        self,
+        classifier: BaseEstimator,
+        quantifier: BaseQuantifier,
+    ):
+        super().__init__(classifier, quantifier)
 
+    def fit(self, train: LabelledCollection):
+        pred_probs = self.classifier.predict_proba(train.X)
+        self.e_train = ExtendedCollection.extend_collection(train, pred_probs)
 
-class MulticlassAccuracyEstimator(AccuracyEstimator):
-    def __init__(self, c_model: BaseEstimator, q_model="SLD", gs=False, recalib=None):
-        super().__init__()
-        self.c_model = c_model
-        self._q_model_name = q_model.upper()
-        self.e_train = None
-        self.gs = gs
-        self.recalib = recalib
+        self.quantifier.fit(self.e_train)
 
-    def fit(self, train: LabelledCollection | ExtendedCollection):
-        # check if model is fit
-        # self.model.fit(*train.Xy)
-        if isinstance(train, LabelledCollection):
-            pred_prob_train = cross_val_predict(
-                self.c_model, *train.Xy, method="predict_proba"
-            )
-            self.e_train = ExtendedCollection.extend_collection(train, pred_prob_train)
-        else:
-            self.e_train = train
+        return self
 
-        if self._q_model_name == "SLD":
-            if self.gs:
-                t_train, t_val = self.e_train.split_stratified(0.6, random_state=0)
-                gs_params = self._gs_params(t_val)
-                self.q_model = GridSearchQ(
-                    SLD(LogisticRegression()),
-                    **gs_params,
-                )
-                self.q_model.fit(t_train)
-                self.fit_score = self.q_model.best_score_
-            else:
-                self.q_model = SLD(LogisticRegression(), recalib=self.recalib)
-                self.q_model.fit(self.e_train)
-        elif self._q_model_name == "CC":
-            self.q_model = CC(LogisticRegression())
-            self.q_model.fit(self.e_train)
-
-    def estimate(self, instances, ext=False):
+    def estimate(self, instances, ext=False) -> np.ndarray:
+        e_inst = instances
         if not ext:
-            pred_prob = self.c_model.predict_proba(instances)
+            pred_prob = self.classifier.predict_proba(instances)
             e_inst = ExtendedCollection.extend_instances(instances, pred_prob)
-        else:
-            e_inst = instances
 
-        estim_prev = self.q_model.quantify(e_inst)
+        estim_prev = self.quantifier.quantify(e_inst)
+        return self._check_prevalence_classes(estim_prev)
 
-        return self._check_prevalence_classes(
-            self.e_train.classes_, self.q_model, estim_prev
-        )
-
-    def _check_prevalence_classes(self, true_classes, q_model, estim_prev):
-        if isinstance(q_model, GridSearchQ):
-            estim_classes = q_model.best_model().classes_
-        else:
-            estim_classes = q_model.classes_
+    def _check_prevalence_classes(self, estim_prev) -> np.ndarray:
+        estim_classes = self.quantifier.classes_
+        true_classes = self.e_train.classes_
         for _cls in true_classes:
             if _cls not in estim_classes:
                 estim_prev = np.insert(estim_prev, _cls, [0.0], axis=0)
         return estim_prev
 
 
-class BinaryQuantifierAccuracyEstimator(AccuracyEstimator):
+class BinaryQuantifierAccuracyEstimator(BaseAccuracyEstimator):
     def __init__(self, c_model: BaseEstimator, q_model="SLD", gs=False, recalib=None):
         super().__init__()
         self.c_model = c_model
@@ -190,3 +173,8 @@ class BinaryQuantifierAccuracyEstimator(AccuracyEstimator):
             return np.asarray(list(map(lambda p: p * norm, q_model.quantify(inst))))
         else:
             return np.asarray([0.0, 0.0])
+
+
+BAE = BaseAccuracyEstimator
+MCAE = MultiClassAccuracyEstimator
+BQAE = BinaryQuantifierAccuracyEstimator
diff --git a/quacc/method/model_selection.py b/quacc/method/model_selection.py
new file mode 100644
index 0000000..a80d5d9
--- /dev/null
+++ b/quacc/method/model_selection.py
@@ -0,0 +1,204 @@
+import itertools
+from copy import deepcopy
+from time import time
+from typing import Callable, Union
+
+from quapy.data import LabelledCollection
+from quapy.protocol import AbstractProtocol, OnLabelledCollectionProtocol
+
+import quacc as qc
+import quacc.evaluation.method as evaluation
+from quacc.data import ExtendedCollection
+from quacc.method.base import BaseAccuracyEstimator
+
+
+class GridSearchAE(BaseAccuracyEstimator):
+    def __init__(
+        self,
+        model: BaseAccuracyEstimator,
+        param_grid: dict,
+        protocol: AbstractProtocol,
+        error: Union[Callable, str] = qc.error.maccd,
+        refit=True,
+        # timeout=-1,
+        # n_jobs=None,
+        verbose=False,
+    ):
+        self.model = model
+        self.param_grid = self.__normalize_params(param_grid)
+        self.protocol = protocol
+        self.refit = refit
+        # self.timeout = timeout
+        # self.n_jobs = qp._get_njobs(n_jobs)
+        self.verbose = verbose
+        self.__check_error(error)
+        assert isinstance(protocol, AbstractProtocol), "unknown protocol"
+
+    def _sout(self, msg):
+        if self.verbose:
+            print(f"[{self.__class__.__name__}]: {msg}")
+
+    def __normalize_params(self, params):
+        __remap = {}
+        for key in params.keys():
+            k, delim, sub_key = key.partition("__")
+            if delim and k == "q":
+                __remap[key] = f"quantifier__{sub_key}"
+
+        return {(__remap[k] if k in __remap else k): v for k, v in params.items()}
+
+    def __check_error(self, error):
+        if error in qc.error.ACCURACY_ERROR:
+            self.error = error
+        elif isinstance(error, str):
+            self.error = qc.error.from_name(error)
+        elif hasattr(error, "__call__"):
+            self.error = error
+        else:
+            raise ValueError(
+                f"unexpected error type; must either be a callable function or a str representing\n"
+                f"the name of an error function in {qc.error.ACCURACY_ERROR_NAMES}"
+            )
+
+    def fit(self, training: LabelledCollection):
+        """Learning routine. Fits methods with all combinations of hyperparameters and selects the one minimizing
+            the error metric.
+
+        :param training: the training set on which to optimize the hyperparameters
+        :return: self
+        """
+        params_keys = list(self.param_grid.keys())
+        params_values = list(self.param_grid.values())
+
+        protocol = self.protocol
+
+        self.param_scores_ = {}
+        self.best_score_ = None
+
+        tinit = time()
+
+        hyper = [
+            dict(zip(params_keys, val)) for val in itertools.product(*params_values)
+        ]
+
+        # self._sout(f"starting model selection with {self.n_jobs =}")
+        self._sout("starting model selection")
+
+        scores = [self.__params_eval(params, training) for params in hyper]
+
+        for params, score, model in scores:
+            if score is not None:
+                if self.best_score_ is None or score < self.best_score_:
+                    self.best_score_ = score
+                    self.best_params_ = params
+                    self.best_model_ = model
+                self.param_scores_[str(params)] = score
+            else:
+                self.param_scores_[str(params)] = "timeout"
+
+        tend = time() - tinit
+
+        if self.best_score_ is None:
+            raise TimeoutError("no combination of hyperparameters seem to work")
+
+        self._sout(
+            f"optimization finished: best params {self.best_params_} (score={self.best_score_:.5f}) "
+            f"[took {tend:.4f}s]"
+        )
+
+        if self.refit:
+            if isinstance(protocol, OnLabelledCollectionProtocol):
+                self._sout("refitting on the whole development set")
+                self.best_model_.fit(training + protocol.get_labelled_collection())
+            else:
+                raise RuntimeWarning(
+                    f'"refit" was requested, but the protocol does not '
+                    f"implement the {OnLabelledCollectionProtocol.__name__} interface"
+                )
+
+        return self
+
+    def __params_eval(self, params, training):
+        protocol = self.protocol
+        error = self.error
+
+        # if self.timeout > 0:
+
+        #     def handler(signum, frame):
+        #         raise TimeoutError()
+
+        #     signal.signal(signal.SIGALRM, handler)
+
+        tinit = time()
+
+        # if self.timeout > 0:
+        #     signal.alarm(self.timeout)
+
+        try:
+            model = deepcopy(self.model)
+            # overrides default parameters with the parameters being explored at this iteration
+            model.set_params(**params)
+            model.fit(training)
+            score = evaluation.evaluate(model, protocol=protocol, error_metric=error)
+
+            ttime = time() - tinit
+            self._sout(
+                f"hyperparams={params}\t got score {score:.5f} [took {ttime:.4f}s]"
+            )
+
+            # if self.timeout > 0:
+            #     signal.alarm(0)
+        # except TimeoutError:
+        #     self._sout(f"timeout ({self.timeout}s) reached for config {params}")
+        #     score = None
+        except ValueError as e:
+            self._sout(f"the combination of hyperparameters {params} is invalid")
+            raise e
+        except Exception as e:
+            self._sout(f"something went wrong for config {params}; skipping:")
+            self._sout(f"\tException: {e}")
+            # traceback(e)
+            score = None
+
+        return params, score, model
+
+    def extend(self, coll: LabelledCollection, pred_proba=None) -> ExtendedCollection:
+        assert hasattr(self, "best_model_"), "quantify called before fit"
+        return self.best_model().extend(coll, pred_proba=pred_proba)
+
+    def estimate(self, instances, ext=False):
+        """Estimate class prevalence values using the best model found after calling the :meth:`fit` method.
+
+        :param instances: sample contanining the instances
+        :return: a ndarray of shape `(n_classes)` with class prevalence estimates as according to the best model found
+            by the model selection process.
+        """
+
+        assert hasattr(self, "best_model_"), "quantify called before fit"
+        return self.best_model().estimate(instances, ext=ext)
+
+    def set_params(self, **parameters):
+        """Sets the hyper-parameters to explore.
+
+        :param parameters: a dictionary with keys the parameter names and values the list of values to explore
+        """
+        self.param_grid = parameters
+
+    def get_params(self, deep=True):
+        """Returns the dictionary of hyper-parameters to explore (`param_grid`)
+
+        :param deep: Unused
+        :return: the dictionary `param_grid`
+        """
+        return self.param_grid
+
+    def best_model(self):
+        """
+        Returns the best model found after calling the :meth:`fit` method, i.e., the one trained on the combination
+        of hyper-parameters that minimized the error function.
+
+        :return: a trained quantifier
+        """
+        if hasattr(self, "best_model_"):
+            return self.best_model_
+        raise ValueError("best_model called before fit")
diff --git a/quacc/old_main.py b/quacc/old_main.py
deleted file mode 100644
index 00ea263..0000000
--- a/quacc/old_main.py
+++ /dev/null
@@ -1,138 +0,0 @@
-import numpy as np
-import scipy as sp
-import quapy as qp
-from quapy.data import LabelledCollection
-from quapy.method.aggregative import SLD
-from quapy.protocol import APP, AbstractStochasticSeededProtocol
-from sklearn.linear_model import LogisticRegression
-from sklearn.model_selection import cross_val_predict
-
-from .data import get_dataset
-
-# Extended classes
-#
-# 0 ~ True 0
-# 1 ~ False 1
-# 2 ~ False 0
-# 3 ~ True 1
-#      _____________________
-#     |          |          |
-#     |  True 0  |  False 1 |
-#     |__________|__________|
-#     |          |          |
-#     |  False 0 |  True 1  |
-#     |__________|__________|
-#
-def get_ex_class(classes, true_class, pred_class):
-    return true_class * classes + pred_class
-
-
-def extend_collection(coll, pred_prob):
-    n_classes = coll.n_classes
-
-    # n_X = [ X | predicted probs. ]
-    if isinstance(coll.X, sp.csr_matrix):
-        pred_prob_csr = sp.csr_matrix(pred_prob)
-        n_x = sp.hstack([coll.X, pred_prob_csr])
-    elif isinstance(coll.X, np.ndarray):
-        n_x = np.concatenate((coll.X, pred_prob), axis=1)
-    else:
-        raise ValueError("Unsupported matrix format")
-
-    # n_y = (exptected y, predicted y)
-    n_y = []
-    for i, true_class in enumerate(coll.y):
-        pred_class = pred_prob[i].argmax(axis=0)
-        n_y.append(get_ex_class(n_classes, true_class, pred_class))
-
-    return LabelledCollection(n_x, np.asarray(n_y), [*range(0, n_classes * n_classes)])
-
-
-def qf1e_binary(prev):
-    recall = prev[0] / (prev[0] + prev[1])
-    precision = prev[0] / (prev[0] + prev[2])
-
-    return 1 - 2 * (precision * recall) / (precision + recall)
-
-
-def compute_errors(true_prev, estim_prev, n_instances):
-    errors = {}
-    _eps = 1 / (2 * n_instances)
-    errors = {
-        "mae": qp.error.mae(true_prev, estim_prev),
-        "rae": qp.error.rae(true_prev, estim_prev, eps=_eps),
-        "mrae": qp.error.mrae(true_prev, estim_prev, eps=_eps),
-        "kld": qp.error.kld(true_prev, estim_prev, eps=_eps),
-        "nkld": qp.error.nkld(true_prev, estim_prev, eps=_eps),
-        "true_f1e": qf1e_binary(true_prev),
-        "estim_f1e": qf1e_binary(estim_prev),
-    }
-
-    return errors
-
-
-def extend_and_quantify(
-    model,
-    q_model,
-    train,
-    test: LabelledCollection | AbstractStochasticSeededProtocol,
-):
-    model.fit(*train.Xy)
-
-    pred_prob_train = cross_val_predict(model, *train.Xy, method="predict_proba")
-    _train = extend_collection(train, pred_prob_train)
-
-    q_model.fit(_train)
-
-    def quantify_extended(test):
-        pred_prob_test = model.predict_proba(test.X)
-        _test = extend_collection(test, pred_prob_test)
-        _estim_prev = q_model.quantify(_test.instances)
-        # check that _estim_prev has all the classes and eventually fill the missing
-        # ones with 0
-        for _cls in _test.classes_:
-            if _cls not in q_model.classes_:
-                _estim_prev = np.insert(_estim_prev, _cls, [0.0], axis=0)
-                print(_estim_prev)
-        return _test.prevalence(), _estim_prev
-
-    if isinstance(test, LabelledCollection):
-        _true_prev, _estim_prev = quantify_extended(test)
-        _errors = compute_errors(_true_prev, _estim_prev, test.X.shape[0])
-        return ([test.prevalence()], [_true_prev], [_estim_prev], [_errors])
-
-    elif isinstance(test, AbstractStochasticSeededProtocol):
-        orig_prevs, true_prevs, estim_prevs, errors = [], [], [], []
-        for index in test.samples_parameters():
-            sample = test.sample(index)
-            _true_prev, _estim_prev = quantify_extended(sample)
-
-            orig_prevs.append(sample.prevalence())
-            true_prevs.append(_true_prev)
-            estim_prevs.append(_estim_prev)
-            errors.append(compute_errors(_true_prev, _estim_prev, sample.X.shape[0]))
-
-        return orig_prevs, true_prevs, estim_prevs, errors
-
-
-
-
-def test_1(dataset_name):
-    train, test = get_dataset(dataset_name)
-
-    orig_prevs, true_prevs, estim_prevs, errors = extend_and_quantify(
-        LogisticRegression(),
-        SLD(LogisticRegression()),
-        train,
-        APP(test, n_prevalences=11, repeats=1),
-    )
-
-    for orig_prev, true_prev, estim_prev, _errors in zip(
-        orig_prevs, true_prevs, estim_prevs, errors
-    ):
-        print(f"original prevalence:\t{orig_prev}")
-        print(f"true prevalence:\t{true_prev}")
-        print(f"estimated prevalence:\t{estim_prev}")
-        for name, err in _errors.items():
-            print(f"{name}={err:.3f}")
-        print()
diff --git a/tests/test_baseline.py b/tests/test_baseline.py
deleted file mode 100644
index c7a8027..0000000
--- a/tests/test_baseline.py
+++ /dev/null
@@ -1,20 +0,0 @@
-
-from sklearn.linear_model import LogisticRegression
-from quacc.evaluation.baseline import kfcv, trust_score
-from quacc.dataset import get_spambase
-
-
-class TestBaseline:
-
-    def test_kfcv(self):
-        train, validation, _ = get_spambase()
-        c_model = LogisticRegression()
-        c_model.fit(train.X, train.y)
-        assert "f1_score" in kfcv(c_model, validation)
-
-    def test_trust_score(self):
-        train, validation, test = get_spambase()
-        c_model = LogisticRegression()
-        c_model.fit(train.X, train.y)
-        trustscore = trust_score(c_model, train, test)
-        assert len(trustscore) == len(test.y)
\ No newline at end of file
diff --git a/tests/test_evaluation/__pycache__/test_baseline.cpython-311-pytest-7.4.2.pyc b/tests/test_evaluation/__pycache__/test_baseline.cpython-311-pytest-7.4.2.pyc
new file mode 100644
index 0000000..0aaf325
Binary files /dev/null and b/tests/test_evaluation/__pycache__/test_baseline.cpython-311-pytest-7.4.2.pyc differ
diff --git a/tests/test_evaluation/test_baseline.py b/tests/test_evaluation/test_baseline.py
new file mode 100644
index 0000000..20fac98
--- /dev/null
+++ b/tests/test_evaluation/test_baseline.py
@@ -0,0 +1,12 @@
+from sklearn.linear_model import LogisticRegression
+
+from quacc.dataset import Dataset
+from quacc.evaluation.baseline import kfcv
+
+
+class TestBaseline:
+    def test_kfcv(self):
+        spambase = Dataset("spambase", n_prevalences=1).get_raw()
+        c_model = LogisticRegression()
+        c_model.fit(spambase.train.X, spambase.train.y)
+        assert "f1_score" in kfcv(c_model, spambase.validation)
diff --git a/tests/test_method/__pycache__/test_base.cpython-311-pytest-7.4.2.pyc b/tests/test_method/__pycache__/test_base.cpython-311-pytest-7.4.2.pyc
new file mode 100644
index 0000000..526d081
Binary files /dev/null and b/tests/test_method/__pycache__/test_base.cpython-311-pytest-7.4.2.pyc differ
diff --git a/tests/test_method/__pycache__/test_model_selection.cpython-311-pytest-7.4.2.pyc b/tests/test_method/__pycache__/test_model_selection.cpython-311-pytest-7.4.2.pyc
new file mode 100644
index 0000000..f9d7598
Binary files /dev/null and b/tests/test_method/__pycache__/test_model_selection.cpython-311-pytest-7.4.2.pyc differ
diff --git a/tests/test_method/test_base/__pycache__/test_BQAE.cpython-311-pytest-7.4.2.pyc b/tests/test_method/test_base/__pycache__/test_BQAE.cpython-311-pytest-7.4.2.pyc
new file mode 100644
index 0000000..e1ad6b8
Binary files /dev/null and b/tests/test_method/test_base/__pycache__/test_BQAE.cpython-311-pytest-7.4.2.pyc differ
diff --git a/tests/test_method/test_base/__pycache__/test_MCAE.cpython-311-pytest-7.4.2.pyc b/tests/test_method/test_base/__pycache__/test_MCAE.cpython-311-pytest-7.4.2.pyc
new file mode 100644
index 0000000..024d5e3
Binary files /dev/null and b/tests/test_method/test_base/__pycache__/test_MCAE.cpython-311-pytest-7.4.2.pyc differ
diff --git a/tests/test_estimator.py b/tests/test_method/test_base/test_BQAE.py
similarity index 95%
rename from tests/test_estimator.py
rename to tests/test_method/test_base/test_BQAE.py
index d13afe2..f28c71b 100644
--- a/tests/test_estimator.py
+++ b/tests/test_method/test_base/test_BQAE.py
@@ -1,12 +1,12 @@
-import pytest
 import numpy as np
+import pytest
 import scipy.sparse as sp
 from sklearn.linear_model import LogisticRegression
 
-from quacc.estimator import BinaryQuantifierAccuracyEstimator
+from quacc.method.base import BinaryQuantifierAccuracyEstimator
 
 
-class TestBinaryQuantifierAccuracyEstimator:
+class TestBQAE:
     @pytest.mark.parametrize(
         "instances,preds0,preds1,result",
         [
diff --git a/tests/test_method/test_base/test_MCAE.py b/tests/test_method/test_base/test_MCAE.py
new file mode 100644
index 0000000..b0784a2
--- /dev/null
+++ b/tests/test_method/test_base/test_MCAE.py
@@ -0,0 +1,2 @@
+class TestMCAE:
+    pass