QuAcc/quacc/estimator.py

import math
from abc import abstractmethod

import numpy as np
import quapy as qp
from quapy.data import LabelledCollection
from quapy.method.aggregative import CC, SLD
from quapy.model_selection import GridSearchQ
from quapy.protocol import UPP
from sklearn.base import BaseEstimator
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict

from quacc.data import ExtendedCollection


class AccuracyEstimator:
    def __init__(self):
        self.fit_score = None

    def _gs_params(self, t_val: LabelledCollection):
        return {
            "param_grid": {
                "classifier__C": np.logspace(-3, 3, 7),
                "classifier__class_weight": [None, "balanced"],
                "recalib": [None, "bcts"],
            },
            "protocol": UPP(t_val, repeats=1000),
            "error": qp.error.mae,
            "refit": False,
            "timeout": -1,
            "n_jobs": None,
            "verbose": True,
        }

    def extend(self, base: LabelledCollection, pred_proba=None) -> ExtendedCollection:
        if not pred_proba:
            pred_proba = self.c_model.predict_proba(base.X)
        return ExtendedCollection.extend_collection(base, pred_proba), pred_proba

    @abstractmethod
    def fit(self, train: LabelledCollection | ExtendedCollection):
        ...

    @abstractmethod
    def estimate(self, instances, ext=False):
        ...


AE = AccuracyEstimator


class MulticlassAccuracyEstimator(AccuracyEstimator):
    def __init__(self, c_model: BaseEstimator, q_model="SLD", gs=False, recalib=None):
        super().__init__()
        self.c_model = c_model
        self._q_model_name = q_model.upper()
        self.e_train = None
        self.gs = gs
        self.recalib = recalib

    def fit(self, train: LabelledCollection | ExtendedCollection):
        # check if model is fit
        # self.model.fit(*train.Xy)
        if isinstance(train, LabelledCollection):
            pred_prob_train = cross_val_predict(
                self.c_model, *train.Xy, method="predict_proba"
            )
            self.e_train = ExtendedCollection.extend_collection(train, pred_prob_train)
        else:
            self.e_train = train

        if self._q_model_name == "SLD":
            if self.gs:
                t_train, t_val = self.e_train.split_stratified(0.6, random_state=0)
                gs_params = self._gs_params(t_val)
                self.q_model = GridSearchQ(
                    SLD(LogisticRegression()),
                    **gs_params,
                )
                self.q_model.fit(t_train)
                self.fit_score = self.q_model.best_score_
            else:
                self.q_model = SLD(LogisticRegression(), recalib=self.recalib)
                self.q_model.fit(self.e_train)
        elif self._q_model_name == "CC":
            self.q_model = CC(LogisticRegression())
            self.q_model.fit(self.e_train)

    def estimate(self, instances, ext=False):
        if not ext:
            pred_prob = self.c_model.predict_proba(instances)
            e_inst = ExtendedCollection.extend_instances(instances, pred_prob)
        else:
            e_inst = instances

        estim_prev = self.q_model.quantify(e_inst)

        return self._check_prevalence_classes(
            self.e_train.classes_, self.q_model, estim_prev
        )

    def _check_prevalence_classes(self, true_classes, q_model, estim_prev):
        if isinstance(q_model, GridSearchQ):
            estim_classes = q_model.best_model().classes_
        else:
            estim_classes = q_model.classes_
        for _cls in true_classes:
            if _cls not in estim_classes:
                estim_prev = np.insert(estim_prev, _cls, [0.0], axis=0)
        return estim_prev


class BinaryQuantifierAccuracyEstimator(AccuracyEstimator):
    def __init__(self, c_model: BaseEstimator, q_model="SLD", gs=False, recalib=None):
        super().__init__()
        self.c_model = c_model
        self._q_model_name = q_model.upper()
        self.q_models = []
        self.gs = gs
        self.recalib = recalib
        self.e_train = None

    def fit(self, train: LabelledCollection | ExtendedCollection):
        # check if model is fit
        # self.model.fit(*train.Xy)
        if isinstance(train, LabelledCollection):
            pred_prob_train = cross_val_predict(
                self.c_model, *train.Xy, method="predict_proba"
            )

            self.e_train = ExtendedCollection.extend_collection(train, pred_prob_train)
        elif isinstance(train, ExtendedCollection):
            self.e_train = train

        self.n_classes = self.e_train.n_classes
        e_trains = self.e_train.split_by_pred()

        if self._q_model_name == "SLD":
            fit_scores = []
            for e_train in e_trains:
                if self.gs:
                    t_train, t_val = e_train.split_stratified(0.6, random_state=0)
                    gs_params = self._gs_params(t_val)
                    q_model = GridSearchQ(
                        SLD(LogisticRegression()),
                        **gs_params,
                    )
                    q_model.fit(t_train)
                    fit_scores.append(q_model.best_score_)
                    self.q_models.append(q_model)
                else:
                    q_model = SLD(LogisticRegression(), recalib=self.recalib)
                    q_model.fit(e_train)
                    self.q_models.append(q_model)

            if self.gs:
                self.fit_score = np.mean(fit_scores)

        elif self._q_model_name == "CC":
            for e_train in e_trains:
                q_model = CC(LogisticRegression())
                q_model.fit(e_train)
                self.q_models.append(q_model)

    def estimate(self, instances, ext=False):
        # TODO: test
        if not ext:
            pred_prob = self.c_model.predict_proba(instances)
            e_inst = ExtendedCollection.extend_instances(instances, pred_prob)
        else:
            e_inst = instances

        _ncl = int(math.sqrt(self.n_classes))
        s_inst, norms = ExtendedCollection.split_inst_by_pred(_ncl, e_inst)
        estim_prevs = [
            self._quantify_helper(inst, norm, q_model)
            for (inst, norm, q_model) in zip(s_inst, norms, self.q_models)
        ]

        estim_prev = []
        for prev_row in zip(*estim_prevs):
            for prev in prev_row:
                estim_prev.append(prev)

        return np.asarray(estim_prev)

    def _quantify_helper(self, inst, norm, q_model):
        if inst.shape[0] > 0:
            return np.asarray(list(map(lambda p: p * norm, q_model.quantify(inst))))
        else:
            return np.asarray([0.0, 0.0])
Binary quantifier added, support added and tested. 2023-07-26 00:38:23 +02:00			`import math`
plots, avg table, conf added; method updated 2023-10-23 03:14:35 +02:00			`from abc import abstractmethod`
Binary quantifier added, support added and tested. 2023-07-26 00:38:23 +02:00
estimator refactored, missing evaluation 2023-05-18 22:55:10 +02:00			`import numpy as np`
diag plot fixed, opts, avg plot, best score added 2023-10-27 12:37:18 +02:00			`import quapy as qp`
estimator refactored, missing evaluation 2023-05-18 22:55:10 +02:00			`from quapy.data import LabelledCollection`
plots, avg table, conf added; method updated 2023-10-23 03:14:35 +02:00			`from quapy.method.aggregative import CC, SLD`
diag plot fixed, opts, avg plot, best score added 2023-10-27 12:37:18 +02:00			`from quapy.model_selection import GridSearchQ`
			`from quapy.protocol import UPP`
estimator refactored, missing evaluation 2023-05-18 22:55:10 +02:00			`from sklearn.base import BaseEstimator`
Binary quantifier added, support added and tested. 2023-07-26 00:38:23 +02:00			`from sklearn.linear_model import LogisticRegression`
estimator refactored, missing evaluation 2023-05-18 22:55:10 +02:00			`from sklearn.model_selection import cross_val_predict`

Typing fixed, gitignore updated 2023-07-28 01:47:44 +02:00			`from quacc.data import ExtendedCollection`
estimator refactored, missing evaluation 2023-05-18 22:55:10 +02:00

Binary quantifier added, support added and tested. 2023-07-26 00:38:23 +02:00			`class AccuracyEstimator:`
diag plot fixed, opts, avg plot, best score added 2023-10-27 12:37:18 +02:00			`def __init__(self):`
			`self.fit_score = None`

			`def _gs_params(self, t_val: LabelledCollection):`
			`return {`
			`"param_grid": {`
			`"classifier__C": np.logspace(-3, 3, 7),`
			`"classifier__class_weight": [None, "balanced"],`
			`"recalib": [None, "bcts"],`
			`},`
			`"protocol": UPP(t_val, repeats=1000),`
			`"error": qp.error.mae,`
			`"refit": False,`
			`"timeout": -1,`
			`"n_jobs": None,`
			`"verbose": True,`
			`}`

Typing fixed, gitignore updated 2023-07-28 01:47:44 +02:00			`def extend(self, base: LabelledCollection, pred_proba=None) -> ExtendedCollection:`
Binary quantifier added, support added and tested. 2023-07-26 00:38:23 +02:00			`if not pred_proba:`
binary quantifier completed, tests added. errors updated. 2023-07-27 03:16:41 +02:00			`pred_proba = self.c_model.predict_proba(base.X)`
plots, avg table, conf added; method updated 2023-10-23 03:14:35 +02:00			`return ExtendedCollection.extend_collection(base, pred_proba), pred_proba`
estimator refactored, missing evaluation 2023-05-18 22:55:10 +02:00
Binary quantifier added, support added and tested. 2023-07-26 00:38:23 +02:00			`@abstractmethod`
Typing fixed, gitignore updated 2023-07-28 01:47:44 +02:00			`def fit(self, train: LabelledCollection \| ExtendedCollection):`
Binary quantifier added, support added and tested. 2023-07-26 00:38:23 +02:00			`...`
estimator refactored, missing evaluation 2023-05-18 22:56:57 +02:00
Binary quantifier added, support added and tested. 2023-07-26 00:38:23 +02:00			`@abstractmethod`
			`def estimate(self, instances, ext=False):`
			`...`
estimator refactored, missing evaluation 2023-05-18 22:55:10 +02:00

diag plot fixed, opts, avg plot, best score added 2023-10-27 12:37:18 +02:00			`AE = AccuracyEstimator`


Binary quantifier added, support added and tested. 2023-07-26 00:38:23 +02:00			`class MulticlassAccuracyEstimator(AccuracyEstimator):`
diag plot fixed, opts, avg plot, best score added 2023-10-27 12:37:18 +02:00			`def __init__(self, c_model: BaseEstimator, q_model="SLD", gs=False, recalib=None):`
			`super().__init__()`
Binary quantifier added, support added and tested. 2023-07-26 00:38:23 +02:00			`self.c_model = c_model`
diag plot fixed, opts, avg plot, best score added 2023-10-27 12:37:18 +02:00			`self._q_model_name = q_model.upper()`
Binary quantifier added, support added and tested. 2023-07-26 00:38:23 +02:00			`self.e_train = None`
diag plot fixed, opts, avg plot, best score added 2023-10-27 12:37:18 +02:00			`self.gs = gs`
			`self.recalib = recalib`
estimator refactored, missing evaluation 2023-05-18 22:56:57 +02:00
Typing fixed, gitignore updated 2023-07-28 01:47:44 +02:00			`def fit(self, train: LabelledCollection \| ExtendedCollection):`
Binary quantifier added, support added and tested. 2023-07-26 00:38:23 +02:00			`# check if model is fit`
			`# self.model.fit(*train.Xy)`
			`if isinstance(train, LabelledCollection):`
			`pred_prob_train = cross_val_predict(`
			`self.c_model, *train.Xy, method="predict_proba"`
			`)`
Typing fixed, gitignore updated 2023-07-28 01:47:44 +02:00			`self.e_train = ExtendedCollection.extend_collection(train, pred_prob_train)`
Binary quantifier added, support added and tested. 2023-07-26 00:38:23 +02:00			`else:`
			`self.e_train = train`
estimator refactored, missing evaluation 2023-05-18 22:55:10 +02:00
diag plot fixed, opts, avg plot, best score added 2023-10-27 12:37:18 +02:00			`if self._q_model_name == "SLD":`
			`if self.gs:`
			`t_train, t_val = self.e_train.split_stratified(0.6, random_state=0)`
			`gs_params = self._gs_params(t_val)`
			`self.q_model = GridSearchQ(`
			`SLD(LogisticRegression()),`
			`**gs_params,`
			`)`
			`self.q_model.fit(t_train)`
			`self.fit_score = self.q_model.best_score_`
			`else:`
			`self.q_model = SLD(LogisticRegression(), recalib=self.recalib)`
			`self.q_model.fit(self.e_train)`
			`elif self._q_model_name == "CC":`
			`self.q_model = CC(LogisticRegression())`
			`self.q_model.fit(self.e_train)`
estimator refactored, missing evaluation 2023-05-18 22:55:10 +02:00
Binary quantifier added, support added and tested. 2023-07-26 00:38:23 +02:00			`def estimate(self, instances, ext=False):`
			`if not ext:`
			`pred_prob = self.c_model.predict_proba(instances)`
Typing fixed, gitignore updated 2023-07-28 01:47:44 +02:00			`e_inst = ExtendedCollection.extend_instances(instances, pred_prob)`
Binary quantifier added, support added and tested. 2023-07-26 00:38:23 +02:00			`else:`
			`e_inst = instances`
estimator refactored, missing evaluation 2023-05-18 22:55:10 +02:00
Binary quantifier added, support added and tested. 2023-07-26 00:38:23 +02:00			`estim_prev = self.q_model.quantify(e_inst)`
estimator refactored, missing evaluation 2023-05-18 22:55:10 +02:00
binary quantifier completed, tests added. errors updated. 2023-07-27 03:16:41 +02:00			`return self._check_prevalence_classes(`
diag plot fixed, opts, avg plot, best score added 2023-10-27 12:37:18 +02:00			`self.e_train.classes_, self.q_model, estim_prev`
Binary quantifier added, support added and tested. 2023-07-26 00:38:23 +02:00			`)`
estimator refactored, missing evaluation 2023-05-18 22:55:10 +02:00
diag plot fixed, opts, avg plot, best score added 2023-10-27 12:37:18 +02:00			`def _check_prevalence_classes(self, true_classes, q_model, estim_prev):`
			`if isinstance(q_model, GridSearchQ):`
			`estim_classes = q_model.best_model().classes_`
			`else:`
			`estim_classes = q_model.classes_`
binary quantifier completed, tests added. errors updated. 2023-07-27 03:16:41 +02:00			`for _cls in true_classes:`
			`if _cls not in estim_classes:`
			`estim_prev = np.insert(estim_prev, _cls, [0.0], axis=0)`
			`return estim_prev`

estimator refactored, missing evaluation 2023-05-18 22:55:10 +02:00
Binary quantifier added, support added and tested. 2023-07-26 00:38:23 +02:00			`class BinaryQuantifierAccuracyEstimator(AccuracyEstimator):`
diag plot fixed, opts, avg plot, best score added 2023-10-27 12:37:18 +02:00			`def __init__(self, c_model: BaseEstimator, q_model="SLD", gs=False, recalib=None):`
			`super().__init__()`
Binary quantifier added, support added and tested. 2023-07-26 00:38:23 +02:00			`self.c_model = c_model`
diag plot fixed, opts, avg plot, best score added 2023-10-27 12:37:18 +02:00			`self._q_model_name = q_model.upper()`
			`self.q_models = []`
			`self.gs = gs`
			`self.recalib = recalib`
Typing fixed, gitignore updated 2023-07-28 01:47:44 +02:00			`self.e_train = None`
Binary quantifier added, support added and tested. 2023-07-26 00:38:23 +02:00
Typing fixed, gitignore updated 2023-07-28 01:47:44 +02:00			`def fit(self, train: LabelledCollection \| ExtendedCollection):`
estimator refactored, missing evaluation 2023-05-18 22:55:10 +02:00			`# check if model is fit`
			`# self.model.fit(*train.Xy)`
			`if isinstance(train, LabelledCollection):`
			`pred_prob_train = cross_val_predict(`
Binary quantifier added, support added and tested. 2023-07-26 00:38:23 +02:00			`self.c_model, *train.Xy, method="predict_proba"`
estimator refactored, missing evaluation 2023-05-18 22:55:10 +02:00			`)`

Typing fixed, gitignore updated 2023-07-28 01:47:44 +02:00			`self.e_train = ExtendedCollection.extend_collection(train, pred_prob_train)`
			`elif isinstance(train, ExtendedCollection):`
plots, avg table, conf added; method updated 2023-10-23 03:14:35 +02:00			`self.e_train = train`
estimator refactored, missing evaluation 2023-05-18 22:55:10 +02:00
binary quantifier completed, tests added. errors updated. 2023-07-27 03:16:41 +02:00			`self.n_classes = self.e_train.n_classes`
diag plot fixed, opts, avg plot, best score added 2023-10-27 12:37:18 +02:00			`e_trains = self.e_train.split_by_pred()`

			`if self._q_model_name == "SLD":`
			`fit_scores = []`
			`for e_train in e_trains:`
			`if self.gs:`
			`t_train, t_val = e_train.split_stratified(0.6, random_state=0)`
			`gs_params = self._gs_params(t_val)`
			`q_model = GridSearchQ(`
			`SLD(LogisticRegression()),`
			`**gs_params,`
			`)`
			`q_model.fit(t_train)`
			`fit_scores.append(q_model.best_score_)`
			`self.q_models.append(q_model)`
			`else:`
			`q_model = SLD(LogisticRegression(), recalib=self.recalib)`
			`q_model.fit(e_train)`
			`self.q_models.append(q_model)`

			`if self.gs:`
			`self.fit_score = np.mean(fit_scores)`

			`elif self._q_model_name == "CC":`
			`for e_train in e_trains:`
			`q_model = CC(LogisticRegression())`
			`q_model.fit(e_train)`
			`self.q_models.append(q_model)`
estimator refactored, missing evaluation 2023-05-18 22:55:10 +02:00
			`def estimate(self, instances, ext=False):`
Binary quantifier added, support added and tested. 2023-07-26 00:38:23 +02:00			`# TODO: test`
estimator refactored, missing evaluation 2023-05-18 22:55:10 +02:00			`if not ext:`
Binary quantifier added, support added and tested. 2023-07-26 00:38:23 +02:00			`pred_prob = self.c_model.predict_proba(instances)`
Typing fixed, gitignore updated 2023-07-28 01:47:44 +02:00			`e_inst = ExtendedCollection.extend_instances(instances, pred_prob)`
estimator refactored, missing evaluation 2023-05-18 22:55:10 +02:00			`else:`
			`e_inst = instances`

binary quantifier completed, tests added. errors updated. 2023-07-27 03:16:41 +02:00			`_ncl = int(math.sqrt(self.n_classes))`
Typing fixed, gitignore updated 2023-07-28 01:47:44 +02:00			`s_inst, norms = ExtendedCollection.split_inst_by_pred(_ncl, e_inst)`
diag plot fixed, opts, avg plot, best score added 2023-10-27 12:37:18 +02:00			`estim_prevs = [`
binary quantifier completed, tests added. errors updated. 2023-07-27 03:16:41 +02:00			`self._quantify_helper(inst, norm, q_model)`
diag plot fixed, opts, avg plot, best score added 2023-10-27 12:37:18 +02:00			`for (inst, norm, q_model) in zip(s_inst, norms, self.q_models)`
Binary quantifier added, support added and tested. 2023-07-26 00:38:23 +02:00			`]`

			`estim_prev = []`
diag plot fixed, opts, avg plot, best score added 2023-10-27 12:37:18 +02:00			`for prev_row in zip(*estim_prevs):`
Binary quantifier added, support added and tested. 2023-07-26 00:38:23 +02:00			`for prev in prev_row:`
			`estim_prev.append(prev)`

binary quantifier completed, tests added. errors updated. 2023-07-27 03:16:41 +02:00			`return np.asarray(estim_prev)`
estimator refactored, missing evaluation 2023-05-18 22:55:10 +02:00
binary quantifier completed, tests added. errors updated. 2023-07-27 03:16:41 +02:00			`def _quantify_helper(self, inst, norm, q_model):`
			`if inst.shape[0] > 0:`
			`return np.asarray(list(map(lambda p: p * norm, q_model.quantify(inst))))`
			`else:`
			`return np.asarray([0.0, 0.0])`