baselines refactored and updated, report updated
This commit is contained in:
parent
17b8f4bf6d
commit
210f50b617
File diff suppressed because it is too large
Load Diff
|
@ -13,9 +13,8 @@ jinja2 = "^3.1.2"
|
||||||
|
|
||||||
[tool.poetry.scripts]
|
[tool.poetry.scripts]
|
||||||
main = "quacc.main:main"
|
main = "quacc.main:main"
|
||||||
multi = "quacc.main:estimate_multiclass"
|
|
||||||
bin = "quacc.main:estimate_binary"
|
|
||||||
comp = "quacc.main:estimate_comparison"
|
comp = "quacc.main:estimate_comparison"
|
||||||
|
tohost = "scp_sync:scp_sync_to_host"
|
||||||
|
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import List, Optional, Self
|
from typing import List, Optional
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import math
|
import math
|
||||||
|
@ -43,7 +43,7 @@ class ExtendedCollection(LabelledCollection):
|
||||||
):
|
):
|
||||||
super().__init__(instances, labels, classes=classes)
|
super().__init__(instances, labels, classes=classes)
|
||||||
|
|
||||||
def split_by_pred(self) -> List[Self]:
|
def split_by_pred(self):
|
||||||
_ncl = int(math.sqrt(self.n_classes))
|
_ncl = int(math.sqrt(self.n_classes))
|
||||||
_indexes = ExtendedCollection._split_index_by_pred(_ncl, self.instances)
|
_indexes = ExtendedCollection._split_index_by_pred(_ncl, self.instances)
|
||||||
if isinstance(self.instances, np.ndarray):
|
if isinstance(self.instances, np.ndarray):
|
||||||
|
@ -129,7 +129,7 @@ class ExtendedCollection(LabelledCollection):
|
||||||
@classmethod
|
@classmethod
|
||||||
def extend_collection(
|
def extend_collection(
|
||||||
cls, base: LabelledCollection, pred_proba: np.ndarray
|
cls, base: LabelledCollection, pred_proba: np.ndarray
|
||||||
) -> Self:
|
):
|
||||||
n_classes = base.n_classes
|
n_classes = base.n_classes
|
||||||
|
|
||||||
# n_X = [ X | predicted probs. ]
|
# n_X = [ X | predicted probs. ]
|
||||||
|
|
|
@ -7,17 +7,22 @@ from sklearn.conftest import fetch_rcv1
|
||||||
TRAIN_VAL_PROP = 0.5
|
TRAIN_VAL_PROP = 0.5
|
||||||
|
|
||||||
|
|
||||||
def get_imdb() -> Tuple[LabelledCollection]:
|
def get_imdb(**kwargs) -> Tuple[LabelledCollection]:
|
||||||
train, test = qp.datasets.fetch_reviews("imdb", tfidf=True).train_test
|
train, test = qp.datasets.fetch_reviews("imdb", tfidf=True).train_test
|
||||||
train, validation = train.split_stratified(train_prop=TRAIN_VAL_PROP)
|
train, validation = train.split_stratified(
|
||||||
|
train_prop=TRAIN_VAL_PROP, random_state=0
|
||||||
|
)
|
||||||
return train, validation, test
|
return train, validation, test
|
||||||
|
|
||||||
|
|
||||||
def get_spambase() -> Tuple[LabelledCollection]:
|
def get_spambase(**kwargs) -> Tuple[LabelledCollection]:
|
||||||
train, test = qp.datasets.fetch_UCIDataset("spambase", verbose=False).train_test
|
train, test = qp.datasets.fetch_UCIDataset("spambase", verbose=False).train_test
|
||||||
train, validation = train.split_stratified(train_prop=TRAIN_VAL_PROP)
|
train, validation = train.split_stratified(
|
||||||
|
train_prop=TRAIN_VAL_PROP, random_state=0
|
||||||
|
)
|
||||||
return train, validation, test
|
return train, validation, test
|
||||||
|
|
||||||
|
|
||||||
# >>> fetch_rcv1().target_names
|
# >>> fetch_rcv1().target_names
|
||||||
# array(['C11', 'C12', 'C13', 'C14', 'C15', 'C151', 'C1511', 'C152', 'C16',
|
# array(['C11', 'C12', 'C13', 'C14', 'C15', 'C151', 'C1511', 'C152', 'C16',
|
||||||
# 'C17', 'C171', 'C172', 'C173', 'C174', 'C18', 'C181', 'C182',
|
# 'C17', 'C171', 'C172', 'C173', 'C174', 'C18', 'C181', 'C182',
|
||||||
|
@ -33,11 +38,15 @@ def get_spambase() -> Tuple[LabelledCollection]:
|
||||||
# 'GWELF', 'M11', 'M12', 'M13', 'M131', 'M132', 'M14', 'M141',
|
# 'GWELF', 'M11', 'M12', 'M13', 'M131', 'M132', 'M14', 'M141',
|
||||||
# 'M142', 'M143', 'MCAT'], dtype=object)
|
# 'M142', 'M143', 'MCAT'], dtype=object)
|
||||||
|
|
||||||
def get_rcv1(target:str):
|
|
||||||
|
def get_rcv1(target = "default", **kwargs):
|
||||||
sample_size = qp.environ["SAMPLE_SIZE"]
|
sample_size = qp.environ["SAMPLE_SIZE"]
|
||||||
n_train = 23149
|
n_train = 23149
|
||||||
dataset = fetch_rcv1()
|
dataset = fetch_rcv1()
|
||||||
|
|
||||||
|
if target == "default":
|
||||||
|
target = "C12"
|
||||||
|
|
||||||
if target not in dataset.target_names:
|
if target not in dataset.target_names:
|
||||||
raise ValueError("Invalid target")
|
raise ValueError("Invalid target")
|
||||||
|
|
||||||
|
@ -46,7 +55,9 @@ def get_rcv1(target:str):
|
||||||
all_train_l, test_l = labels[:n_train], labels[n_train:]
|
all_train_l, test_l = labels[:n_train], labels[n_train:]
|
||||||
all_train = LabelledCollection(all_train_d, all_train_l, classes=classes)
|
all_train = LabelledCollection(all_train_d, all_train_l, classes=classes)
|
||||||
test = LabelledCollection(test_d, test_l, classes=classes)
|
test = LabelledCollection(test_d, test_l, classes=classes)
|
||||||
train, validation = all_train.split_stratified(train_prop=TRAIN_VAL_PROP)
|
train, validation = all_train.split_stratified(
|
||||||
|
train_prop=TRAIN_VAL_PROP, random_state=0
|
||||||
|
)
|
||||||
return train, validation, test
|
return train, validation, test
|
||||||
|
|
||||||
target_index = np.where(dataset.target_names == target)[0]
|
target_index = np.where(dataset.target_names == target)[0]
|
||||||
|
@ -58,4 +69,3 @@ def get_rcv1(target:str):
|
||||||
d = dataset_split(dataset.data, target_labels, classes=[0, 1])
|
d = dataset_split(dataset.data, target_labels, classes=[0, 1])
|
||||||
|
|
||||||
return d
|
return d
|
||||||
|
|
||||||
|
|
|
@ -24,12 +24,12 @@ def from_name(err_name):
|
||||||
def f1(prev):
|
def f1(prev):
|
||||||
den = (2*prev[3]) + prev[1] + prev[2]
|
den = (2*prev[3]) + prev[1] + prev[2]
|
||||||
if den == 0:
|
if den == 0:
|
||||||
return 1.0
|
return 0.0
|
||||||
else:
|
else:
|
||||||
return (2*prev[3])/den
|
return (2*prev[3])/den
|
||||||
|
|
||||||
def f1e(prev):
|
def f1e(prev):
|
||||||
return 1 - f1(prev)
|
return 1 - f1(prev)
|
||||||
|
|
||||||
def mae(prev):
|
def acc(prev):
|
||||||
return (prev[1] + prev[2]) / sum(prev)
|
return (prev[1] + prev[2]) / sum(prev)
|
|
@ -1,149 +0,0 @@
|
||||||
import itertools
|
|
||||||
from quapy.protocol import (
|
|
||||||
OnLabelledCollectionProtocol,
|
|
||||||
AbstractStochasticSeededProtocol,
|
|
||||||
)
|
|
||||||
from typing import Iterable, Callable, Union
|
|
||||||
|
|
||||||
from .estimator import AccuracyEstimator
|
|
||||||
import pandas as pd
|
|
||||||
import numpy as np
|
|
||||||
import quacc.error as error
|
|
||||||
import statistics as stats
|
|
||||||
|
|
||||||
|
|
||||||
def estimate(
|
|
||||||
estimator: AccuracyEstimator,
|
|
||||||
protocol: AbstractStochasticSeededProtocol,
|
|
||||||
):
|
|
||||||
# ensure that the protocol returns a LabelledCollection for each iteration
|
|
||||||
protocol.collator = OnLabelledCollectionProtocol.get_collator("labelled_collection")
|
|
||||||
|
|
||||||
base_prevs, true_prevs, estim_prevs = [], [], []
|
|
||||||
for sample in protocol():
|
|
||||||
e_sample = estimator.extend(sample)
|
|
||||||
estim_prev = estimator.estimate(e_sample.X, ext=True)
|
|
||||||
base_prevs.append(sample.prevalence())
|
|
||||||
true_prevs.append(e_sample.prevalence())
|
|
||||||
estim_prevs.append(estim_prev)
|
|
||||||
|
|
||||||
return base_prevs, true_prevs, estim_prevs
|
|
||||||
|
|
||||||
|
|
||||||
def avg_groupby_distribution(lst, error_names):
|
|
||||||
def _bprev(s):
|
|
||||||
return (s[("base", "F")], s[("base", "T")])
|
|
||||||
|
|
||||||
def _normalize_prev(r):
|
|
||||||
for prev_name in ["true", "estim"]:
|
|
||||||
raw_prev = [v for ((k0, k1), v) in r.items() if k0 == prev_name]
|
|
||||||
norm_prev = [v / sum(raw_prev) for v in raw_prev]
|
|
||||||
for n, v in zip(
|
|
||||||
itertools.product([prev_name], ["TN", "FP", "FN", "TP"]), norm_prev
|
|
||||||
):
|
|
||||||
r[n] = v
|
|
||||||
return r
|
|
||||||
|
|
||||||
current_bprev = _bprev(lst[0])
|
|
||||||
bprev_cnt = 0
|
|
||||||
g_lst = [[]]
|
|
||||||
for s in lst:
|
|
||||||
if _bprev(s) == current_bprev:
|
|
||||||
g_lst[bprev_cnt].append(s)
|
|
||||||
else:
|
|
||||||
g_lst.append([])
|
|
||||||
bprev_cnt += 1
|
|
||||||
current_bprev = _bprev(s)
|
|
||||||
g_lst[bprev_cnt].append(s)
|
|
||||||
|
|
||||||
r_lst = []
|
|
||||||
for gs in g_lst:
|
|
||||||
assert len(gs) > 0
|
|
||||||
r = {}
|
|
||||||
r[("base", "F")], r[("base", "T")] = _bprev(gs[0])
|
|
||||||
|
|
||||||
for pn in [(n1, n2) for ((n1, n2), _) in gs[0].items() if n1 != "base"]:
|
|
||||||
r[pn] = stats.mean(map(lambda s: s[pn], gs))
|
|
||||||
|
|
||||||
r = _normalize_prev(r)
|
|
||||||
|
|
||||||
for en in itertools.product(["errors"], error_names):
|
|
||||||
r[en] = stats.mean(map(lambda s: s[en], gs))
|
|
||||||
|
|
||||||
r_lst.append(r)
|
|
||||||
|
|
||||||
return r_lst
|
|
||||||
|
|
||||||
|
|
||||||
def evaluation_report(
|
|
||||||
estimator: AccuracyEstimator,
|
|
||||||
protocol: AbstractStochasticSeededProtocol,
|
|
||||||
error_metrics: Iterable[Union[str, Callable]] = "all",
|
|
||||||
aggregate: bool = True,
|
|
||||||
prevalence: bool = True,
|
|
||||||
):
|
|
||||||
def _report_columns(err_names):
|
|
||||||
base_cols = list(itertools.product(["base"], ["F", "T"]))
|
|
||||||
prev_cols = list(itertools.product(["true", "estim"], ["TN", "FP", "FN", "TP"]))
|
|
||||||
err_cols = list(itertools.product(["errors"], err_names))
|
|
||||||
return base_cols, prev_cols, err_cols
|
|
||||||
|
|
||||||
base_prevs, true_prevs, estim_prevs = estimate(estimator, protocol)
|
|
||||||
|
|
||||||
if error_metrics == "all":
|
|
||||||
error_metrics = ["mae", "f1"]
|
|
||||||
|
|
||||||
error_funcs = [
|
|
||||||
error.from_name(e) if isinstance(e, str) else e for e in error_metrics
|
|
||||||
]
|
|
||||||
assert all(hasattr(e, "__call__") for e in error_funcs), "invalid error function"
|
|
||||||
error_names = [e.__name__ for e in error_funcs]
|
|
||||||
error_cols = []
|
|
||||||
for err in error_names:
|
|
||||||
if err == "mae":
|
|
||||||
error_cols.extend(["mae estim", "mae true"])
|
|
||||||
elif err == "f1":
|
|
||||||
error_cols.extend(["f1 estim", "f1 true"])
|
|
||||||
elif err == "f1e":
|
|
||||||
error_cols.extend(["f1e estim", "f1e true"])
|
|
||||||
else:
|
|
||||||
error_cols.append(err)
|
|
||||||
|
|
||||||
# df_cols = ["base_prev", "true_prev", "estim_prev"] + error_names
|
|
||||||
base_cols, prev_cols, err_cols = _report_columns(error_cols)
|
|
||||||
|
|
||||||
lst = []
|
|
||||||
for base_prev, true_prev, estim_prev in zip(base_prevs, true_prevs, estim_prevs):
|
|
||||||
if prevalence:
|
|
||||||
series = {
|
|
||||||
k: v
|
|
||||||
for (k, v) in zip(
|
|
||||||
base_cols + prev_cols,
|
|
||||||
np.concatenate((base_prev, true_prev, estim_prev), axis=0),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
df_cols = base_cols + prev_cols + err_cols
|
|
||||||
else:
|
|
||||||
series = {k: v for (k, v) in zip(base_cols, base_prev)}
|
|
||||||
df_cols = base_cols + err_cols
|
|
||||||
|
|
||||||
for err in error_cols:
|
|
||||||
error_funcs = {
|
|
||||||
"mae true": lambda: error.mae(true_prev),
|
|
||||||
"mae estim": lambda: error.mae(estim_prev),
|
|
||||||
"f1 true": lambda: error.f1(true_prev),
|
|
||||||
"f1 estim": lambda: error.f1(estim_prev),
|
|
||||||
"f1e true": lambda: error.f1e(true_prev),
|
|
||||||
"f1e estim": lambda: error.f1e(estim_prev),
|
|
||||||
}
|
|
||||||
series[("errors", err)] = error_funcs[err]()
|
|
||||||
|
|
||||||
lst.append(series)
|
|
||||||
|
|
||||||
lst = avg_groupby_distribution(lst, error_cols) if aggregate else lst
|
|
||||||
|
|
||||||
df = pd.DataFrame(
|
|
||||||
lst,
|
|
||||||
columns=pd.MultiIndex.from_tuples(df_cols),
|
|
||||||
)
|
|
||||||
return df
|
|
|
@ -2,52 +2,73 @@ from statistics import mean
|
||||||
from typing import Dict
|
from typing import Dict
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import quapy as qp
|
|
||||||
from quapy.data import LabelledCollection
|
from quapy.data import LabelledCollection
|
||||||
from sklearn.base import BaseEstimator
|
from sklearn.base import BaseEstimator
|
||||||
from sklearn.model_selection import cross_validate
|
from sklearn.model_selection import cross_validate
|
||||||
|
import sklearn.metrics as metrics
|
||||||
from quapy.protocol import (
|
from quapy.protocol import (
|
||||||
AbstractStochasticSeededProtocol,
|
AbstractStochasticSeededProtocol,
|
||||||
OnLabelledCollectionProtocol,
|
OnLabelledCollectionProtocol,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from .report import EvaluationReport
|
||||||
|
|
||||||
import elsahar19_rca.rca as rca
|
import elsahar19_rca.rca as rca
|
||||||
import garg22_ATC.ATC_helper as atc
|
import garg22_ATC.ATC_helper as atc
|
||||||
import guillory21_doc.doc as doc
|
import guillory21_doc.doc as doc
|
||||||
import jiang18_trustscore.trustscore as trustscore
|
import jiang18_trustscore.trustscore as trustscore
|
||||||
import lipton_bbse.labelshift as bbse
|
|
||||||
import pandas as pd
|
|
||||||
import statistics as stats
|
|
||||||
|
|
||||||
|
|
||||||
def kfcv(c_model: BaseEstimator, validation: LabelledCollection) -> Dict:
|
def kfcv(
|
||||||
scoring = ["f1_macro"]
|
c_model: BaseEstimator,
|
||||||
|
validation: LabelledCollection,
|
||||||
|
protocol: AbstractStochasticSeededProtocol,
|
||||||
|
predict_method="predict"
|
||||||
|
):
|
||||||
|
c_model_predict = getattr(c_model, predict_method)
|
||||||
|
|
||||||
|
scoring = ["accuracy", "f1_macro"]
|
||||||
scores = cross_validate(c_model, validation.X, validation.y, scoring=scoring)
|
scores = cross_validate(c_model, validation.X, validation.y, scoring=scoring)
|
||||||
return {"f1_score": mean(scores["test_f1_macro"])}
|
acc_score = mean(scores["test_accuracy"])
|
||||||
|
f1_score = mean(scores["test_f1_macro"])
|
||||||
|
|
||||||
|
# ensure that the protocol returns a LabelledCollection for each iteration
|
||||||
|
protocol.collator = OnLabelledCollectionProtocol.get_collator("labelled_collection")
|
||||||
|
|
||||||
|
report = EvaluationReport(prefix="kfcv")
|
||||||
|
for test in protocol():
|
||||||
|
test_preds = c_model_predict(test.X)
|
||||||
|
meta_acc = abs(acc_score - metrics.accuracy_score(test.y, test_preds))
|
||||||
|
meta_f1 = abs(f1_score - metrics.f1_score(test.y, test_preds))
|
||||||
|
report.append_row(
|
||||||
|
test.prevalence(),
|
||||||
|
acc_score=(1. - acc_score),
|
||||||
|
f1_score=f1_score,
|
||||||
|
acc=meta_acc,
|
||||||
|
f1=meta_f1,
|
||||||
|
)
|
||||||
|
|
||||||
|
return report
|
||||||
|
|
||||||
|
|
||||||
def avg_groupby_distribution(results):
|
def reference(
|
||||||
def base_prev(s):
|
c_model: BaseEstimator,
|
||||||
return (s[("base", "F")], s[("base", "T")])
|
validation: LabelledCollection,
|
||||||
|
protocol: AbstractStochasticSeededProtocol,
|
||||||
|
):
|
||||||
|
protocol.collator = OnLabelledCollectionProtocol.get_collator("labelled_collection")
|
||||||
|
c_model_predict = getattr(c_model, "predict_proba")
|
||||||
|
report = EvaluationReport(prefix="ref")
|
||||||
|
for test in protocol():
|
||||||
|
test_probs = c_model_predict(test.X)
|
||||||
|
test_preds = np.argmax(test_probs, axis=-1)
|
||||||
|
report.append_row(
|
||||||
|
test.prevalence(),
|
||||||
|
acc_score=(1 - metrics.accuracy_score(test.y, test_preds)),
|
||||||
|
f1_score=metrics.f1_score(test.y, test_preds),
|
||||||
|
)
|
||||||
|
|
||||||
grouped_list = {}
|
return report
|
||||||
for r in results:
|
|
||||||
bp = base_prev(r)
|
|
||||||
if bp in grouped_list.keys():
|
|
||||||
grouped_list[bp].append(r)
|
|
||||||
else:
|
|
||||||
grouped_list[bp] = [r]
|
|
||||||
|
|
||||||
series = []
|
|
||||||
for (fp, tp), r_list in grouped_list.items():
|
|
||||||
assert len(r_list) > 0
|
|
||||||
r_avg = {}
|
|
||||||
r_avg[("base", "F")], r_avg[("base", "T")] = fp, tp
|
|
||||||
for pn in [(n1, n2) for ((n1, n2), _) in r_list[0].items() if n1 != "base"]:
|
|
||||||
r_avg[pn] = stats.mean(map(lambda r: r[pn], r_list))
|
|
||||||
series.append(r_avg)
|
|
||||||
|
|
||||||
return series
|
|
||||||
|
|
||||||
|
|
||||||
def atc_mc(
|
def atc_mc(
|
||||||
|
@ -69,26 +90,26 @@ def atc_mc(
|
||||||
# ensure that the protocol returns a LabelledCollection for each iteration
|
# ensure that the protocol returns a LabelledCollection for each iteration
|
||||||
protocol.collator = OnLabelledCollectionProtocol.get_collator("labelled_collection")
|
protocol.collator = OnLabelledCollectionProtocol.get_collator("labelled_collection")
|
||||||
|
|
||||||
cols = [
|
report = EvaluationReport(prefix="atc_mc")
|
||||||
("base", "F"),
|
|
||||||
("base", "T"),
|
|
||||||
("atc mc", "accuracy"),
|
|
||||||
]
|
|
||||||
results = []
|
|
||||||
for test in protocol():
|
for test in protocol():
|
||||||
## Load OOD test data probs
|
## Load OOD test data probs
|
||||||
test_probs = c_model_predict(test.X)
|
test_probs = c_model_predict(test.X)
|
||||||
|
test_preds = np.argmax(test_probs, axis=-1)
|
||||||
test_scores = atc.get_max_conf(test_probs)
|
test_scores = atc.get_max_conf(test_probs)
|
||||||
atc_accuracy = 1.0 - (atc.get_ATC_acc(atc_thres, test_scores) / 100.0)
|
atc_accuracy = atc.get_ATC_acc(atc_thres, test_scores)
|
||||||
[f_prev, t_prev] = test.prevalence()
|
meta_acc = abs(atc_accuracy - metrics.accuracy_score(test.y, test_preds))
|
||||||
results.append({k: v for k, v in zip(cols, [f_prev, t_prev, atc_accuracy])})
|
f1_score = atc.get_ATC_f1(atc_thres, test_scores, test_probs)
|
||||||
|
meta_f1 = abs(f1_score - metrics.f1_score(test.y, test_preds))
|
||||||
series = avg_groupby_distribution(results)
|
report.append_row(
|
||||||
return pd.DataFrame(
|
test.prevalence(),
|
||||||
series,
|
acc=meta_acc,
|
||||||
columns=pd.MultiIndex.from_tuples(cols),
|
acc_score=1.0 - atc_accuracy,
|
||||||
|
f1_score=f1_score,
|
||||||
|
f1=meta_f1,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
return report
|
||||||
|
|
||||||
|
|
||||||
def atc_ne(
|
def atc_ne(
|
||||||
c_model: BaseEstimator,
|
c_model: BaseEstimator,
|
||||||
|
@ -109,26 +130,26 @@ def atc_ne(
|
||||||
# ensure that the protocol returns a LabelledCollection for each iteration
|
# ensure that the protocol returns a LabelledCollection for each iteration
|
||||||
protocol.collator = OnLabelledCollectionProtocol.get_collator("labelled_collection")
|
protocol.collator = OnLabelledCollectionProtocol.get_collator("labelled_collection")
|
||||||
|
|
||||||
cols = [
|
report = EvaluationReport(prefix="atc_ne")
|
||||||
("base", "F"),
|
|
||||||
("base", "T"),
|
|
||||||
("atc ne", "accuracy"),
|
|
||||||
]
|
|
||||||
results = []
|
|
||||||
for test in protocol():
|
for test in protocol():
|
||||||
## Load OOD test data probs
|
## Load OOD test data probs
|
||||||
test_probs = c_model_predict(test.X)
|
test_probs = c_model_predict(test.X)
|
||||||
|
test_preds = np.argmax(test_probs, axis=-1)
|
||||||
test_scores = atc.get_entropy(test_probs)
|
test_scores = atc.get_entropy(test_probs)
|
||||||
atc_accuracy = 1.0 - (atc.get_ATC_acc(atc_thres, test_scores) / 100.0)
|
atc_accuracy = atc.get_ATC_acc(atc_thres, test_scores)
|
||||||
[f_prev, t_prev] = test.prevalence()
|
meta_acc = abs(atc_accuracy - metrics.accuracy_score(test.y, test_preds))
|
||||||
results.append({k: v for k, v in zip(cols, [f_prev, t_prev, atc_accuracy])})
|
f1_score = atc.get_ATC_f1(atc_thres, test_scores, test_probs)
|
||||||
|
meta_f1 = abs(f1_score - metrics.f1_score(test.y, test_preds))
|
||||||
series = avg_groupby_distribution(results)
|
report.append_row(
|
||||||
return pd.DataFrame(
|
test.prevalence(),
|
||||||
series,
|
acc=meta_acc,
|
||||||
columns=pd.MultiIndex.from_tuples(cols),
|
acc_score=(1.0 - atc_accuracy),
|
||||||
|
f1_score=f1_score,
|
||||||
|
f1=meta_f1,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
return report
|
||||||
|
|
||||||
|
|
||||||
def trust_score(
|
def trust_score(
|
||||||
c_model: BaseEstimator,
|
c_model: BaseEstimator,
|
||||||
|
@ -162,24 +183,16 @@ def doc_feat(
|
||||||
# ensure that the protocol returns a LabelledCollection for each iteration
|
# ensure that the protocol returns a LabelledCollection for each iteration
|
||||||
protocol.collator = OnLabelledCollectionProtocol.get_collator("labelled_collection")
|
protocol.collator = OnLabelledCollectionProtocol.get_collator("labelled_collection")
|
||||||
|
|
||||||
cols = [
|
report = EvaluationReport(prefix="doc_feat")
|
||||||
("base", "F"),
|
|
||||||
("base", "T"),
|
|
||||||
("doc feat", "score"),
|
|
||||||
]
|
|
||||||
results = []
|
|
||||||
for test in protocol():
|
for test in protocol():
|
||||||
test_probs = c_model_predict(test.X)
|
test_probs = c_model_predict(test.X)
|
||||||
|
test_preds = np.argmax(test_probs, axis=-1)
|
||||||
test_scores = np.max(test_probs, axis=-1)
|
test_scores = np.max(test_probs, axis=-1)
|
||||||
score = 1.0 - ((v1acc + doc.get_doc(val_scores, test_scores)) / 100.0)
|
score = (v1acc + doc.get_doc(val_scores, test_scores)) / 100.0
|
||||||
[f_prev, t_prev] = test.prevalence()
|
meta_acc = abs(score - metrics.accuracy_score(test.y, test_preds))
|
||||||
results.append({k: v for k, v in zip(cols, [f_prev, t_prev, score])})
|
report.append_row(test.prevalence(), acc=meta_acc, acc_score=(1.0 - score))
|
||||||
|
|
||||||
series = avg_groupby_distribution(results)
|
return report
|
||||||
return pd.DataFrame(
|
|
||||||
series,
|
|
||||||
columns=pd.MultiIndex.from_tuples(cols),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def rca_score(
|
def rca_score(
|
||||||
|
@ -194,29 +207,24 @@ def rca_score(
|
||||||
# ensure that the protocol returns a LabelledCollection for each iteration
|
# ensure that the protocol returns a LabelledCollection for each iteration
|
||||||
protocol.collator = OnLabelledCollectionProtocol.get_collator("labelled_collection")
|
protocol.collator = OnLabelledCollectionProtocol.get_collator("labelled_collection")
|
||||||
|
|
||||||
cols = [
|
report = EvaluationReport(prefix="rca")
|
||||||
("base", "F"),
|
|
||||||
("base", "T"),
|
|
||||||
("rca", "score"),
|
|
||||||
]
|
|
||||||
results = []
|
|
||||||
for test in protocol():
|
for test in protocol():
|
||||||
try:
|
try:
|
||||||
[f_prev, t_prev] = test.prevalence()
|
|
||||||
test_pred = c_model_predict(test.X)
|
test_pred = c_model_predict(test.X)
|
||||||
c_model2 = rca.clone_fit(c_model, test.X, test_pred)
|
c_model2 = rca.clone_fit(c_model, test.X, test_pred)
|
||||||
c_model2_predict = getattr(c_model2, predict_method)
|
c_model2_predict = getattr(c_model2, predict_method)
|
||||||
val_pred2 = c_model2_predict(validation.X)
|
val_pred2 = c_model2_predict(validation.X)
|
||||||
rca_score = rca.get_score(val_pred1, val_pred2, validation.y)
|
rca_score = rca.get_score(val_pred1, val_pred2, validation.y)
|
||||||
results.append({k: v for k, v in zip(cols, [f_prev, t_prev, rca_score])})
|
meta_score = abs(
|
||||||
except ValueError:
|
rca_score - (1 - metrics.accuracy_score(test.y, test_pred))
|
||||||
results.append({k: v for k, v in zip(cols, [f_prev, t_prev, float("nan")])})
|
|
||||||
|
|
||||||
series = avg_groupby_distribution(results)
|
|
||||||
return pd.DataFrame(
|
|
||||||
series,
|
|
||||||
columns=pd.MultiIndex.from_tuples(cols),
|
|
||||||
)
|
)
|
||||||
|
report.append_row(test.prevalence(), acc=meta_score, acc_score=rca_score)
|
||||||
|
except ValueError:
|
||||||
|
report.append_row(
|
||||||
|
test.prevalence(), acc=float("nan"), acc_score=float("nan")
|
||||||
|
)
|
||||||
|
|
||||||
|
return report
|
||||||
|
|
||||||
|
|
||||||
def rca_star_score(
|
def rca_star_score(
|
||||||
|
@ -226,7 +234,9 @@ def rca_star_score(
|
||||||
predict_method="predict",
|
predict_method="predict",
|
||||||
):
|
):
|
||||||
c_model_predict = getattr(c_model, predict_method)
|
c_model_predict = getattr(c_model, predict_method)
|
||||||
validation1, validation2 = validation.split_stratified(train_prop=0.5)
|
validation1, validation2 = validation.split_stratified(
|
||||||
|
train_prop=0.5, random_state=0
|
||||||
|
)
|
||||||
val1_pred = c_model_predict(validation1.X)
|
val1_pred = c_model_predict(validation1.X)
|
||||||
c_model1 = rca.clone_fit(c_model, validation1.X, val1_pred)
|
c_model1 = rca.clone_fit(c_model, validation1.X, val1_pred)
|
||||||
c_model1_predict = getattr(c_model1, predict_method)
|
c_model1_predict = getattr(c_model1, predict_method)
|
||||||
|
@ -235,62 +245,23 @@ def rca_star_score(
|
||||||
# ensure that the protocol returns a LabelledCollection for each iteration
|
# ensure that the protocol returns a LabelledCollection for each iteration
|
||||||
protocol.collator = OnLabelledCollectionProtocol.get_collator("labelled_collection")
|
protocol.collator = OnLabelledCollectionProtocol.get_collator("labelled_collection")
|
||||||
|
|
||||||
cols = [
|
report = EvaluationReport(prefix="rca_star")
|
||||||
("base", "F"),
|
|
||||||
("base", "T"),
|
|
||||||
("rca*", "score"),
|
|
||||||
]
|
|
||||||
results = []
|
|
||||||
for test in protocol():
|
for test in protocol():
|
||||||
[f_prev, t_prev] = test.prevalence()
|
|
||||||
try:
|
try:
|
||||||
test_pred = c_model_predict(test.X)
|
test_pred = c_model_predict(test.X)
|
||||||
c_model2 = rca.clone_fit(c_model, test.X, test_pred)
|
c_model2 = rca.clone_fit(c_model, test.X, test_pred)
|
||||||
c_model2_predict = getattr(c_model2, predict_method)
|
c_model2_predict = getattr(c_model2, predict_method)
|
||||||
val2_pred2 = c_model2_predict(validation2.X)
|
val2_pred2 = c_model2_predict(validation2.X)
|
||||||
rca_star_score = rca.get_score(val2_pred1, val2_pred2, validation2.y)
|
rca_star_score = rca.get_score(val2_pred1, val2_pred2, validation2.y)
|
||||||
results.append(
|
meta_score = abs(
|
||||||
{k: v for k, v in zip(cols, [f_prev, t_prev, rca_star_score])}
|
rca_star_score - (1 - metrics.accuracy_score(test.y, test_pred))
|
||||||
|
)
|
||||||
|
report.append_row(
|
||||||
|
test.prevalence(), acc=meta_score, acc_score=rca_star_score
|
||||||
)
|
)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
results.append({k: v for k, v in zip(cols, [f_prev, t_prev, float("nan")])})
|
report.append_row(
|
||||||
|
test.prevalence(), acc=float("nan"), acc_score=float("nan")
|
||||||
series = avg_groupby_distribution(results)
|
|
||||||
return pd.DataFrame(
|
|
||||||
series,
|
|
||||||
columns=pd.MultiIndex.from_tuples(cols),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
return report
|
||||||
def bbse_score(
|
|
||||||
c_model: BaseEstimator,
|
|
||||||
validation: LabelledCollection,
|
|
||||||
protocol: AbstractStochasticSeededProtocol,
|
|
||||||
predict_method="predict_proba",
|
|
||||||
):
|
|
||||||
c_model_predict = getattr(c_model, predict_method)
|
|
||||||
val_probs, val_labels = c_model_predict(validation.X), validation.y
|
|
||||||
|
|
||||||
# ensure that the protocol returns a LabelledCollection for each iteration
|
|
||||||
protocol.collator = OnLabelledCollectionProtocol.get_collator("labelled_collection")
|
|
||||||
|
|
||||||
cols = [
|
|
||||||
("base", "F"),
|
|
||||||
("base", "T"),
|
|
||||||
("bbse", "score"),
|
|
||||||
]
|
|
||||||
results = []
|
|
||||||
for test in protocol():
|
|
||||||
test_probs = c_model_predict(test.X)
|
|
||||||
wt = bbse.estimate_labelshift_ratio(val_labels, val_probs, test_probs, 2)
|
|
||||||
estim_prev = bbse.estimate_target_dist(wt, val_labels, 2)[1]
|
|
||||||
true_prev = test.prevalence()
|
|
||||||
[f_prev, t_prev] = true_prev
|
|
||||||
acc = qp.error.ae(true_prev, estim_prev)
|
|
||||||
results.append({k: v for k, v in zip(cols, [f_prev, t_prev, acc])})
|
|
||||||
|
|
||||||
series = avg_groupby_distribution(results)
|
|
||||||
return pd.DataFrame(
|
|
||||||
series,
|
|
||||||
columns=pd.MultiIndex.from_tuples(cols),
|
|
||||||
)
|
|
|
@ -0,0 +1,149 @@
|
||||||
|
import multiprocessing
|
||||||
|
import time
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import quapy as qp
|
||||||
|
from quapy.data import LabelledCollection
|
||||||
|
from quapy.protocol import (
|
||||||
|
APP,
|
||||||
|
AbstractStochasticSeededProtocol,
|
||||||
|
OnLabelledCollectionProtocol,
|
||||||
|
)
|
||||||
|
from sklearn.base import BaseEstimator
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
|
||||||
|
import quacc.error as error
|
||||||
|
import quacc.evaluation.baseline as baseline
|
||||||
|
from quacc.dataset import get_imdb, get_rcv1, get_spambase
|
||||||
|
from quacc.evaluation.report import EvaluationReport
|
||||||
|
|
||||||
|
from ..estimator import (
|
||||||
|
AccuracyEstimator,
|
||||||
|
BinaryQuantifierAccuracyEstimator,
|
||||||
|
MulticlassAccuracyEstimator,
|
||||||
|
)
|
||||||
|
|
||||||
|
qp.environ["SAMPLE_SIZE"] = 100
|
||||||
|
|
||||||
|
pd.set_option("display.float_format", "{:.4f}".format)
|
||||||
|
|
||||||
|
n_prevalences = 21
|
||||||
|
repreats = 100
|
||||||
|
|
||||||
|
|
||||||
|
def estimate(
|
||||||
|
estimator: AccuracyEstimator,
|
||||||
|
protocol: AbstractStochasticSeededProtocol,
|
||||||
|
):
|
||||||
|
# ensure that the protocol returns a LabelledCollection for each iteration
|
||||||
|
protocol.collator = OnLabelledCollectionProtocol.get_collator("labelled_collection")
|
||||||
|
|
||||||
|
base_prevs, true_prevs, estim_prevs = [], [], []
|
||||||
|
for sample in protocol():
|
||||||
|
e_sample = estimator.extend(sample)
|
||||||
|
estim_prev = estimator.estimate(e_sample.X, ext=True)
|
||||||
|
base_prevs.append(sample.prevalence())
|
||||||
|
true_prevs.append(e_sample.prevalence())
|
||||||
|
estim_prevs.append(estim_prev)
|
||||||
|
|
||||||
|
return base_prevs, true_prevs, estim_prevs
|
||||||
|
|
||||||
|
|
||||||
|
def evaluation_report(
|
||||||
|
estimator: AccuracyEstimator,
|
||||||
|
protocol: AbstractStochasticSeededProtocol,
|
||||||
|
method: str,
|
||||||
|
) -> EvaluationReport:
|
||||||
|
base_prevs, true_prevs, estim_prevs = estimate(estimator, protocol)
|
||||||
|
report = EvaluationReport(prefix=method)
|
||||||
|
|
||||||
|
for base_prev, true_prev, estim_prev in zip(base_prevs, true_prevs, estim_prevs):
|
||||||
|
acc_score = error.acc(estim_prev)
|
||||||
|
f1_score = error.f1(estim_prev)
|
||||||
|
report.append_row(
|
||||||
|
base_prev,
|
||||||
|
acc_score=1. - acc_score,
|
||||||
|
acc = abs(error.acc(true_prev) - acc_score),
|
||||||
|
f1_score=f1_score,
|
||||||
|
f1=abs(error.f1(true_prev) - f1_score)
|
||||||
|
)
|
||||||
|
|
||||||
|
return report
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate(
|
||||||
|
c_model: BaseEstimator,
|
||||||
|
validation: LabelledCollection,
|
||||||
|
protocol: AbstractStochasticSeededProtocol,
|
||||||
|
method: str,
|
||||||
|
):
|
||||||
|
estimator : AccuracyEstimator = {
|
||||||
|
"bin": BinaryQuantifierAccuracyEstimator,
|
||||||
|
"mul": MulticlassAccuracyEstimator,
|
||||||
|
}[method](c_model)
|
||||||
|
estimator.fit(validation)
|
||||||
|
return evaluation_report(estimator, protocol, method)
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate_binary(model, validation, protocol):
|
||||||
|
return evaluate(model, validation, protocol, "bin")
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate_multiclass(model, validation, protocol):
|
||||||
|
return evaluate(model, validation, protocol, "mul")
|
||||||
|
|
||||||
|
|
||||||
|
def fit_and_estimate(_estimate, train, validation, test):
|
||||||
|
model = LogisticRegression()
|
||||||
|
|
||||||
|
model.fit(*train.Xy)
|
||||||
|
protocol = APP(test, n_prevalences=n_prevalences, repeats=repreats)
|
||||||
|
start = time.time()
|
||||||
|
result = _estimate(model, validation, protocol)
|
||||||
|
end = time.time()
|
||||||
|
|
||||||
|
return {
|
||||||
|
"name": _estimate.__name__,
|
||||||
|
"result": result,
|
||||||
|
"time": end - start,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate_comparison(dataset: str, **kwargs) -> EvaluationReport:
|
||||||
|
train, validation, test = {
|
||||||
|
"spambase": get_spambase,
|
||||||
|
"imdb": get_imdb,
|
||||||
|
"rcv1": get_rcv1,
|
||||||
|
}[dataset](**kwargs)
|
||||||
|
|
||||||
|
for k,v in kwargs.items():
|
||||||
|
print(k, ":", v)
|
||||||
|
|
||||||
|
prevs = {
|
||||||
|
"train": train.prevalence(),
|
||||||
|
"validation": validation.prevalence(),
|
||||||
|
}
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
with multiprocessing.Pool(8) as pool:
|
||||||
|
estimators = [
|
||||||
|
evaluate_binary,
|
||||||
|
evaluate_multiclass,
|
||||||
|
baseline.kfcv,
|
||||||
|
baseline.atc_mc,
|
||||||
|
baseline.atc_ne,
|
||||||
|
baseline.doc_feat,
|
||||||
|
baseline.rca_score,
|
||||||
|
baseline.rca_star_score,
|
||||||
|
]
|
||||||
|
tasks = [(estim, train, validation, test) for estim in estimators]
|
||||||
|
results = [pool.apply_async(fit_and_estimate, t) for t in tasks]
|
||||||
|
results = list(map(lambda r: r.get(), results))
|
||||||
|
er = EvaluationReport.combine_reports(*list(map(lambda r: r["result"], results)))
|
||||||
|
times = {r["name"]:r["time"] for r in results}
|
||||||
|
end = time.time()
|
||||||
|
times["tot"] = end - start
|
||||||
|
er.times = times
|
||||||
|
er.prevs = prevs
|
||||||
|
|
||||||
|
return er
|
|
@ -0,0 +1,162 @@
|
||||||
|
from email import header
|
||||||
|
from typing import Tuple
|
||||||
|
import statistics as stats
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
def _in_div(s):
|
||||||
|
return "<div>" + s + "</div>\n"
|
||||||
|
|
||||||
|
def _header_footer(s):
|
||||||
|
return (
|
||||||
|
"""
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<style>
|
||||||
|
.dataframe {
|
||||||
|
tr:hover {
|
||||||
|
background-color: aquamarine;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
""" +
|
||||||
|
s +
|
||||||
|
"</body></html>"
|
||||||
|
)
|
||||||
|
|
||||||
|
class EvaluationReport:
|
||||||
|
def __init__(self, prefix=None):
|
||||||
|
self.base = []
|
||||||
|
self.dict = {}
|
||||||
|
self._grouped = False
|
||||||
|
self._grouped_base = []
|
||||||
|
self._grouped_dict = {}
|
||||||
|
self._dataframe = None
|
||||||
|
self.prefix = prefix if prefix is not None else "default"
|
||||||
|
self._times = {}
|
||||||
|
self._prevs = {}
|
||||||
|
self._target = "default"
|
||||||
|
|
||||||
|
def append_row(self, base: np.ndarray | Tuple, **row):
|
||||||
|
if isinstance(base, np.ndarray):
|
||||||
|
base = tuple(base.tolist())
|
||||||
|
self.base.append(base)
|
||||||
|
for k, v in row.items():
|
||||||
|
if (k, self.prefix) in self.dict:
|
||||||
|
self.dict[(k, self.prefix)].append(v)
|
||||||
|
else:
|
||||||
|
self.dict[(k, self.prefix)] = [v]
|
||||||
|
self._grouped = False
|
||||||
|
self._dataframe = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def columns(self):
|
||||||
|
return self.dict.keys()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def grouped(self):
|
||||||
|
if self._grouped:
|
||||||
|
return self._grouped_dict
|
||||||
|
|
||||||
|
self._grouped_base = []
|
||||||
|
self._grouped_dict = {k: [] for k in self.dict.keys()}
|
||||||
|
|
||||||
|
last_end = 0
|
||||||
|
for ind, bp in enumerate(self.base):
|
||||||
|
if ind < (len(self.base) - 1) and bp == self.base[ind + 1]:
|
||||||
|
continue
|
||||||
|
|
||||||
|
self._grouped_base.append(bp)
|
||||||
|
for col in self.dict.keys():
|
||||||
|
self._grouped_dict[col].append(
|
||||||
|
stats.mean(self.dict[col][last_end : ind + 1])
|
||||||
|
)
|
||||||
|
|
||||||
|
last_end = ind + 1
|
||||||
|
|
||||||
|
self._grouped = True
|
||||||
|
return self._grouped_dict
|
||||||
|
|
||||||
|
@property
|
||||||
|
def gbase(self):
|
||||||
|
self.grouped
|
||||||
|
return self._grouped_base
|
||||||
|
|
||||||
|
def get_dataframe(self, metrics=None):
|
||||||
|
if self._dataframe is None:
|
||||||
|
self_columns = sorted(self.columns, key=lambda c: c[0])
|
||||||
|
self._dataframe = pd.DataFrame(
|
||||||
|
self.grouped,
|
||||||
|
index=self.gbase,
|
||||||
|
columns=pd.MultiIndex.from_tuples(self_columns),
|
||||||
|
)
|
||||||
|
|
||||||
|
df = pd.DataFrame(self._dataframe)
|
||||||
|
if metrics is not None:
|
||||||
|
df = df.drop(
|
||||||
|
[(c0, c1) for (c0, c1) in df.columns if c0 not in metrics], axis=1
|
||||||
|
)
|
||||||
|
|
||||||
|
if len(set(k0 for k0, k1 in df.columns)) == 1:
|
||||||
|
df = df.droplevel(0, axis=1)
|
||||||
|
|
||||||
|
return df
|
||||||
|
|
||||||
|
def merge(self, other):
|
||||||
|
if not all(v1 == v2 for v1, v2 in zip(self.base, other.base)):
|
||||||
|
raise ValueError("other has not same base prevalences of self")
|
||||||
|
|
||||||
|
if len(set(self.dict.keys()).intersection(set(other.dict.keys()))) > 0:
|
||||||
|
raise ValueError("self and other have matching keys")
|
||||||
|
|
||||||
|
report = EvaluationReport()
|
||||||
|
report.base = self.base
|
||||||
|
report.dict = self.dict | other.dict
|
||||||
|
return report
|
||||||
|
|
||||||
|
@property
|
||||||
|
def times(self):
|
||||||
|
return self._times
|
||||||
|
|
||||||
|
@times.setter
|
||||||
|
def times(self, val):
|
||||||
|
self._times = val
|
||||||
|
|
||||||
|
@property
|
||||||
|
def prevs(self):
|
||||||
|
return self._prevs
|
||||||
|
|
||||||
|
@prevs.setter
|
||||||
|
def prevs(self, val):
|
||||||
|
self._prevs = val
|
||||||
|
|
||||||
|
@property
|
||||||
|
def target(self):
|
||||||
|
return self._target
|
||||||
|
|
||||||
|
@target.setter
|
||||||
|
def target(self, val):
|
||||||
|
self._target = val
|
||||||
|
|
||||||
|
def to_html(self, *metrics):
|
||||||
|
res = _in_div("target: " + self.target)
|
||||||
|
for k,v in self.prevs.items():
|
||||||
|
res += _in_div(f"{k}: {str(v)}")
|
||||||
|
for k,v in self.times.items():
|
||||||
|
res += _in_div(f"{k}: {v:.3f}s")
|
||||||
|
res += "\n"
|
||||||
|
for m in metrics:
|
||||||
|
res += self.get_dataframe(metrics=m).to_html() + "\n\n"
|
||||||
|
|
||||||
|
return _header_footer(res)
|
||||||
|
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def combine_reports(*args):
|
||||||
|
er = args[0]
|
||||||
|
for r in args[1:]:
|
||||||
|
er = er.merge(r)
|
||||||
|
|
||||||
|
return er
|
152
quacc/main.py
152
quacc/main.py
|
@ -1,133 +1,41 @@
|
||||||
import pandas as pd
|
import traceback
|
||||||
import quapy as qp
|
import quacc.evaluation.method as method
|
||||||
from quapy.protocol import APP
|
|
||||||
from sklearn.linear_model import LogisticRegression
|
|
||||||
from quacc import utils
|
|
||||||
|
|
||||||
import quacc.evaluation as eval
|
DATASET = "imdb"
|
||||||
import quacc.baseline as baseline
|
OUTPUT_FILE = "out_" + DATASET + ".html"
|
||||||
from quacc.estimator import (
|
TARGETS = {
|
||||||
BinaryQuantifierAccuracyEstimator,
|
"rcv1" : [
|
||||||
MulticlassAccuracyEstimator,
|
'C12',
|
||||||
)
|
'C13', 'C15', 'C151', 'C1511', 'C152', 'C17', 'C172',
|
||||||
|
'C18', 'C181', 'C21', 'C24', 'C31', 'C42', 'CCAT'
|
||||||
from quacc.dataset import get_imdb, get_rcv1, get_spambase
|
'E11', 'E12', 'E21', 'E211', 'E212', 'E41', 'E51', 'ECAT',
|
||||||
|
'G15', 'GCAT', 'GCRIM', 'GDIP', 'GPOL', 'GVIO', 'GVOTE', 'GWEA',
|
||||||
qp.environ["SAMPLE_SIZE"] = 100
|
'GWELF', 'M11', 'M12', 'M13', 'M131', 'M132', 'M14', 'M141',
|
||||||
|
'M142', 'M143', 'MCAT'
|
||||||
pd.set_option("display.float_format", "{:.4f}".format)
|
],
|
||||||
|
"spambase": ["default"],
|
||||||
dataset_name = "imdb"
|
"imdb": ["default"],
|
||||||
|
}
|
||||||
|
|
||||||
def estimate_multiclass():
|
|
||||||
print(dataset_name)
|
|
||||||
train, validation, test = get_imdb()
|
|
||||||
|
|
||||||
model = LogisticRegression()
|
|
||||||
|
|
||||||
print(f"fitting model {model.__class__.__name__}...", end=" ", flush=True)
|
|
||||||
model.fit(*train.Xy)
|
|
||||||
print("fit")
|
|
||||||
|
|
||||||
estimator = MulticlassAccuracyEstimator(model)
|
|
||||||
|
|
||||||
print(
|
|
||||||
f"fitting qmodel {estimator.q_model.__class__.__name__}...", end=" ", flush=True
|
|
||||||
)
|
|
||||||
estimator.fit(train)
|
|
||||||
print("fit")
|
|
||||||
|
|
||||||
n_prevalences = 21
|
|
||||||
repreats = 1000
|
|
||||||
protocol = APP(test, n_prevalences=n_prevalences, repeats=repreats)
|
|
||||||
print(
|
|
||||||
f"Tests:\n\
|
|
||||||
protocol={protocol.__class__.__name__}\n\
|
|
||||||
n_prevalences={n_prevalences}\n\
|
|
||||||
repreats={repreats}\n\
|
|
||||||
executing...\n"
|
|
||||||
)
|
|
||||||
df = eval.evaluation_report(
|
|
||||||
estimator,
|
|
||||||
protocol,
|
|
||||||
aggregate=True,
|
|
||||||
)
|
|
||||||
# print(df.to_latex())
|
|
||||||
print(df.to_string())
|
|
||||||
# print(df.to_html())
|
|
||||||
print()
|
|
||||||
|
|
||||||
|
|
||||||
def estimate_binary():
|
|
||||||
print(dataset_name)
|
|
||||||
train, validation, test = get_imdb()
|
|
||||||
|
|
||||||
model = LogisticRegression()
|
|
||||||
|
|
||||||
print(f"fitting model {model.__class__.__name__}...", end=" ", flush=True)
|
|
||||||
model.fit(*train.Xy)
|
|
||||||
print("fit")
|
|
||||||
|
|
||||||
estimator = BinaryQuantifierAccuracyEstimator(model)
|
|
||||||
|
|
||||||
print(
|
|
||||||
f"fitting qmodel {estimator.q_model_0.__class__.__name__}...",
|
|
||||||
end=" ",
|
|
||||||
flush=True,
|
|
||||||
)
|
|
||||||
estimator.fit(train)
|
|
||||||
print("fit")
|
|
||||||
|
|
||||||
n_prevalences = 21
|
|
||||||
repreats = 1000
|
|
||||||
protocol = APP(test, n_prevalences=n_prevalences, repeats=repreats)
|
|
||||||
print(
|
|
||||||
f"Tests:\n\
|
|
||||||
protocol={protocol.__class__.__name__}\n\
|
|
||||||
n_prevalences={n_prevalences}\n\
|
|
||||||
repreats={repreats}\n\
|
|
||||||
executing...\n"
|
|
||||||
)
|
|
||||||
df = eval.evaluation_report(
|
|
||||||
estimator,
|
|
||||||
protocol,
|
|
||||||
aggregate=True,
|
|
||||||
)
|
|
||||||
# print(df.to_latex(float_format="{:.4f}".format))
|
|
||||||
print(df.to_string())
|
|
||||||
# print(df.to_html())
|
|
||||||
print()
|
|
||||||
|
|
||||||
def estimate_comparison():
|
def estimate_comparison():
|
||||||
train, validation, test = get_spambase()
|
open(OUTPUT_FILE, "w").close()
|
||||||
model = LogisticRegression()
|
targets = TARGETS[DATASET]
|
||||||
model.fit(*train.Xy)
|
for target in targets:
|
||||||
|
try:
|
||||||
|
er = method.evaluate_comparison(DATASET, target=target)
|
||||||
|
er.target = target
|
||||||
|
with open(OUTPUT_FILE, "a") as f:
|
||||||
|
f.write(er.to_html(["acc"], ["f1"]))
|
||||||
|
except Exception:
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
n_prevalences = 21
|
# print(df.to_latex(float_format="{:.4f}".format))
|
||||||
repreats = 1000
|
# print(utils.avg_group_report(df).to_latex(float_format="{:.4f}".format))
|
||||||
protocol = APP(test, n_prevalences=n_prevalences, repeats=repreats)
|
|
||||||
|
|
||||||
estimator = BinaryQuantifierAccuracyEstimator(model)
|
|
||||||
estimator.fit(validation)
|
|
||||||
df = eval.evaluation_report(estimator, protocol, prevalence=False)
|
|
||||||
|
|
||||||
df = utils.combine_dataframes(
|
|
||||||
baseline.atc_mc(model, validation, protocol),
|
|
||||||
baseline.atc_ne(model, validation, protocol),
|
|
||||||
baseline.doc_feat(model, validation, protocol),
|
|
||||||
baseline.rca_score(model, validation, protocol),
|
|
||||||
baseline.rca_star_score(model, validation, protocol),
|
|
||||||
baseline.bbse_score(model, validation, protocol),
|
|
||||||
df,
|
|
||||||
df_index=[("base", "F"), ("base", "T")]
|
|
||||||
)
|
|
||||||
|
|
||||||
print(df.to_latex(float_format="{:.4f}".format))
|
|
||||||
print(utils.avg_group_report(df).to_latex(float_format="{:.4f}".format))
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
estimate_comparison()
|
estimate_comparison()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
import functools
|
import functools
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
def combine_dataframes(*dfs, df_index=[]) -> pd.DataFrame:
|
def combine_dataframes(dfs, df_index=[]) -> pd.DataFrame:
|
||||||
if len(dfs) < 1:
|
if len(dfs) < 1:
|
||||||
raise ValueError
|
raise ValueError
|
||||||
if len(dfs) == 1:
|
if len(dfs) == 1:
|
||||||
|
|
Loading…
Reference in New Issue