baseline testing
This commit is contained in:
parent
ede348ea27
commit
90c22cc079
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -12,8 +12,10 @@ pandas = "^2.0.3"
|
||||||
jinja2 = "^3.1.2"
|
jinja2 = "^3.1.2"
|
||||||
|
|
||||||
[tool.poetry.scripts]
|
[tool.poetry.scripts]
|
||||||
|
main = "quacc.main:main"
|
||||||
multi = "quacc.main:estimate_multiclass"
|
multi = "quacc.main:estimate_multiclass"
|
||||||
bin = "quacc.main:estimate_binary"
|
bin = "quacc.main:estimate_binary"
|
||||||
|
comp = "quacc.main:estimate_comparison"
|
||||||
|
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
|
|
|
@ -6,12 +6,18 @@ import quapy as qp
|
||||||
from quapy.data import LabelledCollection
|
from quapy.data import LabelledCollection
|
||||||
from sklearn.base import BaseEstimator
|
from sklearn.base import BaseEstimator
|
||||||
from sklearn.model_selection import cross_validate
|
from sklearn.model_selection import cross_validate
|
||||||
|
from quapy.protocol import (
|
||||||
|
AbstractStochasticSeededProtocol,
|
||||||
|
OnLabelledCollectionProtocol,
|
||||||
|
)
|
||||||
|
|
||||||
import elsahar19_rca.rca as rca
|
import elsahar19_rca.rca as rca
|
||||||
import garg22_ATC.ATC_helper as atc
|
import garg22_ATC.ATC_helper as atc
|
||||||
import guillory21_doc.doc as doc
|
import guillory21_doc.doc as doc
|
||||||
import jiang18_trustscore.trustscore as trustscore
|
import jiang18_trustscore.trustscore as trustscore
|
||||||
import lipton_bbse.labelshift as bbse
|
import lipton_bbse.labelshift as bbse
|
||||||
|
import pandas as pd
|
||||||
|
import statistics as stats
|
||||||
|
|
||||||
|
|
||||||
def kfcv(c_model: BaseEstimator, validation: LabelledCollection) -> Dict:
|
def kfcv(c_model: BaseEstimator, validation: LabelledCollection) -> Dict:
|
||||||
|
@ -20,10 +26,34 @@ def kfcv(c_model: BaseEstimator, validation: LabelledCollection) -> Dict:
|
||||||
return {"f1_score": mean(scores["test_f1_macro"])}
|
return {"f1_score": mean(scores["test_f1_macro"])}
|
||||||
|
|
||||||
|
|
||||||
|
def avg_groupby_distribution(results):
|
||||||
|
def base_prev(s):
|
||||||
|
return (s[("base", "F")], s[("base", "T")])
|
||||||
|
|
||||||
|
grouped_list = {}
|
||||||
|
for r in results:
|
||||||
|
bp = base_prev(r)
|
||||||
|
if bp in grouped_list.keys():
|
||||||
|
grouped_list[bp].append(r)
|
||||||
|
else:
|
||||||
|
grouped_list[bp] = [r]
|
||||||
|
|
||||||
|
series = []
|
||||||
|
for (fp, tp), r_list in grouped_list.items():
|
||||||
|
assert len(r_list) > 0
|
||||||
|
r_avg = {}
|
||||||
|
r_avg[("base", "F")], r_avg[("base", "T")] = fp, tp
|
||||||
|
for pn in [(n1, n2) for ((n1, n2), _) in r_list[0].items() if n1 != "base"]:
|
||||||
|
r_avg[pn] = stats.mean(map(lambda r: r[pn], r_list))
|
||||||
|
series.append(r_avg)
|
||||||
|
|
||||||
|
return series
|
||||||
|
|
||||||
|
|
||||||
def atc_mc(
|
def atc_mc(
|
||||||
c_model: BaseEstimator,
|
c_model: BaseEstimator,
|
||||||
validation: LabelledCollection,
|
validation: LabelledCollection,
|
||||||
test: LabelledCollection,
|
protocol: AbstractStochasticSeededProtocol,
|
||||||
predict_method="predict_proba",
|
predict_method="predict_proba",
|
||||||
):
|
):
|
||||||
c_model_predict = getattr(c_model, predict_method)
|
c_model_predict = getattr(c_model, predict_method)
|
||||||
|
@ -31,27 +61,39 @@ def atc_mc(
|
||||||
## Load ID validation data probs and labels
|
## Load ID validation data probs and labels
|
||||||
val_probs, val_labels = c_model_predict(validation.X), validation.y
|
val_probs, val_labels = c_model_predict(validation.X), validation.y
|
||||||
|
|
||||||
## Load OOD test data probs
|
|
||||||
test_probs = c_model_predict(test.X)
|
|
||||||
|
|
||||||
## score function, e.g., negative entropy or argmax confidence
|
## score function, e.g., negative entropy or argmax confidence
|
||||||
val_scores = atc.get_max_conf(val_probs)
|
val_scores = atc.get_max_conf(val_probs)
|
||||||
val_preds = np.argmax(val_probs, axis=-1)
|
val_preds = np.argmax(val_probs, axis=-1)
|
||||||
test_scores = atc.get_max_conf(test_probs)
|
|
||||||
|
|
||||||
_, atc_thres = atc.find_ATC_threshold(val_scores, val_labels == val_preds)
|
_, atc_thres = atc.find_ATC_threshold(val_scores, val_labels == val_preds)
|
||||||
atc_accuracy = atc.get_ATC_acc(atc_thres, test_scores)
|
|
||||||
|
|
||||||
return {
|
# ensure that the protocol returns a LabelledCollection for each iteration
|
||||||
"true_acc": 100 * np.mean(np.argmax(test_probs, axis=-1) == test.y),
|
protocol.collator = OnLabelledCollectionProtocol.get_collator("labelled_collection")
|
||||||
"pred_acc": atc_accuracy,
|
|
||||||
}
|
cols = [
|
||||||
|
("base", "F"),
|
||||||
|
("base", "T"),
|
||||||
|
("atc_mc", "accuracy"),
|
||||||
|
]
|
||||||
|
results = []
|
||||||
|
for test in protocol():
|
||||||
|
## Load OOD test data probs
|
||||||
|
test_probs = c_model_predict(test.X)
|
||||||
|
test_scores = atc.get_max_conf(test_probs)
|
||||||
|
atc_accuracy = 1.0 - (atc.get_ATC_acc(atc_thres, test_scores) / 100.0)
|
||||||
|
[f_prev, t_prev] = test.prevalence()
|
||||||
|
results.append({k: v for k, v in zip(cols, [f_prev, t_prev, atc_accuracy])})
|
||||||
|
|
||||||
|
series = avg_groupby_distribution(results)
|
||||||
|
return pd.DataFrame(
|
||||||
|
series,
|
||||||
|
columns=pd.MultiIndex.from_tuples(cols),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def atc_ne(
|
def atc_ne(
|
||||||
c_model: BaseEstimator,
|
c_model: BaseEstimator,
|
||||||
validation: LabelledCollection,
|
validation: LabelledCollection,
|
||||||
test: LabelledCollection,
|
protocol: AbstractStochasticSeededProtocol,
|
||||||
predict_method="predict_proba",
|
predict_method="predict_proba",
|
||||||
):
|
):
|
||||||
c_model_predict = getattr(c_model, predict_method)
|
c_model_predict = getattr(c_model, predict_method)
|
||||||
|
@ -59,22 +101,33 @@ def atc_ne(
|
||||||
## Load ID validation data probs and labels
|
## Load ID validation data probs and labels
|
||||||
val_probs, val_labels = c_model_predict(validation.X), validation.y
|
val_probs, val_labels = c_model_predict(validation.X), validation.y
|
||||||
|
|
||||||
## Load OOD test data probs
|
|
||||||
test_probs = c_model_predict(test.X)
|
|
||||||
|
|
||||||
## score function, e.g., negative entropy or argmax confidence
|
## score function, e.g., negative entropy or argmax confidence
|
||||||
val_scores = atc.get_entropy(val_probs)
|
val_scores = atc.get_entropy(val_probs)
|
||||||
val_preds = np.argmax(val_probs, axis=-1)
|
val_preds = np.argmax(val_probs, axis=-1)
|
||||||
|
|
||||||
test_scores = atc.get_entropy(test_probs)
|
|
||||||
|
|
||||||
_, atc_thres = atc.find_ATC_threshold(val_scores, val_labels == val_preds)
|
_, atc_thres = atc.find_ATC_threshold(val_scores, val_labels == val_preds)
|
||||||
atc_accuracy = atc.get_ATC_acc(atc_thres, test_scores)
|
|
||||||
|
|
||||||
return {
|
# ensure that the protocol returns a LabelledCollection for each iteration
|
||||||
"true_acc": 100 * np.mean(np.argmax(test_probs, axis=-1) == test.y),
|
protocol.collator = OnLabelledCollectionProtocol.get_collator("labelled_collection")
|
||||||
"pred_acc": atc_accuracy,
|
|
||||||
}
|
cols = [
|
||||||
|
("base", "F"),
|
||||||
|
("base", "T"),
|
||||||
|
("atc_ne", "accuracy"),
|
||||||
|
]
|
||||||
|
results = []
|
||||||
|
for test in protocol():
|
||||||
|
## Load OOD test data probs
|
||||||
|
test_probs = c_model_predict(test.X)
|
||||||
|
test_scores = atc.get_entropy(test_probs)
|
||||||
|
atc_accuracy = 1.0 - (atc.get_ATC_acc(atc_thres, test_scores) / 100.0)
|
||||||
|
[f_prev, t_prev] = test.prevalence()
|
||||||
|
results.append({k: v for k, v in zip(cols, [f_prev, t_prev, atc_accuracy])})
|
||||||
|
|
||||||
|
series = avg_groupby_distribution(results)
|
||||||
|
return pd.DataFrame(
|
||||||
|
series,
|
||||||
|
columns=pd.MultiIndex.from_tuples(cols),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def trust_score(
|
def trust_score(
|
||||||
|
@ -96,70 +149,148 @@ def trust_score(
|
||||||
def doc_feat(
|
def doc_feat(
|
||||||
c_model: BaseEstimator,
|
c_model: BaseEstimator,
|
||||||
validation: LabelledCollection,
|
validation: LabelledCollection,
|
||||||
test: LabelledCollection,
|
protocol: AbstractStochasticSeededProtocol,
|
||||||
predict_method="predict_proba",
|
predict_method="predict_proba",
|
||||||
):
|
):
|
||||||
c_model_predict = getattr(c_model, predict_method)
|
c_model_predict = getattr(c_model, predict_method)
|
||||||
|
|
||||||
val_probs, val_labels = c_model_predict(validation.X), validation.y
|
val_probs, val_labels = c_model_predict(validation.X), validation.y
|
||||||
test_probs = c_model_predict(test.X)
|
|
||||||
val_scores = np.max(val_probs, axis=-1)
|
val_scores = np.max(val_probs, axis=-1)
|
||||||
test_scores = np.max(test_probs, axis=-1)
|
|
||||||
val_preds = np.argmax(val_probs, axis=-1)
|
val_preds = np.argmax(val_probs, axis=-1)
|
||||||
|
|
||||||
v1acc = np.mean(val_preds == val_labels) * 100
|
v1acc = np.mean(val_preds == val_labels) * 100
|
||||||
return v1acc + doc.get_doc(val_scores, test_scores)
|
|
||||||
|
# ensure that the protocol returns a LabelledCollection for each iteration
|
||||||
|
protocol.collator = OnLabelledCollectionProtocol.get_collator("labelled_collection")
|
||||||
|
|
||||||
|
cols = [
|
||||||
|
("base", "F"),
|
||||||
|
("base", "T"),
|
||||||
|
("doc_feat", "score"),
|
||||||
|
]
|
||||||
|
results = []
|
||||||
|
for test in protocol():
|
||||||
|
test_probs = c_model_predict(test.X)
|
||||||
|
test_scores = np.max(test_probs, axis=-1)
|
||||||
|
score = 1.0 - ((v1acc + doc.get_doc(val_scores, test_scores)) / 100.0)
|
||||||
|
[f_prev, t_prev] = test.prevalence()
|
||||||
|
results.append({k: v for k, v in zip(cols, [f_prev, t_prev, score])})
|
||||||
|
|
||||||
|
series = avg_groupby_distribution(results)
|
||||||
|
return pd.DataFrame(
|
||||||
|
series,
|
||||||
|
columns=pd.MultiIndex.from_tuples(cols),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def rca_score(
|
def rca_score(
|
||||||
c_model: BaseEstimator,
|
c_model: BaseEstimator,
|
||||||
validation: LabelledCollection,
|
validation: LabelledCollection,
|
||||||
test: LabelledCollection,
|
protocol: AbstractStochasticSeededProtocol,
|
||||||
predict_method="predict",
|
predict_method="predict",
|
||||||
):
|
):
|
||||||
c_model_predict = getattr(c_model, predict_method)
|
c_model_predict = getattr(c_model, predict_method)
|
||||||
test_pred = c_model_predict(test.X)
|
|
||||||
c_model2 = rca.clone_fit(test.X, test_pred)
|
|
||||||
c_model2_predict = getattr(c_model2, predict_method)
|
|
||||||
|
|
||||||
val_pred1 = c_model_predict(validation.X)
|
val_pred1 = c_model_predict(validation.X)
|
||||||
val_pred2 = c_model2_predict(validation.X)
|
|
||||||
|
|
||||||
return rca.get_score(val_pred1, val_pred2, validation.y)
|
# ensure that the protocol returns a LabelledCollection for each iteration
|
||||||
|
protocol.collator = OnLabelledCollectionProtocol.get_collator("labelled_collection")
|
||||||
|
|
||||||
|
cols = [
|
||||||
|
("base", "F"),
|
||||||
|
("base", "T"),
|
||||||
|
("rca", "score"),
|
||||||
|
]
|
||||||
|
results = []
|
||||||
|
for test in protocol():
|
||||||
|
[f_prev, t_prev] = test.prevalence()
|
||||||
|
try:
|
||||||
|
test_pred = c_model_predict(test.X)
|
||||||
|
c_model2 = rca.clone_fit(c_model, test.X, test_pred)
|
||||||
|
c_model2_predict = getattr(c_model2, predict_method)
|
||||||
|
val_pred2 = c_model2_predict(validation.X)
|
||||||
|
rca_score = 1.0 - rca.get_score(val_pred1, val_pred2, validation.y)
|
||||||
|
results.append({k: v for k, v in zip(cols, [f_prev, t_prev, rca_score])})
|
||||||
|
except ValueError:
|
||||||
|
results.append({k: v for k, v in zip(cols, [f_prev, t_prev, float("nan")])})
|
||||||
|
|
||||||
|
series = avg_groupby_distribution(results)
|
||||||
|
return pd.DataFrame(
|
||||||
|
series,
|
||||||
|
columns=pd.MultiIndex.from_tuples(cols),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def rca_star_score(
|
def rca_star_score(
|
||||||
c_model: BaseEstimator,
|
c_model: BaseEstimator,
|
||||||
validation: LabelledCollection,
|
validation: LabelledCollection,
|
||||||
test: LabelledCollection,
|
protocol: AbstractStochasticSeededProtocol,
|
||||||
predict_method="predict",
|
predict_method="predict",
|
||||||
):
|
):
|
||||||
c_model_predict = getattr(c_model, predict_method)
|
c_model_predict = getattr(c_model, predict_method)
|
||||||
validation1, validation2 = validation.split_stratified(train_prop=0.5)
|
validation1, validation2 = validation.split_stratified(train_prop=0.5)
|
||||||
test_pred = c_model_predict(test.X)
|
|
||||||
val1_pred = c_model_predict(validation1.X)
|
val1_pred = c_model_predict(validation1.X)
|
||||||
c_model1 = rca.clone_fit(validation1.X, val1_pred)
|
c_model1 = rca.clone_fit(c_model, validation1.X, val1_pred)
|
||||||
c_model2 = rca.clone_fit(test.X, test_pred)
|
|
||||||
c_model1_predict = getattr(c_model1, predict_method)
|
c_model1_predict = getattr(c_model1, predict_method)
|
||||||
c_model2_predict = getattr(c_model2, predict_method)
|
|
||||||
|
|
||||||
val2_pred1 = c_model1_predict(validation2.X)
|
val2_pred1 = c_model1_predict(validation2.X)
|
||||||
val2_pred2 = c_model2_predict(validation2.X)
|
|
||||||
|
|
||||||
return rca.get_score(val2_pred1, val2_pred2, validation2.y)
|
# ensure that the protocol returns a LabelledCollection for each iteration
|
||||||
|
protocol.collator = OnLabelledCollectionProtocol.get_collator("labelled_collection")
|
||||||
|
|
||||||
|
cols = [
|
||||||
|
("base", "F"),
|
||||||
|
("base", "T"),
|
||||||
|
("rca*", "score"),
|
||||||
|
]
|
||||||
|
results = []
|
||||||
|
for test in protocol():
|
||||||
|
[f_prev, t_prev] = test.prevalence()
|
||||||
|
try:
|
||||||
|
test_pred = c_model_predict(test.X)
|
||||||
|
c_model2 = rca.clone_fit(c_model, test.X, test_pred)
|
||||||
|
c_model2_predict = getattr(c_model2, predict_method)
|
||||||
|
val2_pred2 = c_model2_predict(validation2.X)
|
||||||
|
rca_star_score = 1.0 - rca.get_score(val2_pred1, val2_pred2, validation2.y)
|
||||||
|
results.append(
|
||||||
|
{k: v for k, v in zip(cols, [f_prev, t_prev, rca_star_score])}
|
||||||
|
)
|
||||||
|
except ValueError:
|
||||||
|
results.append({k: v for k, v in zip(cols, [f_prev, t_prev, float("nan")])})
|
||||||
|
|
||||||
|
series = avg_groupby_distribution(results)
|
||||||
|
return pd.DataFrame(
|
||||||
|
series,
|
||||||
|
columns=pd.MultiIndex.from_tuples(cols),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def bbse_score(
|
def bbse_score(
|
||||||
c_model: BaseEstimator,
|
c_model: BaseEstimator,
|
||||||
validation: LabelledCollection,
|
validation: LabelledCollection,
|
||||||
test: LabelledCollection,
|
protocol: AbstractStochasticSeededProtocol,
|
||||||
predict_method="predict_proba",
|
predict_method="predict_proba",
|
||||||
):
|
):
|
||||||
|
|
||||||
c_model_predict = getattr(c_model, predict_method)
|
c_model_predict = getattr(c_model, predict_method)
|
||||||
val_probs, val_labels = c_model_predict(validation.X), validation.y
|
val_probs, val_labels = c_model_predict(validation.X), validation.y
|
||||||
test_probs = c_model_predict(test.X)
|
|
||||||
|
|
||||||
wt = bbse.estimate_labelshift_ratio(val_labels, val_probs, test_probs, 2)
|
# ensure that the protocol returns a LabelledCollection for each iteration
|
||||||
estim_prev = bbse.estimate_target_dist(wt, val_labels, 2)
|
protocol.collator = OnLabelledCollectionProtocol.get_collator("labelled_collection")
|
||||||
true_prev = test.prevalence()
|
|
||||||
return qp.error.ae(true_prev, estim_prev)
|
cols = [
|
||||||
|
("base", "F"),
|
||||||
|
("base", "T"),
|
||||||
|
("bbse", "score"),
|
||||||
|
]
|
||||||
|
results = []
|
||||||
|
for test in protocol():
|
||||||
|
test_probs = c_model_predict(test.X)
|
||||||
|
wt = bbse.estimate_labelshift_ratio(val_labels, val_probs, test_probs, 2)
|
||||||
|
estim_prev = bbse.estimate_target_dist(wt, val_labels, 2)[1]
|
||||||
|
true_prev = test.prevalence()
|
||||||
|
[f_prev, t_prev] = true_prev
|
||||||
|
acc = qp.error.ae(true_prev, estim_prev)
|
||||||
|
results.append({k: v for k, v in zip(cols, [f_prev, t_prev, acc])})
|
||||||
|
|
||||||
|
series = avg_groupby_distribution(results)
|
||||||
|
return pd.DataFrame(
|
||||||
|
series,
|
||||||
|
columns=pd.MultiIndex.from_tuples(cols),
|
||||||
|
)
|
||||||
|
|
|
@ -23,9 +23,6 @@ def estimate(
|
||||||
for sample in protocol():
|
for sample in protocol():
|
||||||
e_sample = estimator.extend(sample)
|
e_sample = estimator.extend(sample)
|
||||||
estim_prev = estimator.estimate(e_sample.X, ext=True)
|
estim_prev = estimator.estimate(e_sample.X, ext=True)
|
||||||
# base_prevs.append(_prettyfloat(accuracy, sample.prevalence()))
|
|
||||||
# true_prevs.append(_prettyfloat(accuracy, e_sample.prevalence()))
|
|
||||||
# estim_prevs.append(_prettyfloat(accuracy, estim_prev))
|
|
||||||
base_prevs.append(sample.prevalence())
|
base_prevs.append(sample.prevalence())
|
||||||
true_prevs.append(e_sample.prevalence())
|
true_prevs.append(e_sample.prevalence())
|
||||||
estim_prevs.append(estim_prev)
|
estim_prevs.append(estim_prev)
|
||||||
|
@ -33,37 +30,20 @@ def estimate(
|
||||||
return base_prevs, true_prevs, estim_prevs
|
return base_prevs, true_prevs, estim_prevs
|
||||||
|
|
||||||
|
|
||||||
_bprev_col_0 = ["base"]
|
def avg_groupby_distribution(lst, error_names):
|
||||||
_bprev_col_1 = ["0", "1"]
|
|
||||||
_prev_col_0 = ["true", "estim"]
|
|
||||||
_prev_col_1 = ["TN", "FP", "FN", "TP"]
|
|
||||||
_err_col_0 = ["errors"]
|
|
||||||
|
|
||||||
|
|
||||||
def _report_columns(err_names):
|
|
||||||
bprev_cols = list(itertools.product(_bprev_col_0, _bprev_col_1))
|
|
||||||
prev_cols = list(itertools.product(_prev_col_0, _prev_col_1))
|
|
||||||
|
|
||||||
err_1 = err_names
|
|
||||||
err_cols = list(itertools.product(_err_col_0, err_1))
|
|
||||||
|
|
||||||
cols = bprev_cols + prev_cols + err_cols
|
|
||||||
|
|
||||||
return pd.MultiIndex.from_tuples(cols)
|
|
||||||
|
|
||||||
def _report_avg_groupby_distribution(lst, error_names):
|
|
||||||
def _bprev(s):
|
def _bprev(s):
|
||||||
return (s[("base", "0")], s[("base", "1")])
|
return (s[("base", "F")], s[("base", "T")])
|
||||||
|
|
||||||
def _normalize_prev(r, prev_name):
|
|
||||||
raw_prev = [v for ((k0, k1), v) in r.items() if k0 == prev_name]
|
|
||||||
norm_prev = [v/sum(raw_prev) for v in raw_prev]
|
|
||||||
for n, v in zip(itertools.product([prev_name], _prev_col_1), norm_prev):
|
|
||||||
r[n] = v
|
|
||||||
|
|
||||||
|
def _normalize_prev(r):
|
||||||
|
for prev_name in ["true", "estim"]:
|
||||||
|
raw_prev = [v for ((k0, k1), v) in r.items() if k0 == prev_name]
|
||||||
|
norm_prev = [v / sum(raw_prev) for v in raw_prev]
|
||||||
|
for n, v in zip(
|
||||||
|
itertools.product([prev_name], ["TN", "FP", "FN", "TP"]), norm_prev
|
||||||
|
):
|
||||||
|
r[n] = v
|
||||||
return r
|
return r
|
||||||
|
|
||||||
|
|
||||||
current_bprev = _bprev(lst[0])
|
current_bprev = _bprev(lst[0])
|
||||||
bprev_cnt = 0
|
bprev_cnt = 0
|
||||||
g_lst = [[]]
|
g_lst = [[]]
|
||||||
|
@ -80,27 +60,33 @@ def _report_avg_groupby_distribution(lst, error_names):
|
||||||
for gs in g_lst:
|
for gs in g_lst:
|
||||||
assert len(gs) > 0
|
assert len(gs) > 0
|
||||||
r = {}
|
r = {}
|
||||||
r[("base", "0")], r[("base", "1")] = _bprev(gs[0])
|
r[("base", "F")], r[("base", "T")] = _bprev(gs[0])
|
||||||
|
|
||||||
for pn in itertools.product(_prev_col_0, _prev_col_1):
|
for pn in [(n1, n2) for ((n1, n2), _) in gs[0].items() if n1 != "base"]:
|
||||||
r[pn] = stats.mean(map(lambda s: s[pn], gs))
|
r[pn] = stats.mean(map(lambda s: s[pn], gs))
|
||||||
|
|
||||||
r = _normalize_prev(r, "true")
|
r = _normalize_prev(r)
|
||||||
r = _normalize_prev(r, "estim")
|
|
||||||
|
|
||||||
for en in itertools.product(_err_col_0, error_names):
|
for en in itertools.product(["errors"], error_names):
|
||||||
r[en] = stats.mean(map(lambda s: s[en], gs))
|
r[en] = stats.mean(map(lambda s: s[en], gs))
|
||||||
|
|
||||||
r_lst.append(r)
|
r_lst.append(r)
|
||||||
|
|
||||||
return r_lst
|
return r_lst
|
||||||
|
|
||||||
|
|
||||||
def evaluation_report(
|
def evaluation_report(
|
||||||
estimator: AccuracyEstimator,
|
estimator: AccuracyEstimator,
|
||||||
protocol: AbstractStochasticSeededProtocol,
|
protocol: AbstractStochasticSeededProtocol,
|
||||||
error_metrics: Iterable[Union[str, Callable]] = "all",
|
error_metrics: Iterable[Union[str, Callable]] = "all",
|
||||||
aggregate: bool = True,
|
aggregate: bool = True,
|
||||||
):
|
):
|
||||||
|
def _report_columns(err_names):
|
||||||
|
base_cols = list(itertools.product(["base"], ["F", "T"]))
|
||||||
|
prev_cols = list(itertools.product(["true", "estim"], ["TN", "FP", "FN", "TP"]))
|
||||||
|
err_cols = list(itertools.product(["errors"], err_names))
|
||||||
|
return base_cols + prev_cols, err_cols
|
||||||
|
|
||||||
base_prevs, true_prevs, estim_prevs = estimate(estimator, protocol)
|
base_prevs, true_prevs, estim_prevs = estimate(estimator, protocol)
|
||||||
|
|
||||||
if error_metrics == "all":
|
if error_metrics == "all":
|
||||||
|
@ -114,20 +100,16 @@ def evaluation_report(
|
||||||
error_cols = error_names.copy()
|
error_cols = error_names.copy()
|
||||||
if "f1" in error_cols:
|
if "f1" in error_cols:
|
||||||
error_cols.remove("f1")
|
error_cols.remove("f1")
|
||||||
error_cols.extend(["f1_true", "f1_estim", "f1_dist"])
|
error_cols.extend(["f1_true", "f1_estim"])
|
||||||
if "f1e" in error_cols:
|
if "f1e" in error_cols:
|
||||||
error_cols.remove("f1e")
|
error_cols.remove("f1e")
|
||||||
error_cols.extend(["f1e_true", "f1e_estim"])
|
error_cols.extend(["f1e_true", "f1e_estim"])
|
||||||
|
|
||||||
# df_cols = ["base_prev", "true_prev", "estim_prev"] + error_names
|
# df_cols = ["base_prev", "true_prev", "estim_prev"] + error_names
|
||||||
df_cols = _report_columns(error_cols)
|
prev_cols, err_cols = _report_columns(error_cols)
|
||||||
|
|
||||||
lst = []
|
lst = []
|
||||||
for base_prev, true_prev, estim_prev in zip(base_prevs, true_prevs, estim_prevs):
|
for base_prev, true_prev, estim_prev in zip(base_prevs, true_prevs, estim_prevs):
|
||||||
prev_cols = list(itertools.product(_bprev_col_0, _bprev_col_1)) + list(
|
|
||||||
itertools.product(_prev_col_0, _prev_col_1)
|
|
||||||
)
|
|
||||||
|
|
||||||
series = {
|
series = {
|
||||||
k: v
|
k: v
|
||||||
for (k, v) in zip(
|
for (k, v) in zip(
|
||||||
|
@ -143,7 +125,6 @@ def evaluation_report(
|
||||||
f1_true, f1_estim = error_metric(true_prev), error_metric(estim_prev)
|
f1_true, f1_estim = error_metric(true_prev), error_metric(estim_prev)
|
||||||
series[("errors", "f1_true")] = f1_true
|
series[("errors", "f1_true")] = f1_true
|
||||||
series[("errors", "f1_estim")] = f1_estim
|
series[("errors", "f1_estim")] = f1_estim
|
||||||
series[("errors", "f1_dist")] = abs(f1_estim - f1_true)
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
score = error_metric(true_prev, estim_prev)
|
score = error_metric(true_prev, estim_prev)
|
||||||
|
@ -151,6 +132,10 @@ def evaluation_report(
|
||||||
|
|
||||||
lst.append(series)
|
lst.append(series)
|
||||||
|
|
||||||
lst = _report_avg_groupby_distribution(lst, error_cols) if aggregate else lst
|
lst = avg_groupby_distribution(lst, error_cols) if aggregate else lst
|
||||||
df = pd.DataFrame(lst, columns=df_cols)
|
|
||||||
|
df = pd.DataFrame(
|
||||||
|
lst,
|
||||||
|
columns=pd.MultiIndex.from_tuples(prev_cols + err_cols),
|
||||||
|
)
|
||||||
return df
|
return df
|
||||||
|
|
|
@ -4,12 +4,13 @@ from quapy.protocol import APP
|
||||||
from sklearn.linear_model import LogisticRegression
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
|
||||||
import quacc.evaluation as eval
|
import quacc.evaluation as eval
|
||||||
|
import quacc.baseline as baseline
|
||||||
from quacc.estimator import (
|
from quacc.estimator import (
|
||||||
BinaryQuantifierAccuracyEstimator,
|
BinaryQuantifierAccuracyEstimator,
|
||||||
MulticlassAccuracyEstimator,
|
MulticlassAccuracyEstimator,
|
||||||
)
|
)
|
||||||
|
|
||||||
from quacc.dataset import get_imdb
|
from quacc.dataset import get_imdb, get_spambase
|
||||||
|
|
||||||
qp.environ["SAMPLE_SIZE"] = 100
|
qp.environ["SAMPLE_SIZE"] = 100
|
||||||
|
|
||||||
|
@ -20,7 +21,7 @@ dataset_name = "imdb"
|
||||||
|
|
||||||
def estimate_multiclass():
|
def estimate_multiclass():
|
||||||
print(dataset_name)
|
print(dataset_name)
|
||||||
train, validation, test = get_imdb(dataset_name)
|
train, validation, test = get_imdb()
|
||||||
|
|
||||||
model = LogisticRegression()
|
model = LogisticRegression()
|
||||||
|
|
||||||
|
@ -59,7 +60,7 @@ def estimate_multiclass():
|
||||||
|
|
||||||
def estimate_binary():
|
def estimate_binary():
|
||||||
print(dataset_name)
|
print(dataset_name)
|
||||||
train, validation, test = get_imdb(dataset_name)
|
train, validation, test = get_imdb()
|
||||||
|
|
||||||
model = LogisticRegression()
|
model = LogisticRegression()
|
||||||
|
|
||||||
|
@ -97,6 +98,39 @@ def estimate_binary():
|
||||||
# print(df.to_html())
|
# print(df.to_html())
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
def estimate_comparison():
|
||||||
|
train, validation, test = get_spambase()
|
||||||
|
model = LogisticRegression()
|
||||||
|
model.fit(*train.Xy)
|
||||||
|
|
||||||
|
n_prevalences = 21
|
||||||
|
repreats = 1000
|
||||||
|
protocol = APP(test, n_prevalences=n_prevalences, repeats=repreats)
|
||||||
|
|
||||||
|
estimator = BinaryQuantifierAccuracyEstimator(model)
|
||||||
|
estimator.fit(validation)
|
||||||
|
df = eval.evaluation_report(estimator, protocol)
|
||||||
|
|
||||||
|
df_index = [("base", "F"), ("base", "T")]
|
||||||
|
|
||||||
|
atc_mc_df = baseline.atc_mc(model, validation, protocol)
|
||||||
|
atc_ne_df = baseline.atc_ne(model, validation, protocol)
|
||||||
|
doc_feat_df = baseline.doc_feat(model, validation, protocol)
|
||||||
|
rca_df = baseline.rca_score(model, validation, protocol)
|
||||||
|
rca_star_df = baseline.rca_star_score(model, validation, protocol)
|
||||||
|
bbse_df = baseline.bbse_score(model, validation, protocol)
|
||||||
|
|
||||||
|
df = df.join(atc_mc_df.set_index(df_index), on=df_index)
|
||||||
|
df = df.join(atc_ne_df.set_index(df_index), on=df_index)
|
||||||
|
df = df.join(doc_feat_df.set_index(df_index), on=df_index)
|
||||||
|
df = df.join(rca_df.set_index(df_index), on=df_index)
|
||||||
|
df = df.join(rca_star_df.set_index(df_index), on=df_index)
|
||||||
|
df = df.join(bbse_df.set_index(df_index), on=df_index)
|
||||||
|
|
||||||
|
print(df.to_string())
|
||||||
|
|
||||||
|
def main():
|
||||||
|
estimate_comparison()
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
estimate_binary()
|
main()
|
||||||
|
|
Loading…
Reference in New Issue