gfun_multimodal/evaluation/evaluate.py

97 lines
3.4 KiB
Python

from joblib import Parallel, delayed
from collections import defaultdict
from evaluation.metrics import *
from sklearn.metrics import accuracy_score, top_k_accuracy_score, f1_score
def evaluation_metrics(y, y_, clf_type):
if clf_type == "singlelabel":
return (
accuracy_score(y, y_),
# TODO: we need the logits to compute this top_k_accuracy_score(y, y_, k=5),
# TODO: we need logits top_k_accuracy_score(y, y_, k=10),
f1_score(y, y_, average="macro", zero_division=1),
f1_score(y, y_, average="micro"),
)
elif clf_type == "multilabel":
return (
macroF1(y, y_),
microF1(y, y_),
macroK(y, y_),
microK(y, y_),
)
else:
raise ValueError("clf_type must be either 'singlelabel' or 'multilabel'")
def evaluate(
ly_true, ly_pred, metrics=evaluation_metrics, n_jobs=-1, clf_type="multilabel"
):
if n_jobs == 1:
return {
lang: metrics(ly_true[lang], ly_pred[lang], clf_type)
for lang in ly_true.keys()
}
else:
langs = list(ly_true.keys())
evals = Parallel(n_jobs=n_jobs)(
delayed(metrics)(ly_true[lang], ly_pred[lang], clf_type) for lang in langs
)
return {lang: evals[i] for i, lang in enumerate(langs)}
def log_eval(l_eval, phase="training", clf_type="multilabel", verbose=True):
if verbose:
print(f"\n[Results {phase}]")
metrics = []
if clf_type == "multilabel":
for lang in l_eval.keys():
macrof1, microf1, macrok, microk = l_eval[lang]
metrics.append([macrof1, microf1, macrok, microk])
if phase != "validation":
print(f"Lang {lang}: macro-F1 = {macrof1:.3f} micro-F1 = {microf1:.3f}")
averages = np.mean(np.array(metrics), axis=0)
if verbose:
print(
"Averages: MF1, mF1, MK, mK",
np.round(averages, 3),
"\n",
)
return averages # TODO: return a dict avg and lang specific
elif clf_type == "singlelabel":
lang_metrics = defaultdict(dict)
_metrics = [
"accuracy",
# "acc5", # "accuracy-at-5",
# "acc10", # "accuracy-at-10",
"MF1", # "macro-F1",
"mF1", # "micro-F1",
]
for lang in l_eval.keys():
# acc, top5, top10, macrof1, microf1 = l_eval[lang]
acc, macrof1, microf1 = l_eval[lang]
# metrics.append([acc, top5, top10, macrof1, microf1])
metrics.append([acc, macrof1, microf1])
for m, v in zip(_metrics, l_eval[lang]):
lang_metrics[m][lang] = v
if phase != "validation":
print(
# f"Lang {lang}: acc = {acc:.3f} acc-top5 = {top5:.3f} acc-top10 = {top10:.3f} macro-F1: {macrof1:.3f} micro-F1 = {microf1:.3f}"
f"Lang {lang}: acc = {acc:.3f} macro-F1: {macrof1:.3f} micro-F1 = {microf1:.3f}"
)
averages = np.mean(np.array(metrics), axis=0)
if verbose:
print(
# "Averages: Acc, Acc-5, Acc-10, MF1, mF1",
"Averages: Acc, MF1, mF1",
np.round(averages, 3),
"\n",
)
avg_metrics = dict(zip(_metrics, averages))
return avg_metrics, lang_metrics