gfun_multimodal/evaluation/evaluate.py

47 lines
1.4 KiB
Python

from joblib import Parallel, delayed
from evaluation.metrics import *
def evaluation_metrics(y, y_):
if len(y.shape) == len(y_.shape) == 1 and len(np.unique(y)) > 2: # single-label
raise NotImplementedError()
else:
return (
macroF1(y, y_),
microF1(y, y_),
macroK(y, y_),
microK(y, y_),
# macroAcc(y, y_),
)
def evaluate(ly_true, ly_pred, metrics=evaluation_metrics, n_jobs=-1):
if n_jobs == 1:
return {lang: metrics(ly_true[lang], ly_pred[lang]) for lang in ly_true.keys()}
else:
langs = list(ly_true.keys())
evals = Parallel(n_jobs=n_jobs)(
delayed(metrics)(ly_true[lang], ly_pred[lang]) for lang in langs
)
return {lang: evals[i] for i, lang in enumerate(langs)}
def log_eval(l_eval, phase="training", verbose=True):
if verbose:
print(f"\n[Results {phase}]")
metrics = []
for lang in l_eval.keys():
macrof1, microf1, macrok, microk = l_eval[lang]
metrics.append([macrof1, microf1, macrok, microk])
if phase != "validation":
print(f"Lang {lang}: macro-F1 = {macrof1:.3f} micro-F1 = {microf1:.3f}")
averages = np.mean(np.array(metrics), axis=0)
if verbose:
print(
"Averages: MF1, mF1, MK, mK",
np.round(averages, 3),
"\n",
)
return averages