from joblib import Parallel, delayed from collections import defaultdict # from evaluation.metrics import * import numpy as np from sklearn.metrics import accuracy_score, top_k_accuracy_score, f1_score, precision_score, recall_score def evaluation_metrics(y, y_, clf_type): if clf_type == "singlelabel": return ( accuracy_score(y, y_), # TODO: we need the logits to compute this top_k_accuracy_score(y, y_, k=5), # TODO: we need logits top_k_accuracy_score(y, y_, k=10), f1_score(y, y_, average="macro", zero_division=1), f1_score(y, y_, average="micro"), precision_score(y, y_, zero_division=1, average="macro"), recall_score(y, y_, zero_division=1, average="macro"), ) elif clf_type == "multilabel": return ( f1_score(y, y_, average="macro", zero_division=1), f1_score(y, y_, average="micro"), 0, 0, # macroK(y, y_), # microK(y, y_), ) else: raise ValueError("clf_type must be either 'singlelabel' or 'multilabel'") def evaluate( ly_true, ly_pred, metrics=evaluation_metrics, n_jobs=-1, clf_type="multilabel" ): if n_jobs == 1: return { lang: metrics(ly_true[lang], ly_pred[lang], clf_type) for lang in ly_true.keys() } else: langs = list(ly_true.keys()) evals = Parallel(n_jobs=n_jobs)( delayed(metrics)(ly_true[lang], ly_pred[lang], clf_type) for lang in langs ) return {lang: evals[i] for i, lang in enumerate(langs)} def log_eval(l_eval, phase="training", clf_type="multilabel", verbose=True): if verbose: print(f"\n[Results {phase}]") metrics = [] if clf_type == "multilabel": for lang in sorted(l_eval.keys()): # macrof1, microf1, macrok, microk = l_eval[lang] # metrics.append([macrof1, microf1, macrok, microk]) macrof1, microf1, precision, recall = l_eval[lang] metrics.append([macrof1, microf1, precision, recall]) if phase != "validation": print(f"Lang {lang}: macro-F1 = {macrof1:.3f} micro-F1 = {microf1:.3f}") averages = np.mean(np.array(metrics), axis=0) if verbose: print( "Averages: MF1, mF1, MK, mK", np.round(averages, 3), "\n", ) return averages # TODO: return a dict avg and lang specific elif clf_type == "singlelabel": lang_metrics = defaultdict(dict) _metrics = [ "accuracy", # "acc5", # "accuracy-at-5", # "acc10", # "accuracy-at-10", "MF1", # "macro-F1", "mF1", # "micro-F1", "precision", "recall" ] for lang in sorted(l_eval.keys()): # acc, top5, top10, macrof1, microf1 = l_eval[lang] acc, macrof1, microf1, precision, recall= l_eval[lang] # metrics.append([acc, top5, top10, macrof1, microf1]) # metrics.append([acc, macrof1, microf1]) metrics.append([acc, macrof1, microf1, precision, recall]) for m, v in zip(_metrics, l_eval[lang]): lang_metrics[m][lang] = v if phase != "validation": print( # f"Lang {lang}: acc = {acc:.3f} acc-top5 = {top5:.3f} acc-top10 = {top10:.3f} macro-F1: {macrof1:.3f} micro-F1 = {microf1:.3f}" # f"Lang {lang}: acc = {acc:.3f} macro-F1: {macrof1:.3f} micro-F1 = {microf1:.3f}" f"Lang {lang}: acc = {acc:.3f} macro-F1: {macrof1:.3f} micro-F1 = {microf1:.3f} pr = {precision:.3f} re = {recall:.3f}" ) averages = np.mean(np.array(metrics), axis=0) if verbose: print( # "Averages: Acc, Acc-5, Acc-10, MF1, mF1", "Averages: Acc, MF1, mF1", np.round(averages, 3), "\n", ) avg_metrics = dict(zip(_metrics, averages)) return avg_metrics, lang_metrics