diff --git a/evaluation/evaluate.py b/evaluation/evaluate.py index 7ab0c2b..636148a 100644 --- a/evaluation/evaluate.py +++ b/evaluation/evaluate.py @@ -1,8 +1,9 @@ from joblib import Parallel, delayed from collections import defaultdict -from evaluation.metrics import * -from sklearn.metrics import accuracy_score, top_k_accuracy_score, f1_score +# from evaluation.metrics import * +import numpy as np +from sklearn.metrics import accuracy_score, top_k_accuracy_score, f1_score, precision_score, recall_score def evaluation_metrics(y, y_, clf_type): @@ -13,13 +14,17 @@ def evaluation_metrics(y, y_, clf_type): # TODO: we need logits top_k_accuracy_score(y, y_, k=10), f1_score(y, y_, average="macro", zero_division=1), f1_score(y, y_, average="micro"), + precision_score(y, y_, zero_division=1, average="macro"), + recall_score(y, y_, zero_division=1, average="macro"), ) elif clf_type == "multilabel": return ( - macroF1(y, y_), - microF1(y, y_), - macroK(y, y_), - microK(y, y_), + f1_score(y, y_, average="macro", zero_division=1), + f1_score(y, y_, average="micro"), + 0, + 0, + # macroK(y, y_), + # microK(y, y_), ) else: raise ValueError("clf_type must be either 'singlelabel' or 'multilabel'") @@ -48,8 +53,10 @@ def log_eval(l_eval, phase="training", clf_type="multilabel", verbose=True): if clf_type == "multilabel": for lang in l_eval.keys(): - macrof1, microf1, macrok, microk = l_eval[lang] - metrics.append([macrof1, microf1, macrok, microk]) + # macrof1, microf1, macrok, microk = l_eval[lang] + # metrics.append([macrof1, microf1, macrok, microk]) + macrof1, microf1, precision, recall = l_eval[lang] + metrics.append([macrof1, microf1, precision, recall]) if phase != "validation": print(f"Lang {lang}: macro-F1 = {macrof1:.3f} micro-F1 = {microf1:.3f}") averages = np.mean(np.array(metrics), axis=0) @@ -69,12 +76,15 @@ def log_eval(l_eval, phase="training", clf_type="multilabel", verbose=True): # "acc10", # "accuracy-at-10", "MF1", # "macro-F1", "mF1", # "micro-F1", + "precision", + "recall" ] for lang in l_eval.keys(): # acc, top5, top10, macrof1, microf1 = l_eval[lang] - acc, macrof1, microf1 = l_eval[lang] + acc, macrof1, microf1, precision, recall= l_eval[lang] # metrics.append([acc, top5, top10, macrof1, microf1]) - metrics.append([acc, macrof1, microf1]) + # metrics.append([acc, macrof1, microf1]) + metrics.append([acc, macrof1, microf1, precision, recall]) for m, v in zip(_metrics, l_eval[lang]): lang_metrics[m][lang] = v @@ -82,7 +92,8 @@ def log_eval(l_eval, phase="training", clf_type="multilabel", verbose=True): if phase != "validation": print( # f"Lang {lang}: acc = {acc:.3f} acc-top5 = {top5:.3f} acc-top10 = {top10:.3f} macro-F1: {macrof1:.3f} micro-F1 = {microf1:.3f}" - f"Lang {lang}: acc = {acc:.3f} macro-F1: {macrof1:.3f} micro-F1 = {microf1:.3f}" + # f"Lang {lang}: acc = {acc:.3f} macro-F1: {macrof1:.3f} micro-F1 = {microf1:.3f}" + f"Lang {lang}: acc = {acc:.3f} macro-F1: {macrof1:.3f} micro-F1 = {microf1:.3f} pr = {precision:.3f} re = {recall:.3f}" ) averages = np.mean(np.array(metrics), axis=0) if verbose: