diff --git a/evaluation/evaluate.py b/evaluation/evaluate.py index 10c2333..dc21e1b 100644 --- a/evaluation/evaluate.py +++ b/evaluation/evaluate.py @@ -13,6 +13,9 @@ def evaluation_metrics(y, y_): macroK(y, y_), microK(y, y_), # macroAcc(y, y_), + microAcc( + y, y_ + ), # TODO: we're using micro-averaging for accuracy, it is == to accuracy_score on binary classification ) @@ -32,10 +35,12 @@ def log_eval(l_eval, phase="training", verbose=True): print(f"\n[Results {phase}]") metrics = [] for lang in l_eval.keys(): - macrof1, microf1, macrok, microk = l_eval[lang] - metrics.append([macrof1, microf1, macrok, microk]) + macrof1, microf1, macrok, microk, microAcc = l_eval[lang] + metrics.append([macrof1, microf1, macrok, microk, microAcc]) if phase != "validation": - print(f"Lang {lang}: macro-F1 = {macrof1:.3f} micro-F1 = {microf1:.3f}") + print( + f"Lang {lang}: macro-F1 = {macrof1:.3f} micro-F1 = {microf1:.3f} acc = {microAcc:.3f}" + ) averages = np.mean(np.array(metrics), axis=0) if verbose: print( diff --git a/evaluation/metrics.py b/evaluation/metrics.py index 80cee37..6db4800 100644 --- a/evaluation/metrics.py +++ b/evaluation/metrics.py @@ -239,3 +239,7 @@ def microK(true_labels, predicted_labels): def macroAcc(true_labels, predicted_labels): return macro_average(true_labels, predicted_labels, accuracy) + + +def microAcc(true_labels, predicted_labels): + return micro_average(true_labels, predicted_labels, accuracy)