From 73d1e70ae9507be1995f23c3743a137aba56a892 Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Thu, 16 Jan 2020 17:28:54 +0100 Subject: [PATCH] refactor --- src/{learning => embeddings}/embeddings.py | 2 +- src/{learning => embeddings}/supervised.py | 0 src/learning/learners.py | 8 +- src/{NN_FPEC_andrea.py => main_deep.py} | 0 src/{MLE_andrea.py => main_embeddings_cls.py} | 0 ...{FPEC_andrea.py => main_multimodal_cls.py} | 14 +-- src/models/{cnn_class.py => cnn_class_bu.py} | 0 src/util/file.py | 3 + src/util/metrics.py | 88 +++++++++++++++++++ 9 files changed, 103 insertions(+), 12 deletions(-) rename src/{learning => embeddings}/embeddings.py (99%) rename src/{learning => embeddings}/supervised.py (100%) rename src/{NN_FPEC_andrea.py => main_deep.py} (100%) rename src/{MLE_andrea.py => main_embeddings_cls.py} (100%) rename src/{FPEC_andrea.py => main_multimodal_cls.py} (91%) rename src/models/{cnn_class.py => cnn_class_bu.py} (100%) diff --git a/src/learning/embeddings.py b/src/embeddings/embeddings.py similarity index 99% rename from src/learning/embeddings.py rename to src/embeddings/embeddings.py index 65a5338..0ca51fc 100644 --- a/src/learning/embeddings.py +++ b/src/embeddings/embeddings.py @@ -3,7 +3,7 @@ import pickle from torchtext.vocab import Vectors import torch from abc import ABC, abstractmethod -from learning.supervised import get_supervised_embeddings +from embeddings.supervised import get_supervised_embeddings from util.decompositions import * from util.SIF_embed import * diff --git a/src/learning/supervised.py b/src/embeddings/supervised.py similarity index 100% rename from src/learning/supervised.py rename to src/embeddings/supervised.py diff --git a/src/learning/learners.py b/src/learning/learners.py index a678905..2a42666 100644 --- a/src/learning/learners.py +++ b/src/learning/learners.py @@ -1,6 +1,6 @@ import numpy as np import time -from learning.embeddings import WordEmbeddings, StorageEmbeddings +from embeddings.embeddings import WordEmbeddings, StorageEmbeddings from scipy.sparse import issparse from sklearn.multiclass import OneVsRestClassifier from sklearn.model_selection import GridSearchCV @@ -9,7 +9,7 @@ from joblib import Parallel, delayed from sklearn.feature_extraction.text import TfidfVectorizer from transformers.StandardizeTransformer import StandardizeTransformer from sklearn.decomposition import PCA -from models.cnn_class import CNN_pdr +from models.cnn_class_bu import CNN_pdr def _sort_if_sparse(X): @@ -325,7 +325,7 @@ class MonolingualClassifier: return self.best_params_ -class AndreaCLF(FunnellingPolylingualClassifier): +class FunnellingMultimodal(FunnellingPolylingualClassifier): def __init__(self, we_path, config, @@ -627,7 +627,7 @@ class MonolingualNetSvm: :param word_index: :return: filtered embedding matrix """ - from learning.embeddings import EmbeddingsAligned + from embeddings.embeddings import EmbeddingsAligned type = 'MUSE' path = '/home/andreapdr/CLESA/' MUSE = EmbeddingsAligned(type, path, lang, word_index.keys()) diff --git a/src/NN_FPEC_andrea.py b/src/main_deep.py similarity index 100% rename from src/NN_FPEC_andrea.py rename to src/main_deep.py diff --git a/src/MLE_andrea.py b/src/main_embeddings_cls.py similarity index 100% rename from src/MLE_andrea.py rename to src/main_embeddings_cls.py diff --git a/src/FPEC_andrea.py b/src/main_multimodal_cls.py similarity index 91% rename from src/FPEC_andrea.py rename to src/main_multimodal_cls.py index 3c351b6..ee3a5f6 100644 --- a/src/FPEC_andrea.py +++ b/src/main_multimodal_cls.py @@ -126,13 +126,13 @@ if __name__ == '__main__': result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '') print(f'### PolyEmbedd_andrea_{_config_id}\n') - classifier = AndreaCLF(we_path=op.we_path, - config=config, - first_tier_learner=get_learner(calibrate=True), - meta_learner=get_learner(calibrate=False, kernel='rbf'), - first_tier_parameters=None, # TODO get_params(dense=False),--> first_tier should not be optimized - or not? - meta_parameters=get_params(dense=True), - n_jobs=op.n_jobs) + classifier = FunnellingMultimodal(we_path=op.we_path, + config=config, + first_tier_learner=get_learner(calibrate=True), + meta_learner=get_learner(calibrate=False, kernel='rbf'), + first_tier_parameters=None, # TODO get_params(dense=False),--> first_tier should not be optimized - or not? + meta_parameters=get_params(dense=True), + n_jobs=op.n_jobs) print('# Fitting ...') classifier.fit(lXtr, lytr) diff --git a/src/models/cnn_class.py b/src/models/cnn_class_bu.py similarity index 100% rename from src/models/cnn_class.py rename to src/models/cnn_class_bu.py diff --git a/src/util/file.py b/src/util/file.py index de6c4d0..511fccf 100644 --- a/src/util/file.py +++ b/src/util/file.py @@ -33,4 +33,7 @@ def list_files(dir): def makedirs_if_not_exist(path): if not exists(path): makedirs(path) +def create_if_not_exist(path): + if not exists(path): makedirs(path) + diff --git a/src/util/metrics.py b/src/util/metrics.py index 3efef4f..9f6bc24 100644 --- a/src/util/metrics.py +++ b/src/util/metrics.py @@ -1,4 +1,8 @@ import numpy as np +import numpy as np +from scipy.sparse import lil_matrix, issparse +from sklearn.metrics import f1_score, accuracy_score + """ @@ -166,3 +170,87 @@ def smoothmacroK(true_labels, posterior_probabilities): def smoothmicroK(true_labels, posterior_probabilities): return micro_average(true_labels, posterior_probabilities, K, metric_statistics=soft_single_metric_statistics) + + + +""" +Scikit learn provides a full set of evaluation metrics, but they treat special cases differently. +I.e., when the number of true positives, false positives, and false negatives ammount to 0, all +affected metrices (precision, recall, and thus f1) output 0 in Scikit learn. +We adhere to the common practice of outputting 1 in this case since the classifier has correctly +classified all examples as negatives. +""" + +def evaluation(y_true, y_pred, classification_type): + + if classification_type == 'multilabel': + eval_function = multilabel_eval + elif classification_type == 'singlelabel': + eval_function = singlelabel_eval + + Mf1, mf1, accuracy = eval_function(y_true, y_pred) + + return Mf1, mf1, accuracy + + +def multilabel_eval(y, y_): + + tp = y.multiply(y_) + + fn = lil_matrix(y.shape) + true_ones = y==1 + fn[true_ones]=1-tp[true_ones] + + fp = lil_matrix(y.shape) + pred_ones = y_==1 + if pred_ones.nnz>0: + fp[pred_ones]=1-tp[pred_ones] + + #macro-f1 + tp_macro = np.asarray(tp.sum(axis=0), dtype=int).flatten() + fn_macro = np.asarray(fn.sum(axis=0), dtype=int).flatten() + fp_macro = np.asarray(fp.sum(axis=0), dtype=int).flatten() + + pos_pred = tp_macro+fp_macro + pos_true = tp_macro+fn_macro + prec=np.zeros(shape=tp_macro.shape,dtype=float) + rec=np.zeros(shape=tp_macro.shape,dtype=float) + np.divide(tp_macro, pos_pred, out=prec, where=pos_pred>0) + np.divide(tp_macro, pos_true, out=rec, where=pos_true>0) + den=prec+rec + + macrof1=np.zeros(shape=tp_macro.shape,dtype=float) + np.divide(np.multiply(prec,rec),den,out=macrof1,where=den>0) + macrof1 *=2 + + macrof1[(pos_pred==0)*(pos_true==0)]=1 + macrof1 = np.mean(macrof1) + + #micro-f1 + tp_micro = tp_macro.sum() + fn_micro = fn_macro.sum() + fp_micro = fp_macro.sum() + pos_pred = tp_micro + fp_micro + pos_true = tp_micro + fn_micro + prec = (tp_micro / pos_pred) if pos_pred>0 else 0 + rec = (tp_micro / pos_true) if pos_true>0 else 0 + den = prec+rec + microf1 = 2*prec*rec/den if den>0 else 0 + if pos_pred==pos_true==0: + microf1=1 + + #accuracy + ndecisions = np.multiply(*y.shape) + tn = ndecisions - (tp_micro+fn_micro+fp_micro) + acc = (tp_micro+tn)/ndecisions + + return macrof1,microf1,acc + + +def singlelabel_eval(y, y_): + if issparse(y_): y_ = y_.toarray().flatten() + macrof1 = f1_score(y, y_, average='macro') + microf1 = f1_score(y, y_, average='micro') + acc = accuracy_score(y, y_) + return macrof1,microf1,acc +