refactor

2020-01-16 17:28:54 +01:00 · 2020-01-16 17:28:54 +01:00 · 73d1e70ae9
parent cfd3a609a2
commit 73d1e70ae9
9 changed files with 103 additions and 12 deletions
--- a/src/embeddings/embeddings.py
+++ b/src/embeddings/embeddings.py
@ -3,7 +3,7 @@ import pickle
 from torchtext.vocab import Vectors
 import torch
 from abc import ABC, abstractmethod
-from learning.supervised import get_supervised_embeddings
+from embeddings.supervised import get_supervised_embeddings
 from util.decompositions import *
 from util.SIF_embed import *

--- a/src/embeddings/supervised.py
+++ b/src/embeddings/supervised.py
--- a/src/learning/learners.py
+++ b/src/learning/learners.py
@ -1,6 +1,6 @@
 import numpy as np
 import time
-from learning.embeddings import WordEmbeddings, StorageEmbeddings
+from embeddings.embeddings import WordEmbeddings, StorageEmbeddings
 from scipy.sparse import issparse
 from sklearn.multiclass import OneVsRestClassifier
 from sklearn.model_selection import GridSearchCV
@ -9,7 +9,7 @@ from joblib import Parallel, delayed
 from sklearn.feature_extraction.text import TfidfVectorizer
 from transformers.StandardizeTransformer import StandardizeTransformer
 from sklearn.decomposition import PCA
-from models.cnn_class import CNN_pdr
+from models.cnn_class_bu import CNN_pdr


 def _sort_if_sparse(X):
@ -325,7 +325,7 @@ class MonolingualClassifier:
        return self.best_params_


-class AndreaCLF(FunnellingPolylingualClassifier):
+class FunnellingMultimodal(FunnellingPolylingualClassifier):
    def __init__(self,
                 we_path,
                 config,
@ -627,7 +627,7 @@ class MonolingualNetSvm:
        :param word_index:
        :return: filtered embedding matrix
        """
-        from learning.embeddings import EmbeddingsAligned
+        from embeddings.embeddings import EmbeddingsAligned
        type = 'MUSE'
        path = '/home/andreapdr/CLESA/'
        MUSE = EmbeddingsAligned(type, path, lang, word_index.keys())
--- a/src/NN_FPEC_andrea.py
+++ b/src/NN_FPEC_andrea.py
--- a/src/main_embeddings_cls.py
+++ b/src/main_embeddings_cls.py
--- a/src/main_multimodal_cls.py
+++ b/src/main_multimodal_cls.py
@ -126,13 +126,13 @@ if __name__ == '__main__':
    result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '')

    print(f'### PolyEmbedd_andrea_{_config_id}\n')
-    classifier = AndreaCLF(we_path=op.we_path,
-                           config=config,
-                           first_tier_learner=get_learner(calibrate=True),
-                           meta_learner=get_learner(calibrate=False, kernel='rbf'),
-                           first_tier_parameters=None,   # TODO get_params(dense=False),--> first_tier should not be optimized - or not?
-                           meta_parameters=get_params(dense=True),
-                           n_jobs=op.n_jobs)
+    classifier = FunnellingMultimodal(we_path=op.we_path,
+                                      config=config,
+                                      first_tier_learner=get_learner(calibrate=True),
+                                      meta_learner=get_learner(calibrate=False, kernel='rbf'),
+                                      first_tier_parameters=None,  # TODO get_params(dense=False),--> first_tier should not be optimized - or not?
+                                      meta_parameters=get_params(dense=True),
+                                      n_jobs=op.n_jobs)

    print('# Fitting ...')
    classifier.fit(lXtr, lytr)
--- a/src/models/cnn_class_bu.py
+++ b/src/models/cnn_class_bu.py
--- a/src/util/file.py
+++ b/src/util/file.py
@ -33,4 +33,7 @@ def list_files(dir):
 def makedirs_if_not_exist(path):
    if not exists(path): makedirs(path)

+def create_if_not_exist(path):
+    if not exists(path): makedirs(path)
+

--- a/src/util/metrics.py
+++ b/src/util/metrics.py
@ -1,4 +1,8 @@
 import numpy as np
+import numpy as np
+from scipy.sparse import lil_matrix, issparse
+from sklearn.metrics import f1_score, accuracy_score
+


 """
@ -166,3 +170,87 @@ def smoothmacroK(true_labels, posterior_probabilities):
 def smoothmicroK(true_labels, posterior_probabilities):
    return micro_average(true_labels, posterior_probabilities, K, metric_statistics=soft_single_metric_statistics)

+
+
+
+"""
+Scikit learn provides a full set of evaluation metrics, but they treat special cases differently.
+I.e., when the number of true positives, false positives, and false negatives ammount to 0, all
+affected metrices (precision, recall, and thus f1) output 0 in Scikit learn.
+We adhere to the common practice of outputting 1 in this case since the classifier has correctly
+classified all examples as negatives.
+"""
+
+def evaluation(y_true, y_pred, classification_type):
+
+    if classification_type == 'multilabel':
+        eval_function = multilabel_eval
+    elif classification_type == 'singlelabel':
+        eval_function = singlelabel_eval
+
+    Mf1, mf1, accuracy = eval_function(y_true, y_pred)
+
+    return Mf1, mf1, accuracy
+
+
+def multilabel_eval(y, y_):
+
+    tp = y.multiply(y_)
+
+    fn = lil_matrix(y.shape)
+    true_ones = y==1
+    fn[true_ones]=1-tp[true_ones]
+
+    fp = lil_matrix(y.shape)
+    pred_ones = y_==1
+    if pred_ones.nnz>0:
+        fp[pred_ones]=1-tp[pred_ones]
+
+    #macro-f1
+    tp_macro = np.asarray(tp.sum(axis=0), dtype=int).flatten()
+    fn_macro = np.asarray(fn.sum(axis=0), dtype=int).flatten()
+    fp_macro = np.asarray(fp.sum(axis=0), dtype=int).flatten()
+
+    pos_pred = tp_macro+fp_macro
+    pos_true = tp_macro+fn_macro
+    prec=np.zeros(shape=tp_macro.shape,dtype=float)
+    rec=np.zeros(shape=tp_macro.shape,dtype=float)
+    np.divide(tp_macro, pos_pred, out=prec, where=pos_pred>0)
+    np.divide(tp_macro, pos_true, out=rec, where=pos_true>0)
+    den=prec+rec
+
+    macrof1=np.zeros(shape=tp_macro.shape,dtype=float)
+    np.divide(np.multiply(prec,rec),den,out=macrof1,where=den>0)
+    macrof1 *=2
+
+    macrof1[(pos_pred==0)*(pos_true==0)]=1
+    macrof1 = np.mean(macrof1)
+
+    #micro-f1
+    tp_micro = tp_macro.sum()
+    fn_micro = fn_macro.sum()
+    fp_micro = fp_macro.sum()
+    pos_pred = tp_micro + fp_micro
+    pos_true = tp_micro + fn_micro
+    prec = (tp_micro / pos_pred) if pos_pred>0 else 0
+    rec  = (tp_micro / pos_true) if pos_true>0 else 0
+    den = prec+rec
+    microf1 = 2*prec*rec/den if den>0 else 0
+    if pos_pred==pos_true==0:
+        microf1=1
+
+    #accuracy
+    ndecisions = np.multiply(*y.shape)
+    tn = ndecisions - (tp_micro+fn_micro+fp_micro)
+    acc = (tp_micro+tn)/ndecisions
+
+    return macrof1,microf1,acc
+
+
+def singlelabel_eval(y, y_):
+    if issparse(y_): y_ = y_.toarray().flatten()
+    macrof1 = f1_score(y, y_, average='macro')
+    microf1 = f1_score(y, y_, average='micro')
+    acc = accuracy_score(y, y_)
+    return macrof1,microf1,acc
+