refactor

2020-01-16 17:28:54 +01:00 · 2020-01-16 17:28:54 +01:00 · 73d1e70ae9
parent cfd3a609a2
commit 73d1e70ae9
9 changed files with 103 additions and 12 deletions
--- a/src/embeddings/embeddings.py
+++ b/src/embeddings/embeddings.py
@ -3,7 +3,7 @@ import pickle
 from torchtext.vocab import Vectors
 import torch
 from abc import ABC, abstractmethod
-from learning.supervised import get_supervised_embeddings
+from embeddings.supervised import get_supervised_embeddings
 from util.decompositions import *
 from util.SIF_embed import *
--- a/src/embeddings/supervised.py
+++ b/src/embeddings/supervised.py
--- a/src/learning/learners.py
+++ b/src/learning/learners.py
@ -1,6 +1,6 @@
 import numpy as np
 import time
-from learning.embeddings import WordEmbeddings, StorageEmbeddings
+from embeddings.embeddings import WordEmbeddings, StorageEmbeddings
 from scipy.sparse import issparse
 from sklearn.multiclass import OneVsRestClassifier
 from sklearn.model_selection import GridSearchCV
@ -9,7 +9,7 @@ from joblib import Parallel, delayed
 from sklearn.feature_extraction.text import TfidfVectorizer
 from transformers.StandardizeTransformer import StandardizeTransformer
 from sklearn.decomposition import PCA
-from models.cnn_class import CNN_pdr
+from models.cnn_class_bu import CNN_pdr
 def _sort_if_sparse(X):
@ -325,7 +325,7 @@ class MonolingualClassifier:
        return self.best_params_
-class AndreaCLF(FunnellingPolylingualClassifier):
+class FunnellingMultimodal(FunnellingPolylingualClassifier):
    def __init__(self,
                 we_path,
                 config,
@ -627,7 +627,7 @@ class MonolingualNetSvm:
        :param word_index:
        :return: filtered embedding matrix
        """
-        from learning.embeddings import EmbeddingsAligned
+        from embeddings.embeddings import EmbeddingsAligned
        type = 'MUSE'
        path = '/home/andreapdr/CLESA/'
        MUSE = EmbeddingsAligned(type, path, lang, word_index.keys())
--- a/src/NN_FPEC_andrea.py
+++ b/src/NN_FPEC_andrea.py
--- a/src/main_embeddings_cls.py
+++ b/src/main_embeddings_cls.py
--- a/src/main_multimodal_cls.py
+++ b/src/main_multimodal_cls.py
@ -126,7 +126,7 @@ if __name__ == '__main__':
    result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '')
    print(f'### PolyEmbedd_andrea_{_config_id}\n')
-    classifier = AndreaCLF(we_path=op.we_path,
+    classifier = FunnellingMultimodal(we_path=op.we_path,
                                      config=config,
                                      first_tier_learner=get_learner(calibrate=True),
                                      meta_learner=get_learner(calibrate=False, kernel='rbf'),
--- a/src/models/cnn_class_bu.py
+++ b/src/models/cnn_class_bu.py
--- a/src/util/file.py
+++ b/src/util/file.py
@ -33,4 +33,7 @@ def list_files(dir):
 def makedirs_if_not_exist(path):
    if not exists(path): makedirs(path)
 def create_if_not_exist(path):
    if not exists(path): makedirs(path)
--- a/src/util/metrics.py
+++ b/src/util/metrics.py
@ -1,4 +1,8 @@
 import numpy as np
 import numpy as np
 from scipy.sparse import lil_matrix, issparse
 from sklearn.metrics import f1_score, accuracy_score
 """
@ -166,3 +170,87 @@ def smoothmacroK(true_labels, posterior_probabilities):
 def smoothmicroK(true_labels, posterior_probabilities):
    return micro_average(true_labels, posterior_probabilities, K, metric_statistics=soft_single_metric_statistics)
 """
 Scikit learn provides a full set of evaluation metrics, but they treat special cases differently.
 I.e., when the number of true positives, false positives, and false negatives ammount to 0, all
 affected metrices (precision, recall, and thus f1) output 0 in Scikit learn.
 We adhere to the common practice of outputting 1 in this case since the classifier has correctly
 classified all examples as negatives.
 """
 def evaluation(y_true, y_pred, classification_type):
    if classification_type == 'multilabel':
        eval_function = multilabel_eval
    elif classification_type == 'singlelabel':
        eval_function = singlelabel_eval
    Mf1, mf1, accuracy = eval_function(y_true, y_pred)
    return Mf1, mf1, accuracy
 def multilabel_eval(y, y_):
    tp = y.multiply(y_)
    fn = lil_matrix(y.shape)
    true_ones = y==1
    fn[true_ones]=1-tp[true_ones]
    fp = lil_matrix(y.shape)
    pred_ones = y_==1
    if pred_ones.nnz>0:
        fp[pred_ones]=1-tp[pred_ones]
    #macro-f1
    tp_macro = np.asarray(tp.sum(axis=0), dtype=int).flatten()
    fn_macro = np.asarray(fn.sum(axis=0), dtype=int).flatten()
    fp_macro = np.asarray(fp.sum(axis=0), dtype=int).flatten()
    pos_pred = tp_macro+fp_macro
    pos_true = tp_macro+fn_macro
    prec=np.zeros(shape=tp_macro.shape,dtype=float)
    rec=np.zeros(shape=tp_macro.shape,dtype=float)
    np.divide(tp_macro, pos_pred, out=prec, where=pos_pred>0)
    np.divide(tp_macro, pos_true, out=rec, where=pos_true>0)
    den=prec+rec
    macrof1=np.zeros(shape=tp_macro.shape,dtype=float)
    np.divide(np.multiply(prec,rec),den,out=macrof1,where=den>0)
    macrof1 *=2
    macrof1[(pos_pred==0)*(pos_true==0)]=1
    macrof1 = np.mean(macrof1)
    #micro-f1
    tp_micro = tp_macro.sum()
    fn_micro = fn_macro.sum()
    fp_micro = fp_macro.sum()
    pos_pred = tp_micro + fp_micro
    pos_true = tp_micro + fn_micro
    prec = (tp_micro / pos_pred) if pos_pred>0 else 0
    rec  = (tp_micro / pos_true) if pos_true>0 else 0
    den = prec+rec
    microf1 = 2*prec*rec/den if den>0 else 0
    if pos_pred==pos_true==0:
        microf1=1
    #accuracy
    ndecisions = np.multiply(*y.shape)
    tn = ndecisions - (tp_micro+fn_micro+fp_micro)
    acc = (tp_micro+tn)/ndecisions
    return macrof1,microf1,acc
 def singlelabel_eval(y, y_):
    if issparse(y_): y_ = y_.toarray().flatten()
    macrof1 = f1_score(y, y_, average='macro')
    microf1 = f1_score(y, y_, average='micro')
    acc = accuracy_score(y, y_)
    return macrof1,microf1,acc