This commit is contained in:
Alejandro Moreo Fernandez 2020-01-16 17:28:54 +01:00
parent cfd3a609a2
commit 73d1e70ae9
9 changed files with 103 additions and 12 deletions

View File

@ -3,7 +3,7 @@ import pickle
from torchtext.vocab import Vectors
import torch
from abc import ABC, abstractmethod
from learning.supervised import get_supervised_embeddings
from embeddings.supervised import get_supervised_embeddings
from util.decompositions import *
from util.SIF_embed import *

View File

@ -1,6 +1,6 @@
import numpy as np
import time
from learning.embeddings import WordEmbeddings, StorageEmbeddings
from embeddings.embeddings import WordEmbeddings, StorageEmbeddings
from scipy.sparse import issparse
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV
@ -9,7 +9,7 @@ from joblib import Parallel, delayed
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers.StandardizeTransformer import StandardizeTransformer
from sklearn.decomposition import PCA
from models.cnn_class import CNN_pdr
from models.cnn_class_bu import CNN_pdr
def _sort_if_sparse(X):
@ -325,7 +325,7 @@ class MonolingualClassifier:
return self.best_params_
class AndreaCLF(FunnellingPolylingualClassifier):
class FunnellingMultimodal(FunnellingPolylingualClassifier):
def __init__(self,
we_path,
config,
@ -627,7 +627,7 @@ class MonolingualNetSvm:
:param word_index:
:return: filtered embedding matrix
"""
from learning.embeddings import EmbeddingsAligned
from embeddings.embeddings import EmbeddingsAligned
type = 'MUSE'
path = '/home/andreapdr/CLESA/'
MUSE = EmbeddingsAligned(type, path, lang, word_index.keys())

View File

@ -126,13 +126,13 @@ if __name__ == '__main__':
result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '')
print(f'### PolyEmbedd_andrea_{_config_id}\n')
classifier = AndreaCLF(we_path=op.we_path,
config=config,
first_tier_learner=get_learner(calibrate=True),
meta_learner=get_learner(calibrate=False, kernel='rbf'),
first_tier_parameters=None, # TODO get_params(dense=False),--> first_tier should not be optimized - or not?
meta_parameters=get_params(dense=True),
n_jobs=op.n_jobs)
classifier = FunnellingMultimodal(we_path=op.we_path,
config=config,
first_tier_learner=get_learner(calibrate=True),
meta_learner=get_learner(calibrate=False, kernel='rbf'),
first_tier_parameters=None, # TODO get_params(dense=False),--> first_tier should not be optimized - or not?
meta_parameters=get_params(dense=True),
n_jobs=op.n_jobs)
print('# Fitting ...')
classifier.fit(lXtr, lytr)

View File

@ -33,4 +33,7 @@ def list_files(dir):
def makedirs_if_not_exist(path):
if not exists(path): makedirs(path)
def create_if_not_exist(path):
if not exists(path): makedirs(path)

View File

@ -1,4 +1,8 @@
import numpy as np
import numpy as np
from scipy.sparse import lil_matrix, issparse
from sklearn.metrics import f1_score, accuracy_score
"""
@ -166,3 +170,87 @@ def smoothmacroK(true_labels, posterior_probabilities):
def smoothmicroK(true_labels, posterior_probabilities):
return micro_average(true_labels, posterior_probabilities, K, metric_statistics=soft_single_metric_statistics)
"""
Scikit learn provides a full set of evaluation metrics, but they treat special cases differently.
I.e., when the number of true positives, false positives, and false negatives ammount to 0, all
affected metrices (precision, recall, and thus f1) output 0 in Scikit learn.
We adhere to the common practice of outputting 1 in this case since the classifier has correctly
classified all examples as negatives.
"""
def evaluation(y_true, y_pred, classification_type):
if classification_type == 'multilabel':
eval_function = multilabel_eval
elif classification_type == 'singlelabel':
eval_function = singlelabel_eval
Mf1, mf1, accuracy = eval_function(y_true, y_pred)
return Mf1, mf1, accuracy
def multilabel_eval(y, y_):
tp = y.multiply(y_)
fn = lil_matrix(y.shape)
true_ones = y==1
fn[true_ones]=1-tp[true_ones]
fp = lil_matrix(y.shape)
pred_ones = y_==1
if pred_ones.nnz>0:
fp[pred_ones]=1-tp[pred_ones]
#macro-f1
tp_macro = np.asarray(tp.sum(axis=0), dtype=int).flatten()
fn_macro = np.asarray(fn.sum(axis=0), dtype=int).flatten()
fp_macro = np.asarray(fp.sum(axis=0), dtype=int).flatten()
pos_pred = tp_macro+fp_macro
pos_true = tp_macro+fn_macro
prec=np.zeros(shape=tp_macro.shape,dtype=float)
rec=np.zeros(shape=tp_macro.shape,dtype=float)
np.divide(tp_macro, pos_pred, out=prec, where=pos_pred>0)
np.divide(tp_macro, pos_true, out=rec, where=pos_true>0)
den=prec+rec
macrof1=np.zeros(shape=tp_macro.shape,dtype=float)
np.divide(np.multiply(prec,rec),den,out=macrof1,where=den>0)
macrof1 *=2
macrof1[(pos_pred==0)*(pos_true==0)]=1
macrof1 = np.mean(macrof1)
#micro-f1
tp_micro = tp_macro.sum()
fn_micro = fn_macro.sum()
fp_micro = fp_macro.sum()
pos_pred = tp_micro + fp_micro
pos_true = tp_micro + fn_micro
prec = (tp_micro / pos_pred) if pos_pred>0 else 0
rec = (tp_micro / pos_true) if pos_true>0 else 0
den = prec+rec
microf1 = 2*prec*rec/den if den>0 else 0
if pos_pred==pos_true==0:
microf1=1
#accuracy
ndecisions = np.multiply(*y.shape)
tn = ndecisions - (tp_micro+fn_micro+fp_micro)
acc = (tp_micro+tn)/ndecisions
return macrof1,microf1,acc
def singlelabel_eval(y, y_):
if issparse(y_): y_ = y_.toarray().flatten()
macrof1 = f1_score(y, y_, average='macro')
microf1 = f1_score(y, y_, average='micro')
acc = accuracy_score(y, y_)
return macrof1,microf1,acc