implemented method to compute WCE only for well represented classes;

refactored MLE class in order to support WCE, standard embeddings and combinations;
sketched out NN implementation for WE compositionality;
still TODO SIF embeddings;
This commit is contained in:
andrea 2020-01-07 17:05:41 +01:00
parent 0e66fbf197
commit 53198a7e2c
10 changed files with 498 additions and 37 deletions

View File

@ -6,7 +6,7 @@ from optparse import OptionParser
from util.file import exists from util.file import exists
from util.results import PolylingualClassificationResults from util.results import PolylingualClassificationResults
from sklearn.svm import SVC from sklearn.svm import SVC
from util.util import get_learner, get_params
parser = OptionParser() parser = OptionParser()
@ -115,7 +115,7 @@ if __name__ == '__main__':
config = {'unsupervised': True, config = {'unsupervised': True,
'supervised': True, 'supervised': True,
'we_type': op.we_type} 'we_type': op.we_type}
_config_id = 'M_and_F' _config_id = 'M+F'
config['reduction'] = 'PCA' config['reduction'] = 'PCA'
config['max_label_space'] = op.max_labels_S config['max_label_space'] = op.max_labels_S
@ -125,32 +125,6 @@ if __name__ == '__main__':
result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '') result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '')
PLE_test = True
if PLE_test:
ple = PolylingualEmbeddingsClassifier(wordembeddings_path='/home/andreapdr/CLESA/',
config = config,
learner=get_learner(calibrate=False),
c_parameters=get_params(dense=False),
n_jobs=op.n_jobs)
print('# Fitting ...')
ple.fit(lXtr, lytr)
print('# Evaluating ...')
ple_eval = evaluate_method(ple, lXte, lyte)
metrics = []
for lang in lXte.keys():
macrof1, microf1, macrok, microk = ple_eval[lang]
metrics.append([macrof1, microf1, macrok, microk])
print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1))
results.add_row('MLE', 'svm', 'no', config['we_type'],
'no','no', op.optimc, op.dataset.split('/')[-1], ple.time,
lang, macrof1, microf1, macrok, microk, '')
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
exit()
print(f'### PolyEmbedd_andrea_{_config_id}\n') print(f'### PolyEmbedd_andrea_{_config_id}\n')
classifier = AndreaCLF(we_path=op.we_path, classifier = AndreaCLF(we_path=op.we_path,
config=config, config=config,
@ -174,5 +148,5 @@ if __name__ == '__main__':
results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'], results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'],
(config['max_label_space'], classifier.best_components), (config['max_label_space'], classifier.best_components),
config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time, config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time,
lang, macrof1, microf1, macrok, microk, 'min_prevalence = 0') lang, macrof1, microf1, macrok, microk, '')
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0)) print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))

128
src/MLE_andrea.py Normal file
View File

@ -0,0 +1,128 @@
import os
from dataset_builder import MultilingualDataset
from learning.learners import *
from util.evaluation import *
from optparse import OptionParser
from util.file import exists
from util.results import PolylingualClassificationResults
from util.util import get_learner, get_params
parser = OptionParser()
parser.add_option("-d", "--dataset", dest="dataset",
help="Path to the multilingual dataset processed and stored in .pickle format",
default="/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle")
parser.add_option("-o", "--output", dest="output",
help="Result file", type=str, default='./results/results.csv')
parser.add_option("-e", "--mode-embed", dest="mode_embed",
help="Set the embedding to be used [none, unsupervised, supervised, both]", type=str, default='none')
parser.add_option("-w", "--we-path", dest="we_path",
help="Path to the polylingual word embeddings", default='/home/andreapdr/CLESA/')
parser.add_option('-t', "--we-type", dest="we_type", help="Aligned embeddings to use [FastText, MUSE]", type=str,
default='MUSE')
parser.add_option("-s", "--set_c", dest="set_c",type=float,
help="Set the C parameter", default=1)
parser.add_option("-c", "--optimc", dest="optimc", action='store_true',
help="Optimize hyperparameters", default=False)
parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int,
help="Number of parallel jobs (default is -1, all)", default=-1)
parser.add_option("-p", "--pca", dest="max_labels_S", type=int,
help="If smaller than number of target classes, PCA will be applied to supervised matrix. "
"If set to 0 it will automatically search for the best number of components. "
"If set to -1 it will apply PCA to the vstacked supervised matrix (PCA dim set to 50 atm)",
default=300)
parser.add_option("-u", "--upca", dest="max_labels_U", type=int,
help="If smaller than Unsupervised Dimension, PCA will be applied to unsupervised matrix."
" If set to 0 it will automatically search for the best number of components", default=300)
parser.add_option("-l", dest="lang", type=str)
if __name__ == '__main__':
(op, args) = parser.parse_args()
assert exists(op.dataset), 'Unable to find file '+str(op.dataset)
assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
dataset_file = os.path.basename(op.dataset)
results = PolylingualClassificationResults('./results/PLE_results.csv')
data = MultilingualDataset.load(op.dataset)
data.show_dimensions()
# data.set_view(languages=['en','it', 'pt', 'sv'], categories=list(range(10)))
# data.set_view(languages=[op.lang])
# data.set_view(categories=list(range(10)))
lXtr, lytr = data.training()
lXte, lyte = data.test()
if op.set_c != -1:
meta_parameters = None
else:
meta_parameters = [{'C': [1e3, 1e2, 1e1, 1, 1e-1]}]
# Embeddings and WCE config
_available_mode = ['none', 'unsupervised', 'supervised', 'both']
_available_type = ['MUSE', 'FastText']
assert op.mode_embed in _available_mode, f'{op.mode_embed} not in {_available_mode}'
assert op.we_type in _available_type, f'{op.we_type} not in {_available_type}'
if op.mode_embed == 'none':
config = {'unsupervised': False,
'supervised': False,
'we_type': None}
_config_id = 'None'
elif op.mode_embed == 'unsupervised':
config = {'unsupervised': True,
'supervised': False,
'we_type': op.we_type}
_config_id = 'M'
elif op.mode_embed == 'supervised':
config = {'unsupervised': False,
'supervised': True,
'we_type': None}
_config_id = 'F'
elif op.mode_embed == 'both':
config = {'unsupervised': True,
'supervised': True,
'we_type': op.we_type}
_config_id = 'M+F'
config['reduction'] = 'PCA'
config['max_label_space'] = op.max_labels_S
config['dim_reduction_unsupervised'] = op.max_labels_U
# config['post_pca'] = op.post_pca
# config['plot_covariance_matrices'] = True
result_id = dataset_file + 'MLE_andrea' + _config_id + ('_optimC' if op.optimc else '')
ple = PolylingualEmbeddingsClassifier(wordembeddings_path='/home/andreapdr/CLESA/',
config = config,
learner=get_learner(calibrate=False),
c_parameters=get_params(dense=False),
n_jobs=op.n_jobs)
print('# Fitting ...')
ple.fit(lXtr, lytr)
print('# Evaluating ...')
ple_eval = evaluate_method(ple, lXte, lyte)
metrics = []
for lang in lXte.keys():
macrof1, microf1, macrok, microk = ple_eval[lang]
metrics.append([macrof1, microf1, macrok, microk])
print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1))
results.add_row('MLE', 'svm', _config_id, config['we_type'],
'no','no', op.optimc, op.dataset.split('/')[-1], ple.time,
lang, macrof1, microf1, macrok, microk, '')
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))

92
src/NN_FPEC_andrea.py Normal file
View File

@ -0,0 +1,92 @@
from optparse import OptionParser
from util.results import PolylingualClassificationResults
from dataset_builder import MultilingualDataset
from keras.preprocessing.text import Tokenizer
from learning.learners import MonolingualNetSvm
from sklearn.svm import SVC
import pickle
parser = OptionParser()
parser.add_option("-d", "--dataset", dest="dataset",
help="Path to the multilingual dataset processed and stored in .pickle format",
default="/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle")
parser.add_option("-c", "--optimc", dest="optimc", action='store_true',
help="Optimize hyperparameters", default=False)
parser.add_option("-s", "--set_c", dest="set_c",type=float,
help="Set the C parameter", default=1)
(op, args) = parser.parse_args()
###################################################################################################################
def get_learner(calibrate=False, kernel='linear'):
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto')
def get_params(dense=False):
if not op.optimc:
return None
c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
kernel = 'rbf' if dense else 'linear'
return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]
# PREPROCESS TEXT AND SAVE IT ... both for SVM and NN
def preprocess_data(lXtr, lXte, lytr, lyte):
tokenized_tr = dict()
tokenized_te = dict()
for lang in lXtr.keys():
alltexts = ' '.join(lXtr[lang])
tokenizer = Tokenizer()
tokenizer.fit_on_texts(alltexts.split(' '))
tokenizer.oov_token = len(tokenizer.word_index)+1
# dumping train set
sequences_tr = tokenizer.texts_to_sequences(lXtr[lang])
tokenized_tr[lang] = (tokenizer.word_index, sequences_tr, lytr[lang])
# dumping test set
sequences_te = tokenizer.texts_to_sequences(lXte[lang])
tokenized_te[lang] = (tokenizer.word_index, sequences_te, lyte[lang])
with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_train.pickle', 'wb') as f:
pickle.dump(tokenized_tr, f)
with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_test.pickle', 'wb') as f:
pickle.dump(tokenized_tr, f)
print('Successfully dumped data')
# def load_preprocessed():
# with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_train.pickle', 'rb') as f:
# return pickle.load(f)
#
# def build_embedding_matrix(lang, word_index):
# type = 'MUSE'
# path = '/home/andreapdr/CLESA/'
# MUSE = EmbeddingsAligned(type, path, lang, word_index.keys())
# return MUSE
########## MAIN #################################################################################################
if __name__ == '__main__':
results = PolylingualClassificationResults('./results/NN_FPEC_results.csv')
data = MultilingualDataset.load(op.dataset)
lXtr, lytr = data.training()
lXte, lyte = data.test()
if op.set_c != -1:
meta_parameters = None
else:
meta_parameters = [{'C': [1e3, 1e2, 1e1, 1, 1e-1]}]
test_architecture = MonolingualNetSvm(lXtr,
lytr,
first_tier_learner=get_learner(calibrate=True),
first_tier_parameters=None,
n_jobs=1)
test_architecture.fit()

View File

@ -3,8 +3,9 @@ import pickle
from torchtext.vocab import Vectors from torchtext.vocab import Vectors
import torch import torch
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from data.supervised import get_supervised_embeddings from learning.supervised import get_supervised_embeddings
from util.decompositions import * from util.decompositions import *
from util.SIF_embed import *
class PretrainedEmbeddings(ABC): class PretrainedEmbeddings(ABC):
@ -233,7 +234,6 @@ class StorageEmbeddings:
print(f'# REDUCING LABELS TO min_prevalence = {min_prevalence} in order to compute WCE Matrix ...') print(f'# REDUCING LABELS TO min_prevalence = {min_prevalence} in order to compute WCE Matrix ...')
langs = list(docs.keys()) langs = list(docs.keys())
well_repr_cats = np.logical_and.reduce([labels[lang].sum(axis=0)>min_prevalence for lang in langs]) well_repr_cats = np.logical_and.reduce([labels[lang].sum(axis=0)>min_prevalence for lang in langs])
# lY = {lY[lang][:, well_repr_cats] for lang in langs} TODO not clear
for lang in langs: for lang in langs:
labels[lang] = labels[lang][:, well_repr_cats] labels[lang] = labels[lang][:, well_repr_cats]
print(f'Target number reduced to: {labels[lang].shape[1]}\n') print(f'Target number reduced to: {labels[lang].shape[1]}\n')
@ -245,15 +245,15 @@ class StorageEmbeddings:
nC = self.lang_S[lang].shape[1] nC = self.lang_S[lang].shape[1]
print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n') print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n')
if max_label_space == 0: if max_label_space == 0: # looking for best n_components analyzing explained_variance_ratio
print(f'Computing optimal number of PCA components along matrices S') print(f'Computing optimal number of PCA components along matrices S')
optimal_n = get_optimal_dim(self.lang_S, 'S') optimal_n = get_optimal_dim(self.lang_S, 'S')
print(f'Applying PCA(n_components={optimal_n})') print(f'Applying PCA(n_components={optimal_n})')
self.lang_S = run_pca(optimal_n, self.lang_S) self.lang_S = run_pca(optimal_n, self.lang_S)
elif max_label_space == -1: elif max_label_space == -1: # applying pca to the verticals stacked matrix of WCE embeddings
print(f'Computing PCA on vertical stacked WCE embeddings') print(f'Computing PCA on vertical stacked WCE embeddings')
languages = self.lang_S.keys() languages = self.lang_S.keys()
_temp_stack = np.vstack([self.lang_S[lang] for lang in languages]) _temp_stack = np.vstack([self.lang_S[lang] for lang in languages]) # stacking WCE vertically
stacked_pca = PCA(n_components=_temp_stack.shape[1]) stacked_pca = PCA(n_components=_temp_stack.shape[1])
stacked_pca.fit(_temp_stack) stacked_pca.fit(_temp_stack)
best_n = None best_n = None
@ -271,12 +271,15 @@ class StorageEmbeddings:
print(f'Applying PCA(n_components={i}') print(f'Applying PCA(n_components={i}')
for lang in languages: for lang in languages:
self.lang_S[lang] = stacked_pca.transform(self.lang_S[lang]) self.lang_S[lang] = stacked_pca.transform(self.lang_S[lang])
elif max_label_space <= nC: # also equal in order to reduce it to the same initial dimension elif max_label_space <= nC: # less or equal in order to reduce it to the same initial dimension
print(f'Computing PCA on Supervised Matrix PCA(n_components:{max_label_space})') print(f'Computing PCA on Supervised Matrix PCA(n_components:{max_label_space})')
self.lang_S = run_pca(max_label_space, self.lang_S) self.lang_S = run_pca(max_label_space, self.lang_S)
return return
def SIF_embeddings(self):
print('todo') # TODO
def _concatenate_embeddings(self, docs): def _concatenate_embeddings(self, docs):
_r = dict() _r = dict()
for lang in self.lang_U.keys(): for lang in self.lang_U.keys():
@ -293,6 +296,9 @@ class StorageEmbeddings:
def predict(self, config, docs): def predict(self, config, docs):
if config['supervised'] and config['unsupervised']: if config['supervised'] and config['unsupervised']:
return self._concatenate_embeddings(docs) return self._concatenate_embeddings(docs)
# todo testing applying pca to hstack muse + wce
# _reduced = self._concatenate_embeddings(docs)
# return run_pca(300, _reduced)
elif config['supervised']: elif config['supervised']:
_r = dict() _r = dict()
for lang in docs.keys(): for lang in docs.keys():
@ -301,4 +307,5 @@ class StorageEmbeddings:
_r = dict() _r = dict()
for lang in docs.keys(): for lang in docs.keys():
_r[lang] = docs[lang].dot(self.lang_U[lang]) _r[lang] = docs[lang].dot(self.lang_U[lang])
return _r return _r

View File

@ -1,6 +1,6 @@
import numpy as np import numpy as np
import time import time
from data.embeddings import WordEmbeddings, StorageEmbeddings from learning.embeddings import WordEmbeddings, StorageEmbeddings
from scipy.sparse import issparse from scipy.sparse import issparse
from sklearn.multiclass import OneVsRestClassifier from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV from sklearn.model_selection import GridSearchCV
@ -9,6 +9,7 @@ from joblib import Parallel, delayed
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer
from transformers.StandardizeTransformer import StandardizeTransformer from transformers.StandardizeTransformer import StandardizeTransformer
from sklearn.decomposition import PCA from sklearn.decomposition import PCA
from models.cnn_class import CNN_pdr
def _sort_if_sparse(X): def _sort_if_sparse(X):
@ -581,3 +582,151 @@ class PolylingualEmbeddingsClassifier:
def best_params(self): def best_params(self):
return self.model.best_params() return self.model.best_params()
class MonolingualNetSvm:
"""
testing: funnelling with NN managing word embeddings compositionality. An ensemble of n-SVMs (n equals to the
number of training languages) is first fit on the data, generating the documents projection in the Z-space. Next,
the projection are fed to a single NN with their respective document embeddings. The documents are projected into
the embedding space while preserving their dimensionality (output dim is 300). These projection are horizonatally
concatenated with the respective projection and passed through a fC layer with sigmoid act and output dim equal
to the number of target classes.
# TODO ATM testing with only 1 language
"""
def __init__(self, lX, ly, first_tier_learner, first_tier_parameters, n_jobs):
self.lX = lX
self.ly = ly
# SVM Attributes
self.doc_projector = NaivePolylingualClassifier(first_tier_learner, first_tier_parameters,
n_jobs=n_jobs)
self.calmode = 'cal'
self.languages = []
self.lang_word2idx = dict()
self.lang_tfidf = {}
self.base_learner = 'TODO'
self.parameters = 'TODO'
# NN Attributes
self.NN = 'TODO'
def load_preprocessed(self):
"""
in order to speed up the process, documents are first tokenized in the "main". Here, tokenized docs, word_index, and
targets are loaded.
:return: dict[lang] = (word_index, tokenized_docs, targets)
"""
import pickle
with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_train.pickle', 'rb') as f:
return pickle.load(f)
def _build_embedding_matrix(self, lang, word_index):
"""
build embedding matrix by filtering out OOV embeddings
:param lang:
:param word_index:
:return: filtered embedding matrix
"""
from learning.embeddings import EmbeddingsAligned
type = 'MUSE'
path = '/home/andreapdr/CLESA/'
MUSE = EmbeddingsAligned(type, path, lang, word_index.keys())
return MUSE
def get_data_and_embed(self, data_dict):
from keras.preprocessing.sequence import pad_sequences
langs = data_dict.keys()
lang_embedding_matrix = dict()
nn_lXtr = dict()
nn_lytr = dict()
for lang in langs:
lang_embedding_matrix[lang] = self._build_embedding_matrix(lang, data_dict[lang][0])
nn_lXtr[lang] = pad_sequences(data_dict[lang][1], 100, padding='post')
nn_lytr[lang] = [data_dict[lang][2]]
return nn_lXtr, nn_lytr, lang_embedding_matrix
def svm_vectorize(self, lX, prediction=False):
langs = list(lX.keys())
print(f'# tfidf-vectorizing docs')
if prediction:
for lang in langs:
assert lang in self.lang_tfidf.keys(), 'no tf-idf for given language'
tfidf_vectorizer = self.lang_tfidf[lang]
lX[lang] = tfidf_vectorizer.transform(lX[lang])
return self
for lang in langs:
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True)
self.languages.append(lang)
tfidf_vectorizer.fit(lX[lang])
lX[lang] = tfidf_vectorizer.transform(lX[lang])
self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_
self.lang_tfidf[lang] = tfidf_vectorizer
return lX
def _get_zspace(self, lXtr, lYtr):
print('\nfitting the projectors... {}'.format(list(lXtr.keys())))
self.doc_projector.fit(lXtr, lYtr)
print('\nprojecting the documents')
lZ = self._projection(self.doc_projector, lXtr)
return lZ, lYtr
def _projection(self, doc_projector, lX):
"""
Decides the projection function to be applied; predict_proba if the base classifiers are calibrated or
decision_function if otherwise
:param doc_projector: the document projector (a NaivePolylingualClassifier)
:param lX: {lang:matrix} to train
:return: the projection, applied with predict_proba or decision_function
"""
if self.calmode=='cal':
return doc_projector.predict_proba(lX)
else:
l_decision_scores = doc_projector.decision_function(lX)
if self.calmode=='sigmoid':
def sigmoid(x): return 1 / (1 + np.exp(-x))
for lang in l_decision_scores.keys():
l_decision_scores[lang] = sigmoid(l_decision_scores[lang])
return l_decision_scores
def fit(self):
"""
# 1. Fit SVM to generate posterior probabilities:
# 1.1 Gather documents and vectorize them as in other SVM classifiers
# 2. Fit NN
# 2.1 Gather documents and build NN dataset by indexing wrt embedding matrix
# 2.2 Fit NN first-layer to generate compositional doc embedding
# 2.3 H-stack doc-embed and posterior P
# 2.4 Feed stacked vector to output layer (sigmoid act): output Nc
# 2.5 Train it...
"""
# load pre-processed data
data_dict = self.load_preprocessed()
# build embedding matrices and neural network document training set
nn_lXtr, nn_lytr, lang_embedding_matrix = self.get_data_and_embed(data_dict)
# TF-IDF vectorzing documents for SVM classifier
svm_lX = self.svm_vectorize(self.lX)
# just testing on a smaller subset of data
test_svm_lX = dict()
test_svm_ly = dict()
test_svm_lX['it'] = svm_lX['it'][:10, :]
test_svm_ly['it'] = self.ly['it'][:10, :]
test_nn_data = nn_lXtr['it'][:10]
# projecting document into Z space by SVM
svm_Z, _ = self._get_zspace(test_svm_lX, test_svm_ly)
# initializing net and forward pass
net = CNN_pdr(73, 1, 300, len(lang_embedding_matrix['it'].vectors), 300, lang_embedding_matrix['it'].vectors)
out = net.forward(test_nn_data, svm_Z['it'])
print('TODO')
def net(self):
pass

42
src/models/cnn_class.py Normal file
View File

@ -0,0 +1,42 @@
import torch.nn as nn
from torch.nn import functional as F
import torch
class CNN_pdr(nn.Module):
def __init__(self, output_size, out_channels, compositional_dim, vocab_size, emb_dim, embeddings=None, drop_embedding_range=None,
drop_embedding_prop=0, drop_prob=0.5):
super(CNN_pdr, self).__init__()
self.vocab_size = vocab_size
self.emb_dim = emb_dim
self.embeddings = torch.FloatTensor(embeddings)
self.embedding_layer = nn.Embedding(vocab_size, emb_dim, _weight=self.embeddings)
self.kernel_heights = kernel_heights=[3,5,7]
self.stride = 1
self.padding = 0
self.drop_embedding_range = drop_embedding_range
self.drop_embedding_prop = drop_embedding_prop
assert 0 <= drop_embedding_prop <= 1, 'drop_embedding_prop: wrong range'
self.nC = 73
self.conv1 = nn.Conv2d(1, compositional_dim, (self.kernel_heights[0], self.emb_dim), self.stride, self.padding)
self.dropout = nn.Dropout(drop_prob)
self.label = nn.Linear(len(kernel_heights) * out_channels, output_size)
self.fC = nn.Linear(compositional_dim + self.nC, self.nC)
def forward(self, x, svm_output):
x = torch.LongTensor(x)
svm_output = torch.FloatTensor(svm_output)
x = self.embedding_layer(x)
x = self.conv1(x.unsqueeze(1))
x = F.relu(x.squeeze(3))
x = F.max_pool1d(x, x.size()[2]).squeeze(2)
x = torch.cat((x, svm_output), 1)
x = F.sigmoid(self.fC(x))
return x #.detach().numpy()
# logits = self.label(x)
# return logits

View File

@ -2,6 +2,6 @@ import pandas as pd
import numpy as np import numpy as np
df = pd.read_csv("/home/andreapdr/funneling_pdr/src/results/results.csv", delimiter='\t') df = pd.read_csv("/home/andreapdr/funneling_pdr/src/results/results.csv", delimiter='\t')
pivot = pd.pivot_table(df, values=['time', 'macrof1', 'microf1', 'macrok', 'microk'], index=['embed'], aggfunc=[np.mean, np.std]) pivot = pd.pivot_table(df, values=['time', 'macrof1', 'microf1', 'macrok', 'microk'], index=['method', 'embed'], aggfunc=[np.mean, np.std])
print(pivot) print(pivot)
print('Finished ...') print('Finished ...')

56
src/util/SIF_embed.py Normal file
View File

@ -0,0 +1,56 @@
import numpy as np
from sklearn.decomposition import TruncatedSVD
def get_weighted_average(We, x, w):
"""
Compute the weighted average vectors
:param We: We[i,:] is the vector for word i
:param x: x[i, :] are the indices of the words in sentence i
:param w: w[i, :] are the weights for the words in sentence i
:return: emb[i, :] are the weighted average vector for sentence i
"""
n_samples = x.shape[0]
emb = np.zeros((n_samples, We.shape[1]))
for i in range(n_samples):
emb[i,:] = w[i,:].dot(We[x[i,:],:]) / np.count_nonzero(w[i,:])
return emb
def compute_pc(X,npc=1):
"""
Compute the principal components. DO NOT MAKE THE DATA ZERO MEAN!
:param X: X[i,:] is a data point
:param npc: number of principal components to remove
:return: component_[i,:] is the i-th pc
"""
svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0)
svd.fit(X)
return svd.components_
def remove_pc(X, npc=1):
"""
Remove the projection on the principal components
:param X: X[i,:] is a data point
:param npc: number of principal components to remove
:return: XX[i, :] is the data point after removing its projection
"""
pc = compute_pc(X, npc)
if npc==1:
XX = X - X.dot(pc.transpose()) * pc
else:
XX = X - X.dot(pc.transpose()).dot(pc)
return XX
def SIF_embedding(We, x, w, params):
"""
Compute the scores between pairs of sentences using weighted average + removing the projection on the first principal component
:param We: We[i,:] is the vector for word i
:param x: x[i, :] are the indices of the words in the i-th sentence
:param w: w[i, :] are the weights for the words in the i-th sentence
:param params.rmpc: if >0, remove the projections of the sentence embeddings to their first principal component
:return: emb, emb[i, :] is the embedding for sentence i
"""
emb = get_weighted_average(We, x, w)
if params.rmpc > 0:
emb = remove_pc(emb, params.rmpc)
return emb

View File

@ -1,2 +1,15 @@
from sklearn.svm import SVC
def fill_missing_classes(lXtr, lytr): def fill_missing_classes(lXtr, lytr):
pass pass
def get_learner(calibrate=False, kernel='linear'):
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto')
def get_params(dense=False):
if not op.optimc:
return None
c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
kernel = 'rbf' if dense else 'linear'
return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]