This commit is contained in:
Alejandro Moreo Fernandez 2020-01-16 14:30:20 +01:00
commit cfd3a609a2
12 changed files with 710 additions and 142 deletions

View File

@ -6,7 +6,7 @@ from optparse import OptionParser
from util.file import exists
from util.results import PolylingualClassificationResults
from sklearn.svm import SVC
from util.util import get_learner, get_params
parser = OptionParser()
@ -35,16 +35,22 @@ parser.add_option("-c", "--optimc", dest="optimc", action='store_true',
parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int,
help="Number of parallel jobs (default is -1, all)", default=-1)
parser.add_option("-p", "--pca", dest="max_labels", type=int,
help="If less than number of target classes, will apply PCA to supervised matrix. If set to 0 it"
" will automatically search for the best number of components", default=300)
parser.add_option("-p", "--pca", dest="max_labels_S", type=int,
help="If smaller than number of target classes, PCA will be applied to supervised matrix. "
"If set to 0 it will automatically search for the best number of components. "
"If set to -1 it will apply PCA to the vstacked supervised matrix (PCA dim set to 50 atm)",
default=300)
parser.add_option("-u", "--upca", dest="max_labels_U", type=int,
help="If smaller than Unsupervised Dimension, will apply PCA to unsupervised matrix. If set to 0 it"
" will automatically search for the best number of components", default=300)
help="If smaller than Unsupervised Dimension, PCA will be applied to unsupervised matrix."
" If set to 0 it will automatically search for the best number of components", default=300)
parser.add_option("-l", dest="lang", type=str)
# parser.add_option("-a", dest="post_pca",
# help="If set to True, will apply PCA to the z-space (posterior probabilities stacked along with "
# "embedding space", default=False)
def get_learner(calibrate=False, kernel='linear'):
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto')
@ -73,13 +79,12 @@ if __name__ == '__main__':
data = MultilingualDataset.load(op.dataset)
data.show_dimensions()
data.set_view(languages=['en','it', 'pt', 'sv'], categories=list(range(10)))
# data.set_view(languages=['en','it', 'pt', 'sv'], categories=list(range(10)))
# data.set_view(languages=[op.lang])
# data.set_view(categories=list(range(10)))
lXtr, lytr = data.training()
lXte, lyte = data.test()
if op.set_c != -1:
meta_parameters = None
else:
@ -110,12 +115,12 @@ if __name__ == '__main__':
config = {'unsupervised': True,
'supervised': True,
'we_type': op.we_type}
_config_id = 'M_and_F'
_config_id = 'M+F'
##### TODO - config dict is redundant - we have already op argparse ...
config['reduction'] = 'PCA'
config['max_label_space'] = op.max_labels
config['max_label_space'] = op.max_labels_S
config['dim_reduction_unsupervised'] = op.max_labels_U
# config['post_pca'] = op.post_pca
# config['plot_covariance_matrices'] = True
result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '')
@ -125,7 +130,7 @@ if __name__ == '__main__':
config=config,
first_tier_learner=get_learner(calibrate=True),
meta_learner=get_learner(calibrate=False, kernel='rbf'),
first_tier_parameters=get_params(dense=False),
first_tier_parameters=None, # TODO get_params(dense=False),--> first_tier should not be optimized - or not?
meta_parameters=get_params(dense=True),
n_jobs=op.n_jobs)
@ -140,6 +145,8 @@ if __name__ == '__main__':
macrof1, microf1, macrok, microk = l_eval[lang]
metrics.append([macrof1, microf1, macrok, microk])
print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1))
results.add_row(result_id, 'PolyEmbed_andrea', 'svm', _config_id, config['we_type'], op.optimc, op.dataset.split('/')[-1],
classifier.time, lang, macrof1, microf1, macrok, microk, '')
results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'],
(config['max_label_space'], classifier.best_components),
config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time,
lang, macrof1, microf1, macrok, microk, '')
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))

128
src/MLE_andrea.py Normal file
View File

@ -0,0 +1,128 @@
import os
from dataset_builder import MultilingualDataset
from learning.learners import *
from util.evaluation import *
from optparse import OptionParser
from util.file import exists
from util.results import PolylingualClassificationResults
from util.util import get_learner, get_params
parser = OptionParser()
parser.add_option("-d", "--dataset", dest="dataset",
help="Path to the multilingual dataset processed and stored in .pickle format",
default="/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle")
parser.add_option("-o", "--output", dest="output",
help="Result file", type=str, default='./results/results.csv')
parser.add_option("-e", "--mode-embed", dest="mode_embed",
help="Set the embedding to be used [none, unsupervised, supervised, both]", type=str, default='none')
parser.add_option("-w", "--we-path", dest="we_path",
help="Path to the polylingual word embeddings", default='/home/andreapdr/CLESA/')
parser.add_option('-t', "--we-type", dest="we_type", help="Aligned embeddings to use [FastText, MUSE]", type=str,
default='MUSE')
parser.add_option("-s", "--set_c", dest="set_c",type=float,
help="Set the C parameter", default=1)
parser.add_option("-c", "--optimc", dest="optimc", action='store_true',
help="Optimize hyperparameters", default=False)
parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int,
help="Number of parallel jobs (default is -1, all)", default=-1)
parser.add_option("-p", "--pca", dest="max_labels_S", type=int,
help="If smaller than number of target classes, PCA will be applied to supervised matrix. "
"If set to 0 it will automatically search for the best number of components. "
"If set to -1 it will apply PCA to the vstacked supervised matrix (PCA dim set to 50 atm)",
default=300)
parser.add_option("-u", "--upca", dest="max_labels_U", type=int,
help="If smaller than Unsupervised Dimension, PCA will be applied to unsupervised matrix."
" If set to 0 it will automatically search for the best number of components", default=300)
parser.add_option("-l", dest="lang", type=str)
if __name__ == '__main__':
(op, args) = parser.parse_args()
assert exists(op.dataset), 'Unable to find file '+str(op.dataset)
assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
dataset_file = os.path.basename(op.dataset)
results = PolylingualClassificationResults('./results/PLE_results.csv')
data = MultilingualDataset.load(op.dataset)
data.show_dimensions()
# data.set_view(languages=['en','it', 'pt', 'sv'], categories=list(range(10)))
# data.set_view(languages=[op.lang])
# data.set_view(categories=list(range(10)))
lXtr, lytr = data.training()
lXte, lyte = data.test()
if op.set_c != -1:
meta_parameters = None
else:
meta_parameters = [{'C': [1e3, 1e2, 1e1, 1, 1e-1]}]
# Embeddings and WCE config
_available_mode = ['none', 'unsupervised', 'supervised', 'both']
_available_type = ['MUSE', 'FastText']
assert op.mode_embed in _available_mode, f'{op.mode_embed} not in {_available_mode}'
assert op.we_type in _available_type, f'{op.we_type} not in {_available_type}'
if op.mode_embed == 'none':
config = {'unsupervised': False,
'supervised': False,
'we_type': None}
_config_id = 'None'
elif op.mode_embed == 'unsupervised':
config = {'unsupervised': True,
'supervised': False,
'we_type': op.we_type}
_config_id = 'M'
elif op.mode_embed == 'supervised':
config = {'unsupervised': False,
'supervised': True,
'we_type': None}
_config_id = 'F'
elif op.mode_embed == 'both':
config = {'unsupervised': True,
'supervised': True,
'we_type': op.we_type}
_config_id = 'M+F'
config['reduction'] = 'PCA'
config['max_label_space'] = op.max_labels_S
config['dim_reduction_unsupervised'] = op.max_labels_U
# config['post_pca'] = op.post_pca
# config['plot_covariance_matrices'] = True
result_id = dataset_file + 'MLE_andrea' + _config_id + ('_optimC' if op.optimc else '')
ple = PolylingualEmbeddingsClassifier(wordembeddings_path='/home/andreapdr/CLESA/',
config = config,
learner=get_learner(calibrate=False),
c_parameters=get_params(dense=False),
n_jobs=op.n_jobs)
print('# Fitting ...')
ple.fit(lXtr, lytr)
print('# Evaluating ...')
ple_eval = evaluate_method(ple, lXte, lyte)
metrics = []
for lang in lXte.keys():
macrof1, microf1, macrok, microk = ple_eval[lang]
metrics.append([macrof1, microf1, macrok, microk])
print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1))
results.add_row('MLE', 'svm', _config_id, config['we_type'],
'no','no', op.optimc, op.dataset.split('/')[-1], ple.time,
lang, macrof1, microf1, macrok, microk, '')
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))

92
src/NN_FPEC_andrea.py Normal file
View File

@ -0,0 +1,92 @@
from optparse import OptionParser
from util.results import PolylingualClassificationResults
from dataset_builder import MultilingualDataset
from keras.preprocessing.text import Tokenizer
from learning.learners import MonolingualNetSvm
from sklearn.svm import SVC
import pickle
parser = OptionParser()
parser.add_option("-d", "--dataset", dest="dataset",
help="Path to the multilingual dataset processed and stored in .pickle format",
default="/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle")
parser.add_option("-c", "--optimc", dest="optimc", action='store_true',
help="Optimize hyperparameters", default=False)
parser.add_option("-s", "--set_c", dest="set_c",type=float,
help="Set the C parameter", default=1)
(op, args) = parser.parse_args()
###################################################################################################################
def get_learner(calibrate=False, kernel='linear'):
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto')
def get_params(dense=False):
if not op.optimc:
return None
c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
kernel = 'rbf' if dense else 'linear'
return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]
# PREPROCESS TEXT AND SAVE IT ... both for SVM and NN
def preprocess_data(lXtr, lXte, lytr, lyte):
tokenized_tr = dict()
tokenized_te = dict()
for lang in lXtr.keys():
alltexts = ' '.join(lXtr[lang])
tokenizer = Tokenizer()
tokenizer.fit_on_texts(alltexts.split(' '))
tokenizer.oov_token = len(tokenizer.word_index)+1
# dumping train set
sequences_tr = tokenizer.texts_to_sequences(lXtr[lang])
tokenized_tr[lang] = (tokenizer.word_index, sequences_tr, lytr[lang])
# dumping test set
sequences_te = tokenizer.texts_to_sequences(lXte[lang])
tokenized_te[lang] = (tokenizer.word_index, sequences_te, lyte[lang])
with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_train.pickle', 'wb') as f:
pickle.dump(tokenized_tr, f)
with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_test.pickle', 'wb') as f:
pickle.dump(tokenized_tr, f)
print('Successfully dumped data')
# def load_preprocessed():
# with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_train.pickle', 'rb') as f:
# return pickle.load(f)
#
# def build_embedding_matrix(lang, word_index):
# type = 'MUSE'
# path = '/home/andreapdr/CLESA/'
# MUSE = EmbeddingsAligned(type, path, lang, word_index.keys())
# return MUSE
########## MAIN #################################################################################################
if __name__ == '__main__':
results = PolylingualClassificationResults('./results/NN_FPEC_results.csv')
data = MultilingualDataset.load(op.dataset)
lXtr, lytr = data.training()
lXte, lyte = data.test()
if op.set_c != -1:
meta_parameters = None
else:
meta_parameters = [{'C': [1e3, 1e2, 1e1, 1, 1e-1]}]
test_architecture = MonolingualNetSvm(lXtr,
lytr,
first_tier_learner=get_learner(calibrate=True),
first_tier_parameters=None,
n_jobs=1)
test_architecture.fit()

View File

@ -3,8 +3,9 @@ import pickle
from torchtext.vocab import Vectors
import torch
from abc import ABC, abstractmethod
from data.supervised import get_supervised_embeddings
from learning.supervised import get_supervised_embeddings
from util.decompositions import *
from util.SIF_embed import *
class PretrainedEmbeddings(ABC):
@ -151,7 +152,6 @@ class FastTextWikiNews(Vectors):
def __init__(self, cache, language="en", **kwargs):
url = self.url_base.format(language)
# name = self.path.format(language)
name = cache + self._name.format(language)
super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs)
@ -211,44 +211,75 @@ class StorageEmbeddings:
def _add_embeddings_unsupervised(self, type, docs, vocs, max_label_space=300):
for lang in docs.keys():
nC = self.lang_U[lang].shape[1]
print(f'# [unsupervised-matrix {type}] for {lang}')
voc = np.asarray(list(zip(*sorted(vocs[lang].items(), key=lambda x: x[1])))[0])
self.lang_U[lang] = EmbeddingsAligned(type, self.path, lang, voc).vectors
# if self.lang_U[lang].shape[1] > dim != 0:
# print(f'unsupervised matrix has more dimensions ({self.lang_U[lang].shape[1]}) than'
# f' the allowed limit {dim}. Applying PCA(n_components={dim})')
# pca = PCA(n_components=dim)
# self.lang_U[lang] = pca.fit_transform(self.lang_U[lang])
print(f'Matrix U (weighted sum) of shape {self.lang_U[lang].shape}\n')
nC = self.lang_U[lang].shape[1]
if max_label_space == 0:
print(f'Computing optimal number of PCA components along matrices U')
optimal_n = get_optimal_dim(self.lang_U, 'U')
self.lang_U = run_pca(optimal_n, self.lang_U)
elif max_label_space < nC:
print(f'Applying PCA to unsupervised matrix U')
self.lang_U = run_pca(max_label_space, self.lang_U)
return
def _add_emebeddings_supervised(self, docs, labels, reduction, max_label_space, voc):
# if max_label_space == 0:
# print('Computing optimal number of PCA components along matrices S...')
# optimal_n = self.get_optimal_supervised_components(docs, labels)
# max_label_space = optimal_n
def _add_embeddings_supervised(self, docs, labels, reduction, max_label_space, voc):
only_well_represented_C = False # TODO testing
if only_well_represented_C:
labels = labels.copy()
min_prevalence = 0
print(f'# REDUCING LABELS TO min_prevalence = {min_prevalence} in order to compute WCE Matrix ...')
langs = list(docs.keys())
well_repr_cats = np.logical_and.reduce([labels[lang].sum(axis=0)>min_prevalence for lang in langs])
for lang in langs:
labels[lang] = labels[lang][:, well_repr_cats]
print(f'Target number reduced to: {labels[lang].shape[1]}\n')
for lang in docs.keys(): # compute supervised matrices S - then apply PCA
nC = self.lang_S[lang].shape[1]
print(f'# [supervised-matrix] for {lang}')
self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], reduction, max_label_space, voc[lang], lang)
self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang],
reduction, max_label_space, voc[lang], lang)
nC = self.lang_S[lang].shape[1]
print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n')
if max_label_space == 0:
if max_label_space == 0: # looking for best n_components analyzing explained_variance_ratio
print(f'Computing optimal number of PCA components along matrices S')
optimal_n = get_optimal_dim(self.lang_S, 'S')
print(f'Applying PCA(n_components={optimal_n})')
self.lang_S = run_pca(optimal_n, self.lang_S)
elif max_label_space < nC:
elif max_label_space == -1: # applying pca to the verticals stacked matrix of WCE embeddings
print(f'Computing PCA on vertical stacked WCE embeddings')
languages = self.lang_S.keys()
_temp_stack = np.vstack([self.lang_S[lang] for lang in languages]) # stacking WCE vertically
stacked_pca = PCA(n_components=_temp_stack.shape[1])
stacked_pca.fit(_temp_stack)
best_n = None
_r = stacked_pca.explained_variance_ratio_
_r = np.cumsum(_r)
plt.plot(_r, label='Stacked Supervised')
for i in range(len(_r) - 1, 1, -1):
delta = _r[i] - _r[i - 1]
if delta > 0:
best_n = i
break
plt.show()
stacked_pca = PCA(n_components=best_n)
stacked_pca.fit(_temp_stack)
print(f'Applying PCA(n_components={i}')
for lang in languages:
self.lang_S[lang] = stacked_pca.transform(self.lang_S[lang])
elif max_label_space <= nC: # less or equal in order to reduce it to the same initial dimension
print(f'Computing PCA on Supervised Matrix PCA(n_components:{max_label_space})')
self.lang_S = run_pca(max_label_space, self.lang_S)
return
def SIF_embeddings(self):
print('todo') # TODO
def _concatenate_embeddings(self, docs):
_r = dict()
for lang in self.lang_U.keys():
@ -259,13 +290,15 @@ class StorageEmbeddings:
if config['unsupervised']:
self._add_embeddings_unsupervised(config['we_type'], docs, vocs, config['dim_reduction_unsupervised'])
if config['supervised']:
self._add_emebeddings_supervised(docs, labels, config['reduction'], config['max_label_space'], vocs)
self._add_embeddings_supervised(docs, labels, config['reduction'], config['max_label_space'], vocs)
return self
def predict(self, config, docs):
if config['supervised'] and config['unsupervised']:
return self._concatenate_embeddings(docs)
# todo testing applying pca to hstack muse + wce
# _reduced = self._concatenate_embeddings(docs)
# return run_pca(300, _reduced)
elif config['supervised']:
_r = dict()
for lang in docs.keys():
@ -274,5 +307,5 @@ class StorageEmbeddings:
_r = dict()
for lang in docs.keys():
_r[lang] = docs[lang].dot(self.lang_U[lang])
return _r
return _r

View File

@ -1,6 +1,6 @@
import numpy as np
import time
from data.embeddings import WordEmbeddings, StorageEmbeddings
from learning.embeddings import WordEmbeddings, StorageEmbeddings
from scipy.sparse import issparse
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV
@ -8,7 +8,8 @@ from sklearn.model_selection import KFold
from joblib import Parallel, delayed
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers.StandardizeTransformer import StandardizeTransformer
# from sklearn.decomposition import PCA
from sklearn.decomposition import PCA
from models.cnn_class import CNN_pdr
def _sort_if_sparse(X):
@ -214,11 +215,6 @@ class NaivePolylingualClassifier:
models = Parallel(n_jobs=self.n_jobs)\
(delayed(MonolingualClassifier(self.base_learner, parameters=self.parameters).fit)((lX[lang]),ly[lang]) for lang in langs)
#
# models = [MonolingualClassifier(self.base_learner, parameters=self.parameters) for lang in langs]
#
# for model, lang in zip(models, langs):
# model.fit(lX[lang], ly[lang])
self.model = {lang: models[i] for i, lang in enumerate(langs)}
self.empty_categories = {lang:self.model[lang].empty_categories for lang in langs}
@ -329,6 +325,132 @@ class MonolingualClassifier:
return self.best_params_
class AndreaCLF(FunnellingPolylingualClassifier):
def __init__(self,
we_path,
config,
first_tier_learner,
meta_learner,
first_tier_parameters=None,
meta_parameters=None,
folded_projections=1,
calmode='cal',
n_jobs=-1):
super().__init__(first_tier_learner,
meta_learner,
first_tier_parameters,
meta_parameters,
folded_projections,
calmode,
n_jobs)
self.pca_independent_space = PCA(n_components=50)
self.we_path = we_path
self.config = config
self.lang_word2idx = dict()
self.languages = []
self.lang_tfidf = {}
self.embedding_space = None
self.model = None
self.time = None
self.best_components = 'not set' # if auto optimize pca, it will store the optimal number of components
def vectorize(self, lX, prediction=False):
langs = list(lX.keys())
print(f'# tfidf-vectorizing docs')
if prediction:
for lang in langs:
assert lang in self.lang_tfidf.keys(), 'no tf-idf for given language'
tfidf_vectorizer = self.lang_tfidf[lang]
lX[lang] = tfidf_vectorizer.transform(lX[lang])
return self
for lang in langs:
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True)
self.languages.append(lang)
tfidf_vectorizer.fit(lX[lang])
lX[lang] = tfidf_vectorizer.transform(lX[lang])
self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_
self.lang_tfidf[lang] = tfidf_vectorizer
return self
def _get_zspace(self, lXtr, lYtr):
print('\nfitting the projectors... {}'.format(list(lXtr.keys())))
self.doc_projector.fit(lXtr, lYtr)
print('\nprojecting the documents')
lZ = self._projection(self.doc_projector, lXtr)
return lZ, lYtr
def fit(self, lX, ly):
tinit = time.time()
print('Vectorizing documents...')
self.vectorize(lX)
for lang in self.languages:
print(f'{lang}->{lX[lang].shape}')
Z, zy = self._get_zspace(lX, ly)
if self.config['supervised'] or self.config['unsupervised']:
self.embedding_space = StorageEmbeddings(self.we_path).fit(self.config, lX, self.lang_word2idx, ly)
_embedding_space = self.embedding_space.predict(self.config, lX)
if self.config['max_label_space'] == 0:
_cum_dimension = _embedding_space[list(_embedding_space.keys())[0]].shape[1]
if _cum_dimension - 300 > 0:
_temp = _cum_dimension - 300
else:
_temp = _cum_dimension
self.best_components = _temp
# h_stacking posterior probabilities with (U) and/or (S) matrices
for lang in self.languages:
Z[lang] = np.hstack((Z[lang], _embedding_space[lang]))
# stacking Z space vertically
_vertical_Z = np.vstack([Z[lang] for lang in self.languages])
_vertical_Zy = np.vstack([zy[lang] for lang in self.languages])
self.standardizer = StandardizeTransformer()
_vertical_Z = self.standardizer.fit_predict(_vertical_Z)
# todo testing ...
# if self.config['post_pca']:
# print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...')
# self.pca_independent_space.fit(_vertical_Z)
# _vertical_Z = self.pca_independent_space.transform(_vertical_Z)
print('fitting the Z-space of shape={}'.format(_vertical_Z.shape))
self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters,
n_jobs=self.n_jobs)
self.model.fit(_vertical_Z, _vertical_Zy)
self.time = time.time() - tinit
print(f'\nTotal training time elapsed: {round((self.time/60), 2)} min')
def predict(self, lX, ly):
print('Vectorizing documents')
self.vectorize(lX, prediction=True)
lZ = self._projection(self.doc_projector, lX)
if self.config['supervised'] or self.config['unsupervised']:
_embedding_space = self.embedding_space.predict(self.config, lX)
for lang in lX.keys():
lZ[lang] = np.hstack((lZ[lang], _embedding_space[lang]))
for lang in lZ.keys():
print(lZ[lang].shape)
# todo testing
lZ[lang] = self.standardizer.predict(lZ[lang])
# if self.config['post_pca']:
# print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...')
# lZ[lang] = self.pca_independent_space.transform(lZ[lang])
return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs)
class PolylingualEmbeddingsClassifier:
"""
This classifier creates document embeddings by a tfidf weighted average of polylingual embeddings from the article
@ -340,7 +462,7 @@ class PolylingualEmbeddingsClassifier:
}
url: https://github.com/facebookresearch/MUSE
"""
def __init__(self, wordembeddings_path, learner, c_parameters=None, n_jobs=-1):
def __init__(self, wordembeddings_path, config, learner, c_parameters=None, n_jobs=-1):
"""
:param wordembeddings_path: the path to the directory containing the polylingual embeddings
:param learner: the learner
@ -348,11 +470,15 @@ class PolylingualEmbeddingsClassifier:
:param n_jobs: the number of concurrent threads
"""
self.wordembeddings_path = wordembeddings_path
self.config = config
self.learner = learner
self.c_parameters=c_parameters
self.n_jobs = n_jobs
self.lang_tfidf = {}
self.model = None
self.languages = []
self.lang_word2idx = dict()
self.embedding_space = None
def fit_vectorizers(self, lX):
for lang in lX.keys():
@ -362,6 +488,27 @@ class PolylingualEmbeddingsClassifier:
tfidf.fit(docs)
self.lang_tfidf[lang] = tfidf
def vectorize(self, lX, prediction=False):
langs = list(lX.keys())
print(f'# tfidf-vectorizing docs')
if prediction:
for lang in langs:
assert lang in self.lang_tfidf.keys(), 'no tf-idf for given language'
tfidf_vectorizer = self.lang_tfidf[lang]
lX[lang] = tfidf_vectorizer.transform(lX[lang])
return self
for lang in langs:
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True)
self.languages.append(lang)
tfidf_vectorizer.fit(lX[lang])
lX[lang] = tfidf_vectorizer.transform(lX[lang])
self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_
self.lang_tfidf[lang] = tfidf_vectorizer
return self
def embed(self, docs, lang):
assert lang in self.lang_tfidf, 'unknown language'
tfidf_vectorizer = self.lang_tfidf[lang]
@ -394,31 +541,34 @@ class PolylingualEmbeddingsClassifier:
tinit = time.time()
langs = list(lX.keys())
WEtr, Ytr = [], []
self.fit_vectorizers(lX) # if already fit, does nothing
_lX = dict()
for lang in langs:
_lX[lang] = self.lang_tfidf[lang].transform(lX[lang])
WEtr.append(self.embed(lX[lang], lang))
Ytr.append(ly[lang])
# self.fit_vectorizers(lX) # if already fit, does nothing
self.vectorize(lX)
# config = {'unsupervised' : False, 'supervised': True}
self.embedding_space = StorageEmbeddings(self.wordembeddings_path).fit(self.config, lX, self.lang_word2idx, ly)
WEtr = self.embedding_space.predict(self.config, lX)
# for lang in langs:
# WEtr.append(self.embed(lX[lang], lang)) # todo embed with other matrices
# Ytr.append(ly[lang])
# TODO @Andrea --> here embeddings should be stacked horizontally!
WEtr = np.vstack(WEtr)
Ytr = np.vstack(Ytr)
WEtr = np.vstack([WEtr[lang] for lang in langs])
Ytr = np.vstack([ly[lang] for lang in langs])
self.embed_time = time.time() - tinit
print('fitting the WE-space of shape={}'.format(WEtr.shape))
self.model = MonolingualClassifier(base_learner=self.learner, parameters=self.c_parameters, n_jobs=self.n_jobs)
self.model.fit(_lX['da'], ly['da'])
self.model.fit(WEtr, Ytr)
self.time = time.time() - tinit
return self
def predict(self, lX):
def predict(self, lX, lY):
"""
:param lX: a dictionary {language_label: [list of preprocessed documents]}
"""
assert self.model is not None, 'predict called before fit'
self.vectorize(lX, prediction=True)
langs = list(lX.keys())
lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory
lWEte = self.embedding_space.predict(self.config, lX)
# lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory
return _joblib_transform_multiling(self.model.predict, lWEte, n_jobs=self.n_jobs)
def predict_proba(self, lX):
@ -427,44 +577,78 @@ class PolylingualEmbeddingsClassifier:
"""
assert self.model is not None, 'predict called before fit'
langs = list(lX.keys())
# lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory
return _joblib_transform_multiling(self.model.predict_proba, self.lang_tfidf['da'], n_jobs=self.n_jobs)
lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory
return _joblib_transform_multiling(self.model.predict_proba, lWEte, n_jobs=self.n_jobs)
def best_params(self):
return self.model.best_params()
class AndreaCLF(FunnellingPolylingualClassifier):
def __init__(self,
we_path,
config,
first_tier_learner,
meta_learner,
first_tier_parameters=None,
meta_parameters=None,
folded_projections=1,
calmode='cal',
n_jobs=-1):
super().__init__(first_tier_learner,
meta_learner,
first_tier_parameters,
meta_parameters,
folded_projections,
calmode,
n_jobs)
self.pca_independent_space = PCA(n_components=100)
self.we_path = we_path
self.config = config
self.lang_word2idx = dict()
class MonolingualNetSvm:
"""
testing: funnelling with NN managing word embeddings compositionality. An ensemble of n-SVMs (n equals to the
number of training languages) is first fit on the data, generating the documents projection in the Z-space. Next,
the projection are fed to a single NN with their respective document embeddings. The documents are projected into
the embedding space while preserving their dimensionality (output dim is 300). These projection are horizonatally
concatenated with the respective projection and passed through a fC layer with sigmoid act and output dim equal
to the number of target classes.
# TODO ATM testing with only 1 language
"""
def __init__(self, lX, ly, first_tier_learner, first_tier_parameters, n_jobs):
self.lX = lX
self.ly = ly
# SVM Attributes
self.doc_projector = NaivePolylingualClassifier(first_tier_learner, first_tier_parameters,
n_jobs=n_jobs)
self.calmode = 'cal'
self.languages = []
self.lang_word2idx = dict()
self.lang_tfidf = {}
self.embedding_space = None
self.model = None
self.time = None
self.base_learner = 'TODO'
self.parameters = 'TODO'
# NN Attributes
self.NN = 'TODO'
def vectorize(self, lX, prediction=False):
def load_preprocessed(self):
"""
in order to speed up the process, documents are first tokenized in the "main". Here, tokenized docs, word_index, and
targets are loaded.
:return: dict[lang] = (word_index, tokenized_docs, targets)
"""
import pickle
with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_train.pickle', 'rb') as f:
return pickle.load(f)
def _build_embedding_matrix(self, lang, word_index):
"""
build embedding matrix by filtering out OOV embeddings
:param lang:
:param word_index:
:return: filtered embedding matrix
"""
from learning.embeddings import EmbeddingsAligned
type = 'MUSE'
path = '/home/andreapdr/CLESA/'
MUSE = EmbeddingsAligned(type, path, lang, word_index.keys())
return MUSE
def get_data_and_embed(self, data_dict):
from keras.preprocessing.sequence import pad_sequences
langs = data_dict.keys()
lang_embedding_matrix = dict()
nn_lXtr = dict()
nn_lytr = dict()
for lang in langs:
lang_embedding_matrix[lang] = self._build_embedding_matrix(lang, data_dict[lang][0])
nn_lXtr[lang] = pad_sequences(data_dict[lang][1], 100, padding='post')
nn_lytr[lang] = [data_dict[lang][2]]
return nn_lXtr, nn_lytr, lang_embedding_matrix
def svm_vectorize(self, lX, prediction=False):
langs = list(lX.keys())
print(f'# tfidf-vectorizing docs')
if prediction:
@ -473,7 +657,6 @@ class AndreaCLF(FunnellingPolylingualClassifier):
tfidf_vectorizer = self.lang_tfidf[lang]
lX[lang] = tfidf_vectorizer.transform(lX[lang])
return self
for lang in langs:
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True)
self.languages.append(lang)
@ -481,9 +664,8 @@ class AndreaCLF(FunnellingPolylingualClassifier):
lX[lang] = tfidf_vectorizer.transform(lX[lang])
self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_
self.lang_tfidf[lang] = tfidf_vectorizer
return self
return lX
# @override std class method
def _get_zspace(self, lXtr, lYtr):
print('\nfitting the projectors... {}'.format(list(lXtr.keys())))
self.doc_projector.fit(lXtr, lYtr)
@ -493,57 +675,58 @@ class AndreaCLF(FunnellingPolylingualClassifier):
return lZ, lYtr
# @override std class method
def fit(self, lX, ly):
tinit = time.time()
print('Vectorizing documents...')
self.vectorize(lX)
def _projection(self, doc_projector, lX):
"""
Decides the projection function to be applied; predict_proba if the base classifiers are calibrated or
decision_function if otherwise
:param doc_projector: the document projector (a NaivePolylingualClassifier)
:param lX: {lang:matrix} to train
:return: the projection, applied with predict_proba or decision_function
"""
if self.calmode=='cal':
return doc_projector.predict_proba(lX)
else:
l_decision_scores = doc_projector.decision_function(lX)
if self.calmode=='sigmoid':
def sigmoid(x): return 1 / (1 + np.exp(-x))
for lang in l_decision_scores.keys():
l_decision_scores[lang] = sigmoid(l_decision_scores[lang])
return l_decision_scores
for lang in self.languages:
print(f'{lang}->{lX[lang].shape}')
def fit(self):
"""
# 1. Fit SVM to generate posterior probabilities:
# 1.1 Gather documents and vectorize them as in other SVM classifiers
# 2. Fit NN
# 2.1 Gather documents and build NN dataset by indexing wrt embedding matrix
# 2.2 Fit NN first-layer to generate compositional doc embedding
# 2.3 H-stack doc-embed and posterior P
# 2.4 Feed stacked vector to output layer (sigmoid act): output Nc
# 2.5 Train it...
"""
Z, zy = self._get_zspace(lX, ly)
# load pre-processed data
data_dict = self.load_preprocessed()
# build embedding matrices and neural network document training set
nn_lXtr, nn_lytr, lang_embedding_matrix = self.get_data_and_embed(data_dict)
# TF-IDF vectorzing documents for SVM classifier
svm_lX = self.svm_vectorize(self.lX)
if self.config['supervised'] or self.config['unsupervised']:
self.embedding_space = StorageEmbeddings(self.we_path).fit(self.config, lX, self.lang_word2idx, ly)
_embedding_space = self.embedding_space.predict(self.config, lX)
# h_stacking posterior probabilities with (U) and/or (S) matrices
for lang in self.languages:
Z[lang] = np.hstack((Z[lang], _embedding_space[lang]))
# just testing on a smaller subset of data
test_svm_lX = dict()
test_svm_ly = dict()
test_svm_lX['it'] = svm_lX['it'][:10, :]
test_svm_ly['it'] = self.ly['it'][:10, :]
test_nn_data = nn_lXtr['it'][:10]
# stacking Z space vertically
_vertical_Z = np.vstack([Z[lang] for lang in self.languages])
_vertical_Zy = np.vstack([zy[lang] for lang in self.languages])
# projecting document into Z space by SVM
svm_Z, _ = self._get_zspace(test_svm_lX, test_svm_ly)
# todo testing ...
# self.pca_independent_space.fit(_vertical_Z)
# _vertical_Z = self.pca_independent_space.transform(_vertical_Z)
# initializing net and forward pass
net = CNN_pdr(73, 1, 300, len(lang_embedding_matrix['it'].vectors), 300, lang_embedding_matrix['it'].vectors)
out = net.forward(test_nn_data, svm_Z['it'])
self.standardizer = StandardizeTransformer()
_vertical_Z = self.standardizer.fit_predict(_vertical_Z)
print('TODO')
print('fitting the Z-space of shape={}'.format(_vertical_Z.shape))
self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters,
n_jobs=self.n_jobs)
self.model.fit(_vertical_Z, _vertical_Zy)
self.time = time.time() - tinit
print(f'\nTotal training time elapsed: {round((self.time/60), 2)} min')
def predict(self, lX, ly):
print('Vectorizing documents')
self.vectorize(lX, prediction=True)
lZ = self._projection(self.doc_projector, lX)
if self.config['supervised'] or self.config['unsupervised']:
_embedding_space = self.embedding_space.predict(self.config, lX)
for lang in lX.keys():
lZ[lang] = np.hstack((lZ[lang], _embedding_space[lang]))
for lang in lZ.keys():
print(lZ[lang].shape)
# todo testing
# lZ[lang] = self.pca_independent_space.transform(lZ[lang])
lZ[lang] = self.standardizer.predict(lZ[lang])
return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs)
def net(self):
pass

42
src/models/cnn_class.py Normal file
View File

@ -0,0 +1,42 @@
import torch.nn as nn
from torch.nn import functional as F
import torch
class CNN_pdr(nn.Module):
def __init__(self, output_size, out_channels, compositional_dim, vocab_size, emb_dim, embeddings=None, drop_embedding_range=None,
drop_embedding_prop=0, drop_prob=0.5):
super(CNN_pdr, self).__init__()
self.vocab_size = vocab_size
self.emb_dim = emb_dim
self.embeddings = torch.FloatTensor(embeddings)
self.embedding_layer = nn.Embedding(vocab_size, emb_dim, _weight=self.embeddings)
self.kernel_heights = kernel_heights=[3,5,7]
self.stride = 1
self.padding = 0
self.drop_embedding_range = drop_embedding_range
self.drop_embedding_prop = drop_embedding_prop
assert 0 <= drop_embedding_prop <= 1, 'drop_embedding_prop: wrong range'
self.nC = 73
self.conv1 = nn.Conv2d(1, compositional_dim, (self.kernel_heights[0], self.emb_dim), self.stride, self.padding)
self.dropout = nn.Dropout(drop_prob)
self.label = nn.Linear(len(kernel_heights) * out_channels, output_size)
self.fC = nn.Linear(compositional_dim + self.nC, self.nC)
def forward(self, x, svm_output):
x = torch.LongTensor(x)
svm_output = torch.FloatTensor(svm_output)
x = self.embedding_layer(x)
x = self.conv1(x.unsqueeze(1))
x = F.relu(x.squeeze(3))
x = F.max_pool1d(x, x.size()[2]).squeeze(2)
x = torch.cat((x, svm_output), 1)
x = F.sigmoid(self.fC(x))
return x #.detach().numpy()
# logits = self.label(x)
# return logits

View File

@ -0,0 +1,7 @@
import pandas as pd
import numpy as np
df = pd.read_csv("/home/andreapdr/funneling_pdr/src/results/results.csv", delimiter='\t')
pivot = pd.pivot_table(df, values=['time', 'macrof1', 'microf1', 'macrok', 'microk'], index=['method', 'embed'], aggfunc=[np.mean, np.std])
print(pivot)
print('Finished ...')

56
src/util/SIF_embed.py Normal file
View File

@ -0,0 +1,56 @@
import numpy as np
from sklearn.decomposition import TruncatedSVD
def get_weighted_average(We, x, w):
"""
Compute the weighted average vectors
:param We: We[i,:] is the vector for word i
:param x: x[i, :] are the indices of the words in sentence i
:param w: w[i, :] are the weights for the words in sentence i
:return: emb[i, :] are the weighted average vector for sentence i
"""
n_samples = x.shape[0]
emb = np.zeros((n_samples, We.shape[1]))
for i in range(n_samples):
emb[i,:] = w[i,:].dot(We[x[i,:],:]) / np.count_nonzero(w[i,:])
return emb
def compute_pc(X,npc=1):
"""
Compute the principal components. DO NOT MAKE THE DATA ZERO MEAN!
:param X: X[i,:] is a data point
:param npc: number of principal components to remove
:return: component_[i,:] is the i-th pc
"""
svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0)
svd.fit(X)
return svd.components_
def remove_pc(X, npc=1):
"""
Remove the projection on the principal components
:param X: X[i,:] is a data point
:param npc: number of principal components to remove
:return: XX[i, :] is the data point after removing its projection
"""
pc = compute_pc(X, npc)
if npc==1:
XX = X - X.dot(pc.transpose()) * pc
else:
XX = X - X.dot(pc.transpose()).dot(pc)
return XX
def SIF_embedding(We, x, w, params):
"""
Compute the scores between pairs of sentences using weighted average + removing the projection on the first principal component
:param We: We[i,:] is the vector for word i
:param x: x[i, :] are the indices of the words in the i-th sentence
:param w: w[i, :] are the weights for the words in the i-th sentence
:param params.rmpc: if >0, remove the projections of the sentence embeddings to their first principal component
:return: emb, emb[i, :] is the embedding for sentence i
"""
emb = get_weighted_average(We, x, w)
if params.rmpc > 0:
emb = remove_pc(emb, params.rmpc)
return emb

View File

@ -2,6 +2,7 @@ from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
def run_pca(dim, X):
"""
:param dim: number of pca components to keep

View File

@ -5,7 +5,8 @@ import numpy as np
class PolylingualClassificationResults:
def __init__(self, file, autoflush=True, verbose=False):
self.file = file
self.columns = ['id', 'method', 'learner', 'embed', 'embed_type', 'optimp', 'dataset', 'time', 'lang', 'macrof1', 'microf1', 'macrok', 'microk', 'notes']
self.columns = ['method', 'learner', 'embed', 'embed_type', 'pca_s', 'pca_u', 'optimp', 'dataset', 'time',
'lang', 'macrof1', 'microf1', 'macrok', 'microk', 'notes']
self.autoflush = autoflush
self.verbose = verbose
if os.path.exists(file):
@ -20,8 +21,8 @@ class PolylingualClassificationResults:
def already_calculated(self, id):
return (self.df['id'] == id).any()
def add_row(self, id, method, learner, embed, embed_type, optimp, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
s = pd.Series([id, method, learner, embed, embed_type, optimp, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns)
def add_row(self, method, learner, embed, embed_type, pca_s, pca_u, optimp, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
s = pd.Series([method, learner, embed, embed_type, pca_s, pca_u, optimp, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns)
self.df = self.df.append(s, ignore_index=True)
if self.autoflush: self.flush()
self.tell(s.to_string())

View File

@ -1,3 +1,4 @@
from sklearn.svm import SVC
from tqdm import tqdm
import re
import sys
@ -9,3 +10,20 @@ def mask_numbers(data, number_mask='numbermask'):
for text in tqdm(data, desc='masking numbers'):
masked.append(mask.sub(number_mask, text))
return masked
def fill_missing_classes(lXtr, lytr):
pass
def get_learner(calibrate=False, kernel='linear'):
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto')
def get_params(dense=False):
if not op.optimc:
return None
c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
kernel = 'rbf' if dense else 'linear'
return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]