merged?
This commit is contained in:
commit
cfd3a609a2
|
|
@ -6,7 +6,7 @@ from optparse import OptionParser
|
|||
from util.file import exists
|
||||
from util.results import PolylingualClassificationResults
|
||||
from sklearn.svm import SVC
|
||||
|
||||
from util.util import get_learner, get_params
|
||||
|
||||
parser = OptionParser()
|
||||
|
||||
|
|
@ -35,16 +35,22 @@ parser.add_option("-c", "--optimc", dest="optimc", action='store_true',
|
|||
parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int,
|
||||
help="Number of parallel jobs (default is -1, all)", default=-1)
|
||||
|
||||
parser.add_option("-p", "--pca", dest="max_labels", type=int,
|
||||
help="If less than number of target classes, will apply PCA to supervised matrix. If set to 0 it"
|
||||
" will automatically search for the best number of components", default=300)
|
||||
parser.add_option("-p", "--pca", dest="max_labels_S", type=int,
|
||||
help="If smaller than number of target classes, PCA will be applied to supervised matrix. "
|
||||
"If set to 0 it will automatically search for the best number of components. "
|
||||
"If set to -1 it will apply PCA to the vstacked supervised matrix (PCA dim set to 50 atm)",
|
||||
default=300)
|
||||
|
||||
parser.add_option("-u", "--upca", dest="max_labels_U", type=int,
|
||||
help="If smaller than Unsupervised Dimension, will apply PCA to unsupervised matrix. If set to 0 it"
|
||||
" will automatically search for the best number of components", default=300)
|
||||
help="If smaller than Unsupervised Dimension, PCA will be applied to unsupervised matrix."
|
||||
" If set to 0 it will automatically search for the best number of components", default=300)
|
||||
|
||||
parser.add_option("-l", dest="lang", type=str)
|
||||
|
||||
# parser.add_option("-a", dest="post_pca",
|
||||
# help="If set to True, will apply PCA to the z-space (posterior probabilities stacked along with "
|
||||
# "embedding space", default=False)
|
||||
|
||||
|
||||
def get_learner(calibrate=False, kernel='linear'):
|
||||
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto')
|
||||
|
|
@ -73,13 +79,12 @@ if __name__ == '__main__':
|
|||
data = MultilingualDataset.load(op.dataset)
|
||||
data.show_dimensions()
|
||||
|
||||
data.set_view(languages=['en','it', 'pt', 'sv'], categories=list(range(10)))
|
||||
# data.set_view(languages=['en','it', 'pt', 'sv'], categories=list(range(10)))
|
||||
# data.set_view(languages=[op.lang])
|
||||
# data.set_view(categories=list(range(10)))
|
||||
lXtr, lytr = data.training()
|
||||
lXte, lyte = data.test()
|
||||
|
||||
|
||||
if op.set_c != -1:
|
||||
meta_parameters = None
|
||||
else:
|
||||
|
|
@ -110,12 +115,12 @@ if __name__ == '__main__':
|
|||
config = {'unsupervised': True,
|
||||
'supervised': True,
|
||||
'we_type': op.we_type}
|
||||
_config_id = 'M_and_F'
|
||||
_config_id = 'M+F'
|
||||
|
||||
##### TODO - config dict is redundant - we have already op argparse ...
|
||||
config['reduction'] = 'PCA'
|
||||
config['max_label_space'] = op.max_labels
|
||||
config['max_label_space'] = op.max_labels_S
|
||||
config['dim_reduction_unsupervised'] = op.max_labels_U
|
||||
# config['post_pca'] = op.post_pca
|
||||
# config['plot_covariance_matrices'] = True
|
||||
|
||||
result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '')
|
||||
|
|
@ -125,7 +130,7 @@ if __name__ == '__main__':
|
|||
config=config,
|
||||
first_tier_learner=get_learner(calibrate=True),
|
||||
meta_learner=get_learner(calibrate=False, kernel='rbf'),
|
||||
first_tier_parameters=get_params(dense=False),
|
||||
first_tier_parameters=None, # TODO get_params(dense=False),--> first_tier should not be optimized - or not?
|
||||
meta_parameters=get_params(dense=True),
|
||||
n_jobs=op.n_jobs)
|
||||
|
||||
|
|
@ -140,6 +145,8 @@ if __name__ == '__main__':
|
|||
macrof1, microf1, macrok, microk = l_eval[lang]
|
||||
metrics.append([macrof1, microf1, macrok, microk])
|
||||
print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1))
|
||||
results.add_row(result_id, 'PolyEmbed_andrea', 'svm', _config_id, config['we_type'], op.optimc, op.dataset.split('/')[-1],
|
||||
classifier.time, lang, macrof1, microf1, macrok, microk, '')
|
||||
results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'],
|
||||
(config['max_label_space'], classifier.best_components),
|
||||
config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time,
|
||||
lang, macrof1, microf1, macrok, microk, '')
|
||||
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
|
||||
|
|
|
|||
|
|
@ -0,0 +1,128 @@
|
|||
import os
|
||||
from dataset_builder import MultilingualDataset
|
||||
from learning.learners import *
|
||||
from util.evaluation import *
|
||||
from optparse import OptionParser
|
||||
from util.file import exists
|
||||
from util.results import PolylingualClassificationResults
|
||||
from util.util import get_learner, get_params
|
||||
|
||||
parser = OptionParser()
|
||||
|
||||
parser.add_option("-d", "--dataset", dest="dataset",
|
||||
help="Path to the multilingual dataset processed and stored in .pickle format",
|
||||
default="/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle")
|
||||
|
||||
parser.add_option("-o", "--output", dest="output",
|
||||
help="Result file", type=str, default='./results/results.csv')
|
||||
|
||||
parser.add_option("-e", "--mode-embed", dest="mode_embed",
|
||||
help="Set the embedding to be used [none, unsupervised, supervised, both]", type=str, default='none')
|
||||
|
||||
parser.add_option("-w", "--we-path", dest="we_path",
|
||||
help="Path to the polylingual word embeddings", default='/home/andreapdr/CLESA/')
|
||||
|
||||
parser.add_option('-t', "--we-type", dest="we_type", help="Aligned embeddings to use [FastText, MUSE]", type=str,
|
||||
default='MUSE')
|
||||
|
||||
parser.add_option("-s", "--set_c", dest="set_c",type=float,
|
||||
help="Set the C parameter", default=1)
|
||||
|
||||
parser.add_option("-c", "--optimc", dest="optimc", action='store_true',
|
||||
help="Optimize hyperparameters", default=False)
|
||||
|
||||
parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int,
|
||||
help="Number of parallel jobs (default is -1, all)", default=-1)
|
||||
|
||||
parser.add_option("-p", "--pca", dest="max_labels_S", type=int,
|
||||
help="If smaller than number of target classes, PCA will be applied to supervised matrix. "
|
||||
"If set to 0 it will automatically search for the best number of components. "
|
||||
"If set to -1 it will apply PCA to the vstacked supervised matrix (PCA dim set to 50 atm)",
|
||||
default=300)
|
||||
|
||||
parser.add_option("-u", "--upca", dest="max_labels_U", type=int,
|
||||
help="If smaller than Unsupervised Dimension, PCA will be applied to unsupervised matrix."
|
||||
" If set to 0 it will automatically search for the best number of components", default=300)
|
||||
|
||||
parser.add_option("-l", dest="lang", type=str)
|
||||
|
||||
if __name__ == '__main__':
|
||||
(op, args) = parser.parse_args()
|
||||
|
||||
assert exists(op.dataset), 'Unable to find file '+str(op.dataset)
|
||||
assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
|
||||
|
||||
dataset_file = os.path.basename(op.dataset)
|
||||
|
||||
results = PolylingualClassificationResults('./results/PLE_results.csv')
|
||||
|
||||
data = MultilingualDataset.load(op.dataset)
|
||||
data.show_dimensions()
|
||||
|
||||
# data.set_view(languages=['en','it', 'pt', 'sv'], categories=list(range(10)))
|
||||
# data.set_view(languages=[op.lang])
|
||||
# data.set_view(categories=list(range(10)))
|
||||
lXtr, lytr = data.training()
|
||||
lXte, lyte = data.test()
|
||||
|
||||
if op.set_c != -1:
|
||||
meta_parameters = None
|
||||
else:
|
||||
meta_parameters = [{'C': [1e3, 1e2, 1e1, 1, 1e-1]}]
|
||||
|
||||
# Embeddings and WCE config
|
||||
_available_mode = ['none', 'unsupervised', 'supervised', 'both']
|
||||
_available_type = ['MUSE', 'FastText']
|
||||
assert op.mode_embed in _available_mode, f'{op.mode_embed} not in {_available_mode}'
|
||||
assert op.we_type in _available_type, f'{op.we_type} not in {_available_type}'
|
||||
|
||||
if op.mode_embed == 'none':
|
||||
config = {'unsupervised': False,
|
||||
'supervised': False,
|
||||
'we_type': None}
|
||||
_config_id = 'None'
|
||||
elif op.mode_embed == 'unsupervised':
|
||||
config = {'unsupervised': True,
|
||||
'supervised': False,
|
||||
'we_type': op.we_type}
|
||||
_config_id = 'M'
|
||||
elif op.mode_embed == 'supervised':
|
||||
config = {'unsupervised': False,
|
||||
'supervised': True,
|
||||
'we_type': None}
|
||||
_config_id = 'F'
|
||||
elif op.mode_embed == 'both':
|
||||
config = {'unsupervised': True,
|
||||
'supervised': True,
|
||||
'we_type': op.we_type}
|
||||
_config_id = 'M+F'
|
||||
|
||||
config['reduction'] = 'PCA'
|
||||
config['max_label_space'] = op.max_labels_S
|
||||
config['dim_reduction_unsupervised'] = op.max_labels_U
|
||||
# config['post_pca'] = op.post_pca
|
||||
# config['plot_covariance_matrices'] = True
|
||||
|
||||
result_id = dataset_file + 'MLE_andrea' + _config_id + ('_optimC' if op.optimc else '')
|
||||
|
||||
ple = PolylingualEmbeddingsClassifier(wordembeddings_path='/home/andreapdr/CLESA/',
|
||||
config = config,
|
||||
learner=get_learner(calibrate=False),
|
||||
c_parameters=get_params(dense=False),
|
||||
n_jobs=op.n_jobs)
|
||||
|
||||
print('# Fitting ...')
|
||||
ple.fit(lXtr, lytr)
|
||||
|
||||
print('# Evaluating ...')
|
||||
ple_eval = evaluate_method(ple, lXte, lyte)
|
||||
|
||||
metrics = []
|
||||
for lang in lXte.keys():
|
||||
macrof1, microf1, macrok, microk = ple_eval[lang]
|
||||
metrics.append([macrof1, microf1, macrok, microk])
|
||||
print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1))
|
||||
results.add_row('MLE', 'svm', _config_id, config['we_type'],
|
||||
'no','no', op.optimc, op.dataset.split('/')[-1], ple.time,
|
||||
lang, macrof1, microf1, macrok, microk, '')
|
||||
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
|
||||
|
|
@ -0,0 +1,92 @@
|
|||
from optparse import OptionParser
|
||||
from util.results import PolylingualClassificationResults
|
||||
from dataset_builder import MultilingualDataset
|
||||
from keras.preprocessing.text import Tokenizer
|
||||
from learning.learners import MonolingualNetSvm
|
||||
from sklearn.svm import SVC
|
||||
import pickle
|
||||
|
||||
parser = OptionParser()
|
||||
|
||||
parser.add_option("-d", "--dataset", dest="dataset",
|
||||
help="Path to the multilingual dataset processed and stored in .pickle format",
|
||||
default="/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle")
|
||||
|
||||
parser.add_option("-c", "--optimc", dest="optimc", action='store_true',
|
||||
help="Optimize hyperparameters", default=False)
|
||||
|
||||
parser.add_option("-s", "--set_c", dest="set_c",type=float,
|
||||
help="Set the C parameter", default=1)
|
||||
|
||||
(op, args) = parser.parse_args()
|
||||
|
||||
|
||||
###################################################################################################################
|
||||
|
||||
def get_learner(calibrate=False, kernel='linear'):
|
||||
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto')
|
||||
|
||||
|
||||
def get_params(dense=False):
|
||||
if not op.optimc:
|
||||
return None
|
||||
c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
|
||||
kernel = 'rbf' if dense else 'linear'
|
||||
return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]
|
||||
|
||||
|
||||
# PREPROCESS TEXT AND SAVE IT ... both for SVM and NN
|
||||
def preprocess_data(lXtr, lXte, lytr, lyte):
|
||||
tokenized_tr = dict()
|
||||
tokenized_te = dict()
|
||||
for lang in lXtr.keys():
|
||||
alltexts = ' '.join(lXtr[lang])
|
||||
tokenizer = Tokenizer()
|
||||
tokenizer.fit_on_texts(alltexts.split(' '))
|
||||
tokenizer.oov_token = len(tokenizer.word_index)+1
|
||||
# dumping train set
|
||||
sequences_tr = tokenizer.texts_to_sequences(lXtr[lang])
|
||||
tokenized_tr[lang] = (tokenizer.word_index, sequences_tr, lytr[lang])
|
||||
# dumping test set
|
||||
sequences_te = tokenizer.texts_to_sequences(lXte[lang])
|
||||
tokenized_te[lang] = (tokenizer.word_index, sequences_te, lyte[lang])
|
||||
|
||||
with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_train.pickle', 'wb') as f:
|
||||
pickle.dump(tokenized_tr, f)
|
||||
|
||||
with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_test.pickle', 'wb') as f:
|
||||
pickle.dump(tokenized_tr, f)
|
||||
|
||||
print('Successfully dumped data')
|
||||
|
||||
# def load_preprocessed():
|
||||
# with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_train.pickle', 'rb') as f:
|
||||
# return pickle.load(f)
|
||||
#
|
||||
# def build_embedding_matrix(lang, word_index):
|
||||
# type = 'MUSE'
|
||||
# path = '/home/andreapdr/CLESA/'
|
||||
# MUSE = EmbeddingsAligned(type, path, lang, word_index.keys())
|
||||
# return MUSE
|
||||
|
||||
|
||||
########## MAIN #################################################################################################
|
||||
|
||||
if __name__ == '__main__':
|
||||
results = PolylingualClassificationResults('./results/NN_FPEC_results.csv')
|
||||
data = MultilingualDataset.load(op.dataset)
|
||||
lXtr, lytr = data.training()
|
||||
lXte, lyte = data.test()
|
||||
|
||||
if op.set_c != -1:
|
||||
meta_parameters = None
|
||||
else:
|
||||
meta_parameters = [{'C': [1e3, 1e2, 1e1, 1, 1e-1]}]
|
||||
|
||||
test_architecture = MonolingualNetSvm(lXtr,
|
||||
lytr,
|
||||
first_tier_learner=get_learner(calibrate=True),
|
||||
first_tier_parameters=None,
|
||||
n_jobs=1)
|
||||
|
||||
test_architecture.fit()
|
||||
|
|
@ -3,8 +3,9 @@ import pickle
|
|||
from torchtext.vocab import Vectors
|
||||
import torch
|
||||
from abc import ABC, abstractmethod
|
||||
from data.supervised import get_supervised_embeddings
|
||||
from learning.supervised import get_supervised_embeddings
|
||||
from util.decompositions import *
|
||||
from util.SIF_embed import *
|
||||
|
||||
|
||||
class PretrainedEmbeddings(ABC):
|
||||
|
|
@ -48,7 +49,7 @@ class WordEmbeddings:
|
|||
print('loading pkl in {}'.format(we_path + '.pkl'))
|
||||
(worddim, we) = pickle.load(open(we_path + '.pkl', 'rb'))
|
||||
else:
|
||||
word_registry=set()
|
||||
word_registry = set()
|
||||
lines = open(we_path).readlines()
|
||||
nwords, dims = [int(x) for x in lines[0].split()]
|
||||
print('reading we of {} dimensions'.format(dims))
|
||||
|
|
@ -61,13 +62,13 @@ class WordEmbeddings:
|
|||
word, *vals = line.split()
|
||||
wordp = word_preprocessor(word) if word_preprocessor is not None else word
|
||||
if wordp:
|
||||
wordp=wordp[0]
|
||||
wordp = wordp[0]
|
||||
if wordp in word_registry:
|
||||
print('warning: word <{}> generates a duplicate <{}> after preprocessing'.format(word,wordp))
|
||||
elif len(vals) == dims:
|
||||
worddim[wordp] = index
|
||||
we[index, :] = np.array(vals).astype(float)
|
||||
index+=1
|
||||
index += 1
|
||||
# else:
|
||||
# print('warning: word <{}> generates an empty string after preprocessing'.format(word))
|
||||
we = we[:index]
|
||||
|
|
@ -151,7 +152,6 @@ class FastTextWikiNews(Vectors):
|
|||
|
||||
def __init__(self, cache, language="en", **kwargs):
|
||||
url = self.url_base.format(language)
|
||||
# name = self.path.format(language)
|
||||
name = cache + self._name.format(language)
|
||||
super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs)
|
||||
|
||||
|
|
@ -211,44 +211,75 @@ class StorageEmbeddings:
|
|||
|
||||
def _add_embeddings_unsupervised(self, type, docs, vocs, max_label_space=300):
|
||||
for lang in docs.keys():
|
||||
nC = self.lang_U[lang].shape[1]
|
||||
print(f'# [unsupervised-matrix {type}] for {lang}')
|
||||
voc = np.asarray(list(zip(*sorted(vocs[lang].items(), key=lambda x: x[1])))[0])
|
||||
self.lang_U[lang] = EmbeddingsAligned(type, self.path, lang, voc).vectors
|
||||
# if self.lang_U[lang].shape[1] > dim != 0:
|
||||
# print(f'unsupervised matrix has more dimensions ({self.lang_U[lang].shape[1]}) than'
|
||||
# f' the allowed limit {dim}. Applying PCA(n_components={dim})')
|
||||
# pca = PCA(n_components=dim)
|
||||
# self.lang_U[lang] = pca.fit_transform(self.lang_U[lang])
|
||||
print(f'Matrix U (weighted sum) of shape {self.lang_U[lang].shape}\n')
|
||||
nC = self.lang_U[lang].shape[1]
|
||||
if max_label_space == 0:
|
||||
print(f'Computing optimal number of PCA components along matrices U')
|
||||
optimal_n = get_optimal_dim(self.lang_U, 'U')
|
||||
self.lang_U = run_pca(optimal_n, self.lang_U)
|
||||
elif max_label_space < nC:
|
||||
print(f'Applying PCA to unsupervised matrix U')
|
||||
self.lang_U = run_pca(max_label_space, self.lang_U)
|
||||
|
||||
return
|
||||
|
||||
def _add_emebeddings_supervised(self, docs, labels, reduction, max_label_space, voc):
|
||||
# if max_label_space == 0:
|
||||
# print('Computing optimal number of PCA components along matrices S...')
|
||||
# optimal_n = self.get_optimal_supervised_components(docs, labels)
|
||||
# max_label_space = optimal_n
|
||||
def _add_embeddings_supervised(self, docs, labels, reduction, max_label_space, voc):
|
||||
only_well_represented_C = False # TODO testing
|
||||
if only_well_represented_C:
|
||||
labels = labels.copy()
|
||||
min_prevalence = 0
|
||||
print(f'# REDUCING LABELS TO min_prevalence = {min_prevalence} in order to compute WCE Matrix ...')
|
||||
langs = list(docs.keys())
|
||||
well_repr_cats = np.logical_and.reduce([labels[lang].sum(axis=0)>min_prevalence for lang in langs])
|
||||
for lang in langs:
|
||||
labels[lang] = labels[lang][:, well_repr_cats]
|
||||
print(f'Target number reduced to: {labels[lang].shape[1]}\n')
|
||||
|
||||
for lang in docs.keys(): # compute supervised matrices S - then apply PCA
|
||||
nC = self.lang_S[lang].shape[1]
|
||||
print(f'# [supervised-matrix] for {lang}')
|
||||
self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], reduction, max_label_space, voc[lang], lang)
|
||||
self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang],
|
||||
reduction, max_label_space, voc[lang], lang)
|
||||
nC = self.lang_S[lang].shape[1]
|
||||
print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n')
|
||||
|
||||
if max_label_space == 0:
|
||||
if max_label_space == 0: # looking for best n_components analyzing explained_variance_ratio
|
||||
print(f'Computing optimal number of PCA components along matrices S')
|
||||
optimal_n = get_optimal_dim(self.lang_S, 'S')
|
||||
print(f'Applying PCA(n_components={optimal_n})')
|
||||
self.lang_S = run_pca(optimal_n, self.lang_S)
|
||||
elif max_label_space < nC:
|
||||
elif max_label_space == -1: # applying pca to the verticals stacked matrix of WCE embeddings
|
||||
print(f'Computing PCA on vertical stacked WCE embeddings')
|
||||
languages = self.lang_S.keys()
|
||||
_temp_stack = np.vstack([self.lang_S[lang] for lang in languages]) # stacking WCE vertically
|
||||
stacked_pca = PCA(n_components=_temp_stack.shape[1])
|
||||
stacked_pca.fit(_temp_stack)
|
||||
best_n = None
|
||||
_r = stacked_pca.explained_variance_ratio_
|
||||
_r = np.cumsum(_r)
|
||||
plt.plot(_r, label='Stacked Supervised')
|
||||
for i in range(len(_r) - 1, 1, -1):
|
||||
delta = _r[i] - _r[i - 1]
|
||||
if delta > 0:
|
||||
best_n = i
|
||||
break
|
||||
plt.show()
|
||||
stacked_pca = PCA(n_components=best_n)
|
||||
stacked_pca.fit(_temp_stack)
|
||||
print(f'Applying PCA(n_components={i}')
|
||||
for lang in languages:
|
||||
self.lang_S[lang] = stacked_pca.transform(self.lang_S[lang])
|
||||
elif max_label_space <= nC: # less or equal in order to reduce it to the same initial dimension
|
||||
print(f'Computing PCA on Supervised Matrix PCA(n_components:{max_label_space})')
|
||||
self.lang_S = run_pca(max_label_space, self.lang_S)
|
||||
|
||||
return
|
||||
|
||||
def SIF_embeddings(self):
|
||||
print('todo') # TODO
|
||||
|
||||
def _concatenate_embeddings(self, docs):
|
||||
_r = dict()
|
||||
for lang in self.lang_U.keys():
|
||||
|
|
@ -259,13 +290,15 @@ class StorageEmbeddings:
|
|||
if config['unsupervised']:
|
||||
self._add_embeddings_unsupervised(config['we_type'], docs, vocs, config['dim_reduction_unsupervised'])
|
||||
if config['supervised']:
|
||||
self._add_emebeddings_supervised(docs, labels, config['reduction'], config['max_label_space'], vocs)
|
||||
self._add_embeddings_supervised(docs, labels, config['reduction'], config['max_label_space'], vocs)
|
||||
return self
|
||||
|
||||
|
||||
def predict(self, config, docs):
|
||||
if config['supervised'] and config['unsupervised']:
|
||||
return self._concatenate_embeddings(docs)
|
||||
# todo testing applying pca to hstack muse + wce
|
||||
# _reduced = self._concatenate_embeddings(docs)
|
||||
# return run_pca(300, _reduced)
|
||||
elif config['supervised']:
|
||||
_r = dict()
|
||||
for lang in docs.keys():
|
||||
|
|
@ -274,5 +307,5 @@ class StorageEmbeddings:
|
|||
_r = dict()
|
||||
for lang in docs.keys():
|
||||
_r[lang] = docs[lang].dot(self.lang_U[lang])
|
||||
return _r
|
||||
|
||||
return _r
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
import numpy as np
|
||||
import time
|
||||
from data.embeddings import WordEmbeddings, StorageEmbeddings
|
||||
from learning.embeddings import WordEmbeddings, StorageEmbeddings
|
||||
from scipy.sparse import issparse
|
||||
from sklearn.multiclass import OneVsRestClassifier
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
|
|
@ -8,7 +8,8 @@ from sklearn.model_selection import KFold
|
|||
from joblib import Parallel, delayed
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from transformers.StandardizeTransformer import StandardizeTransformer
|
||||
# from sklearn.decomposition import PCA
|
||||
from sklearn.decomposition import PCA
|
||||
from models.cnn_class import CNN_pdr
|
||||
|
||||
|
||||
def _sort_if_sparse(X):
|
||||
|
|
@ -214,11 +215,6 @@ class NaivePolylingualClassifier:
|
|||
|
||||
models = Parallel(n_jobs=self.n_jobs)\
|
||||
(delayed(MonolingualClassifier(self.base_learner, parameters=self.parameters).fit)((lX[lang]),ly[lang]) for lang in langs)
|
||||
#
|
||||
# models = [MonolingualClassifier(self.base_learner, parameters=self.parameters) for lang in langs]
|
||||
#
|
||||
# for model, lang in zip(models, langs):
|
||||
# model.fit(lX[lang], ly[lang])
|
||||
|
||||
self.model = {lang: models[i] for i, lang in enumerate(langs)}
|
||||
self.empty_categories = {lang:self.model[lang].empty_categories for lang in langs}
|
||||
|
|
@ -329,6 +325,132 @@ class MonolingualClassifier:
|
|||
return self.best_params_
|
||||
|
||||
|
||||
class AndreaCLF(FunnellingPolylingualClassifier):
|
||||
def __init__(self,
|
||||
we_path,
|
||||
config,
|
||||
first_tier_learner,
|
||||
meta_learner,
|
||||
first_tier_parameters=None,
|
||||
meta_parameters=None,
|
||||
folded_projections=1,
|
||||
calmode='cal',
|
||||
n_jobs=-1):
|
||||
|
||||
super().__init__(first_tier_learner,
|
||||
meta_learner,
|
||||
first_tier_parameters,
|
||||
meta_parameters,
|
||||
folded_projections,
|
||||
calmode,
|
||||
n_jobs)
|
||||
|
||||
self.pca_independent_space = PCA(n_components=50)
|
||||
self.we_path = we_path
|
||||
self.config = config
|
||||
self.lang_word2idx = dict()
|
||||
self.languages = []
|
||||
self.lang_tfidf = {}
|
||||
self.embedding_space = None
|
||||
self.model = None
|
||||
self.time = None
|
||||
self.best_components = 'not set' # if auto optimize pca, it will store the optimal number of components
|
||||
|
||||
def vectorize(self, lX, prediction=False):
|
||||
langs = list(lX.keys())
|
||||
print(f'# tfidf-vectorizing docs')
|
||||
if prediction:
|
||||
|
||||
for lang in langs:
|
||||
assert lang in self.lang_tfidf.keys(), 'no tf-idf for given language'
|
||||
tfidf_vectorizer = self.lang_tfidf[lang]
|
||||
lX[lang] = tfidf_vectorizer.transform(lX[lang])
|
||||
return self
|
||||
|
||||
for lang in langs:
|
||||
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True)
|
||||
self.languages.append(lang)
|
||||
tfidf_vectorizer.fit(lX[lang])
|
||||
lX[lang] = tfidf_vectorizer.transform(lX[lang])
|
||||
self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_
|
||||
self.lang_tfidf[lang] = tfidf_vectorizer
|
||||
return self
|
||||
|
||||
def _get_zspace(self, lXtr, lYtr):
|
||||
print('\nfitting the projectors... {}'.format(list(lXtr.keys())))
|
||||
self.doc_projector.fit(lXtr, lYtr)
|
||||
|
||||
print('\nprojecting the documents')
|
||||
lZ = self._projection(self.doc_projector, lXtr)
|
||||
|
||||
return lZ, lYtr
|
||||
|
||||
def fit(self, lX, ly):
|
||||
tinit = time.time()
|
||||
print('Vectorizing documents...')
|
||||
self.vectorize(lX)
|
||||
|
||||
for lang in self.languages:
|
||||
print(f'{lang}->{lX[lang].shape}')
|
||||
|
||||
Z, zy = self._get_zspace(lX, ly)
|
||||
|
||||
if self.config['supervised'] or self.config['unsupervised']:
|
||||
self.embedding_space = StorageEmbeddings(self.we_path).fit(self.config, lX, self.lang_word2idx, ly)
|
||||
_embedding_space = self.embedding_space.predict(self.config, lX)
|
||||
if self.config['max_label_space'] == 0:
|
||||
_cum_dimension = _embedding_space[list(_embedding_space.keys())[0]].shape[1]
|
||||
if _cum_dimension - 300 > 0:
|
||||
_temp = _cum_dimension - 300
|
||||
else:
|
||||
_temp = _cum_dimension
|
||||
self.best_components = _temp
|
||||
# h_stacking posterior probabilities with (U) and/or (S) matrices
|
||||
for lang in self.languages:
|
||||
Z[lang] = np.hstack((Z[lang], _embedding_space[lang]))
|
||||
|
||||
# stacking Z space vertically
|
||||
_vertical_Z = np.vstack([Z[lang] for lang in self.languages])
|
||||
_vertical_Zy = np.vstack([zy[lang] for lang in self.languages])
|
||||
|
||||
self.standardizer = StandardizeTransformer()
|
||||
_vertical_Z = self.standardizer.fit_predict(_vertical_Z)
|
||||
|
||||
# todo testing ...
|
||||
# if self.config['post_pca']:
|
||||
# print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...')
|
||||
# self.pca_independent_space.fit(_vertical_Z)
|
||||
# _vertical_Z = self.pca_independent_space.transform(_vertical_Z)
|
||||
|
||||
print('fitting the Z-space of shape={}'.format(_vertical_Z.shape))
|
||||
self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters,
|
||||
n_jobs=self.n_jobs)
|
||||
self.model.fit(_vertical_Z, _vertical_Zy)
|
||||
self.time = time.time() - tinit
|
||||
print(f'\nTotal training time elapsed: {round((self.time/60), 2)} min')
|
||||
|
||||
def predict(self, lX, ly):
|
||||
print('Vectorizing documents')
|
||||
self.vectorize(lX, prediction=True)
|
||||
lZ = self._projection(self.doc_projector, lX)
|
||||
|
||||
if self.config['supervised'] or self.config['unsupervised']:
|
||||
_embedding_space = self.embedding_space.predict(self.config, lX)
|
||||
|
||||
for lang in lX.keys():
|
||||
lZ[lang] = np.hstack((lZ[lang], _embedding_space[lang]))
|
||||
|
||||
for lang in lZ.keys():
|
||||
print(lZ[lang].shape)
|
||||
# todo testing
|
||||
lZ[lang] = self.standardizer.predict(lZ[lang])
|
||||
# if self.config['post_pca']:
|
||||
# print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...')
|
||||
# lZ[lang] = self.pca_independent_space.transform(lZ[lang])
|
||||
|
||||
return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs)
|
||||
|
||||
|
||||
class PolylingualEmbeddingsClassifier:
|
||||
"""
|
||||
This classifier creates document embeddings by a tfidf weighted average of polylingual embeddings from the article
|
||||
|
|
@ -340,7 +462,7 @@ class PolylingualEmbeddingsClassifier:
|
|||
}
|
||||
url: https://github.com/facebookresearch/MUSE
|
||||
"""
|
||||
def __init__(self, wordembeddings_path, learner, c_parameters=None, n_jobs=-1):
|
||||
def __init__(self, wordembeddings_path, config, learner, c_parameters=None, n_jobs=-1):
|
||||
"""
|
||||
:param wordembeddings_path: the path to the directory containing the polylingual embeddings
|
||||
:param learner: the learner
|
||||
|
|
@ -348,11 +470,15 @@ class PolylingualEmbeddingsClassifier:
|
|||
:param n_jobs: the number of concurrent threads
|
||||
"""
|
||||
self.wordembeddings_path = wordembeddings_path
|
||||
self.config = config
|
||||
self.learner = learner
|
||||
self.c_parameters=c_parameters
|
||||
self.n_jobs = n_jobs
|
||||
self.lang_tfidf = {}
|
||||
self.model = None
|
||||
self.languages = []
|
||||
self.lang_word2idx = dict()
|
||||
self.embedding_space = None
|
||||
|
||||
def fit_vectorizers(self, lX):
|
||||
for lang in lX.keys():
|
||||
|
|
@ -362,6 +488,27 @@ class PolylingualEmbeddingsClassifier:
|
|||
tfidf.fit(docs)
|
||||
self.lang_tfidf[lang] = tfidf
|
||||
|
||||
|
||||
def vectorize(self, lX, prediction=False):
|
||||
langs = list(lX.keys())
|
||||
print(f'# tfidf-vectorizing docs')
|
||||
if prediction:
|
||||
|
||||
for lang in langs:
|
||||
assert lang in self.lang_tfidf.keys(), 'no tf-idf for given language'
|
||||
tfidf_vectorizer = self.lang_tfidf[lang]
|
||||
lX[lang] = tfidf_vectorizer.transform(lX[lang])
|
||||
return self
|
||||
|
||||
for lang in langs:
|
||||
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True)
|
||||
self.languages.append(lang)
|
||||
tfidf_vectorizer.fit(lX[lang])
|
||||
lX[lang] = tfidf_vectorizer.transform(lX[lang])
|
||||
self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_
|
||||
self.lang_tfidf[lang] = tfidf_vectorizer
|
||||
return self
|
||||
|
||||
def embed(self, docs, lang):
|
||||
assert lang in self.lang_tfidf, 'unknown language'
|
||||
tfidf_vectorizer = self.lang_tfidf[lang]
|
||||
|
|
@ -394,31 +541,34 @@ class PolylingualEmbeddingsClassifier:
|
|||
tinit = time.time()
|
||||
langs = list(lX.keys())
|
||||
WEtr, Ytr = [], []
|
||||
self.fit_vectorizers(lX) # if already fit, does nothing
|
||||
_lX = dict()
|
||||
for lang in langs:
|
||||
_lX[lang] = self.lang_tfidf[lang].transform(lX[lang])
|
||||
WEtr.append(self.embed(lX[lang], lang))
|
||||
Ytr.append(ly[lang])
|
||||
# self.fit_vectorizers(lX) # if already fit, does nothing
|
||||
self.vectorize(lX)
|
||||
# config = {'unsupervised' : False, 'supervised': True}
|
||||
self.embedding_space = StorageEmbeddings(self.wordembeddings_path).fit(self.config, lX, self.lang_word2idx, ly)
|
||||
WEtr = self.embedding_space.predict(self.config, lX)
|
||||
# for lang in langs:
|
||||
# WEtr.append(self.embed(lX[lang], lang)) # todo embed with other matrices
|
||||
# Ytr.append(ly[lang])
|
||||
|
||||
# TODO @Andrea --> here embeddings should be stacked horizontally!
|
||||
WEtr = np.vstack(WEtr)
|
||||
Ytr = np.vstack(Ytr)
|
||||
WEtr = np.vstack([WEtr[lang] for lang in langs])
|
||||
Ytr = np.vstack([ly[lang] for lang in langs])
|
||||
self.embed_time = time.time() - tinit
|
||||
|
||||
print('fitting the WE-space of shape={}'.format(WEtr.shape))
|
||||
self.model = MonolingualClassifier(base_learner=self.learner, parameters=self.c_parameters, n_jobs=self.n_jobs)
|
||||
self.model.fit(_lX['da'], ly['da'])
|
||||
self.model.fit(WEtr, Ytr)
|
||||
self.time = time.time() - tinit
|
||||
return self
|
||||
|
||||
def predict(self, lX):
|
||||
def predict(self, lX, lY):
|
||||
"""
|
||||
:param lX: a dictionary {language_label: [list of preprocessed documents]}
|
||||
"""
|
||||
assert self.model is not None, 'predict called before fit'
|
||||
self.vectorize(lX, prediction=True)
|
||||
langs = list(lX.keys())
|
||||
lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory
|
||||
lWEte = self.embedding_space.predict(self.config, lX)
|
||||
# lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory
|
||||
return _joblib_transform_multiling(self.model.predict, lWEte, n_jobs=self.n_jobs)
|
||||
|
||||
def predict_proba(self, lX):
|
||||
|
|
@ -427,44 +577,78 @@ class PolylingualEmbeddingsClassifier:
|
|||
"""
|
||||
assert self.model is not None, 'predict called before fit'
|
||||
langs = list(lX.keys())
|
||||
# lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory
|
||||
return _joblib_transform_multiling(self.model.predict_proba, self.lang_tfidf['da'], n_jobs=self.n_jobs)
|
||||
lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory
|
||||
return _joblib_transform_multiling(self.model.predict_proba, lWEte, n_jobs=self.n_jobs)
|
||||
|
||||
def best_params(self):
|
||||
return self.model.best_params()
|
||||
|
||||
|
||||
class AndreaCLF(FunnellingPolylingualClassifier):
|
||||
def __init__(self,
|
||||
we_path,
|
||||
config,
|
||||
first_tier_learner,
|
||||
meta_learner,
|
||||
first_tier_parameters=None,
|
||||
meta_parameters=None,
|
||||
folded_projections=1,
|
||||
calmode='cal',
|
||||
n_jobs=-1):
|
||||
|
||||
super().__init__(first_tier_learner,
|
||||
meta_learner,
|
||||
first_tier_parameters,
|
||||
meta_parameters,
|
||||
folded_projections,
|
||||
calmode,
|
||||
n_jobs)
|
||||
|
||||
self.pca_independent_space = PCA(n_components=100)
|
||||
self.we_path = we_path
|
||||
self.config = config
|
||||
self.lang_word2idx = dict()
|
||||
class MonolingualNetSvm:
|
||||
"""
|
||||
testing: funnelling with NN managing word embeddings compositionality. An ensemble of n-SVMs (n equals to the
|
||||
number of training languages) is first fit on the data, generating the documents projection in the Z-space. Next,
|
||||
the projection are fed to a single NN with their respective document embeddings. The documents are projected into
|
||||
the embedding space while preserving their dimensionality (output dim is 300). These projection are horizonatally
|
||||
concatenated with the respective projection and passed through a fC layer with sigmoid act and output dim equal
|
||||
to the number of target classes.
|
||||
# TODO ATM testing with only 1 language
|
||||
"""
|
||||
def __init__(self, lX, ly, first_tier_learner, first_tier_parameters, n_jobs):
|
||||
self.lX = lX
|
||||
self.ly = ly
|
||||
# SVM Attributes
|
||||
self.doc_projector = NaivePolylingualClassifier(first_tier_learner, first_tier_parameters,
|
||||
n_jobs=n_jobs)
|
||||
self.calmode = 'cal'
|
||||
self.languages = []
|
||||
self.lang_word2idx = dict()
|
||||
self.lang_tfidf = {}
|
||||
self.embedding_space = None
|
||||
self.model = None
|
||||
self.time = None
|
||||
self.base_learner = 'TODO'
|
||||
self.parameters = 'TODO'
|
||||
# NN Attributes
|
||||
self.NN = 'TODO'
|
||||
|
||||
def vectorize(self, lX, prediction=False):
|
||||
|
||||
def load_preprocessed(self):
|
||||
"""
|
||||
in order to speed up the process, documents are first tokenized in the "main". Here, tokenized docs, word_index, and
|
||||
targets are loaded.
|
||||
:return: dict[lang] = (word_index, tokenized_docs, targets)
|
||||
"""
|
||||
import pickle
|
||||
with open('/home/andreapdr/CLESA/preprocessed_dataset_nn/rcv1-2_train.pickle', 'rb') as f:
|
||||
return pickle.load(f)
|
||||
|
||||
def _build_embedding_matrix(self, lang, word_index):
|
||||
"""
|
||||
build embedding matrix by filtering out OOV embeddings
|
||||
:param lang:
|
||||
:param word_index:
|
||||
:return: filtered embedding matrix
|
||||
"""
|
||||
from learning.embeddings import EmbeddingsAligned
|
||||
type = 'MUSE'
|
||||
path = '/home/andreapdr/CLESA/'
|
||||
MUSE = EmbeddingsAligned(type, path, lang, word_index.keys())
|
||||
return MUSE
|
||||
|
||||
def get_data_and_embed(self, data_dict):
|
||||
from keras.preprocessing.sequence import pad_sequences
|
||||
|
||||
langs = data_dict.keys()
|
||||
lang_embedding_matrix = dict()
|
||||
nn_lXtr = dict()
|
||||
nn_lytr = dict()
|
||||
|
||||
for lang in langs:
|
||||
lang_embedding_matrix[lang] = self._build_embedding_matrix(lang, data_dict[lang][0])
|
||||
nn_lXtr[lang] = pad_sequences(data_dict[lang][1], 100, padding='post')
|
||||
nn_lytr[lang] = [data_dict[lang][2]]
|
||||
|
||||
return nn_lXtr, nn_lytr, lang_embedding_matrix
|
||||
|
||||
def svm_vectorize(self, lX, prediction=False):
|
||||
langs = list(lX.keys())
|
||||
print(f'# tfidf-vectorizing docs')
|
||||
if prediction:
|
||||
|
|
@ -473,7 +657,6 @@ class AndreaCLF(FunnellingPolylingualClassifier):
|
|||
tfidf_vectorizer = self.lang_tfidf[lang]
|
||||
lX[lang] = tfidf_vectorizer.transform(lX[lang])
|
||||
return self
|
||||
|
||||
for lang in langs:
|
||||
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True)
|
||||
self.languages.append(lang)
|
||||
|
|
@ -481,9 +664,8 @@ class AndreaCLF(FunnellingPolylingualClassifier):
|
|||
lX[lang] = tfidf_vectorizer.transform(lX[lang])
|
||||
self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_
|
||||
self.lang_tfidf[lang] = tfidf_vectorizer
|
||||
return self
|
||||
return lX
|
||||
|
||||
# @override std class method
|
||||
def _get_zspace(self, lXtr, lYtr):
|
||||
print('\nfitting the projectors... {}'.format(list(lXtr.keys())))
|
||||
self.doc_projector.fit(lXtr, lYtr)
|
||||
|
|
@ -493,57 +675,58 @@ class AndreaCLF(FunnellingPolylingualClassifier):
|
|||
|
||||
return lZ, lYtr
|
||||
|
||||
# @override std class method
|
||||
def fit(self, lX, ly):
|
||||
tinit = time.time()
|
||||
print('Vectorizing documents...')
|
||||
self.vectorize(lX)
|
||||
def _projection(self, doc_projector, lX):
|
||||
"""
|
||||
Decides the projection function to be applied; predict_proba if the base classifiers are calibrated or
|
||||
decision_function if otherwise
|
||||
:param doc_projector: the document projector (a NaivePolylingualClassifier)
|
||||
:param lX: {lang:matrix} to train
|
||||
:return: the projection, applied with predict_proba or decision_function
|
||||
"""
|
||||
if self.calmode=='cal':
|
||||
return doc_projector.predict_proba(lX)
|
||||
else:
|
||||
l_decision_scores = doc_projector.decision_function(lX)
|
||||
if self.calmode=='sigmoid':
|
||||
def sigmoid(x): return 1 / (1 + np.exp(-x))
|
||||
for lang in l_decision_scores.keys():
|
||||
l_decision_scores[lang] = sigmoid(l_decision_scores[lang])
|
||||
return l_decision_scores
|
||||
|
||||
for lang in self.languages:
|
||||
print(f'{lang}->{lX[lang].shape}')
|
||||
def fit(self):
|
||||
"""
|
||||
# 1. Fit SVM to generate posterior probabilities:
|
||||
# 1.1 Gather documents and vectorize them as in other SVM classifiers
|
||||
# 2. Fit NN
|
||||
# 2.1 Gather documents and build NN dataset by indexing wrt embedding matrix
|
||||
# 2.2 Fit NN first-layer to generate compositional doc embedding
|
||||
# 2.3 H-stack doc-embed and posterior P
|
||||
# 2.4 Feed stacked vector to output layer (sigmoid act): output Nc
|
||||
# 2.5 Train it...
|
||||
"""
|
||||
|
||||
Z, zy = self._get_zspace(lX, ly)
|
||||
# load pre-processed data
|
||||
data_dict = self.load_preprocessed()
|
||||
# build embedding matrices and neural network document training set
|
||||
nn_lXtr, nn_lytr, lang_embedding_matrix = self.get_data_and_embed(data_dict)
|
||||
# TF-IDF vectorzing documents for SVM classifier
|
||||
svm_lX = self.svm_vectorize(self.lX)
|
||||
|
||||
if self.config['supervised'] or self.config['unsupervised']:
|
||||
self.embedding_space = StorageEmbeddings(self.we_path).fit(self.config, lX, self.lang_word2idx, ly)
|
||||
_embedding_space = self.embedding_space.predict(self.config, lX)
|
||||
# h_stacking posterior probabilities with (U) and/or (S) matrices
|
||||
for lang in self.languages:
|
||||
Z[lang] = np.hstack((Z[lang], _embedding_space[lang]))
|
||||
# just testing on a smaller subset of data
|
||||
test_svm_lX = dict()
|
||||
test_svm_ly = dict()
|
||||
test_svm_lX['it'] = svm_lX['it'][:10, :]
|
||||
test_svm_ly['it'] = self.ly['it'][:10, :]
|
||||
test_nn_data = nn_lXtr['it'][:10]
|
||||
|
||||
# stacking Z space vertically
|
||||
_vertical_Z = np.vstack([Z[lang] for lang in self.languages])
|
||||
_vertical_Zy = np.vstack([zy[lang] for lang in self.languages])
|
||||
# projecting document into Z space by SVM
|
||||
svm_Z, _ = self._get_zspace(test_svm_lX, test_svm_ly)
|
||||
|
||||
# todo testing ...
|
||||
# self.pca_independent_space.fit(_vertical_Z)
|
||||
# _vertical_Z = self.pca_independent_space.transform(_vertical_Z)
|
||||
# initializing net and forward pass
|
||||
net = CNN_pdr(73, 1, 300, len(lang_embedding_matrix['it'].vectors), 300, lang_embedding_matrix['it'].vectors)
|
||||
out = net.forward(test_nn_data, svm_Z['it'])
|
||||
|
||||
self.standardizer = StandardizeTransformer()
|
||||
_vertical_Z = self.standardizer.fit_predict(_vertical_Z)
|
||||
print('TODO')
|
||||
|
||||
print('fitting the Z-space of shape={}'.format(_vertical_Z.shape))
|
||||
self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters,
|
||||
n_jobs=self.n_jobs)
|
||||
self.model.fit(_vertical_Z, _vertical_Zy)
|
||||
self.time = time.time() - tinit
|
||||
print(f'\nTotal training time elapsed: {round((self.time/60), 2)} min')
|
||||
|
||||
def predict(self, lX, ly):
|
||||
print('Vectorizing documents')
|
||||
self.vectorize(lX, prediction=True)
|
||||
lZ = self._projection(self.doc_projector, lX)
|
||||
|
||||
if self.config['supervised'] or self.config['unsupervised']:
|
||||
_embedding_space = self.embedding_space.predict(self.config, lX)
|
||||
|
||||
for lang in lX.keys():
|
||||
lZ[lang] = np.hstack((lZ[lang], _embedding_space[lang]))
|
||||
|
||||
for lang in lZ.keys():
|
||||
print(lZ[lang].shape)
|
||||
# todo testing
|
||||
# lZ[lang] = self.pca_independent_space.transform(lZ[lang])
|
||||
lZ[lang] = self.standardizer.predict(lZ[lang])
|
||||
|
||||
return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs)
|
||||
def net(self):
|
||||
pass
|
||||
|
|
@ -0,0 +1,42 @@
|
|||
import torch.nn as nn
|
||||
from torch.nn import functional as F
|
||||
import torch
|
||||
|
||||
class CNN_pdr(nn.Module):
|
||||
|
||||
def __init__(self, output_size, out_channels, compositional_dim, vocab_size, emb_dim, embeddings=None, drop_embedding_range=None,
|
||||
drop_embedding_prop=0, drop_prob=0.5):
|
||||
super(CNN_pdr, self).__init__()
|
||||
self.vocab_size = vocab_size
|
||||
self.emb_dim = emb_dim
|
||||
self.embeddings = torch.FloatTensor(embeddings)
|
||||
self.embedding_layer = nn.Embedding(vocab_size, emb_dim, _weight=self.embeddings)
|
||||
self.kernel_heights = kernel_heights=[3,5,7]
|
||||
self.stride = 1
|
||||
self.padding = 0
|
||||
self.drop_embedding_range = drop_embedding_range
|
||||
self.drop_embedding_prop = drop_embedding_prop
|
||||
assert 0 <= drop_embedding_prop <= 1, 'drop_embedding_prop: wrong range'
|
||||
self.nC = 73
|
||||
|
||||
self.conv1 = nn.Conv2d(1, compositional_dim, (self.kernel_heights[0], self.emb_dim), self.stride, self.padding)
|
||||
self.dropout = nn.Dropout(drop_prob)
|
||||
self.label = nn.Linear(len(kernel_heights) * out_channels, output_size)
|
||||
self.fC = nn.Linear(compositional_dim + self.nC, self.nC)
|
||||
|
||||
|
||||
def forward(self, x, svm_output):
|
||||
x = torch.LongTensor(x)
|
||||
svm_output = torch.FloatTensor(svm_output)
|
||||
x = self.embedding_layer(x)
|
||||
x = self.conv1(x.unsqueeze(1))
|
||||
x = F.relu(x.squeeze(3))
|
||||
x = F.max_pool1d(x, x.size()[2]).squeeze(2)
|
||||
x = torch.cat((x, svm_output), 1)
|
||||
x = F.sigmoid(self.fC(x))
|
||||
return x #.detach().numpy()
|
||||
|
||||
# logits = self.label(x)
|
||||
# return logits
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
df = pd.read_csv("/home/andreapdr/funneling_pdr/src/results/results.csv", delimiter='\t')
|
||||
pivot = pd.pivot_table(df, values=['time', 'macrof1', 'microf1', 'macrok', 'microk'], index=['method', 'embed'], aggfunc=[np.mean, np.std])
|
||||
print(pivot)
|
||||
print('Finished ...')
|
||||
|
|
@ -0,0 +1,56 @@
|
|||
import numpy as np
|
||||
from sklearn.decomposition import TruncatedSVD
|
||||
|
||||
def get_weighted_average(We, x, w):
|
||||
"""
|
||||
Compute the weighted average vectors
|
||||
:param We: We[i,:] is the vector for word i
|
||||
:param x: x[i, :] are the indices of the words in sentence i
|
||||
:param w: w[i, :] are the weights for the words in sentence i
|
||||
:return: emb[i, :] are the weighted average vector for sentence i
|
||||
"""
|
||||
n_samples = x.shape[0]
|
||||
emb = np.zeros((n_samples, We.shape[1]))
|
||||
for i in range(n_samples):
|
||||
emb[i,:] = w[i,:].dot(We[x[i,:],:]) / np.count_nonzero(w[i,:])
|
||||
return emb
|
||||
|
||||
def compute_pc(X,npc=1):
|
||||
"""
|
||||
Compute the principal components. DO NOT MAKE THE DATA ZERO MEAN!
|
||||
:param X: X[i,:] is a data point
|
||||
:param npc: number of principal components to remove
|
||||
:return: component_[i,:] is the i-th pc
|
||||
"""
|
||||
svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0)
|
||||
svd.fit(X)
|
||||
return svd.components_
|
||||
|
||||
def remove_pc(X, npc=1):
|
||||
"""
|
||||
Remove the projection on the principal components
|
||||
:param X: X[i,:] is a data point
|
||||
:param npc: number of principal components to remove
|
||||
:return: XX[i, :] is the data point after removing its projection
|
||||
"""
|
||||
pc = compute_pc(X, npc)
|
||||
if npc==1:
|
||||
XX = X - X.dot(pc.transpose()) * pc
|
||||
else:
|
||||
XX = X - X.dot(pc.transpose()).dot(pc)
|
||||
return XX
|
||||
|
||||
|
||||
def SIF_embedding(We, x, w, params):
|
||||
"""
|
||||
Compute the scores between pairs of sentences using weighted average + removing the projection on the first principal component
|
||||
:param We: We[i,:] is the vector for word i
|
||||
:param x: x[i, :] are the indices of the words in the i-th sentence
|
||||
:param w: w[i, :] are the weights for the words in the i-th sentence
|
||||
:param params.rmpc: if >0, remove the projections of the sentence embeddings to their first principal component
|
||||
:return: emb, emb[i, :] is the embedding for sentence i
|
||||
"""
|
||||
emb = get_weighted_average(We, x, w)
|
||||
if params.rmpc > 0:
|
||||
emb = remove_pc(emb, params.rmpc)
|
||||
return emb
|
||||
|
|
@ -2,6 +2,7 @@ from sklearn.decomposition import PCA
|
|||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
def run_pca(dim, X):
|
||||
"""
|
||||
:param dim: number of pca components to keep
|
||||
|
|
@ -46,4 +47,4 @@ def get_optimal_dim(X, embed_type):
|
|||
plt.axvline(best_n, color='r', label='optimal N')
|
||||
plt.legend()
|
||||
plt.show()
|
||||
return best_n
|
||||
return best_n
|
||||
|
|
|
|||
|
|
@ -5,7 +5,8 @@ import numpy as np
|
|||
class PolylingualClassificationResults:
|
||||
def __init__(self, file, autoflush=True, verbose=False):
|
||||
self.file = file
|
||||
self.columns = ['id', 'method', 'learner', 'embed', 'embed_type', 'optimp', 'dataset', 'time', 'lang', 'macrof1', 'microf1', 'macrok', 'microk', 'notes']
|
||||
self.columns = ['method', 'learner', 'embed', 'embed_type', 'pca_s', 'pca_u', 'optimp', 'dataset', 'time',
|
||||
'lang', 'macrof1', 'microf1', 'macrok', 'microk', 'notes']
|
||||
self.autoflush = autoflush
|
||||
self.verbose = verbose
|
||||
if os.path.exists(file):
|
||||
|
|
@ -20,8 +21,8 @@ class PolylingualClassificationResults:
|
|||
def already_calculated(self, id):
|
||||
return (self.df['id'] == id).any()
|
||||
|
||||
def add_row(self, id, method, learner, embed, embed_type, optimp, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
|
||||
s = pd.Series([id, method, learner, embed, embed_type, optimp, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns)
|
||||
def add_row(self, method, learner, embed, embed_type, pca_s, pca_u, optimp, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
|
||||
s = pd.Series([method, learner, embed, embed_type, pca_s, pca_u, optimp, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns)
|
||||
self.df = self.df.append(s, ignore_index=True)
|
||||
if self.autoflush: self.flush()
|
||||
self.tell(s.to_string())
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
from sklearn.svm import SVC
|
||||
from tqdm import tqdm
|
||||
import re
|
||||
import sys
|
||||
|
|
@ -8,4 +9,21 @@ def mask_numbers(data, number_mask='numbermask'):
|
|||
masked = []
|
||||
for text in tqdm(data, desc='masking numbers'):
|
||||
masked.append(mask.sub(number_mask, text))
|
||||
return masked
|
||||
return masked
|
||||
|
||||
|
||||
def fill_missing_classes(lXtr, lytr):
|
||||
pass
|
||||
|
||||
|
||||
def get_learner(calibrate=False, kernel='linear'):
|
||||
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto')
|
||||
|
||||
|
||||
def get_params(dense=False):
|
||||
if not op.optimc:
|
||||
return None
|
||||
c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
|
||||
kernel = 'rbf' if dense else 'linear'
|
||||
return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue