This commit is contained in:
Alejandro Moreo Fernandez 2019-12-10 16:50:19 +01:00
commit a8d76b6f52
9 changed files with 276 additions and 134 deletions

View File

@ -1,4 +1,4 @@
import os, sys import os
from dataset_builder import MultilingualDataset from dataset_builder import MultilingualDataset
from learning.learners import * from learning.learners import *
from util.evaluation import * from util.evaluation import *
@ -11,32 +11,46 @@ from sklearn.svm import SVC
parser = OptionParser() parser = OptionParser()
parser.add_option("-d", "--dataset", dest="dataset", parser.add_option("-d", "--dataset", dest="dataset",
help="Path to the multilingual dataset processed and stored in .pickle format") help="Path to the multilingual dataset processed and stored in .pickle format",
default="/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle")
parser.add_option("-o", "--output", dest="output", parser.add_option("-o", "--output", dest="output",
help="Result file", type=str, default='./results/results.csv') help="Result file", type=str, default='./results/results.csv')
parser.add_option("-e", "--mode-embed", dest="mode_embed", parser.add_option("-e", "--mode-embed", dest="mode_embed",
help="Set the embedding to be used [none, pretrained, supervised, both]", type=str, default='none') help="Set the embedding to be used [none, unsupervised, supervised, both]", type=str, default='none')
parser.add_option("-w", "--we-path", dest="we_path", parser.add_option("-w", "--we-path", dest="we_path",
help="Path to the polylingual word embeddings", default='../embeddings/') help="Path to the polylingual word embeddings", default='/home/andreapdr/CLESA/')
parser.add_option('-t', "--we-type", dest="we_type", help="Aligned embeddings to use [FastText, MUSE]", type=str,
default='MUSE')
parser.add_option("-s", "--set_c", dest="set_c",type=float, parser.add_option("-s", "--set_c", dest="set_c",type=float,
help="Set the C parameter", default=1) help="Set the C parameter", default=1)
parser.add_option("-c", "--optimc", dest="optimc", action='store_true', parser.add_option("-c", "--optimc", dest="optimc", action='store_true',
help="Optimices hyperparameters", default=False) help="Optimize hyperparameters", default=False)
parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int, parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int,
help="Number of parallel jobs (default is -1, all)", default=-1) help="Number of parallel jobs (default is -1, all)", default=-1)
parser.add_option("-p", "--pca", dest="max_labels", type=int,
help="If less than number of target classes, will apply PCA to supervised matrix. If set to 0 it"
" will automatically search for the best number of components", default=300)
parser.add_option("-u", "--upca", dest="max_labels_U", type=int,
help="If smaller than Unsupervised Dimension, will apply PCA to unsupervised matrix. If set to 0 it"
" will automatically search for the best number of components", default=300)
parser.add_option("-l", dest="lang", type=str)
def get_learner(calibrate=False, kernel='linear'): def get_learner(calibrate=False, kernel='linear'):
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced') return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto')
def get_params(dense=False): # TODO kernel function could be useful for meta-classifier def get_params(dense=False):
if not op.optimc: if not op.optimc:
return None return None
c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1] c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
@ -47,7 +61,6 @@ def get_params(dense=False): # TODO kernel function could be useful for meta-
if __name__ == '__main__': if __name__ == '__main__':
(op, args) = parser.parse_args() (op, args) = parser.parse_args()
assert exists(op.dataset), 'Unable to find file '+str(op.dataset) assert exists(op.dataset), 'Unable to find file '+str(op.dataset)
@ -60,7 +73,9 @@ if __name__ == '__main__':
data = MultilingualDataset.load(op.dataset) data = MultilingualDataset.load(op.dataset)
data.show_dimensions() data.show_dimensions()
# data.set_view(languages=['en','it'], categories=list(range(10))) data.set_view(languages=['en','it', 'pt', 'sv'], categories=list(range(10)))
# data.set_view(languages=[op.lang])
# data.set_view(categories=list(range(10)))
lXtr, lytr = data.training() lXtr, lytr = data.training()
lXte, lyte = data.test() lXte, lyte = data.test()
@ -72,30 +87,42 @@ if __name__ == '__main__':
# Embeddings and WCE config # Embeddings and WCE config
_available_mode = ['none', 'unsupervised', 'supervised', 'both'] _available_mode = ['none', 'unsupervised', 'supervised', 'both']
assert op.mode_embed in _available_mode , f'{op.mode_embed} not in {_available_mode}' _available_type = ['MUSE', 'FastText']
assert op.mode_embed in _available_mode, f'{op.mode_embed} not in {_available_mode}'
assert op.we_type in _available_type, f'{op.we_type} not in {_available_type}'
if op.mode_embed == 'none': if op.mode_embed == 'none':
config = {'unsupervised': False, config = {'unsupervised': False,
'supervised': False} 'supervised': False,
'we_type': None}
_config_id = 'None' _config_id = 'None'
elif op.mode_embed == 'unsupervised': elif op.mode_embed == 'unsupervised':
config = {'unsupervised': True, config = {'unsupervised': True,
'supervised': False} 'supervised': False,
'we_type': op.we_type}
_config_id = 'M' _config_id = 'M'
elif op.mode_embed == 'supervised': elif op.mode_embed == 'supervised':
config = {'unsupervised': False, config = {'unsupervised': False,
'supervised': True} 'supervised': True,
'we_type': None}
_config_id = 'F' _config_id = 'F'
elif op.mode_embed == 'both': elif op.mode_embed == 'both':
config = {'unsupervised': True, config = {'unsupervised': True,
'supervised': True} 'supervised': True,
'we_type': op.we_type}
_config_id = 'M_and_F' _config_id = 'M_and_F'
##### TODO - config dict is redundant - we have already op argparse ...
config['reduction'] = 'PCA'
config['max_label_space'] = op.max_labels
config['dim_reduction_unsupervised'] = op.max_labels_U
# config['plot_covariance_matrices'] = True
result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '') result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '')
print(f'### PolyEmbedd_andrea_{_config_id}\n') print(f'### PolyEmbedd_andrea_{_config_id}\n')
classifier = AndreaCLF(op.we_path, classifier = AndreaCLF(we_path=op.we_path,
config, config=config,
first_tier_learner=get_learner(calibrate=True), first_tier_learner=get_learner(calibrate=True),
meta_learner=get_learner(calibrate=False, kernel='rbf'), meta_learner=get_learner(calibrate=False, kernel='rbf'),
first_tier_parameters=get_params(dense=False), first_tier_parameters=get_params(dense=False),
@ -105,7 +132,7 @@ if __name__ == '__main__':
print('# Fitting ...') print('# Fitting ...')
classifier.fit(lXtr, lytr) classifier.fit(lXtr, lytr)
print('# Evaluating ...') print('\n# Evaluating ...')
l_eval = evaluate_method(classifier, lXte, lyte) l_eval = evaluate_method(classifier, lXte, lyte)
metrics = [] metrics = []
@ -113,6 +140,6 @@ if __name__ == '__main__':
macrof1, microf1, macrok, microk = l_eval[lang] macrof1, microf1, macrok, microk = l_eval[lang]
metrics.append([macrof1, microf1, macrok, microk]) metrics.append([macrof1, microf1, macrok, microk])
print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1)) print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1))
results.add_row(result_id, 'PolyEmbed_andrea', 'svm', _config_id, op.optimc, op.dataset.split('/')[-1], results.add_row(result_id, 'PolyEmbed_andrea', 'svm', _config_id, config['we_type'], op.optimc, op.dataset.split('/')[-1],
'not_binary', 'not_ablation', classifier.time, lang, macrof1, microf1, macrok, microk, 'nope') classifier.time, lang, macrof1, microf1, macrok, microk, '')
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0)) print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))

View File

@ -1,10 +1,10 @@
import os import os
import pickle import pickle
import numpy as np
from torchtext.vocab import Vectors from torchtext.vocab import Vectors
import torch import torch
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from data.supervised import get_supervised_embeddings from data.supervised import get_supervised_embeddings
from util.decompositions import *
class PretrainedEmbeddings(ABC): class PretrainedEmbeddings(ABC):
@ -110,10 +110,10 @@ class WordEmbeddings:
# vocabulary is a set of terms to be kept # vocabulary is a set of terms to be kept
active_vocabulary = sorted([w for w in vocabulary if w in self.worddim]) active_vocabulary = sorted([w for w in vocabulary if w in self.worddim])
lost = len(vocabulary)-len(active_vocabulary) lost = len(vocabulary)-len(active_vocabulary)
if lost>0: #some termr are missing, so it will be replaced by UNK if lost > 0: # some terms are missing, so it will be replaced by UNK
print('warning: missing {} terms for lang {}'.format(lost, self.lang)) print('warning: missing {} terms for lang {}'.format(lost, self.lang))
self.we = self.get_vectors(active_vocabulary) self.we = self.get_vectors(active_vocabulary)
assert self.we.shape[0]==len(active_vocabulary) assert self.we.shape[0] == len(active_vocabulary)
self.dimword={i:w for i,w in enumerate(active_vocabulary)} self.dimword={i:w for i,w in enumerate(active_vocabulary)}
self.worddim={w:i for i,w in enumerate(active_vocabulary)} self.worddim={w:i for i,w in enumerate(active_vocabulary)}
return self return self
@ -132,12 +132,12 @@ class WordEmbeddings:
'instances of {} expected'.format(WordEmbeddings.__name__) 'instances of {} expected'.format(WordEmbeddings.__name__)
polywe = [] polywe = []
worddim={} worddim = {}
offset=0 offset = 0
for we in we_list: for we in we_list:
polywe.append(we.we) polywe.append(we.we)
worddim.update({'{}::{}'.format(we.lang, w):d+offset for w,d in we.worddim.items()}) worddim.update({'{}::{}'.format(we.lang, w):d+offset for w,d in we.worddim.items()})
offset=len(worddim) offset = len(worddim)
polywe = np.vstack(polywe) polywe = np.vstack(polywe)
return WordEmbeddings(lang='poly', we=polywe, worddim=worddim) return WordEmbeddings(lang='poly', we=polywe, worddim=worddim)
@ -147,16 +147,41 @@ class FastTextWikiNews(Vectors):
url_base = 'Cant auto-download MUSE embeddings' url_base = 'Cant auto-download MUSE embeddings'
path = '/storage/andrea/FUNNELING/embeddings/wiki.multi.{}.vec' path = '/storage/andrea/FUNNELING/embeddings/wiki.multi.{}.vec'
_name = 'wiki.multi.{}.vec' _name = '/embeddings/wiki.multi.{}.vec'
def __init__(self, cache, language="en", **kwargs): def __init__(self, cache, language="en", **kwargs):
url = self.url_base.format(language) url = self.url_base.format(language)
# name = self.path.format(language) # name = self.path.format(language)
name = cache + self._name.format(language) name = cache + self._name.format(language)
# print(f'\n\nFASTEXTWIKI-NEW CLASS:\nurl = {url}\nname = {name}\ncache {cache}\nlanguage = {language}')
super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs) super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs)
class EmbeddingsAligned(Vectors):
def __init__(self, type, path, lang, voc):
# todo - rewrite as relative path
self.name = '/embeddings/wiki.multi.{}.vec' if type == 'MUSE' else '/embeddings_polyFASTTEXT/wiki.{}.align.vec'
self.cache_path = '/home/andreapdr/CLESA/embeddings' if type == 'MUSE' else '/home/andreapdr/CLESA/embeddings_polyFASTTEXT'
self.path = path + self.name.format(lang)
assert os.path.exists(path), f'pre-trained vectors not found in {path}'
super(EmbeddingsAligned, self).__init__(self.path, cache=self.cache_path)
self.vectors = self.extract(voc)
def vocabulary(self):
return set(self.stoi.keys())
def extract(self, words):
source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.stoi)
extraction = torch.zeros((len(words), self.dim))
extraction[source_idx] = self.vectors[target_idx]
return extraction
def reduce(self, dim):
pca = PCA(n_components=dim)
self.vectors = pca.fit_transform(self.vectors)
return
class FastTextMUSE(PretrainedEmbeddings): class FastTextMUSE(PretrainedEmbeddings):
def __init__(self, path, lang, limit=None): def __init__(self, path, lang, limit=None):
@ -164,7 +189,6 @@ class FastTextMUSE(PretrainedEmbeddings):
print(f'Loading fastText pretrained vectors from {path}') print(f'Loading fastText pretrained vectors from {path}')
assert os.path.exists(path), print(f'pre-trained vectors not found in {path}') assert os.path.exists(path), print(f'pre-trained vectors not found in {path}')
self.embed = FastTextWikiNews(path, lang, max_vectors=limit) self.embed = FastTextWikiNews(path, lang, max_vectors=limit)
# print('Done')
def vocabulary(self): def vocabulary(self):
return set(self.embed.stoi.keys()) return set(self.embed.stoi.keys())
@ -179,21 +203,76 @@ class FastTextMUSE(PretrainedEmbeddings):
return extraction return extraction
def embedding_matrix(path, voc, lang): class StorageEmbeddings:
vocabulary = np.asarray(list(zip(*sorted(voc.items(), key=lambda x:x[1])))[0]) def __init__(self, path):
self.path = path
self.lang_U = dict()
self.lang_S = dict()
print('[embedding matrix]') def _add_embeddings_unsupervised(self, type, docs, vocs, max_label_space=300):
print(f'# [pretrained-matrix: FastTextMUSE {lang}]') for lang in docs.keys():
pretrained = FastTextMUSE(path, lang) nC = self.lang_U[lang].shape[1]
P = pretrained.extract(vocabulary).numpy() print(f'# [unsupervised-matrix {type}] for {lang}')
del pretrained voc = np.asarray(list(zip(*sorted(vocs[lang].items(), key=lambda x: x[1])))[0])
print(f'[embedding matrix done] of shape={P.shape}\n') self.lang_U[lang] = EmbeddingsAligned(type, self.path, lang, voc).vectors
# if self.lang_U[lang].shape[1] > dim != 0:
# print(f'unsupervised matrix has more dimensions ({self.lang_U[lang].shape[1]}) than'
# f' the allowed limit {dim}. Applying PCA(n_components={dim})')
# pca = PCA(n_components=dim)
# self.lang_U[lang] = pca.fit_transform(self.lang_U[lang])
print(f'Matrix U (weighted sum) of shape {self.lang_U[lang].shape}\n')
if max_label_space == 0:
print(f'Computing optimal number of PCA components along matrices U')
optimal_n = get_optimal_dim(self.lang_U, 'U')
self.lang_U = run_pca(optimal_n, self.lang_U)
elif max_label_space < nC:
self.lang_U = run_pca(max_label_space, self.lang_U)
return vocabulary, P return
def _add_emebeddings_supervised(self, docs, labels, reduction, max_label_space, voc):
# if max_label_space == 0:
# print('Computing optimal number of PCA components along matrices S...')
# optimal_n = self.get_optimal_supervised_components(docs, labels)
# max_label_space = optimal_n
for lang in docs.keys(): # compute supervised matrices S - then apply PCA
nC = self.lang_S[lang].shape[1]
print(f'# [supervised-matrix] for {lang}')
self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], reduction, max_label_space, voc[lang], lang)
print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n')
if max_label_space == 0:
optimal_n = get_optimal_dim(self.lang_S, 'S')
self.lang_S = run_pca(optimal_n, self.lang_S)
elif max_label_space < nC:
self.lang_S = run_pca(max_label_space, self.lang_S)
return
def _concatenate_embeddings(self, docs):
_r = dict()
for lang in self.lang_U.keys():
_r[lang] = np.hstack((docs[lang].dot(self.lang_U[lang]), docs[lang].dot(self.lang_S[lang])))
return _r
def fit(self, config, docs, vocs, labels):
if config['unsupervised']:
self._add_embeddings_unsupervised(config['we_type'], docs, vocs, config['dim_reduction_unsupervised'])
if config['supervised']:
self._add_emebeddings_supervised(docs, labels, config['reduction'], config['max_label_space'], vocs)
return self
def WCE_matrix(Xtr, Ytr, lang): def predict(self, config, docs):
print('\n# [supervised-matrix]') if config['supervised'] and config['unsupervised']:
S = get_supervised_embeddings(Xtr[lang], Ytr[lang], max_label_space=50) return self._concatenate_embeddings(docs)
print(f'[embedding matrix done] of shape={S.shape}\n') elif config['supervised']:
return S _r = dict()
for lang in docs.keys():
_r[lang] = docs[lang].dot(self.lang_S[lang])
else:
_r = dict()
for lang in docs.keys():
_r[lang] = docs[lang].dot(self.lang_U[lang])
return _r

View File

@ -1,7 +1,7 @@
from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square
# from util.common import *
from sklearn.decomposition import PCA
import numpy as np import numpy as np
# from sklearn.decomposition import PCA
# from sklearn.manifold import TSNE
def zscores(x, axis=0): #scipy.stats.zscores does not avoid division by 0, which can indeed occur def zscores(x, axis=0): #scipy.stats.zscores does not avoid division by 0, which can indeed occur
@ -41,12 +41,10 @@ def supervised_embeddings_tsr(X,Y, tsr_function=information_gain, max_documents=
return F return F
def get_supervised_embeddings(X, Y, max_label_space=300, binary_structural_problems=-1, method='dotn', dozscore=True): def get_supervised_embeddings(X, Y, reduction, max_label_space=300, voc=None, lang='None', binary_structural_problems=-1, method='dotn', dozscore=True):
print('computing supervised embeddings...') if max_label_space != 0:
print('computing supervised embeddings...')
nC = Y.shape[1] nC = Y.shape[1]
if nC==2 and binary_structural_problems > nC:
raise ValueError('not implemented in this branch')
if method=='ppmi': if method=='ppmi':
F = supervised_embeddings_ppmi(X, Y) F = supervised_embeddings_ppmi(X, Y)
@ -60,14 +58,41 @@ def get_supervised_embeddings(X, Y, max_label_space=300, binary_structural_probl
if dozscore: if dozscore:
F = zscores(F, axis=0) F = zscores(F, axis=0)
if nC > max_label_space: # Dumping F-matrix for further studies
print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' dump_it = False
f'Applying PCA(n_components={max_label_space})') if dump_it:
pca = PCA(n_components=max_label_space) with open(f'/home/andreapdr/funneling_pdr/src/dumps/WCE_{lang}.tsv', 'w') as outfile:
F = pca.fit(F).transform(F) np.savetxt(outfile, F, delimiter='\t')
with open(f'/home/andreapdr/funneling_pdr/src/dumps/dict_WCE_{lang}.tsv', 'w') as outfile:
for token in voc.keys():
outfile.write(token+'\n')
return F return F
# if nC >= max_label_space:
# if reduction == 'PCA':
# if max_label_space == 0:
# pca = PCA(n_components=Y.shape[1])
# pca = pca.fit(F)
# return pca.explained_variance_ratio_
#
# print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
# f'Applying PCA(n_components={max_label_space})')
# pca = PCA(n_components=max_label_space)
# F = pca.fit_transform(F)
# elif reduction == 'TSNE':
# print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
# f'Applying t-SNE(n_components={max_label_space})')
# tsne = TSNE(n_components=max_label_space)
# F = tsne.fit_transform(F)
# elif reduction == 'tSVD':
# print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
# f'Applying truncatedSVD(n_components={max_label_space})')
# tSVD = TruncatedSVD(n_components=max_label_space)
# F = tSVD.fit_transform(F)
#
# return F

View File

@ -11,6 +11,8 @@ import numpy as np
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from scipy.sparse import issparse from scipy.sparse import issparse
import itertools import itertools
from tqdm import tqdm
import re
class MultilingualDataset: class MultilingualDataset:
@ -73,10 +75,14 @@ class MultilingualDataset:
return self.lXte(), self.lYte() return self.lXte(), self.lYte()
def lXtr(self): def lXtr(self):
return {lang:Xtr for (lang, ((Xtr,_,_),_)) in self.multiling_dataset.items() if lang in self.langs()} return {lang: Xtr for (lang, ((Xtr, _, _), _)) in self.multiling_dataset.items() if
lang in self.langs()}
# return {lang:self.mask_numbers(Xtr) for (lang, ((Xtr,_,_),_)) in self.multiling_dataset.items() if lang in self.langs()}
def lXte(self): def lXte(self):
return {lang:Xte for (lang, (_,(Xte,_,_))) in self.multiling_dataset.items() if lang in self.langs()} return {lang: Xte for (lang, (_, (Xte, _, _))) in self.multiling_dataset.items() if
lang in self.langs()}
# return {lang:self.mask_numbers(Xte) for (lang, (_,(Xte,_,_))) in self.multiling_dataset.items() if lang in self.langs()}
def lYtr(self): def lYtr(self):
return {lang:self.cat_view(Ytr) for (lang, ((_,Ytr,_),_)) in self.multiling_dataset.items() if lang in self.langs()} return {lang:self.cat_view(Ytr) for (lang, ((_,Ytr,_),_)) in self.multiling_dataset.items() if lang in self.langs()}
@ -129,6 +135,13 @@ class MultilingualDataset:
def set_labels(self, labels): def set_labels(self, labels):
self.labels = labels self.labels = labels
def mask_numbers(self, data, number_mask='numbermask'):
mask = re.compile(r'\b[0-9][0-9.,-]*\b')
masked = []
for text in tqdm(data, desc='masking numbers'):
masked.append(mask.sub(number_mask, text))
return masked
# ---------------------------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------------------------
# Helpers # Helpers

View File

@ -1,15 +1,14 @@
import numpy as np import numpy as np
import time import time
from data.embeddings import WordEmbeddings, embedding_matrix, WCE_matrix from data.embeddings import WordEmbeddings, StorageEmbeddings
from scipy.sparse import issparse from scipy.sparse import issparse
from sklearn.multiclass import OneVsRestClassifier from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold from sklearn.model_selection import KFold
from joblib import Parallel, delayed from joblib import Parallel, delayed
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer
from data.supervised import zscores
from transformers.StandardizeTransformer import StandardizeTransformer from transformers.StandardizeTransformer import StandardizeTransformer
# from sklearn.decomposition import PCA
def _sort_if_sparse(X): def _sort_if_sparse(X):
@ -444,7 +443,8 @@ class AndreaCLF(FunnellingPolylingualClassifier):
first_tier_parameters=None, first_tier_parameters=None,
meta_parameters=None, meta_parameters=None,
folded_projections=1, folded_projections=1,
calmode='cal', n_jobs=-1): calmode='cal',
n_jobs=-1):
super().__init__(first_tier_learner, super().__init__(first_tier_learner,
meta_learner, meta_learner,
@ -454,13 +454,13 @@ class AndreaCLF(FunnellingPolylingualClassifier):
calmode, calmode,
n_jobs) n_jobs)
self.pca_independent_space = PCA(n_components=100)
self.we_path = we_path self.we_path = we_path
self.config = config self.config = config
self.lang_word2idx = dict() self.lang_word2idx = dict()
self.languages = [] self.languages = []
self.lang_tfidf = {} self.lang_tfidf = {}
self.word_embeddings = {} self.embedding_space = None
self.supervised_embeddings = {}
self.model = None self.model = None
self.time = None self.time = None
@ -479,9 +479,8 @@ class AndreaCLF(FunnellingPolylingualClassifier):
self.languages.append(lang) self.languages.append(lang)
tfidf_vectorizer.fit(lX[lang]) tfidf_vectorizer.fit(lX[lang])
lX[lang] = tfidf_vectorizer.transform(lX[lang]) lX[lang] = tfidf_vectorizer.transform(lX[lang])
_sort_if_sparse(lX[lang])
self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_ self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_
self.lang_tfidf[lang] = tfidf_vectorizer # utile in fase di testing self.lang_tfidf[lang] = tfidf_vectorizer
return self return self
# @override std class method # @override std class method
@ -494,45 +493,6 @@ class AndreaCLF(FunnellingPolylingualClassifier):
return lZ, lYtr return lZ, lYtr
def embed(self, lX, ly, unsupervised=False, supervised=False, prediction=False):
"""
build embedding matrix for given language and returns its weighted sum wrt tf-idf score
"""
_r = dict()
languages = list(lX.keys())
if prediction:
for lang in languages:
if unsupervised: # If unsupervised embeddings ...
M = self.word_embeddings[lang]
if supervised: # and also unsupervised --> get both (M) and (S) weighted sum matrices and hstack them
S = self.supervised_embeddings[lang]
_r[lang] = np.hstack((lX[lang].dot(M), lX[lang].dot(S)))
continue
_r[lang] = lX[lang].dot(M) # if not supervised --> just get weighted sum of unsupervised (M) embeddings
else: # If not unsupervised --> get (S) matrix and its weighted sum
S = self.supervised_embeddings[lang]
_r[lang] = lX[lang].dot(S)
return _r
if unsupervised:
for lang in languages:
# print('Test building embedding matrix FastTextMuse ...')
_, M = embedding_matrix(self.we_path, self.lang_word2idx[lang], lang)
self.word_embeddings[lang] = M
_r[lang] = lX[lang].dot(M)
if supervised:
for lang in languages:
S = WCE_matrix(lX, ly, lang)
# S = np.squeeze(np.asarray(S)) # casting to ndarray to better visualize S while debugging
self.supervised_embeddings[lang] = S
if unsupervised:
_r[lang] = np.hstack((_r[lang], lX[lang].dot(S)))
else:
_r[lang] = lX[lang].dot(S)
return _r
# @override std class method # @override std class method
def fit(self, lX, ly): def fit(self, lX, ly):
tinit = time.time() tinit = time.time()
@ -545,24 +505,22 @@ class AndreaCLF(FunnellingPolylingualClassifier):
Z, zy = self._get_zspace(lX, ly) Z, zy = self._get_zspace(lX, ly)
if self.config['supervised'] or self.config['unsupervised']: if self.config['supervised'] or self.config['unsupervised']:
# Z vectors is concatenated with doc's embedding weighted sum self.embedding_space = StorageEmbeddings(self.we_path).fit(self.config, lX, self.lang_word2idx, ly)
Z_embedded = dict() _embedding_space = self.embedding_space.predict(self.config, lX)
l_weighted_em = self.embed(lX, ly, # h_stacking posterior probabilities with (U) and/or (S) matrices
unsupervised=self.config['unsupervised'], for lang in self.languages:
supervised=self.config['supervised']) Z[lang] = np.hstack((Z[lang], _embedding_space[lang]))
# stacking Z space horizontally with unsupervised (M) and/or supervised (F) embeddings
for lang in list(lX.keys()):
Z_embedded[lang] = np.hstack((Z[lang], l_weighted_em[lang]))
Z = Z_embedded
# stacking Z space vertically # stacking Z space vertically
_vertical_Z = np.vstack([Z[lang] for lang in self.languages]) _vertical_Z = np.vstack([Z[lang] for lang in self.languages])
_vertical_Zy = np.vstack([zy[lang] for lang in self.languages]) _vertical_Zy = np.vstack([zy[lang] for lang in self.languages])
# todo testing ...
# self.pca_independent_space.fit(_vertical_Z)
# _vertical_Z = self.pca_independent_space.transform(_vertical_Z)
self.standardizer = StandardizeTransformer() self.standardizer = StandardizeTransformer()
_vertical_Z = self.standardizer.fit_predict(_vertical_Z) _vertical_Z = self.standardizer.fit_predict(_vertical_Z)
print('fitting the Z-space of shape={}'.format(_vertical_Z.shape)) print('fitting the Z-space of shape={}'.format(_vertical_Z.shape))
self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters, self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters,
@ -577,17 +535,15 @@ class AndreaCLF(FunnellingPolylingualClassifier):
lZ = self._projection(self.doc_projector, lX) lZ = self._projection(self.doc_projector, lX)
if self.config['supervised'] or self.config['unsupervised']: if self.config['supervised'] or self.config['unsupervised']:
l_weighted_em = self.embed(lX, ly, _embedding_space = self.embedding_space.predict(self.config, lX)
unsupervised=self.config['unsupervised'],
supervised=self.config['supervised'],
prediction=True)
Z_embedded = dict()
for lang in lX.keys(): for lang in lX.keys():
Z_embedded[lang] = np.hstack((lZ[lang], l_weighted_em[lang])) lZ[lang] = np.hstack((lZ[lang], _embedding_space[lang]))
lZ = Z_embedded
for lang in lZ.keys(): for lang in lZ.keys():
print(lZ[lang].shape) print(lZ[lang].shape)
# todo testing
# lZ[lang] = self.pca_independent_space.transform(lZ[lang])
lZ[lang] = self.standardizer.predict(lZ[lang]) lZ[lang] = self.standardizer.predict(lZ[lang])
return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs) return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs)

View File

@ -1,7 +0,0 @@
id method learner embed optimp dataset binary languages time lang macrof1 microf1 macrok microk notes
jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_M PolyEmbed_andrea svm M False test_datasetname not_binary not_ablation 55.56810355186462 da 0.7933333333333333 0.0 0.7933333333333333 0.0 nope
jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_M PolyEmbed_andrea svm M False test_datasetname not_binary not_ablation 55.56810355186462 en 0.7866666666666666 0.0 0.7927111111111111 -0.0003376325207643527 nope
jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_M PolyEmbed_andrea svm M False test_datasetname not_binary not_ablation 55.56810355186462 fr 0.7866666666666666 0.0 0.7930666666666667 -0.0001350530083057411 nope
jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_None PolyEmbed_andrea svm None False test_datasetname not_binary not_ablation 24.031760931015015 da 0.7933333333333333 0.0 0.7933333333333333 0.0 nope
jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_None PolyEmbed_andrea svm None False test_datasetname not_binary not_ablation 24.031760931015015 en 0.7933333333333333 0.0 0.7931111111111111 -0.00013505300830574107 nope
jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_None PolyEmbed_andrea svm None False test_datasetname not_binary not_ablation 24.031760931015015 fr 0.7933333333333333 0.0 0.7933333333333333 0.0 nope
1 id method learner embed optimp dataset binary languages time lang macrof1 microf1 macrok microk notes
2 jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_M PolyEmbed_andrea svm M False test_datasetname not_binary not_ablation 55.56810355186462 da 0.7933333333333333 0.0 0.7933333333333333 0.0 nope
3 jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_M PolyEmbed_andrea svm M False test_datasetname not_binary not_ablation 55.56810355186462 en 0.7866666666666666 0.0 0.7927111111111111 -0.0003376325207643527 nope
4 jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_M PolyEmbed_andrea svm M False test_datasetname not_binary not_ablation 55.56810355186462 fr 0.7866666666666666 0.0 0.7930666666666667 -0.0001350530083057411 nope
5 jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_None PolyEmbed_andrea svm None False test_datasetname not_binary not_ablation 24.031760931015015 da 0.7933333333333333 0.0 0.7933333333333333 0.0 nope
6 jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_None PolyEmbed_andrea svm None False test_datasetname not_binary not_ablation 24.031760931015015 en 0.7933333333333333 0.0 0.7931111111111111 -0.00013505300830574107 nope
7 jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_None PolyEmbed_andrea svm None False test_datasetname not_binary not_ablation 24.031760931015015 fr 0.7933333333333333 0.0 0.7933333333333333 0.0 nope

View File

@ -12,7 +12,7 @@ class StandardizeTransformer:
self.std = np.clip(std, 1e-5, None) self.std = np.clip(std, 1e-5, None)
self.mean = np.mean(X, axis=self.axis) self.mean = np.mean(X, axis=self.axis)
self.yetfit=True self.yetfit=True
print('done') print('done\n')
return self return self
def predict(self, X): def predict(self, X):
@ -20,4 +20,4 @@ class StandardizeTransformer:
return (X - self.mean) / self.std return (X - self.mean) / self.std
def fit_predict(self, X): def fit_predict(self, X):
return self.fit(X).predict(X) return self.fit(X).predict(X)

View File

@ -0,0 +1,49 @@
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
def run_pca(dim, X):
"""
:param dim: number of pca components to keep
:param X: dictionary str(lang): matrix
:return: dict lang: reduced matrix
"""
r = dict()
pca = PCA(n_components=dim)
for lang in X.keys():
r[lang] = pca.fit_transform(X[lang])
return r
def get_optimal_dim(X, embed_type):
"""
:param X: dict str(lang) : csr_matrix of embeddings unsupervised or supervised
:param embed_type: (str) embedding matrix type: S or U (WCE supervised or U unsupervised MUSE/FASTTEXT)
:return:
"""
_idx = []
plt.figure(figsize=(15, 10))
if embed_type == 'U':
plt.title(f'Unsupervised Embeddings {"TODO"} Explained Variance')
else:
plt.title(f'WCE Explained Variance')
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)')
for lang in X.keys():
pca = PCA(n_components=X[lang].shape[1])
pca.fit(X[lang])
_r = pca.explained_variance_ratio_
_r = np.cumsum(_r)
plt.plot(_r, label=lang)
for i in range(len(_r) - 1, 1, -1):
delta = _r[i] - _r[i - 1]
if delta > 0:
_idx.append(i)
break
best_n = max(_idx)
plt.axvline(best_n, color='r', label='optimal N')
plt.legend()
plt.show()
return best_n

View File

@ -5,7 +5,7 @@ import numpy as np
class PolylingualClassificationResults: class PolylingualClassificationResults:
def __init__(self, file, autoflush=True, verbose=False): def __init__(self, file, autoflush=True, verbose=False):
self.file = file self.file = file
self.columns = ['id', 'method', 'learner', 'embed', 'optimp', 'dataset', 'binary', 'languages', 'time', 'lang', 'macrof1', 'microf1', 'macrok', 'microk', 'notes'] self.columns = ['id', 'method', 'learner', 'embed', 'embed_type', 'optimp', 'dataset', 'time', 'lang', 'macrof1', 'microf1', 'macrok', 'microk', 'notes']
self.autoflush = autoflush self.autoflush = autoflush
self.verbose = verbose self.verbose = verbose
if os.path.exists(file): if os.path.exists(file):
@ -20,8 +20,8 @@ class PolylingualClassificationResults:
def already_calculated(self, id): def already_calculated(self, id):
return (self.df['id'] == id).any() return (self.df['id'] == id).any()
def add_row(self, id, method, learner, embed, optimp, dataset, binary, ablation_lang, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''): def add_row(self, id, method, learner, embed, embed_type, optimp, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
s = pd.Series([id, method, learner, embed, optimp, dataset, binary, ablation_lang, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns) s = pd.Series([id, method, learner, embed, embed_type, optimp, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns)
self.df = self.df.append(s, ignore_index=True) self.df = self.df.append(s, ignore_index=True)
if self.autoflush: self.flush() if self.autoflush: self.flush()
self.tell(s.to_string()) self.tell(s.to_string())