merged
This commit is contained in:
commit
a8d76b6f52
|
|
@ -1,4 +1,4 @@
|
|||
import os, sys
|
||||
import os
|
||||
from dataset_builder import MultilingualDataset
|
||||
from learning.learners import *
|
||||
from util.evaluation import *
|
||||
|
|
@ -11,32 +11,46 @@ from sklearn.svm import SVC
|
|||
parser = OptionParser()
|
||||
|
||||
parser.add_option("-d", "--dataset", dest="dataset",
|
||||
help="Path to the multilingual dataset processed and stored in .pickle format")
|
||||
help="Path to the multilingual dataset processed and stored in .pickle format",
|
||||
default="/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle")
|
||||
|
||||
parser.add_option("-o", "--output", dest="output",
|
||||
help="Result file", type=str, default='./results/results.csv')
|
||||
|
||||
parser.add_option("-e", "--mode-embed", dest="mode_embed",
|
||||
help="Set the embedding to be used [none, pretrained, supervised, both]", type=str, default='none')
|
||||
help="Set the embedding to be used [none, unsupervised, supervised, both]", type=str, default='none')
|
||||
|
||||
parser.add_option("-w", "--we-path", dest="we_path",
|
||||
help="Path to the polylingual word embeddings", default='../embeddings/')
|
||||
help="Path to the polylingual word embeddings", default='/home/andreapdr/CLESA/')
|
||||
|
||||
parser.add_option('-t', "--we-type", dest="we_type", help="Aligned embeddings to use [FastText, MUSE]", type=str,
|
||||
default='MUSE')
|
||||
|
||||
parser.add_option("-s", "--set_c", dest="set_c",type=float,
|
||||
help="Set the C parameter", default=1)
|
||||
|
||||
parser.add_option("-c", "--optimc", dest="optimc", action='store_true',
|
||||
help="Optimices hyperparameters", default=False)
|
||||
help="Optimize hyperparameters", default=False)
|
||||
|
||||
parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int,
|
||||
help="Number of parallel jobs (default is -1, all)", default=-1)
|
||||
|
||||
parser.add_option("-p", "--pca", dest="max_labels", type=int,
|
||||
help="If less than number of target classes, will apply PCA to supervised matrix. If set to 0 it"
|
||||
" will automatically search for the best number of components", default=300)
|
||||
|
||||
parser.add_option("-u", "--upca", dest="max_labels_U", type=int,
|
||||
help="If smaller than Unsupervised Dimension, will apply PCA to unsupervised matrix. If set to 0 it"
|
||||
" will automatically search for the best number of components", default=300)
|
||||
|
||||
parser.add_option("-l", dest="lang", type=str)
|
||||
|
||||
|
||||
def get_learner(calibrate=False, kernel='linear'):
|
||||
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced')
|
||||
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto')
|
||||
|
||||
|
||||
def get_params(dense=False): # TODO kernel function could be useful for meta-classifier
|
||||
def get_params(dense=False):
|
||||
if not op.optimc:
|
||||
return None
|
||||
c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
|
||||
|
|
@ -47,7 +61,6 @@ def get_params(dense=False): # TODO kernel function could be useful for meta-
|
|||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
(op, args) = parser.parse_args()
|
||||
|
||||
assert exists(op.dataset), 'Unable to find file '+str(op.dataset)
|
||||
|
|
@ -60,7 +73,9 @@ if __name__ == '__main__':
|
|||
data = MultilingualDataset.load(op.dataset)
|
||||
data.show_dimensions()
|
||||
|
||||
# data.set_view(languages=['en','it'], categories=list(range(10)))
|
||||
data.set_view(languages=['en','it', 'pt', 'sv'], categories=list(range(10)))
|
||||
# data.set_view(languages=[op.lang])
|
||||
# data.set_view(categories=list(range(10)))
|
||||
lXtr, lytr = data.training()
|
||||
lXte, lyte = data.test()
|
||||
|
||||
|
|
@ -72,30 +87,42 @@ if __name__ == '__main__':
|
|||
|
||||
# Embeddings and WCE config
|
||||
_available_mode = ['none', 'unsupervised', 'supervised', 'both']
|
||||
assert op.mode_embed in _available_mode , f'{op.mode_embed} not in {_available_mode}'
|
||||
_available_type = ['MUSE', 'FastText']
|
||||
assert op.mode_embed in _available_mode, f'{op.mode_embed} not in {_available_mode}'
|
||||
assert op.we_type in _available_type, f'{op.we_type} not in {_available_type}'
|
||||
|
||||
if op.mode_embed == 'none':
|
||||
config = {'unsupervised': False,
|
||||
'supervised': False}
|
||||
'supervised': False,
|
||||
'we_type': None}
|
||||
_config_id = 'None'
|
||||
elif op.mode_embed == 'unsupervised':
|
||||
config = {'unsupervised': True,
|
||||
'supervised': False}
|
||||
'supervised': False,
|
||||
'we_type': op.we_type}
|
||||
_config_id = 'M'
|
||||
elif op.mode_embed == 'supervised':
|
||||
config = {'unsupervised': False,
|
||||
'supervised': True}
|
||||
'supervised': True,
|
||||
'we_type': None}
|
||||
_config_id = 'F'
|
||||
elif op.mode_embed == 'both':
|
||||
config = {'unsupervised': True,
|
||||
'supervised': True}
|
||||
'supervised': True,
|
||||
'we_type': op.we_type}
|
||||
_config_id = 'M_and_F'
|
||||
|
||||
##### TODO - config dict is redundant - we have already op argparse ...
|
||||
config['reduction'] = 'PCA'
|
||||
config['max_label_space'] = op.max_labels
|
||||
config['dim_reduction_unsupervised'] = op.max_labels_U
|
||||
# config['plot_covariance_matrices'] = True
|
||||
|
||||
result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '')
|
||||
|
||||
print(f'### PolyEmbedd_andrea_{_config_id}\n')
|
||||
classifier = AndreaCLF(op.we_path,
|
||||
config,
|
||||
classifier = AndreaCLF(we_path=op.we_path,
|
||||
config=config,
|
||||
first_tier_learner=get_learner(calibrate=True),
|
||||
meta_learner=get_learner(calibrate=False, kernel='rbf'),
|
||||
first_tier_parameters=get_params(dense=False),
|
||||
|
|
@ -105,7 +132,7 @@ if __name__ == '__main__':
|
|||
print('# Fitting ...')
|
||||
classifier.fit(lXtr, lytr)
|
||||
|
||||
print('# Evaluating ...')
|
||||
print('\n# Evaluating ...')
|
||||
l_eval = evaluate_method(classifier, lXte, lyte)
|
||||
|
||||
metrics = []
|
||||
|
|
@ -113,6 +140,6 @@ if __name__ == '__main__':
|
|||
macrof1, microf1, macrok, microk = l_eval[lang]
|
||||
metrics.append([macrof1, microf1, macrok, microk])
|
||||
print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1))
|
||||
results.add_row(result_id, 'PolyEmbed_andrea', 'svm', _config_id, op.optimc, op.dataset.split('/')[-1],
|
||||
'not_binary', 'not_ablation', classifier.time, lang, macrof1, microf1, macrok, microk, 'nope')
|
||||
results.add_row(result_id, 'PolyEmbed_andrea', 'svm', _config_id, config['we_type'], op.optimc, op.dataset.split('/')[-1],
|
||||
classifier.time, lang, macrof1, microf1, macrok, microk, '')
|
||||
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
import os
|
||||
import pickle
|
||||
import numpy as np
|
||||
from torchtext.vocab import Vectors
|
||||
import torch
|
||||
from abc import ABC, abstractmethod
|
||||
from data.supervised import get_supervised_embeddings
|
||||
from util.decompositions import *
|
||||
|
||||
|
||||
class PretrainedEmbeddings(ABC):
|
||||
|
|
@ -110,10 +110,10 @@ class WordEmbeddings:
|
|||
# vocabulary is a set of terms to be kept
|
||||
active_vocabulary = sorted([w for w in vocabulary if w in self.worddim])
|
||||
lost = len(vocabulary)-len(active_vocabulary)
|
||||
if lost>0: #some termr are missing, so it will be replaced by UNK
|
||||
if lost > 0: # some terms are missing, so it will be replaced by UNK
|
||||
print('warning: missing {} terms for lang {}'.format(lost, self.lang))
|
||||
self.we = self.get_vectors(active_vocabulary)
|
||||
assert self.we.shape[0]==len(active_vocabulary)
|
||||
assert self.we.shape[0] == len(active_vocabulary)
|
||||
self.dimword={i:w for i,w in enumerate(active_vocabulary)}
|
||||
self.worddim={w:i for i,w in enumerate(active_vocabulary)}
|
||||
return self
|
||||
|
|
@ -132,12 +132,12 @@ class WordEmbeddings:
|
|||
'instances of {} expected'.format(WordEmbeddings.__name__)
|
||||
|
||||
polywe = []
|
||||
worddim={}
|
||||
offset=0
|
||||
worddim = {}
|
||||
offset = 0
|
||||
for we in we_list:
|
||||
polywe.append(we.we)
|
||||
worddim.update({'{}::{}'.format(we.lang, w):d+offset for w,d in we.worddim.items()})
|
||||
offset=len(worddim)
|
||||
offset = len(worddim)
|
||||
polywe = np.vstack(polywe)
|
||||
|
||||
return WordEmbeddings(lang='poly', we=polywe, worddim=worddim)
|
||||
|
|
@ -147,16 +147,41 @@ class FastTextWikiNews(Vectors):
|
|||
|
||||
url_base = 'Cant auto-download MUSE embeddings'
|
||||
path = '/storage/andrea/FUNNELING/embeddings/wiki.multi.{}.vec'
|
||||
_name = 'wiki.multi.{}.vec'
|
||||
_name = '/embeddings/wiki.multi.{}.vec'
|
||||
|
||||
def __init__(self, cache, language="en", **kwargs):
|
||||
url = self.url_base.format(language)
|
||||
# name = self.path.format(language)
|
||||
name = cache + self._name.format(language)
|
||||
# print(f'\n\nFASTEXTWIKI-NEW CLASS:\nurl = {url}\nname = {name}\ncache {cache}\nlanguage = {language}')
|
||||
super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs)
|
||||
|
||||
|
||||
class EmbeddingsAligned(Vectors):
|
||||
|
||||
def __init__(self, type, path, lang, voc):
|
||||
# todo - rewrite as relative path
|
||||
self.name = '/embeddings/wiki.multi.{}.vec' if type == 'MUSE' else '/embeddings_polyFASTTEXT/wiki.{}.align.vec'
|
||||
self.cache_path = '/home/andreapdr/CLESA/embeddings' if type == 'MUSE' else '/home/andreapdr/CLESA/embeddings_polyFASTTEXT'
|
||||
self.path = path + self.name.format(lang)
|
||||
assert os.path.exists(path), f'pre-trained vectors not found in {path}'
|
||||
super(EmbeddingsAligned, self).__init__(self.path, cache=self.cache_path)
|
||||
self.vectors = self.extract(voc)
|
||||
|
||||
def vocabulary(self):
|
||||
return set(self.stoi.keys())
|
||||
|
||||
def extract(self, words):
|
||||
source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.stoi)
|
||||
extraction = torch.zeros((len(words), self.dim))
|
||||
extraction[source_idx] = self.vectors[target_idx]
|
||||
return extraction
|
||||
|
||||
def reduce(self, dim):
|
||||
pca = PCA(n_components=dim)
|
||||
self.vectors = pca.fit_transform(self.vectors)
|
||||
return
|
||||
|
||||
|
||||
class FastTextMUSE(PretrainedEmbeddings):
|
||||
|
||||
def __init__(self, path, lang, limit=None):
|
||||
|
|
@ -164,7 +189,6 @@ class FastTextMUSE(PretrainedEmbeddings):
|
|||
print(f'Loading fastText pretrained vectors from {path}')
|
||||
assert os.path.exists(path), print(f'pre-trained vectors not found in {path}')
|
||||
self.embed = FastTextWikiNews(path, lang, max_vectors=limit)
|
||||
# print('Done')
|
||||
|
||||
def vocabulary(self):
|
||||
return set(self.embed.stoi.keys())
|
||||
|
|
@ -179,21 +203,76 @@ class FastTextMUSE(PretrainedEmbeddings):
|
|||
return extraction
|
||||
|
||||
|
||||
def embedding_matrix(path, voc, lang):
|
||||
vocabulary = np.asarray(list(zip(*sorted(voc.items(), key=lambda x:x[1])))[0])
|
||||
class StorageEmbeddings:
|
||||
def __init__(self, path):
|
||||
self.path = path
|
||||
self.lang_U = dict()
|
||||
self.lang_S = dict()
|
||||
|
||||
print('[embedding matrix]')
|
||||
print(f'# [pretrained-matrix: FastTextMUSE {lang}]')
|
||||
pretrained = FastTextMUSE(path, lang)
|
||||
P = pretrained.extract(vocabulary).numpy()
|
||||
del pretrained
|
||||
print(f'[embedding matrix done] of shape={P.shape}\n')
|
||||
def _add_embeddings_unsupervised(self, type, docs, vocs, max_label_space=300):
|
||||
for lang in docs.keys():
|
||||
nC = self.lang_U[lang].shape[1]
|
||||
print(f'# [unsupervised-matrix {type}] for {lang}')
|
||||
voc = np.asarray(list(zip(*sorted(vocs[lang].items(), key=lambda x: x[1])))[0])
|
||||
self.lang_U[lang] = EmbeddingsAligned(type, self.path, lang, voc).vectors
|
||||
# if self.lang_U[lang].shape[1] > dim != 0:
|
||||
# print(f'unsupervised matrix has more dimensions ({self.lang_U[lang].shape[1]}) than'
|
||||
# f' the allowed limit {dim}. Applying PCA(n_components={dim})')
|
||||
# pca = PCA(n_components=dim)
|
||||
# self.lang_U[lang] = pca.fit_transform(self.lang_U[lang])
|
||||
print(f'Matrix U (weighted sum) of shape {self.lang_U[lang].shape}\n')
|
||||
if max_label_space == 0:
|
||||
print(f'Computing optimal number of PCA components along matrices U')
|
||||
optimal_n = get_optimal_dim(self.lang_U, 'U')
|
||||
self.lang_U = run_pca(optimal_n, self.lang_U)
|
||||
elif max_label_space < nC:
|
||||
self.lang_U = run_pca(max_label_space, self.lang_U)
|
||||
|
||||
return vocabulary, P
|
||||
return
|
||||
|
||||
def _add_emebeddings_supervised(self, docs, labels, reduction, max_label_space, voc):
|
||||
# if max_label_space == 0:
|
||||
# print('Computing optimal number of PCA components along matrices S...')
|
||||
# optimal_n = self.get_optimal_supervised_components(docs, labels)
|
||||
# max_label_space = optimal_n
|
||||
for lang in docs.keys(): # compute supervised matrices S - then apply PCA
|
||||
nC = self.lang_S[lang].shape[1]
|
||||
print(f'# [supervised-matrix] for {lang}')
|
||||
self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], reduction, max_label_space, voc[lang], lang)
|
||||
print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n')
|
||||
|
||||
if max_label_space == 0:
|
||||
optimal_n = get_optimal_dim(self.lang_S, 'S')
|
||||
self.lang_S = run_pca(optimal_n, self.lang_S)
|
||||
elif max_label_space < nC:
|
||||
self.lang_S = run_pca(max_label_space, self.lang_S)
|
||||
|
||||
return
|
||||
|
||||
def _concatenate_embeddings(self, docs):
|
||||
_r = dict()
|
||||
for lang in self.lang_U.keys():
|
||||
_r[lang] = np.hstack((docs[lang].dot(self.lang_U[lang]), docs[lang].dot(self.lang_S[lang])))
|
||||
return _r
|
||||
|
||||
def fit(self, config, docs, vocs, labels):
|
||||
if config['unsupervised']:
|
||||
self._add_embeddings_unsupervised(config['we_type'], docs, vocs, config['dim_reduction_unsupervised'])
|
||||
if config['supervised']:
|
||||
self._add_emebeddings_supervised(docs, labels, config['reduction'], config['max_label_space'], vocs)
|
||||
return self
|
||||
|
||||
|
||||
def WCE_matrix(Xtr, Ytr, lang):
|
||||
print('\n# [supervised-matrix]')
|
||||
S = get_supervised_embeddings(Xtr[lang], Ytr[lang], max_label_space=50)
|
||||
print(f'[embedding matrix done] of shape={S.shape}\n')
|
||||
return S
|
||||
def predict(self, config, docs):
|
||||
if config['supervised'] and config['unsupervised']:
|
||||
return self._concatenate_embeddings(docs)
|
||||
elif config['supervised']:
|
||||
_r = dict()
|
||||
for lang in docs.keys():
|
||||
_r[lang] = docs[lang].dot(self.lang_S[lang])
|
||||
else:
|
||||
_r = dict()
|
||||
for lang in docs.keys():
|
||||
_r[lang] = docs[lang].dot(self.lang_U[lang])
|
||||
return _r
|
||||
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square
|
||||
# from util.common import *
|
||||
from sklearn.decomposition import PCA
|
||||
import numpy as np
|
||||
# from sklearn.decomposition import PCA
|
||||
# from sklearn.manifold import TSNE
|
||||
|
||||
|
||||
def zscores(x, axis=0): #scipy.stats.zscores does not avoid division by 0, which can indeed occur
|
||||
|
|
@ -41,12 +41,10 @@ def supervised_embeddings_tsr(X,Y, tsr_function=information_gain, max_documents=
|
|||
return F
|
||||
|
||||
|
||||
def get_supervised_embeddings(X, Y, max_label_space=300, binary_structural_problems=-1, method='dotn', dozscore=True):
|
||||
print('computing supervised embeddings...')
|
||||
|
||||
def get_supervised_embeddings(X, Y, reduction, max_label_space=300, voc=None, lang='None', binary_structural_problems=-1, method='dotn', dozscore=True):
|
||||
if max_label_space != 0:
|
||||
print('computing supervised embeddings...')
|
||||
nC = Y.shape[1]
|
||||
if nC==2 and binary_structural_problems > nC:
|
||||
raise ValueError('not implemented in this branch')
|
||||
|
||||
if method=='ppmi':
|
||||
F = supervised_embeddings_ppmi(X, Y)
|
||||
|
|
@ -60,14 +58,41 @@ def get_supervised_embeddings(X, Y, max_label_space=300, binary_structural_probl
|
|||
if dozscore:
|
||||
F = zscores(F, axis=0)
|
||||
|
||||
if nC > max_label_space:
|
||||
print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
|
||||
f'Applying PCA(n_components={max_label_space})')
|
||||
pca = PCA(n_components=max_label_space)
|
||||
F = pca.fit(F).transform(F)
|
||||
# Dumping F-matrix for further studies
|
||||
dump_it = False
|
||||
if dump_it:
|
||||
with open(f'/home/andreapdr/funneling_pdr/src/dumps/WCE_{lang}.tsv', 'w') as outfile:
|
||||
np.savetxt(outfile, F, delimiter='\t')
|
||||
with open(f'/home/andreapdr/funneling_pdr/src/dumps/dict_WCE_{lang}.tsv', 'w') as outfile:
|
||||
for token in voc.keys():
|
||||
outfile.write(token+'\n')
|
||||
|
||||
return F
|
||||
|
||||
# if nC >= max_label_space:
|
||||
# if reduction == 'PCA':
|
||||
# if max_label_space == 0:
|
||||
# pca = PCA(n_components=Y.shape[1])
|
||||
# pca = pca.fit(F)
|
||||
# return pca.explained_variance_ratio_
|
||||
#
|
||||
# print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
|
||||
# f'Applying PCA(n_components={max_label_space})')
|
||||
# pca = PCA(n_components=max_label_space)
|
||||
# F = pca.fit_transform(F)
|
||||
# elif reduction == 'TSNE':
|
||||
# print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
|
||||
# f'Applying t-SNE(n_components={max_label_space})')
|
||||
# tsne = TSNE(n_components=max_label_space)
|
||||
# F = tsne.fit_transform(F)
|
||||
# elif reduction == 'tSVD':
|
||||
# print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
|
||||
# f'Applying truncatedSVD(n_components={max_label_space})')
|
||||
# tSVD = TruncatedSVD(n_components=max_label_space)
|
||||
# F = tSVD.fit_transform(F)
|
||||
#
|
||||
# return F
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -11,6 +11,8 @@ import numpy as np
|
|||
from sklearn.model_selection import train_test_split
|
||||
from scipy.sparse import issparse
|
||||
import itertools
|
||||
from tqdm import tqdm
|
||||
import re
|
||||
|
||||
|
||||
class MultilingualDataset:
|
||||
|
|
@ -73,10 +75,14 @@ class MultilingualDataset:
|
|||
return self.lXte(), self.lYte()
|
||||
|
||||
def lXtr(self):
|
||||
return {lang:Xtr for (lang, ((Xtr,_,_),_)) in self.multiling_dataset.items() if lang in self.langs()}
|
||||
return {lang: Xtr for (lang, ((Xtr, _, _), _)) in self.multiling_dataset.items() if
|
||||
lang in self.langs()}
|
||||
# return {lang:self.mask_numbers(Xtr) for (lang, ((Xtr,_,_),_)) in self.multiling_dataset.items() if lang in self.langs()}
|
||||
|
||||
def lXte(self):
|
||||
return {lang:Xte for (lang, (_,(Xte,_,_))) in self.multiling_dataset.items() if lang in self.langs()}
|
||||
return {lang: Xte for (lang, (_, (Xte, _, _))) in self.multiling_dataset.items() if
|
||||
lang in self.langs()}
|
||||
# return {lang:self.mask_numbers(Xte) for (lang, (_,(Xte,_,_))) in self.multiling_dataset.items() if lang in self.langs()}
|
||||
|
||||
def lYtr(self):
|
||||
return {lang:self.cat_view(Ytr) for (lang, ((_,Ytr,_),_)) in self.multiling_dataset.items() if lang in self.langs()}
|
||||
|
|
@ -129,6 +135,13 @@ class MultilingualDataset:
|
|||
def set_labels(self, labels):
|
||||
self.labels = labels
|
||||
|
||||
def mask_numbers(self, data, number_mask='numbermask'):
|
||||
mask = re.compile(r'\b[0-9][0-9.,-]*\b')
|
||||
masked = []
|
||||
for text in tqdm(data, desc='masking numbers'):
|
||||
masked.append(mask.sub(number_mask, text))
|
||||
return masked
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------------------------
|
||||
# Helpers
|
||||
|
|
|
|||
|
|
@ -1,15 +1,14 @@
|
|||
import numpy as np
|
||||
import time
|
||||
from data.embeddings import WordEmbeddings, embedding_matrix, WCE_matrix
|
||||
from data.embeddings import WordEmbeddings, StorageEmbeddings
|
||||
from scipy.sparse import issparse
|
||||
from sklearn.multiclass import OneVsRestClassifier
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.model_selection import KFold
|
||||
from joblib import Parallel, delayed
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
|
||||
from data.supervised import zscores
|
||||
from transformers.StandardizeTransformer import StandardizeTransformer
|
||||
# from sklearn.decomposition import PCA
|
||||
|
||||
|
||||
def _sort_if_sparse(X):
|
||||
|
|
@ -444,7 +443,8 @@ class AndreaCLF(FunnellingPolylingualClassifier):
|
|||
first_tier_parameters=None,
|
||||
meta_parameters=None,
|
||||
folded_projections=1,
|
||||
calmode='cal', n_jobs=-1):
|
||||
calmode='cal',
|
||||
n_jobs=-1):
|
||||
|
||||
super().__init__(first_tier_learner,
|
||||
meta_learner,
|
||||
|
|
@ -454,13 +454,13 @@ class AndreaCLF(FunnellingPolylingualClassifier):
|
|||
calmode,
|
||||
n_jobs)
|
||||
|
||||
self.pca_independent_space = PCA(n_components=100)
|
||||
self.we_path = we_path
|
||||
self.config = config
|
||||
self.lang_word2idx = dict()
|
||||
self.languages = []
|
||||
self.lang_tfidf = {}
|
||||
self.word_embeddings = {}
|
||||
self.supervised_embeddings = {}
|
||||
self.embedding_space = None
|
||||
self.model = None
|
||||
self.time = None
|
||||
|
||||
|
|
@ -479,9 +479,8 @@ class AndreaCLF(FunnellingPolylingualClassifier):
|
|||
self.languages.append(lang)
|
||||
tfidf_vectorizer.fit(lX[lang])
|
||||
lX[lang] = tfidf_vectorizer.transform(lX[lang])
|
||||
_sort_if_sparse(lX[lang])
|
||||
self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_
|
||||
self.lang_tfidf[lang] = tfidf_vectorizer # utile in fase di testing
|
||||
self.lang_tfidf[lang] = tfidf_vectorizer
|
||||
return self
|
||||
|
||||
# @override std class method
|
||||
|
|
@ -494,45 +493,6 @@ class AndreaCLF(FunnellingPolylingualClassifier):
|
|||
|
||||
return lZ, lYtr
|
||||
|
||||
def embed(self, lX, ly, unsupervised=False, supervised=False, prediction=False):
|
||||
"""
|
||||
build embedding matrix for given language and returns its weighted sum wrt tf-idf score
|
||||
"""
|
||||
_r = dict()
|
||||
languages = list(lX.keys())
|
||||
|
||||
if prediction:
|
||||
for lang in languages:
|
||||
if unsupervised: # If unsupervised embeddings ...
|
||||
M = self.word_embeddings[lang]
|
||||
if supervised: # and also unsupervised --> get both (M) and (S) weighted sum matrices and hstack them
|
||||
S = self.supervised_embeddings[lang]
|
||||
_r[lang] = np.hstack((lX[lang].dot(M), lX[lang].dot(S)))
|
||||
continue
|
||||
_r[lang] = lX[lang].dot(M) # if not supervised --> just get weighted sum of unsupervised (M) embeddings
|
||||
else: # If not unsupervised --> get (S) matrix and its weighted sum
|
||||
S = self.supervised_embeddings[lang]
|
||||
_r[lang] = lX[lang].dot(S)
|
||||
return _r
|
||||
|
||||
if unsupervised:
|
||||
for lang in languages:
|
||||
# print('Test building embedding matrix FastTextMuse ...')
|
||||
_, M = embedding_matrix(self.we_path, self.lang_word2idx[lang], lang)
|
||||
self.word_embeddings[lang] = M
|
||||
_r[lang] = lX[lang].dot(M)
|
||||
|
||||
if supervised:
|
||||
for lang in languages:
|
||||
S = WCE_matrix(lX, ly, lang)
|
||||
# S = np.squeeze(np.asarray(S)) # casting to ndarray to better visualize S while debugging
|
||||
self.supervised_embeddings[lang] = S
|
||||
if unsupervised:
|
||||
_r[lang] = np.hstack((_r[lang], lX[lang].dot(S)))
|
||||
else:
|
||||
_r[lang] = lX[lang].dot(S)
|
||||
return _r
|
||||
|
||||
# @override std class method
|
||||
def fit(self, lX, ly):
|
||||
tinit = time.time()
|
||||
|
|
@ -545,24 +505,22 @@ class AndreaCLF(FunnellingPolylingualClassifier):
|
|||
Z, zy = self._get_zspace(lX, ly)
|
||||
|
||||
if self.config['supervised'] or self.config['unsupervised']:
|
||||
# Z vectors is concatenated with doc's embedding weighted sum
|
||||
Z_embedded = dict()
|
||||
l_weighted_em = self.embed(lX, ly,
|
||||
unsupervised=self.config['unsupervised'],
|
||||
supervised=self.config['supervised'])
|
||||
|
||||
# stacking Z space horizontally with unsupervised (M) and/or supervised (F) embeddings
|
||||
for lang in list(lX.keys()):
|
||||
Z_embedded[lang] = np.hstack((Z[lang], l_weighted_em[lang]))
|
||||
Z = Z_embedded
|
||||
|
||||
self.embedding_space = StorageEmbeddings(self.we_path).fit(self.config, lX, self.lang_word2idx, ly)
|
||||
_embedding_space = self.embedding_space.predict(self.config, lX)
|
||||
# h_stacking posterior probabilities with (U) and/or (S) matrices
|
||||
for lang in self.languages:
|
||||
Z[lang] = np.hstack((Z[lang], _embedding_space[lang]))
|
||||
|
||||
# stacking Z space vertically
|
||||
_vertical_Z = np.vstack([Z[lang] for lang in self.languages])
|
||||
_vertical_Zy = np.vstack([zy[lang] for lang in self.languages])
|
||||
|
||||
# todo testing ...
|
||||
# self.pca_independent_space.fit(_vertical_Z)
|
||||
# _vertical_Z = self.pca_independent_space.transform(_vertical_Z)
|
||||
|
||||
self.standardizer = StandardizeTransformer()
|
||||
_vertical_Z = self.standardizer.fit_predict(_vertical_Z)
|
||||
_vertical_Z = self.standardizer.fit_predict(_vertical_Z)
|
||||
|
||||
print('fitting the Z-space of shape={}'.format(_vertical_Z.shape))
|
||||
self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters,
|
||||
|
|
@ -577,17 +535,15 @@ class AndreaCLF(FunnellingPolylingualClassifier):
|
|||
lZ = self._projection(self.doc_projector, lX)
|
||||
|
||||
if self.config['supervised'] or self.config['unsupervised']:
|
||||
l_weighted_em = self.embed(lX, ly,
|
||||
unsupervised=self.config['unsupervised'],
|
||||
supervised=self.config['supervised'],
|
||||
prediction=True)
|
||||
Z_embedded = dict()
|
||||
_embedding_space = self.embedding_space.predict(self.config, lX)
|
||||
|
||||
for lang in lX.keys():
|
||||
Z_embedded[lang] = np.hstack((lZ[lang], l_weighted_em[lang]))
|
||||
lZ = Z_embedded
|
||||
lZ[lang] = np.hstack((lZ[lang], _embedding_space[lang]))
|
||||
|
||||
for lang in lZ.keys():
|
||||
print(lZ[lang].shape)
|
||||
# todo testing
|
||||
# lZ[lang] = self.pca_independent_space.transform(lZ[lang])
|
||||
lZ[lang] = self.standardizer.predict(lZ[lang])
|
||||
|
||||
return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs)
|
||||
|
|
|
|||
|
|
@ -1,7 +0,0 @@
|
|||
id method learner embed optimp dataset binary languages time lang macrof1 microf1 macrok microk notes
|
||||
jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_M PolyEmbed_andrea svm M False test_datasetname not_binary not_ablation 55.56810355186462 da 0.7933333333333333 0.0 0.7933333333333333 0.0 nope
|
||||
jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_M PolyEmbed_andrea svm M False test_datasetname not_binary not_ablation 55.56810355186462 en 0.7866666666666666 0.0 0.7927111111111111 -0.0003376325207643527 nope
|
||||
jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_M PolyEmbed_andrea svm M False test_datasetname not_binary not_ablation 55.56810355186462 fr 0.7866666666666666 0.0 0.7930666666666667 -0.0001350530083057411 nope
|
||||
jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_None PolyEmbed_andrea svm None False test_datasetname not_binary not_ablation 24.031760931015015 da 0.7933333333333333 0.0 0.7933333333333333 0.0 nope
|
||||
jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_None PolyEmbed_andrea svm None False test_datasetname not_binary not_ablation 24.031760931015015 en 0.7933333333333333 0.0 0.7931111111111111 -0.00013505300830574107 nope
|
||||
jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_None PolyEmbed_andrea svm None False test_datasetname not_binary not_ablation 24.031760931015015 fr 0.7933333333333333 0.0 0.7933333333333333 0.0 nope
|
||||
|
|
|
@ -12,7 +12,7 @@ class StandardizeTransformer:
|
|||
self.std = np.clip(std, 1e-5, None)
|
||||
self.mean = np.mean(X, axis=self.axis)
|
||||
self.yetfit=True
|
||||
print('done')
|
||||
print('done\n')
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
|
|
@ -20,4 +20,4 @@ class StandardizeTransformer:
|
|||
return (X - self.mean) / self.std
|
||||
|
||||
def fit_predict(self, X):
|
||||
return self.fit(X).predict(X)
|
||||
return self.fit(X).predict(X)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,49 @@
|
|||
from sklearn.decomposition import PCA
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
def run_pca(dim, X):
|
||||
"""
|
||||
:param dim: number of pca components to keep
|
||||
:param X: dictionary str(lang): matrix
|
||||
:return: dict lang: reduced matrix
|
||||
"""
|
||||
r = dict()
|
||||
pca = PCA(n_components=dim)
|
||||
for lang in X.keys():
|
||||
r[lang] = pca.fit_transform(X[lang])
|
||||
return r
|
||||
|
||||
|
||||
def get_optimal_dim(X, embed_type):
|
||||
"""
|
||||
:param X: dict str(lang) : csr_matrix of embeddings unsupervised or supervised
|
||||
:param embed_type: (str) embedding matrix type: S or U (WCE supervised or U unsupervised MUSE/FASTTEXT)
|
||||
:return:
|
||||
"""
|
||||
_idx = []
|
||||
|
||||
plt.figure(figsize=(15, 10))
|
||||
if embed_type == 'U':
|
||||
plt.title(f'Unsupervised Embeddings {"TODO"} Explained Variance')
|
||||
else:
|
||||
plt.title(f'WCE Explained Variance')
|
||||
plt.xlabel('Number of Components')
|
||||
plt.ylabel('Variance (%)')
|
||||
|
||||
for lang in X.keys():
|
||||
pca = PCA(n_components=X[lang].shape[1])
|
||||
pca.fit(X[lang])
|
||||
_r = pca.explained_variance_ratio_
|
||||
_r = np.cumsum(_r)
|
||||
plt.plot(_r, label=lang)
|
||||
for i in range(len(_r) - 1, 1, -1):
|
||||
delta = _r[i] - _r[i - 1]
|
||||
if delta > 0:
|
||||
_idx.append(i)
|
||||
break
|
||||
best_n = max(_idx)
|
||||
plt.axvline(best_n, color='r', label='optimal N')
|
||||
plt.legend()
|
||||
plt.show()
|
||||
return best_n
|
||||
|
|
@ -5,7 +5,7 @@ import numpy as np
|
|||
class PolylingualClassificationResults:
|
||||
def __init__(self, file, autoflush=True, verbose=False):
|
||||
self.file = file
|
||||
self.columns = ['id', 'method', 'learner', 'embed', 'optimp', 'dataset', 'binary', 'languages', 'time', 'lang', 'macrof1', 'microf1', 'macrok', 'microk', 'notes']
|
||||
self.columns = ['id', 'method', 'learner', 'embed', 'embed_type', 'optimp', 'dataset', 'time', 'lang', 'macrof1', 'microf1', 'macrok', 'microk', 'notes']
|
||||
self.autoflush = autoflush
|
||||
self.verbose = verbose
|
||||
if os.path.exists(file):
|
||||
|
|
@ -20,8 +20,8 @@ class PolylingualClassificationResults:
|
|||
def already_calculated(self, id):
|
||||
return (self.df['id'] == id).any()
|
||||
|
||||
def add_row(self, id, method, learner, embed, optimp, dataset, binary, ablation_lang, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
|
||||
s = pd.Series([id, method, learner, embed, optimp, dataset, binary, ablation_lang, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns)
|
||||
def add_row(self, id, method, learner, embed, embed_type, optimp, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
|
||||
s = pd.Series([id, method, learner, embed, embed_type, optimp, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns)
|
||||
self.df = self.df.append(s, ignore_index=True)
|
||||
if self.autoflush: self.flush()
|
||||
self.tell(s.to_string())
|
||||
|
|
|
|||
Loading…
Reference in New Issue