get_optimal_supervised_components method - to be polished
This commit is contained in:
parent
4de6b3e250
commit
f074fd97f9
|
|
@ -103,8 +103,8 @@ if __name__ == '__main__':
|
||||||
_config_id = 'M_and_F'
|
_config_id = 'M_and_F'
|
||||||
|
|
||||||
##### TODO - config dict is redundant - we have already op argparse ...
|
##### TODO - config dict is redundant - we have already op argparse ...
|
||||||
config['reduction'] = 'tSVD'
|
config['reduction'] = 'PCA'
|
||||||
config['max_label_space'] = 50
|
config['max_label_space'] = 'optimal'
|
||||||
|
|
||||||
result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '')
|
result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '')
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,6 @@ from torchtext.vocab import Vectors
|
||||||
import torch
|
import torch
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from data.supervised import get_supervised_embeddings
|
from data.supervised import get_supervised_embeddings
|
||||||
from sklearn.decomposition import PCA
|
|
||||||
|
|
||||||
|
|
||||||
class PretrainedEmbeddings(ABC):
|
class PretrainedEmbeddings(ABC):
|
||||||
|
|
@ -244,10 +243,16 @@ class StorageEmbeddings:
|
||||||
return
|
return
|
||||||
|
|
||||||
def _add_emebeddings_supervised(self, docs, labels, reduction, max_label_space):
|
def _add_emebeddings_supervised(self, docs, labels, reduction, max_label_space):
|
||||||
|
_optimal = dict()
|
||||||
|
# TODO testing optimal max_label_space
|
||||||
|
if max_label_space == 'optimal':
|
||||||
|
print('Computing optimal number of PCA components ...')
|
||||||
|
optimal_n = self.get_optimal_supervised_components(docs, labels)
|
||||||
|
max_label_space = optimal_n
|
||||||
|
|
||||||
for lang in docs.keys():
|
for lang in docs.keys():
|
||||||
print(f'# [supervised-matrix] for {lang}')
|
print(f'# [supervised-matrix] for {lang}')
|
||||||
# should also pass max_label_space and reduction techniques
|
self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], reduction, max_label_space, lang)
|
||||||
self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], reduction, max_label_space)
|
|
||||||
print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n')
|
print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
@ -277,22 +282,19 @@ class StorageEmbeddings:
|
||||||
_r[lang] = docs[lang].dot(self.lang_U[lang])
|
_r[lang] = docs[lang].dot(self.lang_U[lang])
|
||||||
return _r
|
return _r
|
||||||
|
|
||||||
|
def get_optimal_supervised_components(self, docs, labels):
|
||||||
|
_idx = []
|
||||||
|
for lang in docs.keys():
|
||||||
|
_r = get_supervised_embeddings(docs[lang], labels[lang], reduction='PCA', max_label_space='optimal').tolist()
|
||||||
|
|
||||||
# def embedding_matrix(type, path, voc, lang):
|
for i in range(len(_r)-1, 1, -1):
|
||||||
# vocabulary = np.asarray(list(zip(*sorted(voc.items(), key=lambda x: x[1])))[0])
|
# todo: if n_components (therfore #n labels) is not big enough every value will be smaller than the next one ...
|
||||||
#
|
ratio = _r[i]
|
||||||
# print('[embedding matrix]')
|
next_ratio = _r[i-1]
|
||||||
# print(f'# [pretrained-matrix: {type} {lang}]')
|
delta = _r[i] - _r[i-1]
|
||||||
# pretrained = EmbeddingsAligned(type, path, lang)
|
if delta > 0:
|
||||||
# P = pretrained.extract(vocabulary).numpy()
|
# if ratio < next_ratio:
|
||||||
# del pretrained
|
_idx.append(i)
|
||||||
# print(f'[embedding matrix done] of shape={P.shape}\n')
|
break
|
||||||
#
|
best_n = int(sum(_idx)/len(_idx))
|
||||||
# return vocabulary, P
|
return best_n
|
||||||
|
|
||||||
|
|
||||||
def WCE_matrix(Xtr, Ytr, lang, reduction=None, n_components=50):
|
|
||||||
print('\n# [supervised-matrix]')
|
|
||||||
S = get_supervised_embeddings(Xtr[lang], Ytr[lang])
|
|
||||||
print(f'[embedding matrix done] of shape={S.shape}\n')
|
|
||||||
return S
|
|
||||||
|
|
|
||||||
|
|
@ -40,8 +40,12 @@ def supervised_embeddings_tsr(X,Y, tsr_function=information_gain, max_documents=
|
||||||
return F
|
return F
|
||||||
|
|
||||||
|
|
||||||
def get_supervised_embeddings(X, Y, reduction, max_label_space=300, binary_structural_problems=-1, method='dotn', dozscore=True):
|
def get_supervised_embeddings(X, Y, reduction, max_label_space=300, lang='None', binary_structural_problems=-1, method='dotn', dozscore=True):
|
||||||
print('computing supervised embeddings...')
|
if max_label_space == 'optimal':
|
||||||
|
max_label_space = 0
|
||||||
|
|
||||||
|
if max_label_space != 0:
|
||||||
|
print('computing supervised embeddings...')
|
||||||
|
|
||||||
nC = Y.shape[1]
|
nC = Y.shape[1]
|
||||||
if nC==2 and binary_structural_problems > nC:
|
if nC==2 and binary_structural_problems > nC:
|
||||||
|
|
@ -60,21 +64,40 @@ def get_supervised_embeddings(X, Y, reduction, max_label_space=300, binary_struc
|
||||||
F = zscores(F, axis=0)
|
F = zscores(F, axis=0)
|
||||||
|
|
||||||
if nC > max_label_space:
|
if nC > max_label_space:
|
||||||
|
# TODO testing optimal max_label_space
|
||||||
if reduction == 'PCA':
|
if reduction == 'PCA':
|
||||||
|
if max_label_space == 0:
|
||||||
|
pca = PCA(n_components=Y.shape[1])
|
||||||
|
pca = pca.fit(F)
|
||||||
|
return pca.explained_variance_ratio_
|
||||||
|
|
||||||
print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
|
print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
|
||||||
f'Applying PCA(n_components={max_label_space})')
|
f'Applying PCA(n_components={max_label_space})')
|
||||||
pca = PCA(n_components=max_label_space)
|
pca = PCA(n_components=max_label_space)
|
||||||
F = pca.fit(F).transform(F)
|
pca = pca.fit(F)
|
||||||
|
|
||||||
|
########################################################
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
plt.figure()
|
||||||
|
plt.plot(np.cumsum(pca.explained_variance_ratio_))
|
||||||
|
plt.xlabel('Number of Components')
|
||||||
|
plt.ylabel('Variance (%)') #
|
||||||
|
plt.title(f'WCE Explained Variance {lang}')
|
||||||
|
plt.show()
|
||||||
|
########################################################
|
||||||
|
|
||||||
|
F = pca.fit_transform(F)
|
||||||
elif reduction == 'TSNE':
|
elif reduction == 'TSNE':
|
||||||
print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
|
print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
|
||||||
f'Applying t-SNE(n_components={max_label_space})')
|
f'Applying t-SNE(n_components={max_label_space})')
|
||||||
tsne = TSNE(n_components=max_label_space)
|
tsne = TSNE(n_components=max_label_space)
|
||||||
F = tsne.fit(F).fit_transform(F)
|
F = tsne.fit_transform(F)
|
||||||
elif reduction == 'tSVD':
|
elif reduction == 'tSVD':
|
||||||
print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
|
print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
|
||||||
f'Applying truncatedSVD(n_components={max_label_space})')
|
f'Applying truncatedSVD(n_components={max_label_space})')
|
||||||
tSVD = TruncatedSVD(n_components=max_label_space)
|
tSVD = TruncatedSVD(n_components=max_label_space)
|
||||||
F = tSVD.fit(F).fit_transform(F)
|
F = tSVD.fit_transform(F)
|
||||||
|
|
||||||
return F
|
return F
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import time
|
import time
|
||||||
from data.embeddings import WordEmbeddings, WCE_matrix, StorageEmbeddings
|
from data.embeddings import WordEmbeddings, StorageEmbeddings
|
||||||
from scipy.sparse import issparse
|
from scipy.sparse import issparse
|
||||||
from sklearn.multiclass import OneVsRestClassifier
|
from sklearn.multiclass import OneVsRestClassifier
|
||||||
from sklearn.model_selection import GridSearchCV
|
from sklearn.model_selection import GridSearchCV
|
||||||
|
|
@ -493,43 +493,6 @@ class AndreaCLF(FunnellingPolylingualClassifier):
|
||||||
|
|
||||||
return lZ, lYtr
|
return lZ, lYtr
|
||||||
|
|
||||||
# def embed(self, lX, ly, unsupervised=False, supervised=False, prediction=False):
|
|
||||||
# """
|
|
||||||
# build embedding matrix for given language and returns its weighted sum wrt tf-idf score
|
|
||||||
# """
|
|
||||||
# _r = dict()
|
|
||||||
# languages = list(lX.keys())
|
|
||||||
#
|
|
||||||
# if prediction:
|
|
||||||
# for lang in languages:
|
|
||||||
# if unsupervised: # If unsupervised embeddings ...
|
|
||||||
# M = self.word_embeddings[lang]
|
|
||||||
# if supervised: # and also unsupervised --> get both (M) and (S) weighted sum matrices and hstack them
|
|
||||||
# S = self.supervised_embeddings[lang]
|
|
||||||
# _r[lang] = np.hstack((lX[lang].dot(M), lX[lang].dot(S)))
|
|
||||||
# continue
|
|
||||||
# _r[lang] = lX[lang].dot(M) # if not supervised --> just get weighted sum of unsupervised (M) embeddings
|
|
||||||
# else: # If not unsupervised --> get (S) matrix and its weighted sum
|
|
||||||
# S = self.supervised_embeddings[lang]
|
|
||||||
# _r[lang] = lX[lang].dot(S)
|
|
||||||
# return _r
|
|
||||||
#
|
|
||||||
# if unsupervised:
|
|
||||||
# for lang in languages:
|
|
||||||
# _, M = embedding_matrix(self.config['we_type'], self.we_path, self.lang_word2idx[lang], lang)
|
|
||||||
# self.word_embeddings[lang] = M
|
|
||||||
# _r[lang] = lX[lang].dot(M)
|
|
||||||
#
|
|
||||||
# if supervised:
|
|
||||||
# for lang in languages:
|
|
||||||
# S = WCE_matrix(lX, ly, lang)
|
|
||||||
# self.supervised_embeddings[lang] = S
|
|
||||||
# if unsupervised:
|
|
||||||
# _r[lang] = np.hstack((_r[lang], lX[lang].dot(S)))
|
|
||||||
# else:
|
|
||||||
# _r[lang] = lX[lang].dot(S)
|
|
||||||
# return _r
|
|
||||||
|
|
||||||
# @override std class method
|
# @override std class method
|
||||||
def fit(self, lX, ly):
|
def fit(self, lX, ly):
|
||||||
tinit = time.time()
|
tinit = time.time()
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue