also saving n_components if auto optimizing it

removed some unnecessary columns from result csv
This commit is contained in:
andrea 2019-12-12 14:33:41 +01:00
parent dd34a96f87
commit 8940c99102
4 changed files with 34 additions and 19 deletions

View File

@ -47,9 +47,9 @@ parser.add_option("-u", "--upca", dest="max_labels_U", type=int,
parser.add_option("-l", dest="lang", type=str) parser.add_option("-l", dest="lang", type=str)
parser.add_option("-a", dest="post_pca", # parser.add_option("-a", dest="post_pca",
help="If set to True, will apply PCA to the z-space (posterior probabilities stacked along with " # help="If set to True, will apply PCA to the z-space (posterior probabilities stacked along with "
"embedding space", default=False) # "embedding space", default=False)
def get_learner(calibrate=False, kernel='linear'): def get_learner(calibrate=False, kernel='linear'):
@ -118,11 +118,10 @@ if __name__ == '__main__':
'we_type': op.we_type} 'we_type': op.we_type}
_config_id = 'M_and_F' _config_id = 'M_and_F'
##### TODO - config dict is redundant - we have already op argparse ...
config['reduction'] = 'PCA' config['reduction'] = 'PCA'
config['max_label_space'] = op.max_labels_S config['max_label_space'] = op.max_labels_S
config['dim_reduction_unsupervised'] = op.max_labels_U config['dim_reduction_unsupervised'] = op.max_labels_U
config['post_pca'] = op.post_pca # config['post_pca'] = op.post_pca
# config['plot_covariance_matrices'] = True # config['plot_covariance_matrices'] = True
result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '') result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '')

View File

@ -222,26 +222,40 @@ class StorageEmbeddings:
elif max_label_space < nC: elif max_label_space < nC:
self.lang_U = run_pca(max_label_space, self.lang_U) self.lang_U = run_pca(max_label_space, self.lang_U)
return return
def _add_emebeddings_supervised(self, docs, labels, reduction, max_label_space, voc): def _add_emebeddings_supervised(self, docs, labels, reduction, max_label_space, voc):
for lang in docs.keys(): # compute supervised matrices S - then apply PCA for lang in docs.keys(): # compute supervised matrices S - then apply PCA
print(f'# [supervised-matrix] for {lang}') print(f'# [supervised-matrix] for {lang}')
self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], reduction, max_label_space, voc[lang], lang) self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang],
reduction, max_label_space, voc[lang], lang)
nC = self.lang_S[lang].shape[1] nC = self.lang_S[lang].shape[1]
print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n') print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n')
if max_label_space == 0: if max_label_space == 0:
print(f'Computing optimal number of PCA components along matrices S') print(f'Computing optimal number of PCA components along matrices S')
optimal_n = get_optimal_dim(self.lang_S, 'S') optimal_n = get_optimal_dim(self.lang_S, 'S')
print(f'Applying PCA(n_components={optimal_n})')
self.lang_S = run_pca(optimal_n, self.lang_S) self.lang_S = run_pca(optimal_n, self.lang_S)
elif max_label_space == -1: elif max_label_space == -1:
print(f'Computing PCA on vertical stacked WCE embeddings') print(f'Computing PCA on vertical stacked WCE embeddings')
languages = self.lang_S.keys() languages = self.lang_S.keys()
_temp_stack = np.vstack([self.lang_S[lang] for lang in languages]) _temp_stack = np.vstack([self.lang_S[lang] for lang in languages])
stacked_pca = PCA(n_components=50) stacked_pca = PCA(n_components=_temp_stack.shape[1])
stacked_pca.fit(_temp_stack) stacked_pca.fit(_temp_stack)
best_n = None
_r = stacked_pca.explained_variance_ratio_
_r = np.cumsum(_r)
plt.plot(_r, label='Stacked Supervised')
for i in range(len(_r) - 1, 1, -1):
delta = _r[i] - _r[i - 1]
if delta > 0:
best_n = i
break
plt.show()
stacked_pca = PCA(n_components=best_n)
stacked_pca.fit(_temp_stack)
print(f'Applying PCA(n_components={i}')
for lang in languages: for lang in languages:
self.lang_S[lang] = stacked_pca.transform(self.lang_S[lang]) self.lang_S[lang] = stacked_pca.transform(self.lang_S[lang])
elif max_label_space < nC: elif max_label_space < nC:

View File

@ -353,7 +353,7 @@ class AndreaCLF(FunnellingPolylingualClassifier):
self.embedding_space = None self.embedding_space = None
self.model = None self.model = None
self.time = None self.time = None
self.best_components = None # if auto optimize pca, it will store the optimal number of components self.best_components = 'not set' # if auto optimize pca, it will store the optimal number of components
def vectorize(self, lX, prediction=False): def vectorize(self, lX, prediction=False):
langs = list(lX.keys()) langs = list(lX.keys())
@ -398,10 +398,11 @@ class AndreaCLF(FunnellingPolylingualClassifier):
self.embedding_space = StorageEmbeddings(self.we_path).fit(self.config, lX, self.lang_word2idx, ly) self.embedding_space = StorageEmbeddings(self.we_path).fit(self.config, lX, self.lang_word2idx, ly)
_embedding_space = self.embedding_space.predict(self.config, lX) _embedding_space = self.embedding_space.predict(self.config, lX)
if self.config['max_label_space'] == 0: if self.config['max_label_space'] == 0:
if _embedding_space.shape[1] - 300 > 0: _cum_dimension = _embedding_space[list(_embedding_space.keys())[0]].shape[1]
_temp = _embedding_space.shape[1] - 300 if _cum_dimension - 300 > 0:
_temp = _cum_dimension - 300
else: else:
_temp = _embedding_space.shape[1] _temp = _cum_dimension
self.best_components = _temp self.best_components = _temp
# h_stacking posterior probabilities with (U) and/or (S) matrices # h_stacking posterior probabilities with (U) and/or (S) matrices
for lang in self.languages: for lang in self.languages:
@ -415,10 +416,10 @@ class AndreaCLF(FunnellingPolylingualClassifier):
_vertical_Z = self.standardizer.fit_predict(_vertical_Z) _vertical_Z = self.standardizer.fit_predict(_vertical_Z)
# todo testing ... # todo testing ...
if self.config['post_pca']: # if self.config['post_pca']:
print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...') # print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...')
self.pca_independent_space.fit(_vertical_Z) # self.pca_independent_space.fit(_vertical_Z)
_vertical_Z = self.pca_independent_space.transform(_vertical_Z) # _vertical_Z = self.pca_independent_space.transform(_vertical_Z)
print('fitting the Z-space of shape={}'.format(_vertical_Z.shape)) print('fitting the Z-space of shape={}'.format(_vertical_Z.shape))
self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters, self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters,
@ -442,9 +443,9 @@ class AndreaCLF(FunnellingPolylingualClassifier):
print(lZ[lang].shape) print(lZ[lang].shape)
# todo testing # todo testing
lZ[lang] = self.standardizer.predict(lZ[lang]) lZ[lang] = self.standardizer.predict(lZ[lang])
if self.config['post_pca']: # if self.config['post_pca']:
print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...') # print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...')
lZ[lang] = self.pca_independent_space.transform(lZ[lang]) # lZ[lang] = self.pca_independent_space.transform(lZ[lang])
return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs) return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs)

View File

@ -2,6 +2,7 @@ from sklearn.decomposition import PCA
import numpy as np import numpy as np
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
def run_pca(dim, X): def run_pca(dim, X):
""" """
:param dim: number of pca components to keep :param dim: number of pca components to keep