also saving n_components if auto optimizing it
removed some unnecessary columns from result csv
This commit is contained in:
parent
dd34a96f87
commit
8940c99102
|
@ -47,9 +47,9 @@ parser.add_option("-u", "--upca", dest="max_labels_U", type=int,
|
|||
|
||||
parser.add_option("-l", dest="lang", type=str)
|
||||
|
||||
parser.add_option("-a", dest="post_pca",
|
||||
help="If set to True, will apply PCA to the z-space (posterior probabilities stacked along with "
|
||||
"embedding space", default=False)
|
||||
# parser.add_option("-a", dest="post_pca",
|
||||
# help="If set to True, will apply PCA to the z-space (posterior probabilities stacked along with "
|
||||
# "embedding space", default=False)
|
||||
|
||||
|
||||
def get_learner(calibrate=False, kernel='linear'):
|
||||
|
@ -118,11 +118,10 @@ if __name__ == '__main__':
|
|||
'we_type': op.we_type}
|
||||
_config_id = 'M_and_F'
|
||||
|
||||
##### TODO - config dict is redundant - we have already op argparse ...
|
||||
config['reduction'] = 'PCA'
|
||||
config['max_label_space'] = op.max_labels_S
|
||||
config['dim_reduction_unsupervised'] = op.max_labels_U
|
||||
config['post_pca'] = op.post_pca
|
||||
# config['post_pca'] = op.post_pca
|
||||
# config['plot_covariance_matrices'] = True
|
||||
|
||||
result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '')
|
||||
|
|
|
@ -222,26 +222,40 @@ class StorageEmbeddings:
|
|||
elif max_label_space < nC:
|
||||
self.lang_U = run_pca(max_label_space, self.lang_U)
|
||||
|
||||
|
||||
return
|
||||
|
||||
def _add_emebeddings_supervised(self, docs, labels, reduction, max_label_space, voc):
|
||||
for lang in docs.keys(): # compute supervised matrices S - then apply PCA
|
||||
print(f'# [supervised-matrix] for {lang}')
|
||||
self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], reduction, max_label_space, voc[lang], lang)
|
||||
self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang],
|
||||
reduction, max_label_space, voc[lang], lang)
|
||||
nC = self.lang_S[lang].shape[1]
|
||||
print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n')
|
||||
|
||||
if max_label_space == 0:
|
||||
print(f'Computing optimal number of PCA components along matrices S')
|
||||
optimal_n = get_optimal_dim(self.lang_S, 'S')
|
||||
print(f'Applying PCA(n_components={optimal_n})')
|
||||
self.lang_S = run_pca(optimal_n, self.lang_S)
|
||||
elif max_label_space == -1:
|
||||
print(f'Computing PCA on vertical stacked WCE embeddings')
|
||||
languages = self.lang_S.keys()
|
||||
_temp_stack = np.vstack([self.lang_S[lang] for lang in languages])
|
||||
stacked_pca = PCA(n_components=50)
|
||||
stacked_pca = PCA(n_components=_temp_stack.shape[1])
|
||||
stacked_pca.fit(_temp_stack)
|
||||
best_n = None
|
||||
_r = stacked_pca.explained_variance_ratio_
|
||||
_r = np.cumsum(_r)
|
||||
plt.plot(_r, label='Stacked Supervised')
|
||||
for i in range(len(_r) - 1, 1, -1):
|
||||
delta = _r[i] - _r[i - 1]
|
||||
if delta > 0:
|
||||
best_n = i
|
||||
break
|
||||
plt.show()
|
||||
stacked_pca = PCA(n_components=best_n)
|
||||
stacked_pca.fit(_temp_stack)
|
||||
print(f'Applying PCA(n_components={i}')
|
||||
for lang in languages:
|
||||
self.lang_S[lang] = stacked_pca.transform(self.lang_S[lang])
|
||||
elif max_label_space < nC:
|
||||
|
|
|
@ -353,7 +353,7 @@ class AndreaCLF(FunnellingPolylingualClassifier):
|
|||
self.embedding_space = None
|
||||
self.model = None
|
||||
self.time = None
|
||||
self.best_components = None # if auto optimize pca, it will store the optimal number of components
|
||||
self.best_components = 'not set' # if auto optimize pca, it will store the optimal number of components
|
||||
|
||||
def vectorize(self, lX, prediction=False):
|
||||
langs = list(lX.keys())
|
||||
|
@ -398,10 +398,11 @@ class AndreaCLF(FunnellingPolylingualClassifier):
|
|||
self.embedding_space = StorageEmbeddings(self.we_path).fit(self.config, lX, self.lang_word2idx, ly)
|
||||
_embedding_space = self.embedding_space.predict(self.config, lX)
|
||||
if self.config['max_label_space'] == 0:
|
||||
if _embedding_space.shape[1] - 300 > 0:
|
||||
_temp = _embedding_space.shape[1] - 300
|
||||
_cum_dimension = _embedding_space[list(_embedding_space.keys())[0]].shape[1]
|
||||
if _cum_dimension - 300 > 0:
|
||||
_temp = _cum_dimension - 300
|
||||
else:
|
||||
_temp = _embedding_space.shape[1]
|
||||
_temp = _cum_dimension
|
||||
self.best_components = _temp
|
||||
# h_stacking posterior probabilities with (U) and/or (S) matrices
|
||||
for lang in self.languages:
|
||||
|
@ -415,10 +416,10 @@ class AndreaCLF(FunnellingPolylingualClassifier):
|
|||
_vertical_Z = self.standardizer.fit_predict(_vertical_Z)
|
||||
|
||||
# todo testing ...
|
||||
if self.config['post_pca']:
|
||||
print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...')
|
||||
self.pca_independent_space.fit(_vertical_Z)
|
||||
_vertical_Z = self.pca_independent_space.transform(_vertical_Z)
|
||||
# if self.config['post_pca']:
|
||||
# print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...')
|
||||
# self.pca_independent_space.fit(_vertical_Z)
|
||||
# _vertical_Z = self.pca_independent_space.transform(_vertical_Z)
|
||||
|
||||
print('fitting the Z-space of shape={}'.format(_vertical_Z.shape))
|
||||
self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters,
|
||||
|
@ -442,9 +443,9 @@ class AndreaCLF(FunnellingPolylingualClassifier):
|
|||
print(lZ[lang].shape)
|
||||
# todo testing
|
||||
lZ[lang] = self.standardizer.predict(lZ[lang])
|
||||
if self.config['post_pca']:
|
||||
print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...')
|
||||
lZ[lang] = self.pca_independent_space.transform(lZ[lang])
|
||||
# if self.config['post_pca']:
|
||||
# print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...')
|
||||
# lZ[lang] = self.pca_independent_space.transform(lZ[lang])
|
||||
|
||||
return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs)
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@ from sklearn.decomposition import PCA
|
|||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
def run_pca(dim, X):
|
||||
"""
|
||||
:param dim: number of pca components to keep
|
||||
|
|
Loading…
Reference in New Issue