From 8940c99102ed65f3a5b8a6fe203136e19e48f7ec Mon Sep 17 00:00:00 2001 From: andrea Date: Thu, 12 Dec 2019 14:33:41 +0100 Subject: [PATCH] also saving n_components if auto optimizing it removed some unnecessary columns from result csv --- src/FPEC_andrea.py | 9 ++++----- src/data/embeddings.py | 20 +++++++++++++++++--- src/learning/learners.py | 23 ++++++++++++----------- src/util/decompositions.py | 1 + 4 files changed, 34 insertions(+), 19 deletions(-) diff --git a/src/FPEC_andrea.py b/src/FPEC_andrea.py index 16934df..0ed414e 100644 --- a/src/FPEC_andrea.py +++ b/src/FPEC_andrea.py @@ -47,9 +47,9 @@ parser.add_option("-u", "--upca", dest="max_labels_U", type=int, parser.add_option("-l", dest="lang", type=str) -parser.add_option("-a", dest="post_pca", - help="If set to True, will apply PCA to the z-space (posterior probabilities stacked along with " - "embedding space", default=False) +# parser.add_option("-a", dest="post_pca", +# help="If set to True, will apply PCA to the z-space (posterior probabilities stacked along with " +# "embedding space", default=False) def get_learner(calibrate=False, kernel='linear'): @@ -118,11 +118,10 @@ if __name__ == '__main__': 'we_type': op.we_type} _config_id = 'M_and_F' - ##### TODO - config dict is redundant - we have already op argparse ... config['reduction'] = 'PCA' config['max_label_space'] = op.max_labels_S config['dim_reduction_unsupervised'] = op.max_labels_U - config['post_pca'] = op.post_pca + # config['post_pca'] = op.post_pca # config['plot_covariance_matrices'] = True result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '') diff --git a/src/data/embeddings.py b/src/data/embeddings.py index fb1f135..4b19b4a 100644 --- a/src/data/embeddings.py +++ b/src/data/embeddings.py @@ -222,26 +222,40 @@ class StorageEmbeddings: elif max_label_space < nC: self.lang_U = run_pca(max_label_space, self.lang_U) - return def _add_emebeddings_supervised(self, docs, labels, reduction, max_label_space, voc): for lang in docs.keys(): # compute supervised matrices S - then apply PCA print(f'# [supervised-matrix] for {lang}') - self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], reduction, max_label_space, voc[lang], lang) + self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], + reduction, max_label_space, voc[lang], lang) nC = self.lang_S[lang].shape[1] print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n') if max_label_space == 0: print(f'Computing optimal number of PCA components along matrices S') optimal_n = get_optimal_dim(self.lang_S, 'S') + print(f'Applying PCA(n_components={optimal_n})') self.lang_S = run_pca(optimal_n, self.lang_S) elif max_label_space == -1: print(f'Computing PCA on vertical stacked WCE embeddings') languages = self.lang_S.keys() _temp_stack = np.vstack([self.lang_S[lang] for lang in languages]) - stacked_pca = PCA(n_components=50) + stacked_pca = PCA(n_components=_temp_stack.shape[1]) stacked_pca.fit(_temp_stack) + best_n = None + _r = stacked_pca.explained_variance_ratio_ + _r = np.cumsum(_r) + plt.plot(_r, label='Stacked Supervised') + for i in range(len(_r) - 1, 1, -1): + delta = _r[i] - _r[i - 1] + if delta > 0: + best_n = i + break + plt.show() + stacked_pca = PCA(n_components=best_n) + stacked_pca.fit(_temp_stack) + print(f'Applying PCA(n_components={i}') for lang in languages: self.lang_S[lang] = stacked_pca.transform(self.lang_S[lang]) elif max_label_space < nC: diff --git a/src/learning/learners.py b/src/learning/learners.py index 1d119e3..5d3f7fa 100644 --- a/src/learning/learners.py +++ b/src/learning/learners.py @@ -353,7 +353,7 @@ class AndreaCLF(FunnellingPolylingualClassifier): self.embedding_space = None self.model = None self.time = None - self.best_components = None # if auto optimize pca, it will store the optimal number of components + self.best_components = 'not set' # if auto optimize pca, it will store the optimal number of components def vectorize(self, lX, prediction=False): langs = list(lX.keys()) @@ -398,10 +398,11 @@ class AndreaCLF(FunnellingPolylingualClassifier): self.embedding_space = StorageEmbeddings(self.we_path).fit(self.config, lX, self.lang_word2idx, ly) _embedding_space = self.embedding_space.predict(self.config, lX) if self.config['max_label_space'] == 0: - if _embedding_space.shape[1] - 300 > 0: - _temp = _embedding_space.shape[1] - 300 + _cum_dimension = _embedding_space[list(_embedding_space.keys())[0]].shape[1] + if _cum_dimension - 300 > 0: + _temp = _cum_dimension - 300 else: - _temp = _embedding_space.shape[1] + _temp = _cum_dimension self.best_components = _temp # h_stacking posterior probabilities with (U) and/or (S) matrices for lang in self.languages: @@ -415,10 +416,10 @@ class AndreaCLF(FunnellingPolylingualClassifier): _vertical_Z = self.standardizer.fit_predict(_vertical_Z) # todo testing ... - if self.config['post_pca']: - print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...') - self.pca_independent_space.fit(_vertical_Z) - _vertical_Z = self.pca_independent_space.transform(_vertical_Z) + # if self.config['post_pca']: + # print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...') + # self.pca_independent_space.fit(_vertical_Z) + # _vertical_Z = self.pca_independent_space.transform(_vertical_Z) print('fitting the Z-space of shape={}'.format(_vertical_Z.shape)) self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters, @@ -442,9 +443,9 @@ class AndreaCLF(FunnellingPolylingualClassifier): print(lZ[lang].shape) # todo testing lZ[lang] = self.standardizer.predict(lZ[lang]) - if self.config['post_pca']: - print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...') - lZ[lang] = self.pca_independent_space.transform(lZ[lang]) + # if self.config['post_pca']: + # print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...') + # lZ[lang] = self.pca_independent_space.transform(lZ[lang]) return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs) diff --git a/src/util/decompositions.py b/src/util/decompositions.py index 9029b33..7b50ffc 100644 --- a/src/util/decompositions.py +++ b/src/util/decompositions.py @@ -2,6 +2,7 @@ from sklearn.decomposition import PCA import numpy as np import matplotlib.pyplot as plt + def run_pca(dim, X): """ :param dim: number of pca components to keep