Plot variance explained by PCA for every language

2019-12-04 10:16:17 +01:00 · 2019-12-04 10:16:17 +01:00 · ba1a72ff94
parent f074fd97f9
commit ba1a72ff94
2 changed files with 20 additions and 40 deletions
--- a/src/data/embeddings.py
+++ b/src/data/embeddings.py
@ -157,31 +157,6 @@ class FastTextWikiNews(Vectors):
        super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs)
 # class EmbeddingsAligned(Vectors):
 #
 #     def __init__(self, type, path, lang):
 #
 #         self.name = '/embeddings/wiki.multi.{}.vec' if type == 'MUSE' else '/embeddings_polyFASTTEXT/wiki.{}.align.vec'
 #         # todo - rewrite as relative path
 #         self.cache_path = '/home/andreapdr/CLESA/embeddings' if type == 'MUSE' else '/home/andreapdr/CLESA/embeddings_polyFASTTEXT'
 #         self.path = path + self.name.format(lang)
 #         assert os.path.exists(path), f'pre-trained vectors not found in {path}'
 #         super(EmbeddingsAligned, self).__init__(self.path, cache=self.cache_path)
 #         # self.vectors = self.extract(voc)
 #
 #     def vocabulary(self):
 #         return set(self.stoi.keys())
 #
 #     def dim(self):
 #         return self.dim
 #
 #     def extract(self, words):
 #         source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.stoi)
 #         extraction = torch.zeros((len(words), self.dim))
 #         extraction[source_idx] = self.vectors[target_idx]
 #         return extraction
 class EmbeddingsAligned(Vectors):
    def __init__(self, type, path, lang, voc):
@ -283,18 +258,26 @@ class StorageEmbeddings:
        return _r
    def get_optimal_supervised_components(self, docs, labels):
        import matplotlib.pyplot as plt
        _idx = []
        plt.figure(figsize=(15, 10))
        plt.title(f'WCE Explained Variance')
        plt.xlabel('Number of Components')
        plt.ylabel('Variance (%)')
        for lang in docs.keys():
            _r = get_supervised_embeddings(docs[lang], labels[lang], reduction='PCA', max_label_space='optimal').tolist()
-
+            plt.plot(np.cumsum(_r), label=lang)
            for i in range(len(_r)-1, 1, -1):
                # todo: if n_components (therfore #n labels) is not big enough every value will be smaller than the next one ...
-                ratio = _r[i]
+                delta = _r[i-1] - _r[i]
                next_ratio = _r[i-1]
                delta = _r[i] - _r[i-1]
                if delta > 0:
                # if ratio < next_ratio:
                    _idx.append(i)
                    break
        best_n = int(sum(_idx)/len(_idx))
        plt.vlines(best_n, 0, 1, colors='r', label='optimal N')
        plt.legend()
        plt.show()
        return best_n
--- a/src/data/supervised.py
+++ b/src/data/supervised.py
@ -75,18 +75,15 @@ def get_supervised_embeddings(X, Y, reduction, max_label_space=300, lang='None',
                  f'Applying PCA(n_components={max_label_space})')
            pca = PCA(n_components=max_label_space)
            pca = pca.fit(F)
            ########################################################
-            import matplotlib.pyplot as plt
+            # import matplotlib.pyplot as plt
-
+            # plt.figure()
-            plt.figure()
+            # plt.plot(np.cumsum(pca.explained_variance_ratio_))
-            plt.plot(np.cumsum(pca.explained_variance_ratio_))
+            # plt.xlabel('Number of Components')
-            plt.xlabel('Number of Components')
+            # plt.ylabel('Variance (%)')  #
-            plt.ylabel('Variance (%)')  #
+            # plt.title(f'WCE Explained Variance {lang}')
-            plt.title(f'WCE Explained Variance {lang}')
+            # plt.show()
            plt.show()
            ########################################################
            F = pca.fit_transform(F)
        elif reduction == 'TSNE':
            print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '