refactored pca methods

2019-12-09 15:39:39 +01:00 · 2019-12-09 15:39:39 +01:00 · 0c6056e7a1
parent 9fa1899a7f
commit 0c6056e7a1
3 changed files with 8 additions and 67 deletions
--- a/src/data/embeddings.py
+++ b/src/data/embeddings.py
@ -1,14 +1,12 @@
 import os
 import pickle
-import numpy as np
 from torchtext.vocab import Vectors
 import torch
 from abc import ABC, abstractmethod
 from data.supervised import get_supervised_embeddings
-import matplotlib.pyplot as plt
-from sklearn.decomposition import PCA
 from util.decompositions import *

+
 class PretrainedEmbeddings(ABC):

    def __init__(self):
@ -112,7 +110,7 @@ class WordEmbeddings:
        # vocabulary is a set of terms to be kept
        active_vocabulary = sorted([w for w in vocabulary if w in self.worddim])
        lost = len(vocabulary)-len(active_vocabulary)
-        if lost > 0: #some terms are missing, so it will be replaced by UNK
+        if lost > 0:    # some terms are missing, so it will be replaced by UNK
            print('warning: missing {} terms for lang {}'.format(lost, self.lang))
        self.we = self.get_vectors(active_vocabulary)
        assert self.we.shape[0] == len(active_vocabulary)
@ -134,12 +132,12 @@ class WordEmbeddings:
            'instances of {} expected'.format(WordEmbeddings.__name__)

        polywe = []
-        worddim={}
-        offset=0
+        worddim = {}
+        offset = 0
        for we in we_list:
            polywe.append(we.we)
            worddim.update({'{}::{}'.format(we.lang, w):d+offset for w,d in we.worddim.items()})
-            offset=len(worddim)
+            offset = len(worddim)
        polywe = np.vstack(polywe)

        return WordEmbeddings(lang='poly', we=polywe, worddim=worddim)
@ -191,7 +189,6 @@ class FastTextMUSE(PretrainedEmbeddings):
        print(f'Loading fastText pretrained vectors from {path}')
        assert os.path.exists(path), print(f'pre-trained vectors not found in {path}')
        self.embed = FastTextWikiNews(path, lang, max_vectors=limit)
-        # print('Done')

    def vocabulary(self):
        return set(self.embed.stoi.keys())
@ -277,59 +274,3 @@ class StorageEmbeddings:
            for lang in docs.keys():
                _r[lang] = docs[lang].dot(self.lang_U[lang])
        return _r
-
-    # @staticmethod
-    # def get_optimal_supervised_components(docs, labels):
-    #     optimal_n = get_optimal_dim(docs, 'S')
-    #     return optimal_n
-        # _idx = []
-        #
-        # plt.figure(figsize=(15, 10))
-        # plt.title(f'WCE Explained Variance')
-        # plt.xlabel('Number of Components')
-        # plt.ylabel('Variance (%)')
-        #
-        # for lang in docs.keys():
-        #     _r = get_supervised_embeddings(docs[lang], labels[lang], reduction='PCA', max_label_space=0).tolist()
-        #     _r = np.cumsum(_r)
-        #     plt.plot(_r, label=lang)
-        #     for i in range(len(_r)-1, 1, -1):
-        #         delta = _r[i] - _r[i-1]
-        #         if delta > 0:
-        #             _idx.append(i)
-        #             break
-        # best_n = max(_idx)
-        # plt.axvline(best_n, color='r', label='optimal N')
-        # plt.legend()
-        # plt.show()
-        # return best_n
-    #
-    # def get_optimal_unsupervised_components(self, type):
-    #     _idx = []
-    #
-    #     plt.figure(figsize=(15, 10))
-    #     plt.title(f'Unsupervised Embeddings {type} Explained Variance')
-    #     plt.xlabel('Number of Components')
-    #     plt.ylabel('Variance (%)')
-    #
-    #     for lang in self.lang_U.keys():
-    #         pca = PCA(n_components=self.lang_U[lang].shape[1])
-    #         pca.fit(self.lang_U[lang])
-    #         _r = pca.explained_variance_ratio_
-    #         _r = np.cumsum(_r)
-    #         plt.plot(_r, label=lang)
-    #         for i in range(len(_r) - 1, 1, -1):
-    #             delta = _r[i] - _r[i - 1]
-    #             if delta > 0:
-    #                 _idx.append(i)
-    #                 break
-    #     best_n = max(_idx)
-    #     plt.axvline(best_n, color='r', label='optimal N')
-    #     plt.legend()
-    #     plt.show()
-    #
-    #     for lang in self.lang_U.keys():
-    #         pca = PCA(n_components=best_n)
-    #         self.lang_U[lang] = pca.fit_transform(self.lang_U[lang])
-    #     return
-
--- a/src/data/supervised.py
+++ b/src/data/supervised.py
@ -1,7 +1,7 @@
 from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square
-from sklearn.decomposition import PCA
-from sklearn.manifold import TSNE
 import numpy as np
+# from sklearn.decomposition import PCA
+# from sklearn.manifold import TSNE


 def zscores(x, axis=0): #scipy.stats.zscores does not avoid division by 0, which can indeed occur
--- a/src/learning/learners.py
+++ b/src/learning/learners.py
@ -8,7 +8,7 @@ from sklearn.model_selection import KFold
 from joblib import Parallel, delayed
 from sklearn.feature_extraction.text import TfidfVectorizer
 from transformers.StandardizeTransformer import StandardizeTransformer
-from sklearn.decomposition import PCA
+# from sklearn.decomposition import PCA


 def _sort_if_sparse(X):