refactored pca methods

2019-12-09 15:39:39 +01:00 · 2019-12-09 15:39:39 +01:00 · 0c6056e7a1
parent 9fa1899a7f
commit 0c6056e7a1
3 changed files with 8 additions and 67 deletions
--- a/src/data/embeddings.py
+++ b/src/data/embeddings.py
@ -1,14 +1,12 @@
 import os
 import pickle
 import numpy as np
 from torchtext.vocab import Vectors
 import torch
 from abc import ABC, abstractmethod
 from data.supervised import get_supervised_embeddings
 import matplotlib.pyplot as plt
 from sklearn.decomposition import PCA
 from util.decompositions import *
 class PretrainedEmbeddings(ABC):
    def __init__(self):
@ -112,7 +110,7 @@ class WordEmbeddings:
        # vocabulary is a set of terms to be kept
        active_vocabulary = sorted([w for w in vocabulary if w in self.worddim])
        lost = len(vocabulary)-len(active_vocabulary)
-        if lost > 0: #some terms are missing, so it will be replaced by UNK
+        if lost > 0:    # some terms are missing, so it will be replaced by UNK
            print('warning: missing {} terms for lang {}'.format(lost, self.lang))
        self.we = self.get_vectors(active_vocabulary)
        assert self.we.shape[0] == len(active_vocabulary)
@ -134,12 +132,12 @@ class WordEmbeddings:
            'instances of {} expected'.format(WordEmbeddings.__name__)
        polywe = []
-        worddim={}
+        worddim = {}
-        offset=0
+        offset = 0
        for we in we_list:
            polywe.append(we.we)
            worddim.update({'{}::{}'.format(we.lang, w):d+offset for w,d in we.worddim.items()})
-            offset=len(worddim)
+            offset = len(worddim)
        polywe = np.vstack(polywe)
        return WordEmbeddings(lang='poly', we=polywe, worddim=worddim)
@ -191,7 +189,6 @@ class FastTextMUSE(PretrainedEmbeddings):
        print(f'Loading fastText pretrained vectors from {path}')
        assert os.path.exists(path), print(f'pre-trained vectors not found in {path}')
        self.embed = FastTextWikiNews(path, lang, max_vectors=limit)
        # print('Done')
    def vocabulary(self):
        return set(self.embed.stoi.keys())
@ -277,59 +274,3 @@ class StorageEmbeddings:
            for lang in docs.keys():
                _r[lang] = docs[lang].dot(self.lang_U[lang])
        return _r
    # @staticmethod
    # def get_optimal_supervised_components(docs, labels):
    #     optimal_n = get_optimal_dim(docs, 'S')
    #     return optimal_n
        # _idx = []
        #
        # plt.figure(figsize=(15, 10))
        # plt.title(f'WCE Explained Variance')
        # plt.xlabel('Number of Components')
        # plt.ylabel('Variance (%)')
        #
        # for lang in docs.keys():
        #     _r = get_supervised_embeddings(docs[lang], labels[lang], reduction='PCA', max_label_space=0).tolist()
        #     _r = np.cumsum(_r)
        #     plt.plot(_r, label=lang)
        #     for i in range(len(_r)-1, 1, -1):
        #         delta = _r[i] - _r[i-1]
        #         if delta > 0:
        #             _idx.append(i)
        #             break
        # best_n = max(_idx)
        # plt.axvline(best_n, color='r', label='optimal N')
        # plt.legend()
        # plt.show()
        # return best_n
    #
    # def get_optimal_unsupervised_components(self, type):
    #     _idx = []
    #
    #     plt.figure(figsize=(15, 10))
    #     plt.title(f'Unsupervised Embeddings {type} Explained Variance')
    #     plt.xlabel('Number of Components')
    #     plt.ylabel('Variance (%)')
    #
    #     for lang in self.lang_U.keys():
    #         pca = PCA(n_components=self.lang_U[lang].shape[1])
    #         pca.fit(self.lang_U[lang])
    #         _r = pca.explained_variance_ratio_
    #         _r = np.cumsum(_r)
    #         plt.plot(_r, label=lang)
    #         for i in range(len(_r) - 1, 1, -1):
    #             delta = _r[i] - _r[i - 1]
    #             if delta > 0:
    #                 _idx.append(i)
    #                 break
    #     best_n = max(_idx)
    #     plt.axvline(best_n, color='r', label='optimal N')
    #     plt.legend()
    #     plt.show()
    #
    #     for lang in self.lang_U.keys():
    #         pca = PCA(n_components=best_n)
    #         self.lang_U[lang] = pca.fit_transform(self.lang_U[lang])
    #     return
--- a/src/data/supervised.py
+++ b/src/data/supervised.py
@ -1,7 +1,7 @@
 from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square
 from sklearn.decomposition import PCA
 from sklearn.manifold import TSNE
 import numpy as np
 # from sklearn.decomposition import PCA
 # from sklearn.manifold import TSNE
 def zscores(x, axis=0): #scipy.stats.zscores does not avoid division by 0, which can indeed occur
--- a/src/learning/learners.py
+++ b/src/learning/learners.py
@ -8,7 +8,7 @@ from sklearn.model_selection import KFold
 from joblib import Parallel, delayed
 from sklearn.feature_extraction.text import TfidfVectorizer
 from transformers.StandardizeTransformer import StandardizeTransformer
-from sklearn.decomposition import PCA
+# from sklearn.decomposition import PCA
 def _sort_if_sparse(X):