refactored pca methods

This commit is contained in:
andrea 2019-12-09 15:39:39 +01:00
parent 9fa1899a7f
commit 0c6056e7a1
3 changed files with 8 additions and 67 deletions

View File

@ -1,14 +1,12 @@
import os
import pickle
import numpy as np
from torchtext.vocab import Vectors
import torch
from abc import ABC, abstractmethod
from data.supervised import get_supervised_embeddings
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from util.decompositions import *
class PretrainedEmbeddings(ABC):
def __init__(self):
@ -191,7 +189,6 @@ class FastTextMUSE(PretrainedEmbeddings):
print(f'Loading fastText pretrained vectors from {path}')
assert os.path.exists(path), print(f'pre-trained vectors not found in {path}')
self.embed = FastTextWikiNews(path, lang, max_vectors=limit)
# print('Done')
def vocabulary(self):
return set(self.embed.stoi.keys())
@ -277,59 +274,3 @@ class StorageEmbeddings:
for lang in docs.keys():
_r[lang] = docs[lang].dot(self.lang_U[lang])
return _r
# @staticmethod
# def get_optimal_supervised_components(docs, labels):
# optimal_n = get_optimal_dim(docs, 'S')
# return optimal_n
# _idx = []
#
# plt.figure(figsize=(15, 10))
# plt.title(f'WCE Explained Variance')
# plt.xlabel('Number of Components')
# plt.ylabel('Variance (%)')
#
# for lang in docs.keys():
# _r = get_supervised_embeddings(docs[lang], labels[lang], reduction='PCA', max_label_space=0).tolist()
# _r = np.cumsum(_r)
# plt.plot(_r, label=lang)
# for i in range(len(_r)-1, 1, -1):
# delta = _r[i] - _r[i-1]
# if delta > 0:
# _idx.append(i)
# break
# best_n = max(_idx)
# plt.axvline(best_n, color='r', label='optimal N')
# plt.legend()
# plt.show()
# return best_n
#
# def get_optimal_unsupervised_components(self, type):
# _idx = []
#
# plt.figure(figsize=(15, 10))
# plt.title(f'Unsupervised Embeddings {type} Explained Variance')
# plt.xlabel('Number of Components')
# plt.ylabel('Variance (%)')
#
# for lang in self.lang_U.keys():
# pca = PCA(n_components=self.lang_U[lang].shape[1])
# pca.fit(self.lang_U[lang])
# _r = pca.explained_variance_ratio_
# _r = np.cumsum(_r)
# plt.plot(_r, label=lang)
# for i in range(len(_r) - 1, 1, -1):
# delta = _r[i] - _r[i - 1]
# if delta > 0:
# _idx.append(i)
# break
# best_n = max(_idx)
# plt.axvline(best_n, color='r', label='optimal N')
# plt.legend()
# plt.show()
#
# for lang in self.lang_U.keys():
# pca = PCA(n_components=best_n)
# self.lang_U[lang] = pca.fit_transform(self.lang_U[lang])
# return

View File

@ -1,7 +1,7 @@
from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import numpy as np
# from sklearn.decomposition import PCA
# from sklearn.manifold import TSNE
def zscores(x, axis=0): #scipy.stats.zscores does not avoid division by 0, which can indeed occur

View File

@ -8,7 +8,7 @@ from sklearn.model_selection import KFold
from joblib import Parallel, delayed
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers.StandardizeTransformer import StandardizeTransformer
from sklearn.decomposition import PCA
# from sklearn.decomposition import PCA
def _sort_if_sparse(X):