refactored pca methods
This commit is contained in:
parent
9fa1899a7f
commit
0c6056e7a1
|
|
@ -1,14 +1,12 @@
|
||||||
import os
|
import os
|
||||||
import pickle
|
import pickle
|
||||||
import numpy as np
|
|
||||||
from torchtext.vocab import Vectors
|
from torchtext.vocab import Vectors
|
||||||
import torch
|
import torch
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from data.supervised import get_supervised_embeddings
|
from data.supervised import get_supervised_embeddings
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
from sklearn.decomposition import PCA
|
|
||||||
from util.decompositions import *
|
from util.decompositions import *
|
||||||
|
|
||||||
|
|
||||||
class PretrainedEmbeddings(ABC):
|
class PretrainedEmbeddings(ABC):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
|
@ -112,7 +110,7 @@ class WordEmbeddings:
|
||||||
# vocabulary is a set of terms to be kept
|
# vocabulary is a set of terms to be kept
|
||||||
active_vocabulary = sorted([w for w in vocabulary if w in self.worddim])
|
active_vocabulary = sorted([w for w in vocabulary if w in self.worddim])
|
||||||
lost = len(vocabulary)-len(active_vocabulary)
|
lost = len(vocabulary)-len(active_vocabulary)
|
||||||
if lost > 0: #some terms are missing, so it will be replaced by UNK
|
if lost > 0: # some terms are missing, so it will be replaced by UNK
|
||||||
print('warning: missing {} terms for lang {}'.format(lost, self.lang))
|
print('warning: missing {} terms for lang {}'.format(lost, self.lang))
|
||||||
self.we = self.get_vectors(active_vocabulary)
|
self.we = self.get_vectors(active_vocabulary)
|
||||||
assert self.we.shape[0] == len(active_vocabulary)
|
assert self.we.shape[0] == len(active_vocabulary)
|
||||||
|
|
@ -134,12 +132,12 @@ class WordEmbeddings:
|
||||||
'instances of {} expected'.format(WordEmbeddings.__name__)
|
'instances of {} expected'.format(WordEmbeddings.__name__)
|
||||||
|
|
||||||
polywe = []
|
polywe = []
|
||||||
worddim={}
|
worddim = {}
|
||||||
offset=0
|
offset = 0
|
||||||
for we in we_list:
|
for we in we_list:
|
||||||
polywe.append(we.we)
|
polywe.append(we.we)
|
||||||
worddim.update({'{}::{}'.format(we.lang, w):d+offset for w,d in we.worddim.items()})
|
worddim.update({'{}::{}'.format(we.lang, w):d+offset for w,d in we.worddim.items()})
|
||||||
offset=len(worddim)
|
offset = len(worddim)
|
||||||
polywe = np.vstack(polywe)
|
polywe = np.vstack(polywe)
|
||||||
|
|
||||||
return WordEmbeddings(lang='poly', we=polywe, worddim=worddim)
|
return WordEmbeddings(lang='poly', we=polywe, worddim=worddim)
|
||||||
|
|
@ -191,7 +189,6 @@ class FastTextMUSE(PretrainedEmbeddings):
|
||||||
print(f'Loading fastText pretrained vectors from {path}')
|
print(f'Loading fastText pretrained vectors from {path}')
|
||||||
assert os.path.exists(path), print(f'pre-trained vectors not found in {path}')
|
assert os.path.exists(path), print(f'pre-trained vectors not found in {path}')
|
||||||
self.embed = FastTextWikiNews(path, lang, max_vectors=limit)
|
self.embed = FastTextWikiNews(path, lang, max_vectors=limit)
|
||||||
# print('Done')
|
|
||||||
|
|
||||||
def vocabulary(self):
|
def vocabulary(self):
|
||||||
return set(self.embed.stoi.keys())
|
return set(self.embed.stoi.keys())
|
||||||
|
|
@ -277,59 +274,3 @@ class StorageEmbeddings:
|
||||||
for lang in docs.keys():
|
for lang in docs.keys():
|
||||||
_r[lang] = docs[lang].dot(self.lang_U[lang])
|
_r[lang] = docs[lang].dot(self.lang_U[lang])
|
||||||
return _r
|
return _r
|
||||||
|
|
||||||
# @staticmethod
|
|
||||||
# def get_optimal_supervised_components(docs, labels):
|
|
||||||
# optimal_n = get_optimal_dim(docs, 'S')
|
|
||||||
# return optimal_n
|
|
||||||
# _idx = []
|
|
||||||
#
|
|
||||||
# plt.figure(figsize=(15, 10))
|
|
||||||
# plt.title(f'WCE Explained Variance')
|
|
||||||
# plt.xlabel('Number of Components')
|
|
||||||
# plt.ylabel('Variance (%)')
|
|
||||||
#
|
|
||||||
# for lang in docs.keys():
|
|
||||||
# _r = get_supervised_embeddings(docs[lang], labels[lang], reduction='PCA', max_label_space=0).tolist()
|
|
||||||
# _r = np.cumsum(_r)
|
|
||||||
# plt.plot(_r, label=lang)
|
|
||||||
# for i in range(len(_r)-1, 1, -1):
|
|
||||||
# delta = _r[i] - _r[i-1]
|
|
||||||
# if delta > 0:
|
|
||||||
# _idx.append(i)
|
|
||||||
# break
|
|
||||||
# best_n = max(_idx)
|
|
||||||
# plt.axvline(best_n, color='r', label='optimal N')
|
|
||||||
# plt.legend()
|
|
||||||
# plt.show()
|
|
||||||
# return best_n
|
|
||||||
#
|
|
||||||
# def get_optimal_unsupervised_components(self, type):
|
|
||||||
# _idx = []
|
|
||||||
#
|
|
||||||
# plt.figure(figsize=(15, 10))
|
|
||||||
# plt.title(f'Unsupervised Embeddings {type} Explained Variance')
|
|
||||||
# plt.xlabel('Number of Components')
|
|
||||||
# plt.ylabel('Variance (%)')
|
|
||||||
#
|
|
||||||
# for lang in self.lang_U.keys():
|
|
||||||
# pca = PCA(n_components=self.lang_U[lang].shape[1])
|
|
||||||
# pca.fit(self.lang_U[lang])
|
|
||||||
# _r = pca.explained_variance_ratio_
|
|
||||||
# _r = np.cumsum(_r)
|
|
||||||
# plt.plot(_r, label=lang)
|
|
||||||
# for i in range(len(_r) - 1, 1, -1):
|
|
||||||
# delta = _r[i] - _r[i - 1]
|
|
||||||
# if delta > 0:
|
|
||||||
# _idx.append(i)
|
|
||||||
# break
|
|
||||||
# best_n = max(_idx)
|
|
||||||
# plt.axvline(best_n, color='r', label='optimal N')
|
|
||||||
# plt.legend()
|
|
||||||
# plt.show()
|
|
||||||
#
|
|
||||||
# for lang in self.lang_U.keys():
|
|
||||||
# pca = PCA(n_components=best_n)
|
|
||||||
# self.lang_U[lang] = pca.fit_transform(self.lang_U[lang])
|
|
||||||
# return
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square
|
from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square
|
||||||
from sklearn.decomposition import PCA
|
|
||||||
from sklearn.manifold import TSNE
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
# from sklearn.decomposition import PCA
|
||||||
|
# from sklearn.manifold import TSNE
|
||||||
|
|
||||||
|
|
||||||
def zscores(x, axis=0): #scipy.stats.zscores does not avoid division by 0, which can indeed occur
|
def zscores(x, axis=0): #scipy.stats.zscores does not avoid division by 0, which can indeed occur
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,7 @@ from sklearn.model_selection import KFold
|
||||||
from joblib import Parallel, delayed
|
from joblib import Parallel, delayed
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
from transformers.StandardizeTransformer import StandardizeTransformer
|
from transformers.StandardizeTransformer import StandardizeTransformer
|
||||||
from sklearn.decomposition import PCA
|
# from sklearn.decomposition import PCA
|
||||||
|
|
||||||
|
|
||||||
def _sort_if_sparse(X):
|
def _sort_if_sparse(X):
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue