refactored pca methods

This commit is contained in:
andrea 2019-12-09 15:39:39 +01:00
parent 9fa1899a7f
commit 0c6056e7a1
3 changed files with 8 additions and 67 deletions

View File

@ -1,14 +1,12 @@
import os import os
import pickle import pickle
import numpy as np
from torchtext.vocab import Vectors from torchtext.vocab import Vectors
import torch import torch
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from data.supervised import get_supervised_embeddings from data.supervised import get_supervised_embeddings
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from util.decompositions import * from util.decompositions import *
class PretrainedEmbeddings(ABC): class PretrainedEmbeddings(ABC):
def __init__(self): def __init__(self):
@ -112,7 +110,7 @@ class WordEmbeddings:
# vocabulary is a set of terms to be kept # vocabulary is a set of terms to be kept
active_vocabulary = sorted([w for w in vocabulary if w in self.worddim]) active_vocabulary = sorted([w for w in vocabulary if w in self.worddim])
lost = len(vocabulary)-len(active_vocabulary) lost = len(vocabulary)-len(active_vocabulary)
if lost > 0: #some terms are missing, so it will be replaced by UNK if lost > 0: # some terms are missing, so it will be replaced by UNK
print('warning: missing {} terms for lang {}'.format(lost, self.lang)) print('warning: missing {} terms for lang {}'.format(lost, self.lang))
self.we = self.get_vectors(active_vocabulary) self.we = self.get_vectors(active_vocabulary)
assert self.we.shape[0] == len(active_vocabulary) assert self.we.shape[0] == len(active_vocabulary)
@ -134,12 +132,12 @@ class WordEmbeddings:
'instances of {} expected'.format(WordEmbeddings.__name__) 'instances of {} expected'.format(WordEmbeddings.__name__)
polywe = [] polywe = []
worddim={} worddim = {}
offset=0 offset = 0
for we in we_list: for we in we_list:
polywe.append(we.we) polywe.append(we.we)
worddim.update({'{}::{}'.format(we.lang, w):d+offset for w,d in we.worddim.items()}) worddim.update({'{}::{}'.format(we.lang, w):d+offset for w,d in we.worddim.items()})
offset=len(worddim) offset = len(worddim)
polywe = np.vstack(polywe) polywe = np.vstack(polywe)
return WordEmbeddings(lang='poly', we=polywe, worddim=worddim) return WordEmbeddings(lang='poly', we=polywe, worddim=worddim)
@ -191,7 +189,6 @@ class FastTextMUSE(PretrainedEmbeddings):
print(f'Loading fastText pretrained vectors from {path}') print(f'Loading fastText pretrained vectors from {path}')
assert os.path.exists(path), print(f'pre-trained vectors not found in {path}') assert os.path.exists(path), print(f'pre-trained vectors not found in {path}')
self.embed = FastTextWikiNews(path, lang, max_vectors=limit) self.embed = FastTextWikiNews(path, lang, max_vectors=limit)
# print('Done')
def vocabulary(self): def vocabulary(self):
return set(self.embed.stoi.keys()) return set(self.embed.stoi.keys())
@ -277,59 +274,3 @@ class StorageEmbeddings:
for lang in docs.keys(): for lang in docs.keys():
_r[lang] = docs[lang].dot(self.lang_U[lang]) _r[lang] = docs[lang].dot(self.lang_U[lang])
return _r return _r
# @staticmethod
# def get_optimal_supervised_components(docs, labels):
# optimal_n = get_optimal_dim(docs, 'S')
# return optimal_n
# _idx = []
#
# plt.figure(figsize=(15, 10))
# plt.title(f'WCE Explained Variance')
# plt.xlabel('Number of Components')
# plt.ylabel('Variance (%)')
#
# for lang in docs.keys():
# _r = get_supervised_embeddings(docs[lang], labels[lang], reduction='PCA', max_label_space=0).tolist()
# _r = np.cumsum(_r)
# plt.plot(_r, label=lang)
# for i in range(len(_r)-1, 1, -1):
# delta = _r[i] - _r[i-1]
# if delta > 0:
# _idx.append(i)
# break
# best_n = max(_idx)
# plt.axvline(best_n, color='r', label='optimal N')
# plt.legend()
# plt.show()
# return best_n
#
# def get_optimal_unsupervised_components(self, type):
# _idx = []
#
# plt.figure(figsize=(15, 10))
# plt.title(f'Unsupervised Embeddings {type} Explained Variance')
# plt.xlabel('Number of Components')
# plt.ylabel('Variance (%)')
#
# for lang in self.lang_U.keys():
# pca = PCA(n_components=self.lang_U[lang].shape[1])
# pca.fit(self.lang_U[lang])
# _r = pca.explained_variance_ratio_
# _r = np.cumsum(_r)
# plt.plot(_r, label=lang)
# for i in range(len(_r) - 1, 1, -1):
# delta = _r[i] - _r[i - 1]
# if delta > 0:
# _idx.append(i)
# break
# best_n = max(_idx)
# plt.axvline(best_n, color='r', label='optimal N')
# plt.legend()
# plt.show()
#
# for lang in self.lang_U.keys():
# pca = PCA(n_components=best_n)
# self.lang_U[lang] = pca.fit_transform(self.lang_U[lang])
# return

View File

@ -1,7 +1,7 @@
from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import numpy as np import numpy as np
# from sklearn.decomposition import PCA
# from sklearn.manifold import TSNE
def zscores(x, axis=0): #scipy.stats.zscores does not avoid division by 0, which can indeed occur def zscores(x, axis=0): #scipy.stats.zscores does not avoid division by 0, which can indeed occur

View File

@ -8,7 +8,7 @@ from sklearn.model_selection import KFold
from joblib import Parallel, delayed from joblib import Parallel, delayed
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer
from transformers.StandardizeTransformer import StandardizeTransformer from transformers.StandardizeTransformer import StandardizeTransformer
from sklearn.decomposition import PCA # from sklearn.decomposition import PCA
def _sort_if_sparse(X): def _sort_if_sparse(X):