Plot variance explained by PCA for every language

This commit is contained in:
andrea 2019-12-04 10:16:17 +01:00
parent f074fd97f9
commit ba1a72ff94
2 changed files with 20 additions and 40 deletions

View File

@ -157,31 +157,6 @@ class FastTextWikiNews(Vectors):
super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs) super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs)
# class EmbeddingsAligned(Vectors):
#
# def __init__(self, type, path, lang):
#
# self.name = '/embeddings/wiki.multi.{}.vec' if type == 'MUSE' else '/embeddings_polyFASTTEXT/wiki.{}.align.vec'
# # todo - rewrite as relative path
# self.cache_path = '/home/andreapdr/CLESA/embeddings' if type == 'MUSE' else '/home/andreapdr/CLESA/embeddings_polyFASTTEXT'
# self.path = path + self.name.format(lang)
# assert os.path.exists(path), f'pre-trained vectors not found in {path}'
# super(EmbeddingsAligned, self).__init__(self.path, cache=self.cache_path)
# # self.vectors = self.extract(voc)
#
# def vocabulary(self):
# return set(self.stoi.keys())
#
# def dim(self):
# return self.dim
#
# def extract(self, words):
# source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.stoi)
# extraction = torch.zeros((len(words), self.dim))
# extraction[source_idx] = self.vectors[target_idx]
# return extraction
class EmbeddingsAligned(Vectors): class EmbeddingsAligned(Vectors):
def __init__(self, type, path, lang, voc): def __init__(self, type, path, lang, voc):
@ -283,18 +258,26 @@ class StorageEmbeddings:
return _r return _r
def get_optimal_supervised_components(self, docs, labels): def get_optimal_supervised_components(self, docs, labels):
import matplotlib.pyplot as plt
_idx = [] _idx = []
plt.figure(figsize=(15, 10))
plt.title(f'WCE Explained Variance')
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)')
for lang in docs.keys(): for lang in docs.keys():
_r = get_supervised_embeddings(docs[lang], labels[lang], reduction='PCA', max_label_space='optimal').tolist() _r = get_supervised_embeddings(docs[lang], labels[lang], reduction='PCA', max_label_space='optimal').tolist()
plt.plot(np.cumsum(_r), label=lang)
for i in range(len(_r)-1, 1, -1): for i in range(len(_r)-1, 1, -1):
# todo: if n_components (therfore #n labels) is not big enough every value will be smaller than the next one ... # todo: if n_components (therfore #n labels) is not big enough every value will be smaller than the next one ...
ratio = _r[i] delta = _r[i-1] - _r[i]
next_ratio = _r[i-1]
delta = _r[i] - _r[i-1]
if delta > 0: if delta > 0:
# if ratio < next_ratio:
_idx.append(i) _idx.append(i)
break break
best_n = int(sum(_idx)/len(_idx)) best_n = int(sum(_idx)/len(_idx))
plt.vlines(best_n, 0, 1, colors='r', label='optimal N')
plt.legend()
plt.show()
return best_n return best_n

View File

@ -75,18 +75,15 @@ def get_supervised_embeddings(X, Y, reduction, max_label_space=300, lang='None',
f'Applying PCA(n_components={max_label_space})') f'Applying PCA(n_components={max_label_space})')
pca = PCA(n_components=max_label_space) pca = PCA(n_components=max_label_space)
pca = pca.fit(F) pca = pca.fit(F)
######################################################## ########################################################
import matplotlib.pyplot as plt # import matplotlib.pyplot as plt
# plt.figure()
plt.figure() # plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.plot(np.cumsum(pca.explained_variance_ratio_)) # plt.xlabel('Number of Components')
plt.xlabel('Number of Components') # plt.ylabel('Variance (%)') #
plt.ylabel('Variance (%)') # # plt.title(f'WCE Explained Variance {lang}')
plt.title(f'WCE Explained Variance {lang}') # plt.show()
plt.show()
######################################################## ########################################################
F = pca.fit_transform(F) F = pca.fit_transform(F)
elif reduction == 'TSNE': elif reduction == 'TSNE':
print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '