diff --git a/src/data/embeddings.py b/src/data/embeddings.py index d1ad651..b5b253a 100644 --- a/src/data/embeddings.py +++ b/src/data/embeddings.py @@ -157,31 +157,6 @@ class FastTextWikiNews(Vectors): super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs) -# class EmbeddingsAligned(Vectors): -# -# def __init__(self, type, path, lang): -# -# self.name = '/embeddings/wiki.multi.{}.vec' if type == 'MUSE' else '/embeddings_polyFASTTEXT/wiki.{}.align.vec' -# # todo - rewrite as relative path -# self.cache_path = '/home/andreapdr/CLESA/embeddings' if type == 'MUSE' else '/home/andreapdr/CLESA/embeddings_polyFASTTEXT' -# self.path = path + self.name.format(lang) -# assert os.path.exists(path), f'pre-trained vectors not found in {path}' -# super(EmbeddingsAligned, self).__init__(self.path, cache=self.cache_path) -# # self.vectors = self.extract(voc) -# -# def vocabulary(self): -# return set(self.stoi.keys()) -# -# def dim(self): -# return self.dim -# -# def extract(self, words): -# source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.stoi) -# extraction = torch.zeros((len(words), self.dim)) -# extraction[source_idx] = self.vectors[target_idx] -# return extraction - - class EmbeddingsAligned(Vectors): def __init__(self, type, path, lang, voc): @@ -283,18 +258,26 @@ class StorageEmbeddings: return _r def get_optimal_supervised_components(self, docs, labels): + import matplotlib.pyplot as plt + _idx = [] + + plt.figure(figsize=(15, 10)) + plt.title(f'WCE Explained Variance') + plt.xlabel('Number of Components') + plt.ylabel('Variance (%)') + for lang in docs.keys(): _r = get_supervised_embeddings(docs[lang], labels[lang], reduction='PCA', max_label_space='optimal').tolist() - + plt.plot(np.cumsum(_r), label=lang) for i in range(len(_r)-1, 1, -1): # todo: if n_components (therfore #n labels) is not big enough every value will be smaller than the next one ... - ratio = _r[i] - next_ratio = _r[i-1] - delta = _r[i] - _r[i-1] + delta = _r[i-1] - _r[i] if delta > 0: - # if ratio < next_ratio: _idx.append(i) break best_n = int(sum(_idx)/len(_idx)) + plt.vlines(best_n, 0, 1, colors='r', label='optimal N') + plt.legend() + plt.show() return best_n diff --git a/src/data/supervised.py b/src/data/supervised.py index f365dfd..02f8c84 100755 --- a/src/data/supervised.py +++ b/src/data/supervised.py @@ -75,18 +75,15 @@ def get_supervised_embeddings(X, Y, reduction, max_label_space=300, lang='None', f'Applying PCA(n_components={max_label_space})') pca = PCA(n_components=max_label_space) pca = pca.fit(F) - ######################################################## - import matplotlib.pyplot as plt - - plt.figure() - plt.plot(np.cumsum(pca.explained_variance_ratio_)) - plt.xlabel('Number of Components') - plt.ylabel('Variance (%)') # - plt.title(f'WCE Explained Variance {lang}') - plt.show() + # import matplotlib.pyplot as plt + # plt.figure() + # plt.plot(np.cumsum(pca.explained_variance_ratio_)) + # plt.xlabel('Number of Components') + # plt.ylabel('Variance (%)') # + # plt.title(f'WCE Explained Variance {lang}') + # plt.show() ######################################################## - F = pca.fit_transform(F) elif reduction == 'TSNE': print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '