From 0c6056e7a13aafdcfe03b6688298533837e03747 Mon Sep 17 00:00:00 2001 From: andrea Date: Mon, 9 Dec 2019 15:39:39 +0100 Subject: [PATCH] refactored pca methods --- src/data/embeddings.py | 69 +++------------------------------------- src/data/supervised.py | 4 +-- src/learning/learners.py | 2 +- 3 files changed, 8 insertions(+), 67 deletions(-) diff --git a/src/data/embeddings.py b/src/data/embeddings.py index 2c02592..66e830f 100644 --- a/src/data/embeddings.py +++ b/src/data/embeddings.py @@ -1,14 +1,12 @@ import os import pickle -import numpy as np from torchtext.vocab import Vectors import torch from abc import ABC, abstractmethod from data.supervised import get_supervised_embeddings -import matplotlib.pyplot as plt -from sklearn.decomposition import PCA from util.decompositions import * + class PretrainedEmbeddings(ABC): def __init__(self): @@ -112,7 +110,7 @@ class WordEmbeddings: # vocabulary is a set of terms to be kept active_vocabulary = sorted([w for w in vocabulary if w in self.worddim]) lost = len(vocabulary)-len(active_vocabulary) - if lost > 0: #some terms are missing, so it will be replaced by UNK + if lost > 0: # some terms are missing, so it will be replaced by UNK print('warning: missing {} terms for lang {}'.format(lost, self.lang)) self.we = self.get_vectors(active_vocabulary) assert self.we.shape[0] == len(active_vocabulary) @@ -134,12 +132,12 @@ class WordEmbeddings: 'instances of {} expected'.format(WordEmbeddings.__name__) polywe = [] - worddim={} - offset=0 + worddim = {} + offset = 0 for we in we_list: polywe.append(we.we) worddim.update({'{}::{}'.format(we.lang, w):d+offset for w,d in we.worddim.items()}) - offset=len(worddim) + offset = len(worddim) polywe = np.vstack(polywe) return WordEmbeddings(lang='poly', we=polywe, worddim=worddim) @@ -191,7 +189,6 @@ class FastTextMUSE(PretrainedEmbeddings): print(f'Loading fastText pretrained vectors from {path}') assert os.path.exists(path), print(f'pre-trained vectors not found in {path}') self.embed = FastTextWikiNews(path, lang, max_vectors=limit) - # print('Done') def vocabulary(self): return set(self.embed.stoi.keys()) @@ -277,59 +274,3 @@ class StorageEmbeddings: for lang in docs.keys(): _r[lang] = docs[lang].dot(self.lang_U[lang]) return _r - - # @staticmethod - # def get_optimal_supervised_components(docs, labels): - # optimal_n = get_optimal_dim(docs, 'S') - # return optimal_n - # _idx = [] - # - # plt.figure(figsize=(15, 10)) - # plt.title(f'WCE Explained Variance') - # plt.xlabel('Number of Components') - # plt.ylabel('Variance (%)') - # - # for lang in docs.keys(): - # _r = get_supervised_embeddings(docs[lang], labels[lang], reduction='PCA', max_label_space=0).tolist() - # _r = np.cumsum(_r) - # plt.plot(_r, label=lang) - # for i in range(len(_r)-1, 1, -1): - # delta = _r[i] - _r[i-1] - # if delta > 0: - # _idx.append(i) - # break - # best_n = max(_idx) - # plt.axvline(best_n, color='r', label='optimal N') - # plt.legend() - # plt.show() - # return best_n - # - # def get_optimal_unsupervised_components(self, type): - # _idx = [] - # - # plt.figure(figsize=(15, 10)) - # plt.title(f'Unsupervised Embeddings {type} Explained Variance') - # plt.xlabel('Number of Components') - # plt.ylabel('Variance (%)') - # - # for lang in self.lang_U.keys(): - # pca = PCA(n_components=self.lang_U[lang].shape[1]) - # pca.fit(self.lang_U[lang]) - # _r = pca.explained_variance_ratio_ - # _r = np.cumsum(_r) - # plt.plot(_r, label=lang) - # for i in range(len(_r) - 1, 1, -1): - # delta = _r[i] - _r[i - 1] - # if delta > 0: - # _idx.append(i) - # break - # best_n = max(_idx) - # plt.axvline(best_n, color='r', label='optimal N') - # plt.legend() - # plt.show() - # - # for lang in self.lang_U.keys(): - # pca = PCA(n_components=best_n) - # self.lang_U[lang] = pca.fit_transform(self.lang_U[lang]) - # return - diff --git a/src/data/supervised.py b/src/data/supervised.py index bbd8c37..d2d7aab 100755 --- a/src/data/supervised.py +++ b/src/data/supervised.py @@ -1,7 +1,7 @@ from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square -from sklearn.decomposition import PCA -from sklearn.manifold import TSNE import numpy as np +# from sklearn.decomposition import PCA +# from sklearn.manifold import TSNE def zscores(x, axis=0): #scipy.stats.zscores does not avoid division by 0, which can indeed occur diff --git a/src/learning/learners.py b/src/learning/learners.py index c4c69fd..96e200c 100644 --- a/src/learning/learners.py +++ b/src/learning/learners.py @@ -8,7 +8,7 @@ from sklearn.model_selection import KFold from joblib import Parallel, delayed from sklearn.feature_extraction.text import TfidfVectorizer from transformers.StandardizeTransformer import StandardizeTransformer -from sklearn.decomposition import PCA +# from sklearn.decomposition import PCA def _sort_if_sparse(X):