From 9fa1899a7f1d3f73349bf20909aa0e98596fb31f Mon Sep 17 00:00:00 2001 From: andrea Date: Mon, 9 Dec 2019 15:37:52 +0100 Subject: [PATCH] refactored pca methods --- src/FPEC_andrea.py | 28 +++++--- src/data/embeddings.py | 133 +++++++++++++++++++++++++------------ src/data/supervised.py | 61 ++++++++--------- src/learning/learners.py | 17 ++--- src/util/decompositions.py | 49 ++++++++++++++ src/util/results.py | 6 +- 6 files changed, 199 insertions(+), 95 deletions(-) create mode 100644 src/util/decompositions.py diff --git a/src/FPEC_andrea.py b/src/FPEC_andrea.py index 185bcc2..1618c33 100644 --- a/src/FPEC_andrea.py +++ b/src/FPEC_andrea.py @@ -1,4 +1,4 @@ -import os, sys +import os from dataset_builder import MultilingualDataset from learning.learners import * from util.evaluation import * @@ -21,7 +21,7 @@ parser.add_option("-e", "--mode-embed", dest="mode_embed", help="Set the embedding to be used [none, unsupervised, supervised, both]", type=str, default='none') parser.add_option("-w", "--we-path", dest="we_path", - help="Path to the polylingual word embeddings", default='../embeddings/') + help="Path to the polylingual word embeddings", default='/home/andreapdr/CLESA/') parser.add_option('-t', "--we-type", dest="we_type", help="Aligned embeddings to use [FastText, MUSE]", type=str, default='MUSE') @@ -30,11 +30,21 @@ parser.add_option("-s", "--set_c", dest="set_c",type=float, help="Set the C parameter", default=1) parser.add_option("-c", "--optimc", dest="optimc", action='store_true', - help="Optimices hyperparameters", default=False) + help="Optimize hyperparameters", default=False) parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int, help="Number of parallel jobs (default is -1, all)", default=-1) +parser.add_option("-p", "--pca", dest="max_labels", type=int, + help="If less than number of target classes, will apply PCA to supervised matrix. If set to 0 it" + " will automatically search for the best number of components", default=300) + +parser.add_option("-u", "--upca", dest="max_labels_U", type=int, + help="If smaller than Unsupervised Dimension, will apply PCA to unsupervised matrix. If set to 0 it" + " will automatically search for the best number of components", default=300) + +parser.add_option("-l", dest="lang", type=str) + def get_learner(calibrate=False, kernel='linear'): return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto') @@ -51,7 +61,6 @@ def get_params(dense=False): if __name__ == '__main__': - (op, args) = parser.parse_args() assert exists(op.dataset), 'Unable to find file '+str(op.dataset) @@ -64,8 +73,9 @@ if __name__ == '__main__': data = MultilingualDataset.load(op.dataset) data.show_dimensions() - # data.set_view(languages=['en','it'], categories=list(range(10))) - # data.set_view(languages=['en','it']) + data.set_view(languages=['en','it', 'pt', 'sv'], categories=list(range(10))) + # data.set_view(languages=[op.lang]) + # data.set_view(categories=list(range(10))) lXtr, lytr = data.training() lXte, lyte = data.test() @@ -104,7 +114,9 @@ if __name__ == '__main__': ##### TODO - config dict is redundant - we have already op argparse ... config['reduction'] = 'PCA' - config['max_label_space'] = 300 + config['max_label_space'] = op.max_labels + config['dim_reduction_unsupervised'] = op.max_labels_U + # config['plot_covariance_matrices'] = True result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '') @@ -129,5 +141,5 @@ if __name__ == '__main__': metrics.append([macrof1, microf1, macrok, microk]) print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1)) results.add_row(result_id, 'PolyEmbed_andrea', 'svm', _config_id, config['we_type'], op.optimc, op.dataset.split('/')[-1], - 'not_binary', 'not_ablation', classifier.time, lang, macrof1, microf1, macrok, microk, '') + classifier.time, lang, macrof1, microf1, macrok, microk, '') print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0)) diff --git a/src/data/embeddings.py b/src/data/embeddings.py index 8005dad..2c02592 100644 --- a/src/data/embeddings.py +++ b/src/data/embeddings.py @@ -5,7 +5,9 @@ from torchtext.vocab import Vectors import torch from abc import ABC, abstractmethod from data.supervised import get_supervised_embeddings - +import matplotlib.pyplot as plt +from sklearn.decomposition import PCA +from util.decompositions import * class PretrainedEmbeddings(ABC): @@ -110,10 +112,10 @@ class WordEmbeddings: # vocabulary is a set of terms to be kept active_vocabulary = sorted([w for w in vocabulary if w in self.worddim]) lost = len(vocabulary)-len(active_vocabulary) - if lost>0: #some termr are missing, so it will be replaced by UNK + if lost > 0: #some terms are missing, so it will be replaced by UNK print('warning: missing {} terms for lang {}'.format(lost, self.lang)) self.we = self.get_vectors(active_vocabulary) - assert self.we.shape[0]==len(active_vocabulary) + assert self.we.shape[0] == len(active_vocabulary) self.dimword={i:w for i,w in enumerate(active_vocabulary)} self.worddim={w:i for i,w in enumerate(active_vocabulary)} return self @@ -153,7 +155,6 @@ class FastTextWikiNews(Vectors): url = self.url_base.format(language) # name = self.path.format(language) name = cache + self._name.format(language) - # print(f'\n\nFASTEXTWIKI-NEW CLASS:\nurl = {url}\nname = {name}\ncache {cache}\nlanguage = {language}') super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs) @@ -171,15 +172,17 @@ class EmbeddingsAligned(Vectors): def vocabulary(self): return set(self.stoi.keys()) - def dim(self): - return self.dim - def extract(self, words): source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.stoi) extraction = torch.zeros((len(words), self.dim)) extraction[source_idx] = self.vectors[target_idx] return extraction + def reduce(self, dim): + pca = PCA(n_components=dim) + self.vectors = pca.fit_transform(self.vectors) + return + class FastTextMUSE(PretrainedEmbeddings): @@ -209,26 +212,44 @@ class StorageEmbeddings: self.lang_U = dict() self.lang_S = dict() - def _add_embeddings_unsupervised(self, type, docs, vocs): + def _add_embeddings_unsupervised(self, type, docs, vocs, max_label_space=300): for lang in docs.keys(): + nC = self.lang_U[lang].shape[1] print(f'# [unsupervised-matrix {type}] for {lang}') voc = np.asarray(list(zip(*sorted(vocs[lang].items(), key=lambda x: x[1])))[0]) self.lang_U[lang] = EmbeddingsAligned(type, self.path, lang, voc).vectors + # if self.lang_U[lang].shape[1] > dim != 0: + # print(f'unsupervised matrix has more dimensions ({self.lang_U[lang].shape[1]}) than' + # f' the allowed limit {dim}. Applying PCA(n_components={dim})') + # pca = PCA(n_components=dim) + # self.lang_U[lang] = pca.fit_transform(self.lang_U[lang]) print(f'Matrix U (weighted sum) of shape {self.lang_U[lang].shape}\n') + if max_label_space == 0: + print(f'Computing optimal number of PCA components along matrices U') + optimal_n = get_optimal_dim(self.lang_U, 'U') + self.lang_U = run_pca(optimal_n, self.lang_U) + elif max_label_space < nC: + self.lang_U = run_pca(max_label_space, self.lang_U) + return def _add_emebeddings_supervised(self, docs, labels, reduction, max_label_space, voc): - _optimal = dict() - # TODO testing optimal max_label_space - if max_label_space == 'optimal': - print('Computing optimal number of PCA components ...') - optimal_n = self.get_optimal_supervised_components(docs, labels) - max_label_space = optimal_n - - for lang in docs.keys(): + # if max_label_space == 0: + # print('Computing optimal number of PCA components along matrices S...') + # optimal_n = self.get_optimal_supervised_components(docs, labels) + # max_label_space = optimal_n + for lang in docs.keys(): # compute supervised matrices S - then apply PCA + nC = self.lang_S[lang].shape[1] print(f'# [supervised-matrix] for {lang}') self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], reduction, max_label_space, voc[lang], lang) print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n') + + if max_label_space == 0: + optimal_n = get_optimal_dim(self.lang_S, 'S') + self.lang_S = run_pca(optimal_n, self.lang_S) + elif max_label_space < nC: + self.lang_S = run_pca(max_label_space, self.lang_S) + return def _concatenate_embeddings(self, docs): @@ -239,7 +260,7 @@ class StorageEmbeddings: def fit(self, config, docs, vocs, labels): if config['unsupervised']: - self._add_embeddings_unsupervised(config['we_type'], docs, vocs) + self._add_embeddings_unsupervised(config['we_type'], docs, vocs, config['dim_reduction_unsupervised']) if config['supervised']: self._add_emebeddings_supervised(docs, labels, config['reduction'], config['max_label_space'], vocs) return self @@ -257,28 +278,58 @@ class StorageEmbeddings: _r[lang] = docs[lang].dot(self.lang_U[lang]) return _r - def get_optimal_supervised_components(self, docs, labels): - import matplotlib.pyplot as plt + # @staticmethod + # def get_optimal_supervised_components(docs, labels): + # optimal_n = get_optimal_dim(docs, 'S') + # return optimal_n + # _idx = [] + # + # plt.figure(figsize=(15, 10)) + # plt.title(f'WCE Explained Variance') + # plt.xlabel('Number of Components') + # plt.ylabel('Variance (%)') + # + # for lang in docs.keys(): + # _r = get_supervised_embeddings(docs[lang], labels[lang], reduction='PCA', max_label_space=0).tolist() + # _r = np.cumsum(_r) + # plt.plot(_r, label=lang) + # for i in range(len(_r)-1, 1, -1): + # delta = _r[i] - _r[i-1] + # if delta > 0: + # _idx.append(i) + # break + # best_n = max(_idx) + # plt.axvline(best_n, color='r', label='optimal N') + # plt.legend() + # plt.show() + # return best_n + # + # def get_optimal_unsupervised_components(self, type): + # _idx = [] + # + # plt.figure(figsize=(15, 10)) + # plt.title(f'Unsupervised Embeddings {type} Explained Variance') + # plt.xlabel('Number of Components') + # plt.ylabel('Variance (%)') + # + # for lang in self.lang_U.keys(): + # pca = PCA(n_components=self.lang_U[lang].shape[1]) + # pca.fit(self.lang_U[lang]) + # _r = pca.explained_variance_ratio_ + # _r = np.cumsum(_r) + # plt.plot(_r, label=lang) + # for i in range(len(_r) - 1, 1, -1): + # delta = _r[i] - _r[i - 1] + # if delta > 0: + # _idx.append(i) + # break + # best_n = max(_idx) + # plt.axvline(best_n, color='r', label='optimal N') + # plt.legend() + # plt.show() + # + # for lang in self.lang_U.keys(): + # pca = PCA(n_components=best_n) + # self.lang_U[lang] = pca.fit_transform(self.lang_U[lang]) + # return - _idx = [] - - plt.figure(figsize=(15, 10)) - plt.title(f'WCE Explained Variance') - plt.xlabel('Number of Components') - plt.ylabel('Variance (%)') - - for lang in docs.keys(): - _r = get_supervised_embeddings(docs[lang], labels[lang], reduction='PCA', max_label_space='optimal').tolist() - _r = np.cumsum(_r) - plt.plot(_r, label=lang) - for i in range(len(_r)-1, 1, -1): - # todo: if n_components (therfore #n labels) is not big enough every value will be smaller than the next one ... - delta = _r[i] - _r[i-1] - if delta > 0: - _idx.append(i) - break - best_n = int(sum(_idx)/len(_idx)) - plt.vlines(best_n, 0, 1, colors='r', label='optimal N') - plt.legend() - plt.show() - return best_n diff --git a/src/data/supervised.py b/src/data/supervised.py index d8e1f7d..bbd8c37 100755 --- a/src/data/supervised.py +++ b/src/data/supervised.py @@ -1,5 +1,5 @@ from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square -from sklearn.decomposition import PCA, TruncatedSVD +from sklearn.decomposition import PCA from sklearn.manifold import TSNE import numpy as np @@ -41,15 +41,9 @@ def supervised_embeddings_tsr(X,Y, tsr_function=information_gain, max_documents= def get_supervised_embeddings(X, Y, reduction, max_label_space=300, voc=None, lang='None', binary_structural_problems=-1, method='dotn', dozscore=True): - if max_label_space == 'optimal': - max_label_space = 0 - if max_label_space != 0: print('computing supervised embeddings...') - nC = Y.shape[1] - if nC==2 and binary_structural_problems > nC: - raise ValueError('not implemented in this branch') if method=='ppmi': F = supervised_embeddings_ppmi(X, Y) @@ -64,8 +58,7 @@ def get_supervised_embeddings(X, Y, reduction, max_label_space=300, voc=None, la F = zscores(F, axis=0) # Dumping F-matrix for further studies - # TODO im not sure if voc.keys and F matrix indices are "aligned" correctly - dump_it = True + dump_it = False if dump_it: with open(f'/home/andreapdr/funneling_pdr/src/dumps/WCE_{lang}.tsv', 'w') as outfile: np.savetxt(outfile, F, delimiter='\t') @@ -73,34 +66,32 @@ def get_supervised_embeddings(X, Y, reduction, max_label_space=300, voc=None, la for token in voc.keys(): outfile.write(token+'\n') - - - if nC > max_label_space: - # TODO testing optimal max_label_space - if reduction == 'PCA': - if max_label_space == 0: - pca = PCA(n_components=Y.shape[1]) - pca = pca.fit(F) - return pca.explained_variance_ratio_ - - print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' - f'Applying PCA(n_components={max_label_space})') - pca = PCA(n_components=max_label_space) - pca = pca.fit(F) - F = pca.fit_transform(F) - elif reduction == 'TSNE': - print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' - f'Applying t-SNE(n_components={max_label_space})') - tsne = TSNE(n_components=max_label_space) - F = tsne.fit_transform(F) - elif reduction == 'tSVD': - print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' - f'Applying truncatedSVD(n_components={max_label_space})') - tSVD = TruncatedSVD(n_components=max_label_space) - F = tSVD.fit_transform(F) - return F + # if nC >= max_label_space: + # if reduction == 'PCA': + # if max_label_space == 0: + # pca = PCA(n_components=Y.shape[1]) + # pca = pca.fit(F) + # return pca.explained_variance_ratio_ + # + # print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' + # f'Applying PCA(n_components={max_label_space})') + # pca = PCA(n_components=max_label_space) + # F = pca.fit_transform(F) + # elif reduction == 'TSNE': + # print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' + # f'Applying t-SNE(n_components={max_label_space})') + # tsne = TSNE(n_components=max_label_space) + # F = tsne.fit_transform(F) + # elif reduction == 'tSVD': + # print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' + # f'Applying truncatedSVD(n_components={max_label_space})') + # tSVD = TruncatedSVD(n_components=max_label_space) + # F = tSVD.fit_transform(F) + # + # return F + diff --git a/src/learning/learners.py b/src/learning/learners.py index aed1094..c4c69fd 100644 --- a/src/learning/learners.py +++ b/src/learning/learners.py @@ -8,6 +8,7 @@ from sklearn.model_selection import KFold from joblib import Parallel, delayed from sklearn.feature_extraction.text import TfidfVectorizer from transformers.StandardizeTransformer import StandardizeTransformer +from sklearn.decomposition import PCA def _sort_if_sparse(X): @@ -453,13 +454,12 @@ class AndreaCLF(FunnellingPolylingualClassifier): calmode, n_jobs) + self.pca_independent_space = PCA(n_components=100) self.we_path = we_path self.config = config self.lang_word2idx = dict() self.languages = [] self.lang_tfidf = {} - # self.word_embeddings = {} - # self.supervised_embeddings = {} self.embedding_space = None self.model = None self.time = None @@ -515,6 +515,10 @@ class AndreaCLF(FunnellingPolylingualClassifier): _vertical_Z = np.vstack([Z[lang] for lang in self.languages]) _vertical_Zy = np.vstack([zy[lang] for lang in self.languages]) + # todo testing ... + # self.pca_independent_space.fit(_vertical_Z) + # _vertical_Z = self.pca_independent_space.transform(_vertical_Z) + self.standardizer = StandardizeTransformer() _vertical_Z = self.standardizer.fit_predict(_vertical_Z) @@ -532,17 +536,14 @@ class AndreaCLF(FunnellingPolylingualClassifier): if self.config['supervised'] or self.config['unsupervised']: _embedding_space = self.embedding_space.predict(self.config, lX) - # l_weighted_em = self.embed(lX, ly, - # unsupervised=self.config['unsupervised'], - # supervised=self.config['supervised'], - # prediction=True) - # Z_embedded = dict() + for lang in lX.keys(): lZ[lang] = np.hstack((lZ[lang], _embedding_space[lang])) - # lZ = Z_embedded for lang in lZ.keys(): print(lZ[lang].shape) + # todo testing + # lZ[lang] = self.pca_independent_space.transform(lZ[lang]) lZ[lang] = self.standardizer.predict(lZ[lang]) return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs) diff --git a/src/util/decompositions.py b/src/util/decompositions.py new file mode 100644 index 0000000..9029b33 --- /dev/null +++ b/src/util/decompositions.py @@ -0,0 +1,49 @@ +from sklearn.decomposition import PCA +import numpy as np +import matplotlib.pyplot as plt + +def run_pca(dim, X): + """ + :param dim: number of pca components to keep + :param X: dictionary str(lang): matrix + :return: dict lang: reduced matrix + """ + r = dict() + pca = PCA(n_components=dim) + for lang in X.keys(): + r[lang] = pca.fit_transform(X[lang]) + return r + + +def get_optimal_dim(X, embed_type): + """ + :param X: dict str(lang) : csr_matrix of embeddings unsupervised or supervised + :param embed_type: (str) embedding matrix type: S or U (WCE supervised or U unsupervised MUSE/FASTTEXT) + :return: + """ + _idx = [] + + plt.figure(figsize=(15, 10)) + if embed_type == 'U': + plt.title(f'Unsupervised Embeddings {"TODO"} Explained Variance') + else: + plt.title(f'WCE Explained Variance') + plt.xlabel('Number of Components') + plt.ylabel('Variance (%)') + + for lang in X.keys(): + pca = PCA(n_components=X[lang].shape[1]) + pca.fit(X[lang]) + _r = pca.explained_variance_ratio_ + _r = np.cumsum(_r) + plt.plot(_r, label=lang) + for i in range(len(_r) - 1, 1, -1): + delta = _r[i] - _r[i - 1] + if delta > 0: + _idx.append(i) + break + best_n = max(_idx) + plt.axvline(best_n, color='r', label='optimal N') + plt.legend() + plt.show() + return best_n \ No newline at end of file diff --git a/src/util/results.py b/src/util/results.py index 22e8021..7c25bec 100644 --- a/src/util/results.py +++ b/src/util/results.py @@ -5,7 +5,7 @@ import numpy as np class PolylingualClassificationResults: def __init__(self, file, autoflush=True, verbose=False): self.file = file - self.columns = ['id', 'method', 'learner', 'embed', 'embed_type', 'optimp', 'dataset', 'binary', 'languages', 'time', 'lang', 'macrof1', 'microf1', 'macrok', 'microk', 'notes'] + self.columns = ['id', 'method', 'learner', 'embed', 'embed_type', 'optimp', 'dataset', 'time', 'lang', 'macrof1', 'microf1', 'macrok', 'microk', 'notes'] self.autoflush = autoflush self.verbose = verbose if os.path.exists(file): @@ -20,8 +20,8 @@ class PolylingualClassificationResults: def already_calculated(self, id): return (self.df['id'] == id).any() - def add_row(self, id, method, learner, embed, embed_type, optimp, dataset, binary, ablation_lang, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''): - s = pd.Series([id, method, learner, embed, embed_type, optimp, dataset, binary, ablation_lang, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns) + def add_row(self, id, method, learner, embed, embed_type, optimp, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''): + s = pd.Series([id, method, learner, embed, embed_type, optimp, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns) self.df = self.df.append(s, ignore_index=True) if self.autoflush: self.flush() self.tell(s.to_string())