diff --git a/src/FPEC_andrea.py b/src/FPEC_andrea.py index 9be7c42..137e6cc 100644 --- a/src/FPEC_andrea.py +++ b/src/FPEC_andrea.py @@ -103,8 +103,8 @@ if __name__ == '__main__': _config_id = 'M_and_F' ##### TODO - config dict is redundant - we have already op argparse ... - config['reduction'] = 'tSVD' - config['max_label_space'] = 50 + config['reduction'] = 'PCA' + config['max_label_space'] = 'optimal' result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '') diff --git a/src/data/embeddings.py b/src/data/embeddings.py index 66a14d0..d1ad651 100644 --- a/src/data/embeddings.py +++ b/src/data/embeddings.py @@ -5,7 +5,6 @@ from torchtext.vocab import Vectors import torch from abc import ABC, abstractmethod from data.supervised import get_supervised_embeddings -from sklearn.decomposition import PCA class PretrainedEmbeddings(ABC): @@ -244,10 +243,16 @@ class StorageEmbeddings: return def _add_emebeddings_supervised(self, docs, labels, reduction, max_label_space): + _optimal = dict() + # TODO testing optimal max_label_space + if max_label_space == 'optimal': + print('Computing optimal number of PCA components ...') + optimal_n = self.get_optimal_supervised_components(docs, labels) + max_label_space = optimal_n + for lang in docs.keys(): print(f'# [supervised-matrix] for {lang}') - # should also pass max_label_space and reduction techniques - self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], reduction, max_label_space) + self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], reduction, max_label_space, lang) print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n') return @@ -277,22 +282,19 @@ class StorageEmbeddings: _r[lang] = docs[lang].dot(self.lang_U[lang]) return _r + def get_optimal_supervised_components(self, docs, labels): + _idx = [] + for lang in docs.keys(): + _r = get_supervised_embeddings(docs[lang], labels[lang], reduction='PCA', max_label_space='optimal').tolist() -# def embedding_matrix(type, path, voc, lang): -# vocabulary = np.asarray(list(zip(*sorted(voc.items(), key=lambda x: x[1])))[0]) -# -# print('[embedding matrix]') -# print(f'# [pretrained-matrix: {type} {lang}]') -# pretrained = EmbeddingsAligned(type, path, lang) -# P = pretrained.extract(vocabulary).numpy() -# del pretrained -# print(f'[embedding matrix done] of shape={P.shape}\n') -# -# return vocabulary, P - - -def WCE_matrix(Xtr, Ytr, lang, reduction=None, n_components=50): - print('\n# [supervised-matrix]') - S = get_supervised_embeddings(Xtr[lang], Ytr[lang]) - print(f'[embedding matrix done] of shape={S.shape}\n') - return S + for i in range(len(_r)-1, 1, -1): + # todo: if n_components (therfore #n labels) is not big enough every value will be smaller than the next one ... + ratio = _r[i] + next_ratio = _r[i-1] + delta = _r[i] - _r[i-1] + if delta > 0: + # if ratio < next_ratio: + _idx.append(i) + break + best_n = int(sum(_idx)/len(_idx)) + return best_n diff --git a/src/data/supervised.py b/src/data/supervised.py index b3c4fb9..f365dfd 100755 --- a/src/data/supervised.py +++ b/src/data/supervised.py @@ -40,8 +40,12 @@ def supervised_embeddings_tsr(X,Y, tsr_function=information_gain, max_documents= return F -def get_supervised_embeddings(X, Y, reduction, max_label_space=300, binary_structural_problems=-1, method='dotn', dozscore=True): - print('computing supervised embeddings...') +def get_supervised_embeddings(X, Y, reduction, max_label_space=300, lang='None', binary_structural_problems=-1, method='dotn', dozscore=True): + if max_label_space == 'optimal': + max_label_space = 0 + + if max_label_space != 0: + print('computing supervised embeddings...') nC = Y.shape[1] if nC==2 and binary_structural_problems > nC: @@ -60,21 +64,40 @@ def get_supervised_embeddings(X, Y, reduction, max_label_space=300, binary_struc F = zscores(F, axis=0) if nC > max_label_space: + # TODO testing optimal max_label_space if reduction == 'PCA': + if max_label_space == 0: + pca = PCA(n_components=Y.shape[1]) + pca = pca.fit(F) + return pca.explained_variance_ratio_ + print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' f'Applying PCA(n_components={max_label_space})') pca = PCA(n_components=max_label_space) - F = pca.fit(F).transform(F) + pca = pca.fit(F) + + ######################################################## + import matplotlib.pyplot as plt + + plt.figure() + plt.plot(np.cumsum(pca.explained_variance_ratio_)) + plt.xlabel('Number of Components') + plt.ylabel('Variance (%)') # + plt.title(f'WCE Explained Variance {lang}') + plt.show() + ######################################################## + + F = pca.fit_transform(F) elif reduction == 'TSNE': print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' f'Applying t-SNE(n_components={max_label_space})') tsne = TSNE(n_components=max_label_space) - F = tsne.fit(F).fit_transform(F) + F = tsne.fit_transform(F) elif reduction == 'tSVD': print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' f'Applying truncatedSVD(n_components={max_label_space})') tSVD = TruncatedSVD(n_components=max_label_space) - F = tSVD.fit(F).fit_transform(F) + F = tSVD.fit_transform(F) return F diff --git a/src/learning/learners.py b/src/learning/learners.py index 89bda7e..aed1094 100644 --- a/src/learning/learners.py +++ b/src/learning/learners.py @@ -1,6 +1,6 @@ import numpy as np import time -from data.embeddings import WordEmbeddings, WCE_matrix, StorageEmbeddings +from data.embeddings import WordEmbeddings, StorageEmbeddings from scipy.sparse import issparse from sklearn.multiclass import OneVsRestClassifier from sklearn.model_selection import GridSearchCV @@ -493,43 +493,6 @@ class AndreaCLF(FunnellingPolylingualClassifier): return lZ, lYtr - # def embed(self, lX, ly, unsupervised=False, supervised=False, prediction=False): - # """ - # build embedding matrix for given language and returns its weighted sum wrt tf-idf score - # """ - # _r = dict() - # languages = list(lX.keys()) - # - # if prediction: - # for lang in languages: - # if unsupervised: # If unsupervised embeddings ... - # M = self.word_embeddings[lang] - # if supervised: # and also unsupervised --> get both (M) and (S) weighted sum matrices and hstack them - # S = self.supervised_embeddings[lang] - # _r[lang] = np.hstack((lX[lang].dot(M), lX[lang].dot(S))) - # continue - # _r[lang] = lX[lang].dot(M) # if not supervised --> just get weighted sum of unsupervised (M) embeddings - # else: # If not unsupervised --> get (S) matrix and its weighted sum - # S = self.supervised_embeddings[lang] - # _r[lang] = lX[lang].dot(S) - # return _r - # - # if unsupervised: - # for lang in languages: - # _, M = embedding_matrix(self.config['we_type'], self.we_path, self.lang_word2idx[lang], lang) - # self.word_embeddings[lang] = M - # _r[lang] = lX[lang].dot(M) - # - # if supervised: - # for lang in languages: - # S = WCE_matrix(lX, ly, lang) - # self.supervised_embeddings[lang] = S - # if unsupervised: - # _r[lang] = np.hstack((_r[lang], lX[lang].dot(S))) - # else: - # _r[lang] = lX[lang].dot(S) - # return _r - # @override std class method def fit(self, lX, ly): tinit = time.time()