get_optimal_supervised_components method - to be polished

2019-12-03 19:57:11 +01:00 · 2019-12-03 19:57:11 +01:00 · f074fd97f9
parent 4de6b3e250
commit f074fd97f9
4 changed files with 54 additions and 66 deletions
--- a/src/FPEC_andrea.py
+++ b/src/FPEC_andrea.py
@ -103,8 +103,8 @@ if __name__ == '__main__':
        _config_id = 'M_and_F'

    ##### TODO - config dict is redundant - we have already op argparse ...
-    config['reduction'] = 'tSVD'
-    config['max_label_space'] = 50
+    config['reduction'] = 'PCA'
+    config['max_label_space'] = 'optimal'

    result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '')

--- a/src/data/embeddings.py
+++ b/src/data/embeddings.py
@ -5,7 +5,6 @@ from torchtext.vocab import Vectors
 import torch
 from abc import ABC, abstractmethod
 from data.supervised import get_supervised_embeddings
-from sklearn.decomposition import PCA


 class PretrainedEmbeddings(ABC):
@ -244,10 +243,16 @@ class StorageEmbeddings:
        return

    def _add_emebeddings_supervised(self, docs, labels, reduction, max_label_space):
+        _optimal = dict()
+        # TODO testing optimal max_label_space
+        if max_label_space == 'optimal':
+            print('Computing optimal number of PCA components ...')
+            optimal_n = self.get_optimal_supervised_components(docs, labels)
+            max_label_space = optimal_n
+
        for lang in docs.keys():
            print(f'# [supervised-matrix] for {lang}')
-            # should also pass max_label_space and reduction techniques
-            self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], reduction, max_label_space)
+            self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], reduction, max_label_space, lang)
            print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n')
        return

@ -277,22 +282,19 @@ class StorageEmbeddings:
                _r[lang] = docs[lang].dot(self.lang_U[lang])
        return _r

+    def get_optimal_supervised_components(self, docs, labels):
+        _idx = []
+        for lang in docs.keys():
+            _r = get_supervised_embeddings(docs[lang], labels[lang], reduction='PCA', max_label_space='optimal').tolist()

-# def embedding_matrix(type, path, voc, lang):
-#     vocabulary = np.asarray(list(zip(*sorted(voc.items(), key=lambda x: x[1])))[0])
-#
-#     print('[embedding matrix]')
-#     print(f'# [pretrained-matrix: {type} {lang}]')
-#     pretrained = EmbeddingsAligned(type, path, lang)
-#     P = pretrained.extract(vocabulary).numpy()
-#     del pretrained
-#     print(f'[embedding matrix done] of shape={P.shape}\n')
-#
-#     return vocabulary, P
-
-
-def WCE_matrix(Xtr, Ytr, lang, reduction=None, n_components=50):
-    print('\n# [supervised-matrix]')
-    S = get_supervised_embeddings(Xtr[lang], Ytr[lang])
-    print(f'[embedding matrix done] of shape={S.shape}\n')
-    return S
+            for i in range(len(_r)-1, 1, -1):
+                # todo: if n_components (therfore #n labels) is not big enough every value will be smaller than the next one ...
+                ratio = _r[i]
+                next_ratio = _r[i-1]
+                delta = _r[i] - _r[i-1]
+                if delta > 0:
+                # if ratio < next_ratio:
+                    _idx.append(i)
+                    break
+        best_n = int(sum(_idx)/len(_idx))
+        return best_n
--- a/src/data/supervised.py
+++ b/src/data/supervised.py
@ -40,8 +40,12 @@ def supervised_embeddings_tsr(X,Y, tsr_function=information_gain, max_documents=
    return F


-def get_supervised_embeddings(X, Y, reduction, max_label_space=300, binary_structural_problems=-1, method='dotn', dozscore=True):
-    print('computing supervised embeddings...')
+def get_supervised_embeddings(X, Y, reduction, max_label_space=300, lang='None', binary_structural_problems=-1, method='dotn', dozscore=True):
+    if max_label_space == 'optimal':
+        max_label_space = 0
+
+    if max_label_space != 0:
+        print('computing supervised embeddings...')

    nC = Y.shape[1]
    if nC==2 and binary_structural_problems > nC:
@ -60,21 +64,40 @@ def get_supervised_embeddings(X, Y, reduction, max_label_space=300, binary_struc
        F = zscores(F, axis=0)

    if nC > max_label_space:
+        # TODO testing optimal max_label_space
        if reduction == 'PCA':
+            if max_label_space == 0:
+                pca = PCA(n_components=Y.shape[1])
+                pca = pca.fit(F)
+                return pca.explained_variance_ratio_
+
            print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
                  f'Applying PCA(n_components={max_label_space})')
            pca = PCA(n_components=max_label_space)
-            F = pca.fit(F).transform(F)
+            pca = pca.fit(F)
+
+            ########################################################
+            import matplotlib.pyplot as plt
+
+            plt.figure()
+            plt.plot(np.cumsum(pca.explained_variance_ratio_))
+            plt.xlabel('Number of Components')
+            plt.ylabel('Variance (%)')  #
+            plt.title(f'WCE Explained Variance {lang}')
+            plt.show()
+            ########################################################
+
+            F = pca.fit_transform(F)
        elif reduction == 'TSNE':
            print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
                  f'Applying t-SNE(n_components={max_label_space})')
            tsne = TSNE(n_components=max_label_space)
-            F = tsne.fit(F).fit_transform(F)
+            F = tsne.fit_transform(F)
        elif reduction == 'tSVD':
            print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
                  f'Applying truncatedSVD(n_components={max_label_space})')
            tSVD = TruncatedSVD(n_components=max_label_space)
-            F = tSVD.fit(F).fit_transform(F)
+            F = tSVD.fit_transform(F)

    return F

--- a/src/learning/learners.py
+++ b/src/learning/learners.py
@ -1,6 +1,6 @@
 import numpy as np
 import time
-from data.embeddings import WordEmbeddings, WCE_matrix, StorageEmbeddings
+from data.embeddings import WordEmbeddings, StorageEmbeddings
 from scipy.sparse import issparse
 from sklearn.multiclass import OneVsRestClassifier
 from sklearn.model_selection import GridSearchCV
@ -493,43 +493,6 @@ class AndreaCLF(FunnellingPolylingualClassifier):

        return lZ, lYtr

-    # def embed(self, lX, ly, unsupervised=False, supervised=False, prediction=False):
-    #     """
-    #     build embedding matrix for given language and returns its weighted sum wrt tf-idf score
-    #     """
-    #     _r = dict()
-    #     languages = list(lX.keys())
-    #
-    #     if prediction:
-    #         for lang in languages:
-    #             if unsupervised:    # If unsupervised embeddings ...
-    #                 M = self.word_embeddings[lang]
-    #                 if supervised:  # and also unsupervised --> get both (M) and (S) weighted sum matrices and hstack them
-    #                     S = self.supervised_embeddings[lang]
-    #                     _r[lang] = np.hstack((lX[lang].dot(M), lX[lang].dot(S)))
-    #                     continue
-    #                 _r[lang] = lX[lang].dot(M)  # if not supervised --> just get weighted sum of unsupervised (M) embeddings
-    #             else:   # If not unsupervised --> get (S) matrix and its weighted sum
-    #                 S = self.supervised_embeddings[lang]
-    #                 _r[lang] = lX[lang].dot(S)
-    #         return _r
-    #
-    #     if unsupervised:
-    #         for lang in languages:
-    #             _, M = embedding_matrix(self.config['we_type'], self.we_path, self.lang_word2idx[lang], lang)
-    #             self.word_embeddings[lang] = M
-    #             _r[lang] = lX[lang].dot(M)
-    #
-    #     if supervised:
-    #         for lang in languages:
-    #             S = WCE_matrix(lX, ly, lang)
-    #             self.supervised_embeddings[lang] = S
-    #             if unsupervised:
-    #                 _r[lang] = np.hstack((_r[lang], lX[lang].dot(S)))
-    #             else:
-    #                 _r[lang] = lX[lang].dot(S)
-    #     return _r
-
    # @override std class method
    def fit(self, lX, ly):
        tinit = time.time()