From 9fa1899a7f1d3f73349bf20909aa0e98596fb31f Mon Sep 17 00:00:00 2001
From: andrea <andrea.pdr@hotmail.it>
Date: Mon, 9 Dec 2019 15:37:52 +0100
Subject: [PATCH] refactored pca methods

---
 src/FPEC_andrea.py         |  28 +++++---
 src/data/embeddings.py     | 133 +++++++++++++++++++++++++------------
 src/data/supervised.py     |  61 ++++++++---------
 src/learning/learners.py   |  17 ++---
 src/util/decompositions.py |  49 ++++++++++++++
 src/util/results.py        |   6 +-
 6 files changed, 199 insertions(+), 95 deletions(-)
 create mode 100644 src/util/decompositions.py

diff --git a/src/FPEC_andrea.py b/src/FPEC_andrea.py
index 185bcc2..1618c33 100644
--- a/src/FPEC_andrea.py
+++ b/src/FPEC_andrea.py
@@ -1,4 +1,4 @@
-import os, sys
+import os
 from dataset_builder import MultilingualDataset
 from learning.learners import *
 from util.evaluation import *
@@ -21,7 +21,7 @@ parser.add_option("-e", "--mode-embed", dest="mode_embed",
                   help="Set the embedding to be used [none, unsupervised, supervised, both]", type=str, default='none')
 
 parser.add_option("-w", "--we-path", dest="we_path",
-                  help="Path to the polylingual word embeddings", default='../embeddings/')
+                  help="Path to the polylingual word embeddings", default='/home/andreapdr/CLESA/')
 
 parser.add_option('-t', "--we-type", dest="we_type", help="Aligned embeddings to use [FastText, MUSE]", type=str,
                   default='MUSE')
@@ -30,11 +30,21 @@ parser.add_option("-s", "--set_c", dest="set_c",type=float,
                   help="Set the C parameter", default=1)
 
 parser.add_option("-c", "--optimc", dest="optimc", action='store_true',
-                  help="Optimices hyperparameters", default=False)
+                  help="Optimize hyperparameters", default=False)
 
 parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int,
                   help="Number of parallel jobs (default is -1, all)", default=-1)
 
+parser.add_option("-p", "--pca", dest="max_labels", type=int,
+                  help="If less than number of target classes, will apply PCA to supervised matrix. If set to 0 it"
+                       " will automatically search for the best number of components", default=300)
+
+parser.add_option("-u", "--upca", dest="max_labels_U", type=int,
+                  help="If smaller than Unsupervised Dimension, will apply PCA to unsupervised matrix. If set to 0 it"
+                       " will automatically search for the best number of components", default=300)
+
+parser.add_option("-l", dest="lang", type=str)
+
 
 def get_learner(calibrate=False, kernel='linear'):
     return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto')
@@ -51,7 +61,6 @@ def get_params(dense=False):
 
 
 if __name__ == '__main__':
-
     (op, args) = parser.parse_args()
 
     assert exists(op.dataset), 'Unable to find file '+str(op.dataset)
@@ -64,8 +73,9 @@ if __name__ == '__main__':
     data = MultilingualDataset.load(op.dataset)
     data.show_dimensions()
 
-    # data.set_view(languages=['en','it'], categories=list(range(10)))
-    # data.set_view(languages=['en','it'])
+    data.set_view(languages=['en','it', 'pt', 'sv'], categories=list(range(10)))
+    # data.set_view(languages=[op.lang])
+    # data.set_view(categories=list(range(10)))
     lXtr, lytr = data.training()
     lXte, lyte = data.test()
 
@@ -104,7 +114,9 @@ if __name__ == '__main__':
 
     ##### TODO - config dict is redundant - we have already op argparse ...
     config['reduction'] = 'PCA'
-    config['max_label_space'] = 300
+    config['max_label_space'] = op.max_labels
+    config['dim_reduction_unsupervised'] = op.max_labels_U
+    # config['plot_covariance_matrices'] = True
 
     result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '')
 
@@ -129,5 +141,5 @@ if __name__ == '__main__':
         metrics.append([macrof1, microf1, macrok, microk])
         print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1))
         results.add_row(result_id, 'PolyEmbed_andrea', 'svm', _config_id, config['we_type'], op.optimc, op.dataset.split('/')[-1],
-                        'not_binary', 'not_ablation', classifier.time, lang, macrof1, microf1, macrok, microk, '')
+                        classifier.time, lang, macrof1, microf1, macrok, microk, '')
     print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
diff --git a/src/data/embeddings.py b/src/data/embeddings.py
index 8005dad..2c02592 100644
--- a/src/data/embeddings.py
+++ b/src/data/embeddings.py
@@ -5,7 +5,9 @@ from torchtext.vocab import Vectors
 import torch
 from abc import ABC, abstractmethod
 from data.supervised import get_supervised_embeddings
-
+import matplotlib.pyplot as plt
+from sklearn.decomposition import PCA
+from util.decompositions import *
 
 class PretrainedEmbeddings(ABC):
 
@@ -110,10 +112,10 @@ class WordEmbeddings:
         # vocabulary is a set of terms to be kept
         active_vocabulary = sorted([w for w in vocabulary if w in self.worddim])
         lost = len(vocabulary)-len(active_vocabulary)
-        if lost>0: #some termr are missing, so it will be replaced by UNK
+        if lost > 0: #some terms are missing, so it will be replaced by UNK
             print('warning: missing {} terms for lang {}'.format(lost, self.lang))
         self.we = self.get_vectors(active_vocabulary)
-        assert self.we.shape[0]==len(active_vocabulary)
+        assert self.we.shape[0] == len(active_vocabulary)
         self.dimword={i:w for i,w in enumerate(active_vocabulary)}
         self.worddim={w:i for i,w in enumerate(active_vocabulary)}
         return self
@@ -153,7 +155,6 @@ class FastTextWikiNews(Vectors):
         url = self.url_base.format(language)
         # name = self.path.format(language)
         name = cache + self._name.format(language)
-        # print(f'\n\nFASTEXTWIKI-NEW CLASS:\nurl = {url}\nname = {name}\ncache {cache}\nlanguage = {language}')
         super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs)
 
 
@@ -171,15 +172,17 @@ class EmbeddingsAligned(Vectors):
     def vocabulary(self):
         return set(self.stoi.keys())
 
-    def dim(self):
-        return self.dim
-
     def extract(self, words):
         source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.stoi)
         extraction = torch.zeros((len(words), self.dim))
         extraction[source_idx] = self.vectors[target_idx]
         return extraction
 
+    def reduce(self, dim):
+        pca = PCA(n_components=dim)
+        self.vectors = pca.fit_transform(self.vectors)
+        return
+
 
 class FastTextMUSE(PretrainedEmbeddings):
 
@@ -209,26 +212,44 @@ class StorageEmbeddings:
         self.lang_U = dict()
         self.lang_S = dict()
 
-    def _add_embeddings_unsupervised(self, type, docs, vocs):
+    def _add_embeddings_unsupervised(self, type, docs, vocs, max_label_space=300):
         for lang in docs.keys():
+            nC = self.lang_U[lang].shape[1]
             print(f'# [unsupervised-matrix {type}] for {lang}')
             voc = np.asarray(list(zip(*sorted(vocs[lang].items(), key=lambda x: x[1])))[0])
             self.lang_U[lang] = EmbeddingsAligned(type, self.path, lang, voc).vectors
+            # if self.lang_U[lang].shape[1] > dim != 0:
+            #     print(f'unsupervised matrix has more dimensions ({self.lang_U[lang].shape[1]}) than'
+            #           f' the allowed limit {dim}. Applying PCA(n_components={dim})')
+            #     pca = PCA(n_components=dim)
+            #     self.lang_U[lang] = pca.fit_transform(self.lang_U[lang])
             print(f'Matrix U (weighted sum) of shape {self.lang_U[lang].shape}\n')
+        if max_label_space == 0:
+            print(f'Computing optimal number of PCA components along matrices U')
+            optimal_n = get_optimal_dim(self.lang_U, 'U')
+            self.lang_U = run_pca(optimal_n, self.lang_U)
+        elif max_label_space < nC:
+            self.lang_U = run_pca(max_label_space, self.lang_U)
+
         return
 
     def _add_emebeddings_supervised(self, docs, labels, reduction, max_label_space, voc):
-        _optimal = dict()
-        # TODO testing optimal max_label_space
-        if max_label_space == 'optimal':
-            print('Computing optimal number of PCA components ...')
-            optimal_n = self.get_optimal_supervised_components(docs, labels)
-            max_label_space = optimal_n
-
-        for lang in docs.keys():
+        # if max_label_space == 0:
+        #     print('Computing optimal number of PCA components along matrices S...')
+        #     optimal_n = self.get_optimal_supervised_components(docs, labels)
+        #     max_label_space = optimal_n
+        for lang in docs.keys():    # compute supervised matrices S - then apply PCA
+            nC = self.lang_S[lang].shape[1]
             print(f'# [supervised-matrix] for {lang}')
             self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], reduction, max_label_space, voc[lang], lang)
             print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n')
+
+        if max_label_space == 0:
+            optimal_n = get_optimal_dim(self.lang_S, 'S')
+            self.lang_S = run_pca(optimal_n, self.lang_S)
+        elif max_label_space < nC:
+            self.lang_S = run_pca(max_label_space, self.lang_S)
+
         return
 
     def _concatenate_embeddings(self, docs):
@@ -239,7 +260,7 @@ class StorageEmbeddings:
 
     def fit(self, config, docs, vocs, labels):
         if config['unsupervised']:
-            self._add_embeddings_unsupervised(config['we_type'], docs, vocs)
+            self._add_embeddings_unsupervised(config['we_type'], docs, vocs, config['dim_reduction_unsupervised'])
         if config['supervised']:
             self._add_emebeddings_supervised(docs, labels, config['reduction'], config['max_label_space'], vocs)
         return self
@@ -257,28 +278,58 @@ class StorageEmbeddings:
                 _r[lang] = docs[lang].dot(self.lang_U[lang])
         return _r
 
-    def get_optimal_supervised_components(self, docs, labels):
-        import matplotlib.pyplot as plt
+    # @staticmethod
+    # def get_optimal_supervised_components(docs, labels):
+    #     optimal_n = get_optimal_dim(docs, 'S')
+    #     return optimal_n
+        # _idx = []
+        #
+        # plt.figure(figsize=(15, 10))
+        # plt.title(f'WCE Explained Variance')
+        # plt.xlabel('Number of Components')
+        # plt.ylabel('Variance (%)')
+        #
+        # for lang in docs.keys():
+        #     _r = get_supervised_embeddings(docs[lang], labels[lang], reduction='PCA', max_label_space=0).tolist()
+        #     _r = np.cumsum(_r)
+        #     plt.plot(_r, label=lang)
+        #     for i in range(len(_r)-1, 1, -1):
+        #         delta = _r[i] - _r[i-1]
+        #         if delta > 0:
+        #             _idx.append(i)
+        #             break
+        # best_n = max(_idx)
+        # plt.axvline(best_n, color='r', label='optimal N')
+        # plt.legend()
+        # plt.show()
+        # return best_n
+    #
+    # def get_optimal_unsupervised_components(self, type):
+    #     _idx = []
+    #
+    #     plt.figure(figsize=(15, 10))
+    #     plt.title(f'Unsupervised Embeddings {type} Explained Variance')
+    #     plt.xlabel('Number of Components')
+    #     plt.ylabel('Variance (%)')
+    #
+    #     for lang in self.lang_U.keys():
+    #         pca = PCA(n_components=self.lang_U[lang].shape[1])
+    #         pca.fit(self.lang_U[lang])
+    #         _r = pca.explained_variance_ratio_
+    #         _r = np.cumsum(_r)
+    #         plt.plot(_r, label=lang)
+    #         for i in range(len(_r) - 1, 1, -1):
+    #             delta = _r[i] - _r[i - 1]
+    #             if delta > 0:
+    #                 _idx.append(i)
+    #                 break
+    #     best_n = max(_idx)
+    #     plt.axvline(best_n, color='r', label='optimal N')
+    #     plt.legend()
+    #     plt.show()
+    #
+    #     for lang in self.lang_U.keys():
+    #         pca = PCA(n_components=best_n)
+    #         self.lang_U[lang] = pca.fit_transform(self.lang_U[lang])
+    #     return
 
-        _idx = []
-
-        plt.figure(figsize=(15, 10))
-        plt.title(f'WCE Explained Variance')
-        plt.xlabel('Number of Components')
-        plt.ylabel('Variance (%)')
-
-        for lang in docs.keys():
-            _r = get_supervised_embeddings(docs[lang], labels[lang], reduction='PCA', max_label_space='optimal').tolist()
-            _r = np.cumsum(_r)
-            plt.plot(_r, label=lang)
-            for i in range(len(_r)-1, 1, -1):
-                # todo: if n_components (therfore #n labels) is not big enough every value will be smaller than the next one ...
-                delta = _r[i] - _r[i-1]
-                if delta > 0:
-                    _idx.append(i)
-                    break
-        best_n = int(sum(_idx)/len(_idx))
-        plt.vlines(best_n, 0, 1, colors='r', label='optimal N')
-        plt.legend()
-        plt.show()
-        return best_n
diff --git a/src/data/supervised.py b/src/data/supervised.py
index d8e1f7d..bbd8c37 100755
--- a/src/data/supervised.py
+++ b/src/data/supervised.py
@@ -1,5 +1,5 @@
 from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square
-from sklearn.decomposition import PCA, TruncatedSVD
+from sklearn.decomposition import PCA
 from sklearn.manifold import TSNE
 import numpy as np
 
@@ -41,15 +41,9 @@ def supervised_embeddings_tsr(X,Y, tsr_function=information_gain, max_documents=
 
 
 def get_supervised_embeddings(X, Y, reduction, max_label_space=300, voc=None, lang='None', binary_structural_problems=-1, method='dotn', dozscore=True):
-    if max_label_space == 'optimal':
-        max_label_space = 0
-
     if max_label_space != 0:
         print('computing supervised embeddings...')
-
     nC = Y.shape[1]
-    if nC==2 and binary_structural_problems > nC:
-        raise ValueError('not implemented in this branch')
 
     if method=='ppmi':
         F = supervised_embeddings_ppmi(X, Y)
@@ -64,8 +58,7 @@ def get_supervised_embeddings(X, Y, reduction, max_label_space=300, voc=None, la
         F = zscores(F, axis=0)
 
     # Dumping F-matrix for further studies
-    # TODO im not sure if voc.keys and F matrix indices are "aligned" correctly
-    dump_it = True
+    dump_it = False
     if dump_it:
         with open(f'/home/andreapdr/funneling_pdr/src/dumps/WCE_{lang}.tsv', 'w') as outfile:
             np.savetxt(outfile, F, delimiter='\t')
@@ -73,34 +66,32 @@ def get_supervised_embeddings(X, Y, reduction, max_label_space=300, voc=None, la
             for token in voc.keys():
                 outfile.write(token+'\n')
 
-
-
-    if nC > max_label_space:
-        # TODO testing optimal max_label_space
-        if reduction == 'PCA':
-            if max_label_space == 0:
-                pca = PCA(n_components=Y.shape[1])
-                pca = pca.fit(F)
-                return pca.explained_variance_ratio_
-
-            print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
-                  f'Applying PCA(n_components={max_label_space})')
-            pca = PCA(n_components=max_label_space)
-            pca = pca.fit(F)
-            F = pca.fit_transform(F)
-        elif reduction == 'TSNE':
-            print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
-                  f'Applying t-SNE(n_components={max_label_space})')
-            tsne = TSNE(n_components=max_label_space)
-            F = tsne.fit_transform(F)
-        elif reduction == 'tSVD':
-            print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
-                  f'Applying truncatedSVD(n_components={max_label_space})')
-            tSVD = TruncatedSVD(n_components=max_label_space)
-            F = tSVD.fit_transform(F)
-
     return F
 
+    # if nC >= max_label_space:
+    #     if reduction == 'PCA':
+    #         if max_label_space == 0:
+    #             pca = PCA(n_components=Y.shape[1])
+    #             pca = pca.fit(F)
+    #             return pca.explained_variance_ratio_
+    #
+    #         print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
+    #               f'Applying PCA(n_components={max_label_space})')
+    #         pca = PCA(n_components=max_label_space)
+    #         F = pca.fit_transform(F)
+    #     elif reduction == 'TSNE':
+    #         print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
+    #               f'Applying t-SNE(n_components={max_label_space})')
+    #         tsne = TSNE(n_components=max_label_space)
+    #         F = tsne.fit_transform(F)
+    #     elif reduction == 'tSVD':
+    #         print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
+    #               f'Applying truncatedSVD(n_components={max_label_space})')
+    #         tSVD = TruncatedSVD(n_components=max_label_space)
+    #         F = tSVD.fit_transform(F)
+    #
+    # return F
+
 
 
 
diff --git a/src/learning/learners.py b/src/learning/learners.py
index aed1094..c4c69fd 100644
--- a/src/learning/learners.py
+++ b/src/learning/learners.py
@@ -8,6 +8,7 @@ from sklearn.model_selection import KFold
 from joblib import Parallel, delayed
 from sklearn.feature_extraction.text import TfidfVectorizer
 from transformers.StandardizeTransformer import StandardizeTransformer
+from sklearn.decomposition import PCA
 
 
 def _sort_if_sparse(X):
@@ -453,13 +454,12 @@ class AndreaCLF(FunnellingPolylingualClassifier):
                          calmode,
                          n_jobs)
 
+        self.pca_independent_space = PCA(n_components=100)
         self.we_path = we_path
         self.config = config
         self.lang_word2idx = dict()
         self.languages = []
         self.lang_tfidf = {}
-        # self.word_embeddings = {}
-        # self.supervised_embeddings = {}
         self.embedding_space = None
         self.model = None
         self.time = None
@@ -515,6 +515,10 @@ class AndreaCLF(FunnellingPolylingualClassifier):
         _vertical_Z = np.vstack([Z[lang] for lang in self.languages])
         _vertical_Zy = np.vstack([zy[lang] for lang in self.languages])
 
+        # todo testing ...
+        # self.pca_independent_space.fit(_vertical_Z)
+        # _vertical_Z = self.pca_independent_space.transform(_vertical_Z)
+
         self.standardizer = StandardizeTransformer()
         _vertical_Z = self.standardizer.fit_predict(_vertical_Z)
 
@@ -532,17 +536,14 @@ class AndreaCLF(FunnellingPolylingualClassifier):
 
         if self.config['supervised'] or self.config['unsupervised']:
             _embedding_space = self.embedding_space.predict(self.config, lX)
-            # l_weighted_em = self.embed(lX, ly,
-            #                            unsupervised=self.config['unsupervised'],
-            #                            supervised=self.config['supervised'],
-            #                            prediction=True)
-            # Z_embedded = dict()
+
             for lang in lX.keys():
                 lZ[lang] = np.hstack((lZ[lang], _embedding_space[lang]))
-            # lZ = Z_embedded
 
         for lang in lZ.keys():
             print(lZ[lang].shape)
+            # todo testing
+            # lZ[lang] = self.pca_independent_space.transform(lZ[lang])
             lZ[lang] = self.standardizer.predict(lZ[lang])
 
         return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs)
diff --git a/src/util/decompositions.py b/src/util/decompositions.py
new file mode 100644
index 0000000..9029b33
--- /dev/null
+++ b/src/util/decompositions.py
@@ -0,0 +1,49 @@
+from sklearn.decomposition import PCA
+import numpy as np
+import matplotlib.pyplot as plt
+
+def run_pca(dim, X):
+    """
+    :param dim: number of pca components to keep
+    :param X: dictionary str(lang): matrix
+    :return: dict lang: reduced matrix
+    """
+    r = dict()
+    pca = PCA(n_components=dim)
+    for lang in X.keys():
+        r[lang] = pca.fit_transform(X[lang])
+    return r
+
+
+def get_optimal_dim(X, embed_type):
+    """
+    :param X: dict str(lang) : csr_matrix of embeddings unsupervised or supervised
+    :param embed_type: (str) embedding matrix type: S or U (WCE supervised or U unsupervised MUSE/FASTTEXT)
+    :return:
+    """
+    _idx = []
+
+    plt.figure(figsize=(15, 10))
+    if embed_type == 'U':
+        plt.title(f'Unsupervised Embeddings {"TODO"} Explained Variance')
+    else:
+        plt.title(f'WCE Explained Variance')
+    plt.xlabel('Number of Components')
+    plt.ylabel('Variance (%)')
+
+    for lang in X.keys():
+        pca = PCA(n_components=X[lang].shape[1])
+        pca.fit(X[lang])
+        _r = pca.explained_variance_ratio_
+        _r = np.cumsum(_r)
+        plt.plot(_r, label=lang)
+        for i in range(len(_r) - 1, 1, -1):
+            delta = _r[i] - _r[i - 1]
+            if delta > 0:
+                _idx.append(i)
+                break
+    best_n = max(_idx)
+    plt.axvline(best_n, color='r', label='optimal N')
+    plt.legend()
+    plt.show()
+    return best_n
\ No newline at end of file
diff --git a/src/util/results.py b/src/util/results.py
index 22e8021..7c25bec 100644
--- a/src/util/results.py
+++ b/src/util/results.py
@@ -5,7 +5,7 @@ import numpy as np
 class PolylingualClassificationResults:
     def __init__(self, file, autoflush=True, verbose=False):
         self.file = file
-        self.columns = ['id', 'method', 'learner', 'embed', 'embed_type', 'optimp', 'dataset', 'binary', 'languages', 'time', 'lang', 'macrof1', 'microf1', 'macrok', 'microk', 'notes']
+        self.columns = ['id', 'method', 'learner', 'embed', 'embed_type', 'optimp', 'dataset', 'time', 'lang', 'macrof1', 'microf1', 'macrok', 'microk', 'notes']
         self.autoflush = autoflush
         self.verbose = verbose
         if os.path.exists(file):
@@ -20,8 +20,8 @@ class PolylingualClassificationResults:
     def already_calculated(self, id):
         return (self.df['id'] == id).any()
 
-    def add_row(self, id, method, learner, embed, embed_type, optimp, dataset, binary, ablation_lang, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
-        s = pd.Series([id, method, learner, embed, embed_type, optimp, dataset, binary, ablation_lang, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns)
+    def add_row(self, id, method, learner, embed, embed_type, optimp, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
+        s = pd.Series([id, method, learner, embed, embed_type, optimp, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns)
         self.df = self.df.append(s, ignore_index=True)
         if self.autoflush: self.flush()
         self.tell(s.to_string())