gFun/src/util/decompositions.py

from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt


def run_pca(dim, X):
    """
    :param dim: number of pca components to keep
    :param X: dictionary str(lang): matrix
    :return: dict lang: reduced matrix
    """
    r = dict()
    pca = PCA(n_components=dim)
    for lang in X.keys():
        r[lang] = pca.fit_transform(X[lang])
    return r


def get_optimal_dim(X, embed_type):
    """
    :param X: dict str(lang) : csr_matrix of embeddings unsupervised or supervised
    :param embed_type: (str) embedding matrix type: S or U (WCE supervised or U unsupervised MUSE/FASTTEXT)
    :return:
    """
    _idx = []

    plt.figure(figsize=(15, 10))
    if embed_type == 'U':
        plt.title(f'Unsupervised Embeddings {"TODO"} Explained Variance')
    else:
        plt.title(f'WCE Explained Variance')
    plt.xlabel('Number of Components')
    plt.ylabel('Variance (%)')

    for lang in X.keys():
        pca = PCA(n_components=X[lang].shape[1])
        pca.fit(X[lang])
        _r = pca.explained_variance_ratio_
        _r = np.cumsum(_r)
        plt.plot(_r, label=lang)
        for i in range(len(_r) - 1, 1, -1):
            delta = _r[i] - _r[i - 1]
            if delta > 0:
                _idx.append(i)
                break
    best_n = max(_idx)
    plt.axvline(best_n, color='r', label='optimal N')
    plt.legend()
    plt.show()
    return best_n