gFun/src/transformers/dci.py

import numpy as np
from sklearn.preprocessing import normalize
from scipy.sparse import csr_matrix, issparse
from scipy.spatial.distance import cosine
import operator
import functools
import math, sys
# from sklearn.externals.joblib import Parallel, delayed
from joblib import Parallel, delayed


class DistributionalCorrespondenceIndexing:

    prob_dcf = ['linear', 'pmi']
    vect_dcf = ['cosine']
    valid_dcf = prob_dcf + vect_dcf
    valid_post = ['normal', 'l2', None]

    def __init__(self, dcf='cosine', post='normal', n_jobs=-1):
        """
        :param dcf: a distributional correspondence function name (e.g., 'cosine') or a callable f(u,v) which measures
                the distribucional correspondence between vectors u and v
        :param post: post-processing function to apply to document embeddings. Default is to standardize it into a
                normal distribution; other functions allowed are 'l2' or None
        """
        if post not in self.valid_post:
            raise ValueError("unknown post processing function; valid ones are [%s]" % ', '.join(self.valid_post))

        if isinstance(dcf, str):
            if dcf not in self.valid_dcf:
                raise ValueError("unknown dcf; use any in [%s]" % ', '.join(self.valid_dcf))
            self.dcf = getattr(DistributionalCorrespondenceIndexing, dcf)
        elif hasattr(dcf, '__call__'):
            self.dcf = dcf
        else:
            raise ValueError('param dcf should either be a valid dcf name in [%s] or a callable comparing two vectors')
        #self.dcf = lambda u,v:dcf(u,v)
        self.post = post
        self.domains = None
        self.dFP = None
        self.n_jobs = n_jobs

    def fit(self, dU, dP):
        """
        :param dU: a dictionary of {domain:dsm_matrix}, where dsm is a document-by-term matrix representing the
                distributional semantic model for a specific domain
        :param dP: a dictionary {domain:pivot_matrix} where domain is a string representing each domain,
                and pivot_matrix has shape (d,p) with d the dimensionality of the distributional space, and p the
                number of pivots
        :return: self
        """
        self.domains = list(dP.keys())
        assert len(np.unique([P.shape[1] for P in dP.values()]))==1, "inconsistent number of pivots across domains"
        assert set(dU.keys())==set(self.domains), "inconsistent domains in dU and dP"
        assert not [1 for d in self.domains if dU[d].shape[0]!=dP[d].shape[0]], \
            "inconsistent dimensions between distributional and pivot spaces"
        self.dimensions = list(dP.values())[0].shape[1]
        # embed the feature space from each domain using the pivots of that domain
        #self.dFP = {d:self.dcf_dist(dU[d].transpose(), dP[d].transpose()) for d in self.domains}
        transformations = Parallel(n_jobs=self.n_jobs)(delayed(self.dcf_dist)(dU[d].transpose(),dP[d].transpose()) for d in self.domains)
        self.dFP = {d: transformations[i] for i, d in enumerate(self.domains)}

    def _dom_transform(self, X, FP):
        _X = X.dot(FP)
        if self.post == 'l2':
            _X = normalize(_X, norm='l2', axis=1)
        elif self.post == 'normal':
            std = np.clip(np.std(_X, axis=0), 1e-5, None)
            _X = (_X - np.mean(_X, axis=0)) / std
        return _X

    # dX is a dictionary of {domain:dsm}, where dsm (distributional semantic model) is, e.g., a document-by-term csr_matrix
    def transform(self, dX):
        assert self.dFP is not None, 'transform method called before fit'
        assert set(dX.keys()).issubset(self.domains), 'domains in dX are not scope'
        domains = list(dX.keys())
        transformations = Parallel(n_jobs=self.n_jobs)(delayed(self._dom_transform)(dX[d], self.dFP[d]) for d in domains)
        return {d: transformations[i] for i, d in enumerate(domains)}

    def fit_transform(self, dU, dP, dX):
        return self.fit(dU, dP).transform(dX)

    def _prevalence(self, v):
        if issparse(v):
            return float(v.nnz) / functools.reduce(operator.mul, v.shape, 1) #this works for arrays of any rank
        elif isinstance(v, np.ndarray):
            return float(v[v>0].size) / v.size

    def linear(self, u, v, D):
        tp, fp, fn, tn = self._get_4cellcounters(u, v, D)
        den1=tp+fn
        den2=tn+fp
        tpr = (tp*1./den1) if den1!=0 else 0.
        tnr = (tn*1./den2) if den2!=0 else 0.
        return tpr + tnr - 1

    def pmi(self, u, v, D):
        tp, fp, fn, tn = self._get_4cellcounters(u, v, D)

        Pxy = tp * 1. / D
        Pxny = fp * 1. / D
        Pnxy = fn * 1. / D
        Px = Pxy + Pxny
        Py = Pxy + Pnxy

        if (Px == 0 or Py == 0 or Pxy == 0):
            return 0.0

        score =  math.log2(Pxy / (Px * Py))
        if np.isnan(score) or np.isinf(score):
            print('NAN')
            sys.exit()
        return score

    def cosine(self, u, v):
        pu = self._prevalence(u)
        pv = self._prevalence(v)
        return cosine(u, v) - np.sqrt(pu * pv)

    def _get_4cellcounters(self, u, v, D):
        """
        :param u: a set of indexes with a non-zero value
        :param v: a set of indexes with a non-zero value
        :param D: the number of events (i.e., all posible indexes)
        :return: the 4-cell contingency values tp, fp, fn, tn)
        """
        common=u.intersection(v)
        tp = len(common)
        fp = len(u) - len(common)
        fn = len(v) - len(common)
        tn = D - (tp + fp + fn)
        return tp, fp, fn, tn

    def dcf_dist(self, U, V):
        nU,D = U.shape
        nV = V.shape[0]
        if issparse(U): U = U.toarray()
        if issparse(V): V = V.toarray()

        dists = np.zeros((nU, nV))
        if self.dcf.__name__ in self.prob_dcf:
            def hits_index(v):
                return set(np.argwhere(v>0).reshape(-1).tolist())
            Vhits = {i:hits_index(V[i]) for i in range(nV)}
            for i in range(nU):
                Ui_hits = hits_index(U[i])
                for j in range(nV):
                    dists[i, j] = self.dcf(self, Ui_hits, Vhits[j], D)
        else:
            for i in range(nU):
                for j in range(nV):
                    dists[i, j] = self.dcf(self, U[i], V[j])
        return dists