155 lines
6.2 KiB
Python
155 lines
6.2 KiB
Python
import numpy as np
|
|
from sklearn.preprocessing import normalize
|
|
from scipy.sparse import csr_matrix, issparse
|
|
from scipy.spatial.distance import cosine
|
|
import operator
|
|
import functools
|
|
import math, sys
|
|
# from sklearn.externals.joblib import Parallel, delayed
|
|
from joblib import Parallel, delayed
|
|
|
|
|
|
class DistributionalCorrespondenceIndexing:
|
|
|
|
prob_dcf = ['linear', 'pmi']
|
|
vect_dcf = ['cosine']
|
|
valid_dcf = prob_dcf + vect_dcf
|
|
valid_post = ['normal', 'l2', None]
|
|
|
|
def __init__(self, dcf='cosine', post='normal', n_jobs=-1):
|
|
"""
|
|
:param dcf: a distributional correspondence function name (e.g., 'cosine') or a callable f(u,v) which measures
|
|
the distribucional correspondence between vectors u and v
|
|
:param post: post-processing function to apply to document embeddings. Default is to standardize it into a
|
|
normal distribution; other functions allowed are 'l2' or None
|
|
"""
|
|
if post not in self.valid_post:
|
|
raise ValueError("unknown post processing function; valid ones are [%s]" % ', '.join(self.valid_post))
|
|
|
|
if isinstance(dcf, str):
|
|
if dcf not in self.valid_dcf:
|
|
raise ValueError("unknown dcf; use any in [%s]" % ', '.join(self.valid_dcf))
|
|
self.dcf = getattr(DistributionalCorrespondenceIndexing, dcf)
|
|
elif hasattr(dcf, '__call__'):
|
|
self.dcf = dcf
|
|
else:
|
|
raise ValueError('param dcf should either be a valid dcf name in [%s] or a callable comparing two vectors')
|
|
#self.dcf = lambda u,v:dcf(u,v)
|
|
self.post = post
|
|
self.domains = None
|
|
self.dFP = None
|
|
self.n_jobs = n_jobs
|
|
|
|
def fit(self, dU, dP):
|
|
"""
|
|
:param dU: a dictionary of {domain:dsm_matrix}, where dsm is a document-by-term matrix representing the
|
|
distributional semantic model for a specific domain
|
|
:param dP: a dictionary {domain:pivot_matrix} where domain is a string representing each domain,
|
|
and pivot_matrix has shape (d,p) with d the dimensionality of the distributional space, and p the
|
|
number of pivots
|
|
:return: self
|
|
"""
|
|
self.domains = list(dP.keys())
|
|
assert len(np.unique([P.shape[1] for P in dP.values()]))==1, "inconsistent number of pivots across domains"
|
|
assert set(dU.keys())==set(self.domains), "inconsistent domains in dU and dP"
|
|
assert not [1 for d in self.domains if dU[d].shape[0]!=dP[d].shape[0]], \
|
|
"inconsistent dimensions between distributional and pivot spaces"
|
|
self.dimensions = list(dP.values())[0].shape[1]
|
|
# embed the feature space from each domain using the pivots of that domain
|
|
#self.dFP = {d:self.dcf_dist(dU[d].transpose(), dP[d].transpose()) for d in self.domains}
|
|
transformations = Parallel(n_jobs=self.n_jobs)(delayed(self.dcf_dist)(dU[d].transpose(),dP[d].transpose()) for d in self.domains)
|
|
self.dFP = {d: transformations[i] for i, d in enumerate(self.domains)}
|
|
|
|
def _dom_transform(self, X, FP):
|
|
_X = X.dot(FP)
|
|
if self.post == 'l2':
|
|
_X = normalize(_X, norm='l2', axis=1)
|
|
elif self.post == 'normal':
|
|
std = np.clip(np.std(_X, axis=0), 1e-5, None)
|
|
_X = (_X - np.mean(_X, axis=0)) / std
|
|
return _X
|
|
|
|
# dX is a dictionary of {domain:dsm}, where dsm (distributional semantic model) is, e.g., a document-by-term csr_matrix
|
|
def transform(self, dX):
|
|
assert self.dFP is not None, 'transform method called before fit'
|
|
assert set(dX.keys()).issubset(self.domains), 'domains in dX are not scope'
|
|
domains = list(dX.keys())
|
|
transformations = Parallel(n_jobs=self.n_jobs)(delayed(self._dom_transform)(dX[d], self.dFP[d]) for d in domains)
|
|
return {d: transformations[i] for i, d in enumerate(domains)}
|
|
|
|
def fit_transform(self, dU, dP, dX):
|
|
return self.fit(dU, dP).transform(dX)
|
|
|
|
def _prevalence(self, v):
|
|
if issparse(v):
|
|
return float(v.nnz) / functools.reduce(operator.mul, v.shape, 1) #this works for arrays of any rank
|
|
elif isinstance(v, np.ndarray):
|
|
return float(v[v>0].size) / v.size
|
|
|
|
def linear(self, u, v, D):
|
|
tp, fp, fn, tn = self._get_4cellcounters(u, v, D)
|
|
den1=tp+fn
|
|
den2=tn+fp
|
|
tpr = (tp*1./den1) if den1!=0 else 0.
|
|
tnr = (tn*1./den2) if den2!=0 else 0.
|
|
return tpr + tnr - 1
|
|
|
|
def pmi(self, u, v, D):
|
|
tp, fp, fn, tn = self._get_4cellcounters(u, v, D)
|
|
|
|
Pxy = tp * 1. / D
|
|
Pxny = fp * 1. / D
|
|
Pnxy = fn * 1. / D
|
|
Px = Pxy + Pxny
|
|
Py = Pxy + Pnxy
|
|
|
|
if (Px == 0 or Py == 0 or Pxy == 0):
|
|
return 0.0
|
|
|
|
score = math.log2(Pxy / (Px * Py))
|
|
if np.isnan(score) or np.isinf(score):
|
|
print('NAN')
|
|
sys.exit()
|
|
return score
|
|
|
|
def cosine(self, u, v):
|
|
pu = self._prevalence(u)
|
|
pv = self._prevalence(v)
|
|
return cosine(u, v) - np.sqrt(pu * pv)
|
|
|
|
def _get_4cellcounters(self, u, v, D):
|
|
"""
|
|
:param u: a set of indexes with a non-zero value
|
|
:param v: a set of indexes with a non-zero value
|
|
:param D: the number of events (i.e., all posible indexes)
|
|
:return: the 4-cell contingency values tp, fp, fn, tn)
|
|
"""
|
|
common=u.intersection(v)
|
|
tp = len(common)
|
|
fp = len(u) - len(common)
|
|
fn = len(v) - len(common)
|
|
tn = D - (tp + fp + fn)
|
|
return tp, fp, fn, tn
|
|
|
|
def dcf_dist(self, U, V):
|
|
nU,D = U.shape
|
|
nV = V.shape[0]
|
|
if issparse(U): U = U.toarray()
|
|
if issparse(V): V = V.toarray()
|
|
|
|
dists = np.zeros((nU, nV))
|
|
if self.dcf.__name__ in self.prob_dcf:
|
|
def hits_index(v):
|
|
return set(np.argwhere(v>0).reshape(-1).tolist())
|
|
Vhits = {i:hits_index(V[i]) for i in range(nV)}
|
|
for i in range(nU):
|
|
Ui_hits = hits_index(U[i])
|
|
for j in range(nV):
|
|
dists[i, j] = self.dcf(self, Ui_hits, Vhits[j], D)
|
|
else:
|
|
for i in range(nU):
|
|
for j in range(nV):
|
|
dists[i, j] = self.dcf(self, U[i], V[j])
|
|
return dists
|
|
|