Graph random walks functions

Import packages

In [2]:
import random
import igraph
import numpy as np
import scipy as sp
import math
from sklearn import metrics
from sklearn import svm
from sklearn import manifold
from sklearn.datasets import *
from sklearn.neighbors import NearestNeighbors
import matplotlib as mpl
import matplotlib.pyplot as plt
import time
import seaborn as sns
%matplotlib inline
In [ ]:
c = %config InlineBackend.rc
c['savefig.dpi'] = 144
%config InlineBackend.figure_format='png'
%config InlineBackend.rc = c
%matplotlib inline
sns.set_style("darkgrid", {"legend.frameon": 1})
In [3]:
def abline():
    gca = plt.gca()
    gca.set_autoscale_on(False)
    gca.plot(gca.get_xlim(),gca.get_ylim())
In [4]:
import scipy.weave as sw
from scipy.weave import converters
a  = 1
# test weave is working
sw.inline('printf("%d\\n",a);',['a'])
In [25]:
def weight_hit_time(source, maxt, nrun, klist, kweight = None):
    assert(type(source) == type(1))
    assert(type(maxt) == type(1))
    assert(type(nrun) == type(1))
    # construct flattened list of neighborhood indices
    klens=[0]+[len(item) for item in klist]
    # kind[i] points to the start of neighborhood for vertex [i]
    kind = np.cumsum(np.array(klens))
    kind.astype(int)
    # kwflat is the flattened weight matrix, used for weighted sampling.
    if kweight is None:
        kweight=[np.ones(len(sublist))/float(len(sublist)) for sublist in klist]
    kwtemp = [np.cumsum(nbweight) for nbweight in kweight]
    kwflat = np.array([val for sublist in kwtemp for val in sublist])
    # kflat[kind[i] - kind[i+1]] are the neighbors of i.
    kflat = np.array([val for sublist in klist for val in sublist])
    kflat.astype(int)
    
    hittimes = np.zeros((len(klist),maxt))
    hittemp = np.zeros(len(klist))
        
    code = """\
    for(int i=0; i < nrun; i++){
        for(int k=0; k < Nhittemp[0]; k++)
            hittemp[k] = -1;
        int j = source;
        HITTIMES2(j,0)++;
        int t = 1;
        while(t < maxt){
            int kl = kind[j+1]-kind[j];
            // draw random sample from weighted neighborhood
            float rv = drand48();
            int jn = 0;
            while(rv > kwflat[kind[j] + jn])
                jn++;
            // now maintain the importance weights and list of nodes
            j = kflat[kind[j] + jn];
            if(hittemp[j] < 0){
                hittemp[j] = t;
                HITTIMES2(j,t)++;
            }
            t++;
        }
    }
    """
    sw.inline(code,['maxt','nrun','kflat','kwflat','kind','hittimes','source','hittemp'],verbose=1, headers=["<math.h>"], compiler="gcc")
    return (hittimes)

klist = [[0, 1],[0, 1]]
kweight = [np.array([0.5,0.5]),np.array([0.5,0.5])]
weight_hit_time(1, 10000, 1000, klist, kweight=kweight)
Out[25]:
array([[    0.,   511.,   243., ...,     0.,     0.,     0.],
       [ 1000.,   489.,   273., ...,     0.,     0.,     0.]])
In [11]:
def fast_hit_time(jinit, nrun, maxt, klist):
    assert(type(jinit) == type(1))
    assert(type(nrun) == type(1))
    assert(type(maxt) == type(1))
    # construct flattened list of neighborhood indices
    klens=[0]+[len(item) for item in klist]
    # kind[i] points to the start of neighborhood for vertex [i]
    kind = np.cumsum(np.array(klens))
    kind.astype(int)
    # kflat[kind[i] - kind[i+1]] are the neighbors of i.
    kflat = np.array([val for sublist in klist for val in sublist])
    kflat.astype(int)
    j=jinit
    t=maxt
    
    hittimes = np.zeros((len(klist), maxt))
    hittemp = np.zeros(len(klist))
    
    code = """\
    for(int i=0; i < nrun; i++){
        int j = jinit;
        for(int k=0; k < Nhittemp[0]; k++)
            hittemp[k] = -1;
        for(int t=0; t < maxt; t++){
           int kl = kind[j+1]-kind[j];
           int jn = rand() % kl;
           j = kflat[kind[j] + jn];
           if(hittemp[j] < 0){
               hittemp[j] = t;
               HITTIMES2(j,t)++;
            }
        }
    }
    """
    sw.inline(code,['jinit','maxt','nrun','kflat','kind','hittemp','hittimes'],verbose=1, headers=["<math.h>"], compiler="gcc")
    return hittimes
klist = [[0, 1],[0, 1]]
fast_hit_time(0, 200, 10, klist)
Out[11]:
array([[ 115.,   45.,   17.,   16.,    3.,    2.,    1.,    1.,    0.,
           0.],
       [  85.,   58.,   35.,   14.,    6.,    1.,    0.,    0.,    1.,
           0.]])
In [9]:
def fast_list_stationary(jinit, nrun, klist, kweight = None):
    assert(type(jinit) == type(1))
    assert(type(nrun) == type(1))
    # construct flattened list of neighborhood indices
    klens=[0]+[len(item) for item in klist]
    # kind[i] points to the start of neighborhood for vertex [i]
    kind = np.cumsum(np.array(klens))
    kind.astype(int)
    # kwflat is the flattened weight matrix, used for weighted sampling.
    if kweight is None:
        kweight=[np.ones(len(sublist))/float(len(sublist)) for sublist in klist]
    kwtemp = [np.cumsum(nbweight) for nbweight in kweight]
    kwflat = np.array([val for sublist in kwtemp for val in sublist])
    # kflat[kind[i] - kind[i+1]] are the neighbors of i.
    kflat = np.array([val for sublist in klist for val in sublist])
    kflat.astype(int)
    csamp = np.ones(len(klist))
    j=jinit
    code = """\
    for(int i=0; i < nrun; i++){
       int kl = kind[j+1]-kind[j];
       float rv = drand48();
       int jn = 0;
       while(rv > kwflat[kind[j] + jn])
           jn++;
       j = kflat[kind[j] + jn];
       csamp[j]++;
    }
    """
    sw.inline(code,['j','nrun','kflat','kind','kwflat','csamp'],verbose=1, headers=["<math.h>"], compiler="gcc")
    return csamp/np.sum(csamp)
jinit = 0
nrun = 100000
klist = [[0, 1],[0, 1]]
kweight = [np.array([0.7,0.3]),np.array([0.5,0.5])]
fast_list_stationary(jinit, nrun, klist, kweight)
Out[9]:
array([ 0.62639747,  0.37360253])
In [6]:
def slow_hitting_time(adj, i, j, kmax, nbs=False):
    if nbs:
        jset = np.nonzero(adj[j,:])[0]
    else:
        jset = j
    nnode = adj.shape[0]
    vec = np.zeros(nnode)
    vec[i]=1
    prs = np.zeros(kmax+1)
    adj2 = sp.sparse.csr_matrix(adj.T)
    for k in xrange(kmax):
        vec = adj2.dot(vec)
        prs[k] = np.sum(vec[jset])
        vec[jset] = 0
    prs[kmax] = np.sum(vec)
    return prs
In [4]:
def get_stationary(n_trunc, n_run, adj):
    pi = (np.zeros(n_trunc)+1.0)/float(n_trunc)
    adj2 = sp.sparse.csr_matrix(adj.T)
    for i in xrange(n_run):
        pi = adj2.dot(pi)
    pi = pi + 1.0/( n_run * n_trunc )
    pi = pi / sum(pi)
    return pi

def reweighted_walk(dest, epsest, x, nbrs):
    jumpp=1/pow(epsest,2.0)
    jumpp=jumpp/max(jumpp)
    rwmat = np.zeros((len(dest), len(dest)))
    knind = np.delete(nbrs.kneighbors(x)[1],0,1)
    for i in xrange(knind.shape[0]):
        inds=knind[i,]
        djump=1.0/dest[inds]
        rwmat[i,inds]=djump/sum(djump)*jumpp[i]
        rwmat[i,i]=1.0-jumpp[i]
    return rwmat

Calculate $\varepsilon$ via the relation $\varepsilon(x)^d p(x) \propto k$ which implies, $\varepsilon(x) = k/p(x)^{1/d}$

In [7]:
def fit_graph(adj, n_run = 1000, d = 1):
    pi = get_stationary(adj.shape[0], n_run, adj)
    degree = np.sum(adj > 0,1)
    dest = pow(pi,d/(d+2.0))*pow(degree,2/(d+2.0))
    dest = dest/sum(dest)
    epsest = pow(pi,-1.0/(d+2.0))*pow(degree,1.0/(d+2.0))
    return (pi, dest, epsest)
    
def fit_knn(x, k):
    nbrs = NearestNeighbors(n_neighbors=k).fit(x)
    adj = get_graph(nbrs, x, k)
    return (nbrs, adj)
In [21]:
def ltht(hittimes, beta, ofs=1):
    nsamples=np.sum(hittimes,1)
    runtime=hittimes.shape[1]
    return -np.log(np.dot(hittimes,np.exp(-beta*(np.arange(runtime)+ofs)))/nsamples + np.exp(-beta*runtime) * (nsamples-np.sum(hittimes,1))/nsamples)
In [32]:
def adj_to_nblist(adj):
    return [np.nonzero(adj[i,:])[0] for i in xrange(adj.shape[0])]
In [4]:
def get_graph(nbrs, x, k):
    return nbrs.kneighbors_graph(x).toarray()*1.0/k

def get_dist(adj, eps):
    distmat=np.dot(np.diag(eps),adj)
    np.fill_diagonal(distmat,0)
    return distmat

The directed Laplacian is defined by Yanhua Li and Zhi-Li Zhang in 'Random Walks on Digraphs, The Generalized Digraph Laplacian, and The Degree of Asymmetry'

Their definition is: $$\tilde{L} = \Pi^{1/2} (I-P) \Pi^{-1/2}$$ Where $\Pi^{1/2} = \text{diag}(\sqrt{\pi})$.

In [5]:
def make_laplace(mat):
    return np.diag(np.sum(mat,1)) - mat

def make_directed_laplace_rw(mat, pi):
    lapone = np.diag(np.ones(mat.shape[0])) - mat
    return np.dot(np.dot(np.diag(np.sqrt(pi)),lapone),np.diag(1.0/np.sqrt(pi)))

def make_directed_laplace(mat):
    pi_tmp = get_stationary(mat.shape[0], 1000, mat)
    return make_directed_laplace(mat, pi_tmp)

The diffusion kernel is defined in Kondor et al for a undirected laplacian as:

$$e^{\beta L}$$

This should generalize very directly to the directed case via the SVD.

In [ ]:
def make_diffkern(lap, beta):
    u, d, v = np.linalg.svd(-1 * lap)
    gram=np.dot(np.dot(u,np.diag(np.exp(d*beta))),v)
    return gram

The hitting time between two points on a directed graph is defined by Li-Zhang as:

$$H_{ij} = \frac{\tilde{L}^+_{jj}}{\pi_j} - \frac{\tilde{L}^+_{ij}}{\sqrt{\pi_i\pi_j}}$$

The commute time is obviously: $$C_{ij} = H_{ij} + H_{ji}$$

In [ ]:
def get_htime(lap, pi):
    lapinv = np.linalg.pinv(lap)
    isqrt = np.sqrt(1.0/pi)
    selfterm = np.tile(np.diag(lapinv)/pi, (lapinv.shape[0], 1))
    crossterm = np.dot(np.dot(np.diag(isqrt),lapinv),np.diag(isqrt))
    hittime = selfterm - crossterm
    return hittime
In [ ]:
def get_ctime(lap, pi):
    lapinv = np.linalg.pinv(lap)
    isqrt = np.sqrt(1.0/pi)
    selfterm = np.tile(np.diag(lapinv)/pi, (lapinv.shape[0], 1))
    crossterm = np.dot(np.dot(np.diag(isqrt),lapinv),np.diag(isqrt))
    hittime = selfterm - crossterm
    commtime = hittime + np.transpose(hittime)
    commtime[commtime < 0] = 0
    return commtime
In [ ]:
def get_sps(adj, eps):
    sps=sp.sparse.csgraph.shortest_path(get_dist(adj,eps))
    sps = sps + np.transpose(sps)
    return sps

Plotting and data generation

In [6]:
def get_histo(din, nbin, xvec):
    minx = min(xvec)
    maxx = max(xvec)
    step = (maxx - minx)/nbin
    nzs = np.zeros(nbin)
    for i in xrange(nbin):
        lb = i*step+minx
        ub = (i+1)*step+minx
        isel = np.squeeze((xvec > lb) & (xvec < ub))
        nzs[i]= sum(din[isel])
    return nzs
In [7]:
def get_mix():
    xc=random.uniform(0,0.75)
    if xc > 0.5:
        return xc*2-0.5
    return xc

def dens_mix(xc):
    xr = np.ones(len(np.squeeze(xc)))
    xr[np.squeeze(xc) > 0.5] = 0.5
    return xr