(Editor Note: These notes are not my own text they are copied from MacKay and the Astrophysics source below. They’ll be gradually edited over the next few months I provide them because writing them was useful for me)

The Hamiltonian Monte Carlo is a Metropolis method, applicable to continuous state spaces, that makes use of gradient information to reduce random walk behaviour.

These notes are based on a combination of http://python4mpia.github.io/index.html and David MacKays book on Inference and Information theory.

For many systems whose probability can be written in the form

not only but also its gradient with respect to

can be readily evaluated. It seems wasteful to use a simple random-walk

Metropolis method when this gradient is available – the gradient indicates

which direction one should go in to find states that have higher probability!

**Overview**

In the Hamiltonian Monte Carlo method, the state space x is augmented by

momentum variables p, and there is an alternation of two types of proposal.

The first proposal randomize the momentum variable, leaving the state x un-changed. The second proposal changes both x and p using simulated Hamiltonian dynamics as defined by the Hamiltonian

,

where is a ‘kinetic energy’ such as These two proposals are used to create

(asymptotically) samples from the joint density

This density is separable, so the marginal distribution of is the

desired distribution . So, simply discarding the momentum variables, we obtain a sequence of samples that asymptotically come from

**An algorithm for Hamiltonian Monte-Carlo**

First, we need to compute the gradient of our objective function.

Hamiltonian Monte-Carlo makes use of the fact, that we can write our likelihood as

where is the “energy”. The algorithm then uses

Hamiltonian dynamics to modify the way how candidates are proposed:

This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.

Learn more about bidirectional Unicode characters

import numpy,math | |

import matplotlib.pyplot as plt | |

import random as random | |

random.seed(1) # set random seed. | |

# Draw random samples from Salpeter IMF. | |

# N … number of samples. | |

# alpha … power-law index. | |

# M_min … lower bound of mass interval. | |

# M_max … upper bound of mass interval. | |

def sampleFromSalpeter(N, alpha, M_min, M_max): | |

# Convert limits from M to logM. | |

log_M_Min = math.log(M_min) | |

log_M_Max = math.log(M_max) | |

# Since Salpeter SMF decays, maximum likelihood occurs at M_min | |

maxlik = math.pow(M_min, 1.0 – alpha) | |

# Prepare array for output masses. | |

Masses = [] | |

# Fill in array. | |

while (len(Masses) < N): | |

# Draw candidate from logM interval. | |

logM = random.uniform(log_M_Min,log_M_Max) | |

M = math.exp(logM) | |

# Compute likelihood of candidate from Salpeter SMF. | |

likelihood = math.pow(M, 1.0 – alpha) | |

# Accept randomly. | |

u = random.uniform(0.0,maxlik) | |

if (u < likelihood): | |

Masses.append(M) | |

return Masses | |

log_M_min = math.log(1.0) | |

log_M_max = math.log(100.0) | |

def evaluateLogLikelihood(params, D, N, M_min, M_max): | |

alpha = params[0] | |

c = (1.0 – alpha)/(math.pow(M_max, 1.0–alpha) – math.pow(M_min, 1.0–alpha)) | |

# return Log Likelihood | |

return N*math.log(c) – alpha*D | |

# Generate toy data. | |

N = 1000000 # Draw 1 Million examples | |

alpha = 2.35 | |

M_min = 1.0 | |

M_max = 100.0 | |

Masses = sampleFromSalpeter(N, alpha, M_min, M_max) | |

LogM = numpy.log(numpy.array(Masses)) | |

D = numpy.mean(LogM)*N | |

# Define gradient of log-likelihood. | |

def evaluateGradient(params, D, N, M_min, M_max, log_M_min, log_M_max): | |

alpha = params[0] # extract alpha | |

grad = log_M_min*math.pow(M_min, 1.0–alpha) – log_M_max*math.pow(M_max, 1.0–alpha) | |

grad = 1.0 + grad*(1.0 – alpha)/(math.pow(M_max, 1.0–alpha)– math.pow(M_min, 1.0–alpha)) | |

grad = –D – N*grad/(1.0 – alpha) | |

return numpy.array(grad) | |

log_M_min = math.log(1.0) | |

log_M_max = math.log(100.0) | |

# Initial guess for alpha as array. | |

guess = [3.0] | |

# Prepare storing MCMC chain. | |

A = [guess] | |

# define stepsize of MCMC. | |

stepsize = 0.000047 | |

accepted = 0.0 | |

import copy | |

# Hamiltonian Monte-Carlo. | |

for n in range(10000): | |

old_alpha = A[len(A)–1] | |

# Remember, energy = -loglik | |

old_energy = –evaluateLogLikelihood(old_alpha, D, N, M_min, | |

M_max) | |

old_grad = –evaluateGradient(old_alpha, D, N, M_min, | |

M_max, log_M_min, log_M_max) | |

new_alpha = copy.copy(old_alpha) # deep copy of array | |

new_grad = copy.copy(old_grad) # deep copy of array | |

# Suggest new candidate using gradient + Hamiltonian dynamics. | |

# draw random momentum vector from unit Gaussian. | |

p = random.gauss(0.0, 1.0) | |

H = numpy.dot(p,p)/2.0 + old_energy # compute Hamiltonian | |

# Do 5 Leapfrog steps. | |

for tau in range(5): | |

# make half step in p | |

p = p – stepsize*new_grad/2.0 | |

# make full step in alpha | |

new_alpha = new_alpha + stepsize*p | |

# compute new gradient | |

new_grad = –evaluateGradient(old_alpha, D, N, M_min, | |

M_max, log_M_min, log_M_max) | |

# make half step in p | |

p = p – stepsize*new_grad/2.0 | |

# Compute new Hamiltonian. Remember, energy = -loglik. | |

new_energy = –evaluateLogLikelihood(new_alpha, D, N, M_min, | |

M_max) | |

newH = numpy.dot(p,p)/2.0 + new_energy | |

dH = newH – H | |

# Accept new candidate in Monte-Carlo fashion. | |

if (dH < 0.0): | |

A.append(new_alpha) | |

accepted = accepted + 1.0 | |

else: | |

u = random.uniform(0.0,1.0) | |

if (u < math.exp(–dH)): | |

A.append(new_alpha) | |

accepted = accepted + 1.0 | |

else: | |

A.append(old_alpha) | |

# Discard first half of MCMC chain and thin out the rest. | |

Clean = [] | |

for n in range(len(A)/2,len(A)): | |

if (n % 10 == 0): | |

Clean.append(A[n][0]) | |

# Print Monte-Carlo estimate of alpha. | |

print "Mean: "+str(numpy.mean(Clean)) | |

print "Sigma: "+str(numpy.std(Clean)) | |

plt.figure(1) | |

plt.hist(Clean, 20, histtype='step', lw=3) | |

plt.xticks([2.346,2.348,2.35,2.352,2.354], | |

[2.346,2.348,2.35,2.352,2.354]) | |

plt.xlim(2.345,2.355) | |

plt.xlabel(r'$\alpha$', fontsize=24) | |

plt.ylabel(r'$\cal L($Data$;\alpha)$', fontsize=24) | |

plt.savefig('example-HamiltonianMC-results.png') | |

print "Acceptance rate = "+str(accepted/float(len(A))) |

The true value we used to generate the data was . The Monte-Carlo estimate is . Mean: 2.35130302322 Sigma: 0.00147375480435 Acceptance rate = 0.645735426457