A guided walk through the Metropolis algorithmm

The following vignette contains code to accompany the paper “Markov unchained: a guided walk through the Metropolis algorithm.” The code walks the reader through

The data

A case control study of leukemia (y=1) and residence around strong magnetic fields (x=1)

y = c(rep(1, 36), rep(0, 198)) # leukemia cases
x = c(rep(1, 3), rep(0, 33), rep(1, 5), rep(0, 193)) # exposure

Helper functions

These functions will be used throughout

#helper functions
expit <- function(mu) 1/(1+exp(-mu))

loglik = function(y,x,beta){
  # calculate the log likelihood
  lli = dbinom(y, 1, expit(beta[1] + x*beta[2]), log=TRUE)
  sum(lli)
}

riskdifference = function(y,x,beta){
  # baseline odds (offset)
  # calculate a risk difference
  poprisk = 4.8/100000
  popodds = poprisk/(1-poprisk)
  studyodds = mean(y)/(1-mean(y))
  r1 = expit(log(popodds/studyodds) + beta[1] + beta[2])
  r0 = expit(log(popodds/studyodds) + beta[1])
  mean(r1-r0)
}

Maximum likelihood estimates

data = data.frame(leuk=y, magfield=x)
mod = glm(leuk ~ magfield, family=binomial(), data=data)
summary(mod)$coefficients

beta1 = summary(mod)$coefficients[2,1]
se1 = summary(mod)$coefficients[2,2]
cat("\n\nMaximum likelihood beta coefficient (95% CI)\n")
round(c(beta=beta1, ll=beta1+se1*qnorm(0.025), ul=beta1+se1*qnorm(0.975)), 2)

cat("\n\nMaximum likelihood odds ratio (95% CI)\n")
round(exp(c(beta=beta1, ll=beta1+se1*qnorm(0.025), ul=beta1+se1*qnorm(0.975))), 2)

cat("\n\nMaximum likelihood risk difference (multiplied by 1000) \n")
round(c(rd_1000=riskdifference(y,x,mod$coefficients)*1000), 2)
##              Estimate Std. Error   z value     Pr(>|z|)
## (Intercept) -1.766183   0.188373 -9.375988 6.853094e-21
## magfield     1.255357   0.754200  1.664488 9.601492e-02
## 
## 
## Maximum likelihood beta coefficient (95% CI)
##  beta    ll    ul 
##  1.26 -0.22  2.73 
## 
## 
## Maximum likelihood odds ratio (95% CI)
##  beta    ll    ul 
##  3.51  0.80 15.39 
## 
## 
## Maximum likelihood risk difference (multiplied by 1000) 
## rd_1000 
##    0.11

Random walk metropolis

# initialize
M=10000
set.seed(91828)
beta_post = matrix(nrow=M, ncol=2)
colnames(beta_post) = c('beta0', 'beta1')
accept = numeric(M)
rd = numeric(M)
beta_post[1,] = c(2,-3)
rd[1] = riskdifference(y,x,beta_post[1,])
accept[1] = 1
for(i in 2:M){
  oldb = beta_post[i-1,]
  prop = rnorm(2, sd=0.2)
  newb = oldb+prop
  num = loglik(y,x,newb)
  den = loglik(y,x,oldb)
  acceptprob = exp(num-den)
  acc = (acceptprob > runif(1))
  if(acc){
    beta_post[i,] = newb 
    accept[i] = 1
  }else{
    beta_post[i,] = oldb 
    accept[i] = 0
  }
  rd[i] = 1000*riskdifference(y,x,beta_post[i,])
}

Inspecting output

mean(accept)
## [1] 0.6551
summary(beta_post)
##      beta0            beta1        
##  Min.   :-2.518   Min.   :-3.9483  
##  1st Qu.:-1.902   1st Qu.: 0.7389  
##  Median :-1.776   Median : 1.2292  
##  Mean   :-1.770   Mean   : 1.1714  
##  3rd Qu.:-1.651   3rd Qu.: 1.7004  
##  Max.   : 2.000   Max.   : 3.9189
init = beta_post[1,]
postmean = apply(beta_post[-c(1:1000),], 2, mean)
cat("Posterior mean\n")
## Posterior mean
round(postmean, 2)
## beta0 beta1 
## -1.78  1.22
plot(beta_post, pch=19, col=rgb(0,0,0,0.05), xlab=expression(beta[0]), ylab=expression(beta[1]), xlim=c(-2.5,2.5), ylim=c(-4.5,4.5))
points(init[1], init[2], col="red", pch=19)
points(postmean[1], postmean[2], col="orange", pch=19)
legend("topright", col=c("red", "orange"), legend=c("Initial value", "Post. mean"), pch=19)

plot of chunk Inspecting

plot(beta_post[,2], type='l',  ylab=expression(beta[1]), xlab="Iteration", ylim=c(-4, 4))

plot of chunk Inspecting

plot(rd, type='l',  ylab="RD*1000", xlab="Iteration", ylim=c(-4, 4))

plot of chunk Inspecting

plot(density(beta_post[-c(1:1000),2]), xlab=expression(beta[1]), ylab="Density", main="")

plot of chunk Inspecting

plot(density(rd[-c(1:1000)]), xlab="RD*1000", ylab="Density", main="")

plot of chunk Inspecting

Guided metropolis

# initialize
M=10000
set.seed(91828)
beta_post_guide = matrix(nrow=M, ncol=2)
colnames(beta_post_guide) = c('beta0', 'beta1')
accept = numeric(M)
rd_guide = numeric(M)
beta_post_guide[1,] = c(2,-3)
rd_guide[1] = riskdifference(y,x,beta_post_guide[1,])
accept[1] = 1
dir = 1
for(i in 2:M){
  oldb = beta_post_guide[i-1,]
  prop = dir*abs(rnorm(2, sd=0.2))
  newb = oldb+prop
  num = loglik(y,x,newb)
  den = loglik(y,x,oldb)
  acceptprob = exp(num-den)
  acc = (acceptprob > runif(1))
  if(acc){
    beta_post_guide[i,] = newb 
    accept[i] = 1
  }else{
    beta_post_guide[i,] = oldb 
    accept[i] = 0
    dir = dir*-1
  }
  rd_guide[i] = 1000*riskdifference(y,x,beta_post_guide[i,])
}

postmean = apply(beta_post_guide[-c(1:1000),], 2, mean)
cat("Posterior mean, guided\n")
## Posterior mean, guided
round(postmean, 2)
## beta0 beta1 
## -1.80  1.41

Contrasting output with random walk

col1 = rgb(0,0,0,.5)
col2 = rgb(1,0,0,.35)
par(mfcol=c(1,2))

#trace plots
plot(beta_post[1:200,2], type='l',  ylab=expression(beta[1]), xlab="Iteration", ylim=c(-4, 4), col=col1)
lines(beta_post_guide[1:200,2], col=col2)
legend("topright", lty=1, col=c(col1, col2), legend=c("Rand. walk", "Guided"))
plot(9800:10000, beta_post[9800:10000,2], type='l',  ylab=expression(beta[1]), xlab="Iteration", ylim=c(-4, 4), col=col1)
lines(9800:10000, beta_post_guide[9800:10000,2], col=col2)
legend("topright", lty=1, col=c(col1, col2), legend=c("Rand. walk", "Guided"))

plot of chunk Comparing guided

# density plots
plot(density(beta_post_guide[-c(1:1000),2]), col=col2, xlab=expression(beta[1]), ylab="Density", main="")
lines(density(beta_post[-c(1:1000),2]), col=col1)
legend("bottomright", lty=1, col=c(col1, col2), legend=c("Rand. walk", "Guided"))


plot(density(rd_guide[-c(1:1000)]), xlab="RD*1000", ylab="Density", main="", col=col2)
lines(density(rd[-c(1:1000)]), col=col1)
legend("topright", lty=1, col=c(col1, col2), legend=c("Rand. walk", "Guided"))

plot of chunk Comparing guided

par(mfcol=c(1,1))

Guided, adaptive metropolis algorithm

# initialize
M=10000
burnin=1000
set.seed(91828)
beta_post_adaptguide = matrix(nrow=M+burnin, ncol=2)
colnames(beta_post_adaptguide) = c('beta0', 'beta1')
accept = numeric(M+burnin)
rd_adaptguide = numeric(M+burnin)
beta_post_adaptguide[1,] = c(2,-3)
rd_adaptguide[1] = riskdifference(y,x,beta_post[1,])
accept[1] = 1
prop.sigma = c(0.2, 0.2)
dir = 1
for(i in 2:(M+burnin)){
  if((i < burnin) & (i > 25)){
    prop.sigma = apply(beta_post_adaptguide[max(1, i-100):(i-1),], 2, sd)
  }
  oldb = beta_post_adaptguide[i-1,]
  prop = dir*abs(rnorm(2, sd=prop.sigma))
  newb = oldb+prop
  num = loglik(y,x,newb)
  den = loglik(y,x,oldb)
  acceptprob = exp(num-den)
  acc = (acceptprob > runif(1))
  if(acc){
    beta_post_adaptguide[i,] = newb 
    accept[i] = 1
  }else{
    beta_post_adaptguide[i,] = oldb 
    accept[i] = 0
    dir = dir*-1
  }
  rd_adaptguide[i] = 1000*riskdifference(y,x,beta_post_adaptguide[i,])
}
postmean = apply(beta_post_adaptguide[-c(1:1000),], 2, mean)
cat("Posterior mean, guided and adaptive\n")
## Posterior mean, guided and adaptive
round(postmean, 2)
## beta0 beta1 
## -1.78  1.22

Contrasting output

col1 = rgb(0,0,0,.5)
col2 = rgb(1,0,0,.35)
par(mfcol=c(1,2))

#trace plots
plot(beta_post[1:200,2], type='l',  ylab=expression(beta[1]), xlab="Iteration", ylim=c(-4, 4), col=col1)
lines(beta_post_adaptguide[1:200,2], col=col2)
legend("topright", lty=1, col=c(col1, col2), legend=c("Rand. walk", "Guided, adaptive"))
plot(9800:10000, beta_post[9800:10000,2], type='l',  ylab=expression(beta[1]), xlab="Iteration", ylim=c(-4, 4), col=col1)
lines(9800:10000, beta_post_adaptguide[9800:10000,2], col=col2)
legend("topright", lty=1, col=c(col1, col2), legend=c("Rand. walk", "Guided, adaptive"))

plot of chunk Comparing 2

# density plots
plot(density(beta_post_adaptguide[-c(1:1000),2]), col=col2, xlab=expression(beta[1]), ylab="Density", main="")
lines(density(beta_post[-c(1:1000),2]), col=col1)
legend("bottomright", lty=1, col=c(col1, col2), legend=c("Rand. walk", "Guided, adaptive"))


plot(density(rd_adaptguide[-c(1:1000)]), xlab="RD*1000", ylab="Density", main="", col=col2)
lines(density(rd[-c(1:1000)]), col=col1)
legend("topright", lty=1, col=c(col1, col2), legend=c("Rand. walk", "Guided, adaptive"))

plot of chunk Comparing 2

par(mfcol=c(1,1))

Guided, adaptive metropolis algorithm using normal priors

# initialize
M=10000
burnin=1000
set.seed(91828)
beta_post_adaptguide2 = matrix(nrow=M+burnin, ncol=2)
colnames(beta_post_adaptguide2) = c('beta0', 'beta1')
accept = numeric(M+burnin)
rd_adaptguide2 = numeric(M+burnin)
beta_post_adaptguide2[1,] = c(2,-3)
rd_adaptguide2[1] = riskdifference(y,x,beta_post[1,])
accept[1] = 1
prop.sigma = c(0.2, 0.2)
dir = 1
for(i in 2:(M+burnin)){
  if((i < burnin) & (i > 25)){
    prop.sigma = apply(beta_post_adaptguide2[max(1, i-100):(i-1),], 2, sd)
  }
  oldb = beta_post_adaptguide2[i-1,]
  prop = dir*abs(rnorm(2, sd=prop.sigma))
  newb = oldb+prop
  num = loglik(y,x,newb) + dnorm(newb[1], mean=0, sd=sqrt(100), log=TRUE) + dnorm(newb[2], mean=0, sd=sqrt(0.5), log=TRUE)
  den = loglik(y,x,oldb) + dnorm(oldb[1], mean=0, sd=sqrt(100), log=TRUE) + dnorm(oldb[2], mean=0, sd=sqrt(0.5), log=TRUE)
  acceptprob = exp(num-den)
  acc = (acceptprob > runif(1))
  if(acc){
    beta_post_adaptguide2[i,] = newb 
    accept[i] = 1
  }else{
    beta_post_adaptguide2[i,] = oldb 
    accept[i] = 0
    dir = dir*-1
  }
  rd_adaptguide2[i] = 1000*riskdifference(y,x,beta_post_adaptguide2[i,])
}
postmean = apply(beta_post_adaptguide2[-c(1:1000),], 2, mean)
cat("Posterior mean, guided and adaptive\n")
## Posterior mean, guided and adaptive
round(postmean, 2)
## beta0 beta1 
## -1.75  0.54

Inspecting output

mean(accept)
## [1] 0.5552727
init = beta_post_adaptguide[1,]
postmean = apply(beta_post_adaptguide[-c(1:1000),], 2, mean)
cat("Posterior mean, uniform priors\n")
## Posterior mean, uniform priors
round(postmean, 2)
## beta0 beta1 
## -1.78  1.22
init2 = beta_post_adaptguide2[1,]
postmean2 = apply(beta_post_adaptguide2[-c(1:1000),], 2, mean)
cat("Posterior mean, informative normal priors\n")
## Posterior mean, informative normal priors
round(postmean2, 2)
## beta0 beta1 
## -1.75  0.54
par(mfcol=c(1,2))
plot(beta_post_adaptguide, pch=19, col=rgb(0,0,0,0.05), xlab=expression(beta[0]), ylab=expression(beta[1]), xlim=c(-2.5,2.5), ylim=c(-4.5,4.5), main="Uniform priors")
points(init[1], init[2], col="red", pch=19)
points(postmean[1], postmean[2], col="orange", pch=19)
legend("topright", col=c("red", "orange"), legend=c("Initial value", "Post. mean"), pch=19)

plot(beta_post_adaptguide2, pch=19, col=rgb(0,0,0,0.05), xlab=expression(beta[0]), ylab=expression(beta[1]), xlim=c(-2.5,2.5), ylim=c(-4.5,4.5), main="Informative priors")
points(init2[1], init2[2], col="red", pch=19)
points(postmean2[1], postmean2[2], col="orange", pch=19)
legend("topright", col=c("red", "orange"), legend=c("Initial value", "Post. mean"), pch=19)

plot of chunk Inspecting output with normal priors

par(mfcol=c(1,2))
plot(beta_post_adaptguide[,2], type='l',  ylab=expression(beta[1]), xlab="Iteration", ylim=c(-4, 4))
plot(beta_post_adaptguide2[,2], type='l',  ylab=expression(beta[1]), xlab="Iteration", ylim=c(-4, 4))

plot of chunk Inspecting output with normal priors

plot(density(beta_post_adaptguide[-c(1:1000),2]), xlab=expression(beta[1]), ylab="Density", main="", xlim=c(-4, 4))
plot(density(beta_post_adaptguide2[-c(1:1000), 2]), xlab=expression(beta[1]), ylab="Density", main="", xlim=c(-4, 4))

plot of chunk Inspecting output with normal priors

par(mfcol=c(2,1))

plot(rd_adaptguide, type='l',  ylab="RD*1000", xlab="Iteration", ylim=c(-.2, .5))
plot(rd_adaptguide2, type='l',  ylab="RD*1000", xlab="Iteration", ylim=c(-.2, .5))

plot of chunk Inspecting output with normal priors

plot(density(rd_adaptguide[-c(1:1000)]), xlab=expression(beta[1]), ylab="Density", main="", xlim=c(-.2, .5))
plot(density(rd_adaptguide2[-c(1:1000)]), xlab="RD*1000", ylab="Density", main="", xlim=c(-.2, .5))

plot of chunk Inspecting output with normal priors

par(mfcol=c(1,1))

Using the R package

Given what you know, running the R package function metropolis_glm should be fairly straightforward. The following example calls in the case-control data used above and compares a randome Walk metropolis algorithmn (with N(0, 0.05), N(0, 0.1) proposal distribution) with a guided, adaptive algorithm.

library(metropolis)
## Loading required package: coda
## 
## Attaching package: 'metropolis'
## The following object is masked _by_ '.GlobalEnv':
## 
##     expit
data("magfields", package="metropolis")

# random walk
res.rw = metropolis_glm(y ~ x, data=magfields, family=binomial(), iter=20000, burnin=3000, adapt=FALSE, guided=FALSE, block=TRUE, inits=c(2,-3), control = metropolis.control(prop.sigma.start = c(0.05, .1)))

summary(res.rw, keepburn = FALSE)
## $nsamples
## [1] 20000
## 
## $sd
## (Intercept)           x 
##   0.1870940   0.8565178 
## 
## $se
## (Intercept)           x 
##   0.0109079   0.1120727 
## 
## $ESS_parms
## [1] 294.19637  58.40808
## 
## $postmean
##                  mean normal_lci normal_uci
## (Intercept) -1.778688 -2.1453924  -1.411984
## x            1.240300 -0.4384751   2.919075
## 
## $postmedian
##                median   pctl_lci  pctl_uci
## (Intercept) -1.774736 -2.1594707 -1.413966
## x            1.260167 -0.5570205  2.934671
## 
## $postmode
##                  mode    hpd_lci   hpd_uci
## (Intercept) -1.766332 -2.1746278 -1.431942
## x            1.257571 -0.5541435  2.938285
plot(res.rw, par = 1:2, keepburn=TRUE)

plot of chunk Using the R packageplot of chunk Using the R package

# guided, adaptive
res.ga = metropolis_glm(y ~ x, data=magfields, family=binomial(), iter=20000, burnin=3000, adapt=TRUE, guided=TRUE, block=FALSE, inits=c(2,-3))

summary(res.ga, keepburn = FALSE)
## $nsamples
## [1] 20000
## 
## $sd
## (Intercept)           x 
##   0.1887433   0.8039617 
## 
## $se
## (Intercept)           x 
## 0.002139775 0.009397736 
## 
## $ESS_parms
## [1] 7780.489 7318.537
## 
## $postmean
##                  mean normal_lci normal_uci
## (Intercept) -1.783400 -2.1533369  -1.413463
## x            1.195689 -0.3800755   2.771454
## 
## $postmedian
##                median   pctl_lci  pctl_uci
## (Intercept) -1.779390 -2.1692698 -1.429546
## x            1.204767 -0.4308701  2.719736
## 
## $postmode
##                  mode    hpd_lci   hpd_uci
## (Intercept) -1.764664 -2.1311923 -1.396892
## x            1.246514 -0.4152461  2.725762
plot(res.ga, par = 1:2, keepburn=TRUE)

plot of chunk Using the R packageplot of chunk Using the R package

Using the R package, smart initial values

Finally, we can use the “glm” option in initial values to initialize the chain with the MLE estimates. This can eke out slightly more efficiency and allow reduced burnin.

# guided, adaptive
res.ga.init = metropolis_glm(y ~ x, data=magfields, family=binomial(), iter=20000, burnin=1000, adapt=TRUE, guided=TRUE, block=FALSE, inits="glm")

summary(res.ga.init, keepburn = FALSE)
## $nsamples
## [1] 20000
## 
## $sd
## (Intercept)           x 
##   0.1910264   0.8117856 
## 
## $se
## (Intercept)           x 
## 0.002187349 0.009279116 
## 
## $ESS_parms
## [1] 7626.942 7653.666
## 
## $postmean
##                  mean normal_lci normal_uci
## (Intercept) -1.779687 -2.1540987  -1.405275
## x            1.190642 -0.4004577   2.781742
## 
## $postmedian
##                median   pctl_lci  pctl_uci
## (Intercept) -1.774436 -2.1665633 -1.418378
## x            1.230554 -0.5138752  2.703918
## 
## $postmode
##                  mode    hpd_lci   hpd_uci
## (Intercept) -1.766599 -2.1459323 -1.399196
## x            1.246608 -0.4553881  2.749118
plot(res.ga.init, par = 1:2, keepburn=TRUE)

plot of chunk Initial valuesplot of chunk Initial values

Extending the logistic model results after samples are generated

Using the function, “risk difference” from above, we can use the output from our prior model to get Bayesian estimates of the risk diffence. In general, this is a useful way to extend MCMC simulations to new estimands that may not be directly parameterized in the model.

# guided, adaptive
beta = as.matrix(res.ga.init$parms[, c("b_0", "b_1")])

y = magfields$y
X = cbind(rep(1, dim(magfields)[1]), magfields$x)

1000*riskdifference(y,X,beta[1,])
## [1] 0.1132425
# calculate risk difference for every value of model coefficients
rd.ga.init = apply(beta, 1, function(b) 1000*riskdifference(y,X,b))

par(mfcol=c(2,1))
 plot(density(rd.ga.init), xlab = "RD*1000", ylab="Kernel density", main="", xlim=c(-.2, 2.5))
 plot(rd.ga.init, type='l', xlab = "RD*1000", ylab="Iteration", ylim=c(-.2, 2.5))

plot of chunk Risk difference after glm_metropolis

par(mfcol=c(1,1))

# posterior mean, median, credible intervals
mean(rd.ga.init[-c(1:1000)])
## [1] 0.1517853
quantile(rd.ga.init[-c(1:1000)], p = c(.5, .025, .975) )
##         50%        2.5%       97.5% 
##  0.10763217 -0.01949789  0.59457738