Estimating the SE of MCMC outputs; different methods

# Strategy to manage correctly the MCMC timeseries to get SE
set.seed(123)
val <- rnorm(30, 10, 2)
dnormx <- function(data, x) {
  data <- unlist(data)
  return(-sum(dnorm(data, mean=x['mean'], sd=x['sd'], log=TRUE)))
}
parameters_mcmc <- data.frame(Density=c('dnorm', 'dlnorm'), 
                              Prior1=c(10, 0.5), Prior2=c(2, 0.5), SDProp=c(0.35, 0.2), 
                              Min=c(-3, 0), Max=c(100, 10), Init=c(10, 2), stringsAsFactors = FALSE, 
                              row.names=c('mean', 'sd'))

# Use of trace and traceML parameters
# trace=1 : Only one likelihood is printed

mcmc_run <- MHalgoGen(n.iter=50000, parameters=parameters_mcmc, data=val, 
                      likelihood=dnormx, n.chains=1, n.adapt=100, thin=1, trace=1)

# The function as.paramaters gives the best likelihood; it is not really what we want
# in the context of MCMC; it is more related to ML
as.parameters(mcmc_run)

# I have 50000 values for mean and sd
nrow(mcmc_run$resultMCMC[[1]])
# These values can be considered as the best estimate; note that this value is not biased by autocorrelation
colMeans(mcmc_run$resultMCMC[[1]])

# and how is known these mean is related to se; the sd gives the dispersion of the values
apply(mcmc_run$resultMCMC[[1]], MARGIN = 2, FUN=sd)
# And the dispersion of the mean is SE=SD/sqrt(N)
apply(mcmc_run$resultMCMC[[1]], MARGIN = 2, FUN=sd)/sqrt(nrow(mcmc_run$resultMCMC[[1]]))

# But this value is biased because it does not take into account large autocorrelation
acf_mean <- acf(mcmc_run$resultMCMC[[1]][, "mean"])
(lag_mean <- which(acf_mean$acf<0.01)[1])
acf_sd <- acf(mcmc_run$resultMCMC[[1]][, "sd"])
(lag_sd <- which(acf_sd$acf<0.01)[1])
# Then I thin the data using this value
series_mean <- mcmc_run$resultMCMC[[1]][seq(from=1, to=nrow(mcmc_run$resultMCMC[[1]]), by=lag_mean), "mean"]
# I verify that acf is nearly 0: perfect
acf(series_mean)
mean(series_mean)
sd(series_mean)/sqrt(length(series_mean))
series_sd <- mcmc_run$resultMCMC[[1]][seq(from=1, to=nrow(mcmc_run$resultMCMC[[1]]), by=lag_mean), "sd"]
# I verify that acf is nearly 0: perfect
acf(series_sd)
length(series_sd)
mean(series_sd)
sd(series_sd)/sqrt(length(series_sd))

# An alternative using coda package
mcmc_run.mcmc <- as.mcmc(mcmc_run)
library(coda)
batchSE(mcmc_run.mcmc, batchSize=100)
# Note that SE is dependent on batchSize
batchSE(mcmc_run.mcmc, batchSize=10)
batchSE(mcmc_run.mcmc, batchSize=1000)

# An alternative is: it gives the size of timeseries removing the autocorrelation using the spectral density
effectiveSize(mcmc_run.mcmc) 
# This is proportional to the lag value I used before but is less stringent
(lag <- nrow(mcmc_run.mcmc[[1]])/effectiveSize(mcmc_run.mcmc))
c(mean=lag_mean, sd=lag_sd)
# I verify that acf is removed; not completly; high ar at lag 1 is still observed
series_mean_ef <- mcmc_run$resultMCMC[[1]][seq(from=1, to=nrow(mcmc_run$resultMCMC[[1]]), by=lag[1]), "mean"]
acf(series_mean_ef)
series_sd_ef <- mcmc_run$resultMCMC[[1]][seq(from=1, to=nrow(mcmc_run$resultMCMC[[1]]), by=lag[2]), "sd"]
acf(series_sd_ef)
# The batch method using lag from AR1 model
c(mean=batchSE(mcmc_run.mcmc, batchSize=lag[1])[1], 
sd=batchSE(mcmc_run.mcmc, batchSize=lag[2])[2])
# The batch method using lag from acf<0.01
c(mean=batchSE(mcmc_run.mcmc, batchSize=lag_mean)[1], 
sd=batchSE(mcmc_run.mcmc, batchSize=lag_sd)[2])

# Now let try the timeseries se of coda
serie_mean_efs <- mcmc_run$resultMCMC[[1]][seq(from=1, to=nrow(mcmc_run$resultMCMC[[1]]), by=lag[1]), "mean"]
sd(serie_mean_efs)/sqrt(length(serie_mean_efs))
serie_sd_efs <- mcmc_run$resultMCMC[[1]][seq(from=1, to=nrow(mcmc_run$resultMCMC[[1]]), by=lag[2]), "sd"]
sd(serie_sd_efs)/sqrt(length(serie_sd_efs))
s <- summary(mcmc_run.mcmc)
s$statistics[, "Time-series SE"]
# Compare with
c(mean=sd(series_mean)/sqrt(length(series_mean)), sd= sd(series_sd)/sqrt(length(series_sd)))
# or
c(mean=sd(serie_mean_efs)/sqrt(length(serie_mean_efs)),sd= sd(serie_sd_efs)/sqrt(length(serie_sd_efs)))
# Manipulating the number of observations can change a lot the SE of each parameter
# It can be used to "manipulate" the output; take care!

# note that the thin parameter of the MCMC object does not influence the output
# It is just here to remember how the data have been generated
attributes(mcmc_run.mcmc[[1]])$mcpar <- c(101, 50100, 5000)
summary(mcmc_run.mcmc)

# I use the CI limits to ensure absence of auto-correlation
# This code is taken from getFromNamespace("plot.acf", ns="stats")
x.n.used <- nrow(mcmc_run$resultMCMC[[1]])
ci <- 0.95
clim0 <- qnorm((1 + ci)/2)/sqrt(x.n.used)

# I search the lag without auto-correlation for mean
clim <- clim0 * sqrt(cumsum(c(1, 2 * acf_mean$acf[-1, 1, 1]^2)))
lag_acf_mean <- which(acf_mean$acf<clim)[1]
# Then I thin the data using this value
series_acf_mean <- mcmc_run$resultMCMC[[1]][seq(from=1, to=nrow(mcmc_run$resultMCMC[[1]]), by=lag_acf_mean), "mean"]
# I verify that acf is within the CI: perfect
acf(series_acf_mean)
mean(series_acf_mean)
sd(series_acf_mean)/sqrt(length(series_acf_mean))

I search the lag without auto-correlation for sd
clim <- clim0 * sqrt(cumsum(c(1, 2 * acf_sd$acf[-1, 1, 1]^2)))
lag_acf_sd <- which(acf_sd$acf<clim)[1]
# Then I thin the data using this value
series_acf_sd <- mcmc_run$resultMCMC[[1]][seq(from=1, to=nrow(mcmc_run$resultMCMC[[1]]), by=lag_acf_sd), "sd"]
# I verify that acf is within the CI: perfect
acf(series_acf_sd)
mean(series_acf_sd)
sd(series_acf_sd)/sqrt(length(series_acf_sd))

# Compare this with the time-series se of coda:
s$statistics[, "Time-series SE"]
c(mean=sd(series_acf_mean)/sqrt(length(series_acf_mean)), sd=sd(series_acf_sd)/sqrt(length(series_acf_sd)))

In conclusion, the time series SE of coda package is potentially biased because first order autocorrelation is still present; it makes the SE of estimates smaller than expected.

Note that I wait for the confirmation by the author of coda package.

# Using time series SE from coda based on spectral density
> s$statistics[, "Time-series SE"]
       mean          sd 
0.004577868 0.004126673 
# Using absence of autocorrelation criteria based on CI=0.95
> c(mean=sd(series_acf_mean)/sqrt(length(series_acf_mean)), sd=sd(series_acf_sd)/sqrt(length(series_acf_sd)))
       mean          sd 
0.006586575 0.006465851 

Commentaires

Posts les plus consultés de ce blog

Standard error from Hessian Matrix... what can be done when problem occurs

stepAIC from package MASS with AICc

Install treemix in ubuntu 20.04