Estimating the SE of MCMC outputs; different methods
# Strategy to manage correctly the MCMC timeseries to get SE
set.seed(123)
val <- rnorm(30, 10, 2)
dnormx <- function(data, x) {
data <- unlist(data)
return(-sum(dnorm(data, mean=x['mean'], sd=x['sd'], log=TRUE)))
}
parameters_mcmc <- data.frame(Density=c('dnorm', 'dlnorm'),
Prior1=c(10, 0.5), Prior2=c(2, 0.5), SDProp=c(0.35, 0.2),
Min=c(-3, 0), Max=c(100, 10), Init=c(10, 2), stringsAsFactors = FALSE,
row.names=c('mean', 'sd'))
set.seed(123)
val <- rnorm(30, 10, 2)
dnormx <- function(data, x) {
data <- unlist(data)
return(-sum(dnorm(data, mean=x['mean'], sd=x['sd'], log=TRUE)))
}
parameters_mcmc <- data.frame(Density=c('dnorm', 'dlnorm'),
Prior1=c(10, 0.5), Prior2=c(2, 0.5), SDProp=c(0.35, 0.2),
Min=c(-3, 0), Max=c(100, 10), Init=c(10, 2), stringsAsFactors = FALSE,
row.names=c('mean', 'sd'))
# Use of trace and traceML parameters
# trace=1 : Only one likelihood is printed
mcmc_run <- MHalgoGen(n.iter=50000, parameters=parameters_mcmc, data=val,
likelihood=dnormx, n.chains=1, n.adapt=100, thin=1, trace=1)
# The function as.paramaters gives the best likelihood; it is not really what we want
# in the context of MCMC; it is more related to ML
as.parameters(mcmc_run)
# I have 50000 values for mean and sd
nrow(mcmc_run$resultMCMC[[1]])
# These values can be considered as the best estimate; note that this value is not biased by autocorrelation
colMeans(mcmc_run$resultMCMC[[1]])
# and how is known these mean is related to se; the sd gives the dispersion of the values
apply(mcmc_run$resultMCMC[[1]], MARGIN = 2, FUN=sd)
# And the dispersion of the mean is SE=SD/sqrt(N)
apply(mcmc_run$resultMCMC[[1]], MARGIN = 2, FUN=sd)/sqrt(nrow(mcmc_run$resultMCMC[[1]]))
# But this value is biased because it does not take into account large autocorrelation
acf_mean <- acf(mcmc_run$resultMCMC[[1]][, "mean"])
(lag_mean <- which(acf_mean$acf<0.01)[1])
acf_sd <- acf(mcmc_run$resultMCMC[[1]][, "sd"])
(lag_sd <- which(acf_sd$acf<0.01)[1])
# Then I thin the data using this value
series_mean <- mcmc_run$resultMCMC[[1]][seq(from=1, to=nrow(mcmc_run$resultMCMC[[1]]), by=lag_mean), "mean"]
# I verify that acf is nearly 0: perfect
acf(series_mean)
mean(series_mean)
sd(series_mean)/sqrt(length(series_mean))
series_sd <- mcmc_run$resultMCMC[[1]][seq(from=1, to=nrow(mcmc_run$resultMCMC[[1]]), by=lag_mean), "sd"]
# I verify that acf is nearly 0: perfect
acf(series_sd)
length(series_sd)
mean(series_sd)
sd(series_sd)/sqrt(length(series_sd))
# An alternative using coda package
mcmc_run.mcmc <- as.mcmc(mcmc_run)
library(coda)
batchSE(mcmc_run.mcmc, batchSize=100)
# Note that SE is dependent on batchSize
batchSE(mcmc_run.mcmc, batchSize=10)
batchSE(mcmc_run.mcmc, batchSize=1000)
# An alternative is: it gives the size of timeseries removing the autocorrelation using the spectral density
effectiveSize(mcmc_run.mcmc)
# This is proportional to the lag value I used before but is less stringent
(lag <- nrow(mcmc_run.mcmc[[1]])/effectiveSize(mcmc_run.mcmc))
c(mean=lag_mean, sd=lag_sd)
# I verify that acf is removed; not completly; high ar at lag 1 is still observed
series_mean_ef <- mcmc_run$resultMCMC[[1]][seq(from=1, to=nrow(mcmc_run$resultMCMC[[1]]), by=lag[1]), "mean"]
acf(series_mean_ef)
series_sd_ef <- mcmc_run$resultMCMC[[1]][seq(from=1, to=nrow(mcmc_run$resultMCMC[[1]]), by=lag[2]), "sd"]
acf(series_sd_ef)
# The batch method using lag from AR1 model
c(mean=batchSE(mcmc_run.mcmc, batchSize=lag[1])[1],
sd=batchSE(mcmc_run.mcmc, batchSize=lag[2])[2])
# The batch method using lag from acf<0.01
c(mean=batchSE(mcmc_run.mcmc, batchSize=lag_mean)[1],
sd=batchSE(mcmc_run.mcmc, batchSize=lag_sd)[2])
# Now let try the timeseries se of coda
serie_mean_efs <- mcmc_run$resultMCMC[[1]][seq(from=1, to=nrow(mcmc_run$resultMCMC[[1]]), by=lag[1]), "mean"]
sd(serie_mean_efs)/sqrt(length(serie_mean_efs))
serie_sd_efs <- mcmc_run$resultMCMC[[1]][seq(from=1, to=nrow(mcmc_run$resultMCMC[[1]]), by=lag[2]), "sd"]
sd(serie_sd_efs)/sqrt(length(serie_sd_efs))
s <- summary(mcmc_run.mcmc)
s$statistics[, "Time-series SE"]
# Compare with
c(mean=sd(series_mean)/sqrt(length(series_mean)), sd= sd(series_sd)/sqrt(length(series_sd)))
# or
c(mean=sd(serie_mean_efs)/sqrt(length(serie_mean_efs)),sd= sd(serie_sd_efs)/sqrt(length(serie_sd_efs)))
# Manipulating the number of observations can change a lot the SE of each parameter
# It can be used to "manipulate" the output; take care!
# note that the thin parameter of the MCMC object does not influence the output
# It is just here to remember how the data have been generated
attributes(mcmc_run.mcmc[[1]])$mcpar <- c(101, 50100, 5000)
summary(mcmc_run.mcmc)
# I use the CI limits to ensure absence of auto-correlation
# This code is taken from getFromNamespace("plot.acf", ns="stats")
x.n.used <- nrow(mcmc_run$resultMCMC[[1]])
ci <- 0.95
clim0 <- qnorm((1 + ci)/2)/sqrt(x.n.used)
# I search the lag without auto-correlation for mean
clim <- clim0 * sqrt(cumsum(c(1, 2 * acf_mean$acf[-1, 1, 1]^2)))
lag_acf_mean <- which(acf_mean$acf<clim)[1]
# Then I thin the data using this value
series_acf_mean <- mcmc_run$resultMCMC[[1]][seq(from=1, to=nrow(mcmc_run$resultMCMC[[1]]), by=lag_acf_mean), "mean"]
# I verify that acf is within the CI: perfect
acf(series_acf_mean)
mean(series_acf_mean)
sd(series_acf_mean)/sqrt(length(series_acf_mean))
# I search the lag without auto-correlation for sd
clim <- clim0 * sqrt(cumsum(c(1, 2 * acf_sd$acf[-1, 1, 1]^2)))
lag_acf_sd <- which(acf_sd$acf<clim)[1]
# Then I thin the data using this value
series_acf_sd <- mcmc_run$resultMCMC[[1]][seq(from=1, to=nrow(mcmc_run$resultMCMC[[1]]), by=lag_acf_sd), "sd"]
# I verify that acf is within the CI: perfect
acf(series_acf_sd)
mean(series_acf_sd)
sd(series_acf_sd)/sqrt(length(series_acf_sd))
# Compare this with the time-series se of coda:
s$statistics[, "Time-series SE"]
c(mean=sd(series_acf_mean)/sqrt(length(series_acf_mean)), sd=sd(series_acf_sd)/sqrt(length(series_acf_sd)))
In conclusion, the time series SE of coda package is potentially biased because first order autocorrelation is still present; it makes the SE of estimates smaller than expected.
Note that I wait for the confirmation by the author of coda package.
# Using time series SE from coda based on spectral density
> s$statistics[, "Time-series SE"]
mean sd
0.004577868 0.004126673
# Using absence of autocorrelation criteria based on CI=0.95
mean sd
0.004577868 0.004126673
# Using absence of autocorrelation criteria based on CI=0.95
> c(mean=sd(series_acf_mean)/sqrt(length(series_acf_mean)), sd=sd(series_acf_sd)/sqrt(length(series_acf_sd)))
mean sd
0.006586575 0.006465851
Commentaires
Enregistrer un commentaire