What are the consequences of replacing missing data with median?
The conclusion is that it artificially reduced the variability of the correlation coefficient. It is bad practice. But it is much better than doing nothing !
cor.original <- NULL
cor.na <- NULL
cor.median <- NULL
for (i in 1:10000) {
A <- rnorm(100, mean=100, sd=20)
B <- rnorm(100, mean=100, sd=20)
Bprime <- ifelse(sample(c(0,1), 100, replace = TRUE), B, NA)
Bter <- ifelse(is.na(Bprime), median(B, na.rm = TRUE), Bprime)
cor.original <- c(cor.original, cor(x=A, y=B, method = "spearman"))
cor.na <- c(cor.na, cor(x=A, y=Bprime, method = "spearman", use="complete.obs"))
cor.median <- c(cor.median, cor(x=A, y=Bter, method = "spearman", use="complete.obs"))
}
layout(1:3)
hist(cor.original, xlim=c(-0.6, 0.6), breaks=seq(from=-0.6, to=0.6, by=0.05))
hist(cor.na, xlim=c(-0.6, 0.6), breaks=seq(from=-0.6, to=0.6, by=0.05))
hist(cor.median, xlim=c(-0.6, 0.6), breaks=seq(from=-0.6, to=0.6, by=0.05))
quantile(cor.original)
quantile(cor.na)
quantile(cor.median)
cor.na <- NULL
cor.median <- NULL
for (i in 1:10000) {
A <- rnorm(100, mean=100, sd=20)
B <- rnorm(100, mean=100, sd=20)
Bprime <- ifelse(sample(c(0,1), 100, replace = TRUE), B, NA)
Bter <- ifelse(is.na(Bprime), median(B, na.rm = TRUE), Bprime)
cor.original <- c(cor.original, cor(x=A, y=B, method = "spearman"))
cor.na <- c(cor.na, cor(x=A, y=Bprime, method = "spearman", use="complete.obs"))
cor.median <- c(cor.median, cor(x=A, y=Bter, method = "spearman", use="complete.obs"))
}
layout(1:3)
hist(cor.original, xlim=c(-0.6, 0.6), breaks=seq(from=-0.6, to=0.6, by=0.05))
hist(cor.na, xlim=c(-0.6, 0.6), breaks=seq(from=-0.6, to=0.6, by=0.05))
hist(cor.median, xlim=c(-0.6, 0.6), breaks=seq(from=-0.6, to=0.6, by=0.05))
quantile(cor.original)
quantile(cor.na)
quantile(cor.median)
Commentaires
Enregistrer un commentaire