Further information on efficient concatenation
In a previous post (http://biostatsr.blogspot.fr/2016/02/comparison-of-methods-to-concatenate.html), I showed that the use of c <- c(c, x) for concatenation was not efficient at all. Now I study if the efficiency depends on the length of the c object.
c <- NULL
t <- NULL
for (j in 1:40) {
t <- c(t, system.time(
for (i in 1:1000) c <- c(c, pi)
)[1])
}
plot(1:40, t, bty="n", las=1, ylim=c(0, 0.4), xlim=c(1, 40))
And the answer is clearly: YES ! The longer the vector c is, the more time it takes to add a new element.
And this effect is of course not observed if replacement inside a vector object is done.
Note that the y-scales are not the same.
c <- rep(NA, 40000)
t <- NULL
for (j in 1:40) {
t <- c(t, system.time(
for (i in 1:1000) c[(j-1)*40+i] <- pi
)[1])
}
plot(1:40, t, bty="n", las=1, ylim=c(0, 0.005), xlim=c(1, 40))
Or better:
c <- rep(NA, 40000)
t <- NULL
for (j in 1:40) {
t <- c(t, system.time(
for (i in (j*40+1):(j*40+1000)) c[i] <- pi
)[1])
}
plot(1:40, t, bty="n", las=1, ylim=c(0, 0.005), xlim=c(1, 40))
If you don't have an internal way to know where you must change the element, the most efficient way is to use a counter to know where to add the next element:
cpt <- 1
c <- rep(NA, 40000)
t <- NULL
for (j in 1:40) {
t <- c(t, system.time(
for (i in 1:1000) {c[cpt] <- pi;cpt <- cpt+1}
)[1])
}
plot(1:40, t, bty="n", las=1, ylim=c(0, 0.005), xlim=c(1, 40))
t <- NULL
df <- data.frame(col1=numeric(), col2=character(), stringsAsFactors = FALSE);
for (j in 1:40) {
print(j)
t <- c(t, system.time(
for (i in 1:1000)
df <- rbind(df, data.frame(col1=i, col2=as.character(i)))
)[1])
}
plot(1:40, t, bty="n", las=1, ylim=c(0, 20))
Note again that the y-scale is very different:
t <- NULL
df <- data.frame(col1=rep(NA, 40000), col2=rep(NA, 40000), stringsAsFactors = FALSE);
cpt <- 1
for (j in 1:40) {
print(j)
t <- c(t, system.time({
for (i in 1:1000) {
df[cpt, "col1"] <- i
df[cpt, "col2"] <- as.character(i)
cpt <- cpt + 1
}
}
)[1])
}
plot(1:40, t, bty="n", las=1, ylim=c(0, 2))
Vector
c <- NULL
t <- NULL
for (j in 1:40) {
t <- c(t, system.time(
for (i in 1:1000) c <- c(c, pi)
)[1])
}
plot(1:40, t, bty="n", las=1, ylim=c(0, 0.4), xlim=c(1, 40))
And the answer is clearly: YES ! The longer the vector c is, the more time it takes to add a new element.
Note that the y-scales are not the same.
c <- rep(NA, 40000)
t <- NULL
for (j in 1:40) {
t <- c(t, system.time(
for (i in 1:1000) c[(j-1)*40+i] <- pi
)[1])
}
plot(1:40, t, bty="n", las=1, ylim=c(0, 0.005), xlim=c(1, 40))
Or better:
c <- rep(NA, 40000)
t <- NULL
for (j in 1:40) {
t <- c(t, system.time(
for (i in (j*40+1):(j*40+1000)) c[i] <- pi
)[1])
}
plot(1:40, t, bty="n", las=1, ylim=c(0, 0.005), xlim=c(1, 40))
If you don't have an internal way to know where you must change the element, the most efficient way is to use a counter to know where to add the next element:
cpt <- 1
c <- rep(NA, 40000)
t <- NULL
for (j in 1:40) {
t <- c(t, system.time(
for (i in 1:1000) {c[cpt] <- pi;cpt <- cpt+1}
)[1])
}
plot(1:40, t, bty="n", las=1, ylim=c(0, 0.005), xlim=c(1, 40))
Data.frame
The effect is even more drastic for data.frame:t <- NULL
df <- data.frame(col1=numeric(), col2=character(), stringsAsFactors = FALSE);
for (j in 1:40) {
print(j)
t <- c(t, system.time(
for (i in 1:1000)
df <- rbind(df, data.frame(col1=i, col2=as.character(i)))
)[1])
}
plot(1:40, t, bty="n", las=1, ylim=c(0, 20))
t <- NULL
df <- data.frame(col1=rep(NA, 40000), col2=rep(NA, 40000), stringsAsFactors = FALSE);
cpt <- 1
for (j in 1:40) {
print(j)
t <- c(t, system.time({
for (i in 1:1000) {
df[cpt, "col1"] <- i
df[cpt, "col2"] <- as.character(i)
cpt <- cpt + 1
}
}
)[1])
}
plot(1:40, t, bty="n", las=1, ylim=c(0, 2))
Commentaires
Enregistrer un commentaire