The price we pay

So the price for using these abstractions reduces mainly to the overhead generated by casting formulas into functions; and additionally to the overhead generated by S4 dispatch. This obviously is not such a problem when the time applying the function itself increases – either by computation time or number if iterations.

library("rbenchmark")
library("dat")

benchmark(
  flatmap(1:3, x ~ x^2),
  sapply(1:3, function(x) x^2),
  sapply(1:3, as.function(x ~ x^2)),
  flatmap(1:3, function(x) x^2)
)

benchmark(
  flatmap(1:1e4, x ~ x^2),
  sapply(1:1e4, function(x) x^2)
)

The same can be seen for the multivariate map implementation:

benchmark(
  flatmap(1:3 ~ 1:3, f(x, y) ~ x + y),
  mapply(function(x, y) x + y, 1:3, 1:3),
  mapply(as.function(f(x, y) ~ x + y), 1:3, 1:3)
)

S4 vs. data.table

There is some overhead generated by this package. To some extend this is because we try to preserve the class we operate on. However in most tests it seems that it is mostly because dplyr is used to communicate with data.table, which has a price.

library("data.table")
library("dplyr")
options("dat.use.dplyr" = FALSE)
N <- 2e7 # more is not possible with small laptop
K <- 100
set.seed(1)

DT <- data.table(
  id1 = sample(sprintf("id%03d",1:K), N, TRUE),      # large groups (char)
  id2 = sample(sprintf("id%03d",1:K), N, TRUE),      # large groups (char)
  id3 = sample(sprintf("id%010d",1:(N/K)), N, TRUE), # small groups (char)
  id4 = sample(K, N, TRUE),                          # large groups (int)
  id5 = sample(K, N, TRUE),                          # large groups (int)
  id6 = sample(N/K, N, TRUE),                        # small groups (int)
  v1 =  sample(5, N, TRUE),                          # int in range [1,5]
  v2 =  sample(5, N, TRUE),                          # int in range [1,5]
  v3 =  sample(round(runif(100,max=100),4), N, TRUE) # numeric e.g. 23.5749
)

setClass("DataTable", "data.table")
setMethod("[", "DataTable", mutar)
DT4 <- new("DataTable", DT)

cat("GB =", round(sum(gc()[,2]) / 1024, 3), "\n")
format(object.size(DT), units = "MB")
format(object.size(DT4), units = "MB")

system.time(DT[, sum(v1), keyby = id1])
system.time(DT[, sum(v1), keyby = id1])
system.time(DT4[V1 ~ sum(v1), sby = "id1"])
system.time(DT4[V1 ~ sum(v1), sby = "id1"])
system.time(group_by(DT, id1) %>% summarise(V1 = sum(v1)))
system.time(group_by(DT, id1) %>% summarise(V1 = sum(v1)))

system.time(DT[, sum(v1), keyby = "id1,id2"])
system.time(DT[, sum(v1), keyby = "id1,id2"])
system.time(DT4[V1 ~ sum(v1), sby = c("id1", "id2")])
system.time(DT4[V1 ~ sum(v1), sby = c("id1", "id2")])
system.time(group_by(DT, id1, id2) %>% summarise(V1 = sum(v1)))
system.time(group_by(DT, id1, id2) %>% summarise(V1 = sum(v1)))

system.time(DT[, list(sum(v1), mean(v3)), keyby = id3])
system.time(DT[, list(sum(v1), mean(v3)), keyby = id3])
system.time(DT4[V1 ~ sum(v1), V3 ~ mean(v3), sby = "id3"])
system.time(DT4[V1 ~ sum(v1), V3 ~ mean(v3), sby = "id3"])
system.time(group_by(DT, id3) %>% summarise(V1 = sum(v1), V3 = mean(v3)))
system.time(group_by(DT, id3) %>% summarise(V1 = sum(v1), V3 = mean(v3)))

system.time(DT[, lapply(.SD, mean), keyby = id4, .SDcols = 7:9])
system.time(DT[, lapply(.SD, mean), keyby = id4, .SDcols = 7:9])
system.time(DT4[FL(.n ~ mean(.n), .n = "^v[1-3]"), sby = "id4"])
system.time(DT4[FL(.n ~ mean(.n), .n = "^v[1-3]"), sby = "id4"])
system.time(group_by(DT, id4) %>% summarise(V1 = mean(v1), V2 = mean(v2), V3 = mean(v3)))
system.time(group_by(DT, id4) %>% summarise(V1 = mean(v1), V2 = mean(v2), V3 = mean(v3)))

system.time(DT[, lapply(.SD, sum), keyby = id6, .SDcols = 7:9])
system.time(DT[, lapply(.SD, sum), keyby = id6, .SDcols = 7:9])
system.time(DT4[FL(.n ~ sum(.n), .n = "v1:v3"), sby = "id6"])
system.time(DT4[FL(.n ~ sum(.n), .n = "v1:v3"), sby = "id6"])
system.time(group_by(DT, id6) %>% summarise(V1 = sum(v1), V2 = sum(v2), V3 = sum(v3)))
system.time(group_by(DT, id6) %>% summarise(V1 = sum(v1), V2 = sum(v2), V3 = sum(v3)))