The motivation for the parglm package is a parallel version of the glm function. It solves the iteratively re-weighted least squares using a QR decomposition with column pivoting with DGEQP3 function from LAPACK. The computation is done in parallel as in the bam function in the mgcv package. The cost is an additional \(O(Mp^2 + p^3)\) where \(p\) is the number of coefficients and \(M\) is the number chunks to be computed in parallel. The advantage is that you do not need to compile the package with an optimized BLAS or LAPACK which supports multithreading.

Example of computation time

Below, we perform estimate a logistic regression with 1000000 observations and 50 covariates. We vary the number of cores being used with the nthreads argument to parglm.control.

#####
# simulate
n # number of observations
#> [1] 1000000
p # number of covariates
#> [1] 50
set.seed(68024947)
X <- matrix(rnorm(n * p, 1/p, 1/sqrt(p)), n, ncol = p)
df <- data.frame(y = 1/(1 + exp(-(rowSums(X) - 1))) > runif(n), X)
#####
# compute and measure time. Setup call to make
library(microbenchmark)
library(speedglm)
#> Loading required package: Matrix
#> Loading required package: MASS
library(parglm)
cl <- list(
  quote(microbenchmark), 
  glm      = quote(glm     (y ~ ., binomial(), df)), 
  speedglm = quote(speedglm(y ~ ., family = binomial(), data = df)), 
  times = 11L)
cl <- c(
  cl, lapply(1:n_threads, function(i) bquote(parglm(
      y ~ ., binomial(), df, control = parglm.control(nthreads = .(i))))))
names(cl)[5:(5L + n_threads - 1L)] <- paste0("parglm.", 1:n_threads)
cl <- as.call(cl)
cl # the call we make
#> microbenchmark(glm = glm(y ~ ., binomial(), df), speedglm = speedglm(y ~ 
#>     ., family = binomial(), data = df), times = 11L, parglm.1 = parglm(y ~ 
#>     ., binomial(), df, control = parglm.control(nthreads = 1L)), 
#>     parglm.2 = parglm(y ~ ., binomial(), df, control = parglm.control(nthreads = 2L)), 
#>     parglm.3 = parglm(y ~ ., binomial(), df, control = parglm.control(nthreads = 3L)), 
#>     parglm.4 = parglm(y ~ ., binomial(), df, control = parglm.control(nthreads = 4L)), 
#>     parglm.5 = parglm(y ~ ., binomial(), df, control = parglm.control(nthreads = 5L)), 
#>     parglm.6 = parglm(y ~ ., binomial(), df, control = parglm.control(nthreads = 6L)), 
#>     parglm.7 = parglm(y ~ ., binomial(), df, control = parglm.control(nthreads = 7L)), 
#>     parglm.8 = parglm(y ~ ., binomial(), df, control = parglm.control(nthreads = 8L)), 
#>     parglm.9 = parglm(y ~ ., binomial(), df, control = parglm.control(nthreads = 9L)), 
#>     parglm.10 = parglm(y ~ ., binomial(), df, control = parglm.control(nthreads = 10L)), 
#>     parglm.11 = parglm(y ~ ., binomial(), df, control = parglm.control(nthreads = 11L)), 
#>     parglm.12 = parglm(y ~ ., binomial(), df, control = parglm.control(nthreads = 12L)), 
#>     parglm.13 = parglm(y ~ ., binomial(), df, control = parglm.control(nthreads = 13L)), 
#>     parglm.14 = parglm(y ~ ., binomial(), df, control = parglm.control(nthreads = 14L)), 
#>     parglm.15 = parglm(y ~ ., binomial(), df, control = parglm.control(nthreads = 15L)), 
#>     parglm.16 = parglm(y ~ ., binomial(), df, control = parglm.control(nthreads = 16L)), 
#>     parglm.17 = parglm(y ~ ., binomial(), df, control = parglm.control(nthreads = 17L)), 
#>     parglm.18 = parglm(y ~ ., binomial(), df, control = parglm.control(nthreads = 18L)))

out <- eval(cl)
out # result from `microbenchmark`
#> Unit: seconds
#>       expr          min           lq         mean       median
#>        glm 14.664183436 15.142964143 15.359909494 15.357251653
#>   speedglm  4.160371122  4.351599990  4.505175301  4.534135479
#>   parglm.1 14.040355754 14.256539875 14.447254389 14.317053364
#>   parglm.2  8.097443764  8.230703448  8.328262646  8.347219467
#>   parglm.3  6.168558256  6.347356559  6.424184280  6.374353850
#>   parglm.4  5.165502747  5.283149803  5.417304522  5.363076448
#>   parglm.5  4.810409093  4.885059232  4.924094087  4.898993941
#>   parglm.6  4.398426582  4.498921604  4.572517292  4.536117187
#>   parglm.7  3.989745670  4.289205034  4.438808745  4.312579607
#>   parglm.8  3.734255472  3.845036883  3.933654374  3.886812792
#>   parglm.9  3.606355736  3.784380729  3.834237312  3.844424493
#>  parglm.10  3.506911881  3.591147161  3.693741434  3.752214612
#>  parglm.11  3.477937149  3.570801288  3.662652198  3.643519327
#>  parglm.12  3.425783029  3.639414636  3.688808502  3.715098478
#>  parglm.13  3.474147284  3.538742400  3.635620782  3.632767966
#>  parglm.14  3.521833685  3.538202849  3.601729880  3.607005144
#>  parglm.15  3.305234132  3.491077809  3.550258434  3.522581336
#>  parglm.16  3.240033543  3.430452463  3.454401389  3.444174413
#>  parglm.17  3.203461849  3.501414602  3.585395625  3.584022080
#>  parglm.18  3.307874988  3.428346126  3.486111342  3.483652460
#>            uq          max neval
#>  15.573597313 16.136311482    11
#>   4.680091949  4.785079991    11
#>  14.492168656 15.270763969    11
#>   8.385989789  8.556541071    11
#>   6.464724652  6.849861483    11
#>   5.529647848  5.772380820    11
#>   4.939256538  5.206973645    11
#>   4.600189266  4.825965083    11
#>   4.470814191  5.456749171    11
#>   3.998508303  4.268670305    11
#>   3.888851395  4.008523665    11
#>   3.774280622  3.833249127    11
#>   3.722089552  4.012767892    11
#>   3.777905645  3.837944544    11
#>   3.657921917  3.970971003    11
#>   3.649676988  3.691742054    11
#>   3.578856413  3.924502651    11
#>   3.481927329  3.681751508    11
#>   3.627848398  4.059285339    11
#>   3.558279670  3.623624636    11

The plot below shows median run times versus the number of cores. The dashed line is the median run time of glm and the dotted line is the median run time of speedglm. We could have used glm.fit and parglm.fit. This would make the relative difference bigger as both call e.g., model.matrix and model.frame which do take some time. To show this point, we first compute how much times this takes and then we make the plot. The continuous line is the computation time of model.matrix and model.frame.

modmat_time <- microbenchmark(
  modmat_time = {
    mf <- model.frame(y ~ ., df); model.matrix(terms(mf), mf)
  }, times = 10)
modmat_time # time taken by `model.matrix` and `model.frame`
#> Unit: milliseconds
#>         expr        min          lq        mean      median          uq
#>  modmat_time 975.142461 1038.157711 1123.963102 1108.838174 1186.264465
#>          max neval
#>  1295.933611    10
par(mar = c(4.5, 4.5, .5, .5))
o <- aggregate(time ~ expr, out, median)[, 2] / 10^9
ylim <- range(o, 0); ylim[2] <- ylim[2] + .04 * diff(ylim)
plot(1:n_threads, o[-(1:2)], xlab = "Number of cores", yaxs = "i",
     ylim = ylim, ylab = "Run time", pch = 16)
abline(h = o[1], lty = 2)
abline(h = o[2], lty = 3)
abline(h = median(modmat_time$time) / 10^9, lty = 1)

It is worth mentioning that speedglm computes the cross product of the weighted design matrix. This is advantages in terms of computation cost but may lead to unstable solutions. You can alter the number of observations in each parallel chunk with the block_size argument of parglm.control.

The single threaded performance of parglm may be slower when there are more coefficients. The cause seems to be the difference between the LAPACK and LINPACK implementation. This presumably due to either the QR decomposition method and/or the qr.qty method. On Windows, the parglm do seems slower when build with Rtools and the reason seems so be the qr.qty method in LAPACK, dormqr, which is slower then the LINPACK method, dqrsl. Below is an illustration of the cost on this machine.

qr1 <- qr(X)
qr2 <- qr(X, LAPACK = TRUE)
microbenchmark::microbenchmark(
  `qr LINPACK`     = qr(X), 
  `qr LAPACK`      = qr(X, LAPACK = TRUE), 
  `qr.qty LINPACK` = qr.qty(qr1, df$y), 
  `qr.qty LAPACK`  = qr.qty(qr2, df$y), 
  times = 11)
#> Unit: milliseconds
#>            expr         min          lq         mean      median
#>      qr LINPACK 3139.280366 3139.972717 3172.8539782 3165.433464
#>       qr LAPACK 2505.847082 2508.290932 2530.4196005 2511.217432
#>  qr.qty LINPACK  484.775217  487.073020  498.7921702  490.264909
#>   qr.qty LAPACK  527.644000  531.328690  536.2391545  533.904810
#>            uq         max neval
#>  3174.0074470 3323.203293    11
#>  2525.3808670 2683.391819    11
#>   508.2082865  532.550945    11
#>   534.9791345  570.248979    11

Session info

sessionInfo()
#> R version 3.4.1 (2017-06-30)
#> Platform: x86_64-redhat-linux-gnu (64-bit)
#> Running under: Amazon Linux AMI 2018.03
#> 
#> Matrix products: default
#> BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] speedglm_0.3-2       MASS_7.3-47          Matrix_1.2-10       
#> [4] microbenchmark_1.4-6 parglm_0.1.0        
#> 
#> loaded via a namespace (and not attached):
#>  [1] Rcpp_0.12.18     codetools_0.2-15 lattice_0.20-35  digest_0.6.15   
#>  [5] rprojroot_1.3-2  grid_3.4.1       backports_1.1.2  magrittr_1.5    
#>  [9] evaluate_0.11    stringi_1.2.4    rmarkdown_1.10   tools_3.4.1     
#> [13] stringr_1.3.1    yaml_2.2.0       compiler_3.4.1   htmltools_0.3.6 
#> [17] knitr_1.20