Data frames in R for Make

Reproducibility and high-performance computing

Will Landau

drake in action

Workflow plan data frame

library(drake)
load_basic_example()
my_plan

##                    target                                      command
## 1             'report.md'   my_knit('report.Rmd', report_dependencies)
## 2                   small                                  simulate(5)
## 3                   large                                 simulate(50)
## 4     report_dependencies      c(small, large, coef_regression2_small)
## 5       regression1_small                                  reg1(small)
## 6       regression1_large                                  reg1(large)
## 7       regression2_small                                  reg2(small)
## 8       regression2_large                                  reg2(large)
## 9  summ_regression1_small suppressWarnings(summary(regression1_small))
## 10 summ_regression1_large suppressWarnings(summary(regression1_large))
## 11 summ_regression2_small suppressWarnings(summary(regression2_small))
## 12 summ_regression2_large suppressWarnings(summary(regression2_large))
## 13 coef_regression1_small                      coef(regression1_small)
## 14 coef_regression1_large                      coef(regression1_large)
## 15 coef_regression2_small                      coef(regression2_small)
## 16 coef_regression2_large                      coef(regression2_large)

Network graph

# The graph is interactive! Hover, click, drag, zoom, pan.
plot_graph(my_plan)

Just the targets

plot_graph(my_plan, targets_only = TRUE)

Execution

make(my_plan)

## import 'report.Rmd'
## import c
## import summary
## import suppressWarnings
## import coef
## import knit
## import data.frame
## import rpois
## import stats::rnorm
## import lm
## import my_knit
## import simulate
## import reg1
## import reg2
## build small
## build large
## build regression1_small
## build regression1_large
## build regression2_small
## build regression2_large
## build summ_regression1_small
## build summ_regression1_large
## build summ_regression2_small
## build summ_regression2_large
## build coef_regression1_small
## build coef_regression1_large
## build coef_regression2_small
## build coef_regression2_large
## build report_dependencies
## build 'report.md'

Results

loadd(small)
small

##             x y
## 1  1.43420148 1
## 2 -0.07729196 0
## 3  0.73913723 0
## 4 -1.75860473 1
## 5 -0.06982523 1

readd(coef_regression2_large)

## (Intercept)          x2 
##   0.6447784   0.1950497

Reproducibility

plot_graph(my_plan)

Reproducibility

reg2 = function(d){ # Change one of your functions.
  d$x3 = d$x^3
  lm(y ~ x3, data = d)
}
outdated(my_plan, verbose = FALSE) # Some targets are now out of date.

## [1] "'report.md'"            "coef_regression2_large"
## [3] "coef_regression2_small" "regression2_large"     
## [5] "regression2_small"      "report_dependencies"   
## [7] "summ_regression2_large" "summ_regression2_small"

missed(my_plan, verbose = FALSE) # But our workspace has all we need.

Reproducibility

plot_graph(my_plan)

Reproducibility

make(my_plan) # Only rebuild the outdated targets.

## import 'report.Rmd'
## import c
## import summary
## import suppressWarnings
## import coef
## import knit
## import data.frame
## import rpois
## import stats::rnorm
## import lm
## import my_knit
## import simulate
## import reg1
## import reg2
## build regression2_small
## build regression2_large
## build summ_regression2_small
## build summ_regression2_large
## build coef_regression2_small
## build coef_regression2_large
## build report_dependencies
## build 'report.md'

High-performance computing

How many jobs could help?

max_useful_jobs(my_plan)

Parallel processes: low overhead, light weight

make(my_plan, jobs = 2) # Backend chosen based on platform.
make(my_plan, parallelism = "mclapply", jobs = 2) # Mac/Linux
make(my_plan, parallelism = "parLapply", jobs = 2) # Windows too

Parallel R sessions: high overhead, heavy duty

make(my_plan, parallelism = "Makefile", jobs = 2)
make(my_plan, parallelism = "Makefile", command = "make", 
     args = c("--jobs=2", "--silent"))

Supercomputing

my_script.R

# Your setup...
make(my_plan, parallelism = "Makefile", jobs = 8,
  prepend = "SHELL = ./shell.sh")

shell.sh (write with shell_file())

#!/bin/bash
shift
echo "module load R; $*" | qsub -sync y -cwd -j y

Run on a cluster or supercomputer.

chmod +x shell.sh
nohup nice -19 R CMD BATCH my_script.R &

Utilities

Workflow plan

plan()
analyses()
summaries()
evaluate()
expand()
gather()

Dependency network

outdated()
missed()
plot_graph()
read_graph()
dataframes_graph()
deps()
tracked()
max_useful_jobs()

Cache

clean()
cached()
imported()
built()
readd()
loadd()
find_project()
find_cache()

Debugging

check()
session()
in_progress()
progress()
read_config()

Learning

Basic example

load_basic_example()
examples_drake() # List examples.
example_drake("basic") # Generate code files.

Tutorials

vignette("drake") # High-level overview.
vignette("quickstart") # Deep dive.
vignette("caution") # Pitfalls.

Rendered tutorials: https://CRAN.R-project.org/package=drake/vignettes
Bug reports, issues, feature requests: https://github.com/wlandau-lilly/drake/issues

Similar work

Main inspiration: remake (FitzJohn)
GNU Make (GNU Project)
Packages for caching and tracking:
- archivist (Biecek et al.)
- memoise (Wickham et al.)
- R.cache (Bengtsson)
- trackr (Moore and Becker)
CRAN task views:
- reproducible research
- high-performance computing

Sources

Bengtsson, Henrik. “R.cache: Fast and light-eeight caching (memoization) of objects and results to speed up computations.” 2015. R package version 0.12.0. https://CRAN.R-project.org/package=R.cache.
Biecek, Przemyslaw and Kosinki, Marcin. “archivist: an R package for managing, recording, and restoring data analysis results.” 2016. R package version 2.1.2. https://CRAN.R-project.org/package=archivist.
FitzJohn, Rich. “remake: Make-like declarative workflows in R.” 2017. R package version 0.3.0. GitHub repository, https://github.com/richfitz/remake.
Landau, William M. “Drake: data frames in R for Make.” 2017. R package version 4.0.0. https://CRAN.R-project.org/package=drake.
Moore, Sara and Becker, Gabriel. “trackr: Semantic annotation and discoverability system for R-based artifacts.” 2017. R package version 0.7.4. https://github.com/gmbecker/recordr.
Müller, Kirill. “Reproducible workflows with R.” Zurich R user meetup. April 10, 2017. https://krlmlr.github.io/remake-slides.
Stallman, Richard M. and McGrath, Roland and Smith, Paul D. GNU Make: A Program for Directing Recompilation, for version 3.81. Free Software Foundation, 2004.
Wickham, Hadley and Hester, Jim and Müller, Kirill. “memoise: Memoisation of functions.” R package version 1.0.0. https://CRAN.R-project.org/package=memoise