Copyright Eli Lilly and Company

Data frames in R for Make

Reproducibility and high-performance computing

drake

Will Landau

drake in action

demo

Workflow plan data frame

library(drake)
load_basic_example()
my_plan
##                    target                                      command
## 1             'report.md'   my_knit('report.Rmd', report_dependencies)
## 2                   small                                  simulate(5)
## 3                   large                                 simulate(50)
## 4     report_dependencies      c(small, large, coef_regression2_small)
## 5       regression1_small                                  reg1(small)
## 6       regression1_large                                  reg1(large)
## 7       regression2_small                                  reg2(small)
## 8       regression2_large                                  reg2(large)
## 9  summ_regression1_small suppressWarnings(summary(regression1_small))
## 10 summ_regression1_large suppressWarnings(summary(regression1_large))
## 11 summ_regression2_small suppressWarnings(summary(regression2_small))
## 12 summ_regression2_large suppressWarnings(summary(regression2_large))
## 13 coef_regression1_small                      coef(regression1_small)
## 14 coef_regression1_large                      coef(regression1_large)
## 15 coef_regression2_small                      coef(regression2_small)
## 16 coef_regression2_large                      coef(regression2_large)

Network graph

# The graph is interactive! Hover, click, drag, zoom, pan.
plot_graph(my_plan)

Just the targets

plot_graph(my_plan, targets_only = TRUE)

Execution

make(my_plan)
## import 'report.Rmd'
## import c
## import summary
## import suppressWarnings
## import coef
## import knit
## import data.frame
## import rpois
## import stats::rnorm
## import lm
## import my_knit
## import simulate
## import reg1
## import reg2
## build small
## build large
## build regression1_small
## build regression1_large
## build regression2_small
## build regression2_large
## build summ_regression1_small
## build summ_regression1_large
## build summ_regression2_small
## build summ_regression2_large
## build coef_regression1_small
## build coef_regression1_large
## build coef_regression2_small
## build coef_regression2_large
## build report_dependencies
## build 'report.md'

Results

loadd(small)
small
##             x y
## 1  1.43420148 1
## 2 -0.07729196 0
## 3  0.73913723 0
## 4 -1.75860473 1
## 5 -0.06982523 1
readd(coef_regression2_large)
## (Intercept)          x2 
##   0.6447784   0.1950497

Reproducibility

plot_graph(my_plan)

Reproducibility

reg2 = function(d){ # Change one of your functions.
  d$x3 = d$x^3
  lm(y ~ x3, data = d)
}
outdated(my_plan, verbose = FALSE) # Some targets are now out of date.
## [1] "'report.md'"            "coef_regression2_large"
## [3] "coef_regression2_small" "regression2_large"     
## [5] "regression2_small"      "report_dependencies"   
## [7] "summ_regression2_large" "summ_regression2_small"
missed(my_plan, verbose = FALSE) # But our workspace has all we need.

Reproducibility

plot_graph(my_plan)

Reproducibility

make(my_plan) # Only rebuild the outdated targets.
## import 'report.Rmd'
## import c
## import summary
## import suppressWarnings
## import coef
## import knit
## import data.frame
## import rpois
## import stats::rnorm
## import lm
## import my_knit
## import simulate
## import reg1
## import reg2
## build regression2_small
## build regression2_large
## build summ_regression2_small
## build summ_regression2_large
## build coef_regression2_small
## build coef_regression2_large
## build report_dependencies
## build 'report.md'

High-performance computing

How many jobs could help?
max_useful_jobs(my_plan)
Parallel processes: low overhead, light weight
make(my_plan, jobs = 2) # Backend chosen based on platform.
make(my_plan, parallelism = "mclapply", jobs = 2) # Mac/Linux
make(my_plan, parallelism = "parLapply", jobs = 2) # Windows too
Parallel R sessions: high overhead, heavy duty
make(my_plan, parallelism = "Makefile", jobs = 2)
make(my_plan, parallelism = "Makefile", command = "make", 
     args = c("--jobs=2", "--silent")) 

Supercomputing

my_script.R
# Your setup...
make(my_plan, parallelism = "Makefile", jobs = 8,
  prepend = "SHELL = ./shell.sh")
shell.sh (write with shell_file())
#!/bin/bash
shift
echo "module load R; $*" | qsub -sync y -cwd -j y
Run on a cluster or supercomputer.
chmod +x shell.sh
nohup nice -19 R CMD BATCH my_script.R &

Utilities

Workflow plan

plan()
analyses()
summaries()
evaluate()
expand()
gather()

Dependency network

outdated()
missed()
plot_graph()
read_graph()
dataframes_graph()
deps()
tracked()
max_useful_jobs()

Cache

clean()
cached()
imported()
built()
readd()
loadd()
find_project()
find_cache()

Debugging

check()
session()
in_progress()
progress()
read_config()

Learning

  • Basic example
load_basic_example()
examples_drake() # List examples.
example_drake("basic") # Generate code files.
  • Tutorials
vignette("drake") # High-level overview.
vignette("quickstart") # Deep dive.
vignette("caution") # Pitfalls.

Similar work

Sources