----ratio, fig.cap = "Storage space required using `split_by` relative to storing a single file.", echo = FALSE---- combinations <- expand.grid( a = c(0.25, 0.5, 1, 2, 4), b = seq(0, 1, length = 41), r = c(10, 100, 1000) ) combinations$ratio <- with( combinations, (a * b + b + 1) / (a + 1 + 1 / r) ) ggplot(combinations, aes(x = b, y = ratio, colour = factor(a))) + geom_hline(yintercept = 1, linetype = 2) + geom_line() + facet_wrap(~ paste("r =", r)) + scale_x_continuous( expression(b~{"="}~N[s]~{"/"}~N), # nolint labels = function(x) { paste0(100 * x, "%") } ) + scale_y_continuous( "Relative amount of disk space", labels = function(x) { paste0(100 * x, "%") } ) + scale_colour_manual( paste("a = s", "r", sep = " / "), values = inbo_colours, labels = c("1/4", "1/2", "1", "2", "4") ) ## ----load_data, echo = FALSE----------------------------------------------------- airbag <- readRDS( system.file("efficiency", "airbag.rds", package = "git2rdata") ) ## ----set_tmp_dir----------------------------------------------------------------- library(git2rdata) root <- tempfile("git2rdata-split-by") dir.create(root) ## ----get_write_timings, eval = system.file("split_by", "write_timings.rds", package = "git2rdata") == ""---- # library(microbenchmark) # mb <- microbenchmark( # part_1 = write_vc(airbag, "part_1", root, sorting = "X"), # part_2 = write_vc(airbag, "part_2", root, sorting = "X", split_by = "airbag"), # part_3 = write_vc(airbag, "part_3", root, sorting = "X", split_by = "abcat"), # part_4 = write_vc( # airbag, "part_4", root, sorting = "X", split_by = c("airbag", "sex") # ), # part_5 = write_vc(airbag, "part_5", root, sorting = "X", split_by = "dvcat"), # part_6 = write_vc( # airbag, "part_6", root, sorting = "X", split_by = "yearacc" # ), # part_15 = write_vc( # airbag, "part_15", root, sorting = "X", split_by = c("dvcat", "abcat") # ), # part_45 = write_vc( # airbag, "part_45", root, sorting = "X", split_by = "yearVeh" # ), # part_270 = write_vc( # airbag, "part_270", root, sorting = "X", split_by = c("yearacc", "yearVeh") # ) # ) # mb$time <- mb$time / 1e6 ## ----store_write_timings, echo = FALSE------------------------------------------- if (system.file("split_by", "write_timings.rds", package = "git2rdata") == "") { dir.create(file.path("..", "inst", "split_by"), showWarnings = FALSE) saveRDS(mb, file.path("..", "inst", "split_by", "write_timings.rds")) } else { mb <- readRDS( system.file("split_by", "write_timings.rds", package = "git2rdata") ) } ## ----plot_write_timings, echo = FALSE, fig.cap = "Boxplot of the write timings for different number of parts."---- mb$combinations <- as.integer(gsub("part_", "", levels(mb$expr)))[mb$expr] ggplot(mb, aes(x = combinations, y = time)) + geom_boxplot(aes(group = combinations)) + scale_x_log10("Number of parts") + scale_y_log10("Time (in milliseconds)") ## ----get_read_timings, eval = system.file("split_by", "read_timings.rds", package = "git2rdata") == ""---- # mb_r <- microbenchmark( # part_1 = read_vc("part_1", root), # part_2 = read_vc("part_2", root), # part_3 = read_vc("part_3", root), # part_4 = read_vc("part_4", root), # part_5 = read_vc("part_5", root), # part_6 = read_vc("part_6", root), # part_15 = read_vc("part_15", root), # part_45 = read_vc("part_45", root), # part_270 = read_vc("part_270", root) # ) # mb_r$time <- mb_r$time / 1e6 ## ----store_read_timings, echo = FALSE-------------------------------------------- if (system.file("split_by", "read_timings.rds", package = "git2rdata") == "") { saveRDS(mb_r, file.path("..", "inst", "split_by", "read_timings.rds")) } else { mb_r <- readRDS( system.file("split_by", "read_timings.rds", package = "git2rdata") ) } ## ----plot_read_timings, echo = FALSE, fig.cap = "Boxplot of the read timings for the different number of parts."---- mb_r$combinations <- as.integer(gsub("part_", "", levels(mb_r$expr)))[mb_r$expr] ggplot(mb_r, aes(x = combinations, y = time)) + geom_boxplot(aes(group = combinations)) + scale_x_log10("Number of parts") + scale_y_log10("Time (in milliseconds)")