From 9d25615ee1d5fe28480c73cf238b83797d007484 Mon Sep 17 00:00:00 2001 From: George Stagg Date: Wed, 11 Sep 2024 16:08:01 +0100 Subject: [PATCH] Add function to append filesystem metadata to a given `tar` archive (#40) * Add and export make_tar_index function * Use `make_tar_index()` when building R packages * Update NEWS.md * Append VFS metadata to R package .tgz output * Improve VFS metadata encoding in .tgz file * Rename make_tar_index to add_tar_index * Update pkgdown documentation * Explicitly write metadata values as integer type * Embed filesystem metadata as a tar entry * Set highest compression level when repacking tar * Early exit tar processing on existing metadata * Deal with hard and symbolic links in tar indexing --- NAMESPACE | 1 + NEWS.md | 8 +- R/build.R | 27 +++-- R/lib.R | 14 ++- R/repo.R | 18 +-- R/tar.R | 229 +++++++++++++++++++++++++++++++++++ _pkgdown.yml | 3 +- inst/pkgdown.yml | 13 ++ man/add_list.Rd | 6 +- man/add_pkg.Rd | 8 +- man/add_repo.Rd | 6 +- man/add_tar_index.Rd | 32 +++++ man/build.Rd | 8 +- man/make_vfs_library.Rd | 9 +- man/make_vfs_repo.Rd | 17 +-- vignettes/mount-fs-image.Rmd | 75 ++++++++---- vignettes/mount-host-dir.Rmd | 2 + vignettes/tar-metadata.Rmd | 42 +++++++ 18 files changed, 451 insertions(+), 67 deletions(-) create mode 100644 R/tar.R create mode 100644 inst/pkgdown.yml create mode 100644 man/add_tar_index.Rd create mode 100644 vignettes/tar-metadata.Rmd diff --git a/NAMESPACE b/NAMESPACE index b550c1d..e0a016d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,6 +3,7 @@ export(add_list) export(add_pkg) export(add_repo) +export(add_tar_index) export(build) export(file_packager) export(make_library) diff --git a/NEWS.md b/NEWS.md index 0d7ca50..96e914c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,12 @@ # rwasm (development version) -* Support for a new `compression` argument in `build()`, `add_pkg()`, `make_vfs_library()`, and other related functions. When enabled, VFS images will be compressed using `gzip`. Note: Loading compressed VFS images requires at least version 0.4.1 of webR (#39). +## New features + +* When building R packages with `compress` set to `TRUE`, use the binary R package `.tgz` file for the Emscripten filesystem image data and generate custom metadata rather than using Emscripten's `file_packager` tool. + +* Support for a new `compress` argument in `file_packager()`, `make_vfs_library()`, and other related functions. When enabled, VFS images will be compressed using `gzip` (#39). + +Note: Mounting processed `.tgz` archives or compressed VFS images requires at least version 0.4.2 of webR. # rwasm 0.1.0 diff --git a/R/build.R b/R/build.R index 088625e..0059cdf 100644 --- a/R/build.R +++ b/R/build.R @@ -18,7 +18,7 @@ build <- function(packages, out_dir = ".", remotes = NULL, dependencies = FALSE, - compress = FALSE) { + compress = TRUE) { tmp_dir <- tempfile() on.exit(unlink(tmp_dir, recursive = TRUE)) dir.create(tmp_dir) @@ -215,16 +215,21 @@ wasm_build <- function(pkg, tarball_path, contrib_bin, compress) { bin_dest <- fs::path(contrib_bin, paste0(pkg, "_", bin_ver, ".tgz")) fs::file_copy(bin_path, bin_dest, overwrite = TRUE) - # Build an Emscripten filesystem image for the package - tmp_bin_dir <- fs::path(tempfile()) - on.exit(unlink(tmp_bin_dir, recursive = TRUE), add = TRUE) - untar(bin_dest, exdir = tmp_bin_dir) - file_packager( - fs::dir_ls(tmp_bin_dir)[[1]], - contrib_bin, - fs::path_file(bin_dest), - compress - ) + if (compress) { + # Use binary .tgz file to build Emscripten filesystem image metadata + add_tar_index(bin_dest, strip = 1) + } else { + # Build an uncompressed Emscripten filesystem image for the package + tmp_bin_dir <- fs::path(tempfile()) + on.exit(unlink(tmp_bin_dir, recursive = TRUE), add = TRUE) + untar(bin_dest, exdir = tmp_bin_dir) + file_packager( + fs::dir_ls(tmp_bin_dir)[[1]], + contrib_bin, + fs::path_file(bin_dest), + compress = FALSE + ) + } invisible(NULL) } diff --git a/R/lib.R b/R/lib.R index 5df4b3f..f37b651 100644 --- a/R/lib.R +++ b/R/lib.R @@ -48,13 +48,14 @@ make_library <- function(repo_dir = "./repo", lib_dir = "./lib", strip = NULL) { #' #' Each filesystem image is generated using Emscripten's [file_packager()] tool #' and the output `.data` and `.js.metadata` filesystem image files are written -#' to the repository in the same directory as the package binary `.tar.gz` -#' files. +#' to the repository in the same directory as the package binary `.tgz` files. #' #' The resulting filesystem images may then be used by webR to download and -#' install R packages faster by mounting the `.data` images to the Emscripten -#' virtual filesystem, rather than decompressing and extracting the equivalent -#' `.tar.gz` files. +#' install R packages by mounting the `.data` images to the Emscripten virtual +#' filesystem. +#' +#' When `compress` is `TRUE`, an additional file with extension `".data.gz"` is +#' also output containing a compressed version of the filesystem data. #' #' @inheritParams add_pkg #' @@ -100,6 +101,9 @@ make_vfs_repo <- function(repo_dir = "./repo", compress = FALSE) { #' tool and the output `.data` and `.js.metadata` filesystem image files are #' written to the directory `out_dir`. #' +#' When `compress` is `TRUE`, an additional file with extension `".data.gz"` is +#' also output containing a compressed version of the filesystem data. +#' #' The resulting image can be downloaded by webR and mounted on the Emscripten #' virtual filesystem as an efficient way to provide a pre-configured R library, #' without installing each R package individually. diff --git a/R/repo.R b/R/repo.R index 16a1313..3e1bd4e 100644 --- a/R/repo.R +++ b/R/repo.R @@ -76,12 +76,14 @@ add_list <- function(list_file, ...) { #' source. Defaults to `NA`, meaning prefer a built-in list of references to #' packages pre-modified for use with webR. #' @param dependencies Dependency specification for packages to additionally -#' add to the repository. Defaults to `FALSE`, meaning no additional packages. -#' Use `NA` to install only hard dependencies whereas `TRUE` installs all -#' optional dependencies as well. See [pkgdepends::as_pkg_dependencies] -#' for details. -#' @inheritParams file_packager -#' +#' add to the repository. Defaults to `FALSE`, meaning no additional packages. +#' Use `NA` to install only hard dependencies whereas `TRUE` installs all +#' optional dependencies as well. See [pkgdepends::as_pkg_dependencies] +#' for details. +#' @param compress When `TRUE`, add and compress Emscripten virtual filesystem +#' metadata in the resulting R package binary `.tgz` files. Otherwise, +#' [file_packager()] is used to create uncompressed virtual filesystem images +#' included in the output binary package repository. Defaults to `TRUE`. #' @importFrom dplyr rows_update select #' @importFrom pkgdepends new_pkg_download_proposal #' @export @@ -89,7 +91,7 @@ add_pkg <- function(packages, repo_dir = "./repo", remotes = NA, dependencies = FALSE, - compress = FALSE) { + compress = TRUE) { # Set up pkgdepends configuration config <- ppm_config config$dependencies <- dependencies @@ -185,7 +187,7 @@ prefer_remotes <- function(package_info, remotes = NA) { update_repo <- function(package_info, remotes = NA, repo_dir = "./repo", - compress = FALSE) { + compress = TRUE) { r_version <- R_system_version(getOption("rwasm.webr_version")) writeLines(sprintf("Processing %d package(s).", nrow(package_info))) diff --git a/R/tar.R b/R/tar.R new file mode 100644 index 0000000..108137d --- /dev/null +++ b/R/tar.R @@ -0,0 +1,229 @@ +#' Add Emscripten virtual filesystem metadata to a given `tar` archive +#' +#' Calculates file offsets and other metadata for content stored in an +#' (optionally gzip compressed) `tar` archive. Once added, the `tar` archive +#' with metadata can be mounted as an Emscripten filesystem image, making the +#' contents of the archive available to the WebAssembly R process. +#' +#' The virtual filesystem metadata is appended to the end of the `tar` archive, +#' with the output replacing the original file. The resulting archive should be +#' hosted online so that its URL can be provided to webR for mounting on the +#' virtual filesystem. +#' +#' If `strip` is greater than `0` the virtual filesystem metadata is generated +#' such that when mounted by webR the specified number of leading path elements +#' are removed. Useful for R package binaries where data files are stored in the +#' original `.tgz` file under a subdirectory. Files with fewer path name +#' elements than the specified amount are skipped. +#' +#' @param file Filename of the `tar` archive for which metadata is to be added. +#' @param strip Remove the specified number of leading path elements when +#' mounting with webR. Defaults to `0`. +#' @export +add_tar_index <- function(file, strip = 0) { + file <- fs::path_norm(file) + file_ext <- tolower(fs::path_ext(file)) + file_base <- fs::path_ext_remove(file) + + message(paste("Appending virtual filesystem metadata for:", file)) + + # Check if our tar is compatible + if (!any(file_ext == c("tgz", "gz", "tar"))) { + stop(paste0("Can't make index for \"", file, + "\". Only uncompressed or `gzip` compressed tar files can be indexed.")) + } + + # Handle two-component extensions + if (file_ext == "gz") { + file_base <- fs::path_ext_remove(file_base) + } + + # Read archive contents, decompressing if necessary + gzip <- any(file_ext == c("tgz", "gz")) + data <- readBin(file, "raw", n = file.size(file)) + if (gzip) { + data <- memDecompress(data) + } + + # Build metadata from source .tar file + con <- rawConnection(data, open = "rb") + on.exit(close(con), add = TRUE) + entries <- read_tar_offsets(con, strip) + tar_end <- seek(con) + + metadata <- list( + files = entries, + gzip = gzip, + remote_package_size = length(data) + ) + + # Add metadata as additional .tar entry + entry <- create_metadata_entry(metadata) + json_block <- as.integer(tar_end / 512) + 1L + + # Append additional metadata hint for webR + magic <- charToRaw('webR') + reserved <- raw(4) # reserved for future use + block <- writeBin(json_block, raw(), size = 4, endian = "big") + len <- writeBin(entry$length, raw(), size = 4, endian = "big") + hint <- c(magic, reserved, block, len) + + # Build new .tar archive data + data <- c(data[1:tar_end], entry$data, raw(1024), hint) + + # Write output and move into place + out <- tempfile() + out_con <- if (gzip) { + gzfile(out, open = "wb", compression = 9) + } else { + file(out, open = "wb") + } + writeBin(data, out_con, size = 1L) + close(out_con) + fs::file_copy(out, file, overwrite = TRUE) +} + +create_metadata_entry <- function(metadata) { + # metadata contents + json <- charToRaw(jsonlite::toJSON(metadata, auto_unbox = TRUE)) + len <- length(json) + blocks <- ceiling(len/512) + length(json) <- 512 * blocks + + # entry header + timestamp <- as.integer(Sys.time()) + header <- raw(512) + header[1:15] <- charToRaw('.vfs-index.json') # filename + header[101:108] <- charToRaw('0000644 ') # mode + header[109:116] <- charToRaw('0000000 ') # uid + header[117:124] <- charToRaw('0000000 ') # gid + header[125:136] <- charToRaw(sprintf("%011o ", len)) # length + header[137:148] <- charToRaw(sprintf("%011o ", timestamp)) # timestamp + header[149:156] <- charToRaw(' ') # placeholder + header[157:157] <- charToRaw('0') # type + header[258:262] <- charToRaw('ustar') # ustar magic + header[264:265] <- charToRaw('00') # ustar version + header[266:269] <- charToRaw('root') # user + header[298:302] <- charToRaw('wheel') # group + + # populate checksum field + checksum <- raw(8) + checksum[1:6] <- charToRaw(sprintf("%06o", sum(as.integer(header)))) + checksum[8] <- charToRaw(' ') + header[149:156] <- checksum + + list(data = c(header, json), length = len) +} + +read_tar_offsets <- function(con, strip) { + entries <- list() + next_filename <- NULL + + while (TRUE) { + # Read tar entry header block + header <- readBin(con, "raw", n = 512) + + # Basic tar filename + filename <- rawToChar(header[1:100]) + + # Empty header indicates end of archive, early exit for existing metadata + if (all(header == 0) || filename == ".vfs-index.json") { + # Return connection position to just before this header + seek(con, -512, origin = "current") + break + } + + # Entry size and offset + offset <- seek(con) + size <- strtoi(sub("\\s.*", "", rawToChar(header[125:136])), 8) + file_blocks <- ceiling(size / 512) + + # Skip directories, global, and vendor-specific extended headers + type <- rawToChar(header[157]) + if (grepl("5|g|[A-Z]", type)) { + next + } + + # Handle PAX extended header + if (type == "x") { + pax_data <- readBin(con, "raw", n = 512 * ceiling(size / 512)) + pax_data <- pax_data[1:max(which(pax_data != as.raw(0x00)))] + lines <- raw_split(pax_data, "\n") + for (line in lines) { + payload <- raw_split(line, " ")[[2]] + kv <- raw_split(payload, "=") + if (rawToChar(kv[[1]]) == "path") { + next_filename <- rawToChar(kv[[2]]) + break + } + } + next + } + + # Apply ustar formatted extended filename + magic <- rawToChar(header[258:263]) + if (magic == "ustar"){ + prefix <- rawToChar(header[346:501]) + filename <- paste(prefix, filename, sep = "/") + } + + # Apply PAX formatted extended filename + if (!is.null(next_filename)) { + filename <- next_filename + next_filename <- NULL + } + + # Strip path elements, ignoring leading slash, skip if no path remains + if (strip > 0) { + filename <- gsub("^/", "", filename) + parts <- fs::path_split(filename)[[1]] + parts <- parts[-strip:-1] + if (length(parts) == 0) { + seek(con, 512 * file_blocks, origin = "current") + next + } + filename <- fs::path_join(c("/", parts)) + } + + # Calculate file offsets + entry <- list(filename = filename, start = offset, end = offset + size) + + # Deal with hard and symbolic links + if (grepl("1|2", type)) { + link_name <- rawToChar(header[158:257]) + if (type == "2") { + link_name <- fs::path_norm(fs::path(fs::path_dir(filename), link_name)) + } + link_entry <- Find(\(e) e$filename == link_name, entries) + entry$start = link_entry$start + entry$end = link_entry$end + file_blocks <- 0 + } + + entries <- append(entries, list(entry)) + + # Skip to next entry header + seek(con, 512 * file_blocks, origin = "current") + } + entries +} + +# Split the elements of a raw vector x according to matches of element `split` +raw_split <- function(x, split) { + if (is.character(split)) { + split <- charToRaw(split) + } + + start <- 1 + out <- list() + for (end in which(x == split)) { + out <- c(out, list(x[start:(end - 1)])) + start <- end + 1 + } + + if (start <= length(x)) { + out <- c(out, list(x[start:length(x)])) + } + + out +} diff --git a/_pkgdown.yml b/_pkgdown.yml index 5340db7..d9aed8a 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -1,4 +1,5 @@ url: https://r-wasm.github.io/rwasm/ template: bootstrap: 5 - +deploy: + install_metadata: true diff --git a/inst/pkgdown.yml b/inst/pkgdown.yml new file mode 100644 index 0000000..91f6cbf --- /dev/null +++ b/inst/pkgdown.yml @@ -0,0 +1,13 @@ +pandoc: '3.2' +pkgdown: 2.0.9.9000 +pkgdown_sha: 34ee692e4ce10c8abfb863cc782da771838558f7 +articles: + github-actions: github-actions.html + mount-fs-image: mount-fs-image.html + mount-host-dir: mount-host-dir.html + rwasm: rwasm.html + tar-metadata: tar-metadata.html +last_built: 2024-09-10T15:29Z +urls: + reference: https://r-wasm.github.io/rwasm/reference + article: https://r-wasm.github.io/rwasm/articles diff --git a/man/add_list.Rd b/man/add_list.Rd index a536b91..fe31084 100644 --- a/man/add_list.Rd +++ b/man/add_list.Rd @@ -21,8 +21,10 @@ add to the repository. Defaults to \code{FALSE}, meaning no additional packages. Use \code{NA} to install only hard dependencies whereas \code{TRUE} installs all optional dependencies as well. See \link[pkgdepends:as_pkg_dependencies]{pkgdepends::as_pkg_dependencies} for details.} - \item{\code{compress}}{Logical. If \code{TRUE}, a compressed version of the filesystem -data is included in the output. Defaults to \code{FALSE}.} + \item{\code{compress}}{When \code{TRUE}, add and compress Emscripten virtual filesystem +metadata in the resulting R package binary \code{.tgz} files. Otherwise, +\code{\link[=file_packager]{file_packager()}} is used to create uncompressed virtual filesystem images +included in the output binary package repository. Defaults to \code{TRUE}.} }} } \description{ diff --git a/man/add_pkg.Rd b/man/add_pkg.Rd index 2610456..d26cb5a 100644 --- a/man/add_pkg.Rd +++ b/man/add_pkg.Rd @@ -9,7 +9,7 @@ add_pkg( repo_dir = "./repo", remotes = NA, dependencies = FALSE, - compress = FALSE + compress = TRUE ) } \arguments{ @@ -27,8 +27,10 @@ Use \code{NA} to install only hard dependencies whereas \code{TRUE} installs all optional dependencies as well. See \link[pkgdepends:as_pkg_dependencies]{pkgdepends::as_pkg_dependencies} for details.} -\item{compress}{Logical. If \code{TRUE}, a compressed version of the filesystem -data is included in the output. Defaults to \code{FALSE}.} +\item{compress}{When \code{TRUE}, add and compress Emscripten virtual filesystem +metadata in the resulting R package binary \code{.tgz} files. Otherwise, +\code{\link[=file_packager]{file_packager()}} is used to create uncompressed virtual filesystem images +included in the output binary package repository. Defaults to \code{TRUE}.} } \description{ Downloads and builds the \link[pkgdepends:pkg_refs]{R package references} given diff --git a/man/add_repo.Rd b/man/add_repo.Rd index e9ce42e..9597119 100644 --- a/man/add_repo.Rd +++ b/man/add_repo.Rd @@ -25,8 +25,10 @@ add to the repository. Defaults to \code{FALSE}, meaning no additional packages. Use \code{NA} to install only hard dependencies whereas \code{TRUE} installs all optional dependencies as well. See \link[pkgdepends:as_pkg_dependencies]{pkgdepends::as_pkg_dependencies} for details.} - \item{\code{compress}}{Logical. If \code{TRUE}, a compressed version of the filesystem -data is included in the output. Defaults to \code{FALSE}.} + \item{\code{compress}}{When \code{TRUE}, add and compress Emscripten virtual filesystem +metadata in the resulting R package binary \code{.tgz} files. Otherwise, +\code{\link[=file_packager]{file_packager()}} is used to create uncompressed virtual filesystem images +included in the output binary package repository. Defaults to \code{TRUE}.} }} } \description{ diff --git a/man/add_tar_index.Rd b/man/add_tar_index.Rd new file mode 100644 index 0000000..c3ba1d6 --- /dev/null +++ b/man/add_tar_index.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tar.R +\name{add_tar_index} +\alias{add_tar_index} +\title{Add Emscripten virtual filesystem metadata to a given \code{tar} archive} +\usage{ +add_tar_index(file, strip = 0) +} +\arguments{ +\item{file}{Filename of the \code{tar} archive for which metadata is to be added.} + +\item{strip}{Remove the specified number of leading path elements when +mounting with webR. Defaults to \code{0}.} +} +\description{ +Calculates file offsets and other metadata for content stored in an +(optionally gzip compressed) \code{tar} archive. Once added, the \code{tar} archive +with metadata can be mounted as an Emscripten filesystem image, making the +contents of the archive available to the WebAssembly R process. +} +\details{ +The virtual filesystem metadata is appended to the end of the \code{tar} archive, +with the output replacing the original file. The resulting archive should be +hosted online so that its URL can be provided to webR for mounting on the +virtual filesystem. + +If \code{strip} is greater than \code{0} the virtual filesystem metadata is generated +such that when mounted by webR the specified number of leading path elements +are removed. Useful for R package binaries where data files are stored in the +original \code{.tgz} file under a subdirectory. Files with fewer path name +elements than the specified amount are skipped. +} diff --git a/man/build.Rd b/man/build.Rd index 027e5a6..1ec5b75 100644 --- a/man/build.Rd +++ b/man/build.Rd @@ -9,7 +9,7 @@ build( out_dir = ".", remotes = NULL, dependencies = FALSE, - compress = FALSE + compress = TRUE ) } \arguments{ @@ -28,8 +28,10 @@ Use \code{NA} to install only hard dependencies whereas \code{TRUE} installs all optional dependencies as well. See \link[pkgdepends:as_pkg_dependencies]{pkgdepends::as_pkg_dependencies} for details.} -\item{compress}{Logical. If \code{TRUE}, a compressed version of the filesystem -data is included in the output. Defaults to \code{FALSE}.} +\item{compress}{When \code{TRUE}, add and compress Emscripten virtual filesystem +metadata in the resulting R package binary \code{.tgz} files. Otherwise, +\code{\link[=file_packager]{file_packager()}} is used to create uncompressed virtual filesystem images +included in the output binary package repository. Defaults to \code{TRUE}.} } \description{ Downloads and builds the \link[pkgdepends:pkg_refs]{R package references} given diff --git a/man/make_vfs_library.Rd b/man/make_vfs_library.Rd index 0381fe5..ef7e6a9 100644 --- a/man/make_vfs_library.Rd +++ b/man/make_vfs_library.Rd @@ -20,8 +20,10 @@ to \code{"./vfs"}.} \item{repo_dir}{The package repository directory. Defaults to \code{"./repo"}.} -\item{compress}{Logical. If \code{TRUE}, a compressed version of the filesystem -data is included in the output. Defaults to \code{FALSE}.} +\item{compress}{When \code{TRUE}, add and compress Emscripten virtual filesystem +metadata in the resulting R package binary \code{.tgz} files. Otherwise, +\code{\link[=file_packager]{file_packager()}} is used to create uncompressed virtual filesystem images +included in the output binary package repository. Defaults to \code{TRUE}.} \item{...}{ Arguments passed on to \code{\link[=make_library]{make_library}} @@ -39,6 +41,9 @@ A single filesystem image is generated using Emscripten's \code{\link[=file_pack tool and the output \code{.data} and \code{.js.metadata} filesystem image files are written to the directory \code{out_dir}. +When \code{compress} is \code{TRUE}, an additional file with extension \code{".data.gz"} is +also output containing a compressed version of the filesystem data. + The resulting image can be downloaded by webR and mounted on the Emscripten virtual filesystem as an efficient way to provide a pre-configured R library, without installing each R package individually. diff --git a/man/make_vfs_repo.Rd b/man/make_vfs_repo.Rd index 3e43386..650e226 100644 --- a/man/make_vfs_repo.Rd +++ b/man/make_vfs_repo.Rd @@ -9,8 +9,10 @@ make_vfs_repo(repo_dir = "./repo", compress = FALSE) \arguments{ \item{repo_dir}{The package repository directory. Defaults to \code{"./repo"}.} -\item{compress}{Logical. If \code{TRUE}, a compressed version of the filesystem -data is included in the output. Defaults to \code{FALSE}.} +\item{compress}{When \code{TRUE}, add and compress Emscripten virtual filesystem +metadata in the resulting R package binary \code{.tgz} files. Otherwise, +\code{\link[=file_packager]{file_packager()}} is used to create uncompressed virtual filesystem images +included in the output binary package repository. Defaults to \code{TRUE}.} } \description{ Creates an Emscripten filesystem image for each R package that exists in the @@ -19,11 +21,12 @@ package repository directory \code{repo_dir}. \details{ Each filesystem image is generated using Emscripten's \code{\link[=file_packager]{file_packager()}} tool and the output \code{.data} and \code{.js.metadata} filesystem image files are written -to the repository in the same directory as the package binary \code{.tar.gz} -files. +to the repository in the same directory as the package binary \code{.tgz} files. The resulting filesystem images may then be used by webR to download and -install R packages faster by mounting the \code{.data} images to the Emscripten -virtual filesystem, rather than decompressing and extracting the equivalent -\code{.tar.gz} files. +install R packages by mounting the \code{.data} images to the Emscripten virtual +filesystem. + +When \code{compress} is \code{TRUE}, an additional file with extension \code{".data.gz"} is +also output containing a compressed version of the filesystem data. } diff --git a/vignettes/mount-fs-image.Rmd b/vignettes/mount-fs-image.Rmd index f60ecdc..9231011 100644 --- a/vignettes/mount-fs-image.Rmd +++ b/vignettes/mount-fs-image.Rmd @@ -7,47 +7,82 @@ vignette: > %\VignetteEncoding{UTF-8} --- -## Introduction +The Emscripten WebAssembly (Wasm) environment provides a virtual filesystem (VFS) which supports the concept of *mounting*. With this, an entire file and directory structure can be packaged into a filesystem image, efficiently making individual files or entire R package libraries available for use in webR. -The Emscripten WebAssembly environment provides a virtual filesystem (VFS) which supports the concept of *mounting*. With this, an entire file and directory structure can be packaged into a filesystem image to be loaded and mounted at runtime by WebAssembly (Wasm) applications. We can take advantage of this interface to efficiently mount R package libraries, pre-packaged and containing potentially many related R packages, in the VFS accessible to webR. +## Create filesystem images -## Building an R package library +### Emscripten's `file_packager` tool -To build an R package library image we must first build one or more Wasm R packages using `add_pkg()`. As an example, let's build a package with a few hard dependencies. Ensure that you are running R in an environment with access to Wasm development tools^[See the "Setting up the WebAssembly toolchain" section in `vignette("rwasm")` for further details.], then run: +The [`file_packager`](https://emscripten.org/docs/porting/files/packaging_files.html#packaging-using-the-file-packager-tool) tool, provided by Emscripten, takes in a directory structure as input and produces a webR compatible filesystem image as output. The [`file_packager`](https://emscripten.org/docs/porting/files/packaging_files.html#packaging-using-the-file-packager-tool) tool may be invoked from the [rwasm](https://r-wasm.github.io/rwasm/) package: ```{r eval=FALSE} -rwasm::add_pkg("dplyr") +> rwasm::file_packager("./input", out_dir = ".", out_name = "output") ``` -After the build process has completed, the new `repo` directory contains a CRAN-like package repository with R packages build for Wasm. +It can also be invoked directly using its CLI^[See the [`file_packager`](https://emscripten.org/docs/porting/files/packaging_files.html#packaging-using-the-file-packager-tool) Emscripten documentation for details.], if you prefer: -Next, run the following to build an Emscripten VFS image: +```bash +$ file_packager output.data --preload ./input@/ \ + --separate-metadata --js-output=output.js +``` + +In the above examples, the files in the directory `./input` are packaged and an output filesystem image is created^[When using the `file_packager` CLI, a third file named `output.js` will also be created. If you only plan to mount the image using webR, this file may be discarded.] consisting of a data file, `output.data`, and a metadata file, `output.js.metadata`. + +To prepare for mounting the filesystem image with webR, ensure that both files have the same basename (in this example, `output`). The resulting URLs or relative paths for the two files should differ only by the file extension. + +#### Compression + +Filesystem image `.data` files may optionally be `gzip` compressed prior to deployment. The file extension for compressed filesystem images should be `.data.gz`, and compression should be indicated by setting the property `gzip: true` on the metadata JSON stored in the `.js.metadata` file. + +**NOTE**: Loading compressed VFS images requires at least version 0.4.1 of webR. + +### Mount `.tar` archives as a filesystem image + +Archives in `.tar` format, optionally gzip compressed as `.tar.gz` or `.tgz` files, can also be used as filesystem images by pre-processing the `.tar` archive using the `rwasm::add_tar_index()` function. The function reads archive contents and appends the required filesystem metadata to the end of the `.tar` archive data in a way that is understood by webR. For further information about the format see the [Technical details for .tar archive metadata](tar-metadata.html) article. ```{r eval=FALSE} -rwasm::make_vfs_library() +> rwasm::add_tar_index("./path/to/archive.tar.gz") +# Appending virtual filesystem metadata for: ./path/to/archive.tar.gz ``` -By default, this function will create a new directory named `vfs` if it does not exist. The files `vfs/library.data` and `vfs/library.js.metadata` together form an Emscripten filesystem image containing an R package library consisting of all the packages previously added to the CRAN-like repository in `repo` using `add_pkg()`. +Once processed by `rwasm::add_tar_index()`, the `.tar` archive can be deployed and used directly as a filesystem image. -### Packaging arbitrary data +## Mounting filesystem images -It is also possible to package an arbitrary data directory as an Emscripten filesystem image using the `file_packager()` function: +When running in a web browser, the [`webr::mount()`](https://docs.r-wasm.org/webr/latest/api/r.qmd#mount) function downloads and mounts a filesystem image from a URL source, using the `WORKERFS` filesystem type. ```{r eval=FALSE} -rwasm::file_packager("./some/data/directory", out_name = "output_image.data") +webr::mount( + mountpoint = "/data", + source = "https://example.com/output.data" +) ``` -Again, this function writes output filesystem images to the `vfs` directory by default. +Filesystem images should be deployed to static file hosting^[e.g. GitHub Pages, Netlify, AWS S3, etc.] and the resulting URL provided as the source argument. The image will be mounted in the virtual filesystem under the path given by the `mountpoint` argument. If the `mountpoint` directory does not exist, it will be created prior to mounting. -### Compression +When running under Node.js, the source may also be provided as a relative path to a filesystem image on disk. -The `add_pkg()`, `make_vfs_library()`, `file_packager()` and other related functions support the `compression` argument. The default value is `FALSE`, but when `TRUE` VFS images will be `gzip` compressed for deployment. For some types of package content, the savings in file size with compression can be significant. +To test filesystem images before deployment, serve them using a local static webserver. See the Local Testing section below for an example using `httpuv::runStaticServer()` in R. -**NOTE**: Loading compressed VFS images requires at least version 0.4.1 of webR. +## Building an R package library image -## Mounting filesystem images +A collection of R packages can be collected and bundled into a single filesystem image for mounting. + +To build an R package library image we must first build one or more Wasm R packages using `add_pkg()`. As an example, let's build a package with a few hard dependencies. Ensure that you are running R in an environment with access to Wasm development tools^[See the "Setting up the WebAssembly toolchain" section in `vignette("rwasm")` for further details.], then run: + +```{r eval=FALSE} +rwasm::add_pkg("dplyr") +``` + +After the build process has completed, the new `repo` directory contains a CRAN-like package repository with R packages build for Wasm. -The filesystem image(s) should now be hosted by a web server so that it is available at some URL. Such a URL can then be passed to `webr::mount()` to be made available on the virtual filesystem for the Wasm R process. +Next, run the following to build an Emscripten VFS image: + +```{r eval=FALSE} +rwasm::make_vfs_library() +``` + +By default, this function will create a new directory named `vfs` if it does not exist. The files `vfs/library.data` and `vfs/library.js.metadata` together form an Emscripten filesystem image containing an R package library consisting of all the packages previously added to the CRAN-like repository in `repo` using `add_pkg()`. ### Local testing @@ -92,7 +127,3 @@ library(dplyr) #> #> intersect, setdiff, setequal, union ``` - -### Deployment - -The filesystem image files should be deployed to the static file hosting service of your choice, so that they are available for download anywhere. See the "Deployment to static hosting" section in `vignette("rwasm")` for an example of how to host static files with GitHub pages, substituting the `repo` directory for the `vfs` directory containing Emscripten filesystem images. diff --git a/vignettes/mount-host-dir.Rmd b/vignettes/mount-host-dir.Rmd index 8b13e75..d40ec82 100644 --- a/vignettes/mount-host-dir.Rmd +++ b/vignettes/mount-host-dir.Rmd @@ -11,6 +11,8 @@ vignette: > When running under Node.js, the Emscripten WebAssembly environment can make available the contents of a directory on the host filesystem. In addition to providing webR access to external data files, a pre-prepared R package library can be mounted from the host filesystem. This avoids the need to download potentially large R packages or filesystem images over the network. +See the [webR documentation for more details](https://docs.r-wasm.org/webr/latest/mounting.html#mount-an-existing-host-directory) on mounting host directories under Node.js. + ## Building an R package library To build an R package library, we must first build one or more Wasm R packages using `add_pkg()`. As an example, let's build a package with a few hard dependencies. Ensure that you are running R in an environment with access to Wasm development tools^[See the "Setting up the WebAssembly toolchain" section in `vignette("rwasm")` for further details.], then run: diff --git a/vignettes/tar-metadata.Rmd b/vignettes/tar-metadata.Rmd new file mode 100644 index 0000000..5052469 --- /dev/null +++ b/vignettes/tar-metadata.Rmd @@ -0,0 +1,42 @@ +--- +title: "Technical details for .tar archive metadata" +output: rmarkdown::html_document +vignette: > + %\VignetteIndexEntry{Technical details for .tar archive metadata} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +The `rwasm::add_tar_index()` function appends Emscripten filesystem metadata to an (optionally gzip compressed) `.tar` archive. +The resulting output can be directly mounted by webR to the virtual filesystem, making the content of the archive available to the WebAssembly R process. + +See the [Mounting filesystem images](mount-fs-image.html) article for more information about mounting filesystem images. + +## Filesystem metadata + +Virtual filesystem metadata is a JavaScript object, encoded as a JSON string. The format is defined and output by Emscripten's `file_packager` tool and understood by [webR's mounting API](mount-fs-image.html). The metadata object gives the location of each file in the archive to be mounted, and takes the following format: + +```javascript +{ + files: { + filename: string; + start: number; + end: number; + }[], +}; +``` + +## Archive data layout + +A `.tar` archive that can be directly mounted by webR includes filesystem metadata as a file named `.vfs-index.json` at the top level of the archive. The `.tar` archive may also include a "metadata hint" at the very end of the file, after the end-of-archive marker. Appending additional hint data is optional, but allows for more efficient mounting of archive contents to the virtual filesystem. + +The resulting `.tar` file may be gzip compressed, with file extension `.tar.gz` or `.tgz`. + +| Field | Size | Description | +|-|---|-------------| +| 0 | Variable | Standard `.tar` data, including the end-of-archive marker. | +| 1 | 4 bytes | Magic bytes: The string `"webR"`, UTF8 encoded (`0x77656252`). | +| 2 | 4 bytes | Reserved, currently `0x00000000`. | +| 3 | 4 bytes | Offset of `.vfs-index.json`, in units of 512-byte blocks. Signed integer, big endian. | +| 4 | 4 bytes | Length of `.vfs-index.json`, in bytes. Signed integer, big endian. | +Table: Data layout for a `.tar` archive containing filesystem metadata.