Add function to append filesystem metadata to a given tar archive (#40

) * Add and export make_tar_index function * Use `make_tar_index()` when building R packages * Update NEWS.md * Append VFS metadata to R package .tgz output * Improve VFS metadata encoding in .tgz file * Rename make_tar_index to add_tar_index * Update pkgdown documentation * Explicitly write metadata values as integer type * Embed filesystem metadata as a tar entry * Set highest compression level when repacking tar * Early exit tar processing on existing metadata * Deal with hard and symbolic links in tar indexing
r-wasm · Sep 11, 2024 · 9d25615 · 9d25615
1 parent 478609e
commit 9d25615
Show file tree

Hide file tree

Showing 18 changed files with 451 additions and 67 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -3,6 +3,7 @@
 export(add_list)
 export(add_pkg)
 export(add_repo)
+export(add_tar_index)
 export(build)
 export(file_packager)
 export(make_library)

diff --git a/NEWS.md b/NEWS.md
@@ -1,6 +1,12 @@
 # rwasm (development version)
 
-* Support for a new `compression` argument in `build()`, `add_pkg()`, `make_vfs_library()`, and other related functions. When enabled, VFS images will be compressed using `gzip`. Note: Loading compressed VFS images requires at least version 0.4.1 of webR (#39).
+## New features
+
+* When building R packages with `compress` set to `TRUE`, use the binary R package `.tgz` file for the Emscripten filesystem image data and generate custom metadata rather than using Emscripten's `file_packager` tool.
+
+* Support for a new `compress` argument in `file_packager()`, `make_vfs_library()`, and other related functions. When enabled, VFS images will be compressed using `gzip` (#39).
+
+Note: Mounting processed `.tgz` archives or compressed VFS images requires at least version 0.4.2 of webR.
 
 # rwasm 0.1.0
 

diff --git a/R/build.R b/R/build.R
@@ -18,7 +18,7 @@ build <- function(packages,
                   out_dir = ".",
                   remotes = NULL,
                   dependencies = FALSE,
-                  compress = FALSE) {
+                  compress = TRUE) {
   tmp_dir <- tempfile()
   on.exit(unlink(tmp_dir, recursive = TRUE))
   dir.create(tmp_dir)
@@ -215,16 +215,21 @@ wasm_build <- function(pkg, tarball_path, contrib_bin, compress) {
   bin_dest <- fs::path(contrib_bin, paste0(pkg, "_", bin_ver, ".tgz"))
   fs::file_copy(bin_path, bin_dest, overwrite = TRUE)
 
-  # Build an Emscripten filesystem image for the package
-  tmp_bin_dir <- fs::path(tempfile())
-  on.exit(unlink(tmp_bin_dir, recursive = TRUE), add = TRUE)
-  untar(bin_dest, exdir = tmp_bin_dir)
-  file_packager(
-    fs::dir_ls(tmp_bin_dir)[[1]],
-    contrib_bin,
-    fs::path_file(bin_dest),
-    compress
-  )
+  if (compress) {
+    # Use binary .tgz file to build Emscripten filesystem image metadata
+    add_tar_index(bin_dest, strip = 1)
+  } else {
+    # Build an uncompressed Emscripten filesystem image for the package
+    tmp_bin_dir <- fs::path(tempfile())
+    on.exit(unlink(tmp_bin_dir, recursive = TRUE), add = TRUE)
+    untar(bin_dest, exdir = tmp_bin_dir)
+    file_packager(
+      fs::dir_ls(tmp_bin_dir)[[1]],
+      contrib_bin,
+      fs::path_file(bin_dest),
+      compress = FALSE
+    )
+  }
 
   invisible(NULL)
 }
diff --git a/R/lib.R b/R/lib.R
@@ -48,13 +48,14 @@ make_library <- function(repo_dir = "./repo", lib_dir = "./lib", strip = NULL) {
 #'
 #' Each filesystem image is generated using Emscripten's [file_packager()] tool
 #' and the output `.data` and `.js.metadata` filesystem image files are written
-#' to the repository in the same directory as the package binary `.tar.gz`
-#' files.
+#' to the repository in the same directory as the package binary `.tgz` files.
 #'
 #' The resulting filesystem images may then be used by webR to download and
-#' install R packages faster by mounting the `.data` images to the Emscripten
-#' virtual filesystem, rather than decompressing and extracting the equivalent
-#' `.tar.gz` files.
+#' install R packages by mounting the `.data` images to the Emscripten virtual
+#' filesystem.
+#'
+#' When `compress` is `TRUE`, an additional file with extension `".data.gz"` is
+#' also output containing a compressed version of the filesystem data.
 #'
 #' @inheritParams add_pkg
 #'
@@ -100,6 +101,9 @@ make_vfs_repo <- function(repo_dir = "./repo", compress = FALSE) {
 #' tool and the output `.data` and `.js.metadata` filesystem image files are
 #' written to the directory `out_dir`.
 #'
+#' When `compress` is `TRUE`, an additional file with extension `".data.gz"` is
+#' also output containing a compressed version of the filesystem data.
+#'
 #' The resulting image can be downloaded by webR and mounted on the Emscripten
 #' virtual filesystem as an efficient way to provide a pre-configured R library,
 #' without installing each R package individually.

diff --git a/R/repo.R b/R/repo.R
@@ -76,20 +76,22 @@ add_list <- function(list_file, ...) {
 #'   source. Defaults to `NA`, meaning prefer a built-in list of references to
 #'   packages pre-modified for use with webR.
 #' @param dependencies Dependency specification for packages to additionally
-#' add to the repository. Defaults to `FALSE`, meaning no additional packages.
-#' Use `NA` to install only hard dependencies whereas `TRUE` installs all
-#' optional dependencies as well. See [pkgdepends::as_pkg_dependencies]
-#' for details.
-#' @inheritParams file_packager
-#'
+#'   add to the repository. Defaults to `FALSE`, meaning no additional packages.
+#'   Use `NA` to install only hard dependencies whereas `TRUE` installs all
+#'   optional dependencies as well. See [pkgdepends::as_pkg_dependencies]
+#'   for details.
+#' @param compress When `TRUE`, add and compress Emscripten virtual filesystem
+#'   metadata in the resulting R package binary `.tgz` files. Otherwise,
+#'   [file_packager()] is used to create uncompressed virtual filesystem images
+#'   included in the output binary package repository. Defaults to `TRUE`.
 #' @importFrom dplyr rows_update select
 #' @importFrom pkgdepends new_pkg_download_proposal
 #' @export
 add_pkg <- function(packages,
                     repo_dir = "./repo",
                     remotes = NA,
                     dependencies = FALSE,
-                    compress = FALSE) {
+                    compress = TRUE) {
   # Set up pkgdepends configuration
   config <- ppm_config
   config$dependencies <- dependencies
@@ -185,7 +187,7 @@ prefer_remotes <- function(package_info, remotes = NA) {
 update_repo <- function(package_info,
                         remotes = NA,
                         repo_dir = "./repo",
-                        compress = FALSE) {
+                        compress = TRUE) {
   r_version <- R_system_version(getOption("rwasm.webr_version"))
 
   writeLines(sprintf("Processing %d package(s).", nrow(package_info)))

diff --git a/R/tar.R b/R/tar.R
@@ -0,0 +1,229 @@
+#' Add Emscripten virtual filesystem metadata to a given `tar` archive
+#'
+#' Calculates file offsets and other metadata for content stored in an
+#' (optionally gzip compressed) `tar` archive. Once added, the `tar` archive
+#' with metadata can be mounted as an Emscripten filesystem image, making the
+#' contents of the archive available to the WebAssembly R process.
+#'
+#' The virtual filesystem metadata is appended to the end of the `tar` archive,
+#' with the output replacing the original file. The resulting archive should be
+#' hosted online so that its URL can be provided to webR for mounting on the
+#' virtual filesystem.
+#'
+#' If `strip` is greater than `0` the virtual filesystem metadata is generated
+#' such that when mounted by webR the specified number of leading path elements
+#' are removed. Useful for R package binaries where data files are stored in the
+#' original `.tgz` file under a subdirectory. Files with fewer path name
+#' elements than the specified amount are skipped.
+#'
+#' @param file Filename of the `tar` archive for which metadata is to be added.
+#' @param strip Remove the specified number of leading path elements when
+#'   mounting with webR. Defaults to `0`.
+#' @export
+add_tar_index <- function(file, strip = 0) {
+  file <- fs::path_norm(file)
+  file_ext <- tolower(fs::path_ext(file))
+  file_base <- fs::path_ext_remove(file)
+
+  message(paste("Appending virtual filesystem metadata for:", file))
+
+  # Check if our tar is compatible
+  if (!any(file_ext == c("tgz", "gz", "tar"))) {
+    stop(paste0("Can't make index for \"", file,
+      "\". Only uncompressed or `gzip` compressed tar files can be indexed."))
+  }
+
+  # Handle two-component extensions
+  if (file_ext == "gz") {
+    file_base <- fs::path_ext_remove(file_base)
+  }
+
+  # Read archive contents, decompressing if necessary
+  gzip <- any(file_ext == c("tgz", "gz"))
+  data <- readBin(file, "raw", n = file.size(file))
+  if (gzip) {
+    data <- memDecompress(data)
+  }
+
+  # Build metadata from source .tar file
+  con <- rawConnection(data, open = "rb")
+  on.exit(close(con), add = TRUE)
+  entries <- read_tar_offsets(con, strip)
+  tar_end <- seek(con)
+
+  metadata <- list(
+    files = entries,
+    gzip = gzip,
+    remote_package_size = length(data)
+  )
+
+  # Add metadata as additional .tar entry
+  entry <- create_metadata_entry(metadata)
+  json_block <- as.integer(tar_end / 512) + 1L
+
+  # Append additional metadata hint for webR
+  magic <- charToRaw('webR')
+  reserved <- raw(4) # reserved for future use
+  block <- writeBin(json_block, raw(), size = 4, endian = "big")
+  len <- writeBin(entry$length, raw(), size = 4, endian = "big")
+  hint <- c(magic, reserved, block, len)
+
+  # Build new .tar archive data
+  data <- c(data[1:tar_end], entry$data, raw(1024), hint)
+
+  # Write output and move into place
+  out <- tempfile()
+  out_con <- if (gzip) {
+    gzfile(out, open = "wb", compression = 9)
+  } else {
+    file(out, open = "wb")
+  }
+  writeBin(data, out_con, size = 1L)
+  close(out_con)
+  fs::file_copy(out, file, overwrite = TRUE)
+}
+
+create_metadata_entry <- function(metadata) {
+  # metadata contents
+  json <- charToRaw(jsonlite::toJSON(metadata, auto_unbox = TRUE))
+  len <- length(json)
+  blocks <- ceiling(len/512)
+  length(json) <- 512 * blocks
+
+  # entry header
+  timestamp <- as.integer(Sys.time())
+  header <- raw(512)
+  header[1:15] <- charToRaw('.vfs-index.json')               # filename
+  header[101:108] <- charToRaw('0000644 ')                   # mode
+  header[109:116] <- charToRaw('0000000 ')                   # uid
+  header[117:124] <- charToRaw('0000000 ')                   # gid
+  header[125:136] <- charToRaw(sprintf("%011o ", len))       # length
+  header[137:148] <- charToRaw(sprintf("%011o ", timestamp)) # timestamp
+  header[149:156] <- charToRaw('        ')                   # placeholder
+  header[157:157] <- charToRaw('0')                          # type
+  header[258:262] <- charToRaw('ustar')                      # ustar magic
+  header[264:265] <- charToRaw('00')                         # ustar version
+  header[266:269] <- charToRaw('root')                       # user
+  header[298:302] <- charToRaw('wheel')                      # group
+
+  # populate checksum field
+  checksum <- raw(8)
+  checksum[1:6] <- charToRaw(sprintf("%06o", sum(as.integer(header))))
+  checksum[8] <- charToRaw(' ')
+  header[149:156] <- checksum
+
+  list(data = c(header, json), length = len)
+}
+
+read_tar_offsets <- function(con, strip) {
+  entries <- list()
+  next_filename <- NULL
+
+  while (TRUE) {
+    # Read tar entry header block
+    header <- readBin(con, "raw", n = 512)
+
+    # Basic tar filename
+    filename <- rawToChar(header[1:100])
+
+    # Empty header indicates end of archive, early exit for existing metadata
+    if (all(header == 0) || filename == ".vfs-index.json") {
+      # Return connection position to just before this header
+      seek(con, -512, origin = "current")
+      break
+    }
+
+    # Entry size and offset
+    offset <- seek(con)
+    size <- strtoi(sub("\\s.*", "", rawToChar(header[125:136])), 8)
+    file_blocks <- ceiling(size / 512)
+
+    # Skip directories, global, and vendor-specific extended headers
+    type <- rawToChar(header[157])
+    if (grepl("5|g|[A-Z]", type)) {
+      next
+    }
+
+    # Handle PAX extended header
+    if (type == "x") {
+      pax_data <- readBin(con, "raw", n = 512 * ceiling(size / 512))
+      pax_data <- pax_data[1:max(which(pax_data != as.raw(0x00)))]
+      lines <- raw_split(pax_data, "\n")
+      for (line in lines) {
+        payload <- raw_split(line, " ")[[2]]
+        kv <- raw_split(payload, "=")
+        if (rawToChar(kv[[1]]) == "path") {
+          next_filename <- rawToChar(kv[[2]])
+          break
+        }
+      }
+      next
+    }
+
+    # Apply ustar formatted extended filename
+    magic <- rawToChar(header[258:263])
+    if (magic == "ustar"){
+      prefix <- rawToChar(header[346:501])
+      filename <- paste(prefix, filename, sep = "/")
+    }
+
+    # Apply PAX formatted extended filename
+    if (!is.null(next_filename)) {
+      filename <- next_filename
+      next_filename <- NULL
+    }
+
+    # Strip path elements, ignoring leading slash, skip if no path remains
+    if (strip > 0) {
+      filename <- gsub("^/", "", filename)
+      parts <- fs::path_split(filename)[[1]]
+      parts <- parts[-strip:-1]
+      if (length(parts) == 0) {
+        seek(con, 512 * file_blocks, origin = "current")
+        next
+      }
+      filename <- fs::path_join(c("/", parts))
+    }
+
+    # Calculate file offsets
+    entry <- list(filename = filename, start = offset, end = offset + size)
+
+    # Deal with hard and symbolic links
+    if (grepl("1|2", type)) {
+      link_name <- rawToChar(header[158:257])
+      if (type == "2") {
+        link_name <- fs::path_norm(fs::path(fs::path_dir(filename), link_name))
+      }
+      link_entry <- Find(\(e) e$filename == link_name, entries)
+      entry$start = link_entry$start
+      entry$end = link_entry$end
+      file_blocks <- 0
+    }
+
+    entries <- append(entries, list(entry))
+
+    # Skip to next entry header
+    seek(con, 512 * file_blocks, origin = "current")
+  }
+  entries
+}
+
+# Split the elements of a raw vector x according to matches of element `split`
+raw_split <- function(x, split) {
+  if (is.character(split)) {
+    split <- charToRaw(split)
+  }
+
+  start <- 1
+  out <- list()
+  for (end in which(x == split)) {
+    out <- c(out, list(x[start:(end - 1)]))
+    start <- end + 1
+  }
+
+  if (start <= length(x)) {
+    out <- c(out, list(x[start:length(x)]))
+  }
+
+  out
+}
diff --git a/_pkgdown.yml b/_pkgdown.yml
@@ -1,4 +1,5 @@
 url: https://r-wasm.github.io/rwasm/
 template:
   bootstrap: 5
-
+deploy:
+  install_metadata: true
diff --git a/inst/pkgdown.yml b/inst/pkgdown.yml
@@ -0,0 +1,13 @@
+pandoc: '3.2'
+pkgdown: 2.0.9.9000
+pkgdown_sha: 34ee692e4ce10c8abfb863cc782da771838558f7
+articles:
+  github-actions: github-actions.html
+  mount-fs-image: mount-fs-image.html
+  mount-host-dir: mount-host-dir.html
+  rwasm: rwasm.html
+  tar-metadata: tar-metadata.html
+last_built: 2024-09-10T15:29Z
+urls:
+  reference: https://r-wasm.github.io/rwasm/reference
+  article: https://r-wasm.github.io/rwasm/articles
diff --git a/man/add_list.Rd b/man/add_list.Rd