Skip to content

Commit

Permalink
Add functions to export and import packets to/from a zip file.
Browse files Browse the repository at this point in the history
While the ability to push and pull from remote locations that orderly
already supports works well for back-and-forth collaboration, it is not
as well suited to produce a one-time artefact that may be released
and shared publicly, without depending on a server or shared location.
One use case for this is for publishing a reproducible set of analyses
to accompany a paper.

This commit adds a pair of functions, orderly_export_zip and
orderly_import_zip. These allow a set of packets (and their transitive
dependencies) to be exported as a standalone zip file, containing both
the metadata and files. The zip file can then be imported into a
different repository.

The zip file is formatted as a metadata directory, with a file per
packet, and a content-addressed file store.
  • Loading branch information
plietar committed Mar 19, 2024
1 parent 490391b commit 4dfdccb
Show file tree
Hide file tree
Showing 10 changed files with 404 additions and 4 deletions.
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ Imports:
rlang,
rstudioapi,
withr,
yaml
yaml,
zip
Suggests:
DBI,
RSQLite,
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,11 @@ export(orderly_copy_files)
export(orderly_dependency)
export(orderly_description)
export(orderly_example)
export(orderly_export_zip)
export(orderly_gitignore_update)
export(orderly_hash_data)
export(orderly_hash_file)
export(orderly_import_zip)
export(orderly_init)
export(orderly_interactive_set_search_options)
export(orderly_list_src)
Expand Down
121 changes: 121 additions & 0 deletions R/export.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
##' Export packets as a zip file.
##'
##' The packets can be imported into a different repository using the
##' [orderly2::orderly_import_zip] function.
##'
##' This is useful as one-time way to publish your results, for example as an
##' artefact accompanying a paper. For back-and-forth collaboration, a shared
##' location should be priviledged.
##'
##' @param path the path where the zip file will be created
##'
##' @param packets One or more packets to export
##'
##' @inheritParams orderly_metadata
##'
##' @return Nothing
##' @export
orderly_export_zip <- function(path, packets, root = NULL, locate = TRUE) {
root <- root_open(root, locate = locate, require_orderly = FALSE,
call = environment())

metadata <- root$index$data()$metadata
packets <- find_all_dependencies(packets, metadata)
files <- find_all_files(packets, metadata)

dest <- withr::local_tempfile()
fs::dir_create(dest)
fs::dir_create(file.path(dest, "metadata"))
store <- file_store$new(file.path(dest, "files"))

fs::file_copy(
file.path(root$path, ".outpack", "metadata", packets),
file.path(dest, "metadata", packets))

for (hash in files) {
store$put(find_file_by_hash(root, hash), hash)
}

zip::zip(fs::path_abs(path), root = dest, files = c("metadata", "files"))
invisible()
}

##' Import packets from a zip file.
##'
##' @param path the path to the zip file to be imported.
##'
##' @inheritParams orderly_metadata
##'
##' @return Invisibly, the IDs of the imported packets
##' @export
orderly_import_zip <- function(path, root = NULL, locate = TRUE) {
root <- root_open(root, locate = locate, require_orderly = FALSE,
call = environment())
index <- root$index$data()

hash_algorithm <- root$config$core$hash_algorithm

src <- withr::local_tempfile()
zip::unzip(path, exdir = src)
store <- file_store$new(file.path(src, "files"))

ids <- dir(file.path(src, "metadata"))

# TODO: is using the root's hash algorithm correct? What if the origin had
# used a different hash, now there are two hashes for the same packet. We
# don't record the hash algorithm anywhere in the zip files, maybe we should.
metadata_hashes <- hash_metadata_files(
file.path(src, "metadata", ids), hash_algorithm)

known_packets <- ids %in% names(index$metadata)
missing_packets <- !(ids %in% index$unpacked)

import_check_hashes(src, ids[known_packets], metadata_hashes[known_packets],
root, call = environment())

fs::file_copy(
file.path(src, "metadata", ids[!known_packets]),
file.path(root$path, ".outpack", "metadata", ids[!known_packets]))

if (root$config$core$use_file_store) {
# The index needs reloading to take into account the new metadata we just
# pulled.
index <- root$index$data()
files <- find_all_files(ids, index$metadata)
files <- files[!root$files$exists(files)]
for (hash in files) {
file_path <- store$get(hash, root$files$tmp(), overwrite = FALSE)
root$files$put(file_path, hash, move = TRUE)
}
}

for (i in which(missing_packets)) {
if (!is.null(root$config$core$path_archive)) {
location_pull_files_archive(ids[[i]], store, root)
}
mark_packet_known(ids[[i]], local, metadata_hashes[[i]], Sys.time(), root)
}

invisible(ids)
}

import_check_hashes <- function(src, ids, hashes, root, call) {
index <- root$index$data()
hash_algorithm <- root$config$core$hash_algorithm

hash_here <- index$location$hash[match(ids, index$location$packet)]
err <- hashes != hash_here
if (any(err)) {
cli::cli_abort(
c("Imported file has conflicting metadata",
x = paste("This is {.strong really} bad news. The zip file contains",
"packets with a different hash than the metadata already in",
"this repository. I'm not going to import this new metadata",
"but there's no guarantee that the older metadata is",
"actually what you want!"),
i = "Conflicts for: {squote(ids[err])}",
i = "We would be interested in this case, please let us know"),
call = call)
}
invisible()
}
4 changes: 1 addition & 3 deletions R/location.R
Original file line number Diff line number Diff line change
Expand Up @@ -676,10 +676,8 @@ location_build_push_plan <- function(packet_id, location_name, root) {
files_msg <- character(0)
} else {
packet_id_msg <- sort(packet_id_msg)
metadata <- metadata
## All files across all missing ids:
files <- unique(unlist(
lapply(packet_id_msg, function(i) metadata[[i]]$files$hash)))
files <- find_all_files(packet_id_msg, metadata)

## Which of these does the server not know about:
files_msg <- driver$list_unknown_files(files)
Expand Down
8 changes: 8 additions & 0 deletions R/outpack_hash.R
Original file line number Diff line number Diff line change
Expand Up @@ -112,3 +112,11 @@ rehash <- function(data, hash_function, expected) {
algorithm <- hash_parse(expected)$algorithm
hash_function(data, algorithm)
}

## metadata files are hashed by ignoring leading and trailing newline
## characters.
hash_metadata_files <- function(path, hash_algorithm) {
vcapply(path, function(p) {
hash_data(read_string(p), hash_algorithm)
})
}
1 change: 1 addition & 0 deletions R/outpack_insert.R
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ outpack_insert_packet <- function(path, json, root = NULL) {
## TODO: once we get more flexible remotes, this will get moved into
## its own thing.
hash <- hash_data(json, hash_algorithm)

time <- Sys.time()
mark_packet_known(id, local, hash, time, root)
}
Expand Down
4 changes: 4 additions & 0 deletions R/outpack_misc.R
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@ find_all_dependencies <- function(id, metadata) {
sort(ret)
}

## Get all the files for a set of packets, filtering any overlap.
find_all_files <- function(id, metadata) {
unique(unlist(lapply(id, function(i) metadata[[i]]$files$hash)))
}

validate_parameters <- function(parameters, call) {
if (is.null(parameters) || length(parameters) == 0) {
Expand Down
37 changes: 37 additions & 0 deletions man/orderly_export_zip.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

29 changes: 29 additions & 0 deletions man/orderly_import_zip.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 4dfdccb

Please sign in to comment.