diff --git a/DESCRIPTION b/DESCRIPTION index 131cf5a9..79becd23 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -24,7 +24,8 @@ Imports: rlang, rstudioapi, withr, - yaml + yaml, + zip Suggests: DBI, RSQLite, diff --git a/NAMESPACE b/NAMESPACE index 4c96b240..205924ad 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -12,9 +12,11 @@ export(orderly_copy_files) export(orderly_dependency) export(orderly_description) export(orderly_example) +export(orderly_export_zip) export(orderly_gitignore_update) export(orderly_hash_data) export(orderly_hash_file) +export(orderly_import_zip) export(orderly_init) export(orderly_interactive_set_search_options) export(orderly_list_src) diff --git a/R/export.R b/R/export.R new file mode 100644 index 00000000..d19be5d9 --- /dev/null +++ b/R/export.R @@ -0,0 +1,121 @@ +##' Export packets as a zip file. +##' +##' The packets can be imported into a different repository using the +##' [orderly2::orderly_import_zip] function. +##' +##' This is useful as one-time way to publish your results, for example as an +##' artefact accompanying a paper. For back-and-forth collaboration, a shared +##' location should be priviledged. +##' +##' @param path the path where the zip file will be created +##' +##' @param packets One or more packets to export +##' +##' @inheritParams orderly_metadata +##' +##' @return Nothing +##' @export +orderly_export_zip <- function(path, packets, root = NULL, locate = TRUE) { + root <- root_open(root, locate = locate, require_orderly = FALSE, + call = environment()) + + metadata <- root$index$data()$metadata + packets <- find_all_dependencies(packets, metadata) + files <- find_all_files(packets, metadata) + + dest <- withr::local_tempfile() + fs::dir_create(dest) + fs::dir_create(file.path(dest, "metadata")) + store <- file_store$new(file.path(dest, "files")) + + fs::file_copy( + file.path(root$path, ".outpack", "metadata", packets), + file.path(dest, "metadata", packets)) + + for (hash in files) { + store$put(find_file_by_hash(root, hash), hash) + } + + zip::zip(fs::path_abs(path), root = dest, files = c("metadata", "files")) + invisible() +} + +##' Import packets from a zip file. +##' +##' @param path the path to the zip file to be imported. +##' +##' @inheritParams orderly_metadata +##' +##' @return Invisibly, the IDs of the imported packets +##' @export +orderly_import_zip <- function(path, root = NULL, locate = TRUE) { + root <- root_open(root, locate = locate, require_orderly = FALSE, + call = environment()) + index <- root$index$data() + + hash_algorithm <- root$config$core$hash_algorithm + + src <- withr::local_tempfile() + zip::unzip(path, exdir = src) + store <- file_store$new(file.path(src, "files")) + + ids <- dir(file.path(src, "metadata")) + + # TODO: is using the root's hash algorithm correct? What if the origin had + # used a different hash, now there are two hashes for the same packet. We + # don't record the hash algorithm anywhere in the zip files, maybe we should. + metadata_hashes <- hash_metadata_files( + file.path(src, "metadata", ids), hash_algorithm) + + known_packets <- ids %in% names(index$metadata) + missing_packets <- !(ids %in% index$unpacked) + + import_check_hashes(src, ids[known_packets], metadata_hashes[known_packets], + root, call = environment()) + + fs::file_copy( + file.path(src, "metadata", ids[!known_packets]), + file.path(root$path, ".outpack", "metadata", ids[!known_packets])) + + if (root$config$core$use_file_store) { + # The index needs reloading to take into account the new metadata we just + # pulled. + index <- root$index$data() + files <- find_all_files(ids, index$metadata) + files <- files[!root$files$exists(files)] + for (hash in files) { + file_path <- store$get(hash, root$files$tmp(), overwrite = FALSE) + root$files$put(file_path, hash, move = TRUE) + } + } + + for (i in which(missing_packets)) { + if (!is.null(root$config$core$path_archive)) { + location_pull_files_archive(ids[[i]], store, root) + } + mark_packet_known(ids[[i]], local, metadata_hashes[[i]], Sys.time(), root) + } + + invisible(ids) +} + +import_check_hashes <- function(src, ids, hashes, root, call) { + index <- root$index$data() + hash_algorithm <- root$config$core$hash_algorithm + + hash_here <- index$location$hash[match(ids, index$location$packet)] + err <- hashes != hash_here + if (any(err)) { + cli::cli_abort( + c("Imported file has conflicting metadata", + x = paste("This is {.strong really} bad news. The zip file contains", + "packets with a different hash than the metadata already in", + "this repository. I'm not going to import this new metadata", + "but there's no guarantee that the older metadata is", + "actually what you want!"), + i = "Conflicts for: {squote(ids[err])}", + i = "We would be interested in this case, please let us know"), + call = call) + } + invisible() +} diff --git a/R/location.R b/R/location.R index 820af691..62b6461b 100644 --- a/R/location.R +++ b/R/location.R @@ -676,10 +676,8 @@ location_build_push_plan <- function(packet_id, location_name, root) { files_msg <- character(0) } else { packet_id_msg <- sort(packet_id_msg) - metadata <- metadata ## All files across all missing ids: - files <- unique(unlist( - lapply(packet_id_msg, function(i) metadata[[i]]$files$hash))) + files <- find_all_files(packet_id_msg, metadata) ## Which of these does the server not know about: files_msg <- driver$list_unknown_files(files) diff --git a/R/outpack_hash.R b/R/outpack_hash.R index df1e98ea..f135cae6 100644 --- a/R/outpack_hash.R +++ b/R/outpack_hash.R @@ -112,3 +112,11 @@ rehash <- function(data, hash_function, expected) { algorithm <- hash_parse(expected)$algorithm hash_function(data, algorithm) } + +## metadata files are hashed by ignoring leading and trailing newline +## characters. +hash_metadata_files <- function(path, hash_algorithm) { + vcapply(path, function(p) { + hash_data(read_string(p), hash_algorithm) + }) +} diff --git a/R/outpack_insert.R b/R/outpack_insert.R index e9ab53e6..c13c74d3 100644 --- a/R/outpack_insert.R +++ b/R/outpack_insert.R @@ -39,6 +39,7 @@ outpack_insert_packet <- function(path, json, root = NULL) { ## TODO: once we get more flexible remotes, this will get moved into ## its own thing. hash <- hash_data(json, hash_algorithm) + time <- Sys.time() mark_packet_known(id, local, hash, time, root) } diff --git a/R/outpack_misc.R b/R/outpack_misc.R index 1270804f..97933332 100644 --- a/R/outpack_misc.R +++ b/R/outpack_misc.R @@ -63,6 +63,10 @@ find_all_dependencies <- function(id, metadata) { sort(ret) } +## Get all the files for a set of packets, filtering any overlap. +find_all_files <- function(id, metadata) { + unique(unlist(lapply(id, function(i) metadata[[i]]$files$hash))) +} validate_parameters <- function(parameters, call) { if (is.null(parameters) || length(parameters) == 0) { diff --git a/man/orderly_export_zip.Rd b/man/orderly_export_zip.Rd new file mode 100644 index 00000000..a207b1ea --- /dev/null +++ b/man/orderly_export_zip.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/export.R +\name{orderly_export_zip} +\alias{orderly_export_zip} +\title{Export packets as a zip file.} +\usage{ +orderly_export_zip(path, packets, root = NULL, locate = TRUE) +} +\arguments{ +\item{path}{the path where the zip file will be created} + +\item{packets}{One or more packets to export} + +\item{root}{The path to the root directory, or \code{NULL} (the +default) to search for one from the current working directory if +\code{locate} is \code{TRUE}. This function does not require that the +directory is configured for orderly, and can be any \code{outpack} +root (see \link{orderly_init} for details).} + +\item{locate}{Logical, indicating if the root should be searched +for. If \code{TRUE}, then we looks in the directory given for \code{root} +(or the working directory if \code{NULL}) and then up through its +parents until it finds an \code{.outpack} directory or +\code{orderly_config.yml}} +} +\value{ +Nothing +} +\description{ +The packets can be imported into a different repository using the +\link{orderly_import_zip} function. +} +\details{ +This is useful as one-time way to publish your results, for example as an +artefact accompanying a paper. For back-and-forth collaboration, a shared +location should be priviledged. +} diff --git a/man/orderly_import_zip.Rd b/man/orderly_import_zip.Rd new file mode 100644 index 00000000..bd9c66cb --- /dev/null +++ b/man/orderly_import_zip.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/export.R +\name{orderly_import_zip} +\alias{orderly_import_zip} +\title{Import packets from a zip file.} +\usage{ +orderly_import_zip(path, root = NULL, locate = TRUE) +} +\arguments{ +\item{path}{the path to the zip file to be imported.} + +\item{root}{The path to the root directory, or \code{NULL} (the +default) to search for one from the current working directory if +\code{locate} is \code{TRUE}. This function does not require that the +directory is configured for orderly, and can be any \code{outpack} +root (see \link{orderly_init} for details).} + +\item{locate}{Logical, indicating if the root should be searched +for. If \code{TRUE}, then we looks in the directory given for \code{root} +(or the working directory if \code{NULL}) and then up through its +parents until it finds an \code{.outpack} directory or +\code{orderly_config.yml}} +} +\value{ +Invisibly, the IDs of the imported packets +} +\description{ +Import packets from a zip file. +} diff --git a/tests/testthat/test-export.R b/tests/testthat/test-export.R new file mode 100644 index 00000000..36d3a760 --- /dev/null +++ b/tests/testthat/test-export.R @@ -0,0 +1,199 @@ +export_info <- function(path) { + listing <- zip::zip_list(path)$filename + + metadata <- grep("^metadata/.*[^/]$", listing, value=TRUE) + metadata <- sub("^metadata/", "", metadata) + + files <- grep("^files/.*[^/]$", listing, value=TRUE) + files <- sub("^files/", "", files) + + list(metadata = metadata, files = files) +} + +test_that("Exporting a packet includes its transitive dependencies", { + root <- create_temporary_root() + ids <- create_random_packet_chain(root, 3) + other <- create_random_packet(root) + + path <- tempfile() + orderly_export_zip(path, ids[[3]], root = root) + + info <- export_info(path) + expect_setequal(info$metadata, ids) + + # The root packet has one file, and each downstream one has 2 (one source file + # and one data). The downstreams actually have three, but one of them is a + # copy of the parent packet's data, hence is deduplicated and doesn't count. + expect_equal(length(info$files), 5) +}) + +test_that("Can export multiple packets", { + root <- create_temporary_root() + first <- create_random_packet(root) + second <- create_random_packet(root) + ids <- c(first, second) + + path <- tempfile() + orderly_export_zip(path, ids, root = root) + + info <- export_info(path) + expect_setequal(info$metadata, ids) + expect_equal(length(info$files), 2) +}) + +test_that("Can export from a file store", { + root <- create_temporary_root(use_file_store = TRUE) + ids <- create_random_packet_chain(root, 3) + + path <- tempfile() + orderly_export_zip(path, ids[[3]], root = root) + + info <- export_info(path) + expect_setequal(info$metadata, ids) + expect_equal(length(info$files), 5) +}) + +test_that("Packet files are de-duplicated when exported", { + root <- create_temporary_root() + ids <- c(create_deterministic_packet(root), create_deterministic_packet(root)) + + path <- tempfile() + orderly_export_zip(path, ids, root = root) + + info <- export_info(path) + expect_setequal(info$metadata, ids) + expect_equal(length(info$files), 1) +}) + +test_that("Can import a zip file", { + upstream <- create_temporary_root() + downstream <- create_temporary_root() + + id <- create_random_packet(upstream) + + path <- tempfile() + orderly_export_zip(path, id, root = upstream) + + imported <- orderly_import_zip(path, root = downstream) + expect_equal(imported, id) + + index <- downstream$index$data() + expect_setequal(names(index$metadata), id) + expect_mapequal(index$metadata, upstream$index$data()$metadata) + expect_setequal(index$unpacked, id) + + files <- upstream$index$metadata(id)$files + file_paths <- file.path(downstream$path, downstream$config$core$path_archive, + upstream$index$metadata(id)$name, id, files$path) + + for (i in seq_along(file_paths)) { + expect_no_error(hash_validate_file(file_paths[[i]], files$hash[[i]])) + } +}) + +test_that("Can import a zip file to a file store", { + upstream <- create_temporary_root() + downstream <- create_temporary_root(use_file_store=TRUE) + + ids <- create_random_packet_chain(upstream, 3) + + path <- tempfile() + orderly_export_zip(path, ids[[3]], root = upstream) + orderly_import_zip(path, root = downstream) + + index <- downstream$index$data() + expect_setequal(names(index$metadata), ids) + expect_mapequal(index$metadata, upstream$index$data()$metadata) + expect_setequal(index$unpacked, ids) + + for (id in ids) { + files <- upstream$index$metadata(id)$files + expect_true(all(downstream$files$exists(files$hash))) + } +}) + +test_that("Importing a zip file is idempotent", { + upstream <- create_temporary_root() + downstream <- create_temporary_root() + + id <- create_random_packet(upstream) + + path <- tempfile() + orderly_export_zip(path, id, root = upstream) + imported_once <- orderly_import_zip(path, root = downstream) + imported_twice <- orderly_import_zip(path, root = downstream) + + expect_equal(imported_once, id) + expect_equal(imported_twice, id) + + index <- downstream$index$data() + expect_setequal(names(index$metadata), id) + expect_mapequal(index$metadata, upstream$index$data()$metadata) + expect_setequal(index$unpacked, id) +}) + +test_that("New packets are imported", { + upstream <- create_temporary_root() + + first_id <- create_random_packet(upstream) + first_zip <- tempfile() + orderly_export_zip(first_zip, first_id, root = upstream) + + second_id <- create_random_packet(upstream) + second_zip <- tempfile() + orderly_export_zip(second_zip, c(first_id, second_id), root = upstream) + + downstream <- create_temporary_root() + + orderly_import_zip(first_zip, root = downstream) + index <- downstream$index$data() + expect_setequal(names(index$metadata), first_id) + expect_setequal(index$unpacked, first_id) + + orderly_import_zip(second_zip, root = downstream) + index <- downstream$index$data() + expect_setequal(names(index$metadata), c(first_id, second_id)) + expect_mapequal(index$metadata, upstream$index$data()$metadata) + expect_setequal(index$unpacked, c(first_id, second_id)) +}) + +test_that("Can import packet with existing metadata", { + upstream <- create_temporary_root(use_file_store=TRUE) + id <- create_random_packet(upstream) + + # We want to bring in the packets metadata into the downstream repository, + # but not copy any of the actual files (yet). We do this by adding a path + # location and pulling the metadata from it. + downstream <- create_temporary_root() + orderly_location_add("upstream", "path", list(path = upstream$path), + root = downstream) + orderly_location_pull_metadata(root = downstream) + + index <- downstream$index$data() + expect_setequal(names(index$metadata), id) + expect_equal(length(index$unpacked), 0) + + path <- tempfile() + orderly_export_zip(path, id, root = upstream) + orderly_import_zip(path, root = downstream) + + index <- downstream$index$data() + expect_setequal(names(index$metadata), id) + expect_setequal(index$unpacked, id) +}) + +test_that("Importing a zip file with mismatching metadata fails", { + upstream <- create_temporary_root() + downstream <- create_temporary_root() + + id <- outpack_id() + create_random_packet(upstream, id = id) + create_random_packet(downstream, id = id) + + path <- tempfile() + orderly_export_zip(path, id, root = upstream) + + expect_error( + orderly_import_zip(path, root = downstream), + "Imported file has conflicting metadata") +})