From 841cca66bed8cebdfc611b08cfad316c4bac7c8c Mon Sep 17 00:00:00 2001 From: Paul Lietar Date: Tue, 25 Jun 2024 14:04:30 +0100 Subject: [PATCH] Add a function to compare the contents of two packets. The new `orderly_compare_packets` function takes in two packet IDs and produces a diff of the two packets' contents. This includes a comparison of both the packets' metadata and of their files. The actual diff of the files is computed using the diffobj package. A parameter of the function can be used to control which components of the packets are considered in the comparison, and take be one of "everything", "metadata", "files" and "artefacts". This feature was requested as part of the epireview project, where one report's implementation got very messy and needs refactoring, but in the absence of any way of comparing two runs of the same report it is difficult to determine whether the refactor has any unexpected effect on the output. --- DESCRIPTION | 2 + NAMESPACE | 3 + R/compare.R | 211 +++++++++++++++++++++++++++++++ R/util.R | 1 + man/orderly_compare_packets.Rd | 54 ++++++++ man/orderly_shared_resource.Rd | 13 +- tests/testthat/_snaps/compare.md | 83 ++++++++++++ tests/testthat/helper-orderly.R | 7 + tests/testthat/helper-outpack.R | 15 +++ tests/testthat/test-compare.R | 161 +++++++++++++++++++++++ 10 files changed, 547 insertions(+), 3 deletions(-) create mode 100644 R/compare.R create mode 100644 man/orderly_compare_packets.Rd create mode 100644 tests/testthat/_snaps/compare.md create mode 100644 tests/testthat/test-compare.R diff --git a/DESCRIPTION b/DESCRIPTION index 8ffc3210..5270b0e8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -17,6 +17,7 @@ BugReports: https://github.com/mrc-ide/orderly2/issues Imports: R6, cli, + diffobj, fs, gert (>= 1.9.3), jsonlite, @@ -35,6 +36,7 @@ Suggests: pkgload, processx, rmarkdown, + stringr, testthat (>= 3.0.0) Config/testthat/edition: 3 Remotes: diff --git a/NAMESPACE b/NAMESPACE index 4c96b240..00a3f725 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,11 +1,14 @@ # Generated by roxygen2: do not edit by hand +S3method(format,orderly_compare_packets) S3method(format,orderly_query) S3method(print,orderly_cleanup_status) +S3method(print,orderly_compare_packets) S3method(print,orderly_query_explain) export(orderly_artefact) export(orderly_cleanup) export(orderly_cleanup_status) +export(orderly_compare_packets) export(orderly_config) export(orderly_config_set) export(orderly_copy_files) diff --git a/R/compare.R b/R/compare.R new file mode 100644 index 00000000..e8b671e7 --- /dev/null +++ b/R/compare.R @@ -0,0 +1,211 @@ +is_binary_file <- function(path, n = 1024) { + # This is a pretty crude heuristic, but it seems good enough. + # It is actually similar to what the `diff` tool does. + data <- readBin(path, "raw", n) + as.raw(0) %in% data +} + +compare_metadata <- function(target, current) { + # id and time will almost always differ, but not in any interesting way. + # files may differ (especially the hashes), but we compare the files in + # detail seperately. + exclude <- c("id", "time", "files") + + target_filtered <- target[!(names(target) %in% exclude)] + current_filtered <- current[!(names(current) %in% exclude)] + if (!identical(target_filtered, current_filtered)) { + diffobj::diffPrint( + target_filtered, + current_filtered, + tar.banner = target$id, + cur.banner = current$id, + rds = FALSE, + mode = "unified", + style = list(pad = FALSE, wrap = FALSE), + interactive = FALSE) + } else { + NULL + } +} + +compare_filesets <- function(target, current) { + files <- merge(x = target, y = current, by = "path", all = TRUE) + status <- ifelse( + is.na(files$hash.x), "added", + ifelse(is.na(files$hash.y), "removed", + ifelse(files$hash.x == files$hash.y, "unchanged", "modified"))) + data.frame(path = files$path, status = status) +} + +compare_files <- function(target, current, files, root, search_options) { + if (is.null(files)) { + return(NULL) + } + + path_target <- withr::local_tempdir() + path_current <- withr::local_tempdir() + + # Copying the files into a temporary directory is fairly wasteful and, as + # long as the packet is unpacked already, we could read the files from the + # archive or file store directly. Nevertheless this makes accessing the files + # very straightforward, and covers the case where the file only exists + # remotely transparent. + + orderly_copy_files(target, dest = path_target, files = files, + options = search_options, root = root) + + orderly_copy_files(current, dest = path_current, files = files, + options = search_options, root = root) + + ret <- lapply(files, function(p) { + if (is_binary_file(file.path(path_target, p)) || + is_binary_file(file.path(path_current, p))) { + NULL + } else { + diffobj::diffFile( + file.path(path_target, p), + file.path(path_current, p), + tar.banner = file.path(target, p), + cur.banner = file.path(current, p), + rds = FALSE, + mode = "unified", + style = list(pad = FALSE, wrap = FALSE), + interactive = FALSE) + } + }) + names(ret) <- files + + ret +} + +##' Compare the metadata and contents of two packets. +##' +##' Insignificant differences in the metadata (eg. different dates and packet +##' IDs) are excluded from the comparison. +##' +##' If either packet is not unpacked, but `search_options`'s `allow_remote` is +##' `TRUE`, we will try to request files from remote locations, as necessary. +##' +##' @title Compare two packets +##' +##' @param target The id of the packet to use in the comparison. +##' @param current The id of the other packet against which to compare. +##' @param what One of "everything" (the default), "metadata", "files" or +##' "artefacts", retricting what components of the packet to compare. This is +##' useful when it is known for example that the source code of a report what +##' changed, and one is only interested in the effect on its output. +##' +##' @inheritParams orderly_metadata +##' +##' @return If the packets have identical contents, TRUE is returned. Otherwise +##' an object detailing the differences is returned. While the object can be +##' inspected, its contents is subject to change. In both cases, the returned +##' value has class `orderly_compare_packets`, allowing a user friendly +##' display of the result. +##' +##' @export +orderly_compare_packets <- function( + target, current, what = c("everything", "metadata", "files", "artefacts"), + search_options = NULL, root = NULL, locate = TRUE) { + validate_outpack_id(target, call = environment()) + validate_outpack_id(current, call = environment()) + what <- rlang::arg_match(what) + + root <- root_open(root, locate = locate, require_orderly = FALSE, + call = environment()) + + meta_target <- orderly_metadata(target, root = root) + meta_current <- orderly_metadata(current, root = root) + if (what %in% c("everything", "metadata")) { + metadata_diff <- compare_metadata(meta_target, meta_current) + } else { + metadata_diff <- NULL + } + + if (what == "artefacts") { + if (is.null(meta_target$custom$orderly) || + is.null(meta_current$custom$orderly)) { + cli::cli_abort("Cannot compare artefacts of non-orderly packets") + } + + artefacts_target <- unlist(meta_target$custom$orderly$artefacts$paths) + artefacts_current <- unlist(meta_current$custom$orderly$artefacts$paths) + files <- compare_filesets( + meta_target$files[meta_target$files$path %in% artefacts_target, ], + meta_current$files[meta_current$files$path %in% artefacts_current, ]) + } else if (what %in% c("everything", "files")) { + files <- compare_filesets(meta_target$files, meta_current$files) + } else { + files <- data.frame(path = NULL, status = NULL) + } + + if (is.null(metadata_diff) && all(files$status == "unchanged")) { + ret <- TRUE + } else { + idx <- files$status == "modified" + files$diff[idx] <- compare_files(target, current, files[idx, ]$path, + search_options = search_options, + root = root) + + ret <- list(packets = c(target = target, current = current), + metadata_diff = metadata_diff, + files = files) + } + + class(ret) <- "orderly_compare_packets" + + ret +} + + +#' @export +format.orderly_compare_packets <- function(x) { + cli::cli_format_method({ + if (isTRUE(x)) { + cli::cli_alert_success("Packets are identical") + } else { + target <- x$packets[[1]] + current <- x$packets[[2]] + + cli::cli_alert_info("Comparing packets {target} and {current}") + + if (!is.null(x$metadata_diff)) { + cli::cli_alert_warning("Packet metadata differs:") + cli::cli_div(theme = list(div = list("margin-left" = 2))) + cli::cli_verbatim(as.character(x$metadata_diff)) + cli::cli_end() + } + + removed <- x$files[x$files$status == "removed", ] + if (nrow(removed) > 0) { + cli::cli_alert_warning("The following files only exist in packet {current}:") + cli::cli_ul(removed$path) + } + + added <- x$files[x$files$status == "added", ] + if (nrow(added) > 0) { + cli::cli_alert_warning("The following files only exist in packet {target}:") + cli::cli_ul(added$path) + } + + modified <- x$files[x$files$status == "modified", ] + if (nrow(modified) > 0) { + cli::cli_alert_warning("The following files exist in both packets but have different contents:") + cli::cli_ul() + for (i in seq(nrow(modified))) { + cli::cli_li("{modified$path[[i]]}") + if (!is.null(modified$diff[[i]])) { + cli::cli_verbatim(as.character(modified$diff[[i]])) + } + } + cli::cli_end() + } + } + }) +} + + +#' @export +print.orderly_compare_packets <- function(x, ...) { + cat(format(x, ...), sep = "\n") +} diff --git a/R/util.R b/R/util.R index ee7e8f24..e0928989 100644 --- a/R/util.R +++ b/R/util.R @@ -646,6 +646,7 @@ is_testing <- function() { } #' Given a character vector, missing names are filled using the value. +#' @noRd fill_missing_names <- function(x) { if (is.null(names(x))) { names(x) <- x diff --git a/man/orderly_compare_packets.Rd b/man/orderly_compare_packets.Rd new file mode 100644 index 00000000..a03de707 --- /dev/null +++ b/man/orderly_compare_packets.Rd @@ -0,0 +1,54 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/compare.R +\name{orderly_compare_packets} +\alias{orderly_compare_packets} +\title{Compare two packets} +\usage{ +orderly_compare_packets( + target, + current, + what = c("everything", "metadata", "files", "artefacts"), + search_options = NULL, + root = NULL, + locate = TRUE +) +} +\arguments{ +\item{target}{The id of the packet to use in the comparison.} + +\item{current}{The id of the other packet against which to compare.} + +\item{what}{One of "everything" (the default), "metadata", "files" or +"artefacts", retricting what components of the packet to compare. This is +useful when it is known for example that the source code of a report what +changed, and one is only interested in the effect on its output.} + +\item{root}{The path to the root directory, or \code{NULL} (the +default) to search for one from the current working directory if +\code{locate} is \code{TRUE}. This function does not require that the +directory is configured for orderly, and can be any \code{outpack} +root (see \link{orderly_init} for details).} + +\item{locate}{Logical, indicating if the root should be searched +for. If \code{TRUE}, then we looks in the directory given for \code{root} +(or the working directory if \code{NULL}) and then up through its +parents until it finds an \code{.outpack} directory or +\code{orderly_config.yml}} +} +\value{ +If the packets have identical contents, TRUE is returned. Otherwise +an object detailing the differences is returned. While the object can be +inspected, its contents is subject to change. In both cases, the returned +value has class \code{orderly_compare_packets}, allowing a user friendly +display of the result. +} +\description{ +Compare the metadata and contents of two packets. +} +\details{ +Insignificant differences in the metadata (eg. different dates and packet +IDs) are excluded from the comparison. + +If either packet is not unpacked, but \code{search_options}'s \code{allow_remote} is +\code{TRUE}, we will try to request files from remote locations, as necessary. +} diff --git a/man/orderly_shared_resource.Rd b/man/orderly_shared_resource.Rd index a30d15b0..167b4724 100644 --- a/man/orderly_shared_resource.Rd +++ b/man/orderly_shared_resource.Rd @@ -7,9 +7,16 @@ orderly_shared_resource(...) } \arguments{ -\item{...}{Named arguments corresponding to shared resources to -copy. The name will be the destination filename, while the value -is the filename within the shared resource directory.} +\item{...}{The shared resources to copy. If arguments are named, the name +will be the destination file while the value is the filename within the +shared resource directory. + +You can use a limited form of string interpolation in the names of +this argument; using \verb{$\{variable\}} will pick up values from +\code{envir} and substitute them into your string. This is similar +to the interpolation you might be familiar with from +\code{glue::glue} or similar, but much simpler with no concatenation +or other fancy features supported.} } \value{ Invisibly, a data.frame with columns \code{here} (the fileames diff --git a/tests/testthat/_snaps/compare.md b/tests/testthat/_snaps/compare.md new file mode 100644 index 00000000..68689446 --- /dev/null +++ b/tests/testthat/_snaps/compare.md @@ -0,0 +1,83 @@ +# Comparing a packet to itself returns TRUE + + Code + print(result) + Output + v Packets are identical + +# Comparing packets ignores ID and time differences + + Code + print(result) + Output + v Packets are identical + +# Can compare packets with different metadata + + Code + print(all) + Output + i Comparing packets 19700101-000000-00000001 and 19700101-000000-00000002 + ! Packet metadata differs: + < 19700101-000000-00000001 + > 19700101-000000-00000002 + @@ 7,5 / 7,5 @@ + $parameters + $parameters$A + < [1] "foo" + > [1] "bar" + + + +# Can compare packets with different file contents + + Code + print(all) + Output + i Comparing packets 19700101-000000-00000001 and 19700101-000000-00000002 + ! The following files exist in both packets but have different contents: + * data.R + < 19700101-000000-00000001/data.R + > 19700101-000000-00000002/data.R + @@ 1 / 1 @@ + < writeLines("Hello", "data.txt") + > writeLines("World", "data.txt") + * data.txt + < 19700101-000000-00000001/data.txt + > 19700101-000000-00000002/data.txt + @@ 1 / 1 @@ + < Hello + > World + +# Can compare artefacts only + + Code + print(files) + Output + i Comparing packets 19700101-000000-00000001 and 19700101-000000-00000002 + ! The following files exist in both packets but have different contents: + * data.R + < 19700101-000000-00000001/data.R + > 19700101-000000-00000002/data.R + @@ 1,4 / 1,4 @@ + { + orderly_artefact("Output", "output.txt") + < writeLines(toString(2 + 1), "output.txt") + > writeLines(toString(1 + 2), "output.txt") + } + +# Can compare packets with binary contents + + Code + print(result) + Output + i Comparing packets 19700101-000000-00000001 and 19700101-000000-00000002 + ! The following files exist in both packets but have different contents: + * data.rds + * data.txt + < 19700101-000000-00000001/data.txt + > 19700101-000000-00000002/data.txt + @@ 1 / 1 @@ + < Hello + > World + diff --git a/tests/testthat/helper-orderly.R b/tests/testthat/helper-orderly.R index 373786c6..0040e582 100644 --- a/tests/testthat/helper-orderly.R +++ b/tests/testthat/helper-orderly.R @@ -125,3 +125,10 @@ orderly_init_quietly <- function(...) { orderly_run_quietly <- function(..., echo = FALSE) { suppressMessages(orderly_run(..., echo = echo)) } + +orderly_run_snippet <- function(root, name, expr, ...) { + fs::dir_create(file.path(root$path, "src", name)) + writeLines(deparse(substitute(expr)), + file.path(root$path, "src", name, sprintf("%s.R", name))) + orderly_run_quietly(name, root = root, ...) +} diff --git a/tests/testthat/helper-outpack.R b/tests/testthat/helper-outpack.R index c6c5e9cc..d7aebccb 100644 --- a/tests/testthat/helper-outpack.R +++ b/tests/testthat/helper-outpack.R @@ -185,3 +185,18 @@ forcibly_truncate_file <- function(path) { file.create(path) fs::file_chmod(path, permissions) } + +#' Scrub packets from an output. +#' +#' This returns a transformation suitable to be passed to `expect_snapshot`. +#' The specified packet IDs are removed from the output, and replaced with +#' stable strings of the same length. +#' +#' @param ... the list of packet IDs to remove from the output. +#' @noRd +scrub_packets <- function(...) { + ids <- c(...) + replacements <- sprintf("19700101-000000-%08x", seq_along(ids)) + names(replacements) <- ids + function(x) stringr::str_replace_all(x, replacements) +} diff --git a/tests/testthat/test-compare.R b/tests/testthat/test-compare.R new file mode 100644 index 00000000..8575acd6 --- /dev/null +++ b/tests/testthat/test-compare.R @@ -0,0 +1,161 @@ +test_that("Comparing a packet to itself returns TRUE", { + root <- create_temporary_root() + id <- create_random_packet(root) + + result <- orderly_compare_packets(id, id, root = root) + expect_true(result) + expect_snapshot(print(result), transform = scrub_packets(id)) +}) + + +test_that("Comparing packets ignores ID and time differences", { + root <- create_temporary_root() + id1 <- create_deterministic_packet(root) + id2 <- create_deterministic_packet(root) + + meta1 <- orderly_metadata(id1, root = root) + meta2 <- orderly_metadata(id2, root = root) + expect_false(isTRUE(all.equal(meta1$id, meta2$id))) + expect_false(isTRUE(all.equal(meta1$time, meta2$time))) + + result <- orderly_compare_packets(id1, id2, root = root) + expect_true(result) + expect_snapshot(print(result), transform = scrub_packets(id1, id2)) +}) + + +test_that("Can compare packets with different metadata", { + root <- create_temporary_root() + + p1 <- create_deterministic_packet(root, "data", list(A = "foo")) + p2 <- create_deterministic_packet(root, "data", list(A = "bar")) + + all <- orderly_compare_packets(p1, p2, root = root) + metadata <- orderly_compare_packets(p1, p2, what = "metadata", root = root) + files <- orderly_compare_packets(p1, p2, what = "files", root = root) + + expect_false(isTRUE(all)) + expect_false(isTRUE(metadata)) + expect_true(files) + + expect_snapshot(print(all), transform = scrub_packets(p1, p2)) +}) + + +test_that("Can compare packets with different file contents", { + root <- create_temporary_root() + + p1 <- orderly_run_snippet(root, "data", writeLines("Hello", "data.txt")) + p2 <- orderly_run_snippet(root, "data", writeLines("World", "data.txt")) + + all <- orderly_compare_packets(p1, p2, root = root) + metadata <- orderly_compare_packets(p1, p2, what = "metadata", root = root) + files <- orderly_compare_packets(p1, p2, what = "files", root = root) + + expect_false(isTRUE(all)) + expect_true(metadata) + expect_false(isTRUE(files)) + + expect_snapshot(print(all), transform = scrub_packets(p1, p2)) +}) + + +test_that("Can compare artefacts only", { + root <- create_temporary_root() + + p1 <- orderly_run_snippet(root, "data", { + orderly_artefact("Output", "output.txt") + writeLines(toString(2 + 1), "output.txt") + }) + + p2 <- orderly_run_snippet(root, "data", { + orderly_artefact("Output", "output.txt") + writeLines(toString(1 + 2), "output.txt") + }) + + files <- orderly_compare_packets(p1, p2, what = "files", root = root) + artefacts <- orderly_compare_packets(p1, p2, what = "artefacts", root = root) + + # The packet files, in particular the source code, are different. However the + # different snippets produce identical artefacts. + expect_false(isTRUE(files)) + expect_true(artefacts) + + expect_snapshot(print(files), transform = scrub_packets(p1, p2)) +}) + + +test_that("Can detect newly declared artefact", { + root <- create_temporary_root() + + p1 <- orderly_run_snippet(root, "data", { + writeLines("Hello", "hello.txt") + }) + + p2 <- orderly_run_snippet(root, "data", { + orderly_artefact("Output", "hello.txt") + writeLines("Hello", "hello.txt") + }) + + files <- orderly_compare_packets(p1, p2, what = "files", root = root) + artefacts <- orderly_compare_packets(p1, p2, what = "artefacts", root = root) + + f <- files$files[files$files$path == "hello.txt", ] + expect_equal(f$status, "unchanged") + + # When comparing artefacts, the file is reported as "added", because while it + # exists in the original packet, it was not an artefact. + f <- artefacts$files[artefacts$files$path == "hello.txt", ] + expect_equal(f$status, "added") +}) + + +test_that("Can compare packets with binary contents", { + root <- create_temporary_root() + + p1 <- orderly_run_snippet(root, "data", { + orderly_artefact("Outputs", c("data.txt", "data.rds")) + writeLines("Hello", "data.txt") + saveRDS(1:10, "data.rds") + }) + + p2 <- orderly_run_snippet(root, "data", { + orderly_artefact("Outputs", c("data.txt", "data.rds")) + writeLines("World", "data.txt") + saveRDS(11:20, "data.rds") + }) + + result <- orderly_compare_packets(p1, p2, what = "artefacts", root = root) + expect_false(isTRUE(result)) + + # We can't render a diff for binary files, so these are set to NULL, unlike + # the text file. + text <- result$files[result$files$path == "data.txt", ] + binary <- result$files[result$files$path == "data.rds", ] + expect_false(is.null(text$diff[[1]])) + expect_true(is.null(binary$diff[[1]])) + + expect_snapshot(print(result), transform = scrub_packets(p1, p2)) +}) + + +test_that("Cannot compare artefacts of non-orderly packets", { + root <- create_temporary_root() + + src <- withr::local_tempdir() + writeLines("World", file.path(src, "data.txt")) + p <- outpack_packet_start_quietly(src, "data", root = root) + outpack_packet_end_quietly(p) + + # Only orderly packets have the right metadata needed to infer what is or + # isn't an artefact. + expect_error( + orderly_compare_packets(p$id, p$id, what = "artefacts", root = root), + "Cannot compare artefacts of non-orderly packets") + + # We can still do full packet comparison, even without the orderly-specific + # metadata. + expect_no_error(orderly_compare_packets(p$id, p$id, root = root)) + expect_no_error( + orderly_compare_packets(p$id, p$id, what = "files", root = root)) +})