Skip to content

Commit

Permalink
Add a function to compare the contents of two packets.
Browse files Browse the repository at this point in the history
The new `orderly_compare_packets` function takes in two packet IDs and
produces a diff of the two packets' contents. This includes a comparison
of both the packets' metadata and of their files. The actual diff of the
files is computed using the diffobj package.

A parameter of the function can be used to control which components of
the packets are considered in the comparison, and take be one of
"everything", "metadata", "files" and "artefacts".

This feature was requested as part of the epireview project, where one
report's implementation got very messy and needs refactoring, but in the
absence of any way of comparing two runs of the same report it is
difficult to determine whether the refactor has any unexpected effect on
the output.
  • Loading branch information
plietar committed Jun 25, 2024
1 parent f8282a3 commit 841cca6
Show file tree
Hide file tree
Showing 10 changed files with 547 additions and 3 deletions.
2 changes: 2 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ BugReports: https://github.com/mrc-ide/orderly2/issues
Imports:
R6,
cli,
diffobj,
fs,
gert (>= 1.9.3),
jsonlite,
Expand All @@ -35,6 +36,7 @@ Suggests:
pkgload,
processx,
rmarkdown,
stringr,
testthat (>= 3.0.0)
Config/testthat/edition: 3
Remotes:
Expand Down
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
# Generated by roxygen2: do not edit by hand

S3method(format,orderly_compare_packets)
S3method(format,orderly_query)
S3method(print,orderly_cleanup_status)
S3method(print,orderly_compare_packets)
S3method(print,orderly_query_explain)
export(orderly_artefact)
export(orderly_cleanup)
export(orderly_cleanup_status)
export(orderly_compare_packets)
export(orderly_config)
export(orderly_config_set)
export(orderly_copy_files)
Expand Down
211 changes: 211 additions & 0 deletions R/compare.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
is_binary_file <- function(path, n = 1024) {
# This is a pretty crude heuristic, but it seems good enough.
# It is actually similar to what the `diff` tool does.
data <- readBin(path, "raw", n)
as.raw(0) %in% data
}

compare_metadata <- function(target, current) {
# id and time will almost always differ, but not in any interesting way.
# files may differ (especially the hashes), but we compare the files in
# detail seperately.
exclude <- c("id", "time", "files")

target_filtered <- target[!(names(target) %in% exclude)]
current_filtered <- current[!(names(current) %in% exclude)]
if (!identical(target_filtered, current_filtered)) {
diffobj::diffPrint(
target_filtered,
current_filtered,
tar.banner = target$id,
cur.banner = current$id,
rds = FALSE,
mode = "unified",
style = list(pad = FALSE, wrap = FALSE),
interactive = FALSE)
} else {
NULL
}
}

compare_filesets <- function(target, current) {
files <- merge(x = target, y = current, by = "path", all = TRUE)
status <- ifelse(
is.na(files$hash.x), "added",
ifelse(is.na(files$hash.y), "removed",
ifelse(files$hash.x == files$hash.y, "unchanged", "modified")))
data.frame(path = files$path, status = status)
}

compare_files <- function(target, current, files, root, search_options) {
if (is.null(files)) {
return(NULL)
}

path_target <- withr::local_tempdir()
path_current <- withr::local_tempdir()

# Copying the files into a temporary directory is fairly wasteful and, as
# long as the packet is unpacked already, we could read the files from the
# archive or file store directly. Nevertheless this makes accessing the files
# very straightforward, and covers the case where the file only exists
# remotely transparent.

orderly_copy_files(target, dest = path_target, files = files,
options = search_options, root = root)

orderly_copy_files(current, dest = path_current, files = files,
options = search_options, root = root)

ret <- lapply(files, function(p) {
if (is_binary_file(file.path(path_target, p)) ||
is_binary_file(file.path(path_current, p))) {
NULL
} else {
diffobj::diffFile(
file.path(path_target, p),
file.path(path_current, p),
tar.banner = file.path(target, p),
cur.banner = file.path(current, p),
rds = FALSE,
mode = "unified",
style = list(pad = FALSE, wrap = FALSE),
interactive = FALSE)
}
})
names(ret) <- files

ret
}

##' Compare the metadata and contents of two packets.
##'
##' Insignificant differences in the metadata (eg. different dates and packet
##' IDs) are excluded from the comparison.
##'
##' If either packet is not unpacked, but `search_options`'s `allow_remote` is
##' `TRUE`, we will try to request files from remote locations, as necessary.
##'
##' @title Compare two packets
##'
##' @param target The id of the packet to use in the comparison.
##' @param current The id of the other packet against which to compare.
##' @param what One of "everything" (the default), "metadata", "files" or
##' "artefacts", retricting what components of the packet to compare. This is
##' useful when it is known for example that the source code of a report what
##' changed, and one is only interested in the effect on its output.
##'
##' @inheritParams orderly_metadata
##'
##' @return If the packets have identical contents, TRUE is returned. Otherwise
##' an object detailing the differences is returned. While the object can be
##' inspected, its contents is subject to change. In both cases, the returned
##' value has class `orderly_compare_packets`, allowing a user friendly
##' display of the result.
##'
##' @export
orderly_compare_packets <- function(
target, current, what = c("everything", "metadata", "files", "artefacts"),
search_options = NULL, root = NULL, locate = TRUE) {
validate_outpack_id(target, call = environment())
validate_outpack_id(current, call = environment())
what <- rlang::arg_match(what)

root <- root_open(root, locate = locate, require_orderly = FALSE,
call = environment())

meta_target <- orderly_metadata(target, root = root)
meta_current <- orderly_metadata(current, root = root)
if (what %in% c("everything", "metadata")) {
metadata_diff <- compare_metadata(meta_target, meta_current)
} else {
metadata_diff <- NULL
}

if (what == "artefacts") {
if (is.null(meta_target$custom$orderly) ||
is.null(meta_current$custom$orderly)) {
cli::cli_abort("Cannot compare artefacts of non-orderly packets")
}

artefacts_target <- unlist(meta_target$custom$orderly$artefacts$paths)
artefacts_current <- unlist(meta_current$custom$orderly$artefacts$paths)
files <- compare_filesets(
meta_target$files[meta_target$files$path %in% artefacts_target, ],
meta_current$files[meta_current$files$path %in% artefacts_current, ])
} else if (what %in% c("everything", "files")) {
files <- compare_filesets(meta_target$files, meta_current$files)
} else {
files <- data.frame(path = NULL, status = NULL)
}

if (is.null(metadata_diff) && all(files$status == "unchanged")) {
ret <- TRUE
} else {
idx <- files$status == "modified"
files$diff[idx] <- compare_files(target, current, files[idx, ]$path,
search_options = search_options,
root = root)

ret <- list(packets = c(target = target, current = current),
metadata_diff = metadata_diff,
files = files)
}

class(ret) <- "orderly_compare_packets"

ret
}


#' @export
format.orderly_compare_packets <- function(x) {
cli::cli_format_method({
if (isTRUE(x)) {
cli::cli_alert_success("Packets are identical")
} else {
target <- x$packets[[1]]
current <- x$packets[[2]]

cli::cli_alert_info("Comparing packets {target} and {current}")

if (!is.null(x$metadata_diff)) {
cli::cli_alert_warning("Packet metadata differs:")
cli::cli_div(theme = list(div = list("margin-left" = 2)))
cli::cli_verbatim(as.character(x$metadata_diff))
cli::cli_end()
}

removed <- x$files[x$files$status == "removed", ]
if (nrow(removed) > 0) {
cli::cli_alert_warning("The following files only exist in packet {current}:")
cli::cli_ul(removed$path)
}

added <- x$files[x$files$status == "added", ]
if (nrow(added) > 0) {
cli::cli_alert_warning("The following files only exist in packet {target}:")
cli::cli_ul(added$path)
}

modified <- x$files[x$files$status == "modified", ]
if (nrow(modified) > 0) {
cli::cli_alert_warning("The following files exist in both packets but have different contents:")
cli::cli_ul()
for (i in seq(nrow(modified))) {
cli::cli_li("{modified$path[[i]]}")
if (!is.null(modified$diff[[i]])) {
cli::cli_verbatim(as.character(modified$diff[[i]]))
}
}
cli::cli_end()
}
}
})
}


#' @export
print.orderly_compare_packets <- function(x, ...) {
cat(format(x, ...), sep = "\n")
}
1 change: 1 addition & 0 deletions R/util.R
Original file line number Diff line number Diff line change
Expand Up @@ -646,6 +646,7 @@ is_testing <- function() {
}

#' Given a character vector, missing names are filled using the value.
#' @noRd
fill_missing_names <- function(x) {
if (is.null(names(x))) {
names(x) <- x
Expand Down
54 changes: 54 additions & 0 deletions man/orderly_compare_packets.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 10 additions & 3 deletions man/orderly_shared_resource.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 841cca6

Please sign in to comment.