From 459390242ba2eab4c6b9df6863e70d5746337244 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Paul=20Li=C3=A9tar?= <pl2113@ic.ac.uk>
Date: Wed, 23 Oct 2024 15:06:25 +0100
Subject: [PATCH] WIP

---
 DESCRIPTION                    |   2 +-
 NAMESPACE                      |   2 +
 R/compare.R                    | 354 ++++++++++++++-------------------
 man/orderly_compare_packets.Rd |  28 +--
 4 files changed, 162 insertions(+), 224 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index c1570a8c..c016dbef 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: orderly2
 Title: Orderly Next Generation
-Version: 1.99.49
+Version: 1.99.50
 Authors@R: c(person("Rich", "FitzJohn", role = c("aut", "cre"),
                     email = "rich.fitzjohn@gmail.com"),
              person("Robert", "Ashton", role = "aut"),
diff --git a/NAMESPACE b/NAMESPACE
index 804ca6aa..40c895bd 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,7 +1,9 @@
 # Generated by roxygen2: do not edit by hand
 
+S3method(format,orderly_comparison)
 S3method(format,orderly_query)
 S3method(print,orderly_cleanup_status)
+S3method(print,orderly_comparison)
 S3method(print,orderly_query_explain)
 export(orderly_artefact)
 export(orderly_cleanup)
diff --git a/R/compare.R b/R/compare.R
index c78423c3..d332714b 100644
--- a/R/compare.R
+++ b/R/compare.R
@@ -1,31 +1,10 @@
 is_binary_file <- function(path, n = 1024) {
-  # This is a pretty crude heuristic, but it seems good enough.
-  # It is actually similar to what the `diff` tool does.
-  data <- readBin(path, "raw", n)
-  as.raw(0) %in% data
-}
-
-compare_metadata <- function(target, current) {
-  # id and time will almost always differ, but not in any interesting way.
-  # files may differ (especially the hashes), but we compare the files in
-  # detail seperately.
-  exclude <- c("id", "time", "files")
-
-  target_filtered <- target[!(names(target) %in% exclude)]
-  current_filtered <- current[!(names(current) %in% exclude)]
-  if (!identical(target_filtered, current_filtered)) {
-    diffobj::diffPrint(
-      target_filtered,
-      current_filtered,
-      tar.banner = target$id,
-      cur.banner = current$id,
-      rds = FALSE,
-      mode = "unified",
-      style = list(pad = FALSE, wrap = FALSE),
-      interactive = FALSE)
-  } else {
-    NULL
-  }
+  vlapply(path, function(p) {
+    # This is a pretty crude heuristic, but it seems good enough.
+    # It is actually similar to what the `diff` tool does.
+    data <- readBin(path, "raw", n)
+    as.raw(0) %in% data
+  })
 }
 
 compare_filesets <- function(target, current) {
@@ -37,43 +16,6 @@ compare_filesets <- function(target, current) {
   data.frame(path = files$path, status = status)
 }
 
-compare_files <- function(target, current, files, root, search_options) {
-  path_target <- withr::local_tempdir()
-  path_current <- withr::local_tempdir()
-
-  # Copying the files into a temporary directory is fairly wasteful and, as
-  # long as the packet is unpacked already, we could read the files from the
-  # archive or file store directly. Nevertheless this makes accessing the files
-  # very straightforward, and covers the case where the file only exists
-  # remotely transparent.
-
-  orderly_copy_files(target, dest = path_target, files = files,
-                     options = search_options, root = root)
-
-  orderly_copy_files(current, dest = path_current, files = files,
-                     options = search_options, root = root)
-
-  ret <- lapply(files, function(p) {
-    if (is_binary_file(file.path(path_target, p)) ||
-        is_binary_file(file.path(path_current, p))) {
-      NULL
-    } else {
-      diffobj::diffChr(
-        read_file_lossy(file.path(path_target, p)),
-        read_file_lossy(file.path(path_current, p)),
-        tar.banner = file.path(target, p),
-        cur.banner = file.path(current, p),
-        rds = FALSE,
-        mode = "unified",
-        style = list(pad = FALSE, wrap = FALSE),
-        interactive = FALSE)
-    }
-  })
-  names(ret) <- files
-
-  ret
-}
-
 ##' Compare the metadata and contents of two packets.
 ##'
 ##' Insignificant differences in the metadata (eg. different dates and packet
@@ -102,173 +44,173 @@ compare_files <- function(target, current, files, root, search_options) {
 ##'   difference was found.
 ##'
 ##' @export
-orderly_compare_packets <- function(
-  target, current, search_options = NULL, root = NULL, locate = TRUE,
-  what = c("metadata", "files")) {
-  what <- rlang::arg_match(what, multiple = TRUE,
-                           values = c("metadata", "files", "artefacts"))
-  if (length(what) == 0) {
-    cli::cli_abort("{.code what} must not be empty")
-  }
-  if ("artefacts" %in% what && "files" %in% what) {
-    cli::cli_abort('{.code what} must contain both "files" and "artefacts"')
-  }
-
+orderly_compare_packets <- function(target, current, 
+                                    root = NULL,
+                                    location = NULL,
+                                    allow_remote = NULL,
+                                    pull_metadata = NULL) {
+  root <- root_open(root, require_orderly = FALSE)
   validate_outpack_id(target, call = environment())
   validate_outpack_id(current, call = environment())
 
-  root <- root_open(root, locate = locate, require_orderly = FALSE,
-                    call = environment())
+  ret <- list(
+    root = root,
+    target = orderly_metadata(target, root = root),
+    current = orderly_metadata(current, root = root))
+  class(ret) <- "orderly_comparison"
+  ret
+}
 
-  meta_target <- orderly_metadata(target, root = root)
-  meta_current <- orderly_metadata(current, root = root)
+compare_files <- function(x, verbose, name) {
+  diff <- compare_filesets(x$target$files, x$current$files)
 
-  if ("metadata" %in% what) {
-    metadata <- compare_metadata(meta_target, meta_current)
-  } else {
-    metadata <- NULL
-  }
+  modified <- diff[diff$status == "modified", ]
+  target_only <- diff[diff$status == "removed", ]
+  current_only <- diff[diff$status == "added", ]
 
-  if ("files" %in% what) {
-    files <- compare_filesets(meta_target$files, meta_current$files)
-  } else if ("artefacts" %in% what) {
-    if (is.null(meta_target$custom$orderly) ||
-        is.null(meta_current$custom$orderly)) {
-      cli::cli_abort("Cannot compare artefacts of non-orderly packets")
+  if (nrow(target_only) > 0) {
+    cli::cli_alert_info("The following files only exist in packet {x$target$id}")
+    cli::cli_ul(target_only$path)
+  }
+  if (nrow(current_only) > 0) {
+    cli::cli_alert_info("The following files only exist in packet {x$current$id}")
+    cli::cli_ul(current_only$path)
+  }
+  if (nrow(modified) > 0) {
+    if (verbose) {
+      compare_file_contents(x, modified$path)
+    } else {
+      cli::cli_alert_info("The following files exist in both packets but have different contents:")
+      cli::cli_ul(modified$path)
+      cli::cli_alert_info("Use {.code orderly_comparison_explain({name}, \"files\", verbose = TRUE)} to compare the files' contents.")
     }
+  }
+  if (nrow(modified) == 0 && nrow(target_only) == 0 && nrow(current_only) == 0) {
+    cli::cli_alert_info("The files across the two packets are identical.")
+  }
+}
 
-    artefacts_target <- unlist(meta_target$custom$orderly$artefacts$paths)
-    artefacts_current <- unlist(meta_current$custom$orderly$artefacts$paths)
-    files <- compare_filesets(
-      meta_target$files[meta_target$files$path %in% artefacts_target, ],
-      meta_current$files[meta_current$files$path %in% artefacts_current, ])
-  } else {
-    files <- data.frame(path = character(0), status = character(0))
+compare_file_contents <- function(x, files) {
+  path_target <- withr::local_tempdir()
+  path_current <- withr::local_tempdir()
+
+  # Copying the files into a temporary directory is fairly wasteful and, as
+  # long as the packet is unpacked already, we could read the files from the
+  # archive or file store directly. Nevertheless this makes accessing the files
+  # very straightforward, and covers the case where the file only exists
+  # remotely transparent.
+
+  orderly_copy_files(x$target$id, dest = path_target, files = files, root = root)
+  orderly_copy_files(x$current$id, dest = path_current, files = files, root = root)
+
+  binary_files <- is_binary_file(file.path(path_target, files)) ||
+                  is_binary_file(file.path(path_current, files))
+
+  for (f in files[!binary_files]) {
+    cli::cli_verbatim(diffobj::diffChr(
+      read_file_lossy(file.path(path_target, f)),
+      read_file_lossy(file.path(path_current, f)),
+      tar.banner = file.path(x$target$id, f),
+      cur.banner = file.path(x$current$id, f),
+      rds = FALSE,
+      mode = "unified"))
   }
 
-  idx <- files$status == "modified"
-  files$diff[idx] <- compare_files(target, current, files[idx, ]$path,
-                                   search_options = search_options,
-                                   root = root)
+  if (any(binary_files)) {
+    cli::cli_alert_warning("The following files differ across packets, but could not be compared as their content is binary:")
+    cli::cli_ul(files[binary_files])
+  }
+}
 
-  orderly_packet_diff$new(what, target, current, metadata, files)
+compare_metadata_attribute <- function(x, attribute, verbose, name) {
+  if (attribute == "files") {
+    cli::cli_alert_info("Comparing files")
+    compare_files(x, verbose, name)
+  } else {
+    cli::cli_alert_info("Comparing attribute {.code {attribute}}")
+    cli::cli_verbatim(as.character(diffobj::diffPrint(
+      x$target[[attribute]],
+      x$current[[attribute]],
+      tar.banner = sprintf("%s$%s", x$target$id, attribute),
+      cur.banner = sprintf("%s$%s", x$current$id, attribute),
+      rds = FALSE,
+      mode = "unified"
+    )))
+  }
 }
 
+compare_metadata_attribute_list <- function(target, current) {
+  target_names <- setdiff(names(target), c("id", "time"))
+  current_names <- setdiff(names(current), c("id", "time"))
 
-orderly_packet_diff <- R6::R6Class(
-  "orderly_packet_diff",
-  private = list(
-    what = NULL,
-    target = NULL,
-    current = NULL,
-    metadata = NULL,
-    files = NULL,
+  intersected <- intersect(target_names, current_names)
+  differences <- vlapply(intersected, function(n) {
+    !identical(target[[n]], current[[n]])
+  })
 
-    print_metadata = function() {
-      if (!is.null(private$metadata)) {
-        cli::cli_alert_warning("Packet metadata differs:")
-        cli::cli_div(theme = list(div = list("margin-left" = 2)))
-        cli::cli_verbatim(as.character(private$metadata))
-        cli::cli_end()
-      }
-    },
+  target_only <- setdiff(target_names, current_names)
+  current_only <- setdiff(current_names, target_names)
 
-    print_files = function(verbose) {
-      name <- if ("artefacts" %in% private$what) {
-        "artefacts"
-      } else {
-        "files"
-      }
+  list(differences = intersected[differences],
+       target_only = target_only,
+       current_only = current_only)
+}
 
-      removed <- private$files[private$files$status == "removed", ]
-      if (nrow(removed) > 0) {
-        cli::cli_alert_warning(
-          "The following {name} only exist in packet {private$current}:")
-        cli::cli_ul(removed$path)
-      }
+##' @export
+format.orderly_comparison <- function(x, ..., name = deparse(substitute(x))) {
+  cli::cli_format_method({
+    cli::cli_alert_info("Comparing packets {x$target$id} and {x$current$id}...")
+    cli::cli_div(theme = list(div = list("margin-left" = 2)))
 
-      added <- private$files[private$files$status == "added", ]
-      if (nrow(added) > 0) {
-        cli::cli_alert_warning(
-          "The following {name} only exist in packet {private$target}:")
-        cli::cli_ul(added$path)
-      }
+    diff <- compare_metadata_attribute_list(x$target, x$current)
 
-      modified <- private$files[private$files$status == "modified", ]
-      if (nrow(modified) > 0) {
-        binary <- sapply(modified$diff, is.null)
+    if (length(diff$differences) > 0) {
+      cli::cli_alert_info("The following attributes are different across the two packets:")
+      cli::cli_ul(diff$differences)
+    }
 
-        cli::cli_alert_warning(
-          paste("The following {name} exist in both packets but have",
-                "different contents:"))
+    if (length(diff$target_only) > 0) {
+      cli::cli_alert_info("The following attributes only exist in packet {x$target$only}:")
+      cli::cli_ul(diff$target_only)
+    }
 
-        cli::cli_ul()
-        for (i in seq_len(nrow(modified))) {
-          cli::cli_li("{modified$path[[i]]}")
-          if (verbose) {
-            if (!binary[[i]]) {
-              cli::cli_div(theme = list(div = list("margin-left" = 2)))
-              cli::cli_verbatim(as.character(modified$diff[[i]]))
-              cli::cli_end()
-            }
-          }
-        }
-        if (verbose && any(binary)) {
-          cli::cli_alert_warning(
-            "Contents of binary file{?s} {modified$path[binary]} were omitted")
-        }
-        if (!verbose) {
-          cli::cli_alert_info(paste(
-            "Print the comparison with {.code verbose = TRUE} to display the",
-            "differences in the {name}' contents"))
-        }
-        cli::cli_end()
-      }
+    if (length(diff$current_only) > 0) {
+      cli::cli_alert_info("The following attributes only exist in packet {x$current$only}:")
+      cli::cli_ul(diff$current_only)
     }
-  ),
 
-  public = list(
-    initialize = function(what, target, current, metadata, files) {
-      private$what <- what
-      private$target <- target
-      private$current <- current
-      private$metadata <- metadata
-      private$files <- files
-    },
+    if (length(diff$differences) > 0 ||
+        length(diff$target_only) > 0 ||
+        length(diff$current_only) > 0) {
+      cli::cli_alert_info("Use {.code orderly_comparison_explain({name})} to examine the differences in detail.")
+    } else {
+      cli::cli_alert_success("The two packets are identical, up to trivial differences.")
+    }
+  })
+}
 
-    is_equal = function() {
-      is.null(private$metadata) && all(private$files$status == "unchanged")
-    },
+##' @export
+print.orderly_comparison <- function(x, ..., name = deparse(substitute(x))) {
+  cat(format(x, ..., name = name), sep = "\n")
+}
 
-    format = function(verbose = FALSE, ...) {
-      target <- private$target
-      current <- private$current
+orderly_comparison_explain <- function(x, attributes  = NULL, ..., verbose = FALSE, name = deparse(substitute(x))) {
+  rlang::check_dots_empty()
 
-      cli::cli_format_method({
-        if (self$is_equal()) {
-          msg <- if (setequal(private$what, c("metadata", "files"))) {
-            "Packets {target} and {current} are identical"
-          } else if (identical(private$what, "metadata")) {
-            "Metadata of packets {target} and {current} is identical"
-          } else if (identical(private$what, "files")) {
-            "Files of packets {target} and {current} are identical"
-          } else if (identical(private$what, "artefacts")) {
-            "Artefacts of packets {target} and {current} are identical"
-          } else if (setequal(private$what, c("metadata", "artefacts"))) {
-            paste("Metadata and artefacts of packets {target} and {current}",
-                  "are identical")
-          } else {
-            stop("Unhandled combination of `what`")
-          }
-          cli::cli_alert_success(msg)
-        } else {
-          cli::cli_alert_info(
-            "Comparing packets {private$target} and {private$current}")
+  cli::cli_alert_info("Comparing packets {x$target$id} and {x$current$id}...")
 
-          private$print_metadata()
-          private$print_files(verbose = verbose)
-        }
-      })
+  if (is.null(attributes)) {
+    diff <- compare_metadata_attribute_list(x$target, x$current)
+    attributes <- union(diff$differences,
+                        union(diff$target_only, diff$current_only))
+    if (length(attributes) == 0) {
+      cli::cli_alert_success("The two packets are identical, up to trivial differences.")
+      return()
     }
-  )
-)
+  }
+
+  for (n in attributes) {
+    compare_metadata_attribute(
+      x, attribute = n, verbose = verbose, name = name)
+  }
+}
diff --git a/man/orderly_compare_packets.Rd b/man/orderly_compare_packets.Rd
index 84194223..8d7e71e7 100644
--- a/man/orderly_compare_packets.Rd
+++ b/man/orderly_compare_packets.Rd
@@ -7,10 +7,10 @@
 orderly_compare_packets(
   target,
   current,
-  search_options = NULL,
   root = NULL,
-  locate = TRUE,
-  what = c("metadata", "files")
+  location = NULL,
+  allow_remote = NULL,
+  pull_metadata = NULL
 )
 }
 \arguments{
@@ -18,26 +18,20 @@ orderly_compare_packets(
 
 \item{current}{The id of the other packet against which to compare.}
 
-\item{search_options}{Options for locating packet files. If there are no
-copies of the files locally, they can be downloaded automatically from a
-remote location on-demand if \code{allow_remote} is \code{TRUE}.}
-
 \item{root}{The path to the root directory, or \code{NULL} (the
-default) to search for one from the current working directory if
-\code{locate} is \code{TRUE}. This function does not require that the
-directory is configured for orderly, and can be any \code{outpack}
-root (see \link{orderly_init} for details).}
-
-\item{locate}{Logical, indicating if the root should be searched
-for.  If \code{TRUE}, then we looks in the directory given for \code{root}
-(or the working directory if \code{NULL}) and then up through its
-parents until it finds an \code{.outpack} directory or
-\code{orderly_config.yml}}
+default) to search for one from the current working
+directory. This function does not require that the directory is
+configured for orderly, and can be any \code{outpack} root (see
+\link{orderly_init} for details).}
 
 \item{what}{One or more of "metadata", "files" and "artefacts", retricting
 what components of the packet to compare. This is useful when it is known
 for example that the source code of a report what changed, and one is only
 interested in the effect on its output.}
+
+\item{search_options}{Options for locating packet files. If there are no
+copies of the files locally, they can be downloaded automatically from a
+remote location on-demand if \code{allow_remote} is \code{TRUE}.}
 }
 \value{
 An R6 object of class \code{orderly_packet_diff} is returned. Printing