From 1cd3d07318eae07cdfdc7204f25dd3786b8c32e3 Mon Sep 17 00:00:00 2001 From: Paul Lietar Date: Wed, 20 Mar 2024 09:44:42 +0000 Subject: [PATCH] Don't add a trailing newline to metadata files. By default, the base writeLines function adds a trailing newline to the end of the file, even when the input data did not contain any. While this is generally an acceptable behaviour, in the case of metadata files, we also hash the data and embed it in location files. Because the hashing happens over the in-memory data, it does not include the trailing newline. The hash of the on-disk file therefore ends up being different from the hash stored in the location file. It hasn't caused any practical issues because we trim leading and trailing whitespaces when reading the files back, getting the original serialized metadata that had been hashed. Similarly, when pushing a metadata file to a remote server, the client trims whitespaces from the metadata file. The behaviour is nevertheless confusing and it would be better to have consitent contents and hashes all around. The old behaviour of trimming whitespaces on read is preserved, as we must maintain compatibility with old files that do contain the newline. --- R/location.R | 2 +- R/location_path.R | 2 +- R/outpack_insert.R | 2 +- tests/testthat/test-location-path.R | 30 ++++++++++++++++++++++++++++ tests/testthat/test-outpack-packet.R | 12 +++++++++++ tests/testthat/test-util.R | 12 +++++++++++ 6 files changed, 57 insertions(+), 3 deletions(-) diff --git a/R/location.R b/R/location.R index d0223dd2..c7b2b13a 100644 --- a/R/location.R +++ b/R/location.R @@ -474,7 +474,7 @@ location_pull_metadata <- function(location_name, root, call) { "Please let us know how this might have happened."), i = hint_remove), call) - writeLines(metadata[[i]], filename[[i]]) + writeLines(metadata[[i]], filename[[i]], sep="") } } diff --git a/R/location_path.R b/R/location_path.R index 19d894d7..15ffd5bc 100644 --- a/R/location_path.R +++ b/R/location_path.R @@ -92,7 +92,7 @@ location_path_import_metadata <- function(str, hash, root) { root$files$get(meta$files$hash, dst, TRUE) } - writeLines(str, file.path(root$path, ".outpack", "metadata", id)) + writeLines(str, file.path(root$path, ".outpack", "metadata", id), sep="") time <- Sys.time() mark_packet_known(id, local, hash, time, root) } diff --git a/R/outpack_insert.R b/R/outpack_insert.R index e9ab53e6..2129e22f 100644 --- a/R/outpack_insert.R +++ b/R/outpack_insert.R @@ -34,7 +34,7 @@ outpack_insert_packet <- function(path, json, root = NULL) { } path_meta <- file.path(root$path, ".outpack", "metadata", id) - writeLines(json, path_meta) + writeLines(json, path_meta, sep="") ## TODO: once we get more flexible remotes, this will get moved into ## its own thing. diff --git a/tests/testthat/test-location-path.R b/tests/testthat/test-location-path.R index 5a483279..a66e6e3e 100644 --- a/tests/testthat/test-location-path.R +++ b/tests/testthat/test-location-path.R @@ -319,3 +319,33 @@ test_that("Push single packet", { files_used <- lapply(id, function(id) client$index$metadata(id)$files$hash) expect_setequal(plan$files, unique(unlist(files_used, FALSE, FALSE))) }) + + +test_that("Can read metadata files with a trailing newline", { + # Past versions of orderly2 wrote metadata files with a trailing newline + # character, despite the fact that the newline was not included when hashing. + # + # This has been fixed by not writing the newline anymore, but for + # compatibility we need to ensure we can still read those metadata files and + # get a correct hash. + + root <- create_temporary_root() + id <- create_random_packet(root) + path <- file.path(root$path, ".outpack", "metadata", id) + + # Calling writeLines adds the trailing newline and mimicks the old orderly2 + # behaviour. + old_size <- file.info(path)$size + writeLines(read_string(path), path) + expect_equal(file.info(path)$size, old_size + 1) + + # Reading the metadata from a location at that path correctly strips the + # newline and hashes correctly. + loc <- orderly_location_path$new(root$path) + packets <- loc$list() + data <- loc$metadata(id) + expect_equal(nchar(data), old_size, ignore_attr = TRUE) + + expected_hash <- packets[packets$packet == id]$hash + expect_no_error(hash_validate_data(data, expected_hash)) +}) diff --git a/tests/testthat/test-outpack-packet.R b/tests/testthat/test-outpack-packet.R index 41ec5adf..2478e516 100644 --- a/tests/testthat/test-outpack-packet.R +++ b/tests/testthat/test-outpack-packet.R @@ -823,3 +823,15 @@ test_that("can overwrite dependencies", { hash_file(file.path(path_src, "data.rds")), hash_file(file.path(root$path, "archive", "data", id, "data.rds"))) }) + + +test_that("metadata files match their hash", { + root <- create_temporary_root() + id <- create_random_packet(root) + + location <- root$index$location(local) + expected_hash <- location[location$packet == id]$hash + + path <- file.path(root$path, ".outpack", "metadata", id) + expect_no_error(hash_validate_file(expected_hash, path)) +}) diff --git a/tests/testthat/test-util.R b/tests/testthat/test-util.R index f06bc747..e2fb228d 100644 --- a/tests/testthat/test-util.R +++ b/tests/testthat/test-util.R @@ -321,3 +321,15 @@ test_that("can gracefully cope with rds save failure", { "some error") expect_equal(dir(tmp), character()) }) + + +test_that("read_string strips newlines", { + path <- tempfile() + writeLines(c("", "12345678"), path) + + # 8 characters, a leading newline and a trailing one + expect_equal(file.info(path)$size, 10) + + result <- expect_silent(read_string(path)) + expect_equal(result, "12345678") +})