From 846d0730b561841168610dc249e6bbe63901f806 Mon Sep 17 00:00:00 2001 From: Paul Lietar Date: Wed, 20 Mar 2024 09:44:42 +0000 Subject: [PATCH 1/4] Don't add a trailing newline to metadata files. By default, the base writeLines function adds a trailing newline to the end of the file, even when the input data did not contain any. While this is generally an acceptable behaviour, in the case of metadata files, we also hash the data and embed it in location files. Because the hashing happens over the in-memory data, it does not include the trailing newline. The hash of the on-disk file therefore ends up being different from the hash stored in the location file. It hasn't caused any practical issues because we trim leading and trailing whitespaces when reading the files back, getting the original serialized metadata that had been hashed. Similarly, when pushing a metadata file to a remote server, the client trims whitespaces from the metadata file. The behaviour is nevertheless confusing and it would be better to have consitent contents and hashes all around. The old behaviour of trimming whitespaces on read is preserved, as we must maintain compatibility with old files that do contain the newline. --- R/location.R | 2 +- R/location_path.R | 2 +- R/outpack_insert.R | 2 +- tests/testthat/test-location-path.R | 30 ++++++++++++++++++++++++++++ tests/testthat/test-outpack-packet.R | 12 +++++++++++ tests/testthat/test-util.R | 12 +++++++++++ 6 files changed, 57 insertions(+), 3 deletions(-) diff --git a/R/location.R b/R/location.R index d0223dd2..c7b2b13a 100644 --- a/R/location.R +++ b/R/location.R @@ -474,7 +474,7 @@ location_pull_metadata <- function(location_name, root, call) { "Please let us know how this might have happened."), i = hint_remove), call) - writeLines(metadata[[i]], filename[[i]]) + writeLines(metadata[[i]], filename[[i]], sep="") } } diff --git a/R/location_path.R b/R/location_path.R index 19d894d7..15ffd5bc 100644 --- a/R/location_path.R +++ b/R/location_path.R @@ -92,7 +92,7 @@ location_path_import_metadata <- function(str, hash, root) { root$files$get(meta$files$hash, dst, TRUE) } - writeLines(str, file.path(root$path, ".outpack", "metadata", id)) + writeLines(str, file.path(root$path, ".outpack", "metadata", id), sep="") time <- Sys.time() mark_packet_known(id, local, hash, time, root) } diff --git a/R/outpack_insert.R b/R/outpack_insert.R index e9ab53e6..2129e22f 100644 --- a/R/outpack_insert.R +++ b/R/outpack_insert.R @@ -34,7 +34,7 @@ outpack_insert_packet <- function(path, json, root = NULL) { } path_meta <- file.path(root$path, ".outpack", "metadata", id) - writeLines(json, path_meta) + writeLines(json, path_meta, sep="") ## TODO: once we get more flexible remotes, this will get moved into ## its own thing. diff --git a/tests/testthat/test-location-path.R b/tests/testthat/test-location-path.R index 5a483279..a66e6e3e 100644 --- a/tests/testthat/test-location-path.R +++ b/tests/testthat/test-location-path.R @@ -319,3 +319,33 @@ test_that("Push single packet", { files_used <- lapply(id, function(id) client$index$metadata(id)$files$hash) expect_setequal(plan$files, unique(unlist(files_used, FALSE, FALSE))) }) + + +test_that("Can read metadata files with a trailing newline", { + # Past versions of orderly2 wrote metadata files with a trailing newline + # character, despite the fact that the newline was not included when hashing. + # + # This has been fixed by not writing the newline anymore, but for + # compatibility we need to ensure we can still read those metadata files and + # get a correct hash. + + root <- create_temporary_root() + id <- create_random_packet(root) + path <- file.path(root$path, ".outpack", "metadata", id) + + # Calling writeLines adds the trailing newline and mimicks the old orderly2 + # behaviour. + old_size <- file.info(path)$size + writeLines(read_string(path), path) + expect_equal(file.info(path)$size, old_size + 1) + + # Reading the metadata from a location at that path correctly strips the + # newline and hashes correctly. + loc <- orderly_location_path$new(root$path) + packets <- loc$list() + data <- loc$metadata(id) + expect_equal(nchar(data), old_size, ignore_attr = TRUE) + + expected_hash <- packets[packets$packet == id]$hash + expect_no_error(hash_validate_data(data, expected_hash)) +}) diff --git a/tests/testthat/test-outpack-packet.R b/tests/testthat/test-outpack-packet.R index 41ec5adf..2478e516 100644 --- a/tests/testthat/test-outpack-packet.R +++ b/tests/testthat/test-outpack-packet.R @@ -823,3 +823,15 @@ test_that("can overwrite dependencies", { hash_file(file.path(path_src, "data.rds")), hash_file(file.path(root$path, "archive", "data", id, "data.rds"))) }) + + +test_that("metadata files match their hash", { + root <- create_temporary_root() + id <- create_random_packet(root) + + location <- root$index$location(local) + expected_hash <- location[location$packet == id]$hash + + path <- file.path(root$path, ".outpack", "metadata", id) + expect_no_error(hash_validate_file(expected_hash, path)) +}) diff --git a/tests/testthat/test-util.R b/tests/testthat/test-util.R index f06bc747..e2fb228d 100644 --- a/tests/testthat/test-util.R +++ b/tests/testthat/test-util.R @@ -321,3 +321,15 @@ test_that("can gracefully cope with rds save failure", { "some error") expect_equal(dir(tmp), character()) }) + + +test_that("read_string strips newlines", { + path <- tempfile() + writeLines(c("", "12345678"), path) + + # 8 characters, a leading newline and a trailing one + expect_equal(file.info(path)$size, 10) + + result <- expect_silent(read_string(path)) + expect_equal(result, "12345678") +}) From 241127abd5020b174732a51415ae09f0336e8286 Mon Sep 17 00:00:00 2001 From: Paul Lietar Date: Wed, 3 Apr 2024 13:19:43 +0100 Subject: [PATCH 2/4] fix test --- tests/testthat/test-outpack-packet.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-outpack-packet.R b/tests/testthat/test-outpack-packet.R index 2478e516..32c06648 100644 --- a/tests/testthat/test-outpack-packet.R +++ b/tests/testthat/test-outpack-packet.R @@ -833,5 +833,5 @@ test_that("metadata files match their hash", { expected_hash <- location[location$packet == id]$hash path <- file.path(root$path, ".outpack", "metadata", id) - expect_no_error(hash_validate_file(expected_hash, path)) + expect_no_error(hash_validate_file(path, expected_hash)) }) From 6527497b2a01c0b2bc1b9cfc4bd3d09096a8dbc5 Mon Sep 17 00:00:00 2001 From: Paul Lietar Date: Wed, 3 Apr 2024 13:34:57 +0100 Subject: [PATCH 3/4] style fix --- R/location.R | 2 +- R/location_path.R | 2 +- R/outpack_insert.R | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/R/location.R b/R/location.R index c7b2b13a..e25d4088 100644 --- a/R/location.R +++ b/R/location.R @@ -474,7 +474,7 @@ location_pull_metadata <- function(location_name, root, call) { "Please let us know how this might have happened."), i = hint_remove), call) - writeLines(metadata[[i]], filename[[i]], sep="") + writeLines(metadata[[i]], filename[[i]], sep = "") } } diff --git a/R/location_path.R b/R/location_path.R index 15ffd5bc..daea4594 100644 --- a/R/location_path.R +++ b/R/location_path.R @@ -92,7 +92,7 @@ location_path_import_metadata <- function(str, hash, root) { root$files$get(meta$files$hash, dst, TRUE) } - writeLines(str, file.path(root$path, ".outpack", "metadata", id), sep="") + writeLines(str, file.path(root$path, ".outpack", "metadata", id), sep = "") time <- Sys.time() mark_packet_known(id, local, hash, time, root) } diff --git a/R/outpack_insert.R b/R/outpack_insert.R index 2129e22f..44101e24 100644 --- a/R/outpack_insert.R +++ b/R/outpack_insert.R @@ -34,7 +34,7 @@ outpack_insert_packet <- function(path, json, root = NULL) { } path_meta <- file.path(root$path, ".outpack", "metadata", id) - writeLines(json, path_meta, sep="") + writeLines(json, path_meta, sep = "") ## TODO: once we get more flexible remotes, this will get moved into ## its own thing. From 36facd6aff5cc89c9923003594e24df3503c04bf Mon Sep 17 00:00:00 2001 From: Paul Lietar Date: Wed, 3 Apr 2024 13:42:48 +0100 Subject: [PATCH 4/4] Fix tests on windows. --- tests/testthat/test-location-path.R | 5 +++-- tests/testthat/test-util.R | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/testthat/test-location-path.R b/tests/testthat/test-location-path.R index a66e6e3e..a951dd6c 100644 --- a/tests/testthat/test-location-path.R +++ b/tests/testthat/test-location-path.R @@ -334,10 +334,11 @@ test_that("Can read metadata files with a trailing newline", { path <- file.path(root$path, ".outpack", "metadata", id) # Calling writeLines adds the trailing newline and mimicks the old orderly2 - # behaviour. + # behaviour. The size will be one or two bytes bigger than the actual data, + # depending on whether the newline is `\n` or `\r\n`. old_size <- file.info(path)$size writeLines(read_string(path), path) - expect_equal(file.info(path)$size, old_size + 1) + expect_gte(file.info(path)$size, old_size + 1) # Reading the metadata from a location at that path correctly strips the # newline and hashes correctly. diff --git a/tests/testthat/test-util.R b/tests/testthat/test-util.R index e2fb228d..bac860c5 100644 --- a/tests/testthat/test-util.R +++ b/tests/testthat/test-util.R @@ -328,7 +328,8 @@ test_that("read_string strips newlines", { writeLines(c("", "12345678"), path) # 8 characters, a leading newline and a trailing one - expect_equal(file.info(path)$size, 10) + # Each newline may be one of two bytes each, depending on the platform. + expect_gte(file.info(path)$size, 10) result <- expect_silent(read_string(path)) expect_equal(result, "12345678")