From d6951c8656ada6e66697887632cfb2ef0d3c8d4f Mon Sep 17 00:00:00 2001 From: Will Manning Date: Thu, 26 Sep 2024 16:04:40 +0100 Subject: [PATCH] fix: edge case in filling ALP encoded child on patches (#939) Realized that there's an unhandled edge case in #924, [commented here](https://github.com/spiraldb/vortex/pull/924/files#r1776099681) Essentially, on develop, if we have two chunks and the first chunk is all patches and the second chunk has 0 patches, then the patched values won't get filled in the encoded array. Not the end of the world (they're presumably full of integer approximations that don't round-trip), but if it's a case of outlier large values that are getting patched, then the encoded values will end up bitpacking poorly. This PR fixes that. --- encodings/alp/src/alp.rs | 51 +++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/encodings/alp/src/alp.rs b/encodings/alp/src/alp.rs index a7f6a8b105..710b8b25b3 100644 --- a/encodings/alp/src/alp.rs +++ b/encodings/alp/src/alp.rs @@ -176,6 +176,30 @@ fn encode_chunk_unchecked( let chunk_patch_count = chunk_patch_count; // immutable hereafter assert_eq!(encoded_output.len(), num_prev_encoded + chunk.len()); + if chunk_patch_count > 0 { + // we need to gather the patches for this chunk + // preallocate space for the patches (plus one because our loop may attempt to write one past the end) + patch_indices.reserve(chunk_patch_count + 1); + patch_values.reserve(chunk_patch_count + 1); + + // record the patches in this chunk + let patch_indices_mut = patch_indices.spare_capacity_mut(); + let patch_values_mut = patch_values.spare_capacity_mut(); + let mut chunk_patch_index = 0; + for i in num_prev_encoded..encoded_output.len() { + let decoded = T::decode_single(encoded_output[i], exp); + // write() is only safe to call more than once because the values are primitive (i.e., Drop is a no-op) + patch_indices_mut[chunk_patch_index].write(i as u64); + patch_values_mut[chunk_patch_index].write(chunk[i - num_prev_encoded]); + chunk_patch_index += (decoded != chunk[i - num_prev_encoded]) as usize; + } + assert_eq!(chunk_patch_index, chunk_patch_count); + unsafe { + patch_indices.set_len(num_prev_patches + chunk_patch_count); + patch_values.set_len(num_prev_patches + chunk_patch_count); + } + } + // find the first successfully encoded value (i.e., not patched) // this is our fill value for missing values if fill_value.is_none() && (num_prev_encoded + chunk_patch_count < encoded_output.len()) { @@ -188,33 +212,6 @@ fn encode_chunk_unchecked( } } - // if there are no patches, we are done - if chunk_patch_count == 0 { - return; - } - - // we need to gather the patches for this chunk - // preallocate space for the patches (plus one because our loop may attempt to write one past the end) - patch_indices.reserve(chunk_patch_count + 1); - patch_values.reserve(chunk_patch_count + 1); - - // record the patches in this chunk - let patch_indices_mut = patch_indices.spare_capacity_mut(); - let patch_values_mut = patch_values.spare_capacity_mut(); - let mut chunk_patch_index = 0; - for i in num_prev_encoded..encoded_output.len() { - let decoded = T::decode_single(encoded_output[i], exp); - // write() is only safe to call more than once because the values are primitive (i.e., Drop is a no-op) - patch_indices_mut[chunk_patch_index].write(i as u64); - patch_values_mut[chunk_patch_index].write(chunk[i - num_prev_encoded]); - chunk_patch_index += (decoded != chunk[i - num_prev_encoded]) as usize; - } - assert_eq!(chunk_patch_index, chunk_patch_count); - unsafe { - patch_indices.set_len(num_prev_patches + chunk_patch_count); - patch_values.set_len(num_prev_patches + chunk_patch_count); - } - // replace the patched values in the encoded array with the fill value // for better downstream compression if let Some(fill_value) = fill_value {