From d6951c8656ada6e66697887632cfb2ef0d3c8d4f Mon Sep 17 00:00:00 2001
From: Will Manning <will@spiraldb.com>
Date: Thu, 26 Sep 2024 16:04:40 +0100
Subject: [PATCH] fix: edge case in filling ALP encoded child on patches (#939)

Realized that there's an unhandled edge case in #924, [commented
here](https://github.com/spiraldb/vortex/pull/924/files#r1776099681)

Essentially, on develop, if we have two chunks and the first chunk is
all patches and the second chunk has 0 patches, then the patched values
won't get filled in the encoded array. Not the end of the world (they're
presumably full of integer approximations that don't round-trip), but if
it's a case of outlier large values that are getting patched, then the
encoded values will end up bitpacking poorly.

This PR fixes that.
---
 encodings/alp/src/alp.rs | 51 +++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 27 deletions(-)

diff --git a/encodings/alp/src/alp.rs b/encodings/alp/src/alp.rs
index a7f6a8b105..710b8b25b3 100644
--- a/encodings/alp/src/alp.rs
+++ b/encodings/alp/src/alp.rs
@@ -176,6 +176,30 @@ fn encode_chunk_unchecked<T: ALPFloat>(
     let chunk_patch_count = chunk_patch_count; // immutable hereafter
     assert_eq!(encoded_output.len(), num_prev_encoded + chunk.len());
 
+    if chunk_patch_count > 0 {
+        // we need to gather the patches for this chunk
+        // preallocate space for the patches (plus one because our loop may attempt to write one past the end)
+        patch_indices.reserve(chunk_patch_count + 1);
+        patch_values.reserve(chunk_patch_count + 1);
+
+        // record the patches in this chunk
+        let patch_indices_mut = patch_indices.spare_capacity_mut();
+        let patch_values_mut = patch_values.spare_capacity_mut();
+        let mut chunk_patch_index = 0;
+        for i in num_prev_encoded..encoded_output.len() {
+            let decoded = T::decode_single(encoded_output[i], exp);
+            // write() is only safe to call more than once because the values are primitive (i.e., Drop is a no-op)
+            patch_indices_mut[chunk_patch_index].write(i as u64);
+            patch_values_mut[chunk_patch_index].write(chunk[i - num_prev_encoded]);
+            chunk_patch_index += (decoded != chunk[i - num_prev_encoded]) as usize;
+        }
+        assert_eq!(chunk_patch_index, chunk_patch_count);
+        unsafe {
+            patch_indices.set_len(num_prev_patches + chunk_patch_count);
+            patch_values.set_len(num_prev_patches + chunk_patch_count);
+        }
+    }
+
     // find the first successfully encoded value (i.e., not patched)
     // this is our fill value for missing values
     if fill_value.is_none() && (num_prev_encoded + chunk_patch_count < encoded_output.len()) {
@@ -188,33 +212,6 @@ fn encode_chunk_unchecked<T: ALPFloat>(
         }
     }
 
-    // if there are no patches, we are done
-    if chunk_patch_count == 0 {
-        return;
-    }
-
-    // we need to gather the patches for this chunk
-    // preallocate space for the patches (plus one because our loop may attempt to write one past the end)
-    patch_indices.reserve(chunk_patch_count + 1);
-    patch_values.reserve(chunk_patch_count + 1);
-
-    // record the patches in this chunk
-    let patch_indices_mut = patch_indices.spare_capacity_mut();
-    let patch_values_mut = patch_values.spare_capacity_mut();
-    let mut chunk_patch_index = 0;
-    for i in num_prev_encoded..encoded_output.len() {
-        let decoded = T::decode_single(encoded_output[i], exp);
-        // write() is only safe to call more than once because the values are primitive (i.e., Drop is a no-op)
-        patch_indices_mut[chunk_patch_index].write(i as u64);
-        patch_values_mut[chunk_patch_index].write(chunk[i - num_prev_encoded]);
-        chunk_patch_index += (decoded != chunk[i - num_prev_encoded]) as usize;
-    }
-    assert_eq!(chunk_patch_index, chunk_patch_count);
-    unsafe {
-        patch_indices.set_len(num_prev_patches + chunk_patch_count);
-        patch_values.set_len(num_prev_patches + chunk_patch_count);
-    }
-
     // replace the patched values in the encoded array with the fill value
     // for better downstream compression
     if let Some(fill_value) = fill_value {