Skip to content

Commit

Permalink
comments & review
Browse files Browse the repository at this point in the history
  • Loading branch information
sebastiantia committed Feb 19, 2025
1 parent a0fae19 commit 72562e0
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 13 deletions.
11 changes: 6 additions & 5 deletions kernel/src/log_segment.rs
Original file line number Diff line number Diff line change
Expand Up @@ -231,9 +231,10 @@ impl LogSegment {

/// Returns an iterator over checkpoint data, processing sidecar files when necessary.
///
/// Checkpoint data is returned directly if:
/// - Processing a multi-part checkpoint
/// - Schema does not contain file actions
/// By default, `create_checkpoint_stream` checks for the presence of sidecar files, and
/// reads their contents if present. Checking for sidecar files is skipped if:
/// - The checkpoint is a multi-part checkpoint
/// - The checkpoint read schema does not contain the add action
///
/// For single-part checkpoints, any referenced sidecar files are processed. These
/// sidecar files contain the actual add actions that would otherwise be
Expand Down Expand Up @@ -288,8 +289,8 @@ impl LogSegment {
// This closure maps the checkpoint batch to an iterator of batches
// by chaining the checkpoint batch with sidecar batches if they exist.

// 1. In the case where the schema does not contain add/remove actions, we return the checkpoint
// batch directly as sidecar files only have to be read when the schema contains add/remove actions.
// 1. In the case where the schema does not contain the add action, we return the checkpoint
// batch directly as sidecar files only have to be read when the schema contains the add action.
// 2. Multi-part checkpoint batches never have sidecar actions, so the batch is returned as-is.
let sidecar_content = if need_add_actions && checkpoint_parts.len() == 1 {
Self::process_sidecars(
Expand Down
20 changes: 12 additions & 8 deletions kernel/src/log_segment/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ use url::Url;

use crate::actions::visitors::AddVisitor;
use crate::actions::{
get_log_add_schema, get_log_schema, Add, Sidecar, ADD_NAME, METADATA_NAME, SIDECAR_NAME,
get_log_add_schema, get_log_schema, Add, Sidecar, ADD_NAME, METADATA_NAME, REMOVE_NAME,
SIDECAR_NAME,
};
use crate::engine::arrow_data::ArrowEngineData;
use crate::engine::default::executor::tokio::TokioBackgroundExecutor;
Expand Down Expand Up @@ -788,12 +789,12 @@ fn test_checkpoint_batch_with_sidecars_returns_sidecar_batches() -> DeltaResult<

add_sidecar_to_store(
&store,
add_batch_simple(get_log_schema().clone()),
add_batch_simple(get_log_schema().project(&[ADD_NAME, REMOVE_NAME])?),
"sidecarfile1.parquet",
)?;
add_sidecar_to_store(
&store,
add_batch_with_remove(get_log_schema().clone()),
add_batch_with_remove(get_log_schema().project(&[ADD_NAME, REMOVE_NAME])?),
"sidecarfile2.parquet",
)?;

Expand Down Expand Up @@ -1084,9 +1085,12 @@ fn test_create_checkpoint_stream_reads_json_checkpoint_batch_without_sidecars()
Ok(())
}

// Encapsulates logic that has already been tested but tests the interaction between the functions,
// such as performing a map operation on the returned sidecar batches from `process_sidecars`
// to include the is_log_batch flag
// Tests the end-to-end process of creating a checkpoint stream.
// Verifies that:
// - The checkpoint file is read and produces batches containing references to sidecar files.
// - As sidecar references are present, the corresponding sidecar files are processed correctly.
// - Batches from both the checkpoint file and sidecar files are returned.
// - Each returned batch is correctly flagged with is_log_batch set to false
#[test]
fn test_create_checkpoint_stream_reads_checkpoint_file_and_returns_sidecar_batches(
) -> DeltaResult<()> {
Expand All @@ -1108,12 +1112,12 @@ fn test_create_checkpoint_stream_reads_checkpoint_file_and_returns_sidecar_batch

add_sidecar_to_store(
&store,
add_batch_simple(get_log_schema().clone()),
add_batch_simple(get_log_schema().project(&[ADD_NAME, REMOVE_NAME])?),
"sidecarfile1.parquet",
)?;
add_sidecar_to_store(
&store,
add_batch_with_remove(get_log_schema().clone()),
add_batch_with_remove(get_log_schema().project(&[ADD_NAME, REMOVE_NAME])?),
"sidecarfile2.parquet",
)?;

Expand Down

0 comments on commit 72562e0

Please sign in to comment.