From 524af05d3d7feb15f3f9e5f98d47ed4cd62297f9 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 9 Aug 2023 14:56:37 -0400 Subject: [PATCH 01/32] Prototype TopK operator --- datafusion/core/src/physical_plan/mod.rs | 2 + .../core/src/physical_plan/sorts/sort.rs | 77 ++- datafusion/core/src/physical_plan/topk/mod.rs | 516 ++++++++++++++++++ .../tests/sqllogictests/test_files/aal.slt | 202 +++++++ .../tests/sqllogictests/test_files/window.slt | 101 ++-- 5 files changed, 825 insertions(+), 73 deletions(-) create mode 100644 datafusion/core/src/physical_plan/topk/mod.rs create mode 100644 datafusion/core/tests/sqllogictests/test_files/aal.slt diff --git a/datafusion/core/src/physical_plan/mod.rs b/datafusion/core/src/physical_plan/mod.rs index c73e61aea190..c60dbd6a44c5 100644 --- a/datafusion/core/src/physical_plan/mod.rs +++ b/datafusion/core/src/physical_plan/mod.rs @@ -17,6 +17,7 @@ //! Traits for physical query plan, supporting parallel execution for partitioned relations. +mod topk; mod visitor; pub use self::metrics::Metric; use self::metrics::MetricsSet; @@ -27,6 +28,7 @@ use crate::datasource::physical_plan::FileScanConfig; use crate::physical_plan::expressions::PhysicalSortExpr; use datafusion_common::Result; pub use datafusion_common::{ColumnStatistics, Statistics}; +pub use topk::TopK; pub use visitor::{accept, visit_execution_plan, ExecutionPlanVisitor}; use arrow::datatypes::SchemaRef; diff --git a/datafusion/core/src/physical_plan/sorts/sort.rs b/datafusion/core/src/physical_plan/sorts/sort.rs index 52936dc55e6e..9f6ff0fb6d03 100644 --- a/datafusion/core/src/physical_plan/sorts/sort.rs +++ b/datafusion/core/src/physical_plan/sorts/sort.rs @@ -26,6 +26,7 @@ use crate::physical_plan::metrics::{ }; use crate::physical_plan::sorts::merge::streaming_merge; use crate::physical_plan::stream::{RecordBatchReceiverStream, RecordBatchStreamAdapter}; +use crate::physical_plan::topk::TopK; use crate::physical_plan::{ DisplayAs, DisplayFormatType, Distribution, EmptyRecordBatchStream, ExecutionPlan, Partitioning, SendableRecordBatchStream, Statistics, @@ -759,7 +760,12 @@ impl DisplayAs for SortExec { let expr: Vec = self.expr.iter().map(|e| e.to_string()).collect(); match self.fetch { Some(fetch) => { - write!(f, "SortExec: fetch={fetch}, expr=[{}]", expr.join(",")) + write!( + f, + // TODO should this say topk? + "SortExec: fetch={fetch}, expr=[{}]", + expr.join(",") + ) } None => write!(f, "SortExec: expr=[{}]", expr.join(",")), } @@ -847,29 +853,54 @@ impl ExecutionPlan for SortExec { trace!("End SortExec's input.execute for partition: {}", partition); - let mut sorter = ExternalSorter::new( - partition, - input.schema(), - self.expr.clone(), - context.session_config().batch_size(), - self.fetch, - execution_options.sort_spill_reservation_bytes, - execution_options.sort_in_place_threshold_bytes, - &self.metrics_set, - context.runtime_env(), - ); + if let Some(fetch) = self.fetch.as_ref() { + let mut topk = TopK::try_new( + partition, + input.schema(), + self.expr.clone(), + *fetch, + context.session_config().batch_size(), + context.runtime_env(), + &self.metrics_set, + partition, + )?; + + Ok(Box::pin(RecordBatchStreamAdapter::new( + self.schema(), + futures::stream::once(async move { + while let Some(batch) = input.next().await { + let batch = batch?; + topk.insert_batch(batch)?; + } + topk.emit() + }) + .try_flatten(), + ))) + } else { + let mut sorter = ExternalSorter::new( + partition, + input.schema(), + self.expr.clone(), + context.session_config().batch_size(), + self.fetch, + execution_options.sort_spill_reservation_bytes, + execution_options.sort_in_place_threshold_bytes, + &self.metrics_set, + context.runtime_env(), + ); - Ok(Box::pin(RecordBatchStreamAdapter::new( - self.schema(), - futures::stream::once(async move { - while let Some(batch) = input.next().await { - let batch = batch?; - sorter.insert_batch(batch).await?; - } - sorter.sort() - }) - .try_flatten(), - ))) + Ok(Box::pin(RecordBatchStreamAdapter::new( + self.schema(), + futures::stream::once(async move { + while let Some(batch) = input.next().await { + let batch = batch?; + sorter.insert_batch(batch).await?; + } + sorter.sort() + }) + .try_flatten(), + ))) + } } fn metrics(&self) -> Option { diff --git a/datafusion/core/src/physical_plan/topk/mod.rs b/datafusion/core/src/physical_plan/topk/mod.rs new file mode 100644 index 000000000000..d626f0806698 --- /dev/null +++ b/datafusion/core/src/physical_plan/topk/mod.rs @@ -0,0 +1,516 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! TopK: Combination of Sort / LIMIT + +use arrow::{ + compute::interleave, + row::{OwnedRow, RowConverter, Rows, SortField}, +}; +use std::{cmp::Ordering, sync::Arc}; + +use arrow_array::{Array, ArrayRef, RecordBatch}; +use arrow_schema::SchemaRef; +use datafusion_common::Result; +use datafusion_execution::{ + memory_pool::{MemoryConsumer, MemoryReservation}, + runtime_env::RuntimeEnv, +}; +use datafusion_physical_expr::PhysicalSortExpr; +use hashbrown::HashMap; + +use crate::physical_plan::{stream::RecordBatchStreamAdapter, SendableRecordBatchStream}; + +use super::metrics::{BaselineMetrics, Count, ExecutionPlanMetricsSet, MetricBuilder}; + +/// Global TopK +/// +/// # Background +/// +/// "Top K" is a common query optimization used for queries such as +/// "find the top 3 customers by revenue". The (simplified) SQL for +/// such a query might be: +/// +/// ```sql +/// SELECT customer_id, revenue FROM 'sales.csv' ORDER BY revenue DESC limit 3; +/// ``` +/// +/// The simple plan would be: +/// +/// ``` +/// > explain SELECT customer_id, revenue FROM sales ORDER BY revenue DESC limit 3; +/// +--------------+----------------------------------------+ +/// | plan_type | plan | +/// +--------------+----------------------------------------+ +/// | logical_plan | Limit: 3 | +/// | | Sort: revenue DESC NULLS FIRST | +/// | | Projection: customer_id, revenue | +/// | | TableScan: sales | +/// +--------------+----------------------------------------+ +/// ``` +/// +/// While this plan produces the correct answer, it will fully sorts the +/// input before discarding everything other than the top 3 elements. +/// +/// The same answer can be produced by simply keeping track of the top +/// N elements, reducing the total amount of required buffer memory. +/// +/// # Structure +/// +/// This operator tracks the top K items using a `TopKHeap`. +pub struct TopK { + /// schema of the output (and the input) + schema: SchemaRef, + /// Runtime metrics + metrics: TopKMetrics, + /// Reservation + reservation: MemoryReservation, + /// The target number of rows for output batches + batch_size: usize, + /// sort expressions + expr: Arc<[PhysicalSortExpr]>, + /// row converter, for sort keys + row_converter: RowConverter, + /// scratch space for converting rows + scratch_rows: Rows, + /// stores the top k values and their sort key values, in order + heap: TopKHeap, +} + +impl TopK { + /// Create a new [`TopK`] that stores the top `k` values, as + /// defined by the sort expressions in `expr`. + // TOOD: make a builder or some other nicer API to avoid the + // clippy warning + #[allow(clippy::too_many_arguments)] + pub fn try_new( + partition_id: usize, + schema: SchemaRef, + expr: Vec, + k: usize, + batch_size: usize, + runtime: Arc, + metrics: &ExecutionPlanMetricsSet, + partition: usize, + ) -> Result { + let reservation = MemoryConsumer::new(format!("TopK[{partition_id}]")) + .register(&runtime.memory_pool); + + let expr: Arc<[PhysicalSortExpr]> = expr.into(); + + let sort_fields: Vec<_> = expr + .iter() + .map(|e| { + Ok(SortField::new_with_options( + e.expr.data_type(&schema)?, + e.options, + )) + }) + .collect::>()?; + + let row_converter = RowConverter::new(sort_fields)?; + let scratch_rows = row_converter.empty_rows( + batch_size, + 20 * batch_size, // guestimate 20 bytes per row + ); + + Ok(Self { + schema, + metrics: TopKMetrics::new(metrics, partition), + reservation, + batch_size, + expr, + row_converter, + scratch_rows, + heap: TopKHeap::new(k), + }) + } + + /// Insert `batch`, remembering it if any of its values are among + /// the top k seen so far. + pub fn insert_batch(&mut self, batch: RecordBatch) -> Result<()> { + // Updates on drop + let _timer = self.metrics.baseline.elapsed_compute().timer(); + + let sort_keys: Vec = self + .expr + .iter() + .map(|expr| { + let value = expr.expr.evaluate(&batch)?; + Ok(value.into_array(batch.num_rows())) + }) + .collect::>>()?; + + // reuse existing `Rows` to avoid reallocations + let rows = &mut self.scratch_rows; + rows.clear(); + self.row_converter.append(rows, &sort_keys)?; + + // TODO make this algorithmically better?: + // 1. only check topk values in rows + // 2. only do one update through top_k + + let mut batch_entry = self.heap.register_batch(batch); + for (index, row) in rows.iter().enumerate() { + match self.heap.k_largest() { + // heap has k items, and the current row is not + // smaller than the curret smallest k value, skip + Some(largest) if largest.row.row() <= row => {} + // don't yet have k items or new item is greater than + // current min top k + None | Some(_) => { + self.heap.add(&mut batch_entry, row.owned(), index); + self.metrics.row_replacements.add(1); + } + } + } + self.heap.insert_batch_entry(batch_entry); + + // update memory reservation + self.reservation.try_resize(self.size())?; + Ok(()) + } + + /// Returns the top k results broken into `batch_size` [`RecordBatch`]es + pub fn emit(self) -> Result { + let Self { + schema, + metrics, + reservation: _, + batch_size, + expr: _, + row_converter: _, + scratch_rows: _, + heap, + } = self; + let _timer = metrics.baseline.elapsed_compute().timer(); // time updated on drop + + let mut batch = heap.emit(schema.clone())?; + metrics.baseline.output_rows().add(batch.num_rows()); + + // break into record batches as needed + let mut batches = vec![]; + loop { + if batch.num_rows() < batch_size { + batches.push(Ok(batch)); + break; + } else { + batches.push(Ok(batch.slice(0, batch_size))); + batch = batch.slice(batch_size, batch.num_rows()); + } + } + Ok(Box::pin(RecordBatchStreamAdapter::new( + schema, + futures::stream::iter(batches), + ))) + } + + /// return the size of memory used by this operator, in bytes + fn size(&self) -> usize { + std::mem::size_of::() + + self.row_converter.size() + + self.scratch_rows.size() + + self.heap.size() + } +} + +struct TopKMetrics { + /// metrics + pub baseline: BaselineMetrics, + + /// count of how many rows were replaced in the heap + pub row_replacements: Count, +} + +impl TopKMetrics { + fn new(metrics: &ExecutionPlanMetricsSet, partition: usize) -> Self { + Self { + baseline: BaselineMetrics::new(metrics, partition), + row_replacements: MetricBuilder::new(metrics) + .counter("row_replacements", partition), + } + } +} + +/// This structure keeps at most the *smallest* k items, using the +/// [arrow::row] format for sort keys. While it is called "topK" for +/// values like `1, 2, 3, 4, 5` the "top 3" really means the +/// *smallest* 3 , `1, 2, 3`, not the *largest* 3 `3, 4, 5`. +/// +/// Using the `Row` format handles things such as ascending vs +/// descending and nulls first vs nulls last. +/// +/// It doesn't use `BinaryHeap` in the Rust standard library because +/// it is important to check the current minimum value in the heap +/// prior to creating a new value to insert. +struct TopKHeap { + /// The maximum size of this heap. + k: usize, + /// Storage for up at most `k` items, in ascending + /// order. `inner[0]` holds the smallest value of the smallest k + /// so far, `inner[len-1]` holds the largest value smallest k so far. + inner: Vec, + /// Storage the original row values (TopKRow only has the sort key) + store: RecordBatchStore, + /// The size of all `OwnedRows`s held by this heap + owned_row_bytes: usize, +} + +impl TopKHeap { + fn new(k: usize) -> Self { + assert!(k > 0); + Self { + k, + inner: Vec::with_capacity(k), + store: RecordBatchStore::new(), + owned_row_bytes: 0, + } + } + + /// Register a [`RecordBatch`] with the heap, returning the + /// appropriate entry + pub fn register_batch(&mut self, batch: RecordBatch) -> RecordBatchEntry { + self.store.register(batch) + } + + /// Insert a [`RecordBatchEntry`] created by a previous call to + /// [`Self::register_batch`] into storage. + pub fn insert_batch_entry(&mut self, entry: RecordBatchEntry) { + self.store.insert(entry) + } + + /// Returns the largest value stored by the heap if there are k + /// items, otherwise returns None + fn k_largest(&self) -> Option<&TopKRow> { + if self.inner.len() < self.k { + None + } else { + self.inner.last() + } + } + + /// Adds `row` to this heap. If inserting this new item would + /// increase the size past `k`, removes the previously smallest + /// item. + fn add(&mut self, batch_entry: &mut RecordBatchEntry, row: OwnedRow, index: usize) { + assert!(self.inner.len() <= self.k); + + batch_entry.uses += 1; + + self.owned_row_bytes += owned_row_size(&row); + + // put the new row into the correct location to maintain that + // self.inner is sorted in descending order + let insertion_point = self + .inner + .partition_point(|current_row| current_row.row <= row); + self.inner.insert( + insertion_point, + TopKRow { + row, + batch_id: batch_entry.id, + index, + }, + ); + + // limit size to k items + if self.inner.len() > self.k { + // If there was a previous minimum value, decrement its use + if let Some(prev_min) = self.inner.pop() { + if prev_min.batch_id == batch_entry.id { + batch_entry.uses -= 1; + } else { + self.store.unuse(prev_min.batch_id); + } + // update memory accounting + let prev_size = owned_row_size(&prev_min.row); + assert!(self.owned_row_bytes >= prev_size); + self.owned_row_bytes -= prev_size; + } + } + } + + /// Returns the values stored in this heap, from values low to high, as a single + /// [`RecordBatch`] + pub fn emit(&self, schema: SchemaRef) -> Result { + // Indicies for each row within its respective RecordBatch + let indicies: Vec<_> = self + .inner + .iter() + .enumerate() + .map(|(i, k)| (i, k.index)) + .collect(); + + let num_columns = { + let Some(first_value) = self.inner.get(0) else { + return Ok(RecordBatch::new_empty(schema)); + }; + self.store + .get(first_value.batch_id) + .expect("invalid batch id") + .batch + .num_columns() + }; + + // build the output columns one at time, using the + // `interleave` kernel to pick rows from different arrays + let output_columns: Vec<_> = (0..num_columns) + .map(|col| { + let input_arrays: Vec<_> = self + .inner + .iter() + .map(|k| { + let entry = + self.store.get(k.batch_id).expect("invalid stored batch id"); + entry.batch.column(col) as &dyn Array + }) + .collect(); + + // at this point `indices` contains indexes within the + // rows and `input_arrays` contains a reference to the + // relevant Array for that index. `interleave` pulls + // them together into a single new array + Ok(interleave(&input_arrays, &indicies)?) + }) + .collect::>()?; + + Ok(RecordBatch::try_new(schema, output_columns)?) + } + + /// return the size of memory used by this heap, in bytes + fn size(&self) -> usize { + std::mem::size_of::() + + (self.inner.capacity() * std::mem::size_of::()) + + self.store.size() + + self.owned_row_bytes + } +} + +/// Size of memory owned by `row` until row::size() is available +/// TODO file upstream ticket in arrow-rs to add this +fn owned_row_size(row: &OwnedRow) -> usize { + std::mem::size_of_val(row) + row.as_ref().len() // underlying data, doesn't account for capacity +} + +/// Represents one of the top K rows. Orders according to `OwnedRow` +#[derive(Debug, PartialEq)] +struct TopKRow { + /// the value of the sort key for this row + row: OwnedRow, + /// the index in this record batch the row came from + index: usize, + /// the RecordBatch this row came from: an id into a [`RecordBatchStore`] + batch_id: u32, +} + +impl Eq for TopKRow {} + +impl PartialOrd for TopKRow { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for TopKRow { + fn cmp(&self, other: &Self) -> Ordering { + self.row.cmp(&other.row) + } +} + +#[derive(Debug)] +struct RecordBatchEntry { + id: u32, + batch: RecordBatch, + // for this batch, how many times has it been used + uses: usize, +} + +/// This structure tracks [`RecordBatch`] by an id so that: +/// +/// 1. The baches can be tracked via an id that can be copied cheaply +/// 2. The total memory held by all batches is tracked +#[derive(Debug)] +struct RecordBatchStore { + /// id generator + next_id: u32, + /// storage + batches: HashMap, + /// total size of all record batches tracked by this store + batches_size: usize, +} + +impl RecordBatchStore { + fn new() -> Self { + Self { + next_id: 0, + batches: HashMap::new(), + batches_size: 0, + } + } + + /// Register this batch with the store and assign an ID. No + /// attempt is made to compare this batch to other batches + pub fn register(&mut self, batch: RecordBatch) -> RecordBatchEntry { + let id = self.next_id; + self.next_id += 1; + RecordBatchEntry { id, batch, uses: 0 } + } + + /// Insert a record batch entry into this store, tracking its + /// memory use, if it has any uses + pub fn insert(&mut self, entry: RecordBatchEntry) { + // uses of 0 means that none of the rows in the batch were stored in the topk + if entry.uses > 0 { + self.batches_size += entry.batch.get_array_memory_size(); + self.batches.insert(entry.id, entry); + } + } + + fn get(&self, id: u32) -> Option<&RecordBatchEntry> { + self.batches.get(&id) + } + + /// remove a use from the specified batch id. If the use count + /// reaches zero the batch entry is removed from the store + /// + /// panics if there were no remaining uses of id + pub fn unuse(&mut self, id: u32) { + let remove = if let Some(batch_entry) = self.batches.get_mut(&id) { + batch_entry.uses = batch_entry.uses.checked_sub(1).expect("underflow"); + batch_entry.uses == 0 + } else { + panic!("No entry for id {id}"); + }; + + if remove { + let old_entry = self.batches.remove(&id).unwrap(); + self.batches_size = self + .batches_size + .checked_sub(old_entry.batch.get_array_memory_size()) + .unwrap(); + } + } + + /// returns the size of memory used by this store, including all + /// referenced `RecordBatch`es, in bytes + pub fn size(&self) -> usize { + std::mem::size_of::() + + self.batches.capacity() + * (std::mem::size_of::() + std::mem::size_of::()) + + self.batches_size + } +} diff --git a/datafusion/core/tests/sqllogictests/test_files/aal.slt b/datafusion/core/tests/sqllogictests/test_files/aal.slt new file mode 100644 index 000000000000..f19c79e8d1cb --- /dev/null +++ b/datafusion/core/tests/sqllogictests/test_files/aal.slt @@ -0,0 +1,202 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Tests for development + +statement ok +create table aal(x int) as values (10), (2), (3), (0), (5), (4), (3), (2), (1), (3), (8); + +query I +select * from aal order by x; +---- +0 +1 +2 +2 +3 +3 +3 +4 +5 +8 +10 + +query I +select * from aal order by x limit 3; +---- +0 +1 +2 + +query I +select * from aal order by x desc limit 3; +---- +10 +8 +5 + + + + +statement ok +CREATE EXTERNAL TABLE aggregate_test_100 ( + c1 VARCHAR NOT NULL, + c2 TINYINT NOT NULL, + c3 SMALLINT NOT NULL, + c4 SMALLINT, + c5 INT, + c6 BIGINT NOT NULL, + c7 SMALLINT NOT NULL, + c8 INT NOT NULL, + c9 BIGINT UNSIGNED NOT NULL, + c10 VARCHAR NOT NULL, + c11 FLOAT NOT NULL, + c12 DOUBLE NOT NULL, + c13 VARCHAR NOT NULL +) +STORED AS CSV +WITH HEADER ROW +LOCATION '../../testing/data/csv/aggregate_test_100.csv' + +query TT +explain select * from aggregate_test_100 ORDER BY c13 desc limit 5; +---- +logical_plan +Limit: skip=0, fetch=5 +--Sort: aggregate_test_100.c13 DESC NULLS FIRST, fetch=5 +----TableScan: aggregate_test_100 projection=[c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13] +physical_plan +GlobalLimitExec: skip=0, fetch=5 +--SortExec: fetch=5, expr=[c13@12 DESC] +----CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13], has_header=true + + + + +query T +select c13 from aggregate_test_100 ORDER BY c13; +---- +0VVIHzxWtNOFLtnhjHEKjXaJOSLJfm +0keZ5G8BffGwgF2RwQD59TFzMStxCB +0og6hSkhbX8AC1ktFS4kounvTzy8Vo +1aOcrEGd0cOqZe2I5XBOm0nDcwtBZO +2T3wSlHdEmASmO0xcXHnndkKEt6bz8 +3BEOHQsMEFZ58VcNTOJYShTBpAPzbt +4HX6feIvmNXBN7XGqgO4YVBkhu8GDI +4JznSdBajNWhu4hRQwjV1FjTTxY68i +52mKlRE3aHCBZtjECq6sY9OqVf8Dze +56MZa5O1hVtX4c5sbnCfxuX5kDChqI +6FPJlLAcaQ5uokyOWZ9HGdLZObFvOZ +6WfVFBVGJSQb7FhA7E0lBwdvjfZnSW +6oIXZuIPIqEoPBvFmbt2Nxy3tryGUE +6x93sxYioWuq5c9Kkk8oTAAORM7cH0 +802bgTGl6Bk5TlkPYYTxp5JkKyaYUA +8LIh0b6jmDGm87BmIyjdxNIpX4ugjD +90gAtmGEeIqUTbo1ZrxCvWtsseukXC +9UbObCsVkmYpJGcGrgfK90qOnwb2Lj +AFGCj7OWlEB5QfniEFgonMq90Tq5uH +ALuRhobVWbnQTTWZdSOk0iVe8oYFhW +Amn2K87Db5Es3dFQO9cw9cvpAM6h35 +AyYVExXK6AR2qUTxNZ7qRHQOVGMLcz +BJqx5WokrmrrezZA0dUbleMYkG5U2O +BPtQMxnuSPpxMExYV9YkDa6cAN7GP3 +BsM5ZAYifRh5Lw3Y8X1r53I0cTJnfE +C2GT5KVyOPZpgKVl110TyZO0NcJ434 +DuJNG8tufSqW0ZstHqWj3aGvFLMg4A +EcCuckwsF3gV1Ecgmh5v4KM8g1ozif +ErJFw6hzZ5fmI5r8bhE4JzlscnhKZU +F7NSTjWvQJyBburN7CXRUlbgp2dIrA +Fi4rJeTQq4eXj8Lxg3Hja5hBVTVV5u +H5j5ZHy1FGesOAHjkQEDYCucbpKWRu +HKSMQ9nTnwXCJIte1JrM1dtYnDtJ8g +IWl0G3ZlMNf7WT8yjIB49cx7MmYOmr +IZTkHMLvIKuiLjhDjYMmIHxh166we4 +Ig1QcuKsjHXkproePdERo2w0mYzIqd +JHNgc2UCaiXOdmkxwDDyGhRlO0mnBQ +JN0VclewmjwYlSl8386MlWv5rEhWCz +JafwVLSVk5AVoXFuzclesQ000EE2k1 +KJFcmTVjdkCMv94wYCtfHMFhzyRsmH +Ktb7GQ0N1DrxwkCkEUsTaIXk0xYinn +Ld2ej8NEv5zNcqU60FwpHeZKBhfpiV +LiEBxds3X0Uw0lxiYjDqrkAaAwoiIW +MXhhH1Var3OzzJCtI9VNyYvA0q8UyJ +MeSTAXq8gVxVjbEjgkvU9YLte0X9uE +NEhyk8uIx4kEULJGa8qIyFjjBcP2G6 +O66j6PaYuZhEUtqV6fuU7TyjM2WxC5 +OF7fQ37GzaZ5ikA2oMyvleKtgnLjXh +OPwBqCEK5PWTjWaiOyL45u2NLTaDWv +Oq6J4Rx6nde0YlhOIJkFsX2MsSvAQ0 +Ow5PGpfTm4dXCfTDsXAOTatXRoAydR +QEHVvcP8gxI6EMJIrvcnIhgzPNjIvv +QJYm7YRA3YetcBHI5wkMZeLXVmfuNy +QYlaIAnJA6r8rlAb6f59wcxvcPcWFf +RilTlL1tKkPOUFuzmLydHAVZwv1OGl +Sfx0vxv1skzZWT1PqVdoRDdO6Sb6xH +TTQUwpMNSXZqVBKAFvXu7OlWvKXJKX +TtDKUZxzVxsq758G6AWPSYuZgVgbcl +VDhtJkYjAYPykCgOU9x3v7v3t4SO1a +VY0zXmXeksCT8BzvpzpPLbmU9Kp9Y4 +Vp3gmWunM5A7wOC9YW2JroFqTWjvTi +WHmjWk2AY4c6m7DA4GitUx6nmb1yYS +XemNcT1xp61xcM1Qz3wZ1VECCnq06O +Z2sWcQr0qyCJRMHDpRy3aQr7PkHtkK +aDxBtor7Icd9C5hnTvvw5NrIre740e +akiiY5N0I44CMwEnBL6RTBk7BRkxEj +b3b9esRhTzFEawbs6XhpKnD9ojutHB +bgK1r6v3BCTh0aejJUhkA1Hn6idXGp +cBGc0kSm32ylBDnxogG727C0uhZEYZ +cq4WSAIFwx3wwTUS5bp1wCe71R6U5I +dVdvo6nUD5FgCgsbOZLds28RyGTpnx +e2Gh6Ov8XkXoFdJWhl0EjwEHlMDYyG +f9ALCzwDAKmdu7Rk2msJaB1wxe5IBX +fuyvs0w7WsKSlXqJ1e6HFSoLmx03AG +gTpyQnEODMcpsPnJMZC66gh33i3m0b +gpo8K5qtYePve6jyPt6xgJx4YOVjms +gxfHWUF8XgY2KdFxigxvNEXe2V2XMl +i6RQVXKUh7MzuGMDaNclUYnFUAireU +ioEncce3mPOXD2hWhpZpCPWGATG6GU +jQimhdepw3GKmioWUlVSWeBVRKFkY3 +l7uwDoTepWwnAP0ufqtHJS3CRi7RfP +lqhzgLsXZ8JhtpeeUWWNbMz8PHI705 +m6jD0LBIQWaMfenwRCTANI9eOdyyto +mhjME0zBHbrK6NMkytMTQzOssOa1gF +mzbkwXKrPeZnxg2Kn1LRF5hYSsmksS +nYVJnVicpGRqKZibHyBAmtmzBXAFfT +oHJMNvWuunsIMIWFnYG31RCfkOo2V7 +oLZ21P2JEDooxV1pU31cIxQHEeeoLu +okOkcWflkNXIy4R8LzmySyY1EC3sYd +pLk3i59bZwd5KBZrI1FiweYTd5hteG +pTeu0WMjBRTaNRT15rLCuEh3tBJVc5 +qnPOOmslCJaT45buUisMRnM0rc77EK +t6fQUjJejPcjc04wHvHTPe55S65B4V +ukOiFGGFnQJDHFgZxHMpvhD3zybF0M +ukyD7b0Efj7tNlFSRmzZ0IqkEzg2a8 +waIGbOGl1PM6gnzZ4uuZt4E2yDWRHs +wwXqSGKLyBQyPkonlzBNYUJTCo4LRS +xipQ93429ksjNcXPX5326VSg1xJZcW +y7C453hRWd4E7ImjNDWlpexB8nUqjh +ydkwycaISlYSlEq3TlkS2m15I2pcp8 + + +query TIIIIIIIITRRT +select * from aggregate_test_100 ORDER BY c13 desc limit 5; +---- +a 4 -38 20744 762932956 308913475857409919 7 45465 1787652631 878137512938218976 0.7459874 0.021825780392 ydkwycaISlYSlEq3TlkS2m15I2pcp8 +d 1 -98 13630 -1991133944 1184110014998006843 220 2986 225513085 9634106610243643486 0.89651865 0.164088254508 y7C453hRWd4E7ImjNDWlpexB8nUqjh +e 2 52 -12056 -1090239422 9011500141803970147 238 4168 2013662838 12565360638488684051 0.6694766 0.391444365692 xipQ93429ksjNcXPX5326VSg1xJZcW +d 1 -72 25590 1188089983 3090286296481837049 241 832 3542840110 5885937420286765261 0.41980565 0.215354023438 wwXqSGKLyBQyPkonlzBNYUJTCo4LRS +a 1 -5 12636 794623392 2909750622865366631 15 24022 2669374863 4776679784701509574 0.29877836 0.253725340799 waIGbOGl1PM6gnzZ4uuZt4E2yDWRHs diff --git a/datafusion/core/tests/sqllogictests/test_files/window.slt b/datafusion/core/tests/sqllogictests/test_files/window.slt index cd257aaa92de..45a3bb583450 100644 --- a/datafusion/core/tests/sqllogictests/test_files/window.slt +++ b/datafusion/core/tests/sqllogictests/test_files/window.slt @@ -2597,6 +2597,7 @@ SELECT # test_source_sorted_builtin query TT EXPLAIN SELECT + ts, FIRST_VALUE(inc_col) OVER(ORDER BY ts RANGE BETWEEN 10 PRECEDING and 1 FOLLOWING) as fv1, FIRST_VALUE(inc_col) OVER(ORDER BY ts ROWS BETWEEN 10 PRECEDING and 1 FOLLOWING) as fv2, LAST_VALUE(inc_col) OVER(ORDER BY ts RANGE BETWEEN 10 PRECEDING and 1 FOLLOWING) as lv1, @@ -2626,24 +2627,23 @@ EXPLAIN SELECT LIMIT 5; ---- logical_plan -Projection: fv1, fv2, lv1, lv2, nv1, nv2, rn1, rn2, rank1, rank2, dense_rank1, dense_rank2, lag1, lag2, lead1, lead2, fvr1, fvr2, lvr1, lvr2, lagr1, lagr2, leadr1, leadr2 ---Limit: skip=0, fetch=5 -----Sort: annotated_data_finite.ts DESC NULLS FIRST, fetch=5 -------Projection: FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING AS fv1, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS fv2, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING AS lv1, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS lv2, NTH_VALUE(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING AS nv1, NTH_VALUE(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS nv2, ROW_NUMBER() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING AS rn1, ROW_NUMBER() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS rn2, RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING AS rank1, RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS rank2, DENSE_RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING AS dense_rank1, DENSE_RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS dense_rank2, LAG(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING AS lag1, LAG(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS lag2, LEAD(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING AS lead1, LEAD(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS lead2, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING AS fvr1, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS fvr2, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING AS lvr1, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS lvr2, LAG(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING AS lagr1, LAG(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS lagr2, LEAD(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING AS leadr1, LEAD(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS leadr2, annotated_data_finite.ts ---------WindowAggr: windowExpr=[[FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, NTH_VALUE(annotated_data_finite.inc_col, Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, NTH_VALUE(annotated_data_finite.inc_col, Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, ROW_NUMBER() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, ROW_NUMBER() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, DENSE_RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, DENSE_RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, LAG(annotated_data_finite.inc_col, Int64(1), Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, LAG(annotated_data_finite.inc_col, Int64(2), Int64(1002)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, LEAD(annotated_data_finite.inc_col, Int64(-1), Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, LEAD(annotated_data_finite.inc_col, Int64(4), Int64(1004)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING]] -----------WindowAggr: windowExpr=[[FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, LAG(annotated_data_finite.inc_col, Int64(1), Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, LAG(annotated_data_finite.inc_col, Int64(2), Int64(1002)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, LEAD(annotated_data_finite.inc_col, Int64(-1), Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, LEAD(annotated_data_finite.inc_col, Int64(4), Int64(1004)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING]] -------------TableScan: annotated_data_finite projection=[ts, inc_col] +Limit: skip=0, fetch=5 +--Sort: annotated_data_finite.ts DESC NULLS FIRST, fetch=5 +----Projection: annotated_data_finite.ts, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING AS fv1, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS fv2, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING AS lv1, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS lv2, NTH_VALUE(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING AS nv1, NTH_VALUE(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS nv2, ROW_NUMBER() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING AS rn1, ROW_NUMBER() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS rn2, RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING AS rank1, RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS rank2, DENSE_RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING AS dense_rank1, DENSE_RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS dense_rank2, LAG(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING AS lag1, LAG(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS lag2, LEAD(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING AS lead1, LEAD(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS lead2, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING AS fvr1, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS fvr2, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING AS lvr1, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS lvr2, LAG(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING AS lagr1, LAG(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS lagr2, LEAD(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING AS leadr1, LEAD(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS leadr2 +------WindowAggr: windowExpr=[[FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, NTH_VALUE(annotated_data_finite.inc_col, Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, NTH_VALUE(annotated_data_finite.inc_col, Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, ROW_NUMBER() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, ROW_NUMBER() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, DENSE_RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, DENSE_RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, LAG(annotated_data_finite.inc_col, Int64(1), Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, LAG(annotated_data_finite.inc_col, Int64(2), Int64(1002)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, LEAD(annotated_data_finite.inc_col, Int64(-1), Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, LEAD(annotated_data_finite.inc_col, Int64(4), Int64(1004)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING]] +--------WindowAggr: windowExpr=[[FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, LAG(annotated_data_finite.inc_col, Int64(1), Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, LAG(annotated_data_finite.inc_col, Int64(2), Int64(1002)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, LEAD(annotated_data_finite.inc_col, Int64(-1), Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, LEAD(annotated_data_finite.inc_col, Int64(4), Int64(1004)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING]] +----------TableScan: annotated_data_finite projection=[ts, inc_col] physical_plan -ProjectionExec: expr=[fv1@0 as fv1, fv2@1 as fv2, lv1@2 as lv1, lv2@3 as lv2, nv1@4 as nv1, nv2@5 as nv2, rn1@6 as rn1, rn2@7 as rn2, rank1@8 as rank1, rank2@9 as rank2, dense_rank1@10 as dense_rank1, dense_rank2@11 as dense_rank2, lag1@12 as lag1, lag2@13 as lag2, lead1@14 as lead1, lead2@15 as lead2, fvr1@16 as fvr1, fvr2@17 as fvr2, lvr1@18 as lvr1, lvr2@19 as lvr2, lagr1@20 as lagr1, lagr2@21 as lagr2, leadr1@22 as leadr1, leadr2@23 as leadr2] ---GlobalLimitExec: skip=0, fetch=5 -----SortExec: fetch=5, expr=[ts@24 DESC] -------ProjectionExec: expr=[FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@10 as fv1, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@11 as fv2, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@12 as lv1, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@13 as lv2, NTH_VALUE(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@14 as nv1, NTH_VALUE(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@15 as nv2, ROW_NUMBER() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@16 as rn1, ROW_NUMBER() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@17 as rn2, RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@18 as rank1, RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@19 as rank2, DENSE_RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@20 as dense_rank1, DENSE_RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@21 as dense_rank2, LAG(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@22 as lag1, LAG(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@23 as lag2, LEAD(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@24 as lead1, LEAD(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@25 as lead2, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@2 as fvr1, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@3 as fvr2, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@4 as lvr1, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@5 as lvr2, LAG(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@6 as lagr1, LAG(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@7 as lagr2, LEAD(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@8 as leadr1, LEAD(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@9 as leadr2, ts@0 as ts] ---------BoundedWindowAggExec: wdw=[FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)) }, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)) }, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, NTH_VALUE(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "NTH_VALUE(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)) }, NTH_VALUE(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "NTH_VALUE(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, ROW_NUMBER() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Ok(Field { name: "ROW_NUMBER() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, ROW_NUMBER() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "ROW_NUMBER() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Ok(Field { name: "RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, DENSE_RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Ok(Field { name: "DENSE_RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, DENSE_RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "DENSE_RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, LAG(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Ok(Field { name: "LAG(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, LAG(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "LAG(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, LEAD(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Ok(Field { name: "LEAD(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, LEAD(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "LEAD(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }], mode=[Sorted] -----------BoundedWindowAggExec: wdw=[FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(10)) }, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(10)) }, LAG(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Ok(Field { name: "LAG(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)) }, LAG(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "LAG(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(10)) }, LEAD(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Ok(Field { name: "LEAD(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)) }, LEAD(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "LEAD(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(10)) }], mode=[Sorted] -------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_1.csv]]}, projection=[ts, inc_col], output_ordering=[ts@0 ASC NULLS LAST], has_header=true +GlobalLimitExec: skip=0, fetch=5 +--SortExec: fetch=5, expr=[ts@0 DESC] +----ProjectionExec: expr=[ts@0 as ts, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@10 as fv1, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@11 as fv2, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@12 as lv1, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@13 as lv2, NTH_VALUE(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@14 as nv1, NTH_VALUE(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@15 as nv2, ROW_NUMBER() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@16 as rn1, ROW_NUMBER() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@17 as rn2, RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@18 as rank1, RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@19 as rank2, DENSE_RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@20 as dense_rank1, DENSE_RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@21 as dense_rank2, LAG(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@22 as lag1, LAG(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@23 as lag2, LEAD(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@24 as lead1, LEAD(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@25 as lead2, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@2 as fvr1, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@3 as fvr2, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@4 as lvr1, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@5 as lvr2, LAG(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@6 as lagr1, LAG(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@7 as lagr2, LEAD(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@8 as leadr1, LEAD(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@9 as leadr2] +------BoundedWindowAggExec: wdw=[FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)) }, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)) }, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, NTH_VALUE(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "NTH_VALUE(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)) }, NTH_VALUE(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "NTH_VALUE(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, ROW_NUMBER() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Ok(Field { name: "ROW_NUMBER() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, ROW_NUMBER() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "ROW_NUMBER() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Ok(Field { name: "RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, DENSE_RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Ok(Field { name: "DENSE_RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, DENSE_RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "DENSE_RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, LAG(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Ok(Field { name: "LAG(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, LAG(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "LAG(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, LEAD(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Ok(Field { name: "LEAD(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, LEAD(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "LEAD(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }], mode=[Sorted] +--------BoundedWindowAggExec: wdw=[FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(10)) }, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(10)) }, LAG(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Ok(Field { name: "LAG(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)) }, LAG(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "LAG(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(10)) }, LEAD(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Ok(Field { name: "LEAD(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)) }, LEAD(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "LEAD(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(10)) }], mode=[Sorted] +----------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_1.csv]]}, projection=[ts, inc_col], output_ordering=[ts@0 ASC NULLS LAST], has_header=true -query IIIIIIIIIIIIIIIIIIIIIIII +query IIIIIIIIIIIIIIIIIIIIIIIII SELECT + ts, FIRST_VALUE(inc_col) OVER(ORDER BY ts RANGE BETWEEN 10 PRECEDING and 1 FOLLOWING) as fv1, FIRST_VALUE(inc_col) OVER(ORDER BY ts ROWS BETWEEN 10 PRECEDING and 1 FOLLOWING) as fv2, LAST_VALUE(inc_col) OVER(ORDER BY ts RANGE BETWEEN 10 PRECEDING and 1 FOLLOWING) as lv1, @@ -2672,11 +2672,11 @@ SELECT ORDER BY ts DESC LIMIT 5; ---- -289 269 305 305 305 283 100 100 99 99 86 86 301 296 301 1004 305 305 301 301 1001 1002 1001 289 -289 266 305 305 305 278 99 99 99 99 86 86 296 291 296 1004 305 305 301 296 305 1002 305 286 -289 261 296 301 NULL 275 98 98 98 98 85 85 291 289 291 1004 305 305 296 291 301 305 301 283 -286 259 291 296 NULL 272 97 97 97 97 84 84 289 286 289 1004 305 305 291 289 296 301 296 278 -275 254 289 291 289 269 96 96 96 96 83 83 286 283 286 305 305 305 289 286 291 296 291 275 +264 289 266 305 305 305 278 99 99 99 99 86 86 296 291 296 1004 305 305 301 296 305 1002 305 286 +264 289 269 305 305 305 283 100 100 99 99 86 86 301 296 301 1004 305 305 301 301 1001 1002 1001 289 +262 289 261 296 301 NULL 275 98 98 98 98 85 85 291 289 291 1004 305 305 296 291 301 305 301 283 +258 286 259 291 296 NULL 272 97 97 97 97 84 84 289 286 289 1004 305 305 291 289 296 301 296 278 +254 275 254 289 291 289 269 96 96 96 96 83 83 286 283 286 305 305 305 289 286 291 296 291 275 # test_source_sorted_unbounded_preceding @@ -3197,8 +3197,9 @@ drop table annotated_data_infinite2 # window3 spec is not used in window functions. # The query should still work. -query RR +query IRR SELECT + C3, MAX(c12) OVER window1, MIN(c12) OVER window2 as max1 FROM aggregate_test_100 @@ -3208,14 +3209,15 @@ SELECT ORDER BY C3 LIMIT 5 ---- -0.970671228336 0.970671228336 -0.850672105305 0.850672105305 -0.152498292972 0.152498292972 -0.369363046006 0.369363046006 -0.56535284223 0.56535284223 +-117 0.850672105305 0.850672105305 +-117 0.970671228336 0.970671228336 +-111 0.152498292972 0.152498292972 +-107 0.369363046006 0.369363046006 +-106 0.56535284223 0.56535284223 query TT EXPLAIN SELECT + C3, MAX(c12) OVER window1 as min1, MIN(c12) OVER window2 as max1 FROM aggregate_test_100 @@ -3226,30 +3228,29 @@ EXPLAIN SELECT LIMIT 5 ---- logical_plan -Projection: min1, max1 ---Limit: skip=0, fetch=5 -----Sort: aggregate_test_100.c3 ASC NULLS LAST, fetch=5 -------Projection: MAX(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS min1, MIN(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS max1, aggregate_test_100.c3 ---------WindowAggr: windowExpr=[[MAX(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] -----------Projection: aggregate_test_100.c3, aggregate_test_100.c12, MIN(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING -------------WindowAggr: windowExpr=[[MIN(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]] ---------------TableScan: aggregate_test_100 projection=[c3, c11, c12] +Limit: skip=0, fetch=5 +--Sort: aggregate_test_100.c3 ASC NULLS LAST, fetch=5 +----Projection: aggregate_test_100.c3, MAX(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS min1, MIN(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS max1 +------WindowAggr: windowExpr=[[MAX(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] +--------Projection: aggregate_test_100.c3, aggregate_test_100.c12, MIN(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING +----------WindowAggr: windowExpr=[[MIN(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]] +------------TableScan: aggregate_test_100 projection=[c3, c11, c12] physical_plan -ProjectionExec: expr=[min1@0 as min1, max1@1 as max1] ---GlobalLimitExec: skip=0, fetch=5 -----SortExec: fetch=5, expr=[c3@2 ASC NULLS LAST] -------ProjectionExec: expr=[MAX(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as min1, MIN(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@2 as max1, c3@0 as c3] ---------BoundedWindowAggExec: wdw=[MAX(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "MAX(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Float64(NULL)), end_bound: CurrentRow }], mode=[Sorted] -----------SortExec: expr=[c12@1 ASC NULLS LAST] -------------ProjectionExec: expr=[c3@0 as c3, c12@2 as c12, MIN(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@3 as MIN(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING] ---------------WindowAggExec: wdw=[MIN(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "MIN(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)) }] -----------------SortExec: expr=[c11@1 ASC NULLS LAST] -------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c3, c11, c12], has_header=true +GlobalLimitExec: skip=0, fetch=5 +--SortExec: fetch=5, expr=[c3@0 ASC NULLS LAST] +----ProjectionExec: expr=[c3@0 as c3, MAX(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as min1, MIN(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@2 as max1] +------BoundedWindowAggExec: wdw=[MAX(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "MAX(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Float64(NULL)), end_bound: CurrentRow }], mode=[Sorted] +--------SortExec: expr=[c12@1 ASC NULLS LAST] +----------ProjectionExec: expr=[c3@0 as c3, c12@2 as c12, MIN(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@3 as MIN(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING] +------------WindowAggExec: wdw=[MIN(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "MIN(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)) }] +--------------SortExec: expr=[c11@1 ASC NULLS LAST] +----------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c3, c11, c12], has_header=true # window1 spec is used multiple times under different aggregations. # The query should still work. -query RR +query IRR SELECT + C3, MAX(c12) OVER window1 as min1, MIN(c12) OVER window1 as max1 FROM aggregate_test_100 @@ -3257,11 +3258,11 @@ SELECT ORDER BY C3 LIMIT 5 ---- -0.970671228336 0.014793053078 -0.850672105305 0.014793053078 -0.152498292972 0.014793053078 -0.369363046006 0.014793053078 -0.56535284223 0.014793053078 +-117 0.850672105305 0.014793053078 +-117 0.970671228336 0.014793053078 +-111 0.152498292972 0.014793053078 +-107 0.369363046006 0.014793053078 +-106 0.56535284223 0.014793053078 query TT EXPLAIN SELECT From d4c09f283397fa817ee48c1ae285fea11c6de661 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 14 Aug 2023 14:45:28 -0400 Subject: [PATCH 02/32] Avoid use of Row --- datafusion/core/src/physical_plan/topk/mod.rs | 141 ++++++++++++------ 1 file changed, 94 insertions(+), 47 deletions(-) diff --git a/datafusion/core/src/physical_plan/topk/mod.rs b/datafusion/core/src/physical_plan/topk/mod.rs index d626f0806698..06d3ad33c161 100644 --- a/datafusion/core/src/physical_plan/topk/mod.rs +++ b/datafusion/core/src/physical_plan/topk/mod.rs @@ -19,7 +19,7 @@ use arrow::{ compute::interleave, - row::{OwnedRow, RowConverter, Rows, SortField}, + row::{RowConverter, Rows, SortField}, }; use std::{cmp::Ordering, sync::Arc}; @@ -169,11 +169,11 @@ impl TopK { match self.heap.k_largest() { // heap has k items, and the current row is not // smaller than the curret smallest k value, skip - Some(largest) if largest.row.row() <= row => {} + Some(largest) if largest.row.as_slice() <= row.as_ref() => {} // don't yet have k items or new item is greater than // current min top k None | Some(_) => { - self.heap.add(&mut batch_entry, row.owned(), index); + self.heap.add(&mut batch_entry, row, index); self.metrics.row_replacements.add(1); } } @@ -210,7 +210,8 @@ impl TopK { break; } else { batches.push(Ok(batch.slice(0, batch_size))); - batch = batch.slice(batch_size, batch.num_rows()); + let remaining_length = batch.num_rows() - batch_size; + batch = batch.slice(batch_size, remaining_length); } } Ok(Box::pin(RecordBatchStreamAdapter::new( @@ -266,8 +267,8 @@ struct TopKHeap { inner: Vec, /// Storage the original row values (TopKRow only has the sort key) store: RecordBatchStore, - /// The size of all `OwnedRows`s held by this heap - owned_row_bytes: usize, + /// The size of all owned data held by this heap + owned_bytes: usize, } impl TopKHeap { @@ -277,7 +278,7 @@ impl TopKHeap { k, inner: Vec::with_capacity(k), store: RecordBatchStore::new(), - owned_row_bytes: 0, + owned_bytes: 0, } } @@ -306,42 +307,44 @@ impl TopKHeap { /// Adds `row` to this heap. If inserting this new item would /// increase the size past `k`, removes the previously smallest /// item. - fn add(&mut self, batch_entry: &mut RecordBatchEntry, row: OwnedRow, index: usize) { + fn add( + &mut self, + batch_entry: &mut RecordBatchEntry, + row: impl AsRef<[u8]>, + index: usize, + ) { + let batch_id = batch_entry.id; + batch_entry.uses += 1; + assert!(self.inner.len() <= self.k); + let row = row.as_ref(); - batch_entry.uses += 1; + // Reuse storage for evicted item if possible + let new_top_k = if self.inner.len() == self.k { + let prev_min = self.inner.pop().unwrap(); + + // Update batch use + if prev_min.batch_id == batch_entry.id { + batch_entry.uses -= 1; + } else { + self.store.unuse(prev_min.batch_id); + } - self.owned_row_bytes += owned_row_size(&row); + // update memory accounting + self.owned_bytes -= prev_min.owned_size(); + prev_min.with_new_row(row, batch_id, index) + } else { + TopKRow::new(row, batch_id, index) + }; + + self.owned_bytes += new_top_k.owned_size(); // put the new row into the correct location to maintain that // self.inner is sorted in descending order let insertion_point = self .inner - .partition_point(|current_row| current_row.row <= row); - self.inner.insert( - insertion_point, - TopKRow { - row, - batch_id: batch_entry.id, - index, - }, - ); - - // limit size to k items - if self.inner.len() > self.k { - // If there was a previous minimum value, decrement its use - if let Some(prev_min) = self.inner.pop() { - if prev_min.batch_id == batch_entry.id { - batch_entry.uses -= 1; - } else { - self.store.unuse(prev_min.batch_id); - } - // update memory accounting - let prev_size = owned_row_size(&prev_min.row); - assert!(self.owned_row_bytes >= prev_size); - self.owned_row_bytes -= prev_size; - } - } + .partition_point(|current_row| current_row.row() <= row.as_ref()); + self.inner.insert(insertion_point, new_top_k); } /// Returns the values stored in this heap, from values low to high, as a single @@ -396,25 +399,69 @@ impl TopKHeap { std::mem::size_of::() + (self.inner.capacity() * std::mem::size_of::()) + self.store.size() - + self.owned_row_bytes + + self.owned_bytes } } -/// Size of memory owned by `row` until row::size() is available -/// TODO file upstream ticket in arrow-rs to add this -fn owned_row_size(row: &OwnedRow) -> usize { - std::mem::size_of_val(row) + row.as_ref().len() // underlying data, doesn't account for capacity -} - -/// Represents one of the top K rows. Orders according to `OwnedRow` +/// Represents one of the top K rows held in this heap. Orders +/// according to memcmp of row (e.g. the arrow Row format, but could +/// also be primtive values) +/// +/// Reuses allocations to minimize runtime overhead of creating new Vecs #[derive(Debug, PartialEq)] struct TopKRow { - /// the value of the sort key for this row - row: OwnedRow, - /// the index in this record batch the row came from - index: usize, + /// the value of the sort key for this row. This contains the + /// bytes that could be stored in `OwnedRow` but uses `Vec` to + /// reuse allocations. + row: Vec, /// the RecordBatch this row came from: an id into a [`RecordBatchStore`] batch_id: u32, + /// the index in this record batch the row came from + index: usize, +} + +impl TopKRow { + /// Create a new TopKRow with new allocation + fn new(row: impl AsRef<[u8]>, batch_id: u32, index: usize) -> Self { + Self { + row: row.as_ref().to_vec(), + batch_id, + index, + } + } + + /// Create a new TopKRow reusing the existing allocation + fn with_new_row( + self, + new_row: impl AsRef<[u8]>, + batch_id: u32, + index: usize, + ) -> Self { + let Self { + mut row, + batch_id: _, + index: _, + } = self; + row.clear(); + row.extend_from_slice(new_row.as_ref()); + + Self { + row, + batch_id, + index, + } + } + + /// Returns the number of bytes owned by this row in the heap (not + /// including itself) + fn owned_size(&self) -> usize { + self.row.capacity() + } + + /// Returns a slice to the owned row value + fn row(&self) -> &[u8] { + self.row.as_slice() + } } impl Eq for TopKRow {} From 948c1a2b6578749beecc8b3456a8af96b82ecd8b Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 22 Aug 2023 14:26:00 -0400 Subject: [PATCH 03/32] start working on compaction --- datafusion/core/src/physical_plan/mod.rs | 2 +- datafusion/core/src/physical_plan/topk/mod.rs | 47 ++++++++++++------- 2 files changed, 30 insertions(+), 19 deletions(-) diff --git a/datafusion/core/src/physical_plan/mod.rs b/datafusion/core/src/physical_plan/mod.rs index f544d7cce5e5..ce13e46a7ec6 100644 --- a/datafusion/core/src/physical_plan/mod.rs +++ b/datafusion/core/src/physical_plan/mod.rs @@ -27,8 +27,8 @@ use self::{ use crate::datasource::physical_plan::FileScanConfig; use crate::physical_plan::expressions::PhysicalSortExpr; use datafusion_common::Result; -pub use topk::TopK; pub use datafusion_common::{internal_err, ColumnStatistics, Statistics}; +pub use topk::TopK; pub use visitor::{accept, visit_execution_plan, ExecutionPlanVisitor}; use arrow::datatypes::SchemaRef; diff --git a/datafusion/core/src/physical_plan/topk/mod.rs b/datafusion/core/src/physical_plan/topk/mod.rs index 06d3ad33c161..2efb65fefff2 100644 --- a/datafusion/core/src/physical_plan/topk/mod.rs +++ b/datafusion/core/src/physical_plan/topk/mod.rs @@ -129,14 +129,14 @@ impl TopK { ); Ok(Self { - schema, + schema: schema.clone(), metrics: TopKMetrics::new(metrics, partition), reservation, batch_size, expr, row_converter, scratch_rows, - heap: TopKHeap::new(k), + heap: TopKHeap::new(k, schema), }) } @@ -199,7 +199,7 @@ impl TopK { } = self; let _timer = metrics.baseline.elapsed_compute().timer(); // time updated on drop - let mut batch = heap.emit(schema.clone())?; + let mut batch = heap.emit()?; metrics.baseline.output_rows().add(batch.num_rows()); // break into record batches as needed @@ -259,7 +259,7 @@ impl TopKMetrics { /// it is important to check the current minimum value in the heap /// prior to creating a new value to insert. struct TopKHeap { - /// The maximum size of this heap. + /// The maximum number of elemenents to store in this heap. k: usize, /// Storage for up at most `k` items, in ascending /// order. `inner[0]` holds the smallest value of the smallest k @@ -272,12 +272,12 @@ struct TopKHeap { } impl TopKHeap { - fn new(k: usize) -> Self { + fn new(k: usize, schema: SchemaRef) -> Self { assert!(k > 0); Self { k, inner: Vec::with_capacity(k), - store: RecordBatchStore::new(), + store: RecordBatchStore::new(schema), owned_bytes: 0, } } @@ -349,7 +349,9 @@ impl TopKHeap { /// Returns the values stored in this heap, from values low to high, as a single /// [`RecordBatch`] - pub fn emit(&self, schema: SchemaRef) -> Result { + pub fn emit(&self) -> Result { + let schema = self.store.schema().clone(); + // Indicies for each row within its respective RecordBatch let indicies: Vec<_> = self .inner @@ -358,16 +360,7 @@ impl TopKHeap { .map(|(i, k)| (i, k.index)) .collect(); - let num_columns = { - let Some(first_value) = self.inner.get(0) else { - return Ok(RecordBatch::new_empty(schema)); - }; - self.store - .get(first_value.batch_id) - .expect("invalid batch id") - .batch - .num_columns() - }; + let num_columns = schema.fields().len(); // build the output columns one at time, using the // `interleave` kernel to pick rows from different arrays @@ -394,6 +387,11 @@ impl TopKHeap { Ok(RecordBatch::try_new(schema, output_columns)?) } + /// Compact this heap, rewriting all stored batches + fn compact(&mut self) { + //let new_batch = self.emit( + } + /// return the size of memory used by this heap, in bytes fn size(&self) -> usize { std::mem::size_of::() @@ -498,14 +496,17 @@ struct RecordBatchStore { batches: HashMap, /// total size of all record batches tracked by this store batches_size: usize, + /// schema of the batches + schema: SchemaRef, } impl RecordBatchStore { - fn new() -> Self { + fn new(schema: SchemaRef) -> Self { Self { next_id: 0, batches: HashMap::new(), batches_size: 0, + schema, } } @@ -531,6 +532,16 @@ impl RecordBatchStore { self.batches.get(&id) } + /// returns the total number of batches stored in this store + fn len(&self) -> usize { + self.batches.len() + } + + /// return the schema of batches stored + fn schema(&self) -> &SchemaRef { + &self.schema + } + /// remove a use from the specified batch id. If the use count /// reaches zero the batch entry is removed from the store /// From 354d687caf8ad0e75fa27e28779844842a535724 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 22 Aug 2023 15:03:31 -0400 Subject: [PATCH 04/32] checkpoint --- datafusion/core/src/physical_plan/topk/mod.rs | 56 ++++++++++++++++++- datafusion/sqllogictest/test_files/aal.slt | 15 +++++ 2 files changed, 68 insertions(+), 3 deletions(-) diff --git a/datafusion/core/src/physical_plan/topk/mod.rs b/datafusion/core/src/physical_plan/topk/mod.rs index 2efb65fefff2..e6993b70d4c6 100644 --- a/datafusion/core/src/physical_plan/topk/mod.rs +++ b/datafusion/core/src/physical_plan/topk/mod.rs @@ -143,6 +143,8 @@ impl TopK { /// Insert `batch`, remembering it if any of its values are among /// the top k seen so far. pub fn insert_batch(&mut self, batch: RecordBatch) -> Result<()> { + use log::info; + info!("INSERTING {} rows", batch.num_rows()); // Updates on drop let _timer = self.metrics.baseline.elapsed_compute().timer(); @@ -180,6 +182,9 @@ impl TopK { } self.heap.insert_batch_entry(batch_entry); + // conserve memory + self.heap.maybe_compact()?; + // update memory reservation self.reservation.try_resize(self.size())?; Ok(()) @@ -352,6 +357,10 @@ impl TopKHeap { pub fn emit(&self) -> Result { let schema = self.store.schema().clone(); + if self.store.is_empty() { + return Ok(RecordBatch::new_empty(schema)); + } + // Indicies for each row within its respective RecordBatch let indicies: Vec<_> = self .inner @@ -387,9 +396,39 @@ impl TopKHeap { Ok(RecordBatch::try_new(schema, output_columns)?) } - /// Compact this heap, rewriting all stored batches - fn compact(&mut self) { - //let new_batch = self.emit( + /// Compact this heap, rewriting all stored batches into a single + /// input batch + pub fn maybe_compact(&mut self) -> Result<()>{ + // don't compact if the store has less than ten batches + if self.store.len() <= 10 { + return Ok(()); + } + + panic!("Disco"); + + // at first, compact the entire thing always into a new batch + // (maybe we can get fancier in the future about ignoring + // batches that have a high usage ratio already + + // Note: new batch is in the same order as inner + let new_batch = self.emit()?; + + // clear all old entires in store (this invalidates all + // store_ids in `inner`) + self.store.clear(); + + let mut batch_entry = self.register_batch(new_batch); + batch_entry.uses = self.inner.len(); + + // rewrite all existing entries to use the new batch, and + // remove old entries. The sortedness and their relative + // position do not change + for (i, topk_row) in self.inner.iter_mut().enumerate() { + topk_row.batch_id = batch_entry.id; + topk_row.index = i; + } + self.insert_batch_entry(batch_entry); + Ok(()) } /// return the size of memory used by this heap, in bytes @@ -528,6 +567,12 @@ impl RecordBatchStore { } } + /// Clear all values in this store, invalidating all previous batch ids + fn clear(&mut self) { + self.batches.clear(); + self.batches_size = 0; + } + fn get(&self, id: u32) -> Option<&RecordBatchEntry> { self.batches.get(&id) } @@ -537,6 +582,11 @@ impl RecordBatchStore { self.batches.len() } + /// returns true if the store has nothing stored + fn is_empty(&self) -> bool { + self.batches.is_empty() + } + /// return the schema of batches stored fn schema(&self) -> &SchemaRef { &self.schema diff --git a/datafusion/sqllogictest/test_files/aal.slt b/datafusion/sqllogictest/test_files/aal.slt index f19c79e8d1cb..4d8346edf23e 100644 --- a/datafusion/sqllogictest/test_files/aal.slt +++ b/datafusion/sqllogictest/test_files/aal.slt @@ -200,3 +200,18 @@ d 1 -98 13630 -1991133944 1184110014998006843 220 2986 225513085 963410661024364 e 2 52 -12056 -1090239422 9011500141803970147 238 4168 2013662838 12565360638488684051 0.6694766 0.391444365692 xipQ93429ksjNcXPX5326VSg1xJZcW d 1 -72 25590 1188089983 3090286296481837049 241 832 3542840110 5885937420286765261 0.41980565 0.215354023438 wwXqSGKLyBQyPkonlzBNYUJTCo4LRS a 1 -5 12636 794623392 2909750622865366631 15 24022 2669374863 4776679784701509574 0.29877836 0.253725340799 waIGbOGl1PM6gnzZ4uuZt4E2yDWRHs + + + +## -- make tiny batches to trigger batch compaction +statement ok +set datafusion.execution.batch_size = 7 + +query TIIIIIIIITRRT +select * from aggregate_test_100 ORDER BY c13 desc limit 5; +---- +a 4 -38 20744 762932956 308913475857409919 7 45465 1787652631 878137512938218976 0.7459874 0.021825780392 ydkwycaISlYSlEq3TlkS2m15I2pcp8 +d 1 -98 13630 -1991133944 1184110014998006843 220 2986 225513085 9634106610243643486 0.89651865 0.164088254508 y7C453hRWd4E7ImjNDWlpexB8nUqjh +e 2 52 -12056 -1090239422 9011500141803970147 238 4168 2013662838 12565360638488684051 0.6694766 0.391444365692 xipQ93429ksjNcXPX5326VSg1xJZcW +d 1 -72 25590 1188089983 3090286296481837049 241 832 3542840110 5885937420286765261 0.41980565 0.215354023438 wwXqSGKLyBQyPkonlzBNYUJTCo4LRS +a 1 -5 12636 794623392 2909750622865366631 15 24022 2669374863 4776679784701509574 0.29877836 0.253725340799 waIGbOGl1PM6gnzZ4uuZt4E2yDWRHs From afea7d3e38faf0ec149c78849177a20d640bb541 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 22 Aug 2023 15:10:11 -0400 Subject: [PATCH 05/32] update --- datafusion/core/src/physical_plan/topk/mod.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/datafusion/core/src/physical_plan/topk/mod.rs b/datafusion/core/src/physical_plan/topk/mod.rs index e6993b70d4c6..64d2090da866 100644 --- a/datafusion/core/src/physical_plan/topk/mod.rs +++ b/datafusion/core/src/physical_plan/topk/mod.rs @@ -399,13 +399,14 @@ impl TopKHeap { /// Compact this heap, rewriting all stored batches into a single /// input batch pub fn maybe_compact(&mut self) -> Result<()>{ + use log::info; + info!("Have {} batches in store", self.store.len()); // don't compact if the store has less than ten batches - if self.store.len() <= 10 { + //if self.store.len() <= 10 { + if self.store.len() <= 2 { return Ok(()); } - panic!("Disco"); - // at first, compact the entire thing always into a new batch // (maybe we can get fancier in the future about ignoring // batches that have a high usage ratio already @@ -428,6 +429,7 @@ impl TopKHeap { topk_row.index = i; } self.insert_batch_entry(batch_entry); + info!("COMPACTION DONE: Have {} batches in store", self.store.len()); Ok(()) } From 69b86ab0bce7aa469f6912ab673af1d3f2135873 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 23 Aug 2023 08:03:56 -0400 Subject: [PATCH 06/32] checkpoint --- datafusion/core/src/physical_plan/topk/mod.rs | 30 +++++++++++++++---- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/datafusion/core/src/physical_plan/topk/mod.rs b/datafusion/core/src/physical_plan/topk/mod.rs index 64d2090da866..8fdefae544f6 100644 --- a/datafusion/core/src/physical_plan/topk/mod.rs +++ b/datafusion/core/src/physical_plan/topk/mod.rs @@ -399,13 +399,20 @@ impl TopKHeap { /// Compact this heap, rewriting all stored batches into a single /// input batch pub fn maybe_compact(&mut self) -> Result<()>{ - use log::info; - info!("Have {} batches in store", self.store.len()); - // don't compact if the store has less than ten batches - //if self.store.len() <= 10 { - if self.store.len() <= 2 { + + // we compact if the number of "unused" rows in the store is + // past some pre-defined threshold. Target holding up to + // around 20 batches, but handle cases of large k where some + // batches might be partially full + let target_batch_size = 8024; + let max_unused_rows = 20 * target_batch_size + self.k; + + // don't compact if the store has only one batch or + if self.store.len() <= 2 || self.store.unused_rows() < max_unused_rows { return Ok(()); } + use log::info; + info!("Have {} batches in store, COMPACTING", self.store.len()); // at first, compact the entire thing always into a new batch // (maybe we can get fancier in the future about ignoring @@ -442,6 +449,8 @@ impl TopKHeap { } } + + /// Represents one of the top K rows held in this heap. Orders /// according to memcmp of row (e.g. the arrow Row format, but could /// also be primtive values) @@ -584,6 +593,17 @@ impl RecordBatchStore { self.batches.len() } + /// Returns the total number of rows in batches minus the number + /// which are in use + fn unused_rows(&self) -> usize { + self.batches + .values() + .map(|batch_entry| { + batch_entry.batch.num_rows() - batch_entry.uses + }) + .sum() + } + /// returns true if the store has nothing stored fn is_empty(&self) -> bool { self.batches.is_empty() From c8b415c1e9da90921f89c56e576c370a50a1a0cc Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 23 Aug 2023 08:58:21 -0400 Subject: [PATCH 07/32] fmt --- datafusion/core/src/physical_plan/topk/mod.rs | 155 +++++++++--------- 1 file changed, 77 insertions(+), 78 deletions(-) diff --git a/datafusion/core/src/physical_plan/topk/mod.rs b/datafusion/core/src/physical_plan/topk/mod.rs index 8fdefae544f6..5e242746e6c0 100644 --- a/datafusion/core/src/physical_plan/topk/mod.rs +++ b/datafusion/core/src/physical_plan/topk/mod.rs @@ -21,7 +21,7 @@ use arrow::{ compute::interleave, row::{RowConverter, Rows, SortField}, }; -use std::{cmp::Ordering, sync::Arc}; +use std::{cmp::Ordering, collections::BinaryHeap, sync::Arc}; use arrow_array::{Array, ArrayRef, RecordBatch}; use arrow_schema::SchemaRef; @@ -143,8 +143,6 @@ impl TopK { /// Insert `batch`, remembering it if any of its values are among /// the top k seen so far. pub fn insert_batch(&mut self, batch: RecordBatch) -> Result<()> { - use log::info; - info!("INSERTING {} rows", batch.num_rows()); // Updates on drop let _timer = self.metrics.baseline.elapsed_compute().timer(); @@ -168,12 +166,11 @@ impl TopK { let mut batch_entry = self.heap.register_batch(batch); for (index, row) in rows.iter().enumerate() { - match self.heap.k_largest() { - // heap has k items, and the current row is not - // smaller than the curret smallest k value, skip - Some(largest) if largest.row.as_slice() <= row.as_ref() => {} - // don't yet have k items or new item is greater than - // current min top k + match self.heap.max() { + // heap has k items, and the new row is greater than the + // current max in the heap ==> it is not a new topk + Some(max_row) if row.as_ref() >= max_row.row.as_slice() => {} + // don't yet have k items or new item is lower than the currently k low values None | Some(_) => { self.heap.add(&mut batch_entry, row, index); self.metrics.row_replacements.add(1); @@ -190,7 +187,7 @@ impl TopK { Ok(()) } - /// Returns the top k results broken into `batch_size` [`RecordBatch`]es + /// Returns the top k results broken into `batch_size` [`RecordBatch`]es, consuming the heap pub fn emit(self) -> Result { let Self { schema, @@ -200,7 +197,7 @@ impl TopK { expr: _, row_converter: _, scratch_rows: _, - heap, + mut heap, } = self; let _timer = metrics.baseline.elapsed_compute().timer(); // time updated on drop @@ -266,10 +263,9 @@ impl TopKMetrics { struct TopKHeap { /// The maximum number of elemenents to store in this heap. k: usize, - /// Storage for up at most `k` items, in ascending - /// order. `inner[0]` holds the smallest value of the smallest k - /// so far, `inner[len-1]` holds the largest value smallest k so far. - inner: Vec, + /// Storage for up at most `k` items using a BinaryHeap. Reverserd + /// so that the smallest k so far is on the top + inner: BinaryHeap, /// Storage the original row values (TopKRow only has the sort key) store: RecordBatchStore, /// The size of all owned data held by this heap @@ -281,7 +277,7 @@ impl TopKHeap { assert!(k > 0); Self { k, - inner: Vec::with_capacity(k), + inner: BinaryHeap::new(), store: RecordBatchStore::new(schema), owned_bytes: 0, } @@ -300,12 +296,13 @@ impl TopKHeap { } /// Returns the largest value stored by the heap if there are k - /// items, otherwise returns None - fn k_largest(&self) -> Option<&TopKRow> { + /// items, otherwise returns None. Remember this structure is + /// keeping the "smallest" k values + fn max(&self) -> Option<&TopKRow> { if self.inner.len() < self.k { None } else { - self.inner.last() + self.inner.peek() } } @@ -344,26 +341,33 @@ impl TopKHeap { self.owned_bytes += new_top_k.owned_size(); - // put the new row into the correct location to maintain that - // self.inner is sorted in descending order - let insertion_point = self - .inner - .partition_point(|current_row| current_row.row() <= row.as_ref()); - self.inner.insert(insertion_point, new_top_k); + // put the new row into the heap + self.inner.push(new_top_k) } - /// Returns the values stored in this heap, from values low to high, as a single - /// [`RecordBatch`] - pub fn emit(&self) -> Result { + /// Returns the values stored in this heap, from values low to + /// high, as a single [`RecordBatch`], resetting the inner heap + pub fn emit(&mut self) -> Result { + Ok(self.emit_with_state()?.0) + } + + /// Returns the values stored in this heap, from values low to + /// high, as a single [`RecordBatch`], and a sorted vec of heap contents + + pub fn emit_with_state(&mut self) -> Result<(RecordBatch, Vec)> { let schema = self.store.schema().clone(); + let mut topk_rows = std::mem::take(&mut self.inner).into_vec(); + + // sort low to high (reverse the reverse) + topk_rows.sort(); + if self.store.is_empty() { - return Ok(RecordBatch::new_empty(schema)); + return Ok((RecordBatch::new_empty(schema), topk_rows)); } // Indicies for each row within its respective RecordBatch - let indicies: Vec<_> = self - .inner + let indicies: Vec<_> = topk_rows .iter() .enumerate() .map(|(i, k)| (i, k.index)) @@ -375,8 +379,7 @@ impl TopKHeap { // `interleave` kernel to pick rows from different arrays let output_columns: Vec<_> = (0..num_columns) .map(|col| { - let input_arrays: Vec<_> = self - .inner + let input_arrays: Vec<_> = topk_rows .iter() .map(|k| { let entry = @@ -393,50 +396,50 @@ impl TopKHeap { }) .collect::>()?; - Ok(RecordBatch::try_new(schema, output_columns)?) + let new_batch = RecordBatch::try_new(schema, output_columns)?; + Ok((new_batch, topk_rows)) } /// Compact this heap, rewriting all stored batches into a single /// input batch - pub fn maybe_compact(&mut self) -> Result<()>{ - - // we compact if the number of "unused" rows in the store is - // past some pre-defined threshold. Target holding up to - // around 20 batches, but handle cases of large k where some - // batches might be partially full - let target_batch_size = 8024; - let max_unused_rows = 20 * target_batch_size + self.k; - - // don't compact if the store has only one batch or - if self.store.len() <= 2 || self.store.unused_rows() < max_unused_rows { - return Ok(()); - } - use log::info; - info!("Have {} batches in store, COMPACTING", self.store.len()); - - // at first, compact the entire thing always into a new batch - // (maybe we can get fancier in the future about ignoring - // batches that have a high usage ratio already - - // Note: new batch is in the same order as inner - let new_batch = self.emit()?; - - // clear all old entires in store (this invalidates all - // store_ids in `inner`) - self.store.clear(); - - let mut batch_entry = self.register_batch(new_batch); - batch_entry.uses = self.inner.len(); - - // rewrite all existing entries to use the new batch, and - // remove old entries. The sortedness and their relative - // position do not change - for (i, topk_row) in self.inner.iter_mut().enumerate() { - topk_row.batch_id = batch_entry.id; - topk_row.index = i; - } - self.insert_batch_entry(batch_entry); - info!("COMPACTION DONE: Have {} batches in store", self.store.len()); + pub fn maybe_compact(&mut self) -> Result<()> { + // // we compact if the number of "unused" rows in the store is + // // past some pre-defined threshold. Target holding up to + // // around 20 batches, but handle cases of large k where some + // // batches might be partially full + // let target_batch_size = 8024; + // let max_unused_rows = 20 * target_batch_size + self.k; + + // // don't compact if the store has only one batch or + // if self.store.len() <= 2 || self.store.unused_rows() < max_unused_rows { + // return Ok(()); + // } + // use log::info; + // info!("Have {} batches in store, COMPACTING", self.store.len()); + + // // at first, compact the entire thing always into a new batch + // // (maybe we can get fancier in the future about ignoring + // // batches that have a high usage ratio already + + // // Note: new batch is in the same order as inner + // let new_batch = self.emit()?; + + // // clear all old entires in store (this invalidates all + // // store_ids in `inner`) + // self.store.clear(); + + // let mut batch_entry = self.register_batch(new_batch); + // batch_entry.uses = self.inner.len(); + + // // rewrite all existing entries to use the new batch, and + // // remove old entries. The sortedness and their relative + // // position do not change + // for (i, topk_row) in self.inner.iter_mut().enumerate() { + // topk_row.batch_id = batch_entry.id; + // topk_row.index = i; + // } + // self.insert_batch_entry(batch_entry); + // info!("COMPACTION DONE: Have {} batches in store", self.store.len()); Ok(()) } @@ -449,8 +452,6 @@ impl TopKHeap { } } - - /// Represents one of the top K rows held in this heap. Orders /// according to memcmp of row (e.g. the arrow Row format, but could /// also be primtive values) @@ -598,9 +599,7 @@ impl RecordBatchStore { fn unused_rows(&self) -> usize { self.batches .values() - .map(|batch_entry| { - batch_entry.batch.num_rows() - batch_entry.uses - }) + .map(|batch_entry| batch_entry.batch.num_rows() - batch_entry.uses) .sum() } From 0337e310cd3d998cdfb105b6cba41aa28a34b8b1 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 23 Aug 2023 11:35:47 -0400 Subject: [PATCH 08/32] Fix compaction --- datafusion/core/src/physical_plan/topk/mod.rs | 94 +++++++++++-------- datafusion/sqllogictest/test_files/aal.slt | 2 +- 2 files changed, 55 insertions(+), 41 deletions(-) diff --git a/datafusion/core/src/physical_plan/topk/mod.rs b/datafusion/core/src/physical_plan/topk/mod.rs index 5e242746e6c0..faf68bcd5ac2 100644 --- a/datafusion/core/src/physical_plan/topk/mod.rs +++ b/datafusion/core/src/physical_plan/topk/mod.rs @@ -136,7 +136,7 @@ impl TopK { expr, row_converter, scratch_rows, - heap: TopKHeap::new(k, schema), + heap: TopKHeap::new(k, batch_size, schema), }) } @@ -169,7 +169,7 @@ impl TopK { match self.heap.max() { // heap has k items, and the new row is greater than the // current max in the heap ==> it is not a new topk - Some(max_row) if row.as_ref() >= max_row.row.as_slice() => {} + Some(max_row) if row.as_ref() >= max_row.row() => {} // don't yet have k items or new item is lower than the currently k low values None | Some(_) => { self.heap.add(&mut batch_entry, row, index); @@ -263,6 +263,8 @@ impl TopKMetrics { struct TopKHeap { /// The maximum number of elemenents to store in this heap. k: usize, + /// The target number of rows for output batches + batch_size: usize, /// Storage for up at most `k` items using a BinaryHeap. Reverserd /// so that the smallest k so far is on the top inner: BinaryHeap, @@ -273,10 +275,15 @@ struct TopKHeap { } impl TopKHeap { - fn new(k: usize, schema: SchemaRef) -> Self { + fn new( + k: usize, + batch_size: usize, + schema: SchemaRef + ) -> Self { assert!(k > 0); Self { k, + batch_size, inner: BinaryHeap::new(), store: RecordBatchStore::new(schema), owned_bytes: 0, @@ -403,43 +410,50 @@ impl TopKHeap { /// Compact this heap, rewriting all stored batches into a single /// input batch pub fn maybe_compact(&mut self) -> Result<()> { - // // we compact if the number of "unused" rows in the store is - // // past some pre-defined threshold. Target holding up to - // // around 20 batches, but handle cases of large k where some - // // batches might be partially full - // let target_batch_size = 8024; - // let max_unused_rows = 20 * target_batch_size + self.k; - - // // don't compact if the store has only one batch or - // if self.store.len() <= 2 || self.store.unused_rows() < max_unused_rows { - // return Ok(()); - // } - // use log::info; - // info!("Have {} batches in store, COMPACTING", self.store.len()); - - // // at first, compact the entire thing always into a new batch - // // (maybe we can get fancier in the future about ignoring - // // batches that have a high usage ratio already - - // // Note: new batch is in the same order as inner - // let new_batch = self.emit()?; - - // // clear all old entires in store (this invalidates all - // // store_ids in `inner`) - // self.store.clear(); - - // let mut batch_entry = self.register_batch(new_batch); - // batch_entry.uses = self.inner.len(); - - // // rewrite all existing entries to use the new batch, and - // // remove old entries. The sortedness and their relative - // // position do not change - // for (i, topk_row) in self.inner.iter_mut().enumerate() { - // topk_row.batch_id = batch_entry.id; - // topk_row.index = i; - // } - // self.insert_batch_entry(batch_entry); - // info!("COMPACTION DONE: Have {} batches in store", self.store.len()); + // we compact if the number of "unused" rows in the store is + // past some pre-defined threshold. Target holding up to + // around 20 batches, but handle cases of large k where some + // batches might be partially full + let max_unused_rows = (20 * self.batch_size) + self.k; + let unused_rows = self.store.unused_rows(); + use log::info; + //info!("{} batches in store, unused rows in store: {}, max unused rows: {}", + //self.store.len(), unused_rows, max_unused_rows); + + // don't compact if the store has only one batch or + if self.store.len() <= 2 || unused_rows < max_unused_rows { + //if self.store.len() <= 2 { + return Ok(()); + } + info!("Have {} batches in store, COMPACTING", self.store.len()); + + // at first, compact the entire thing always into a new batch + // (maybe we can get fancier in the future about ignoring + // batches that have a high usage ratio already + + // Note: new batch is in the same order as inner + let num_rows = self.inner.len(); + let (new_batch, mut topk_rows) = self.emit_with_state()?; + + // clear all old entires in store (this invalidates all + // store_ids in `inner`) + self.store.clear(); + + let mut batch_entry = self.register_batch(new_batch); + batch_entry.uses = num_rows; + + // rewrite all existing entries to use the new batch, and + // remove old entries. The sortedness and their relative + // position do not change + for (i, topk_row) in topk_rows.iter_mut().enumerate() { + topk_row.batch_id = batch_entry.id; + topk_row.index = i; + } + self.insert_batch_entry(batch_entry); + // restore the heap + self.inner = BinaryHeap::from(topk_rows); + + info!("COMPACTION DONE: Have {} batches in store", self.store.len()); Ok(()) } diff --git a/datafusion/sqllogictest/test_files/aal.slt b/datafusion/sqllogictest/test_files/aal.slt index 4d8346edf23e..bbab912956e1 100644 --- a/datafusion/sqllogictest/test_files/aal.slt +++ b/datafusion/sqllogictest/test_files/aal.slt @@ -205,7 +205,7 @@ a 1 -5 12636 794623392 2909750622865366631 15 24022 2669374863 47766797847015095 ## -- make tiny batches to trigger batch compaction statement ok -set datafusion.execution.batch_size = 7 +set datafusion.execution.batch_size = 2 query TIIIIIIIITRRT select * from aggregate_test_100 ORDER BY c13 desc limit 5; From db196fb1a13520b336df9a18171986d59eb7b2d1 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 24 Aug 2023 16:02:52 -0400 Subject: [PATCH 09/32] add location for re-encoding --- datafusion/core/src/physical_plan/topk/mod.rs | 30 ++++++++++++++++--- datafusion/sqllogictest/test_files/aal.slt | 15 ++++++++++ 2 files changed, 41 insertions(+), 4 deletions(-) diff --git a/datafusion/core/src/physical_plan/topk/mod.rs b/datafusion/core/src/physical_plan/topk/mod.rs index faf68bcd5ac2..e2868c3b7fa3 100644 --- a/datafusion/core/src/physical_plan/topk/mod.rs +++ b/datafusion/core/src/physical_plan/topk/mod.rs @@ -18,13 +18,13 @@ //! TopK: Combination of Sort / LIMIT use arrow::{ - compute::interleave, + error::ArrowError, row::{RowConverter, Rows, SortField}, }; use std::{cmp::Ordering, collections::BinaryHeap, sync::Arc}; use arrow_array::{Array, ArrayRef, RecordBatch}; -use arrow_schema::SchemaRef; +use arrow_schema::{DataType, SchemaRef}; use datafusion_common::Result; use datafusion_execution::{ memory_pool::{MemoryConsumer, MemoryReservation}, @@ -359,8 +359,8 @@ impl TopKHeap { } /// Returns the values stored in this heap, from values low to - /// high, as a single [`RecordBatch`], and a sorted vec of heap contents - + /// high, as a single [`RecordBatch`], and a sorted vec of the + /// current heap's contents pub fn emit_with_state(&mut self) -> Result<(RecordBatch, Vec)> { let schema = self.store.schema().clone(); @@ -657,3 +657,25 @@ impl RecordBatchStore { + self.batches_size } } + + +/// wrapper over [`arrow::compute::interleave`] that re-encodes +/// dictionaries that have a low usage (values referenced) + fn interleave( + values: &[&dyn Array], + indices: &[(usize, usize)], +) -> Result { + // for now, always re-encode only string dictionaries + if !values.is_empty() { + match values[0].data_type() { + DataType::Dictionary(_key_type, value_type) if value_type.as_ref() == &DataType::Utf8 => { + + //todo!() + return arrow::compute::interleave(values, indices); + } + _ => { } + } + } + // fallback to arrow + arrow::compute::interleave(values, indices) + } diff --git a/datafusion/sqllogictest/test_files/aal.slt b/datafusion/sqllogictest/test_files/aal.slt index bbab912956e1..36dc0d9fdcf9 100644 --- a/datafusion/sqllogictest/test_files/aal.slt +++ b/datafusion/sqllogictest/test_files/aal.slt @@ -215,3 +215,18 @@ d 1 -98 13630 -1991133944 1184110014998006843 220 2986 225513085 963410661024364 e 2 52 -12056 -1090239422 9011500141803970147 238 4168 2013662838 12565360638488684051 0.6694766 0.391444365692 xipQ93429ksjNcXPX5326VSg1xJZcW d 1 -72 25590 1188089983 3090286296481837049 241 832 3542840110 5885937420286765261 0.41980565 0.215354023438 wwXqSGKLyBQyPkonlzBNYUJTCo4LRS a 1 -5 12636 794623392 2909750622865366631 15 24022 2669374863 4776679784701509574 0.29877836 0.253725340799 waIGbOGl1PM6gnzZ4uuZt4E2yDWRHs + + +## make an example for + +statement ok +create table dict as select c1, c2, c3, c13, arrow_cast(c13, 'Dictionary(Int32, Utf8)') as c13_dict from aggregate_test_100; + +query TIIT? +select * from dict order by c13 desc limit 5; +---- +a 4 -38 ydkwycaISlYSlEq3TlkS2m15I2pcp8 ydkwycaISlYSlEq3TlkS2m15I2pcp8 +d 1 -98 y7C453hRWd4E7ImjNDWlpexB8nUqjh y7C453hRWd4E7ImjNDWlpexB8nUqjh +e 2 52 xipQ93429ksjNcXPX5326VSg1xJZcW xipQ93429ksjNcXPX5326VSg1xJZcW +d 1 -72 wwXqSGKLyBQyPkonlzBNYUJTCo4LRS wwXqSGKLyBQyPkonlzBNYUJTCo4LRS +a 1 -5 waIGbOGl1PM6gnzZ4uuZt4E2yDWRHs waIGbOGl1PM6gnzZ4uuZt4E2yDWRHs From f12307596f6c0f0255d40c682c8a456d13dff980 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 24 Aug 2023 16:16:32 -0400 Subject: [PATCH 10/32] Start sketching dictionary interleave --- datafusion/core/src/physical_plan/topk/mod.rs | 60 ++++++++++++------- datafusion/core/tests/sql/order.rs | 4 +- .../simplify_expressions/expr_simplifier.rs | 4 +- .../src/simplify_expressions/regex.rs | 4 +- datafusion/sql/src/statement.rs | 4 +- .../substrait/src/logical_plan/consumer.rs | 26 ++++---- .../substrait/src/logical_plan/producer.rs | 5 +- 7 files changed, 65 insertions(+), 42 deletions(-) diff --git a/datafusion/core/src/physical_plan/topk/mod.rs b/datafusion/core/src/physical_plan/topk/mod.rs index e2868c3b7fa3..3b8546cc876c 100644 --- a/datafusion/core/src/physical_plan/topk/mod.rs +++ b/datafusion/core/src/physical_plan/topk/mod.rs @@ -23,7 +23,7 @@ use arrow::{ }; use std::{cmp::Ordering, collections::BinaryHeap, sync::Arc}; -use arrow_array::{Array, ArrayRef, RecordBatch}; +use arrow_array::{downcast_dictionary_array, Array, ArrayRef, RecordBatch}; use arrow_schema::{DataType, SchemaRef}; use datafusion_common::Result; use datafusion_execution::{ @@ -275,11 +275,7 @@ struct TopKHeap { } impl TopKHeap { - fn new( - k: usize, - batch_size: usize, - schema: SchemaRef - ) -> Self { + fn new(k: usize, batch_size: usize, schema: SchemaRef) -> Self { assert!(k > 0); Self { k, @@ -453,7 +449,10 @@ impl TopKHeap { // restore the heap self.inner = BinaryHeap::from(topk_rows); - info!("COMPACTION DONE: Have {} batches in store", self.store.len()); + info!( + "COMPACTION DONE: Have {} batches in store", + self.store.len() + ); Ok(()) } @@ -658,24 +657,39 @@ impl RecordBatchStore { } } - /// wrapper over [`arrow::compute::interleave`] that re-encodes /// dictionaries that have a low usage (values referenced) - fn interleave( +fn interleave( + values: &[&dyn Array], + indices: &[(usize, usize)], +) -> Result { + // for now, always re-encode only string dictionaries + if !values.is_empty() { + match values[0].data_type() { + DataType::Dictionary(_key_type, value_type) + if value_type.as_ref() == &DataType::Utf8 => + { + return interleave_dictionary(values, indices); + } + _ => {} + } + } + // fallback to arrow + arrow::compute::interleave(values, indices) +} + +// we don't need specialized version for each index type, simply need +fn interleave_dictionary( values: &[&dyn Array], indices: &[(usize, usize)], ) -> Result { - // for now, always re-encode only string dictionaries - if !values.is_empty() { - match values[0].data_type() { - DataType::Dictionary(_key_type, value_type) if value_type.as_ref() == &DataType::Utf8 => { - - //todo!() - return arrow::compute::interleave(values, indices); - } - _ => { } - } - } - // fallback to arrow - arrow::compute::interleave(values, indices) - } + todo!() +} + +/// returns a reference to the values of this dictioanry +fn values(array: &ArrayRef) -> &ArrayRef { + downcast_dictionary_array!( + array => return array.values(), + _ => unreachable!("Non dictionary type") + ) +} diff --git a/datafusion/core/tests/sql/order.rs b/datafusion/core/tests/sql/order.rs index 3981fbaa4d7a..a400a78fc914 100644 --- a/datafusion/core/tests/sql/order.rs +++ b/datafusion/core/tests/sql/order.rs @@ -48,7 +48,9 @@ async fn sort_with_lots_of_repetition_values() -> Result<()> { async fn create_external_table_with_order() -> Result<()> { let ctx = SessionContext::new(); let sql = "CREATE EXTERNAL TABLE dt (a_id integer, a_str string, a_bool boolean) STORED AS CSV WITH ORDER (a_id ASC) LOCATION 'file://path/to/table';"; - let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = ctx.state().create_logical_plan(sql).await? else { + let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = + ctx.state().create_logical_plan(sql).await? + else { panic!("Wrong command") }; diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 895432026b48..3cf564f367ba 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -412,7 +412,9 @@ impl<'a, S: SimplifyInfo> TreeNodeRewriter for Simplifier<'a, S> { }) if list.len() == 1 && matches!(list.first(), Some(Expr::ScalarSubquery { .. })) => { - let Expr::ScalarSubquery(subquery) = list.remove(0) else { unreachable!() }; + let Expr::ScalarSubquery(subquery) = list.remove(0) else { + unreachable!() + }; Expr::InSubquery(InSubquery::new(expr, subquery, negated)) } diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs b/datafusion/optimizer/src/simplify_expressions/regex.rs index 5094623b82c0..b9d9821b43f0 100644 --- a/datafusion/optimizer/src/simplify_expressions/regex.rs +++ b/datafusion/optimizer/src/simplify_expressions/regex.rs @@ -203,7 +203,9 @@ fn anchored_literal_to_expr(v: &[Hir]) -> Option { match v.len() { 2 => Some(lit("")), 3 => { - let HirKind::Literal(l) = v[1].kind() else { return None }; + let HirKind::Literal(l) = v[1].kind() else { + return None; + }; like_str_from_literal(l).map(lit) } _ => None, diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index c9a34bfaf220..8676e2a6e76a 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -499,10 +499,10 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { "DELETE FROM only supports single table, got: joins".to_string(), )); } - let TableFactor::Table{name, ..} = table_factor.relation else { + let TableFactor::Table { name, .. } = table_factor.relation else { return Err(DataFusionError::NotImplemented(format!( "DELETE FROM only supports single table, got: {table_factor:?}" - ))) + ))); }; Ok(name) diff --git a/datafusion/substrait/src/logical_plan/consumer.rs b/datafusion/substrait/src/logical_plan/consumer.rs index 4e4d71ddb604..54f1facb4ada 100644 --- a/datafusion/substrait/src/logical_plan/consumer.rs +++ b/datafusion/substrait/src/logical_plan/consumer.rs @@ -573,9 +573,9 @@ pub async fn from_substrait_sorts( Some(k) => match k { Direction(d) => { let Some(direction) = SortDirection::from_i32(*d) else { - return Err(DataFusionError::NotImplemented( - format!("Unsupported Substrait SortDirection value {d}"), - )) + return Err(DataFusionError::NotImplemented(format!( + "Unsupported Substrait SortDirection value {d}" + ))); }; match direction { @@ -1313,27 +1313,27 @@ async fn make_datafusion_like( } let Some(ArgType::Value(expr_substrait)) = &f.arguments[0].arg_type else { - return Err(DataFusionError::NotImplemented( - format!("Invalid arguments type for `{fn_name}` expr") - )) + return Err(DataFusionError::NotImplemented(format!( + "Invalid arguments type for `{fn_name}` expr" + ))); }; let expr = from_substrait_rex(expr_substrait, input_schema, extensions) .await? .as_ref() .clone(); let Some(ArgType::Value(pattern_substrait)) = &f.arguments[1].arg_type else { - return Err(DataFusionError::NotImplemented( - format!("Invalid arguments type for `{fn_name}` expr") - )) + return Err(DataFusionError::NotImplemented(format!( + "Invalid arguments type for `{fn_name}` expr" + ))); }; let pattern = from_substrait_rex(pattern_substrait, input_schema, extensions) .await? .as_ref() .clone(); let Some(ArgType::Value(escape_char_substrait)) = &f.arguments[2].arg_type else { - return Err(DataFusionError::NotImplemented( - format!("Invalid arguments type for `{fn_name}` expr") - )) + return Err(DataFusionError::NotImplemented(format!( + "Invalid arguments type for `{fn_name}` expr" + ))); }; let escape_char_expr = from_substrait_rex(escape_char_substrait, input_schema, extensions) @@ -1343,7 +1343,7 @@ async fn make_datafusion_like( let Expr::Literal(ScalarValue::Utf8(escape_char)) = escape_char_expr else { return Err(DataFusionError::Substrait(format!( "Expect Utf8 literal for escape char, but found {escape_char_expr:?}", - ))) + ))); }; Ok(Arc::new(Expr::Like(Like { diff --git a/datafusion/substrait/src/logical_plan/producer.rs b/datafusion/substrait/src/logical_plan/producer.rs index 79cd8995c6c6..d1f46c9858a0 100644 --- a/datafusion/substrait/src/logical_plan/producer.rs +++ b/datafusion/substrait/src/logical_plan/producer.rs @@ -1664,7 +1664,10 @@ mod test { println!("Checking round trip of {scalar:?}"); let substrait = to_substrait_literal(&scalar)?; - let Expression { rex_type: Some(RexType::Literal(substrait_literal)) } = substrait else { + let Expression { + rex_type: Some(RexType::Literal(substrait_literal)), + } = substrait + else { panic!("Expected Literal expression, got {substrait:?}"); }; From 157379a21820db8e4da636c7bb4adaca7693f282 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 24 Aug 2023 16:41:39 -0400 Subject: [PATCH 11/32] checkpoint --- datafusion/core/src/physical_plan/topk/mod.rs | 82 ++++++++++++++++--- 1 file changed, 71 insertions(+), 11 deletions(-) diff --git a/datafusion/core/src/physical_plan/topk/mod.rs b/datafusion/core/src/physical_plan/topk/mod.rs index 3b8546cc876c..f48ba32025d9 100644 --- a/datafusion/core/src/physical_plan/topk/mod.rs +++ b/datafusion/core/src/physical_plan/topk/mod.rs @@ -18,12 +18,11 @@ //! TopK: Combination of Sort / LIMIT use arrow::{ - error::ArrowError, row::{RowConverter, Rows, SortField}, }; use std::{cmp::Ordering, collections::BinaryHeap, sync::Arc}; -use arrow_array::{downcast_dictionary_array, Array, ArrayRef, RecordBatch}; +use arrow_array::{downcast_dictionary_array, Array, ArrayRef, RecordBatch, builder::StringBuilder, cast::AsArray, StringArray, Int32Array, types::Int32Type, DictionaryArray}; use arrow_schema::{DataType, SchemaRef}; use datafusion_common::Result; use datafusion_execution::{ @@ -31,7 +30,7 @@ use datafusion_execution::{ runtime_env::RuntimeEnv, }; use datafusion_physical_expr::PhysicalSortExpr; -use hashbrown::HashMap; +use hashbrown::{HashMap, HashSet}; use crate::physical_plan::{stream::RecordBatchStreamAdapter, SendableRecordBatchStream}; @@ -662,32 +661,93 @@ impl RecordBatchStore { fn interleave( values: &[&dyn Array], indices: &[(usize, usize)], -) -> Result { +) -> Result { // for now, always re-encode only string dictionaries if !values.is_empty() { match values[0].data_type() { DataType::Dictionary(_key_type, value_type) if value_type.as_ref() == &DataType::Utf8 => { - return interleave_dictionary(values, indices); + return interleave_and_repack_dictionary(values, indices); } _ => {} } } // fallback to arrow - arrow::compute::interleave(values, indices) + Ok(arrow::compute::interleave(values, indices)?) } -// we don't need specialized version for each index type, simply need -fn interleave_dictionary( +/// Special interleave kernel that re-creates the dictionary values, +/// ensuring no unused space +fn interleave_and_repack_dictionary( values: &[&dyn Array], indices: &[(usize, usize)], -) -> Result { - todo!() +) -> Result { + let existing_values = HashSet::new(); + + let data_type = values[0].data_type(); + + // repack to a new StringArray + let mut new_values = StringBuilder::new(); + // we could specialize this and avoid the copy of the index, but + // that seems like a lot of codegen overhead + let mut new_keys = vec![]; + + for (array_idx, row_idx) in indices { + // look up value, + let array = values[*array_idx]; + downcast_dictionary_array!( + array=> { + if let Some(key) = array.key(*row_idx) { + let values: &StringArray = array.values().as_string(); + if values.is_valid(key) { + let current_value = values.value(key); + println!("Current value is {current_value}"); + todo!(); + } else { + new_keys.push(None) + } + } + else { + new_keys.push(None); + } + + + } + _ => unreachable!("Non dictionary type") + + ) + } + + // form the output + let DataType::Dictionary(key_type, value_type) = data_type else { + unreachable!("non dictionary type"); + }; + + let new_values: ArrayRef = Arc::new(new_values.finish()); + match key_type.as_ref() { + DataType::Int32 => { + // check the keys will fit in this array + if new_values.len() >= i32::MAX as usize { + panic!("todo make a real error message"); + } + + let new_keys: Int32Array = new_keys.iter().map(|v| v.map(|v| v as i32)).collect(); + + Ok(Arc::new(DictionaryArray::try_new(new_keys, new_values)?)) + } + _ => { + // handle other keys + todo!() + } + } + + + } /// returns a reference to the values of this dictioanry -fn values(array: &ArrayRef) -> &ArrayRef { +fn get_dict_values(array: &ArrayRef) -> &ArrayRef { downcast_dictionary_array!( array => return array.values(), _ => unreachable!("Non dictionary type") From 682127af5898de0f6dec81d037a5efbe54c07f5b Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 24 Aug 2023 16:50:29 -0400 Subject: [PATCH 12/32] initial specialized dictionary --- datafusion/core/src/physical_plan/topk/mod.rs | 36 ++++++++----------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/datafusion/core/src/physical_plan/topk/mod.rs b/datafusion/core/src/physical_plan/topk/mod.rs index f48ba32025d9..e5ba0a69e392 100644 --- a/datafusion/core/src/physical_plan/topk/mod.rs +++ b/datafusion/core/src/physical_plan/topk/mod.rs @@ -22,7 +22,7 @@ use arrow::{ }; use std::{cmp::Ordering, collections::BinaryHeap, sync::Arc}; -use arrow_array::{downcast_dictionary_array, Array, ArrayRef, RecordBatch, builder::StringBuilder, cast::AsArray, StringArray, Int32Array, types::Int32Type, DictionaryArray}; +use arrow_array::{downcast_dictionary_array, Array, ArrayRef, RecordBatch, builder::StringBuilder, cast::AsArray, StringArray, Int32Array, DictionaryArray}; use arrow_schema::{DataType, SchemaRef}; use datafusion_common::Result; use datafusion_execution::{ @@ -30,7 +30,7 @@ use datafusion_execution::{ runtime_env::RuntimeEnv, }; use datafusion_physical_expr::PhysicalSortExpr; -use hashbrown::{HashMap, HashSet}; +use hashbrown::{HashMap}; use crate::physical_plan::{stream::RecordBatchStreamAdapter, SendableRecordBatchStream}; @@ -683,14 +683,11 @@ fn interleave_and_repack_dictionary( values: &[&dyn Array], indices: &[(usize, usize)], ) -> Result { - let existing_values = HashSet::new(); - let data_type = values[0].data_type(); - // repack to a new StringArray + // maps strings to new keys ( indexes) + let mut new_value_to_key = HashMap::new(); let mut new_values = StringBuilder::new(); - // we could specialize this and avoid the copy of the index, but - // that seems like a lot of codegen overhead let mut new_keys = vec![]; for (array_idx, row_idx) in indices { @@ -702,8 +699,16 @@ fn interleave_and_repack_dictionary( let values: &StringArray = array.values().as_string(); if values.is_valid(key) { let current_value = values.value(key); - println!("Current value is {current_value}"); - todo!(); + if let Some(new_key) = new_value_to_key.get(current_value) { + // value was already in the set + new_keys.push(Some(*new_key)) + } else { + // value not yet seen + let new_key = new_value_to_key.len(); + new_values.append_value(current_value); + new_keys.push(Some(new_key)); + new_value_to_key.insert(current_value, new_key); + } } else { new_keys.push(None) } @@ -720,7 +725,7 @@ fn interleave_and_repack_dictionary( } // form the output - let DataType::Dictionary(key_type, value_type) = data_type else { + let DataType::Dictionary(key_type, _value_type) = data_type else { unreachable!("non dictionary type"); }; @@ -741,15 +746,4 @@ fn interleave_and_repack_dictionary( todo!() } } - - - -} - -/// returns a reference to the values of this dictioanry -fn get_dict_values(array: &ArrayRef) -> &ArrayRef { - downcast_dictionary_array!( - array => return array.values(), - _ => unreachable!("Non dictionary type") - ) } From a1ea62ecac4ecc4bfad96eee1068ffa02e49f9ae Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 24 Aug 2023 16:54:05 -0400 Subject: [PATCH 13/32] finish initial special interleave --- datafusion/core/src/physical_plan/topk/mod.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/datafusion/core/src/physical_plan/topk/mod.rs b/datafusion/core/src/physical_plan/topk/mod.rs index e5ba0a69e392..8169471472bf 100644 --- a/datafusion/core/src/physical_plan/topk/mod.rs +++ b/datafusion/core/src/physical_plan/topk/mod.rs @@ -716,8 +716,6 @@ fn interleave_and_repack_dictionary( else { new_keys.push(None); } - - } _ => unreachable!("Non dictionary type") @@ -736,7 +734,6 @@ fn interleave_and_repack_dictionary( if new_values.len() >= i32::MAX as usize { panic!("todo make a real error message"); } - let new_keys: Int32Array = new_keys.iter().map(|v| v.map(|v| v as i32)).collect(); Ok(Arc::new(DictionaryArray::try_new(new_keys, new_values)?)) From 5e65130adab1b90cad8d785bed766230632d4f38 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 24 Aug 2023 17:58:55 -0400 Subject: [PATCH 14/32] Complete dictionary order --- datafusion/core/src/physical_plan/topk/mod.rs | 53 ++++++++++++------- 1 file changed, 35 insertions(+), 18 deletions(-) diff --git a/datafusion/core/src/physical_plan/topk/mod.rs b/datafusion/core/src/physical_plan/topk/mod.rs index 8169471472bf..5e1ec8d5152a 100644 --- a/datafusion/core/src/physical_plan/topk/mod.rs +++ b/datafusion/core/src/physical_plan/topk/mod.rs @@ -17,12 +17,14 @@ //! TopK: Combination of Sort / LIMIT -use arrow::{ - row::{RowConverter, Rows, SortField}, -}; +use arrow::row::{RowConverter, Rows, SortField}; use std::{cmp::Ordering, collections::BinaryHeap, sync::Arc}; -use arrow_array::{downcast_dictionary_array, Array, ArrayRef, RecordBatch, builder::StringBuilder, cast::AsArray, StringArray, Int32Array, DictionaryArray}; +use arrow_array::{ + builder::StringBuilder, cast::AsArray, downcast_dictionary_array, Array, ArrayRef, + DictionaryArray, Int16Array, Int32Array, Int64Array, Int8Array, RecordBatch, + StringArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, +}; use arrow_schema::{DataType, SchemaRef}; use datafusion_common::Result; use datafusion_execution::{ @@ -30,7 +32,7 @@ use datafusion_execution::{ runtime_env::RuntimeEnv, }; use datafusion_physical_expr::PhysicalSortExpr; -use hashbrown::{HashMap}; +use hashbrown::HashMap; use crate::physical_plan::{stream::RecordBatchStreamAdapter, SendableRecordBatchStream}; @@ -658,10 +660,7 @@ impl RecordBatchStore { /// wrapper over [`arrow::compute::interleave`] that re-encodes /// dictionaries that have a low usage (values referenced) -fn interleave( - values: &[&dyn Array], - indices: &[(usize, usize)], -) -> Result { +fn interleave(values: &[&dyn Array], indices: &[(usize, usize)]) -> Result { // for now, always re-encode only string dictionaries if !values.is_empty() { match values[0].data_type() { @@ -688,7 +687,7 @@ fn interleave_and_repack_dictionary( // maps strings to new keys ( indexes) let mut new_value_to_key = HashMap::new(); let mut new_values = StringBuilder::new(); - let mut new_keys = vec![]; + let mut new_keys = vec![]; for (array_idx, row_idx) in indices { // look up value, @@ -728,19 +727,37 @@ fn interleave_and_repack_dictionary( }; let new_values: ArrayRef = Arc::new(new_values.finish()); - match key_type.as_ref() { - DataType::Int32 => { + + // creates a $ARRAY_TYPE array from $NEW_KEYS ad $NEW_VALUES + use datafusion_common::DataFusionError; + macro_rules! make_keys { + ($PRIM_TYPE:ty, $ARRAY_TYPE:ty, $NEW_KEYS:ident, $NEW_VALUES:ident) => {{ // check the keys will fit in this array - if new_values.len() >= i32::MAX as usize { - panic!("todo make a real error message"); + if $NEW_VALUES.len() >= <$PRIM_TYPE>::MAX as usize { + return Err(DataFusionError::Execution(format!( + "keys did not fit in prim type -- TODO MAKE BETTER" + ))); } - let new_keys: Int32Array = new_keys.iter().map(|v| v.map(|v| v as i32)).collect(); - + let new_keys: $ARRAY_TYPE = new_keys + .iter() + .map(|v| v.map(|v| v as $PRIM_TYPE)) + .collect(); Ok(Arc::new(DictionaryArray::try_new(new_keys, new_values)?)) - } + }}; + } + + match key_type.as_ref() { + DataType::Int8 => make_keys!(i8, Int8Array, new_keys, new_values), + DataType::Int16 => make_keys!(i16, Int16Array, new_keys, new_values), + DataType::Int32 => make_keys!(i32, Int32Array, new_keys, new_values), + DataType::Int64 => make_keys!(i64, Int64Array, new_keys, new_values), + DataType::UInt8 => make_keys!(u8, UInt8Array, new_keys, new_values), + DataType::UInt16 => make_keys!(u16, UInt16Array, new_keys, new_values), + DataType::UInt32 => make_keys!(u32, UInt32Array, new_keys, new_values), + DataType::UInt64 => make_keys!(u64, UInt64Array, new_keys, new_values), _ => { // handle other keys - todo!() + unreachable!("unvalid key type"); } } } From 4a30c4cf4890fdb405903e80a5930fdd30c99b57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Mon, 2 Oct 2023 14:11:11 +0200 Subject: [PATCH 15/32] Merge --- parquet-testing | 2 +- testing | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet-testing b/parquet-testing index a11fc8f148f8..e45cd23f784a 160000 --- a/parquet-testing +++ b/parquet-testing @@ -1 +1 @@ -Subproject commit a11fc8f148f8a7a89d9281cc0da3eb9d56095fbf +Subproject commit e45cd23f784aab3d6bf0701f8f4e621469ed3be7 diff --git a/testing b/testing index e81d0c6de359..98fceecd024d 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit e81d0c6de35948b3be7984af8e00413b314cde6e +Subproject commit 98fceecd024dccd2f8a00e32fc144975f218acf4 From d9c596ff4dbb57e680aab58271f3658abfae0a03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Mon, 2 Oct 2023 14:26:59 +0200 Subject: [PATCH 16/32] fmt --- datafusion/physical-plan/src/lib.rs | 2 +- datafusion/physical-plan/src/sorts/sort.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs index 0165f808002b..3071fadcb1a2 100644 --- a/datafusion/physical-plan/src/lib.rs +++ b/datafusion/physical-plan/src/lib.rs @@ -25,9 +25,9 @@ use self::{ coalesce_partitions::CoalescePartitionsExec, display::DisplayableExecutionPlan, }; pub use datafusion_common::{internal_err, ColumnStatistics, Statistics}; -pub use topk::TopK; use datafusion_common::{plan_err, Result}; use datafusion_physical_expr::PhysicalSortExpr; +pub use topk::TopK; pub use visitor::{accept, visit_execution_plan, ExecutionPlanVisitor}; use arrow::datatypes::SchemaRef; diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 8c66b269b363..b76b183c0942 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -24,9 +24,9 @@ use crate::expressions::PhysicalSortExpr; use crate::metrics::{ BaselineMetrics, Count, ExecutionPlanMetricsSet, MetricBuilder, MetricsSet, }; -use crate::topk::TopK; use crate::sorts::merge::streaming_merge; use crate::stream::{RecordBatchReceiverStream, RecordBatchStreamAdapter}; +use crate::topk::TopK; use crate::{ DisplayAs, DisplayFormatType, Distribution, EmptyRecordBatchStream, ExecutionPlan, Partitioning, SendableRecordBatchStream, Statistics, From c0f89c114095309d4032686e2e32ddb8f0c49aba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Mon, 2 Oct 2023 14:50:09 +0200 Subject: [PATCH 17/32] Cleanup --- datafusion/physical-plan/src/topk/mod.rs | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/datafusion/physical-plan/src/topk/mod.rs b/datafusion/physical-plan/src/topk/mod.rs index fcf161b3821f..16c7508f92d7 100644 --- a/datafusion/physical-plan/src/topk/mod.rs +++ b/datafusion/physical-plan/src/topk/mod.rs @@ -370,8 +370,8 @@ impl TopKHeap { return Ok((RecordBatch::new_empty(schema), topk_rows)); } - // Indicies for each row within its respective RecordBatch - let indicies: Vec<_> = topk_rows + // Indices for each row within its respective RecordBatch + let indices: Vec<_> = topk_rows .iter() .enumerate() .map(|(i, k)| (i, k.index)) @@ -396,7 +396,7 @@ impl TopKHeap { // rows and `input_arrays` contains a reference to the // relevant Array for that index. `interleave` pulls // them together into a single new array - Ok(interleave(&input_arrays, &indicies)?) + Ok(interleave(&input_arrays, &indices)?) }) .collect::>()?; @@ -413,17 +413,12 @@ impl TopKHeap { // batches might be partially full let max_unused_rows = (20 * self.batch_size) + self.k; let unused_rows = self.store.unused_rows(); - use log::info; - //info!("{} batches in store, unused rows in store: {}, max unused rows: {}", - //self.store.len(), unused_rows, max_unused_rows); // don't compact if the store has only one batch or if self.store.len() <= 2 || unused_rows < max_unused_rows { //if self.store.len() <= 2 { return Ok(()); } - info!("Have {} batches in store, COMPACTING", self.store.len()); - // at first, compact the entire thing always into a new batch // (maybe we can get fancier in the future about ignoring // batches that have a high usage ratio already @@ -450,10 +445,6 @@ impl TopKHeap { // restore the heap self.inner = BinaryHeap::from(topk_rows); - info!( - "COMPACTION DONE: Have {} batches in store", - self.store.len() - ); Ok(()) } From 466d4b627d260f69dbbc3bf346b7617f6e34e582 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Mon, 2 Oct 2023 15:11:17 +0200 Subject: [PATCH 18/32] Fix test --- datafusion/physical-plan/src/sorts/sort.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index b76b183c0942..703f80d90d2b 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -1074,7 +1074,7 @@ mod tests { assert_eq!(result.len(), 1); let metrics = sort_exec.metrics().unwrap(); - let did_it_spill = metrics.spill_count().unwrap() > 0; + let did_it_spill = metrics.spill_count().unwrap_or(0) > 0; assert_eq!(did_it_spill, expect_spillage, "with fetch: {fetch:?}"); } Ok(()) From 33065ad8598cf373908b633b56170828895edf21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Mon, 2 Oct 2023 16:24:44 +0200 Subject: [PATCH 19/32] Cleanup --- datafusion/physical-plan/src/topk/mod.rs | 109 +---------------------- 1 file changed, 4 insertions(+), 105 deletions(-) diff --git a/datafusion/physical-plan/src/topk/mod.rs b/datafusion/physical-plan/src/topk/mod.rs index 16c7508f92d7..0ee9b7ea42c8 100644 --- a/datafusion/physical-plan/src/topk/mod.rs +++ b/datafusion/physical-plan/src/topk/mod.rs @@ -17,7 +17,10 @@ //! TopK: Combination of Sort / LIMIT -use arrow::row::{RowConverter, Rows, SortField}; +use arrow::{ + compute::interleave, + row::{RowConverter, Rows, SortField}, +}; use std::{cmp::Ordering, collections::BinaryHeap, sync::Arc}; use arrow_array::{ @@ -648,107 +651,3 @@ impl RecordBatchStore { + self.batches_size } } - -/// wrapper over [`arrow::compute::interleave`] that re-encodes -/// dictionaries that have a low usage (values referenced) -fn interleave(values: &[&dyn Array], indices: &[(usize, usize)]) -> Result { - // for now, always re-encode only string dictionaries - if !values.is_empty() { - match values[0].data_type() { - DataType::Dictionary(_key_type, value_type) - if value_type.as_ref() == &DataType::Utf8 => - { - return interleave_and_repack_dictionary(values, indices); - } - _ => {} - } - } - // fallback to arrow - Ok(arrow::compute::interleave(values, indices)?) -} - -/// Special interleave kernel that re-creates the dictionary values, -/// ensuring no unused space -fn interleave_and_repack_dictionary( - values: &[&dyn Array], - indices: &[(usize, usize)], -) -> Result { - let data_type = values[0].data_type(); - - // maps strings to new keys ( indexes) - let mut new_value_to_key = HashMap::new(); - let mut new_values = StringBuilder::new(); - let mut new_keys = vec![]; - - for (array_idx, row_idx) in indices { - // look up value, - let array = values[*array_idx]; - downcast_dictionary_array!( - array=> { - if let Some(key) = array.key(*row_idx) { - let values: &StringArray = array.values().as_string(); - if values.is_valid(key) { - let current_value = values.value(key); - if let Some(new_key) = new_value_to_key.get(current_value) { - // value was already in the set - new_keys.push(Some(*new_key)) - } else { - // value not yet seen - let new_key = new_value_to_key.len(); - new_values.append_value(current_value); - new_keys.push(Some(new_key)); - new_value_to_key.insert(current_value, new_key); - } - } else { - new_keys.push(None) - } - } - else { - new_keys.push(None); - } - } - _ => unreachable!("Non dictionary type") - - ) - } - - // form the output - let DataType::Dictionary(key_type, _value_type) = data_type else { - unreachable!("non dictionary type"); - }; - - let new_values: ArrayRef = Arc::new(new_values.finish()); - - // creates a $ARRAY_TYPE array from $NEW_KEYS ad $NEW_VALUES - use datafusion_common::DataFusionError; - macro_rules! make_keys { - ($PRIM_TYPE:ty, $ARRAY_TYPE:ty, $NEW_KEYS:ident, $NEW_VALUES:ident) => {{ - // check the keys will fit in this array - if $NEW_VALUES.len() >= <$PRIM_TYPE>::MAX as usize { - return Err(DataFusionError::Execution(format!( - "keys did not fit in prim type -- TODO MAKE BETTER" - ))); - } - let new_keys: $ARRAY_TYPE = new_keys - .iter() - .map(|v| v.map(|v| v as $PRIM_TYPE)) - .collect(); - Ok(Arc::new(DictionaryArray::try_new(new_keys, new_values)?)) - }}; - } - - match key_type.as_ref() { - DataType::Int8 => make_keys!(i8, Int8Array, new_keys, new_values), - DataType::Int16 => make_keys!(i16, Int16Array, new_keys, new_values), - DataType::Int32 => make_keys!(i32, Int32Array, new_keys, new_values), - DataType::Int64 => make_keys!(i64, Int64Array, new_keys, new_values), - DataType::UInt8 => make_keys!(u8, UInt8Array, new_keys, new_values), - DataType::UInt16 => make_keys!(u16, UInt16Array, new_keys, new_values), - DataType::UInt32 => make_keys!(u32, UInt32Array, new_keys, new_values), - DataType::UInt64 => make_keys!(u64, UInt64Array, new_keys, new_values), - _ => { - // handle other keys - unreachable!("unvalid key type"); - } - } -} From e31718ec808c8c3b2cb5d64405445f01762e2497 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Mon, 2 Oct 2023 17:03:40 +0200 Subject: [PATCH 20/32] Make test deterministic --- datafusion/sqllogictest/test_files/decimal.slt | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/datafusion/sqllogictest/test_files/decimal.slt b/datafusion/sqllogictest/test_files/decimal.slt index a326a0cc4941..d7632138a84e 100644 --- a/datafusion/sqllogictest/test_files/decimal.slt +++ b/datafusion/sqllogictest/test_files/decimal.slt @@ -507,27 +507,26 @@ select * from decimal_simple where c1 >= 0.00004 order by c1; query RRIBR -select * from decimal_simple where c1 >= 0.00004 order by c1 limit 10; +select * from decimal_simple where c1 >= 0.00004 order by c1, c3 limit 10; ---- 0.00004 0.000000000004 5 true 0.000044 +0.00004 0.000000000004 8 false 0.000044 0.00004 0.000000000004 12 false 0.00004 0.00004 0.000000000004 14 true 0.00004 -0.00004 0.000000000004 8 false 0.000044 -0.00005 0.000000000005 9 true 0.000052 +0.00005 0.000000000005 1 false 0.0001 0.00005 0.000000000005 4 true 0.000078 0.00005 0.000000000005 8 false 0.000033 +0.00005 0.000000000005 9 true 0.000052 0.00005 0.000000000005 100 true 0.000068 -0.00005 0.000000000005 1 false 0.0001 - query RRIBR -select * from decimal_simple where c1 >= 0.00004 order by c1 limit 5; +select * from decimal_simple where c1 >= 0.00004 order by c1, c3 limit 5; ---- 0.00004 0.000000000004 5 true 0.000044 +0.00004 0.000000000004 8 false 0.000044 0.00004 0.000000000004 12 false 0.00004 0.00004 0.000000000004 14 true 0.00004 -0.00004 0.000000000004 8 false 0.000044 -0.00005 0.000000000005 9 true 0.000052 +0.00005 0.000000000005 1 false 0.0001 query RRIBR From 40ef4488acc8fdae8b8f5811c581a7a31b31fd82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Mon, 2 Oct 2023 17:59:48 +0200 Subject: [PATCH 21/32] Clippy, doctest --- datafusion/physical-plan/src/topk/mod.rs | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/datafusion/physical-plan/src/topk/mod.rs b/datafusion/physical-plan/src/topk/mod.rs index 0ee9b7ea42c8..3cd1eaca5b03 100644 --- a/datafusion/physical-plan/src/topk/mod.rs +++ b/datafusion/physical-plan/src/topk/mod.rs @@ -23,12 +23,8 @@ use arrow::{ }; use std::{cmp::Ordering, collections::BinaryHeap, sync::Arc}; -use arrow_array::{ - builder::StringBuilder, cast::AsArray, downcast_dictionary_array, Array, ArrayRef, - DictionaryArray, Int16Array, Int32Array, Int64Array, Int8Array, RecordBatch, - StringArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, -}; -use arrow_schema::{DataType, SchemaRef}; +use arrow_array::{Array, ArrayRef, RecordBatch}; +use arrow_schema::SchemaRef; use datafusion_common::Result; use datafusion_execution::{ memory_pool::{MemoryConsumer, MemoryReservation}, @@ -55,7 +51,7 @@ use super::metrics::{BaselineMetrics, Count, ExecutionPlanMetricsSet, MetricBuil /// /// The simple plan would be: /// -/// ``` +/// ```sql /// > explain SELECT customer_id, revenue FROM sales ORDER BY revenue DESC limit 3; /// +--------------+----------------------------------------+ /// | plan_type | plan | From c373ce312fefdf12fd0ebb3eee627f1b12812532 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Tue, 3 Oct 2023 13:35:31 +0200 Subject: [PATCH 22/32] Use into_sorted_vec --- datafusion/physical-plan/src/topk/mod.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/datafusion/physical-plan/src/topk/mod.rs b/datafusion/physical-plan/src/topk/mod.rs index 3cd1eaca5b03..6bdfc1b8b776 100644 --- a/datafusion/physical-plan/src/topk/mod.rs +++ b/datafusion/physical-plan/src/topk/mod.rs @@ -360,10 +360,8 @@ impl TopKHeap { pub fn emit_with_state(&mut self) -> Result<(RecordBatch, Vec)> { let schema = self.store.schema().clone(); - let mut topk_rows = std::mem::take(&mut self.inner).into_vec(); - - // sort low to high (reverse the reverse) - topk_rows.sort(); + // generate sorted rows + let topk_rows = std::mem::take(&mut self.inner).into_sorted_vec(); if self.store.is_empty() { return Ok((RecordBatch::new_empty(schema), topk_rows)); From bd72ad878bb4d74b503936963b1388cb4e1fcf4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Tue, 3 Oct 2023 14:35:10 +0200 Subject: [PATCH 23/32] Fix nondeterministic tests --- datafusion/sqllogictest/test_files/window.slt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/datafusion/sqllogictest/test_files/window.slt b/datafusion/sqllogictest/test_files/window.slt index 4ba0d6cc3e40..5fb5a04c6709 100644 --- a/datafusion/sqllogictest/test_files/window.slt +++ b/datafusion/sqllogictest/test_files/window.slt @@ -2673,7 +2673,7 @@ SELECT LEAD(inc_col, -1, 1001) OVER(ORDER BY ts DESC RANGE BETWEEN 1 PRECEDING and 10 FOLLOWING) AS leadr1, LEAD(inc_col, 4, 1004) OVER(ORDER BY ts DESC ROWS BETWEEN 10 PRECEDING and 1 FOLLOWING) as leadr2 FROM annotated_data_finite - ORDER BY ts DESC + ORDER BY ts DESC, fv2 LIMIT 5; ---- 264 289 266 305 305 305 278 99 99 99 99 86 86 296 291 296 1004 305 305 301 296 305 1002 305 286 @@ -3274,13 +3274,13 @@ drop table annotated_data_infinite2 query IRR SELECT C3, - MAX(c12) OVER window1, - MIN(c12) OVER window2 as max1 + MAX(c12) OVER window1 as max1, + MIN(c12) OVER window2 as max2 FROM aggregate_test_100 WINDOW window1 AS (ORDER BY C12), window2 AS (PARTITION BY C11), window3 AS (ORDER BY C1) - ORDER BY C3 + ORDER BY C3, max2 LIMIT 5 ---- -117 0.850672105305 0.850672105305 @@ -3329,7 +3329,7 @@ SELECT MIN(c12) OVER window1 as max1 FROM aggregate_test_100 WINDOW window1 AS (ORDER BY C12) - ORDER BY C3 + ORDER BY C3, min1 LIMIT 5 ---- -117 0.850672105305 0.014793053078 From 84ffae8ae0f20d785a99a00d884279baccff99b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Tue, 3 Oct 2023 14:42:35 +0200 Subject: [PATCH 24/32] Update cargo.lock --- datafusion-cli/Cargo.lock | 184 ++++++++++++++++++++------------------ 1 file changed, 99 insertions(+), 85 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 775f8ec87e38..ab7f24922899 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -77,9 +77,9 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.3" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b84bf0a05bbb2a83e5eb6fa36bb6e87baa08193c35ff52bbf6b38d8af2890e46" +checksum = "7079075b41f533b8c61d2a4d073c4676e1f8b249ff94a393b0595db304e0dd87" [[package]] name = "arrayref" @@ -143,7 +143,7 @@ dependencies = [ "chrono", "chrono-tz", "half", - "hashbrown 0.14.0", + "hashbrown 0.14.1", "num", ] @@ -234,7 +234,7 @@ dependencies = [ "arrow-schema", "chrono", "half", - "indexmap 2.0.0", + "indexmap 2.0.2", "lexical-core", "num", "serde", @@ -268,7 +268,7 @@ dependencies = [ "arrow-data", "arrow-schema", "half", - "hashbrown 0.14.0", + "hashbrown 0.14.1", ] [[package]] @@ -734,9 +734,9 @@ dependencies = [ [[package]] name = "brotli" -version = "3.3.4" +version = "3.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1a0b1dbcc8ae29329621f8d4f0d835787c1c38bb1401979b49d13b0b305ff68" +checksum = "516074a47ef4bce09577a3b379392300159ce5b1ba2e501ff1c819950066100f" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -745,9 +745,9 @@ dependencies = [ [[package]] name = "brotli-decompressor" -version = "2.3.4" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b6561fd3f895a11e8f72af2cb7d22e08366bebc2b6b57f7744c4bda27034744" +checksum = "da74e2b81409b1b743f8f0c62cc6254afefb8b8e50bbfe3735550f7aeefa3448" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -1026,9 +1026,9 @@ dependencies = [ [[package]] name = "ctor" -version = "0.2.4" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f34ba9a9bcb8645379e9de8cb3ecfcf4d1c85ba66d90deb3259206fa5aa193b" +checksum = "37e366bff8cd32dd8754b0991fb66b279dc48f598c3a18914852a6673deef583" dependencies = [ "quote", "syn 2.0.37", @@ -1041,7 +1041,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "978747c1d849a7d2ee5e8adc0159961c48fb7e5db2f06af6723b80123bb53856" dependencies = [ "cfg-if", - "hashbrown 0.14.0", + "hashbrown 0.14.1", "lock_api", "once_cell", "parking_lot_core", @@ -1072,9 +1072,9 @@ dependencies = [ "futures", "glob", "half", - "hashbrown 0.14.0", - "indexmap 2.0.0", - "itertools 0.11.0", + "hashbrown 0.14.1", + "indexmap 2.0.2", + "itertools", "log", "num_cpus", "object_store", @@ -1145,7 +1145,7 @@ dependencies = [ "datafusion-common", "datafusion-expr", "futures", - "hashbrown 0.14.0", + "hashbrown 0.14.1", "log", "object_store", "parking_lot", @@ -1177,8 +1177,8 @@ dependencies = [ "datafusion-common", "datafusion-expr", "datafusion-physical-expr", - "hashbrown 0.14.0", - "itertools 0.11.0", + "hashbrown 0.14.1", + "itertools", "log", "regex-syntax", ] @@ -1199,10 +1199,10 @@ dependencies = [ "datafusion-common", "datafusion-expr", "half", - "hashbrown 0.14.0", + "hashbrown 0.14.1", "hex", - "indexmap 2.0.0", - "itertools 0.11.0", + "indexmap 2.0.2", + "itertools", "libc", "log", "md-5", @@ -1232,9 +1232,9 @@ dependencies = [ "datafusion-physical-expr", "futures", "half", - "hashbrown 0.14.0", - "indexmap 2.0.0", - "itertools 0.11.0", + "hashbrown 0.14.1", + "indexmap 2.0.2", + "itertools", "log", "once_cell", "parking_lot", @@ -1368,9 +1368,9 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "errno" -version = "0.3.3" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "136526188508e25c6fef639d7927dfb3e0e3084488bf202267829cf7fc23dbdd" +checksum = "add4f07d43996f76ef320709726a556a9d4f965d9410d8d0271132d2f8293480" dependencies = [ "errno-dragonfly", "libc", @@ -1408,9 +1408,9 @@ dependencies = [ [[package]] name = "fastrand" -version = "2.0.0" +version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6999dc1837253364c2ebb0704ba97994bd874e8f195d665c50b7548f6ea92764" +checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" [[package]] name = "fd-lock" @@ -1639,9 +1639,9 @@ checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" [[package]] name = "hashbrown" -version = "0.14.0" +version = "0.14.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a" +checksum = "7dfda62a12f55daeae5015f81b0baea145391cb4520f86c248fc615d72640d12" dependencies = [ "ahash", "allocator-api2", @@ -1821,12 +1821,12 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.0.0" +version = "2.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d5477fe2230a79769d8dc68e0eabf5437907c0457a5614a9e8dddb67f65eb65d" +checksum = "8adf3ddd720272c6ea8bf59463c04e0f93d0bbf7c5439b691bca2987e0270897" dependencies = [ "equivalent", - "hashbrown 0.14.0", + "hashbrown 0.14.1", ] [[package]] @@ -1850,15 +1850,6 @@ version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28b29a3cd74f0f4598934efe3aeba42bae0eb4680554128851ebbecb02af14e6" -[[package]] -name = "itertools" -version = "0.10.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" -dependencies = [ - "either", -] - [[package]] name = "itertools" version = "0.11.0" @@ -1986,9 +1977,9 @@ dependencies = [ [[package]] name = "linux-raw-sys" -version = "0.4.7" +version = "0.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a9bad9f94746442c783ca431b22403b519cd7fbeed0533fdd6328b2f2212128" +checksum = "3852614a3bd9ca9804678ba6be5e3b8ce76dfc902cae004e3e0c44051b6e88db" [[package]] name = "lock_api" @@ -2039,18 +2030,19 @@ dependencies = [ [[package]] name = "md-5" -version = "0.10.5" +version = "0.10.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6365506850d44bff6e2fbcb5176cf63650e48bd45ef2fe2665ae1570e0f4b9ca" +checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" dependencies = [ + "cfg-if", "digest", ] [[package]] name = "memchr" -version = "2.6.3" +version = "2.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f232d6ef707e1956a43342693d2a31e72989554d58299d7a88738cc95b0d35c" +checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167" [[package]] name = "mimalloc" @@ -2211,9 +2203,9 @@ dependencies = [ [[package]] name = "object_store" -version = "0.7.0" +version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d359e231e5451f4f9fa889d56e3ce34f8724f1a61db2107739359717cf2bbf08" +checksum = "f930c88a43b1c3f6e776dfe495b4afab89882dbc81530c632db2ed65451ebcb4" dependencies = [ "async-trait", "base64", @@ -2222,7 +2214,7 @@ dependencies = [ "futures", "humantime", "hyper", - "itertools 0.10.5", + "itertools", "parking_lot", "percent-encoding", "quick-xml", @@ -2315,7 +2307,7 @@ dependencies = [ "chrono", "flate2", "futures", - "hashbrown 0.14.0", + "hashbrown 0.14.1", "lz4", "num", "num-bigint", @@ -2357,7 +2349,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e1d3afd2628e69da2be385eb6f2fd57c8ac7977ceeff6dc166ff1657b0e386a9" dependencies = [ "fixedbitset", - "indexmap 2.0.0", + "indexmap 2.0.2", ] [[package]] @@ -2451,7 +2443,7 @@ dependencies = [ "anstyle", "difflib", "float-cmp", - "itertools 0.11.0", + "itertools", "normalize-line-endings", "predicates-core", "regex", @@ -2514,9 +2506,9 @@ dependencies = [ [[package]] name = "quick-xml" -version = "0.28.2" +version = "0.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ce5e73202a820a31f8a0ee32ada5e21029c81fd9e3ebf668a40832e4219d9d1" +checksum = "eff6510e86862b57b210fd8cbe8ed3f0d7d600b9c2863cd4549a2e033c66e956" dependencies = [ "memchr", "serde", @@ -2602,9 +2594,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.9.5" +version = "1.9.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "697061221ea1b4a94a624f67d0ae2bfe4e22b8a17b6a192afb11046542cc8c47" +checksum = "ebee201405406dbf528b8b672104ae6d6d63e6d118cb10e4d51abbc7b58044ff" dependencies = [ "aho-corasick", "memchr", @@ -2614,9 +2606,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.3.8" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2f401f4955220693b56f8ec66ee9c78abffd8d1c4f23dc41a23839eb88f0795" +checksum = "59b23e92ee4318893fa3fe3e6fb365258efbfe6ac6ab30f090cdcbb7aa37efa9" dependencies = [ "aho-corasick", "memchr", @@ -2631,9 +2623,9 @@ checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" [[package]] name = "reqwest" -version = "0.11.20" +version = "0.11.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e9ad3fe7488d7e34558a2033d45a0c90b72d97b4f80705666fea71472e2e6a1" +checksum = "046cd98826c46c2ac8ddecae268eb5c2e58628688a5fc7a2643704a73faba95b" dependencies = [ "base64", "bytes", @@ -2657,6 +2649,7 @@ dependencies = [ "serde", "serde_json", "serde_urlencoded", + "system-configuration", "tokio", "tokio-rustls 0.24.1", "tokio-util", @@ -2728,9 +2721,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.14" +version = "0.38.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "747c788e9ce8e92b12cd485c49ddf90723550b654b32508f979b71a7b1ecda4f" +checksum = "d2f9da0cbd88f9f09e7814e388301c8414c51c62aa6ce1e4b5c551d49d96e531" dependencies = [ "bitflags 2.4.0", "errno", @@ -2786,9 +2779,9 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.101.5" +version = "0.101.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45a27e3b59326c16e23d30aeb7a36a24cc0d29e71d68ff611cdfb4a01d013bed" +checksum = "3c7d5dece342910d9ba34d259310cae3e0154b873b35408b787b59bce53d34fe" dependencies = [ "ring", "untrusted", @@ -2888,9 +2881,9 @@ dependencies = [ [[package]] name = "semver" -version = "1.0.18" +version = "1.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0293b4b29daaf487284529cc2f5675b8e57c61f70167ba415a463651fd6a918" +checksum = "ad977052201c6de01a8ef2aa3378c4bd23217a056337d1d6da40468d267a4fb0" [[package]] name = "seq-macro" @@ -2943,9 +2936,9 @@ dependencies = [ [[package]] name = "sha2" -version = "0.10.7" +version = "0.10.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "479fb9d862239e610720565ca91403019f2f00410f1864c5aa7479b950a76ed8" +checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8" dependencies = [ "cfg-if", "cpufeatures", @@ -3135,6 +3128,27 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "system-configuration" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba3a3adc5c275d719af8cb4272ea1c4a6d668a777f37e115f6d11ddbc1c8e0e7" +dependencies = [ + "bitflags 1.3.2", + "core-foundation", + "system-configuration-sys", +] + +[[package]] +name = "system-configuration-sys" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75fb188eb626b924683e3b95e3a48e63551fcfb51949de2f06a9d91dbee93c9" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "tempfile" version = "3.8.0" @@ -3142,7 +3156,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cb94d2f3cc536af71caac6b6fcebf65860b347e7ce0cc9ebe8f70d3e521054ef" dependencies = [ "cfg-if", - "fastrand 2.0.0", + "fastrand 2.0.1", "redox_syscall 0.3.5", "rustix", "windows-sys", @@ -3171,18 +3185,18 @@ checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d" [[package]] name = "thiserror" -version = "1.0.48" +version = "1.0.49" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d6d7a740b8a666a7e828dd00da9c0dc290dff53154ea77ac109281de90589b7" +checksum = "1177e8c6d7ede7afde3585fd2513e611227efd6481bd78d2e82ba1ce16557ed4" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.48" +version = "1.0.49" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49922ecae66cc8a249b77e68d1d0623c1b2c514f0060c27cdc68bd62a1219d35" +checksum = "10712f02019e9288794769fba95cd6847df9874d49d871d062172f9dd41bc4cc" dependencies = [ "proc-macro2", "quote", @@ -3202,9 +3216,9 @@ dependencies = [ [[package]] name = "time" -version = "0.3.28" +version = "0.3.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17f6bb557fd245c28e6411aa56b6403c689ad95061f50e4be16c274e70a17e48" +checksum = "426f806f4089c493dcac0d24c29c01e2c38baf8e30f1b716ee37e83d200b18fe" dependencies = [ "deranged", "serde", @@ -3214,15 +3228,15 @@ dependencies = [ [[package]] name = "time-core" -version = "0.1.1" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb" +checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" [[package]] name = "time-macros" -version = "0.2.14" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a942f44339478ef67935ab2bbaec2fb0322496cf3cbe84b261e06ac3814c572" +checksum = "4ad70d68dba9e1f8aceda7aa6711965dfec1cac869f311a51bd08b3a2ccbce20" dependencies = [ "time-core", ] @@ -3617,9 +3631,9 @@ dependencies = [ [[package]] name = "webpki" -version = "0.22.1" +version = "0.22.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0e74f82d49d545ad128049b7e88f6576df2da6b02e9ce565c6f533be576957e" +checksum = "07ecc0cd7cac091bf682ec5efa18b1cff79d617b84181f38b3951dbe135f607f" dependencies = [ "ring", "untrusted", @@ -3749,9 +3763,9 @@ dependencies = [ [[package]] name = "xmlparser" -version = "0.13.5" +version = "0.13.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4d25c75bf9ea12c4040a97f829154768bbbce366287e2dc044af160cd79a13fd" +checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4" [[package]] name = "xz2" From 592b10e5f3455200f3b3dddb6f79b85f1fdc9208 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Tue, 3 Oct 2023 20:28:28 +0200 Subject: [PATCH 25/32] Update datafusion/physical-plan/src/topk/mod.rs Co-authored-by: Andrew Lamb --- datafusion/physical-plan/src/topk/mod.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/datafusion/physical-plan/src/topk/mod.rs b/datafusion/physical-plan/src/topk/mod.rs index 6bdfc1b8b776..c1a16e74d8c3 100644 --- a/datafusion/physical-plan/src/topk/mod.rs +++ b/datafusion/physical-plan/src/topk/mod.rs @@ -122,6 +122,8 @@ impl TopK { }) .collect::>()?; + // TODO there is potential to add special cases for single column sort fields + // to improve performance let row_converter = RowConverter::new(sort_fields)?; let scratch_rows = row_converter.empty_rows( batch_size, From 47ee1994c389ec7425d1ed19effbdb92b5d44ce9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Tue, 3 Oct 2023 20:28:46 +0200 Subject: [PATCH 26/32] Update datafusion/physical-plan/src/topk/mod.rs Co-authored-by: Andrew Lamb --- datafusion/physical-plan/src/topk/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/physical-plan/src/topk/mod.rs b/datafusion/physical-plan/src/topk/mod.rs index c1a16e74d8c3..c4bd4ec555f4 100644 --- a/datafusion/physical-plan/src/topk/mod.rs +++ b/datafusion/physical-plan/src/topk/mod.rs @@ -142,7 +142,7 @@ impl TopK { }) } - /// Insert `batch`, remembering it if any of its values are among + /// Insert `batch`, remembering if any of its values are among /// the top k seen so far. pub fn insert_batch(&mut self, batch: RecordBatch) -> Result<()> { // Updates on drop From 2c3363769a631d5c0af4af46b644ad6c8f3804e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Wed, 4 Oct 2023 11:24:27 +0200 Subject: [PATCH 27/32] Update datafusion/physical-plan/src/topk/mod.rs Co-authored-by: Andrew Lamb --- datafusion/physical-plan/src/topk/mod.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/datafusion/physical-plan/src/topk/mod.rs b/datafusion/physical-plan/src/topk/mod.rs index c4bd4ec555f4..b672a86682a8 100644 --- a/datafusion/physical-plan/src/topk/mod.rs +++ b/datafusion/physical-plan/src/topk/mod.rs @@ -258,10 +258,6 @@ impl TopKMetrics { /// /// Using the `Row` format handles things such as ascending vs /// descending and nulls first vs nulls last. -/// -/// It doesn't use `BinaryHeap` in the Rust standard library because -/// it is important to check the current minimum value in the heap -/// prior to creating a new value to insert. struct TopKHeap { /// The maximum number of elemenents to store in this heap. k: usize, From c9121ccf2dc0aee9abf04460e30f5bce1e96a1c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Wed, 4 Oct 2023 11:28:51 +0200 Subject: [PATCH 28/32] Update datafusion/physical-plan/src/topk/mod.rs Co-authored-by: Andrew Lamb --- datafusion/physical-plan/src/topk/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/physical-plan/src/topk/mod.rs b/datafusion/physical-plan/src/topk/mod.rs index b672a86682a8..ba445b4d2348 100644 --- a/datafusion/physical-plan/src/topk/mod.rs +++ b/datafusion/physical-plan/src/topk/mod.rs @@ -67,7 +67,7 @@ use super::metrics::{BaselineMetrics, Count, ExecutionPlanMetricsSet, MetricBuil /// input before discarding everything other than the top 3 elements. /// /// The same answer can be produced by simply keeping track of the top -/// N elements, reducing the total amount of required buffer memory. +/// K=3 elements, reducing the total amount of required buffer memory. /// /// # Structure /// From 0dc3488bdc55ae3950f4b15464488c40adca55b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Wed, 4 Oct 2023 11:29:37 +0200 Subject: [PATCH 29/32] Add / update some comments --- datafusion/physical-plan/src/topk/mod.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/datafusion/physical-plan/src/topk/mod.rs b/datafusion/physical-plan/src/topk/mod.rs index ba445b4d2348..4638c0dcf264 100644 --- a/datafusion/physical-plan/src/topk/mod.rs +++ b/datafusion/physical-plan/src/topk/mod.rs @@ -163,9 +163,8 @@ impl TopK { self.row_converter.append(rows, &sort_keys)?; // TODO make this algorithmically better?: - // 1. only check topk values in rows - // 2. only do one update through top_k - + // Idea: filter out rows >= self.heap.max() early (before passing to `RowConverter`) + // this avoids some work and also might be better vectorizable. let mut batch_entry = self.heap.register_batch(batch); for (index, row) in rows.iter().enumerate() { match self.heap.max() { @@ -409,9 +408,9 @@ impl TopKHeap { let max_unused_rows = (20 * self.batch_size) + self.k; let unused_rows = self.store.unused_rows(); - // don't compact if the store has only one batch or + // don't compact if the store has one extra batch or + // unused rows is under the threshold if self.store.len() <= 2 || unused_rows < max_unused_rows { - //if self.store.len() <= 2 { return Ok(()); } // at first, compact the entire thing always into a new batch From 0470306950eee9614706ea8c451d24880e2d77d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Wed, 4 Oct 2023 13:55:09 +0200 Subject: [PATCH 30/32] Rename test file --- datafusion/sqllogictest/test_files/{aal.slt => topk.slt} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename datafusion/sqllogictest/test_files/{aal.slt => topk.slt} (100%) diff --git a/datafusion/sqllogictest/test_files/aal.slt b/datafusion/sqllogictest/test_files/topk.slt similarity index 100% rename from datafusion/sqllogictest/test_files/aal.slt rename to datafusion/sqllogictest/test_files/topk.slt From 0c59fe1975f628259b7fa19ff3a3e13609fc2c34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Wed, 4 Oct 2023 13:55:49 +0200 Subject: [PATCH 31/32] Rename table as well --- datafusion/sqllogictest/test_files/topk.slt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/datafusion/sqllogictest/test_files/topk.slt b/datafusion/sqllogictest/test_files/topk.slt index 36dc0d9fdcf9..70a90e9daf59 100644 --- a/datafusion/sqllogictest/test_files/topk.slt +++ b/datafusion/sqllogictest/test_files/topk.slt @@ -18,10 +18,10 @@ # Tests for development statement ok -create table aal(x int) as values (10), (2), (3), (0), (5), (4), (3), (2), (1), (3), (8); +create table topk(x int) as values (10), (2), (3), (0), (5), (4), (3), (2), (1), (3), (8); query I -select * from aal order by x; +select * from topk order by x; ---- 0 1 @@ -36,14 +36,14 @@ select * from aal order by x; 10 query I -select * from aal order by x limit 3; +select * from topk order by x limit 3; ---- 0 1 2 query I -select * from aal order by x desc limit 3; +select * from topk order by x desc limit 3; ---- 10 8 From 6bb299bae6ac0c30c0303b4d02f127f2da4a57ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20Heres?= Date: Thu, 5 Oct 2023 08:50:46 +0200 Subject: [PATCH 32/32] Update datafusion/sqllogictest/test_files/topk.slt Co-authored-by: Andrew Lamb --- datafusion/sqllogictest/test_files/topk.slt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/sqllogictest/test_files/topk.slt b/datafusion/sqllogictest/test_files/topk.slt index 70a90e9daf59..8d3b70139d35 100644 --- a/datafusion/sqllogictest/test_files/topk.slt +++ b/datafusion/sqllogictest/test_files/topk.slt @@ -217,7 +217,7 @@ d 1 -72 25590 1188089983 3090286296481837049 241 832 3542840110 5885937420286765 a 1 -5 12636 794623392 2909750622865366631 15 24022 2669374863 4776679784701509574 0.29877836 0.253725340799 waIGbOGl1PM6gnzZ4uuZt4E2yDWRHs -## make an example for +## make an example for dictionary encoding statement ok create table dict as select c1, c2, c3, c13, arrow_cast(c13, 'Dictionary(Int32, Utf8)') as c13_dict from aggregate_test_100;