From 524af05d3d7feb15f3f9e5f98d47ed4cd62297f9 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Wed, 9 Aug 2023 14:56:37 -0400
Subject: [PATCH 01/32] Prototype TopK operator

---
 datafusion/core/src/physical_plan/mod.rs      |   2 +
 .../core/src/physical_plan/sorts/sort.rs      |  77 ++-
 datafusion/core/src/physical_plan/topk/mod.rs | 516 ++++++++++++++++++
 .../tests/sqllogictests/test_files/aal.slt    | 202 +++++++
 .../tests/sqllogictests/test_files/window.slt | 101 ++--
 5 files changed, 825 insertions(+), 73 deletions(-)
 create mode 100644 datafusion/core/src/physical_plan/topk/mod.rs
 create mode 100644 datafusion/core/tests/sqllogictests/test_files/aal.slt
diff --git a/datafusion/core/src/physical_plan/mod.rs b/datafusion/core/src/physical_plan/mod.rs
index c73e61aea190..c60dbd6a44c5 100644
--- a/datafusion/core/src/physical_plan/mod.rs
+++ b/datafusion/core/src/physical_plan/mod.rs
@@ -17,6 +17,7 @@
 
 //! Traits for physical query plan, supporting parallel execution for partitioned relations.
 
+mod topk;
 mod visitor;
 pub use self::metrics::Metric;
 use self::metrics::MetricsSet;
@@ -27,6 +28,7 @@ use crate::datasource::physical_plan::FileScanConfig;
 use crate::physical_plan::expressions::PhysicalSortExpr;
 use datafusion_common::Result;
 pub use datafusion_common::{ColumnStatistics, Statistics};
+pub use topk::TopK;
 pub use visitor::{accept, visit_execution_plan, ExecutionPlanVisitor};
 
 use arrow::datatypes::SchemaRef;
diff --git a/datafusion/core/src/physical_plan/sorts/sort.rs b/datafusion/core/src/physical_plan/sorts/sort.rs
index 52936dc55e6e..9f6ff0fb6d03 100644
--- a/datafusion/core/src/physical_plan/sorts/sort.rs
+++ b/datafusion/core/src/physical_plan/sorts/sort.rs
@@ -26,6 +26,7 @@ use crate::physical_plan::metrics::{
 };
 use crate::physical_plan::sorts::merge::streaming_merge;
 use crate::physical_plan::stream::{RecordBatchReceiverStream, RecordBatchStreamAdapter};
+use crate::physical_plan::topk::TopK;
 use crate::physical_plan::{
     DisplayAs, DisplayFormatType, Distribution, EmptyRecordBatchStream, ExecutionPlan,
     Partitioning, SendableRecordBatchStream, Statistics,
@@ -759,7 +760,12 @@ impl DisplayAs for SortExec {
                 let expr: Vec<String> = self.expr.iter().map(|e| e.to_string()).collect();
                 match self.fetch {
                     Some(fetch) => {
-                        write!(f, "SortExec: fetch={fetch}, expr=[{}]", expr.join(","))
+                        write!(
+                            f,
+                            // TODO should this say topk?
+                            "SortExec: fetch={fetch}, expr=[{}]",
+                            expr.join(",")
+                        )
                     }
                     None => write!(f, "SortExec: expr=[{}]", expr.join(",")),
                 }
@@ -847,29 +853,54 @@ impl ExecutionPlan for SortExec {
 
         trace!("End SortExec's input.execute for partition: {}", partition);
 
-        let mut sorter = ExternalSorter::new(
-            partition,
-            input.schema(),
-            self.expr.clone(),
-            context.session_config().batch_size(),
-            self.fetch,
-            execution_options.sort_spill_reservation_bytes,
-            execution_options.sort_in_place_threshold_bytes,
-            &self.metrics_set,
-            context.runtime_env(),
-        );
+        if let Some(fetch) = self.fetch.as_ref() {
+            let mut topk = TopK::try_new(
+                partition,
+                input.schema(),
+                self.expr.clone(),
+                *fetch,
+                context.session_config().batch_size(),
+                context.runtime_env(),
+                &self.metrics_set,
+                partition,
+            )?;
+
+            Ok(Box::pin(RecordBatchStreamAdapter::new(
+                self.schema(),
+                futures::stream::once(async move {
+                    while let Some(batch) = input.next().await {
+                        let batch = batch?;
+                        topk.insert_batch(batch)?;
+                    }
+                    topk.emit()
+                })
+                .try_flatten(),
+            )))
+        } else {
+            let mut sorter = ExternalSorter::new(
+                partition,
+                input.schema(),
+                self.expr.clone(),
+                context.session_config().batch_size(),
+                self.fetch,
+                execution_options.sort_spill_reservation_bytes,
+                execution_options.sort_in_place_threshold_bytes,
+                &self.metrics_set,
+                context.runtime_env(),
+            );
 
-        Ok(Box::pin(RecordBatchStreamAdapter::new(
-            self.schema(),
-            futures::stream::once(async move {
-                while let Some(batch) = input.next().await {
-                    let batch = batch?;
-                    sorter.insert_batch(batch).await?;
-                }
-                sorter.sort()
-            })
-            .try_flatten(),
-        )))
+            Ok(Box::pin(RecordBatchStreamAdapter::new(
+                self.schema(),
+                futures::stream::once(async move {
+                    while let Some(batch) = input.next().await {
+                        let batch = batch?;
+                        sorter.insert_batch(batch).await?;
+                    }
+                    sorter.sort()
+                })
+                .try_flatten(),
+            )))
+        }
     }
 
     fn metrics(&self) -> Option<MetricsSet> {
diff --git a/datafusion/core/src/physical_plan/topk/mod.rs b/datafusion/core/src/physical_plan/topk/mod.rs
new file mode 100644
index 000000000000..d626f0806698
--- /dev/null
+++ b/datafusion/core/src/physical_plan/topk/mod.rs
@@ -0,0 +1,516 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! TopK: Combination of Sort / LIMIT
+
+use arrow::{
+    compute::interleave,
+    row::{OwnedRow, RowConverter, Rows, SortField},
+};
+use std::{cmp::Ordering, sync::Arc};
+
+use arrow_array::{Array, ArrayRef, RecordBatch};
+use arrow_schema::SchemaRef;
+use datafusion_common::Result;
+use datafusion_execution::{
+    memory_pool::{MemoryConsumer, MemoryReservation},
+    runtime_env::RuntimeEnv,
+};
+use datafusion_physical_expr::PhysicalSortExpr;
+use hashbrown::HashMap;
+
+use crate::physical_plan::{stream::RecordBatchStreamAdapter, SendableRecordBatchStream};
+
+use super::metrics::{BaselineMetrics, Count, ExecutionPlanMetricsSet, MetricBuilder};
+
+/// Global TopK
+///
+/// # Background
+///
+/// "Top K" is a common query optimization used for queries such as
+/// "find the top 3 customers by revenue". The (simplified) SQL for
+/// such a query might be:
+///
+/// ```sql
+/// SELECT customer_id, revenue FROM 'sales.csv' ORDER BY revenue DESC limit 3;
+/// ```
+///
+/// The simple plan would be:
+///
+/// ```
+/// > explain SELECT customer_id, revenue FROM sales ORDER BY revenue DESC limit 3;
+/// +--------------+----------------------------------------+
+/// | plan_type    | plan                                   |
+/// +--------------+----------------------------------------+
+/// | logical_plan | Limit: 3                               |
+/// |              |   Sort: revenue DESC NULLS FIRST       |
+/// |              |     Projection: customer_id, revenue   |
+/// |              |       TableScan: sales                 |
+/// +--------------+----------------------------------------+
+/// ```
+///
+/// While this plan produces the correct answer, it will fully sorts the
+/// input before discarding everything other than the top 3 elements.
+///
+/// The same answer can be produced by simply keeping track of the top
+/// N elements, reducing the total amount of required buffer memory.
+///
+/// # Structure
+///
+/// This operator tracks the top K items using a `TopKHeap`.
+pub struct TopK {
+    /// schema of the output (and the input)
+    schema: SchemaRef,
+    /// Runtime metrics
+    metrics: TopKMetrics,
+    /// Reservation
+    reservation: MemoryReservation,
+    /// The target number of rows for output batches
+    batch_size: usize,
+    /// sort expressions
+    expr: Arc<[PhysicalSortExpr]>,
+    /// row converter, for sort keys
+    row_converter: RowConverter,
+    /// scratch space for converting rows
+    scratch_rows: Rows,
+    /// stores the top k values and their sort key values, in order
+    heap: TopKHeap,
+}
+
+impl TopK {
+    /// Create a new [`TopK`] that stores the top `k` values, as
+    /// defined by the sort expressions in `expr`.
+    // TOOD: make a builder or some other nicer API to avoid the
+    // clippy warning
+    #[allow(clippy::too_many_arguments)]
+    pub fn try_new(
+        partition_id: usize,
+        schema: SchemaRef,
+        expr: Vec<PhysicalSortExpr>,
+        k: usize,
+        batch_size: usize,
+        runtime: Arc<RuntimeEnv>,
+        metrics: &ExecutionPlanMetricsSet,
+        partition: usize,
+    ) -> Result<Self> {
+        let reservation = MemoryConsumer::new(format!("TopK[{partition_id}]"))
+            .register(&runtime.memory_pool);
+
+        let expr: Arc<[PhysicalSortExpr]> = expr.into();
+
+        let sort_fields: Vec<_> = expr
+            .iter()
+            .map(|e| {
+                Ok(SortField::new_with_options(
+                    e.expr.data_type(&schema)?,
+                    e.options,
+                ))
+            })
+            .collect::<Result<_>>()?;
+
+        let row_converter = RowConverter::new(sort_fields)?;
+        let scratch_rows = row_converter.empty_rows(
+            batch_size,
+            20 * batch_size, // guestimate 20 bytes per row
+        );
+
+        Ok(Self {
+            schema,
+            metrics: TopKMetrics::new(metrics, partition),
+            reservation,
+            batch_size,
+            expr,
+            row_converter,
+            scratch_rows,
+            heap: TopKHeap::new(k),
+        })
+    }
+
+    /// Insert `batch`, remembering it if any of its values are among
+    /// the top k seen so far.
+    pub fn insert_batch(&mut self, batch: RecordBatch) -> Result<()> {
+        // Updates on drop
+        let _timer = self.metrics.baseline.elapsed_compute().timer();
+
+        let sort_keys: Vec<ArrayRef> = self
+            .expr
+            .iter()
+            .map(|expr| {
+                let value = expr.expr.evaluate(&batch)?;
+                Ok(value.into_array(batch.num_rows()))
+            })
+            .collect::<Result<Vec<_>>>()?;
+
+        // reuse existing `Rows` to avoid reallocations
+        let rows = &mut self.scratch_rows;
+        rows.clear();
+        self.row_converter.append(rows, &sort_keys)?;
+
+        // TODO make this algorithmically better?:
+        // 1. only check topk values in rows
+        // 2. only do one update through top_k
+
+        let mut batch_entry = self.heap.register_batch(batch);
+        for (index, row) in rows.iter().enumerate() {
+            match self.heap.k_largest() {
+                // heap has k items, and the current row is not
+                // smaller than the curret smallest k value, skip
+                Some(largest) if largest.row.row() <= row => {}
+                // don't yet have k items or new item is greater than
+                // current min top k
+                None | Some(_) => {
+                    self.heap.add(&mut batch_entry, row.owned(), index);
+                    self.metrics.row_replacements.add(1);
+                }
+            }
+        }
+        self.heap.insert_batch_entry(batch_entry);
+
+        // update memory reservation
+        self.reservation.try_resize(self.size())?;
+        Ok(())
+    }
+
+    /// Returns the top k results broken into `batch_size` [`RecordBatch`]es
+    pub fn emit(self) -> Result<SendableRecordBatchStream> {
+        let Self {
+            schema,
+            metrics,
+            reservation: _,
+            batch_size,
+            expr: _,
+            row_converter: _,
+            scratch_rows: _,
+            heap,
+        } = self;
+        let _timer = metrics.baseline.elapsed_compute().timer(); // time updated on drop
+
+        let mut batch = heap.emit(schema.clone())?;
+        metrics.baseline.output_rows().add(batch.num_rows());
+
+        // break into record batches as needed
+        let mut batches = vec![];
+        loop {
+            if batch.num_rows() < batch_size {
+                batches.push(Ok(batch));
+                break;
+            } else {
+                batches.push(Ok(batch.slice(0, batch_size)));
+                batch = batch.slice(batch_size, batch.num_rows());
+            }
+        }
+        Ok(Box::pin(RecordBatchStreamAdapter::new(
+            schema,
+            futures::stream::iter(batches),
+        )))
+    }
+
+    /// return the size of memory used by this operator, in bytes
+    fn size(&self) -> usize {
+        std::mem::size_of::<Self>()
+            + self.row_converter.size()
+            + self.scratch_rows.size()
+            + self.heap.size()
+    }
+}
+
+struct TopKMetrics {
+    /// metrics
+    pub baseline: BaselineMetrics,
+
+    /// count of how many rows were replaced in the heap
+    pub row_replacements: Count,
+}
+
+impl TopKMetrics {
+    fn new(metrics: &ExecutionPlanMetricsSet, partition: usize) -> Self {
+        Self {
+            baseline: BaselineMetrics::new(metrics, partition),
+            row_replacements: MetricBuilder::new(metrics)
+                .counter("row_replacements", partition),
+        }
+    }
+}
+
+/// This structure keeps at most the *smallest* k items, using the
+/// [arrow::row] format for sort keys. While it is called "topK" for
+/// values like `1, 2, 3, 4, 5` the "top 3" really means the
+/// *smallest* 3 , `1, 2, 3`, not the *largest* 3 `3, 4, 5`.
+///
+/// Using the `Row` format handles things such as ascending vs
+/// descending and nulls first vs nulls last.
+///
+/// It doesn't use `BinaryHeap` in the Rust standard library because
+/// it is important to check the current minimum value in the heap
+/// prior to creating a new value to insert.
+struct TopKHeap {
+    /// The maximum size of this heap.
+    k: usize,
+    /// Storage for up at most `k` items, in ascending
+    /// order. `inner[0]` holds the smallest value of the smallest k
+    /// so far, `inner[len-1]` holds the largest value smallest k so far.
+    inner: Vec<TopKRow>,
+    /// Storage the original row values (TopKRow only has the sort key)
+    store: RecordBatchStore,
+    /// The size of all `OwnedRows`s held by this heap
+    owned_row_bytes: usize,
+}
+
+impl TopKHeap {
+    fn new(k: usize) -> Self {
+        assert!(k > 0);
+        Self {
+            k,
+            inner: Vec::with_capacity(k),
+            store: RecordBatchStore::new(),
+            owned_row_bytes: 0,
+        }
+    }
+
+    /// Register a [`RecordBatch`] with the heap, returning the
+    /// appropriate entry
+    pub fn register_batch(&mut self, batch: RecordBatch) -> RecordBatchEntry {
+        self.store.register(batch)
+    }
+
+    /// Insert a [`RecordBatchEntry`] created by a previous call to
+    /// [`Self::register_batch`] into storage.
+    pub fn insert_batch_entry(&mut self, entry: RecordBatchEntry) {
+        self.store.insert(entry)
+    }
+
+    /// Returns the largest value stored by the heap if there are k
+    /// items, otherwise returns None
+    fn k_largest(&self) -> Option<&TopKRow> {
+        if self.inner.len() < self.k {
+            None
+        } else {
+            self.inner.last()
+        }
+    }
+
+    /// Adds `row` to this heap. If inserting this new item would
+    /// increase the size past `k`, removes the previously smallest
+    /// item.
+    fn add(&mut self, batch_entry: &mut RecordBatchEntry, row: OwnedRow, index: usize) {
+        assert!(self.inner.len() <= self.k);
+
+        batch_entry.uses += 1;
+
+        self.owned_row_bytes += owned_row_size(&row);
+
+        // put the new row into the correct location to maintain that
+        // self.inner is sorted in descending order
+        let insertion_point = self
+            .inner
+            .partition_point(|current_row| current_row.row <= row);
+        self.inner.insert(
+            insertion_point,
+            TopKRow {
+                row,
+                batch_id: batch_entry.id,
+                index,
+            },
+        );
+
+        // limit size to k items
+        if self.inner.len() > self.k {
+            // If there was a previous minimum value, decrement its use
+            if let Some(prev_min) = self.inner.pop() {
+                if prev_min.batch_id == batch_entry.id {
+                    batch_entry.uses -= 1;
+                } else {
+                    self.store.unuse(prev_min.batch_id);
+                }
+                // update memory accounting
+                let prev_size = owned_row_size(&prev_min.row);
+                assert!(self.owned_row_bytes >= prev_size);
+                self.owned_row_bytes -= prev_size;
+            }
+        }
+    }
+
+    /// Returns the values stored in this heap, from values low to high, as a single
+    /// [`RecordBatch`]
+    pub fn emit(&self, schema: SchemaRef) -> Result<RecordBatch> {
+        // Indicies for each row within its respective RecordBatch
+        let indicies: Vec<_> = self
+            .inner
+            .iter()
+            .enumerate()
+            .map(|(i, k)| (i, k.index))
+            .collect();
+
+        let num_columns = {
+            let Some(first_value) = self.inner.get(0) else {
+                return Ok(RecordBatch::new_empty(schema));
+            };
+            self.store
+                .get(first_value.batch_id)
+                .expect("invalid batch id")
+                .batch
+                .num_columns()
+        };
+
+        // build the output columns one at time, using the
+        // `interleave` kernel to pick rows from different arrays
+        let output_columns: Vec<_> = (0..num_columns)
+            .map(|col| {
+                let input_arrays: Vec<_> = self
+                    .inner
+                    .iter()
+                    .map(|k| {
+                        let entry =
+                            self.store.get(k.batch_id).expect("invalid stored batch id");
+                        entry.batch.column(col) as &dyn Array
+                    })
+                    .collect();
+
+                // at this point `indices` contains indexes within the
+                // rows and `input_arrays` contains a reference to the
+                // relevant Array for that index. `interleave` pulls
+                // them together into a single new array
+                Ok(interleave(&input_arrays, &indicies)?)
+            })
+            .collect::<Result<_>>()?;
+
+        Ok(RecordBatch::try_new(schema, output_columns)?)
+    }
+
+    /// return the size of memory used by this heap, in bytes
+    fn size(&self) -> usize {
+        std::mem::size_of::<Self>()
+            + (self.inner.capacity() * std::mem::size_of::<TopKRow>())
+            + self.store.size()
+            + self.owned_row_bytes
+    }
+}
+
+/// Size of memory owned by `row` until row::size() is available
+/// TODO file upstream ticket in arrow-rs to add this
+fn owned_row_size(row: &OwnedRow) -> usize {
+    std::mem::size_of_val(row) + row.as_ref().len() // underlying data, doesn't account for capacity
+}
+
+/// Represents one of the top K rows. Orders according to `OwnedRow`
+#[derive(Debug, PartialEq)]
+struct TopKRow {
+    /// the value of the sort key for this row
+    row: OwnedRow,
+    /// the index in this record batch the row came from
+    index: usize,
+    /// the RecordBatch this row came from: an id into a [`RecordBatchStore`]
+    batch_id: u32,
+}
+
+impl Eq for TopKRow {}
+
+impl PartialOrd for TopKRow {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for TopKRow {
+    fn cmp(&self, other: &Self) -> Ordering {
+        self.row.cmp(&other.row)
+    }
+}
+
+#[derive(Debug)]
+struct RecordBatchEntry {
+    id: u32,
+    batch: RecordBatch,
+    // for this batch, how many times has it been used
+    uses: usize,
+}
+
+/// This structure tracks [`RecordBatch`] by an id so that:
+///
+/// 1. The baches can be tracked via an id that can be copied cheaply
+/// 2. The total memory held by all batches is tracked
+#[derive(Debug)]
+struct RecordBatchStore {
+    /// id generator
+    next_id: u32,
+    /// storage
+    batches: HashMap<u32, RecordBatchEntry>,
+    /// total size of all record batches tracked by this store
+    batches_size: usize,
+}
+
+impl RecordBatchStore {
+    fn new() -> Self {
+        Self {
+            next_id: 0,
+            batches: HashMap::new(),
+            batches_size: 0,
+        }
+    }
+
+    /// Register this batch with the store and assign an ID. No
+    /// attempt is made to compare this batch to other batches
+    pub fn register(&mut self, batch: RecordBatch) -> RecordBatchEntry {
+        let id = self.next_id;
+        self.next_id += 1;
+        RecordBatchEntry { id, batch, uses: 0 }
+    }
+
+    /// Insert a record batch entry into this store, tracking its
+    /// memory use, if it has any uses
+    pub fn insert(&mut self, entry: RecordBatchEntry) {
+        // uses of 0 means that none of the rows in the batch were stored in the topk
+        if entry.uses > 0 {
+            self.batches_size += entry.batch.get_array_memory_size();
+            self.batches.insert(entry.id, entry);
+        }
+    }
+
+    fn get(&self, id: u32) -> Option<&RecordBatchEntry> {
+        self.batches.get(&id)
+    }
+
+    /// remove a use from the specified batch id. If the use count
+    /// reaches zero the batch entry is removed from the store
+    ///
+    /// panics if there were no remaining uses of id
+    pub fn unuse(&mut self, id: u32) {
+        let remove = if let Some(batch_entry) = self.batches.get_mut(&id) {
+            batch_entry.uses = batch_entry.uses.checked_sub(1).expect("underflow");
+            batch_entry.uses == 0
+        } else {
+            panic!("No entry for id {id}");
+        };
+
+        if remove {
+            let old_entry = self.batches.remove(&id).unwrap();
+            self.batches_size = self
+                .batches_size
+                .checked_sub(old_entry.batch.get_array_memory_size())
+                .unwrap();
+        }
+    }
+
+    /// returns the size of memory used by this store, including all
+    /// referenced `RecordBatch`es, in bytes
+    pub fn size(&self) -> usize {
+        std::mem::size_of::<Self>()
+            + self.batches.capacity()
+                * (std::mem::size_of::<u32>() + std::mem::size_of::<RecordBatchEntry>())
+            + self.batches_size
+    }
+}
diff --git a/datafusion/core/tests/sqllogictests/test_files/aal.slt b/datafusion/core/tests/sqllogictests/test_files/aal.slt
new file mode 100644
index 000000000000..f19c79e8d1cb
--- /dev/null
+++ b/datafusion/core/tests/sqllogictests/test_files/aal.slt
@@ -0,0 +1,202 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Tests for development
+
+statement ok
+create table aal(x int) as values (10), (2), (3), (0), (5), (4), (3), (2), (1), (3), (8);
+
+query I
+select * from aal order by x;
+----
+0
+1
+2
+2
+3
+3
+3
+4
+5
+8
+10
+
+query I
+select * from aal order by x limit 3;
+----
+0
+1
+2
+
+query I
+select * from aal order by x desc limit 3;
+----
+10
+8
+5
+
+
+
+
+statement ok
+CREATE EXTERNAL TABLE aggregate_test_100 (
+  c1  VARCHAR NOT NULL,
+  c2  TINYINT NOT NULL,
+  c3  SMALLINT NOT NULL,
+  c4  SMALLINT,
+  c5  INT,
+  c6  BIGINT NOT NULL,
+  c7  SMALLINT NOT NULL,
+  c8  INT NOT NULL,
+  c9  BIGINT UNSIGNED NOT NULL,
+  c10 VARCHAR NOT NULL,
+  c11 FLOAT NOT NULL,
+  c12 DOUBLE NOT NULL,
+  c13 VARCHAR NOT NULL
+)
+STORED AS CSV
+WITH HEADER ROW
+LOCATION '../../testing/data/csv/aggregate_test_100.csv'
+
+query TT
+explain select * from aggregate_test_100 ORDER BY c13 desc limit 5;
+----
+logical_plan
+Limit: skip=0, fetch=5
+--Sort: aggregate_test_100.c13 DESC NULLS FIRST, fetch=5
+----TableScan: aggregate_test_100 projection=[c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13]
+physical_plan
+GlobalLimitExec: skip=0, fetch=5
+--SortExec: fetch=5, expr=[c13@12 DESC]
+----CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13], has_header=true
+
+
+
+
+query T
+select c13 from aggregate_test_100 ORDER BY c13;
+----
+0VVIHzxWtNOFLtnhjHEKjXaJOSLJfm
+0keZ5G8BffGwgF2RwQD59TFzMStxCB
+0og6hSkhbX8AC1ktFS4kounvTzy8Vo
+1aOcrEGd0cOqZe2I5XBOm0nDcwtBZO
+2T3wSlHdEmASmO0xcXHnndkKEt6bz8
+3BEOHQsMEFZ58VcNTOJYShTBpAPzbt
+4HX6feIvmNXBN7XGqgO4YVBkhu8GDI
+4JznSdBajNWhu4hRQwjV1FjTTxY68i
+52mKlRE3aHCBZtjECq6sY9OqVf8Dze
+56MZa5O1hVtX4c5sbnCfxuX5kDChqI
+6FPJlLAcaQ5uokyOWZ9HGdLZObFvOZ
+6WfVFBVGJSQb7FhA7E0lBwdvjfZnSW
+6oIXZuIPIqEoPBvFmbt2Nxy3tryGUE
+6x93sxYioWuq5c9Kkk8oTAAORM7cH0
+802bgTGl6Bk5TlkPYYTxp5JkKyaYUA
+8LIh0b6jmDGm87BmIyjdxNIpX4ugjD
+90gAtmGEeIqUTbo1ZrxCvWtsseukXC
+9UbObCsVkmYpJGcGrgfK90qOnwb2Lj
+AFGCj7OWlEB5QfniEFgonMq90Tq5uH
+ALuRhobVWbnQTTWZdSOk0iVe8oYFhW
+Amn2K87Db5Es3dFQO9cw9cvpAM6h35
+AyYVExXK6AR2qUTxNZ7qRHQOVGMLcz
+BJqx5WokrmrrezZA0dUbleMYkG5U2O
+BPtQMxnuSPpxMExYV9YkDa6cAN7GP3
+BsM5ZAYifRh5Lw3Y8X1r53I0cTJnfE
+C2GT5KVyOPZpgKVl110TyZO0NcJ434
+DuJNG8tufSqW0ZstHqWj3aGvFLMg4A
+EcCuckwsF3gV1Ecgmh5v4KM8g1ozif
+ErJFw6hzZ5fmI5r8bhE4JzlscnhKZU
+F7NSTjWvQJyBburN7CXRUlbgp2dIrA
+Fi4rJeTQq4eXj8Lxg3Hja5hBVTVV5u
+H5j5ZHy1FGesOAHjkQEDYCucbpKWRu
+HKSMQ9nTnwXCJIte1JrM1dtYnDtJ8g
+IWl0G3ZlMNf7WT8yjIB49cx7MmYOmr
+IZTkHMLvIKuiLjhDjYMmIHxh166we4
+Ig1QcuKsjHXkproePdERo2w0mYzIqd
+JHNgc2UCaiXOdmkxwDDyGhRlO0mnBQ
+JN0VclewmjwYlSl8386MlWv5rEhWCz
+JafwVLSVk5AVoXFuzclesQ000EE2k1
+KJFcmTVjdkCMv94wYCtfHMFhzyRsmH
+Ktb7GQ0N1DrxwkCkEUsTaIXk0xYinn
+Ld2ej8NEv5zNcqU60FwpHeZKBhfpiV
+LiEBxds3X0Uw0lxiYjDqrkAaAwoiIW
+MXhhH1Var3OzzJCtI9VNyYvA0q8UyJ
+MeSTAXq8gVxVjbEjgkvU9YLte0X9uE
+NEhyk8uIx4kEULJGa8qIyFjjBcP2G6
+O66j6PaYuZhEUtqV6fuU7TyjM2WxC5
+OF7fQ37GzaZ5ikA2oMyvleKtgnLjXh
+OPwBqCEK5PWTjWaiOyL45u2NLTaDWv
+Oq6J4Rx6nde0YlhOIJkFsX2MsSvAQ0
+Ow5PGpfTm4dXCfTDsXAOTatXRoAydR
+QEHVvcP8gxI6EMJIrvcnIhgzPNjIvv
+QJYm7YRA3YetcBHI5wkMZeLXVmfuNy
+QYlaIAnJA6r8rlAb6f59wcxvcPcWFf
+RilTlL1tKkPOUFuzmLydHAVZwv1OGl
+Sfx0vxv1skzZWT1PqVdoRDdO6Sb6xH
+TTQUwpMNSXZqVBKAFvXu7OlWvKXJKX
+TtDKUZxzVxsq758G6AWPSYuZgVgbcl
+VDhtJkYjAYPykCgOU9x3v7v3t4SO1a
+VY0zXmXeksCT8BzvpzpPLbmU9Kp9Y4
+Vp3gmWunM5A7wOC9YW2JroFqTWjvTi
+WHmjWk2AY4c6m7DA4GitUx6nmb1yYS
+XemNcT1xp61xcM1Qz3wZ1VECCnq06O
+Z2sWcQr0qyCJRMHDpRy3aQr7PkHtkK
+aDxBtor7Icd9C5hnTvvw5NrIre740e
+akiiY5N0I44CMwEnBL6RTBk7BRkxEj
+b3b9esRhTzFEawbs6XhpKnD9ojutHB
+bgK1r6v3BCTh0aejJUhkA1Hn6idXGp
+cBGc0kSm32ylBDnxogG727C0uhZEYZ
+cq4WSAIFwx3wwTUS5bp1wCe71R6U5I
+dVdvo6nUD5FgCgsbOZLds28RyGTpnx
+e2Gh6Ov8XkXoFdJWhl0EjwEHlMDYyG
+f9ALCzwDAKmdu7Rk2msJaB1wxe5IBX
+fuyvs0w7WsKSlXqJ1e6HFSoLmx03AG
+gTpyQnEODMcpsPnJMZC66gh33i3m0b
+gpo8K5qtYePve6jyPt6xgJx4YOVjms
+gxfHWUF8XgY2KdFxigxvNEXe2V2XMl
+i6RQVXKUh7MzuGMDaNclUYnFUAireU
+ioEncce3mPOXD2hWhpZpCPWGATG6GU
+jQimhdepw3GKmioWUlVSWeBVRKFkY3
+l7uwDoTepWwnAP0ufqtHJS3CRi7RfP
+lqhzgLsXZ8JhtpeeUWWNbMz8PHI705
+m6jD0LBIQWaMfenwRCTANI9eOdyyto
+mhjME0zBHbrK6NMkytMTQzOssOa1gF
+mzbkwXKrPeZnxg2Kn1LRF5hYSsmksS
+nYVJnVicpGRqKZibHyBAmtmzBXAFfT
+oHJMNvWuunsIMIWFnYG31RCfkOo2V7
+oLZ21P2JEDooxV1pU31cIxQHEeeoLu
+okOkcWflkNXIy4R8LzmySyY1EC3sYd
+pLk3i59bZwd5KBZrI1FiweYTd5hteG
+pTeu0WMjBRTaNRT15rLCuEh3tBJVc5
+qnPOOmslCJaT45buUisMRnM0rc77EK
+t6fQUjJejPcjc04wHvHTPe55S65B4V
+ukOiFGGFnQJDHFgZxHMpvhD3zybF0M
+ukyD7b0Efj7tNlFSRmzZ0IqkEzg2a8
+waIGbOGl1PM6gnzZ4uuZt4E2yDWRHs
+wwXqSGKLyBQyPkonlzBNYUJTCo4LRS
+xipQ93429ksjNcXPX5326VSg1xJZcW
+y7C453hRWd4E7ImjNDWlpexB8nUqjh
+ydkwycaISlYSlEq3TlkS2m15I2pcp8
+
+
+query TIIIIIIIITRRT
+select * from aggregate_test_100 ORDER BY c13 desc limit 5;
+----
+a 4 -38 20744 762932956 308913475857409919 7 45465 1787652631 878137512938218976 0.7459874 0.021825780392 ydkwycaISlYSlEq3TlkS2m15I2pcp8
+d 1 -98 13630 -1991133944 1184110014998006843 220 2986 225513085 9634106610243643486 0.89651865 0.164088254508 y7C453hRWd4E7ImjNDWlpexB8nUqjh
+e 2 52 -12056 -1090239422 9011500141803970147 238 4168 2013662838 12565360638488684051 0.6694766 0.391444365692 xipQ93429ksjNcXPX5326VSg1xJZcW
+d 1 -72 25590 1188089983 3090286296481837049 241 832 3542840110 5885937420286765261 0.41980565 0.215354023438 wwXqSGKLyBQyPkonlzBNYUJTCo4LRS
+a 1 -5 12636 794623392 2909750622865366631 15 24022 2669374863 4776679784701509574 0.29877836 0.253725340799 waIGbOGl1PM6gnzZ4uuZt4E2yDWRHs
diff --git a/datafusion/core/tests/sqllogictests/test_files/window.slt b/datafusion/core/tests/sqllogictests/test_files/window.slt
index cd257aaa92de..45a3bb583450 100644
--- a/datafusion/core/tests/sqllogictests/test_files/window.slt
+++ b/datafusion/core/tests/sqllogictests/test_files/window.slt
@@ -2597,6 +2597,7 @@ SELECT
 # test_source_sorted_builtin
 query TT
 EXPLAIN SELECT
+  ts,
   FIRST_VALUE(inc_col) OVER(ORDER BY ts RANGE BETWEEN 10 PRECEDING and 1 FOLLOWING) as fv1,
   FIRST_VALUE(inc_col) OVER(ORDER BY ts ROWS BETWEEN 10 PRECEDING and 1 FOLLOWING) as fv2,
   LAST_VALUE(inc_col) OVER(ORDER BY ts RANGE BETWEEN 10 PRECEDING and 1 FOLLOWING) as lv1,
@@ -2626,24 +2627,23 @@ EXPLAIN SELECT
   LIMIT 5;
 ----
 logical_plan
-Projection: fv1, fv2, lv1, lv2, nv1, nv2, rn1, rn2, rank1, rank2, dense_rank1, dense_rank2, lag1, lag2, lead1, lead2, fvr1, fvr2, lvr1, lvr2, lagr1, lagr2, leadr1, leadr2
---Limit: skip=0, fetch=5
-----Sort: annotated_data_finite.ts DESC NULLS FIRST, fetch=5
-------Projection: FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING AS fv1, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS fv2, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING AS lv1, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS lv2, NTH_VALUE(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING AS nv1, NTH_VALUE(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS nv2, ROW_NUMBER() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING AS rn1, ROW_NUMBER() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS rn2, RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING AS rank1, RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS rank2, DENSE_RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING AS dense_rank1, DENSE_RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS dense_rank2, LAG(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING AS lag1, LAG(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS lag2, LEAD(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING AS lead1, LEAD(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS lead2, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING AS fvr1, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS fvr2, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING AS lvr1, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS lvr2, LAG(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING AS lagr1, LAG(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS lagr2, LEAD(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING AS leadr1, LEAD(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS leadr2, annotated_data_finite.ts
---------WindowAggr: windowExpr=[[FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, NTH_VALUE(annotated_data_finite.inc_col, Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, NTH_VALUE(annotated_data_finite.inc_col, Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, ROW_NUMBER() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, ROW_NUMBER() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, DENSE_RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, DENSE_RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, LAG(annotated_data_finite.inc_col, Int64(1), Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, LAG(annotated_data_finite.inc_col, Int64(2), Int64(1002)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, LEAD(annotated_data_finite.inc_col, Int64(-1), Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, LEAD(annotated_data_finite.inc_col, Int64(4), Int64(1004)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING]]
-----------WindowAggr: windowExpr=[[FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, LAG(annotated_data_finite.inc_col, Int64(1), Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, LAG(annotated_data_finite.inc_col, Int64(2), Int64(1002)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, LEAD(annotated_data_finite.inc_col, Int64(-1), Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, LEAD(annotated_data_finite.inc_col, Int64(4), Int64(1004)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING]]
-------------TableScan: annotated_data_finite projection=[ts, inc_col]
+Limit: skip=0, fetch=5
+--Sort: annotated_data_finite.ts DESC NULLS FIRST, fetch=5
+----Projection: annotated_data_finite.ts, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING AS fv1, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS fv2, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING AS lv1, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS lv2, NTH_VALUE(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING AS nv1, NTH_VALUE(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS nv2, ROW_NUMBER() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING AS rn1, ROW_NUMBER() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS rn2, RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING AS rank1, RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS rank2, DENSE_RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING AS dense_rank1, DENSE_RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS dense_rank2, LAG(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING AS lag1, LAG(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS lag2, LEAD(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING AS lead1, LEAD(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS lead2, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING AS fvr1, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS fvr2, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING AS lvr1, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS lvr2, LAG(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING AS lagr1, LAG(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS lagr2, LEAD(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING AS leadr1, LEAD(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING AS leadr2
+------WindowAggr: windowExpr=[[FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, NTH_VALUE(annotated_data_finite.inc_col, Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, NTH_VALUE(annotated_data_finite.inc_col, Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, ROW_NUMBER() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, ROW_NUMBER() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, DENSE_RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, DENSE_RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, LAG(annotated_data_finite.inc_col, Int64(1), Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, LAG(annotated_data_finite.inc_col, Int64(2), Int64(1002)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, LEAD(annotated_data_finite.inc_col, Int64(-1), Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, LEAD(annotated_data_finite.inc_col, Int64(4), Int64(1004)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING]]
+--------WindowAggr: windowExpr=[[FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, LAG(annotated_data_finite.inc_col, Int64(1), Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, LAG(annotated_data_finite.inc_col, Int64(2), Int64(1002)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, LEAD(annotated_data_finite.inc_col, Int64(-1), Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, LEAD(annotated_data_finite.inc_col, Int64(4), Int64(1004)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING]]
+----------TableScan: annotated_data_finite projection=[ts, inc_col]
 physical_plan
-ProjectionExec: expr=[fv1@0 as fv1, fv2@1 as fv2, lv1@2 as lv1, lv2@3 as lv2, nv1@4 as nv1, nv2@5 as nv2, rn1@6 as rn1, rn2@7 as rn2, rank1@8 as rank1, rank2@9 as rank2, dense_rank1@10 as dense_rank1, dense_rank2@11 as dense_rank2, lag1@12 as lag1, lag2@13 as lag2, lead1@14 as lead1, lead2@15 as lead2, fvr1@16 as fvr1, fvr2@17 as fvr2, lvr1@18 as lvr1, lvr2@19 as lvr2, lagr1@20 as lagr1, lagr2@21 as lagr2, leadr1@22 as leadr1, leadr2@23 as leadr2]
---GlobalLimitExec: skip=0, fetch=5
-----SortExec: fetch=5, expr=[ts@24 DESC]
-------ProjectionExec: expr=[FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@10 as fv1, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@11 as fv2, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@12 as lv1, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@13 as lv2, NTH_VALUE(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@14 as nv1, NTH_VALUE(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@15 as nv2, ROW_NUMBER() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@16 as rn1, ROW_NUMBER() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@17 as rn2, RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@18 as rank1, RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@19 as rank2, DENSE_RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@20 as dense_rank1, DENSE_RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@21 as dense_rank2, LAG(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@22 as lag1, LAG(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@23 as lag2, LEAD(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@24 as lead1, LEAD(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@25 as lead2, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@2 as fvr1, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@3 as fvr2, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@4 as lvr1, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@5 as lvr2, LAG(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@6 as lagr1, LAG(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@7 as lagr2, LEAD(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@8 as leadr1, LEAD(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@9 as leadr2, ts@0 as ts]
---------BoundedWindowAggExec: wdw=[FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)) }, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)) }, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, NTH_VALUE(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "NTH_VALUE(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)) }, NTH_VALUE(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "NTH_VALUE(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, ROW_NUMBER() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Ok(Field { name: "ROW_NUMBER() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, ROW_NUMBER() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "ROW_NUMBER() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Ok(Field { name: "RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, DENSE_RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Ok(Field { name: "DENSE_RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, DENSE_RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "DENSE_RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, LAG(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Ok(Field { name: "LAG(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, LAG(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "LAG(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, LEAD(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Ok(Field { name: "LEAD(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, LEAD(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "LEAD(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }], mode=[Sorted]
-----------BoundedWindowAggExec: wdw=[FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(10)) }, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(10)) }, LAG(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Ok(Field { name: "LAG(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)) }, LAG(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "LAG(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(10)) }, LEAD(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Ok(Field { name: "LEAD(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)) }, LEAD(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "LEAD(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(10)) }], mode=[Sorted]
-------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_1.csv]]}, projection=[ts, inc_col], output_ordering=[ts@0 ASC NULLS LAST], has_header=true
+GlobalLimitExec: skip=0, fetch=5
+--SortExec: fetch=5, expr=[ts@0 DESC]
+----ProjectionExec: expr=[ts@0 as ts, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@10 as fv1, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@11 as fv2, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@12 as lv1, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@13 as lv2, NTH_VALUE(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@14 as nv1, NTH_VALUE(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@15 as nv2, ROW_NUMBER() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@16 as rn1, ROW_NUMBER() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@17 as rn2, RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@18 as rank1, RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@19 as rank2, DENSE_RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@20 as dense_rank1, DENSE_RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@21 as dense_rank2, LAG(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@22 as lag1, LAG(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@23 as lag2, LEAD(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@24 as lead1, LEAD(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@25 as lead2, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@2 as fvr1, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@3 as fvr2, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@4 as lvr1, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@5 as lvr2, LAG(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@6 as lagr1, LAG(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@7 as lagr2, LEAD(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING@8 as leadr1, LEAD(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING@9 as leadr2]
+------BoundedWindowAggExec: wdw=[FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)) }, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)) }, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, NTH_VALUE(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "NTH_VALUE(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)) }, NTH_VALUE(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "NTH_VALUE(annotated_data_finite.inc_col,Int64(5)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, ROW_NUMBER() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Ok(Field { name: "ROW_NUMBER() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, ROW_NUMBER() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "ROW_NUMBER() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Ok(Field { name: "RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, DENSE_RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Ok(Field { name: "DENSE_RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, DENSE_RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "DENSE_RANK() ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, LAG(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Ok(Field { name: "LAG(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, LAG(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "LAG(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }, LEAD(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Ok(Field { name: "LEAD(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, LEAD(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "LEAD(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(10)), end_bound: Following(UInt64(1)) }], mode=[Sorted]
+--------BoundedWindowAggExec: wdw=[FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "FIRST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(10)) }, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(1)), end_bound: Following(Int32(10)) }, LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "LAST_VALUE(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(10)) }, LAG(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Ok(Field { name: "LAG(annotated_data_finite.inc_col,Int64(1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)) }, LAG(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "LAG(annotated_data_finite.inc_col,Int64(2),Int64(1002)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(10)) }, LEAD(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING: Ok(Field { name: "LEAD(annotated_data_finite.inc_col,Int64(-1),Int64(1001)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Int32(10)), end_bound: Following(Int32(1)) }, LEAD(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING: Ok(Field { name: "LEAD(annotated_data_finite.inc_col,Int64(4),Int64(1004)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(10)) }], mode=[Sorted]
+----------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_1.csv]]}, projection=[ts, inc_col], output_ordering=[ts@0 ASC NULLS LAST], has_header=true
 
-query IIIIIIIIIIIIIIIIIIIIIIII
+query IIIIIIIIIIIIIIIIIIIIIIIII
 SELECT
+  ts,
   FIRST_VALUE(inc_col) OVER(ORDER BY ts RANGE BETWEEN 10 PRECEDING and 1 FOLLOWING) as fv1,
   FIRST_VALUE(inc_col) OVER(ORDER BY ts ROWS BETWEEN 10 PRECEDING and 1 FOLLOWING) as fv2,
   LAST_VALUE(inc_col) OVER(ORDER BY ts RANGE BETWEEN 10 PRECEDING and 1 FOLLOWING) as lv1,
@@ -2672,11 +2672,11 @@ SELECT
   ORDER BY ts DESC
   LIMIT 5;
 ----
-289 269 305 305 305 283 100 100 99 99 86 86 301 296 301 1004 305 305 301 301 1001 1002 1001 289
-289 266 305 305 305 278 99 99 99 99 86 86 296 291 296 1004 305 305 301 296 305 1002 305 286
-289 261 296 301 NULL 275 98 98 98 98 85 85 291 289 291 1004 305 305 296 291 301 305 301 283
-286 259 291 296 NULL 272 97 97 97 97 84 84 289 286 289 1004 305 305 291 289 296 301 296 278
-275 254 289 291 289 269 96 96 96 96 83 83 286 283 286 305 305 305 289 286 291 296 291 275
+264 289 266 305 305 305 278 99 99 99 99 86 86 296 291 296 1004 305 305 301 296 305 1002 305 286
+264 289 269 305 305 305 283 100 100 99 99 86 86 301 296 301 1004 305 305 301 301 1001 1002 1001 289
+262 289 261 296 301 NULL 275 98 98 98 98 85 85 291 289 291 1004 305 305 296 291 301 305 301 283
+258 286 259 291 296 NULL 272 97 97 97 97 84 84 289 286 289 1004 305 305 291 289 296 301 296 278
+254 275 254 289 291 289 269 96 96 96 96 83 83 286 283 286 305 305 305 289 286 291 296 291 275
 
 
 # test_source_sorted_unbounded_preceding
@@ -3197,8 +3197,9 @@ drop table annotated_data_infinite2
 
 # window3 spec is not used in window functions.
 # The query should still work.
-query RR
+query IRR
 SELECT
+  C3,
   MAX(c12) OVER window1,
   MIN(c12) OVER window2 as max1
   FROM aggregate_test_100
@@ -3208,14 +3209,15 @@ SELECT
   ORDER BY C3
   LIMIT 5
 ----
-0.970671228336 0.970671228336
-0.850672105305 0.850672105305
-0.152498292972 0.152498292972
-0.369363046006 0.369363046006
-0.56535284223 0.56535284223
+-117 0.850672105305 0.850672105305
+-117 0.970671228336 0.970671228336
+-111 0.152498292972 0.152498292972
+-107 0.369363046006 0.369363046006
+-106 0.56535284223 0.56535284223
 
 query TT
 EXPLAIN SELECT
+  C3,
   MAX(c12) OVER window1 as min1,
   MIN(c12) OVER window2 as max1
   FROM aggregate_test_100
@@ -3226,30 +3228,29 @@ EXPLAIN SELECT
   LIMIT 5
 ----
 logical_plan
-Projection: min1, max1
---Limit: skip=0, fetch=5
-----Sort: aggregate_test_100.c3 ASC NULLS LAST, fetch=5
-------Projection: MAX(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS min1, MIN(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS max1, aggregate_test_100.c3
---------WindowAggr: windowExpr=[[MAX(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
-----------Projection: aggregate_test_100.c3, aggregate_test_100.c12, MIN(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
-------------WindowAggr: windowExpr=[[MIN(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
---------------TableScan: aggregate_test_100 projection=[c3, c11, c12]
+Limit: skip=0, fetch=5
+--Sort: aggregate_test_100.c3 ASC NULLS LAST, fetch=5
+----Projection: aggregate_test_100.c3, MAX(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS min1, MIN(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS max1
+------WindowAggr: windowExpr=[[MAX(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
+--------Projection: aggregate_test_100.c3, aggregate_test_100.c12, MIN(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
+----------WindowAggr: windowExpr=[[MIN(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]]
+------------TableScan: aggregate_test_100 projection=[c3, c11, c12]
 physical_plan
-ProjectionExec: expr=[min1@0 as min1, max1@1 as max1]
---GlobalLimitExec: skip=0, fetch=5
-----SortExec: fetch=5, expr=[c3@2 ASC NULLS LAST]
-------ProjectionExec: expr=[MAX(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as min1, MIN(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@2 as max1, c3@0 as c3]
---------BoundedWindowAggExec: wdw=[MAX(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "MAX(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Float64(NULL)), end_bound: CurrentRow }], mode=[Sorted]
-----------SortExec: expr=[c12@1 ASC NULLS LAST]
-------------ProjectionExec: expr=[c3@0 as c3, c12@2 as c12, MIN(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@3 as MIN(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]
---------------WindowAggExec: wdw=[MIN(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "MIN(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)) }]
-----------------SortExec: expr=[c11@1 ASC NULLS LAST]
-------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c3, c11, c12], has_header=true
+GlobalLimitExec: skip=0, fetch=5
+--SortExec: fetch=5, expr=[c3@0 ASC NULLS LAST]
+----ProjectionExec: expr=[c3@0 as c3, MAX(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as min1, MIN(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@2 as max1]
+------BoundedWindowAggExec: wdw=[MAX(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "MAX(aggregate_test_100.c12) ORDER BY [aggregate_test_100.c12 ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Float64(NULL)), end_bound: CurrentRow }], mode=[Sorted]
+--------SortExec: expr=[c12@1 ASC NULLS LAST]
+----------ProjectionExec: expr=[c3@0 as c3, c12@2 as c12, MIN(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@3 as MIN(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING]
+------------WindowAggExec: wdw=[MIN(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "MIN(aggregate_test_100.c12) PARTITION BY [aggregate_test_100.c11] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)) }]
+--------------SortExec: expr=[c11@1 ASC NULLS LAST]
+----------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c3, c11, c12], has_header=true
 
 # window1 spec is used multiple times under different aggregations.
 # The query should still work.
-query RR
+query IRR
 SELECT
+  C3,
   MAX(c12) OVER window1 as min1,
   MIN(c12) OVER window1 as max1
   FROM aggregate_test_100
@@ -3257,11 +3258,11 @@ SELECT
   ORDER BY C3
   LIMIT 5
 ----
-0.970671228336 0.014793053078
-0.850672105305 0.014793053078
-0.152498292972 0.014793053078
-0.369363046006 0.014793053078
-0.56535284223 0.014793053078
+-117 0.850672105305 0.014793053078
+-117 0.970671228336 0.014793053078
+-111 0.152498292972 0.014793053078
+-107 0.369363046006 0.014793053078
+-106 0.56535284223 0.014793053078
 
 query TT
 EXPLAIN SELECT

From d4c09f283397fa817ee48c1ae285fea11c6de661 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Mon, 14 Aug 2023 14:45:28 -0400
Subject: [PATCH 02/32] Avoid use of Row

---
 datafusion/core/src/physical_plan/topk/mod.rs | 141 ++++++++++++------
 1 file changed, 94 insertions(+), 47 deletions(-)

diff --git a/datafusion/core/src/physical_plan/topk/mod.rs b/datafusion/core/src/physical_plan/topk/mod.rs
index d626f0806698..06d3ad33c161 100644
--- a/datafusion/core/src/physical_plan/topk/mod.rs
+++ b/datafusion/core/src/physical_plan/topk/mod.rs
@@ -19,7 +19,7 @@
 
 use arrow::{
     compute::interleave,
-    row::{OwnedRow, RowConverter, Rows, SortField},
+    row::{RowConverter, Rows, SortField},
 };
 use std::{cmp::Ordering, sync::Arc};
 
@@ -169,11 +169,11 @@ impl TopK {
             match self.heap.k_largest() {
                 // heap has k items, and the current row is not
                 // smaller than the curret smallest k value, skip
-                Some(largest) if largest.row.row() <= row => {}
+                Some(largest) if largest.row.as_slice() <= row.as_ref() => {}
                 // don't yet have k items or new item is greater than
                 // current min top k
                 None | Some(_) => {
-                    self.heap.add(&mut batch_entry, row.owned(), index);
+                    self.heap.add(&mut batch_entry, row, index);
                     self.metrics.row_replacements.add(1);
                 }
             }
@@ -210,7 +210,8 @@ impl TopK {
                 break;
             } else {
                 batches.push(Ok(batch.slice(0, batch_size)));
-                batch = batch.slice(batch_size, batch.num_rows());
+                let remaining_length = batch.num_rows() - batch_size;
+                batch = batch.slice(batch_size, remaining_length);
             }
         }
         Ok(Box::pin(RecordBatchStreamAdapter::new(
@@ -266,8 +267,8 @@ struct TopKHeap {
     inner: Vec<TopKRow>,
     /// Storage the original row values (TopKRow only has the sort key)
     store: RecordBatchStore,
-    /// The size of all `OwnedRows`s held by this heap
-    owned_row_bytes: usize,
+    /// The size of all owned data held by this heap
+    owned_bytes: usize,
 }
 
 impl TopKHeap {
@@ -277,7 +278,7 @@ impl TopKHeap {
             k,
             inner: Vec::with_capacity(k),
             store: RecordBatchStore::new(),
-            owned_row_bytes: 0,
+            owned_bytes: 0,
         }
     }
 
@@ -306,42 +307,44 @@ impl TopKHeap {
     /// Adds `row` to this heap. If inserting this new item would
     /// increase the size past `k`, removes the previously smallest
     /// item.
-    fn add(&mut self, batch_entry: &mut RecordBatchEntry, row: OwnedRow, index: usize) {
+    fn add(
+        &mut self,
+        batch_entry: &mut RecordBatchEntry,
+        row: impl AsRef<[u8]>,
+        index: usize,
+    ) {
+        let batch_id = batch_entry.id;
+        batch_entry.uses += 1;
+
         assert!(self.inner.len() <= self.k);
+        let row = row.as_ref();
 
-        batch_entry.uses += 1;
+        // Reuse storage for evicted item if possible
+        let new_top_k = if self.inner.len() == self.k {
+            let prev_min = self.inner.pop().unwrap();
+
+            // Update batch use
+            if prev_min.batch_id == batch_entry.id {
+                batch_entry.uses -= 1;
+            } else {
+                self.store.unuse(prev_min.batch_id);
+            }
 
-        self.owned_row_bytes += owned_row_size(&row);
+            // update memory accounting
+            self.owned_bytes -= prev_min.owned_size();
+            prev_min.with_new_row(row, batch_id, index)
+        } else {
+            TopKRow::new(row, batch_id, index)
+        };
+
+        self.owned_bytes += new_top_k.owned_size();
 
         // put the new row into the correct location to maintain that
         // self.inner is sorted in descending order
         let insertion_point = self
             .inner
-            .partition_point(|current_row| current_row.row <= row);
-        self.inner.insert(
-            insertion_point,
-            TopKRow {
-                row,
-                batch_id: batch_entry.id,
-                index,
-            },
-        );
-
-        // limit size to k items
-        if self.inner.len() > self.k {
-            // If there was a previous minimum value, decrement its use
-            if let Some(prev_min) = self.inner.pop() {
-                if prev_min.batch_id == batch_entry.id {
-                    batch_entry.uses -= 1;
-                } else {
-                    self.store.unuse(prev_min.batch_id);
-                }
-                // update memory accounting
-                let prev_size = owned_row_size(&prev_min.row);
-                assert!(self.owned_row_bytes >= prev_size);
-                self.owned_row_bytes -= prev_size;
-            }
-        }
+            .partition_point(|current_row| current_row.row() <= row.as_ref());
+        self.inner.insert(insertion_point, new_top_k);
     }
 
     /// Returns the values stored in this heap, from values low to high, as a single
@@ -396,25 +399,69 @@ impl TopKHeap {
         std::mem::size_of::<Self>()
             + (self.inner.capacity() * std::mem::size_of::<TopKRow>())
             + self.store.size()
-            + self.owned_row_bytes
+            + self.owned_bytes
     }
 }
 
-/// Size of memory owned by `row` until row::size() is available
-/// TODO file upstream ticket in arrow-rs to add this
-fn owned_row_size(row: &OwnedRow) -> usize {
-    std::mem::size_of_val(row) + row.as_ref().len() // underlying data, doesn't account for capacity
-}
-
-/// Represents one of the top K rows. Orders according to `OwnedRow`
+/// Represents one of the top K rows held in this heap. Orders
+/// according to memcmp of row (e.g. the arrow Row format, but could
+/// also be primtive values)
+///
+/// Reuses allocations to minimize runtime overhead of creating new Vecs
 #[derive(Debug, PartialEq)]
 struct TopKRow {
-    /// the value of the sort key for this row
-    row: OwnedRow,
-    /// the index in this record batch the row came from
-    index: usize,
+    /// the value of the sort key for this row. This contains the
+    /// bytes that could be stored in `OwnedRow` but uses `Vec<u8>` to
+    /// reuse allocations.
+    row: Vec<u8>,
     /// the RecordBatch this row came from: an id into a [`RecordBatchStore`]
     batch_id: u32,
+    /// the index in this record batch the row came from
+    index: usize,
+}
+
+impl TopKRow {
+    /// Create a new TopKRow with new allocation
+    fn new(row: impl AsRef<[u8]>, batch_id: u32, index: usize) -> Self {
+        Self {
+            row: row.as_ref().to_vec(),
+            batch_id,
+            index,
+        }
+    }
+
+    /// Create a new  TopKRow reusing the existing allocation
+    fn with_new_row(
+        self,
+        new_row: impl AsRef<[u8]>,
+        batch_id: u32,
+        index: usize,
+    ) -> Self {
+        let Self {
+            mut row,
+            batch_id: _,
+            index: _,
+        } = self;
+        row.clear();
+        row.extend_from_slice(new_row.as_ref());
+
+        Self {
+            row,
+            batch_id,
+            index,
+        }
+    }
+
+    /// Returns the number of bytes owned by this row in the heap (not
+    /// including itself)
+    fn owned_size(&self) -> usize {
+        self.row.capacity()
+    }
+
+    /// Returns a slice to the owned row value
+    fn row(&self) -> &[u8] {
+        self.row.as_slice()
+    }
 }
 
 impl Eq for TopKRow {}

From 948c1a2b6578749beecc8b3456a8af96b82ecd8b Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Tue, 22 Aug 2023 14:26:00 -0400
Subject: [PATCH 03/32] start working on compaction

---
 datafusion/core/src/physical_plan/mod.rs      |  2 +-
 datafusion/core/src/physical_plan/topk/mod.rs | 47 ++++++++++++-------
 2 files changed, 30 insertions(+), 19 deletions(-)

diff --git a/datafusion/core/src/physical_plan/mod.rs b/datafusion/core/src/physical_plan/mod.rs
index f544d7cce5e5..ce13e46a7ec6 100644
--- a/datafusion/core/src/physical_plan/mod.rs
+++ b/datafusion/core/src/physical_plan/mod.rs
@@ -27,8 +27,8 @@ use self::{
 use crate::datasource::physical_plan::FileScanConfig;
 use crate::physical_plan::expressions::PhysicalSortExpr;
 use datafusion_common::Result;
-pub use topk::TopK;
 pub use datafusion_common::{internal_err, ColumnStatistics, Statistics};
+pub use topk::TopK;
 pub use visitor::{accept, visit_execution_plan, ExecutionPlanVisitor};
 
 use arrow::datatypes::SchemaRef;
diff --git a/datafusion/core/src/physical_plan/topk/mod.rs b/datafusion/core/src/physical_plan/topk/mod.rs
index 06d3ad33c161..2efb65fefff2 100644
--- a/datafusion/core/src/physical_plan/topk/mod.rs
+++ b/datafusion/core/src/physical_plan/topk/mod.rs
@@ -129,14 +129,14 @@ impl TopK {
         );
 
         Ok(Self {
-            schema,
+            schema: schema.clone(),
             metrics: TopKMetrics::new(metrics, partition),
             reservation,
             batch_size,
             expr,
             row_converter,
             scratch_rows,
-            heap: TopKHeap::new(k),
+            heap: TopKHeap::new(k, schema),
         })
     }
 
@@ -199,7 +199,7 @@ impl TopK {
         } = self;
         let _timer = metrics.baseline.elapsed_compute().timer(); // time updated on drop
 
-        let mut batch = heap.emit(schema.clone())?;
+        let mut batch = heap.emit()?;
         metrics.baseline.output_rows().add(batch.num_rows());
 
         // break into record batches as needed
@@ -259,7 +259,7 @@ impl TopKMetrics {
 /// it is important to check the current minimum value in the heap
 /// prior to creating a new value to insert.
 struct TopKHeap {
-    /// The maximum size of this heap.
+    /// The maximum number of elemenents to store in this heap.
     k: usize,
     /// Storage for up at most `k` items, in ascending
     /// order. `inner[0]` holds the smallest value of the smallest k
@@ -272,12 +272,12 @@ struct TopKHeap {
 }
 
 impl TopKHeap {
-    fn new(k: usize) -> Self {
+    fn new(k: usize, schema: SchemaRef) -> Self {
         assert!(k > 0);
         Self {
             k,
             inner: Vec::with_capacity(k),
-            store: RecordBatchStore::new(),
+            store: RecordBatchStore::new(schema),
             owned_bytes: 0,
         }
     }
@@ -349,7 +349,9 @@ impl TopKHeap {
 
     /// Returns the values stored in this heap, from values low to high, as a single
     /// [`RecordBatch`]
-    pub fn emit(&self, schema: SchemaRef) -> Result<RecordBatch> {
+    pub fn emit(&self) -> Result<RecordBatch> {
+        let schema = self.store.schema().clone();
+
         // Indicies for each row within its respective RecordBatch
         let indicies: Vec<_> = self
             .inner
@@ -358,16 +360,7 @@ impl TopKHeap {
             .map(|(i, k)| (i, k.index))
             .collect();
 
-        let num_columns = {
-            let Some(first_value) = self.inner.get(0) else {
-                return Ok(RecordBatch::new_empty(schema));
-            };
-            self.store
-                .get(first_value.batch_id)
-                .expect("invalid batch id")
-                .batch
-                .num_columns()
-        };
+        let num_columns = schema.fields().len();
 
         // build the output columns one at time, using the
         // `interleave` kernel to pick rows from different arrays
@@ -394,6 +387,11 @@ impl TopKHeap {
         Ok(RecordBatch::try_new(schema, output_columns)?)
     }
 
+    /// Compact this heap, rewriting all stored batches
+    fn compact(&mut self) {
+        //let new_batch = self.emit(
+    }
+
     /// return the size of memory used by this heap, in bytes
     fn size(&self) -> usize {
         std::mem::size_of::<Self>()
@@ -498,14 +496,17 @@ struct RecordBatchStore {
     batches: HashMap<u32, RecordBatchEntry>,
     /// total size of all record batches tracked by this store
     batches_size: usize,
+    /// schema of the batches
+    schema: SchemaRef,
 }
 
 impl RecordBatchStore {
-    fn new() -> Self {
+    fn new(schema: SchemaRef) -> Self {
         Self {
             next_id: 0,
             batches: HashMap::new(),
             batches_size: 0,
+            schema,
         }
     }
 
@@ -531,6 +532,16 @@ impl RecordBatchStore {
         self.batches.get(&id)
     }
 
+    /// returns the total number of batches stored in this store
+    fn len(&self) -> usize {
+        self.batches.len()
+    }
+
+    /// return the schema of batches stored
+    fn schema(&self) -> &SchemaRef {
+        &self.schema
+    }
+
     /// remove a use from the specified batch id. If the use count
     /// reaches zero the batch entry is removed from the store
     ///

From 354d687caf8ad0e75fa27e28779844842a535724 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Tue, 22 Aug 2023 15:03:31 -0400
Subject: [PATCH 04/32] checkpoint

---
 datafusion/core/src/physical_plan/topk/mod.rs | 56 ++++++++++++++++++-
 datafusion/sqllogictest/test_files/aal.slt    | 15 +++++
 2 files changed, 68 insertions(+), 3 deletions(-)

diff --git a/datafusion/core/src/physical_plan/topk/mod.rs b/datafusion/core/src/physical_plan/topk/mod.rs
index 2efb65fefff2..e6993b70d4c6 100644
--- a/datafusion/core/src/physical_plan/topk/mod.rs
+++ b/datafusion/core/src/physical_plan/topk/mod.rs
@@ -143,6 +143,8 @@ impl TopK {
     /// Insert `batch`, remembering it if any of its values are among
     /// the top k seen so far.
     pub fn insert_batch(&mut self, batch: RecordBatch) -> Result<()> {
+        use log::info;
+        info!("INSERTING {} rows", batch.num_rows());
         // Updates on drop
         let _timer = self.metrics.baseline.elapsed_compute().timer();
 
@@ -180,6 +182,9 @@ impl TopK {
         }
         self.heap.insert_batch_entry(batch_entry);
 
+        // conserve memory
+        self.heap.maybe_compact()?;
+
         // update memory reservation
         self.reservation.try_resize(self.size())?;
         Ok(())
@@ -352,6 +357,10 @@ impl TopKHeap {
     pub fn emit(&self) -> Result<RecordBatch> {
         let schema = self.store.schema().clone();
 
+        if self.store.is_empty() {
+            return Ok(RecordBatch::new_empty(schema));
+        }
+
         // Indicies for each row within its respective RecordBatch
         let indicies: Vec<_> = self
             .inner
@@ -387,9 +396,39 @@ impl TopKHeap {
         Ok(RecordBatch::try_new(schema, output_columns)?)
     }
 
-    /// Compact this heap, rewriting all stored batches
-    fn compact(&mut self) {
-        //let new_batch = self.emit(
+    /// Compact this heap, rewriting all stored batches into a single
+    /// input batch
+    pub fn maybe_compact(&mut self) -> Result<()>{
+        // don't compact if the store has less than ten batches
+        if self.store.len() <= 10 {
+            return Ok(());
+        }
+
+        panic!("Disco");
+
+        // at first, compact the entire thing always into a new batch
+        // (maybe we can get fancier in the future about ignoring
+        // batches that have a high usage ratio already
+
+        // Note: new batch is in the same order as inner
+        let new_batch = self.emit()?;
+
+        // clear all old entires in store (this invalidates all
+        // store_ids in `inner`)
+        self.store.clear();
+
+        let mut batch_entry = self.register_batch(new_batch);
+        batch_entry.uses = self.inner.len();
+
+        // rewrite all existing entries to use the new batch, and
+        // remove old entries. The sortedness and their relative
+        // position do not change
+        for (i, topk_row) in self.inner.iter_mut().enumerate() {
+            topk_row.batch_id = batch_entry.id;
+            topk_row.index = i;
+        }
+        self.insert_batch_entry(batch_entry);
+        Ok(())
     }
 
     /// return the size of memory used by this heap, in bytes
@@ -528,6 +567,12 @@ impl RecordBatchStore {
         }
     }
 
+    /// Clear all values in this store, invalidating all previous batch ids
+    fn clear(&mut self) {
+        self.batches.clear();
+        self.batches_size = 0;
+    }
+
     fn get(&self, id: u32) -> Option<&RecordBatchEntry> {
         self.batches.get(&id)
     }
@@ -537,6 +582,11 @@ impl RecordBatchStore {
         self.batches.len()
     }
 
+    /// returns true if the store has nothing stored
+    fn is_empty(&self) -> bool {
+        self.batches.is_empty()
+    }
+
     /// return the schema of batches stored
     fn schema(&self) -> &SchemaRef {
         &self.schema
diff --git a/datafusion/sqllogictest/test_files/aal.slt b/datafusion/sqllogictest/test_files/aal.slt
index f19c79e8d1cb..4d8346edf23e 100644
--- a/datafusion/sqllogictest/test_files/aal.slt
+++ b/datafusion/sqllogictest/test_files/aal.slt
@@ -200,3 +200,18 @@ d 1 -98 13630 -1991133944 1184110014998006843 220 2986 225513085 963410661024364
 e 2 52 -12056 -1090239422 9011500141803970147 238 4168 2013662838 12565360638488684051 0.6694766 0.391444365692 xipQ93429ksjNcXPX5326VSg1xJZcW
 d 1 -72 25590 1188089983 3090286296481837049 241 832 3542840110 5885937420286765261 0.41980565 0.215354023438 wwXqSGKLyBQyPkonlzBNYUJTCo4LRS
 a 1 -5 12636 794623392 2909750622865366631 15 24022 2669374863 4776679784701509574 0.29877836 0.253725340799 waIGbOGl1PM6gnzZ4uuZt4E2yDWRHs
+
+
+
+## -- make tiny batches to trigger batch compaction
+statement ok
+set datafusion.execution.batch_size = 7
+
+query TIIIIIIIITRRT
+select * from aggregate_test_100 ORDER BY c13 desc limit 5;
+----
+a 4 -38 20744 762932956 308913475857409919 7 45465 1787652631 878137512938218976 0.7459874 0.021825780392 ydkwycaISlYSlEq3TlkS2m15I2pcp8
+d 1 -98 13630 -1991133944 1184110014998006843 220 2986 225513085 9634106610243643486 0.89651865 0.164088254508 y7C453hRWd4E7ImjNDWlpexB8nUqjh
+e 2 52 -12056 -1090239422 9011500141803970147 238 4168 2013662838 12565360638488684051 0.6694766 0.391444365692 xipQ93429ksjNcXPX5326VSg1xJZcW
+d 1 -72 25590 1188089983 3090286296481837049 241 832 3542840110 5885937420286765261 0.41980565 0.215354023438 wwXqSGKLyBQyPkonlzBNYUJTCo4LRS
+a 1 -5 12636 794623392 2909750622865366631 15 24022 2669374863 4776679784701509574 0.29877836 0.253725340799 waIGbOGl1PM6gnzZ4uuZt4E2yDWRHs

From afea7d3e38faf0ec149c78849177a20d640bb541 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Tue, 22 Aug 2023 15:10:11 -0400
Subject: [PATCH 05/32] update

---
 datafusion/core/src/physical_plan/topk/mod.rs | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/datafusion/core/src/physical_plan/topk/mod.rs b/datafusion/core/src/physical_plan/topk/mod.rs
index e6993b70d4c6..64d2090da866 100644
--- a/datafusion/core/src/physical_plan/topk/mod.rs
+++ b/datafusion/core/src/physical_plan/topk/mod.rs
@@ -399,13 +399,14 @@ impl TopKHeap {
     /// Compact this heap, rewriting all stored batches into a single
     /// input batch
     pub fn maybe_compact(&mut self) -> Result<()>{
+        use log::info;
+        info!("Have {} batches in store", self.store.len());
         // don't compact if the store has less than ten batches
-        if self.store.len() <= 10 {
+        //if self.store.len() <= 10 {
+        if self.store.len() <= 2 {
             return Ok(());
         }
 
-        panic!("Disco");
-
         // at first, compact the entire thing always into a new batch
         // (maybe we can get fancier in the future about ignoring
         // batches that have a high usage ratio already
@@ -428,6 +429,7 @@ impl TopKHeap {
             topk_row.index = i;
         }
         self.insert_batch_entry(batch_entry);
+        info!("COMPACTION DONE: Have {} batches in store", self.store.len());
         Ok(())
     }
 

From 69b86ab0bce7aa469f6912ab673af1d3f2135873 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Wed, 23 Aug 2023 08:03:56 -0400
Subject: [PATCH 06/32] checkpoint

---
 datafusion/core/src/physical_plan/topk/mod.rs | 30 +++++++++++++++----
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/datafusion/core/src/physical_plan/topk/mod.rs b/datafusion/core/src/physical_plan/topk/mod.rs
index 64d2090da866..8fdefae544f6 100644
--- a/datafusion/core/src/physical_plan/topk/mod.rs
+++ b/datafusion/core/src/physical_plan/topk/mod.rs
@@ -399,13 +399,20 @@ impl TopKHeap {
     /// Compact this heap, rewriting all stored batches into a single
     /// input batch
     pub fn maybe_compact(&mut self) -> Result<()>{
-        use log::info;
-        info!("Have {} batches in store", self.store.len());
-        // don't compact if the store has less than ten batches
-        //if self.store.len() <= 10 {
-        if self.store.len() <= 2 {
+
+        // we compact if the number of "unused" rows in the store is
+        // past some pre-defined threshold. Target holding up to
+        // around 20 batches, but handle cases of large k where some
+        // batches might be partially full
+        let target_batch_size = 8024;
+        let max_unused_rows = 20 * target_batch_size + self.k;
+
+        // don't compact if the store has only one batch or
+        if self.store.len() <= 2 || self.store.unused_rows() < max_unused_rows {
             return Ok(());
         }
+        use log::info;
+        info!("Have {} batches in store, COMPACTING", self.store.len());
 
         // at first, compact the entire thing always into a new batch
         // (maybe we can get fancier in the future about ignoring
@@ -442,6 +449,8 @@ impl TopKHeap {
     }
 }
 
+
+
 /// Represents one of the top K rows held in this heap. Orders
 /// according to memcmp of row (e.g. the arrow Row format, but could
 /// also be primtive values)
@@ -584,6 +593,17 @@ impl RecordBatchStore {
         self.batches.len()
     }
 
+    /// Returns the total number of rows in batches minus the number
+    /// which are in use
+    fn unused_rows(&self) -> usize {
+        self.batches
+            .values()
+            .map(|batch_entry| {
+                batch_entry.batch.num_rows() - batch_entry.uses
+            })
+            .sum()
+    }
+
     /// returns true if the store has nothing stored
     fn is_empty(&self) -> bool {
         self.batches.is_empty()

From c8b415c1e9da90921f89c56e576c370a50a1a0cc Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Wed, 23 Aug 2023 08:58:21 -0400
Subject: [PATCH 07/32] fmt

---
 datafusion/core/src/physical_plan/topk/mod.rs | 155 +++++++++---------
 1 file changed, 77 insertions(+), 78 deletions(-)

diff --git a/datafusion/core/src/physical_plan/topk/mod.rs b/datafusion/core/src/physical_plan/topk/mod.rs
index 8fdefae544f6..5e242746e6c0 100644
--- a/datafusion/core/src/physical_plan/topk/mod.rs
+++ b/datafusion/core/src/physical_plan/topk/mod.rs
@@ -21,7 +21,7 @@ use arrow::{
     compute::interleave,
     row::{RowConverter, Rows, SortField},
 };
-use std::{cmp::Ordering, sync::Arc};
+use std::{cmp::Ordering, collections::BinaryHeap, sync::Arc};
 
 use arrow_array::{Array, ArrayRef, RecordBatch};
 use arrow_schema::SchemaRef;
@@ -143,8 +143,6 @@ impl TopK {
     /// Insert `batch`, remembering it if any of its values are among
     /// the top k seen so far.
     pub fn insert_batch(&mut self, batch: RecordBatch) -> Result<()> {
-        use log::info;
-        info!("INSERTING {} rows", batch.num_rows());
         // Updates on drop
         let _timer = self.metrics.baseline.elapsed_compute().timer();
 
@@ -168,12 +166,11 @@ impl TopK {
 
         let mut batch_entry = self.heap.register_batch(batch);
         for (index, row) in rows.iter().enumerate() {
-            match self.heap.k_largest() {
-                // heap has k items, and the current row is not
-                // smaller than the curret smallest k value, skip
-                Some(largest) if largest.row.as_slice() <= row.as_ref() => {}
-                // don't yet have k items or new item is greater than
-                // current min top k
+            match self.heap.max() {
+                // heap has k items, and the new row is greater than the
+                // current max in the heap ==> it is not a new topk
+                Some(max_row) if row.as_ref() >= max_row.row.as_slice() => {}
+                // don't yet have k items or new item is lower than the currently k low values
                 None | Some(_) => {
                     self.heap.add(&mut batch_entry, row, index);
                     self.metrics.row_replacements.add(1);
@@ -190,7 +187,7 @@ impl TopK {
         Ok(())
     }
 
-    /// Returns the top k results broken into `batch_size` [`RecordBatch`]es
+    /// Returns the top k results broken into `batch_size` [`RecordBatch`]es, consuming the heap
     pub fn emit(self) -> Result<SendableRecordBatchStream> {
         let Self {
             schema,
@@ -200,7 +197,7 @@ impl TopK {
             expr: _,
             row_converter: _,
             scratch_rows: _,
-            heap,
+            mut heap,
         } = self;
         let _timer = metrics.baseline.elapsed_compute().timer(); // time updated on drop
 
@@ -266,10 +263,9 @@ impl TopKMetrics {
 struct TopKHeap {
     /// The maximum number of elemenents to store in this heap.
     k: usize,
-    /// Storage for up at most `k` items, in ascending
-    /// order. `inner[0]` holds the smallest value of the smallest k
-    /// so far, `inner[len-1]` holds the largest value smallest k so far.
-    inner: Vec<TopKRow>,
+    /// Storage for up at most `k` items using a BinaryHeap. Reverserd
+    /// so that the smallest k so far is on the top
+    inner: BinaryHeap<TopKRow>,
     /// Storage the original row values (TopKRow only has the sort key)
     store: RecordBatchStore,
     /// The size of all owned data held by this heap
@@ -281,7 +277,7 @@ impl TopKHeap {
         assert!(k > 0);
         Self {
             k,
-            inner: Vec::with_capacity(k),
+            inner: BinaryHeap::new(),
             store: RecordBatchStore::new(schema),
             owned_bytes: 0,
         }
@@ -300,12 +296,13 @@ impl TopKHeap {
     }
 
     /// Returns the largest value stored by the heap if there are k
-    /// items, otherwise returns None
-    fn k_largest(&self) -> Option<&TopKRow> {
+    /// items, otherwise returns None. Remember this structure is
+    /// keeping the "smallest" k values
+    fn max(&self) -> Option<&TopKRow> {
         if self.inner.len() < self.k {
             None
         } else {
-            self.inner.last()
+            self.inner.peek()
         }
     }
 
@@ -344,26 +341,33 @@ impl TopKHeap {
 
         self.owned_bytes += new_top_k.owned_size();
 
-        // put the new row into the correct location to maintain that
-        // self.inner is sorted in descending order
-        let insertion_point = self
-            .inner
-            .partition_point(|current_row| current_row.row() <= row.as_ref());
-        self.inner.insert(insertion_point, new_top_k);
+        // put the new row into the heap
+        self.inner.push(new_top_k)
     }
 
-    /// Returns the values stored in this heap, from values low to high, as a single
-    /// [`RecordBatch`]
-    pub fn emit(&self) -> Result<RecordBatch> {
+    /// Returns the values stored in this heap, from values low to
+    /// high, as a single [`RecordBatch`], resetting the inner heap
+    pub fn emit(&mut self) -> Result<RecordBatch> {
+        Ok(self.emit_with_state()?.0)
+    }
+
+    /// Returns the values stored in this heap, from values low to
+    /// high, as a single [`RecordBatch`], and a sorted vec of heap contents
+
+    pub fn emit_with_state(&mut self) -> Result<(RecordBatch, Vec<TopKRow>)> {
         let schema = self.store.schema().clone();
 
+        let mut topk_rows = std::mem::take(&mut self.inner).into_vec();
+
+        // sort low to high (reverse the reverse)
+        topk_rows.sort();
+
         if self.store.is_empty() {
-            return Ok(RecordBatch::new_empty(schema));
+            return Ok((RecordBatch::new_empty(schema), topk_rows));
         }
 
         // Indicies for each row within its respective RecordBatch
-        let indicies: Vec<_> = self
-            .inner
+        let indicies: Vec<_> = topk_rows
             .iter()
             .enumerate()
             .map(|(i, k)| (i, k.index))
@@ -375,8 +379,7 @@ impl TopKHeap {
         // `interleave` kernel to pick rows from different arrays
         let output_columns: Vec<_> = (0..num_columns)
             .map(|col| {
-                let input_arrays: Vec<_> = self
-                    .inner
+                let input_arrays: Vec<_> = topk_rows
                     .iter()
                     .map(|k| {
                         let entry =
@@ -393,50 +396,50 @@ impl TopKHeap {
             })
             .collect::<Result<_>>()?;
 
-        Ok(RecordBatch::try_new(schema, output_columns)?)
+        let new_batch = RecordBatch::try_new(schema, output_columns)?;
+        Ok((new_batch, topk_rows))
     }
 
     /// Compact this heap, rewriting all stored batches into a single
     /// input batch
-    pub fn maybe_compact(&mut self) -> Result<()>{
-
-        // we compact if the number of "unused" rows in the store is
-        // past some pre-defined threshold. Target holding up to
-        // around 20 batches, but handle cases of large k where some
-        // batches might be partially full
-        let target_batch_size = 8024;
-        let max_unused_rows = 20 * target_batch_size + self.k;
-
-        // don't compact if the store has only one batch or
-        if self.store.len() <= 2 || self.store.unused_rows() < max_unused_rows {
-            return Ok(());
-        }
-        use log::info;
-        info!("Have {} batches in store, COMPACTING", self.store.len());
-
-        // at first, compact the entire thing always into a new batch
-        // (maybe we can get fancier in the future about ignoring
-        // batches that have a high usage ratio already
-
-        // Note: new batch is in the same order as inner
-        let new_batch = self.emit()?;
-
-        // clear all old entires in store (this invalidates all
-        // store_ids in `inner`)
-        self.store.clear();
-
-        let mut batch_entry = self.register_batch(new_batch);
-        batch_entry.uses = self.inner.len();
-
-        // rewrite all existing entries to use the new batch, and
-        // remove old entries. The sortedness and their relative
-        // position do not change
-        for (i, topk_row) in self.inner.iter_mut().enumerate() {
-            topk_row.batch_id = batch_entry.id;
-            topk_row.index = i;
-        }
-        self.insert_batch_entry(batch_entry);
-        info!("COMPACTION DONE: Have {} batches in store", self.store.len());
+    pub fn maybe_compact(&mut self) -> Result<()> {
+        // // we compact if the number of "unused" rows in the store is
+        // // past some pre-defined threshold. Target holding up to
+        // // around 20 batches, but handle cases of large k where some
+        // // batches might be partially full
+        // let target_batch_size = 8024;
+        // let max_unused_rows = 20 * target_batch_size + self.k;
+
+        // // don't compact if the store has only one batch or
+        // if self.store.len() <= 2 || self.store.unused_rows() < max_unused_rows {
+        //     return Ok(());
+        // }
+        // use log::info;
+        // info!("Have {} batches in store, COMPACTING", self.store.len());
+
+        // // at first, compact the entire thing always into a new batch
+        // // (maybe we can get fancier in the future about ignoring
+        // // batches that have a high usage ratio already
+
+        // // Note: new batch is in the same order as inner
+        // let new_batch = self.emit()?;
+
+        // // clear all old entires in store (this invalidates all
+        // // store_ids in `inner`)
+        // self.store.clear();
+
+        // let mut batch_entry = self.register_batch(new_batch);
+        // batch_entry.uses = self.inner.len();
+
+        // // rewrite all existing entries to use the new batch, and
+        // // remove old entries. The sortedness and their relative
+        // // position do not change
+        // for (i, topk_row) in self.inner.iter_mut().enumerate() {
+        //     topk_row.batch_id = batch_entry.id;
+        //     topk_row.index = i;
+        // }
+        // self.insert_batch_entry(batch_entry);
+        // info!("COMPACTION DONE: Have {} batches in store", self.store.len());
         Ok(())
     }
 
@@ -449,8 +452,6 @@ impl TopKHeap {
     }
 }
 
-
-
 /// Represents one of the top K rows held in this heap. Orders
 /// according to memcmp of row (e.g. the arrow Row format, but could
 /// also be primtive values)
@@ -598,9 +599,7 @@ impl RecordBatchStore {
     fn unused_rows(&self) -> usize {
         self.batches
             .values()
-            .map(|batch_entry| {
-                batch_entry.batch.num_rows() - batch_entry.uses
-            })
+            .map(|batch_entry| batch_entry.batch.num_rows() - batch_entry.uses)
             .sum()
     }
 

From 0337e310cd3d998cdfb105b6cba41aa28a34b8b1 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Wed, 23 Aug 2023 11:35:47 -0400
Subject: [PATCH 08/32] Fix compaction

---
 datafusion/core/src/physical_plan/topk/mod.rs | 94 +++++++++++--------
 datafusion/sqllogictest/test_files/aal.slt    |  2 +-
 2 files changed, 55 insertions(+), 41 deletions(-)

diff --git a/datafusion/core/src/physical_plan/topk/mod.rs b/datafusion/core/src/physical_plan/topk/mod.rs
index 5e242746e6c0..faf68bcd5ac2 100644
--- a/datafusion/core/src/physical_plan/topk/mod.rs
+++ b/datafusion/core/src/physical_plan/topk/mod.rs
@@ -136,7 +136,7 @@ impl TopK {
             expr,
             row_converter,
             scratch_rows,
-            heap: TopKHeap::new(k, schema),
+            heap: TopKHeap::new(k, batch_size, schema),
         })
     }
 
@@ -169,7 +169,7 @@ impl TopK {
             match self.heap.max() {
                 // heap has k items, and the new row is greater than the
                 // current max in the heap ==> it is not a new topk
-                Some(max_row) if row.as_ref() >= max_row.row.as_slice() => {}
+                Some(max_row) if row.as_ref() >= max_row.row() => {}
                 // don't yet have k items or new item is lower than the currently k low values
                 None | Some(_) => {
                     self.heap.add(&mut batch_entry, row, index);
@@ -263,6 +263,8 @@ impl TopKMetrics {
 struct TopKHeap {
     /// The maximum number of elemenents to store in this heap.
     k: usize,
+    /// The target number of rows for output batches
+    batch_size: usize,
     /// Storage for up at most `k` items using a BinaryHeap. Reverserd
     /// so that the smallest k so far is on the top
     inner: BinaryHeap<TopKRow>,
@@ -273,10 +275,15 @@ struct TopKHeap {
 }
 
 impl TopKHeap {
-    fn new(k: usize, schema: SchemaRef) -> Self {
+    fn new(
+        k: usize,
+        batch_size: usize,
+        schema: SchemaRef
+    ) -> Self {
         assert!(k > 0);
         Self {
             k,
+            batch_size,
             inner: BinaryHeap::new(),
             store: RecordBatchStore::new(schema),
             owned_bytes: 0,
@@ -403,43 +410,50 @@ impl TopKHeap {
     /// Compact this heap, rewriting all stored batches into a single
     /// input batch
     pub fn maybe_compact(&mut self) -> Result<()> {
-        // // we compact if the number of "unused" rows in the store is
-        // // past some pre-defined threshold. Target holding up to
-        // // around 20 batches, but handle cases of large k where some
-        // // batches might be partially full
-        // let target_batch_size = 8024;
-        // let max_unused_rows = 20 * target_batch_size + self.k;
-
-        // // don't compact if the store has only one batch or
-        // if self.store.len() <= 2 || self.store.unused_rows() < max_unused_rows {
-        //     return Ok(());
-        // }
-        // use log::info;
-        // info!("Have {} batches in store, COMPACTING", self.store.len());
-
-        // // at first, compact the entire thing always into a new batch
-        // // (maybe we can get fancier in the future about ignoring
-        // // batches that have a high usage ratio already
-
-        // // Note: new batch is in the same order as inner
-        // let new_batch = self.emit()?;
-
-        // // clear all old entires in store (this invalidates all
-        // // store_ids in `inner`)
-        // self.store.clear();
-
-        // let mut batch_entry = self.register_batch(new_batch);
-        // batch_entry.uses = self.inner.len();
-
-        // // rewrite all existing entries to use the new batch, and
-        // // remove old entries. The sortedness and their relative
-        // // position do not change
-        // for (i, topk_row) in self.inner.iter_mut().enumerate() {
-        //     topk_row.batch_id = batch_entry.id;
-        //     topk_row.index = i;
-        // }
-        // self.insert_batch_entry(batch_entry);
-        // info!("COMPACTION DONE: Have {} batches in store", self.store.len());
+        // we compact if the number of "unused" rows in the store is
+        // past some pre-defined threshold. Target holding up to
+        // around 20 batches, but handle cases of large k where some
+        // batches might be partially full
+        let max_unused_rows = (20 * self.batch_size) + self.k;
+        let unused_rows = self.store.unused_rows();
+        use log::info;
+        //info!("{} batches in store, unused rows in store: {}, max unused rows: {}",
+        //self.store.len(), unused_rows, max_unused_rows);
+
+        // don't compact if the store has only one batch or
+        if self.store.len() <= 2 || unused_rows < max_unused_rows {
+            //if self.store.len() <= 2 {
+            return Ok(());
+        }
+        info!("Have {} batches in store, COMPACTING", self.store.len());
+
+        // at first, compact the entire thing always into a new batch
+        // (maybe we can get fancier in the future about ignoring
+        // batches that have a high usage ratio already
+
+        // Note: new batch is in the same order as inner
+        let num_rows = self.inner.len();
+        let (new_batch, mut topk_rows) = self.emit_with_state()?;
+
+        // clear all old entires in store (this invalidates all
+        // store_ids in `inner`)
+        self.store.clear();
+
+        let mut batch_entry = self.register_batch(new_batch);
+        batch_entry.uses = num_rows;
+
+        // rewrite all existing entries to use the new batch, and
+        // remove old entries. The sortedness and their relative
+        // position do not change
+        for (i, topk_row) in topk_rows.iter_mut().enumerate() {
+            topk_row.batch_id = batch_entry.id;
+            topk_row.index = i;
+        }
+        self.insert_batch_entry(batch_entry);
+        // restore the heap
+        self.inner = BinaryHeap::from(topk_rows);
+
+        info!("COMPACTION DONE: Have {} batches in store", self.store.len());
         Ok(())
     }
 
diff --git a/datafusion/sqllogictest/test_files/aal.slt b/datafusion/sqllogictest/test_files/aal.slt
index 4d8346edf23e..bbab912956e1 100644
--- a/datafusion/sqllogictest/test_files/aal.slt
+++ b/datafusion/sqllogictest/test_files/aal.slt
@@ -205,7 +205,7 @@ a 1 -5 12636 794623392 2909750622865366631 15 24022 2669374863 47766797847015095
 
 ## -- make tiny batches to trigger batch compaction
 statement ok
-set datafusion.execution.batch_size = 7
+set datafusion.execution.batch_size = 2
 
 query TIIIIIIIITRRT
 select * from aggregate_test_100 ORDER BY c13 desc limit 5;

From db196fb1a13520b336df9a18171986d59eb7b2d1 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Thu, 24 Aug 2023 16:02:52 -0400
Subject: [PATCH 09/32] add location for re-encoding

---
 datafusion/core/src/physical_plan/topk/mod.rs | 30 ++++++++++++++++---
 datafusion/sqllogictest/test_files/aal.slt    | 15 ++++++++++
 2 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/datafusion/core/src/physical_plan/topk/mod.rs b/datafusion/core/src/physical_plan/topk/mod.rs
index faf68bcd5ac2..e2868c3b7fa3 100644
--- a/datafusion/core/src/physical_plan/topk/mod.rs
+++ b/datafusion/core/src/physical_plan/topk/mod.rs
@@ -18,13 +18,13 @@
 //! TopK: Combination of Sort / LIMIT
 
 use arrow::{
-    compute::interleave,
+    error::ArrowError,
     row::{RowConverter, Rows, SortField},
 };
 use std::{cmp::Ordering, collections::BinaryHeap, sync::Arc};
 
 use arrow_array::{Array, ArrayRef, RecordBatch};
-use arrow_schema::SchemaRef;
+use arrow_schema::{DataType, SchemaRef};
 use datafusion_common::Result;
 use datafusion_execution::{
     memory_pool::{MemoryConsumer, MemoryReservation},
@@ -359,8 +359,8 @@ impl TopKHeap {
     }
 
     /// Returns the values stored in this heap, from values low to
-    /// high, as a single [`RecordBatch`], and a sorted vec of heap contents
-
+    /// high, as a single [`RecordBatch`], and a sorted vec of the
+    /// current heap's contents
     pub fn emit_with_state(&mut self) -> Result<(RecordBatch, Vec<TopKRow>)> {
         let schema = self.store.schema().clone();
 
@@ -657,3 +657,25 @@ impl RecordBatchStore {
             + self.batches_size
     }
 }
+
+
+/// wrapper over [`arrow::compute::interleave`] that re-encodes
+/// dictionaries that have a low usage (values referenced)
+ fn interleave(
+    values: &[&dyn Array],
+    indices: &[(usize, usize)],
+) -> Result<ArrayRef, ArrowError> {
+     // for now, always re-encode only string dictionaries
+     if !values.is_empty() {
+         match values[0].data_type() {
+             DataType::Dictionary(_key_type, value_type) if value_type.as_ref() == &DataType::Utf8 => {
+
+                 //todo!()
+                 return arrow::compute::interleave(values, indices);
+             }
+             _ => { }
+         }
+     }
+     // fallback to arrow
+     arrow::compute::interleave(values, indices)
+ }
diff --git a/datafusion/sqllogictest/test_files/aal.slt b/datafusion/sqllogictest/test_files/aal.slt
index bbab912956e1..36dc0d9fdcf9 100644
--- a/datafusion/sqllogictest/test_files/aal.slt
+++ b/datafusion/sqllogictest/test_files/aal.slt
@@ -215,3 +215,18 @@ d 1 -98 13630 -1991133944 1184110014998006843 220 2986 225513085 963410661024364
 e 2 52 -12056 -1090239422 9011500141803970147 238 4168 2013662838 12565360638488684051 0.6694766 0.391444365692 xipQ93429ksjNcXPX5326VSg1xJZcW
 d 1 -72 25590 1188089983 3090286296481837049 241 832 3542840110 5885937420286765261 0.41980565 0.215354023438 wwXqSGKLyBQyPkonlzBNYUJTCo4LRS
 a 1 -5 12636 794623392 2909750622865366631 15 24022 2669374863 4776679784701509574 0.29877836 0.253725340799 waIGbOGl1PM6gnzZ4uuZt4E2yDWRHs
+
+
+## make an example for
+
+statement ok
+create table dict as select c1, c2, c3, c13, arrow_cast(c13, 'Dictionary(Int32, Utf8)') as c13_dict from aggregate_test_100;
+
+query TIIT?
+select * from dict order by c13 desc limit 5;
+----
+a 4 -38 ydkwycaISlYSlEq3TlkS2m15I2pcp8 ydkwycaISlYSlEq3TlkS2m15I2pcp8
+d 1 -98 y7C453hRWd4E7ImjNDWlpexB8nUqjh y7C453hRWd4E7ImjNDWlpexB8nUqjh
+e 2 52 xipQ93429ksjNcXPX5326VSg1xJZcW xipQ93429ksjNcXPX5326VSg1xJZcW
+d 1 -72 wwXqSGKLyBQyPkonlzBNYUJTCo4LRS wwXqSGKLyBQyPkonlzBNYUJTCo4LRS
+a 1 -5 waIGbOGl1PM6gnzZ4uuZt4E2yDWRHs waIGbOGl1PM6gnzZ4uuZt4E2yDWRHs

From f12307596f6c0f0255d40c682c8a456d13dff980 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Thu, 24 Aug 2023 16:16:32 -0400
Subject: [PATCH 10/32] Start sketching dictionary interleave

---
 datafusion/core/src/physical_plan/topk/mod.rs | 60 ++++++++++++-------
 datafusion/core/tests/sql/order.rs            |  4 +-
 .../simplify_expressions/expr_simplifier.rs   |  4 +-
 .../src/simplify_expressions/regex.rs         |  4 +-
 datafusion/sql/src/statement.rs               |  4 +-
 .../substrait/src/logical_plan/consumer.rs    | 26 ++++----
 .../substrait/src/logical_plan/producer.rs    |  5 +-
 7 files changed, 65 insertions(+), 42 deletions(-)

diff --git a/datafusion/core/src/physical_plan/topk/mod.rs b/datafusion/core/src/physical_plan/topk/mod.rs
index e2868c3b7fa3..3b8546cc876c 100644
--- a/datafusion/core/src/physical_plan/topk/mod.rs
+++ b/datafusion/core/src/physical_plan/topk/mod.rs
@@ -23,7 +23,7 @@ use arrow::{
 };
 use std::{cmp::Ordering, collections::BinaryHeap, sync::Arc};
 
-use arrow_array::{Array, ArrayRef, RecordBatch};
+use arrow_array::{downcast_dictionary_array, Array, ArrayRef, RecordBatch};
 use arrow_schema::{DataType, SchemaRef};
 use datafusion_common::Result;
 use datafusion_execution::{
@@ -275,11 +275,7 @@ struct TopKHeap {
 }
 
 impl TopKHeap {
-    fn new(
-        k: usize,
-        batch_size: usize,
-        schema: SchemaRef
-    ) -> Self {
+    fn new(k: usize, batch_size: usize, schema: SchemaRef) -> Self {
         assert!(k > 0);
         Self {
             k,
@@ -453,7 +449,10 @@ impl TopKHeap {
         // restore the heap
         self.inner = BinaryHeap::from(topk_rows);
 
-        info!("COMPACTION DONE: Have {} batches in store", self.store.len());
+        info!(
+            "COMPACTION DONE: Have {} batches in store",
+            self.store.len()
+        );
         Ok(())
     }
 
@@ -658,24 +657,39 @@ impl RecordBatchStore {
     }
 }
 
-
 /// wrapper over [`arrow::compute::interleave`] that re-encodes
 /// dictionaries that have a low usage (values referenced)
- fn interleave(
+fn interleave(
+    values: &[&dyn Array],
+    indices: &[(usize, usize)],
+) -> Result<ArrayRef, ArrowError> {
+    // for now, always re-encode only string dictionaries
+    if !values.is_empty() {
+        match values[0].data_type() {
+            DataType::Dictionary(_key_type, value_type)
+                if value_type.as_ref() == &DataType::Utf8 =>
+            {
+                return interleave_dictionary(values, indices);
+            }
+            _ => {}
+        }
+    }
+    // fallback to arrow
+    arrow::compute::interleave(values, indices)
+}
+
+// we don't need specialized version for each index type, simply need
+fn interleave_dictionary(
     values: &[&dyn Array],
     indices: &[(usize, usize)],
 ) -> Result<ArrayRef, ArrowError> {
-     // for now, always re-encode only string dictionaries
-     if !values.is_empty() {
-         match values[0].data_type() {
-             DataType::Dictionary(_key_type, value_type) if value_type.as_ref() == &DataType::Utf8 => {
-
-                 //todo!()
-                 return arrow::compute::interleave(values, indices);
-             }
-             _ => { }
-         }
-     }
-     // fallback to arrow
-     arrow::compute::interleave(values, indices)
- }
+    todo!()
+}
+
+/// returns a reference to the values of this dictioanry
+fn values(array: &ArrayRef) -> &ArrayRef {
+    downcast_dictionary_array!(
+        array => return array.values(),
+        _ => unreachable!("Non dictionary type")
+    )
+}
diff --git a/datafusion/core/tests/sql/order.rs b/datafusion/core/tests/sql/order.rs
index 3981fbaa4d7a..a400a78fc914 100644
--- a/datafusion/core/tests/sql/order.rs
+++ b/datafusion/core/tests/sql/order.rs
@@ -48,7 +48,9 @@ async fn sort_with_lots_of_repetition_values() -> Result<()> {
 async fn create_external_table_with_order() -> Result<()> {
     let ctx = SessionContext::new();
     let sql = "CREATE EXTERNAL TABLE dt (a_id integer, a_str string, a_bool boolean) STORED AS CSV WITH ORDER (a_id ASC) LOCATION 'file://path/to/table';";
-    let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) = ctx.state().create_logical_plan(sql).await? else {
+    let LogicalPlan::Ddl(DdlStatement::CreateExternalTable(cmd)) =
+        ctx.state().create_logical_plan(sql).await?
+    else {
         panic!("Wrong command")
     };
 
diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
index 895432026b48..3cf564f367ba 100644
--- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
+++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
@@ -412,7 +412,9 @@ impl<'a, S: SimplifyInfo> TreeNodeRewriter for Simplifier<'a, S> {
             }) if list.len() == 1
                 && matches!(list.first(), Some(Expr::ScalarSubquery { .. })) =>
             {
-                let Expr::ScalarSubquery(subquery) = list.remove(0) else { unreachable!() };
+                let Expr::ScalarSubquery(subquery) = list.remove(0) else {
+                    unreachable!()
+                };
                 Expr::InSubquery(InSubquery::new(expr, subquery, negated))
             }
 
diff --git a/datafusion/optimizer/src/simplify_expressions/regex.rs b/datafusion/optimizer/src/simplify_expressions/regex.rs
index 5094623b82c0..b9d9821b43f0 100644
--- a/datafusion/optimizer/src/simplify_expressions/regex.rs
+++ b/datafusion/optimizer/src/simplify_expressions/regex.rs
@@ -203,7 +203,9 @@ fn anchored_literal_to_expr(v: &[Hir]) -> Option<Expr> {
     match v.len() {
         2 => Some(lit("")),
         3 => {
-            let HirKind::Literal(l) = v[1].kind() else { return None };
+            let HirKind::Literal(l) = v[1].kind() else {
+                return None;
+            };
             like_str_from_literal(l).map(lit)
         }
         _ => None,
diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs
index c9a34bfaf220..8676e2a6e76a 100644
--- a/datafusion/sql/src/statement.rs
+++ b/datafusion/sql/src/statement.rs
@@ -499,10 +499,10 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
                 "DELETE FROM only supports single table, got: joins".to_string(),
             ));
         }
-        let TableFactor::Table{name, ..} = table_factor.relation else {
+        let TableFactor::Table { name, .. } = table_factor.relation else {
             return Err(DataFusionError::NotImplemented(format!(
                 "DELETE FROM only supports single table, got: {table_factor:?}"
-            )))
+            )));
         };
 
         Ok(name)
diff --git a/datafusion/substrait/src/logical_plan/consumer.rs b/datafusion/substrait/src/logical_plan/consumer.rs
index 4e4d71ddb604..54f1facb4ada 100644
--- a/datafusion/substrait/src/logical_plan/consumer.rs
+++ b/datafusion/substrait/src/logical_plan/consumer.rs
@@ -573,9 +573,9 @@ pub async fn from_substrait_sorts(
             Some(k) => match k {
                 Direction(d) => {
                     let Some(direction) = SortDirection::from_i32(*d) else {
-                        return Err(DataFusionError::NotImplemented(
-                            format!("Unsupported Substrait SortDirection value {d}"),
-                        ))
+                        return Err(DataFusionError::NotImplemented(format!(
+                            "Unsupported Substrait SortDirection value {d}"
+                        )));
                     };
 
                     match direction {
@@ -1313,27 +1313,27 @@ async fn make_datafusion_like(
     }
 
     let Some(ArgType::Value(expr_substrait)) = &f.arguments[0].arg_type else {
-        return Err(DataFusionError::NotImplemented(
-            format!("Invalid arguments type for `{fn_name}` expr")
-        ))
+        return Err(DataFusionError::NotImplemented(format!(
+            "Invalid arguments type for `{fn_name}` expr"
+        )));
     };
     let expr = from_substrait_rex(expr_substrait, input_schema, extensions)
         .await?
         .as_ref()
         .clone();
     let Some(ArgType::Value(pattern_substrait)) = &f.arguments[1].arg_type else {
-        return Err(DataFusionError::NotImplemented(
-            format!("Invalid arguments type for `{fn_name}` expr")
-        ))
+        return Err(DataFusionError::NotImplemented(format!(
+            "Invalid arguments type for `{fn_name}` expr"
+        )));
     };
     let pattern = from_substrait_rex(pattern_substrait, input_schema, extensions)
         .await?
         .as_ref()
         .clone();
     let Some(ArgType::Value(escape_char_substrait)) = &f.arguments[2].arg_type else {
-        return Err(DataFusionError::NotImplemented(
-            format!("Invalid arguments type for `{fn_name}` expr")
-        ))
+        return Err(DataFusionError::NotImplemented(format!(
+            "Invalid arguments type for `{fn_name}` expr"
+        )));
     };
     let escape_char_expr =
         from_substrait_rex(escape_char_substrait, input_schema, extensions)
@@ -1343,7 +1343,7 @@ async fn make_datafusion_like(
     let Expr::Literal(ScalarValue::Utf8(escape_char)) = escape_char_expr else {
         return Err(DataFusionError::Substrait(format!(
             "Expect Utf8 literal for escape char, but found {escape_char_expr:?}",
-        )))
+        )));
     };
 
     Ok(Arc::new(Expr::Like(Like {
diff --git a/datafusion/substrait/src/logical_plan/producer.rs b/datafusion/substrait/src/logical_plan/producer.rs
index 79cd8995c6c6..d1f46c9858a0 100644
--- a/datafusion/substrait/src/logical_plan/producer.rs
+++ b/datafusion/substrait/src/logical_plan/producer.rs
@@ -1664,7 +1664,10 @@ mod test {
         println!("Checking round trip of {scalar:?}");
 
         let substrait = to_substrait_literal(&scalar)?;
-        let Expression { rex_type: Some(RexType::Literal(substrait_literal)) } = substrait else {
+        let Expression {
+            rex_type: Some(RexType::Literal(substrait_literal)),
+        } = substrait
+        else {
             panic!("Expected Literal expression, got {substrait:?}");
         };
 

From 157379a21820db8e4da636c7bb4adaca7693f282 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Thu, 24 Aug 2023 16:41:39 -0400
Subject: [PATCH 11/32] checkpoint

---
 datafusion/core/src/physical_plan/topk/mod.rs | 82 ++++++++++++++++---
 1 file changed, 71 insertions(+), 11 deletions(-)

diff --git a/datafusion/core/src/physical_plan/topk/mod.rs b/datafusion/core/src/physical_plan/topk/mod.rs
index 3b8546cc876c..f48ba32025d9 100644
--- a/datafusion/core/src/physical_plan/topk/mod.rs
+++ b/datafusion/core/src/physical_plan/topk/mod.rs
@@ -18,12 +18,11 @@
 //! TopK: Combination of Sort / LIMIT
 
 use arrow::{
-    error::ArrowError,
     row::{RowConverter, Rows, SortField},
 };
 use std::{cmp::Ordering, collections::BinaryHeap, sync::Arc};
 
-use arrow_array::{downcast_dictionary_array, Array, ArrayRef, RecordBatch};
+use arrow_array::{downcast_dictionary_array, Array, ArrayRef, RecordBatch, builder::StringBuilder, cast::AsArray, StringArray, Int32Array, types::Int32Type, DictionaryArray};
 use arrow_schema::{DataType, SchemaRef};
 use datafusion_common::Result;
 use datafusion_execution::{
@@ -31,7 +30,7 @@ use datafusion_execution::{
     runtime_env::RuntimeEnv,
 };
 use datafusion_physical_expr::PhysicalSortExpr;
-use hashbrown::HashMap;
+use hashbrown::{HashMap, HashSet};
 
 use crate::physical_plan::{stream::RecordBatchStreamAdapter, SendableRecordBatchStream};
 
@@ -662,32 +661,93 @@ impl RecordBatchStore {
 fn interleave(
     values: &[&dyn Array],
     indices: &[(usize, usize)],
-) -> Result<ArrayRef, ArrowError> {
+) -> Result<ArrayRef> {
     // for now, always re-encode only string dictionaries
     if !values.is_empty() {
         match values[0].data_type() {
             DataType::Dictionary(_key_type, value_type)
                 if value_type.as_ref() == &DataType::Utf8 =>
             {
-                return interleave_dictionary(values, indices);
+                return interleave_and_repack_dictionary(values, indices);
             }
             _ => {}
         }
     }
     // fallback to arrow
-    arrow::compute::interleave(values, indices)
+    Ok(arrow::compute::interleave(values, indices)?)
 }
 
-// we don't need specialized version for each index type, simply need
-fn interleave_dictionary(
+/// Special interleave kernel that re-creates the dictionary values,
+/// ensuring no unused space
+fn interleave_and_repack_dictionary(
     values: &[&dyn Array],
     indices: &[(usize, usize)],
-) -> Result<ArrayRef, ArrowError> {
-    todo!()
+) -> Result<ArrayRef> {
+    let existing_values = HashSet::new();
+
+    let data_type = values[0].data_type();
+
+    // repack to a new StringArray
+    let mut new_values = StringBuilder::new();
+    // we could specialize this and avoid the copy of the index, but
+    // that seems like a lot of codegen overhead
+    let mut new_keys  = vec![];
+
+    for (array_idx, row_idx) in indices {
+        // look up value,
+        let array = values[*array_idx];
+        downcast_dictionary_array!(
+            array=> {
+                if let Some(key) = array.key(*row_idx) {
+                    let values: &StringArray = array.values().as_string();
+                    if values.is_valid(key) {
+                        let current_value = values.value(key);
+                        println!("Current value is {current_value}");
+                        todo!();
+                    } else {
+                        new_keys.push(None)
+                    }
+                }
+                else {
+                    new_keys.push(None);
+                }
+
+
+            }
+        _ => unreachable!("Non dictionary type")
+
+        )
+    }
+
+    // form the output
+    let DataType::Dictionary(key_type, value_type) = data_type else {
+        unreachable!("non dictionary type");
+    };
+
+    let new_values: ArrayRef = Arc::new(new_values.finish());
+    match key_type.as_ref() {
+        DataType::Int32  => {
+            // check the keys will fit in this array
+            if new_values.len() >= i32::MAX as usize {
+                panic!("todo make a real error message");
+            }
+
+            let new_keys: Int32Array = new_keys.iter().map(|v| v.map(|v| v as i32)).collect();
+
+            Ok(Arc::new(DictionaryArray::try_new(new_keys, new_values)?))
+        }
+        _ => {
+            // handle other keys
+            todo!()
+        }
+    }
+
+
+
 }
 
 /// returns a reference to the values of this dictioanry
-fn values(array: &ArrayRef) -> &ArrayRef {
+fn get_dict_values(array: &ArrayRef) -> &ArrayRef {
     downcast_dictionary_array!(
         array => return array.values(),
         _ => unreachable!("Non dictionary type")

From 682127af5898de0f6dec81d037a5efbe54c07f5b Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Thu, 24 Aug 2023 16:50:29 -0400
Subject: [PATCH 12/32] initial specialized dictionary

---
 datafusion/core/src/physical_plan/topk/mod.rs | 36 ++++++++-----------
 1 file changed, 15 insertions(+), 21 deletions(-)

diff --git a/datafusion/core/src/physical_plan/topk/mod.rs b/datafusion/core/src/physical_plan/topk/mod.rs
index f48ba32025d9..e5ba0a69e392 100644
--- a/datafusion/core/src/physical_plan/topk/mod.rs
+++ b/datafusion/core/src/physical_plan/topk/mod.rs
@@ -22,7 +22,7 @@ use arrow::{
 };
 use std::{cmp::Ordering, collections::BinaryHeap, sync::Arc};
 
-use arrow_array::{downcast_dictionary_array, Array, ArrayRef, RecordBatch, builder::StringBuilder, cast::AsArray, StringArray, Int32Array, types::Int32Type, DictionaryArray};
+use arrow_array::{downcast_dictionary_array, Array, ArrayRef, RecordBatch, builder::StringBuilder, cast::AsArray, StringArray, Int32Array, DictionaryArray};
 use arrow_schema::{DataType, SchemaRef};
 use datafusion_common::Result;
 use datafusion_execution::{
@@ -30,7 +30,7 @@ use datafusion_execution::{
     runtime_env::RuntimeEnv,
 };
 use datafusion_physical_expr::PhysicalSortExpr;
-use hashbrown::{HashMap, HashSet};
+use hashbrown::{HashMap};
 
 use crate::physical_plan::{stream::RecordBatchStreamAdapter, SendableRecordBatchStream};
 
@@ -683,14 +683,11 @@ fn interleave_and_repack_dictionary(
     values: &[&dyn Array],
     indices: &[(usize, usize)],
 ) -> Result<ArrayRef> {
-    let existing_values = HashSet::new();
-
     let data_type = values[0].data_type();
 
-    // repack to a new StringArray
+    // maps strings to new keys ( indexes)
+    let mut new_value_to_key = HashMap::new();
     let mut new_values = StringBuilder::new();
-    // we could specialize this and avoid the copy of the index, but
-    // that seems like a lot of codegen overhead
     let mut new_keys  = vec![];
 
     for (array_idx, row_idx) in indices {
@@ -702,8 +699,16 @@ fn interleave_and_repack_dictionary(
                     let values: &StringArray = array.values().as_string();
                     if values.is_valid(key) {
                         let current_value = values.value(key);
-                        println!("Current value is {current_value}");
-                        todo!();
+                        if let Some(new_key) = new_value_to_key.get(current_value) {
+                            // value was already in the set
+                            new_keys.push(Some(*new_key))
+                        } else {
+                            // value not yet seen
+                            let new_key = new_value_to_key.len();
+                            new_values.append_value(current_value);
+                            new_keys.push(Some(new_key));
+                            new_value_to_key.insert(current_value, new_key);
+                        }
                     } else {
                         new_keys.push(None)
                     }
@@ -720,7 +725,7 @@ fn interleave_and_repack_dictionary(
     }
 
     // form the output
-    let DataType::Dictionary(key_type, value_type) = data_type else {
+    let DataType::Dictionary(key_type, _value_type) = data_type else {
         unreachable!("non dictionary type");
     };
 
@@ -741,15 +746,4 @@ fn interleave_and_repack_dictionary(
             todo!()
         }
     }
-
-
-
-}
-
-/// returns a reference to the values of this dictioanry
-fn get_dict_values(array: &ArrayRef) -> &ArrayRef {
-    downcast_dictionary_array!(
-        array => return array.values(),
-        _ => unreachable!("Non dictionary type")
-    )
 }

From a1ea62ecac4ecc4bfad96eee1068ffa02e49f9ae Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Thu, 24 Aug 2023 16:54:05 -0400
Subject: [PATCH 13/32] finish initial special interleave

---
 datafusion/core/src/physical_plan/topk/mod.rs | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/datafusion/core/src/physical_plan/topk/mod.rs b/datafusion/core/src/physical_plan/topk/mod.rs
index e5ba0a69e392..8169471472bf 100644
--- a/datafusion/core/src/physical_plan/topk/mod.rs
+++ b/datafusion/core/src/physical_plan/topk/mod.rs
@@ -716,8 +716,6 @@ fn interleave_and_repack_dictionary(
                 else {
                     new_keys.push(None);
                 }
-
-
             }
         _ => unreachable!("Non dictionary type")
 
@@ -736,7 +734,6 @@ fn interleave_and_repack_dictionary(
             if new_values.len() >= i32::MAX as usize {
                 panic!("todo make a real error message");
             }
-
             let new_keys: Int32Array = new_keys.iter().map(|v| v.map(|v| v as i32)).collect();
 
             Ok(Arc::new(DictionaryArray::try_new(new_keys, new_values)?))

From 5e65130adab1b90cad8d785bed766230632d4f38 Mon Sep 17 00:00:00 2001
From: Andrew Lamb <andrew@nerdnetworks.org>
Date: Thu, 24 Aug 2023 17:58:55 -0400
Subject: [PATCH 14/32] Complete dictionary order

---
 datafusion/core/src/physical_plan/topk/mod.rs | 53 ++++++++++++-------
 1 file changed, 35 insertions(+), 18 deletions(-)

diff --git a/datafusion/core/src/physical_plan/topk/mod.rs b/datafusion/core/src/physical_plan/topk/mod.rs
index 8169471472bf..5e1ec8d5152a 100644
--- a/datafusion/core/src/physical_plan/topk/mod.rs
+++ b/datafusion/core/src/physical_plan/topk/mod.rs
@@ -17,12 +17,14 @@
 
 //! TopK: Combination of Sort / LIMIT
 
-use arrow::{
-    row::{RowConverter, Rows, SortField},
-};
+use arrow::row::{RowConverter, Rows, SortField};
 use std::{cmp::Ordering, collections::BinaryHeap, sync::Arc};
 
-use arrow_array::{downcast_dictionary_array, Array, ArrayRef, RecordBatch, builder::StringBuilder, cast::AsArray, StringArray, Int32Array, DictionaryArray};
+use arrow_array::{
+    builder::StringBuilder, cast::AsArray, downcast_dictionary_array, Array, ArrayRef,
+    DictionaryArray, Int16Array, Int32Array, Int64Array, Int8Array, RecordBatch,
+    StringArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
+};
 use arrow_schema::{DataType, SchemaRef};
 use datafusion_common::Result;
 use datafusion_execution::{
@@ -30,7 +32,7 @@ use datafusion_execution::{
     runtime_env::RuntimeEnv,
 };
 use datafusion_physical_expr::PhysicalSortExpr;
-use hashbrown::{HashMap};
+use hashbrown::HashMap;
 
 use crate::physical_plan::{stream::RecordBatchStreamAdapter, SendableRecordBatchStream};
 
@@ -658,10 +660,7 @@ impl RecordBatchStore {
 
 /// wrapper over [`arrow::compute::interleave`] that re-encodes
 /// dictionaries that have a low usage (values referenced)
-fn interleave(
-    values: &[&dyn Array],
-    indices: &[(usize, usize)],
-) -> Result<ArrayRef> {
+fn interleave(values: &[&dyn Array], indices: &[(usize, usize)]) -> Result<ArrayRef> {
     // for now, always re-encode only string dictionaries
     if !values.is_empty() {
         match values[0].data_type() {
@@ -688,7 +687,7 @@ fn interleave_and_repack_dictionary(
     // maps strings to new keys ( indexes)
     let mut new_value_to_key = HashMap::new();
     let mut new_values = StringBuilder::new();
-    let mut new_keys  = vec![];
+    let mut new_keys = vec![];
 
     for (array_idx, row_idx) in indices {
         // look up value,
@@ -728,19 +727,37 @@ fn interleave_and_repack_dictionary(
     };
 
     let new_values: ArrayRef = Arc::new(new_values.finish());
-    match key_type.as_ref() {
-        DataType::Int32  => {
+
+    // creates a $ARRAY_TYPE array from $NEW_KEYS ad $NEW_VALUES
+    use datafusion_common::DataFusionError;
+    macro_rules! make_keys {
+        ($PRIM_TYPE:ty, $ARRAY_TYPE:ty, $NEW_KEYS:ident, $NEW_VALUES:ident) => {{
             // check the keys will fit in this array
-            if new_values.len() >= i32::MAX as usize {
-                panic!("todo make a real error message");
+            if $NEW_VALUES.len() >= <$PRIM_TYPE>::MAX as usize {
+                return Err(DataFusionError::Execution(format!(
+                    "keys did not fit in prim type -- TODO MAKE BETTER"
+                )));
             }
-            let new_keys: Int32Array = new_keys.iter().map(|v| v.map(|v| v as i32)).collect();
-
+            let new_keys: $ARRAY_TYPE = new_keys
+                .iter()
+                .map(|v| v.map(|v| v as $PRIM_TYPE))
+                .collect();
             Ok(Arc::new(DictionaryArray::try_new(new_keys, new_values)?))
-        }
+        }};
+    }
+
+    match key_type.as_ref() {
+        DataType::Int8 => make_keys!(i8, Int8Array, new_keys, new_values),
+        DataType::Int16 => make_keys!(i16, Int16Array, new_keys, new_values),
+        DataType::Int32 => make_keys!(i32, Int32Array, new_keys, new_values),
+        DataType::Int64 => make_keys!(i64, Int64Array, new_keys, new_values),
+        DataType::UInt8 => make_keys!(u8, UInt8Array, new_keys, new_values),
+        DataType::UInt16 => make_keys!(u16, UInt16Array, new_keys, new_values),
+        DataType::UInt32 => make_keys!(u32, UInt32Array, new_keys, new_values),
+        DataType::UInt64 => make_keys!(u64, UInt64Array, new_keys, new_values),
         _ => {
             // handle other keys
-            todo!()
+            unreachable!("unvalid key type");
         }
     }
 }

From 4a30c4cf4890fdb405903e80a5930fdd30c99b57 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20Heres?= <danielheres@gmail.com>
Date: Mon, 2 Oct 2023 14:11:11 +0200
Subject: [PATCH 15/32] Merge

---
 parquet-testing | 2 +-
 testing         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/parquet-testing b/parquet-testing
index a11fc8f148f8..e45cd23f784a 160000
--- a/parquet-testing
+++ b/parquet-testing
@@ -1 +1 @@
-Subproject commit a11fc8f148f8a7a89d9281cc0da3eb9d56095fbf
+Subproject commit e45cd23f784aab3d6bf0701f8f4e621469ed3be7
diff --git a/testing b/testing
index e81d0c6de359..98fceecd024d 160000
--- a/testing
+++ b/testing
@@ -1 +1 @@
-Subproject commit e81d0c6de35948b3be7984af8e00413b314cde6e
+Subproject commit 98fceecd024dccd2f8a00e32fc144975f218acf4

From d9c596ff4dbb57e680aab58271f3658abfae0a03 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20Heres?= <danielheres@gmail.com>
Date: Mon, 2 Oct 2023 14:26:59 +0200
Subject: [PATCH 16/32] fmt

---
 datafusion/physical-plan/src/lib.rs        | 2 +-
 datafusion/physical-plan/src/sorts/sort.rs | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs
index 0165f808002b..3071fadcb1a2 100644
--- a/datafusion/physical-plan/src/lib.rs
+++ b/datafusion/physical-plan/src/lib.rs
@@ -25,9 +25,9 @@ use self::{
     coalesce_partitions::CoalescePartitionsExec, display::DisplayableExecutionPlan,
 };
 pub use datafusion_common::{internal_err, ColumnStatistics, Statistics};
-pub use topk::TopK;
 use datafusion_common::{plan_err, Result};
 use datafusion_physical_expr::PhysicalSortExpr;
+pub use topk::TopK;
 pub use visitor::{accept, visit_execution_plan, ExecutionPlanVisitor};
 
 use arrow::datatypes::SchemaRef;
diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs
index 8c66b269b363..b76b183c0942 100644
--- a/datafusion/physical-plan/src/sorts/sort.rs
+++ b/datafusion/physical-plan/src/sorts/sort.rs
@@ -24,9 +24,9 @@ use crate::expressions::PhysicalSortExpr;
 use crate::metrics::{
     BaselineMetrics, Count, ExecutionPlanMetricsSet, MetricBuilder, MetricsSet,
 };
-use crate::topk::TopK;
 use crate::sorts::merge::streaming_merge;
 use crate::stream::{RecordBatchReceiverStream, RecordBatchStreamAdapter};
+use crate::topk::TopK;
 use crate::{
     DisplayAs, DisplayFormatType, Distribution, EmptyRecordBatchStream, ExecutionPlan,
     Partitioning, SendableRecordBatchStream, Statistics,

From c0f89c114095309d4032686e2e32ddb8f0c49aba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20Heres?= <danielheres@gmail.com>
Date: Mon, 2 Oct 2023 14:50:09 +0200
Subject: [PATCH 17/32] Cleanup

---
 datafusion/physical-plan/src/topk/mod.rs | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/datafusion/physical-plan/src/topk/mod.rs b/datafusion/physical-plan/src/topk/mod.rs
index fcf161b3821f..16c7508f92d7 100644
--- a/datafusion/physical-plan/src/topk/mod.rs
+++ b/datafusion/physical-plan/src/topk/mod.rs
@@ -370,8 +370,8 @@ impl TopKHeap {
             return Ok((RecordBatch::new_empty(schema), topk_rows));
         }
 
-        // Indicies for each row within its respective RecordBatch
-        let indicies: Vec<_> = topk_rows
+        // Indices for each row within its respective RecordBatch
+        let indices: Vec<_> = topk_rows
             .iter()
             .enumerate()
             .map(|(i, k)| (i, k.index))
@@ -396,7 +396,7 @@ impl TopKHeap {
                 // rows and `input_arrays` contains a reference to the
                 // relevant Array for that index. `interleave` pulls
                 // them together into a single new array
-                Ok(interleave(&input_arrays, &indicies)?)
+                Ok(interleave(&input_arrays, &indices)?)
             })
             .collect::<Result<_>>()?;
 
@@ -413,17 +413,12 @@ impl TopKHeap {
         // batches might be partially full
         let max_unused_rows = (20 * self.batch_size) + self.k;
         let unused_rows = self.store.unused_rows();
-        use log::info;
-        //info!("{} batches in store, unused rows in store: {}, max unused rows: {}",
-        //self.store.len(), unused_rows, max_unused_rows);
 
         // don't compact if the store has only one batch or
         if self.store.len() <= 2 || unused_rows < max_unused_rows {
             //if self.store.len() <= 2 {
             return Ok(());
         }
-        info!("Have {} batches in store, COMPACTING", self.store.len());
-
         // at first, compact the entire thing always into a new batch
         // (maybe we can get fancier in the future about ignoring
         // batches that have a high usage ratio already
@@ -450,10 +445,6 @@ impl TopKHeap {
         // restore the heap
         self.inner = BinaryHeap::from(topk_rows);
 
-        info!(
-            "COMPACTION DONE: Have {} batches in store",
-            self.store.len()
-        );
         Ok(())
     }
 

From 466d4b627d260f69dbbc3bf346b7617f6e34e582 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20Heres?= <danielheres@gmail.com>
Date: Mon, 2 Oct 2023 15:11:17 +0200
Subject: [PATCH 18/32] Fix test

---
 datafusion/physical-plan/src/sorts/sort.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs
index b76b183c0942..703f80d90d2b 100644
--- a/datafusion/physical-plan/src/sorts/sort.rs
+++ b/datafusion/physical-plan/src/sorts/sort.rs
@@ -1074,7 +1074,7 @@ mod tests {
             assert_eq!(result.len(), 1);
 
             let metrics = sort_exec.metrics().unwrap();
-            let did_it_spill = metrics.spill_count().unwrap() > 0;
+            let did_it_spill = metrics.spill_count().unwrap_or(0) > 0;
             assert_eq!(did_it_spill, expect_spillage, "with fetch: {fetch:?}");
         }
         Ok(())

From 33065ad8598cf373908b633b56170828895edf21 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20Heres?= <danielheres@gmail.com>
Date: Mon, 2 Oct 2023 16:24:44 +0200
Subject: [PATCH 19/32] Cleanup

---
 datafusion/physical-plan/src/topk/mod.rs | 109 +----------------------
 1 file changed, 4 insertions(+), 105 deletions(-)

diff --git a/datafusion/physical-plan/src/topk/mod.rs b/datafusion/physical-plan/src/topk/mod.rs
index 16c7508f92d7..0ee9b7ea42c8 100644
--- a/datafusion/physical-plan/src/topk/mod.rs
+++ b/datafusion/physical-plan/src/topk/mod.rs
@@ -17,7 +17,10 @@
 
 //! TopK: Combination of Sort / LIMIT
 
-use arrow::row::{RowConverter, Rows, SortField};
+use arrow::{
+    compute::interleave,
+    row::{RowConverter, Rows, SortField},
+};
 use std::{cmp::Ordering, collections::BinaryHeap, sync::Arc};
 
 use arrow_array::{
@@ -648,107 +651,3 @@ impl RecordBatchStore {
             + self.batches_size
     }
 }
-
-/// wrapper over [`arrow::compute::interleave`] that re-encodes
-/// dictionaries that have a low usage (values referenced)
-fn interleave(values: &[&dyn Array], indices: &[(usize, usize)]) -> Result<ArrayRef> {
-    // for now, always re-encode only string dictionaries
-    if !values.is_empty() {
-        match values[0].data_type() {
-            DataType::Dictionary(_key_type, value_type)
-                if value_type.as_ref() == &DataType::Utf8 =>
-            {
-                return interleave_and_repack_dictionary(values, indices);
-            }
-            _ => {}
-        }
-    }
-    // fallback to arrow
-    Ok(arrow::compute::interleave(values, indices)?)
-}
-
-/// Special interleave kernel that re-creates the dictionary values,
-/// ensuring no unused space
-fn interleave_and_repack_dictionary(
-    values: &[&dyn Array],
-    indices: &[(usize, usize)],
-) -> Result<ArrayRef> {
-    let data_type = values[0].data_type();
-
-    // maps strings to new keys ( indexes)
-    let mut new_value_to_key = HashMap::new();
-    let mut new_values = StringBuilder::new();
-    let mut new_keys = vec![];
-
-    for (array_idx, row_idx) in indices {
-        // look up value,
-        let array = values[*array_idx];
-        downcast_dictionary_array!(
-            array=> {
-                if let Some(key) = array.key(*row_idx) {
-                    let values: &StringArray = array.values().as_string();
-                    if values.is_valid(key) {
-                        let current_value = values.value(key);
-                        if let Some(new_key) = new_value_to_key.get(current_value) {
-                            // value was already in the set
-                            new_keys.push(Some(*new_key))
-                        } else {
-                            // value not yet seen
-                            let new_key = new_value_to_key.len();
-                            new_values.append_value(current_value);
-                            new_keys.push(Some(new_key));
-                            new_value_to_key.insert(current_value, new_key);
-                        }
-                    } else {
-                        new_keys.push(None)
-                    }
-                }
-                else {
-                    new_keys.push(None);
-                }
-            }
-        _ => unreachable!("Non dictionary type")
-
-        )
-    }
-
-    // form the output
-    let DataType::Dictionary(key_type, _value_type) = data_type else {
-        unreachable!("non dictionary type");
-    };
-
-    let new_values: ArrayRef = Arc::new(new_values.finish());
-
-    // creates a $ARRAY_TYPE array from $NEW_KEYS ad $NEW_VALUES
-    use datafusion_common::DataFusionError;
-    macro_rules! make_keys {
-        ($PRIM_TYPE:ty, $ARRAY_TYPE:ty, $NEW_KEYS:ident, $NEW_VALUES:ident) => {{
-            // check the keys will fit in this array
-            if $NEW_VALUES.len() >= <$PRIM_TYPE>::MAX as usize {
-                return Err(DataFusionError::Execution(format!(
-                    "keys did not fit in prim type -- TODO MAKE BETTER"
-                )));
-            }
-            let new_keys: $ARRAY_TYPE = new_keys
-                .iter()
-                .map(|v| v.map(|v| v as $PRIM_TYPE))
-                .collect();
-            Ok(Arc::new(DictionaryArray::try_new(new_keys, new_values)?))
-        }};
-    }
-
-    match key_type.as_ref() {
-        DataType::Int8 => make_keys!(i8, Int8Array, new_keys, new_values),
-        DataType::Int16 => make_keys!(i16, Int16Array, new_keys, new_values),
-        DataType::Int32 => make_keys!(i32, Int32Array, new_keys, new_values),
-        DataType::Int64 => make_keys!(i64, Int64Array, new_keys, new_values),
-        DataType::UInt8 => make_keys!(u8, UInt8Array, new_keys, new_values),
-        DataType::UInt16 => make_keys!(u16, UInt16Array, new_keys, new_values),
-        DataType::UInt32 => make_keys!(u32, UInt32Array, new_keys, new_values),
-        DataType::UInt64 => make_keys!(u64, UInt64Array, new_keys, new_values),
-        _ => {
-            // handle other keys
-            unreachable!("unvalid key type");
-        }
-    }
-}

From e31718ec808c8c3b2cb5d64405445f01762e2497 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20Heres?= <danielheres@gmail.com>
Date: Mon, 2 Oct 2023 17:03:40 +0200
Subject: [PATCH 20/32] Make test deterministic

---
 datafusion/sqllogictest/test_files/decimal.slt | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/datafusion/sqllogictest/test_files/decimal.slt b/datafusion/sqllogictest/test_files/decimal.slt
index a326a0cc4941..d7632138a84e 100644
--- a/datafusion/sqllogictest/test_files/decimal.slt
+++ b/datafusion/sqllogictest/test_files/decimal.slt
@@ -507,27 +507,26 @@ select * from decimal_simple where c1 >= 0.00004 order by c1;
 
 
 query RRIBR
-select * from decimal_simple where c1 >= 0.00004 order by c1 limit 10;
+select * from decimal_simple where c1 >= 0.00004 order by c1, c3 limit 10;
 ----
 0.00004 0.000000000004 5 true 0.000044
+0.00004 0.000000000004 8 false 0.000044
 0.00004 0.000000000004 12 false 0.00004
 0.00004 0.000000000004 14 true 0.00004
-0.00004 0.000000000004 8 false 0.000044
-0.00005 0.000000000005 9 true 0.000052
+0.00005 0.000000000005 1 false 0.0001
 0.00005 0.000000000005 4 true 0.000078
 0.00005 0.000000000005 8 false 0.000033
+0.00005 0.000000000005 9 true 0.000052
 0.00005 0.000000000005 100 true 0.000068
-0.00005 0.000000000005 1 false 0.0001
-
 
 query RRIBR
-select * from decimal_simple where c1 >= 0.00004 order by c1 limit 5;
+select * from decimal_simple where c1 >= 0.00004 order by c1, c3 limit 5;
 ----
 0.00004 0.000000000004 5 true 0.000044
+0.00004 0.000000000004 8 false 0.000044
 0.00004 0.000000000004 12 false 0.00004
 0.00004 0.000000000004 14 true 0.00004
-0.00004 0.000000000004 8 false 0.000044
-0.00005 0.000000000005 9 true 0.000052
+0.00005 0.000000000005 1 false 0.0001
 
 
 query RRIBR

From 40ef4488acc8fdae8b8f5811c581a7a31b31fd82 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20Heres?= <danielheres@gmail.com>
Date: Mon, 2 Oct 2023 17:59:48 +0200
Subject: [PATCH 21/32] Clippy, doctest

---
 datafusion/physical-plan/src/topk/mod.rs | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/datafusion/physical-plan/src/topk/mod.rs b/datafusion/physical-plan/src/topk/mod.rs
index 0ee9b7ea42c8..3cd1eaca5b03 100644
--- a/datafusion/physical-plan/src/topk/mod.rs
+++ b/datafusion/physical-plan/src/topk/mod.rs
@@ -23,12 +23,8 @@ use arrow::{
 };
 use std::{cmp::Ordering, collections::BinaryHeap, sync::Arc};
 
-use arrow_array::{
-    builder::StringBuilder, cast::AsArray, downcast_dictionary_array, Array, ArrayRef,
-    DictionaryArray, Int16Array, Int32Array, Int64Array, Int8Array, RecordBatch,
-    StringArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
-};
-use arrow_schema::{DataType, SchemaRef};
+use arrow_array::{Array, ArrayRef, RecordBatch};
+use arrow_schema::SchemaRef;
 use datafusion_common::Result;
 use datafusion_execution::{
     memory_pool::{MemoryConsumer, MemoryReservation},
@@ -55,7 +51,7 @@ use super::metrics::{BaselineMetrics, Count, ExecutionPlanMetricsSet, MetricBuil
 ///
 /// The simple plan would be:
 ///
-/// ```
+/// ```sql
 /// > explain SELECT customer_id, revenue FROM sales ORDER BY revenue DESC limit 3;
 /// +--------------+----------------------------------------+
 /// | plan_type    | plan                                   |

From c373ce312fefdf12fd0ebb3eee627f1b12812532 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20Heres?= <danielheres@gmail.com>
Date: Tue, 3 Oct 2023 13:35:31 +0200
Subject: [PATCH 22/32] Use into_sorted_vec

---
 datafusion/physical-plan/src/topk/mod.rs | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/datafusion/physical-plan/src/topk/mod.rs b/datafusion/physical-plan/src/topk/mod.rs
index 3cd1eaca5b03..6bdfc1b8b776 100644
--- a/datafusion/physical-plan/src/topk/mod.rs
+++ b/datafusion/physical-plan/src/topk/mod.rs
@@ -360,10 +360,8 @@ impl TopKHeap {
     pub fn emit_with_state(&mut self) -> Result<(RecordBatch, Vec<TopKRow>)> {
         let schema = self.store.schema().clone();
 
-        let mut topk_rows = std::mem::take(&mut self.inner).into_vec();
-
-        // sort low to high (reverse the reverse)
-        topk_rows.sort();
+        // generate sorted rows
+        let topk_rows = std::mem::take(&mut self.inner).into_sorted_vec();
 
         if self.store.is_empty() {
             return Ok((RecordBatch::new_empty(schema), topk_rows));

From bd72ad878bb4d74b503936963b1388cb4e1fcf4d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20Heres?= <danielheres@gmail.com>
Date: Tue, 3 Oct 2023 14:35:10 +0200
Subject: [PATCH 23/32] Fix nondeterministic tests

---
 datafusion/sqllogictest/test_files/window.slt | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/datafusion/sqllogictest/test_files/window.slt b/datafusion/sqllogictest/test_files/window.slt
index 4ba0d6cc3e40..5fb5a04c6709 100644
--- a/datafusion/sqllogictest/test_files/window.slt
+++ b/datafusion/sqllogictest/test_files/window.slt
@@ -2673,7 +2673,7 @@ SELECT
   LEAD(inc_col, -1, 1001) OVER(ORDER BY ts DESC RANGE BETWEEN 1 PRECEDING and 10 FOLLOWING) AS leadr1,
   LEAD(inc_col, 4, 1004) OVER(ORDER BY ts DESC ROWS BETWEEN 10 PRECEDING and 1 FOLLOWING) as leadr2
   FROM annotated_data_finite
-  ORDER BY ts DESC
+  ORDER BY ts DESC, fv2
   LIMIT 5;
 ----
 264 289 266 305 305 305 278 99 99 99 99 86 86 296 291 296 1004 305 305 301 296 305 1002 305 286
@@ -3274,13 +3274,13 @@ drop table annotated_data_infinite2
 query IRR
 SELECT
   C3,
-  MAX(c12) OVER window1,
-  MIN(c12) OVER window2 as max1
+  MAX(c12) OVER window1 as max1,
+  MIN(c12) OVER window2 as max2
   FROM aggregate_test_100
   WINDOW window1 AS (ORDER BY C12),
   window2 AS (PARTITION BY C11),
   window3 AS (ORDER BY C1)
-  ORDER BY C3
+  ORDER BY C3, max2
   LIMIT 5
 ----
 -117 0.850672105305 0.850672105305
@@ -3329,7 +3329,7 @@ SELECT
   MIN(c12) OVER window1 as max1
   FROM aggregate_test_100
   WINDOW window1 AS (ORDER BY C12)
-  ORDER BY C3
+  ORDER BY C3, min1
   LIMIT 5
 ----
 -117 0.850672105305 0.014793053078

From 84ffae8ae0f20d785a99a00d884279baccff99b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20Heres?= <danielheres@gmail.com>
Date: Tue, 3 Oct 2023 14:42:35 +0200
Subject: [PATCH 24/32] Update cargo.lock

---
 datafusion-cli/Cargo.lock | 184 ++++++++++++++++++++------------------
 1 file changed, 99 insertions(+), 85 deletions(-)

diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock
index 775f8ec87e38..ab7f24922899 100644
--- a/datafusion-cli/Cargo.lock
+++ b/datafusion-cli/Cargo.lock
@@ -77,9 +77,9 @@ dependencies = [
 
 [[package]]
 name = "anstyle"
-version = "1.0.3"
+version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b84bf0a05bbb2a83e5eb6fa36bb6e87baa08193c35ff52bbf6b38d8af2890e46"
+checksum = "7079075b41f533b8c61d2a4d073c4676e1f8b249ff94a393b0595db304e0dd87"
 
 [[package]]
 name = "arrayref"
@@ -143,7 +143,7 @@ dependencies = [
  "chrono",
  "chrono-tz",
  "half",
- "hashbrown 0.14.0",
+ "hashbrown 0.14.1",
  "num",
 ]
 
@@ -234,7 +234,7 @@ dependencies = [
  "arrow-schema",
  "chrono",
  "half",
- "indexmap 2.0.0",
+ "indexmap 2.0.2",
  "lexical-core",
  "num",
  "serde",
@@ -268,7 +268,7 @@ dependencies = [
  "arrow-data",
  "arrow-schema",
  "half",
- "hashbrown 0.14.0",
+ "hashbrown 0.14.1",
 ]
 
 [[package]]
@@ -734,9 +734,9 @@ dependencies = [
 
 [[package]]
 name = "brotli"
-version = "3.3.4"
+version = "3.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a1a0b1dbcc8ae29329621f8d4f0d835787c1c38bb1401979b49d13b0b305ff68"
+checksum = "516074a47ef4bce09577a3b379392300159ce5b1ba2e501ff1c819950066100f"
 dependencies = [
  "alloc-no-stdlib",
  "alloc-stdlib",
@@ -745,9 +745,9 @@ dependencies = [
 
 [[package]]
 name = "brotli-decompressor"
-version = "2.3.4"
+version = "2.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4b6561fd3f895a11e8f72af2cb7d22e08366bebc2b6b57f7744c4bda27034744"
+checksum = "da74e2b81409b1b743f8f0c62cc6254afefb8b8e50bbfe3735550f7aeefa3448"
 dependencies = [
  "alloc-no-stdlib",
  "alloc-stdlib",
@@ -1026,9 +1026,9 @@ dependencies = [
 
 [[package]]
 name = "ctor"
-version = "0.2.4"
+version = "0.2.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1f34ba9a9bcb8645379e9de8cb3ecfcf4d1c85ba66d90deb3259206fa5aa193b"
+checksum = "37e366bff8cd32dd8754b0991fb66b279dc48f598c3a18914852a6673deef583"
 dependencies = [
  "quote",
  "syn 2.0.37",
@@ -1041,7 +1041,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "978747c1d849a7d2ee5e8adc0159961c48fb7e5db2f06af6723b80123bb53856"
 dependencies = [
  "cfg-if",
- "hashbrown 0.14.0",
+ "hashbrown 0.14.1",
  "lock_api",
  "once_cell",
  "parking_lot_core",
@@ -1072,9 +1072,9 @@ dependencies = [
  "futures",
  "glob",
  "half",
- "hashbrown 0.14.0",
- "indexmap 2.0.0",
- "itertools 0.11.0",
+ "hashbrown 0.14.1",
+ "indexmap 2.0.2",
+ "itertools",
  "log",
  "num_cpus",
  "object_store",
@@ -1145,7 +1145,7 @@ dependencies = [
  "datafusion-common",
  "datafusion-expr",
  "futures",
- "hashbrown 0.14.0",
+ "hashbrown 0.14.1",
  "log",
  "object_store",
  "parking_lot",
@@ -1177,8 +1177,8 @@ dependencies = [
  "datafusion-common",
  "datafusion-expr",
  "datafusion-physical-expr",
- "hashbrown 0.14.0",
- "itertools 0.11.0",
+ "hashbrown 0.14.1",
+ "itertools",
  "log",
  "regex-syntax",
 ]
@@ -1199,10 +1199,10 @@ dependencies = [
  "datafusion-common",
  "datafusion-expr",
  "half",
- "hashbrown 0.14.0",
+ "hashbrown 0.14.1",
  "hex",
- "indexmap 2.0.0",
- "itertools 0.11.0",
+ "indexmap 2.0.2",
+ "itertools",
  "libc",
  "log",
  "md-5",
@@ -1232,9 +1232,9 @@ dependencies = [
  "datafusion-physical-expr",
  "futures",
  "half",
- "hashbrown 0.14.0",
- "indexmap 2.0.0",
- "itertools 0.11.0",
+ "hashbrown 0.14.1",
+ "indexmap 2.0.2",
+ "itertools",
  "log",
  "once_cell",
  "parking_lot",
@@ -1368,9 +1368,9 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
 
 [[package]]
 name = "errno"
-version = "0.3.3"
+version = "0.3.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "136526188508e25c6fef639d7927dfb3e0e3084488bf202267829cf7fc23dbdd"
+checksum = "add4f07d43996f76ef320709726a556a9d4f965d9410d8d0271132d2f8293480"
 dependencies = [
  "errno-dragonfly",
  "libc",
@@ -1408,9 +1408,9 @@ dependencies = [
 
 [[package]]
 name = "fastrand"
-version = "2.0.0"
+version = "2.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6999dc1837253364c2ebb0704ba97994bd874e8f195d665c50b7548f6ea92764"
+checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5"
 
 [[package]]
 name = "fd-lock"
@@ -1639,9 +1639,9 @@ checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
 
 [[package]]
 name = "hashbrown"
-version = "0.14.0"
+version = "0.14.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a"
+checksum = "7dfda62a12f55daeae5015f81b0baea145391cb4520f86c248fc615d72640d12"
 dependencies = [
  "ahash",
  "allocator-api2",
@@ -1821,12 +1821,12 @@ dependencies = [
 
 [[package]]
 name = "indexmap"
-version = "2.0.0"
+version = "2.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d5477fe2230a79769d8dc68e0eabf5437907c0457a5614a9e8dddb67f65eb65d"
+checksum = "8adf3ddd720272c6ea8bf59463c04e0f93d0bbf7c5439b691bca2987e0270897"
 dependencies = [
  "equivalent",
- "hashbrown 0.14.0",
+ "hashbrown 0.14.1",
 ]
 
 [[package]]
@@ -1850,15 +1850,6 @@ version = "2.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "28b29a3cd74f0f4598934efe3aeba42bae0eb4680554128851ebbecb02af14e6"
 
-[[package]]
-name = "itertools"
-version = "0.10.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473"
-dependencies = [
- "either",
-]
-
 [[package]]
 name = "itertools"
 version = "0.11.0"
@@ -1986,9 +1977,9 @@ dependencies = [
 
 [[package]]
 name = "linux-raw-sys"
-version = "0.4.7"
+version = "0.4.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a9bad9f94746442c783ca431b22403b519cd7fbeed0533fdd6328b2f2212128"
+checksum = "3852614a3bd9ca9804678ba6be5e3b8ce76dfc902cae004e3e0c44051b6e88db"
 
 [[package]]
 name = "lock_api"
@@ -2039,18 +2030,19 @@ dependencies = [
 
 [[package]]
 name = "md-5"
-version = "0.10.5"
+version = "0.10.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6365506850d44bff6e2fbcb5176cf63650e48bd45ef2fe2665ae1570e0f4b9ca"
+checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf"
 dependencies = [
+ "cfg-if",
  "digest",
 ]
 
 [[package]]
 name = "memchr"
-version = "2.6.3"
+version = "2.6.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f232d6ef707e1956a43342693d2a31e72989554d58299d7a88738cc95b0d35c"
+checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167"
 
 [[package]]
 name = "mimalloc"
@@ -2211,9 +2203,9 @@ dependencies = [
 
 [[package]]
 name = "object_store"
-version = "0.7.0"
+version = "0.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d359e231e5451f4f9fa889d56e3ce34f8724f1a61db2107739359717cf2bbf08"
+checksum = "f930c88a43b1c3f6e776dfe495b4afab89882dbc81530c632db2ed65451ebcb4"
 dependencies = [
  "async-trait",
  "base64",
@@ -2222,7 +2214,7 @@ dependencies = [
  "futures",
  "humantime",
  "hyper",
- "itertools 0.10.5",
+ "itertools",
  "parking_lot",
  "percent-encoding",
  "quick-xml",
@@ -2315,7 +2307,7 @@ dependencies = [
  "chrono",
  "flate2",
  "futures",
- "hashbrown 0.14.0",
+ "hashbrown 0.14.1",
  "lz4",
  "num",
  "num-bigint",
@@ -2357,7 +2349,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e1d3afd2628e69da2be385eb6f2fd57c8ac7977ceeff6dc166ff1657b0e386a9"
 dependencies = [
  "fixedbitset",
- "indexmap 2.0.0",
+ "indexmap 2.0.2",
 ]
 
 [[package]]
@@ -2451,7 +2443,7 @@ dependencies = [
  "anstyle",
  "difflib",
  "float-cmp",
- "itertools 0.11.0",
+ "itertools",
  "normalize-line-endings",
  "predicates-core",
  "regex",
@@ -2514,9 +2506,9 @@ dependencies = [
 
 [[package]]
 name = "quick-xml"
-version = "0.28.2"
+version = "0.30.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ce5e73202a820a31f8a0ee32ada5e21029c81fd9e3ebf668a40832e4219d9d1"
+checksum = "eff6510e86862b57b210fd8cbe8ed3f0d7d600b9c2863cd4549a2e033c66e956"
 dependencies = [
  "memchr",
  "serde",
@@ -2602,9 +2594,9 @@ dependencies = [
 
 [[package]]
 name = "regex"
-version = "1.9.5"
+version = "1.9.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "697061221ea1b4a94a624f67d0ae2bfe4e22b8a17b6a192afb11046542cc8c47"
+checksum = "ebee201405406dbf528b8b672104ae6d6d63e6d118cb10e4d51abbc7b58044ff"
 dependencies = [
  "aho-corasick",
  "memchr",
@@ -2614,9 +2606,9 @@ dependencies = [
 
 [[package]]
 name = "regex-automata"
-version = "0.3.8"
+version = "0.3.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c2f401f4955220693b56f8ec66ee9c78abffd8d1c4f23dc41a23839eb88f0795"
+checksum = "59b23e92ee4318893fa3fe3e6fb365258efbfe6ac6ab30f090cdcbb7aa37efa9"
 dependencies = [
  "aho-corasick",
  "memchr",
@@ -2631,9 +2623,9 @@ checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da"
 
 [[package]]
 name = "reqwest"
-version = "0.11.20"
+version = "0.11.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3e9ad3fe7488d7e34558a2033d45a0c90b72d97b4f80705666fea71472e2e6a1"
+checksum = "046cd98826c46c2ac8ddecae268eb5c2e58628688a5fc7a2643704a73faba95b"
 dependencies = [
  "base64",
  "bytes",
@@ -2657,6 +2649,7 @@ dependencies = [
  "serde",
  "serde_json",
  "serde_urlencoded",
+ "system-configuration",
  "tokio",
  "tokio-rustls 0.24.1",
  "tokio-util",
@@ -2728,9 +2721,9 @@ dependencies = [
 
 [[package]]
 name = "rustix"
-version = "0.38.14"
+version = "0.38.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "747c788e9ce8e92b12cd485c49ddf90723550b654b32508f979b71a7b1ecda4f"
+checksum = "d2f9da0cbd88f9f09e7814e388301c8414c51c62aa6ce1e4b5c551d49d96e531"
 dependencies = [
  "bitflags 2.4.0",
  "errno",
@@ -2786,9 +2779,9 @@ dependencies = [
 
 [[package]]
 name = "rustls-webpki"
-version = "0.101.5"
+version = "0.101.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "45a27e3b59326c16e23d30aeb7a36a24cc0d29e71d68ff611cdfb4a01d013bed"
+checksum = "3c7d5dece342910d9ba34d259310cae3e0154b873b35408b787b59bce53d34fe"
 dependencies = [
  "ring",
  "untrusted",
@@ -2888,9 +2881,9 @@ dependencies = [
 
 [[package]]
 name = "semver"
-version = "1.0.18"
+version = "1.0.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b0293b4b29daaf487284529cc2f5675b8e57c61f70167ba415a463651fd6a918"
+checksum = "ad977052201c6de01a8ef2aa3378c4bd23217a056337d1d6da40468d267a4fb0"
 
 [[package]]
 name = "seq-macro"
@@ -2943,9 +2936,9 @@ dependencies = [
 
 [[package]]
 name = "sha2"
-version = "0.10.7"
+version = "0.10.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "479fb9d862239e610720565ca91403019f2f00410f1864c5aa7479b950a76ed8"
+checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8"
 dependencies = [
  "cfg-if",
  "cpufeatures",
@@ -3135,6 +3128,27 @@ dependencies = [
  "unicode-ident",
 ]
 
+[[package]]
+name = "system-configuration"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba3a3adc5c275d719af8cb4272ea1c4a6d668a777f37e115f6d11ddbc1c8e0e7"
+dependencies = [
+ "bitflags 1.3.2",
+ "core-foundation",
+ "system-configuration-sys",
+]
+
+[[package]]
+name = "system-configuration-sys"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a75fb188eb626b924683e3b95e3a48e63551fcfb51949de2f06a9d91dbee93c9"
+dependencies = [
+ "core-foundation-sys",
+ "libc",
+]
+
 [[package]]
 name = "tempfile"
 version = "3.8.0"
@@ -3142,7 +3156,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cb94d2f3cc536af71caac6b6fcebf65860b347e7ce0cc9ebe8f70d3e521054ef"
 dependencies = [
  "cfg-if",
- "fastrand 2.0.0",
+ "fastrand 2.0.1",
  "redox_syscall 0.3.5",
  "rustix",
  "windows-sys",
@@ -3171,18 +3185,18 @@ checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d"
 
 [[package]]
 name = "thiserror"
-version = "1.0.48"
+version = "1.0.49"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9d6d7a740b8a666a7e828dd00da9c0dc290dff53154ea77ac109281de90589b7"
+checksum = "1177e8c6d7ede7afde3585fd2513e611227efd6481bd78d2e82ba1ce16557ed4"
 dependencies = [
  "thiserror-impl",
 ]
 
 [[package]]
 name = "thiserror-impl"
-version = "1.0.48"
+version = "1.0.49"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "49922ecae66cc8a249b77e68d1d0623c1b2c514f0060c27cdc68bd62a1219d35"
+checksum = "10712f02019e9288794769fba95cd6847df9874d49d871d062172f9dd41bc4cc"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -3202,9 +3216,9 @@ dependencies = [
 
 [[package]]
 name = "time"
-version = "0.3.28"
+version = "0.3.29"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "17f6bb557fd245c28e6411aa56b6403c689ad95061f50e4be16c274e70a17e48"
+checksum = "426f806f4089c493dcac0d24c29c01e2c38baf8e30f1b716ee37e83d200b18fe"
 dependencies = [
  "deranged",
  "serde",
@@ -3214,15 +3228,15 @@ dependencies = [
 
 [[package]]
 name = "time-core"
-version = "0.1.1"
+version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb"
+checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
 
 [[package]]
 name = "time-macros"
-version = "0.2.14"
+version = "0.2.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a942f44339478ef67935ab2bbaec2fb0322496cf3cbe84b261e06ac3814c572"
+checksum = "4ad70d68dba9e1f8aceda7aa6711965dfec1cac869f311a51bd08b3a2ccbce20"
 dependencies = [
  "time-core",
 ]
@@ -3617,9 +3631,9 @@ dependencies = [
 
 [[package]]
 name = "webpki"
-version = "0.22.1"
+version = "0.22.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f0e74f82d49d545ad128049b7e88f6576df2da6b02e9ce565c6f533be576957e"
+checksum = "07ecc0cd7cac091bf682ec5efa18b1cff79d617b84181f38b3951dbe135f607f"
 dependencies = [
  "ring",
  "untrusted",
@@ -3749,9 +3763,9 @@ dependencies = [
 
 [[package]]
 name = "xmlparser"
-version = "0.13.5"
+version = "0.13.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4d25c75bf9ea12c4040a97f829154768bbbce366287e2dc044af160cd79a13fd"
+checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4"
 
 [[package]]
 name = "xz2"

From 592b10e5f3455200f3b3dddb6f79b85f1fdc9208 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20Heres?= <danielheres@gmail.com>
Date: Tue, 3 Oct 2023 20:28:28 +0200
Subject: [PATCH 25/32] Update datafusion/physical-plan/src/topk/mod.rs

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
---
 datafusion/physical-plan/src/topk/mod.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/datafusion/physical-plan/src/topk/mod.rs b/datafusion/physical-plan/src/topk/mod.rs
index 6bdfc1b8b776..c1a16e74d8c3 100644
--- a/datafusion/physical-plan/src/topk/mod.rs
+++ b/datafusion/physical-plan/src/topk/mod.rs
@@ -122,6 +122,8 @@ impl TopK {
             })
             .collect::<Result<_>>()?;
 
+        // TODO there is potential to add special cases for single column sort fields
+        // to improve performance
         let row_converter = RowConverter::new(sort_fields)?;
         let scratch_rows = row_converter.empty_rows(
             batch_size,

From 47ee1994c389ec7425d1ed19effbdb92b5d44ce9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20Heres?= <danielheres@gmail.com>
Date: Tue, 3 Oct 2023 20:28:46 +0200
Subject: [PATCH 26/32] Update datafusion/physical-plan/src/topk/mod.rs

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
---
 datafusion/physical-plan/src/topk/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datafusion/physical-plan/src/topk/mod.rs b/datafusion/physical-plan/src/topk/mod.rs
index c1a16e74d8c3..c4bd4ec555f4 100644
--- a/datafusion/physical-plan/src/topk/mod.rs
+++ b/datafusion/physical-plan/src/topk/mod.rs
@@ -142,7 +142,7 @@ impl TopK {
         })
     }
 
-    /// Insert `batch`, remembering it if any of its values are among
+    /// Insert `batch`, remembering if any of its values are among
     /// the top k seen so far.
     pub fn insert_batch(&mut self, batch: RecordBatch) -> Result<()> {
         // Updates on drop

From 2c3363769a631d5c0af4af46b644ad6c8f3804e0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20Heres?= <danielheres@gmail.com>
Date: Wed, 4 Oct 2023 11:24:27 +0200
Subject: [PATCH 27/32] Update datafusion/physical-plan/src/topk/mod.rs

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
---
 datafusion/physical-plan/src/topk/mod.rs | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/datafusion/physical-plan/src/topk/mod.rs b/datafusion/physical-plan/src/topk/mod.rs
index c4bd4ec555f4..b672a86682a8 100644
--- a/datafusion/physical-plan/src/topk/mod.rs
+++ b/datafusion/physical-plan/src/topk/mod.rs
@@ -258,10 +258,6 @@ impl TopKMetrics {
 ///
 /// Using the `Row` format handles things such as ascending vs
 /// descending and nulls first vs nulls last.
-///
-/// It doesn't use `BinaryHeap` in the Rust standard library because
-/// it is important to check the current minimum value in the heap
-/// prior to creating a new value to insert.
 struct TopKHeap {
     /// The maximum number of elemenents to store in this heap.
     k: usize,

From c9121ccf2dc0aee9abf04460e30f5bce1e96a1c2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20Heres?= <danielheres@gmail.com>
Date: Wed, 4 Oct 2023 11:28:51 +0200
Subject: [PATCH 28/32] Update datafusion/physical-plan/src/topk/mod.rs

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
---
 datafusion/physical-plan/src/topk/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datafusion/physical-plan/src/topk/mod.rs b/datafusion/physical-plan/src/topk/mod.rs
index b672a86682a8..ba445b4d2348 100644
--- a/datafusion/physical-plan/src/topk/mod.rs
+++ b/datafusion/physical-plan/src/topk/mod.rs
@@ -67,7 +67,7 @@ use super::metrics::{BaselineMetrics, Count, ExecutionPlanMetricsSet, MetricBuil
 /// input before discarding everything other than the top 3 elements.
 ///
 /// The same answer can be produced by simply keeping track of the top
-/// N elements, reducing the total amount of required buffer memory.
+/// K=3 elements, reducing the total amount of required buffer memory.
 ///
 /// # Structure
 ///

From 0dc3488bdc55ae3950f4b15464488c40adca55b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20Heres?= <danielheres@gmail.com>
Date: Wed, 4 Oct 2023 11:29:37 +0200
Subject: [PATCH 29/32] Add / update some comments

---
 datafusion/physical-plan/src/topk/mod.rs | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/datafusion/physical-plan/src/topk/mod.rs b/datafusion/physical-plan/src/topk/mod.rs
index ba445b4d2348..4638c0dcf264 100644
--- a/datafusion/physical-plan/src/topk/mod.rs
+++ b/datafusion/physical-plan/src/topk/mod.rs
@@ -163,9 +163,8 @@ impl TopK {
         self.row_converter.append(rows, &sort_keys)?;
 
         // TODO make this algorithmically better?:
-        // 1. only check topk values in rows
-        // 2. only do one update through top_k
-
+        // Idea: filter out rows >= self.heap.max() early (before passing to `RowConverter`)
+        //       this avoids some work and also might be better vectorizable.
         let mut batch_entry = self.heap.register_batch(batch);
         for (index, row) in rows.iter().enumerate() {
             match self.heap.max() {
@@ -409,9 +408,9 @@ impl TopKHeap {
         let max_unused_rows = (20 * self.batch_size) + self.k;
         let unused_rows = self.store.unused_rows();
 
-        // don't compact if the store has only one batch or
+        // don't compact if the store has one extra batch or
+        // unused rows is under the threshold
         if self.store.len() <= 2 || unused_rows < max_unused_rows {
-            //if self.store.len() <= 2 {
             return Ok(());
         }
         // at first, compact the entire thing always into a new batch

From 0470306950eee9614706ea8c451d24880e2d77d5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20Heres?= <danielheres@gmail.com>
Date: Wed, 4 Oct 2023 13:55:09 +0200
Subject: [PATCH 30/32] Rename test file

---
 datafusion/sqllogictest/test_files/{aal.slt => topk.slt} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename datafusion/sqllogictest/test_files/{aal.slt => topk.slt} (100%)

diff --git a/datafusion/sqllogictest/test_files/aal.slt b/datafusion/sqllogictest/test_files/topk.slt
similarity index 100%
rename from datafusion/sqllogictest/test_files/aal.slt
rename to datafusion/sqllogictest/test_files/topk.slt

From 0c59fe1975f628259b7fa19ff3a3e13609fc2c34 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20Heres?= <danielheres@gmail.com>
Date: Wed, 4 Oct 2023 13:55:49 +0200
Subject: [PATCH 31/32] Rename table as well

---
 datafusion/sqllogictest/test_files/topk.slt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/datafusion/sqllogictest/test_files/topk.slt b/datafusion/sqllogictest/test_files/topk.slt
index 36dc0d9fdcf9..70a90e9daf59 100644
--- a/datafusion/sqllogictest/test_files/topk.slt
+++ b/datafusion/sqllogictest/test_files/topk.slt
@@ -18,10 +18,10 @@
 # Tests for development
 
 statement ok
-create table aal(x int) as values (10), (2), (3), (0), (5), (4), (3), (2), (1), (3), (8);
+create table topk(x int) as values (10), (2), (3), (0), (5), (4), (3), (2), (1), (3), (8);
 
 query I
-select * from aal order by x;
+select * from topk order by x;
 ----
 0
 1
@@ -36,14 +36,14 @@ select * from aal order by x;
 10
 
 query I
-select * from aal order by x limit 3;
+select * from topk order by x limit 3;
 ----
 0
 1
 2
 
 query I
-select * from aal order by x desc limit 3;
+select * from topk order by x desc limit 3;
 ----
 10
 8

From 6bb299bae6ac0c30c0303b4d02f127f2da4a57ea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20Heres?= <danielheres@gmail.com>
Date: Thu, 5 Oct 2023 08:50:46 +0200
Subject: [PATCH 32/32] Update datafusion/sqllogictest/test_files/topk.slt

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
---
 datafusion/sqllogictest/test_files/topk.slt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datafusion/sqllogictest/test_files/topk.slt b/datafusion/sqllogictest/test_files/topk.slt
index 70a90e9daf59..8d3b70139d35 100644
--- a/datafusion/sqllogictest/test_files/topk.slt
+++ b/datafusion/sqllogictest/test_files/topk.slt
@@ -217,7 +217,7 @@ d 1 -72 25590 1188089983 3090286296481837049 241 832 3542840110 5885937420286765
 a 1 -5 12636 794623392 2909750622865366631 15 24022 2669374863 4776679784701509574 0.29877836 0.253725340799 waIGbOGl1PM6gnzZ4uuZt4E2yDWRHs
 
 
-## make an example for
+## make an example for dictionary encoding
 
 statement ok
 create table dict as select c1, c2, c3, c13, arrow_cast(c13, 'Dictionary(Int32, Utf8)') as c13_dict from aggregate_test_100;