From c670836641b5b25e17dae8c1bd6e57df8985ca7a Mon Sep 17 00:00:00 2001 From: doki Date: Wed, 28 Feb 2024 00:02:18 +0800 Subject: [PATCH 01/13] Add OffsetsBuilder --- arrow-buffer/src/buffer/offset.rs | 11 +- arrow-buffer/src/builder/mod.rs | 5 +- arrow-buffer/src/builder/offset.rs | 222 +++++++++++++++++++++++++++++ arrow-buffer/src/lib.rs | 5 + 4 files changed, 240 insertions(+), 3 deletions(-) create mode 100644 arrow-buffer/src/builder/offset.rs diff --git a/arrow-buffer/src/buffer/offset.rs b/arrow-buffer/src/buffer/offset.rs index 652d30c3b0ab..410c2c47566d 100644 --- a/arrow-buffer/src/buffer/offset.rs +++ b/arrow-buffer/src/buffer/offset.rs @@ -16,7 +16,7 @@ // under the License. use crate::buffer::ScalarBuffer; -use crate::{ArrowNativeType, MutableBuffer}; +use crate::{ArrowNativeType, MutableBuffer, OffsetsBuilder}; use std::ops::Deref; /// A non-empty buffer of monotonically increasing, positive integers. @@ -55,7 +55,6 @@ use std::ops::Deref; /// (offsets[i], /// offsets[i+1]) /// ``` - #[derive(Debug, Clone)] pub struct OffsetBuffer(ScalarBuffer); @@ -174,6 +173,14 @@ impl AsRef<[T]> for OffsetBuffer { } } +impl TryFrom for OffsetBuffer { + type Error = String; + + fn try_from(value: OffsetsBuilder) -> Result { + value.finish() + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/arrow-buffer/src/builder/mod.rs b/arrow-buffer/src/builder/mod.rs index d5d5a7d3f18d..f7e0e29dace4 100644 --- a/arrow-buffer/src/builder/mod.rs +++ b/arrow-buffer/src/builder/mod.rs @@ -18,9 +18,12 @@ //! Buffer builders mod boolean; -pub use boolean::*; mod null; +mod offset; + +pub use boolean::*; pub use null::*; +pub use offset::*; use crate::{ArrowNativeType, Buffer, MutableBuffer}; use std::{iter, marker::PhantomData}; diff --git a/arrow-buffer/src/builder/offset.rs b/arrow-buffer/src/builder/offset.rs new file mode 100644 index 000000000000..bce0ef4757d8 --- /dev/null +++ b/arrow-buffer/src/builder/offset.rs @@ -0,0 +1,222 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::{ArrowNativeType, OffsetBuffer, ScalarBuffer}; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct OffsetsBuilder { + offsets: Vec, +} + +/// builder for [`OffsetBuffer`] +impl OffsetsBuilder { + /// create a new builder containing only 1 zero offset + pub fn new(capacity: usize) -> Self { + let mut offsets = Vec::with_capacity(capacity + 1); + offsets.push(0); + Self::new_unchecked(offsets) + } + + /// create a new builder containing capacity number of zero offsets + pub fn new_zeroed(capacity: usize) -> Self { + let offsets = vec![0usize; capacity + 1]; + Self::new_unchecked(offsets) + } + + /// create from offsets + /// caller guarantees that offsets are monotonically increasing values + #[inline] + pub fn new_unchecked(offsets: Vec) -> Self { + Self { offsets } + } + + /// try to push a [`ArrowNativeType`] length into the builder. + /// it returns an Err if length cannot be safely cast to usize + pub fn try_push_length(&mut self, length: impl ArrowNativeType) -> Result<(), String> { + let usize_len = length + .to_usize() + .ok_or(format!("The length {length:?} cannot safely cast to usize").to_string())?; + self.push_usize_length(usize_len); + Ok(()) + } + + pub fn push_usize_length(&mut self, length: usize) { + let last_offset = self.offsets.last().unwrap(); + let next_offset = last_offset + length; + self.offsets.push(next_offset); + } + + /// try to extend the builder with an Iterator of [`ArrowNativeType`] lengths + /// returns an Err if any length cannot be safely cast to usize + pub fn try_extend_from_lengths( + &mut self, + lengths: impl IntoIterator, + ) -> Result<(), String> { + let usize_lengths = lengths + .into_iter() + .map(|len| { + len.to_usize() + .ok_or(format!("Cannot safely convert length {len} to usize")) + }) + .collect::, String>>()?; + self.extend_from_usize_lengths(usize_lengths); + Ok(()) + } + + /// extend with an Iterator of usize lengths + pub fn extend_from_usize_lengths(&mut self, lengths: impl IntoIterator) { + let lengths_iter = lengths.into_iter(); + let size_hint = match lengths_iter.size_hint().1 { + Some(h_bound) => h_bound, + None => lengths_iter.size_hint().0, + }; + self.offsets.reserve(size_hint); + lengths_iter.for_each(|length| self.push_usize_length(length)) + } + + /// extend from another OffsetsBuilder + /// it get a lengths iterator from another builder and extend the builder with the iter + pub fn extend_from_builder(&mut self, offsets_builder: OffsetsBuilder) { + let lengths = offsets_builder.lengths(); + self.extend_from_usize_lengths(lengths); + } + + /// takes the builder itself and returns an [`OffsetBuffer`] + pub fn finish(self) -> Result, String> { + let scala_buf = ScalarBuffer::from( + self.offsets + .into_iter() + .map(|offset| { + O::from_usize(offset).ok_or("Cannot safely convert offset to usize".to_string()) + }) + .collect::, String>>()?, + ); + Ok(OffsetBuffer::new(scala_buf)) + } + + pub fn capacity(&self) -> usize { + self.offsets.capacity() + } + + pub fn len(&self) -> usize { + self.offsets.len() + } + + pub fn last(&self) -> usize { + *self.offsets.last().unwrap() + } + + /// get an iterator of lengths from builder's underlying offsets + pub fn lengths(&self) -> impl IntoIterator { + let mut lengths = self + .offsets + .iter() + .scan(0usize, |prev, offset| { + let ret = Some(*offset - *prev); + *prev = *offset; + ret + }) + .collect::>(); + lengths.remove(0); + lengths + } +} + +impl> From for OffsetsBuilder { + fn from(value: IntoIter) -> Self { + let mut offsets_vec: Vec = value + .into_iter() + .scan(0usize, |prev, len| { + *prev = len + *prev; + Some(*prev) + }) + .collect(); + offsets_vec.insert(0, 0); + OffsetsBuilder::new_unchecked(offsets_vec) + } +} + +#[cfg(test)] +mod tests { + use crate::{OffsetBuffer, OffsetsBuilder}; + + #[test] + fn new_zeroed() -> Result<(), String> { + let builder = OffsetsBuilder::new_zeroed(3); + assert_eq!(builder.finish::()?.to_vec(), vec![0, 0, 0, 0]); + Ok(()) + } + + #[test] + fn test_from() -> Result<(), String> { + let lengths = vec![1, 2, 3, 0, 3, 2, 1]; + let builder: OffsetsBuilder = lengths.into(); + + assert_eq!(builder.last(), 12); + assert_eq!(builder.len(), 8); + + let offsets = builder.finish::()?; + assert_eq!(offsets.to_vec(), vec![0, 1, 3, 6, 6, 9, 11, 12]); + Ok(()) + } + + #[test] + fn test_push() -> Result<(), String> { + let lengths = vec![1, 2, 3, 0, 3, 2, 1]; + let mut builder: OffsetsBuilder = lengths.into(); + builder.push_usize_length(1); + builder.try_push_length(2)?; + builder.try_push_length(0i16)?; + let offsets: OffsetBuffer = builder.try_into()?; + let expect_offsets = vec![0, 1, 3, 6, 6, 9, 11, 12, 13, 15, 15]; + assert_eq!(offsets.to_vec(), expect_offsets); + Ok(()) + } + + #[test] + fn test_try_push_unexpect() -> Result<(), String> { + let lengths = vec![1, 2, 3]; + let mut builder: OffsetsBuilder = lengths.into(); + match builder.try_push_length(-1i32) { + Err(err) => { + assert_eq!("The length -1 cannot safely cast to usize", err); + Ok(()) + } + Ok(_) => return Err("builder.finish should return Err".to_string()), + } + } + + #[test] + fn test_extend() -> Result<(), String> { + let lengths = vec![1, 2, 3]; + let mut builder: OffsetsBuilder = lengths.into(); + + let extend_lengths = vec![4, 4]; + builder.extend_from_usize_lengths(extend_lengths); + + let extend_i32_lengths: Vec = vec![5, 5]; + builder.try_extend_from_lengths(extend_i32_lengths)?; + + let another_builder = vec![1, 2, 3].into(); + builder.extend_from_builder(another_builder); + + let offsets: OffsetBuffer = builder.finish()?; + let expect_offsets = vec![0, 1, 3, 6, 10, 14, 19, 24, 25, 27, 30]; + assert_eq!(offsets.to_vec(), expect_offsets); + Ok(()) + } +} diff --git a/arrow-buffer/src/lib.rs b/arrow-buffer/src/lib.rs index 612897af9bed..8dbee1c36567 100644 --- a/arrow-buffer/src/lib.rs +++ b/arrow-buffer/src/lib.rs @@ -22,16 +22,21 @@ pub mod alloc; pub mod buffer; + pub use buffer::*; pub mod builder; + pub use builder::*; mod bigint; mod bytes; mod native; + pub use bigint::i256; pub use native::*; + mod util; + pub use util::*; From d365746a340d495bf47c6b34357cdf997d09e8d1 Mon Sep 17 00:00:00 2001 From: doki Date: Wed, 28 Feb 2024 00:35:07 +0800 Subject: [PATCH 02/13] clippy & fmt --- arrow-buffer/src/builder/offset.rs | 3 ++- arrow-buffer/src/lib.rs | 15 ++++----------- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/arrow-buffer/src/builder/offset.rs b/arrow-buffer/src/builder/offset.rs index bce0ef4757d8..175693b34829 100644 --- a/arrow-buffer/src/builder/offset.rs +++ b/arrow-buffer/src/builder/offset.rs @@ -112,6 +112,7 @@ impl OffsetsBuilder { self.offsets.capacity() } + #[allow(clippy::len_without_is_empty)] pub fn len(&self) -> usize { self.offsets.len() } @@ -141,7 +142,7 @@ impl> From for OffsetsBuilder { let mut offsets_vec: Vec = value .into_iter() .scan(0usize, |prev, len| { - *prev = len + *prev; + *prev += len; Some(*prev) }) .collect(); diff --git a/arrow-buffer/src/lib.rs b/arrow-buffer/src/lib.rs index 8dbee1c36567..e0f0d4046391 100644 --- a/arrow-buffer/src/lib.rs +++ b/arrow-buffer/src/lib.rs @@ -21,22 +21,15 @@ #![cfg_attr(miri, feature(strict_provenance))] pub mod alloc; +mod bigint; pub mod buffer; - -pub use buffer::*; - pub mod builder; - -pub use builder::*; - -mod bigint; mod bytes; mod native; +mod util; pub use bigint::i256; - +pub use buffer::*; +pub use builder::*; pub use native::*; - -mod util; - pub use util::*; From 5f4c246d59cf438f7d5b0031c2adb60fef8bb5d5 Mon Sep 17 00:00:00 2001 From: doki Date: Sun, 3 Mar 2024 10:56:21 +0800 Subject: [PATCH 03/13] change underlying data type from usize to O: ArrowNativeType --- arrow-buffer/src/buffer/offset.rs | 10 +- arrow-buffer/src/builder/offset.rs | 145 +++++++++++++++-------------- 2 files changed, 78 insertions(+), 77 deletions(-) diff --git a/arrow-buffer/src/buffer/offset.rs b/arrow-buffer/src/buffer/offset.rs index 410c2c47566d..38fd262edb1c 100644 --- a/arrow-buffer/src/buffer/offset.rs +++ b/arrow-buffer/src/buffer/offset.rs @@ -17,7 +17,7 @@ use crate::buffer::ScalarBuffer; use crate::{ArrowNativeType, MutableBuffer, OffsetsBuilder}; -use std::ops::Deref; +use std::ops::{Add, Deref, Sub}; /// A non-empty buffer of monotonically increasing, positive integers. /// @@ -173,10 +173,10 @@ impl AsRef<[T]> for OffsetBuffer { } } -impl TryFrom for OffsetBuffer { - type Error = String; - - fn try_from(value: OffsetsBuilder) -> Result { +impl + Sub> From> + for OffsetBuffer +{ + fn from(value: OffsetsBuilder) -> Self { value.finish() } } diff --git a/arrow-buffer/src/builder/offset.rs b/arrow-buffer/src/builder/offset.rs index 175693b34829..6e632c34e3a0 100644 --- a/arrow-buffer/src/builder/offset.rs +++ b/arrow-buffer/src/builder/offset.rs @@ -15,97 +15,92 @@ // specific language governing permissions and limitations // under the License. +use std::ops::{Add, Sub}; + use crate::{ArrowNativeType, OffsetBuffer, ScalarBuffer}; #[derive(Debug, Clone, PartialEq, Eq)] -pub struct OffsetsBuilder { - offsets: Vec, +pub struct OffsetsBuilder { + offsets: Vec, } /// builder for [`OffsetBuffer`] -impl OffsetsBuilder { +impl + Sub> OffsetsBuilder { /// create a new builder containing only 1 zero offset pub fn new(capacity: usize) -> Self { let mut offsets = Vec::with_capacity(capacity + 1); - offsets.push(0); + offsets.push(O::usize_as(0)); Self::new_unchecked(offsets) } /// create a new builder containing capacity number of zero offsets pub fn new_zeroed(capacity: usize) -> Self { - let offsets = vec![0usize; capacity + 1]; + let offsets = vec![O::usize_as(0); capacity + 1]; Self::new_unchecked(offsets) } /// create from offsets /// caller guarantees that offsets are monotonically increasing values #[inline] - pub fn new_unchecked(offsets: Vec) -> Self { + pub fn new_unchecked(offsets: Vec) -> Self { Self { offsets } } - /// try to push a [`ArrowNativeType`] length into the builder. - /// it returns an Err if length cannot be safely cast to usize - pub fn try_push_length(&mut self, length: impl ArrowNativeType) -> Result<(), String> { - let usize_len = length - .to_usize() - .ok_or(format!("The length {length:?} cannot safely cast to usize").to_string())?; - self.push_usize_length(usize_len); - Ok(()) - } - - pub fn push_usize_length(&mut self, length: usize) { + /// push a length into the builder. + #[inline] + pub fn push_length(&mut self, length: O) { let last_offset = self.offsets.last().unwrap(); - let next_offset = last_offset + length; + let next_offset = *last_offset + length; self.offsets.push(next_offset); } - /// try to extend the builder with an Iterator of [`ArrowNativeType`] lengths - /// returns an Err if any length cannot be safely cast to usize - pub fn try_extend_from_lengths( - &mut self, - lengths: impl IntoIterator, - ) -> Result<(), String> { - let usize_lengths = lengths - .into_iter() - .map(|len| { - len.to_usize() - .ok_or(format!("Cannot safely convert length {len} to usize")) - }) - .collect::, String>>()?; - self.extend_from_usize_lengths(usize_lengths); + #[inline] + pub fn try_push_usize_length(&mut self, length: usize) -> Result<(), String> { + self.push_length(O::from_usize(length).ok_or(format!( + "Cannot safely convert usize length {length} to offset" + ))?); Ok(()) } - /// extend with an Iterator of usize lengths - pub fn extend_from_usize_lengths(&mut self, lengths: impl IntoIterator) { + /// extend the builder with an Iterator of lengths + pub fn extend_from_lengths(&mut self, lengths: impl IntoIterator) { let lengths_iter = lengths.into_iter(); let size_hint = match lengths_iter.size_hint().1 { Some(h_bound) => h_bound, None => lengths_iter.size_hint().0, }; self.offsets.reserve(size_hint); - lengths_iter.for_each(|length| self.push_usize_length(length)) + lengths_iter.for_each(|length| self.push_length(length)); + } + + /// extend with an Iterator of usize lengths + pub fn try_extend_from_usize_lengths( + &mut self, + lengths: impl IntoIterator, + ) -> Result<(), String> { + self.extend_from_lengths( + lengths + .into_iter() + .map(|u_len| { + O::from_usize(u_len).ok_or(format!( + "Cannot safely convert usize length {u_len} to offset" + )) + }) + .collect::, String>>()?, + ); + Ok(()) } /// extend from another OffsetsBuilder /// it get a lengths iterator from another builder and extend the builder with the iter - pub fn extend_from_builder(&mut self, offsets_builder: OffsetsBuilder) { + pub fn extend_from_builder(&mut self, offsets_builder: OffsetsBuilder) { let lengths = offsets_builder.lengths(); - self.extend_from_usize_lengths(lengths); + self.extend_from_lengths(lengths); } /// takes the builder itself and returns an [`OffsetBuffer`] - pub fn finish(self) -> Result, String> { - let scala_buf = ScalarBuffer::from( - self.offsets - .into_iter() - .map(|offset| { - O::from_usize(offset).ok_or("Cannot safely convert offset to usize".to_string()) - }) - .collect::, String>>()?, - ); - Ok(OffsetBuffer::new(scala_buf)) + pub fn finish(self) -> OffsetBuffer { + OffsetBuffer::new(ScalarBuffer::from(self.offsets)) } pub fn capacity(&self) -> usize { @@ -117,36 +112,38 @@ impl OffsetsBuilder { self.offsets.len() } - pub fn last(&self) -> usize { + pub fn last(&self) -> O { *self.offsets.last().unwrap() } /// get an iterator of lengths from builder's underlying offsets - pub fn lengths(&self) -> impl IntoIterator { + pub fn lengths(&self) -> impl IntoIterator { let mut lengths = self .offsets .iter() - .scan(0usize, |prev, offset| { + .scan(O::usize_as(0), |prev, offset| { let ret = Some(*offset - *prev); *prev = *offset; ret }) - .collect::>(); + .collect::>(); lengths.remove(0); lengths } } -impl> From for OffsetsBuilder { +impl, O: ArrowNativeType + Add + Sub> + From for OffsetsBuilder +{ fn from(value: IntoIter) -> Self { - let mut offsets_vec: Vec = value + let mut offsets_vec: Vec = value .into_iter() - .scan(0usize, |prev, len| { - *prev += len; + .scan(O::usize_as(0), |prev, len| { + *prev = *prev + len; Some(*prev) }) .collect(); - offsets_vec.insert(0, 0); + offsets_vec.insert(0, O::usize_as(0)); OffsetsBuilder::new_unchecked(offsets_vec) } } @@ -157,20 +154,20 @@ mod tests { #[test] fn new_zeroed() -> Result<(), String> { - let builder = OffsetsBuilder::new_zeroed(3); - assert_eq!(builder.finish::()?.to_vec(), vec![0, 0, 0, 0]); + let builder: OffsetsBuilder = OffsetsBuilder::new_zeroed(3); + assert_eq!(builder.finish().to_vec(), vec![0, 0, 0, 0]); Ok(()) } #[test] fn test_from() -> Result<(), String> { let lengths = vec![1, 2, 3, 0, 3, 2, 1]; - let builder: OffsetsBuilder = lengths.into(); + let builder: OffsetsBuilder = lengths.into(); assert_eq!(builder.last(), 12); assert_eq!(builder.len(), 8); - let offsets = builder.finish::()?; + let offsets = builder.finish(); assert_eq!(offsets.to_vec(), vec![0, 1, 3, 6, 6, 9, 11, 12]); Ok(()) } @@ -178,11 +175,11 @@ mod tests { #[test] fn test_push() -> Result<(), String> { let lengths = vec![1, 2, 3, 0, 3, 2, 1]; - let mut builder: OffsetsBuilder = lengths.into(); - builder.push_usize_length(1); - builder.try_push_length(2)?; - builder.try_push_length(0i16)?; - let offsets: OffsetBuffer = builder.try_into()?; + let mut builder: OffsetsBuilder = lengths.into(); + builder.try_push_usize_length(1)?; + builder.push_length(2); + builder.push_length(0); + let offsets: OffsetBuffer = builder.into(); let expect_offsets = vec![0, 1, 3, 6, 6, 9, 11, 12, 13, 15, 15]; assert_eq!(offsets.to_vec(), expect_offsets); Ok(()) @@ -191,10 +188,14 @@ mod tests { #[test] fn test_try_push_unexpect() -> Result<(), String> { let lengths = vec![1, 2, 3]; - let mut builder: OffsetsBuilder = lengths.into(); - match builder.try_push_length(-1i32) { + let mut builder: OffsetsBuilder = lengths.into(); + let len = 1 << 20; + match builder.try_push_usize_length(1 << 20) { Err(err) => { - assert_eq!("The length -1 cannot safely cast to usize", err); + assert_eq!( + format!("Cannot safely convert usize length {len} to offset"), + err + ); Ok(()) } Ok(_) => return Err("builder.finish should return Err".to_string()), @@ -204,18 +205,18 @@ mod tests { #[test] fn test_extend() -> Result<(), String> { let lengths = vec![1, 2, 3]; - let mut builder: OffsetsBuilder = lengths.into(); + let mut builder: OffsetsBuilder = lengths.into(); let extend_lengths = vec![4, 4]; - builder.extend_from_usize_lengths(extend_lengths); + builder.try_extend_from_usize_lengths(extend_lengths)?; let extend_i32_lengths: Vec = vec![5, 5]; - builder.try_extend_from_lengths(extend_i32_lengths)?; + builder.extend_from_lengths(extend_i32_lengths); let another_builder = vec![1, 2, 3].into(); builder.extend_from_builder(another_builder); - let offsets: OffsetBuffer = builder.finish()?; + let offsets = builder.finish(); let expect_offsets = vec![0, 1, 3, 6, 10, 14, 19, 24, 25, 27, 30]; assert_eq!(offsets.to_vec(), expect_offsets); Ok(()) From b3a4353e8184bee2f097ab8e2c1fd4c6294f24e5 Mon Sep 17 00:00:00 2001 From: doki Date: Mon, 4 Mar 2024 12:01:57 +0800 Subject: [PATCH 04/13] add some mem management apis --- arrow-buffer/src/builder/offset.rs | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/arrow-buffer/src/builder/offset.rs b/arrow-buffer/src/builder/offset.rs index 6e632c34e3a0..6352518f1b4e 100644 --- a/arrow-buffer/src/builder/offset.rs +++ b/arrow-buffer/src/builder/offset.rs @@ -54,6 +54,7 @@ impl + Sub> OffsetsBuilder { self.offsets.push(next_offset); } + /// try to safely push a length of usize type into builder #[inline] pub fn try_push_usize_length(&mut self, length: usize) -> Result<(), String> { self.push_length(O::from_usize(length).ok_or(format!( @@ -69,7 +70,7 @@ impl + Sub> OffsetsBuilder { Some(h_bound) => h_bound, None => lengths_iter.size_hint().0, }; - self.offsets.reserve(size_hint); + self.reserve(size_hint); lengths_iter.for_each(|length| self.push_length(length)); } @@ -104,7 +105,7 @@ impl + Sub> OffsetsBuilder { } pub fn capacity(&self) -> usize { - self.offsets.capacity() + self.offsets.capacity() - 1 } #[allow(clippy::len_without_is_empty)] @@ -112,10 +113,23 @@ impl + Sub> OffsetsBuilder { self.offsets.len() } + /// Last offset pub fn last(&self) -> O { *self.offsets.last().unwrap() } + pub fn reserve(&mut self, additional: usize) { + self.offsets.reserve(additional); + } + + pub fn reserve_exact(&mut self, additional: usize) { + self.offsets.reserve_exact(additional); + } + + pub fn shrink_to_fit(&mut self) { + self.offsets.shrink_to_fit(); + } + /// get an iterator of lengths from builder's underlying offsets pub fn lengths(&self) -> impl IntoIterator { let mut lengths = self From 33c84af10d4b8d6329f09b68c7160781f03752d0 Mon Sep 17 00:00:00 2001 From: doki Date: Mon, 4 Mar 2024 12:02:57 +0800 Subject: [PATCH 05/13] clippy --- arrow-buffer/src/builder/offset.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-buffer/src/builder/offset.rs b/arrow-buffer/src/builder/offset.rs index 6352518f1b4e..285506802916 100644 --- a/arrow-buffer/src/builder/offset.rs +++ b/arrow-buffer/src/builder/offset.rs @@ -212,7 +212,7 @@ mod tests { ); Ok(()) } - Ok(_) => return Err("builder.finish should return Err".to_string()), + Ok(_) => Err("builder.finish should return Err".to_string()), } } From 9a8c3cb63f7b9702e43310fb38000e64124f20c3 Mon Sep 17 00:00:00 2001 From: doki Date: Thu, 14 Mar 2024 11:36:22 +0800 Subject: [PATCH 06/13] optimize finish and remove lengths method --- arrow-buffer/src/buffer/offset.rs | 6 +- arrow-buffer/src/builder/offset.rs | 134 ++++++++++++----------------- 2 files changed, 57 insertions(+), 83 deletions(-) diff --git a/arrow-buffer/src/buffer/offset.rs b/arrow-buffer/src/buffer/offset.rs index 38fd262edb1c..d7f66e3a6aff 100644 --- a/arrow-buffer/src/buffer/offset.rs +++ b/arrow-buffer/src/buffer/offset.rs @@ -16,7 +16,7 @@ // under the License. use crate::buffer::ScalarBuffer; -use crate::{ArrowNativeType, MutableBuffer, OffsetsBuilder}; +use crate::{ArrowNativeType, MutableBuffer, OffsetBufferBuilder}; use std::ops::{Add, Deref, Sub}; /// A non-empty buffer of monotonically increasing, positive integers. @@ -173,10 +173,10 @@ impl AsRef<[T]> for OffsetBuffer { } } -impl + Sub> From> +impl + Sub> From> for OffsetBuffer { - fn from(value: OffsetsBuilder) -> Self { + fn from(value: OffsetBufferBuilder) -> Self { value.finish() } } diff --git a/arrow-buffer/src/builder/offset.rs b/arrow-buffer/src/builder/offset.rs index 285506802916..0660d23c349f 100644 --- a/arrow-buffer/src/builder/offset.rs +++ b/arrow-buffer/src/builder/offset.rs @@ -17,97 +17,86 @@ use std::ops::{Add, Sub}; -use crate::{ArrowNativeType, OffsetBuffer, ScalarBuffer}; +use crate::{ArrowNativeType, OffsetBuffer}; #[derive(Debug, Clone, PartialEq, Eq)] -pub struct OffsetsBuilder { +pub struct OffsetBufferBuilder { offsets: Vec, } -/// builder for [`OffsetBuffer`] -impl + Sub> OffsetsBuilder { - /// create a new builder containing only 1 zero offset +/// Builder of [`OffsetBuffer`] +impl + Sub> OffsetBufferBuilder { + /// Create a new builder containing only 1 zero offset. pub fn new(capacity: usize) -> Self { let mut offsets = Vec::with_capacity(capacity + 1); offsets.push(O::usize_as(0)); - Self::new_unchecked(offsets) + unsafe { Self::new_unchecked(offsets) } } - /// create a new builder containing capacity number of zero offsets + /// Create a new builder containing capacity number of zero offsets. pub fn new_zeroed(capacity: usize) -> Self { let offsets = vec![O::usize_as(0); capacity + 1]; - Self::new_unchecked(offsets) + unsafe { Self::new_unchecked(offsets) } } - /// create from offsets - /// caller guarantees that offsets are monotonically increasing values + /// # Safety + /// Create from offsets. + /// Caller guarantees that offsets are monotonically increasing values. #[inline] - pub fn new_unchecked(offsets: Vec) -> Self { + pub unsafe fn new_unchecked(offsets: Vec) -> Self { Self { offsets } } - /// push a length into the builder. + /// Try to safely push a length of usize type into builder #[inline] - pub fn push_length(&mut self, length: O) { + pub fn try_push_length(&mut self, length: usize) -> Result<(), String> { let last_offset = self.offsets.last().unwrap(); - let next_offset = *last_offset + length; + let next_offset = *last_offset + O::from_usize(length).expect("offset overflow"); self.offsets.push(next_offset); - } - - /// try to safely push a length of usize type into builder - #[inline] - pub fn try_push_usize_length(&mut self, length: usize) -> Result<(), String> { - self.push_length(O::from_usize(length).ok_or(format!( - "Cannot safely convert usize length {length} to offset" - ))?); Ok(()) } - /// extend the builder with an Iterator of lengths - pub fn extend_from_lengths(&mut self, lengths: impl IntoIterator) { + /// Extend the builder with an Iterator of lengths + pub fn try_extend_from_lengths( + &mut self, + lengths: impl IntoIterator, + ) -> Result<(), String> { let lengths_iter = lengths.into_iter(); let size_hint = match lengths_iter.size_hint().1 { Some(h_bound) => h_bound, None => lengths_iter.size_hint().0, }; self.reserve(size_hint); - lengths_iter.for_each(|length| self.push_length(length)); - } - - /// extend with an Iterator of usize lengths - pub fn try_extend_from_usize_lengths( - &mut self, - lengths: impl IntoIterator, - ) -> Result<(), String> { - self.extend_from_lengths( - lengths - .into_iter() - .map(|u_len| { - O::from_usize(u_len).ok_or(format!( - "Cannot safely convert usize length {u_len} to offset" - )) - }) - .collect::, String>>()?, - ); + for len in lengths_iter { + self.try_push_length(len)?; + } Ok(()) } - /// extend from another OffsetsBuilder - /// it get a lengths iterator from another builder and extend the builder with the iter - pub fn extend_from_builder(&mut self, offsets_builder: OffsetsBuilder) { - let lengths = offsets_builder.lengths(); - self.extend_from_lengths(lengths); + /// Extend from another OffsetsBuilder. + /// It gets a lengths iterator from another builder and extend the builder with the iter. + pub fn extend_from_builder(&mut self, another: OffsetBufferBuilder) { + self.reserve(another.len() - 1); + let mut last_offset = another.offsets[0]; + for i in 1..another.offsets.len() { + let cur_offset = another.offsets[i]; + let len = cur_offset - last_offset; + self.offsets.push(self.last() + len); + last_offset = cur_offset; + } } - /// takes the builder itself and returns an [`OffsetBuffer`] + /// Takes the builder itself and returns an [`OffsetBuffer`] pub fn finish(self) -> OffsetBuffer { - OffsetBuffer::new(ScalarBuffer::from(self.offsets)) + unsafe { OffsetBuffer::new_unchecked(self.offsets.into()) } } + /// Capacity of offsets pub fn capacity(&self) -> usize { - self.offsets.capacity() - 1 + self.offsets.capacity() } + /// Length of the Offsets #[allow(clippy::len_without_is_empty)] pub fn len(&self) -> usize { self.offsets.len() @@ -129,25 +118,10 @@ impl + Sub> OffsetsBuilder { pub fn shrink_to_fit(&mut self) { self.offsets.shrink_to_fit(); } - - /// get an iterator of lengths from builder's underlying offsets - pub fn lengths(&self) -> impl IntoIterator { - let mut lengths = self - .offsets - .iter() - .scan(O::usize_as(0), |prev, offset| { - let ret = Some(*offset - *prev); - *prev = *offset; - ret - }) - .collect::>(); - lengths.remove(0); - lengths - } } impl, O: ArrowNativeType + Add + Sub> - From for OffsetsBuilder + From for OffsetBufferBuilder { fn from(value: IntoIter) -> Self { let mut offsets_vec: Vec = value @@ -158,17 +132,17 @@ impl, O: ArrowNativeType + Add + Su }) .collect(); offsets_vec.insert(0, O::usize_as(0)); - OffsetsBuilder::new_unchecked(offsets_vec) + unsafe { OffsetBufferBuilder::new_unchecked(offsets_vec) } } } #[cfg(test)] mod tests { - use crate::{OffsetBuffer, OffsetsBuilder}; + use crate::{OffsetBuffer, OffsetBufferBuilder}; #[test] fn new_zeroed() -> Result<(), String> { - let builder: OffsetsBuilder = OffsetsBuilder::new_zeroed(3); + let builder: OffsetBufferBuilder = OffsetBufferBuilder::new_zeroed(3); assert_eq!(builder.finish().to_vec(), vec![0, 0, 0, 0]); Ok(()) } @@ -176,7 +150,7 @@ mod tests { #[test] fn test_from() -> Result<(), String> { let lengths = vec![1, 2, 3, 0, 3, 2, 1]; - let builder: OffsetsBuilder = lengths.into(); + let builder: OffsetBufferBuilder = lengths.into(); assert_eq!(builder.last(), 12); assert_eq!(builder.len(), 8); @@ -189,10 +163,10 @@ mod tests { #[test] fn test_push() -> Result<(), String> { let lengths = vec![1, 2, 3, 0, 3, 2, 1]; - let mut builder: OffsetsBuilder = lengths.into(); - builder.try_push_usize_length(1)?; - builder.push_length(2); - builder.push_length(0); + let mut builder: OffsetBufferBuilder = lengths.into(); + builder.try_push_length(1usize)?; + builder.try_push_length(2usize)?; + builder.try_push_length(0usize)?; let offsets: OffsetBuffer = builder.into(); let expect_offsets = vec![0, 1, 3, 6, 6, 9, 11, 12, 13, 15, 15]; assert_eq!(offsets.to_vec(), expect_offsets); @@ -202,9 +176,9 @@ mod tests { #[test] fn test_try_push_unexpect() -> Result<(), String> { let lengths = vec![1, 2, 3]; - let mut builder: OffsetsBuilder = lengths.into(); + let mut builder: OffsetBufferBuilder = lengths.into(); let len = 1 << 20; - match builder.try_push_usize_length(1 << 20) { + match builder.try_push_length(1 << 20) { Err(err) => { assert_eq!( format!("Cannot safely convert usize length {len} to offset"), @@ -219,13 +193,13 @@ mod tests { #[test] fn test_extend() -> Result<(), String> { let lengths = vec![1, 2, 3]; - let mut builder: OffsetsBuilder = lengths.into(); + let mut builder: OffsetBufferBuilder = lengths.into(); let extend_lengths = vec![4, 4]; - builder.try_extend_from_usize_lengths(extend_lengths)?; + builder.try_extend_from_lengths(extend_lengths)?; - let extend_i32_lengths: Vec = vec![5, 5]; - builder.extend_from_lengths(extend_i32_lengths); + let extend_i32_lengths: Vec = vec![5, 5]; + builder.try_extend_from_lengths(extend_i32_lengths)?; let another_builder = vec![1, 2, 3].into(); builder.extend_from_builder(another_builder); From 3b61d88d0c30511991b7d1c4434a1e1254ba01a3 Mon Sep 17 00:00:00 2001 From: doki Date: Thu, 14 Mar 2024 11:57:49 +0800 Subject: [PATCH 07/13] impl FromIterator & Extend --- arrow-buffer/src/builder/offset.rs | 74 ++++++++++++++++-------------- 1 file changed, 40 insertions(+), 34 deletions(-) diff --git a/arrow-buffer/src/builder/offset.rs b/arrow-buffer/src/builder/offset.rs index 0660d23c349f..653a5269cf9f 100644 --- a/arrow-buffer/src/builder/offset.rs +++ b/arrow-buffer/src/builder/offset.rs @@ -39,8 +39,8 @@ impl + Sub> OffsetBufferBuilder unsafe { Self::new_unchecked(offsets) } } - /// # Safety /// Create from offsets. + /// # Safety /// Caller guarantees that offsets are monotonically increasing values. #[inline] pub unsafe fn new_unchecked(offsets: Vec) -> Self { @@ -50,38 +50,26 @@ impl + Sub> OffsetBufferBuilder /// Try to safely push a length of usize type into builder #[inline] pub fn try_push_length(&mut self, length: usize) -> Result<(), String> { - let last_offset = self.offsets.last().unwrap(); - let next_offset = *last_offset + O::from_usize(length).expect("offset overflow"); - self.offsets.push(next_offset); + self.push_length(O::from_usize(length).ok_or("offset overflow".to_string())?); Ok(()) } - /// Extend the builder with an Iterator of lengths - pub fn try_extend_from_lengths( - &mut self, - lengths: impl IntoIterator, - ) -> Result<(), String> { - let lengths_iter = lengths.into_iter(); - let size_hint = match lengths_iter.size_hint().1 { - Some(h_bound) => h_bound, - None => lengths_iter.size_hint().0, - }; - self.reserve(size_hint); - for len in lengths_iter { - self.try_push_length(len)?; - } - Ok(()) + /// Push a length of usize type into builder + pub fn push_length(&mut self, length: O) { + let last_offset = self.offsets.last().unwrap(); + let next_offset = *last_offset + length; + self.offsets.push(next_offset); } /// Extend from another OffsetsBuilder. /// It gets a lengths iterator from another builder and extend the builder with the iter. - pub fn extend_from_builder(&mut self, another: OffsetBufferBuilder) { + pub fn extend_with_builder(&mut self, another: OffsetBufferBuilder) { self.reserve(another.len() - 1); let mut last_offset = another.offsets[0]; for i in 1..another.offsets.len() { let cur_offset = another.offsets[i]; let len = cur_offset - last_offset; - self.offsets.push(self.last() + len); + self.push_length(len); last_offset = cur_offset; } } @@ -120,11 +108,12 @@ impl + Sub> OffsetBufferBuilder } } +/// Convert an [`IntoIterator`] of lengths to [`OffsetBufferBuilder`] impl, O: ArrowNativeType + Add + Sub> From for OffsetBufferBuilder { - fn from(value: IntoIter) -> Self { - let mut offsets_vec: Vec = value + fn from(lengths: IntoIter) -> Self { + let mut offsets_vec: Vec = lengths .into_iter() .scan(O::usize_as(0), |prev, len| { *prev = *prev + len; @@ -136,6 +125,29 @@ impl, O: ArrowNativeType + Add + Su } } +/// Convert an [`FromIterator`] of lengths to [`OffsetBufferBuilder`] +impl + Sub> FromIterator + for OffsetBufferBuilder +{ + fn from_iter>(lengths: T) -> Self { + lengths.into() + } +} + +impl + Sub> Extend for OffsetBufferBuilder { + fn extend>(&mut self, lengths: T) { + let lengths_iter = lengths.into_iter(); + let size_hint = match lengths_iter.size_hint().1 { + Some(h_bound) => h_bound, + None => lengths_iter.size_hint().0, + }; + self.reserve(size_hint); + for len in lengths_iter { + self.push_length(len); + } + } +} + #[cfg(test)] mod tests { use crate::{OffsetBuffer, OffsetBufferBuilder}; @@ -178,12 +190,9 @@ mod tests { let lengths = vec![1, 2, 3]; let mut builder: OffsetBufferBuilder = lengths.into(); let len = 1 << 20; - match builder.try_push_length(1 << 20) { + match builder.try_push_length(len) { Err(err) => { - assert_eq!( - format!("Cannot safely convert usize length {len} to offset"), - err - ); + assert_eq!(format!("offset overflow"), err); Ok(()) } Ok(_) => Err("builder.finish should return Err".to_string()), @@ -195,14 +204,11 @@ mod tests { let lengths = vec![1, 2, 3]; let mut builder: OffsetBufferBuilder = lengths.into(); - let extend_lengths = vec![4, 4]; - builder.try_extend_from_lengths(extend_lengths)?; - - let extend_i32_lengths: Vec = vec![5, 5]; - builder.try_extend_from_lengths(extend_i32_lengths)?; + let extend_lengths = vec![4, 4, 5, 5]; + builder.extend(extend_lengths); let another_builder = vec![1, 2, 3].into(); - builder.extend_from_builder(another_builder); + builder.extend_with_builder(another_builder); let offsets = builder.finish(); let expect_offsets = vec![0, 1, 3, 6, 10, 14, 19, 24, 25, 27, 30]; From 2bf238b6c0cfcbcd01a922508725fb9119b955b9 Mon Sep 17 00:00:00 2001 From: doki Date: Fri, 15 Mar 2024 08:17:16 +0800 Subject: [PATCH 08/13] update --- arrow-buffer/src/builder/offset.rs | 40 +++++++++++------------------- 1 file changed, 15 insertions(+), 25 deletions(-) diff --git a/arrow-buffer/src/builder/offset.rs b/arrow-buffer/src/builder/offset.rs index 653a5269cf9f..81138e94afb6 100644 --- a/arrow-buffer/src/builder/offset.rs +++ b/arrow-buffer/src/builder/offset.rs @@ -50,30 +50,23 @@ impl + Sub> OffsetBufferBuilder /// Try to safely push a length of usize type into builder #[inline] pub fn try_push_length(&mut self, length: usize) -> Result<(), String> { - self.push_length(O::from_usize(length).ok_or("offset overflow".to_string())?); + let last_offset = self.offsets.last().unwrap(); + let next_offset = + *last_offset + O::from_usize(length).ok_or("offset overflow".to_string())?; + self.offsets.push(next_offset); Ok(()) } - /// Push a length of usize type into builder - pub fn push_length(&mut self, length: O) { + /// Push a length of usize type without overflow checking. + /// # Safety + /// This doesn't check offset overflow, the caller must ensure it. + #[inline] + pub unsafe fn push_length_unchecked(&mut self, length: usize) { let last_offset = self.offsets.last().unwrap(); - let next_offset = *last_offset + length; + let next_offset = *last_offset + O::usize_as(length); self.offsets.push(next_offset); } - /// Extend from another OffsetsBuilder. - /// It gets a lengths iterator from another builder and extend the builder with the iter. - pub fn extend_with_builder(&mut self, another: OffsetBufferBuilder) { - self.reserve(another.len() - 1); - let mut last_offset = another.offsets[0]; - for i in 1..another.offsets.len() { - let cur_offset = another.offsets[i]; - let len = cur_offset - last_offset; - self.push_length(len); - last_offset = cur_offset; - } - } - /// Takes the builder itself and returns an [`OffsetBuffer`] pub fn finish(self) -> OffsetBuffer { unsafe { OffsetBuffer::new_unchecked(self.offsets.into()) } @@ -81,7 +74,7 @@ impl + Sub> OffsetBufferBuilder /// Capacity of offsets pub fn capacity(&self) -> usize { - self.offsets.capacity() + self.offsets.capacity() - 1 } /// Length of the Offsets @@ -142,9 +135,9 @@ impl + Sub> Extend for Offse None => lengths_iter.size_hint().0, }; self.reserve(size_hint); - for len in lengths_iter { - self.push_length(len); - } + lengths_iter.for_each(|len| unsafe { + self.push_length_unchecked(len.as_usize()); + }); } } @@ -207,11 +200,8 @@ mod tests { let extend_lengths = vec![4, 4, 5, 5]; builder.extend(extend_lengths); - let another_builder = vec![1, 2, 3].into(); - builder.extend_with_builder(another_builder); - let offsets = builder.finish(); - let expect_offsets = vec![0, 1, 3, 6, 10, 14, 19, 24, 25, 27, 30]; + let expect_offsets = vec![0, 1, 3, 6, 10, 14, 19, 24]; assert_eq!(offsets.to_vec(), expect_offsets); Ok(()) } From 1117419bdc10d86c2ef6abd30255b69cf1831929 Mon Sep 17 00:00:00 2001 From: doki Date: Fri, 15 Mar 2024 12:32:20 +0800 Subject: [PATCH 09/13] update --- arrow-buffer/src/builder/offset.rs | 68 +++++++++++++++--------------- arrow-buffer/src/lib.rs | 12 +++--- 2 files changed, 40 insertions(+), 40 deletions(-) diff --git a/arrow-buffer/src/builder/offset.rs b/arrow-buffer/src/builder/offset.rs index 81138e94afb6..0d0bc8803f00 100644 --- a/arrow-buffer/src/builder/offset.rs +++ b/arrow-buffer/src/builder/offset.rs @@ -41,18 +41,38 @@ impl + Sub> OffsetBufferBuilder /// Create from offsets. /// # Safety - /// Caller guarantees that offsets are monotonically increasing values. + /// Caller guarantees that offsets are monotonically increasing values + /// and the offsets vec is not empty. #[inline] pub unsafe fn new_unchecked(offsets: Vec) -> Self { Self { offsets } } + /// Try to create an [`OffsetBufferBuilder`] with a lengths iterator + pub fn try_from_lengths(lengths: impl IntoIterator) -> Result { + let lengths_iter = lengths.into_iter(); + let size_hint = match lengths_iter.size_hint().1 { + Some(h_bound) => h_bound, + None => lengths_iter.size_hint().0, + }; + let mut builder = OffsetBufferBuilder::new(size_hint); + for len in lengths_iter { + builder.try_push_length(len)?; + } + Ok(builder) + } + /// Try to safely push a length of usize type into builder #[inline] pub fn try_push_length(&mut self, length: usize) -> Result<(), String> { let last_offset = self.offsets.last().unwrap(); - let next_offset = - *last_offset + O::from_usize(length).ok_or("offset overflow".to_string())?; + let next_offset = O::from_usize( + last_offset + .as_usize() + .checked_add(length) + .ok_or("offset overflow".to_string())?, + ) + .ok_or("offset overflow")?; self.offsets.push(next_offset); Ok(()) } @@ -84,49 +104,27 @@ impl + Sub> OffsetBufferBuilder } /// Last offset + #[inline] pub fn last(&self) -> O { *self.offsets.last().unwrap() } + #[inline] pub fn reserve(&mut self, additional: usize) { self.offsets.reserve(additional); } + #[inline] pub fn reserve_exact(&mut self, additional: usize) { self.offsets.reserve_exact(additional); } + #[inline] pub fn shrink_to_fit(&mut self) { self.offsets.shrink_to_fit(); } } -/// Convert an [`IntoIterator`] of lengths to [`OffsetBufferBuilder`] -impl, O: ArrowNativeType + Add + Sub> - From for OffsetBufferBuilder -{ - fn from(lengths: IntoIter) -> Self { - let mut offsets_vec: Vec = lengths - .into_iter() - .scan(O::usize_as(0), |prev, len| { - *prev = *prev + len; - Some(*prev) - }) - .collect(); - offsets_vec.insert(0, O::usize_as(0)); - unsafe { OffsetBufferBuilder::new_unchecked(offsets_vec) } - } -} - -/// Convert an [`FromIterator`] of lengths to [`OffsetBufferBuilder`] -impl + Sub> FromIterator - for OffsetBufferBuilder -{ - fn from_iter>(lengths: T) -> Self { - lengths.into() - } -} - impl + Sub> Extend for OffsetBufferBuilder { fn extend>(&mut self, lengths: T) { let lengths_iter = lengths.into_iter(); @@ -135,8 +133,8 @@ impl + Sub> Extend for Offse None => lengths_iter.size_hint().0, }; self.reserve(size_hint); - lengths_iter.for_each(|len| unsafe { - self.push_length_unchecked(len.as_usize()); + lengths_iter.for_each(|len| { + self.try_push_length(len.as_usize()).unwrap(); }); } } @@ -155,7 +153,7 @@ mod tests { #[test] fn test_from() -> Result<(), String> { let lengths = vec![1, 2, 3, 0, 3, 2, 1]; - let builder: OffsetBufferBuilder = lengths.into(); + let builder = OffsetBufferBuilder::::try_from_lengths(lengths)?; assert_eq!(builder.last(), 12); assert_eq!(builder.len(), 8); @@ -168,7 +166,7 @@ mod tests { #[test] fn test_push() -> Result<(), String> { let lengths = vec![1, 2, 3, 0, 3, 2, 1]; - let mut builder: OffsetBufferBuilder = lengths.into(); + let mut builder = OffsetBufferBuilder::::try_from_lengths(lengths)?; builder.try_push_length(1usize)?; builder.try_push_length(2usize)?; builder.try_push_length(0usize)?; @@ -181,7 +179,7 @@ mod tests { #[test] fn test_try_push_unexpect() -> Result<(), String> { let lengths = vec![1, 2, 3]; - let mut builder: OffsetBufferBuilder = lengths.into(); + let mut builder = OffsetBufferBuilder::::try_from_lengths(lengths)?; let len = 1 << 20; match builder.try_push_length(len) { Err(err) => { @@ -195,7 +193,7 @@ mod tests { #[test] fn test_extend() -> Result<(), String> { let lengths = vec![1, 2, 3]; - let mut builder: OffsetBufferBuilder = lengths.into(); + let mut builder = OffsetBufferBuilder::::try_from_lengths(lengths)?; let extend_lengths = vec![4, 4, 5, 5]; builder.extend(extend_lengths); diff --git a/arrow-buffer/src/lib.rs b/arrow-buffer/src/lib.rs index e0f0d4046391..612897af9bed 100644 --- a/arrow-buffer/src/lib.rs +++ b/arrow-buffer/src/lib.rs @@ -21,15 +21,17 @@ #![cfg_attr(miri, feature(strict_provenance))] pub mod alloc; -mod bigint; pub mod buffer; +pub use buffer::*; + pub mod builder; +pub use builder::*; + +mod bigint; mod bytes; mod native; -mod util; - pub use bigint::i256; -pub use buffer::*; -pub use builder::*; + pub use native::*; +mod util; pub use util::*; From e28edd6489ba1ab396a8020143a02a9cf8ede451 Mon Sep 17 00:00:00 2001 From: doki Date: Fri, 15 Mar 2024 21:00:48 +0800 Subject: [PATCH 10/13] add try_extend --- arrow-buffer/src/builder/offset.rs | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/arrow-buffer/src/builder/offset.rs b/arrow-buffer/src/builder/offset.rs index 0d0bc8803f00..9cfeba578c65 100644 --- a/arrow-buffer/src/builder/offset.rs +++ b/arrow-buffer/src/builder/offset.rs @@ -70,7 +70,7 @@ impl + Sub> OffsetBufferBuilder last_offset .as_usize() .checked_add(length) - .ok_or("offset overflow".to_string())?, + .ok_or("offset overflow")?, ) .ok_or("offset overflow")?; self.offsets.push(next_offset); @@ -87,6 +87,20 @@ impl + Sub> OffsetBufferBuilder self.offsets.push(next_offset); } + /// Try to extend an IntoIterator of lengths + pub fn try_extend>(&mut self, lengths: T) -> Result<(), String> { + let lengths_iter = lengths.into_iter(); + let size_hint = match lengths_iter.size_hint().1 { + Some(h_bound) => h_bound, + None => lengths_iter.size_hint().0, + }; + self.reserve(size_hint); + for len in lengths_iter { + self.try_push_length(len.as_usize())?; + } + Ok(()) + } + /// Takes the builder itself and returns an [`OffsetBuffer`] pub fn finish(self) -> OffsetBuffer { unsafe { OffsetBuffer::new_unchecked(self.offsets.into()) } @@ -127,15 +141,7 @@ impl + Sub> OffsetBufferBuilder impl + Sub> Extend for OffsetBufferBuilder { fn extend>(&mut self, lengths: T) { - let lengths_iter = lengths.into_iter(); - let size_hint = match lengths_iter.size_hint().1 { - Some(h_bound) => h_bound, - None => lengths_iter.size_hint().0, - }; - self.reserve(size_hint); - lengths_iter.for_each(|len| { - self.try_push_length(len.as_usize()).unwrap(); - }); + self.try_extend(lengths).unwrap() } } @@ -169,7 +175,9 @@ mod tests { let mut builder = OffsetBufferBuilder::::try_from_lengths(lengths)?; builder.try_push_length(1usize)?; builder.try_push_length(2usize)?; - builder.try_push_length(0usize)?; + unsafe { + builder.push_length_unchecked(0usize); + } let offsets: OffsetBuffer = builder.into(); let expect_offsets = vec![0, 1, 3, 6, 6, 9, 11, 12, 13, 15, 15]; assert_eq!(offsets.to_vec(), expect_offsets); From 8984e91ca965ac2fdeb7f21d942b14c3df83412a Mon Sep 17 00:00:00 2001 From: doki Date: Sat, 16 Mar 2024 08:10:02 +0800 Subject: [PATCH 11/13] update unit tests --- arrow-buffer/src/builder/offset.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arrow-buffer/src/builder/offset.rs b/arrow-buffer/src/builder/offset.rs index 9cfeba578c65..432334722a8d 100644 --- a/arrow-buffer/src/builder/offset.rs +++ b/arrow-buffer/src/builder/offset.rs @@ -203,9 +203,12 @@ mod tests { let lengths = vec![1, 2, 3]; let mut builder = OffsetBufferBuilder::::try_from_lengths(lengths)?; - let extend_lengths = vec![4, 4, 5, 5]; + let extend_lengths = vec![4, 4]; builder.extend(extend_lengths); + let extend_lengths = vec![5, 5]; + builder.try_extend(extend_lengths)?; + let offsets = builder.finish(); let expect_offsets = vec![0, 1, 3, 6, 10, 14, 19, 24]; assert_eq!(offsets.to_vec(), expect_offsets); From 72c25bd262110fd1eb11af6e55de9d10f8bac7bb Mon Sep 17 00:00:00 2001 From: doki Date: Sat, 16 Mar 2024 08:16:35 +0800 Subject: [PATCH 12/13] update --- arrow-buffer/src/builder/offset.rs | 33 +++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/arrow-buffer/src/builder/offset.rs b/arrow-buffer/src/builder/offset.rs index 432334722a8d..d3d531345800 100644 --- a/arrow-buffer/src/builder/offset.rs +++ b/arrow-buffer/src/builder/offset.rs @@ -87,7 +87,7 @@ impl + Sub> OffsetBufferBuilder self.offsets.push(next_offset); } - /// Try to extend an IntoIterator of lengths + /// Try to extend builder with an IntoIterator of lengths pub fn try_extend>(&mut self, lengths: T) -> Result<(), String> { let lengths_iter = lengths.into_iter(); let size_hint = match lengths_iter.size_hint().1 { @@ -107,14 +107,20 @@ impl + Sub> OffsetBufferBuilder } /// Capacity of offsets + #[inline] pub fn capacity(&self) -> usize { self.offsets.capacity() - 1 } /// Length of the Offsets - #[allow(clippy::len_without_is_empty)] + #[inline] pub fn len(&self) -> usize { - self.offsets.len() + self.offsets.len() - 1 + } + + #[inline] + pub fn is_empty(&self) -> bool { + self.len() == 0 } /// Last offset @@ -150,19 +156,36 @@ mod tests { use crate::{OffsetBuffer, OffsetBufferBuilder}; #[test] - fn new_zeroed() -> Result<(), String> { + fn test_new_zeroed() -> Result<(), String> { let builder: OffsetBufferBuilder = OffsetBufferBuilder::new_zeroed(3); + assert_eq!(builder.is_empty(), false); + assert_eq!(builder.capacity(), 3); + assert_eq!(builder.len(), 3); assert_eq!(builder.finish().to_vec(), vec![0, 0, 0, 0]); Ok(()) } + #[test] + fn test_capacity_len() -> Result<(), String> { + let mut builder = OffsetBufferBuilder::::new(1); + assert_eq!(builder.is_empty(), true); + assert_eq!(builder.capacity(), 1); + builder.try_push_length(1)?; + assert_eq!(builder.is_empty(), false); + assert_eq!(builder.capacity(), 1); + assert_eq!(builder.len(), 1); + Ok(()) + } + #[test] fn test_from() -> Result<(), String> { let lengths = vec![1, 2, 3, 0, 3, 2, 1]; let builder = OffsetBufferBuilder::::try_from_lengths(lengths)?; assert_eq!(builder.last(), 12); - assert_eq!(builder.len(), 8); + assert_eq!(builder.capacity(), 7); + assert_eq!(builder.len(), 7); + assert_eq!(builder.is_empty(), false); let offsets = builder.finish(); assert_eq!(offsets.to_vec(), vec![0, 1, 3, 6, 6, 9, 11, 12]); From d4ce5614ffc6e847c2a049f3592b52ddeeb6227a Mon Sep 17 00:00:00 2001 From: doki Date: Sat, 16 Mar 2024 08:26:41 +0800 Subject: [PATCH 13/13] update comments of capacity and len --- arrow-buffer/src/builder/offset.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arrow-buffer/src/builder/offset.rs b/arrow-buffer/src/builder/offset.rs index d3d531345800..ae3cda4baed3 100644 --- a/arrow-buffer/src/builder/offset.rs +++ b/arrow-buffer/src/builder/offset.rs @@ -106,13 +106,13 @@ impl + Sub> OffsetBufferBuilder unsafe { OffsetBuffer::new_unchecked(self.offsets.into()) } } - /// Capacity of offsets + /// Capacity of offsets excluding the first zero offset #[inline] pub fn capacity(&self) -> usize { self.offsets.capacity() - 1 } - /// Length of the Offsets + /// Length of the offsets excluding the first zero offset #[inline] pub fn len(&self) -> usize { self.offsets.len() - 1