From 7995dd4adf4e3cc2061d353dccbe034703dacd74 Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Fri, 14 Feb 2025 12:00:09 +0100 Subject: [PATCH] Encode `RowId` as `FixedSizeBinaryArray` (#9038) ### Related * Closes https://github.com/rerun-io/rerun/issues/8992 ### What Change how we encode `RowId` in arrow from a weird `StructArray` to a `FixedSizeBinaryArray` ([same as how UUID:s are usually encoded](https://arrow.apache.org/docs/format/CanonicalExtensions.html#uuid)). The result is code that is shorter, simpler, and faster. And less embarrassing. This is a breaking change (a lot of those going around anyways). --- crates/store/re_chunk/src/chunk.rs | 111 ++++------------- crates/store/re_chunk/src/iter.rs | 11 +- crates/store/re_chunk/src/merge.rs | 4 +- crates/store/re_chunk/src/shuffle.rs | 28 ++--- crates/store/re_chunk/src/slice.rs | 24 +--- .../formatting__format_chunk_store.snap | 2 +- crates/store/re_dataframe/src/query.rs | 23 +--- crates/store/re_sorbet/src/chunk_batch.rs | 9 +- crates/store/re_types_core/Cargo.toml | 2 +- crates/store/re_types_core/src/id.rs | 30 ++++- crates/store/re_types_core/src/lib.rs | 2 +- crates/store/re_types_core/src/tuid.rs | 112 +++++------------- crates/utils/re_tuid/src/lib.rs | 58 ++++----- .../viewer/re_chunk_store_ui/src/chunk_ui.rs | 2 +- 14 files changed, 138 insertions(+), 280 deletions(-) diff --git a/crates/store/re_chunk/src/chunk.rs b/crates/store/re_chunk/src/chunk.rs index 3f6281960f49..33a25a8a428c 100644 --- a/crates/store/re_chunk/src/chunk.rs +++ b/crates/store/re_chunk/src/chunk.rs @@ -3,8 +3,8 @@ use std::sync::atomic::{AtomicU64, Ordering}; use ahash::HashMap; use arrow::{ array::{ - Array as ArrowArray, ArrayRef as ArrowArrayRef, ListArray as ArrowListArray, - StructArray as ArrowStructArray, UInt64Array as ArrowUInt64Array, + Array as ArrowArray, ArrayRef as ArrowArrayRef, FixedSizeBinaryArray, + ListArray as ArrowListArray, }, buffer::{NullBuffer as ArrowNullBuffer, ScalarBuffer as ArrowScalarBuffer}, }; @@ -15,8 +15,7 @@ use re_arrow_util::ArrowArrayDowncastRef as _; use re_byte_size::SizeBytes as _; use re_log_types::{EntityPath, ResolvedTimeRange, Time, TimeInt, TimePoint, Timeline}; use re_types_core::{ - ComponentDescriptor, ComponentName, DeserializationError, Loggable, LoggableBatch, - SerializationError, + ComponentDescriptor, ComponentName, DeserializationError, Loggable, SerializationError, }; use crate::{ChunkId, RowId}; @@ -199,7 +198,7 @@ pub struct Chunk { pub(crate) is_sorted: bool, /// The respective [`RowId`]s for each row of data. - pub(crate) row_ids: ArrowStructArray, + pub(crate) row_ids: FixedSizeBinaryArray, /// The time columns. /// @@ -378,18 +377,9 @@ impl Chunk { .take(self.row_ids.len()) .collect_vec(); - #[allow(clippy::unwrap_used)] - let row_ids = ::to_arrow(&row_ids) - // Unwrap: native RowIds cannot fail to serialize. - .unwrap() - .downcast_array_ref::() - // Unwrap: RowId schema is known in advance to be a struct array -- always. - .unwrap() - .clone(); - Self { id, - row_ids, + row_ids: RowId::arrow_from_slice(&row_ids), ..self.clone() } } @@ -407,14 +397,7 @@ impl Chunk { .take(self.row_ids.len()) .collect_vec(); - #[allow(clippy::unwrap_used)] - let row_ids = ::to_arrow(&row_ids) - // Unwrap: native RowIds cannot fail to serialize. - .unwrap() - .downcast_array_ref::() - // Unwrap: RowId schema is known in advance to be a struct array -- always. - .unwrap() - .clone(); + let row_ids = RowId::arrow_from_slice(&row_ids); Self { row_ids, ..self } } @@ -740,7 +723,7 @@ impl Chunk { id: ChunkId, entity_path: EntityPath, is_sorted: Option, - row_ids: ArrowStructArray, + row_ids: FixedSizeBinaryArray, timelines: IntMap, components: ChunkComponents, ) -> ChunkResult { @@ -779,19 +762,7 @@ impl Chunk { components: ChunkComponents, ) -> ChunkResult { re_tracing::profile_function!(); - let row_ids = row_ids - .to_arrow() - // NOTE: impossible, but better safe than sorry. - .map_err(|err| ChunkError::Malformed { - reason: format!("RowIds failed to serialize: {err}"), - })? - .downcast_array_ref::() - // NOTE: impossible, but better safe than sorry. - .ok_or_else(|| ChunkError::Malformed { - reason: "RowIds failed to downcast".to_owned(), - })? - .clone(); - + let row_ids = RowId::arrow_from_slice(row_ids); Self::new(id, entity_path, is_sorted, row_ids, timelines, components) } @@ -836,7 +807,7 @@ impl Chunk { id: ChunkId, entity_path: EntityPath, is_sorted: Option, - row_ids: ArrowStructArray, + row_ids: FixedSizeBinaryArray, components: ChunkComponents, ) -> ChunkResult { Self::new( @@ -856,11 +827,7 @@ impl Chunk { entity_path, heap_size_bytes: Default::default(), is_sorted: true, - row_ids: arrow::array::StructBuilder::from_fields( - re_types_core::tuid_arrow_fields(), - 0, - ) - .finish(), + row_ids: RowId::arrow_from_slice(&[]), timelines: Default::default(), components: Default::default(), } @@ -1128,34 +1095,21 @@ impl Chunk { } #[inline] - pub fn row_ids_array(&self) -> &ArrowStructArray { + pub fn row_ids_array(&self) -> &FixedSizeBinaryArray { &self.row_ids } - /// Returns the [`RowId`]s in their raw-est form: a tuple of (times, counters) arrays. #[inline] - pub fn row_ids_raw(&self) -> (&ArrowUInt64Array, &ArrowUInt64Array) { - let [times, counters] = self.row_ids.columns() else { - panic!("RowIds are corrupt -- this should be impossible (sanity checked)"); - }; - - #[allow(clippy::unwrap_used)] - let times = times.downcast_array_ref::().unwrap(); // sanity checked - - #[allow(clippy::unwrap_used)] - let counters = counters.downcast_array_ref::().unwrap(); // sanity checked - - (times, counters) + pub fn row_ids_slice(&self) -> &[RowId] { + RowId::slice_from_arrow(&self.row_ids) } /// All the [`RowId`] in this chunk. /// /// This could be in any order if this chunk is unsorted. #[inline] - pub fn row_ids(&self) -> impl Iterator + '_ { - let (times, counters) = self.row_ids_raw(); - izip!(times.values(), counters.values()) - .map(|(&time, &counter)| RowId::from_u128((time as u128) << 64 | (counter as u128))) + pub fn row_ids(&self) -> impl ExactSizeIterator + '_ { + self.row_ids_slice().iter().copied() } /// Returns an iterator over the [`RowId`]s of a [`Chunk`], for a given component. @@ -1195,41 +1149,20 @@ impl Chunk { return None; } - let (times, counters) = self.row_ids_raw(); - let (times, counters) = (times.values(), counters.values()); + let row_ids = self.row_ids_slice(); #[allow(clippy::unwrap_used)] // checked above - let (index_min, index_max) = if self.is_sorted() { + Some(if self.is_sorted() { ( - ( - times.first().copied().unwrap(), - counters.first().copied().unwrap(), - ), - ( - times.last().copied().unwrap(), - counters.last().copied().unwrap(), - ), + row_ids.first().copied().unwrap(), + row_ids.last().copied().unwrap(), ) } else { ( - ( - times.iter().min().copied().unwrap(), - counters.iter().min().copied().unwrap(), - ), - ( - times.iter().max().copied().unwrap(), - counters.iter().max().copied().unwrap(), - ), + row_ids.iter().min().copied().unwrap(), + row_ids.iter().max().copied().unwrap(), ) - }; - - let (time_min, counter_min) = index_min; - let (time_max, counter_max) = index_max; - - Some(( - RowId::from_u128((time_min as u128) << 64 | (counter_min as u128)), - RowId::from_u128((time_max as u128) << 64 | (counter_max as u128)), - )) + }) } #[inline] diff --git a/crates/store/re_chunk/src/iter.rs b/crates/store/re_chunk/src/iter.rs index ea8de69e39ec..65fd484d47f6 100644 --- a/crates/store/re_chunk/src/iter.rs +++ b/crates/store/re_chunk/src/iter.rs @@ -702,16 +702,7 @@ impl Iterator for ChunkIndicesIter { let i = self.index; self.index += 1; - let row_id = { - let (times, incs) = self.chunk.row_ids_raw(); - let times = times.values(); - let incs = incs.values(); - - let time = *times.get(i)?; - let inc = *incs.get(i)?; - - RowId::from_u128(((time as u128) << 64) | (inc as u128)) - }; + let row_id = *self.chunk.row_ids_slice().get(i)?; if let Some(time_column) = &self.time_column { let time = *time_column.times_raw().get(i)?; diff --git a/crates/store/re_chunk/src/merge.rs b/crates/store/re_chunk/src/merge.rs index 9e258b44ab38..1894349ed700 100644 --- a/crates/store/re_chunk/src/merge.rs +++ b/crates/store/re_chunk/src/merge.rs @@ -1,4 +1,4 @@ -use arrow::array::StructArray as ArrowStructArray; +use arrow::array::FixedSizeBinaryArray; use arrow::array::{Array as ArrowArray, ListArray as ArrowListArray}; use arrow::buffer::ScalarBuffer as ArrowScalarBuffer; use itertools::{izip, Itertools}; @@ -51,7 +51,7 @@ impl Chunk { #[allow(clippy::unwrap_used)] // concatenating 2 RowId arrays must yield another RowId array row_ids - .downcast_array_ref::() + .downcast_array_ref::() .unwrap() .clone() }; diff --git a/crates/store/re_chunk/src/shuffle.rs b/crates/store/re_chunk/src/shuffle.rs index 6d1a22c484a3..258f0d1be429 100644 --- a/crates/store/re_chunk/src/shuffle.rs +++ b/crates/store/re_chunk/src/shuffle.rs @@ -1,13 +1,9 @@ -use std::sync::Arc; - use arrow::{ - array::{ - Array as ArrowArray, ListArray as ArrowListArray, StructArray as ArrowStructArray, - UInt64Array as ArrowUInt64Array, - }, + array::{Array as ArrowArray, ListArray as ArrowListArray}, buffer::{OffsetBuffer as ArrowOffsets, ScalarBuffer as ArrowScalarBuffer}, }; use itertools::Itertools as _; + use re_log_types::Timeline; use crate::{Chunk, TimeColumn}; @@ -87,7 +83,7 @@ impl Chunk { let swaps = { re_tracing::profile_scope!("swaps"); - let row_ids = self.row_ids().collect_vec(); + let row_ids = self.row_ids_slice(); let mut swaps = (0..row_ids.len()).collect::>(); swaps.sort_by_key(|&i| row_ids[i]); swaps @@ -134,7 +130,7 @@ impl Chunk { let swaps = { re_tracing::profile_scope!("swaps"); - let row_ids = chunk.row_ids().collect_vec(); + let row_ids = chunk.row_ids_slice(); let times = time_column.times_raw().to_vec(); let mut swaps = (0..times.len()).collect::>(); swaps.sort_by_key(|&i| (times[i], row_ids[i])); @@ -204,21 +200,13 @@ impl Chunk { { re_tracing::profile_scope!("row ids"); - let (times, counters) = self.row_ids_raw(); - let (times, counters) = (times.values(), counters.values()); + let row_ids = self.row_ids_slice(); - let mut sorted_times = times.to_vec(); - let mut sorted_counters = counters.to_vec(); + let mut sorted_row_ids = row_ids.to_vec(); for (to, from) in swaps.iter().copied().enumerate() { - sorted_times[to] = times[from]; - sorted_counters[to] = counters[from]; + sorted_row_ids[to] = row_ids[from]; } - - let times = Arc::new(ArrowUInt64Array::from(sorted_times)); - let counters = Arc::new(ArrowUInt64Array::from(sorted_counters)); - - self.row_ids = - ArrowStructArray::new(self.row_ids.fields().clone(), vec![times, counters], None); + self.row_ids = re_types_core::RowId::arrow_from_slice(&sorted_row_ids); } let Self { diff --git a/crates/store/re_chunk/src/slice.rs b/crates/store/re_chunk/src/slice.rs index 0e1dbebe5422..08c7dacf692e 100644 --- a/crates/store/re_chunk/src/slice.rs +++ b/crates/store/re_chunk/src/slice.rs @@ -32,23 +32,9 @@ impl Chunk { .and_then(|per_desc| per_desc.get(component_desc))?; if self.is_sorted() { - let row_id_128 = row_id.as_u128(); - let row_id_time_ns = (row_id_128 >> 64) as u64; - let row_id_inc = (row_id_128 & (!0 >> 64)) as u64; - - let (times, incs) = self.row_ids_raw(); - let times = times.values(); - let incs = incs.values(); - - let mut index = times.partition_point(|&time| time < row_id_time_ns); - while index < incs.len() && incs[index] < row_id_inc { - index += 1; - } - - let found_it = - times.get(index) == Some(&row_id_time_ns) && incs.get(index) == Some(&row_id_inc); - - (found_it && list_array.is_valid(index)).then(|| list_array.value(index)) + let row_ids = self.row_ids_slice(); + let index = row_ids.binary_search(&row_id).ok()?; + list_array.is_valid(index).then(|| list_array.value(index)) } else { self.row_ids() .find_position(|id| *id == row_id) @@ -427,7 +413,7 @@ impl Chunk { entity_path, heap_size_bytes: _, is_sorted: _, - row_ids, + row_ids: _, timelines, components, } = self; @@ -439,7 +425,7 @@ impl Chunk { entity_path: entity_path.clone(), heap_size_bytes: Default::default(), is_sorted: true, - row_ids: arrow::array::StructBuilder::from_fields(row_ids.fields().clone(), 0).finish(), + row_ids: RowId::arrow_from_slice(&[]), timelines: timelines .iter() .map(|(&timeline, time_column)| (timeline, time_column.emptied())) diff --git a/crates/store/re_chunk_store/tests/snapshots/formatting__format_chunk_store.snap b/crates/store/re_chunk_store/tests/snapshots/formatting__format_chunk_store.snap index 3ed50160c31a..dfebca01c7e9 100644 --- a/crates/store/re_chunk_store/tests/snapshots/formatting__format_chunk_store.snap +++ b/crates/store/re_chunk_store/tests/snapshots/formatting__format_chunk_store.snap @@ -23,7 +23,7 @@ ChunkStore { │ ┌──────────────────────────────────┬────────────────────────┬───────────────────────────────┬──────────────────────────────┬──────────────────────────────┐ │ │ │ RowId ┆ frame_nr ┆ log_time ┆ example.MyColor ┆ example.MyIndex │ │ │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ - │ │ type: "Struct[2]" ┆ type: "i64" ┆ type: "Timestamp(ns)" ┆ type: "List[u32]" ┆ type: "List[u64]" │ │ + │ │ type: "FixedSizeBinary[16]" ┆ type: "i64" ┆ type: "Timestamp(ns)" ┆ type: "List[u32]" ┆ type: "List[u64]" │ │ │ │ ARROW:extension:name: "TUID" ┆ index_name: "frame_nr" ┆ index_name: "log_time" ┆ component: "example.MyColor" ┆ component: "example.MyIndex" │ │ │ │ kind: "control" ┆ is_sorted: "true" ┆ is_sorted: "true" ┆ kind: "data" ┆ kind: "data" │ │ │ │ ┆ kind: "index" ┆ kind: "index" ┆ ┆ │ │ diff --git a/crates/store/re_dataframe/src/query.rs b/crates/store/re_dataframe/src/query.rs index 3a4d5e7d5c00..fffa602c51de 100644 --- a/crates/store/re_dataframe/src/query.rs +++ b/crates/store/re_dataframe/src/query.rs @@ -925,22 +925,7 @@ impl QueryHandle { .timelines() .get(&state.filtered_index) .map_or(cur_index_times_empty, |time_column| time_column.times_raw()); - let cur_index_row_ids = cur_chunk.row_ids_raw(); - - // NOTE: "Deserializing" everything into a native vec is way too much for rustc to - // follow and doesn't get optimized at all -- we have to work with raw arrow data - // all the way, so this gets a bit complicated. - let cur_index_row_id_at = |at: usize| { - let (times, incs) = cur_index_row_ids; - - let times = times.values(); - let incs = incs.values(); - - let time = *times.get(at)?; - let inc = *incs.get(at)?; - - Some(RowId::from_u128(((time as u128) << 64) | (inc as u128))) - }; + let cur_index_row_ids = cur_chunk.row_ids_slice(); let (index_value, cur_row_id) = 'walk: loop { let (Some(mut index_value), Some(mut cur_row_id)) = ( @@ -948,7 +933,7 @@ impl QueryHandle { .get(cur_cursor_value as usize) .copied() .map(TimeInt::new_temporal), - cur_index_row_id_at(cur_cursor_value as usize), + cur_index_row_ids.get(cur_cursor_value as usize).copied(), ) else { continue 'overlaps; }; @@ -962,7 +947,9 @@ impl QueryHandle { .get(cur_cursor_value as usize + 1) .copied() .map(TimeInt::new_temporal), - cur_index_row_id_at(cur_cursor_value as usize + 1), + cur_index_row_ids + .get(cur_cursor_value as usize + 1) + .copied(), ) { if next_index_value == *cur_index_value { index_value = next_index_value; diff --git a/crates/store/re_sorbet/src/chunk_batch.rs b/crates/store/re_sorbet/src/chunk_batch.rs index 5aa3720077f2..bcd648a2b952 100644 --- a/crates/store/re_sorbet/src/chunk_batch.rs +++ b/crates/store/re_sorbet/src/chunk_batch.rs @@ -1,7 +1,6 @@ use arrow::{ array::{ - ArrayRef as ArrowArrayRef, AsArray, RecordBatch as ArrowRecordBatch, - StructArray as ArrowStructArray, + ArrayRef as ArrowArrayRef, AsArray, FixedSizeBinaryArray, RecordBatch as ArrowRecordBatch, }, datatypes::Fields as ArrowFields, }; @@ -79,13 +78,11 @@ impl ChunkBatch { } /// The `RowId` column. - pub fn row_id_column(&self) -> (&RowIdColumnDescriptor, &ArrowStructArray) { + pub fn row_id_column(&self) -> (&RowIdColumnDescriptor, &FixedSizeBinaryArray) { // The first column is always the row IDs. ( self.schema.row_id_column(), - self.columns()[0] - .as_struct_opt() - .expect("Row IDs should be encoded as struct"), + self.columns()[0].as_fixed_size_binary(), ) } } diff --git a/crates/store/re_types_core/Cargo.toml b/crates/store/re_types_core/Cargo.toml index 1bc2c7fcc022..3a3d34aab241 100644 --- a/crates/store/re_types_core/Cargo.toml +++ b/crates/store/re_types_core/Cargo.toml @@ -41,7 +41,7 @@ re_error.workspace = true re_log.workspace = true re_string_interner.workspace = true re_tracing.workspace = true -re_tuid.workspace = true +re_tuid = { workspace = true, features = ["bytemuck"] } # External anyhow.workspace = true diff --git a/crates/store/re_types_core/src/id.rs b/crates/store/re_types_core/src/id.rs index 354606460e44..4651e27c7d70 100644 --- a/crates/store/re_types_core/src/id.rs +++ b/crates/store/re_types_core/src/id.rs @@ -1,3 +1,7 @@ +use arrow::array::Array; + +use crate::Loggable as _; + /// A unique ID for a `Chunk`. /// /// `Chunk`s are the atomic unit of ingestion, transport, storage, events and GC in Rerun. @@ -111,7 +115,7 @@ impl std::ops::DerefMut for ChunkId { } } -crate::delegate_arrow_tuid!(ChunkId as "rerun.controls.ChunkId"); +crate::delegate_arrow_tuid!(ChunkId as "rerun.controls.ChunkId"); // Used in the dataplatform // --- @@ -154,7 +158,19 @@ crate::delegate_arrow_tuid!(ChunkId as "rerun.controls.ChunkId"); /// /// This has very important implications when inserting data far into the past or into the future: /// think carefully about your `RowId`s in these cases. -#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[repr(C, align(1))] +#[derive( + Debug, + Clone, + Copy, + PartialEq, + Eq, + PartialOrd, + Ord, + Hash, + bytemuck::AnyBitPattern, + bytemuck::NoUninit, +)] #[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))] pub struct RowId(pub(crate) re_tuid::Tuid); @@ -215,6 +231,16 @@ impl RowId { pub fn from_u128(id: u128) -> Self { Self(re_tuid::Tuid::from_u128(id)) } + + pub fn arrow_from_slice(slice: &[Self]) -> arrow::array::FixedSizeBinaryArray { + crate::tuids_to_arrow(bytemuck::cast_slice(slice)) + } + + /// Panics if the array is of the wrong width + pub fn slice_from_arrow(array: &arrow::array::FixedSizeBinaryArray) -> &[Self] { + debug_assert_eq!(array.data_type(), &Self::arrow_datatype()); + bytemuck::cast_slice(array.value_data()) + } } impl re_byte_size::SizeBytes for RowId { diff --git a/crates/store/re_types_core/src/lib.rs b/crates/store/re_types_core/src/lib.rs index 1025b26a2656..3b761cc05f68 100644 --- a/crates/store/re_types_core/src/lib.rs +++ b/crates/store/re_types_core/src/lib.rs @@ -56,7 +56,7 @@ pub use self::{ DeserializationError, DeserializationResult, ResultExt, SerializationError, SerializationResult, _Backtrace, }, - tuid::tuid_arrow_fields, + tuid::tuids_to_arrow, view::{View, ViewClassIdentifier}, }; diff --git a/crates/store/re_types_core/src/tuid.rs b/crates/store/re_types_core/src/tuid.rs index b17b5336ca65..82f47394e30b 100644 --- a/crates/store/re_types_core/src/tuid.rs +++ b/crates/store/re_types_core/src/tuid.rs @@ -1,29 +1,30 @@ use std::sync::Arc; use arrow::{ - array::{ArrayRef, StructArray, UInt64Array}, - datatypes::{DataType, Field, Fields}, + array::{ArrayRef, AsArray, FixedSizeBinaryArray, FixedSizeBinaryBuilder}, + datatypes::DataType, }; -use re_arrow_util::ArrowArrayDowncastRef as _; use re_tuid::Tuid; use crate::{DeserializationError, Loggable}; // --- -// TODO(emilk): This is a bit ugly… but good enough for now? -pub fn tuid_arrow_fields() -> Fields { - Fields::from(vec![ - Field::new("time_ns", DataType::UInt64, false), - Field::new("inc", DataType::UInt64, false), - ]) +const BYTE_WIDTH: i32 = std::mem::size_of::() as i32; + +pub fn tuids_to_arrow(tuids: &[Tuid]) -> FixedSizeBinaryArray { + #[allow(clippy::unwrap_used)] // Can't fail + ::to_arrow(tuids.iter()) + .unwrap() + .as_fixed_size_binary() + .clone() } impl Loggable for Tuid { #[inline] fn arrow_datatype() -> arrow::datatypes::DataType { - DataType::Struct(tuid_arrow_fields()) + DataType::FixedSizeBinary(BYTE_WIDTH) } fn to_arrow_opt<'a>( @@ -40,92 +41,37 @@ impl Loggable for Tuid { #[inline] fn to_arrow<'a>( - data: impl IntoIterator>>, + iter: impl IntoIterator>>, ) -> crate::SerializationResult where Self: 'a, { - let (time_ns_values, inc_values): (Vec<_>, Vec<_>) = data - .into_iter() - .map(Into::into) - .map(|tuid| (tuid.nanoseconds_since_epoch(), tuid.inc())) - .unzip(); - - let values: Vec = vec![ - Arc::new(UInt64Array::from(time_ns_values)), - Arc::new(UInt64Array::from(inc_values)), - ]; - let validity = None; - - Ok(Arc::new(StructArray::new( - Fields::from(vec![ - Field::new("time_ns", DataType::UInt64, false), - Field::new("inc", DataType::UInt64, false), - ]), - values, - validity, - ))) + let iter = iter.into_iter(); + + let mut builder = FixedSizeBinaryBuilder::with_capacity(iter.size_hint().0, BYTE_WIDTH); + for tuid in iter { + #[allow(clippy::unwrap_used)] // Can't fail because `BYTE_WIDTH` is correct. + builder.append_value(tuid.into().as_bytes()).unwrap(); + } + + Ok(Arc::new(builder.finish())) } fn from_arrow(array: &dyn ::arrow::array::Array) -> crate::DeserializationResult> { - let expected_datatype = Self::arrow_datatype(); - let actual_datatype = array.data_type(); - if actual_datatype != &expected_datatype { + let Some(array) = array.as_fixed_size_binary_opt() else { return Err(DeserializationError::datatype_mismatch( - expected_datatype, - actual_datatype.clone(), + Self::arrow_datatype(), + array.data_type().clone(), )); - } - - // NOTE: Unwrap is safe everywhere below, datatype is checked above. - // NOTE: We don't even look at the validity, our datatype says we don't care. - - #[allow(clippy::unwrap_used)] - let array = array.downcast_array_ref::().unwrap(); - - // TODO(cmc): Can we rely on the fields ordering from the datatype? I would assume not - // since we generally cannot rely on anything when it comes to arrow… - // If we could, that would also impact our codegen deserialization path. - let (time_ns_index, inc_index) = { - let mut time_ns_index = None; - let mut inc_index = None; - for (i, field) in array.fields().iter().enumerate() { - if field.name() == "time_ns" { - time_ns_index = Some(i); - } else if field.name() == "inc" { - inc_index = Some(i); - } - } - #[allow(clippy::unwrap_used)] - (time_ns_index.unwrap(), inc_index.unwrap()) - }; - - let get_buffer = |field_index: usize| { - #[allow(clippy::unwrap_used)] - array.columns()[field_index] - .downcast_array_ref::() - .unwrap() - .values() }; - let time_ns_buffer = get_buffer(time_ns_index); - let inc_buffer = get_buffer(inc_index); + // NOTE: We don't even look at the validity, our datatype says we don't care. - if time_ns_buffer.len() != inc_buffer.len() { - return Err(DeserializationError::mismatched_struct_field_lengths( - "time_ns", - time_ns_buffer.len(), - "inc", - inc_buffer.len(), - )); - } + let uuids: &[Self] = bytemuck::try_cast_slice(array.value_data()).map_err(|err| { + DeserializationError::ValidationError(format!("Bad length of Tuid array: {err}")) + })?; - Ok(time_ns_buffer - .iter() - .copied() - .zip(inc_buffer.iter().copied()) - .map(|(time_ns, inc)| Self::from_nanos_and_inc(time_ns, inc)) - .collect()) + Ok(uuids.to_vec()) } } diff --git a/crates/utils/re_tuid/src/lib.rs b/crates/utils/re_tuid/src/lib.rs index d98f3f76997c..6978741cca8d 100644 --- a/crates/utils/re_tuid/src/lib.rs +++ b/crates/utils/re_tuid/src/lib.rs @@ -9,34 +9,32 @@ /// TUID: Time-based Unique Identifier. /// /// Time-ordered globally unique 128-bit identifiers. +/// +/// The raw bytes of the `Tuid` sorts in time order as the `Tuid` itself, +/// and the `Tuid` is byte-aligned so you can just transmute between `Tuid` and raw bytes. #[repr(C, align(1))] -#[derive(Clone, Copy, PartialEq, Eq, Hash)] -#[cfg_attr(feature = "bytemuck", derive(bytemuck::AnyBitPattern))] +#[derive(Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd)] +#[cfg_attr( + feature = "bytemuck", + derive(bytemuck::AnyBitPattern, bytemuck::NoUninit) +)] pub struct Tuid { /// Approximate nanoseconds since epoch. - /// A LE u64 encoded as bytes to keep the alignment of `Tuid` to 1. + /// + /// A big-endian u64 encoded as bytes to keep the alignment of `Tuid` to 1. + /// + /// We use big-endian so that the raw bytes of the `Tuid` sorts in time order. time_ns: [u8; 8], /// Initialized to something random on each thread, /// then incremented for each new [`Tuid`] being allocated. - /// A LE u64 encoded as bytes to keep the alignment of `Tuid` to 1. + /// + /// Uses big-endian u64 encoded as bytes to keep the alignment of `Tuid` to 1. + /// + /// We use big-endian so that the raw bytes of the `Tuid` sorts in creation order. inc: [u8; 8], } -impl Ord for Tuid { - #[inline] - fn cmp(&self, other: &Self) -> std::cmp::Ordering { - self.as_u128().cmp(&other.as_u128()) - } -} - -impl PartialOrd for Tuid { - #[inline] - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - impl Tuid { /// We give an actual name to [`Tuid`], and inject that name into the Arrow datatype extensions, /// as a hack so that we can compactly format them when printing Arrow data to the terminal. @@ -98,8 +96,8 @@ impl Tuid { /// All ones. pub const MAX: Self = Self { - time_ns: u64::MAX.to_le_bytes(), - inc: u64::MAX.to_le_bytes(), + time_ns: u64::MAX.to_be_bytes(), + inc: u64::MAX.to_be_bytes(), }; /// Create a new unique [`Tuid`] based on the current time. @@ -138,8 +136,8 @@ impl Tuid { #[inline] pub fn from_nanos_and_inc(time_ns: u64, inc: u64) -> Self { Self { - time_ns: time_ns.to_le_bytes(), - inc: inc.to_le_bytes(), + time_ns: time_ns.to_be_bytes(), + inc: inc.to_be_bytes(), } } @@ -153,12 +151,18 @@ impl Tuid { ((self.nanoseconds_since_epoch() as u128) << 64) | (self.inc() as u128) } + /// Returns most significant byte first (big endian). + #[inline] + pub fn as_bytes(&self) -> [u8; 16] { + self.as_u128().to_be_bytes() + } + /// Approximate nanoseconds since unix epoch. /// /// The upper 64 bits of the [`Tuid`]. #[inline] pub fn nanoseconds_since_epoch(&self) -> u64 { - u64::from_le_bytes(self.time_ns) + u64::from_be_bytes(self.time_ns) } /// The increment part of the [`Tuid`]. @@ -166,7 +170,7 @@ impl Tuid { /// The lower 64 bits of the [`Tuid`]. #[inline] pub fn inc(&self) -> u64 { - u64::from_le_bytes(self.inc) + u64::from_be_bytes(self.inc) } /// Returns the next logical [`Tuid`]. @@ -182,7 +186,7 @@ impl Tuid { Self { time_ns, - inc: u64::from_le_bytes(inc).wrapping_add(1).to_le_bytes(), + inc: u64::from_be_bytes(inc).wrapping_add(1).to_be_bytes(), } } @@ -199,7 +203,7 @@ impl Tuid { let Self { time_ns, inc } = *self; Self { time_ns, - inc: u64::from_le_bytes(inc).wrapping_add(n).to_le_bytes(), + inc: u64::from_be_bytes(inc).wrapping_add(n).to_be_bytes(), } } @@ -248,7 +252,7 @@ fn nanos_since_epoch() -> u64 { fn random_u64() -> u64 { let mut bytes = [0_u8; 8]; getrandom::getrandom(&mut bytes).expect("Couldn't get random bytes"); - u64::from_le_bytes(bytes) + u64::from_be_bytes(bytes) } impl re_byte_size::SizeBytes for Tuid { diff --git a/crates/viewer/re_chunk_store_ui/src/chunk_ui.rs b/crates/viewer/re_chunk_store_ui/src/chunk_ui.rs index efa7ec106bf9..915bbe091248 100644 --- a/crates/viewer/re_chunk_store_ui/src/chunk_ui.rs +++ b/crates/viewer/re_chunk_store_ui/src/chunk_ui.rs @@ -65,7 +65,7 @@ impl ChunkUi { } }; - let row_ids = chunk.row_ids().collect_vec(); + let row_ids = chunk.row_ids_slice(); let reverse = self.sort_column.direction == SortDirection::Descending; //