From 99a59ad37e833e1129f7e1abd25738a0adfae73a Mon Sep 17 00:00:00 2001 From: PSeitz Date: Fri, 26 Apr 2024 06:03:28 +0200 Subject: [PATCH] remove zero byte check (#2379) remove zero byte checks in columnar. zero bytes are converted during serialization now. unify code paths extend test for expected column names --- columnar/src/columnar/writer/mod.rs | 88 +++++++++++------------------ src/indexer/mod.rs | 60 +++++++++++++++++++- 2 files changed, 91 insertions(+), 57 deletions(-) diff --git a/columnar/src/columnar/writer/mod.rs b/columnar/src/columnar/writer/mod.rs index 32b31b9019..1fbc9d85de 100644 --- a/columnar/src/columnar/writer/mod.rs +++ b/columnar/src/columnar/writer/mod.rs @@ -59,22 +59,6 @@ pub struct ColumnarWriter { buffers: SpareBuffers, } -#[inline] -fn mutate_or_create_column( - arena_hash_map: &mut ArenaHashMap, - column_name: &str, - updater: TMutator, -) where - V: Copy + 'static, - TMutator: FnMut(Option) -> V, -{ - assert!( - !column_name.as_bytes().contains(&0u8), - "key may not contain the 0 byte" - ); - arena_hash_map.mutate_or_create(column_name.as_bytes(), updater); -} - impl ColumnarWriter { pub fn mem_usage(&self) -> usize { self.arena.mem_usage() @@ -175,9 +159,8 @@ impl ColumnarWriter { }, &mut self.dictionaries, ); - mutate_or_create_column( - hash_map, - column_name, + hash_map.mutate_or_create( + column_name.as_bytes(), |column_opt: Option| { let mut column_writer = if let Some(column_writer) = column_opt { column_writer @@ -192,24 +175,21 @@ impl ColumnarWriter { ); } ColumnType::Bool => { - mutate_or_create_column( - &mut self.bool_field_hash_map, - column_name, + self.bool_field_hash_map.mutate_or_create( + column_name.as_bytes(), |column_opt: Option| column_opt.unwrap_or_default(), ); } ColumnType::DateTime => { - mutate_or_create_column( - &mut self.datetime_field_hash_map, - column_name, + self.datetime_field_hash_map.mutate_or_create( + column_name.as_bytes(), |column_opt: Option| column_opt.unwrap_or_default(), ); } ColumnType::I64 | ColumnType::F64 | ColumnType::U64 => { let numerical_type = column_type.numerical_type().unwrap(); - mutate_or_create_column( - &mut self.numerical_field_hash_map, - column_name, + self.numerical_field_hash_map.mutate_or_create( + column_name.as_bytes(), |column_opt: Option| { let mut column: NumericalColumnWriter = column_opt.unwrap_or_default(); column.force_numerical_type(numerical_type); @@ -217,9 +197,8 @@ impl ColumnarWriter { }, ); } - ColumnType::IpAddr => mutate_or_create_column( - &mut self.ip_addr_field_hash_map, - column_name, + ColumnType::IpAddr => self.ip_addr_field_hash_map.mutate_or_create( + column_name.as_bytes(), |column_opt: Option| column_opt.unwrap_or_default(), ), } @@ -232,9 +211,8 @@ impl ColumnarWriter { numerical_value: T, ) { let (hash_map, arena) = (&mut self.numerical_field_hash_map, &mut self.arena); - mutate_or_create_column( - hash_map, - column_name, + hash_map.mutate_or_create( + column_name.as_bytes(), |column_opt: Option| { let mut column: NumericalColumnWriter = column_opt.unwrap_or_default(); column.record_numerical_value(doc, numerical_value.into(), arena); @@ -244,10 +222,6 @@ impl ColumnarWriter { } pub fn record_ip_addr(&mut self, doc: RowId, column_name: &str, ip_addr: Ipv6Addr) { - assert!( - !column_name.as_bytes().contains(&0u8), - "key may not contain the 0 byte" - ); let (hash_map, arena) = (&mut self.ip_addr_field_hash_map, &mut self.arena); hash_map.mutate_or_create( column_name.as_bytes(), @@ -261,24 +235,30 @@ impl ColumnarWriter { pub fn record_bool(&mut self, doc: RowId, column_name: &str, val: bool) { let (hash_map, arena) = (&mut self.bool_field_hash_map, &mut self.arena); - mutate_or_create_column(hash_map, column_name, |column_opt: Option| { - let mut column: ColumnWriter = column_opt.unwrap_or_default(); - column.record(doc, val, arena); - column - }); + hash_map.mutate_or_create( + column_name.as_bytes(), + |column_opt: Option| { + let mut column: ColumnWriter = column_opt.unwrap_or_default(); + column.record(doc, val, arena); + column + }, + ); } pub fn record_datetime(&mut self, doc: RowId, column_name: &str, datetime: common::DateTime) { let (hash_map, arena) = (&mut self.datetime_field_hash_map, &mut self.arena); - mutate_or_create_column(hash_map, column_name, |column_opt: Option| { - let mut column: ColumnWriter = column_opt.unwrap_or_default(); - column.record( - doc, - NumericalValue::I64(datetime.into_timestamp_nanos()), - arena, - ); - column - }); + hash_map.mutate_or_create( + column_name.as_bytes(), + |column_opt: Option| { + let mut column: ColumnWriter = column_opt.unwrap_or_default(); + column.record( + doc, + NumericalValue::I64(datetime.into_timestamp_nanos()), + arena, + ); + column + }, + ); } pub fn record_str(&mut self, doc: RowId, column_name: &str, value: &str) { @@ -303,10 +283,6 @@ impl ColumnarWriter { } pub fn record_bytes(&mut self, doc: RowId, column_name: &str, value: &[u8]) { - assert!( - !column_name.as_bytes().contains(&0u8), - "key may not contain the 0 byte" - ); let (hash_map, arena, dictionaries) = ( &mut self.bytes_field_hash_map, &mut self.arena, diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs index 692e2c1087..a14f129711 100644 --- a/src/indexer/mod.rs +++ b/src/indexer/mod.rs @@ -182,7 +182,7 @@ mod tests_mmap { let index = Index::create_in_ram(schema_builder.build()); let mut index_writer = index.writer_for_tests().unwrap(); index_writer - .add_document(doc!(field=>json!({format!("{field_name_in}"): "test1"}))) + .add_document(doc!(field=>json!({format!("{field_name_in}"): "test1", format!("num{field_name_in}"): 10}))) .unwrap(); index_writer .add_document(doc!(field=>json!({format!("a{field_name_in}"): "test2"}))) @@ -260,6 +260,64 @@ mod tests_mmap { "test6", ); test_agg(format!("json.{field_name_out}a").as_str(), "test7"); + + // `.` is stored as `\u{0001}` internally in tantivy + let field_name_out_internal = if field_name_out == "." { + "\u{0001}" + } else { + field_name_out + }; + + let mut fields = reader.searcher().segment_readers()[0] + .inverted_index(field) + .unwrap() + .list_encoded_fields() + .unwrap(); + assert_eq!(fields.len(), 8); + fields.sort(); + let mut expected_fields = vec![ + (format!("a{field_name_out_internal}"), Type::Str), + (format!("a{field_name_out_internal}a"), Type::Str), + ( + format!("a{field_name_out_internal}a{field_name_out_internal}"), + Type::Str, + ), + ( + format!("a{field_name_out_internal}\u{1}ab{field_name_out_internal}"), + Type::Str, + ), + ( + format!("a{field_name_out_internal}\u{1}a{field_name_out_internal}"), + Type::Str, + ), + (format!("{field_name_out_internal}a"), Type::Str), + (format!("{field_name_out_internal}"), Type::Str), + (format!("num{field_name_out_internal}"), Type::I64), + ]; + expected_fields.sort(); + assert_eq!(fields, expected_fields); + // Check columnar reader + let mut columns = reader.searcher().segment_readers()[0] + .fast_fields() + .columnar() + .list_columns() + .unwrap() + .into_iter() + .map(|(name, _)| name) + .collect::>(); + let mut expected_columns = vec![ + format!("json\u{1}{field_name_out_internal}"), + format!("json\u{1}{field_name_out_internal}a"), + format!("json\u{1}a{field_name_out_internal}"), + format!("json\u{1}a{field_name_out_internal}a"), + format!("json\u{1}a{field_name_out_internal}a{field_name_out_internal}"), + format!("json\u{1}a{field_name_out_internal}\u{1}ab{field_name_out_internal}"), + format!("json\u{1}a{field_name_out_internal}\u{1}a{field_name_out_internal}"), + format!("json\u{1}num{field_name_out_internal}"), + ]; + columns.sort(); + expected_columns.sort(); + assert_eq!(columns, expected_columns); } #[test]