Skip to content

Commit

Permalink
remove zero byte check (#2379)
Browse files Browse the repository at this point in the history
remove zero byte checks in columnar. zero bytes are converted during serialization now.
unify code paths
extend test for expected column names
  • Loading branch information
PSeitz authored Apr 26, 2024
1 parent 6a66a71 commit 99a59ad
Show file tree
Hide file tree
Showing 2 changed files with 91 additions and 57 deletions.
88 changes: 32 additions & 56 deletions columnar/src/columnar/writer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,22 +59,6 @@ pub struct ColumnarWriter {
buffers: SpareBuffers,
}

#[inline]
fn mutate_or_create_column<V, TMutator>(
arena_hash_map: &mut ArenaHashMap,
column_name: &str,
updater: TMutator,
) where
V: Copy + 'static,
TMutator: FnMut(Option<V>) -> V,
{
assert!(
!column_name.as_bytes().contains(&0u8),
"key may not contain the 0 byte"
);
arena_hash_map.mutate_or_create(column_name.as_bytes(), updater);
}

impl ColumnarWriter {
pub fn mem_usage(&self) -> usize {
self.arena.mem_usage()
Expand Down Expand Up @@ -175,9 +159,8 @@ impl ColumnarWriter {
},
&mut self.dictionaries,
);
mutate_or_create_column(
hash_map,
column_name,
hash_map.mutate_or_create(
column_name.as_bytes(),
|column_opt: Option<StrOrBytesColumnWriter>| {
let mut column_writer = if let Some(column_writer) = column_opt {
column_writer
Expand All @@ -192,34 +175,30 @@ impl ColumnarWriter {
);
}
ColumnType::Bool => {
mutate_or_create_column(
&mut self.bool_field_hash_map,
column_name,
self.bool_field_hash_map.mutate_or_create(
column_name.as_bytes(),
|column_opt: Option<ColumnWriter>| column_opt.unwrap_or_default(),
);
}
ColumnType::DateTime => {
mutate_or_create_column(
&mut self.datetime_field_hash_map,
column_name,
self.datetime_field_hash_map.mutate_or_create(
column_name.as_bytes(),
|column_opt: Option<ColumnWriter>| column_opt.unwrap_or_default(),
);
}
ColumnType::I64 | ColumnType::F64 | ColumnType::U64 => {
let numerical_type = column_type.numerical_type().unwrap();
mutate_or_create_column(
&mut self.numerical_field_hash_map,
column_name,
self.numerical_field_hash_map.mutate_or_create(
column_name.as_bytes(),
|column_opt: Option<NumericalColumnWriter>| {
let mut column: NumericalColumnWriter = column_opt.unwrap_or_default();
column.force_numerical_type(numerical_type);
column
},
);
}
ColumnType::IpAddr => mutate_or_create_column(
&mut self.ip_addr_field_hash_map,
column_name,
ColumnType::IpAddr => self.ip_addr_field_hash_map.mutate_or_create(
column_name.as_bytes(),
|column_opt: Option<ColumnWriter>| column_opt.unwrap_or_default(),
),
}
Expand All @@ -232,9 +211,8 @@ impl ColumnarWriter {
numerical_value: T,
) {
let (hash_map, arena) = (&mut self.numerical_field_hash_map, &mut self.arena);
mutate_or_create_column(
hash_map,
column_name,
hash_map.mutate_or_create(
column_name.as_bytes(),
|column_opt: Option<NumericalColumnWriter>| {
let mut column: NumericalColumnWriter = column_opt.unwrap_or_default();
column.record_numerical_value(doc, numerical_value.into(), arena);
Expand All @@ -244,10 +222,6 @@ impl ColumnarWriter {
}

pub fn record_ip_addr(&mut self, doc: RowId, column_name: &str, ip_addr: Ipv6Addr) {
assert!(
!column_name.as_bytes().contains(&0u8),
"key may not contain the 0 byte"
);
let (hash_map, arena) = (&mut self.ip_addr_field_hash_map, &mut self.arena);
hash_map.mutate_or_create(
column_name.as_bytes(),
Expand All @@ -261,24 +235,30 @@ impl ColumnarWriter {

pub fn record_bool(&mut self, doc: RowId, column_name: &str, val: bool) {
let (hash_map, arena) = (&mut self.bool_field_hash_map, &mut self.arena);
mutate_or_create_column(hash_map, column_name, |column_opt: Option<ColumnWriter>| {
let mut column: ColumnWriter = column_opt.unwrap_or_default();
column.record(doc, val, arena);
column
});
hash_map.mutate_or_create(
column_name.as_bytes(),
|column_opt: Option<ColumnWriter>| {
let mut column: ColumnWriter = column_opt.unwrap_or_default();
column.record(doc, val, arena);
column
},
);
}

pub fn record_datetime(&mut self, doc: RowId, column_name: &str, datetime: common::DateTime) {
let (hash_map, arena) = (&mut self.datetime_field_hash_map, &mut self.arena);
mutate_or_create_column(hash_map, column_name, |column_opt: Option<ColumnWriter>| {
let mut column: ColumnWriter = column_opt.unwrap_or_default();
column.record(
doc,
NumericalValue::I64(datetime.into_timestamp_nanos()),
arena,
);
column
});
hash_map.mutate_or_create(
column_name.as_bytes(),
|column_opt: Option<ColumnWriter>| {
let mut column: ColumnWriter = column_opt.unwrap_or_default();
column.record(
doc,
NumericalValue::I64(datetime.into_timestamp_nanos()),
arena,
);
column
},
);
}

pub fn record_str(&mut self, doc: RowId, column_name: &str, value: &str) {
Expand All @@ -303,10 +283,6 @@ impl ColumnarWriter {
}

pub fn record_bytes(&mut self, doc: RowId, column_name: &str, value: &[u8]) {
assert!(
!column_name.as_bytes().contains(&0u8),
"key may not contain the 0 byte"
);
let (hash_map, arena, dictionaries) = (
&mut self.bytes_field_hash_map,
&mut self.arena,
Expand Down
60 changes: 59 additions & 1 deletion src/indexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ mod tests_mmap {
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(field=>json!({format!("{field_name_in}"): "test1"})))
.add_document(doc!(field=>json!({format!("{field_name_in}"): "test1", format!("num{field_name_in}"): 10})))
.unwrap();
index_writer
.add_document(doc!(field=>json!({format!("a{field_name_in}"): "test2"})))
Expand Down Expand Up @@ -260,6 +260,64 @@ mod tests_mmap {
"test6",
);
test_agg(format!("json.{field_name_out}a").as_str(), "test7");

// `.` is stored as `\u{0001}` internally in tantivy
let field_name_out_internal = if field_name_out == "." {
"\u{0001}"
} else {
field_name_out
};

let mut fields = reader.searcher().segment_readers()[0]
.inverted_index(field)
.unwrap()
.list_encoded_fields()
.unwrap();
assert_eq!(fields.len(), 8);
fields.sort();
let mut expected_fields = vec![
(format!("a{field_name_out_internal}"), Type::Str),
(format!("a{field_name_out_internal}a"), Type::Str),
(
format!("a{field_name_out_internal}a{field_name_out_internal}"),
Type::Str,
),
(
format!("a{field_name_out_internal}\u{1}ab{field_name_out_internal}"),
Type::Str,
),
(
format!("a{field_name_out_internal}\u{1}a{field_name_out_internal}"),
Type::Str,
),
(format!("{field_name_out_internal}a"), Type::Str),
(format!("{field_name_out_internal}"), Type::Str),

Check warning on line 294 in src/indexer/mod.rs

View workflow job for this annotation

GitHub Actions / clippy

useless use of `format!`

warning: useless use of `format!` --> src/indexer/mod.rs:294:14 | 294 | (format!("{field_name_out_internal}"), Type::Str), | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ help: consider using `.to_string()`: `field_name_out_internal.to_string()` | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#useless_format = note: `#[warn(clippy::useless_format)]` on by default
(format!("num{field_name_out_internal}"), Type::I64),
];
expected_fields.sort();
assert_eq!(fields, expected_fields);
// Check columnar reader
let mut columns = reader.searcher().segment_readers()[0]
.fast_fields()
.columnar()
.list_columns()
.unwrap()
.into_iter()
.map(|(name, _)| name)
.collect::<Vec<_>>();
let mut expected_columns = vec![
format!("json\u{1}{field_name_out_internal}"),
format!("json\u{1}{field_name_out_internal}a"),
format!("json\u{1}a{field_name_out_internal}"),
format!("json\u{1}a{field_name_out_internal}a"),
format!("json\u{1}a{field_name_out_internal}a{field_name_out_internal}"),
format!("json\u{1}a{field_name_out_internal}\u{1}ab{field_name_out_internal}"),
format!("json\u{1}a{field_name_out_internal}\u{1}a{field_name_out_internal}"),
format!("json\u{1}num{field_name_out_internal}"),
];
columns.sort();
expected_columns.sort();
assert_eq!(columns, expected_columns);
}

#[test]
Expand Down

0 comments on commit 99a59ad

Please sign in to comment.