From e1679f3fb943bdcf9effc1ce3613637e6e87b39d Mon Sep 17 00:00:00 2001 From: PSeitz Date: Tue, 21 May 2024 10:16:08 +0200 Subject: [PATCH] compact doc (#2402) * compact doc * add any value type * pass references when building CompactDoc * remove OwnedValue from API * clippy * clippy * fail on large documents * fmt * cleanup * cleanup * implement Value for different types fix serde_json date Value implementation * fmt * cleanup * fmt * cleanup * store positions instead of pos+len * remove nodes array * remove mediumvec * cleanup * infallible serialize into vec * remove positions indirection * remove 24MB limitation in document use u32 for Addr Remove the 3 byte addressing limitation and use VInt instead * cleanup * extend test * cleanup, add comments * rename, remove pub --- benches/index-bench.rs | 10 +- common/src/vint.rs | 2 +- examples/date_time_field.rs | 11 +- examples/faceted_search_with_tweaked_score.rs | 5 +- src/fastfield/facet_reader.rs | 1 - src/indexer/doc_id_mapping.rs | 12 +- src/indexer/index_writer.rs | 3 +- src/indexer/merger.rs | 1 - src/indexer/merger_sorted_index_test.rs | 1 - src/indexer/segment_writer.rs | 26 +- src/lib.rs | 15 +- src/macros.rs | 3 +- src/schema/document/default_document.rs | 678 +++++++++++++++--- src/schema/document/existing_type_impls.rs | 152 +++- src/schema/document/mod.rs | 6 +- src/schema/document/owned_value.rs | 12 +- src/schema/document/se.rs | 2 + src/schema/document/value.rs | 63 ++ src/schema/field_type.rs | 39 +- src/schema/field_value.rs | 46 -- src/schema/mod.rs | 2 - src/schema/schema.rs | 12 +- src/store/mod.rs | 14 +- src/store/reader.rs | 1 - 24 files changed, 883 insertions(+), 234 deletions(-) delete mode 100644 src/schema/field_value.rs diff --git a/benches/index-bench.rs b/benches/index-bench.rs index f9ae63b686..eb48d4487e 100644 --- a/benches/index-bench.rs +++ b/benches/index-bench.rs @@ -18,7 +18,7 @@ fn benchmark( benchmark_dynamic_json(b, input, schema, commit, parse_json) } else { _benchmark(b, input, schema, commit, parse_json, |schema, doc_json| { - TantivyDocument::parse_json(&schema, doc_json).unwrap() + TantivyDocument::parse_json(schema, doc_json).unwrap() }) } } @@ -90,8 +90,7 @@ fn benchmark_dynamic_json( ) { let json_field = schema.get_field("json").unwrap(); _benchmark(b, input, schema, commit, parse_json, |_schema, doc_json| { - let json_val: serde_json::Map = - serde_json::from_str(doc_json).unwrap(); + let json_val: serde_json::Value = serde_json::from_str(doc_json).unwrap(); tantivy::doc!(json_field=>json_val) }) } @@ -138,12 +137,13 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) { for (prefix, schema, is_dynamic) in benches { for commit in [false, true] { let suffix = if commit { "with-commit" } else { "no-commit" }; - for parse_json in [false] { + { + let parse_json = false; // for parse_json in [false, true] { let suffix = if parse_json { format!("{}-with-json-parsing", suffix) } else { - format!("{}", suffix) + suffix.to_string() }; let bench_name = format!("{}{}", prefix, suffix); diff --git a/common/src/vint.rs b/common/src/vint.rs index 64d4f7e145..b09e73b928 100644 --- a/common/src/vint.rs +++ b/common/src/vint.rs @@ -151,7 +151,7 @@ pub fn read_u32_vint_no_advance(data: &[u8]) -> (u32, usize) { (result, vlen) } /// Write a `u32` as a vint payload. -pub fn write_u32_vint(val: u32, writer: &mut W) -> io::Result<()> { +pub fn write_u32_vint(val: u32, writer: &mut W) -> io::Result<()> { let mut buf = [0u8; 8]; let data = serialize_vint_u32(val, &mut buf); writer.write_all(data) diff --git a/examples/date_time_field.rs b/examples/date_time_field.rs index aba9411d3e..658c66aeae 100644 --- a/examples/date_time_field.rs +++ b/examples/date_time_field.rs @@ -4,7 +4,7 @@ use tantivy::collector::TopDocs; use tantivy::query::QueryParser; -use tantivy::schema::{DateOptions, Document, OwnedValue, Schema, INDEXED, STORED, STRING}; +use tantivy::schema::{DateOptions, Document, Schema, INDEXED, STORED, STRING}; use tantivy::{Index, IndexWriter, TantivyDocument}; fn main() -> tantivy::Result<()> { @@ -61,10 +61,11 @@ fn main() -> tantivy::Result<()> { assert_eq!(count_docs.len(), 1); for (_score, doc_address) in count_docs { let retrieved_doc = searcher.doc::(doc_address)?; - assert!(matches!( - retrieved_doc.get_first(occurred_at), - Some(OwnedValue::Date(_)) - )); + assert!(retrieved_doc + .get_first(occurred_at) + .unwrap() + .as_datetime() + .is_some(),); assert_eq!( retrieved_doc.to_json(&schema), r#"{"event":["comment"],"occurred_at":["2022-06-22T13:00:00.22Z"]}"# diff --git a/examples/faceted_search_with_tweaked_score.rs b/examples/faceted_search_with_tweaked_score.rs index 371818eda3..d21a1c3d4c 100644 --- a/examples/faceted_search_with_tweaked_score.rs +++ b/examples/faceted_search_with_tweaked_score.rs @@ -51,7 +51,7 @@ fn main() -> tantivy::Result<()> { let reader = index.reader()?; let searcher = reader.searcher(); { - let facets = vec![ + let facets = [ Facet::from("/ingredient/egg"), Facet::from("/ingredient/oil"), Facet::from("/ingredient/garlic"), @@ -94,9 +94,8 @@ fn main() -> tantivy::Result<()> { .doc::(*doc_id) .unwrap() .get_first(title) - .and_then(|v| v.as_str()) + .and_then(|v| v.as_str().map(|el| el.to_string())) .unwrap() - .to_owned() }) .collect(); assert_eq!(titles, vec!["Fried egg", "Egg rolls"]); diff --git a/src/fastfield/facet_reader.rs b/src/fastfield/facet_reader.rs index 7312427796..dc62b2c1a4 100644 --- a/src/fastfield/facet_reader.rs +++ b/src/fastfield/facet_reader.rs @@ -62,7 +62,6 @@ impl FacetReader { #[cfg(test)] mod tests { - use crate::schema::document::Value; use crate::schema::{Facet, FacetOptions, SchemaBuilder, STORED}; use crate::{DocAddress, Index, IndexWriter, TantivyDocument}; diff --git a/src/indexer/doc_id_mapping.rs b/src/indexer/doc_id_mapping.rs index 63460eda30..b3c1ea2f08 100644 --- a/src/indexer/doc_id_mapping.rs +++ b/src/indexer/doc_id_mapping.rs @@ -306,12 +306,10 @@ mod tests_indexsorting { let my_string_field = index.schema().get_field("string_field").unwrap(); let searcher = index.reader()?.searcher(); { - assert_eq!( - searcher - .doc::(DocAddress::new(0, 0))? - .get_first(my_string_field), - None - ); + assert!(searcher + .doc::(DocAddress::new(0, 0))? + .get_first(my_string_field) + .is_none()); assert_eq!( searcher .doc::(DocAddress::new(0, 3))? @@ -344,7 +342,7 @@ mod tests_indexsorting { Some("blublub") ); let doc = searcher.doc::(DocAddress::new(0, 4))?; - assert_eq!(doc.get_first(my_string_field), None); + assert!(doc.get_first(my_string_field).is_none()); } // sort by field desc let index = create_test_index( diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index c13853bc0e..f12310a29a 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -814,7 +814,6 @@ mod tests { use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN; use crate::indexer::NoMergePolicy; use crate::query::{BooleanQuery, Occur, Query, QueryParser, TermQuery}; - use crate::schema::document::Value; use crate::schema::{ self, Facet, FacetOptions, IndexRecordOption, IpAddrOptions, NumericOptions, Schema, TextFieldIndexing, TextOptions, FAST, INDEXED, STORED, STRING, TEXT, @@ -2013,7 +2012,7 @@ mod tests { let mut bool2 = doc.get_all(multi_bools); assert_eq!(bool, bool2.next().unwrap().as_bool().unwrap()); assert_ne!(bool, bool2.next().unwrap().as_bool().unwrap()); - assert_eq!(None, bool2.next()) + assert!(bool2.next().is_none()) } } } diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 9dd027a594..88494e8dfa 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -795,7 +795,6 @@ mod tests { use crate::collector::{Count, FacetCollector}; use crate::index::{Index, SegmentId}; use crate::query::{AllQuery, BooleanQuery, EnableScoring, Scorer, TermQuery}; - use crate::schema::document::Value; use crate::schema::{ Facet, FacetOptions, IndexRecordOption, NumericOptions, TantivyDocument, Term, TextFieldIndexing, INDEXED, TEXT, diff --git a/src/indexer/merger_sorted_index_test.rs b/src/indexer/merger_sorted_index_test.rs index d698b357fa..9f345845d4 100644 --- a/src/indexer/merger_sorted_index_test.rs +++ b/src/indexer/merger_sorted_index_test.rs @@ -5,7 +5,6 @@ mod tests { use crate::index::Index; use crate::postings::Postings; use crate::query::QueryParser; - use crate::schema::document::Value; use crate::schema::{ self, BytesOptions, Facet, FacetOptions, IndexRecordOption, NumericOptions, TextFieldIndexing, TextOptions, diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 0e1be366c2..4a02b0a19a 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -499,7 +499,6 @@ mod tests { use crate::fastfield::FastValue; use crate::postings::{Postings, TermInfo}; use crate::query::{PhraseQuery, QueryParser}; - use crate::schema::document::Value; use crate::schema::{ Document, IndexRecordOption, OwnedValue, Schema, TextFieldIndexing, TextOptions, STORED, STRING, TEXT, @@ -555,9 +554,12 @@ mod tests { let reader = StoreReader::open(directory.open_read(path).unwrap(), 0).unwrap(); let doc = reader.get::(0).unwrap(); - assert_eq!(doc.field_values().len(), 2); - assert_eq!(doc.field_values()[0].value().as_str(), Some("A")); - assert_eq!(doc.field_values()[1].value().as_str(), Some("title")); + assert_eq!(doc.field_values().count(), 2); + assert_eq!(doc.get_all(text_field).next().unwrap().as_str(), Some("A")); + assert_eq!( + doc.get_all(text_field).nth(1).unwrap().as_str(), + Some("title") + ); } #[test] fn test_simple_json_indexing() { @@ -641,7 +643,7 @@ mod tests { let mut schema_builder = Schema::builder(); let json_field = schema_builder.add_json_field("json", STORED | TEXT); let schema = schema_builder.build(); - let json_val: serde_json::Map = serde_json::from_str( + let json_val: serde_json::Value = serde_json::from_str( r#"{ "toto": "titi", "float": -0.2, @@ -669,14 +671,10 @@ mod tests { doc_id: 0u32, }) .unwrap(); - let serdeser_json_val = serde_json::from_str::>( - &doc.to_json(&schema), - ) - .unwrap() - .get("json") - .unwrap()[0] - .as_object() + let serdeser_json_val = serde_json::from_str::(&doc.to_json(&schema)) .unwrap() + .get("json") + .unwrap()[0] .clone(); assert_eq!(json_val, serdeser_json_val); let segment_reader = searcher.segment_reader(0u32); @@ -840,7 +838,7 @@ mod tests { let mut schema_builder = Schema::builder(); let json_field = schema_builder.add_json_field("json", STRING); let schema = schema_builder.build(); - let json_val: serde_json::Map = + let json_val: serde_json::Value = serde_json::from_str(r#"{"mykey": "two tokens"}"#).unwrap(); let doc = doc!(json_field=>json_val); let index = Index::create_in_ram(schema); @@ -880,7 +878,7 @@ mod tests { let mut schema_builder = Schema::builder(); let json_field = schema_builder.add_json_field("json", TEXT); let schema = schema_builder.build(); - let json_val: serde_json::Map = serde_json::from_str( + let json_val: serde_json::Value = serde_json::from_str( r#"{"mykey": [{"field": "hello happy tax payer"}, {"field": "nothello"}]}"#, ) .unwrap(); diff --git a/src/lib.rs b/src/lib.rs index 85b71b3c8b..5ac35c98dd 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -935,7 +935,7 @@ pub mod tests { let mut schema_builder = Schema::builder(); let json_field = schema_builder.add_json_field("json", STORED | TEXT); let schema = schema_builder.build(); - let json_val: serde_json::Map = serde_json::from_str( + let json_val: serde_json::Value = serde_json::from_str( r#"{ "signed": 2, "float": 2.0, @@ -1025,13 +1025,16 @@ pub mod tests { text_field => "some other value", other_text_field => "short"); assert_eq!(document.len(), 3); - let values: Vec<&OwnedValue> = document.get_all(text_field).collect(); + let values: Vec = document.get_all(text_field).map(OwnedValue::from).collect(); assert_eq!(values.len(), 2); - assert_eq!(values[0].as_str(), Some("tantivy")); - assert_eq!(values[1].as_str(), Some("some other value")); - let values: Vec<&OwnedValue> = document.get_all(other_text_field).collect(); + assert_eq!(values[0].as_ref().as_str(), Some("tantivy")); + assert_eq!(values[1].as_ref().as_str(), Some("some other value")); + let values: Vec = document + .get_all(other_text_field) + .map(OwnedValue::from) + .collect(); assert_eq!(values.len(), 1); - assert_eq!(values[0].as_str(), Some("short")); + assert_eq!(values[0].as_ref().as_str(), Some("short")); } #[test] diff --git a/src/macros.rs b/src/macros.rs index 2303a7f11c..ad32154d58 100644 --- a/src/macros.rs +++ b/src/macros.rs @@ -41,6 +41,7 @@ /// ); /// # } /// ``` + #[macro_export] macro_rules! doc( () => { @@ -52,7 +53,7 @@ macro_rules! doc( { let mut document = $crate::TantivyDocument::default(); $( - document.add_field_value($field, $value); + document.add_field_value($field, &$value); )* document } diff --git a/src/schema/document/default_document.rs b/src/schema/document/default_document.rs index fcf374dfed..5b65fc3ebc 100644 --- a/src/schema/document/default_document.rs +++ b/src/schema/document/default_document.rs @@ -1,93 +1,64 @@ use std::collections::{BTreeMap, HashMap, HashSet}; +use std::io::{self, Read, Write}; use std::net::Ipv6Addr; -use common::DateTime; +use columnar::MonotonicallyMappableToU128; +use common::{read_u32_vint_no_advance, serialize_vint_u32, BinarySerializable, DateTime, VInt}; use serde_json::Map; +pub use CompactDoc as TantivyDocument; +use super::{ReferenceValue, ReferenceValueLeaf, Value}; use crate::schema::document::{ DeserializeError, Document, DocumentDeserialize, DocumentDeserializer, }; use crate::schema::field_type::ValueParsingError; -use crate::schema::field_value::FieldValueIter; -use crate::schema::{Facet, Field, FieldValue, NamedFieldDocument, OwnedValue, Schema}; +use crate::schema::{Facet, Field, NamedFieldDocument, OwnedValue, Schema}; use crate::tokenizer::PreTokenizedString; -/// TantivyDocument provides a default implementation of the `Document` trait. -/// It is the object that can be indexed and then searched for. -/// -/// Documents are fundamentally a collection of unordered couples `(field, value)`. -/// In this list, one field may appear more than once. -#[derive(Clone, Debug, serde::Serialize, serde::Deserialize, Default)] -pub struct TantivyDocument { - field_values: Vec, +#[repr(packed)] +#[derive(Debug, Clone)] +/// A field value pair in the compact tantivy document +struct FieldValueAddr { + pub field: u16, + pub value_addr: ValueAddr, } -impl Document for TantivyDocument { - type Value<'a> = &'a OwnedValue; - type FieldsValuesIter<'a> = FieldValueIter<'a>; - - fn iter_fields_and_values(&self) -> Self::FieldsValuesIter<'_> { - FieldValueIter(self.field_values.iter()) - } +#[derive(Debug, Clone)] +/// The default document in tantivy. It encodes data in a compact form. +pub struct CompactDoc { + /// `node_data` is a vec of bytes, where each value is serialized into bytes and stored. It + /// includes all the data of the document and also metadata like where the nodes are located + /// in an object or array. + pub node_data: Vec, + /// The root (Field, Value) pairs + field_values: Vec, } -impl DocumentDeserialize for TantivyDocument { - fn deserialize<'de, D>(mut deserializer: D) -> Result - where D: DocumentDeserializer<'de> { - let mut field_values = Vec::with_capacity(deserializer.size_hint()); - - while let Some((field, value)) = deserializer.next_field()? { - field_values.push(FieldValue::new(field, value)); - } - - Ok(Self { field_values }) +impl Default for CompactDoc { + fn default() -> Self { + Self::new() } } -impl From> for TantivyDocument { - fn from(field_values: Vec) -> Self { - Self { field_values } - } -} - -impl PartialEq for TantivyDocument { - fn eq(&self, other: &Self) -> bool { - // super slow, but only here for tests - let convert_to_comparable_map = |field_values: &[FieldValue]| { - let mut field_value_set: HashMap> = Default::default(); - for field_value in field_values.iter() { - let value = serde_json::to_string(field_value.value()).unwrap(); - field_value_set - .entry(field_value.field()) - .or_default() - .insert(value); - } - field_value_set - }; - let self_field_values: HashMap> = - convert_to_comparable_map(&self.field_values); - let other_field_values: HashMap> = - convert_to_comparable_map(&other.field_values); - self_field_values.eq(&other_field_values) +impl CompactDoc { + /// Creates a new, empty document object + /// The reserved capacity is for the total serialized data + pub fn with_capacity(bytes: usize) -> CompactDoc { + CompactDoc { + node_data: Vec::with_capacity(bytes), + field_values: Vec::with_capacity(4), + } } -} - -impl Eq for TantivyDocument {} - -impl IntoIterator for TantivyDocument { - type Item = FieldValue; - type IntoIter = std::vec::IntoIter; - - fn into_iter(self) -> Self::IntoIter { - self.field_values.into_iter() + /// Creates a new, empty document object + pub fn new() -> CompactDoc { + CompactDoc::with_capacity(1024) } -} -impl TantivyDocument { - /// Creates a new, empty document object - pub fn new() -> TantivyDocument { - TantivyDocument::default() + /// Skrinks the capacity of the document to fit the data + pub fn shrink_to_fit(&mut self) { + self.node_data.shrink_to_fit(); + self.field_values.shrink_to_fit(); } /// Returns the length of the document. @@ -99,83 +70,116 @@ impl TantivyDocument { pub fn add_facet(&mut self, field: Field, path: F) where Facet: From { let facet = Facet::from(path); - let value = OwnedValue::Facet(facet); - self.add_field_value(field, value); + self.add_leaf_field_value(field, ReferenceValueLeaf::Facet(facet.encoded_str())); } /// Add a text field. - pub fn add_text(&mut self, field: Field, text: S) { - let value = OwnedValue::Str(text.to_string()); - self.add_field_value(field, value); + pub fn add_text>(&mut self, field: Field, text: S) { + self.add_leaf_field_value(field, ReferenceValueLeaf::Str(text.as_ref())); } /// Add a pre-tokenized text field. pub fn add_pre_tokenized_text(&mut self, field: Field, pre_tokenized_text: PreTokenizedString) { - self.add_field_value(field, pre_tokenized_text); + self.add_leaf_field_value(field, pre_tokenized_text); } /// Add a u64 field pub fn add_u64(&mut self, field: Field, value: u64) { - self.add_field_value(field, value); + self.add_leaf_field_value(field, value); } /// Add a IP address field. Internally only Ipv6Addr is used. pub fn add_ip_addr(&mut self, field: Field, value: Ipv6Addr) { - self.add_field_value(field, value); + self.add_leaf_field_value(field, value); } /// Add a i64 field pub fn add_i64(&mut self, field: Field, value: i64) { - self.add_field_value(field, value); + self.add_leaf_field_value(field, value); } /// Add a f64 field pub fn add_f64(&mut self, field: Field, value: f64) { - self.add_field_value(field, value); + self.add_leaf_field_value(field, value); } /// Add a bool field pub fn add_bool(&mut self, field: Field, value: bool) { - self.add_field_value(field, value); + self.add_leaf_field_value(field, value); } /// Add a date field with unspecified time zone offset pub fn add_date(&mut self, field: Field, value: DateTime) { - self.add_field_value(field, value); + self.add_leaf_field_value(field, value); } /// Add a bytes field - pub fn add_bytes>>(&mut self, field: Field, value: T) { - self.add_field_value(field, value.into()); + pub fn add_bytes(&mut self, field: Field, value: &[u8]) { + self.add_leaf_field_value(field, value); } /// Add a dynamic object field pub fn add_object(&mut self, field: Field, object: BTreeMap) { - self.add_field_value(field, object); + self.add_field_value(field, &OwnedValue::from(object)); } /// Add a (field, value) to the document. - pub fn add_field_value>(&mut self, field: Field, typed_val: T) { + /// + /// `OwnedValue` implements Value, which should be easiest to use, but is not the most + /// performant. + pub fn add_field_value<'a, V: Value<'a>>(&mut self, field: Field, value: V) { + let field_value = FieldValueAddr { + field: field + .field_id() + .try_into() + .expect("support only up to u16::MAX field ids"), + value_addr: self.add_value(value), + }; + self.field_values.push(field_value); + } + + /// Add a (field, leaf value) to the document. + /// Leaf values don't have nested values. + pub fn add_leaf_field_value<'a, T: Into>>( + &mut self, + field: Field, + typed_val: T, + ) { let value = typed_val.into(); - let field_value = FieldValue { field, value }; + let field_value = FieldValueAddr { + field: field + .field_id() + .try_into() + .expect("support only up to u16::MAX field ids"), + value_addr: self.add_value_leaf(value), + }; self.field_values.push(field_value); } /// field_values accessor - pub fn field_values(&self) -> &[FieldValue] { - &self.field_values + pub fn field_values( + &self, + ) -> impl Iterator>)> { + self.field_values.iter().map(|field_val| { + let field = Field::from_field_id(field_val.field as u32); + let val = self.extract_value(field_val.value_addr).unwrap(); + (field, val) + }) } - /// Returns all of the `FieldValue`s associated the given field - pub fn get_all(&self, field: Field) -> impl Iterator { + /// Returns all of the `ReferenceValue`s associated the given field + pub fn get_all( + &self, + field: Field, + ) -> impl Iterator>> + '_ { self.field_values .iter() - .filter(move |field_value| field_value.field() == field) - .map(FieldValue::value) + .filter(move |field_value| Field::from_field_id(field_value.field as u32) == field) + .map(|val| self.extract_value(val.value_addr).unwrap()) } - /// Returns the first `FieldValue` associated the given field - pub fn get_first(&self, field: Field) -> Option<&OwnedValue> { + /// Returns the first `ReferenceValue` associated the given field + pub fn get_first(&self, field: Field) -> Option>> { self.get_all(field).next() } @@ -183,12 +187,12 @@ impl TantivyDocument { pub fn convert_named_doc( schema: &Schema, named_doc: NamedFieldDocument, - ) -> Result { - let mut document = TantivyDocument::new(); + ) -> Result { + let mut document = Self::new(); for (field_name, values) in named_doc.0 { if let Ok(field) = schema.get_field(&field_name) { for value in values { - document.add_field_value(field, value); + document.add_field_value(field, &value); } } } @@ -196,7 +200,7 @@ impl TantivyDocument { } /// Build a document object from a json-object. - pub fn parse_json(schema: &Schema, doc_json: &str) -> Result { + pub fn parse_json(schema: &Schema, doc_json: &str) -> Result { let json_obj: Map = serde_json::from_str(doc_json).map_err(|_| DocParsingError::invalid_json(doc_json))?; Self::from_json_object(schema, json_obj) @@ -206,8 +210,8 @@ impl TantivyDocument { pub fn from_json_object( schema: &Schema, json_obj: Map, - ) -> Result { - let mut doc = TantivyDocument::default(); + ) -> Result { + let mut doc = Self::default(); for (field_name, json_value) in json_obj { if let Ok(field) = schema.get_field(&field_name) { let field_entry = schema.get_field_entry(field); @@ -218,20 +222,457 @@ impl TantivyDocument { let value = field_type .value_from_json(json_item) .map_err(|e| DocParsingError::ValueError(field_name.clone(), e))?; - doc.add_field_value(field, value); + doc.add_field_value(field, &value); } } _ => { let value = field_type .value_from_json(json_value) .map_err(|e| DocParsingError::ValueError(field_name.clone(), e))?; - doc.add_field_value(field, value); + doc.add_field_value(field, &value); } } } } Ok(doc) } + + fn add_value_leaf(&mut self, leaf: ReferenceValueLeaf) -> ValueAddr { + let type_id = ValueType::from(&leaf); + // Write into `node_data` and return u32 position as its address + // Null and bool are inlined into the address + let val_addr = match leaf { + ReferenceValueLeaf::Null => 0, + ReferenceValueLeaf::Str(bytes) => { + write_bytes_into(&mut self.node_data, bytes.as_bytes()) + } + ReferenceValueLeaf::Facet(bytes) => { + write_bytes_into(&mut self.node_data, bytes.as_bytes()) + } + ReferenceValueLeaf::Bytes(bytes) => write_bytes_into(&mut self.node_data, bytes), + ReferenceValueLeaf::U64(num) => write_into(&mut self.node_data, num), + ReferenceValueLeaf::I64(num) => write_into(&mut self.node_data, num), + ReferenceValueLeaf::F64(num) => write_into(&mut self.node_data, num), + ReferenceValueLeaf::Bool(b) => b as u32, + ReferenceValueLeaf::Date(date) => { + write_into(&mut self.node_data, date.into_timestamp_nanos()) + } + ReferenceValueLeaf::IpAddr(num) => write_into(&mut self.node_data, num.to_u128()), + ReferenceValueLeaf::PreTokStr(pre_tok) => write_into(&mut self.node_data, *pre_tok), + }; + ValueAddr { type_id, val_addr } + } + /// Adds a value and returns in address into the + fn add_value<'a, V: Value<'a>>(&mut self, value: V) -> ValueAddr { + let value = value.as_value(); + let type_id = ValueType::from(&value); + match value { + ReferenceValue::Leaf(leaf) => self.add_value_leaf(leaf), + ReferenceValue::Array(elements) => { + // addresses of the elements in node_data + // Reusing a vec would be nicer, but it's not easy because of the recursion + // A global vec would work if every writer get it's discriminator + let mut addresses = Vec::new(); + for elem in elements { + let value_addr = self.add_value(elem); + write_into(&mut addresses, value_addr); + } + ValueAddr { + type_id, + val_addr: write_bytes_into(&mut self.node_data, &addresses), + } + } + ReferenceValue::Object(entries) => { + // addresses of the elements in node_data + let mut addresses = Vec::new(); + for (key, value) in entries { + let key_addr = self.add_value_leaf(ReferenceValueLeaf::Str(key)); + let value_addr = self.add_value(value); + write_into(&mut addresses, key_addr); + write_into(&mut addresses, value_addr); + } + ValueAddr { + type_id, + val_addr: write_bytes_into(&mut self.node_data, &addresses), + } + } + } + } + + fn extract_value( + &self, + ref_value: ValueAddr, + ) -> io::Result>> { + match ref_value.type_id { + ValueType::Null => Ok(ReferenceValueLeaf::Null.into()), + ValueType::Str => { + let str_ref = self.extract_str(ref_value.val_addr); + Ok(ReferenceValueLeaf::Str(str_ref).into()) + } + ValueType::Facet => { + let str_ref = self.extract_str(ref_value.val_addr); + Ok(ReferenceValueLeaf::Facet(str_ref).into()) + } + ValueType::Bytes => { + let data = self.extract_bytes(ref_value.val_addr); + Ok(ReferenceValueLeaf::Bytes(data).into()) + } + ValueType::U64 => self + .read_from::(ref_value.val_addr) + .map(ReferenceValueLeaf::U64) + .map(Into::into), + ValueType::I64 => self + .read_from::(ref_value.val_addr) + .map(ReferenceValueLeaf::I64) + .map(Into::into), + ValueType::F64 => self + .read_from::(ref_value.val_addr) + .map(ReferenceValueLeaf::F64) + .map(Into::into), + ValueType::Bool => Ok(ReferenceValueLeaf::Bool(ref_value.val_addr != 0).into()), + ValueType::Date => self + .read_from::(ref_value.val_addr) + .map(|ts| ReferenceValueLeaf::Date(DateTime::from_timestamp_nanos(ts))) + .map(Into::into), + ValueType::IpAddr => self + .read_from::(ref_value.val_addr) + .map(|num| ReferenceValueLeaf::IpAddr(Ipv6Addr::from_u128(num))) + .map(Into::into), + ValueType::PreTokStr => self + .read_from::(ref_value.val_addr) + .map(Into::into) + .map(ReferenceValueLeaf::PreTokStr) + .map(Into::into), + ValueType::Object => Ok(ReferenceValue::Object(CompactDocObjectIter::new( + self, + ref_value.val_addr, + )?)), + ValueType::Array => Ok(ReferenceValue::Array(CompactDocArrayIter::new( + self, + ref_value.val_addr, + )?)), + } + } + + /// get &[u8] reference from node_data + fn extract_bytes(&self, addr: Addr) -> &[u8] { + binary_deserialize_bytes(self.get_slice(addr)) + } + + /// get &str reference from node_data + fn extract_str(&self, addr: Addr) -> &str { + let data = self.extract_bytes(addr); + // Utf-8 checks would have a noticeable performance overhead here + unsafe { std::str::from_utf8_unchecked(data) } + } + + /// deserialized owned value from node_data + fn read_from(&self, addr: Addr) -> io::Result { + let data_slice = &self.node_data[addr as usize..]; + let mut cursor = std::io::Cursor::new(data_slice); + T::deserialize(&mut cursor) + } + + /// get slice from address. The returned slice is open ended + fn get_slice(&self, addr: Addr) -> &[u8] { + &self.node_data[addr as usize..] + } +} + +/// BinarySerializable alternative to read references +fn binary_deserialize_bytes(data: &[u8]) -> &[u8] { + let (len, bytes_read) = read_u32_vint_no_advance(data); + &data[bytes_read..bytes_read + len as usize] +} + +/// Write bytes and return the position of the written data. +/// +/// BinarySerializable alternative to write references +fn write_bytes_into(vec: &mut Vec, data: &[u8]) -> u32 { + let pos = vec.len() as u32; + let mut buf = [0u8; 8]; + let len_vint_bytes = serialize_vint_u32(data.len() as u32, &mut buf); + vec.extend_from_slice(len_vint_bytes); + vec.extend_from_slice(data); + pos +} + +/// Serialize and return the position +fn write_into(vec: &mut Vec, value: T) -> u32 { + let pos = vec.len() as u32; + value.serialize(vec).unwrap(); + pos +} + +impl PartialEq for CompactDoc { + fn eq(&self, other: &Self) -> bool { + // super slow, but only here for tests + let convert_to_comparable_map = |doc: &CompactDoc| { + let mut field_value_set: HashMap> = Default::default(); + for field_value in doc.field_values.iter() { + let value: OwnedValue = doc.extract_value(field_value.value_addr).unwrap().into(); + let value = serde_json::to_string(&value).unwrap(); + field_value_set + .entry(Field::from_field_id(field_value.field as u32)) + .or_default() + .insert(value); + } + field_value_set + }; + let self_field_values: HashMap> = convert_to_comparable_map(self); + let other_field_values: HashMap> = convert_to_comparable_map(other); + self_field_values.eq(&other_field_values) + } +} + +impl Eq for CompactDoc {} + +impl DocumentDeserialize for CompactDoc { + fn deserialize<'de, D>(mut deserializer: D) -> Result + where D: DocumentDeserializer<'de> { + let mut doc = CompactDoc::default(); + // TODO: Deserializing into OwnedValue is wasteful. The deserializer should be able to work + // on slices and referenced data. + while let Some((field, value)) = deserializer.next_field::()? { + doc.add_field_value(field, &value); + } + Ok(doc) + } +} + +/// A value of Compact Doc needs a reference to the container to extract its payload +#[derive(Debug, Clone, Copy)] +pub struct CompactDocValue<'a> { + container: &'a CompactDoc, + value: ValueAddr, +} +impl<'a> Value<'a> for CompactDocValue<'a> { + type ArrayIter = CompactDocArrayIter<'a>; + + type ObjectIter = CompactDocObjectIter<'a>; + + fn as_value(&self) -> ReferenceValue<'a, Self> { + self.container.extract_value(self.value).unwrap() + } +} + +/// The address in the vec +type Addr = u32; + +#[derive(Clone, Copy, Default)] +#[repr(packed)] +/// The value type and the address to its payload in the container. +struct ValueAddr { + type_id: ValueType, + /// This is the address to the value in the vec, except for bool and null, which are inlined + val_addr: Addr, +} +impl BinarySerializable for ValueAddr { + fn serialize(&self, writer: &mut W) -> io::Result<()> { + self.type_id.serialize(writer)?; + VInt(self.val_addr as u64).serialize(writer) + } + + fn deserialize(reader: &mut R) -> io::Result { + let type_id = ValueType::deserialize(reader)?; + let val_addr = VInt::deserialize(reader)?.0 as u32; + Ok(ValueAddr { type_id, val_addr }) + } +} +impl std::fmt::Debug for ValueAddr { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let val_addr = self.val_addr; + f.write_fmt(format_args!("{:?} at {:?}", self.type_id, val_addr)) + } +} + +/// A enum representing a value for tantivy to index. +/// +/// Any changes need to be reflected in `BinarySerializable` for `ValueType` +/// +/// We can't use [schema::Type] or [columnar::ColumnType] here, because they are missing +/// some items like Array and PreTokStr. +#[derive(Default, Clone, Copy, Debug, PartialEq)] +#[repr(u8)] +pub enum ValueType { + /// A null value. + #[default] + Null = 0, + /// The str type is used for any text information. + Str = 1, + /// Unsigned 64-bits Integer `u64` + U64 = 2, + /// Signed 64-bits Integer `i64` + I64 = 3, + /// 64-bits Float `f64` + F64 = 4, + /// Date/time with nanoseconds precision + Date = 5, + /// Facet + Facet = 6, + /// Arbitrarily sized byte array + Bytes = 7, + /// IpV6 Address. Internally there is no IpV4, it needs to be converted to `Ipv6Addr`. + IpAddr = 8, + /// Bool value + Bool = 9, + /// Pre-tokenized str type, + PreTokStr = 10, + /// Object + Object = 11, + /// Pre-tokenized str type, + Array = 12, +} + +impl BinarySerializable for ValueType { + fn serialize(&self, writer: &mut W) -> io::Result<()> { + (*self as u8).serialize(writer)?; + Ok(()) + } + + fn deserialize(reader: &mut R) -> io::Result { + let num = u8::deserialize(reader)?; + let type_id = if (0..=12).contains(&num) { + unsafe { std::mem::transmute(num) } + } else { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!("Invalid value type id: {}", num), + )); + }; + Ok(type_id) + } +} + +impl<'a, V: Value<'a>> From<&ReferenceValue<'a, V>> for ValueType { + fn from(value: &ReferenceValue<'a, V>) -> Self { + match value { + ReferenceValue::Leaf(leaf) => leaf.into(), + ReferenceValue::Array(_) => ValueType::Array, + ReferenceValue::Object(_) => ValueType::Object, + } + } +} +impl<'a> From<&ReferenceValueLeaf<'a>> for ValueType { + fn from(value: &ReferenceValueLeaf<'a>) -> Self { + match value { + ReferenceValueLeaf::Null => ValueType::Null, + ReferenceValueLeaf::Str(_) => ValueType::Str, + ReferenceValueLeaf::U64(_) => ValueType::U64, + ReferenceValueLeaf::I64(_) => ValueType::I64, + ReferenceValueLeaf::F64(_) => ValueType::F64, + ReferenceValueLeaf::Bool(_) => ValueType::Bool, + ReferenceValueLeaf::Date(_) => ValueType::Date, + ReferenceValueLeaf::IpAddr(_) => ValueType::IpAddr, + ReferenceValueLeaf::PreTokStr(_) => ValueType::PreTokStr, + ReferenceValueLeaf::Facet(_) => ValueType::Facet, + ReferenceValueLeaf::Bytes(_) => ValueType::Bytes, + } + } +} + +#[derive(Debug, Clone)] +/// The Iterator for the object values in the compact document +pub struct CompactDocObjectIter<'a> { + container: &'a CompactDoc, + node_addresses_slice: &'a [u8], +} + +impl<'a> CompactDocObjectIter<'a> { + fn new(container: &'a CompactDoc, addr: Addr) -> io::Result { + // Objects are `&[ValueAddr]` serialized into bytes + let node_addresses_slice = container.extract_bytes(addr); + Ok(Self { + container, + node_addresses_slice, + }) + } +} + +impl<'a> Iterator for CompactDocObjectIter<'a> { + type Item = (&'a str, CompactDocValue<'a>); + + fn next(&mut self) -> Option { + if self.node_addresses_slice.is_empty() { + return None; + } + let key_addr = ValueAddr::deserialize(&mut self.node_addresses_slice).ok()?; + let key = self.container.extract_str(key_addr.val_addr); + let value = ValueAddr::deserialize(&mut self.node_addresses_slice).ok()?; + let value = CompactDocValue { + container: self.container, + value, + }; + return Some((key, value)); + } +} + +#[derive(Debug, Clone)] +/// The Iterator for the array values in the compact document +pub struct CompactDocArrayIter<'a> { + container: &'a CompactDoc, + node_addresses_slice: &'a [u8], +} + +impl<'a> CompactDocArrayIter<'a> { + fn new(container: &'a CompactDoc, addr: Addr) -> io::Result { + // Arrays are &[ValueAddr] serialized into bytes + let node_addresses_slice = container.extract_bytes(addr); + Ok(Self { + container, + node_addresses_slice, + }) + } +} + +impl<'a> Iterator for CompactDocArrayIter<'a> { + type Item = CompactDocValue<'a>; + + fn next(&mut self) -> Option { + if self.node_addresses_slice.is_empty() { + return None; + } + let value = ValueAddr::deserialize(&mut self.node_addresses_slice).ok()?; + let value = CompactDocValue { + container: self.container, + value, + }; + return Some(value); + } +} + +impl Document for CompactDoc { + type Value<'a> = CompactDocValue<'a>; + type FieldsValuesIter<'a> = FieldValueIterRef<'a>; + + fn iter_fields_and_values(&self) -> Self::FieldsValuesIter<'_> { + FieldValueIterRef { + slice: self.field_values.iter(), + container: self, + } + } +} + +/// A helper wrapper for creating an iterator over the field values +pub struct FieldValueIterRef<'a> { + slice: std::slice::Iter<'a, FieldValueAddr>, + container: &'a CompactDoc, +} + +impl<'a> Iterator for FieldValueIterRef<'a> { + type Item = (Field, CompactDocValue<'a>); + + fn next(&mut self) -> Option { + self.slice.next().map(|field_value| { + ( + Field::from_field_id(field_value.field as u32), + CompactDocValue::<'a> { + container: self.container, + value: field_value.value_addr, + }, + ) + }) + } } /// Error that may happen when deserializing @@ -264,7 +705,40 @@ mod tests { let text_field = schema_builder.add_text_field("title", TEXT); let mut doc = TantivyDocument::default(); doc.add_text(text_field, "My title"); - assert_eq!(doc.field_values().len(), 1); + assert_eq!(doc.field_values().count(), 1); + + let schema = schema_builder.build(); + let _val = doc.get_first(text_field).unwrap(); + let _json = doc.to_named_doc(&schema); + } + + #[test] + fn test_json_value() { + let json_str = r#"{ + "toto": "titi", + "float": -0.2, + "bool": true, + "unsigned": 1, + "signed": -2, + "complexobject": { + "field.with.dot": 1 + }, + "date": "1985-04-12T23:20:50.52Z", + "my_arr": [2, 3, {"my_key": "two tokens"}, 4, {"nested_array": [2, 5, 6, [7, 8, {"a": [{"d": {"e":[99]}}, 9000]}, 9, 10], [5, 5]]}] + }"#; + let json_val: std::collections::BTreeMap = + serde_json::from_str(json_str).unwrap(); + + let mut schema_builder = Schema::builder(); + let json_field = schema_builder.add_json_field("json", TEXT); + let mut doc = TantivyDocument::default(); + doc.add_object(json_field, json_val); + + let schema = schema_builder.build(); + let json = doc.to_json(&schema); + let actual_json: serde_json::Value = serde_json::from_str(&json).unwrap(); + let expected_json: serde_json::Value = serde_json::from_str(json_str).unwrap(); + assert_eq!(actual_json["json"][0], expected_json); } // TODO: Should this be re-added with the serialize method diff --git a/src/schema/document/existing_type_impls.rs b/src/schema/document/existing_type_impls.rs index eaeaed2cd9..fcda1989fe 100644 --- a/src/schema/document/existing_type_impls.rs +++ b/src/schema/document/existing_type_impls.rs @@ -5,21 +5,39 @@ //! and don't care about some of the more specialised types or only want to customise //! part of the document structure. use std::collections::{btree_map, hash_map, BTreeMap, HashMap}; +use std::iter::Empty; +use std::net::Ipv6Addr; +use common::DateTime; use serde_json::Number; +use time::format_description::well_known::Rfc3339; +use time::OffsetDateTime; +use super::facet::Facet; use super::ReferenceValueLeaf; use crate::schema::document::{ ArrayAccess, DeserializeError, Document, DocumentDeserialize, DocumentDeserializer, ObjectAccess, ReferenceValue, Value, ValueDeserialize, ValueDeserializer, ValueVisitor, }; use crate::schema::Field; +use crate::tokenizer::PreTokenizedString; // Serde compatibility support. +pub fn can_be_rfc3339_date_time(text: &str) -> bool { + if let Some(&first_byte) = text.as_bytes().first() { + if first_byte.is_ascii_digit() { + return true; + } + } + + false +} + impl<'a> Value<'a> for &'a serde_json::Value { type ArrayIter = std::slice::Iter<'a, serde_json::Value>; type ObjectIter = JsonObjectIter<'a>; + #[inline] fn as_value(&self) -> ReferenceValue<'a, Self> { match self { serde_json::Value::Null => ReferenceValueLeaf::Null.into(), @@ -35,7 +53,19 @@ impl<'a> Value<'a> for &'a serde_json::Value { panic!("Unsupported serde_json number {number}"); } } - serde_json::Value::String(val) => ReferenceValueLeaf::Str(val).into(), + serde_json::Value::String(text) => { + if can_be_rfc3339_date_time(text) { + match OffsetDateTime::parse(text, &Rfc3339) { + Ok(dt) => { + let dt_utc = dt.to_offset(time::UtcOffset::UTC); + ReferenceValueLeaf::Date(DateTime::from_utc(dt_utc)).into() + } + Err(_) => ReferenceValueLeaf::Str(text).into(), + } + } else { + ReferenceValueLeaf::Str(text).into() + } + } serde_json::Value::Array(elements) => ReferenceValue::Array(elements.iter()), serde_json::Value::Object(object) => { ReferenceValue::Object(JsonObjectIter(object.iter())) @@ -44,6 +74,126 @@ impl<'a> Value<'a> for &'a serde_json::Value { } } +impl<'a> Value<'a> for &'a String { + type ArrayIter = Empty<&'a String>; + type ObjectIter = Empty<(&'a str, &'a String)>; + #[inline] + fn as_value(&self) -> ReferenceValue<'a, Self> { + ReferenceValue::Leaf(ReferenceValueLeaf::Str(self)) + } +} + +impl<'a> Value<'a> for &'a Facet { + type ArrayIter = Empty<&'a Facet>; + type ObjectIter = Empty<(&'a str, &'a Facet)>; + #[inline] + fn as_value(&self) -> ReferenceValue<'a, Self> { + ReferenceValue::Leaf(ReferenceValueLeaf::Facet(self.encoded_str())) + } +} + +impl<'a> Value<'a> for &'a u64 { + type ArrayIter = Empty<&'a u64>; + type ObjectIter = Empty<(&'a str, &'a u64)>; + #[inline] + fn as_value(&self) -> ReferenceValue<'a, Self> { + ReferenceValue::Leaf(ReferenceValueLeaf::U64(**self)) + } +} + +impl<'a> Value<'a> for &'a i64 { + type ArrayIter = Empty<&'a i64>; + type ObjectIter = Empty<(&'a str, &'a i64)>; + #[inline] + fn as_value(&self) -> ReferenceValue<'a, Self> { + ReferenceValue::Leaf(ReferenceValueLeaf::I64(**self)) + } +} +impl<'a> Value<'a> for &'a f64 { + type ArrayIter = Empty<&'a f64>; + type ObjectIter = Empty<(&'a str, &'a f64)>; + #[inline] + fn as_value(&self) -> ReferenceValue<'a, Self> { + ReferenceValue::Leaf(ReferenceValueLeaf::F64(**self)) + } +} +impl<'a> Value<'a> for &'a bool { + type ArrayIter = Empty<&'a bool>; + type ObjectIter = Empty<(&'a str, &'a bool)>; + #[inline] + fn as_value(&self) -> ReferenceValue<'a, Self> { + ReferenceValue::Leaf(ReferenceValueLeaf::Bool(**self)) + } +} +impl<'a> Value<'a> for &'a str { + type ArrayIter = Empty<&'a str>; + type ObjectIter = Empty<(&'a str, &'a str)>; + #[inline] + fn as_value(&self) -> ReferenceValue<'a, Self> { + ReferenceValue::Leaf(ReferenceValueLeaf::Str(self)) + } +} +impl<'a> Value<'a> for &'a &'a str { + type ArrayIter = Empty<&'a &'a str>; + type ObjectIter = Empty<(&'a str, &'a &'a str)>; + #[inline] + fn as_value(&self) -> ReferenceValue<'a, Self> { + ReferenceValue::Leaf(ReferenceValueLeaf::Str(self)) + } +} + +impl<'a> Value<'a> for &'a [u8] { + type ArrayIter = Empty<&'a [u8]>; + type ObjectIter = Empty<(&'a str, &'a [u8])>; + #[inline] + fn as_value(&self) -> ReferenceValue<'a, Self> { + ReferenceValue::Leaf(ReferenceValueLeaf::Bytes(self)) + } +} + +impl<'a> Value<'a> for &'a &'a [u8] { + type ArrayIter = Empty<&'a &'a [u8]>; + type ObjectIter = Empty<(&'a str, &'a &'a [u8])>; + #[inline] + fn as_value(&self) -> ReferenceValue<'a, Self> { + ReferenceValue::Leaf(ReferenceValueLeaf::Bytes(self)) + } +} + +impl<'a> Value<'a> for &'a Vec { + type ArrayIter = Empty<&'a Vec>; + type ObjectIter = Empty<(&'a str, &'a Vec)>; + #[inline] + fn as_value(&self) -> ReferenceValue<'a, Self> { + ReferenceValue::Leaf(ReferenceValueLeaf::Bytes(self)) + } +} + +impl<'a> Value<'a> for &'a DateTime { + type ArrayIter = Empty<&'a DateTime>; + type ObjectIter = Empty<(&'a str, &'a DateTime)>; + #[inline] + fn as_value(&self) -> ReferenceValue<'a, Self> { + ReferenceValue::Leaf(ReferenceValueLeaf::Date(**self)) + } +} +impl<'a> Value<'a> for &'a Ipv6Addr { + type ArrayIter = Empty<&'a Ipv6Addr>; + type ObjectIter = Empty<(&'a str, &'a Ipv6Addr)>; + #[inline] + fn as_value(&self) -> ReferenceValue<'a, Self> { + ReferenceValue::Leaf(ReferenceValueLeaf::IpAddr(**self)) + } +} +impl<'a> Value<'a> for &'a PreTokenizedString { + type ArrayIter = Empty<&'a PreTokenizedString>; + type ObjectIter = Empty<(&'a str, &'a PreTokenizedString)>; + #[inline] + fn as_value(&self) -> ReferenceValue<'a, Self> { + ReferenceValue::Leaf(ReferenceValueLeaf::PreTokStr(Box::new((*self).clone()))) + } +} + impl ValueDeserialize for serde_json::Value { fn deserialize<'de, D>(deserializer: D) -> Result where D: ValueDeserializer<'de> { diff --git a/src/schema/document/mod.rs b/src/schema/document/mod.rs index 16c7eaddfa..91ce894c4d 100644 --- a/src/schema/document/mod.rs +++ b/src/schema/document/mod.rs @@ -172,7 +172,9 @@ pub use self::de::{ ArrayAccess, DeserializeError, DocumentDeserialize, DocumentDeserializer, ObjectAccess, ValueDeserialize, ValueDeserializer, ValueType, ValueVisitor, }; -pub use self::default_document::{DocParsingError, TantivyDocument}; +pub use self::default_document::{ + CompactDocArrayIter, CompactDocObjectIter, CompactDocValue, DocParsingError, TantivyDocument, +}; pub use self::owned_value::OwnedValue; pub(crate) use self::se::BinaryDocumentSerializer; pub use self::value::{ReferenceValue, ReferenceValueLeaf, Value}; @@ -233,7 +235,7 @@ pub trait Document: Send + Sync + 'static { let field_name = schema.get_field_name(field); let values: Vec = field_values .into_iter() - .map(|val| val.as_value().into()) + .map(|val| OwnedValue::from(val.as_value())) .collect(); field_map.insert(field_name.to_string(), values); } diff --git a/src/schema/document/owned_value.rs b/src/schema/document/owned_value.rs index a70eb7d1cd..9fbf1f8c26 100644 --- a/src/schema/document/owned_value.rs +++ b/src/schema/document/owned_value.rs @@ -8,6 +8,7 @@ use serde::de::{MapAccess, SeqAccess}; use time::format_description::well_known::Rfc3339; use time::OffsetDateTime; +use super::existing_type_impls::can_be_rfc3339_date_time; use super::ReferenceValueLeaf; use crate::schema::document::{ ArrayAccess, DeserializeError, ObjectAccess, ReferenceValue, Value, ValueDeserialize, @@ -375,16 +376,6 @@ impl From> for OwnedValue { } } -fn can_be_rfc3339_date_time(text: &str) -> bool { - if let Some(&first_byte) = text.as_bytes().first() { - if first_byte.is_ascii_digit() { - return true; - } - } - - false -} - impl From for OwnedValue { fn from(value: serde_json::Value) -> Self { match value { @@ -472,6 +463,7 @@ mod tests { let mut doc = TantivyDocument::default(); doc.add_bytes(bytes_field, "".as_bytes()); let json_string = doc.to_json(&schema); + assert_eq!(json_string, r#"{"my_bytes":[""]}"#); } diff --git a/src/schema/document/se.rs b/src/schema/document/se.rs index f1eed1027e..98e62da3b0 100644 --- a/src/schema/document/se.rs +++ b/src/schema/document/se.rs @@ -25,6 +25,7 @@ where W: Write /// Attempts to serialize a given document and write the output /// to the writer. + #[inline] pub(crate) fn serialize_doc(&mut self, doc: &D) -> io::Result<()> where D: Document { let stored_field_values = || { @@ -679,6 +680,7 @@ mod tests { ); } + #[inline] fn serialize_doc(doc: &D, schema: &Schema) -> Vec { let mut writer = Vec::new(); diff --git a/src/schema/document/value.rs b/src/schema/document/value.rs index 81edeef3f9..a8f8839dae 100644 --- a/src/schema/document/value.rs +++ b/src/schema/document/value.rs @@ -159,6 +159,69 @@ pub enum ReferenceValueLeaf<'a> { PreTokStr(Box), } +impl From for ReferenceValueLeaf<'_> { + #[inline] + fn from(value: u64) -> Self { + ReferenceValueLeaf::U64(value) + } +} + +impl From for ReferenceValueLeaf<'_> { + #[inline] + fn from(value: i64) -> Self { + ReferenceValueLeaf::I64(value) + } +} + +impl From for ReferenceValueLeaf<'_> { + #[inline] + fn from(value: f64) -> Self { + ReferenceValueLeaf::F64(value) + } +} + +impl From for ReferenceValueLeaf<'_> { + #[inline] + fn from(value: bool) -> Self { + ReferenceValueLeaf::Bool(value) + } +} + +impl<'a> From<&'a str> for ReferenceValueLeaf<'a> { + #[inline] + fn from(value: &'a str) -> Self { + ReferenceValueLeaf::Str(value) + } +} + +impl<'a> From<&'a [u8]> for ReferenceValueLeaf<'a> { + #[inline] + fn from(value: &'a [u8]) -> Self { + ReferenceValueLeaf::Bytes(value) + } +} + +impl From for ReferenceValueLeaf<'_> { + #[inline] + fn from(value: DateTime) -> Self { + ReferenceValueLeaf::Date(value) + } +} + +impl From for ReferenceValueLeaf<'_> { + #[inline] + fn from(value: Ipv6Addr) -> Self { + ReferenceValueLeaf::IpAddr(value) + } +} + +impl From for ReferenceValueLeaf<'_> { + #[inline] + fn from(val: PreTokenizedString) -> Self { + ReferenceValueLeaf::PreTokStr(Box::new(val)) + } +} + impl<'a, T: Value<'a> + ?Sized> From> for ReferenceValue<'a, T> { #[inline] fn from(value: ReferenceValueLeaf<'a>) -> Self { diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index 828cf238e9..baa5200383 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -568,21 +568,21 @@ mod tests { let schema = schema_builder.build(); let doc = TantivyDocument::parse_json(&schema, r#"{"id": 100}"#).unwrap(); assert_eq!( - &OwnedValue::Str("100".to_string()), - doc.get_first(text_field).unwrap() + OwnedValue::Str("100".to_string()), + doc.get_first(text_field).unwrap().into() ); let doc = TantivyDocument::parse_json(&schema, r#"{"id": true}"#).unwrap(); assert_eq!( - &OwnedValue::Str("true".to_string()), - doc.get_first(text_field).unwrap() + OwnedValue::Str("true".to_string()), + doc.get_first(text_field).unwrap().into() ); // Not sure if this null coercion is the best approach let doc = TantivyDocument::parse_json(&schema, r#"{"id": null}"#).unwrap(); assert_eq!( - &OwnedValue::Str("null".to_string()), - doc.get_first(text_field).unwrap() + OwnedValue::Str("null".to_string()), + doc.get_first(text_field).unwrap().into() ); } @@ -595,9 +595,18 @@ mod tests { let schema = schema_builder.build(); let doc_json = r#"{"i64": "100", "u64": "100", "f64": "100"}"#; let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap(); - assert_eq!(&OwnedValue::I64(100), doc.get_first(i64_field).unwrap()); - assert_eq!(&OwnedValue::U64(100), doc.get_first(u64_field).unwrap()); - assert_eq!(&OwnedValue::F64(100.0), doc.get_first(f64_field).unwrap()); + assert_eq!( + OwnedValue::I64(100), + doc.get_first(i64_field).unwrap().into() + ); + assert_eq!( + OwnedValue::U64(100), + doc.get_first(u64_field).unwrap().into() + ); + assert_eq!( + OwnedValue::F64(100.0), + doc.get_first(f64_field).unwrap().into() + ); } #[test] @@ -607,11 +616,17 @@ mod tests { let schema = schema_builder.build(); let doc_json = r#"{"bool": "true"}"#; let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap(); - assert_eq!(&OwnedValue::Bool(true), doc.get_first(bool_field).unwrap()); + assert_eq!( + OwnedValue::Bool(true), + doc.get_first(bool_field).unwrap().into() + ); let doc_json = r#"{"bool": "false"}"#; let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap(); - assert_eq!(&OwnedValue::Bool(false), doc.get_first(bool_field).unwrap()); + assert_eq!( + OwnedValue::Bool(false), + doc.get_first(bool_field).unwrap().into() + ); } #[test] @@ -646,7 +661,7 @@ mod tests { let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap(); let date = doc.get_first(date_field).unwrap(); // Time zone is converted to UTC - assert_eq!("Date(2019-10-12T05:20:50.52Z)", format!("{date:?}")); + assert_eq!("Leaf(Date(2019-10-12T05:20:50.52Z))", format!("{date:?}")); } #[test] diff --git a/src/schema/field_value.rs b/src/schema/field_value.rs deleted file mode 100644 index ac5851ebd6..0000000000 --- a/src/schema/field_value.rs +++ /dev/null @@ -1,46 +0,0 @@ -use crate::schema::{Field, OwnedValue}; - -/// `FieldValue` holds together a `Field` and its `Value`. -#[allow(missing_docs)] -#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] -pub struct FieldValue { - pub field: Field, - pub value: OwnedValue, -} - -impl FieldValue { - /// Constructor - pub fn new(field: Field, value: OwnedValue) -> FieldValue { - FieldValue { field, value } - } - - /// Field accessor - pub fn field(&self) -> Field { - self.field - } - - /// Value accessor - pub fn value(&self) -> &OwnedValue { - &self.value - } -} - -impl From for OwnedValue { - fn from(field_value: FieldValue) -> Self { - field_value.value - } -} - -/// A helper wrapper for creating standard iterators -/// out of the fields iterator trait. -pub struct FieldValueIter<'a>(pub(crate) std::slice::Iter<'a, FieldValue>); - -impl<'a> Iterator for FieldValueIter<'a> { - type Item = (Field, &'a OwnedValue); - - fn next(&mut self) -> Option { - self.0 - .next() - .map(|field_value| (field_value.field, &field_value.value)) - } -} diff --git a/src/schema/mod.rs b/src/schema/mod.rs index b4c3b037e9..3400a60acb 100644 --- a/src/schema/mod.rs +++ b/src/schema/mod.rs @@ -114,7 +114,6 @@ pub(crate) mod term; mod field_entry; mod field_type; -mod field_value; mod bytes_options; mod date_time_options; @@ -138,7 +137,6 @@ pub use self::facet_options::FacetOptions; pub use self::field::Field; pub use self::field_entry::FieldEntry; pub use self::field_type::{FieldType, Type}; -pub use self::field_value::FieldValue; pub use self::flags::{COERCE, FAST, INDEXED, STORED}; pub use self::index_record_option::IndexRecordOption; pub use self::ip_options::{IntoIpv6Addr, IpAddrOptions}; diff --git a/src/schema/schema.rs b/src/schema/schema.rs index d3215a37c3..a91fb67137 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -645,15 +645,15 @@ mod tests { let doc = TantivyDocument::convert_named_doc(&schema, NamedFieldDocument(named_doc_map)).unwrap(); assert_eq!( - doc.get_all(title).collect::>(), + doc.get_all(title).map(OwnedValue::from).collect::>(), vec![ - &OwnedValue::from("title1".to_string()), - &OwnedValue::from("title2".to_string()) + OwnedValue::from("title1".to_string()), + OwnedValue::from("title2".to_string()) ] ); assert_eq!( - doc.get_all(val).collect::>(), - vec![&OwnedValue::from(14u64), &OwnedValue::from(-1i64)] + doc.get_all(val).map(OwnedValue::from).collect::>(), + vec![OwnedValue::from(14u64), OwnedValue::from(-1i64)] ); } @@ -682,7 +682,7 @@ mod tests { let schema = schema_builder.build(); { let doc = TantivyDocument::parse_json(&schema, "{}").unwrap(); - assert!(doc.field_values().is_empty()); + assert!(doc.field_values().next().is_none()); } { let doc = TantivyDocument::parse_json( diff --git a/src/store/mod.rs b/src/store/mod.rs index 1cb1a1101c..e5e28be45b 100644 --- a/src/store/mod.rs +++ b/src/store/mod.rs @@ -59,7 +59,6 @@ pub mod tests { use super::*; use crate::directory::{Directory, RamDirectory, WritePtr}; use crate::fastfield::AliveBitSet; - use crate::schema::document::Value; use crate::schema::{ self, Schema, TantivyDocument, TextFieldIndexing, TextOptions, STORED, TEXT, }; @@ -92,8 +91,8 @@ pub mod tests { StoreWriter::new(writer, compressor, blocksize, separate_thread).unwrap(); for i in 0..num_docs { let mut doc = TantivyDocument::default(); - doc.add_field_value(field_body, LOREM.to_string()); - doc.add_field_value(field_title, format!("Doc {i}")); + doc.add_text(field_body, LOREM); + doc.add_text(field_title, format!("Doc {i}")); store_writer.store(&doc, &schema).unwrap(); } store_writer.close().unwrap(); @@ -119,7 +118,7 @@ pub mod tests { let store = StoreReader::open(store_file, 10)?; for i in 0..NUM_DOCS as u32 { assert_eq!( - *store + store .get::(i)? .get_first(field_title) .unwrap() @@ -131,7 +130,12 @@ pub mod tests { for doc in store.iter::(Some(&alive_bitset)) { let doc = doc?; - let title_content = doc.get_first(field_title).unwrap().as_str().unwrap(); + let title_content = doc + .get_first(field_title) + .unwrap() + .as_str() + .unwrap() + .to_string(); if !title_content.starts_with("Doc ") { panic!("unexpected title_content {title_content}"); } diff --git a/src/store/reader.rs b/src/store/reader.rs index 44f0df993e..1e4432e5f3 100644 --- a/src/store/reader.rs +++ b/src/store/reader.rs @@ -403,7 +403,6 @@ mod tests { use super::*; use crate::directory::RamDirectory; - use crate::schema::document::Value; use crate::schema::{Field, TantivyDocument}; use crate::store::tests::write_lorem_ipsum_store; use crate::store::Compressor;