Skip to content

Commit

Permalink
store document length as fast field (#4861)
Browse files Browse the repository at this point in the history
* add document_length do docmapper config

* add _doc_length to documents when required by docmapper config
  • Loading branch information
trinity-1686a authored Apr 15, 2024
1 parent 144074d commit 7082685
Show file tree
Hide file tree
Showing 22 changed files with 100 additions and 9 deletions.
5 changes: 5 additions & 0 deletions quickwit/quickwit-config/src/index_config/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,9 @@ pub struct DocMapping {
pub max_num_partitions: NonZeroU32,
#[serde(default)]
pub tokenizers: Vec<TokenizerEntry>,
/// Record document length
#[serde(default)]
pub document_length: bool,
}

#[derive(Clone, Debug, Serialize, Deserialize, utoipa::ToSchema)]
Expand Down Expand Up @@ -454,6 +457,7 @@ impl TestableForRegression for IndexConfig {
max_num_partitions: NonZeroU32::new(100).unwrap(),
timestamp_field: Some("timestamp".to_string()),
tokenizers: vec![tokenizer],
document_length: false,
};
let retention_policy = Some(RetentionPolicy {
retention_period: "90 days".to_string(),
Expand Down Expand Up @@ -531,6 +535,7 @@ pub fn build_doc_mapper(
partition_key: doc_mapping.partition_key.clone(),
max_num_partitions: doc_mapping.max_num_partitions,
tokenizers: doc_mapping.tokenizers.clone(),
document_length: doc_mapping.document_length,
};
Ok(Arc::new(builder.try_build()?))
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ use crate::query_builder::build_query;
use crate::routing_expression::RoutingExpr;
use crate::{
Cardinality, DocMapper, DocParsingError, Mode, QueryParserError, TokenizerEntry, WarmupInfo,
DYNAMIC_FIELD_NAME, FIELD_PRESENCE_FIELD_NAME, SOURCE_FIELD_NAME,
DOCUMENT_LEN_FIELD_NAME, DYNAMIC_FIELD_NAME, FIELD_PRESENCE_FIELD_NAME, SOURCE_FIELD_NAME,
};

const FIELD_PRESENCE_FIELD: Field = Field::from_field_id(0u32);
Expand All @@ -69,6 +69,8 @@ pub struct DefaultDocMapper {
/// This field is only valid when using the schema associated with the default
/// doc mapper, and therefore cannot be used in the `query` method.
dynamic_field: Option<Field>,
/// Field in which the len of the source document is stored as a fast field.
document_len_field: Option<Field>,
/// Default list of field names used for search.
default_search_field_names: Vec<String>,
/// Timestamp field name.
Expand Down Expand Up @@ -158,6 +160,13 @@ impl TryFrom<DefaultDocMapperBuilder> for DefaultDocMapper {
None
};

let document_len_field = if builder.document_length {
let document_len_field_options = tantivy::schema::NumericOptions::default().set_fast();
Some(schema_builder.add_u64_field(DOCUMENT_LEN_FIELD_NAME, document_len_field_options))
} else {
None
};

// Adding regular fields.
let MappingNodeRoot {
field_mappings,
Expand Down Expand Up @@ -226,7 +235,6 @@ impl TryFrom<DefaultDocMapperBuilder> for DefaultDocMapper {
default_search_field_name
)
}
let dynamic_field = schema.get_field(DYNAMIC_FIELD_NAME).ok();
let (default_search_field, _json_path) = schema
.find_field_with_default(default_search_field_name, dynamic_field)
.with_context(|| {
Expand Down Expand Up @@ -262,6 +270,7 @@ impl TryFrom<DefaultDocMapperBuilder> for DefaultDocMapper {
index_field_presence: builder.index_field_presence,
source_field,
dynamic_field,
document_len_field,
default_search_field_names,
timestamp_field_name: builder.timestamp_field,
field_mappings,
Expand Down Expand Up @@ -378,6 +387,7 @@ impl From<DefaultDocMapper> for DefaultDocMapperBuilder {
partition_key: partition_key_opt,
max_num_partitions: default_doc_mapper.max_num_partitions,
tokenizers: default_doc_mapper.tokenizer_entries,
document_length: false,
}
}
}
Expand Down Expand Up @@ -561,6 +571,7 @@ impl DocMapper for DefaultDocMapper {
fn doc_from_json_obj(
&self,
json_obj: JsonObject,
document_len: u64,
) -> Result<(Partition, Document), DocParsingError> {
let partition: Partition = self.partition_key.eval_hash(&json_obj);

Expand Down Expand Up @@ -613,6 +624,10 @@ impl DocMapper for DefaultDocMapper {
}
}

if let Some(document_len_field) = self.document_len_field {
document.add_u64(document_len_field, document_len);
}

// The capacity is inexact here.

if self.index_field_presence {
Expand Down Expand Up @@ -729,8 +744,8 @@ mod tests {
use crate::default_doc_mapper::field_mapping_entry::DEFAULT_TOKENIZER_NAME;
use crate::default_doc_mapper::mapping_tree::value_to_pretokenized;
use crate::{
DefaultDocMapperBuilder, DocMapper, DocParsingError, DYNAMIC_FIELD_NAME,
FIELD_PRESENCE_FIELD_NAME, SOURCE_FIELD_NAME,
DefaultDocMapperBuilder, DocMapper, DocParsingError, DOCUMENT_LEN_FIELD_NAME,
DYNAMIC_FIELD_NAME, FIELD_PRESENCE_FIELD_NAME, SOURCE_FIELD_NAME,
};

fn example_json_doc_value() -> JsonValue {
Expand Down Expand Up @@ -786,7 +801,7 @@ mod tests {
let json_doc = example_json_doc_value();
let doc_mapper = crate::default_doc_mapper_for_test();
let (_, document) = doc_mapper
.doc_from_json_obj(json_doc.as_object().unwrap().clone())
.doc_from_json_obj(json_doc.as_object().unwrap().clone(), 0)
.unwrap();
let schema = doc_mapper.schema();
// 9 property entry + 1 field "_source" + 2 fields values for "tags" field
Expand Down Expand Up @@ -1265,7 +1280,7 @@ mod tests {
"image": "YWJj"
});
let (_, document) = doc_mapper
.doc_from_json_obj(json_doc_value.as_object().unwrap().clone())
.doc_from_json_obj(json_doc_value.as_object().unwrap().clone(), 0)
.unwrap();

// 2 properties, + 1 value for "_source" + 2 for field presence.
Expand Down Expand Up @@ -2014,6 +2029,20 @@ mod tests {
);
}

#[test]
fn test_length_field() {
let raw_doc = r#"{ "some_obj": { "json_obj": {"hello": 2} } }"#;
test_doc_from_json_test_aux(
r#"{
"document_length": true,
"mode": "dynamic"
}"#,
DOCUMENT_LEN_FIELD_NAME,
raw_doc,
vec![(raw_doc.len() as u64).into()],
);
}

fn default_doc_mapper_query_aux(
doc_mapper: &dyn DocMapper,
query: &str,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,9 @@ pub struct DefaultDocMapperBuilder {
/// User-defined tokenizers.
#[serde(default)]
pub tokenizers: Vec<TokenizerEntry>,
/// Record document length
#[serde(default)]
pub document_length: bool,
}

/// Defines how an unmapped field should be handled.
Expand Down
5 changes: 3 additions & 2 deletions quickwit/quickwit-doc-mapper/src/doc_mapper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ pub trait DocMapper: Send + Sync + Debug + DynClone + 'static {
fn doc_from_json_obj(
&self,
json_obj: JsonObject,
document_len: u64,
) -> Result<(Partition, Document), DocParsingError>;

/// Parses a JSON byte slice into a tantivy [`Document`].
Expand All @@ -65,7 +66,7 @@ pub trait DocMapper: Send + Sync + Debug + DynClone + 'static {
.unwrap_or_else(|_| "document contains some invalid UTF-8 characters".to_string());
DocParsingError::NotJsonObject(json_doc_sample)
})?;
self.doc_from_json_obj(json_obj)
self.doc_from_json_obj(json_obj, json_doc.len() as u64)
}

/// Parses a JSON string into a tantivy [`Document`].
Expand All @@ -74,7 +75,7 @@ pub trait DocMapper: Send + Sync + Debug + DynClone + 'static {
let json_doc_sample: String = json_doc.chars().take(20).chain("...".chars()).collect();
DocParsingError::NotJsonObject(json_doc_sample)
})?;
self.doc_from_json_obj(json_obj)
self.doc_from_json_obj(json_obj, json_doc.len() as u64)
}

/// Converts a tantivy named Document to the json format.
Expand Down
4 changes: 4 additions & 0 deletions quickwit/quickwit-doc-mapper/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,15 @@ pub const SOURCE_FIELD_NAME: &str = "_source";
/// Field name reserved for storing the dynamically indexed fields.
pub const DYNAMIC_FIELD_NAME: &str = "_dynamic";

/// Field name reserved for storing the length of source document.
pub const DOCUMENT_LEN_FIELD_NAME: &str = "_doc_length";

/// Quickwit reserved field names.
const QW_RESERVED_FIELD_NAMES: &[&str] = &[
SOURCE_FIELD_NAME,
DYNAMIC_FIELD_NAME,
FIELD_PRESENCE_FIELD_NAME,
DOCUMENT_LEN_FIELD_NAME,
];

/// Cardinality of a field.
Expand Down
4 changes: 3 additions & 1 deletion quickwit/quickwit-indexing/src/actors/doc_processor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -461,7 +461,9 @@ impl DocProcessor {
fn process_json_doc(&self, json_doc: JsonDoc) -> Result<ProcessedDoc, DocProcessorError> {
let num_bytes = json_doc.num_bytes;

let (partition, doc) = self.doc_mapper.doc_from_json_obj(json_doc.json_obj)?;
let (partition, doc) = self
.doc_mapper
.doc_from_json_obj(json_doc.json_obj, json_doc.num_bytes as u64)?;
let timestamp_opt = self.extract_timestamp(&doc)?;
Ok(ProcessedDoc {
doc,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
"create_timestamp": 1789,
"index_config": {
"doc_mapping": {
"document_length": false,
"dynamic_mapping": {
"expand_dots": true,
"fast": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
"create_timestamp": 1789,
"index_config": {
"doc_mapping": {
"document_length": false,
"dynamic_mapping": {
"expand_dots": true,
"fast": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
"create_timestamp": 1789,
"index_config": {
"doc_mapping": {
"document_length": false,
"dynamic_mapping": {
"expand_dots": true,
"fast": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
"create_timestamp": 1789,
"index_config": {
"doc_mapping": {
"document_length": false,
"dynamic_mapping": {
"expand_dots": true,
"fast": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"create_timestamp": 1789,
"index_config": {
"doc_mapping": {
"document_length": false,
"dynamic_mapping": {
"expand_dots": true,
"fast": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"create_timestamp": 1789,
"index_config": {
"doc_mapping": {
"document_length": false,
"dynamic_mapping": {
"expand_dots": true,
"fast": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"create_timestamp": 1789,
"index_config": {
"doc_mapping": {
"document_length": false,
"dynamic_mapping": {
"expand_dots": true,
"fast": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"create_timestamp": 1789,
"index_config": {
"doc_mapping": {
"document_length": false,
"dynamic_mapping": {
"expand_dots": true,
"fast": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"create_timestamp": 1789,
"index_config": {
"doc_mapping": {
"document_length": false,
"dynamic_mapping": {
"expand_dots": true,
"fast": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"create_timestamp": 1789,
"index_config": {
"doc_mapping": {
"document_length": false,
"dynamic_mapping": {
"expand_dots": true,
"fast": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
{
"description": "Test description.",
"doc_mapping": {
"document_length": false,
"dynamic_mapping": {
"expand_dots": true,
"fast": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
{
"description": "Test description.",
"doc_mapping": {
"document_length": false,
"dynamic_mapping": {
"expand_dots": true,
"fast": {
Expand Down
1 change: 1 addition & 0 deletions quickwit/quickwit-metastore/test-data/manifest/v0.8.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
{
"description": "Test description.",
"doc_mapping": {
"document_length": false,
"dynamic_mapping": {
"expand_dots": true,
"fast": {
Expand Down
1 change: 1 addition & 0 deletions quickwit/quickwit-search/src/collector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1488,6 +1488,7 @@ mod tests {
fn doc_from_json_obj(
&self,
_json_obj: quickwit_doc_mapper::JsonObject,
_doc_len: u64,
) -> Result<(u64, TantivyDocument), quickwit_doc_mapper::DocParsingError> {
unimplemented!()
}
Expand Down
32 changes: 32 additions & 0 deletions quickwit/rest-api-tests/scenarii/aggregations/0002-doc-len.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Test summing doc len
method: [GET]
engines:
- quickwit
endpoint: _elastic/aggregations/_search
json:
query: { match_all: {} }
aggs:
doc_len:
sum:
field: "_doc_length"
expected:
aggregations:
doc_len:
value: 913.0
---
# Test doc len isn't shown when querying documents
method: [GET]
engines:
- quickwit
endpoint: _elastic/aggregations/_search
json:
query:
term:
id:
value: 1
expected:
hits:
hits:
- _source:
$expect: "not '_doc_length' in val"
---
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ json:
dynamic_mapping:
tokenizer: default
fast: true
document_length: true
field_mappings:
- name: date
type: datetime
Expand Down

0 comments on commit 7082685

Please sign in to comment.