Exist queries match subpath fields

quickwit-oss · Dec 11, 2024 · 03c1e84 · 03c1e84
1 parent 0f99d4f
commit 03c1e84
Show file tree

Hide file tree

Showing 6 changed files with 272 additions and 18 deletions.
diff --git a/columnar/src/columnar/reader/mod.rs b/columnar/src/columnar/reader/mod.rs
@@ -1,6 +1,7 @@
 use std::{fmt, io, mem};
 
 use common::file_slice::FileSlice;
+use common::json_path_writer::JSON_PATH_SEGMENT_SEP;
 use common::BinarySerializable;
 use sstable::{Dictionary, RangeSSTable};
 
@@ -153,15 +154,9 @@ impl ColumnarReader {
         //
         // This is in turn equivalent to searching for the range
         // `[column_name,\0`..column_name\1)`.
-        // TODO can we get some more generic `prefix(..)` logic in the dictionary.
-        let mut start_key = column_name.to_string();
-        start_key.push('\0');
-        let mut end_key = column_name.to_string();
-        end_key.push(1u8 as char);
-        self.column_dictionary
-            .range()
-            .ge(start_key.as_bytes())
-            .lt(end_key.as_bytes())
+        let mut prefix = column_name.to_string();
+        prefix.push('\0');
+        self.column_dictionary.prefix_range(prefix.as_bytes())
     }
 
     pub async fn read_columns_async(
@@ -184,6 +179,21 @@ impl ColumnarReader {
         read_all_columns_in_stream(stream, &self.column_data, self.format_version)
     }
 
+    /// Get all columns for a given prefix, ie columns for which the name starts
+    /// with the prefix, then contain a `.` char.
+    ///
+    /// There can be more than one column associated for each column name,
+    /// provided they have different types.
+    pub fn read_subpath_columns(&self, root_path: &str) -> io::Result<Vec<DynamicColumnHandle>> {
+        let mut prefix = root_path.to_string();
+        prefix.push(JSON_PATH_SEGMENT_SEP as char);
+        let stream = self
+            .column_dictionary
+            .prefix_range(prefix.as_bytes())
+            .into_stream()?;
+        read_all_columns_in_stream(stream, &self.column_data, self.format_version)
+    }
+
     /// Return the number of columns in the columnar.
     pub fn num_columns(&self) -> usize {
         self.column_dictionary.num_terms()
@@ -192,6 +202,8 @@ impl ColumnarReader {
 
 #[cfg(test)]
 mod tests {
+    use common::json_path_writer::JSON_PATH_SEGMENT_SEP;
+
     use crate::{ColumnType, ColumnarReader, ColumnarWriter};
 
     #[test]
@@ -224,6 +236,64 @@ mod tests {
         assert_eq!(columns[0].1.column_type(), ColumnType::U64);
     }
 
+    #[test]
+    fn test_read_columns() {
+        let mut columnar_writer = ColumnarWriter::default();
+        columnar_writer.record_column_type("col", ColumnType::U64, false);
+        columnar_writer.record_numerical(1, "col", 1u64);
+        let mut buffer = Vec::new();
+        columnar_writer.serialize(2, &mut buffer).unwrap();
+        let columnar = ColumnarReader::open(buffer).unwrap();
+        {
+            let columns = columnar.read_columns("col").unwrap();
+            assert_eq!(columns.len(), 1);
+            assert_eq!(columns[0].column_type(), ColumnType::U64);
+        }
+        {
+            let columns = columnar.read_columns("other").unwrap();
+            assert_eq!(columns.len(), 0);
+        }
+    }
+
+    #[test]
+    fn test_read_subpath_columns() {
+        let mut columnar_writer = ColumnarWriter::default();
+        columnar_writer.record_str(
+            0,
+            &format!("col1{}subcol1", JSON_PATH_SEGMENT_SEP as char),
+            "hello",
+        );
+        columnar_writer.record_numerical(
+            0,
+            &format!("col1{}subcol2", JSON_PATH_SEGMENT_SEP as char),
+            1i64,
+        );
+        columnar_writer.record_str(1, "col1", "hello");
+        columnar_writer.record_str(0, "col2", "hello");
+        let mut buffer = Vec::new();
+        columnar_writer.serialize(2, &mut buffer).unwrap();
+
+        let columnar = ColumnarReader::open(buffer).unwrap();
+        {
+            let columns = columnar.read_subpath_columns("col1").unwrap();
+            assert_eq!(columns.len(), 2);
+            assert_eq!(columns[0].column_type(), ColumnType::Str);
+            assert_eq!(columns[1].column_type(), ColumnType::I64);
+        }
+        {
+            let columns = columnar.read_subpath_columns("col1.subcol1").unwrap();
+            assert_eq!(columns.len(), 0);
+        }
+        {
+            let columns = columnar.read_subpath_columns("col2").unwrap();
+            assert_eq!(columns.len(), 0);
+        }
+        {
+            let columns = columnar.read_subpath_columns("other").unwrap();
+            assert_eq!(columns.len(), 0);
+        }
+    }
+
     #[test]
     #[should_panic(expected = "Input type forbidden")]
     fn test_list_columns_strict_typing_panics_on_wrong_types() {

diff --git a/columnar/src/columnar/writer/mod.rs b/columnar/src/columnar/writer/mod.rs
@@ -285,7 +285,6 @@ impl ColumnarWriter {
                 .map(|(column_name, addr)| (column_name, ColumnType::DateTime, addr)),
         );
         columns.sort_unstable_by_key(|(column_name, col_type, _)| (*column_name, *col_type));
-
         let (arena, buffers, dictionaries) = (&self.arena, &mut self.buffers, &self.dictionaries);
         let mut symbol_byte_buffer: Vec<u8> = Vec::new();
         for (column_name, column_type, addr) in columns {

diff --git a/src/fastfield/readers.rs b/src/fastfield/readers.rs
@@ -217,7 +217,7 @@ impl FastFieldReaders {
         Ok(dynamic_column.into())
     }
 
-    /// Returning a `dynamic_column_handle`.
+    /// Returns a `dynamic_column_handle`.
     pub fn dynamic_column_handle(
         &self,
         field_name: &str,
@@ -234,7 +234,7 @@ impl FastFieldReaders {
         Ok(dynamic_column_handle_opt)
     }
 
-    /// Returning all `dynamic_column_handle`.
+    /// Returns all `dynamic_column_handle` that match the given field name.
     pub fn dynamic_column_handles(
         &self,
         field_name: &str,
@@ -250,6 +250,22 @@ impl FastFieldReaders {
         Ok(dynamic_column_handles)
     }
 
+    /// Returns all `dynamic_column_handle` that are inner fields of the provided JSON path.
+    pub fn dynamic_subpath_column_handles(
+        &self,
+        root_path: &str,
+    ) -> crate::Result<Vec<DynamicColumnHandle>> {
+        let Some(resolved_field_name) = self.resolve_field(root_path)? else {
+            return Ok(Vec::new());
+        };
+        let dynamic_column_handles = self
+            .columnar
+            .read_subpath_columns(&resolved_field_name)?
+            .into_iter()
+            .collect();
+        Ok(dynamic_column_handles)
+    }
+
     #[doc(hidden)]
     pub async fn list_dynamic_column_handles(
         &self,

diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs
@@ -422,6 +422,7 @@ mod tests {
     use std::collections::BTreeMap;
     use std::path::{Path, PathBuf};
 
+    use columnar::ColumnType;
     use tempfile::TempDir;
 
     use crate::collector::{Count, TopDocs};
@@ -431,15 +432,15 @@ mod tests {
     use crate::query::{PhraseQuery, QueryParser};
     use crate::schema::{
         Document, IndexRecordOption, OwnedValue, Schema, TextFieldIndexing, TextOptions, Value,
-        DATE_TIME_PRECISION_INDEXED, STORED, STRING, TEXT,
+        DATE_TIME_PRECISION_INDEXED, FAST, STORED, STRING, TEXT,
     };
     use crate::store::{Compressor, StoreReader, StoreWriter};
     use crate::time::format_description::well_known::Rfc3339;
     use crate::time::OffsetDateTime;
     use crate::tokenizer::{PreTokenizedString, Token};
     use crate::{
-        DateTime, Directory, DocAddress, DocSet, Index, IndexWriter, TantivyDocument, Term,
-        TERMINATED,
+        DateTime, Directory, DocAddress, DocSet, Index, IndexWriter, SegmentReader,
+        TantivyDocument, Term, TERMINATED,
     };
 
     #[test]
@@ -841,6 +842,75 @@ mod tests {
         assert_eq!(searcher.search(&phrase_query, &Count).unwrap(), 0);
     }
 
+    #[test]
+    fn test_json_fast() {
+        let mut schema_builder = Schema::builder();
+        let json_field = schema_builder.add_json_field("json", FAST);
+        let schema = schema_builder.build();
+        let json_val: serde_json::Value = serde_json::from_str(
+            r#"{
+            "toto": "titi",
+            "float": -0.2,
+            "bool": true,
+            "unsigned": 1,
+            "signed": -2,
+            "complexobject": {
+                "field.with.dot": 1
+            },
+            "date": "1985-04-12T23:20:50.52Z",
+            "my_arr": [2, 3, {"my_key": "two tokens"}, 4]
+        }"#,
+        )
+        .unwrap();
+        let doc = doc!(json_field=>json_val.clone());
+        let index = Index::create_in_ram(schema.clone());
+        let mut writer = index.writer_for_tests().unwrap();
+        writer.add_document(doc).unwrap();
+        writer.commit().unwrap();
+        let reader = index.reader().unwrap();
+        let searcher = reader.searcher();
+        let segment_reader = searcher.segment_reader(0u32);
+
+        fn assert_type(reader: &SegmentReader, field: &str, typ: ColumnType) {
+            let cols = reader.fast_fields().dynamic_column_handles(field).unwrap();
+            assert_eq!(cols.len(), 1, "{}", field);
+            assert_eq!(cols[0].column_type(), typ, "{}", field);
+        }
+        assert_type(segment_reader, "json.toto", ColumnType::Str);
+        assert_type(segment_reader, "json.float", ColumnType::F64);
+        assert_type(segment_reader, "json.bool", ColumnType::Bool);
+        assert_type(segment_reader, "json.unsigned", ColumnType::I64);
+        assert_type(segment_reader, "json.signed", ColumnType::I64);
+        assert_type(
+            segment_reader,
+            "json.complexobject.field\\.with\\.dot",
+            ColumnType::I64,
+        );
+        assert_type(segment_reader, "json.date", ColumnType::DateTime);
+        assert_type(segment_reader, "json.my_arr", ColumnType::I64);
+        assert_type(segment_reader, "json.my_arr.my_key", ColumnType::Str);
+
+        fn assert_empty(reader: &SegmentReader, field: &str) {
+            let cols = reader.fast_fields().dynamic_column_handles(field).unwrap();
+            assert_eq!(cols.len(), 0);
+        }
+        assert_empty(segment_reader, "unknown");
+        assert_empty(segment_reader, "json");
+        assert_empty(segment_reader, "json.toto.titi");
+
+        let sub_columns = segment_reader
+            .fast_fields()
+            .dynamic_subpath_column_handles("json")
+            .unwrap();
+        assert_eq!(sub_columns.len(), 9);
+
+        let subsub_columns = segment_reader
+            .fast_fields()
+            .dynamic_subpath_column_handles("json.complexobject")
+            .unwrap();
+        assert_eq!(subsub_columns.len(), 1);
+    }
+
     #[test]
     fn test_json_term_with_numeric_merge_panic_regression_bug_2283() {
         // https://github.com/quickwit-oss/tantivy/issues/2283

diff --git a/src/query/exist_query.rs b/src/query/exist_query.rs
@@ -7,9 +7,23 @@ use crate::docset::{DocSet, TERMINATED};
 use crate::index::SegmentReader;
 use crate::query::explanation::does_not_match;
 use crate::query::{EnableScoring, Explanation, Query, Scorer, Weight};
+use crate::schema::Type;
 use crate::{DocId, Score, TantivyError};
 
-/// Query that matches all documents with a non-null value in the specified field.
+/// Query that matches all documents with a non-null value in the specified
+/// field.
+///
+/// If the field path is at the root or inside JSON field, all subpath will also be
+/// checked and the document will be matched if a non-null value exists in any subpath.
+/// For example, assuming the following document where `myfield` is a JSON fast field:
+/// ```json
+/// {
+///   "myfield": {
+///     "mysubfield": "hello"
+///   }
+/// }
+/// ```
+/// "exists" queries on either `myfield` or `myfield.mysubfield` will match the document.
 ///
 /// All of the matched documents get the score 1.0.
 #[derive(Clone, Debug)]
@@ -43,20 +57,27 @@ impl Query for ExistsQuery {
         }
         Ok(Box::new(ExistsWeight {
             field_name: self.field_name.clone(),
+            field_type: field_type.value_type(),
         }))
     }
 }
 
 /// Weight associated with the `ExistsQuery` query.
 pub struct ExistsWeight {
     field_name: String,
+    field_type: Type,
 }
 
 impl Weight for ExistsWeight {
     fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
         let fast_field_reader = reader.fast_fields();
-        let dynamic_columns: crate::Result<Vec<DynamicColumn>> = fast_field_reader
-            .dynamic_column_handles(&self.field_name)?
+        let mut column_handles = fast_field_reader.dynamic_column_handles(&self.field_name)?;
+        if self.field_type == Type::Json {
+            let mut sub_columns =
+                fast_field_reader.dynamic_subpath_column_handles(&self.field_name)?;
+            column_handles.append(&mut sub_columns);
+        }
+        let dynamic_columns: crate::Result<Vec<DynamicColumn>> = column_handles
             .into_iter()
             .map(|handle| handle.open().map_err(|io_error| io_error.into()))
             .collect();
@@ -233,6 +254,7 @@ mod tests {
         assert_eq!(count_existing_fields(&searcher, "json.all")?, 100);
         assert_eq!(count_existing_fields(&searcher, "json.even")?, 50);
         assert_eq!(count_existing_fields(&searcher, "json.odd")?, 50);
+        assert_eq!(count_existing_fields(&searcher, "json")?, 100);
 
         // Handling of non-existing fields:
         assert_eq!(count_existing_fields(&searcher, "json.absent")?, 0);