Add wildcard docs (#5632)

Also improve tests
quickwit-oss · Jan 14, 2025 · dee631b · dee631b
1 parent d8e98b7
commit dee631b
Show file tree

Hide file tree

Showing 3 changed files with 47 additions and 9 deletions.
diff --git a/docs/reference/query-language.md b/docs/reference/query-language.md
@@ -75,14 +75,21 @@ Matches documents if the targeted field contains a token equal to the provided t
 
 `field:value` will match any document where the field 'field' has a token 'value'.
 
-### Term Prefix `field:prefix*`
+### Wildcard `field:wil?car*d`
 ```
-term_prefix = term '*'
+wildcard = [term_char\*\?]+
 ```
 
-Matches documents if the targeted field contains a token which starts with the provided value.
+Matches documents if the targeted field contains a token that matches the wildcard:
+- `?` replaces one and only one term character
+- `*` replaces any number of term characters or an empty string
+
+Examples:
+- `field:quick*` will match any document where the field 'field' has a token like `quickwit` or `quickstart`, but not `qui` or `abcd`.
+- `field:h?llo` will match any document where the field 'field' has a token like `hello` or `hallo`, but not `heillo` or `hllo`.
+
+Queries with prefixes (`field:qui*`) are much more efficient than queries starting with a wildcard (`field:*wit`)
 
-`field:quick*` will match any document where the field 'field' has a token like `quickwit` or `quickstart`, but not `qui` or `abcd`.
 
 ### Term set `field:IN [a b c]`
 ```
@@ -110,9 +117,16 @@ slop = '~' [01-9]+
 
 ```
 
-Matches if the field contains the sequence of token provided. `field:"looks good to me"` will match any document containing that sequence of tokens.
+Matches if the field contains the sequence of token provided:
+- `field:"looks good to me"` will match any document containing that sequence of tokens.
+- `field:"look* good to me"` with the default tokenizer is equivalent to `field:"look good to me"`, i.e. the '*' character is pruned by the tokenizer and not interpreted as a wildcard.
+
+:::info
+
 The field must have been configured with `record: position` when indexing.
 
+:::
+
 ###### Slop operator
 Is is also possible to add a slop, which allow matching a sequence with some distance. For instance `"looks to me"~1` will match "looks good to me", but not "looks very good to me".
 Transposition costs 2, e.g. `"A B"~1` will not match `"B A"` but it would with `"A B"~2`.

diff --git a/quickwit/quickwit-doc-mapper/src/query_builder.rs b/quickwit/quickwit-doc-mapper/src/query_builder.rs
@@ -593,6 +593,17 @@ mod test {
     #[test]
     fn test_wildcard_query() {
         check_build_query_static_mode("title:hello*", Vec::new(), TestExpectation::Ok("Regex"));
+        check_build_query_static_mode(
+            "title:\"hello world\"*",
+            Vec::new(),
+            TestExpectation::Ok("PhrasePrefixQuery"),
+        );
+        // the tokenizer removes '*' chars, making it a simple PhraseQuery (not RegexPhraseQuery)
+        check_build_query_static_mode(
+            "title:\"hello* world*\"",
+            Vec::new(),
+            TestExpectation::Ok("PhraseQuery"),
+        );
         check_build_query_static_mode(
             "foo:bar*",
             Vec::new(),

diff --git a/quickwit/rest-api-tests/scenarii/es_compatibility/0005-query_string_query.yaml b/quickwit/rest-api-tests/scenarii/es_compatibility/0005-query_string_query.yaml
@@ -152,7 +152,7 @@ json:
       query: "to AND the"
 status_code: 400
 ---
-# trailing wildcard
+# wildcard
 json:
   query:
     query_string:
@@ -164,7 +164,19 @@ expected:
     total:
       value: 2
 ---
-# trailing wildcard
+# wildcard
+json:
+  query:
+    query_string:
+      default_field: payload.description
+      lenient: true
+      query: "Jour?al AND unix"
+expected:
+  hits:
+    total:
+      value: 2
+---
+# wildcard
 json:
   query:
     query_string:
@@ -188,13 +200,14 @@ expected:
     total:
       value: 3
 ---
-# trailing wildcard
+# escaped wildcard
 json:
   query:
     query_string:
       default_field: payload.description
       lenient: true
-      query: "jour\\*"
+      # ? char removed by tokenizer
+      query: "jour\\?"
 expected:
   hits:
     total: