From a2ebcde156cea8eb7999b53c95394297d2b629e7 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Tue, 12 Mar 2024 12:42:35 +0800 Subject: [PATCH] handle all type conversion cases, add tests, improve python test lib --- quickwit/quickwit-search/src/collector.rs | 142 ++++++-- quickwit/rest-api-tests/run_tests.py | 15 +- .../es_compatibility/0018-search_after.yaml | 316 ++++++++++++++++++ .../scenarii/es_compatibility/0020-stats.yaml | 6 +- .../es_compatibility/0021-cat-indices.yaml | 2 + .../es_compatibility/_setup.quickwit.yaml | 74 ++++ .../es_compatibility/_teardown.quickwit.yaml | 5 + 7 files changed, 530 insertions(+), 30 deletions(-) diff --git a/quickwit/quickwit-search/src/collector.rs b/quickwit/quickwit-search/src/collector.rs index 2204bc2a1c5..f8b7242110c 100644 --- a/quickwit/quickwit-search/src/collector.rs +++ b/quickwit/quickwit-search/src/collector.rs @@ -128,7 +128,7 @@ impl SortByComponent { } } -#[derive(Copy, Clone, Debug)] +#[derive(Copy, Clone, Debug, Eq, PartialEq)] enum SortFieldType { U64, I64, @@ -158,6 +158,7 @@ impl SortingFieldExtractorComponent { #[inline] fn extract_typed_sort_value_opt(&self, doc_id: DocId, score: Score) -> Option { match self { + // Tie breaks are not handled here, but in SegmentPartialHit SortingFieldExtractorComponent::DocId => None, SortingFieldExtractorComponent::FastField { sort_column, .. } => { sort_column.first(doc_id) @@ -169,6 +170,9 @@ impl SortingFieldExtractorComponent { #[inline] /// Converts u64 fast field values to its correct type. /// The conversion is delayed for performance reasons. + /// + /// This is used to convert `search_after` sort value to a u64 representation that will respect + /// the same order as the `SortValue` representation. fn convert_u64_ff_val_to_sort_value(&self, sort_value: u64) -> SortValue { let map_fast_field_to_value = |fast_field_value, field_type| match field_type { SortFieldType::U64 => SortValue::U64(fast_field_value), @@ -188,12 +192,12 @@ impl SortingFieldExtractorComponent { /// Converts fast field values into their u64 fast field representation. /// /// Returns None if value is out of bounds of target value. - /// None means that the search_after will be disabled as everything matches. + /// None means that the search_after will be disabled and everything matches. /// /// What's currently missing is to signal that _nothing_ matches to generate an optimized - /// query. + /// query. For now we just choose the max value of the target type. #[inline] - fn convert_to_u64_ff_val(&self, sort_value: SortValue) -> Option { + fn convert_to_u64_ff_val(&self, sort_value: SortValue, sort_order: SortOrder) -> Option { match self { SortingFieldExtractorComponent::DocId => match sort_value { SortValue::U64(val) => Some(val), @@ -206,6 +210,37 @@ impl SortingFieldExtractorComponent { // representation of the fast field. // This requires this weird conversion of first casting into the target type // (if possible) and then to its u64 presentation. + // + // For the conversion into the target type it's important to know if the target + // type does not cover the whole range of the source type. In that case we need to + // add additional conversion checks, to see if it matches everything + // or nothing. (Which also depends on the sort order). + // Below are the visual representations of the value ranges of the different types. + // Note: DateTime is equal to I64 and omitted. + // + // Bool value range (0, 1): + // <-> + // + // I64 value range (signed 64-bit integer): + // <------------------------------------> + // -2^63 2^63-1 + // U64 value range (unsigned 64-bit integer): + // <------------------------------------> + // 0 2^64-1 + // F64 value range (64-bit floating point, conceptual, not to scale): + // <--------------------------------------------------------------------> + // Very negative numbers Very positive numbers + // + // Those conversions have limited target type value space: + // - [X] U64 -> I64 + // - [X] F64 -> I64 + // - [X] I64 -> U64 + // - [X] F64 -> U64 + // + // - [X] F64 -> Bool + // - [X] I64 -> Bool + // - [X] U64 -> Bool + // let val = match (sort_value, sort_field_type) { // Same field type, no conversion needed. (SortValue::U64(val), SortFieldType::U64) => val, @@ -213,49 +248,101 @@ impl SortingFieldExtractorComponent { (SortValue::Boolean(val), SortFieldType::Bool) => val.to_u64(), (SortValue::I64(val), SortFieldType::I64) => val.to_u64(), (SortValue::U64(mut val), SortFieldType::I64) => { + if sort_order == SortOrder::Desc && val > i64::MAX as u64 { + return None; + } // Add a limit to avoid overflow. val = val.min(i64::MAX as u64); (val as i64).to_u64() } (SortValue::U64(val), SortFieldType::F64) => (val as f64).to_u64(), (SortValue::U64(mut val), SortFieldType::DateTime) => { + // Match everything + if sort_order == SortOrder::Desc && val > i64::MAX as u64 { + return None; + } // Add a limit to avoid overflow. val = val.min(i64::MAX as u64); DateTime::from_timestamp_nanos(val as i64).to_u64() } - // questionable number into boolean conversions - (SortValue::U64(val), SortFieldType::Bool) => (val != 0).to_u64(), - (SortValue::I64(val), SortFieldType::Bool) => (val != 0).to_u64(), - (SortValue::F64(val), SortFieldType::Bool) => (val != 0.0).to_u64(), (SortValue::I64(val), SortFieldType::U64) => { - if val < 0 { + if val < 0 && sort_order == SortOrder::Asc { return None; } - val as u64 + if val < 0 && sort_order == SortOrder::Desc { + u64::MIN // matches nothing as search_after is not inclusive + } else { + val as u64 + } } (SortValue::I64(val), SortFieldType::F64) => (val as f64).to_u64(), (SortValue::I64(val), SortFieldType::DateTime) => { DateTime::from_timestamp_nanos(val).to_u64() } - (SortValue::F64(val), SortFieldType::DateTime) => { - DateTime::from_timestamp_nanos(val as i64).to_u64() - } (SortValue::F64(val), SortFieldType::U64) => { - if val < 0.0 { + let all_values_ahead1 = + val < u64::MIN as f64 && sort_order == SortOrder::Asc; + let all_values_ahead2 = + val > u64::MAX as f64 && sort_order == SortOrder::Desc; + if all_values_ahead1 || all_values_ahead2 { return None; } - (val.floor() as u64).to_u64() + // f64 cast already handles under/overflow and clamps the value + (val as u64).to_u64() } - (SortValue::F64(val), SortFieldType::I64) => { - if val < i64::MIN as f64 { + (SortValue::F64(val), SortFieldType::I64) + | (SortValue::F64(val), SortFieldType::DateTime) => { + let all_values_ahead1 = + val < i64::MIN as f64 && sort_order == SortOrder::Asc; + let all_values_ahead2 = + val > i64::MAX as f64 && sort_order == SortOrder::Desc; + if all_values_ahead1 || all_values_ahead2 { return None; } - (val as i64).to_u64() + // f64 cast already handles under/overflow and clamps the value + let val_i64 = val as i64; + + if *sort_field_type == SortFieldType::DateTime { + DateTime::from_timestamp_nanos(val_i64).to_u64() + } else { + val_i64.to_u64() + } } - // Disable search after for bool conversion into other types - // So it would be a match all ... but maybe it should be a match none instead? // Not sure when we hit this, it's probably are very rare case. - (SortValue::Boolean(_val), _) => return None, + (SortValue::Boolean(val), SortFieldType::U64) => val as u64, + (SortValue::Boolean(val), SortFieldType::F64) => (val as u64 as f64).to_u64(), + (SortValue::Boolean(val), SortFieldType::I64) => (val as i64).to_u64(), + (SortValue::Boolean(val), SortFieldType::DateTime) => { + DateTime::from_timestamp_nanos(val as i64).to_u64() + } + (SortValue::U64(mut val), SortFieldType::Bool) => { + let all_values_ahead1 = val > 1 && sort_order == SortOrder::Desc; + if all_values_ahead1 { + return None; + } + // clamp value for comparison + val = val.min(1).max(0); + (val == 1).to_u64() + } + (SortValue::I64(mut val), SortFieldType::Bool) => { + let all_values_ahead1 = val > 1 && sort_order == SortOrder::Desc; + let all_values_ahead2 = val < 0 && sort_order == SortOrder::Asc; + if all_values_ahead1 || all_values_ahead2 { + return None; + } + // clamp value for comparison + val = val.min(1).max(0); + (val == 1).to_u64() + } + (SortValue::F64(mut val), SortFieldType::Bool) => { + let all_values_ahead1 = val > 1.0 && sort_order == SortOrder::Desc; + let all_values_ahead2 = val < 0.0 && sort_order == SortOrder::Asc; + if all_values_ahead1 || all_values_ahead2 { + return None; + } + val = val.min(1.0).max(0.0); + (val >= 0.5).to_u64() // Is this correct? + } }; Some(val) } @@ -405,6 +492,8 @@ struct SearchAfterSegment { impl SearchAfterSegment { fn new( search_after: Option, + sort_order1: SortOrder, + sort_order2: SortOrder, score_extractor: &SortingFieldExtractorPair, ) -> Option { let Some(search_after) = search_after else { @@ -418,7 +507,7 @@ impl SearchAfterSegment { { if let Some(new_value) = score_extractor .first - .convert_to_u64_ff_val(search_after_sort_value) + .convert_to_u64_ff_val(search_after_sort_value, sort_order1) { sort_value = Some(new_value); } else { @@ -437,7 +526,9 @@ impl SearchAfterSegment { .second .as_ref() .expect("Internal error: Got sort_value2, but no sort extractor"); - if let Some(new_value) = extractor.convert_to_u64_ff_val(search_after_sort_value) { + if let Some(new_value) = + extractor.convert_to_u64_ff_val(search_after_sort_value, sort_order2) + { sort_value2 = Some(new_value); } } @@ -454,7 +545,7 @@ impl SearchAfterSegment { impl QuickwitSegmentCollector { #[inline] fn collect_top_k(&mut self, doc_id: DocId, score: Score) { - let (sort_value, sort_value2) = + let (sort_value, sort_value2): (Option, Option) = self.score_extractor.extract_typed_sort_value(doc_id, score); if let Some(search_after) = &self.search_after { @@ -804,7 +895,8 @@ impl Collector for QuickwitCollector { _ => Ordering::Equal, }; // Convert search_after into fast field u64 - let search_after = SearchAfterSegment::new(self.search_after.clone(), &score_extractor); + let search_after = + SearchAfterSegment::new(self.search_after.clone(), order1, order2, &score_extractor); Ok(QuickwitSegmentCollector { num_hits: 0u64, split_id: self.split_id.clone(), diff --git a/quickwit/rest-api-tests/run_tests.py b/quickwit/rest-api-tests/run_tests.py index 5e65e6668ff..70d0f314e12 100755 --- a/quickwit/rest-api-tests/run_tests.py +++ b/quickwit/rest-api-tests/run_tests.py @@ -137,7 +137,18 @@ def check_result(result, expected, context_path = ""): def check_result_list(result, expected, context_path=""): if len(result) != len(expected): - raise(Exception("Wrong length at context %s" % context_path)) + if len(expected) != 0: + # get keys from the expected dicts and filter result to print only the keys that are in the expected dicts + expected_keys = set().union(*expected) + filtered_result = [{k: v for k, v in d.items() if k in expected_keys} for d in result] + # Check if the length differs by more than five + if abs(len(filtered_result) - len(expected)) > 5: + # Show only the first 5 elements followed by ellipsis if there are more + display_filtered_result = filtered_result[:5] + ['...'] if len(filtered_result) > 5 else filtered_result + else: + display_filtered_result = filtered_result + raise Exception("Wrong length at context %s. Expected: %s Received: %s,\n Expected \n%s \n Received \n%s" % (context_path, len(expected), len(result), display_filtered_result, expected)) + raise Exception("Wrong length at context %s. Expected: %s Received: %s" % (context_path, len(expected), len(result))) for (i, (left, right)) in enumerate(zip(result, expected)): check_result(left, right, context_path + "[%s]" % i) @@ -242,7 +253,7 @@ def run_scenario(self, path, script): num_steps_executed += 1 except Exception as e: print("🔴 %s" % scenario_path) - print("Failed at step %d" % i) + print(f"Failed at step '{step['desc']}'" if 'desc' in step else f"Failed at step {i}") print(step) print(e) print("--------------") diff --git a/quickwit/rest-api-tests/scenarii/es_compatibility/0018-search_after.yaml b/quickwit/rest-api-tests/scenarii/es_compatibility/0018-search_after.yaml index bd24f4fb718..2f61636f994 100644 --- a/quickwit/rest-api-tests/scenarii/es_compatibility/0018-search_after.yaml +++ b/quickwit/rest-api-tests/scenarii/es_compatibility/0018-search_after.yaml @@ -143,3 +143,319 @@ expected: - sort: [1422748816000000000] - sort: [1422748816000000000] - sort: [1422748816000000000] +--- # i64 to u64 +endpoint: "search_after/_search" +engines: + - quickwit +json: + size: 1 + query: + match_all: {} + sort: + - val_u64: + order: asc + search_after: [-10] +expected: + hits: + total: + value: 5 + relation: eq + hits: + - sort: [0] +--- # f64 to u64 +endpoint: "search_after/_search" +engines: + - quickwit +json: + size: 1 + query: + match_all: {} + sort: + - val_u64: + order: asc + search_after: [0.2] +expected: + hits: + total: + value: 5 + relation: eq + hits: + - sort: [20] +--- # u64 to i64 +endpoint: "search_after/_search" +desc: "search after u64 to i64 asc" +engines: + - quickwit +json: + size: 5 + query: + match_all: {} + sort: + - val_i64: + order: asc + search_after: [250] +expected: + hits: + total: + value: 5 + relation: eq + hits: + - sort: [300] + - sort: [9223372036854775807] + - sort: [9223372036854775807] +--- # u64 to i64 +endpoint: "search_after/_search" +desc: "search after u64 to i64 desc" +engines: + - quickwit +json: + size: 5 + query: + match_all: {} + sort: + - val_i64: + order: desc + search_after: [250] +expected: + hits: + total: + value: 5 + relation: eq + hits: + - sort: [200] + - sort: [-100] +--- # u64 to i64 corner case. We are exceeding i64::MAX, so we don't get any results. +desc: "search after u64 to i64 corner case exceeding i64::MAX asc" +endpoint: "search_after/_search" +engines: + - quickwit +json: + size: 5 + query: + match_all: {} + sort: + - val_i64: + order: asc + search_after: [18_000_000_000_000_000_000] +expected: + hits: + total: + value: 5 + relation: eq + hits: + $expect: "len(val) == 0" +--- # u64 to i64 corner case.We are exceeding i64::MAX, but with desc we get ALL the results. +desc: "search after u64 to i64 corner case exceeding i64::MAX desc" +endpoint: "search_after/_search" +engines: + - quickwit +json: + size: 5 + query: + match_all: {} + sort: + - val_i64: + order: desc + search_after: [18_000_000_000_000_000_000] +expected: + hits: + total: + value: 5 + relation: eq + hits: + - sort: [9_223_372_036_854_775_807] + - sort: [9_223_372_036_854_775_807] + - sort: [300] + - sort: [200] + - sort: [-100] +--- # u64 to i64 corner case +desc: "search after u64 to i64 corner case one below i64::MAX asc" +endpoint: "search_after/_search" +engines: + - quickwit +json: + size: 1 + query: + match_all: {} + sort: + - val_i64: + order: asc + search_after: [9_223_372_036_854_775_806] +expected: + hits: + total: + value: 5 + relation: eq + hits: + - sort: [9_223_372_036_854_775_807] +--- +desc: "search after u64 to i64 corner case exactly i64::MAX asc" +endpoint: "search_after/_search" +engines: + - quickwit +json: + size: 1 + query: + match_all: {} + sort: + - val_i64: + order: asc + search_after: [9_223_372_036_854_775_807] +expected: + hits: + total: + value: 5 + relation: eq + hits: + $expect: "len(val) == 0" +--- +desc: "search after u64 to i64 corner case one above i64::MAX asc" +endpoint: "search_after/_search" +engines: + - quickwit +json: + size: 1 + query: + match_all: {} + sort: + - val_i64: + order: asc + search_after: [9_223_372_036_854_775_808] +expected: + hits: + total: + value: 5 + relation: eq + hits: + $expect: "len(val) == 0" +--- +desc: "search after f64 to i64 corner case" +endpoint: "search_after/_search" +engines: + - quickwit +json: + size: 1 + query: + match_all: {} + sort: + - val_i64: + order: asc + search_after: [9_223_372_036_854_500_000.5] # lower the value we seem to hit some f64 accuracy issue here +expected: + hits: + total: + value: 5 + relation: eq + hits: + - sort: [9_223_372_036_854_775_807] +--- +desc: "search after f64 to i64 out of bounds asc match nothing" +endpoint: "search_after/_search" +engines: + - quickwit +json: + size: 1 + query: + match_all: {} + sort: + - val_i64: + order: asc + search_after: [19_223_372_036_854_500_000.5] +expected: + hits: + total: + value: 5 + relation: eq + hits: + $expect: "len(val) == 0" +--- +desc: "search after f64 to i64 out of bounds desc match everything" +endpoint: "search_after/_search" +engines: + - quickwit +json: + size: 5 + query: + match_all: {} + sort: + - val_i64: + order: desc + search_after: [19_223_372_036_854_500_000.5] +expected: + hits: + total: + value: 5 + relation: eq + hits: + - sort: [9_223_372_036_854_775_807] + - sort: [9_223_372_036_854_775_807] + - sort: [300] + - sort: [200] + - sort: [-100] +--- +desc: "search after on mixed column asc" +endpoint: "search_after/_search" +engines: + - quickwit +json: + size: 5 + query: + match_all: {} + sort: + - mixed_type: + order: asc + search_after: [-10] +expected: + hits: + total: + value: 5 + relation: eq + hits: + - sort: [0] + - sort: [True] + - sort: [10.5] + - sort: [18000000000000000000] +--- +desc: "search after on mixed column desc match nothing" +endpoint: "search_after/_search" +engines: + - quickwit +json: + size: 5 + query: + match_all: {} + sort: + - mixed_type: + order: desc + search_after: [-10] +expected: + hits: + total: + value: 5 + relation: eq + hits: + $expect: "len(val) == 0" +--- +desc: "search after on mixed column desc" +endpoint: "search_after/_search" +engines: + - quickwit +json: + size: 5 + query: + match_all: {} + sort: + - mixed_type: + order: desc + search_after: [2] +expected: + hits: + total: + value: 5 + relation: eq + hits: + - sort: [True] + - sort: [0] + - sort: [-10] + + diff --git a/quickwit/rest-api-tests/scenarii/es_compatibility/0020-stats.yaml b/quickwit/rest-api-tests/scenarii/es_compatibility/0020-stats.yaml index 522dcbedaa8..ac1c44054a0 100644 --- a/quickwit/rest-api-tests/scenarii/es_compatibility/0020-stats.yaml +++ b/quickwit/rest-api-tests/scenarii/es_compatibility/0020-stats.yaml @@ -65,12 +65,12 @@ expected: _all: primaries: docs: - count: 100 + count: 105 total: segments: - count: 1 + count: 5 docs: - count: 100 + count: 105 indices: gharchive: primaries: diff --git a/quickwit/rest-api-tests/scenarii/es_compatibility/0021-cat-indices.yaml b/quickwit/rest-api-tests/scenarii/es_compatibility/0021-cat-indices.yaml index ca59de616b3..19ad4b4c48d 100644 --- a/quickwit/rest-api-tests/scenarii/es_compatibility/0021-cat-indices.yaml +++ b/quickwit/rest-api-tests/scenarii/es_compatibility/0021-cat-indices.yaml @@ -20,6 +20,8 @@ expected: docs.count: '0' - index: otel-traces-v0_7 docs.count: '0' +- index: search_after + docs.count: '5' --- method: [GET] engines: diff --git a/quickwit/rest-api-tests/scenarii/es_compatibility/_setup.quickwit.yaml b/quickwit/rest-api-tests/scenarii/es_compatibility/_setup.quickwit.yaml index 49ac827c44f..7905cf43cca 100644 --- a/quickwit/rest-api-tests/scenarii/es_compatibility/_setup.quickwit.yaml +++ b/quickwit/rest-api-tests/scenarii/es_compatibility/_setup.quickwit.yaml @@ -4,6 +4,12 @@ api_root: http://localhost:7280/api/v1/ endpoint: indexes/gharchive status_code: null --- +# Delete possibly remaining index +method: DELETE +api_root: http://localhost:7280/api/v1/ +endpoint: indexes/empty_index +status_code: null +--- # Create index method: POST api_root: http://localhost:7280/api/v1/ @@ -16,6 +22,7 @@ json: - name: created_at type: datetime fast: true +sleep_after: 3 --- # Create index method: POST @@ -55,3 +62,70 @@ params: refresh: "true" headers: {"Content-Type": "application/json", "content-encoding": "gzip"} body_from_file: gharchive-bulk.json.gz +sleep_after: 3 +--- +# Delete possibly remaining index +method: DELETE +endpoint: indexes/search_after +status_code: null +--- +# Create index +method: POST +api_root: http://localhost:7280/api/v1/ +endpoint: indexes/ +json: + version: "0.7" + index_id: search_after + doc_mapping: + mode: dynamic + dynamic_mapping: + tokenizer: default + fast: true + field_mappings: + - name: val_u64 + type: u64 + fast: true + - name: val_f64 + type: f64 + fast: true + - name: val_i64 + type: i64 + fast: true +sleep_after: 3 +--- +# Ingest documents split #1 +method: POST +api_root: http://localhost:7280/api/v1/ +endpoint: search_after/ingest +params: + commit: force +ndjson: + - {"mixed_type": 18_000_000_000_000_000_000, "val_i64": -100, "val_f64": 100.5, "val_u64": 0} # mixed_type is a u64 + - {"mixed_type": 0, "val_i64": 9_223_372_036_854_775_807, "val_f64": 110, "val_u64": 18_000_000_000_000_000_000} # to enforce u64 type on val_u64 we need a value > 2^63, or it will take i64 (maybe we should change this) +--- +# Ingest documents split #2 +method: POST +api_root: http://localhost:7280/api/v1/ +endpoint: search_after/ingest +params: + commit: force +ndjson: + - {"mixed_type": 10.5, "val_i64": 200, "val_f64": 200.0, "val_u64": 20} #mixed_type is a f64 +--- +# Ingest documents split #3 +method: POST +api_root: http://localhost:7280/api/v1/ +endpoint: search_after/ingest +params: + commit: force +ndjson: + - {"mixed_type": -10, "val_i64": 300, "val_f64": 300.0, "val_u64": 0} #mixed_type is a i64 +--- +# Ingest documents split #4 +method: POST +api_root: http://localhost:7280/api/v1/ +endpoint: search_after/ingest +params: + commit: force +ndjson: + - {"mixed_type": true, "val_i64": 9_223_372_036_854_775_807, "val_f64": 300.0, "val_u64": 0} # i64::MAX diff --git a/quickwit/rest-api-tests/scenarii/es_compatibility/_teardown.quickwit.yaml b/quickwit/rest-api-tests/scenarii/es_compatibility/_teardown.quickwit.yaml index 746e2160601..b9612bfad96 100644 --- a/quickwit/rest-api-tests/scenarii/es_compatibility/_teardown.quickwit.yaml +++ b/quickwit/rest-api-tests/scenarii/es_compatibility/_teardown.quickwit.yaml @@ -22,3 +22,8 @@ method: DELETE api_root: http://localhost:7280/api/v1/ endpoint: indexes/test_index1 status_code: null +--- # Cleanup +method: DELETE +api_root: http://localhost:7280/api/v1/ +endpoint: indexes/search_after +status_code: null