handle all type conversion cases, add tests, improve python test lib

quickwit-oss · Mar 12, 2024 · a2ebcde · a2ebcde
1 parent fca2c1f
commit a2ebcde
Show file tree

Hide file tree

Showing 7 changed files with 530 additions and 30 deletions.
diff --git a/quickwit/quickwit-search/src/collector.rs b/quickwit/quickwit-search/src/collector.rs
@@ -128,7 +128,7 @@ impl SortByComponent {
     }
 }
 
-#[derive(Copy, Clone, Debug)]
+#[derive(Copy, Clone, Debug, Eq, PartialEq)]
 enum SortFieldType {
     U64,
     I64,
@@ -158,6 +158,7 @@ impl SortingFieldExtractorComponent {
     #[inline]
     fn extract_typed_sort_value_opt(&self, doc_id: DocId, score: Score) -> Option<u64> {
         match self {
+            // Tie breaks are not handled here, but in SegmentPartialHit
             SortingFieldExtractorComponent::DocId => None,
             SortingFieldExtractorComponent::FastField { sort_column, .. } => {
                 sort_column.first(doc_id)
@@ -169,6 +170,9 @@ impl SortingFieldExtractorComponent {
     #[inline]
     /// Converts u64 fast field values to its correct type.
     /// The conversion is delayed for performance reasons.
+    ///
+    /// This is used to convert `search_after` sort value to a u64 representation that will respect
+    /// the same order as the `SortValue` representation.
     fn convert_u64_ff_val_to_sort_value(&self, sort_value: u64) -> SortValue {
         let map_fast_field_to_value = |fast_field_value, field_type| match field_type {
             SortFieldType::U64 => SortValue::U64(fast_field_value),
@@ -188,12 +192,12 @@ impl SortingFieldExtractorComponent {
     /// Converts fast field values into their u64 fast field representation.
     ///
     /// Returns None if value is out of bounds of target value.
-    /// None means that the search_after will be disabled as everything matches.
+    /// None means that the search_after will be disabled and everything matches.
     ///
     /// What's currently missing is to signal that _nothing_ matches to generate an optimized
-    /// query.
+    /// query. For now we just choose the max value of the target type.
     #[inline]
-    fn convert_to_u64_ff_val(&self, sort_value: SortValue) -> Option<u64> {
+    fn convert_to_u64_ff_val(&self, sort_value: SortValue, sort_order: SortOrder) -> Option<u64> {
         match self {
             SortingFieldExtractorComponent::DocId => match sort_value {
                 SortValue::U64(val) => Some(val),
@@ -206,56 +210,139 @@ impl SortingFieldExtractorComponent {
                 // representation of the fast field.
                 // This requires this weird conversion of first casting into the target type
                 // (if possible) and then to its u64 presentation.
+                //
+                // For the conversion into the target type it's important to know if the target
+                // type does not cover the whole range of the source type. In that case we need to
+                // add additional conversion checks, to see if it matches everything
+                // or nothing. (Which also depends on the sort order).
+                // Below are the visual representations of the value ranges of the different types.
+                // Note: DateTime is equal to I64 and omitted.
+                //
+                //     Bool value range (0, 1):
+                //                        <->
+                //
+                //     I64 value range (signed 64-bit integer):
+                //     <------------------------------------>
+                //     -2^63                             2^63-1
+                //     U64 value range (unsigned 64-bit integer):
+                //                        <------------------------------------>
+                //                        0                                  2^64-1
+                // F64 value range (64-bit floating point, conceptual, not to scale):
+                // <-------------------------------------------------------------------->
+                // Very negative numbers                                       Very positive numbers
+                //
+                // Those conversions have limited target type value space:
+                // - [X] U64 -> I64
+                // - [X] F64 -> I64
+                // - [X] I64 -> U64
+                // - [X] F64 -> U64
+                //
+                // - [X] F64 -> Bool
+                // - [X] I64 -> Bool
+                // - [X] U64 -> Bool
+                //
                 let val = match (sort_value, sort_field_type) {
                     // Same field type, no conversion needed.
                     (SortValue::U64(val), SortFieldType::U64) => val,
                     (SortValue::F64(val), SortFieldType::F64) => val.to_u64(),
                     (SortValue::Boolean(val), SortFieldType::Bool) => val.to_u64(),
                     (SortValue::I64(val), SortFieldType::I64) => val.to_u64(),
                     (SortValue::U64(mut val), SortFieldType::I64) => {
+                        if sort_order == SortOrder::Desc && val > i64::MAX as u64 {
+                            return None;
+                        }
                         // Add a limit to avoid overflow.
                         val = val.min(i64::MAX as u64);
                         (val as i64).to_u64()
                     }
                     (SortValue::U64(val), SortFieldType::F64) => (val as f64).to_u64(),
                     (SortValue::U64(mut val), SortFieldType::DateTime) => {
+                        // Match everything
+                        if sort_order == SortOrder::Desc && val > i64::MAX as u64 {
+                            return None;
+                        }
                         // Add a limit to avoid overflow.
                         val = val.min(i64::MAX as u64);
                         DateTime::from_timestamp_nanos(val as i64).to_u64()
                     }
-                    // questionable number into boolean conversions
-                    (SortValue::U64(val), SortFieldType::Bool) => (val != 0).to_u64(),
-                    (SortValue::I64(val), SortFieldType::Bool) => (val != 0).to_u64(),
-                    (SortValue::F64(val), SortFieldType::Bool) => (val != 0.0).to_u64(),
                     (SortValue::I64(val), SortFieldType::U64) => {
-                        if val < 0 {
+                        if val < 0 && sort_order == SortOrder::Asc {
                             return None;
                         }
-                        val as u64
+                        if val < 0 && sort_order == SortOrder::Desc {
+                            u64::MIN // matches nothing as search_after is not inclusive
+                        } else {
+                            val as u64
+                        }
                     }
                     (SortValue::I64(val), SortFieldType::F64) => (val as f64).to_u64(),
                     (SortValue::I64(val), SortFieldType::DateTime) => {
                         DateTime::from_timestamp_nanos(val).to_u64()
                     }
-                    (SortValue::F64(val), SortFieldType::DateTime) => {
-                        DateTime::from_timestamp_nanos(val as i64).to_u64()
-                    }
                     (SortValue::F64(val), SortFieldType::U64) => {
-                        if val < 0.0 {
+                        let all_values_ahead1 =
+                            val < u64::MIN as f64 && sort_order == SortOrder::Asc;
+                        let all_values_ahead2 =
+                            val > u64::MAX as f64 && sort_order == SortOrder::Desc;
+                        if all_values_ahead1 || all_values_ahead2 {
                             return None;
                         }
-                        (val.floor() as u64).to_u64()
+                        // f64 cast already handles under/overflow and clamps the value
+                        (val as u64).to_u64()
                     }
-                    (SortValue::F64(val), SortFieldType::I64) => {
-                        if val < i64::MIN as f64 {
+                    (SortValue::F64(val), SortFieldType::I64)
+                    | (SortValue::F64(val), SortFieldType::DateTime) => {
+                        let all_values_ahead1 =
+                            val < i64::MIN as f64 && sort_order == SortOrder::Asc;
+                        let all_values_ahead2 =
+                            val > i64::MAX as f64 && sort_order == SortOrder::Desc;
+                        if all_values_ahead1 || all_values_ahead2 {
                             return None;
                         }
-                        (val as i64).to_u64()
+                        // f64 cast already handles under/overflow and clamps the value
+                        let val_i64 = val as i64;
+
+                        if *sort_field_type == SortFieldType::DateTime {
+                            DateTime::from_timestamp_nanos(val_i64).to_u64()
+                        } else {
+                            val_i64.to_u64()
+                        }
                     }
-                    // Disable search after for bool conversion into other types
-                    // So it would be a match all ...  but maybe it should be a match none instead?
                     // Not sure when we hit this, it's probably are very rare case.
-                    (SortValue::Boolean(_val), _) => return None,
+                    (SortValue::Boolean(val), SortFieldType::U64) => val as u64,
+                    (SortValue::Boolean(val), SortFieldType::F64) => (val as u64 as f64).to_u64(),
+                    (SortValue::Boolean(val), SortFieldType::I64) => (val as i64).to_u64(),
+                    (SortValue::Boolean(val), SortFieldType::DateTime) => {
+                        DateTime::from_timestamp_nanos(val as i64).to_u64()
+                    }
+                    (SortValue::U64(mut val), SortFieldType::Bool) => {
+                        let all_values_ahead1 = val > 1 && sort_order == SortOrder::Desc;
+                        if all_values_ahead1 {
+                            return None;
+                        }
+                        // clamp value for comparison
+                        val = val.min(1).max(0);
+                        (val == 1).to_u64()
+                    }
+                    (SortValue::I64(mut val), SortFieldType::Bool) => {
+                        let all_values_ahead1 = val > 1 && sort_order == SortOrder::Desc;
+                        let all_values_ahead2 = val < 0 && sort_order == SortOrder::Asc;
+                        if all_values_ahead1 || all_values_ahead2 {
+                            return None;
+                        }
+                        // clamp value for comparison
+                        val = val.min(1).max(0);
+                        (val == 1).to_u64()
+                    }
+                    (SortValue::F64(mut val), SortFieldType::Bool) => {
+                        let all_values_ahead1 = val > 1.0 && sort_order == SortOrder::Desc;
+                        let all_values_ahead2 = val < 0.0 && sort_order == SortOrder::Asc;
+                        if all_values_ahead1 || all_values_ahead2 {
+                            return None;
+                        }
+                        val = val.min(1.0).max(0.0);
+                        (val >= 0.5).to_u64() // Is this correct?
+                    }
                 };
                 Some(val)
             }
@@ -405,6 +492,8 @@ struct SearchAfterSegment {
 impl SearchAfterSegment {
     fn new(
         search_after: Option<PartialHit>,
+        sort_order1: SortOrder,
+        sort_order2: SortOrder,
         score_extractor: &SortingFieldExtractorPair,
     ) -> Option<Self> {
         let Some(search_after) = search_after else {
@@ -418,7 +507,7 @@ impl SearchAfterSegment {
         {
             if let Some(new_value) = score_extractor
                 .first
-                .convert_to_u64_ff_val(search_after_sort_value)
+                .convert_to_u64_ff_val(search_after_sort_value, sort_order1)
             {
                 sort_value = Some(new_value);
             } else {
@@ -437,7 +526,9 @@ impl SearchAfterSegment {
                 .second
                 .as_ref()
                 .expect("Internal error: Got sort_value2, but no sort extractor");
-            if let Some(new_value) = extractor.convert_to_u64_ff_val(search_after_sort_value) {
+            if let Some(new_value) =
+                extractor.convert_to_u64_ff_val(search_after_sort_value, sort_order2)
+            {
                 sort_value2 = Some(new_value);
             }
         }
@@ -454,7 +545,7 @@ impl SearchAfterSegment {
 impl QuickwitSegmentCollector {
     #[inline]
     fn collect_top_k(&mut self, doc_id: DocId, score: Score) {
-        let (sort_value, sort_value2) =
+        let (sort_value, sort_value2): (Option<u64>, Option<u64>) =
             self.score_extractor.extract_typed_sort_value(doc_id, score);
 
         if let Some(search_after) = &self.search_after {
@@ -804,7 +895,8 @@ impl Collector for QuickwitCollector {
             _ => Ordering::Equal,
         };
         // Convert search_after into fast field u64
-        let search_after = SearchAfterSegment::new(self.search_after.clone(), &score_extractor);
+        let search_after =
+            SearchAfterSegment::new(self.search_after.clone(), order1, order2, &score_extractor);
         Ok(QuickwitSegmentCollector {
             num_hits: 0u64,
             split_id: self.split_id.clone(),

diff --git a/quickwit/rest-api-tests/run_tests.py b/quickwit/rest-api-tests/run_tests.py
@@ -137,7 +137,18 @@ def check_result(result, expected, context_path = ""):
 
 def check_result_list(result, expected, context_path=""):
     if len(result) != len(expected):
-        raise(Exception("Wrong length at context %s" % context_path))
+        if len(expected) != 0:
+            # get keys from the expected dicts and filter result to print only the keys that are in the expected dicts
+            expected_keys = set().union(*expected)
+            filtered_result = [{k: v for k, v in d.items() if k in expected_keys} for d in result]
+            # Check if the length differs by more than five
+            if abs(len(filtered_result) - len(expected)) > 5:
+                # Show only the first 5 elements followed by ellipsis if there are more
+                display_filtered_result = filtered_result[:5] + ['...'] if len(filtered_result) > 5 else filtered_result
+            else:
+                display_filtered_result = filtered_result
+            raise Exception("Wrong length at context %s. Expected: %s Received: %s,\n Expected \n%s \n Received \n%s" % (context_path, len(expected), len(result), display_filtered_result, expected))
+        raise Exception("Wrong length at context %s. Expected: %s Received: %s" % (context_path, len(expected), len(result)))
     for (i, (left, right)) in enumerate(zip(result, expected)):
         check_result(left, right, context_path + "[%s]" % i)
 
@@ -242,7 +253,7 @@ def run_scenario(self, path, script):
                 num_steps_executed += 1
             except Exception as e:
                 print("🔴 %s" % scenario_path)
-                print("Failed at step %d" % i)
+                print(f"Failed at step '{step['desc']}'" if 'desc' in step else f"Failed at step {i}")
                 print(step)
                 print(e)
                 print("--------------")