Skip to content

Commit

Permalink
handle all type conversion cases, add tests, improve python test lib
Browse files Browse the repository at this point in the history
  • Loading branch information
PSeitz committed Mar 12, 2024
1 parent fca2c1f commit a2ebcde
Show file tree
Hide file tree
Showing 7 changed files with 530 additions and 30 deletions.
142 changes: 117 additions & 25 deletions quickwit/quickwit-search/src/collector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ impl SortByComponent {
}
}

#[derive(Copy, Clone, Debug)]
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
enum SortFieldType {
U64,
I64,
Expand Down Expand Up @@ -158,6 +158,7 @@ impl SortingFieldExtractorComponent {
#[inline]
fn extract_typed_sort_value_opt(&self, doc_id: DocId, score: Score) -> Option<u64> {
match self {
// Tie breaks are not handled here, but in SegmentPartialHit
SortingFieldExtractorComponent::DocId => None,
SortingFieldExtractorComponent::FastField { sort_column, .. } => {
sort_column.first(doc_id)
Expand All @@ -169,6 +170,9 @@ impl SortingFieldExtractorComponent {
#[inline]
/// Converts u64 fast field values to its correct type.
/// The conversion is delayed for performance reasons.
///
/// This is used to convert `search_after` sort value to a u64 representation that will respect
/// the same order as the `SortValue` representation.
fn convert_u64_ff_val_to_sort_value(&self, sort_value: u64) -> SortValue {
let map_fast_field_to_value = |fast_field_value, field_type| match field_type {
SortFieldType::U64 => SortValue::U64(fast_field_value),
Expand All @@ -188,12 +192,12 @@ impl SortingFieldExtractorComponent {
/// Converts fast field values into their u64 fast field representation.
///
/// Returns None if value is out of bounds of target value.
/// None means that the search_after will be disabled as everything matches.
/// None means that the search_after will be disabled and everything matches.
///
/// What's currently missing is to signal that _nothing_ matches to generate an optimized
/// query.
/// query. For now we just choose the max value of the target type.
#[inline]
fn convert_to_u64_ff_val(&self, sort_value: SortValue) -> Option<u64> {
fn convert_to_u64_ff_val(&self, sort_value: SortValue, sort_order: SortOrder) -> Option<u64> {
match self {
SortingFieldExtractorComponent::DocId => match sort_value {
SortValue::U64(val) => Some(val),
Expand All @@ -206,56 +210,139 @@ impl SortingFieldExtractorComponent {
// representation of the fast field.
// This requires this weird conversion of first casting into the target type
// (if possible) and then to its u64 presentation.
//
// For the conversion into the target type it's important to know if the target
// type does not cover the whole range of the source type. In that case we need to
// add additional conversion checks, to see if it matches everything
// or nothing. (Which also depends on the sort order).
// Below are the visual representations of the value ranges of the different types.
// Note: DateTime is equal to I64 and omitted.
//
// Bool value range (0, 1):
// <->
//
// I64 value range (signed 64-bit integer):
// <------------------------------------>
// -2^63 2^63-1
// U64 value range (unsigned 64-bit integer):
// <------------------------------------>
// 0 2^64-1
// F64 value range (64-bit floating point, conceptual, not to scale):
// <-------------------------------------------------------------------->
// Very negative numbers Very positive numbers
//
// Those conversions have limited target type value space:
// - [X] U64 -> I64
// - [X] F64 -> I64
// - [X] I64 -> U64
// - [X] F64 -> U64
//
// - [X] F64 -> Bool
// - [X] I64 -> Bool
// - [X] U64 -> Bool
//
let val = match (sort_value, sort_field_type) {
// Same field type, no conversion needed.
(SortValue::U64(val), SortFieldType::U64) => val,
(SortValue::F64(val), SortFieldType::F64) => val.to_u64(),
(SortValue::Boolean(val), SortFieldType::Bool) => val.to_u64(),
(SortValue::I64(val), SortFieldType::I64) => val.to_u64(),
(SortValue::U64(mut val), SortFieldType::I64) => {
if sort_order == SortOrder::Desc && val > i64::MAX as u64 {
return None;
}
// Add a limit to avoid overflow.
val = val.min(i64::MAX as u64);
(val as i64).to_u64()
}
(SortValue::U64(val), SortFieldType::F64) => (val as f64).to_u64(),
(SortValue::U64(mut val), SortFieldType::DateTime) => {
// Match everything
if sort_order == SortOrder::Desc && val > i64::MAX as u64 {
return None;
}
// Add a limit to avoid overflow.
val = val.min(i64::MAX as u64);
DateTime::from_timestamp_nanos(val as i64).to_u64()
}
// questionable number into boolean conversions
(SortValue::U64(val), SortFieldType::Bool) => (val != 0).to_u64(),
(SortValue::I64(val), SortFieldType::Bool) => (val != 0).to_u64(),
(SortValue::F64(val), SortFieldType::Bool) => (val != 0.0).to_u64(),
(SortValue::I64(val), SortFieldType::U64) => {
if val < 0 {
if val < 0 && sort_order == SortOrder::Asc {
return None;
}
val as u64
if val < 0 && sort_order == SortOrder::Desc {
u64::MIN // matches nothing as search_after is not inclusive
} else {
val as u64
}
}
(SortValue::I64(val), SortFieldType::F64) => (val as f64).to_u64(),
(SortValue::I64(val), SortFieldType::DateTime) => {
DateTime::from_timestamp_nanos(val).to_u64()
}
(SortValue::F64(val), SortFieldType::DateTime) => {
DateTime::from_timestamp_nanos(val as i64).to_u64()
}
(SortValue::F64(val), SortFieldType::U64) => {
if val < 0.0 {
let all_values_ahead1 =
val < u64::MIN as f64 && sort_order == SortOrder::Asc;
let all_values_ahead2 =
val > u64::MAX as f64 && sort_order == SortOrder::Desc;
if all_values_ahead1 || all_values_ahead2 {
return None;
}
(val.floor() as u64).to_u64()
// f64 cast already handles under/overflow and clamps the value
(val as u64).to_u64()
}
(SortValue::F64(val), SortFieldType::I64) => {
if val < i64::MIN as f64 {
(SortValue::F64(val), SortFieldType::I64)
| (SortValue::F64(val), SortFieldType::DateTime) => {
let all_values_ahead1 =
val < i64::MIN as f64 && sort_order == SortOrder::Asc;
let all_values_ahead2 =
val > i64::MAX as f64 && sort_order == SortOrder::Desc;
if all_values_ahead1 || all_values_ahead2 {
return None;
}
(val as i64).to_u64()
// f64 cast already handles under/overflow and clamps the value
let val_i64 = val as i64;

if *sort_field_type == SortFieldType::DateTime {
DateTime::from_timestamp_nanos(val_i64).to_u64()
} else {
val_i64.to_u64()
}
}
// Disable search after for bool conversion into other types
// So it would be a match all ... but maybe it should be a match none instead?
// Not sure when we hit this, it's probably are very rare case.
(SortValue::Boolean(_val), _) => return None,
(SortValue::Boolean(val), SortFieldType::U64) => val as u64,
(SortValue::Boolean(val), SortFieldType::F64) => (val as u64 as f64).to_u64(),
(SortValue::Boolean(val), SortFieldType::I64) => (val as i64).to_u64(),
(SortValue::Boolean(val), SortFieldType::DateTime) => {
DateTime::from_timestamp_nanos(val as i64).to_u64()
}
(SortValue::U64(mut val), SortFieldType::Bool) => {
let all_values_ahead1 = val > 1 && sort_order == SortOrder::Desc;
if all_values_ahead1 {
return None;
}
// clamp value for comparison
val = val.min(1).max(0);
(val == 1).to_u64()
}
(SortValue::I64(mut val), SortFieldType::Bool) => {
let all_values_ahead1 = val > 1 && sort_order == SortOrder::Desc;
let all_values_ahead2 = val < 0 && sort_order == SortOrder::Asc;
if all_values_ahead1 || all_values_ahead2 {
return None;
}
// clamp value for comparison
val = val.min(1).max(0);
(val == 1).to_u64()
}
(SortValue::F64(mut val), SortFieldType::Bool) => {
let all_values_ahead1 = val > 1.0 && sort_order == SortOrder::Desc;
let all_values_ahead2 = val < 0.0 && sort_order == SortOrder::Asc;
if all_values_ahead1 || all_values_ahead2 {
return None;
}
val = val.min(1.0).max(0.0);
(val >= 0.5).to_u64() // Is this correct?
}
};
Some(val)
}
Expand Down Expand Up @@ -405,6 +492,8 @@ struct SearchAfterSegment {
impl SearchAfterSegment {
fn new(
search_after: Option<PartialHit>,
sort_order1: SortOrder,
sort_order2: SortOrder,
score_extractor: &SortingFieldExtractorPair,
) -> Option<Self> {
let Some(search_after) = search_after else {
Expand All @@ -418,7 +507,7 @@ impl SearchAfterSegment {
{
if let Some(new_value) = score_extractor
.first
.convert_to_u64_ff_val(search_after_sort_value)
.convert_to_u64_ff_val(search_after_sort_value, sort_order1)
{
sort_value = Some(new_value);
} else {
Expand All @@ -437,7 +526,9 @@ impl SearchAfterSegment {
.second
.as_ref()
.expect("Internal error: Got sort_value2, but no sort extractor");
if let Some(new_value) = extractor.convert_to_u64_ff_val(search_after_sort_value) {
if let Some(new_value) =
extractor.convert_to_u64_ff_val(search_after_sort_value, sort_order2)
{
sort_value2 = Some(new_value);
}
}
Expand All @@ -454,7 +545,7 @@ impl SearchAfterSegment {
impl QuickwitSegmentCollector {
#[inline]
fn collect_top_k(&mut self, doc_id: DocId, score: Score) {
let (sort_value, sort_value2) =
let (sort_value, sort_value2): (Option<u64>, Option<u64>) =
self.score_extractor.extract_typed_sort_value(doc_id, score);

if let Some(search_after) = &self.search_after {
Expand Down Expand Up @@ -804,7 +895,8 @@ impl Collector for QuickwitCollector {
_ => Ordering::Equal,
};
// Convert search_after into fast field u64
let search_after = SearchAfterSegment::new(self.search_after.clone(), &score_extractor);
let search_after =
SearchAfterSegment::new(self.search_after.clone(), order1, order2, &score_extractor);
Ok(QuickwitSegmentCollector {
num_hits: 0u64,
split_id: self.split_id.clone(),
Expand Down
15 changes: 13 additions & 2 deletions quickwit/rest-api-tests/run_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,18 @@ def check_result(result, expected, context_path = ""):

def check_result_list(result, expected, context_path=""):
if len(result) != len(expected):
raise(Exception("Wrong length at context %s" % context_path))
if len(expected) != 0:
# get keys from the expected dicts and filter result to print only the keys that are in the expected dicts
expected_keys = set().union(*expected)
filtered_result = [{k: v for k, v in d.items() if k in expected_keys} for d in result]
# Check if the length differs by more than five
if abs(len(filtered_result) - len(expected)) > 5:
# Show only the first 5 elements followed by ellipsis if there are more
display_filtered_result = filtered_result[:5] + ['...'] if len(filtered_result) > 5 else filtered_result
else:
display_filtered_result = filtered_result
raise Exception("Wrong length at context %s. Expected: %s Received: %s,\n Expected \n%s \n Received \n%s" % (context_path, len(expected), len(result), display_filtered_result, expected))
raise Exception("Wrong length at context %s. Expected: %s Received: %s" % (context_path, len(expected), len(result)))
for (i, (left, right)) in enumerate(zip(result, expected)):
check_result(left, right, context_path + "[%s]" % i)

Expand Down Expand Up @@ -242,7 +253,7 @@ def run_scenario(self, path, script):
num_steps_executed += 1
except Exception as e:
print("🔴 %s" % scenario_path)
print("Failed at step %d" % i)
print(f"Failed at step '{step['desc']}'" if 'desc' in step else f"Failed at step {i}")
print(step)
print(e)
print("--------------")
Expand Down
Loading

0 comments on commit a2ebcde

Please sign in to comment.