From ac827abe1b66b1dfa02ce65ae857477f68667843 Mon Sep 17 00:00:00 2001 From: Jonathan Chen Date: Thu, 24 Oct 2024 07:26:37 -0400 Subject: [PATCH] feat: Migrate Map Functions (#13047) * add page * small fixes * delete md * Migrate map functions --- datafusion/functions-nested/src/map.rs | 69 ++++++++- .../functions-nested/src/map_extract.rs | 49 +++++- datafusion/functions-nested/src/map_keys.rs | 41 ++++- datafusion/functions-nested/src/map_values.rs | 41 ++++- dev/update_function_docs.sh | 1 - .../source/user-guide/sql/scalar_functions.md | 145 ------------------ .../user-guide/sql/scalar_functions_new.md | 144 +++++++++++++++++ 7 files changed, 334 insertions(+), 156 deletions(-) diff --git a/datafusion/functions-nested/src/map.rs b/datafusion/functions-nested/src/map.rs index 29afe4a7f3be..d7dce3bacbe1 100644 --- a/datafusion/functions-nested/src/map.rs +++ b/datafusion/functions-nested/src/map.rs @@ -17,7 +17,7 @@ use std::any::Any; use std::collections::{HashSet, VecDeque}; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::ArrayData; use arrow_array::{Array, ArrayRef, MapArray, OffsetSizeTrait, StructArray}; @@ -27,7 +27,10 @@ use arrow_schema::{DataType, Field, SchemaBuilder}; use datafusion_common::utils::{fixed_size_list_to_arrays, list_to_arrays}; use datafusion_common::{exec_err, Result, ScalarValue}; use datafusion_expr::expr::ScalarFunction; -use datafusion_expr::{ColumnarValue, Expr, ScalarUDFImpl, Signature, Volatility}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_MAP; +use datafusion_expr::{ + ColumnarValue, Documentation, Expr, ScalarUDFImpl, Signature, Volatility, +}; use crate::make_array::make_array; @@ -238,7 +241,69 @@ impl ScalarUDFImpl for MapFunc { fn invoke(&self, args: &[ColumnarValue]) -> Result { make_map_batch(args) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_map_doc()) + } } + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_map_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_MAP) + .with_description( + "Returns an Arrow map with the specified key-value pairs.\n\n\ + The `make_map` function creates a map from two lists: one for keys and one for values. Each key must be unique and non-null." + ) + .with_syntax_example( + "map(key, value)\nmap(key: value)\nmake_map(['key1', 'key2'], ['value1', 'value2'])" + ) + .with_sql_example( + r#"```sql + -- Using map function + SELECT MAP('type', 'test'); + ---- + {type: test} + + SELECT MAP(['POST', 'HEAD', 'PATCH'], [41, 33, null]); + ---- + {POST: 41, HEAD: 33, PATCH: } + + SELECT MAP([[1,2], [3,4]], ['a', 'b']); + ---- + {[1, 2]: a, [3, 4]: b} + + SELECT MAP { 'a': 1, 'b': 2 }; + ---- + {a: 1, b: 2} + + -- Using make_map function + SELECT MAKE_MAP(['POST', 'HEAD'], [41, 33]); + ---- + {POST: 41, HEAD: 33} + + SELECT MAKE_MAP(['key1', 'key2'], ['value1', null]); + ---- + {key1: value1, key2: } + ```"# + ) + .with_argument( + "key", + "For `map`: Expression to be used for key. Can be a constant, column, function, or any combination of arithmetic or string operators.\n\ + For `make_map`: The list of keys to be used in the map. Each key must be unique and non-null." + ) + .with_argument( + "value", + "For `map`: Expression to be used for value. Can be a constant, column, function, or any combination of arithmetic or string operators.\n\ + For `make_map`: The list of values to be mapped to the corresponding keys." + ) + .build() + .unwrap() + }) +} + fn get_element_type(data_type: &DataType) -> Result<&DataType> { match data_type { DataType::List(element) => Ok(element.data_type()), diff --git a/datafusion/functions-nested/src/map_extract.rs b/datafusion/functions-nested/src/map_extract.rs index 9f0c4ad29c60..d2bb6595fe76 100644 --- a/datafusion/functions-nested/src/map_extract.rs +++ b/datafusion/functions-nested/src/map_extract.rs @@ -26,9 +26,12 @@ use arrow_buffer::OffsetBuffer; use arrow_schema::Field; use datafusion_common::{cast::as_map_array, exec_err, Result}; -use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_MAP; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, +}; use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use std::vec; use crate::utils::{get_map_entry_field, make_scalar_function}; @@ -101,6 +104,48 @@ impl ScalarUDFImpl for MapExtract { field.first().unwrap().data_type().clone(), ]) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_map_extract_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_map_extract_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_MAP) + .with_description( + "Returns a list containing the value for the given key or an empty list if the key is not present in the map.", + ) + .with_syntax_example("map_extract(map, key)") + .with_sql_example( + r#"```sql +SELECT map_extract(MAP {'a': 1, 'b': NULL, 'c': 3}, 'a'); +---- +[1] + +SELECT map_extract(MAP {1: 'one', 2: 'two'}, 2); +---- +['two'] + +SELECT map_extract(MAP {'x': 10, 'y': NULL, 'z': 30}, 'y'); +---- +[] +```"#, + ) + .with_argument( + "map", + "Map expression. Can be a constant, column, or function, and any combination of map operators.", + ) + .with_argument( + "key", + "Key to extract from the map. Can be a constant, column, or function, any combination of arithmetic or string operators, or a named expression of the previously listed.", + ) + .build() + .unwrap() + }) } fn general_map_extract_inner( diff --git a/datafusion/functions-nested/src/map_keys.rs b/datafusion/functions-nested/src/map_keys.rs index 0b1cebb27c86..f28de1c3b2c7 100644 --- a/datafusion/functions-nested/src/map_keys.rs +++ b/datafusion/functions-nested/src/map_keys.rs @@ -21,12 +21,13 @@ use crate::utils::{get_map_entry_field, make_scalar_function}; use arrow_array::{Array, ArrayRef, ListArray}; use arrow_schema::{DataType, Field}; use datafusion_common::{cast::as_map_array, exec_err, Result}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_MAP; use datafusion_expr::{ - ArrayFunctionSignature, ColumnarValue, ScalarUDFImpl, Signature, TypeSignature, - Volatility, + ArrayFunctionSignature, ColumnarValue, Documentation, ScalarUDFImpl, Signature, + TypeSignature, Volatility, }; use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; make_udf_expr_and_func!( MapKeysFunc, @@ -81,6 +82,40 @@ impl ScalarUDFImpl for MapKeysFunc { fn invoke(&self, args: &[ColumnarValue]) -> datafusion_common::Result { make_scalar_function(map_keys_inner)(args) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_map_keys_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_map_keys_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_MAP) + .with_description( + "Returns a list of all keys in the map." + ) + .with_syntax_example("map_keys(map)") + .with_sql_example( + r#"```sql +SELECT map_keys(MAP {'a': 1, 'b': NULL, 'c': 3}); +---- +[a, b, c] + +SELECT map_keys(map([100, 5], [42, 43])); +---- +[100, 5] +```"#, + ) + .with_argument( + "map", + "Map expression. Can be a constant, column, or function, and any combination of map operators." + ) + .build() + .unwrap() + }) } fn map_keys_inner(args: &[ArrayRef]) -> Result { diff --git a/datafusion/functions-nested/src/map_values.rs b/datafusion/functions-nested/src/map_values.rs index 58c0d74eed5f..2b19d9fbbc76 100644 --- a/datafusion/functions-nested/src/map_values.rs +++ b/datafusion/functions-nested/src/map_values.rs @@ -21,12 +21,13 @@ use crate::utils::{get_map_entry_field, make_scalar_function}; use arrow_array::{Array, ArrayRef, ListArray}; use arrow_schema::{DataType, Field}; use datafusion_common::{cast::as_map_array, exec_err, Result}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_MAP; use datafusion_expr::{ - ArrayFunctionSignature, ColumnarValue, ScalarUDFImpl, Signature, TypeSignature, - Volatility, + ArrayFunctionSignature, ColumnarValue, Documentation, ScalarUDFImpl, Signature, + TypeSignature, Volatility, }; use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; make_udf_expr_and_func!( MapValuesFunc, @@ -81,6 +82,40 @@ impl ScalarUDFImpl for MapValuesFunc { fn invoke(&self, args: &[ColumnarValue]) -> datafusion_common::Result { make_scalar_function(map_values_inner)(args) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_map_values_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_map_values_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_MAP) + .with_description( + "Returns a list of all values in the map." + ) + .with_syntax_example("map_values(map)") + .with_sql_example( + r#"```sql +SELECT map_values(MAP {'a': 1, 'b': NULL, 'c': 3}); +---- +[1, , 3] + +SELECT map_values(map([100, 5], [42, 43])); +---- +[42, 43] +```"#, + ) + .with_argument( + "map", + "Map expression. Can be a constant, column, or function, and any combination of map operators." + ) + .build() + .unwrap() + }) } fn map_values_inner(args: &[ArrayRef]) -> Result { diff --git a/dev/update_function_docs.sh b/dev/update_function_docs.sh index f1f26c8b2f58..13bc22afcc13 100755 --- a/dev/update_function_docs.sh +++ b/dev/update_function_docs.sh @@ -297,4 +297,3 @@ echo "Running prettier" npx prettier@2.3.2 --write "$TARGET_FILE" echo "'$TARGET_FILE' successfully updated!" - diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index 547ea108080e..203411428777 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -199,151 +199,6 @@ Unwraps struct fields into columns. +-----------------------+-----------------------+ ``` -## Map Functions - -- [map](#map) -- [make_map](#make_map) -- [map_extract](#map_extract) -- [map_keys](#map_keys) -- [map_values](#map_values) - -### `map` - -Returns an Arrow map with the specified key-value pairs. - -``` -map(key, value) -map(key: value) -``` - -#### Arguments - -- **key**: Expression to be used for key. - Can be a constant, column, or function, any combination of arithmetic or - string operators, or a named expression of previous listed. -- **value**: Expression to be used for value. - Can be a constant, column, or function, any combination of arithmetic or - string operators, or a named expression of previous listed. - -#### Example - -``` -SELECT MAP(['POST', 'HEAD', 'PATCH'], [41, 33, null]); ----- -{POST: 41, HEAD: 33, PATCH: } - -SELECT MAP([[1,2], [3,4]], ['a', 'b']); ----- -{[1, 2]: a, [3, 4]: b} - -SELECT MAP { 'a': 1, 'b': 2 }; ----- -{a: 1, b: 2} -``` - -### `make_map` - -Returns an Arrow map with the specified key-value pairs. - -``` -make_map(key_1, value_1, ..., key_n, value_n) -``` - -#### Arguments - -- **key_n**: Expression to be used for key. - Can be a constant, column, or function, any combination of arithmetic or - string operators, or a named expression of previous listed. -- **value_n**: Expression to be used for value. - Can be a constant, column, or function, any combination of arithmetic or - string operators, or a named expression of previous listed. - -#### Example - -``` -SELECT MAKE_MAP('POST', 41, 'HEAD', 33, 'PATCH', null); ----- -{POST: 41, HEAD: 33, PATCH: } -``` - -### `map_extract` - -Return a list containing the value for a given key or an empty list if the key is not contained in the map. - -``` -map_extract(map, key) -``` - -#### Arguments - -- `map`: Map expression. - Can be a constant, column, or function, and any combination of map operators. -- `key`: Key to extract from the map. - Can be a constant, column, or function, any combination of arithmetic or - string operators, or a named expression of previous listed. - -#### Example - -``` -SELECT map_extract(MAP {'a': 1, 'b': NULL, 'c': 3}, 'a'); ----- -[1] -``` - -#### Aliases - -- element_at - -### `map_keys` - -Return a list of all keys in the map. - -``` -map_keys(map) -``` - -#### Arguments - -- `map`: Map expression. - Can be a constant, column, or function, and any combination of map operators. - -#### Example - -``` -SELECT map_keys(MAP {'a': 1, 'b': NULL, 'c': 3}); ----- -[a, b, c] - -select map_keys(map([100, 5], [42,43])); ----- -[100, 5] -``` - -### `map_values` - -Return a list of all values in the map. - -``` -map_values(map) -``` - -#### Arguments - -- `map`: Map expression. - Can be a constant, column, or function, and any combination of map operators. - -#### Example - -``` -SELECT map_values(MAP {'a': 1, 'b': NULL, 'c': 3}); ----- -[1, , 3] - -select map_values(map([100, 5], [42,43])); ----- -[42, 43] -``` - ## Other Functions See the new documentation [`here`](https://datafusion.apache.org/user-guide/sql/scalar_functions_new.html) diff --git a/docs/source/user-guide/sql/scalar_functions_new.md b/docs/source/user-guide/sql/scalar_functions_new.md index 1f4ec1c27858..7d0280dbc28f 100644 --- a/docs/source/user-guide/sql/scalar_functions_new.md +++ b/docs/source/user-guide/sql/scalar_functions_new.md @@ -3898,6 +3898,150 @@ select struct(a as field_a, b) from t; - row +## Map Functions + +- [element_at](#element_at) +- [map](#map) +- [map_extract](#map_extract) +- [map_keys](#map_keys) +- [map_values](#map_values) + +### `element_at` + +_Alias of [map_extract](#map_extract)._ + +### `map` + +Returns an Arrow map with the specified key-value pairs. + +The `make_map` function creates a map from two lists: one for keys and one for values. Each key must be unique and non-null. + +``` +map(key, value) +map(key: value) +make_map(['key1', 'key2'], ['value1', 'value2']) +``` + +#### Arguments + +- **key**: For `map`: Expression to be used for key. Can be a constant, column, function, or any combination of arithmetic or string operators. + For `make_map`: The list of keys to be used in the map. Each key must be unique and non-null. +- **value**: For `map`: Expression to be used for value. Can be a constant, column, function, or any combination of arithmetic or string operators. + For `make_map`: The list of values to be mapped to the corresponding keys. + +#### Example + +````sql + -- Using map function + SELECT MAP('type', 'test'); + ---- + {type: test} + + SELECT MAP(['POST', 'HEAD', 'PATCH'], [41, 33, null]); + ---- + {POST: 41, HEAD: 33, PATCH: } + + SELECT MAP([[1,2], [3,4]], ['a', 'b']); + ---- + {[1, 2]: a, [3, 4]: b} + + SELECT MAP { 'a': 1, 'b': 2 }; + ---- + {a: 1, b: 2} + + -- Using make_map function + SELECT MAKE_MAP(['POST', 'HEAD'], [41, 33]); + ---- + {POST: 41, HEAD: 33} + + SELECT MAKE_MAP(['key1', 'key2'], ['value1', null]); + ---- + {key1: value1, key2: } + ``` + + +### `map_extract` + +Returns a list containing the value for the given key or an empty list if the key is not present in the map. + +```` + +map_extract(map, key) + +```` +#### Arguments + +- **map**: Map expression. Can be a constant, column, or function, and any combination of map operators. +- **key**: Key to extract from the map. Can be a constant, column, or function, any combination of arithmetic or string operators, or a named expression of the previously listed. + +#### Example + +```sql +SELECT map_extract(MAP {'a': 1, 'b': NULL, 'c': 3}, 'a'); +---- +[1] + +SELECT map_extract(MAP {1: 'one', 2: 'two'}, 2); +---- +['two'] + +SELECT map_extract(MAP {'x': 10, 'y': NULL, 'z': 30}, 'y'); +---- +[] +```` + +#### Aliases + +- element_at + +### `map_keys` + +Returns a list of all keys in the map. + +``` +map_keys(map) +``` + +#### Arguments + +- **map**: Map expression. Can be a constant, column, or function, and any combination of map operators. + +#### Example + +```sql +SELECT map_keys(MAP {'a': 1, 'b': NULL, 'c': 3}); +---- +[a, b, c] + +SELECT map_keys(map([100, 5], [42, 43])); +---- +[100, 5] +``` + +### `map_values` + +Returns a list of all values in the map. + +``` +map_values(map) +``` + +#### Arguments + +- **map**: Map expression. Can be a constant, column, or function, and any combination of map operators. + +#### Example + +```sql +SELECT map_values(MAP {'a': 1, 'b': NULL, 'c': 3}); +---- +[1, , 3] + +SELECT map_values(map([100, 5], [42, 43])); +---- +[42, 43] +``` + ## Hashing Functions - [digest](#digest)