From bb8b54b5d3228d6bd4d48523191b8307a12dfc71 Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Mon, 14 Oct 2024 23:47:29 -0700 Subject: [PATCH] feat: add some s2 geo functions (#4823) * feat: add first batch of s2 functions * refactor: update reusable code from main * test: add sqlness tests for s2 * feat: add tostring function for s2 * Update src/common/function/src/scalars/geo/s2.rs Co-authored-by: Lei, HUANG <6406592+v0y4g3r@users.noreply.github.com> * Apply suggestions from code review * one more change Signed-off-by: Ruihang Xia --------- Signed-off-by: Ruihang Xia Co-authored-by: Ruihang Xia Co-authored-by: Lei, HUANG <6406592+v0y4g3r@users.noreply.github.com> --- Cargo.lock | 52 +++- src/common/function/Cargo.toml | 3 +- src/common/function/src/scalars/geo.rs | 13 +- .../function/src/scalars/geo/encoding.rs | 2 +- src/common/function/src/scalars/geo/h3.rs | 28 +- .../function/src/scalars/geo/helpers.rs | 26 +- src/common/function/src/scalars/geo/s2.rs | 275 ++++++++++++++++++ .../standalone/common/function/geo.result | 15 + .../cases/standalone/common/function/geo.sql | 9 + 9 files changed, 383 insertions(+), 40 deletions(-) create mode 100644 src/common/function/src/scalars/geo/s2.rs diff --git a/Cargo.lock b/Cargo.lock index eee5d4ee5d7f..b096335763aa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -230,6 +230,15 @@ dependencies = [ "tonic-build", ] +[[package]] +name = "approx" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f2a05fd1bd10b2527e20a2cd32d8873d115b8b39fe219ee25f42a8aca6ba278" +dependencies = [ + "num-traits", +] + [[package]] name = "approx" version = "0.5.1" @@ -985,6 +994,7 @@ dependencies = [ "num-bigint", "num-integer", "num-traits", + "serde", ] [[package]] @@ -1548,6 +1558,16 @@ dependencies = [ "vob", ] +[[package]] +name = "cgmath" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a98d30140e3296250832bbaaff83b27dcd6fa3cc70fb6f1f3e5c9c0023b5317" +dependencies = [ + "approx 0.4.0", + "num-traits", +] + [[package]] name = "chrono" version = "0.4.38" @@ -2054,6 +2074,7 @@ dependencies = [ "once_cell", "paste", "ron", + "s2", "serde", "serde_json", "session", @@ -3971,6 +3992,15 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28a80e3145d8ad11ba0995949bbcf48b9df2be62772b3d351ef017dff6ecb853" +[[package]] +name = "float_extras" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b22b70f8649ea2315955f1a36d964b0e4da482dfaa5f0d04df0d1fb7c338ab7a" +dependencies = [ + "libc", +] + [[package]] name = "flow" version = "0.9.3" @@ -4401,7 +4431,7 @@ version = "0.7.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ff16065e5720f376fbced200a5ae0f47ace85fd70b7e54269790281353b6d61" dependencies = [ - "approx", + "approx 0.5.1", "num-traits", "serde", ] @@ -6883,7 +6913,7 @@ version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d506eb7e08d6329505faa8a3a00a5dcc6de9f76e0c77e4b75763ae3c770831ff" dependencies = [ - "approx", + "approx 0.5.1", "matrixmultiply", "nalgebra-macros", "num-complex", @@ -10274,6 +10304,20 @@ version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" +[[package]] +name = "s2" +version = "0.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc7fbc04bb52c40b5f48c9bb2d2961375301916e0c25d9f373750654d588cd5c" +dependencies = [ + "bigdecimal 0.3.1", + "cgmath", + "float_extras", + "lazy_static", + "libm", + "serde", +] + [[package]] name = "safe-proc-macro2" version = "1.0.67" @@ -10912,7 +10956,7 @@ version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0b7840f121a46d63066ee7a99fc81dcabbc6105e437cae43528cea199b5a05f" dependencies = [ - "approx", + "approx 0.5.1", "num-complex", "num-traits", "paste", @@ -11383,7 +11427,7 @@ version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b35a062dbadac17a42e0fc64c27f419b25d6fae98572eb43c8814c9e873d7721" dependencies = [ - "approx", + "approx 0.5.1", "lazy_static", "nalgebra", "num-traits", diff --git a/src/common/function/Cargo.toml b/src/common/function/Cargo.toml index e9a3dd6b5558..6b23762d90c2 100644 --- a/src/common/function/Cargo.toml +++ b/src/common/function/Cargo.toml @@ -9,7 +9,7 @@ workspace = true [features] default = ["geo"] -geo = ["geohash", "h3o"] +geo = ["geohash", "h3o", "s2"] [dependencies] api.workspace = true @@ -35,6 +35,7 @@ num = "0.4" num-traits = "0.2" once_cell.workspace = true paste = "1.0" +s2 = { version = "0.0.12", optional = true } serde.workspace = true serde_json.workspace = true session.workspace = true diff --git a/src/common/function/src/scalars/geo.rs b/src/common/function/src/scalars/geo.rs index e47a1de9f2fa..866acddc4dc9 100644 --- a/src/common/function/src/scalars/geo.rs +++ b/src/common/function/src/scalars/geo.rs @@ -17,8 +17,7 @@ pub(crate) mod encoding; mod geohash; mod h3; mod helpers; - -use geohash::{GeohashFunction, GeohashNeighboursFunction}; +mod s2; use crate::function_registry::FunctionRegistry; @@ -27,8 +26,8 @@ pub(crate) struct GeoFunctions; impl GeoFunctions { pub fn register(registry: &FunctionRegistry) { // geohash - registry.register(Arc::new(GeohashFunction)); - registry.register(Arc::new(GeohashNeighboursFunction)); + registry.register(Arc::new(geohash::GeohashFunction)); + registry.register(Arc::new(geohash::GeohashNeighboursFunction)); // h3 index registry.register(Arc::new(h3::H3LatLngToCell)); @@ -55,5 +54,11 @@ impl GeoFunctions { registry.register(Arc::new(h3::H3GridDiskDistances)); registry.register(Arc::new(h3::H3GridDistance)); registry.register(Arc::new(h3::H3GridPathCells)); + + // s2 + registry.register(Arc::new(s2::S2LatLngToCell)); + registry.register(Arc::new(s2::S2CellLevel)); + registry.register(Arc::new(s2::S2CellToToken)); + registry.register(Arc::new(s2::S2CellParent)); } } diff --git a/src/common/function/src/scalars/geo/encoding.rs b/src/common/function/src/scalars/geo/encoding.rs index 06001205989a..145cd24bff05 100644 --- a/src/common/function/src/scalars/geo/encoding.rs +++ b/src/common/function/src/scalars/geo/encoding.rs @@ -17,7 +17,7 @@ use std::sync::Arc; use common_error::ext::{BoxedError, PlainError}; use common_error::status_code::StatusCode; use common_macro::{as_aggr_func_creator, AggrFuncTypeStore}; -use common_query::error::{self, InvalidFuncArgsSnafu, InvalidInputStateSnafu, Result}; +use common_query::error::{self, InvalidInputStateSnafu, Result}; use common_query::logical_plan::accumulator::AggrFuncTypeStore; use common_query::logical_plan::{Accumulator, AggregateFunctionCreator}; use common_query::prelude::AccumulatorCreatorFunction; diff --git a/src/common/function/src/scalars/geo/h3.rs b/src/common/function/src/scalars/geo/h3.rs index 00c567d8d727..7f98c3147a61 100644 --- a/src/common/function/src/scalars/geo/h3.rs +++ b/src/common/function/src/scalars/geo/h3.rs @@ -16,7 +16,7 @@ use std::str::FromStr; use common_error::ext::{BoxedError, PlainError}; use common_error::status_code::StatusCode; -use common_query::error::{self, InvalidFuncArgsSnafu, Result}; +use common_query::error::{self, Result}; use common_query::prelude::{Signature, TypeSignature}; use datafusion::logical_expr::Volatility; use datatypes::prelude::ConcreteDataType; @@ -29,9 +29,9 @@ use datatypes::vectors::{ use derive_more::Display; use h3o::{CellIndex, LatLng, Resolution}; use once_cell::sync::Lazy; -use snafu::{ensure, ResultExt}; +use snafu::ResultExt; -use super::helpers::{ensure_columns_len, ensure_columns_n}; +use super::helpers::{ensure_and_coerce, ensure_columns_len, ensure_columns_n}; use crate::function::{Function, FunctionContext}; static CELL_TYPES: Lazy> = Lazy::new(|| { @@ -382,15 +382,7 @@ impl Function for H3CellResolution { } fn eval(&self, _func_ctx: FunctionContext, columns: &[VectorRef]) -> Result { - ensure!( - columns.len() == 1, - InvalidFuncArgsSnafu { - err_msg: format!( - "The length of the args is not correct, expect 1, provided : {}", - columns.len() - ), - } - ); + ensure_columns_n!(columns, 1); let cell_vec = &columns[0]; let size = cell_vec.len(); @@ -982,18 +974,6 @@ fn value_to_resolution(v: Value) -> Result { .context(error::ExecuteSnafu) } -macro_rules! ensure_and_coerce { - ($compare:expr, $coerce:expr) => {{ - ensure!( - $compare, - InvalidFuncArgsSnafu { - err_msg: "Argument was outside of acceptable range " - } - ); - Ok($coerce) - }}; -} - fn value_to_position(v: Value) -> Result { match v { Value::Int8(v) => ensure_and_coerce!(v >= 0, v as u64), diff --git a/src/common/function/src/scalars/geo/helpers.rs b/src/common/function/src/scalars/geo/helpers.rs index f07eaefb15bf..22d47f54e481 100644 --- a/src/common/function/src/scalars/geo/helpers.rs +++ b/src/common/function/src/scalars/geo/helpers.rs @@ -14,15 +14,15 @@ macro_rules! ensure_columns_len { ($columns:ident) => { - ensure!( + snafu::ensure!( $columns.windows(2).all(|c| c[0].len() == c[1].len()), - InvalidFuncArgsSnafu { + common_query::error::InvalidFuncArgsSnafu { err_msg: "The length of input columns are in different size" } ) }; ($column_a:ident, $column_b:ident, $($column_n:ident),*) => { - ensure!( + snafu::ensure!( { let mut result = $column_a.len() == $column_b.len(); $( @@ -30,7 +30,7 @@ macro_rules! ensure_columns_len { )* result } - InvalidFuncArgsSnafu { + common_query::error::InvalidFuncArgsSnafu { err_msg: "The length of input columns are in different size" } ) @@ -41,9 +41,9 @@ pub(super) use ensure_columns_len; macro_rules! ensure_columns_n { ($columns:ident, $n:literal) => { - ensure!( + snafu::ensure!( $columns.len() == $n, - InvalidFuncArgsSnafu { + common_query::error::InvalidFuncArgsSnafu { err_msg: format!( "The length of arguments is not correct, expect {}, provided : {}", stringify!($n), @@ -59,3 +59,17 @@ macro_rules! ensure_columns_n { } pub(super) use ensure_columns_n; + +macro_rules! ensure_and_coerce { + ($compare:expr, $coerce:expr) => {{ + snafu::ensure!( + $compare, + common_query::error::InvalidFuncArgsSnafu { + err_msg: "Argument was outside of acceptable range " + } + ); + Ok($coerce) + }}; +} + +pub(super) use ensure_and_coerce; diff --git a/src/common/function/src/scalars/geo/s2.rs b/src/common/function/src/scalars/geo/s2.rs new file mode 100644 index 000000000000..6e40dc300fd5 --- /dev/null +++ b/src/common/function/src/scalars/geo/s2.rs @@ -0,0 +1,275 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use common_query::error::{InvalidFuncArgsSnafu, Result}; +use common_query::prelude::{Signature, TypeSignature}; +use datafusion::logical_expr::Volatility; +use datatypes::prelude::ConcreteDataType; +use datatypes::scalars::ScalarVectorBuilder; +use datatypes::value::Value; +use datatypes::vectors::{MutableVector, StringVectorBuilder, UInt64VectorBuilder, VectorRef}; +use derive_more::Display; +use once_cell::sync::Lazy; +use s2::cellid::{CellID, MAX_LEVEL}; +use s2::latlng::LatLng; +use snafu::ensure; + +use crate::function::{Function, FunctionContext}; +use crate::scalars::geo::helpers::{ensure_and_coerce, ensure_columns_len, ensure_columns_n}; + +static CELL_TYPES: Lazy> = Lazy::new(|| { + vec![ + ConcreteDataType::int64_datatype(), + ConcreteDataType::uint64_datatype(), + ] +}); + +static COORDINATE_TYPES: Lazy> = Lazy::new(|| { + vec![ + ConcreteDataType::float32_datatype(), + ConcreteDataType::float64_datatype(), + ] +}); + +static LEVEL_TYPES: Lazy> = Lazy::new(|| { + vec![ + ConcreteDataType::int8_datatype(), + ConcreteDataType::int16_datatype(), + ConcreteDataType::int32_datatype(), + ConcreteDataType::int64_datatype(), + ConcreteDataType::uint8_datatype(), + ConcreteDataType::uint16_datatype(), + ConcreteDataType::uint32_datatype(), + ConcreteDataType::uint64_datatype(), + ] +}); + +/// Function that returns [s2] encoding cellid for a given geospatial coordinate. +/// +/// [s2]: http://s2geometry.io +#[derive(Clone, Debug, Default, Display)] +#[display("{}", self.name())] +pub struct S2LatLngToCell; + +impl Function for S2LatLngToCell { + fn name(&self) -> &str { + "s2_latlng_to_cell" + } + + fn return_type(&self, _input_types: &[ConcreteDataType]) -> Result { + Ok(ConcreteDataType::uint64_datatype()) + } + + fn signature(&self) -> Signature { + let mut signatures = Vec::with_capacity(COORDINATE_TYPES.len()); + for coord_type in COORDINATE_TYPES.as_slice() { + signatures.push(TypeSignature::Exact(vec![ + // latitude + coord_type.clone(), + // longitude + coord_type.clone(), + ])); + } + Signature::one_of(signatures, Volatility::Stable) + } + + fn eval(&self, _func_ctx: FunctionContext, columns: &[VectorRef]) -> Result { + ensure_columns_n!(columns, 2); + + let lat_vec = &columns[0]; + let lon_vec = &columns[1]; + + let size = lat_vec.len(); + let mut results = UInt64VectorBuilder::with_capacity(size); + + for i in 0..size { + let lat = lat_vec.get(i).as_f64_lossy(); + let lon = lon_vec.get(i).as_f64_lossy(); + + let result = match (lat, lon) { + (Some(lat), Some(lon)) => { + let coord = LatLng::from_degrees(lat, lon); + ensure!( + coord.is_valid(), + InvalidFuncArgsSnafu { + err_msg: "The input coordinates are invalid", + } + ); + let cellid = CellID::from(coord); + let encoded: u64 = cellid.0; + Some(encoded) + } + _ => None, + }; + + results.push(result); + } + + Ok(results.to_vector()) + } +} + +/// Return the level of current s2 cell +#[derive(Clone, Debug, Default, Display)] +#[display("{}", self.name())] +pub struct S2CellLevel; + +impl Function for S2CellLevel { + fn name(&self) -> &str { + "s2_cell_level" + } + + fn return_type(&self, _input_types: &[ConcreteDataType]) -> Result { + Ok(ConcreteDataType::uint64_datatype()) + } + + fn signature(&self) -> Signature { + signature_of_cell() + } + + fn eval(&self, _func_ctx: FunctionContext, columns: &[VectorRef]) -> Result { + ensure_columns_n!(columns, 1); + + let cell_vec = &columns[0]; + let size = cell_vec.len(); + let mut results = UInt64VectorBuilder::with_capacity(size); + + for i in 0..size { + let cell = cell_from_value(cell_vec.get(i)); + let res = cell.map(|cell| cell.level()); + + results.push(res); + } + + Ok(results.to_vector()) + } +} + +/// Return the string presentation of the cell +#[derive(Clone, Debug, Default, Display)] +#[display("{}", self.name())] +pub struct S2CellToToken; + +impl Function for S2CellToToken { + fn name(&self) -> &str { + "s2_cell_to_token" + } + + fn return_type(&self, _input_types: &[ConcreteDataType]) -> Result { + Ok(ConcreteDataType::string_datatype()) + } + + fn signature(&self) -> Signature { + signature_of_cell() + } + + fn eval(&self, _func_ctx: FunctionContext, columns: &[VectorRef]) -> Result { + ensure_columns_n!(columns, 1); + + let cell_vec = &columns[0]; + let size = cell_vec.len(); + let mut results = StringVectorBuilder::with_capacity(size); + + for i in 0..size { + let cell = cell_from_value(cell_vec.get(i)); + let res = cell.map(|cell| cell.to_token()); + + results.push(res.as_deref()); + } + + Ok(results.to_vector()) + } +} + +/// Return parent at given level of current s2 cell +#[derive(Clone, Debug, Default, Display)] +#[display("{}", self.name())] +pub struct S2CellParent; + +impl Function for S2CellParent { + fn name(&self) -> &str { + "s2_cell_parent" + } + + fn return_type(&self, _input_types: &[ConcreteDataType]) -> Result { + Ok(ConcreteDataType::uint64_datatype()) + } + + fn signature(&self) -> Signature { + signature_of_cell_and_level() + } + + fn eval(&self, _func_ctx: FunctionContext, columns: &[VectorRef]) -> Result { + ensure_columns_n!(columns, 2); + + let cell_vec = &columns[0]; + let level_vec = &columns[1]; + let size = cell_vec.len(); + let mut results = UInt64VectorBuilder::with_capacity(size); + + for i in 0..size { + let cell = cell_from_value(cell_vec.get(i)); + let level = value_to_level(level_vec.get(i))?; + let result = cell.map(|cell| cell.parent(level).0); + + results.push(result); + } + + Ok(results.to_vector()) + } +} + +fn signature_of_cell() -> Signature { + let mut signatures = Vec::with_capacity(CELL_TYPES.len()); + for cell_type in CELL_TYPES.as_slice() { + signatures.push(TypeSignature::Exact(vec![cell_type.clone()])); + } + + Signature::one_of(signatures, Volatility::Stable) +} + +fn signature_of_cell_and_level() -> Signature { + let mut signatures = Vec::with_capacity(CELL_TYPES.len() * LEVEL_TYPES.len()); + for cell_type in CELL_TYPES.as_slice() { + for level_type in LEVEL_TYPES.as_slice() { + signatures.push(TypeSignature::Exact(vec![ + cell_type.clone(), + level_type.clone(), + ])); + } + } + Signature::one_of(signatures, Volatility::Stable) +} + +fn cell_from_value(v: Value) -> Option { + match v { + Value::Int64(v) => Some(CellID(v as u64)), + Value::UInt64(v) => Some(CellID(v)), + _ => None, + } +} + +fn value_to_level(v: Value) -> Result { + match v { + Value::Int8(v) => ensure_and_coerce!(v >= 0 && v <= MAX_LEVEL as i8, v as u64), + Value::Int16(v) => ensure_and_coerce!(v >= 0 && v <= MAX_LEVEL as i16, v as u64), + Value::Int32(v) => ensure_and_coerce!(v >= 0 && v <= MAX_LEVEL as i32, v as u64), + Value::Int64(v) => ensure_and_coerce!(v >= 0 && v <= MAX_LEVEL as i64, v as u64), + Value::UInt8(v) => ensure_and_coerce!(v <= MAX_LEVEL as u8, v as u64), + Value::UInt16(v) => ensure_and_coerce!(v <= MAX_LEVEL as u16, v as u64), + Value::UInt32(v) => ensure_and_coerce!(v <= MAX_LEVEL as u32, v as u64), + Value::UInt64(v) => ensure_and_coerce!(v <= MAX_LEVEL, v), + _ => unreachable!(), + } +} diff --git a/tests/cases/standalone/common/function/geo.result b/tests/cases/standalone/common/function/geo.result index 75caeb886b5b..8c9460c738fa 100644 --- a/tests/cases/standalone/common/function/geo.result +++ b/tests/cases/standalone/common/function/geo.result @@ -259,6 +259,21 @@ SELECT geohash_neighbours(37.76938, -122.3889, 11); | [9q8yygxnefv, 9q8yygxnefu, 9q8yygxnefs, 9q8yygxnefk, 9q8yygxnefm, 9q8yygxnefq, 9q8yygxnefw, 9q8yygxnefy] | +----------------------------------------------------------------------------------------------------------+ +WITH cell_cte AS ( + SELECT s2_latlng_to_cell(37.76938, -122.3889) AS cell +) +SELECT cell, + s2_cell_to_token(cell), + s2_cell_level(cell), + s2_cell_parent(cell, 3) +FROM cell_cte; + ++---------------------+---------------------------------+------------------------------+----------------------------------------+ +| cell | s2_cell_to_token(cell_cte.cell) | s2_cell_level(cell_cte.cell) | s2_cell_parent(cell_cte.cell,Int64(3)) | ++---------------------+---------------------------------+------------------------------+----------------------------------------+ +| 9263763445276221387 | 808f7fc59ef01fcb | 30 | 9277415232383221760 | ++---------------------+---------------------------------+------------------------------+----------------------------------------+ + SELECT json_encode_path(37.76938, -122.3889, 1728083375::TimestampSecond); +----------------------------------------------------------------------------------------------------------------------+ diff --git a/tests/cases/standalone/common/function/geo.sql b/tests/cases/standalone/common/function/geo.sql index af2a16517df1..205d45ddd5f5 100644 --- a/tests/cases/standalone/common/function/geo.sql +++ b/tests/cases/standalone/common/function/geo.sql @@ -83,6 +83,15 @@ SELECT geohash(37.76938, -122.3889, 11::UInt64); SELECT geohash_neighbours(37.76938, -122.3889, 11); +WITH cell_cte AS ( + SELECT s2_latlng_to_cell(37.76938, -122.3889) AS cell +) +SELECT cell, + s2_cell_to_token(cell), + s2_cell_level(cell), + s2_cell_parent(cell, 3) +FROM cell_cte; + SELECT json_encode_path(37.76938, -122.3889, 1728083375::TimestampSecond); SELECT json_encode_path(lat, lon, ts)