feat: add function to aggregate path into a geojson path (#4798)

* feat: add geojson function to aggregate paths * test: add sqlness results * test: add sqlness * refactor: corrected to aggregation function * chore: update comments * fix: make linter happy again * refactor: rename to remove `geo` from `geojson` function name The return type is not geojson at all. It's just compatible with geojson's coordinates part and superset's deckgl path plugin.
GreptimeTeam · Oct 9, 2024 · 1991892 · 1991892
1 parent 5f0a83b
commit 1991892
Show file tree

Hide file tree

Showing 17 changed files with 367 additions and 19 deletions.
diff --git a/src/common/function/src/scalars/aggregate.rs b/src/common/function/src/scalars/aggregate.rs
@@ -31,6 +31,7 @@ pub use polyval::PolyvalAccumulatorCreator;
 pub use scipy_stats_norm_cdf::ScipyStatsNormCdfAccumulatorCreator;
 pub use scipy_stats_norm_pdf::ScipyStatsNormPdfAccumulatorCreator;
 
+use super::geo::encoding::JsonPathEncodeFunctionCreator;
 use crate::function_registry::FunctionRegistry;
 
 /// A function creates `AggregateFunctionCreator`.
@@ -91,5 +92,7 @@ impl AggregateFunctions {
         register_aggr_func!("argmin", 1, ArgminAccumulatorCreator);
         register_aggr_func!("scipystatsnormcdf", 2, ScipyStatsNormCdfAccumulatorCreator);
         register_aggr_func!("scipystatsnormpdf", 2, ScipyStatsNormPdfAccumulatorCreator);
+
+        register_aggr_func!("json_encode_path", 3, JsonPathEncodeFunctionCreator);
     }
 }
diff --git a/src/common/function/src/scalars/aggregate/argmax.rs b/src/common/function/src/scalars/aggregate/argmax.rs
@@ -16,7 +16,10 @@ use std::cmp::Ordering;
 use std::sync::Arc;
 
 use common_macro::{as_aggr_func_creator, AggrFuncTypeStore};
-use common_query::error::{BadAccumulatorImplSnafu, CreateAccumulatorSnafu, Result};
+use common_query::error::{
+    BadAccumulatorImplSnafu, CreateAccumulatorSnafu, InvalidInputStateSnafu, Result,
+};
+use common_query::logical_plan::accumulator::AggrFuncTypeStore;
 use common_query::logical_plan::{Accumulator, AggregateFunctionCreator};
 use common_query::prelude::*;
 use datatypes::prelude::*;

diff --git a/src/common/function/src/scalars/aggregate/argmin.rs b/src/common/function/src/scalars/aggregate/argmin.rs
@@ -16,7 +16,10 @@ use std::cmp::Ordering;
 use std::sync::Arc;
 
 use common_macro::{as_aggr_func_creator, AggrFuncTypeStore};
-use common_query::error::{BadAccumulatorImplSnafu, CreateAccumulatorSnafu, Result};
+use common_query::error::{
+    BadAccumulatorImplSnafu, CreateAccumulatorSnafu, InvalidInputStateSnafu, Result,
+};
+use common_query::logical_plan::accumulator::AggrFuncTypeStore;
 use common_query::logical_plan::{Accumulator, AggregateFunctionCreator};
 use common_query::prelude::*;
 use datatypes::prelude::*;

diff --git a/src/common/function/src/scalars/aggregate/diff.rs b/src/common/function/src/scalars/aggregate/diff.rs
@@ -17,8 +17,10 @@ use std::sync::Arc;
 
 use common_macro::{as_aggr_func_creator, AggrFuncTypeStore};
 use common_query::error::{
-    CreateAccumulatorSnafu, DowncastVectorSnafu, FromScalarValueSnafu, Result,
+    CreateAccumulatorSnafu, DowncastVectorSnafu, FromScalarValueSnafu, InvalidInputStateSnafu,
+    Result,
 };
+use common_query::logical_plan::accumulator::AggrFuncTypeStore;
 use common_query::logical_plan::{Accumulator, AggregateFunctionCreator};
 use common_query::prelude::*;
 use datatypes::prelude::*;

diff --git a/src/common/function/src/scalars/aggregate/mean.rs b/src/common/function/src/scalars/aggregate/mean.rs
@@ -17,8 +17,10 @@ use std::sync::Arc;
 
 use common_macro::{as_aggr_func_creator, AggrFuncTypeStore};
 use common_query::error::{
-    BadAccumulatorImplSnafu, CreateAccumulatorSnafu, DowncastVectorSnafu, Result,
+    BadAccumulatorImplSnafu, CreateAccumulatorSnafu, DowncastVectorSnafu, InvalidInputStateSnafu,
+    Result,
 };
+use common_query::logical_plan::accumulator::AggrFuncTypeStore;
 use common_query::logical_plan::{Accumulator, AggregateFunctionCreator};
 use common_query::prelude::*;
 use datatypes::prelude::*;

diff --git a/src/common/function/src/scalars/aggregate/polyval.rs b/src/common/function/src/scalars/aggregate/polyval.rs
@@ -18,8 +18,9 @@ use std::sync::Arc;
 use common_macro::{as_aggr_func_creator, AggrFuncTypeStore};
 use common_query::error::{
     self, BadAccumulatorImplSnafu, CreateAccumulatorSnafu, DowncastVectorSnafu,
-    FromScalarValueSnafu, InvalidInputColSnafu, Result,
+    FromScalarValueSnafu, InvalidInputColSnafu, InvalidInputStateSnafu, Result,
 };
+use common_query::logical_plan::accumulator::AggrFuncTypeStore;
 use common_query::logical_plan::{Accumulator, AggregateFunctionCreator};
 use common_query::prelude::*;
 use datatypes::prelude::*;

diff --git a/src/common/function/src/scalars/aggregate/scipy_stats_norm_cdf.rs b/src/common/function/src/scalars/aggregate/scipy_stats_norm_cdf.rs
@@ -17,8 +17,10 @@ use std::sync::Arc;
 use common_macro::{as_aggr_func_creator, AggrFuncTypeStore};
 use common_query::error::{
     self, BadAccumulatorImplSnafu, CreateAccumulatorSnafu, DowncastVectorSnafu,
-    FromScalarValueSnafu, GenerateFunctionSnafu, InvalidInputColSnafu, Result,
+    FromScalarValueSnafu, GenerateFunctionSnafu, InvalidInputColSnafu, InvalidInputStateSnafu,
+    Result,
 };
+use common_query::logical_plan::accumulator::AggrFuncTypeStore;
 use common_query::logical_plan::{Accumulator, AggregateFunctionCreator};
 use common_query::prelude::*;
 use datatypes::prelude::*;

diff --git a/src/common/function/src/scalars/aggregate/scipy_stats_norm_pdf.rs b/src/common/function/src/scalars/aggregate/scipy_stats_norm_pdf.rs
@@ -17,8 +17,10 @@ use std::sync::Arc;
 use common_macro::{as_aggr_func_creator, AggrFuncTypeStore};
 use common_query::error::{
     self, BadAccumulatorImplSnafu, CreateAccumulatorSnafu, DowncastVectorSnafu,
-    FromScalarValueSnafu, GenerateFunctionSnafu, InvalidInputColSnafu, Result,
+    FromScalarValueSnafu, GenerateFunctionSnafu, InvalidInputColSnafu, InvalidInputStateSnafu,
+    Result,
 };
+use common_query::logical_plan::accumulator::AggrFuncTypeStore;
 use common_query::logical_plan::{Accumulator, AggregateFunctionCreator};
 use common_query::prelude::*;
 use datatypes::prelude::*;

diff --git a/src/common/function/src/scalars/geo.rs b/src/common/function/src/scalars/geo.rs
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 use std::sync::Arc;
+pub(crate) mod encoding;
 mod geohash;
 mod h3;
+mod helpers;
 
 use geohash::{GeohashFunction, GeohashNeighboursFunction};
 

diff --git a/src/common/function/src/scalars/geo/encoding.rs b/src/common/function/src/scalars/geo/encoding.rs
@@ -0,0 +1,223 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use common_error::ext::{BoxedError, PlainError};
+use common_error::status_code::StatusCode;
+use common_macro::{as_aggr_func_creator, AggrFuncTypeStore};
+use common_query::error::{self, InvalidFuncArgsSnafu, InvalidInputStateSnafu, Result};
+use common_query::logical_plan::accumulator::AggrFuncTypeStore;
+use common_query::logical_plan::{Accumulator, AggregateFunctionCreator};
+use common_query::prelude::AccumulatorCreatorFunction;
+use common_time::Timestamp;
+use datatypes::prelude::ConcreteDataType;
+use datatypes::value::{ListValue, Value};
+use datatypes::vectors::VectorRef;
+use snafu::{ensure, ResultExt};
+
+use super::helpers::{ensure_columns_len, ensure_columns_n};
+
+/// Accumulator of lat, lng, timestamp tuples
+#[derive(Debug)]
+pub struct JsonPathAccumulator {
+    timestamp_type: ConcreteDataType,
+    lat: Vec<Option<f64>>,
+    lng: Vec<Option<f64>>,
+    timestamp: Vec<Option<Timestamp>>,
+}
+
+impl JsonPathAccumulator {
+    fn new(timestamp_type: ConcreteDataType) -> Self {
+        Self {
+            lat: Vec::default(),
+            lng: Vec::default(),
+            timestamp: Vec::default(),
+            timestamp_type,
+        }
+    }
+}
+
+impl Accumulator for JsonPathAccumulator {
+    fn state(&self) -> Result<Vec<Value>> {
+        Ok(vec![
+            Value::List(ListValue::new(
+                self.lat.iter().map(|i| Value::from(*i)).collect(),
+                ConcreteDataType::float64_datatype(),
+            )),
+            Value::List(ListValue::new(
+                self.lng.iter().map(|i| Value::from(*i)).collect(),
+                ConcreteDataType::float64_datatype(),
+            )),
+            Value::List(ListValue::new(
+                self.timestamp.iter().map(|i| Value::from(*i)).collect(),
+                self.timestamp_type.clone(),
+            )),
+        ])
+    }
+
+    fn update_batch(&mut self, columns: &[VectorRef]) -> Result<()> {
+        // update batch as in datafusion just provides the accumulator original
+        //  input.
+        //
+        // columns is vec of [`lat`, `lng`, `timestamp`]
+        // where
+        // - `lat` is a vector of `Value::Float64` or similar type. Each item in
+        //  the vector is a row in given dataset.
+        // - so on so forth for `lng` and `timestamp`
+        ensure_columns_n!(columns, 3);
+
+        let lat = &columns[0];
+        let lng = &columns[1];
+        let ts = &columns[2];
+
+        let size = lat.len();
+
+        for idx in 0..size {
+            self.lat.push(lat.get(idx).as_f64_lossy());
+            self.lng.push(lng.get(idx).as_f64_lossy());
+            self.timestamp.push(ts.get(idx).as_timestamp());
+        }
+
+        Ok(())
+    }
+
+    fn merge_batch(&mut self, states: &[VectorRef]) -> Result<()> {
+        // merge batch as in datafusion gives state accumulated from the data
+        //  returned from child accumulators' state() call
+        // In our particular implementation, the data structure is like
+        //
+        // states is vec of [`lat`, `lng`, `timestamp`]
+        // where
+        // - `lat` is a vector of `Value::List`. Each item in the list is all
+        //  coordinates from a child accumulator.
+        // - so on so forth for `lng` and `timestamp`
+
+        ensure_columns_n!(states, 3);
+
+        let lat_lists = &states[0];
+        let lng_lists = &states[1];
+        let ts_lists = &states[2];
+
+        let len = lat_lists.len();
+
+        for idx in 0..len {
+            if let Some(lat_list) = lat_lists
+                .get(idx)
+                .as_list()
+                .map_err(BoxedError::new)
+                .context(error::ExecuteSnafu)?
+            {
+                for v in lat_list.items() {
+                    self.lat.push(v.as_f64_lossy());
+                }
+            }
+
+            if let Some(lng_list) = lng_lists
+                .get(idx)
+                .as_list()
+                .map_err(BoxedError::new)
+                .context(error::ExecuteSnafu)?
+            {
+                for v in lng_list.items() {
+                    self.lng.push(v.as_f64_lossy());
+                }
+            }
+
+            if let Some(ts_list) = ts_lists
+                .get(idx)
+                .as_list()
+                .map_err(BoxedError::new)
+                .context(error::ExecuteSnafu)?
+            {
+                for v in ts_list.items() {
+                    self.timestamp.push(v.as_timestamp());
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    fn evaluate(&self) -> Result<Value> {
+        let mut work_vec: Vec<(&Option<f64>, &Option<f64>, &Option<Timestamp>)> = self
+            .lat
+            .iter()
+            .zip(self.lng.iter())
+            .zip(self.timestamp.iter())
+            .map(|((a, b), c)| (a, b, c))
+            .collect();
+
+        // sort by timestamp, we treat null timestamp as 0
+        work_vec.sort_unstable_by_key(|tuple| tuple.2.unwrap_or_else(|| Timestamp::new_second(0)));
+
+        let result = serde_json::to_string(
+            &work_vec
+                .into_iter()
+                // note that we transform to lng,lat for geojson compatibility
+                .map(|(lat, lng, _)| vec![lng, lat])
+                .collect::<Vec<Vec<&Option<f64>>>>(),
+        )
+        .map_err(|e| {
+            BoxedError::new(PlainError::new(
+                format!("Serialization failure: {}", e),
+                StatusCode::EngineExecuteQuery,
+            ))
+        })
+        .context(error::ExecuteSnafu)?;
+
+        Ok(Value::String(result.into()))
+    }
+}
+
+/// This function accept rows of lat, lng and timestamp, sort with timestamp and
+/// encoding them into a geojson-like path.
+///
+/// Example:
+///
+/// ```sql
+/// SELECT json_encode_path(lat, lon, timestamp) FROM table [group by ...];
+/// ```
+///
+#[as_aggr_func_creator]
+#[derive(Debug, Default, AggrFuncTypeStore)]
+pub struct JsonPathEncodeFunctionCreator {}
+
+impl AggregateFunctionCreator for JsonPathEncodeFunctionCreator {
+    fn creator(&self) -> AccumulatorCreatorFunction {
+        let creator: AccumulatorCreatorFunction = Arc::new(move |types: &[ConcreteDataType]| {
+            let ts_type = types[2].clone();
+            Ok(Box::new(JsonPathAccumulator::new(ts_type)))
+        });
+
+        creator
+    }
+
+    fn output_type(&self) -> Result<ConcreteDataType> {
+        Ok(ConcreteDataType::string_datatype())
+    }
+
+    fn state_types(&self) -> Result<Vec<ConcreteDataType>> {
+        let input_types = self.input_types()?;
+        ensure!(input_types.len() == 3, InvalidInputStateSnafu);
+
+        let timestamp_type = input_types[2].clone();
+
+        Ok(vec![
+            ConcreteDataType::list_datatype(ConcreteDataType::float64_datatype()),
+            ConcreteDataType::list_datatype(ConcreteDataType::float64_datatype()),
+            ConcreteDataType::list_datatype(timestamp_type),
+        ])
+    }
+}
diff --git a/src/common/function/src/scalars/geo/helpers.rs b/src/common/function/src/scalars/geo/helpers.rs
@@ -0,0 +1,61 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+macro_rules! ensure_columns_len {
+    ($columns:ident) => {
+        ensure!(
+            $columns.windows(2).all(|c| c[0].len() == c[1].len()),
+            InvalidFuncArgsSnafu {
+                err_msg: "The length of input columns are in different size"
+            }
+        )
+    };
+    ($column_a:ident, $column_b:ident, $($column_n:ident),*) => {
+        ensure!(
+            {
+                let mut result = $column_a.len() == $column_b.len();
+                $(
+                result = result && ($column_a.len() == $column_n.len());
+                )*
+                result
+            }
+            InvalidFuncArgsSnafu {
+                err_msg: "The length of input columns are in different size"
+            }
+        )
+    };
+}
+
+pub(super) use ensure_columns_len;
+
+macro_rules! ensure_columns_n {
+    ($columns:ident, $n:literal) => {
+        ensure!(
+            $columns.len() == $n,
+            InvalidFuncArgsSnafu {
+                err_msg: format!(
+                    "The length of arguments is not correct, expect {}, provided : {}",
+                    stringify!($n),
+                    $columns.len()
+                ),
+            }
+        );
+
+        if $n > 1 {
+            ensure_columns_len!($columns);
+        }
+    };
+}
+
+pub(super) use ensure_columns_n;