apache · jayzhan211 · Jun 8, 2024 · Jun 7, 2024 · Jun 7, 2024 · Jun 7, 2024
diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs
@@ -168,20 +168,6 @@ pub fn max(expr: Expr) -> Expr {
     ))
 }
 
-/// Create an expression to represent the sum() aggregate function
-///
-/// TODO: Remove this function and use `sum` from `datafusion_functions_aggregate::expr_fn` instead
-pub fn sum(expr: Expr) -> Expr {
-    Expr::AggregateFunction(AggregateFunction::new(
-        aggregate_function::AggregateFunction::Sum,
-        vec![expr],
-        false,
-        None,
-        None,
-        None,
-    ))
-}
-
 /// Create an expression to represent the array_agg() aggregate function
 pub fn array_agg(expr: Expr) -> Expr {
     Expr::AggregateFunction(AggregateFunction::new(

diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs
@@ -1719,7 +1719,7 @@ pub fn unnest_with_options(
 mod tests {
     use super::*;
     use crate::logical_plan::StringifiedPlan;
-    use crate::{col, expr, expr_fn::exists, in_subquery, lit, scalar_subquery, sum};
+    use crate::{col, expr, expr_fn::exists, in_subquery, lit, scalar_subquery};
 
     use datafusion_common::SchemaError;
 
@@ -1775,28 +1775,6 @@ mod tests {
         );
     }
 
-    #[test]
-    fn plan_builder_aggregate() -> Result<()> {
-        let plan =
-            table_scan(Some("employee_csv"), &employee_schema(), Some(vec![3, 4]))?
-                .aggregate(
-                    vec![col("state")],
-                    vec![sum(col("salary")).alias("total_salary")],
-                )?
-                .project(vec![col("state"), col("total_salary")])?
-                .limit(2, Some(10))?
-                .build()?;
-
-        let expected = "Limit: skip=2, fetch=10\
-                \n  Projection: employee_csv.state, total_salary\
-                \n    Aggregate: groupBy=[[employee_csv.state]], aggr=[[SUM(employee_csv.salary) AS total_salary]]\
-                \n      TableScan: employee_csv projection=[state, salary]";
-
-        assert_eq!(expected, format!("{plan:?}"));
-
-        Ok(())
-    }
-
     #[test]
     fn plan_builder_sort() -> Result<()> {
         let plan =
@@ -2037,36 +2015,6 @@ mod tests {
         }
     }
 
-    #[test]
-    fn aggregate_non_unique_names() -> Result<()> {
-        let plan = table_scan(
-            Some("employee_csv"),
-            &employee_schema(),
-            // project state and salary by column index
-            Some(vec![3, 4]),
-        )?
-        // two columns with the same name => error
-        .aggregate(vec![col("state")], vec![sum(col("salary")).alias("state")]);
-
-        match plan {
-            Err(DataFusionError::SchemaError(
-                SchemaError::AmbiguousReference {
-                    field:
-                        Column {
-                            relation: Some(TableReference::Bare { table }),
-                            name,
-                        },
-                },
-                _,
-            )) => {
-                assert_eq!(*"employee_csv", *table);
-                assert_eq!("state", &name);
-                Ok(())
-            }
-            _ => plan_err!("Plan should have returned an DataFusionError::SchemaError"),
-        }
-    }
-
     fn employee_schema() -> Schema {
         Schema::new(vec![
             Field::new("id", DataType::Int32, false),

diff --git a/datafusion/optimizer/Cargo.toml b/datafusion/optimizer/Cargo.toml
@@ -50,6 +50,7 @@ hashbrown = { workspace = true }
 indexmap = { workspace = true }
 itertools = { workspace = true }
 log = { workspace = true }
+paste = "1.0.14"
 regex-syntax = "0.8.0"
 [dev-dependencies]
 ctor = { workspace = true }

diff --git a/datafusion/optimizer/src/analyzer/count_wildcard_rule.rs b/datafusion/optimizer/src/analyzer/count_wildcard_rule.rs
@@ -117,13 +117,14 @@ fn analyze_internal(plan: LogicalPlan) -> Result<Transformed<LogicalPlan>> {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::test::function_stub::sum;
     use crate::test::*;
     use arrow::datatypes::DataType;
     use datafusion_common::ScalarValue;
     use datafusion_expr::expr::Sort;
     use datafusion_expr::{
         col, count, exists, expr, in_subquery, logical_plan::LogicalPlanBuilder, max,
-        out_ref_col, scalar_subquery, sum, wildcard, AggregateFunction, WindowFrame,
+        out_ref_col, scalar_subquery, wildcard, AggregateFunction, WindowFrame,
         WindowFrameBound, WindowFrameUnits,
     };
     use std::sync::Arc;

diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs
@@ -840,13 +840,15 @@ mod test {
     use arrow::datatypes::Schema;
 
     use datafusion_expr::logical_plan::{table_scan, JoinType};
-    use datafusion_expr::{avg, lit, logical_plan::builder::LogicalPlanBuilder, sum};
+
+    use datafusion_expr::{avg, lit, logical_plan::builder::LogicalPlanBuilder};
     use datafusion_expr::{
         grouping_set, AccumulatorFactoryFunction, AggregateUDF, Signature,
         SimpleAggregateUDF, Volatility,
     };
 
     use crate::optimizer::OptimizerContext;
+    use crate::test::function_stub::sum;
     use crate::test::*;
 
     use super::*;

diff --git a/datafusion/optimizer/src/eliminate_filter.rs b/datafusion/optimizer/src/eliminate_filter.rs
@@ -91,10 +91,11 @@ mod tests {
 
     use datafusion_common::{Result, ScalarValue};
     use datafusion_expr::{
-        col, lit, logical_plan::builder::LogicalPlanBuilder, sum, Expr, LogicalPlan,
+        col, lit, logical_plan::builder::LogicalPlanBuilder, Expr, LogicalPlan,
     };
 
     use crate::eliminate_filter::EliminateFilter;
+    use crate::test::function_stub::sum;
     use crate::test::*;
 
     fn assert_optimized_plan_equal(plan: LogicalPlan, expected: &str) -> Result<()> {

diff --git a/datafusion/optimizer/src/eliminate_limit.rs b/datafusion/optimizer/src/eliminate_limit.rs
@@ -100,11 +100,11 @@ mod tests {
     use datafusion_expr::{
         col,
         logical_plan::{builder::LogicalPlanBuilder, JoinType},
-        sum,
     };
     use std::sync::Arc;
 
     use crate::push_down_limit::PushDownLimit;
+    use crate::test::function_stub::sum;
 
     fn observe(_plan: &LogicalPlan, _rule: &dyn OptimizerRule) {}
     fn assert_optimized_plan_eq(plan: LogicalPlan, expected: &str) -> Result<()> {

diff --git a/datafusion/optimizer/src/push_down_filter.rs b/datafusion/optimizer/src/push_down_filter.rs
@@ -1090,13 +1090,14 @@ mod tests {
     use datafusion_expr::expr::ScalarFunction;
     use datafusion_expr::logical_plan::table_scan;
     use datafusion_expr::{
-        col, in_list, in_subquery, lit, sum, ColumnarValue, Extension, ScalarUDF,
+        col, in_list, in_subquery, lit, ColumnarValue, Extension, ScalarUDF,
         ScalarUDFImpl, Signature, TableSource, TableType, UserDefinedLogicalNodeCore,
         Volatility,
     };
 
     use crate::optimizer::Optimizer;
     use crate::rewrite_disjunctive_predicate::RewriteDisjunctivePredicate;
+    use crate::test::function_stub::sum;
     use crate::test::*;
     use crate::OptimizerContext;
 

diff --git a/datafusion/optimizer/src/scalar_subquery_to_join.rs b/datafusion/optimizer/src/scalar_subquery_to_join.rs
@@ -400,10 +400,9 @@ mod tests {
     use super::*;
     use crate::test::*;
 
+    use crate::test::function_stub::sum;
     use arrow::datatypes::DataType;
-    use datafusion_expr::{
-        col, lit, max, min, out_ref_col, scalar_subquery, sum, Between,
-    };
+    use datafusion_expr::{col, lit, max, min, out_ref_col, scalar_subquery, Between};
 
     /// Test multiple correlated subqueries
     #[test]

diff --git a/datafusion/optimizer/src/single_distinct_to_groupby.rs b/datafusion/optimizer/src/single_distinct_to_groupby.rs
@@ -360,12 +360,13 @@ impl OptimizerRule for SingleDistinctToGroupBy {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::test::function_stub::sum;
     use crate::test::*;
     use datafusion_expr::expr;
     use datafusion_expr::expr::GroupingSet;
     use datafusion_expr::{
         count, count_distinct, lit, logical_plan::builder::LogicalPlanBuilder, max, min,
-        sum, AggregateFunction,
+        AggregateFunction,
     };
 
     fn assert_optimized_plan_equal(plan: LogicalPlan, expected: &str) -> Result<()> {

diff --git a/datafusion/optimizer/src/test/function_stub.rs b/datafusion/optimizer/src/test/function_stub.rs
@@ -0,0 +1,188 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::any::Any;
+
+use arrow::datatypes::{
+    DataType, Field, DECIMAL128_MAX_PRECISION, DECIMAL256_MAX_PRECISION,
+};
+use datafusion_common::{exec_err, Result};
+use datafusion_expr::{
+    expr::AggregateFunction,
+    function::{AccumulatorArgs, StateFieldsArgs},
+    utils::AggregateOrderSensitivity,
+    Accumulator, AggregateUDFImpl, Expr, GroupsAccumulator, ReversedUDAF, Signature,
+    Volatility,
+};
+
+macro_rules! create_func {
+    ($UDAF:ty, $AGGREGATE_UDF_FN:ident) => {
+        paste::paste! {
+            /// Singleton instance of [$UDAF], ensures the UDAF is only created once
+            /// named STATIC_$(UDAF). For example `STATIC_FirstValue`
+            #[allow(non_upper_case_globals)]
+            static [< STATIC_ $UDAF >]: std::sync::OnceLock<std::sync::Arc<datafusion_expr::AggregateUDF>> =
+            std::sync::OnceLock::new();
+
+            /// AggregateFunction that returns a [AggregateUDF] for [$UDAF]
+            ///
+            /// [AggregateUDF]: datafusion_expr::AggregateUDF
+            pub fn $AGGREGATE_UDF_FN() -> std::sync::Arc<datafusion_expr::AggregateUDF> {
+                [< STATIC_ $UDAF >]
+                    .get_or_init(|| {
+                        std::sync::Arc::new(datafusion_expr::AggregateUDF::from(<$UDAF>::default()))
+                    })
+                    .clone()
+            }
+        }
+    }
+}
+
+create_func!(Sum, sum_udaf);
+
+pub(crate) fn sum(expr: Expr) -> Expr {
+    Expr::AggregateFunction(AggregateFunction::new_udf(
+        sum_udaf(),
+        vec![expr],
+        false,
+        None,
+        None,
+        None,
+    ))
+}
+
+#[derive(Debug)]
+pub struct Sum {
-pub struct Sum {
+/// Stub `sum` used for optimizer testing
+pub struct Sum {
-pub struct Sum {
+/// Stub `sum` used for optimizer testing
+pub struct Sum {
+    signature: Signature,
+    aliases: Vec<String>,
+}
+
+impl Sum {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::user_defined(Volatility::Immutable),
+            aliases: vec!["sum".to_string()],
+        }
+    }
+}
+
+impl Default for Sum {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl AggregateUDFImpl for Sum {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "SUM"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
+        if arg_types.len() != 1 {
+            return exec_err!("SUM expects exactly one argument");
+        }
+
+        // Refer to https://www.postgresql.org/docs/8.2/functions-aggregate.html doc
+        // smallint, int, bigint, real, double precision, decimal, or interval.
+
+        fn coerced_type(data_type: &DataType) -> Result<DataType> {
+            match data_type {
+                DataType::Dictionary(_, v) => coerced_type(v),
+                // in the spark, the result type is DECIMAL(min(38,precision+10), s)
+                // ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala#L66
+                DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => {
+                    Ok(data_type.clone())
+                }
+                dt if dt.is_signed_integer() => Ok(DataType::Int64),
+                dt if dt.is_unsigned_integer() => Ok(DataType::UInt64),
+                dt if dt.is_floating() => Ok(DataType::Float64),
+                _ => exec_err!("Sum not supported for {}", data_type),
+            }
+        }
+
+        Ok(vec![coerced_type(&arg_types[0])?])
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        match &arg_types[0] {
+            DataType::Int64 => Ok(DataType::Int64),
+            DataType::UInt64 => Ok(DataType::UInt64),
+            DataType::Float64 => Ok(DataType::Float64),
+            DataType::Decimal128(precision, scale) => {
+                // in the spark, the result type is DECIMAL(min(38,precision+10), s)
+                // ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala#L66
+                let new_precision = DECIMAL128_MAX_PRECISION.min(*precision + 10);
+                Ok(DataType::Decimal128(new_precision, *scale))
+            }
+            DataType::Decimal256(precision, scale) => {
+                // in the spark, the result type is DECIMAL(min(38,precision+10), s)
+                // ref: https://github.com/apache/spark/blob/fcf636d9eb8d645c24be3db2d599aba2d7e2955a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Sum.scala#L66
+                let new_precision = DECIMAL256_MAX_PRECISION.min(*precision + 10);
+                Ok(DataType::Decimal256(new_precision, *scale))
+            }
+            other => {
+                exec_err!("[return_type] SUM not supported for {}", other)
+            }
+        }
+    }
+
+    fn accumulator(&self, _args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
+        unreachable!("stub should not have accumulate()")
+    }
+
+    fn state_fields(&self, _args: StateFieldsArgs) -> Result<Vec<Field>> {
+        unreachable!("stub should not have state_fields()")
+    }
+
+    fn aliases(&self) -> &[String] {
+        &self.aliases
+    }
+
+    fn groups_accumulator_supported(&self, _args: AccumulatorArgs) -> bool {
+        false
+    }
+
+    fn create_groups_accumulator(
+        &self,
+        _args: AccumulatorArgs,
+    ) -> Result<Box<dyn GroupsAccumulator>> {
+        unreachable!("stub should not have accumulate()")
+    }
+
+    fn create_sliding_accumulator(
+        &self,
+        _args: AccumulatorArgs,
+    ) -> Result<Box<dyn Accumulator>> {
+        unreachable!("stub should not have accumulate()")
+    }
+
+    fn reverse_expr(&self) -> ReversedUDAF {
+        ReversedUDAF::Identical
+    }
+
+    fn order_sensitivity(&self) -> AggregateOrderSensitivity {
+        AggregateOrderSensitivity::Insensitive
+    }
+}