From 79b0734df6c8d16f244b0c1db956e3567bca4534 Mon Sep 17 00:00:00 2001
From: kamille <caoruiqiu.crq@antgroup.com>
Date: Wed, 9 Oct 2024 01:51:11 +0800
Subject: [PATCH] make `data_gen_rounds` able to set again, and add more tests.

---
 .../core/tests/fuzz_cases/aggregate_fuzz.rs   | 25 ++++++++++++++++---
 .../fuzz_cases/aggregation_fuzzer/fuzzer.rs   | 25 ++++++++++++++-----
 2 files changed, 40 insertions(+), 10 deletions(-)

diff --git a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs
index 0aad4b502c0e..c0fe3851c059 100644
--- a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs
+++ b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs
@@ -62,7 +62,7 @@ async fn test_group_by_single_int64() {
 
     // Define data generator config
     let columns = vec![
-        ColumnDescr::new("a", DataType::Int64),
+        ColumnDescr::new("a", DataType::Int32),
         ColumnDescr::new("b", DataType::Int64),
         ColumnDescr::new("c", DataType::Int64),
     ];
@@ -79,10 +79,13 @@ async fn test_group_by_single_int64() {
     // Build fuzzer
     let fuzzer = builder
         .data_gen_config(data_gen_config)
+        .data_gen_rounds(32)
         .add_sql("SELECT b, sum(a) FROM fuzz_table GROUP BY b")
+        .add_sql("SELECT b, sum(distinct a) FROM fuzz_table GROUP BY b")
         .add_sql("SELECT b, max(a) FROM fuzz_table GROUP BY b")
         .add_sql("SELECT b, min(a) FROM fuzz_table GROUP BY b")
         .add_sql("SELECT b, count(a) FROM fuzz_table GROUP BY b")
+        .add_sql("SELECT b, count(distinct a) FROM fuzz_table GROUP BY b")
         .add_sql("SELECT b, avg(a) FROM fuzz_table GROUP BY b")
         .table_name("fuzz_table")
         .build();
@@ -97,7 +100,7 @@ async fn test_group_by_single_string() {
 
     // Define data generator config
     let columns = vec![
-        ColumnDescr::new("a", DataType::Int64),
+        ColumnDescr::new("a", DataType::Int32),
         ColumnDescr::new("b", DataType::Utf8),
         ColumnDescr::new("c", DataType::Int64),
     ];
@@ -114,7 +117,14 @@ async fn test_group_by_single_string() {
     // Build fuzzer
     let fuzzer = builder
         .data_gen_config(data_gen_config)
+        .data_gen_rounds(32)
         .add_sql("SELECT b, sum(a) FROM fuzz_table GROUP BY b")
+        .add_sql("SELECT b, sum(distinct a) FROM fuzz_table GROUP BY b")
+        .add_sql("SELECT b, max(a) FROM fuzz_table GROUP BY b")
+        .add_sql("SELECT b, min(a) FROM fuzz_table GROUP BY b")
+        .add_sql("SELECT b, count(a) FROM fuzz_table GROUP BY b")
+        .add_sql("SELECT b, count(distinct a) FROM fuzz_table GROUP BY b")
+        .add_sql("SELECT b, avg(a) FROM fuzz_table GROUP BY b")
         .table_name("fuzz_table")
         .build();
 
@@ -128,7 +138,7 @@ async fn test_group_by_mixed_string_int64() {
 
     // Define data generator config
     let columns = vec![
-        ColumnDescr::new("a", DataType::Int64),
+        ColumnDescr::new("a", DataType::Int32),
         ColumnDescr::new("b", DataType::Utf8),
         ColumnDescr::new("c", DataType::Int64),
         ColumnDescr::new("d", DataType::Int32),
@@ -146,7 +156,14 @@ async fn test_group_by_mixed_string_int64() {
     // Build fuzzer
     let fuzzer = builder
         .data_gen_config(data_gen_config)
-        .add_sql("SELECT b, c, sum(a) FROM fuzz_table GROUP BY b,c")
+        .data_gen_rounds(32)
+        .add_sql("SELECT b, c, sum(a) FROM fuzz_table GROUP BY b, c")
+        .add_sql("SELECT b, c, sum(distinct a) FROM fuzz_table GROUP BY b,c")
+        .add_sql("SELECT b, c, max(a) FROM fuzz_table GROUP BY b, c")
+        .add_sql("SELECT b, c, min(a) FROM fuzz_table GROUP BY b, c")
+        .add_sql("SELECT b, c, count(a) FROM fuzz_table GROUP BY b, c")
+        .add_sql("SELECT b, c, count(distinct a) FROM fuzz_table GROUP BY b, c")
+        .add_sql("SELECT b, c, avg(a) FROM fuzz_table GROUP BY b, c")
         .table_name("fuzz_table")
         .build();
 
diff --git a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs
index 3f5aa3577cc7..c9a28d9f2602 100644
--- a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs
+++ b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs
@@ -29,11 +29,6 @@ use crate::fuzz_cases::aggregation_fuzzer::{
     run_sql,
 };
 
-/// Rounds to call `generate` of [`DatasetGenerator`]
-/// in [`AggregationFuzzer`] `len(sort_keys_set) + 1` datasets
-/// will be generated for testing.
-const DATA_GEN_ROUNDS: usize = 16;
-
 /// Rounds to call `generate` of [`SessionContextGenerator`]
 /// in [`AggregationFuzzer`], `ctx_gen_rounds` random [`SessionContext`]
 /// will generated for each dataset for testing.
@@ -50,6 +45,9 @@ pub struct AggregationFuzzerBuilder {
     /// Used to generate `dataset_generator` in [`AggregationFuzzer`],
     /// no default, and required to set
     data_gen_config: Option<DatasetGeneratorConfig>,
+
+    /// See `data_gen_rounds` in [`AggregationFuzzer`], default 16
+    data_gen_rounds: usize,
 }
 
 impl AggregationFuzzerBuilder {
@@ -58,6 +56,7 @@ impl AggregationFuzzerBuilder {
             candidate_sqls: Vec::new(),
             table_name: None,
             data_gen_config: None,
+            data_gen_rounds: 16,
         }
     }
 
@@ -76,11 +75,17 @@ impl AggregationFuzzerBuilder {
         self
     }
 
+    pub fn data_gen_rounds(mut self, data_gen_rounds: usize) -> Self {
+        self.data_gen_rounds = data_gen_rounds;
+        self
+    }
+
     pub fn build(self) -> AggregationFuzzer {
         assert!(!self.candidate_sqls.is_empty());
         let candidate_sqls = self.candidate_sqls;
         let table_name = self.table_name.expect("table_name is required");
         let data_gen_config = self.data_gen_config.expect("data_gen_config is required");
+        let data_gen_rounds = self.data_gen_rounds;
 
         let dataset_generator = DatasetGenerator::new(data_gen_config);
 
@@ -88,6 +93,7 @@ impl AggregationFuzzerBuilder {
             candidate_sqls,
             table_name,
             dataset_generator,
+            data_gen_rounds,
         }
     }
 }
@@ -110,6 +116,13 @@ pub struct AggregationFuzzer {
 
     /// Dataset generator used to randomly generate datasets
     dataset_generator: DatasetGenerator,
+
+    /// Rounds to call `generate` of [`DatasetGenerator`],
+    /// len(sort_keys_set) + 1` datasets will be generated for testing.
+    ///
+    /// It is suggested to set value 2x or more bigger than num of
+    /// `candidate_sqls` for better test coverage.
+    data_gen_rounds: usize,
 }
 
 /// Query group including the tested dataset and its sql query
@@ -124,7 +137,7 @@ impl AggregationFuzzer {
         let mut rng = thread_rng();
 
         // Loop to generate datasets and its query
-        for _ in 0..DATA_GEN_ROUNDS {
+        for _ in 0..self.data_gen_rounds {
             // Generate datasets first
             let datasets = self
                 .dataset_generator