From ef4f61414db721df8f6121f5dc53993a7562b67a Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 8 Jan 2025 03:08:17 +0000
Subject: [PATCH 01/21] chore(deps): Bump downcast-rs from 1.2.0 to 2.0.1
 (#20063)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 Cargo.lock               | 16 +++++++++++-----
 src/expr/core/Cargo.toml |  2 +-
 src/frontend/Cargo.toml  |  2 +-
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 74687e9ee8bdb..43645d8b8b745 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4195,9 +4195,15 @@ checksum = "1435fa1053d8b2fbbe9be7e97eca7f33d37b28409959813daefc1446a14247f1"
 
 [[package]]
 name = "downcast-rs"
-version = "1.2.0"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75b325c5dbd37f80359721ad39aca5a29fb04c89279657cffdda8736d0c0b9d2"
+
+[[package]]
+name = "downcast-rs"
+version = "2.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9ea835d29036a4087793836fa931b08837ad5e957da9e23886b29586fb9b6650"
+checksum = "ea8a8b81cacc08888170eef4d13b775126db426d0b348bee9d18c2c1eaf123cf"
 
 [[package]]
 name = "duct"
@@ -7146,7 +7152,7 @@ dependencies = [
  "async-task",
  "bincode 1.3.3",
  "bytes",
- "downcast-rs",
+ "downcast-rs 1.2.1",
  "futures-util",
  "lazy_static",
  "libc",
@@ -11118,7 +11124,7 @@ dependencies = [
  "await-tree",
  "chrono",
  "const-currying",
- "downcast-rs",
+ "downcast-rs 2.0.1",
  "easy-ext",
  "educe",
  "either",
@@ -11230,7 +11236,7 @@ dependencies = [
  "bytes",
  "chrono",
  "clap",
- "downcast-rs",
+ "downcast-rs 2.0.1",
  "dyn-clone",
  "easy-ext",
  "educe",
diff --git a/src/expr/core/Cargo.toml b/src/expr/core/Cargo.toml
index 16db4ea8f2691..f9be0b84c36d7 100644
--- a/src/expr/core/Cargo.toml
+++ b/src/expr/core/Cargo.toml
@@ -19,7 +19,7 @@ chrono = { version = "0.4", default-features = false, features = [
     "std",
 ] }
 const-currying = "0.0.4"
-downcast-rs = "1.2"
+downcast-rs = "2.0"
 easy-ext = "1"
 educe = "0.6"
 either = "1"
diff --git a/src/frontend/Cargo.toml b/src/frontend/Cargo.toml
index 9dccfb56380df..50af8afef3240 100644
--- a/src/frontend/Cargo.toml
+++ b/src/frontend/Cargo.toml
@@ -21,7 +21,7 @@ bk-tree = "0.5.0"
 bytes = "1"
 chrono = { version = "0.4", default-features = false }
 clap = { workspace = true }
-downcast-rs = "1.2"
+downcast-rs = "2.0"
 dyn-clone = "1.0.14"
 easy-ext = "1"
 educe = "0.6"

From f89c1b71cd0aa25a5cd378572c8777a17340f1cc Mon Sep 17 00:00:00 2001
From: xxchan <xxchan22f@gmail.com>
Date: Wed, 8 Jan 2025 11:30:41 +0800
Subject: [PATCH 02/21] build(deps): unpatch prost (#20066)

Signed-off-by: xxchan <xxchan22f@gmail.com>
---
 Cargo.lock | 128 +++++++++++++++++++++--------------------------------
 Cargo.toml |   2 -
 2 files changed, 50 insertions(+), 80 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 43645d8b8b745..e2547112a44e3 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -562,8 +562,8 @@ dependencies = [
  "bytes",
  "futures",
  "paste",
- "prost 0.13.1",
- "prost-types 0.13.1",
+ "prost 0.13.4",
+ "prost-types 0.13.4",
  "tokio",
  "tonic",
 ]
@@ -2651,8 +2651,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "86ed14aa9c9f927213c6e4f3ef75faaad3406134efe84ba2cb7983431d5f0931"
 dependencies = [
  "futures-core",
- "prost 0.13.1",
- "prost-types 0.13.1",
+ "prost 0.13.4",
+ "prost-types 0.13.4",
  "tonic",
  "tracing-core",
 ]
@@ -2670,8 +2670,8 @@ dependencies = [
  "hdrhistogram",
  "humantime",
  "hyper-util",
- "prost 0.13.1",
- "prost-types 0.13.1",
+ "prost 0.13.4",
+ "prost-types 0.13.4",
  "serde",
  "serde_json",
  "thread_local",
@@ -5536,8 +5536,8 @@ version = "0.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0edfdfb507593d47605b3bb2fb36628b391e3d397e520b85852dea2412c8e2d1"
 dependencies = [
- "prost 0.13.1",
- "prost-types 0.13.1",
+ "prost 0.13.4",
+ "prost-types 0.13.4",
  "tonic",
 ]
 
@@ -5564,7 +5564,7 @@ dependencies = [
  "google-cloud-gax",
  "google-cloud-googleapis",
  "google-cloud-token",
- "prost-types 0.13.1",
+ "prost-types 0.13.4",
  "thiserror 1.0.63",
  "tokio",
  "tokio-util",
@@ -8307,7 +8307,7 @@ dependencies = [
  "opentelemetry",
  "opentelemetry-proto",
  "opentelemetry_sdk",
- "prost 0.13.1",
+ "prost 0.13.4",
  "thiserror 1.0.63",
  "tokio",
  "tonic",
@@ -8321,7 +8321,7 @@ checksum = "30ee9f20bff9c984511a02f082dc8ede839e4a9bf15cc2487c8d6fea5ad850d9"
 dependencies = [
  "opentelemetry",
  "opentelemetry_sdk",
- "prost 0.13.1",
+ "prost 0.13.4",
  "tonic",
 ]
 
@@ -8417,7 +8417,7 @@ dependencies = [
  "datasize",
  "hex",
  "itertools 0.13.0",
- "prost 0.13.1",
+ "prost 0.13.4",
  "rust-embed",
  "schnellru",
  "serde",
@@ -8716,8 +8716,8 @@ checksum = "6eea3058763d6e656105d1403cb04e0a41b7bbac6362d413e7c33be0c32279c9"
 dependencies = [
  "heck 0.5.0",
  "itertools 0.13.0",
- "prost 0.13.1",
- "prost-types 0.13.1",
+ "prost 0.13.4",
+ "prost-types 0.13.4",
 ]
 
 [[package]]
@@ -9401,20 +9401,11 @@ dependencies = [
  "prost-derive 0.12.6",
 ]
 
-[[package]]
-name = "prost"
-version = "0.13.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e13db3d3fde688c61e2446b4d843bc27a7e8af269a69440c0308021dc92333cc"
-dependencies = [
- "bytes",
- "prost-derive 0.13.1",
-]
-
 [[package]]
 name = "prost"
 version = "0.13.4"
-source = "git+https://github.com/xxchan/prost.git?rev=0eb1c7b09976cf6b5231e4b8d87bb5908ae6a163#0eb1c7b09976cf6b5231e4b8d87bb5908ae6a163"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2c0fef6c4230e4ccf618a35c59d7ede15dea37de8427500f50aff708806e42ec"
 dependencies = [
  "bytes",
  "prost-derive 0.13.4",
@@ -9445,10 +9436,11 @@ dependencies = [
 [[package]]
 name = "prost-build"
 version = "0.13.4"
-source = "git+https://github.com/xxchan/prost.git?rev=0eb1c7b09976cf6b5231e4b8d87bb5908ae6a163#0eb1c7b09976cf6b5231e4b8d87bb5908ae6a163"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d0f3e5beed80eb580c68e2c600937ac2c4eedabdfd5ef1e5b7ea4f3fba84497b"
 dependencies = [
  "heck 0.5.0",
- "itertools 0.13.0",
+ "itertools 0.10.5",
  "log",
  "multimap 0.10.0",
  "once_cell",
@@ -9487,26 +9479,14 @@ dependencies = [
  "syn 2.0.87",
 ]
 
-[[package]]
-name = "prost-derive"
-version = "0.13.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "18bec9b0adc4eba778b33684b7ba3e7137789434769ee3ce3930463ef904cfca"
-dependencies = [
- "anyhow",
- "itertools 0.13.0",
- "proc-macro2",
- "quote",
- "syn 2.0.87",
-]
-
 [[package]]
 name = "prost-derive"
 version = "0.13.4"
-source = "git+https://github.com/xxchan/prost.git?rev=0eb1c7b09976cf6b5231e4b8d87bb5908ae6a163#0eb1c7b09976cf6b5231e4b8d87bb5908ae6a163"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "157c5a9d7ea5c2ed2d9fb8f495b64759f7816c7eaea54ba3978f0d63000162e3"
 dependencies = [
  "anyhow",
- "itertools 0.13.0",
+ "itertools 0.10.5",
  "proc-macro2",
  "quote",
  "syn 2.0.87",
@@ -9531,8 +9511,8 @@ dependencies = [
  "logos",
  "miette",
  "once_cell",
- "prost 0.13.1",
- "prost-types 0.13.1",
+ "prost 0.13.4",
+ "prost-types 0.13.4",
  "serde",
  "serde-value",
 ]
@@ -9546,19 +9526,11 @@ dependencies = [
  "prost 0.11.9",
 ]
 
-[[package]]
-name = "prost-types"
-version = "0.13.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cee5168b05f49d4b0ca581206eb14a7b22fafd963efe729ac48eb03266e25cc2"
-dependencies = [
- "prost 0.13.1",
-]
-
 [[package]]
 name = "prost-types"
 version = "0.13.4"
-source = "git+https://github.com/xxchan/prost.git?rev=0eb1c7b09976cf6b5231e4b8d87bb5908ae6a163#0eb1c7b09976cf6b5231e4b8d87bb5908ae6a163"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cc2f1e56baa61e93533aebc21af4d2134b70f66275e0fcdf3cbe43d77ff7e8fc"
 dependencies = [
  "prost 0.13.4",
 ]
@@ -9577,9 +9549,9 @@ checksum = "873f359bdecdfe6e353752f97cb9ee69368df55b16363ed2216da85e03232a58"
 dependencies = [
  "bytes",
  "miette",
- "prost 0.13.1",
+ "prost 0.13.4",
  "prost-reflect",
- "prost-types 0.13.1",
+ "prost-types 0.13.4",
  "protox-parse",
  "thiserror 1.0.63",
 ]
@@ -9592,7 +9564,7 @@ checksum = "a3a462d115462c080ae000c29a47f0b3985737e5d3a995fcdbcaa5c782068dde"
 dependencies = [
  "logos",
  "miette",
- "prost-types 0.13.1",
+ "prost-types 0.13.4",
  "thiserror 1.0.63",
 ]
 
@@ -10345,7 +10317,7 @@ dependencies = [
  "bytes",
  "itertools 0.13.0",
  "parking_lot 0.12.1",
- "prost 0.13.1",
+ "prost 0.13.4",
  "risingwave_common",
  "risingwave_hummock_sdk",
  "risingwave_meta_model",
@@ -10586,7 +10558,7 @@ dependencies = [
  "pretty_assertions",
  "procfs 0.17.0",
  "prometheus",
- "prost 0.13.1",
+ "prost 0.13.4",
  "rand",
  "regex",
  "reqwest 0.12.4",
@@ -10735,7 +10707,7 @@ dependencies = [
  "bincode 1.3.3",
  "hex",
  "parking_lot 0.12.1",
- "prost 0.13.1",
+ "prost 0.13.4",
  "risingwave_pb",
  "serde",
  "thiserror 1.0.63",
@@ -10798,7 +10770,7 @@ dependencies = [
  "jsonbb",
  "madsim-tokio",
  "madsim-tonic",
- "prost 0.13.1",
+ "prost 0.13.4",
  "risingwave_common",
  "risingwave_common_heap_profiling",
  "risingwave_common_service",
@@ -10832,7 +10804,7 @@ dependencies = [
  "maplit",
  "pprof",
  "prometheus",
- "prost 0.13.1",
+ "prost 0.13.4",
  "rand",
  "risingwave_batch",
  "risingwave_batch_executors",
@@ -10938,9 +10910,9 @@ dependencies = [
  "pretty_assertions",
  "proc-macro2",
  "prometheus",
- "prost 0.13.1",
+ "prost 0.13.4",
  "prost-reflect",
- "prost-types 0.13.1",
+ "prost-types 0.13.4",
  "pulsar",
  "quote",
  "rand",
@@ -11010,10 +10982,10 @@ dependencies = [
  "jsonschema-transpiler",
  "madsim-tokio",
  "num-bigint",
- "prost 0.13.1",
+ "prost 0.13.4",
  "prost-build 0.13.4",
  "prost-reflect",
- "prost-types 0.13.1",
+ "prost-types 0.13.4",
  "protox",
  "risingwave_common",
  "risingwave_pb",
@@ -11042,7 +11014,7 @@ dependencies = [
  "itertools 0.13.0",
  "madsim-tokio",
  "madsim-tonic",
- "prost 0.13.1",
+ "prost 0.13.4",
  "regex",
  "risingwave_common",
  "risingwave_connector",
@@ -11269,7 +11241,7 @@ dependencies = [
  "pretty-xmlish",
  "pretty_assertions",
  "prometheus",
- "prost 0.13.1",
+ "prost 0.13.4",
  "quick-xml 0.37.1",
  "rand",
  "risingwave_batch",
@@ -11380,7 +11352,7 @@ dependencies = [
  "madsim-tokio",
  "mockall",
  "parking_lot 0.12.1",
- "prost 0.13.1",
+ "prost 0.13.4",
  "risingwave_common",
  "risingwave_hummock_sdk",
  "risingwave_pb",
@@ -11399,7 +11371,7 @@ dependencies = [
  "futures",
  "jni",
  "madsim-tokio",
- "prost 0.13.1",
+ "prost 0.13.4",
  "risingwave_common",
  "risingwave_expr",
  "risingwave_hummock_sdk",
@@ -11427,7 +11399,7 @@ dependencies = [
  "jni",
  "madsim-tokio",
  "paste",
- "prost 0.13.1",
+ "prost 0.13.4",
  "risingwave_common",
  "risingwave_pb",
  "thiserror 1.0.63",
@@ -11497,7 +11469,7 @@ dependencies = [
  "parking_lot 0.12.1",
  "prometheus",
  "prometheus-http-query",
- "prost 0.13.1",
+ "prost 0.13.4",
  "rand",
  "risingwave_backup",
  "risingwave_common",
@@ -11558,7 +11530,7 @@ dependencies = [
 name = "risingwave_meta_model"
 version = "2.3.0-alpha"
 dependencies = [
- "prost 0.13.1",
+ "prost 0.13.4",
  "risingwave_common",
  "risingwave_pb",
  "sea-orm",
@@ -11676,7 +11648,7 @@ dependencies = [
  "madsim-tonic-build",
  "pbjson",
  "pbjson-build",
- "prost 0.13.1",
+ "prost 0.13.4",
  "prost-build 0.13.4",
  "prost-helpers",
  "risingwave_error",
@@ -11942,7 +11914,7 @@ dependencies = [
  "parking_lot 0.12.1",
  "procfs 0.17.0",
  "prometheus",
- "prost 0.13.1",
+ "prost 0.13.4",
  "rand",
  "risingwave_backup",
  "risingwave_common",
@@ -12012,7 +11984,7 @@ dependencies = [
  "prehash",
  "pretty_assertions",
  "prometheus",
- "prost 0.13.1",
+ "prost 0.13.4",
  "rand",
  "risingwave_common",
  "risingwave_common_estimate_size",
@@ -12050,7 +12022,7 @@ version = "2.3.0-alpha"
 dependencies = [
  "jsonbb",
  "madsim-tokio",
- "prost 0.13.1",
+ "prost 0.13.4",
  "reqwest 0.12.4",
  "risingwave_pb",
  "thiserror-ext",
@@ -14607,7 +14579,7 @@ dependencies = [
  "hyper-util",
  "percent-encoding",
  "pin-project",
- "prost 0.13.1",
+ "prost 0.13.4",
  "rustls-pemfile 2.2.0",
  "socket2 0.5.6",
  "tokio",
diff --git a/Cargo.toml b/Cargo.toml
index 17a6d1ff82670..771601e38e8a0 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -379,8 +379,6 @@ tokio-postgres = { git = "https://github.com/madsim-rs/rust-postgres.git", rev =
 sqlx = { git = "https://github.com/madsim-rs/sqlx.git", rev = "3efe6d0065963db2a2b7f30dee69f647e28dec81" }
 # patch to remove preserve_order from serde_json
 bson = { git = "https://github.com/risingwavelabs/bson-rust", rev = "e5175ec" }
-# TODO: unpatch after PR merged https://github.com/tokio-rs/prost/pull/1210
-prost-build = { git = "https://github.com/xxchan/prost.git", rev = "0eb1c7b09976cf6b5231e4b8d87bb5908ae6a163" }
 
 [workspace.metadata.dylint]
 libraries = [{ path = "./lints" }]

From 212addb20c86255293aa81f3e32d41db84b365cc Mon Sep 17 00:00:00 2001
From: Bugen Zhao <i@bugenzhao.com>
Date: Wed, 8 Jan 2025 12:54:50 +0800
Subject: [PATCH 03/21] fix(ci): increase timeout for e2e source test in
 main-cron (#20069)

---
 ci/workflows/main-cron.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/workflows/main-cron.yml b/ci/workflows/main-cron.yml
index b12d473c679de..2261f6dd49548 100644
--- a/ci/workflows/main-cron.yml
+++ b/ci/workflows/main-cron.yml
@@ -190,7 +190,7 @@ steps:
             <<: *docker-compose-common
             run: source-test-env
         - ./ci/plugins/upload-failure-logs
-      timeout_in_minutes: 15
+      timeout_in_minutes: 20
       retry: *auto-retry
 
     - label: "end-to-end sink test ({{matrix.backend}} backend)"

From f7e5a667ff3d2fcbe04e8631357509663d9625a3 Mon Sep 17 00:00:00 2001
From: Takahiro Ebato <takahiro.ebato@gmail.com>
Date: Wed, 8 Jan 2025 14:09:37 +0900
Subject: [PATCH 04/21] fix(sqlparser): make LIMIT and OFFSET order flexible
 (#20026)

---
 src/sqlparser/src/parser.rs              | 20 ++++++++++----------
 src/sqlparser/tests/testdata/select.yaml | 22 ++++++++++++++++++++++
 2 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/src/sqlparser/src/parser.rs b/src/sqlparser/src/parser.rs
index e81c06371d617..9eb3d9e439967 100644
--- a/src/sqlparser/src/parser.rs
+++ b/src/sqlparser/src/parser.rs
@@ -4248,17 +4248,17 @@ impl Parser<'_> {
             vec![]
         };
 
-        let limit = if self.parse_keyword(Keyword::LIMIT) {
-            self.parse_limit()?
-        } else {
-            None
-        };
+        let mut limit = None;
+        let mut offset = None;
+        for _x in 0..2 {
+            if limit.is_none() && self.parse_keyword(Keyword::LIMIT) {
+                limit = self.parse_limit()?
+            }
 
-        let offset = if self.parse_keyword(Keyword::OFFSET) {
-            Some(self.parse_offset()?)
-        } else {
-            None
-        };
+            if offset.is_none() && self.parse_keyword(Keyword::OFFSET) {
+                offset = Some(self.parse_offset()?)
+            }
+        }
 
         let fetch = if self.parse_keyword(Keyword::FETCH) {
             if limit.is_some() {
diff --git a/src/sqlparser/tests/testdata/select.yaml b/src/sqlparser/tests/testdata/select.yaml
index 72de88bd1dfb4..78f7c6a001dbd 100644
--- a/src/sqlparser/tests/testdata/select.yaml
+++ b/src/sqlparser/tests/testdata/select.yaml
@@ -237,3 +237,25 @@
     SELECT a0 + b0 FROM a_log, "B_log"
   formatted_sql: WITH a_log AS changelog from a, "B_log" AS changelog from public."B" SELECT a0 + b0 FROM a_log, "B_log"
   formatted_ast: 'Query(Query { with: Some(With { recursive: false, cte_tables: [Cte { alias: TableAlias { name: Ident { value: "a_log", quote_style: None }, columns: [] }, cte_inner: ChangeLog(ObjectName([Ident { value: "a", quote_style: None }])) }, Cte { alias: TableAlias { name: Ident { value: "B_log", quote_style: Some(''"'') }, columns: [] }, cte_inner: ChangeLog(ObjectName([Ident { value: "public", quote_style: None }, Ident { value: "B", quote_style: Some(''"'') }])) }] }), body: Select(Select { distinct: All, projection: [UnnamedExpr(BinaryOp { left: Identifier(Ident { value: "a0", quote_style: None }), op: Plus, right: Identifier(Ident { value: "b0", quote_style: None }) })], from: [TableWithJoins { relation: Table { name: ObjectName([Ident { value: "a_log", quote_style: None }]), alias: None, as_of: None }, joins: [] }, TableWithJoins { relation: Table { name: ObjectName([Ident { value: "B_log", quote_style: Some(''"'') }]), alias: None, as_of: None }, joins: [] }], lateral_views: [], selection: None, group_by: [], having: None }), order_by: [], limit: None, offset: None, fetch: None })'
+- input: SELECT * FROM t LIMIT 1
+  formatted_sql: SELECT * FROM t LIMIT 1
+  formatted_ast: 'Query(Query { with: None, body: Select(Select { distinct: All, projection: [Wildcard(None)], from: [TableWithJoins { relation: Table { name: ObjectName([Ident { value: "t", quote_style: None }]), alias: None, as_of: None }, joins: [] }], lateral_views: [], selection: None, group_by: [], having: None }), order_by: [], limit: Some(Value(Number("1"))), offset: None, fetch: None })'
+- input: SELECT * FROM t OFFSET 1
+  formatted_sql: SELECT * FROM t OFFSET 1
+  formatted_ast: 'Query(Query { with: None, body: Select(Select { distinct: All, projection: [Wildcard(None)], from: [TableWithJoins { relation: Table { name: ObjectName([Ident { value: "t", quote_style: None }]), alias: None, as_of: None }, joins: [] }], lateral_views: [], selection: None, group_by: [], having: None }), order_by: [], limit: None, offset: Some("1"), fetch: None })'
+- input: SELECT * FROM t LIMIT 1 OFFSET 2
+  formatted_sql: SELECT * FROM t LIMIT 1 OFFSET 2
+  formatted_ast: 'Query(Query { with: None, body: Select(Select { distinct: All, projection: [Wildcard(None)], from: [TableWithJoins { relation: Table { name: ObjectName([Ident { value: "t", quote_style: None }]), alias: None, as_of: None }, joins: [] }], lateral_views: [], selection: None, group_by: [], having: None }), order_by: [], limit: Some(Value(Number("1"))), offset: Some("2"), fetch: None })'
+- input: SELECT * FROM t OFFSET 2 LIMIT 1
+  formatted_sql: SELECT * FROM t LIMIT 1 OFFSET 2
+  formatted_ast: 'Query(Query { with: None, body: Select(Select { distinct: All, projection: [Wildcard(None)], from: [TableWithJoins { relation: Table { name: ObjectName([Ident { value: "t", quote_style: None }]), alias: None, as_of: None }, joins: [] }], lateral_views: [], selection: None, group_by: [], having: None }), order_by: [], limit: Some(Value(Number("1"))), offset: Some("2"), fetch: None })'
+- input: SELECT * FROM t LIMIT 1 LIMIT 2
+  error_msg: |-
+    sql parser error: expected end of statement, found: LIMIT
+    LINE 1: SELECT * FROM t LIMIT 1 LIMIT 2
+                                    ^
+- input: SELECT * FROM t OFFSET 1 OFFSET 2
+  error_msg: |-
+    sql parser error: expected end of statement, found: OFFSET
+    LINE 1: SELECT * FROM t OFFSET 1 OFFSET 2
+                                     ^

From 1ac43758611096ad4c953bc48b7e601e50b9877f Mon Sep 17 00:00:00 2001
From: Noel Kwan <47273164+kwannoel@users.noreply.github.com>
Date: Wed, 8 Jan 2025 13:35:27 +0800
Subject: [PATCH 05/21] bench(iceberg): add iceberg predicate pushdown bench
 (#20047)

---
 ci/scripts/e2e-iceberg-sink-v2-test.sh        |  3 +
 .../iceberg/benches/predicate_pushdown.slt    | 65 +++++++++++++++++++
 .../iceberg/benches/predicate_pushdown.toml   | 11 ++++
 .../predicate_pushdown/filter.slt.part        |  4 ++
 .../predicate_pushdown/point_get.slt.part     |  4 ++
 5 files changed, 87 insertions(+)
 create mode 100644 e2e_test/iceberg/benches/predicate_pushdown.slt
 create mode 100644 e2e_test/iceberg/benches/predicate_pushdown.toml
 create mode 100644 e2e_test/iceberg/benches/predicate_pushdown/filter.slt.part
 create mode 100644 e2e_test/iceberg/benches/predicate_pushdown/point_get.slt.part

diff --git a/ci/scripts/e2e-iceberg-sink-v2-test.sh b/ci/scripts/e2e-iceberg-sink-v2-test.sh
index d1ebbce419055..4825e4ab11b1a 100755
--- a/ci/scripts/e2e-iceberg-sink-v2-test.sh
+++ b/ci/scripts/e2e-iceberg-sink-v2-test.sh
@@ -35,6 +35,7 @@ echo "--- preparing iceberg"
 cd e2e_test/iceberg
 bash ./start_spark_connect_server.sh
 
+echo "--- Running tests"
 # Don't remove the `--quiet` option since poetry has a bug when printing output, see
 # https://github.com/python-poetry/poetry/issues/3412
 poetry update --quiet
@@ -52,6 +53,8 @@ poetry run python main.py -t ./test_case/iceberg_source_all_delete.toml
 poetry run python main.py -t ./test_case/iceberg_source_explain_for_delete.toml
 poetry run python main.py -t ./test_case/iceberg_predicate_pushdown.toml
 
+echo "--- Running benchmarks"
+poetry run python main.py -t ./benches/predicate_pushdown.toml
 
 echo "--- Kill cluster"
 cd ../../
diff --git a/e2e_test/iceberg/benches/predicate_pushdown.slt b/e2e_test/iceberg/benches/predicate_pushdown.slt
new file mode 100644
index 0000000000000..355b0858db9c1
--- /dev/null
+++ b/e2e_test/iceberg/benches/predicate_pushdown.slt
@@ -0,0 +1,65 @@
+statement ok
+CREATE TABLE t1 (i1 int, i2 varchar, i3 varchar);
+
+statement ok
+INSERT INTO t1 select key, 'some long string of text', 'another long string of text' from generate_series(1, 1000000) as key;
+
+statement ok
+CREATE SINK sink1 AS select * from t1 WITH (
+    connector = 'iceberg',
+    type = 'append-only',
+    force_append_only = 'true',
+    database.name = 'demo_db',
+    table.name = 't1',
+    catalog.name = 'demo',
+    catalog.type = 'storage',
+    warehouse.path = 's3a://icebergdata/demo',
+    s3.endpoint = 'http://127.0.0.1:9301',
+    s3.region = 'us-east-1',
+    s3.access.key = 'hummockadmin',
+    s3.secret.key = 'hummockadmin',
+    commit_checkpoint_interval = 1,
+    create_table_if_not_exists = 'true'
+);
+
+statement ok
+CREATE SOURCE iceberg_t1_source
+WITH (
+    connector = 'iceberg',
+    s3.endpoint = 'http://127.0.0.1:9301',
+    s3.region = 'us-east-1',
+    s3.access.key = 'hummockadmin',
+    s3.secret.key = 'hummockadmin',
+    s3.path.style.access = 'true',
+    catalog.type = 'storage',
+    warehouse.path = 's3a://icebergdata/demo',
+    database.name = 'demo_db',
+    table.name = 't1',
+);
+
+statement ok
+flush;
+
+query I
+select count(*) from iceberg_t1_source;
+----
+1000000
+
+# warmup
+include ./predicate_pushdown/point_get.slt.part
+# bench
+include ./predicate_pushdown/point_get.slt.part
+
+# warmup
+include ./predicate_pushdown/filter.slt.part
+# bench
+include ./predicate_pushdown/filter.slt.part
+
+statement ok
+DROP SINK sink1;
+
+statement ok
+DROP SOURCE iceberg_t1_source;
+
+statement ok
+DROP TABLE t1 cascade;
\ No newline at end of file
diff --git a/e2e_test/iceberg/benches/predicate_pushdown.toml b/e2e_test/iceberg/benches/predicate_pushdown.toml
new file mode 100644
index 0000000000000..018bfd0cdb6d7
--- /dev/null
+++ b/e2e_test/iceberg/benches/predicate_pushdown.toml
@@ -0,0 +1,11 @@
+init_sqls = [
+    'CREATE SCHEMA IF NOT EXISTS demo_db',
+    'DROP TABLE IF EXISTS demo_db.t1',
+]
+
+slt = 'benches/predicate_pushdown.slt'
+
+drop_sqls = [
+    'DROP TABLE IF EXISTS demo_db.t1',
+    'DROP SCHEMA IF EXISTS demo_db',
+]
diff --git a/e2e_test/iceberg/benches/predicate_pushdown/filter.slt.part b/e2e_test/iceberg/benches/predicate_pushdown/filter.slt.part
new file mode 100644
index 0000000000000..897bd4966182d
--- /dev/null
+++ b/e2e_test/iceberg/benches/predicate_pushdown/filter.slt.part
@@ -0,0 +1,4 @@
+query I
+select count(*) from iceberg_t1_source where i1 > 1001 and i1 < 1110;
+----
+108
\ No newline at end of file
diff --git a/e2e_test/iceberg/benches/predicate_pushdown/point_get.slt.part b/e2e_test/iceberg/benches/predicate_pushdown/point_get.slt.part
new file mode 100644
index 0000000000000..6051155902a8b
--- /dev/null
+++ b/e2e_test/iceberg/benches/predicate_pushdown/point_get.slt.part
@@ -0,0 +1,4 @@
+query I
+select * from iceberg_t1_source where i1 = 100000;
+----
+  100000 some long string of text another long string of text
\ No newline at end of file

From 52532c7ec6dabad2105146f873c2f95c9c8ea704 Mon Sep 17 00:00:00 2001
From: William Wen <44139337+wenym1@users.noreply.github.com>
Date: Wed, 8 Jan 2025 16:01:53 +0800
Subject: [PATCH 06/21] refactor(storage): remove StateStoreWrite and extract
 separate StateStoreReadLog trait (#20051)

---
 .../src/hummock/iterator/change_log.rs        |   6 +-
 .../src/hummock/store/hummock_storage.rs      |   7 +-
 .../hummock/store/local_hummock_storage.rs    |  16 -
 src/storage/src/hummock/utils.rs              |  91 ++++-
 src/storage/src/mem_table.rs                  | 361 +-----------------
 src/storage/src/memory.rs                     | 284 +++++++++++++-
 src/storage/src/monitor/monitored_store.rs    |   5 +-
 src/storage/src/monitor/traced_store.rs       |   5 +-
 src/storage/src/panic_store.rs                |  18 +-
 src/storage/src/store.rs                      |  75 ++--
 src/storage/src/store_impl.rs                 |  64 ++--
 .../log_store_impl/kv_log_store/reader.rs     |  66 ++--
 12 files changed, 483 insertions(+), 515 deletions(-)

diff --git a/src/storage/src/hummock/iterator/change_log.rs b/src/storage/src/hummock/iterator/change_log.rs
index a10981c35fe16..0fd227020a527 100644
--- a/src/storage/src/hummock/iterator/change_log.rs
+++ b/src/storage/src/hummock/iterator/change_log.rs
@@ -545,10 +545,10 @@ mod tests {
     use crate::mem_table::{KeyOp, MemTable, MemTableHummockIterator, MemTableStore};
     use crate::memory::MemoryStateStore;
     use crate::store::{
-        ChangeLogValue, NewLocalOptions, OpConsistencyLevel, ReadLogOptions, StateStoreIter,
-        StateStoreRead, CHECK_BYTES_EQUAL,
+        ChangeLogValue, NewLocalOptions, OpConsistencyLevel, ReadLogOptions, StateStoreReadLog,
+        CHECK_BYTES_EQUAL,
     };
-    use crate::StateStore;
+    use crate::{StateStore, StateStoreIter};
 
     #[tokio::test]
     async fn test_empty() {
diff --git a/src/storage/src/hummock/store/hummock_storage.rs b/src/storage/src/hummock/store/hummock_storage.rs
index 0caae17c6c204..07aff03b4e804 100644
--- a/src/storage/src/hummock/store/hummock_storage.rs
+++ b/src/storage/src/hummock/store/hummock_storage.rs
@@ -79,7 +79,7 @@ impl Drop for HummockStorageShutdownGuard {
 }
 
 /// `HummockStorage` is the entry point of the Hummock state store backend.
-/// It implements the `StateStore` and `StateStoreRead` traits but not the `StateStoreWrite` trait
+/// It implements the `StateStore` and `StateStoreRead` traits but without any write method
 /// since all writes should be done via `LocalHummockStorage` to ensure the single writer property
 /// of hummock. `LocalHummockStorage` instance can be created via `new_local` call.
 /// Hummock is the state store backend.
@@ -589,7 +589,6 @@ impl HummockStorage {
 }
 
 impl StateStoreRead for HummockStorage {
-    type ChangeLogIter = ChangeLogIterator;
     type Iter = HummockStorageIterator;
     type RevIter = HummockStorageRevIterator;
 
@@ -635,6 +634,10 @@ impl StateStoreRead for HummockStorage {
         );
         self.rev_iter_inner(key_range, epoch, read_options)
     }
+}
+
+impl StateStoreReadLog for HummockStorage {
+    type ChangeLogIter = ChangeLogIterator;
 
     async fn iter_log(
         &self,
diff --git a/src/storage/src/hummock/store/local_hummock_storage.rs b/src/storage/src/hummock/store/local_hummock_storage.rs
index 989eb878b0798..69952838e9829 100644
--- a/src/storage/src/hummock/store/local_hummock_storage.rs
+++ b/src/storage/src/hummock/store/local_hummock_storage.rs
@@ -32,7 +32,6 @@ use super::version::{StagingData, VersionUpdate};
 use crate::error::StorageResult;
 use crate::hummock::event_handler::hummock_event_handler::HummockEventSender;
 use crate::hummock::event_handler::{HummockEvent, HummockReadVersionRef, LocalInstanceGuard};
-use crate::hummock::iterator::change_log::ChangeLogIterator;
 use crate::hummock::iterator::{
     Backward, BackwardUserIterator, ConcatIteratorInner, Forward, HummockIteratorUnion,
     IteratorFactory, MergeIterator, UserIterator,
@@ -244,7 +243,6 @@ impl LocalHummockStorage {
 }
 
 impl StateStoreRead for LocalHummockStorage {
-    type ChangeLogIter = ChangeLogIterator;
     type Iter = HummockStorageIterator;
     type RevIter = HummockStorageRevIterator;
 
@@ -279,20 +277,6 @@ impl StateStoreRead for LocalHummockStorage {
         self.rev_iter_flushed(key_range, epoch, read_options)
             .instrument(tracing::trace_span!("hummock_rev_iter"))
     }
-
-    async fn iter_log(
-        &self,
-        epoch_range: (u64, u64),
-        key_range: TableKeyRange,
-        options: ReadLogOptions,
-    ) -> StorageResult<Self::ChangeLogIter> {
-        let version = self.read_version.read().committed().clone();
-        let iter = self
-            .hummock_version_reader
-            .iter_log(version, epoch_range, key_range, options)
-            .await?;
-        Ok(iter)
-    }
 }
 
 impl LocalStateStore for LocalHummockStorage {
diff --git a/src/storage/src/hummock/utils.rs b/src/storage/src/hummock/utils.rs
index 6f42260bcc9c1..7f30f63d83e19 100644
--- a/src/storage/src/hummock/utils.rs
+++ b/src/storage/src/hummock/utils.rs
@@ -23,9 +23,11 @@ use std::time::{Duration, Instant};
 
 use bytes::Bytes;
 use foyer::CacheHint;
+use futures::{pin_mut, Stream, StreamExt};
 use parking_lot::Mutex;
 use risingwave_common::catalog::{TableId, TableOption};
 use risingwave_common::config::StorageMemoryConfig;
+use risingwave_expr::codegen::try_stream;
 use risingwave_hummock_sdk::can_concat;
 use risingwave_hummock_sdk::compaction_group::StateTableId;
 use risingwave_hummock_sdk::key::{
@@ -35,12 +37,12 @@ use risingwave_hummock_sdk::sstable_info::SstableInfo;
 use tokio::sync::oneshot::{channel, Receiver, Sender};
 
 use super::{HummockError, HummockResult, SstableStoreRef};
-use crate::error::StorageResult;
+use crate::error::{StorageError, StorageResult};
 use crate::hummock::local_version::pinned_version::PinnedVersion;
 use crate::hummock::CachePolicy;
 use crate::mem_table::{KeyOp, MemTableError};
 use crate::monitor::MemoryCollector;
-use crate::store::{OpConsistencyLevel, ReadOptions, StateStoreRead};
+use crate::store::{OpConsistencyLevel, ReadOptions, StateStoreKeyedRow, StateStoreRead};
 
 pub fn range_overlap<R, B>(
     search_key_range: &R,
@@ -716,6 +718,91 @@ impl MemoryCollector for HummockMemoryCollector {
     }
 }
 
+#[try_stream(ok = StateStoreKeyedRow, error = StorageError)]
+pub(crate) async fn merge_stream<'a>(
+    mem_table_iter: impl Iterator<Item = (&'a TableKey<Bytes>, &'a KeyOp)> + 'a,
+    inner_stream: impl Stream<Item = StorageResult<StateStoreKeyedRow>> + 'static,
+    table_id: TableId,
+    epoch: u64,
+    rev: bool,
+) {
+    let inner_stream = inner_stream.peekable();
+    pin_mut!(inner_stream);
+
+    let mut mem_table_iter = mem_table_iter.fuse().peekable();
+
+    loop {
+        match (inner_stream.as_mut().peek().await, mem_table_iter.peek()) {
+            (None, None) => break,
+            // The mem table side has come to an end, return data from the shared storage.
+            (Some(_), None) => {
+                let (key, value) = inner_stream.next().await.unwrap()?;
+                yield (key, value)
+            }
+            // The stream side has come to an end, return data from the mem table.
+            (None, Some(_)) => {
+                let (key, key_op) = mem_table_iter.next().unwrap();
+                match key_op {
+                    KeyOp::Insert(value) | KeyOp::Update((_, value)) => {
+                        yield (FullKey::new(table_id, key.clone(), epoch), value.clone())
+                    }
+                    _ => {}
+                }
+            }
+            (Some(Ok((inner_key, _))), Some((mem_table_key, _))) => {
+                debug_assert_eq!(inner_key.user_key.table_id, table_id);
+                let mut ret = inner_key.user_key.table_key.cmp(mem_table_key);
+                if rev {
+                    ret = ret.reverse();
+                }
+                match ret {
+                    Ordering::Less => {
+                        // yield data from storage
+                        let (key, value) = inner_stream.next().await.unwrap()?;
+                        yield (key, value);
+                    }
+                    Ordering::Equal => {
+                        // both memtable and storage contain the key, so we advance both
+                        // iterators and return the data in memory.
+
+                        let (_, key_op) = mem_table_iter.next().unwrap();
+                        let (key, old_value_in_inner) = inner_stream.next().await.unwrap()?;
+                        match key_op {
+                            KeyOp::Insert(value) => {
+                                yield (key.clone(), value.clone());
+                            }
+                            KeyOp::Delete(_) => {}
+                            KeyOp::Update((old_value, new_value)) => {
+                                debug_assert!(old_value == &old_value_in_inner);
+
+                                yield (key, new_value.clone());
+                            }
+                        }
+                    }
+                    Ordering::Greater => {
+                        // yield data from mem table
+                        let (key, key_op) = mem_table_iter.next().unwrap();
+
+                        match key_op {
+                            KeyOp::Insert(value) => {
+                                yield (FullKey::new(table_id, key.clone(), epoch), value.clone());
+                            }
+                            KeyOp::Delete(_) => {}
+                            KeyOp::Update(_) => unreachable!(
+                                "memtable update should always be paired with a storage key"
+                            ),
+                        }
+                    }
+                }
+            }
+            (Some(Err(_)), Some(_)) => {
+                // Throw the error.
+                return Err(inner_stream.next().await.unwrap().unwrap_err());
+            }
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use std::future::{poll_fn, Future};
diff --git a/src/storage/src/mem_table.rs b/src/storage/src/mem_table.rs
index 2694f1433c0fa..61807517a20a4 100644
--- a/src/storage/src/mem_table.rs
+++ b/src/storage/src/mem_table.rs
@@ -12,37 +12,23 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use std::cmp::Ordering;
 use std::collections::btree_map::Entry;
 use std::collections::BTreeMap;
-use std::future::Future;
-use std::ops::Bound::{Excluded, Included, Unbounded};
+use std::ops::Bound::{Included, Unbounded};
 use std::ops::RangeBounds;
-use std::sync::Arc;
 
 use bytes::Bytes;
-use futures::{pin_mut, Stream, StreamExt};
-use futures_async_stream::try_stream;
-use itertools::Itertools;
-use risingwave_common::bitmap::Bitmap;
-use risingwave_common::catalog::{TableId, TableOption};
-use risingwave_common::hash::{VirtualNode, VnodeBitmapExt};
 use risingwave_common_estimate_size::{EstimateSize, KvSize};
-use risingwave_hummock_sdk::key::{prefixed_range_with_vnode, FullKey, TableKey, TableKeyRange};
-use risingwave_hummock_sdk::table_watermark::WatermarkDirection;
+use risingwave_hummock_sdk::key::TableKey;
 use thiserror::Error;
 use thiserror_ext::AsReport;
 use tracing::error;
 
-use crate::error::{StorageError, StorageResult};
 use crate::hummock::iterator::{Backward, Forward, FromRustIterator, RustIteratorBuilder};
 use crate::hummock::shared_buffer::shared_buffer_batch::{SharedBufferBatch, SharedBufferBatchId};
-use crate::hummock::utils::{
-    do_delete_sanity_check, do_insert_sanity_check, do_update_sanity_check, sanity_check_enabled,
-};
+use crate::hummock::utils::sanity_check_enabled;
 use crate::hummock::value::HummockValue;
 use crate::row_serde::value_serde::ValueRowSerde;
-use crate::storage_value::StorageValue;
 use crate::store::*;
 pub type ImmutableMemtable = SharedBufferBatch;
 
@@ -427,347 +413,6 @@ impl KeyOp {
     }
 }
 
-#[try_stream(ok = StateStoreKeyedRow, error = StorageError)]
-pub(crate) async fn merge_stream<'a>(
-    mem_table_iter: impl Iterator<Item = (&'a TableKey<Bytes>, &'a KeyOp)> + 'a,
-    inner_stream: impl Stream<Item = StorageResult<StateStoreKeyedRow>> + 'static,
-    table_id: TableId,
-    epoch: u64,
-    rev: bool,
-) {
-    let inner_stream = inner_stream.peekable();
-    pin_mut!(inner_stream);
-
-    let mut mem_table_iter = mem_table_iter.fuse().peekable();
-
-    loop {
-        match (inner_stream.as_mut().peek().await, mem_table_iter.peek()) {
-            (None, None) => break,
-            // The mem table side has come to an end, return data from the shared storage.
-            (Some(_), None) => {
-                let (key, value) = inner_stream.next().await.unwrap()?;
-                yield (key, value)
-            }
-            // The stream side has come to an end, return data from the mem table.
-            (None, Some(_)) => {
-                let (key, key_op) = mem_table_iter.next().unwrap();
-                match key_op {
-                    KeyOp::Insert(value) | KeyOp::Update((_, value)) => {
-                        yield (FullKey::new(table_id, key.clone(), epoch), value.clone())
-                    }
-                    _ => {}
-                }
-            }
-            (Some(Ok((inner_key, _))), Some((mem_table_key, _))) => {
-                debug_assert_eq!(inner_key.user_key.table_id, table_id);
-                let mut ret = inner_key.user_key.table_key.cmp(mem_table_key);
-                if rev {
-                    ret = ret.reverse();
-                }
-                match ret {
-                    Ordering::Less => {
-                        // yield data from storage
-                        let (key, value) = inner_stream.next().await.unwrap()?;
-                        yield (key, value);
-                    }
-                    Ordering::Equal => {
-                        // both memtable and storage contain the key, so we advance both
-                        // iterators and return the data in memory.
-
-                        let (_, key_op) = mem_table_iter.next().unwrap();
-                        let (key, old_value_in_inner) = inner_stream.next().await.unwrap()?;
-                        match key_op {
-                            KeyOp::Insert(value) => {
-                                yield (key.clone(), value.clone());
-                            }
-                            KeyOp::Delete(_) => {}
-                            KeyOp::Update((old_value, new_value)) => {
-                                debug_assert!(old_value == &old_value_in_inner);
-
-                                yield (key, new_value.clone());
-                            }
-                        }
-                    }
-                    Ordering::Greater => {
-                        // yield data from mem table
-                        let (key, key_op) = mem_table_iter.next().unwrap();
-
-                        match key_op {
-                            KeyOp::Insert(value) => {
-                                yield (FullKey::new(table_id, key.clone(), epoch), value.clone());
-                            }
-                            KeyOp::Delete(_) => {}
-                            KeyOp::Update(_) => unreachable!(
-                                "memtable update should always be paired with a storage key"
-                            ),
-                        }
-                    }
-                }
-            }
-            (Some(Err(_)), Some(_)) => {
-                // Throw the error.
-                return Err(inner_stream.next().await.unwrap().unwrap_err());
-            }
-        }
-    }
-}
-
-pub struct MemtableLocalStateStore<S: StateStoreWrite + StateStoreRead> {
-    mem_table: MemTable,
-    inner: S,
-
-    epoch: Option<u64>,
-
-    table_id: TableId,
-    op_consistency_level: OpConsistencyLevel,
-    table_option: TableOption,
-    vnodes: Arc<Bitmap>,
-}
-
-impl<S: StateStoreWrite + StateStoreRead> MemtableLocalStateStore<S> {
-    pub fn new(inner: S, option: NewLocalOptions) -> Self {
-        Self {
-            inner,
-            mem_table: MemTable::new(option.op_consistency_level.clone()),
-            epoch: None,
-            table_id: option.table_id,
-            op_consistency_level: option.op_consistency_level,
-            table_option: option.table_option,
-            vnodes: option.vnodes,
-        }
-    }
-
-    pub fn inner(&self) -> &S {
-        &self.inner
-    }
-}
-
-impl<S: StateStoreWrite + StateStoreRead> LocalStateStore for MemtableLocalStateStore<S> {
-    type Iter<'a> = impl StateStoreIter + 'a;
-    type RevIter<'a> = impl StateStoreIter + 'a;
-
-    async fn get(
-        &self,
-        key: TableKey<Bytes>,
-        read_options: ReadOptions,
-    ) -> StorageResult<Option<Bytes>> {
-        match self.mem_table.buffer.get(&key) {
-            None => self.inner.get(key, self.epoch(), read_options).await,
-            Some(op) => match op {
-                KeyOp::Insert(value) | KeyOp::Update((_, value)) => Ok(Some(value.clone())),
-                KeyOp::Delete(_) => Ok(None),
-            },
-        }
-    }
-
-    #[allow(clippy::manual_async_fn)]
-    fn iter(
-        &self,
-        key_range: TableKeyRange,
-        read_options: ReadOptions,
-    ) -> impl Future<Output = StorageResult<Self::Iter<'_>>> + Send + '_ {
-        async move {
-            let iter = self
-                .inner
-                .iter(key_range.clone(), self.epoch(), read_options)
-                .await?;
-            Ok(FromStreamStateStoreIter::new(Box::pin(merge_stream(
-                self.mem_table.iter(key_range),
-                iter.into_stream(to_owned_item),
-                self.table_id,
-                self.epoch(),
-                false,
-            ))))
-        }
-    }
-
-    #[allow(clippy::manual_async_fn)]
-    fn rev_iter(
-        &self,
-        key_range: TableKeyRange,
-        read_options: ReadOptions,
-    ) -> impl Future<Output = StorageResult<Self::RevIter<'_>>> + Send + '_ {
-        async move {
-            let iter = self
-                .inner
-                .rev_iter(key_range.clone(), self.epoch(), read_options)
-                .await?;
-            Ok(FromStreamStateStoreIter::new(Box::pin(merge_stream(
-                self.mem_table.rev_iter(key_range),
-                iter.into_stream(to_owned_item),
-                self.table_id,
-                self.epoch(),
-                true,
-            ))))
-        }
-    }
-
-    fn insert(
-        &mut self,
-        key: TableKey<Bytes>,
-        new_val: Bytes,
-        old_val: Option<Bytes>,
-    ) -> StorageResult<()> {
-        match old_val {
-            None => self.mem_table.insert(key, new_val)?,
-            Some(old_val) => self.mem_table.update(key, old_val, new_val)?,
-        };
-        Ok(())
-    }
-
-    fn delete(&mut self, key: TableKey<Bytes>, old_val: Bytes) -> StorageResult<()> {
-        Ok(self.mem_table.delete(key, old_val)?)
-    }
-
-    async fn flush(&mut self) -> StorageResult<usize> {
-        let buffer = self.mem_table.drain().into_parts();
-        let mut kv_pairs = Vec::with_capacity(buffer.len());
-        for (key, key_op) in buffer {
-            match key_op {
-                // Currently, some executors do not strictly comply with these semantics. As
-                // a workaround you may call disable the check by initializing the
-                // state store with `op_consistency_level=Inconsistent`.
-                KeyOp::Insert(value) => {
-                    if sanity_check_enabled() {
-                        do_insert_sanity_check(
-                            &key,
-                            &value,
-                            &self.inner,
-                            self.epoch(),
-                            self.table_id,
-                            self.table_option,
-                            &self.op_consistency_level,
-                        )
-                        .await?;
-                    }
-                    kv_pairs.push((key, StorageValue::new_put(value)));
-                }
-                KeyOp::Delete(old_value) => {
-                    if sanity_check_enabled() {
-                        do_delete_sanity_check(
-                            &key,
-                            &old_value,
-                            &self.inner,
-                            self.epoch(),
-                            self.table_id,
-                            self.table_option,
-                            &self.op_consistency_level,
-                        )
-                        .await?;
-                    }
-                    kv_pairs.push((key, StorageValue::new_delete()));
-                }
-                KeyOp::Update((old_value, new_value)) => {
-                    if sanity_check_enabled() {
-                        do_update_sanity_check(
-                            &key,
-                            &old_value,
-                            &new_value,
-                            &self.inner,
-                            self.epoch(),
-                            self.table_id,
-                            self.table_option,
-                            &self.op_consistency_level,
-                        )
-                        .await?;
-                    }
-                    kv_pairs.push((key, StorageValue::new_put(new_value)));
-                }
-            }
-        }
-        self.inner.ingest_batch(
-            kv_pairs,
-            vec![],
-            WriteOptions {
-                epoch: self.epoch(),
-                table_id: self.table_id,
-            },
-        )
-    }
-
-    fn epoch(&self) -> u64 {
-        self.epoch.expect("should have set the epoch")
-    }
-
-    fn is_dirty(&self) -> bool {
-        self.mem_table.is_dirty()
-    }
-
-    #[allow(clippy::unused_async)]
-    async fn init(&mut self, options: InitOptions) -> StorageResult<()> {
-        assert!(
-            self.epoch.replace(options.epoch.curr).is_none(),
-            "epoch in local state store of table id {:?} is init for more than once",
-            self.table_id
-        );
-
-        Ok(())
-    }
-
-    fn seal_current_epoch(&mut self, next_epoch: u64, opts: SealCurrentEpochOptions) {
-        assert!(!self.is_dirty());
-        if let Some(value_checker) = opts.switch_op_consistency_level {
-            self.mem_table.op_consistency_level.update(&value_checker);
-        }
-        let prev_epoch = self
-            .epoch
-            .replace(next_epoch)
-            .expect("should have init epoch before seal the first epoch");
-        assert!(
-            next_epoch > prev_epoch,
-            "new epoch {} should be greater than current epoch: {}",
-            next_epoch,
-            prev_epoch
-        );
-        if let Some((direction, watermarks)) = opts.table_watermarks {
-            let delete_ranges = watermarks
-                .iter()
-                .flat_map(|vnode_watermark| {
-                    let inner_range = match direction {
-                        WatermarkDirection::Ascending => {
-                            (Unbounded, Excluded(vnode_watermark.watermark().clone()))
-                        }
-                        WatermarkDirection::Descending => {
-                            (Excluded(vnode_watermark.watermark().clone()), Unbounded)
-                        }
-                    };
-                    vnode_watermark
-                        .vnode_bitmap()
-                        .iter_vnodes()
-                        .map(move |vnode| {
-                            let (start, end) =
-                                prefixed_range_with_vnode(inner_range.clone(), vnode);
-                            (start.map(|key| key.0.clone()), end.map(|key| key.0.clone()))
-                        })
-                })
-                .collect_vec();
-            if let Err(e) = self.inner.ingest_batch(
-                Vec::new(),
-                delete_ranges,
-                WriteOptions {
-                    epoch: self.epoch(),
-                    table_id: self.table_id,
-                },
-            ) {
-                error!(error = %e.as_report(), "failed to write delete ranges of table watermark");
-            }
-        }
-    }
-
-    async fn try_flush(&mut self) -> StorageResult<()> {
-        Ok(())
-    }
-
-    fn update_vnode_bitmap(&mut self, vnodes: Arc<Bitmap>) -> Arc<Bitmap> {
-        std::mem::replace(&mut self.vnodes, vnodes)
-    }
-
-    fn get_table_watermark(&self, _vnode: VirtualNode) -> Option<Bytes> {
-        // TODO: may store the written table watermark and have a correct implementation
-        None
-    }
-}
-
 #[cfg(test)]
 mod tests {
     use bytes::{BufMut, Bytes, BytesMut};
diff --git a/src/storage/src/memory.rs b/src/storage/src/memory.rs
index 5f4a631494113..bd388fc47222b 100644
--- a/src/storage/src/memory.rs
+++ b/src/storage/src/memory.rs
@@ -14,18 +14,31 @@
 
 use std::cmp::Ordering;
 use std::collections::{BTreeMap, BTreeSet, VecDeque};
+use std::future::Future;
 use std::ops::Bound::{Excluded, Included, Unbounded};
 use std::ops::{Bound, RangeBounds};
 use std::sync::{Arc, LazyLock};
 
 use bytes::Bytes;
+use itertools::Itertools;
 use parking_lot::RwLock;
-use risingwave_common::catalog::TableId;
-use risingwave_hummock_sdk::key::{FullKey, TableKey, TableKeyRange, UserKey};
+use risingwave_common::bitmap::Bitmap;
+use risingwave_common::catalog::{TableId, TableOption};
+use risingwave_common::hash::{VirtualNode, VnodeBitmapExt};
+use risingwave_hummock_sdk::key::{
+    prefixed_range_with_vnode, FullKey, TableKey, TableKeyRange, UserKey,
+};
+use risingwave_hummock_sdk::table_watermark::WatermarkDirection;
 use risingwave_hummock_sdk::{HummockEpoch, HummockReadEpoch};
+use thiserror_ext::AsReport;
+use tracing::error;
 
 use crate::error::StorageResult;
-use crate::mem_table::MemtableLocalStateStore;
+use crate::hummock::utils::{
+    do_delete_sanity_check, do_insert_sanity_check, do_update_sanity_check, merge_stream,
+    sanity_check_enabled,
+};
+use crate::mem_table::{KeyOp, MemTable};
 use crate::storage_value::StorageValue;
 use crate::store::*;
 
@@ -603,7 +616,6 @@ impl<R: RangeKv> RangeKvStateStore<R> {
 }
 
 impl<R: RangeKv> StateStoreRead for RangeKvStateStore<R> {
-    type ChangeLogIter = RangeKvStateStoreChangeLogIter<R>;
     type Iter = RangeKvStateStoreIter<R>;
     type RevIter = RangeKvStateStoreRevIter<R>;
 
@@ -663,6 +675,10 @@ impl<R: RangeKv> StateStoreRead for RangeKvStateStore<R> {
             true,
         ))
     }
+}
+
+impl<R: RangeKv> StateStoreReadLog for RangeKvStateStore<R> {
+    type ChangeLogIter = RangeKvStateStoreChangeLogIter<R>;
 
     async fn iter_log(
         &self,
@@ -692,8 +708,8 @@ impl<R: RangeKv> StateStoreRead for RangeKvStateStore<R> {
     }
 }
 
-impl<R: RangeKv> StateStoreWrite for RangeKvStateStore<R> {
-    fn ingest_batch(
+impl<R: RangeKv> RangeKvStateStore<R> {
+    pub(crate) fn ingest_batch(
         &self,
         mut kv_pairs: Vec<(TableKey<Bytes>, StorageValue)>,
         delete_ranges: Vec<(Bound<Bytes>, Bound<Bytes>)>,
@@ -735,7 +751,7 @@ impl<R: RangeKv> StateStoreWrite for RangeKvStateStore<R> {
 }
 
 impl<R: RangeKv> StateStore for RangeKvStateStore<R> {
-    type Local = MemtableLocalStateStore<Self>;
+    type Local = RangeKvLocalStateStore<R>;
 
     #[allow(clippy::unused_async)]
     async fn try_wait_epoch(
@@ -748,7 +764,259 @@ impl<R: RangeKv> StateStore for RangeKvStateStore<R> {
     }
 
     async fn new_local(&self, option: NewLocalOptions) -> Self::Local {
-        MemtableLocalStateStore::new(self.clone(), option)
+        RangeKvLocalStateStore::new(self.clone(), option)
+    }
+}
+
+pub struct RangeKvLocalStateStore<R: RangeKv> {
+    mem_table: MemTable,
+    inner: RangeKvStateStore<R>,
+
+    epoch: Option<u64>,
+
+    table_id: TableId,
+    op_consistency_level: OpConsistencyLevel,
+    table_option: TableOption,
+    vnodes: Arc<Bitmap>,
+}
+
+impl<R: RangeKv> RangeKvLocalStateStore<R> {
+    pub fn new(inner: RangeKvStateStore<R>, option: NewLocalOptions) -> Self {
+        Self {
+            inner,
+            mem_table: MemTable::new(option.op_consistency_level.clone()),
+            epoch: None,
+            table_id: option.table_id,
+            op_consistency_level: option.op_consistency_level,
+            table_option: option.table_option,
+            vnodes: option.vnodes,
+        }
+    }
+}
+
+impl<R: RangeKv> LocalStateStore for RangeKvLocalStateStore<R> {
+    type Iter<'a> = impl StateStoreIter + 'a;
+    type RevIter<'a> = impl StateStoreIter + 'a;
+
+    async fn get(
+        &self,
+        key: TableKey<Bytes>,
+        read_options: ReadOptions,
+    ) -> StorageResult<Option<Bytes>> {
+        match self.mem_table.buffer.get(&key) {
+            None => self.inner.get(key, self.epoch(), read_options).await,
+            Some(op) => match op {
+                KeyOp::Insert(value) | KeyOp::Update((_, value)) => Ok(Some(value.clone())),
+                KeyOp::Delete(_) => Ok(None),
+            },
+        }
+    }
+
+    #[allow(clippy::manual_async_fn)]
+    fn iter(
+        &self,
+        key_range: TableKeyRange,
+        read_options: ReadOptions,
+    ) -> impl Future<Output = StorageResult<Self::Iter<'_>>> + Send + '_ {
+        async move {
+            let iter = self
+                .inner
+                .iter(key_range.clone(), self.epoch(), read_options)
+                .await?;
+            Ok(FromStreamStateStoreIter::new(Box::pin(merge_stream(
+                self.mem_table.iter(key_range),
+                iter.into_stream(to_owned_item),
+                self.table_id,
+                self.epoch(),
+                false,
+            ))))
+        }
+    }
+
+    #[allow(clippy::manual_async_fn)]
+    fn rev_iter(
+        &self,
+        key_range: TableKeyRange,
+        read_options: ReadOptions,
+    ) -> impl Future<Output = StorageResult<Self::RevIter<'_>>> + Send + '_ {
+        async move {
+            let iter = self
+                .inner
+                .rev_iter(key_range.clone(), self.epoch(), read_options)
+                .await?;
+            Ok(FromStreamStateStoreIter::new(Box::pin(merge_stream(
+                self.mem_table.rev_iter(key_range),
+                iter.into_stream(to_owned_item),
+                self.table_id,
+                self.epoch(),
+                true,
+            ))))
+        }
+    }
+
+    fn insert(
+        &mut self,
+        key: TableKey<Bytes>,
+        new_val: Bytes,
+        old_val: Option<Bytes>,
+    ) -> StorageResult<()> {
+        match old_val {
+            None => self.mem_table.insert(key, new_val)?,
+            Some(old_val) => self.mem_table.update(key, old_val, new_val)?,
+        };
+        Ok(())
+    }
+
+    fn delete(&mut self, key: TableKey<Bytes>, old_val: Bytes) -> StorageResult<()> {
+        Ok(self.mem_table.delete(key, old_val)?)
+    }
+
+    async fn flush(&mut self) -> StorageResult<usize> {
+        let buffer = self.mem_table.drain().into_parts();
+        let mut kv_pairs = Vec::with_capacity(buffer.len());
+        for (key, key_op) in buffer {
+            match key_op {
+                // Currently, some executors do not strictly comply with these semantics. As
+                // a workaround you may call disable the check by initializing the
+                // state store with `op_consistency_level=Inconsistent`.
+                KeyOp::Insert(value) => {
+                    if sanity_check_enabled() {
+                        do_insert_sanity_check(
+                            &key,
+                            &value,
+                            &self.inner,
+                            self.epoch(),
+                            self.table_id,
+                            self.table_option,
+                            &self.op_consistency_level,
+                        )
+                        .await?;
+                    }
+                    kv_pairs.push((key, StorageValue::new_put(value)));
+                }
+                KeyOp::Delete(old_value) => {
+                    if sanity_check_enabled() {
+                        do_delete_sanity_check(
+                            &key,
+                            &old_value,
+                            &self.inner,
+                            self.epoch(),
+                            self.table_id,
+                            self.table_option,
+                            &self.op_consistency_level,
+                        )
+                        .await?;
+                    }
+                    kv_pairs.push((key, StorageValue::new_delete()));
+                }
+                KeyOp::Update((old_value, new_value)) => {
+                    if sanity_check_enabled() {
+                        do_update_sanity_check(
+                            &key,
+                            &old_value,
+                            &new_value,
+                            &self.inner,
+                            self.epoch(),
+                            self.table_id,
+                            self.table_option,
+                            &self.op_consistency_level,
+                        )
+                        .await?;
+                    }
+                    kv_pairs.push((key, StorageValue::new_put(new_value)));
+                }
+            }
+        }
+        self.inner.ingest_batch(
+            kv_pairs,
+            vec![],
+            WriteOptions {
+                epoch: self.epoch(),
+                table_id: self.table_id,
+            },
+        )
+    }
+
+    fn epoch(&self) -> u64 {
+        self.epoch.expect("should have set the epoch")
+    }
+
+    fn is_dirty(&self) -> bool {
+        self.mem_table.is_dirty()
+    }
+
+    #[allow(clippy::unused_async)]
+    async fn init(&mut self, options: InitOptions) -> StorageResult<()> {
+        assert!(
+            self.epoch.replace(options.epoch.curr).is_none(),
+            "epoch in local state store of table id {:?} is init for more than once",
+            self.table_id
+        );
+
+        Ok(())
+    }
+
+    fn seal_current_epoch(&mut self, next_epoch: u64, opts: SealCurrentEpochOptions) {
+        assert!(!self.is_dirty());
+        if let Some(value_checker) = opts.switch_op_consistency_level {
+            self.mem_table.op_consistency_level.update(&value_checker);
+        }
+        let prev_epoch = self
+            .epoch
+            .replace(next_epoch)
+            .expect("should have init epoch before seal the first epoch");
+        assert!(
+            next_epoch > prev_epoch,
+            "new epoch {} should be greater than current epoch: {}",
+            next_epoch,
+            prev_epoch
+        );
+        if let Some((direction, watermarks)) = opts.table_watermarks {
+            let delete_ranges = watermarks
+                .iter()
+                .flat_map(|vnode_watermark| {
+                    let inner_range = match direction {
+                        WatermarkDirection::Ascending => {
+                            (Unbounded, Excluded(vnode_watermark.watermark().clone()))
+                        }
+                        WatermarkDirection::Descending => {
+                            (Excluded(vnode_watermark.watermark().clone()), Unbounded)
+                        }
+                    };
+                    vnode_watermark
+                        .vnode_bitmap()
+                        .iter_vnodes()
+                        .map(move |vnode| {
+                            let (start, end) =
+                                prefixed_range_with_vnode(inner_range.clone(), vnode);
+                            (start.map(|key| key.0.clone()), end.map(|key| key.0.clone()))
+                        })
+                })
+                .collect_vec();
+            if let Err(e) = self.inner.ingest_batch(
+                Vec::new(),
+                delete_ranges,
+                WriteOptions {
+                    epoch: self.epoch(),
+                    table_id: self.table_id,
+                },
+            ) {
+                error!(error = %e.as_report(), "failed to write delete ranges of table watermark");
+            }
+        }
+    }
+
+    async fn try_flush(&mut self) -> StorageResult<()> {
+        Ok(())
+    }
+
+    fn update_vnode_bitmap(&mut self, vnodes: Arc<Bitmap>) -> Arc<Bitmap> {
+        std::mem::replace(&mut self.vnodes, vnodes)
+    }
+
+    fn get_table_watermark(&self, _vnode: VirtualNode) -> Option<Bytes> {
+        // TODO: may store the written table watermark and have a correct implementation
+        None
     }
 }
 
diff --git a/src/storage/src/monitor/monitored_store.rs b/src/storage/src/monitor/monitored_store.rs
index 3e795f72ab47a..96a225cc22635 100644
--- a/src/storage/src/monitor/monitored_store.rs
+++ b/src/storage/src/monitor/monitored_store.rs
@@ -175,7 +175,6 @@ impl<S> MonitoredStateStore<S> {
 }
 
 impl<S: StateStoreRead> StateStoreRead for MonitoredStateStore<S> {
-    type ChangeLogIter = impl StateStoreReadChangeLogIter;
     type Iter = impl StateStoreReadIter;
     type RevIter = impl StateStoreReadIter;
 
@@ -217,6 +216,10 @@ impl<S: StateStoreRead> StateStoreRead for MonitoredStateStore<S> {
             self.inner.rev_iter(key_range, epoch, read_options),
         )
     }
+}
+
+impl<S: StateStoreReadLog> StateStoreReadLog for MonitoredStateStore<S> {
+    type ChangeLogIter = impl StateStoreReadChangeLogIter;
 
     fn iter_log(
         &self,
diff --git a/src/storage/src/monitor/traced_store.rs b/src/storage/src/monitor/traced_store.rs
index 4fa24d425953b..fc4aecc6c4054 100644
--- a/src/storage/src/monitor/traced_store.rs
+++ b/src/storage/src/monitor/traced_store.rs
@@ -285,7 +285,6 @@ impl<S: StateStore> StateStore for TracedStateStore<S> {
 }
 
 impl<S: StateStoreRead> StateStoreRead for TracedStateStore<S> {
-    type ChangeLogIter = impl StateStoreReadChangeLogIter;
     type Iter = impl StateStoreReadIter;
     type RevIter = impl StateStoreReadIter;
 
@@ -336,6 +335,10 @@ impl<S: StateStoreRead> StateStoreRead for TracedStateStore<S> {
         );
         self.traced_iter(self.inner.rev_iter(key_range, epoch, read_options), span)
     }
+}
+
+impl<S: StateStoreReadLog> StateStoreReadLog for TracedStateStore<S> {
+    type ChangeLogIter = impl StateStoreReadChangeLogIter;
 
     fn iter_log(
         &self,
diff --git a/src/storage/src/panic_store.rs b/src/storage/src/panic_store.rs
index 1ca087696cc6c..1f04281a38a11 100644
--- a/src/storage/src/panic_store.rs
+++ b/src/storage/src/panic_store.rs
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 use std::marker::PhantomData;
-use std::ops::Bound;
 use std::sync::Arc;
 
 use bytes::Bytes;
@@ -23,7 +22,6 @@ use risingwave_hummock_sdk::key::{TableKey, TableKeyRange};
 use risingwave_hummock_sdk::HummockReadEpoch;
 
 use crate::error::StorageResult;
-use crate::storage_value::StorageValue;
 use crate::store::*;
 
 /// A panic state store. If a workload is fully in-memory, we can use this state store to
@@ -32,7 +30,6 @@ use crate::store::*;
 pub struct PanicStateStore;
 
 impl StateStoreRead for PanicStateStore {
-    type ChangeLogIter = PanicStateStoreIter<StateStoreReadLogItem>;
     type Iter = PanicStateStoreIter<StateStoreKeyedRow>;
     type RevIter = PanicStateStoreIter<StateStoreKeyedRow>;
 
@@ -65,6 +62,10 @@ impl StateStoreRead for PanicStateStore {
     ) -> StorageResult<Self::RevIter> {
         panic!("should not read from the state store!");
     }
+}
+
+impl StateStoreReadLog for PanicStateStore {
+    type ChangeLogIter = PanicStateStoreIter<StateStoreReadLogItem>;
 
     async fn iter_log(
         &self,
@@ -76,17 +77,6 @@ impl StateStoreRead for PanicStateStore {
     }
 }
 
-impl StateStoreWrite for PanicStateStore {
-    fn ingest_batch(
-        &self,
-        _kv_pairs: Vec<(TableKey<Bytes>, StorageValue)>,
-        _delete_ranges: Vec<(Bound<Bytes>, Bound<Bytes>)>,
-        _write_options: WriteOptions,
-    ) -> StorageResult<usize> {
-        panic!("should not write to the state store!");
-    }
-}
-
 impl LocalStateStore for PanicStateStore {
     type Iter<'a> = PanicStateStoreIter<StateStoreKeyedRow>;
     type RevIter<'a> = PanicStateStoreIter<StateStoreKeyedRow>;
diff --git a/src/storage/src/store.rs b/src/storage/src/store.rs
index da8fdf206ad5d..531ca629d4d65 100644
--- a/src/storage/src/store.rs
+++ b/src/storage/src/store.rs
@@ -17,7 +17,6 @@ use std::default::Default;
 use std::fmt::{Debug, Formatter};
 use std::future::Future;
 use std::marker::PhantomData;
-use std::ops::Bound;
 use std::sync::{Arc, LazyLock};
 
 use bytes::Bytes;
@@ -42,7 +41,6 @@ use risingwave_pb::hummock::PbVnodeWatermark;
 use crate::error::{StorageError, StorageResult};
 use crate::hummock::CachePolicy;
 use crate::monitor::{MonitoredStateStore, MonitoredStorageMetrics};
-use crate::storage_value::StorageValue;
 
 pub trait StaticSendSync = Send + Sync + 'static;
 
@@ -59,9 +57,7 @@ impl IterItem for StateStoreReadLogItem {
 }
 
 pub trait StateStoreIter<T: IterItem = StateStoreKeyedRow>: Send {
-    fn try_next(
-        &mut self,
-    ) -> impl Future<Output = StorageResult<Option<T::ItemRef<'_>>>> + Send + '_;
+    fn try_next(&mut self) -> impl StorageFuture<'_, Option<T::ItemRef<'_>>>;
 }
 
 pub fn to_owned_item((key, value): StateStoreKeyedRowRef<'_>) -> StorageResult<StateStoreKeyedRow> {
@@ -238,11 +234,22 @@ pub struct ReadLogOptions {
 }
 
 pub trait StateStoreReadChangeLogIter = StateStoreIter<StateStoreReadLogItem> + Send + 'static;
+pub trait StorageFuture<'a, T> = Future<Output = StorageResult<T>> + Send + 'a;
+
+pub trait StateStoreReadLog: StaticSendSync {
+    type ChangeLogIter: StateStoreReadChangeLogIter;
+
+    fn iter_log(
+        &self,
+        epoch_range: (u64, u64),
+        key_range: TableKeyRange,
+        options: ReadLogOptions,
+    ) -> impl StorageFuture<'_, Self::ChangeLogIter>;
+}
 
 pub trait StateStoreRead: StaticSendSync {
     type Iter: StateStoreReadIter;
     type RevIter: StateStoreReadIter;
-    type ChangeLogIter: StateStoreReadChangeLogIter;
 
     /// Point gets a value from the state store.
     /// The result is based on a snapshot corresponding to the given `epoch`.
@@ -252,7 +259,7 @@ pub trait StateStoreRead: StaticSendSync {
         key: TableKey<Bytes>,
         epoch: u64,
         read_options: ReadOptions,
-    ) -> impl Future<Output = StorageResult<Option<StateStoreKeyedRow>>> + Send + '_;
+    ) -> impl StorageFuture<'_, Option<StateStoreKeyedRow>>;
 
     /// Point gets a value from the state store.
     /// The result is based on a snapshot corresponding to the given `epoch`.
@@ -262,7 +269,7 @@ pub trait StateStoreRead: StaticSendSync {
         key: TableKey<Bytes>,
         epoch: u64,
         read_options: ReadOptions,
-    ) -> impl Future<Output = StorageResult<Option<Bytes>>> + Send + '_ {
+    ) -> impl StorageFuture<'_, Option<Bytes>> {
         self.get_keyed_row(key, epoch, read_options)
             .map_ok(|v| v.map(|(_, v)| v))
     }
@@ -277,21 +284,14 @@ pub trait StateStoreRead: StaticSendSync {
         key_range: TableKeyRange,
         epoch: u64,
         read_options: ReadOptions,
-    ) -> impl Future<Output = StorageResult<Self::Iter>> + Send + '_;
+    ) -> impl StorageFuture<'_, Self::Iter>;
 
     fn rev_iter(
         &self,
         key_range: TableKeyRange,
         epoch: u64,
         read_options: ReadOptions,
-    ) -> impl Future<Output = StorageResult<Self::RevIter>> + Send + '_;
-
-    fn iter_log(
-        &self,
-        epoch_range: (u64, u64),
-        key_range: TableKeyRange,
-        options: ReadLogOptions,
-    ) -> impl Future<Output = StorageResult<Self::ChangeLogIter>> + Send + '_;
+    ) -> impl StorageFuture<'_, Self::RevIter>;
 }
 
 pub trait StateStoreReadExt: StaticSendSync {
@@ -308,7 +308,7 @@ pub trait StateStoreReadExt: StaticSendSync {
         epoch: u64,
         limit: Option<usize>,
         read_options: ReadOptions,
-    ) -> impl Future<Output = StorageResult<Vec<StateStoreKeyedRow>>> + Send + '_;
+    ) -> impl StorageFuture<'_, Vec<StateStoreKeyedRow>>;
 }
 
 impl<S: StateStoreRead> StateStoreReadExt for S {
@@ -333,29 +333,6 @@ impl<S: StateStoreRead> StateStoreReadExt for S {
     }
 }
 
-pub trait StateStoreWrite: StaticSendSync {
-    /// Writes a batch to storage. The batch should be:
-    /// * Ordered. KV pairs will be directly written to the table, so it must be ordered.
-    /// * Locally unique. There should not be two or more operations on the same key in one write
-    ///   batch.
-    ///
-    /// Ingests a batch of data into the state store. One write batch should never contain operation
-    /// on the same key. e.g. Put(233, x) then Delete(233).
-    /// An epoch should be provided to ingest a write batch. It is served as:
-    /// - A handle to represent an atomic write session. All ingested write batches associated with
-    ///   the same `Epoch` have the all-or-nothing semantics, meaning that partial changes are not
-    ///   queryable and will be rolled back if instructed.
-    /// - A version of a kv pair. kv pair associated with larger `Epoch` is guaranteed to be newer
-    ///   then kv pair with smaller `Epoch`. Currently this version is only used to derive the
-    ///   per-key modification history (e.g. in compaction), not across different keys.
-    fn ingest_batch(
-        &self,
-        kv_pairs: Vec<(TableKey<Bytes>, StorageValue)>,
-        delete_ranges: Vec<(Bound<Bytes>, Bound<Bytes>)>,
-        write_options: WriteOptions,
-    ) -> StorageResult<usize>;
-}
-
 #[derive(Clone)]
 pub struct TryWaitEpochOptions {
     pub table_id: TableId,
@@ -384,7 +361,7 @@ impl From<TryWaitEpochOptions> for TracedTryWaitEpochOptions {
     }
 }
 
-pub trait StateStore: StateStoreRead + StaticSendSync + Clone {
+pub trait StateStore: StateStoreRead + StateStoreReadLog + StaticSendSync + Clone {
     type Local: LocalStateStore;
 
     /// If epoch is `Committed`, we will wait until the epoch is committed and its data is ready to
@@ -393,7 +370,7 @@ pub trait StateStore: StateStoreRead + StaticSendSync + Clone {
         &self,
         epoch: HummockReadEpoch,
         options: TryWaitEpochOptions,
-    ) -> impl Future<Output = StorageResult<()>> + Send + '_;
+    ) -> impl StorageFuture<'_, ()>;
 
     /// Creates a [`MonitoredStateStore`] from this state store, with given `stats`.
     fn monitored(self, storage_metrics: Arc<MonitoredStorageMetrics>) -> MonitoredStateStore<Self> {
@@ -416,7 +393,7 @@ pub trait LocalStateStore: StaticSendSync {
         &self,
         key: TableKey<Bytes>,
         read_options: ReadOptions,
-    ) -> impl Future<Output = StorageResult<Option<Bytes>>> + Send + '_;
+    ) -> impl StorageFuture<'_, Option<Bytes>>;
 
     /// Opens and returns an iterator for given `prefix_hint` and `full_key_range`
     /// Internally, `prefix_hint` will be used to for checking `bloom_filter` and
@@ -427,13 +404,13 @@ pub trait LocalStateStore: StaticSendSync {
         &self,
         key_range: TableKeyRange,
         read_options: ReadOptions,
-    ) -> impl Future<Output = StorageResult<Self::Iter<'_>>> + Send + '_;
+    ) -> impl StorageFuture<'_, Self::Iter<'_>>;
 
     fn rev_iter(
         &self,
         key_range: TableKeyRange,
         read_options: ReadOptions,
-    ) -> impl Future<Output = StorageResult<Self::RevIter<'_>>> + Send + '_;
+    ) -> impl StorageFuture<'_, Self::RevIter<'_>>;
 
     /// Get last persisted watermark for a given vnode.
     fn get_table_watermark(&self, vnode: VirtualNode) -> Option<Bytes>;
@@ -450,9 +427,9 @@ pub trait LocalStateStore: StaticSendSync {
     /// than the given `epoch` will be deleted.
     fn delete(&mut self, key: TableKey<Bytes>, old_val: Bytes) -> StorageResult<()>;
 
-    fn flush(&mut self) -> impl Future<Output = StorageResult<usize>> + Send + '_;
+    fn flush(&mut self) -> impl StorageFuture<'_, usize>;
 
-    fn try_flush(&mut self) -> impl Future<Output = StorageResult<()>> + Send + '_;
+    fn try_flush(&mut self) -> impl StorageFuture<'_, ()>;
     fn epoch(&self) -> u64;
 
     fn is_dirty(&self) -> bool;
@@ -463,7 +440,7 @@ pub trait LocalStateStore: StaticSendSync {
     /// In some cases like replicated state table, state table may not be empty initially,
     /// as such we need to wait for `epoch.prev` checkpoint to complete,
     /// hence this interface is made async.
-    fn init(&mut self, opts: InitOptions) -> impl Future<Output = StorageResult<()>> + Send + '_;
+    fn init(&mut self, opts: InitOptions) -> impl StorageFuture<'_, ()>;
 
     /// Updates the monotonically increasing write epoch to `new_epoch`.
     /// All writes after this function is called will be tagged with `new_epoch`. In other words,
diff --git a/src/storage/src/store_impl.rs b/src/storage/src/store_impl.rs
index 87883911c58e2..6488e9c3f23b6 100644
--- a/src/storage/src/store_impl.rs
+++ b/src/storage/src/store_impl.rs
@@ -228,7 +228,7 @@ pub mod verify {
     use std::fmt::Debug;
     use std::future::Future;
     use std::marker::PhantomData;
-    use std::ops::{Bound, Deref};
+    use std::ops::Deref;
     use std::sync::Arc;
 
     use bytes::Bytes;
@@ -240,7 +240,6 @@ pub mod verify {
 
     use crate::error::StorageResult;
     use crate::hummock::HummockStorage;
-    use crate::storage_value::StorageValue;
     use crate::store::*;
     use crate::store_impl::AsHummock;
 
@@ -263,6 +262,7 @@ pub mod verify {
         }
     }
 
+    #[derive(Clone)]
     pub struct VerifyStateStore<A, E, T = ()> {
         pub actual: A,
         pub expected: Option<E>,
@@ -276,7 +276,6 @@ pub mod verify {
     }
 
     impl<A: StateStoreRead, E: StateStoreRead> StateStoreRead for VerifyStateStore<A, E> {
-        type ChangeLogIter = impl StateStoreReadChangeLogIter;
         type Iter = impl StateStoreReadIter;
         type RevIter = impl StateStoreReadIter;
 
@@ -342,6 +341,10 @@ pub mod verify {
                 Ok(verify_iter::<StateStoreKeyedRow>(actual, expected))
             }
         }
+    }
+
+    impl<A: StateStoreReadLog, E: StateStoreReadLog> StateStoreReadLog for VerifyStateStore<A, E> {
+        type ChangeLogIter = impl StateStoreReadChangeLogIter;
 
         async fn iter_log(
             &self,
@@ -392,36 +395,6 @@ pub mod verify {
         }
     }
 
-    impl<A: StateStoreWrite, E: StateStoreWrite> StateStoreWrite for VerifyStateStore<A, E> {
-        fn ingest_batch(
-            &self,
-            kv_pairs: Vec<(TableKey<Bytes>, StorageValue)>,
-            delete_ranges: Vec<(Bound<Bytes>, Bound<Bytes>)>,
-            write_options: WriteOptions,
-        ) -> StorageResult<usize> {
-            let actual = self.actual.ingest_batch(
-                kv_pairs.clone(),
-                delete_ranges.clone(),
-                write_options.clone(),
-            );
-            if let Some(expected) = &self.expected {
-                let expected = expected.ingest_batch(kv_pairs, delete_ranges, write_options);
-                assert_eq!(actual.is_err(), expected.is_err());
-            }
-            actual
-        }
-    }
-
-    impl<A: Clone, E: Clone> Clone for VerifyStateStore<A, E> {
-        fn clone(&self) -> Self {
-            Self {
-                actual: self.actual.clone(),
-                expected: self.expected.clone(),
-                _phantom: PhantomData,
-            }
-        }
-    }
-
     impl<A: LocalStateStore, E: LocalStateStore> LocalStateStore for VerifyStateStore<A, E> {
         type Iter<'a> = impl StateStoreIter + 'a;
         type RevIter<'a> = impl StateStoreIter + 'a;
@@ -806,7 +779,7 @@ impl AsHummock for SledStateStore {
 }
 
 #[cfg(debug_assertions)]
-pub mod boxed_state_store {
+mod boxed_state_store {
     use std::future::Future;
     use std::ops::{Deref, DerefMut};
     use std::sync::Arc;
@@ -873,7 +846,10 @@ pub mod boxed_state_store {
             epoch: u64,
             read_options: ReadOptions,
         ) -> StorageResult<BoxStateStoreReadIter>;
+    }
 
+    #[async_trait::async_trait]
+    pub trait DynamicDispatchedStateStoreReadLog: StaticSendSync {
         async fn iter_log(
             &self,
             epoch_range: (u64, u64),
@@ -912,7 +888,10 @@ pub mod boxed_state_store {
                 self.rev_iter(key_range, epoch, read_options).await?,
             ))
         }
+    }
 
+    #[async_trait::async_trait]
+    impl<S: StateStoreReadLog> DynamicDispatchedStateStoreReadLog for S {
         async fn iter_log(
             &self,
             epoch_range: (u64, u64),
@@ -1159,7 +1138,6 @@ pub mod boxed_state_store {
     pub type BoxDynamicDispatchedStateStore = Box<dyn DynamicDispatchedStateStore>;
 
     impl StateStoreRead for BoxDynamicDispatchedStateStore {
-        type ChangeLogIter = BoxStateStoreReadChangeLogIter;
         type Iter = BoxStateStoreReadIter;
         type RevIter = BoxStateStoreReadIter;
 
@@ -1189,6 +1167,10 @@ pub mod boxed_state_store {
         ) -> impl Future<Output = StorageResult<Self::RevIter>> + '_ {
             self.deref().rev_iter(key_range, epoch, read_options)
         }
+    }
+
+    impl StateStoreReadLog for BoxDynamicDispatchedStateStore {
+        type ChangeLogIter = BoxStateStoreReadChangeLogIter;
 
         fn iter_log(
             &self,
@@ -1201,7 +1183,11 @@ pub mod boxed_state_store {
     }
 
     pub trait DynamicDispatchedStateStore:
-        DynClone + DynamicDispatchedStateStoreRead + DynamicDispatchedStateStoreExt + AsHummock
+        DynClone
+        + DynamicDispatchedStateStoreRead
+        + DynamicDispatchedStateStoreReadLog
+        + DynamicDispatchedStateStoreExt
+        + AsHummock
     {
     }
 
@@ -1214,7 +1200,11 @@ pub mod boxed_state_store {
     }
 
     impl<
-            S: DynClone + DynamicDispatchedStateStoreRead + DynamicDispatchedStateStoreExt + AsHummock,
+            S: DynClone
+                + DynamicDispatchedStateStoreRead
+                + DynamicDispatchedStateStoreReadLog
+                + DynamicDispatchedStateStoreExt
+                + AsHummock,
         > DynamicDispatchedStateStore for S
     {
     }
diff --git a/src/stream/src/common/log_store_impl/kv_log_store/reader.rs b/src/stream/src/common/log_store_impl/kv_log_store/reader.rs
index c27cf711a3ad0..20c8e7fbcb576 100644
--- a/src/stream/src/common/log_store_impl/kv_log_store/reader.rs
+++ b/src/stream/src/common/log_store_impl/kv_log_store/reader.rs
@@ -674,54 +674,65 @@ impl<S: StateStore> LogReader for KvLogStoreReader<S> {
 
 #[cfg(test)]
 mod tests {
-    use std::ops::Bound::Unbounded;
+    use std::collections::{Bound, HashSet};
 
     use bytes::Bytes;
     use itertools::Itertools;
-    use risingwave_common::util::epoch::test_epoch;
-    use risingwave_hummock_sdk::key::TableKey;
+    use risingwave_common::hash::VirtualNode;
+    use risingwave_common::util::epoch::{test_epoch, EpochExt};
+    use risingwave_hummock_sdk::key::{prefixed_range_with_vnode, KeyPayloadType, TableKey};
+    use risingwave_hummock_test::local_state_store_test_utils::LocalStateStoreTestExt;
+    use risingwave_hummock_test::test_utils::prepare_hummock_test_env;
     use risingwave_storage::hummock::iterator::test_utils::{
         iterator_test_table_key_of, iterator_test_value_of,
     };
-    use risingwave_storage::memory::MemoryStateStore;
-    use risingwave_storage::storage_value::StorageValue;
-    use risingwave_storage::store::{ReadOptions, StateStoreRead, StateStoreWrite, WriteOptions};
-    use risingwave_storage::StateStoreIter;
+    use risingwave_storage::store::{
+        LocalStateStore, NewLocalOptions, ReadOptions, SealCurrentEpochOptions, StateStoreRead,
+    };
+    use risingwave_storage::{StateStore, StateStoreIter};
 
     use crate::common::log_store_impl::kv_log_store::reader::AutoRebuildStateStoreReadIter;
     use crate::common::log_store_impl::kv_log_store::test_utils::TEST_TABLE_ID;
 
     #[tokio::test]
     async fn test_auto_rebuild_iter() {
-        let state_store = MemoryStateStore::new();
+        let test_env = prepare_hummock_test_env().await;
+        test_env.register_table_id(TEST_TABLE_ID).await;
+        let mut state_store = test_env
+            .storage
+            .new_local(NewLocalOptions::for_test(TEST_TABLE_ID))
+            .await;
+        let epoch = test_epoch(1);
+        test_env
+            .storage
+            .start_epoch(epoch, HashSet::from_iter([TEST_TABLE_ID]));
+        state_store.init_for_test(epoch).await.unwrap();
         let key_count = 100;
         let pairs = (0..key_count)
             .map(|i| {
                 let key = iterator_test_table_key_of(i);
                 let value = iterator_test_value_of(i);
-                (TableKey(Bytes::from(key)), StorageValue::new_put(value))
+                (TableKey(Bytes::from(key)), Bytes::from(value))
             })
             .collect_vec();
-        let epoch = test_epoch(1);
-        state_store
-            .ingest_batch(
-                pairs.clone(),
-                vec![],
-                WriteOptions {
-                    epoch,
-                    table_id: TEST_TABLE_ID,
-                },
-            )
-            .unwrap();
+        for (key, value) in &pairs {
+            state_store
+                .insert(key.clone(), value.clone(), None)
+                .unwrap();
+        }
+        state_store.flush().await.unwrap();
+        state_store.seal_current_epoch(epoch.next_epoch(), SealCurrentEpochOptions::for_test());
+        test_env.commit_epoch(epoch).await;
+        let state_store = test_env.storage.clone();
 
         async fn validate(
-            mut kv_iter: impl Iterator<Item = (TableKey<Bytes>, StorageValue)>,
+            mut kv_iter: impl Iterator<Item = (TableKey<Bytes>, Bytes)>,
             mut iter: impl StateStoreIter,
         ) {
             while let Some((key, value)) = iter.try_next().await.unwrap() {
                 let (k, v) = kv_iter.next().unwrap();
                 assert_eq!(key.user_key.table_key, k.to_ref());
-                assert_eq!(v.user_value.as_deref(), Some(value));
+                assert_eq!(v.as_ref(), value);
             }
             assert!(kv_iter.next().is_none());
         }
@@ -730,10 +741,17 @@ mod tests {
             table_id: TEST_TABLE_ID,
             ..Default::default()
         };
+        let key_range = prefixed_range_with_vnode(
+            (
+                Bound::<KeyPayloadType>::Unbounded,
+                Bound::<KeyPayloadType>::Unbounded,
+            ),
+            VirtualNode::ZERO,
+        );
 
         let kv_iter = pairs.clone().into_iter();
         let iter = state_store
-            .iter((Unbounded, Unbounded), epoch, read_options.clone())
+            .iter(key_range.clone(), epoch, read_options.clone())
             .await
             .unwrap();
         validate(kv_iter, iter).await;
@@ -755,7 +773,7 @@ mod tests {
                     false
                 }
             },
-            (Unbounded, Unbounded),
+            key_range.clone(),
             epoch,
             read_options,
         )

From 1bc6bea26cd2c103b827837b007f3a1d5e2d6067 Mon Sep 17 00:00:00 2001
From: Bohan Zhang <tabvision@bupt.icu>
Date: Wed, 8 Jan 2025 16:28:41 +0800
Subject: [PATCH 07/21] feat: batching telemetry event request avoid too many
 requests (#20000)

Co-authored-by: tabversion <tabversion@bupt.icu>
---
 proto/telemetry.proto                 |  7 +++--
 src/common/src/telemetry/report.rs    | 37 ++++++++++++++++++++++++---
 src/common/telemetry_event/src/lib.rs | 35 +++++++++++++++++--------
 3 files changed, 63 insertions(+), 16 deletions(-)

diff --git a/proto/telemetry.proto b/proto/telemetry.proto
index 06e161506db15..162ab80648aba 100644
--- a/proto/telemetry.proto
+++ b/proto/telemetry.proto
@@ -7,8 +7,7 @@ option go_package = "risingwavelabs.com/risingwave/proto/telemetry";
 enum MetaBackend {
   META_BACKEND_UNSPECIFIED = 0;
   META_BACKEND_MEMORY = 1;
-  reserved 2;
-  reserved "META_BACKEND_ETCD";
+  META_BACKEND_ETCD = 2;
   META_BACKEND_RDB = 3;
 }
 
@@ -167,3 +166,7 @@ message EventMessage {
   // mark the event is a test message
   bool is_test = 11;
 }
+
+message BatchEventMessage {
+  repeated EventMessage events = 1;
+}
diff --git a/src/common/src/telemetry/report.rs b/src/common/src/telemetry/report.rs
index d4b1fd27335df..f4737038863b0 100644
--- a/src/common/src/telemetry/report.rs
+++ b/src/common/src/telemetry/report.rs
@@ -14,13 +14,18 @@
 
 use std::sync::Arc;
 
-use risingwave_telemetry_event::get_telemetry_risingwave_cloud_uuid;
+use risingwave_pb::telemetry::PbEventMessage;
 pub use risingwave_telemetry_event::{
-    current_timestamp, post_telemetry_report_pb, TELEMETRY_REPORT_URL, TELEMETRY_TRACKING_ID,
+    current_timestamp, do_telemetry_event_report, post_telemetry_report_pb,
+    TELEMETRY_EVENT_REPORT_INTERVAL, TELEMETRY_REPORT_URL, TELEMETRY_TRACKING_ID,
+};
+use risingwave_telemetry_event::{
+    get_telemetry_risingwave_cloud_uuid, TELEMETRY_EVENT_REPORT_STASH_SIZE,
+    TELEMETRY_EVENT_REPORT_TX,
 };
 use tokio::sync::oneshot::Sender;
 use tokio::task::JoinHandle;
-use tokio::time::{interval, Duration};
+use tokio::time::{interval as tokio_interval_fn, Duration};
 use uuid::Uuid;
 
 use super::{Result, TELEMETRY_REPORT_INTERVAL};
@@ -60,9 +65,13 @@ where
 
         let begin_time = std::time::Instant::now();
         let session_id = Uuid::new_v4().to_string();
-        let mut interval = interval(Duration::from_secs(TELEMETRY_REPORT_INTERVAL));
+        let mut interval = tokio_interval_fn(Duration::from_secs(TELEMETRY_REPORT_INTERVAL));
         interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
 
+        let mut event_interval =
+            tokio_interval_fn(Duration::from_secs(TELEMETRY_EVENT_REPORT_INTERVAL));
+        event_interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
+
         // fetch telemetry tracking_id from the meta node only at the beginning
         // There is only one case tracking_id updated at the runtime ---- metastore data has been
         // cleaned. There is no way that metastore has been cleaned but nodes are still running
@@ -91,9 +100,29 @@ where
                 )
             });
 
+        let (tx, mut event_rx) = tokio::sync::mpsc::unbounded_channel::<PbEventMessage>();
+        TELEMETRY_EVENT_REPORT_TX.set(tx).unwrap_or_else(|_| {
+            tracing::warn!(
+                "Telemetry failed to set event reporting tx, event reporting will be disabled"
+            );
+        });
+        let mut event_stash = Vec::new();
+
         loop {
             tokio::select! {
                 _ = interval.tick() => {},
+                event = event_rx.recv() => {
+                    debug_assert!(event.is_some());
+                    event_stash.push(event.unwrap());
+                    if event_stash.len() >= TELEMETRY_EVENT_REPORT_STASH_SIZE {
+                        do_telemetry_event_report(&mut event_stash).await;
+                    }
+                    continue;
+                }
+                _ = event_interval.tick() => {
+                    do_telemetry_event_report(&mut event_stash).await;
+                    continue;
+                },
                 _ = &mut shutdown_rx => {
                     tracing::info!("Telemetry exit");
                     return;
diff --git a/src/common/telemetry_event/src/lib.rs b/src/common/telemetry_event/src/lib.rs
index 0be5f40e0de1c..18874f4818627 100644
--- a/src/common/telemetry_event/src/lib.rs
+++ b/src/common/telemetry_event/src/lib.rs
@@ -21,9 +21,11 @@ use std::sync::OnceLock;
 
 use prost::Message;
 use risingwave_pb::telemetry::{
-    EventMessage as PbEventMessage, PbTelemetryDatabaseObject,
+    EventMessage as PbEventMessage, PbBatchEventMessage, PbTelemetryDatabaseObject,
     TelemetryEventStage as PbTelemetryEventStage,
 };
+use thiserror_ext::AsReport;
+use tokio::sync::mpsc::UnboundedSender;
 pub use util::*;
 
 pub type TelemetryResult<T> = core::result::Result<T, TelemetryError>;
@@ -32,6 +34,7 @@ pub type TelemetryResult<T> = core::result::Result<T, TelemetryError>;
 pub type TelemetryError = String;
 
 pub static TELEMETRY_TRACKING_ID: OnceLock<String> = OnceLock::new();
+pub static TELEMETRY_EVENT_REPORT_TX: OnceLock<UnboundedSender<PbEventMessage>> = OnceLock::new();
 
 pub const TELEMETRY_REPORT_URL: &str = "https://telemetry.risingwave.dev/api/v2/report";
 
@@ -42,6 +45,21 @@ pub fn get_telemetry_risingwave_cloud_uuid() -> Option<String> {
     env::var(TELEMETRY_RISINGWAVE_CLOUD_UUID).ok()
 }
 
+pub async fn do_telemetry_event_report(event_stash: &mut Vec<PbEventMessage>) {
+    const TELEMETRY_EVENT_REPORT_TYPE: &str = "events"; // the batch report url
+    let url = (TELEMETRY_REPORT_URL.to_owned() + "/" + TELEMETRY_EVENT_REPORT_TYPE).to_owned();
+    let batch_message = PbBatchEventMessage {
+        events: std::mem::take(event_stash),
+    };
+
+    post_telemetry_report_pb(&url, batch_message.encode_to_vec())
+        .await
+        .unwrap_or_else(|e| tracing::debug!("{}", e));
+}
+
+pub const TELEMETRY_EVENT_REPORT_INTERVAL: u64 = 10; // 10 seconds
+pub const TELEMETRY_EVENT_REPORT_STASH_SIZE: usize = 100; // 100 events to trigger a report action
+
 pub fn report_event_common(
     event_stage: PbTelemetryEventStage,
     event_name: &str,
@@ -95,15 +113,12 @@ pub fn request_to_telemetry_event(
         node,
         is_test,
     };
-    let report_bytes = event.encode_to_vec();
-
-    tokio::spawn(async move {
-        const TELEMETRY_EVENT_REPORT_TYPE: &str = "event";
-        let url = (TELEMETRY_REPORT_URL.to_owned() + "/" + TELEMETRY_EVENT_REPORT_TYPE).to_owned();
-        post_telemetry_report_pb(&url, report_bytes)
-            .await
-            .unwrap_or_else(|e| tracing::info!("{}", e))
-    });
+
+    if let Some(tx) = TELEMETRY_EVENT_REPORT_TX.get() {
+        let _ = tx.send(event).inspect_err(|e| {
+            tracing::warn!("Failed to send telemetry event queue: {}", e.as_report())
+        });
+    }
 }
 
 #[cfg(test)]

From bb4dec288c5d93cc12c549e56ca7d11b0e827dc3 Mon Sep 17 00:00:00 2001
From: xxchan <xxchan22f@gmail.com>
Date: Wed, 8 Jan 2025 21:50:11 +0800
Subject: [PATCH 08/21] refactor(meta): splits source_manager into smaller mods
 (#20071)

Signed-off-by: xxchan <xxchan22f@gmail.com>
---
 src/meta/src/rpc/ddl_controller.rs            |    4 +-
 src/meta/src/stream/source_manager.rs         | 1131 +----------------
 .../stream/source_manager/split_assignment.rs |  830 ++++++++++++
 src/meta/src/stream/source_manager/worker.rs  |  320 +++++
 4 files changed, 1159 insertions(+), 1126 deletions(-)
 create mode 100644 src/meta/src/stream/source_manager/split_assignment.rs
 create mode 100644 src/meta/src/stream/source_manager/worker.rs

diff --git a/src/meta/src/rpc/ddl_controller.rs b/src/meta/src/rpc/ddl_controller.rs
index 70419e6118b13..5c203df91748f 100644
--- a/src/meta/src/rpc/ddl_controller.rs
+++ b/src/meta/src/rpc/ddl_controller.rs
@@ -75,7 +75,7 @@ use crate::manager::{
 };
 use crate::model::{StreamContext, StreamJobFragments, TableParallelism};
 use crate::stream::{
-    create_source_worker_handle, validate_sink, ActorGraphBuildResult, ActorGraphBuilder,
+    create_source_worker, validate_sink, ActorGraphBuildResult, ActorGraphBuilder,
     CompleteStreamFragmentGraph, CreateStreamingJobContext, CreateStreamingJobOption,
     GlobalStreamManagerRef, ReplaceStreamJobContext, SourceChange, SourceManagerRef,
     StreamFragmentGraph,
@@ -441,7 +441,7 @@ impl DdlController {
 
     /// Shared source is handled in [`Self::create_streaming_job`]
     async fn create_non_shared_source(&self, source: Source) -> MetaResult<NotificationVersion> {
-        let handle = create_source_worker_handle(&source, self.source_manager.metrics.clone())
+        let handle = create_source_worker(&source, self.source_manager.metrics.clone())
             .await
             .context("failed to create source worker")?;
 
diff --git a/src/meta/src/stream/source_manager.rs b/src/meta/src/stream/source_manager.rs
index 958ce8a802bfc..6d2ea50d0a5cd 100644
--- a/src/meta/src/stream/source_manager.rs
+++ b/src/meta/src/stream/source_manager.rs
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+mod split_assignment;
+mod worker;
 use std::borrow::BorrowMut;
 use std::cmp::Ordering;
 use std::collections::hash_map::Entry;
@@ -40,6 +42,8 @@ use tokio::sync::{oneshot, Mutex};
 use tokio::task::JoinHandle;
 use tokio::time::MissedTickBehavior;
 use tokio::{select, time};
+pub use worker::create_source_worker;
+use worker::{create_source_worker_async, ConnectorSourceWorkerHandle};
 
 use crate::barrier::{BarrierScheduler, Command};
 use crate::manager::MetadataManager;
@@ -51,7 +55,7 @@ pub type SourceManagerRef = Arc<SourceManager>;
 pub type SplitAssignment = HashMap<FragmentId, HashMap<ActorId, Vec<SplitImpl>>>;
 pub type ThrottleConfig = HashMap<FragmentId, HashMap<ActorId, Option<u32>>>;
 
-/// `SourceManager` keeps fetching the latest split metadata from the external source services ([`ConnectorSourceWorker::tick`]),
+/// `SourceManager` keeps fetching the latest split metadata from the external source services ([`worker::ConnectorSourceWorker::tick`]),
 /// and sends a split assignment command if split changes detected ([`Self::tick`]).
 pub struct SourceManager {
     pub paused: Mutex<()>,
@@ -59,226 +63,6 @@ pub struct SourceManager {
     core: Mutex<SourceManagerCore>,
     pub metrics: Arc<MetaMetrics>,
 }
-
-const MAX_FAIL_CNT: u32 = 10;
-const DEFAULT_SOURCE_TICK_TIMEOUT: Duration = Duration::from_secs(10);
-
-struct SharedSplitMap {
-    splits: Option<BTreeMap<SplitId, SplitImpl>>,
-}
-
-type SharedSplitMapRef = Arc<Mutex<SharedSplitMap>>;
-
-/// `ConnectorSourceWorker` keeps fetching the latest split metadata from the external source service ([`Self::tick`]),
-/// and maintains it in `current_splits`.
-struct ConnectorSourceWorker<P: SourceProperties> {
-    source_id: SourceId,
-    source_name: String,
-    current_splits: SharedSplitMapRef,
-    enumerator: P::SplitEnumerator,
-    period: Duration,
-    metrics: Arc<MetaMetrics>,
-    connector_properties: P,
-    fail_cnt: u32,
-    source_is_up: LabelGuardedIntGauge<2>,
-}
-
-fn extract_prop_from_existing_source(source: &Source) -> ConnectorResult<ConnectorProperties> {
-    let options_with_secret =
-        WithOptionsSecResolved::new(source.with_properties.clone(), source.secret_refs.clone());
-    let mut properties = ConnectorProperties::extract(options_with_secret, false)?;
-    properties.init_from_pb_source(source);
-    Ok(properties)
-}
-fn extract_prop_from_new_source(source: &Source) -> ConnectorResult<ConnectorProperties> {
-    let options_with_secret =
-        WithOptionsSecResolved::new(source.with_properties.clone(), source.secret_refs.clone());
-    let mut properties = ConnectorProperties::extract(options_with_secret, true)?;
-    properties.init_from_pb_source(source);
-    Ok(properties)
-}
-
-/// Used to create a new `ConnectorSourceWorkerHandle` for a new source.
-///
-/// It will call `ConnectorSourceWorker::tick()` to fetch split metadata once before returning.
-pub async fn create_source_worker_handle(
-    source: &Source,
-    metrics: Arc<MetaMetrics>,
-) -> MetaResult<ConnectorSourceWorkerHandle> {
-    tracing::info!("spawning new watcher for source {}", source.id);
-
-    let splits = Arc::new(Mutex::new(SharedSplitMap { splits: None }));
-    let current_splits_ref = splits.clone();
-
-    let connector_properties = extract_prop_from_new_source(source)?;
-    let enable_scale_in = connector_properties.enable_drop_split();
-    let enable_adaptive_splits = connector_properties.enable_adaptive_splits();
-    let (sync_call_tx, sync_call_rx) = tokio::sync::mpsc::unbounded_channel();
-    let handle = dispatch_source_prop!(connector_properties, prop, {
-        let mut worker = ConnectorSourceWorker::create(
-            source,
-            *prop,
-            DEFAULT_SOURCE_WORKER_TICK_INTERVAL,
-            current_splits_ref.clone(),
-            metrics,
-        )
-        .await?;
-
-        // if fail to fetch meta info, will refuse to create source
-
-        // todo: make the timeout configurable, longer than `properties.sync.call.timeout`
-        // in kafka
-        tokio::time::timeout(DEFAULT_SOURCE_TICK_TIMEOUT, worker.tick())
-            .await
-            .ok()
-            .with_context(|| {
-                format!(
-                    "failed to fetch meta info for source {}, timeout {:?}",
-                    source.id, DEFAULT_SOURCE_TICK_TIMEOUT
-                )
-            })??;
-
-        tokio::spawn(async move { worker.run(sync_call_rx).await })
-    });
-    Ok(ConnectorSourceWorkerHandle {
-        handle,
-        sync_call_tx,
-        splits,
-        enable_drop_split: enable_scale_in,
-        enable_adaptive_splits,
-    })
-}
-
-const DEFAULT_SOURCE_WORKER_TICK_INTERVAL: Duration = Duration::from_secs(30);
-
-impl<P: SourceProperties> ConnectorSourceWorker<P> {
-    /// Recreate the `SplitEnumerator` to establish a new connection to the external source service.
-    async fn refresh(&mut self) -> MetaResult<()> {
-        let enumerator = P::SplitEnumerator::new(
-            self.connector_properties.clone(),
-            Arc::new(SourceEnumeratorContext {
-                metrics: self.metrics.source_enumerator_metrics.clone(),
-                info: SourceEnumeratorInfo {
-                    source_id: self.source_id as u32,
-                },
-            }),
-        )
-        .await
-        .context("failed to create SplitEnumerator")?;
-        self.enumerator = enumerator;
-        self.fail_cnt = 0;
-        tracing::info!("refreshed source enumerator: {}", self.source_name);
-        Ok(())
-    }
-
-    /// On creation, connection to the external source service will be established, but `splits`
-    /// will not be updated until `tick` is called.
-    pub async fn create(
-        source: &Source,
-        connector_properties: P,
-        period: Duration,
-        splits: Arc<Mutex<SharedSplitMap>>,
-        metrics: Arc<MetaMetrics>,
-    ) -> MetaResult<Self> {
-        let enumerator = P::SplitEnumerator::new(
-            connector_properties.clone(),
-            Arc::new(SourceEnumeratorContext {
-                metrics: metrics.source_enumerator_metrics.clone(),
-                info: SourceEnumeratorInfo {
-                    source_id: source.id,
-                },
-            }),
-        )
-        .await
-        .context("failed to create SplitEnumerator")?;
-
-        let source_is_up = metrics
-            .source_is_up
-            .with_guarded_label_values(&[source.id.to_string().as_str(), &source.name]);
-
-        Ok(Self {
-            source_id: source.id as SourceId,
-            source_name: source.name.clone(),
-            current_splits: splits,
-            enumerator,
-            period,
-            metrics,
-            connector_properties,
-            fail_cnt: 0,
-            source_is_up,
-        })
-    }
-
-    pub async fn run(
-        &mut self,
-        mut sync_call_rx: UnboundedReceiver<oneshot::Sender<MetaResult<()>>>,
-    ) {
-        let mut interval = time::interval(self.period);
-        interval.set_missed_tick_behavior(MissedTickBehavior::Skip);
-        loop {
-            select! {
-                biased;
-                tx = sync_call_rx.borrow_mut().recv() => {
-                    if let Some(tx) = tx {
-                        let _ = tx.send(self.tick().await);
-                    }
-                }
-                _ = interval.tick() => {
-                    if self.fail_cnt > MAX_FAIL_CNT {
-                        if let Err(e) = self.refresh().await {
-                            tracing::error!(error = %e.as_report(), "error happened when refresh from connector source worker");
-                        }
-                    }
-                    if let Err(e) = self.tick().await {
-                        tracing::error!(error = %e.as_report(), "error happened when tick from connector source worker");
-                    }
-                }
-            }
-        }
-    }
-
-    /// Uses [`SplitEnumerator`] to fetch the latest split metadata from the external source service.
-    async fn tick(&mut self) -> MetaResult<()> {
-        let source_is_up = |res: i64| {
-            self.source_is_up.set(res);
-        };
-        let splits = self.enumerator.list_splits().await.inspect_err(|_| {
-            source_is_up(0);
-            self.fail_cnt += 1;
-        })?;
-        source_is_up(1);
-        self.fail_cnt = 0;
-        let mut current_splits = self.current_splits.lock().await;
-        current_splits.splits.replace(
-            splits
-                .into_iter()
-                .map(|split| (split.id(), P::Split::into(split)))
-                .collect(),
-        );
-
-        Ok(())
-    }
-}
-
-/// Handle for a running [`ConnectorSourceWorker`].
-pub struct ConnectorSourceWorkerHandle {
-    handle: JoinHandle<()>,
-    sync_call_tx: UnboundedSender<oneshot::Sender<MetaResult<()>>>,
-    splits: SharedSplitMapRef,
-    enable_drop_split: bool,
-    enable_adaptive_splits: bool,
-}
-
-impl ConnectorSourceWorkerHandle {
-    async fn discovered_splits(&self) -> Option<BTreeMap<SplitId, SplitImpl>> {
-        self.splits.lock().await.splits.clone()
-    }
-
-    pub fn get_enable_adaptive_splits(&self) -> bool {
-        self.enable_adaptive_splits
-    }
-}
-
 pub struct SourceManagerCore {
     metadata_manager: MetadataManager,
 
@@ -300,36 +84,6 @@ pub struct SourceManagerRunningInfo {
     pub actor_splits: HashMap<ActorId, Vec<SplitImpl>>,
 }
 
-async fn handle_discover_splits(
-    handle: &ConnectorSourceWorkerHandle,
-    source_id: SourceId,
-    actors: &HashSet<ActorId>,
-) -> MetaResult<BTreeMap<Arc<str>, SplitImpl>> {
-    let Some(mut discovered_splits) = handle.discovered_splits().await else {
-        tracing::info!(
-            "The discover loop for source {} is not ready yet; we'll wait for the next run",
-            source_id
-        );
-        return Ok(BTreeMap::new());
-    };
-    if discovered_splits.is_empty() {
-        tracing::warn!("No splits discovered for source {}", source_id);
-    }
-
-    if handle.enable_adaptive_splits {
-        // Connector supporting adaptive splits returns just one split, and we need to make the number of splits equal to the number of actors in this fragment.
-        // Because we Risingwave consume the splits statelessly and we do not need to keep the id internally, we always use actor_id as split_id.
-        // And prev splits record should be dropped via CN.
-
-        debug_assert!(handle.enable_drop_split);
-        debug_assert!(discovered_splits.len() == 1);
-        discovered_splits =
-            fill_adaptive_split(discovered_splits.values().next().unwrap(), actors)?;
-    }
-
-    Ok(discovered_splits)
-}
-
 impl SourceManagerCore {
     fn new(
         metadata_manager: MetadataManager,
@@ -347,117 +101,6 @@ impl SourceManagerCore {
         }
     }
 
-    /// Checks whether the external source metadata has changed,
-    /// and re-assigns splits if there's a diff.
-    ///
-    /// `self.actor_splits` will not be updated. It will be updated by `Self::apply_source_change`,
-    /// after the mutation barrier has been collected.
-    async fn reassign_splits(&self) -> MetaResult<HashMap<DatabaseId, SplitAssignment>> {
-        let mut split_assignment: SplitAssignment = HashMap::new();
-
-        'loop_source: for (source_id, handle) in &self.managed_sources {
-            let source_fragment_ids = match self.source_fragments.get(source_id) {
-                Some(fragment_ids) if !fragment_ids.is_empty() => fragment_ids,
-                _ => {
-                    continue;
-                }
-            };
-            let backfill_fragment_ids = self.backfill_fragments.get(source_id);
-
-            'loop_fragment: for &fragment_id in source_fragment_ids {
-                let actors = match self
-                    .metadata_manager
-                    .get_running_actors_of_fragment(fragment_id)
-                    .await
-                {
-                    Ok(actors) => {
-                        if actors.is_empty() {
-                            tracing::warn!("No actors found for fragment {}", fragment_id);
-                            continue 'loop_fragment;
-                        }
-                        actors
-                    }
-                    Err(err) => {
-                        tracing::warn!(error = %err.as_report(), "Failed to get the actor of the fragment, maybe the fragment doesn't exist anymore");
-                        continue 'loop_fragment;
-                    }
-                };
-
-                let discovered_splits = handle_discover_splits(handle, *source_id, &actors).await?;
-                if discovered_splits.is_empty() {
-                    // The discover loop for this source is not ready yet; we'll wait for the next run
-                    continue 'loop_source;
-                }
-
-                let prev_actor_splits: HashMap<_, _> = actors
-                    .into_iter()
-                    .map(|actor_id| {
-                        (
-                            actor_id,
-                            self.actor_splits
-                                .get(&actor_id)
-                                .cloned()
-                                .unwrap_or_default(),
-                        )
-                    })
-                    .collect();
-
-                if let Some(new_assignment) = reassign_splits(
-                    fragment_id,
-                    prev_actor_splits,
-                    &discovered_splits,
-                    SplitDiffOptions {
-                        enable_scale_in: handle.enable_drop_split,
-                        enable_adaptive: handle.enable_adaptive_splits,
-                    },
-                ) {
-                    split_assignment.insert(fragment_id, new_assignment);
-                }
-            }
-
-            if let Some(backfill_fragment_ids) = backfill_fragment_ids {
-                // align splits for backfill fragments with its upstream source fragment
-                for (fragment_id, upstream_fragment_id) in backfill_fragment_ids {
-                    let Some(upstream_assignment) = split_assignment.get(upstream_fragment_id)
-                    else {
-                        // upstream fragment unchanged, do not update backfill fragment too
-                        continue;
-                    };
-                    let actors = match self
-                        .metadata_manager
-                        .get_running_actors_for_source_backfill(*fragment_id)
-                        .await
-                    {
-                        Ok(actors) => {
-                            if actors.is_empty() {
-                                tracing::warn!("No actors found for fragment {}", fragment_id);
-                                continue;
-                            }
-                            actors
-                        }
-                        Err(err) => {
-                            tracing::warn!(error = %err.as_report(),"Failed to get the actor of the fragment, maybe the fragment doesn't exist anymore");
-                            continue;
-                        }
-                    };
-                    split_assignment.insert(
-                        *fragment_id,
-                        align_splits(
-                            actors,
-                            upstream_assignment,
-                            *fragment_id,
-                            *upstream_fragment_id,
-                        )?,
-                    );
-                }
-            }
-        }
-
-        self.metadata_manager
-            .split_fragment_map_by_database(split_assignment)
-            .await
-    }
-
     /// Updates states after all kinds of source change.
     pub fn apply_source_change(&mut self, source_change: SourceChange) {
         let mut added_source_fragments = Default::default();
@@ -593,203 +236,6 @@ impl SourceManagerCore {
     }
 }
 
-/// Note: the `PartialEq` and `Ord` impl just compares the number of splits.
-#[derive(Debug)]
-struct ActorSplitsAssignment<T: SplitMetaData> {
-    actor_id: ActorId,
-    splits: Vec<T>,
-}
-
-impl<T: SplitMetaData + Clone> Eq for ActorSplitsAssignment<T> {}
-
-impl<T: SplitMetaData + Clone> PartialEq<Self> for ActorSplitsAssignment<T> {
-    fn eq(&self, other: &Self) -> bool {
-        self.splits.len() == other.splits.len()
-    }
-}
-
-impl<T: SplitMetaData + Clone> PartialOrd<Self> for ActorSplitsAssignment<T> {
-    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
-        Some(self.cmp(other))
-    }
-}
-
-impl<T: SplitMetaData + Clone> Ord for ActorSplitsAssignment<T> {
-    fn cmp(&self, other: &Self) -> Ordering {
-        // Note: this is reversed order, to make BinaryHeap a min heap.
-        other.splits.len().cmp(&self.splits.len())
-    }
-}
-
-#[derive(Debug)]
-struct SplitDiffOptions {
-    enable_scale_in: bool,
-
-    /// For most connectors, this should be false. When enabled, RisingWave will not track any progress.
-    enable_adaptive: bool,
-}
-
-#[allow(clippy::derivable_impls)]
-impl Default for SplitDiffOptions {
-    fn default() -> Self {
-        SplitDiffOptions {
-            enable_scale_in: false,
-            enable_adaptive: false,
-        }
-    }
-}
-
-/// Reassigns splits if there are new splits or dropped splits,
-/// i.e., `actor_splits` and `discovered_splits` differ, or actors are rescheduled.
-///
-/// The existing splits will remain unmoved in their currently assigned actor.
-///
-/// If an actor has an upstream actor, it should be a backfill executor,
-/// and its splits should be aligned with the upstream actor. **`reassign_splits` should not be used in this case.
-/// Use [`align_splits`] instead.**
-///
-/// - `fragment_id`: just for logging
-///
-/// ## Different connectors' behavior of split change
-///
-/// ### Kafka and Pulsar
-/// They only support increasing the number of splits via adding new empty splits.
-/// Old data is not moved.
-///
-/// ### Kinesis
-/// It supports *pairwise* shard split and merge.
-///
-/// In both cases, old data remain in the old shard(s) and the old shard is still available.
-/// New data are routed to the new shard(s).
-/// After the retention period has expired, the old shard will become `EXPIRED` and isn't
-/// listed any more. In other words, the total number of shards will first increase and then decrease.
-///
-/// See also:
-/// - [Kinesis resharding doc](https://docs.aws.amazon.com/streams/latest/dev/kinesis-using-sdk-java-after-resharding.html#kinesis-using-sdk-java-resharding-data-routing)
-/// - An example of how the shards can be like: <https://stackoverflow.com/questions/72272034/list-shard-show-more-shards-than-provisioned>
-fn reassign_splits<T>(
-    fragment_id: FragmentId,
-    actor_splits: HashMap<ActorId, Vec<T>>,
-    discovered_splits: &BTreeMap<SplitId, T>,
-    opts: SplitDiffOptions,
-) -> Option<HashMap<ActorId, Vec<T>>>
-where
-    T: SplitMetaData + Clone,
-{
-    // if no actors, return
-    if actor_splits.is_empty() {
-        return None;
-    }
-
-    let prev_split_ids: HashSet<_> = actor_splits
-        .values()
-        .flat_map(|splits| splits.iter().map(SplitMetaData::id))
-        .collect();
-
-    tracing::trace!(fragment_id, prev_split_ids = ?prev_split_ids, "previous splits");
-    tracing::trace!(fragment_id, prev_split_ids = ?discovered_splits.keys(), "discovered splits");
-
-    let discovered_split_ids: HashSet<_> = discovered_splits.keys().cloned().collect();
-
-    let dropped_splits: HashSet<_> = prev_split_ids
-        .difference(&discovered_split_ids)
-        .cloned()
-        .collect();
-
-    if !dropped_splits.is_empty() {
-        if opts.enable_scale_in {
-            tracing::info!(fragment_id, dropped_spltis = ?dropped_splits, "new dropped splits");
-        } else {
-            tracing::warn!(fragment_id, dropped_spltis = ?dropped_splits, "split dropping happened, but it is not allowed");
-        }
-    }
-
-    let new_discovered_splits: BTreeSet<_> = discovered_split_ids
-        .into_iter()
-        .filter(|split_id| !prev_split_ids.contains(split_id))
-        .collect();
-
-    if opts.enable_scale_in || opts.enable_adaptive {
-        // if we support scale in, no more splits are discovered, and no splits are dropped, return
-        // we need to check if discovered_split_ids is empty, because if it is empty, we need to
-        // handle the case of scale in to zero (like deleting all objects from s3)
-        if dropped_splits.is_empty()
-            && new_discovered_splits.is_empty()
-            && !discovered_splits.is_empty()
-        {
-            return None;
-        }
-    } else {
-        // if we do not support scale in, and no more splits are discovered, return
-        if new_discovered_splits.is_empty() && !discovered_splits.is_empty() {
-            return None;
-        }
-    }
-
-    tracing::info!(fragment_id, new_discovered_splits = ?new_discovered_splits, "new discovered splits");
-
-    let mut heap = BinaryHeap::with_capacity(actor_splits.len());
-
-    for (actor_id, mut splits) in actor_splits {
-        if opts.enable_scale_in || opts.enable_adaptive {
-            splits.retain(|split| !dropped_splits.contains(&split.id()));
-        }
-
-        heap.push(ActorSplitsAssignment { actor_id, splits })
-    }
-
-    for split_id in new_discovered_splits {
-        // ActorSplitsAssignment's Ord is reversed, so this is min heap, i.e.,
-        // we get the assignment with the least splits here.
-
-        // Note: If multiple actors have the same number of splits, it will be randomly picked.
-        // When the number of source actors is larger than the number of splits,
-        // It's possible that the assignment is uneven.
-        // e.g., https://github.com/risingwavelabs/risingwave/issues/14324#issuecomment-1875033158
-        // TODO: We should make the assignment rack-aware to make sure it's even.
-        let mut peek_ref = heap.peek_mut().unwrap();
-        peek_ref
-            .splits
-            .push(discovered_splits.get(&split_id).cloned().unwrap());
-    }
-
-    Some(
-        heap.into_iter()
-            .map(|ActorSplitsAssignment { actor_id, splits }| (actor_id, splits))
-            .collect(),
-    )
-}
-
-/// Assign splits to a new set of actors, according to existing assignment.
-///
-/// illustration:
-/// ```text
-/// upstream                               new
-/// actor x1 [split 1, split2]      ->     actor y1 [split 1, split2]
-/// actor x2 [split 3]              ->     actor y2 [split 3]
-/// ...
-/// ```
-fn align_splits(
-    // (actor_id, upstream_actor_id)
-    aligned_actors: impl IntoIterator<Item = (ActorId, ActorId)>,
-    existing_assignment: &HashMap<ActorId, Vec<SplitImpl>>,
-    fragment_id: FragmentId,
-    upstream_source_fragment_id: FragmentId,
-) -> anyhow::Result<HashMap<ActorId, Vec<SplitImpl>>> {
-    aligned_actors
-        .into_iter()
-        .map(|(actor_id, upstream_actor_id)| {
-            let Some(splits) = existing_assignment.get(&upstream_actor_id) else {
-                return Err(anyhow::anyhow!("upstream assignment not found, fragment_id: {fragment_id}, upstream_fragment_id: {upstream_source_fragment_id}, actor_id: {actor_id}, upstream_assignment: {existing_assignment:?}, upstream_actor_id: {upstream_actor_id:?}"));
-            };
-            Ok((
-                actor_id,
-                splits.clone(),
-            ))
-        })
-        .collect()
-}
-
 impl SourceManager {
     const DEFAULT_SOURCE_TICK_INTERVAL: Duration = Duration::from_secs(10);
 
@@ -802,7 +248,7 @@ impl SourceManager {
         {
             let sources = metadata_manager.list_sources().await?;
             for source in sources {
-                Self::create_source_worker_async(source, &mut managed_sources, metrics.clone())?
+                create_source_worker_async(source, &mut managed_sources, metrics.clone())?
             }
         }
 
@@ -904,283 +350,12 @@ impl SourceManager {
         core.apply_source_change(source_change);
     }
 
-    /// Migrates splits from previous actors to the new actors for a rescheduled fragment.
-    ///
-    /// Very occasionally split removal may happen during scaling, in which case we need to
-    /// use the old splits for reallocation instead of the latest splits (which may be missing),
-    /// so that we can resolve the split removal in the next command.
-    pub async fn migrate_splits_for_source_actors(
-        &self,
-        fragment_id: FragmentId,
-        prev_actor_ids: &[ActorId],
-        curr_actor_ids: &[ActorId],
-    ) -> MetaResult<HashMap<ActorId, Vec<SplitImpl>>> {
-        let core = self.core.lock().await;
-
-        let prev_splits = prev_actor_ids
-            .iter()
-            .flat_map(|actor_id| core.actor_splits.get(actor_id).unwrap())
-            .map(|split| (split.id(), split.clone()))
-            .collect();
-
-        let empty_actor_splits = curr_actor_ids
-            .iter()
-            .map(|actor_id| (*actor_id, vec![]))
-            .collect();
-
-        let diff = reassign_splits(
-            fragment_id,
-            empty_actor_splits,
-            &prev_splits,
-            // pre-allocate splits is the first time getting splits and it does not have scale-in scene
-            SplitDiffOptions::default(),
-        )
-        .unwrap_or_default();
-
-        Ok(diff)
-    }
-
-    /// Migrates splits from previous actors to the new actors for a rescheduled fragment.
-    pub fn migrate_splits_for_backfill_actors(
-        &self,
-        fragment_id: FragmentId,
-        upstream_source_fragment_id: FragmentId,
-        curr_actor_ids: &[ActorId],
-        fragment_actor_splits: &HashMap<FragmentId, HashMap<ActorId, Vec<SplitImpl>>>,
-        no_shuffle_upstream_actor_map: &HashMap<ActorId, HashMap<FragmentId, ActorId>>,
-    ) -> MetaResult<HashMap<ActorId, Vec<SplitImpl>>> {
-        // align splits for backfill fragments with its upstream source fragment
-        let actors = no_shuffle_upstream_actor_map
-            .iter()
-            .filter(|(id, _)| curr_actor_ids.contains(id))
-            .map(|(id, upstream_fragment_actors)| {
-                (
-                    *id,
-                    *upstream_fragment_actors
-                        .get(&upstream_source_fragment_id)
-                        .unwrap(),
-                )
-            });
-        let upstream_assignment = fragment_actor_splits
-            .get(&upstream_source_fragment_id)
-            .unwrap();
-        tracing::info!(
-            fragment_id,
-            upstream_source_fragment_id,
-            ?upstream_assignment,
-            "migrate_splits_for_backfill_actors"
-        );
-        Ok(align_splits(
-            actors,
-            upstream_assignment,
-            fragment_id,
-            upstream_source_fragment_id,
-        )?)
-    }
-
-    /// Allocates splits to actors for a newly created source executor.
-    pub async fn allocate_splits(&self, job_id: &TableId) -> MetaResult<SplitAssignment> {
-        let core = self.core.lock().await;
-        let table_fragments = core
-            .metadata_manager
-            .get_job_fragments_by_id(job_id)
-            .await?;
-
-        let source_fragments = table_fragments.stream_source_fragments();
-
-        let mut assigned = HashMap::new();
-
-        'loop_source: for (source_id, fragments) in source_fragments {
-            let handle = core
-                .managed_sources
-                .get(&source_id)
-                .with_context(|| format!("could not find source {}", source_id))?;
-
-            if handle.splits.lock().await.splits.is_none() {
-                // force refresh source
-                let (tx, rx) = oneshot::channel();
-                handle
-                    .sync_call_tx
-                    .send(tx)
-                    .ok()
-                    .context("failed to send sync call")?;
-                rx.await
-                    .ok()
-                    .context("failed to receive sync call response")??;
-            }
-
-            for fragment_id in fragments {
-                let empty_actor_splits: HashMap<u32, Vec<SplitImpl>> = table_fragments
-                    .fragments
-                    .get(&fragment_id)
-                    .unwrap()
-                    .actors
-                    .iter()
-                    .map(|actor| (actor.actor_id, vec![]))
-                    .collect();
-                let actor_hashset: HashSet<u32> = empty_actor_splits.keys().cloned().collect();
-                let splits = handle_discover_splits(handle, source_id, &actor_hashset).await?;
-                if splits.is_empty() {
-                    tracing::warn!("no splits detected for source {}", source_id);
-                    continue 'loop_source;
-                }
-
-                if let Some(diff) = reassign_splits(
-                    fragment_id,
-                    empty_actor_splits,
-                    &splits,
-                    SplitDiffOptions::default(),
-                ) {
-                    assigned.insert(fragment_id, diff);
-                }
-            }
-        }
-
-        Ok(assigned)
-    }
-
-    /// Allocates splits to actors for replace source job.
-    pub async fn allocate_splits_for_replace_source(
-        &self,
-        job_id: &TableId,
-        merge_updates: &Vec<MergeUpdate>,
-    ) -> MetaResult<SplitAssignment> {
-        tracing::debug!(?merge_updates, "allocate_splits_for_replace_source");
-        if merge_updates.is_empty() {
-            // no existing downstream. We can just re-allocate splits arbitrarily.
-            return self.allocate_splits(job_id).await;
-        }
-
-        let core = self.core.lock().await;
-        let table_fragments = core
-            .metadata_manager
-            .get_job_fragments_by_id(job_id)
-            .await?;
-
-        let source_fragments = table_fragments.stream_source_fragments();
-        assert_eq!(
-            source_fragments.len(),
-            1,
-            "replace source job should only have one source"
-        );
-        let (_source_id, fragments) = source_fragments.into_iter().next().unwrap();
-        assert_eq!(
-            fragments.len(),
-            1,
-            "replace source job should only have one fragment"
-        );
-        let fragment_id = fragments.into_iter().next().unwrap();
-
-        debug_assert!(
-            !merge_updates.is_empty()
-                && merge_updates.iter().all(|merge_update| {
-                    merge_update.upstream_fragment_id == merge_updates[0].upstream_fragment_id
-                        && merge_update.new_upstream_fragment_id == Some(fragment_id)
-                }),
-            "merge update should only replace one fragment: {:?}",
-            merge_updates
-        );
-        let prev_fragment_id = merge_updates[0].upstream_fragment_id;
-        // Here we align the new source executor to backfill executors
-        //
-        // old_source => new_source            backfill_1
-        // actor_x1   => actor_y1 -----┬------>actor_a1
-        // actor_x2   => actor_y2 -----┼-┬---->actor_a2
-        //                             │ │
-        //                             │ │     backfill_2
-        //                             └─┼---->actor_b1
-        //                               └---->actor_b2
-        //
-        // Note: we can choose any backfill actor to align here.
-        // We use `HashMap` to dedup.
-        let aligned_actors: HashMap<ActorId, ActorId> = merge_updates
-            .iter()
-            .map(|merge_update| {
-                assert_eq!(merge_update.added_upstream_actor_id.len(), 1);
-                // Note: removed_upstream_actor_id is not set for replace job, so we can't use it.
-                assert_eq!(merge_update.removed_upstream_actor_id.len(), 0);
-                (
-                    merge_update.added_upstream_actor_id[0],
-                    merge_update.actor_id,
-                )
-            })
-            .collect();
-        let assignment = align_splits(
-            aligned_actors.into_iter(),
-            &core.actor_splits,
-            fragment_id,
-            prev_fragment_id,
-        )?;
-        Ok(HashMap::from([(fragment_id, assignment)]))
-    }
-
-    /// Allocates splits to actors for a newly created `SourceBackfill` executor.
-    ///
-    /// Unlike [`Self::allocate_splits`], which creates a new assignment,
-    /// this method aligns the splits for backfill fragments with its upstream source fragment ([`align_splits`]).
-    pub async fn allocate_splits_for_backfill(
-        &self,
-        table_id: &TableId,
-        // dispatchers from SourceExecutor to SourceBackfillExecutor
-        dispatchers: &HashMap<ActorId, Vec<Dispatcher>>,
-    ) -> MetaResult<SplitAssignment> {
-        let core = self.core.lock().await;
-        let table_fragments = core
-            .metadata_manager
-            .get_job_fragments_by_id(table_id)
-            .await?;
-
-        let source_backfill_fragments = table_fragments.source_backfill_fragments()?;
-
-        let mut assigned = HashMap::new();
-
-        for (_source_id, fragments) in source_backfill_fragments {
-            for (fragment_id, upstream_source_fragment_id) in fragments {
-                let upstream_actors = core
-                    .metadata_manager
-                    .get_running_actors_of_fragment(upstream_source_fragment_id)
-                    .await?;
-                let mut backfill_actors = vec![];
-                for upstream_actor in upstream_actors {
-                    if let Some(dispatchers) = dispatchers.get(&upstream_actor) {
-                        let err = || {
-                            anyhow::anyhow!(
-                            "source backfill fragment's upstream fragment should have one dispatcher, fragment_id: {fragment_id}, upstream_fragment_id: {upstream_source_fragment_id}, upstream_actor: {upstream_actor}, dispatchers: {dispatchers:?}",
-                            fragment_id = fragment_id,
-                            upstream_source_fragment_id = upstream_source_fragment_id,
-                            upstream_actor = upstream_actor,
-                            dispatchers = dispatchers
-                        )
-                        };
-                        if dispatchers.len() != 1 || dispatchers[0].downstream_actor_id.len() != 1 {
-                            return Err(err().into());
-                        }
-
-                        backfill_actors
-                            .push((dispatchers[0].downstream_actor_id[0], upstream_actor));
-                    }
-                }
-                assigned.insert(
-                    fragment_id,
-                    align_splits(
-                        backfill_actors,
-                        &core.actor_splits,
-                        fragment_id,
-                        upstream_source_fragment_id,
-                    )?,
-                );
-            }
-        }
-
-        Ok(assigned)
-    }
-
     /// create and register connector worker for source.
     pub async fn register_source(&self, source: &Source) -> MetaResult<()> {
         tracing::debug!("register_source: {}", source.get_id());
         let mut core = self.core.lock().await;
         if let Entry::Vacant(e) = core.managed_sources.entry(source.get_id() as _) {
-            let handle = create_source_worker_handle(source, self.metrics.clone())
+            let handle = create_source_worker(source, self.metrics.clone())
                 .await
                 .context("failed to create source worker")?;
             e.insert(handle);
@@ -1204,66 +379,6 @@ impl SourceManager {
         }
     }
 
-    /// Used on startup ([`Self::new`]). Failed sources will not block meta startup.
-    fn create_source_worker_async(
-        source: Source,
-        managed_sources: &mut HashMap<SourceId, ConnectorSourceWorkerHandle>,
-        metrics: Arc<MetaMetrics>,
-    ) -> MetaResult<()> {
-        tracing::info!("spawning new watcher for source {}", source.id);
-
-        let splits = Arc::new(Mutex::new(SharedSplitMap { splits: None }));
-        let current_splits_ref = splits.clone();
-        let source_id = source.id;
-
-        let connector_properties = extract_prop_from_existing_source(&source)?;
-
-        let enable_drop_split = connector_properties.enable_drop_split();
-        let enable_adaptive_splits = connector_properties.enable_adaptive_splits();
-        let (sync_call_tx, sync_call_rx) = tokio::sync::mpsc::unbounded_channel();
-        let handle = tokio::spawn(async move {
-            let mut ticker = time::interval(Self::DEFAULT_SOURCE_TICK_INTERVAL);
-            ticker.set_missed_tick_behavior(MissedTickBehavior::Skip);
-
-            dispatch_source_prop!(connector_properties, prop, {
-                let mut worker = loop {
-                    ticker.tick().await;
-
-                    match ConnectorSourceWorker::create(
-                        &source,
-                        prop.deref().clone(),
-                        DEFAULT_SOURCE_WORKER_TICK_INTERVAL,
-                        current_splits_ref.clone(),
-                        metrics.clone(),
-                    )
-                    .await
-                    {
-                        Ok(worker) => {
-                            break worker;
-                        }
-                        Err(e) => {
-                            tracing::warn!(error = %e.as_report(), "failed to create source worker");
-                        }
-                    }
-                };
-
-                worker.run(sync_call_rx).await
-            });
-        });
-
-        managed_sources.insert(
-            source_id as SourceId,
-            ConnectorSourceWorkerHandle {
-                handle,
-                sync_call_tx,
-                splits,
-                enable_drop_split,
-                enable_adaptive_splits,
-            },
-        );
-        Ok(())
-    }
-
     pub async fn list_assignments(&self) -> HashMap<ActorId, Vec<SplitImpl>> {
         let core = self.core.lock().await;
         core.actor_splits.clone()
@@ -1384,235 +499,3 @@ pub fn build_actor_split_impls(
         })
         .collect()
 }
-
-#[cfg(test)]
-mod tests {
-    use std::collections::{BTreeMap, HashMap, HashSet};
-
-    use risingwave_common::types::JsonbVal;
-    use risingwave_connector::error::ConnectorResult;
-    use risingwave_connector::source::{SplitId, SplitMetaData};
-    use serde::{Deserialize, Serialize};
-
-    use crate::model::{ActorId, FragmentId};
-    use crate::stream::source_manager::{reassign_splits, SplitDiffOptions};
-
-    #[derive(Debug, Copy, Clone, Serialize, Deserialize)]
-    struct TestSplit {
-        id: u32,
-    }
-
-    impl SplitMetaData for TestSplit {
-        fn id(&self) -> SplitId {
-            format!("{}", self.id).into()
-        }
-
-        fn encode_to_json(&self) -> JsonbVal {
-            serde_json::to_value(*self).unwrap().into()
-        }
-
-        fn restore_from_json(value: JsonbVal) -> ConnectorResult<Self> {
-            serde_json::from_value(value.take()).map_err(Into::into)
-        }
-
-        fn update_offset(&mut self, _last_read_offset: String) -> ConnectorResult<()> {
-            Ok(())
-        }
-    }
-
-    fn check_all_splits(
-        discovered_splits: &BTreeMap<SplitId, TestSplit>,
-        diff: &HashMap<ActorId, Vec<TestSplit>>,
-    ) {
-        let mut split_ids: HashSet<_> = discovered_splits.keys().cloned().collect();
-
-        for splits in diff.values() {
-            for split in splits {
-                assert!(split_ids.remove(&split.id()))
-            }
-        }
-
-        assert!(split_ids.is_empty());
-    }
-
-    #[test]
-    fn test_drop_splits() {
-        let mut actor_splits: HashMap<ActorId, _> = HashMap::new();
-        actor_splits.insert(0, vec![TestSplit { id: 0 }, TestSplit { id: 1 }]);
-        actor_splits.insert(1, vec![TestSplit { id: 2 }, TestSplit { id: 3 }]);
-        actor_splits.insert(2, vec![TestSplit { id: 4 }, TestSplit { id: 5 }]);
-
-        let mut prev_split_to_actor = HashMap::new();
-        for (actor_id, splits) in &actor_splits {
-            for split in splits {
-                prev_split_to_actor.insert(split.id(), *actor_id);
-            }
-        }
-
-        let discovered_splits: BTreeMap<SplitId, TestSplit> = (1..5)
-            .map(|i| {
-                let split = TestSplit { id: i };
-                (split.id(), split)
-            })
-            .collect();
-
-        let opts = SplitDiffOptions {
-            enable_scale_in: true,
-            enable_adaptive: false,
-        };
-
-        let prev_split_ids: HashSet<_> = actor_splits
-            .values()
-            .flat_map(|splits| splits.iter().map(|split| split.id()))
-            .collect();
-
-        let diff = reassign_splits(
-            FragmentId::default(),
-            actor_splits,
-            &discovered_splits,
-            opts,
-        )
-        .unwrap();
-        check_all_splits(&discovered_splits, &diff);
-
-        let mut after_split_to_actor = HashMap::new();
-        for (actor_id, splits) in &diff {
-            for split in splits {
-                after_split_to_actor.insert(split.id(), *actor_id);
-            }
-        }
-
-        let discovered_split_ids: HashSet<_> = discovered_splits.keys().cloned().collect();
-
-        let retained_split_ids: HashSet<_> =
-            prev_split_ids.intersection(&discovered_split_ids).collect();
-
-        for retained_split_id in retained_split_ids {
-            assert_eq!(
-                prev_split_to_actor.get(retained_split_id),
-                after_split_to_actor.get(retained_split_id)
-            )
-        }
-    }
-
-    #[test]
-    fn test_drop_splits_to_empty() {
-        let mut actor_splits: HashMap<ActorId, _> = HashMap::new();
-        actor_splits.insert(0, vec![TestSplit { id: 0 }]);
-
-        let discovered_splits: BTreeMap<SplitId, TestSplit> = BTreeMap::new();
-
-        let opts = SplitDiffOptions {
-            enable_scale_in: true,
-            enable_adaptive: false,
-        };
-
-        let diff = reassign_splits(
-            FragmentId::default(),
-            actor_splits,
-            &discovered_splits,
-            opts,
-        )
-        .unwrap();
-
-        assert!(!diff.is_empty())
-    }
-
-    #[test]
-    fn test_reassign_splits() {
-        let actor_splits = HashMap::new();
-        let discovered_splits: BTreeMap<SplitId, TestSplit> = BTreeMap::new();
-        assert!(reassign_splits(
-            FragmentId::default(),
-            actor_splits,
-            &discovered_splits,
-            Default::default()
-        )
-        .is_none());
-
-        let actor_splits = (0..3).map(|i| (i, vec![])).collect();
-        let discovered_splits: BTreeMap<SplitId, TestSplit> = BTreeMap::new();
-        let diff = reassign_splits(
-            FragmentId::default(),
-            actor_splits,
-            &discovered_splits,
-            Default::default(),
-        )
-        .unwrap();
-        assert_eq!(diff.len(), 3);
-        for splits in diff.values() {
-            assert!(splits.is_empty())
-        }
-
-        let actor_splits = (0..3).map(|i| (i, vec![])).collect();
-        let discovered_splits: BTreeMap<SplitId, TestSplit> = (0..3)
-            .map(|i| {
-                let split = TestSplit { id: i };
-                (split.id(), split)
-            })
-            .collect();
-
-        let diff = reassign_splits(
-            FragmentId::default(),
-            actor_splits,
-            &discovered_splits,
-            Default::default(),
-        )
-        .unwrap();
-        assert_eq!(diff.len(), 3);
-        for splits in diff.values() {
-            assert_eq!(splits.len(), 1);
-        }
-
-        check_all_splits(&discovered_splits, &diff);
-
-        let actor_splits = (0..3).map(|i| (i, vec![TestSplit { id: i }])).collect();
-        let discovered_splits: BTreeMap<SplitId, TestSplit> = (0..5)
-            .map(|i| {
-                let split = TestSplit { id: i };
-                (split.id(), split)
-            })
-            .collect();
-
-        let diff = reassign_splits(
-            FragmentId::default(),
-            actor_splits,
-            &discovered_splits,
-            Default::default(),
-        )
-        .unwrap();
-        assert_eq!(diff.len(), 3);
-        for splits in diff.values() {
-            let len = splits.len();
-            assert!(len == 1 || len == 2);
-        }
-
-        check_all_splits(&discovered_splits, &diff);
-
-        let mut actor_splits: HashMap<ActorId, Vec<TestSplit>> =
-            (0..3).map(|i| (i, vec![TestSplit { id: i }])).collect();
-        actor_splits.insert(3, vec![]);
-        actor_splits.insert(4, vec![]);
-
-        let discovered_splits: BTreeMap<SplitId, TestSplit> = (0..5)
-            .map(|i| {
-                let split = TestSplit { id: i };
-                (split.id(), split)
-            })
-            .collect();
-
-        let diff = reassign_splits(
-            FragmentId::default(),
-            actor_splits,
-            &discovered_splits,
-            Default::default(),
-        )
-        .unwrap();
-        assert_eq!(diff.len(), 5);
-        for splits in diff.values() {
-            assert_eq!(splits.len(), 1);
-        }
-
-        check_all_splits(&discovered_splits, &diff);
-    }
-}
diff --git a/src/meta/src/stream/source_manager/split_assignment.rs b/src/meta/src/stream/source_manager/split_assignment.rs
new file mode 100644
index 0000000000000..e36c1198bfc96
--- /dev/null
+++ b/src/meta/src/stream/source_manager/split_assignment.rs
@@ -0,0 +1,830 @@
+// Copyright 2025 RisingWave Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use super::*;
+
+impl SourceManager {
+    /// Migrates splits from previous actors to the new actors for a rescheduled fragment.
+    ///
+    /// Very occasionally split removal may happen during scaling, in which case we need to
+    /// use the old splits for reallocation instead of the latest splits (which may be missing),
+    /// so that we can resolve the split removal in the next command.
+    pub async fn migrate_splits_for_source_actors(
+        &self,
+        fragment_id: FragmentId,
+        prev_actor_ids: &[ActorId],
+        curr_actor_ids: &[ActorId],
+    ) -> MetaResult<HashMap<ActorId, Vec<SplitImpl>>> {
+        let core = self.core.lock().await;
+
+        let prev_splits = prev_actor_ids
+            .iter()
+            .flat_map(|actor_id| core.actor_splits.get(actor_id).unwrap())
+            .map(|split| (split.id(), split.clone()))
+            .collect();
+
+        let empty_actor_splits = curr_actor_ids
+            .iter()
+            .map(|actor_id| (*actor_id, vec![]))
+            .collect();
+
+        let diff = reassign_splits(
+            fragment_id,
+            empty_actor_splits,
+            &prev_splits,
+            // pre-allocate splits is the first time getting splits and it does not have scale-in scene
+            SplitDiffOptions::default(),
+        )
+        .unwrap_or_default();
+
+        Ok(diff)
+    }
+
+    /// Migrates splits from previous actors to the new actors for a rescheduled fragment.
+    pub fn migrate_splits_for_backfill_actors(
+        &self,
+        fragment_id: FragmentId,
+        upstream_source_fragment_id: FragmentId,
+        curr_actor_ids: &[ActorId],
+        fragment_actor_splits: &HashMap<FragmentId, HashMap<ActorId, Vec<SplitImpl>>>,
+        no_shuffle_upstream_actor_map: &HashMap<ActorId, HashMap<FragmentId, ActorId>>,
+    ) -> MetaResult<HashMap<ActorId, Vec<SplitImpl>>> {
+        // align splits for backfill fragments with its upstream source fragment
+        let actors = no_shuffle_upstream_actor_map
+            .iter()
+            .filter(|(id, _)| curr_actor_ids.contains(id))
+            .map(|(id, upstream_fragment_actors)| {
+                (
+                    *id,
+                    *upstream_fragment_actors
+                        .get(&upstream_source_fragment_id)
+                        .unwrap(),
+                )
+            });
+        let upstream_assignment = fragment_actor_splits
+            .get(&upstream_source_fragment_id)
+            .unwrap();
+        tracing::info!(
+            fragment_id,
+            upstream_source_fragment_id,
+            ?upstream_assignment,
+            "migrate_splits_for_backfill_actors"
+        );
+        Ok(align_splits(
+            actors,
+            upstream_assignment,
+            fragment_id,
+            upstream_source_fragment_id,
+        )?)
+    }
+
+    /// Allocates splits to actors for a newly created source executor.
+    pub async fn allocate_splits(&self, job_id: &TableId) -> MetaResult<SplitAssignment> {
+        let core = self.core.lock().await;
+        let table_fragments = core
+            .metadata_manager
+            .get_job_fragments_by_id(job_id)
+            .await?;
+
+        let source_fragments = table_fragments.stream_source_fragments();
+
+        let mut assigned = HashMap::new();
+
+        'loop_source: for (source_id, fragments) in source_fragments {
+            let handle = core
+                .managed_sources
+                .get(&source_id)
+                .with_context(|| format!("could not find source {}", source_id))?;
+
+            if handle.splits.lock().await.splits.is_none() {
+                // force refresh source
+                let (tx, rx) = oneshot::channel();
+                handle
+                    .sync_call_tx
+                    .send(tx)
+                    .ok()
+                    .context("failed to send sync call")?;
+                rx.await
+                    .ok()
+                    .context("failed to receive sync call response")??;
+            }
+
+            for fragment_id in fragments {
+                let empty_actor_splits: HashMap<u32, Vec<SplitImpl>> = table_fragments
+                    .fragments
+                    .get(&fragment_id)
+                    .unwrap()
+                    .actors
+                    .iter()
+                    .map(|actor| (actor.actor_id, vec![]))
+                    .collect();
+                let actor_hashset: HashSet<u32> = empty_actor_splits.keys().cloned().collect();
+                let splits = handle.discovered_splits(source_id, &actor_hashset).await?;
+                if splits.is_empty() {
+                    tracing::warn!("no splits detected for source {}", source_id);
+                    continue 'loop_source;
+                }
+
+                if let Some(diff) = reassign_splits(
+                    fragment_id,
+                    empty_actor_splits,
+                    &splits,
+                    SplitDiffOptions::default(),
+                ) {
+                    assigned.insert(fragment_id, diff);
+                }
+            }
+        }
+
+        Ok(assigned)
+    }
+
+    /// Allocates splits to actors for replace source job.
+    pub async fn allocate_splits_for_replace_source(
+        &self,
+        job_id: &TableId,
+        merge_updates: &Vec<MergeUpdate>,
+    ) -> MetaResult<SplitAssignment> {
+        tracing::debug!(?merge_updates, "allocate_splits_for_replace_source");
+        if merge_updates.is_empty() {
+            // no existing downstream. We can just re-allocate splits arbitrarily.
+            return self.allocate_splits(job_id).await;
+        }
+
+        let core = self.core.lock().await;
+        let table_fragments = core
+            .metadata_manager
+            .get_job_fragments_by_id(job_id)
+            .await?;
+
+        let source_fragments = table_fragments.stream_source_fragments();
+        assert_eq!(
+            source_fragments.len(),
+            1,
+            "replace source job should only have one source"
+        );
+        let (_source_id, fragments) = source_fragments.into_iter().next().unwrap();
+        assert_eq!(
+            fragments.len(),
+            1,
+            "replace source job should only have one fragment"
+        );
+        let fragment_id = fragments.into_iter().next().unwrap();
+
+        debug_assert!(
+            !merge_updates.is_empty()
+                && merge_updates.iter().all(|merge_update| {
+                    merge_update.upstream_fragment_id == merge_updates[0].upstream_fragment_id
+                        && merge_update.new_upstream_fragment_id == Some(fragment_id)
+                }),
+            "merge update should only replace one fragment: {:?}",
+            merge_updates
+        );
+        let prev_fragment_id = merge_updates[0].upstream_fragment_id;
+        // Here we align the new source executor to backfill executors
+        //
+        // old_source => new_source            backfill_1
+        // actor_x1   => actor_y1 -----┬------>actor_a1
+        // actor_x2   => actor_y2 -----┼-┬---->actor_a2
+        //                             │ │
+        //                             │ │     backfill_2
+        //                             └─┼---->actor_b1
+        //                               └---->actor_b2
+        //
+        // Note: we can choose any backfill actor to align here.
+        // We use `HashMap` to dedup.
+        let aligned_actors: HashMap<ActorId, ActorId> = merge_updates
+            .iter()
+            .map(|merge_update| {
+                assert_eq!(merge_update.added_upstream_actor_id.len(), 1);
+                // Note: removed_upstream_actor_id is not set for replace job, so we can't use it.
+                assert_eq!(merge_update.removed_upstream_actor_id.len(), 0);
+                (
+                    merge_update.added_upstream_actor_id[0],
+                    merge_update.actor_id,
+                )
+            })
+            .collect();
+        let assignment = align_splits(
+            aligned_actors.into_iter(),
+            &core.actor_splits,
+            fragment_id,
+            prev_fragment_id,
+        )?;
+        Ok(HashMap::from([(fragment_id, assignment)]))
+    }
+
+    /// Allocates splits to actors for a newly created `SourceBackfill` executor.
+    ///
+    /// Unlike [`Self::allocate_splits`], which creates a new assignment,
+    /// this method aligns the splits for backfill fragments with its upstream source fragment ([`align_splits`]).
+    pub async fn allocate_splits_for_backfill(
+        &self,
+        table_id: &TableId,
+        // dispatchers from SourceExecutor to SourceBackfillExecutor
+        dispatchers: &HashMap<ActorId, Vec<Dispatcher>>,
+    ) -> MetaResult<SplitAssignment> {
+        let core = self.core.lock().await;
+        let table_fragments = core
+            .metadata_manager
+            .get_job_fragments_by_id(table_id)
+            .await?;
+
+        let source_backfill_fragments = table_fragments.source_backfill_fragments()?;
+
+        let mut assigned = HashMap::new();
+
+        for (_source_id, fragments) in source_backfill_fragments {
+            for (fragment_id, upstream_source_fragment_id) in fragments {
+                let upstream_actors = core
+                    .metadata_manager
+                    .get_running_actors_of_fragment(upstream_source_fragment_id)
+                    .await?;
+                let mut backfill_actors = vec![];
+                for upstream_actor in upstream_actors {
+                    if let Some(dispatchers) = dispatchers.get(&upstream_actor) {
+                        let err = || {
+                            anyhow::anyhow!(
+                            "source backfill fragment's upstream fragment should have one dispatcher, fragment_id: {fragment_id}, upstream_fragment_id: {upstream_source_fragment_id}, upstream_actor: {upstream_actor}, dispatchers: {dispatchers:?}",
+                            fragment_id = fragment_id,
+                            upstream_source_fragment_id = upstream_source_fragment_id,
+                            upstream_actor = upstream_actor,
+                            dispatchers = dispatchers
+                        )
+                        };
+                        if dispatchers.len() != 1 || dispatchers[0].downstream_actor_id.len() != 1 {
+                            return Err(err().into());
+                        }
+
+                        backfill_actors
+                            .push((dispatchers[0].downstream_actor_id[0], upstream_actor));
+                    }
+                }
+                assigned.insert(
+                    fragment_id,
+                    align_splits(
+                        backfill_actors,
+                        &core.actor_splits,
+                        fragment_id,
+                        upstream_source_fragment_id,
+                    )?,
+                );
+            }
+        }
+
+        Ok(assigned)
+    }
+}
+
+impl SourceManagerCore {
+    /// Checks whether the external source metadata has changed,
+    /// and re-assigns splits if there's a diff.
+    ///
+    /// `self.actor_splits` will not be updated. It will be updated by `Self::apply_source_change`,
+    /// after the mutation barrier has been collected.
+    pub async fn reassign_splits(&self) -> MetaResult<HashMap<DatabaseId, SplitAssignment>> {
+        let mut split_assignment: SplitAssignment = HashMap::new();
+
+        'loop_source: for (source_id, handle) in &self.managed_sources {
+            let source_fragment_ids = match self.source_fragments.get(source_id) {
+                Some(fragment_ids) if !fragment_ids.is_empty() => fragment_ids,
+                _ => {
+                    continue;
+                }
+            };
+            let backfill_fragment_ids = self.backfill_fragments.get(source_id);
+
+            'loop_fragment: for &fragment_id in source_fragment_ids {
+                let actors = match self
+                    .metadata_manager
+                    .get_running_actors_of_fragment(fragment_id)
+                    .await
+                {
+                    Ok(actors) => {
+                        if actors.is_empty() {
+                            tracing::warn!("No actors found for fragment {}", fragment_id);
+                            continue 'loop_fragment;
+                        }
+                        actors
+                    }
+                    Err(err) => {
+                        tracing::warn!(error = %err.as_report(), "Failed to get the actor of the fragment, maybe the fragment doesn't exist anymore");
+                        continue 'loop_fragment;
+                    }
+                };
+
+                let discovered_splits = handle.discovered_splits(*source_id, &actors).await?;
+                if discovered_splits.is_empty() {
+                    // The discover loop for this source is not ready yet; we'll wait for the next run
+                    continue 'loop_source;
+                }
+
+                let prev_actor_splits: HashMap<_, _> = actors
+                    .into_iter()
+                    .map(|actor_id| {
+                        (
+                            actor_id,
+                            self.actor_splits
+                                .get(&actor_id)
+                                .cloned()
+                                .unwrap_or_default(),
+                        )
+                    })
+                    .collect();
+
+                if let Some(new_assignment) = reassign_splits(
+                    fragment_id,
+                    prev_actor_splits,
+                    &discovered_splits,
+                    SplitDiffOptions {
+                        enable_scale_in: handle.enable_drop_split,
+                        enable_adaptive: handle.enable_adaptive_splits,
+                    },
+                ) {
+                    split_assignment.insert(fragment_id, new_assignment);
+                }
+            }
+
+            if let Some(backfill_fragment_ids) = backfill_fragment_ids {
+                // align splits for backfill fragments with its upstream source fragment
+                for (fragment_id, upstream_fragment_id) in backfill_fragment_ids {
+                    let Some(upstream_assignment) = split_assignment.get(upstream_fragment_id)
+                    else {
+                        // upstream fragment unchanged, do not update backfill fragment too
+                        continue;
+                    };
+                    let actors = match self
+                        .metadata_manager
+                        .get_running_actors_for_source_backfill(*fragment_id)
+                        .await
+                    {
+                        Ok(actors) => {
+                            if actors.is_empty() {
+                                tracing::warn!("No actors found for fragment {}", fragment_id);
+                                continue;
+                            }
+                            actors
+                        }
+                        Err(err) => {
+                            tracing::warn!(error = %err.as_report(),"Failed to get the actor of the fragment, maybe the fragment doesn't exist anymore");
+                            continue;
+                        }
+                    };
+                    split_assignment.insert(
+                        *fragment_id,
+                        align_splits(
+                            actors,
+                            upstream_assignment,
+                            *fragment_id,
+                            *upstream_fragment_id,
+                        )?,
+                    );
+                }
+            }
+        }
+
+        self.metadata_manager
+            .split_fragment_map_by_database(split_assignment)
+            .await
+    }
+}
+
+/// Reassigns splits if there are new splits or dropped splits,
+/// i.e., `actor_splits` and `discovered_splits` differ, or actors are rescheduled.
+///
+/// The existing splits will remain unmoved in their currently assigned actor.
+///
+/// If an actor has an upstream actor, it should be a backfill executor,
+/// and its splits should be aligned with the upstream actor. **`reassign_splits` should not be used in this case.
+/// Use [`align_splits`] instead.**
+///
+/// - `fragment_id`: just for logging
+///
+/// ## Different connectors' behavior of split change
+///
+/// ### Kafka and Pulsar
+/// They only support increasing the number of splits via adding new empty splits.
+/// Old data is not moved.
+///
+/// ### Kinesis
+/// It supports *pairwise* shard split and merge.
+///
+/// In both cases, old data remain in the old shard(s) and the old shard is still available.
+/// New data are routed to the new shard(s).
+/// After the retention period has expired, the old shard will become `EXPIRED` and isn't
+/// listed any more. In other words, the total number of shards will first increase and then decrease.
+///
+/// See also:
+/// - [Kinesis resharding doc](https://docs.aws.amazon.com/streams/latest/dev/kinesis-using-sdk-java-after-resharding.html#kinesis-using-sdk-java-resharding-data-routing)
+/// - An example of how the shards can be like: <https://stackoverflow.com/questions/72272034/list-shard-show-more-shards-than-provisioned>
+fn reassign_splits<T>(
+    fragment_id: FragmentId,
+    actor_splits: HashMap<ActorId, Vec<T>>,
+    discovered_splits: &BTreeMap<SplitId, T>,
+    opts: SplitDiffOptions,
+) -> Option<HashMap<ActorId, Vec<T>>>
+where
+    T: SplitMetaData + Clone,
+{
+    // if no actors, return
+    if actor_splits.is_empty() {
+        return None;
+    }
+
+    let prev_split_ids: HashSet<_> = actor_splits
+        .values()
+        .flat_map(|splits| splits.iter().map(SplitMetaData::id))
+        .collect();
+
+    tracing::trace!(fragment_id, prev_split_ids = ?prev_split_ids, "previous splits");
+    tracing::trace!(fragment_id, prev_split_ids = ?discovered_splits.keys(), "discovered splits");
+
+    let discovered_split_ids: HashSet<_> = discovered_splits.keys().cloned().collect();
+
+    let dropped_splits: HashSet<_> = prev_split_ids
+        .difference(&discovered_split_ids)
+        .cloned()
+        .collect();
+
+    if !dropped_splits.is_empty() {
+        if opts.enable_scale_in {
+            tracing::info!(fragment_id, dropped_spltis = ?dropped_splits, "new dropped splits");
+        } else {
+            tracing::warn!(fragment_id, dropped_spltis = ?dropped_splits, "split dropping happened, but it is not allowed");
+        }
+    }
+
+    let new_discovered_splits: BTreeSet<_> = discovered_split_ids
+        .into_iter()
+        .filter(|split_id| !prev_split_ids.contains(split_id))
+        .collect();
+
+    if opts.enable_scale_in || opts.enable_adaptive {
+        // if we support scale in, no more splits are discovered, and no splits are dropped, return
+        // we need to check if discovered_split_ids is empty, because if it is empty, we need to
+        // handle the case of scale in to zero (like deleting all objects from s3)
+        if dropped_splits.is_empty()
+            && new_discovered_splits.is_empty()
+            && !discovered_splits.is_empty()
+        {
+            return None;
+        }
+    } else {
+        // if we do not support scale in, and no more splits are discovered, return
+        if new_discovered_splits.is_empty() && !discovered_splits.is_empty() {
+            return None;
+        }
+    }
+
+    tracing::info!(fragment_id, new_discovered_splits = ?new_discovered_splits, "new discovered splits");
+
+    let mut heap = BinaryHeap::with_capacity(actor_splits.len());
+
+    for (actor_id, mut splits) in actor_splits {
+        if opts.enable_scale_in || opts.enable_adaptive {
+            splits.retain(|split| !dropped_splits.contains(&split.id()));
+        }
+
+        heap.push(ActorSplitsAssignment { actor_id, splits })
+    }
+
+    for split_id in new_discovered_splits {
+        // ActorSplitsAssignment's Ord is reversed, so this is min heap, i.e.,
+        // we get the assignment with the least splits here.
+
+        // Note: If multiple actors have the same number of splits, it will be randomly picked.
+        // When the number of source actors is larger than the number of splits,
+        // It's possible that the assignment is uneven.
+        // e.g., https://github.com/risingwavelabs/risingwave/issues/14324#issuecomment-1875033158
+        // TODO: We should make the assignment rack-aware to make sure it's even.
+        let mut peek_ref = heap.peek_mut().unwrap();
+        peek_ref
+            .splits
+            .push(discovered_splits.get(&split_id).cloned().unwrap());
+    }
+
+    Some(
+        heap.into_iter()
+            .map(|ActorSplitsAssignment { actor_id, splits }| (actor_id, splits))
+            .collect(),
+    )
+}
+
+/// Assign splits to a new set of actors, according to existing assignment.
+///
+/// illustration:
+/// ```text
+/// upstream                               new
+/// actor x1 [split 1, split2]      ->     actor y1 [split 1, split2]
+/// actor x2 [split 3]              ->     actor y2 [split 3]
+/// ...
+/// ```
+fn align_splits(
+    // (actor_id, upstream_actor_id)
+    aligned_actors: impl IntoIterator<Item = (ActorId, ActorId)>,
+    existing_assignment: &HashMap<ActorId, Vec<SplitImpl>>,
+    fragment_id: FragmentId,
+    upstream_source_fragment_id: FragmentId,
+) -> anyhow::Result<HashMap<ActorId, Vec<SplitImpl>>> {
+    aligned_actors
+        .into_iter()
+        .map(|(actor_id, upstream_actor_id)| {
+            let Some(splits) = existing_assignment.get(&upstream_actor_id) else {
+                return Err(anyhow::anyhow!("upstream assignment not found, fragment_id: {fragment_id}, upstream_fragment_id: {upstream_source_fragment_id}, actor_id: {actor_id}, upstream_assignment: {existing_assignment:?}, upstream_actor_id: {upstream_actor_id:?}"));
+            };
+            Ok((
+                actor_id,
+                splits.clone(),
+            ))
+        })
+        .collect()
+}
+
+/// Note: the `PartialEq` and `Ord` impl just compares the number of splits.
+#[derive(Debug)]
+struct ActorSplitsAssignment<T: SplitMetaData> {
+    actor_id: ActorId,
+    splits: Vec<T>,
+}
+
+impl<T: SplitMetaData + Clone> Eq for ActorSplitsAssignment<T> {}
+
+impl<T: SplitMetaData + Clone> PartialEq<Self> for ActorSplitsAssignment<T> {
+    fn eq(&self, other: &Self) -> bool {
+        self.splits.len() == other.splits.len()
+    }
+}
+
+impl<T: SplitMetaData + Clone> PartialOrd<Self> for ActorSplitsAssignment<T> {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl<T: SplitMetaData + Clone> Ord for ActorSplitsAssignment<T> {
+    fn cmp(&self, other: &Self) -> Ordering {
+        // Note: this is reversed order, to make BinaryHeap a min heap.
+        other.splits.len().cmp(&self.splits.len())
+    }
+}
+
+#[derive(Debug)]
+pub struct SplitDiffOptions {
+    pub enable_scale_in: bool,
+
+    /// For most connectors, this should be false. When enabled, RisingWave will not track any progress.
+    pub enable_adaptive: bool,
+}
+
+#[allow(clippy::derivable_impls)]
+impl Default for SplitDiffOptions {
+    fn default() -> Self {
+        SplitDiffOptions {
+            enable_scale_in: false,
+            enable_adaptive: false,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::collections::{BTreeMap, HashMap, HashSet};
+
+    use risingwave_common::types::JsonbVal;
+    use risingwave_connector::error::ConnectorResult;
+    use risingwave_connector::source::{SplitId, SplitMetaData};
+    use serde::{Deserialize, Serialize};
+
+    use super::*;
+    use crate::model::{ActorId, FragmentId};
+
+    #[derive(Debug, Copy, Clone, Serialize, Deserialize)]
+    struct TestSplit {
+        id: u32,
+    }
+
+    impl SplitMetaData for TestSplit {
+        fn id(&self) -> SplitId {
+            format!("{}", self.id).into()
+        }
+
+        fn encode_to_json(&self) -> JsonbVal {
+            serde_json::to_value(*self).unwrap().into()
+        }
+
+        fn restore_from_json(value: JsonbVal) -> ConnectorResult<Self> {
+            serde_json::from_value(value.take()).map_err(Into::into)
+        }
+
+        fn update_offset(&mut self, _last_read_offset: String) -> ConnectorResult<()> {
+            Ok(())
+        }
+    }
+
+    fn check_all_splits(
+        discovered_splits: &BTreeMap<SplitId, TestSplit>,
+        diff: &HashMap<ActorId, Vec<TestSplit>>,
+    ) {
+        let mut split_ids: HashSet<_> = discovered_splits.keys().cloned().collect();
+
+        for splits in diff.values() {
+            for split in splits {
+                assert!(split_ids.remove(&split.id()))
+            }
+        }
+
+        assert!(split_ids.is_empty());
+    }
+
+    #[test]
+    fn test_drop_splits() {
+        let mut actor_splits: HashMap<ActorId, _> = HashMap::new();
+        actor_splits.insert(0, vec![TestSplit { id: 0 }, TestSplit { id: 1 }]);
+        actor_splits.insert(1, vec![TestSplit { id: 2 }, TestSplit { id: 3 }]);
+        actor_splits.insert(2, vec![TestSplit { id: 4 }, TestSplit { id: 5 }]);
+
+        let mut prev_split_to_actor = HashMap::new();
+        for (actor_id, splits) in &actor_splits {
+            for split in splits {
+                prev_split_to_actor.insert(split.id(), *actor_id);
+            }
+        }
+
+        let discovered_splits: BTreeMap<SplitId, TestSplit> = (1..5)
+            .map(|i| {
+                let split = TestSplit { id: i };
+                (split.id(), split)
+            })
+            .collect();
+
+        let opts = SplitDiffOptions {
+            enable_scale_in: true,
+            enable_adaptive: false,
+        };
+
+        let prev_split_ids: HashSet<_> = actor_splits
+            .values()
+            .flat_map(|splits| splits.iter().map(|split| split.id()))
+            .collect();
+
+        let diff = reassign_splits(
+            FragmentId::default(),
+            actor_splits,
+            &discovered_splits,
+            opts,
+        )
+        .unwrap();
+        check_all_splits(&discovered_splits, &diff);
+
+        let mut after_split_to_actor = HashMap::new();
+        for (actor_id, splits) in &diff {
+            for split in splits {
+                after_split_to_actor.insert(split.id(), *actor_id);
+            }
+        }
+
+        let discovered_split_ids: HashSet<_> = discovered_splits.keys().cloned().collect();
+
+        let retained_split_ids: HashSet<_> =
+            prev_split_ids.intersection(&discovered_split_ids).collect();
+
+        for retained_split_id in retained_split_ids {
+            assert_eq!(
+                prev_split_to_actor.get(retained_split_id),
+                after_split_to_actor.get(retained_split_id)
+            )
+        }
+    }
+
+    #[test]
+    fn test_drop_splits_to_empty() {
+        let mut actor_splits: HashMap<ActorId, _> = HashMap::new();
+        actor_splits.insert(0, vec![TestSplit { id: 0 }]);
+
+        let discovered_splits: BTreeMap<SplitId, TestSplit> = BTreeMap::new();
+
+        let opts = SplitDiffOptions {
+            enable_scale_in: true,
+            enable_adaptive: false,
+        };
+
+        let diff = reassign_splits(
+            FragmentId::default(),
+            actor_splits,
+            &discovered_splits,
+            opts,
+        )
+        .unwrap();
+
+        assert!(!diff.is_empty())
+    }
+
+    #[test]
+    fn test_reassign_splits() {
+        let actor_splits = HashMap::new();
+        let discovered_splits: BTreeMap<SplitId, TestSplit> = BTreeMap::new();
+        assert!(reassign_splits(
+            FragmentId::default(),
+            actor_splits,
+            &discovered_splits,
+            Default::default()
+        )
+        .is_none());
+
+        let actor_splits = (0..3).map(|i| (i, vec![])).collect();
+        let discovered_splits: BTreeMap<SplitId, TestSplit> = BTreeMap::new();
+        let diff = reassign_splits(
+            FragmentId::default(),
+            actor_splits,
+            &discovered_splits,
+            Default::default(),
+        )
+        .unwrap();
+        assert_eq!(diff.len(), 3);
+        for splits in diff.values() {
+            assert!(splits.is_empty())
+        }
+
+        let actor_splits = (0..3).map(|i| (i, vec![])).collect();
+        let discovered_splits: BTreeMap<SplitId, TestSplit> = (0..3)
+            .map(|i| {
+                let split = TestSplit { id: i };
+                (split.id(), split)
+            })
+            .collect();
+
+        let diff = reassign_splits(
+            FragmentId::default(),
+            actor_splits,
+            &discovered_splits,
+            Default::default(),
+        )
+        .unwrap();
+        assert_eq!(diff.len(), 3);
+        for splits in diff.values() {
+            assert_eq!(splits.len(), 1);
+        }
+
+        check_all_splits(&discovered_splits, &diff);
+
+        let actor_splits = (0..3).map(|i| (i, vec![TestSplit { id: i }])).collect();
+        let discovered_splits: BTreeMap<SplitId, TestSplit> = (0..5)
+            .map(|i| {
+                let split = TestSplit { id: i };
+                (split.id(), split)
+            })
+            .collect();
+
+        let diff = reassign_splits(
+            FragmentId::default(),
+            actor_splits,
+            &discovered_splits,
+            Default::default(),
+        )
+        .unwrap();
+        assert_eq!(diff.len(), 3);
+        for splits in diff.values() {
+            let len = splits.len();
+            assert!(len == 1 || len == 2);
+        }
+
+        check_all_splits(&discovered_splits, &diff);
+
+        let mut actor_splits: HashMap<ActorId, Vec<TestSplit>> =
+            (0..3).map(|i| (i, vec![TestSplit { id: i }])).collect();
+        actor_splits.insert(3, vec![]);
+        actor_splits.insert(4, vec![]);
+
+        let discovered_splits: BTreeMap<SplitId, TestSplit> = (0..5)
+            .map(|i| {
+                let split = TestSplit { id: i };
+                (split.id(), split)
+            })
+            .collect();
+
+        let diff = reassign_splits(
+            FragmentId::default(),
+            actor_splits,
+            &discovered_splits,
+            Default::default(),
+        )
+        .unwrap();
+        assert_eq!(diff.len(), 5);
+        for splits in diff.values() {
+            assert_eq!(splits.len(), 1);
+        }
+
+        check_all_splits(&discovered_splits, &diff);
+    }
+}
diff --git a/src/meta/src/stream/source_manager/worker.rs b/src/meta/src/stream/source_manager/worker.rs
new file mode 100644
index 0000000000000..6f6d647ffb78f
--- /dev/null
+++ b/src/meta/src/stream/source_manager/worker.rs
@@ -0,0 +1,320 @@
+// Copyright 2025 RisingWave Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use super::*;
+
+const MAX_FAIL_CNT: u32 = 10;
+const DEFAULT_SOURCE_TICK_TIMEOUT: Duration = Duration::from_secs(10);
+
+pub struct SharedSplitMap {
+    pub splits: Option<BTreeMap<SplitId, SplitImpl>>,
+}
+
+type SharedSplitMapRef = Arc<Mutex<SharedSplitMap>>;
+
+/// `ConnectorSourceWorker` keeps fetching the latest split metadata from the external source service ([`Self::tick`]),
+/// and maintains it in `current_splits`.
+pub struct ConnectorSourceWorker<P: SourceProperties> {
+    source_id: SourceId,
+    source_name: String,
+    current_splits: SharedSplitMapRef,
+    enumerator: P::SplitEnumerator,
+    period: Duration,
+    metrics: Arc<MetaMetrics>,
+    connector_properties: P,
+    fail_cnt: u32,
+    source_is_up: LabelGuardedIntGauge<2>,
+}
+
+fn extract_prop_from_existing_source(source: &Source) -> ConnectorResult<ConnectorProperties> {
+    let options_with_secret =
+        WithOptionsSecResolved::new(source.with_properties.clone(), source.secret_refs.clone());
+    let mut properties = ConnectorProperties::extract(options_with_secret, false)?;
+    properties.init_from_pb_source(source);
+    Ok(properties)
+}
+fn extract_prop_from_new_source(source: &Source) -> ConnectorResult<ConnectorProperties> {
+    let options_with_secret =
+        WithOptionsSecResolved::new(source.with_properties.clone(), source.secret_refs.clone());
+    let mut properties = ConnectorProperties::extract(options_with_secret, true)?;
+    properties.init_from_pb_source(source);
+    Ok(properties)
+}
+
+/// Used to create a new [`ConnectorSourceWorkerHandle`] for a new source.
+///
+/// It will call [`ConnectorSourceWorker::tick()`] to fetch split metadata once before returning.
+pub async fn create_source_worker(
+    source: &Source,
+    metrics: Arc<MetaMetrics>,
+) -> MetaResult<ConnectorSourceWorkerHandle> {
+    tracing::info!("spawning new watcher for source {}", source.id);
+
+    let splits = Arc::new(Mutex::new(SharedSplitMap { splits: None }));
+    let current_splits_ref = splits.clone();
+
+    let connector_properties = extract_prop_from_new_source(source)?;
+    let enable_scale_in = connector_properties.enable_drop_split();
+    let enable_adaptive_splits = connector_properties.enable_adaptive_splits();
+    let (sync_call_tx, sync_call_rx) = tokio::sync::mpsc::unbounded_channel();
+    let handle = dispatch_source_prop!(connector_properties, prop, {
+        let mut worker = ConnectorSourceWorker::create(
+            source,
+            *prop,
+            DEFAULT_SOURCE_WORKER_TICK_INTERVAL,
+            current_splits_ref.clone(),
+            metrics,
+        )
+        .await?;
+
+        // if fail to fetch meta info, will refuse to create source
+
+        // todo: make the timeout configurable, longer than `properties.sync.call.timeout`
+        // in kafka
+        tokio::time::timeout(DEFAULT_SOURCE_TICK_TIMEOUT, worker.tick())
+            .await
+            .ok()
+            .with_context(|| {
+                format!(
+                    "failed to fetch meta info for source {}, timeout {:?}",
+                    source.id, DEFAULT_SOURCE_TICK_TIMEOUT
+                )
+            })??;
+
+        tokio::spawn(async move { worker.run(sync_call_rx).await })
+    });
+    Ok(ConnectorSourceWorkerHandle {
+        handle,
+        sync_call_tx,
+        splits,
+        enable_drop_split: enable_scale_in,
+        enable_adaptive_splits,
+    })
+}
+
+/// Used on startup ([`SourceManager::new`]). Failed sources will not block meta startup.
+pub fn create_source_worker_async(
+    source: Source,
+    managed_sources: &mut HashMap<SourceId, ConnectorSourceWorkerHandle>,
+    metrics: Arc<MetaMetrics>,
+) -> MetaResult<()> {
+    tracing::info!("spawning new watcher for source {}", source.id);
+
+    let splits = Arc::new(Mutex::new(SharedSplitMap { splits: None }));
+    let current_splits_ref = splits.clone();
+    let source_id = source.id;
+
+    let connector_properties = extract_prop_from_existing_source(&source)?;
+
+    let enable_drop_split = connector_properties.enable_drop_split();
+    let enable_adaptive_splits = connector_properties.enable_adaptive_splits();
+    let (sync_call_tx, sync_call_rx) = tokio::sync::mpsc::unbounded_channel();
+    let handle = tokio::spawn(async move {
+        let mut ticker = time::interval(DEFAULT_SOURCE_WORKER_TICK_INTERVAL);
+        ticker.set_missed_tick_behavior(MissedTickBehavior::Skip);
+
+        dispatch_source_prop!(connector_properties, prop, {
+            let mut worker = loop {
+                ticker.tick().await;
+
+                match ConnectorSourceWorker::create(
+                    &source,
+                    prop.deref().clone(),
+                    DEFAULT_SOURCE_WORKER_TICK_INTERVAL,
+                    current_splits_ref.clone(),
+                    metrics.clone(),
+                )
+                .await
+                {
+                    Ok(worker) => {
+                        break worker;
+                    }
+                    Err(e) => {
+                        tracing::warn!(error = %e.as_report(), "failed to create source worker");
+                    }
+                }
+            };
+
+            worker.run(sync_call_rx).await
+        });
+    });
+
+    managed_sources.insert(
+        source_id as SourceId,
+        ConnectorSourceWorkerHandle {
+            handle,
+            sync_call_tx,
+            splits,
+            enable_drop_split,
+            enable_adaptive_splits,
+        },
+    );
+    Ok(())
+}
+
+const DEFAULT_SOURCE_WORKER_TICK_INTERVAL: Duration = Duration::from_secs(30);
+
+impl<P: SourceProperties> ConnectorSourceWorker<P> {
+    /// Recreate the `SplitEnumerator` to establish a new connection to the external source service.
+    async fn refresh(&mut self) -> MetaResult<()> {
+        let enumerator = P::SplitEnumerator::new(
+            self.connector_properties.clone(),
+            Arc::new(SourceEnumeratorContext {
+                metrics: self.metrics.source_enumerator_metrics.clone(),
+                info: SourceEnumeratorInfo {
+                    source_id: self.source_id as u32,
+                },
+            }),
+        )
+        .await
+        .context("failed to create SplitEnumerator")?;
+        self.enumerator = enumerator;
+        self.fail_cnt = 0;
+        tracing::info!("refreshed source enumerator: {}", self.source_name);
+        Ok(())
+    }
+
+    /// On creation, connection to the external source service will be established, but `splits`
+    /// will not be updated until `tick` is called.
+    pub async fn create(
+        source: &Source,
+        connector_properties: P,
+        period: Duration,
+        splits: Arc<Mutex<SharedSplitMap>>,
+        metrics: Arc<MetaMetrics>,
+    ) -> MetaResult<Self> {
+        let enumerator = P::SplitEnumerator::new(
+            connector_properties.clone(),
+            Arc::new(SourceEnumeratorContext {
+                metrics: metrics.source_enumerator_metrics.clone(),
+                info: SourceEnumeratorInfo {
+                    source_id: source.id,
+                },
+            }),
+        )
+        .await
+        .context("failed to create SplitEnumerator")?;
+
+        let source_is_up = metrics
+            .source_is_up
+            .with_guarded_label_values(&[source.id.to_string().as_str(), &source.name]);
+
+        Ok(Self {
+            source_id: source.id as SourceId,
+            source_name: source.name.clone(),
+            current_splits: splits,
+            enumerator,
+            period,
+            metrics,
+            connector_properties,
+            fail_cnt: 0,
+            source_is_up,
+        })
+    }
+
+    pub async fn run(
+        &mut self,
+        mut sync_call_rx: UnboundedReceiver<oneshot::Sender<MetaResult<()>>>,
+    ) {
+        let mut interval = time::interval(self.period);
+        interval.set_missed_tick_behavior(MissedTickBehavior::Skip);
+        loop {
+            select! {
+                biased;
+                tx = sync_call_rx.borrow_mut().recv() => {
+                    if let Some(tx) = tx {
+                        let _ = tx.send(self.tick().await);
+                    }
+                }
+                _ = interval.tick() => {
+                    if self.fail_cnt > MAX_FAIL_CNT {
+                        if let Err(e) = self.refresh().await {
+                            tracing::error!(error = %e.as_report(), "error happened when refresh from connector source worker");
+                        }
+                    }
+                    if let Err(e) = self.tick().await {
+                        tracing::error!(error = %e.as_report(), "error happened when tick from connector source worker");
+                    }
+                }
+            }
+        }
+    }
+
+    /// Uses [`SplitEnumerator`] to fetch the latest split metadata from the external source service.
+    async fn tick(&mut self) -> MetaResult<()> {
+        let source_is_up = |res: i64| {
+            self.source_is_up.set(res);
+        };
+        let splits = self.enumerator.list_splits().await.inspect_err(|_| {
+            source_is_up(0);
+            self.fail_cnt += 1;
+        })?;
+        source_is_up(1);
+        self.fail_cnt = 0;
+        let mut current_splits = self.current_splits.lock().await;
+        current_splits.splits.replace(
+            splits
+                .into_iter()
+                .map(|split| (split.id(), P::Split::into(split)))
+                .collect(),
+        );
+
+        Ok(())
+    }
+}
+
+/// Handle for a running [`ConnectorSourceWorker`].
+pub struct ConnectorSourceWorkerHandle {
+    pub handle: JoinHandle<()>,
+    pub sync_call_tx: UnboundedSender<oneshot::Sender<MetaResult<()>>>,
+    pub splits: SharedSplitMapRef,
+    pub enable_drop_split: bool,
+    pub enable_adaptive_splits: bool,
+}
+
+impl ConnectorSourceWorkerHandle {
+    pub fn get_enable_adaptive_splits(&self) -> bool {
+        self.enable_adaptive_splits
+    }
+
+    pub async fn discovered_splits(
+        &self,
+        source_id: SourceId,
+        actors: &HashSet<ActorId>,
+    ) -> MetaResult<BTreeMap<Arc<str>, SplitImpl>> {
+        let Some(mut discovered_splits) = self.splits.lock().await.splits.clone() else {
+            tracing::info!(
+                "The discover loop for source {} is not ready yet; we'll wait for the next run",
+                source_id
+            );
+            return Ok(BTreeMap::new());
+        };
+        if discovered_splits.is_empty() {
+            tracing::warn!("No splits discovered for source {}", source_id);
+        }
+
+        if self.enable_adaptive_splits {
+            // Connector supporting adaptive splits returns just one split, and we need to make the number of splits equal to the number of actors in this fragment.
+            // Because we Risingwave consume the splits statelessly and we do not need to keep the id internally, we always use actor_id as split_id.
+            // And prev splits record should be dropped via CN.
+
+            debug_assert!(self.enable_drop_split);
+            debug_assert!(discovered_splits.len() == 1);
+            discovered_splits =
+                fill_adaptive_split(discovered_splits.values().next().unwrap(), actors)?;
+        }
+
+        Ok(discovered_splits)
+    }
+}

From 9d7698545b3038befed43be7262d7d61132030b6 Mon Sep 17 00:00:00 2001
From: Li0k <yuli@singularity-data.com>
Date: Thu, 9 Jan 2025 13:27:31 +0800
Subject: [PATCH 09/21] refactor(storage): remove option of new log (#20055)

---
 src/meta/src/hummock/manager/checkpoint.rs    |  2 +-
 src/storage/hummock_sdk/src/change_log.rs     | 18 +++----
 .../compaction_group/hummock_version_ext.rs   |  2 +-
 .../hummock_sdk/src/frontend_version.rs       | 48 +++++++++++--------
 src/storage/hummock_sdk/src/time_travel.rs    | 18 +++----
 src/storage/hummock_sdk/src/version.rs        |  6 +--
 6 files changed, 51 insertions(+), 43 deletions(-)

diff --git a/src/meta/src/hummock/manager/checkpoint.rs b/src/meta/src/hummock/manager/checkpoint.rs
index 5f9664b64a011..bb47248686129 100644
--- a/src/meta/src/hummock/manager/checkpoint.rs
+++ b/src/meta/src/hummock/manager/checkpoint.rs
@@ -179,7 +179,7 @@ impl HummockManager {
                                 .change_log_delta
                                 .values()
                                 .flat_map(|change_log| {
-                                    let new_log = change_log.new_log.as_ref().unwrap();
+                                    let new_log = &change_log.new_log;
                                     new_log
                                         .new_value
                                         .iter()
diff --git a/src/storage/hummock_sdk/src/change_log.rs b/src/storage/hummock_sdk/src/change_log.rs
index 5a7bf0143c764..5a5c4a647ecbe 100644
--- a/src/storage/hummock_sdk/src/change_log.rs
+++ b/src/storage/hummock_sdk/src/change_log.rs
@@ -190,11 +190,11 @@ pub fn build_table_change_log_delta<'a>(
                 TableId::new(table_id),
                 ChangeLogDelta {
                     truncate_epoch,
-                    new_log: Some(EpochNewChangeLog {
+                    new_log: EpochNewChangeLog {
                         new_value: vec![],
                         old_value: vec![],
                         epochs: epochs.clone(),
-                    }),
+                    },
                 },
             )
         })
@@ -203,7 +203,7 @@ pub fn build_table_change_log_delta<'a>(
         for table_id in &sst.table_ids {
             match table_change_log.get_mut(&TableId::new(*table_id)) {
                 Some(log) => {
-                    log.new_log.as_mut().unwrap().old_value.push(sst.clone());
+                    log.new_log.old_value.push(sst.clone());
                 }
                 None => {
                     warn!(table_id, ?sst, "old value sst contains non-log-store table");
@@ -214,7 +214,7 @@ pub fn build_table_change_log_delta<'a>(
     for sst in new_value_ssts {
         for table_id in &sst.table_ids {
             if let Some(log) = table_change_log.get_mut(&TableId::new(*table_id)) {
-                log.new_log.as_mut().unwrap().new_value.push(sst.clone());
+                log.new_log.new_value.push(sst.clone());
             }
         }
     }
@@ -224,7 +224,7 @@ pub fn build_table_change_log_delta<'a>(
 #[derive(Debug, PartialEq, Clone)]
 pub struct ChangeLogDeltaCommon<T> {
     pub truncate_epoch: u64,
-    pub new_log: Option<EpochNewChangeLogCommon<T>>,
+    pub new_log: EpochNewChangeLogCommon<T>,
 }
 
 pub type ChangeLogDelta = ChangeLogDeltaCommon<SstableInfo>;
@@ -236,7 +236,7 @@ where
     fn from(val: &ChangeLogDeltaCommon<T>) -> Self {
         Self {
             truncate_epoch: val.truncate_epoch,
-            new_log: val.new_log.as_ref().map(|a| a.into()),
+            new_log: Some((&val.new_log).into()),
         }
     }
 }
@@ -248,7 +248,7 @@ where
     fn from(val: &PbChangeLogDelta) -> Self {
         Self {
             truncate_epoch: val.truncate_epoch,
-            new_log: val.new_log.as_ref().map(|a| a.into()),
+            new_log: val.new_log.as_ref().unwrap().into(),
         }
     }
 }
@@ -260,7 +260,7 @@ where
     fn from(val: ChangeLogDeltaCommon<T>) -> Self {
         Self {
             truncate_epoch: val.truncate_epoch,
-            new_log: val.new_log.map(|a| a.into()),
+            new_log: Some(val.new_log.into()),
         }
     }
 }
@@ -272,7 +272,7 @@ where
     fn from(val: PbChangeLogDelta) -> Self {
         Self {
             truncate_epoch: val.truncate_epoch,
-            new_log: val.new_log.map(|a| a.into()),
+            new_log: val.new_log.unwrap().into(),
         }
     }
 }
diff --git a/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs b/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs
index ee316f75ffd65..eb4bb30e69dc3 100644
--- a/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs
+++ b/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs
@@ -693,7 +693,7 @@ impl HummockVersion {
         changed_table_info: &HashMap<TableId, Option<StateTableInfo>>,
     ) {
         for (table_id, change_log_delta) in change_log_delta {
-            let new_change_log = change_log_delta.new_log.as_ref().unwrap();
+            let new_change_log = &change_log_delta.new_log;
             match table_change_log.entry(*table_id) {
                 Entry::Occupied(entry) => {
                     let change_log = entry.into_mut();
diff --git a/src/storage/hummock_sdk/src/frontend_version.rs b/src/storage/hummock_sdk/src/frontend_version.rs
index 4840b402a292b..11eb045efbe22 100644
--- a/src/storage/hummock_sdk/src/frontend_version.rs
+++ b/src/storage/hummock_sdk/src/frontend_version.rs
@@ -151,14 +151,12 @@ impl FrontendHummockVersionDelta {
                         *table_id,
                         ChangeLogDeltaCommon {
                             truncate_epoch: change_log_delta.truncate_epoch,
-                            new_log: change_log_delta.new_log.as_ref().map(|new_log| {
-                                EpochNewChangeLogCommon {
-                                    // Here we need to determine if value is null but don't care what the value is, so we fill him in using `()`
-                                    new_value: vec![(); new_log.new_value.len()],
-                                    old_value: vec![(); new_log.old_value.len()],
-                                    epochs: new_log.epochs.clone(),
-                                }
-                            }),
+                            new_log: EpochNewChangeLogCommon {
+                                // Here we need to determine if value is null but don't care what the value is, so we fill him in using `()`
+                                new_value: vec![(); change_log_delta.new_log.new_value.len()],
+                                old_value: vec![(); change_log_delta.new_log.old_value.len()],
+                                epochs: change_log_delta.new_log.epochs.clone(),
+                            },
                         },
                     )
                 })
@@ -187,11 +185,17 @@ impl FrontendHummockVersionDelta {
                     (
                         table_id.table_id,
                         PbChangeLogDelta {
-                            new_log: delta.new_log.as_ref().map(|new_log| PbEpochNewChangeLog {
+                            new_log: Some(PbEpochNewChangeLog {
                                 // Here we need to determine if value is null but don't care what the value is, so we fill him in using `PbSstableInfo::default()`
-                                old_value: vec![PbSstableInfo::default(); new_log.old_value.len()],
-                                new_value: vec![PbSstableInfo::default(); new_log.new_value.len()],
-                                epochs: new_log.epochs.clone(),
+                                old_value: vec![
+                                    PbSstableInfo::default();
+                                    delta.new_log.old_value.len()
+                                ],
+                                new_value: vec![
+                                    PbSstableInfo::default();
+                                    delta.new_log.new_value.len()
+                                ],
+                                epochs: delta.new_log.epochs.clone(),
                             }),
                             truncate_epoch: delta.truncate_epoch,
                         },
@@ -228,14 +232,18 @@ impl FrontendHummockVersionDelta {
                         TableId::new(*table_id),
                         ChangeLogDeltaCommon {
                             truncate_epoch: change_log_delta.truncate_epoch,
-                            new_log: change_log_delta.new_log.as_ref().map(|new_log| {
-                                EpochNewChangeLogCommon {
-                                    // Here we need to determine if value is null but don't care what the value is, so we fill him in using `()`
-                                    new_value: vec![(); new_log.new_value.len()],
-                                    old_value: vec![(); new_log.old_value.len()],
-                                    epochs: new_log.epochs.clone(),
-                                }
-                            }),
+                            new_log: change_log_delta
+                                .new_log
+                                .as_ref()
+                                .map(|new_log| {
+                                    EpochNewChangeLogCommon {
+                                        // Here we need to determine if value is null but don't care what the value is, so we fill him in using `()`
+                                        new_value: vec![(); new_log.new_value.len()],
+                                        old_value: vec![(); new_log.old_value.len()],
+                                        epochs: new_log.epochs.clone(),
+                                    }
+                                })
+                                .unwrap(),
                         },
                     )
                 })
diff --git a/src/storage/hummock_sdk/src/time_travel.rs b/src/storage/hummock_sdk/src/time_travel.rs
index 0a651260b3789..9f628808909ff 100644
--- a/src/storage/hummock_sdk/src/time_travel.rs
+++ b/src/storage/hummock_sdk/src/time_travel.rs
@@ -191,15 +191,15 @@ impl From<(&HummockVersionDelta, &HashSet<StateTableId>)> for IncompleteHummockV
                     }
                     debug_assert!(log_delta
                         .new_log
-                        .as_ref()
-                        .map(|d| {
-                            d.new_value.iter().chain(d.old_value.iter()).all(|s| {
-                                s.table_ids
-                                    .iter()
-                                    .any(|tid| time_travel_table_ids.contains(tid))
-                            })
-                        })
-                        .unwrap_or(true));
+                        .new_value
+                        .iter()
+                        .chain(log_delta.new_log.old_value.iter())
+                        .all(|s| {
+                            s.table_ids
+                                .iter()
+                                .any(|tid| time_travel_table_ids.contains(tid))
+                        }));
+
                     Some((*table_id, PbChangeLogDelta::from(log_delta).into()))
                 })
                 .collect(),
diff --git a/src/storage/hummock_sdk/src/version.rs b/src/storage/hummock_sdk/src/version.rs
index 09e96860cc839..4c90e6cae47f1 100644
--- a/src/storage/hummock_sdk/src/version.rs
+++ b/src/storage/hummock_sdk/src/version.rs
@@ -570,7 +570,7 @@ where
             })
             .chain(self.change_log_delta.values().flat_map(|delta| {
                 // TODO: optimization: strip table change log
-                let new_log = delta.new_log.as_ref().unwrap();
+                let new_log = &delta.new_log;
                 new_log.new_value.iter().chain(new_log.old_value.iter())
             }))
     }
@@ -623,8 +623,8 @@ where
                     (
                         TableId::new(*table_id),
                         ChangeLogDeltaCommon {
-                            new_log: log_delta.new_log.as_ref().map(Into::into),
                             truncate_epoch: log_delta.truncate_epoch,
+                            new_log: log_delta.new_log.as_ref().unwrap().into(),
                         },
                     )
                 })
@@ -752,7 +752,7 @@ where
                     (
                         TableId::new(*table_id),
                         ChangeLogDeltaCommon {
-                            new_log: log_delta.new_log.clone().map(Into::into),
+                            new_log: log_delta.new_log.clone().unwrap().into(),
                             truncate_epoch: log_delta.truncate_epoch,
                         },
                     )

From 63b20efbfd126707e66d217e69ba6157124a6743 Mon Sep 17 00:00:00 2001
From: Richard Chien <stdrc@outlook.com>
Date: Thu, 9 Jan 2025 13:40:48 +0800
Subject: [PATCH 10/21] fix(frontend): set `if_not_exists` to false when
 normalizing `CREATE VIEW` (#20073)

Signed-off-by: Richard Chien <stdrc@outlook.com>
---
 src/frontend/src/handler/mod.rs | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/frontend/src/handler/mod.rs b/src/frontend/src/handler/mod.rs
index 446187335b06a..bd559b9245fc7 100644
--- a/src/frontend/src/handler/mod.rs
+++ b/src/frontend/src/handler/mod.rs
@@ -198,8 +198,13 @@ impl HandlerArgs {
     fn normalize_sql(stmt: &Statement) -> String {
         let mut stmt = stmt.clone();
         match &mut stmt {
-            Statement::CreateView { or_replace, .. } => {
+            Statement::CreateView {
+                or_replace,
+                if_not_exists,
+                ..
+            } => {
                 *or_replace = false;
+                *if_not_exists = false;
             }
             Statement::CreateTable {
                 or_replace,

From 14823be7a808a5fdb34df262dbed1c75e31d316f Mon Sep 17 00:00:00 2001
From: Dylan <zilin@risingwave-labs.com>
Date: Thu, 9 Jan 2025 13:55:42 +0800
Subject: [PATCH 11/21] fix: change iceberg glue catalog default implemantation
 to java (#20075)

---
 src/connector/src/connector_common/iceberg/mod.rs | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/connector/src/connector_common/iceberg/mod.rs b/src/connector/src/connector_common/iceberg/mod.rs
index ff53d80e89598..6964e03a5b99f 100644
--- a/src/connector/src/connector_common/iceberg/mod.rs
+++ b/src/connector/src/connector_common/iceberg/mod.rs
@@ -387,7 +387,7 @@ impl IcebergCommon {
                 let catalog = iceberg_catalog_rest::RestCatalog::new(config);
                 Ok(Arc::new(catalog))
             }
-            "glue" => {
+            "glue_rust" => {
                 let mut iceberg_configs = HashMap::new();
                 // glue
                 if let Some(region) = &self.region {
@@ -427,7 +427,10 @@ impl IcebergCommon {
                 Ok(Arc::new(catalog))
             }
             catalog_type
-                if catalog_type == "hive" || catalog_type == "jdbc" || catalog_type == "rest" =>
+                if catalog_type == "hive"
+                    || catalog_type == "jdbc"
+                    || catalog_type == "rest"
+                    || catalog_type == "glue" =>
             {
                 // Create java catalog
                 let (file_io_props, java_catalog_props) =
@@ -436,6 +439,7 @@ impl IcebergCommon {
                     "hive" => "org.apache.iceberg.hive.HiveCatalog",
                     "jdbc" => "org.apache.iceberg.jdbc.JdbcCatalog",
                     "rest" => "org.apache.iceberg.rest.RESTCatalog",
+                    "glue" => "org.apache.iceberg.aws.glue.GlueCatalog",
                     _ => unreachable!(),
                 };
 

From 934db16dbf71dd5ba1fbd69142ec0f0d3737c183 Mon Sep 17 00:00:00 2001
From: Xinhao Xu <84456268+xxhZs@users.noreply.github.com>
Date: Thu, 9 Jan 2025 14:49:25 +0800
Subject: [PATCH 12/21] feat(sink): support force_append_only for es sink
 (#19919)

---
 .../sink/elasticsearch/elasticsearch_sink.slt |  3 +++
 .../elasticsearch_opensearch/elasticsearch.rs |  3 +++
 .../elasticsearch_converter.rs                | 15 +++++++++----
 .../elasticsearch_opensearch_client.rs        |  5 ++++-
 .../elasticsearch_opensearch_config.rs        |  2 ++
 .../elasticsearch_opensearch_formatter.rs     | 21 +++++++++++++++++--
 .../elasticsearch_opensearch/opensearch.rs    |  3 +++
 src/connector/src/sink/remote.rs              |  1 +
 src/connector/with_options_sink.yaml          |  3 +++
 9 files changed, 49 insertions(+), 7 deletions(-)

diff --git a/e2e_test/sink/elasticsearch/elasticsearch_sink.slt b/e2e_test/sink/elasticsearch/elasticsearch_sink.slt
index 842e3f3303438..953bc694d744a 100644
--- a/e2e_test/sink/elasticsearch/elasticsearch_sink.slt
+++ b/e2e_test/sink/elasticsearch/elasticsearch_sink.slt
@@ -22,6 +22,7 @@ CREATE TABLE test_route (
 
 statement ok
 CREATE SINK test_route_sink from test_route WITH (
+    type = 'upsert',
     connector = 'elasticsearch',
     index = 'test_route',
     url = 'http://elasticsearch:9200',
@@ -32,6 +33,7 @@ CREATE SINK test_route_sink from test_route WITH (
 
 statement ok
 CREATE SINK s7 from t7 WITH (
+    type = 'upsert',
     connector = 'elasticsearch',
     index = 'test',
     url = 'http://elasticsearch:9200',
@@ -41,6 +43,7 @@ CREATE SINK s7 from t7 WITH (
 
 statement ok
 CREATE SINK s8 from t7 WITH (
+    type = 'upsert',
     connector = 'elasticsearch',
     index = 'test1',
     primary_key = 'v1,v3',
diff --git a/src/connector/src/sink/elasticsearch_opensearch/elasticsearch.rs b/src/connector/src/sink/elasticsearch_opensearch/elasticsearch.rs
index 1f0ac754fcad2..fd4cbed31698d 100644
--- a/src/connector/src/sink/elasticsearch_opensearch/elasticsearch.rs
+++ b/src/connector/src/sink/elasticsearch_opensearch/elasticsearch.rs
@@ -28,6 +28,7 @@ pub struct ElasticSearchSink {
     config: ElasticSearchOpenSearchConfig,
     schema: Schema,
     pk_indices: Vec<usize>,
+    is_append_only: bool,
 }
 
 #[async_trait]
@@ -41,6 +42,7 @@ impl TryFrom<SinkParam> for ElasticSearchSink {
             config,
             schema,
             pk_indices: param.downstream_pk,
+            is_append_only: param.sink_type.is_append_only(),
         })
     }
 }
@@ -64,6 +66,7 @@ impl Sink for ElasticSearchSink {
             self.schema.clone(),
             self.pk_indices.clone(),
             Self::SINK_NAME,
+            self.is_append_only,
         )?
         .into_log_sinker(self.config.concurrent_requests))
     }
diff --git a/src/connector/src/sink/elasticsearch_opensearch/elasticsearch_converter.rs b/src/connector/src/sink/elasticsearch_opensearch/elasticsearch_converter.rs
index dc19c99261aa7..ba2571ff2a130 100644
--- a/src/connector/src/sink/elasticsearch_opensearch/elasticsearch_converter.rs
+++ b/src/connector/src/sink/elasticsearch_opensearch/elasticsearch_converter.rs
@@ -39,6 +39,7 @@ impl StreamChunkConverter {
         schema: Schema,
         pk_indices: &Vec<usize>,
         properties: &BTreeMap<String, String>,
+        is_append_only: bool,
     ) -> Result<Self> {
         if is_remote_es_sink(sink_name) {
             let index_column = properties
@@ -71,6 +72,7 @@ impl StreamChunkConverter {
                 index_column,
                 index,
                 routing_column,
+                is_append_only,
             )?))
         } else {
             Ok(StreamChunkConverter::Other)
@@ -79,13 +81,14 @@ impl StreamChunkConverter {
 
     pub fn convert_chunk(&self, chunk: StreamChunk) -> Result<StreamChunk> {
         match self {
-            StreamChunkConverter::Es(es) => es.convert_chunk(chunk),
+            StreamChunkConverter::Es(es) => es.convert_chunk(chunk, es.is_append_only),
             StreamChunkConverter::Other => Ok(chunk),
         }
     }
 }
 pub struct EsStreamChunkConverter {
     formatter: ElasticSearchOpenSearchFormatter,
+    is_append_only: bool,
 }
 impl EsStreamChunkConverter {
     pub fn new(
@@ -95,6 +98,7 @@ impl EsStreamChunkConverter {
         index_column: Option<usize>,
         index: Option<String>,
         routing_column: Option<usize>,
+        is_append_only: bool,
     ) -> Result<Self> {
         let formatter = ElasticSearchOpenSearchFormatter::new(
             pk_indices,
@@ -104,10 +108,13 @@ impl EsStreamChunkConverter {
             index,
             routing_column,
         )?;
-        Ok(Self { formatter })
+        Ok(Self {
+            formatter,
+            is_append_only,
+        })
     }
 
-    fn convert_chunk(&self, chunk: StreamChunk) -> Result<StreamChunk> {
+    fn convert_chunk(&self, chunk: StreamChunk, is_append_only: bool) -> Result<StreamChunk> {
         let mut ops = Vec::with_capacity(chunk.capacity());
         let mut id_string_builder =
             <Utf8ArrayBuilder as risingwave_common::array::ArrayBuilder>::new(chunk.capacity());
@@ -117,7 +124,7 @@ impl EsStreamChunkConverter {
             <Utf8ArrayBuilder as risingwave_common::array::ArrayBuilder>::new(chunk.capacity());
         let mut routing_builder =
             <Utf8ArrayBuilder as risingwave_common::array::ArrayBuilder>::new(chunk.capacity());
-        for build_bulk_para in self.formatter.convert_chunk(chunk)? {
+        for build_bulk_para in self.formatter.convert_chunk(chunk, is_append_only)? {
             let BuildBulkPara {
                 key,
                 value,
diff --git a/src/connector/src/sink/elasticsearch_opensearch/elasticsearch_opensearch_client.rs b/src/connector/src/sink/elasticsearch_opensearch/elasticsearch_opensearch_client.rs
index 5369fcdee6aab..eb02d1310b650 100644
--- a/src/connector/src/sink/elasticsearch_opensearch/elasticsearch_opensearch_client.rs
+++ b/src/connector/src/sink/elasticsearch_opensearch/elasticsearch_opensearch_client.rs
@@ -160,6 +160,7 @@ pub struct ElasticSearchOpenSearchSinkWriter {
     client: Arc<ElasticSearchOpenSearchClient>,
     formatter: ElasticSearchOpenSearchFormatter,
     config: ElasticSearchOpenSearchConfig,
+    is_append_only: bool,
 }
 
 impl ElasticSearchOpenSearchSinkWriter {
@@ -168,6 +169,7 @@ impl ElasticSearchOpenSearchSinkWriter {
         schema: Schema,
         pk_indices: Vec<usize>,
         connector: &str,
+        is_append_only: bool,
     ) -> Result<Self> {
         let client = Arc::new(config.build_client(connector)?);
         let formatter = ElasticSearchOpenSearchFormatter::new(
@@ -182,6 +184,7 @@ impl ElasticSearchOpenSearchSinkWriter {
             client,
             formatter,
             config,
+            is_append_only,
         })
     }
 }
@@ -202,7 +205,7 @@ impl AsyncTruncateSinkWriter for ElasticSearchOpenSearchSinkWriter {
         let mut bulks: Vec<ElasticSearchOpenSearchBulk> = Vec::with_capacity(chunk_capacity);
 
         let mut bulks_size = 0;
-        for build_bulk_para in self.formatter.convert_chunk(chunk)? {
+        for build_bulk_para in self.formatter.convert_chunk(chunk, self.is_append_only)? {
             let BuildBulkPara {
                 key,
                 value,
diff --git a/src/connector/src/sink/elasticsearch_opensearch/elasticsearch_opensearch_config.rs b/src/connector/src/sink/elasticsearch_opensearch/elasticsearch_opensearch_config.rs
index e2907301d78a9..24f79dd84030b 100644
--- a/src/connector/src/sink/elasticsearch_opensearch/elasticsearch_opensearch_config.rs
+++ b/src/connector/src/sink/elasticsearch_opensearch/elasticsearch_opensearch_config.rs
@@ -77,6 +77,8 @@ pub struct ElasticSearchOpenSearchConfig {
     #[serde_as(as = "DisplayFromStr")]
     #[serde(default = "default_concurrent_requests")]
     pub concurrent_requests: usize,
+
+    pub r#type: String,
 }
 
 fn default_retry_on_conflict() -> i32 {
diff --git a/src/connector/src/sink/elasticsearch_opensearch/elasticsearch_opensearch_formatter.rs b/src/connector/src/sink/elasticsearch_opensearch/elasticsearch_opensearch_formatter.rs
index 660e26d578834..3cfda0601b617 100644
--- a/src/connector/src/sink/elasticsearch_opensearch/elasticsearch_opensearch_formatter.rs
+++ b/src/connector/src/sink/elasticsearch_opensearch/elasticsearch_opensearch_formatter.rs
@@ -113,7 +113,11 @@ impl ElasticSearchOpenSearchFormatter {
         })
     }
 
-    pub fn convert_chunk(&self, chunk: StreamChunk) -> Result<Vec<BuildBulkPara>> {
+    pub fn convert_chunk(
+        &self,
+        chunk: StreamChunk,
+        is_append_only: bool,
+    ) -> Result<Vec<BuildBulkPara>> {
         let mut result_vec = Vec::with_capacity(chunk.capacity());
         for (op, rows) in chunk.rows() {
             let index = if let Some(index_column) = self.index_column {
@@ -157,6 +161,11 @@ impl ElasticSearchOpenSearchFormatter {
                     });
                 }
                 Op::Delete => {
+                    if is_append_only {
+                        return Err(SinkError::ElasticSearchOpenSearch(anyhow!(
+                            "`Delete` operation is not supported in `append_only` mode"
+                        )));
+                    }
                     let key = self.key_encoder.encode(rows)?;
                     let mem_size_b = std::mem::size_of_val(&key);
                     result_vec.push(BuildBulkPara {
@@ -167,7 +176,15 @@ impl ElasticSearchOpenSearchFormatter {
                         routing_column,
                     });
                 }
-                Op::UpdateDelete => continue,
+                Op::UpdateDelete => {
+                    if is_append_only {
+                        return Err(SinkError::ElasticSearchOpenSearch(anyhow!(
+                            "`UpdateDelete` operation is not supported in `append_only` mode"
+                        )));
+                    } else {
+                        continue;
+                    }
+                }
             }
         }
         Ok(result_vec)
diff --git a/src/connector/src/sink/elasticsearch_opensearch/opensearch.rs b/src/connector/src/sink/elasticsearch_opensearch/opensearch.rs
index b0df3c8ca2345..bf9b79f6a8850 100644
--- a/src/connector/src/sink/elasticsearch_opensearch/opensearch.rs
+++ b/src/connector/src/sink/elasticsearch_opensearch/opensearch.rs
@@ -30,6 +30,7 @@ pub struct OpenSearchSink {
     config: ElasticSearchOpenSearchConfig,
     schema: Schema,
     pk_indices: Vec<usize>,
+    is_append_only: bool,
 }
 
 #[async_trait]
@@ -43,6 +44,7 @@ impl TryFrom<SinkParam> for OpenSearchSink {
             config,
             schema,
             pk_indices: param.downstream_pk,
+            is_append_only: param.sink_type.is_append_only(),
         })
     }
 }
@@ -69,6 +71,7 @@ impl Sink for OpenSearchSink {
             self.schema.clone(),
             self.pk_indices.clone(),
             Self::SINK_NAME,
+            self.is_append_only,
         )?
         .into_log_sinker(self.config.concurrent_requests))
     }
diff --git a/src/connector/src/sink/remote.rs b/src/connector/src/sink/remote.rs
index 26683966f566b..582a5c861bb5c 100644
--- a/src/connector/src/sink/remote.rs
+++ b/src/connector/src/sink/remote.rs
@@ -301,6 +301,7 @@ impl RemoteLogSinker {
                 sink_param.schema(),
                 &sink_param.downstream_pk,
                 &sink_param.properties,
+                sink_param.sink_type.is_append_only(),
             )?,
         })
     }
diff --git a/src/connector/with_options_sink.yaml b/src/connector/with_options_sink.yaml
index d7291c8ecc61d..819e597fec448 100644
--- a/src/connector/with_options_sink.yaml
+++ b/src/connector/with_options_sink.yaml
@@ -331,6 +331,9 @@ ElasticSearchOpenSearchConfig:
   - name: concurrent_requests
     field_type: usize
     required: true
+  - name: r#type
+    field_type: String
+    required: true
 FsConfig:
   fields:
   - name: fs.path

From 09e59e43dd7630f02664af8f8fb6672554351530 Mon Sep 17 00:00:00 2001
From: Richard Chien <stdrc@outlook.com>
Date: Thu, 9 Jan 2025 16:57:02 +0800
Subject: [PATCH 13/21] feat(udf): support `CREATE FUNCTION/AGGREGATE IF NOT
 EXISTS` (#20079)

Signed-off-by: Richard Chien <stdrc@outlook.com>
---
 e2e_test/udf/create_and_drop.slt              | 115 ++++++++++++++++++
 e2e_test/udf/drop_function.slt                |  52 --------
 src/frontend/src/handler/create_aggregate.rs  |  23 ++--
 src/frontend/src/handler/create_function.rs   |  23 ++--
 .../src/handler/create_sql_function.rs        |  23 ++--
 src/frontend/src/handler/mod.rs               |   5 +
 src/frontend/src/session.rs                   |  39 ++++++
 src/sqlparser/src/ast/mod.rs                  |  16 ++-
 src/sqlparser/src/parser.rs                   |   6 +
 src/sqlparser/tests/sqlparser_postgres.rs     |  57 ++++++++-
 10 files changed, 265 insertions(+), 94 deletions(-)
 create mode 100644 e2e_test/udf/create_and_drop.slt
 delete mode 100644 e2e_test/udf/drop_function.slt

diff --git a/e2e_test/udf/create_and_drop.slt b/e2e_test/udf/create_and_drop.slt
new file mode 100644
index 0000000000000..7b31dba16fdbd
--- /dev/null
+++ b/e2e_test/udf/create_and_drop.slt
@@ -0,0 +1,115 @@
+# https://github.com/risingwavelabs/risingwave/issues/17263
+
+statement ok
+create table t (a int, b int);
+
+statement ok
+create function add(a int, b int) returns int language python as $$
+def add(a, b):
+  return a+b
+$$;
+
+statement error function with name add\(integer,integer\) exists
+create function add(int, int) returns int language sql as $$select $1 + $2$$;
+
+statement ok
+create function if not exists add(int, int) returns int language sql as $$select $1 + $2$$;
+
+statement ok
+create function add_v2(int, int) returns int language sql as $$select $1 + $2$$;
+
+statement ok
+create aggregate mysum(value int) returns int language python as $$
+def create_state():
+    return 0
+def accumulate(state, value):
+    return state + value
+def finish(state):
+    return state
+$$;
+
+statement error function with name mysum\(integer\) exists
+create aggregate mysum(value int) returns int language python as $$
+def create_state():
+    return 0
+def accumulate(state, value):
+    return state + value
+def finish(state):
+    return state
+$$;
+
+statement ok
+create aggregate if not exists mysum(value int) returns int language python as $$
+def create_state():
+    return 0
+def accumulate(state, value):
+    return state + value
+def finish(state):
+    return state
+$$;
+
+statement ok
+create materialized view mv as select add(a, b) + add_v2(a, b) as c from t;
+
+statement ok
+create materialized view mv2 as select mysum(a) as s from t;
+
+statement error function used by 1 other objects
+drop function add;
+
+statement error function used by 1 other objects
+drop function if exists add;
+
+statement error function used by 1 other objects
+drop function add_v2;
+
+statement error function used by 1 other objects
+drop aggregate mysum;
+
+statement ok
+drop materialized view mv;
+
+statement ok
+drop materialized view mv2;
+
+statement ok
+drop function add;
+
+statement error function not found
+drop function add;
+
+statement ok
+drop function if exists add;
+
+statement ok
+drop function add_v2;
+
+statement ok
+drop function if exists add_v2;
+
+statement ok
+drop aggregate mysum;
+
+statement ok
+drop aggregate if exists mysum;
+
+statement ok
+create function add(a int, b int) returns int language python as $$
+def add(a, b):
+  return a+b
+$$;
+
+statement ok
+create sink s as select add(a, b) as c from t with (connector = 'blackhole');
+
+statement error function used by 1 other objects
+drop function add;
+
+statement ok
+drop sink s;
+
+statement ok
+drop function add;
+
+statement ok
+drop table t;
diff --git a/e2e_test/udf/drop_function.slt b/e2e_test/udf/drop_function.slt
deleted file mode 100644
index ffe4e0eea481f..0000000000000
--- a/e2e_test/udf/drop_function.slt
+++ /dev/null
@@ -1,52 +0,0 @@
-# https://github.com/risingwavelabs/risingwave/issues/17263
-
-statement ok
-create table t (a int, b int);
-
-statement ok
-create function add(a int, b int) returns int language python as $$
-def add(a, b):
-  return a+b
-$$;
-
-statement ok
-create function add_v2(INT, INT) returns int language sql as $$select $1 + $2$$;
-
-statement ok
-create materialized view mv as select add(a, b) + add_v2(a, b) as c from t;
-
-statement error function used by 1 other objects
-drop function add;
-
-statement error function used by 1 other objects
-drop function add_v2;
-
-statement ok
-drop materialized view mv;
-
-statement ok
-drop function add;
-
-statement ok
-drop function add_v2;
-
-statement ok
-create function add(a int, b int) returns int language python as $$
-def add(a, b):
-  return a+b
-$$;
-
-statement ok
-create sink s as select add(a, b) as c from t with (connector = 'blackhole');
-
-statement error function used by 1 other objects
-drop function add;
-
-statement ok
-drop sink s;
-
-statement ok
-drop function add;
-
-statement ok
-drop table t;
diff --git a/src/frontend/src/handler/create_aggregate.rs b/src/frontend/src/handler/create_aggregate.rs
index 32f326db9b1d9..85ba343fef408 100644
--- a/src/frontend/src/handler/create_aggregate.rs
+++ b/src/frontend/src/handler/create_aggregate.rs
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 use anyhow::Context;
+use either::Either;
 use risingwave_common::catalog::FunctionId;
 use risingwave_expr::sig::{CreateFunctionOptions, UdfKind};
 use risingwave_pb::catalog::function::{AggregateFunction, Kind};
@@ -20,12 +21,12 @@ use risingwave_pb::catalog::Function;
 use risingwave_sqlparser::ast::DataType as AstDataType;
 
 use super::*;
-use crate::catalog::CatalogError;
 use crate::{bind_data_type, Binder};
 
 pub async fn handle_create_aggregate(
     handler_args: HandlerArgs,
     or_replace: bool,
+    if_not_exists: bool,
     name: ObjectName,
     args: Vec<OperateFunctionArg>,
     returns: AstDataType,
@@ -74,20 +75,18 @@ pub async fn handle_create_aggregate(
     // resolve database and schema id
     let session = &handler_args.session;
     let db_name = &session.database();
-    let (schema_name, function_name) = Binder::resolve_schema_qualified_name(db_name, name)?;
+    let (schema_name, function_name) =
+        Binder::resolve_schema_qualified_name(db_name, name.clone())?;
     let (database_id, schema_id) = session.get_database_and_schema_id_for_create(schema_name)?;
 
     // check if the function exists in the catalog
-    if (session.env().catalog_reader().read_guard())
-        .get_schema_by_id(&database_id, &schema_id)?
-        .get_function_by_name_args(&function_name, &arg_types)
-        .is_some()
-    {
-        let name = format!(
-            "{function_name}({})",
-            arg_types.iter().map(|t| t.to_string()).join(",")
-        );
-        return Err(CatalogError::Duplicated("function", name).into());
+    if let Either::Right(resp) = session.check_function_name_duplicated(
+        StatementType::CREATE_FUNCTION,
+        name,
+        &arg_types,
+        if_not_exists,
+    )? {
+        return Ok(resp);
     }
 
     let link = match &params.using {
diff --git a/src/frontend/src/handler/create_function.rs b/src/frontend/src/handler/create_function.rs
index b87d3c90a3488..c212eaebb56f4 100644
--- a/src/frontend/src/handler/create_function.rs
+++ b/src/frontend/src/handler/create_function.rs
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 use anyhow::Context;
+use either::Either;
 use risingwave_common::catalog::FunctionId;
 use risingwave_common::types::StructType;
 use risingwave_expr::sig::{CreateFunctionOptions, UdfKind};
@@ -20,13 +21,13 @@ use risingwave_pb::catalog::function::{Kind, ScalarFunction, TableFunction};
 use risingwave_pb::catalog::Function;
 
 use super::*;
-use crate::catalog::CatalogError;
 use crate::{bind_data_type, Binder};
 
 pub async fn handle_create_function(
     handler_args: HandlerArgs,
     or_replace: bool,
     temporary: bool,
+    if_not_exists: bool,
     name: ObjectName,
     args: Option<Vec<OperateFunctionArg>>,
     returns: Option<CreateFunctionReturns>,
@@ -107,20 +108,18 @@ pub async fn handle_create_function(
     // resolve database and schema id
     let session = &handler_args.session;
     let db_name = &session.database();
-    let (schema_name, function_name) = Binder::resolve_schema_qualified_name(db_name, name)?;
+    let (schema_name, function_name) =
+        Binder::resolve_schema_qualified_name(db_name, name.clone())?;
     let (database_id, schema_id) = session.get_database_and_schema_id_for_create(schema_name)?;
 
     // check if the function exists in the catalog
-    if (session.env().catalog_reader().read_guard())
-        .get_schema_by_id(&database_id, &schema_id)?
-        .get_function_by_name_args(&function_name, &arg_types)
-        .is_some()
-    {
-        let name = format!(
-            "{function_name}({})",
-            arg_types.iter().map(|t| t.to_string()).join(",")
-        );
-        return Err(CatalogError::Duplicated("function", name).into());
+    if let Either::Right(resp) = session.check_function_name_duplicated(
+        StatementType::CREATE_FUNCTION,
+        name,
+        &arg_types,
+        if_not_exists,
+    )? {
+        return Ok(resp);
     }
 
     let link = match &params.using {
diff --git a/src/frontend/src/handler/create_sql_function.rs b/src/frontend/src/handler/create_sql_function.rs
index 4725b37ab6511..71a31ce5173cc 100644
--- a/src/frontend/src/handler/create_sql_function.rs
+++ b/src/frontend/src/handler/create_sql_function.rs
@@ -14,6 +14,7 @@
 
 use std::collections::HashMap;
 
+use either::Either;
 use fancy_regex::Regex;
 use risingwave_common::catalog::FunctionId;
 use risingwave_common::types::{DataType, StructType};
@@ -23,7 +24,6 @@ use risingwave_sqlparser::parser::{Parser, ParserError};
 
 use super::*;
 use crate::binder::UdfContext;
-use crate::catalog::CatalogError;
 use crate::expr::{Expr, ExprImpl, Literal};
 use crate::{bind_data_type, Binder};
 
@@ -122,6 +122,7 @@ pub async fn handle_create_sql_function(
     handler_args: HandlerArgs,
     or_replace: bool,
     temporary: bool,
+    if_not_exists: bool,
     name: ObjectName,
     args: Option<Vec<OperateFunctionArg>>,
     returns: Option<CreateFunctionReturns>,
@@ -214,20 +215,18 @@ pub async fn handle_create_sql_function(
     // resolve database and schema id
     let session = &handler_args.session;
     let db_name = &session.database();
-    let (schema_name, function_name) = Binder::resolve_schema_qualified_name(db_name, name)?;
+    let (schema_name, function_name) =
+        Binder::resolve_schema_qualified_name(db_name, name.clone())?;
     let (database_id, schema_id) = session.get_database_and_schema_id_for_create(schema_name)?;
 
     // check if function exists
-    if (session.env().catalog_reader().read_guard())
-        .get_schema_by_id(&database_id, &schema_id)?
-        .get_function_by_name_args(&function_name, &arg_types)
-        .is_some()
-    {
-        let name = format!(
-            "{function_name}({})",
-            arg_types.iter().map(|t| t.to_string()).join(",")
-        );
-        return Err(CatalogError::Duplicated("function", name).into());
+    if let Either::Right(resp) = session.check_function_name_duplicated(
+        StatementType::CREATE_FUNCTION,
+        name,
+        &arg_types,
+        if_not_exists,
+    )? {
+        return Ok(resp);
     }
 
     // Parse function body here
diff --git a/src/frontend/src/handler/mod.rs b/src/frontend/src/handler/mod.rs
index bd559b9245fc7..4245e66c3034a 100644
--- a/src/frontend/src/handler/mod.rs
+++ b/src/frontend/src/handler/mod.rs
@@ -278,6 +278,7 @@ pub async fn handle(
         Statement::CreateFunction {
             or_replace,
             temporary,
+            if_not_exists,
             name,
             args,
             returns,
@@ -298,6 +299,7 @@ pub async fn handle(
                     handler_args,
                     or_replace,
                     temporary,
+                    if_not_exists,
                     name,
                     args,
                     returns,
@@ -310,6 +312,7 @@ pub async fn handle(
                     handler_args,
                     or_replace,
                     temporary,
+                    if_not_exists,
                     name,
                     args,
                     returns,
@@ -320,6 +323,7 @@ pub async fn handle(
         }
         Statement::CreateAggregate {
             or_replace,
+            if_not_exists,
             name,
             args,
             returns,
@@ -329,6 +333,7 @@ pub async fn handle(
             create_aggregate::handle_create_aggregate(
                 handler_args,
                 or_replace,
+                if_not_exists,
                 name,
                 args,
                 returns,
diff --git a/src/frontend/src/session.rs b/src/frontend/src/session.rs
index fe03af7c0c786..7f99d91d6e0d5 100644
--- a/src/frontend/src/session.rs
+++ b/src/frontend/src/session.rs
@@ -22,6 +22,7 @@ use std::time::{Duration, Instant};
 use anyhow::anyhow;
 use bytes::Bytes;
 use either::Either;
+use itertools::Itertools;
 use parking_lot::{Mutex, RwLock, RwLockReadGuard};
 use pgwire::error::{PsqlError, PsqlResult};
 use pgwire::net::{Address, AddressRef};
@@ -961,6 +962,44 @@ impl SessionImpl {
             .map_err(RwError::from)
     }
 
+    pub fn check_function_name_duplicated(
+        &self,
+        stmt_type: StatementType,
+        name: ObjectName,
+        arg_types: &[DataType],
+        if_not_exists: bool,
+    ) -> Result<Either<(), RwPgResponse>> {
+        let db_name = &self.database();
+        let (schema_name, function_name) = Binder::resolve_schema_qualified_name(db_name, name)?;
+        let (database_id, schema_id) = self.get_database_and_schema_id_for_create(schema_name)?;
+
+        let catalog_reader = self.env().catalog_reader().read_guard();
+        if catalog_reader
+            .get_schema_by_id(&database_id, &schema_id)?
+            .get_function_by_name_args(&function_name, arg_types)
+            .is_some()
+        {
+            let full_name = format!(
+                "{function_name}({})",
+                arg_types.iter().map(|t| t.to_string()).join(",")
+            );
+            if if_not_exists {
+                Ok(Either::Right(
+                    PgResponse::builder(stmt_type)
+                        .notice(format!(
+                            "function \"{}\" already exists, skipping",
+                            full_name
+                        ))
+                        .into(),
+                ))
+            } else {
+                Err(CatalogError::Duplicated("function", full_name).into())
+            }
+        } else {
+            Ok(Either::Left(()))
+        }
+    }
+
     /// Also check if the user has the privilege to create in the schema.
     pub fn get_database_and_schema_id_for_create(
         &self,
diff --git a/src/sqlparser/src/ast/mod.rs b/src/sqlparser/src/ast/mod.rs
index 16496b71c97eb..98f8599ccf89a 100644
--- a/src/sqlparser/src/ast/mod.rs
+++ b/src/sqlparser/src/ast/mod.rs
@@ -1349,6 +1349,7 @@ pub enum Statement {
     CreateFunction {
         or_replace: bool,
         temporary: bool,
+        if_not_exists: bool,
         name: ObjectName,
         args: Option<Vec<OperateFunctionArg>>,
         returns: Option<CreateFunctionReturns>,
@@ -1361,6 +1362,7 @@ pub enum Statement {
     /// Postgres: <https://www.postgresql.org/docs/15/sql-createaggregate.html>
     CreateAggregate {
         or_replace: bool,
+        if_not_exists: bool,
         name: ObjectName,
         args: Vec<OperateFunctionArg>,
         returns: DataType,
@@ -1768,6 +1770,7 @@ impl fmt::Display for Statement {
             Statement::CreateFunction {
                 or_replace,
                 temporary,
+                if_not_exists,
                 name,
                 args,
                 returns,
@@ -1776,9 +1779,10 @@ impl fmt::Display for Statement {
             } => {
                 write!(
                     f,
-                    "CREATE {or_replace}{temp}FUNCTION {name}",
+                    "CREATE {or_replace}{temp}FUNCTION {if_not_exists}{name}",
                     temp = if *temporary { "TEMPORARY " } else { "" },
                     or_replace = if *or_replace { "OR REPLACE " } else { "" },
+                    if_not_exists = if *if_not_exists { "IF NOT EXISTS " } else { "" },
                 )?;
                 if let Some(args) = args {
                     write!(f, "({})", display_comma_separated(args))?;
@@ -1792,6 +1796,7 @@ impl fmt::Display for Statement {
             }
             Statement::CreateAggregate {
                 or_replace,
+                if_not_exists,
                 name,
                 args,
                 returns,
@@ -1800,8 +1805,9 @@ impl fmt::Display for Statement {
             } => {
                 write!(
                     f,
-                    "CREATE {or_replace}AGGREGATE {name}",
+                    "CREATE {or_replace}AGGREGATE {if_not_exists}{name}",
                     or_replace = if *or_replace { "OR REPLACE " } else { "" },
+                    if_not_exists = if *if_not_exists { "IF NOT EXISTS " } else { "" },
                 )?;
                 write!(f, "({})", display_comma_separated(args))?;
                 write!(f, " RETURNS {}", returns)?;
@@ -3551,8 +3557,9 @@ mod tests {
     #[test]
     fn test_create_function_display() {
         let create_function = Statement::CreateFunction {
-            temporary: false,
             or_replace: false,
+            temporary: false,
+            if_not_exists: false,
             name: ObjectName(vec![Ident::new_unchecked("foo")]),
             args: Some(vec![OperateFunctionArg::unnamed(DataType::Int)]),
             returns: Some(CreateFunctionReturns::Value(DataType::Int)),
@@ -3573,8 +3580,9 @@ mod tests {
             format!("{}", create_function)
         );
         let create_function = Statement::CreateFunction {
-            temporary: false,
             or_replace: false,
+            temporary: false,
+            if_not_exists: false,
             name: ObjectName(vec![Ident::new_unchecked("foo")]),
             args: Some(vec![OperateFunctionArg::unnamed(DataType::Int)]),
             returns: Some(CreateFunctionReturns::Value(DataType::Int)),
diff --git a/src/sqlparser/src/parser.rs b/src/sqlparser/src/parser.rs
index 9eb3d9e439967..2df0183cf5c5a 100644
--- a/src/sqlparser/src/parser.rs
+++ b/src/sqlparser/src/parser.rs
@@ -2210,6 +2210,8 @@ impl Parser<'_> {
         or_replace: bool,
         temporary: bool,
     ) -> PResult<Statement> {
+        impl_parse_to!(if_not_exists => [Keyword::IF, Keyword::NOT, Keyword::EXISTS], self);
+
         let name = self.parse_object_name()?;
         self.expect_token(&Token::LParen)?;
         let args = if self.peek_token().token == Token::RParen {
@@ -2248,6 +2250,7 @@ impl Parser<'_> {
         Ok(Statement::CreateFunction {
             or_replace,
             temporary,
+            if_not_exists,
             name,
             args,
             returns: return_type,
@@ -2257,6 +2260,8 @@ impl Parser<'_> {
     }
 
     fn parse_create_aggregate(&mut self, or_replace: bool) -> PResult<Statement> {
+        impl_parse_to!(if_not_exists => [Keyword::IF, Keyword::NOT, Keyword::EXISTS], self);
+
         let name = self.parse_object_name()?;
         self.expect_token(&Token::LParen)?;
         let args = self.parse_comma_separated(Parser::parse_function_arg)?;
@@ -2270,6 +2275,7 @@ impl Parser<'_> {
 
         Ok(Statement::CreateAggregate {
             or_replace,
+            if_not_exists,
             name,
             args,
             returns,
diff --git a/src/sqlparser/tests/sqlparser_postgres.rs b/src/sqlparser/tests/sqlparser_postgres.rs
index 7acf6d29b4444..1466a9024a6d5 100644
--- a/src/sqlparser/tests/sqlparser_postgres.rs
+++ b/src/sqlparser/tests/sqlparser_postgres.rs
@@ -753,6 +753,7 @@ fn parse_create_function() {
         Statement::CreateFunction {
             or_replace: false,
             temporary: false,
+            if_not_exists: false,
             name: ObjectName(vec![Ident::new_unchecked("add")]),
             args: Some(vec![
                 OperateFunctionArg::unnamed(DataType::Int),
@@ -777,6 +778,7 @@ fn parse_create_function() {
         Statement::CreateFunction {
             or_replace: false,
             temporary: false,
+            if_not_exists: false,
             name: ObjectName(vec![Ident::new_unchecked("sub")]),
             args: Some(vec![
                 OperateFunctionArg::unnamed(DataType::Int),
@@ -801,6 +803,7 @@ fn parse_create_function() {
         Statement::CreateFunction {
             or_replace: false,
             temporary: false,
+            if_not_exists: false,
             name: ObjectName(vec![Ident::new_unchecked("return_test")]),
             args: Some(vec![
                 OperateFunctionArg::unnamed(DataType::Int),
@@ -826,6 +829,7 @@ fn parse_create_function() {
         Statement::CreateFunction {
             or_replace: true,
             temporary: false,
+            if_not_exists: false,
             name: ObjectName(vec![Ident::new_unchecked("add")]),
             args: Some(vec![
                 OperateFunctionArg::with_name("a", DataType::Int),
@@ -851,12 +855,14 @@ fn parse_create_function() {
         }
     );
 
-    let sql = "CREATE FUNCTION unnest(a INT[]) RETURNS TABLE (x INT) LANGUAGE SQL RETURN a";
+    let sql =
+        "CREATE TEMPORARY FUNCTION unnest(a INT[]) RETURNS TABLE (x INT) LANGUAGE SQL RETURN a";
     assert_eq!(
         verified_stmt(sql),
         Statement::CreateFunction {
             or_replace: false,
-            temporary: false,
+            temporary: true,
+            if_not_exists: false,
             name: ObjectName(vec![Ident::new_unchecked("unnest")]),
             args: Some(vec![OperateFunctionArg::with_name(
                 "a",
@@ -874,6 +880,32 @@ fn parse_create_function() {
             with_options: Default::default(),
         }
     );
+
+    let sql =
+        "CREATE FUNCTION IF NOT EXISTS add(INT, INT) RETURNS INT LANGUAGE SQL IMMUTABLE AS 'select $1 + $2;'";
+    assert_eq!(
+        verified_stmt(sql),
+        Statement::CreateFunction {
+            or_replace: false,
+            temporary: false,
+            if_not_exists: true,
+            name: ObjectName(vec![Ident::new_unchecked("add")]),
+            args: Some(vec![
+                OperateFunctionArg::unnamed(DataType::Int),
+                OperateFunctionArg::unnamed(DataType::Int),
+            ]),
+            returns: Some(CreateFunctionReturns::Value(DataType::Int)),
+            params: CreateFunctionBody {
+                language: Some("SQL".into()),
+                behavior: Some(FunctionBehavior::Immutable),
+                as_: Some(FunctionDefinition::SingleQuotedDef(
+                    "select $1 + $2;".into()
+                )),
+                ..Default::default()
+            },
+            with_options: Default::default(),
+        }
+    );
 }
 
 #[test]
@@ -884,6 +916,27 @@ fn parse_create_aggregate() {
         verified_stmt(sql),
         Statement::CreateAggregate {
             or_replace: true,
+            if_not_exists: false,
+            name: ObjectName(vec![Ident::new_unchecked("sum")]),
+            args: vec![OperateFunctionArg::unnamed(DataType::Int)],
+            returns: DataType::BigInt,
+            append_only: true,
+            params: CreateFunctionBody {
+                language: Some("python".into()),
+                as_: Some(FunctionDefinition::SingleQuotedDef("sum".into())),
+                using: Some(CreateFunctionUsing::Link("xxx".into())),
+                ..Default::default()
+            },
+        }
+    );
+
+    let sql =
+        "CREATE AGGREGATE IF NOT EXISTS sum(INT) RETURNS BIGINT APPEND ONLY LANGUAGE python AS 'sum' USING LINK 'xxx'";
+    assert_eq!(
+        verified_stmt(sql),
+        Statement::CreateAggregate {
+            or_replace: false,
+            if_not_exists: true,
             name: ObjectName(vec![Ident::new_unchecked("sum")]),
             args: vec![OperateFunctionArg::unnamed(DataType::Int)],
             returns: DataType::BigInt,

From 3619d775a56a9b8eb74fe5e0c9323c4cf10d76a3 Mon Sep 17 00:00:00 2001
From: Dylan <zilin@risingwave-labs.com>
Date: Thu, 9 Jan 2025 17:58:55 +0800
Subject: [PATCH 14/21] fix(iceberg): fix iceberg time travel timezone (#20085)

---
 src/frontend/src/handler/explain.rs           |  1 +
 src/frontend/src/handler/query.rs             |  1 +
 .../src/scheduler/distributed/query.rs        |  1 +
 src/frontend/src/scheduler/plan_fragmenter.rs | 62 +++++++++++++++----
 4 files changed, 53 insertions(+), 12 deletions(-)

diff --git a/src/frontend/src/handler/explain.rs b/src/frontend/src/handler/explain.rs
index e5d20ce289724..5f55f9aac378c 100644
--- a/src/frontend/src/handler/explain.rs
+++ b/src/frontend/src/handler/explain.rs
@@ -220,6 +220,7 @@ async fn do_handle_explain(
                                 worker_node_manager_reader,
                                 session.env().catalog_reader().clone(),
                                 session.config().batch_parallelism().0,
+                                session.config().timezone().to_owned(),
                                 plan.clone(),
                             )?);
                             batch_plan_fragmenter_fmt = if explain_format == ExplainFormat::Dot {
diff --git a/src/frontend/src/handler/query.rs b/src/frontend/src/handler/query.rs
index 4f4b1f2187aa0..ec5a3ee393d40 100644
--- a/src/frontend/src/handler/query.rs
+++ b/src/frontend/src/handler/query.rs
@@ -373,6 +373,7 @@ pub fn gen_batch_plan_fragmenter(
         worker_node_manager_reader,
         session.env().catalog_reader().clone(),
         session.config().batch_parallelism().0,
+        session.config().timezone().to_owned(),
         plan,
     )?;
 
diff --git a/src/frontend/src/scheduler/distributed/query.rs b/src/frontend/src/scheduler/distributed/query.rs
index a843aa1b59fe4..cf6319c894a0e 100644
--- a/src/frontend/src/scheduler/distributed/query.rs
+++ b/src/frontend/src/scheduler/distributed/query.rs
@@ -734,6 +734,7 @@ pub(crate) mod tests {
             worker_node_selector,
             catalog_reader,
             None,
+            "UTC".to_owned(),
             batch_exchange_node.clone(),
         )
         .unwrap();
diff --git a/src/frontend/src/scheduler/plan_fragmenter.rs b/src/frontend/src/scheduler/plan_fragmenter.rs
index 76f5428348003..3949f1f2d9f5d 100644
--- a/src/frontend/src/scheduler/plan_fragmenter.rs
+++ b/src/frontend/src/scheduler/plan_fragmenter.rs
@@ -20,6 +20,7 @@ use std::sync::Arc;
 
 use anyhow::anyhow;
 use async_recursion::async_recursion;
+use chrono::{MappedLocalTime, TimeZone};
 use enum_as_inner::EnumAsInner;
 use futures::TryStreamExt;
 use iceberg::expr::Predicate as IcebergPredicate;
@@ -33,6 +34,7 @@ use risingwave_common::bitmap::{Bitmap, BitmapBuilder};
 use risingwave_common::catalog::{Schema, TableDesc};
 use risingwave_common::hash::table_distribution::TableDistribution;
 use risingwave_common::hash::{WorkerSlotId, WorkerSlotMapping};
+use risingwave_common::types::Timestamptz;
 use risingwave_common::util::scan_range::ScanRange;
 use risingwave_connector::source::filesystem::opendal_source::opendal_enumerator::OpendalEnumerator;
 use risingwave_connector::source::filesystem::opendal_source::{
@@ -145,6 +147,7 @@ pub struct BatchPlanFragmenter {
     catalog_reader: CatalogReader,
 
     batch_parallelism: usize,
+    timezone: String,
 
     stage_graph_builder: Option<StageGraphBuilder>,
     stage_graph: Option<StageGraph>,
@@ -163,6 +166,7 @@ impl BatchPlanFragmenter {
         worker_node_manager: WorkerNodeSelector,
         catalog_reader: CatalogReader,
         batch_parallelism: Option<NonZeroU64>,
+        timezone: String,
         batch_node: PlanRef,
     ) -> SchedulerResult<Self> {
         // if batch_parallelism is None, it means no limit, we will use the available nodes count as
@@ -186,6 +190,7 @@ impl BatchPlanFragmenter {
             worker_node_manager,
             catalog_reader,
             batch_parallelism,
+            timezone,
             stage_graph_builder: Some(StageGraphBuilder::new(batch_parallelism)),
             stage_graph: None,
         };
@@ -311,7 +316,11 @@ impl SourceScanInfo {
         Self::Incomplete(fetch_info)
     }
 
-    pub async fn complete(self, batch_parallelism: usize) -> SchedulerResult<Self> {
+    pub async fn complete(
+        self,
+        batch_parallelism: usize,
+        timezone: String,
+    ) -> SchedulerResult<Self> {
         let fetch_info = match self {
             SourceScanInfo::Incomplete(fetch_info) => fetch_info,
             SourceScanInfo::Complete(_) => {
@@ -382,15 +391,36 @@ impl SourceScanInfo {
                     Some(AsOf::VersionString(_)) => {
                         bail!("Unsupported version string in iceberg time travel")
                     }
-                    Some(AsOf::TimestampString(ts)) => Some(
-                        speedate::DateTime::parse_str_rfc3339(&ts)
-                            .map(|t| {
-                                IcebergTimeTravelInfo::TimestampMs(
-                                    t.timestamp_tz() * 1000 + t.time.microsecond as i64 / 1000,
-                                )
-                            })
-                            .map_err(|_e| anyhow!("fail to parse timestamp"))?,
-                    ),
+                    Some(AsOf::TimestampString(ts)) => {
+                        let date_time = speedate::DateTime::parse_str_rfc3339(&ts)
+                            .map_err(|_e| anyhow!("fail to parse timestamp"))?;
+                        let timestamp = if date_time.time.tz_offset.is_none() {
+                            // If the input does not specify a time zone, use the time zone set by the "SET TIME ZONE" command.
+                            let tz =
+                                Timestamptz::lookup_time_zone(&timezone).map_err(|e| anyhow!(e))?;
+                            match tz.with_ymd_and_hms(
+                                date_time.date.year.into(),
+                                date_time.date.month.into(),
+                                date_time.date.day.into(),
+                                date_time.time.hour.into(),
+                                date_time.time.minute.into(),
+                                date_time.time.second.into(),
+                            ) {
+                                MappedLocalTime::Single(d) => Ok(d.timestamp()),
+                                MappedLocalTime::Ambiguous(_, _) | MappedLocalTime::None => {
+                                    Err(anyhow!(format!(
+                                        "failed to parse the timestamp {ts} with the specified time zone {tz}"
+                                    )))
+                                }
+                            }?
+                        } else {
+                            date_time.timestamp_tz()
+                        };
+
+                        Some(IcebergTimeTravelInfo::TimestampMs(
+                            timestamp * 1000 + date_time.time.microsecond as i64 / 1000,
+                        ))
+                    }
                     Some(AsOf::ProcessTime) | Some(AsOf::ProcessTimeWithInterval(_)) => {
                         unreachable!()
                     }
@@ -731,6 +761,7 @@ impl StageGraph {
         self,
         catalog_reader: &CatalogReader,
         worker_node_manager: &WorkerNodeSelector,
+        timezone: String,
     ) -> SchedulerResult<StageGraph> {
         let mut complete_stages = HashMap::new();
         self.complete_stage(
@@ -739,6 +770,7 @@ impl StageGraph {
             &mut complete_stages,
             catalog_reader,
             worker_node_manager,
+            timezone,
         )
         .await?;
         Ok(StageGraph {
@@ -758,6 +790,7 @@ impl StageGraph {
         complete_stages: &mut HashMap<StageId, QueryStageRef>,
         catalog_reader: &CatalogReader,
         worker_node_manager: &WorkerNodeSelector,
+        timezone: String,
     ) -> SchedulerResult<()> {
         let parallelism = if stage.parallelism.is_some() {
             // If the stage has parallelism, it means it's a complete stage.
@@ -772,7 +805,7 @@ impl StageGraph {
                 .as_ref()
                 .unwrap()
                 .clone()
-                .complete(self.batch_parallelism)
+                .complete(self.batch_parallelism, timezone.to_owned())
                 .await?;
 
             // For batch reading file source, the number of files involved is typically large.
@@ -842,6 +875,7 @@ impl StageGraph {
                 complete_stages,
                 catalog_reader,
                 worker_node_manager,
+                timezone.to_owned(),
             )
             .await?;
         }
@@ -935,7 +969,11 @@ impl BatchPlanFragmenter {
     pub async fn generate_complete_query(self) -> SchedulerResult<Query> {
         let stage_graph = self.stage_graph.unwrap();
         let new_stage_graph = stage_graph
-            .complete(&self.catalog_reader, &self.worker_node_manager)
+            .complete(
+                &self.catalog_reader,
+                &self.worker_node_manager,
+                self.timezone.to_owned(),
+            )
             .await?;
         Ok(Query {
             query_id: self.query_id,

From 87eac4f4d92f04fed4d52ae3f13567a77a248455 Mon Sep 17 00:00:00 2001
From: Shanicky Chen <peng@risingwave-labs.com>
Date: Thu, 9 Jan 2025 18:56:56 +0800
Subject: [PATCH 15/21] fix: wrong streaming_parallelism when
 default_parallelism is set to fixed (#20072)

Signed-off-by: Shanicky Chen <peng@risingwave-labs.com>
---
 src/meta/src/controller/streaming_job.rs      |  8 ++-
 src/tests/simulation/src/cluster.rs           | 32 +++++++++++
 .../integration_tests/default_parallelism.rs  | 56 +++++++++++++++++++
 .../tests/integration_tests/main.rs           |  1 +
 4 files changed, 94 insertions(+), 3 deletions(-)
 create mode 100644 src/tests/simulation/tests/integration_tests/default_parallelism.rs

diff --git a/src/meta/src/controller/streaming_job.rs b/src/meta/src/controller/streaming_job.rs
index f30e20b50f0b8..8676b2583227c 100644
--- a/src/meta/src/controller/streaming_job.rs
+++ b/src/meta/src/controller/streaming_job.rs
@@ -17,6 +17,7 @@ use std::num::NonZeroUsize;
 
 use anyhow::anyhow;
 use itertools::Itertools;
+use risingwave_common::config::DefaultParallelism;
 use risingwave_common::hash::VnodeCountCompat;
 use risingwave_common::util::column_index_mapping::ColIndexMapping;
 use risingwave_common::util::stream_graph_visitor::visit_stream_node;
@@ -119,9 +120,10 @@ impl CatalogController {
         let txn = inner.db.begin().await?;
         let create_type = streaming_job.create_type();
 
-        let streaming_parallelism = match parallelism {
-            None => StreamingParallelism::Adaptive,
-            Some(n) => StreamingParallelism::Fixed(n.parallelism as _),
+        let streaming_parallelism = match (parallelism, self.env.opts.default_parallelism) {
+            (None, DefaultParallelism::Full) => StreamingParallelism::Adaptive,
+            (None, DefaultParallelism::Default(n)) => StreamingParallelism::Fixed(n.get()),
+            (Some(n), _) => StreamingParallelism::Fixed(n.parallelism as _),
         };
 
         ensure_user_id(streaming_job.owner() as _, &txn).await?;
diff --git a/src/tests/simulation/src/cluster.rs b/src/tests/simulation/src/cluster.rs
index 21f113b7b9858..a3fd28bb54293 100644
--- a/src/tests/simulation/src/cluster.rs
+++ b/src/tests/simulation/src/cluster.rs
@@ -14,6 +14,7 @@
 
 #![cfg_attr(not(madsim), allow(unused_imports))]
 
+use std::cmp::max;
 use std::collections::HashMap;
 use std::future::Future;
 use std::io::Write;
@@ -206,6 +207,37 @@ metrics_level = "Disabled"
         }
     }
 
+    pub fn for_default_parallelism(default_parallelism: usize) -> Self {
+        let config_path = {
+            let mut file =
+                tempfile::NamedTempFile::new().expect("failed to create temp config file");
+
+            let config_data = format!(
+                r#"
+[server]
+telemetry_enabled = false
+metrics_level = "Disabled"
+[meta]
+default_parallelism = {default_parallelism}
+"#
+            )
+            .to_owned();
+            file.write_all(config_data.as_bytes())
+                .expect("failed to write config file");
+            file.into_temp_path()
+        };
+
+        Configuration {
+            config_path: ConfigPath::Temp(config_path.into()),
+            frontend_nodes: 1,
+            compute_nodes: 1,
+            meta_nodes: 1,
+            compactor_nodes: 1,
+            compute_node_cores: default_parallelism * 2,
+            per_session_queries: vec![].into(),
+        }
+    }
+
     /// Returns the config for backfill test.
     pub fn for_backfill() -> Self {
         // Embed the config file and create a temporary file at runtime. The file will be deleted
diff --git a/src/tests/simulation/tests/integration_tests/default_parallelism.rs b/src/tests/simulation/tests/integration_tests/default_parallelism.rs
new file mode 100644
index 0000000000000..2cb79d5c3a8cf
--- /dev/null
+++ b/src/tests/simulation/tests/integration_tests/default_parallelism.rs
@@ -0,0 +1,56 @@
+// Copyright 2025 RisingWave Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use anyhow::Result;
+use risingwave_simulation::cluster::{Cluster, Configuration};
+use risingwave_simulation::utils::AssertResult;
+
+#[tokio::test]
+async fn test_no_default_parallelism() -> Result<()> {
+    let mut cluster = Cluster::start(Configuration::default()).await?;
+
+    let mut session = cluster.start_session();
+
+    session.run("create table t(v int);").await?;
+    session
+        .run("select parallelism from rw_streaming_parallelism where name = 't';")
+        .await?
+        .assert_result_eq("ADAPTIVE");
+
+    Ok(())
+}
+
+#[tokio::test]
+async fn test_default_parallelism() -> Result<()> {
+    let mut cluster = Cluster::start(Configuration::for_default_parallelism(2)).await?;
+
+    let mut session = cluster.start_session();
+
+    session.run("create table t(v int);").await?;
+    session
+        .run("select parallelism from rw_streaming_parallelism where name = 't';")
+        .await?
+        .assert_result_eq("FIXED(2)");
+
+    session
+        .run("alter table t set parallelism = adaptive;")
+        .await?;
+
+    session
+        .run("select parallelism from rw_streaming_parallelism where name = 't';")
+        .await?
+        .assert_result_eq("ADAPTIVE");
+
+    Ok(())
+}
diff --git a/src/tests/simulation/tests/integration_tests/main.rs b/src/tests/simulation/tests/integration_tests/main.rs
index 8244a0bb48f27..ad8e854a30e5f 100644
--- a/src/tests/simulation/tests/integration_tests/main.rs
+++ b/src/tests/simulation/tests/integration_tests/main.rs
@@ -29,4 +29,5 @@ mod storage;
 mod throttle;
 
 mod compaction;
+mod default_parallelism;
 mod utils;

From 8b5cabac7ba209efacfc05eb92086d83a45309c8 Mon Sep 17 00:00:00 2001
From: zwang28 <70626450+zwang28@users.noreply.github.com>
Date: Thu, 9 Jan 2025 19:20:55 +0800
Subject: [PATCH 16/21] refactor: log deleted objects (#20086)

---
 src/meta/src/hummock/manager/gc.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/meta/src/hummock/manager/gc.rs b/src/meta/src/hummock/manager/gc.rs
index 7bd7034ae1f99..eba07b7002244 100644
--- a/src/meta/src/hummock/manager/gc.rs
+++ b/src/meta/src/hummock/manager/gc.rs
@@ -535,7 +535,7 @@ impl HummockManager {
                 break;
             }
             let delete_batch: HashSet<_> = objects_to_delete.drain(..batch_size).collect();
-            tracing::debug!(?delete_batch, "Attempt to delete objects.");
+            tracing::info!(?delete_batch, "Attempt to delete objects.");
             let deleted_object_ids = delete_batch.clone();
             self.gc_manager
                 .delete_objects(delete_batch.into_iter())

From e6e9ace249c3eb9a78c5c900c47ea47dc0dd5c82 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 10 Jan 2025 03:05:55 +0000
Subject: [PATCH 17/21] chore(deps): Bump anstream from 0.6.4 to 0.6.18 in
 /integration_tests/feature-store/simulator (#20022)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: xxchan <xxchan22f@gmail.com>
---
 .../feature-store/simulator/Cargo.lock        | 114 +++++++++++++++---
 1 file changed, 97 insertions(+), 17 deletions(-)

diff --git a/integration_tests/feature-store/simulator/Cargo.lock b/integration_tests/feature-store/simulator/Cargo.lock
index 79c7d3cad88b0..1049a14ba248f 100644
--- a/integration_tests/feature-store/simulator/Cargo.lock
+++ b/integration_tests/feature-store/simulator/Cargo.lock
@@ -19,15 +19,16 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
 
 [[package]]
 name = "anstream"
-version = "0.6.4"
+version = "0.6.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2ab91ebe16eb252986481c5b62f6098f3b698a45e34b5b98200cf20dd2484a44"
+checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b"
 dependencies = [
  "anstyle",
  "anstyle-parse",
  "anstyle-query",
  "anstyle-wincon",
  "colorchoice",
+ "is_terminal_polyfill",
  "utf8parse",
 ]
 
@@ -52,17 +53,17 @@ version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5ca11d4be1bab0c8bc8734a9aa7bf4ee8316d462a08c6ac5052f888fef5b494b"
 dependencies = [
- "windows-sys",
+ "windows-sys 0.48.0",
 ]
 
 [[package]]
 name = "anstyle-wincon"
-version = "3.0.1"
+version = "3.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f0699d10d2f4d628a98ee7b57b289abbc98ff3bad977cb3152709d4bf2330628"
+checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125"
 dependencies = [
  "anstyle",
- "windows-sys",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -508,6 +509,12 @@ dependencies = [
  "hashbrown 0.14.3",
 ]
 
+[[package]]
+name = "is_terminal_polyfill"
+version = "1.70.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
+
 [[package]]
 name = "itertools"
 version = "0.11.0"
@@ -564,7 +571,7 @@ checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c"
 dependencies = [
  "libc",
  "wasi",
- "windows-sys",
+ "windows-sys 0.48.0",
 ]
 
 [[package]]
@@ -797,7 +804,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4031e820eb552adee9295814c0ced9e5cf38ddf1e8b7d566d6de8e2538ea989e"
 dependencies = [
  "libc",
- "windows-sys",
+ "windows-sys 0.48.0",
 ]
 
 [[package]]
@@ -837,7 +844,7 @@ dependencies = [
  "pin-project-lite",
  "socket2 0.5.4",
  "tokio-macros",
- "windows-sys",
+ "windows-sys 0.48.0",
 ]
 
 [[package]]
@@ -1037,7 +1044,16 @@ version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
 dependencies = [
- "windows-targets",
+ "windows-targets 0.48.5",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.59.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
+dependencies = [
+ "windows-targets 0.52.6",
 ]
 
 [[package]]
@@ -1046,13 +1062,29 @@ version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
 dependencies = [
- "windows_aarch64_gnullvm",
- "windows_aarch64_msvc",
- "windows_i686_gnu",
- "windows_i686_msvc",
- "windows_x86_64_gnu",
- "windows_x86_64_gnullvm",
- "windows_x86_64_msvc",
+ "windows_aarch64_gnullvm 0.48.5",
+ "windows_aarch64_msvc 0.48.5",
+ "windows_i686_gnu 0.48.5",
+ "windows_i686_msvc 0.48.5",
+ "windows_x86_64_gnu 0.48.5",
+ "windows_x86_64_gnullvm 0.48.5",
+ "windows_x86_64_msvc 0.48.5",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
+dependencies = [
+ "windows_aarch64_gnullvm 0.52.6",
+ "windows_aarch64_msvc 0.52.6",
+ "windows_i686_gnu 0.52.6",
+ "windows_i686_gnullvm",
+ "windows_i686_msvc 0.52.6",
+ "windows_x86_64_gnu 0.52.6",
+ "windows_x86_64_gnullvm 0.52.6",
+ "windows_x86_64_msvc 0.52.6",
 ]
 
 [[package]]
@@ -1061,38 +1093,86 @@ version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
 
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
+
 [[package]]
 name = "windows_aarch64_msvc"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
 
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
+
 [[package]]
 name = "windows_i686_gnu"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
 
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
+
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
+
 [[package]]
 name = "windows_i686_msvc"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
 
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
+
 [[package]]
 name = "windows_x86_64_gnu"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
 
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
+
 [[package]]
 name = "windows_x86_64_gnullvm"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
 
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
+
 [[package]]
 name = "windows_x86_64_msvc"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"

From 340a167d142899a9705725ac84782c5d276aabcc Mon Sep 17 00:00:00 2001
From: August <pin@singularity-data.com>
Date: Fri, 10 Jan 2025 18:54:12 +0800
Subject: [PATCH 18/21] fix: synchronize the index on the mview when changing
 the owner or schema (#20093)

---
 e2e_test/ddl/alter_owner.slt                | 18 +++++++
 e2e_test/ddl/alter_set_schema.slt           | 12 ++++-
 src/meta/src/controller/catalog/alter_op.rs | 58 +++++++++------------
 3 files changed, 53 insertions(+), 35 deletions(-)

diff --git a/e2e_test/ddl/alter_owner.slt b/e2e_test/ddl/alter_owner.slt
index 11df2419f538d..37b4a94073ccf 100644
--- a/e2e_test/ddl/alter_owner.slt
+++ b/e2e_test/ddl/alter_owner.slt
@@ -67,6 +67,9 @@ v user1
 statement ok
 CREATE MATERIALIZED VIEW mv AS SELECT v1, (t.v2).v1 AS v21 FROM t;
 
+statement ok
+CREATE INDEX mv_idx ON mv(v1);
+
 statement ok
 ALTER MATERIALIZED VIEW mv OWNER TO user1;
 
@@ -85,6 +88,21 @@ WHERE
 ----
 mv user1
 
+query TT
+SELECT
+    pg_class.relname AS rel_name,
+    pg_roles.rolname AS owner
+FROM
+    pg_class
+    JOIN pg_namespace ON pg_namespace.oid = pg_class.relnamespace
+    JOIN pg_roles ON pg_roles.oid = pg_class.relowner
+WHERE
+    pg_namespace.nspname NOT LIKE 'pg_%'
+    AND pg_namespace.nspname != 'information_schema'
+    AND pg_class.relname = 'mv_idx';
+----
+mv_idx user1
+
 statement ok
 CREATE SOURCE src (v INT) WITH (
     connector = 'datagen',
diff --git a/e2e_test/ddl/alter_set_schema.slt b/e2e_test/ddl/alter_set_schema.slt
index db0f479c85c05..3f32cd732e879 100644
--- a/e2e_test/ddl/alter_set_schema.slt
+++ b/e2e_test/ddl/alter_set_schema.slt
@@ -23,12 +23,21 @@ CREATE INDEX test_index1 ON test_table(u);
 statement ok
 CREATE INDEX test_index2 ON test_table(v);
 
+statement ok
+CREATE MATERIALIZED VIEW test_mv AS SELECT u FROM test_table;
+
+statement ok
+CREATE INDEX test_mv_index ON test_mv(u);
+
 statement ok
 ALTER TABLE test_table SET SCHEMA public;
 
 statement ok
 ALTER TABLE test_table SET SCHEMA test_schema;
 
+statement ok
+ALTER MATERIALIZED VIEW test_mv SET SCHEMA test_schema;
+
 query TT
 SELECT tablename, schemaname FROM pg_tables WHERE schemaname = 'test_schema';
 ----
@@ -39,6 +48,7 @@ SELECT indexname, schemaname FROM pg_indexes WHERE schemaname = 'test_schema';
 ----
 test_index1 test_schema
 test_index2 test_schema
+test_mv_index test_schema
 
 statement ok
 CREATE SOURCE test_source (v INT) WITH (
@@ -104,7 +114,7 @@ statement ok
 DROP SOURCE test_schema.test_source;
 
 statement ok
-DROP TABLE test_schema.test_table;
+DROP TABLE test_schema.test_table cascade;
 
 statement ok
 DROP SCHEMA test_schema;
diff --git a/src/meta/src/controller/catalog/alter_op.rs b/src/meta/src/controller/catalog/alter_op.rs
index 6bdbee653bb69..43a9843ec05e1 100644
--- a/src/meta/src/controller/catalog/alter_op.rs
+++ b/src/meta/src/controller/catalog/alter_op.rs
@@ -331,20 +331,15 @@ impl CatalogController {
                 }
 
                 // indexes.
-                let (index_ids, mut table_ids): (Vec<IndexId>, Vec<TableId>) =
-                    if table.table_type == TableType::Table {
-                        Index::find()
-                            .select_only()
-                            .columns([index::Column::IndexId, index::Column::IndexTableId])
-                            .filter(index::Column::PrimaryTableId.eq(object_id))
-                            .into_tuple::<(IndexId, TableId)>()
-                            .all(&txn)
-                            .await?
-                            .into_iter()
-                            .unzip()
-                    } else {
-                        (vec![], vec![])
-                    };
+                let (index_ids, mut table_ids): (Vec<IndexId>, Vec<TableId>) = Index::find()
+                    .select_only()
+                    .columns([index::Column::IndexId, index::Column::IndexTableId])
+                    .filter(index::Column::PrimaryTableId.eq(object_id))
+                    .into_tuple::<(IndexId, TableId)>()
+                    .all(&txn)
+                    .await?
+                    .into_iter()
+                    .unzip();
                 relations.push(PbRelationInfo::Table(ObjectModel(table, obj).into()));
 
                 // internal tables.
@@ -501,8 +496,7 @@ impl CatalogController {
                     .await?
                     .ok_or_else(|| MetaError::catalog_id_not_found("table", object_id))?;
                 check_relation_name_duplicate(&table.name, database_id, new_schema, &txn).await?;
-                let (associated_src_id, table_type) =
-                    (table.optional_associated_source_id, table.table_type);
+                let associated_src_id = table.optional_associated_source_id;
 
                 let mut obj = obj.into_active_model();
                 obj.schema_id = Set(Some(new_schema));
@@ -531,24 +525,20 @@ impl CatalogController {
                 let (index_ids, (index_names, mut table_ids)): (
                     Vec<IndexId>,
                     (Vec<String>, Vec<TableId>),
-                ) = if table_type == TableType::Table {
-                    Index::find()
-                        .select_only()
-                        .columns([
-                            index::Column::IndexId,
-                            index::Column::Name,
-                            index::Column::IndexTableId,
-                        ])
-                        .filter(index::Column::PrimaryTableId.eq(object_id))
-                        .into_tuple::<(IndexId, String, TableId)>()
-                        .all(&txn)
-                        .await?
-                        .into_iter()
-                        .map(|(id, name, t_id)| (id, (name, t_id)))
-                        .unzip()
-                } else {
-                    (vec![], (vec![], vec![]))
-                };
+                ) = Index::find()
+                    .select_only()
+                    .columns([
+                        index::Column::IndexId,
+                        index::Column::Name,
+                        index::Column::IndexTableId,
+                    ])
+                    .filter(index::Column::PrimaryTableId.eq(object_id))
+                    .into_tuple::<(IndexId, String, TableId)>()
+                    .all(&txn)
+                    .await?
+                    .into_iter()
+                    .map(|(id, name, t_id)| (id, (name, t_id)))
+                    .unzip();
 
                 // internal tables.
                 let internal_tables: Vec<TableId> = Table::find()

From 4c384402b6c0ca3bd1804319d4b2baabc1df654e Mon Sep 17 00:00:00 2001
From: Li0k <yuli@singularity-data.com>
Date: Sat, 11 Jan 2025 03:38:19 +0800
Subject: [PATCH 19/21] =?UTF-8?q?fix(storage):=20Remove=20logic=20for=20de?=
 =?UTF-8?q?leting=20objects=20in=20truncate=5Ftime=5Ftrav=E2=80=A6=20(#201?=
 =?UTF-8?q?01)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/meta/src/hummock/manager/time_travel.rs | 22 ++-------------------
 1 file changed, 2 insertions(+), 20 deletions(-)

diff --git a/src/meta/src/hummock/manager/time_travel.rs b/src/meta/src/hummock/manager/time_travel.rs
index 721d90e860310..e9886519c0485 100644
--- a/src/meta/src/hummock/manager/time_travel.rs
+++ b/src/meta/src/hummock/manager/time_travel.rs
@@ -108,18 +108,8 @@ impl HummockManager {
             txn.commit().await?;
             return Ok(());
         };
-        let (
-            latest_valid_version_id,
-            latest_valid_version_sst_ids,
-            latest_valid_version_object_ids,
-        ) = {
-            (
-                latest_valid_version.id,
-                latest_valid_version.get_sst_ids(),
-                latest_valid_version.get_object_ids(),
-            )
-        };
-        let mut object_ids_to_delete: HashSet<_> = HashSet::default();
+        let (latest_valid_version_id, latest_valid_version_sst_ids) =
+            { (latest_valid_version.id, latest_valid_version.get_sst_ids()) };
         let version_ids_to_delete: Vec<risingwave_meta_model::HummockVersionId> =
             hummock_time_travel_version::Entity::find()
                 .select_only()
@@ -163,8 +153,6 @@ impl HummockManager {
                 .filter(hummock_sstable_info::Column::SstId.is_in(sst_ids_to_delete))
                 .exec(&txn)
                 .await?;
-            let new_object_ids = delta_to_delete.newly_added_object_ids();
-            object_ids_to_delete.extend(&new_object_ids - &latest_valid_version_object_ids);
             tracing::debug!(
                 delta_id = delta_to_delete.id.to_u64(),
                 "delete {} rows from hummock_sstable_info",
@@ -194,8 +182,6 @@ impl HummockManager {
                 .filter(hummock_sstable_info::Column::SstId.is_in(sst_ids_to_delete))
                 .exec(&txn)
                 .await?;
-            let new_object_ids = prev_version.get_object_ids();
-            object_ids_to_delete.extend(&new_object_ids - &latest_valid_version_object_ids);
             tracing::debug!(
                 prev_version_id,
                 "delete {} rows from hummock_sstable_info",
@@ -203,10 +189,6 @@ impl HummockManager {
             );
             next_version_sst_ids = sst_ids;
         }
-        if !object_ids_to_delete.is_empty() {
-            self.gc_manager
-                .add_may_delete_object_ids(object_ids_to_delete.into_iter());
-        }
 
         let res = hummock_time_travel_version::Entity::delete_many()
             .filter(

From 80b3fd04a8b918e57b1be12cc34b98be20724484 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sat, 11 Jan 2025 12:56:17 +0800
Subject: [PATCH 20/21] chore(deps): Bump the all group across 1 directory with
 44 updates (#20037)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Eric Fu <eric@singularity-data.com>
---
 .../risingwave-sink-deltalake/pom.xml         |  2 +-
 .../risingwave-sink-iceberg/pom.xml           |  2 +-
 java/pom.xml                                  | 26 +++++++++----------
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/java/connector-node/risingwave-sink-deltalake/pom.xml b/java/connector-node/risingwave-sink-deltalake/pom.xml
index c1918a4855c29..ea62a6a703a13 100644
--- a/java/connector-node/risingwave-sink-deltalake/pom.xml
+++ b/java/connector-node/risingwave-sink-deltalake/pom.xml
@@ -43,7 +43,7 @@
         <dependency>
             <groupId>io.delta</groupId>
             <artifactId>delta-standalone_2.12</artifactId>
-            <version>3.2.1</version>
+            <version>3.3.0</version>
         </dependency>
         <dependency>
             <groupId>org.apache.parquet</groupId>
diff --git a/java/connector-node/risingwave-sink-iceberg/pom.xml b/java/connector-node/risingwave-sink-iceberg/pom.xml
index 49500676896b3..b4d2e87e404b1 100644
--- a/java/connector-node/risingwave-sink-iceberg/pom.xml
+++ b/java/connector-node/risingwave-sink-iceberg/pom.xml
@@ -16,7 +16,7 @@
     <name>risingwave-sink-iceberg</name>
 
     <properties>
-        <iceberg.version>1.7.0</iceberg.version>
+        <iceberg.version>1.7.1</iceberg.version>
         <maven.compiler.source>11</maven.compiler.source>
         <maven.compiler.target>11</maven.compiler.target>
         <maven.javadoc.skip>true</maven.javadoc.skip>
diff --git a/java/pom.xml b/java/pom.xml
index c717cfc87a2e0..1a6060947f68c 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -65,16 +65,16 @@
         <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
         <!-- dependency versions -->
         <protobuf.version>3.25.5</protobuf.version> <!-- https://github.com/grpc/grpc-java/issues/11015 -->
-        <grpc.version>1.68.1</grpc.version>
-        <log4j.version>2.24.2</log4j.version>
+        <grpc.version>1.69.0</grpc.version>
+        <log4j.version>2.24.3</log4j.version>
         <slf4j.version>2.0.16</slf4j.version>
         <debezium.version>2.6.2.Final</debezium.version>
-        <jackson.version>2.18.1</jackson.version>
+        <jackson.version>2.18.2</jackson.version>
         <hadoop.version>3.4.1</hadoop.version>
         <testcontainers.version>1.20.4</testcontainers.version>
-        <aws.version>2.29.20</aws.version>
-        <jetty.version>12.0.15</jetty.version>
-        <parquet.version>1.14.4</parquet.version>
+        <aws.version>2.29.45</aws.version>
+        <jetty.version>12.0.16</jetty.version>
+        <parquet.version>1.15.0</parquet.version>
     </properties>
 
     <dependencyManagement>
@@ -107,7 +107,7 @@
             <dependency>
                 <groupId>org.apache.commons</groupId>
                 <artifactId>commons-text</artifactId>
-                <version>1.12.0</version>
+                <version>1.13.0</version>
             </dependency>
             <dependency>
                 <groupId>commons-io</groupId>
@@ -175,7 +175,7 @@
             <dependency>
                 <groupId>org.elasticsearch.client</groupId>
                 <artifactId>elasticsearch-rest-high-level-client</artifactId>
-                <version>7.17.25</version>
+                <version>7.17.26</version>
             </dependency>
             <dependency>
                 <groupId>org.opensearch.client</groupId>
@@ -219,7 +219,7 @@
             <dependency>
                 <groupId>org.assertj</groupId>
                 <artifactId>assertj-core</artifactId>
-                <version>3.26.3</version>
+                <version>3.27.2</version>
                 <scope>test</scope>
             </dependency>
             <dependency>
@@ -295,7 +295,7 @@
             <dependency>
                 <groupId>com.google.guava</groupId>
                 <artifactId>guava</artifactId>
-                <version>33.3.1-jre</version>
+                <version>33.4.0-jre</version>
             </dependency>
             <dependency>
                 <groupId>org.apache.httpcomponents</groupId>
@@ -310,7 +310,7 @@
             <dependency>
                 <groupId>org.xerial</groupId>
                 <artifactId>sqlite-jdbc</artifactId>
-                <version>3.47.0.0</version>
+                <version>3.47.2.0</version>
             </dependency>
             <dependency>
                 <groupId>software.amazon.awssdk</groupId>
@@ -398,7 +398,7 @@
                         <groupId>com.puppycrawl.tools</groupId>
                         <artifactId>checkstyle</artifactId>
                         <!-- TODO: Document it for devs. -->
-                        <version>10.20.1</version>
+                        <version>10.21.1</version>
                     </dependency>
                 </dependencies>
                 <executions>
@@ -491,7 +491,7 @@
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>
                 <artifactId>maven-javadoc-plugin</artifactId>
-                <version>3.11.1</version>
+                <version>3.11.2</version>
                 <executions>
                     <execution>
                         <id>attach-javadocs</id>

From 326ec191614bfc34075604a9783de7a029cb36dc Mon Sep 17 00:00:00 2001
From: zwang28 <70626450+zwang28@users.noreply.github.com>
Date: Sat, 11 Jan 2025 15:22:23 +0800
Subject: [PATCH 21/21] fix(meta): fix time travel GC bug (#20108)

---
 src/meta/src/hummock/manager/gc.rs          | 35 ++++++---------------
 src/meta/src/hummock/manager/time_travel.rs | 24 ++++++++++++--
 src/meta/src/hummock/manager/versioning.rs  | 26 +++++++++++++++
 3 files changed, 57 insertions(+), 28 deletions(-)

diff --git a/src/meta/src/hummock/manager/gc.rs b/src/meta/src/hummock/manager/gc.rs
index eba07b7002244..f423e1995b499 100644
--- a/src/meta/src/hummock/manager/gc.rs
+++ b/src/meta/src/hummock/manager/gc.rs
@@ -14,7 +14,6 @@
 
 use std::cmp;
 use std::collections::HashSet;
-use std::ops::Bound::{Excluded, Included};
 use std::ops::DerefMut;
 use std::sync::atomic::{AtomicBool, Ordering};
 use std::time::{Duration, SystemTime};
@@ -223,30 +222,8 @@ impl HummockManager {
     ) -> Result<Vec<HummockSstableObjectId>> {
         // This lock ensures `commit_epoch` and `report_compat_task` can see the latest GC history during sanity check.
         let versioning = self.versioning.read().await;
-        let tracked_object_ids: HashSet<HummockSstableObjectId> = {
-            let context_info = self.context_info.read().await;
-            // object ids in checkpoint version
-            let mut tracked_object_ids = versioning.checkpoint.version.get_object_ids();
-            // add object ids added between checkpoint version and current version
-            for (_, delta) in versioning.hummock_version_deltas.range((
-                Excluded(versioning.checkpoint.version.id),
-                Included(versioning.current_version.id),
-            )) {
-                tracked_object_ids.extend(delta.newly_added_object_ids());
-            }
-            // add stale object ids before the checkpoint version
-            let min_pinned_version_id = context_info.min_pinned_version_id();
-            tracked_object_ids.extend(
-                versioning
-                    .checkpoint
-                    .stale_objects
-                    .iter()
-                    .filter(|(version_id, _)| **version_id >= min_pinned_version_id)
-                    .flat_map(|(_, objects)| objects.id.iter())
-                    .cloned(),
-            );
-            tracked_object_ids
-        };
+        let tracked_object_ids: HashSet<HummockSstableObjectId> = versioning
+            .get_tracked_object_ids(self.context_info.read().await.min_pinned_version_id());
         let to_delete = object_ids.filter(|object_id| !tracked_object_ids.contains(object_id));
         self.write_gc_history(to_delete.clone()).await?;
         Ok(to_delete.collect())
@@ -556,9 +533,15 @@ impl HummockManager {
         };
         // Objects pinned by either meta backup or time travel should be filtered out.
         let backup_pinned: HashSet<_> = backup_manager.list_pinned_ssts();
+        // The version_pinned is obtained after the candidate object_ids for deletion, which is new enough for filtering purpose.
+        let version_pinned = {
+            let versioning = self.versioning.read().await;
+            versioning
+                .get_tracked_object_ids(self.context_info.read().await.min_pinned_version_id())
+        };
         let object_ids = object_ids
             .into_iter()
-            .filter(|s| !backup_pinned.contains(s));
+            .filter(|s| !version_pinned.contains(s) && !backup_pinned.contains(s));
         let object_ids = self.filter_out_objects_by_time_travel(object_ids).await?;
         // Retry is not necessary. Full GC will handle these objects eventually.
         self.delete_objects(object_ids.into_iter().collect())
diff --git a/src/meta/src/hummock/manager/time_travel.rs b/src/meta/src/hummock/manager/time_travel.rs
index e9886519c0485..a31fe4d82fafb 100644
--- a/src/meta/src/hummock/manager/time_travel.rs
+++ b/src/meta/src/hummock/manager/time_travel.rs
@@ -108,8 +108,18 @@ impl HummockManager {
             txn.commit().await?;
             return Ok(());
         };
-        let (latest_valid_version_id, latest_valid_version_sst_ids) =
-            { (latest_valid_version.id, latest_valid_version.get_sst_ids()) };
+        let (
+            latest_valid_version_id,
+            latest_valid_version_sst_ids,
+            latest_valid_version_object_ids,
+        ) = {
+            (
+                latest_valid_version.id,
+                latest_valid_version.get_sst_ids(),
+                latest_valid_version.get_object_ids(),
+            )
+        };
+        let mut object_ids_to_delete: HashSet<_> = HashSet::default();
         let version_ids_to_delete: Vec<risingwave_meta_model::HummockVersionId> =
             hummock_time_travel_version::Entity::find()
                 .select_only()
@@ -153,6 +163,8 @@ impl HummockManager {
                 .filter(hummock_sstable_info::Column::SstId.is_in(sst_ids_to_delete))
                 .exec(&txn)
                 .await?;
+            let new_object_ids = delta_to_delete.newly_added_object_ids();
+            object_ids_to_delete.extend(&new_object_ids - &latest_valid_version_object_ids);
             tracing::debug!(
                 delta_id = delta_to_delete.id.to_u64(),
                 "delete {} rows from hummock_sstable_info",
@@ -182,6 +194,8 @@ impl HummockManager {
                 .filter(hummock_sstable_info::Column::SstId.is_in(sst_ids_to_delete))
                 .exec(&txn)
                 .await?;
+            let new_object_ids = prev_version.get_object_ids();
+            object_ids_to_delete.extend(&new_object_ids - &latest_valid_version_object_ids);
             tracing::debug!(
                 prev_version_id,
                 "delete {} rows from hummock_sstable_info",
@@ -189,6 +203,12 @@ impl HummockManager {
             );
             next_version_sst_ids = sst_ids;
         }
+        if !object_ids_to_delete.is_empty() {
+            // IMPORTANT: object_ids_to_delete may include objects that are still being used by SSTs not included in time travel metadata.
+            // So it's crucial to filter out those objects before actually deleting them, i.e. when using `try_take_may_delete_object_ids`.
+            self.gc_manager
+                .add_may_delete_object_ids(object_ids_to_delete.into_iter());
+        }
 
         let res = hummock_time_travel_version::Entity::delete_many()
             .filter(
diff --git a/src/meta/src/hummock/manager/versioning.rs b/src/meta/src/hummock/manager/versioning.rs
index bdd86a4f3b803..07d99992e10b1 100644
--- a/src/meta/src/hummock/manager/versioning.rs
+++ b/src/meta/src/hummock/manager/versioning.rs
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 use std::cmp;
+use std::collections::Bound::{Excluded, Included};
 use std::collections::{BTreeMap, HashMap, HashSet};
 
 use itertools::Itertools;
@@ -82,6 +83,31 @@ impl Versioning {
     pub(super) fn mark_next_time_travel_version_snapshot(&mut self) {
         self.time_travel_snapshot_interval_counter = u64::MAX;
     }
+
+    pub fn get_tracked_object_ids(
+        &self,
+        min_pinned_version_id: HummockVersionId,
+    ) -> HashSet<HummockSstableObjectId> {
+        // object ids in checkpoint version
+        let mut tracked_object_ids = self.checkpoint.version.get_object_ids();
+        // add object ids added between checkpoint version and current version
+        for (_, delta) in self.hummock_version_deltas.range((
+            Excluded(self.checkpoint.version.id),
+            Included(self.current_version.id),
+        )) {
+            tracked_object_ids.extend(delta.newly_added_object_ids());
+        }
+        // add stale object ids before the checkpoint version
+        tracked_object_ids.extend(
+            self.checkpoint
+                .stale_objects
+                .iter()
+                .filter(|(version_id, _)| **version_id >= min_pinned_version_id)
+                .flat_map(|(_, objects)| objects.id.iter())
+                .cloned(),
+        );
+        tracked_object_ids
+    }
 }
 
 impl HummockManager {