From ef4f61414db721df8f6121f5dc53993a7562b67a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 8 Jan 2025 03:08:17 +0000 Subject: [PATCH 01/21] chore(deps): Bump downcast-rs from 1.2.0 to 2.0.1 (#20063) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 16 +++++++++++----- src/expr/core/Cargo.toml | 2 +- src/frontend/Cargo.toml | 2 +- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 74687e9ee8bdb..43645d8b8b745 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4195,9 +4195,15 @@ checksum = "1435fa1053d8b2fbbe9be7e97eca7f33d37b28409959813daefc1446a14247f1" [[package]] name = "downcast-rs" -version = "1.2.0" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75b325c5dbd37f80359721ad39aca5a29fb04c89279657cffdda8736d0c0b9d2" + +[[package]] +name = "downcast-rs" +version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ea835d29036a4087793836fa931b08837ad5e957da9e23886b29586fb9b6650" +checksum = "ea8a8b81cacc08888170eef4d13b775126db426d0b348bee9d18c2c1eaf123cf" [[package]] name = "duct" @@ -7146,7 +7152,7 @@ dependencies = [ "async-task", "bincode 1.3.3", "bytes", - "downcast-rs", + "downcast-rs 1.2.1", "futures-util", "lazy_static", "libc", @@ -11118,7 +11124,7 @@ dependencies = [ "await-tree", "chrono", "const-currying", - "downcast-rs", + "downcast-rs 2.0.1", "easy-ext", "educe", "either", @@ -11230,7 +11236,7 @@ dependencies = [ "bytes", "chrono", "clap", - "downcast-rs", + "downcast-rs 2.0.1", "dyn-clone", "easy-ext", "educe", diff --git a/src/expr/core/Cargo.toml b/src/expr/core/Cargo.toml index 16db4ea8f2691..f9be0b84c36d7 100644 --- a/src/expr/core/Cargo.toml +++ b/src/expr/core/Cargo.toml @@ -19,7 +19,7 @@ chrono = { version = "0.4", default-features = false, features = [ "std", ] } const-currying = "0.0.4" -downcast-rs = "1.2" +downcast-rs = "2.0" easy-ext = "1" educe = "0.6" either = "1" diff --git a/src/frontend/Cargo.toml b/src/frontend/Cargo.toml index 9dccfb56380df..50af8afef3240 100644 --- a/src/frontend/Cargo.toml +++ b/src/frontend/Cargo.toml @@ -21,7 +21,7 @@ bk-tree = "0.5.0" bytes = "1" chrono = { version = "0.4", default-features = false } clap = { workspace = true } -downcast-rs = "1.2" +downcast-rs = "2.0" dyn-clone = "1.0.14" easy-ext = "1" educe = "0.6" From f89c1b71cd0aa25a5cd378572c8777a17340f1cc Mon Sep 17 00:00:00 2001 From: xxchan Date: Wed, 8 Jan 2025 11:30:41 +0800 Subject: [PATCH 02/21] build(deps): unpatch prost (#20066) Signed-off-by: xxchan --- Cargo.lock | 128 +++++++++++++++++++++-------------------------------- Cargo.toml | 2 - 2 files changed, 50 insertions(+), 80 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 43645d8b8b745..e2547112a44e3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -562,8 +562,8 @@ dependencies = [ "bytes", "futures", "paste", - "prost 0.13.1", - "prost-types 0.13.1", + "prost 0.13.4", + "prost-types 0.13.4", "tokio", "tonic", ] @@ -2651,8 +2651,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "86ed14aa9c9f927213c6e4f3ef75faaad3406134efe84ba2cb7983431d5f0931" dependencies = [ "futures-core", - "prost 0.13.1", - "prost-types 0.13.1", + "prost 0.13.4", + "prost-types 0.13.4", "tonic", "tracing-core", ] @@ -2670,8 +2670,8 @@ dependencies = [ "hdrhistogram", "humantime", "hyper-util", - "prost 0.13.1", - "prost-types 0.13.1", + "prost 0.13.4", + "prost-types 0.13.4", "serde", "serde_json", "thread_local", @@ -5536,8 +5536,8 @@ version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0edfdfb507593d47605b3bb2fb36628b391e3d397e520b85852dea2412c8e2d1" dependencies = [ - "prost 0.13.1", - "prost-types 0.13.1", + "prost 0.13.4", + "prost-types 0.13.4", "tonic", ] @@ -5564,7 +5564,7 @@ dependencies = [ "google-cloud-gax", "google-cloud-googleapis", "google-cloud-token", - "prost-types 0.13.1", + "prost-types 0.13.4", "thiserror 1.0.63", "tokio", "tokio-util", @@ -8307,7 +8307,7 @@ dependencies = [ "opentelemetry", "opentelemetry-proto", "opentelemetry_sdk", - "prost 0.13.1", + "prost 0.13.4", "thiserror 1.0.63", "tokio", "tonic", @@ -8321,7 +8321,7 @@ checksum = "30ee9f20bff9c984511a02f082dc8ede839e4a9bf15cc2487c8d6fea5ad850d9" dependencies = [ "opentelemetry", "opentelemetry_sdk", - "prost 0.13.1", + "prost 0.13.4", "tonic", ] @@ -8417,7 +8417,7 @@ dependencies = [ "datasize", "hex", "itertools 0.13.0", - "prost 0.13.1", + "prost 0.13.4", "rust-embed", "schnellru", "serde", @@ -8716,8 +8716,8 @@ checksum = "6eea3058763d6e656105d1403cb04e0a41b7bbac6362d413e7c33be0c32279c9" dependencies = [ "heck 0.5.0", "itertools 0.13.0", - "prost 0.13.1", - "prost-types 0.13.1", + "prost 0.13.4", + "prost-types 0.13.4", ] [[package]] @@ -9401,20 +9401,11 @@ dependencies = [ "prost-derive 0.12.6", ] -[[package]] -name = "prost" -version = "0.13.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e13db3d3fde688c61e2446b4d843bc27a7e8af269a69440c0308021dc92333cc" -dependencies = [ - "bytes", - "prost-derive 0.13.1", -] - [[package]] name = "prost" version = "0.13.4" -source = "git+https://github.com/xxchan/prost.git?rev=0eb1c7b09976cf6b5231e4b8d87bb5908ae6a163#0eb1c7b09976cf6b5231e4b8d87bb5908ae6a163" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c0fef6c4230e4ccf618a35c59d7ede15dea37de8427500f50aff708806e42ec" dependencies = [ "bytes", "prost-derive 0.13.4", @@ -9445,10 +9436,11 @@ dependencies = [ [[package]] name = "prost-build" version = "0.13.4" -source = "git+https://github.com/xxchan/prost.git?rev=0eb1c7b09976cf6b5231e4b8d87bb5908ae6a163#0eb1c7b09976cf6b5231e4b8d87bb5908ae6a163" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0f3e5beed80eb580c68e2c600937ac2c4eedabdfd5ef1e5b7ea4f3fba84497b" dependencies = [ "heck 0.5.0", - "itertools 0.13.0", + "itertools 0.10.5", "log", "multimap 0.10.0", "once_cell", @@ -9487,26 +9479,14 @@ dependencies = [ "syn 2.0.87", ] -[[package]] -name = "prost-derive" -version = "0.13.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18bec9b0adc4eba778b33684b7ba3e7137789434769ee3ce3930463ef904cfca" -dependencies = [ - "anyhow", - "itertools 0.13.0", - "proc-macro2", - "quote", - "syn 2.0.87", -] - [[package]] name = "prost-derive" version = "0.13.4" -source = "git+https://github.com/xxchan/prost.git?rev=0eb1c7b09976cf6b5231e4b8d87bb5908ae6a163#0eb1c7b09976cf6b5231e4b8d87bb5908ae6a163" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "157c5a9d7ea5c2ed2d9fb8f495b64759f7816c7eaea54ba3978f0d63000162e3" dependencies = [ "anyhow", - "itertools 0.13.0", + "itertools 0.10.5", "proc-macro2", "quote", "syn 2.0.87", @@ -9531,8 +9511,8 @@ dependencies = [ "logos", "miette", "once_cell", - "prost 0.13.1", - "prost-types 0.13.1", + "prost 0.13.4", + "prost-types 0.13.4", "serde", "serde-value", ] @@ -9546,19 +9526,11 @@ dependencies = [ "prost 0.11.9", ] -[[package]] -name = "prost-types" -version = "0.13.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cee5168b05f49d4b0ca581206eb14a7b22fafd963efe729ac48eb03266e25cc2" -dependencies = [ - "prost 0.13.1", -] - [[package]] name = "prost-types" version = "0.13.4" -source = "git+https://github.com/xxchan/prost.git?rev=0eb1c7b09976cf6b5231e4b8d87bb5908ae6a163#0eb1c7b09976cf6b5231e4b8d87bb5908ae6a163" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc2f1e56baa61e93533aebc21af4d2134b70f66275e0fcdf3cbe43d77ff7e8fc" dependencies = [ "prost 0.13.4", ] @@ -9577,9 +9549,9 @@ checksum = "873f359bdecdfe6e353752f97cb9ee69368df55b16363ed2216da85e03232a58" dependencies = [ "bytes", "miette", - "prost 0.13.1", + "prost 0.13.4", "prost-reflect", - "prost-types 0.13.1", + "prost-types 0.13.4", "protox-parse", "thiserror 1.0.63", ] @@ -9592,7 +9564,7 @@ checksum = "a3a462d115462c080ae000c29a47f0b3985737e5d3a995fcdbcaa5c782068dde" dependencies = [ "logos", "miette", - "prost-types 0.13.1", + "prost-types 0.13.4", "thiserror 1.0.63", ] @@ -10345,7 +10317,7 @@ dependencies = [ "bytes", "itertools 0.13.0", "parking_lot 0.12.1", - "prost 0.13.1", + "prost 0.13.4", "risingwave_common", "risingwave_hummock_sdk", "risingwave_meta_model", @@ -10586,7 +10558,7 @@ dependencies = [ "pretty_assertions", "procfs 0.17.0", "prometheus", - "prost 0.13.1", + "prost 0.13.4", "rand", "regex", "reqwest 0.12.4", @@ -10735,7 +10707,7 @@ dependencies = [ "bincode 1.3.3", "hex", "parking_lot 0.12.1", - "prost 0.13.1", + "prost 0.13.4", "risingwave_pb", "serde", "thiserror 1.0.63", @@ -10798,7 +10770,7 @@ dependencies = [ "jsonbb", "madsim-tokio", "madsim-tonic", - "prost 0.13.1", + "prost 0.13.4", "risingwave_common", "risingwave_common_heap_profiling", "risingwave_common_service", @@ -10832,7 +10804,7 @@ dependencies = [ "maplit", "pprof", "prometheus", - "prost 0.13.1", + "prost 0.13.4", "rand", "risingwave_batch", "risingwave_batch_executors", @@ -10938,9 +10910,9 @@ dependencies = [ "pretty_assertions", "proc-macro2", "prometheus", - "prost 0.13.1", + "prost 0.13.4", "prost-reflect", - "prost-types 0.13.1", + "prost-types 0.13.4", "pulsar", "quote", "rand", @@ -11010,10 +10982,10 @@ dependencies = [ "jsonschema-transpiler", "madsim-tokio", "num-bigint", - "prost 0.13.1", + "prost 0.13.4", "prost-build 0.13.4", "prost-reflect", - "prost-types 0.13.1", + "prost-types 0.13.4", "protox", "risingwave_common", "risingwave_pb", @@ -11042,7 +11014,7 @@ dependencies = [ "itertools 0.13.0", "madsim-tokio", "madsim-tonic", - "prost 0.13.1", + "prost 0.13.4", "regex", "risingwave_common", "risingwave_connector", @@ -11269,7 +11241,7 @@ dependencies = [ "pretty-xmlish", "pretty_assertions", "prometheus", - "prost 0.13.1", + "prost 0.13.4", "quick-xml 0.37.1", "rand", "risingwave_batch", @@ -11380,7 +11352,7 @@ dependencies = [ "madsim-tokio", "mockall", "parking_lot 0.12.1", - "prost 0.13.1", + "prost 0.13.4", "risingwave_common", "risingwave_hummock_sdk", "risingwave_pb", @@ -11399,7 +11371,7 @@ dependencies = [ "futures", "jni", "madsim-tokio", - "prost 0.13.1", + "prost 0.13.4", "risingwave_common", "risingwave_expr", "risingwave_hummock_sdk", @@ -11427,7 +11399,7 @@ dependencies = [ "jni", "madsim-tokio", "paste", - "prost 0.13.1", + "prost 0.13.4", "risingwave_common", "risingwave_pb", "thiserror 1.0.63", @@ -11497,7 +11469,7 @@ dependencies = [ "parking_lot 0.12.1", "prometheus", "prometheus-http-query", - "prost 0.13.1", + "prost 0.13.4", "rand", "risingwave_backup", "risingwave_common", @@ -11558,7 +11530,7 @@ dependencies = [ name = "risingwave_meta_model" version = "2.3.0-alpha" dependencies = [ - "prost 0.13.1", + "prost 0.13.4", "risingwave_common", "risingwave_pb", "sea-orm", @@ -11676,7 +11648,7 @@ dependencies = [ "madsim-tonic-build", "pbjson", "pbjson-build", - "prost 0.13.1", + "prost 0.13.4", "prost-build 0.13.4", "prost-helpers", "risingwave_error", @@ -11942,7 +11914,7 @@ dependencies = [ "parking_lot 0.12.1", "procfs 0.17.0", "prometheus", - "prost 0.13.1", + "prost 0.13.4", "rand", "risingwave_backup", "risingwave_common", @@ -12012,7 +11984,7 @@ dependencies = [ "prehash", "pretty_assertions", "prometheus", - "prost 0.13.1", + "prost 0.13.4", "rand", "risingwave_common", "risingwave_common_estimate_size", @@ -12050,7 +12022,7 @@ version = "2.3.0-alpha" dependencies = [ "jsonbb", "madsim-tokio", - "prost 0.13.1", + "prost 0.13.4", "reqwest 0.12.4", "risingwave_pb", "thiserror-ext", @@ -14607,7 +14579,7 @@ dependencies = [ "hyper-util", "percent-encoding", "pin-project", - "prost 0.13.1", + "prost 0.13.4", "rustls-pemfile 2.2.0", "socket2 0.5.6", "tokio", diff --git a/Cargo.toml b/Cargo.toml index 17a6d1ff82670..771601e38e8a0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -379,8 +379,6 @@ tokio-postgres = { git = "https://github.com/madsim-rs/rust-postgres.git", rev = sqlx = { git = "https://github.com/madsim-rs/sqlx.git", rev = "3efe6d0065963db2a2b7f30dee69f647e28dec81" } # patch to remove preserve_order from serde_json bson = { git = "https://github.com/risingwavelabs/bson-rust", rev = "e5175ec" } -# TODO: unpatch after PR merged https://github.com/tokio-rs/prost/pull/1210 -prost-build = { git = "https://github.com/xxchan/prost.git", rev = "0eb1c7b09976cf6b5231e4b8d87bb5908ae6a163" } [workspace.metadata.dylint] libraries = [{ path = "./lints" }] From 212addb20c86255293aa81f3e32d41db84b365cc Mon Sep 17 00:00:00 2001 From: Bugen Zhao Date: Wed, 8 Jan 2025 12:54:50 +0800 Subject: [PATCH 03/21] fix(ci): increase timeout for e2e source test in main-cron (#20069) --- ci/workflows/main-cron.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/workflows/main-cron.yml b/ci/workflows/main-cron.yml index b12d473c679de..2261f6dd49548 100644 --- a/ci/workflows/main-cron.yml +++ b/ci/workflows/main-cron.yml @@ -190,7 +190,7 @@ steps: <<: *docker-compose-common run: source-test-env - ./ci/plugins/upload-failure-logs - timeout_in_minutes: 15 + timeout_in_minutes: 20 retry: *auto-retry - label: "end-to-end sink test ({{matrix.backend}} backend)" From f7e5a667ff3d2fcbe04e8631357509663d9625a3 Mon Sep 17 00:00:00 2001 From: Takahiro Ebato Date: Wed, 8 Jan 2025 14:09:37 +0900 Subject: [PATCH 04/21] fix(sqlparser): make LIMIT and OFFSET order flexible (#20026) --- src/sqlparser/src/parser.rs | 20 ++++++++++---------- src/sqlparser/tests/testdata/select.yaml | 22 ++++++++++++++++++++++ 2 files changed, 32 insertions(+), 10 deletions(-) diff --git a/src/sqlparser/src/parser.rs b/src/sqlparser/src/parser.rs index e81c06371d617..9eb3d9e439967 100644 --- a/src/sqlparser/src/parser.rs +++ b/src/sqlparser/src/parser.rs @@ -4248,17 +4248,17 @@ impl Parser<'_> { vec![] }; - let limit = if self.parse_keyword(Keyword::LIMIT) { - self.parse_limit()? - } else { - None - }; + let mut limit = None; + let mut offset = None; + for _x in 0..2 { + if limit.is_none() && self.parse_keyword(Keyword::LIMIT) { + limit = self.parse_limit()? + } - let offset = if self.parse_keyword(Keyword::OFFSET) { - Some(self.parse_offset()?) - } else { - None - }; + if offset.is_none() && self.parse_keyword(Keyword::OFFSET) { + offset = Some(self.parse_offset()?) + } + } let fetch = if self.parse_keyword(Keyword::FETCH) { if limit.is_some() { diff --git a/src/sqlparser/tests/testdata/select.yaml b/src/sqlparser/tests/testdata/select.yaml index 72de88bd1dfb4..78f7c6a001dbd 100644 --- a/src/sqlparser/tests/testdata/select.yaml +++ b/src/sqlparser/tests/testdata/select.yaml @@ -237,3 +237,25 @@ SELECT a0 + b0 FROM a_log, "B_log" formatted_sql: WITH a_log AS changelog from a, "B_log" AS changelog from public."B" SELECT a0 + b0 FROM a_log, "B_log" formatted_ast: 'Query(Query { with: Some(With { recursive: false, cte_tables: [Cte { alias: TableAlias { name: Ident { value: "a_log", quote_style: None }, columns: [] }, cte_inner: ChangeLog(ObjectName([Ident { value: "a", quote_style: None }])) }, Cte { alias: TableAlias { name: Ident { value: "B_log", quote_style: Some(''"'') }, columns: [] }, cte_inner: ChangeLog(ObjectName([Ident { value: "public", quote_style: None }, Ident { value: "B", quote_style: Some(''"'') }])) }] }), body: Select(Select { distinct: All, projection: [UnnamedExpr(BinaryOp { left: Identifier(Ident { value: "a0", quote_style: None }), op: Plus, right: Identifier(Ident { value: "b0", quote_style: None }) })], from: [TableWithJoins { relation: Table { name: ObjectName([Ident { value: "a_log", quote_style: None }]), alias: None, as_of: None }, joins: [] }, TableWithJoins { relation: Table { name: ObjectName([Ident { value: "B_log", quote_style: Some(''"'') }]), alias: None, as_of: None }, joins: [] }], lateral_views: [], selection: None, group_by: [], having: None }), order_by: [], limit: None, offset: None, fetch: None })' +- input: SELECT * FROM t LIMIT 1 + formatted_sql: SELECT * FROM t LIMIT 1 + formatted_ast: 'Query(Query { with: None, body: Select(Select { distinct: All, projection: [Wildcard(None)], from: [TableWithJoins { relation: Table { name: ObjectName([Ident { value: "t", quote_style: None }]), alias: None, as_of: None }, joins: [] }], lateral_views: [], selection: None, group_by: [], having: None }), order_by: [], limit: Some(Value(Number("1"))), offset: None, fetch: None })' +- input: SELECT * FROM t OFFSET 1 + formatted_sql: SELECT * FROM t OFFSET 1 + formatted_ast: 'Query(Query { with: None, body: Select(Select { distinct: All, projection: [Wildcard(None)], from: [TableWithJoins { relation: Table { name: ObjectName([Ident { value: "t", quote_style: None }]), alias: None, as_of: None }, joins: [] }], lateral_views: [], selection: None, group_by: [], having: None }), order_by: [], limit: None, offset: Some("1"), fetch: None })' +- input: SELECT * FROM t LIMIT 1 OFFSET 2 + formatted_sql: SELECT * FROM t LIMIT 1 OFFSET 2 + formatted_ast: 'Query(Query { with: None, body: Select(Select { distinct: All, projection: [Wildcard(None)], from: [TableWithJoins { relation: Table { name: ObjectName([Ident { value: "t", quote_style: None }]), alias: None, as_of: None }, joins: [] }], lateral_views: [], selection: None, group_by: [], having: None }), order_by: [], limit: Some(Value(Number("1"))), offset: Some("2"), fetch: None })' +- input: SELECT * FROM t OFFSET 2 LIMIT 1 + formatted_sql: SELECT * FROM t LIMIT 1 OFFSET 2 + formatted_ast: 'Query(Query { with: None, body: Select(Select { distinct: All, projection: [Wildcard(None)], from: [TableWithJoins { relation: Table { name: ObjectName([Ident { value: "t", quote_style: None }]), alias: None, as_of: None }, joins: [] }], lateral_views: [], selection: None, group_by: [], having: None }), order_by: [], limit: Some(Value(Number("1"))), offset: Some("2"), fetch: None })' +- input: SELECT * FROM t LIMIT 1 LIMIT 2 + error_msg: |- + sql parser error: expected end of statement, found: LIMIT + LINE 1: SELECT * FROM t LIMIT 1 LIMIT 2 + ^ +- input: SELECT * FROM t OFFSET 1 OFFSET 2 + error_msg: |- + sql parser error: expected end of statement, found: OFFSET + LINE 1: SELECT * FROM t OFFSET 1 OFFSET 2 + ^ From 1ac43758611096ad4c953bc48b7e601e50b9877f Mon Sep 17 00:00:00 2001 From: Noel Kwan <47273164+kwannoel@users.noreply.github.com> Date: Wed, 8 Jan 2025 13:35:27 +0800 Subject: [PATCH 05/21] bench(iceberg): add iceberg predicate pushdown bench (#20047) --- ci/scripts/e2e-iceberg-sink-v2-test.sh | 3 + .../iceberg/benches/predicate_pushdown.slt | 65 +++++++++++++++++++ .../iceberg/benches/predicate_pushdown.toml | 11 ++++ .../predicate_pushdown/filter.slt.part | 4 ++ .../predicate_pushdown/point_get.slt.part | 4 ++ 5 files changed, 87 insertions(+) create mode 100644 e2e_test/iceberg/benches/predicate_pushdown.slt create mode 100644 e2e_test/iceberg/benches/predicate_pushdown.toml create mode 100644 e2e_test/iceberg/benches/predicate_pushdown/filter.slt.part create mode 100644 e2e_test/iceberg/benches/predicate_pushdown/point_get.slt.part diff --git a/ci/scripts/e2e-iceberg-sink-v2-test.sh b/ci/scripts/e2e-iceberg-sink-v2-test.sh index d1ebbce419055..4825e4ab11b1a 100755 --- a/ci/scripts/e2e-iceberg-sink-v2-test.sh +++ b/ci/scripts/e2e-iceberg-sink-v2-test.sh @@ -35,6 +35,7 @@ echo "--- preparing iceberg" cd e2e_test/iceberg bash ./start_spark_connect_server.sh +echo "--- Running tests" # Don't remove the `--quiet` option since poetry has a bug when printing output, see # https://github.com/python-poetry/poetry/issues/3412 poetry update --quiet @@ -52,6 +53,8 @@ poetry run python main.py -t ./test_case/iceberg_source_all_delete.toml poetry run python main.py -t ./test_case/iceberg_source_explain_for_delete.toml poetry run python main.py -t ./test_case/iceberg_predicate_pushdown.toml +echo "--- Running benchmarks" +poetry run python main.py -t ./benches/predicate_pushdown.toml echo "--- Kill cluster" cd ../../ diff --git a/e2e_test/iceberg/benches/predicate_pushdown.slt b/e2e_test/iceberg/benches/predicate_pushdown.slt new file mode 100644 index 0000000000000..355b0858db9c1 --- /dev/null +++ b/e2e_test/iceberg/benches/predicate_pushdown.slt @@ -0,0 +1,65 @@ +statement ok +CREATE TABLE t1 (i1 int, i2 varchar, i3 varchar); + +statement ok +INSERT INTO t1 select key, 'some long string of text', 'another long string of text' from generate_series(1, 1000000) as key; + +statement ok +CREATE SINK sink1 AS select * from t1 WITH ( + connector = 'iceberg', + type = 'append-only', + force_append_only = 'true', + database.name = 'demo_db', + table.name = 't1', + catalog.name = 'demo', + catalog.type = 'storage', + warehouse.path = 's3a://icebergdata/demo', + s3.endpoint = 'http://127.0.0.1:9301', + s3.region = 'us-east-1', + s3.access.key = 'hummockadmin', + s3.secret.key = 'hummockadmin', + commit_checkpoint_interval = 1, + create_table_if_not_exists = 'true' +); + +statement ok +CREATE SOURCE iceberg_t1_source +WITH ( + connector = 'iceberg', + s3.endpoint = 'http://127.0.0.1:9301', + s3.region = 'us-east-1', + s3.access.key = 'hummockadmin', + s3.secret.key = 'hummockadmin', + s3.path.style.access = 'true', + catalog.type = 'storage', + warehouse.path = 's3a://icebergdata/demo', + database.name = 'demo_db', + table.name = 't1', +); + +statement ok +flush; + +query I +select count(*) from iceberg_t1_source; +---- +1000000 + +# warmup +include ./predicate_pushdown/point_get.slt.part +# bench +include ./predicate_pushdown/point_get.slt.part + +# warmup +include ./predicate_pushdown/filter.slt.part +# bench +include ./predicate_pushdown/filter.slt.part + +statement ok +DROP SINK sink1; + +statement ok +DROP SOURCE iceberg_t1_source; + +statement ok +DROP TABLE t1 cascade; \ No newline at end of file diff --git a/e2e_test/iceberg/benches/predicate_pushdown.toml b/e2e_test/iceberg/benches/predicate_pushdown.toml new file mode 100644 index 0000000000000..018bfd0cdb6d7 --- /dev/null +++ b/e2e_test/iceberg/benches/predicate_pushdown.toml @@ -0,0 +1,11 @@ +init_sqls = [ + 'CREATE SCHEMA IF NOT EXISTS demo_db', + 'DROP TABLE IF EXISTS demo_db.t1', +] + +slt = 'benches/predicate_pushdown.slt' + +drop_sqls = [ + 'DROP TABLE IF EXISTS demo_db.t1', + 'DROP SCHEMA IF EXISTS demo_db', +] diff --git a/e2e_test/iceberg/benches/predicate_pushdown/filter.slt.part b/e2e_test/iceberg/benches/predicate_pushdown/filter.slt.part new file mode 100644 index 0000000000000..897bd4966182d --- /dev/null +++ b/e2e_test/iceberg/benches/predicate_pushdown/filter.slt.part @@ -0,0 +1,4 @@ +query I +select count(*) from iceberg_t1_source where i1 > 1001 and i1 < 1110; +---- +108 \ No newline at end of file diff --git a/e2e_test/iceberg/benches/predicate_pushdown/point_get.slt.part b/e2e_test/iceberg/benches/predicate_pushdown/point_get.slt.part new file mode 100644 index 0000000000000..6051155902a8b --- /dev/null +++ b/e2e_test/iceberg/benches/predicate_pushdown/point_get.slt.part @@ -0,0 +1,4 @@ +query I +select * from iceberg_t1_source where i1 = 100000; +---- + 100000 some long string of text another long string of text \ No newline at end of file From 52532c7ec6dabad2105146f873c2f95c9c8ea704 Mon Sep 17 00:00:00 2001 From: William Wen <44139337+wenym1@users.noreply.github.com> Date: Wed, 8 Jan 2025 16:01:53 +0800 Subject: [PATCH 06/21] refactor(storage): remove StateStoreWrite and extract separate StateStoreReadLog trait (#20051) --- .../src/hummock/iterator/change_log.rs | 6 +- .../src/hummock/store/hummock_storage.rs | 7 +- .../hummock/store/local_hummock_storage.rs | 16 - src/storage/src/hummock/utils.rs | 91 ++++- src/storage/src/mem_table.rs | 361 +----------------- src/storage/src/memory.rs | 284 +++++++++++++- src/storage/src/monitor/monitored_store.rs | 5 +- src/storage/src/monitor/traced_store.rs | 5 +- src/storage/src/panic_store.rs | 18 +- src/storage/src/store.rs | 75 ++-- src/storage/src/store_impl.rs | 64 ++-- .../log_store_impl/kv_log_store/reader.rs | 66 ++-- 12 files changed, 483 insertions(+), 515 deletions(-) diff --git a/src/storage/src/hummock/iterator/change_log.rs b/src/storage/src/hummock/iterator/change_log.rs index a10981c35fe16..0fd227020a527 100644 --- a/src/storage/src/hummock/iterator/change_log.rs +++ b/src/storage/src/hummock/iterator/change_log.rs @@ -545,10 +545,10 @@ mod tests { use crate::mem_table::{KeyOp, MemTable, MemTableHummockIterator, MemTableStore}; use crate::memory::MemoryStateStore; use crate::store::{ - ChangeLogValue, NewLocalOptions, OpConsistencyLevel, ReadLogOptions, StateStoreIter, - StateStoreRead, CHECK_BYTES_EQUAL, + ChangeLogValue, NewLocalOptions, OpConsistencyLevel, ReadLogOptions, StateStoreReadLog, + CHECK_BYTES_EQUAL, }; - use crate::StateStore; + use crate::{StateStore, StateStoreIter}; #[tokio::test] async fn test_empty() { diff --git a/src/storage/src/hummock/store/hummock_storage.rs b/src/storage/src/hummock/store/hummock_storage.rs index 0caae17c6c204..07aff03b4e804 100644 --- a/src/storage/src/hummock/store/hummock_storage.rs +++ b/src/storage/src/hummock/store/hummock_storage.rs @@ -79,7 +79,7 @@ impl Drop for HummockStorageShutdownGuard { } /// `HummockStorage` is the entry point of the Hummock state store backend. -/// It implements the `StateStore` and `StateStoreRead` traits but not the `StateStoreWrite` trait +/// It implements the `StateStore` and `StateStoreRead` traits but without any write method /// since all writes should be done via `LocalHummockStorage` to ensure the single writer property /// of hummock. `LocalHummockStorage` instance can be created via `new_local` call. /// Hummock is the state store backend. @@ -589,7 +589,6 @@ impl HummockStorage { } impl StateStoreRead for HummockStorage { - type ChangeLogIter = ChangeLogIterator; type Iter = HummockStorageIterator; type RevIter = HummockStorageRevIterator; @@ -635,6 +634,10 @@ impl StateStoreRead for HummockStorage { ); self.rev_iter_inner(key_range, epoch, read_options) } +} + +impl StateStoreReadLog for HummockStorage { + type ChangeLogIter = ChangeLogIterator; async fn iter_log( &self, diff --git a/src/storage/src/hummock/store/local_hummock_storage.rs b/src/storage/src/hummock/store/local_hummock_storage.rs index 989eb878b0798..69952838e9829 100644 --- a/src/storage/src/hummock/store/local_hummock_storage.rs +++ b/src/storage/src/hummock/store/local_hummock_storage.rs @@ -32,7 +32,6 @@ use super::version::{StagingData, VersionUpdate}; use crate::error::StorageResult; use crate::hummock::event_handler::hummock_event_handler::HummockEventSender; use crate::hummock::event_handler::{HummockEvent, HummockReadVersionRef, LocalInstanceGuard}; -use crate::hummock::iterator::change_log::ChangeLogIterator; use crate::hummock::iterator::{ Backward, BackwardUserIterator, ConcatIteratorInner, Forward, HummockIteratorUnion, IteratorFactory, MergeIterator, UserIterator, @@ -244,7 +243,6 @@ impl LocalHummockStorage { } impl StateStoreRead for LocalHummockStorage { - type ChangeLogIter = ChangeLogIterator; type Iter = HummockStorageIterator; type RevIter = HummockStorageRevIterator; @@ -279,20 +277,6 @@ impl StateStoreRead for LocalHummockStorage { self.rev_iter_flushed(key_range, epoch, read_options) .instrument(tracing::trace_span!("hummock_rev_iter")) } - - async fn iter_log( - &self, - epoch_range: (u64, u64), - key_range: TableKeyRange, - options: ReadLogOptions, - ) -> StorageResult { - let version = self.read_version.read().committed().clone(); - let iter = self - .hummock_version_reader - .iter_log(version, epoch_range, key_range, options) - .await?; - Ok(iter) - } } impl LocalStateStore for LocalHummockStorage { diff --git a/src/storage/src/hummock/utils.rs b/src/storage/src/hummock/utils.rs index 6f42260bcc9c1..7f30f63d83e19 100644 --- a/src/storage/src/hummock/utils.rs +++ b/src/storage/src/hummock/utils.rs @@ -23,9 +23,11 @@ use std::time::{Duration, Instant}; use bytes::Bytes; use foyer::CacheHint; +use futures::{pin_mut, Stream, StreamExt}; use parking_lot::Mutex; use risingwave_common::catalog::{TableId, TableOption}; use risingwave_common::config::StorageMemoryConfig; +use risingwave_expr::codegen::try_stream; use risingwave_hummock_sdk::can_concat; use risingwave_hummock_sdk::compaction_group::StateTableId; use risingwave_hummock_sdk::key::{ @@ -35,12 +37,12 @@ use risingwave_hummock_sdk::sstable_info::SstableInfo; use tokio::sync::oneshot::{channel, Receiver, Sender}; use super::{HummockError, HummockResult, SstableStoreRef}; -use crate::error::StorageResult; +use crate::error::{StorageError, StorageResult}; use crate::hummock::local_version::pinned_version::PinnedVersion; use crate::hummock::CachePolicy; use crate::mem_table::{KeyOp, MemTableError}; use crate::monitor::MemoryCollector; -use crate::store::{OpConsistencyLevel, ReadOptions, StateStoreRead}; +use crate::store::{OpConsistencyLevel, ReadOptions, StateStoreKeyedRow, StateStoreRead}; pub fn range_overlap( search_key_range: &R, @@ -716,6 +718,91 @@ impl MemoryCollector for HummockMemoryCollector { } } +#[try_stream(ok = StateStoreKeyedRow, error = StorageError)] +pub(crate) async fn merge_stream<'a>( + mem_table_iter: impl Iterator, &'a KeyOp)> + 'a, + inner_stream: impl Stream> + 'static, + table_id: TableId, + epoch: u64, + rev: bool, +) { + let inner_stream = inner_stream.peekable(); + pin_mut!(inner_stream); + + let mut mem_table_iter = mem_table_iter.fuse().peekable(); + + loop { + match (inner_stream.as_mut().peek().await, mem_table_iter.peek()) { + (None, None) => break, + // The mem table side has come to an end, return data from the shared storage. + (Some(_), None) => { + let (key, value) = inner_stream.next().await.unwrap()?; + yield (key, value) + } + // The stream side has come to an end, return data from the mem table. + (None, Some(_)) => { + let (key, key_op) = mem_table_iter.next().unwrap(); + match key_op { + KeyOp::Insert(value) | KeyOp::Update((_, value)) => { + yield (FullKey::new(table_id, key.clone(), epoch), value.clone()) + } + _ => {} + } + } + (Some(Ok((inner_key, _))), Some((mem_table_key, _))) => { + debug_assert_eq!(inner_key.user_key.table_id, table_id); + let mut ret = inner_key.user_key.table_key.cmp(mem_table_key); + if rev { + ret = ret.reverse(); + } + match ret { + Ordering::Less => { + // yield data from storage + let (key, value) = inner_stream.next().await.unwrap()?; + yield (key, value); + } + Ordering::Equal => { + // both memtable and storage contain the key, so we advance both + // iterators and return the data in memory. + + let (_, key_op) = mem_table_iter.next().unwrap(); + let (key, old_value_in_inner) = inner_stream.next().await.unwrap()?; + match key_op { + KeyOp::Insert(value) => { + yield (key.clone(), value.clone()); + } + KeyOp::Delete(_) => {} + KeyOp::Update((old_value, new_value)) => { + debug_assert!(old_value == &old_value_in_inner); + + yield (key, new_value.clone()); + } + } + } + Ordering::Greater => { + // yield data from mem table + let (key, key_op) = mem_table_iter.next().unwrap(); + + match key_op { + KeyOp::Insert(value) => { + yield (FullKey::new(table_id, key.clone(), epoch), value.clone()); + } + KeyOp::Delete(_) => {} + KeyOp::Update(_) => unreachable!( + "memtable update should always be paired with a storage key" + ), + } + } + } + } + (Some(Err(_)), Some(_)) => { + // Throw the error. + return Err(inner_stream.next().await.unwrap().unwrap_err()); + } + } + } +} + #[cfg(test)] mod tests { use std::future::{poll_fn, Future}; diff --git a/src/storage/src/mem_table.rs b/src/storage/src/mem_table.rs index 2694f1433c0fa..61807517a20a4 100644 --- a/src/storage/src/mem_table.rs +++ b/src/storage/src/mem_table.rs @@ -12,37 +12,23 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::cmp::Ordering; use std::collections::btree_map::Entry; use std::collections::BTreeMap; -use std::future::Future; -use std::ops::Bound::{Excluded, Included, Unbounded}; +use std::ops::Bound::{Included, Unbounded}; use std::ops::RangeBounds; -use std::sync::Arc; use bytes::Bytes; -use futures::{pin_mut, Stream, StreamExt}; -use futures_async_stream::try_stream; -use itertools::Itertools; -use risingwave_common::bitmap::Bitmap; -use risingwave_common::catalog::{TableId, TableOption}; -use risingwave_common::hash::{VirtualNode, VnodeBitmapExt}; use risingwave_common_estimate_size::{EstimateSize, KvSize}; -use risingwave_hummock_sdk::key::{prefixed_range_with_vnode, FullKey, TableKey, TableKeyRange}; -use risingwave_hummock_sdk::table_watermark::WatermarkDirection; +use risingwave_hummock_sdk::key::TableKey; use thiserror::Error; use thiserror_ext::AsReport; use tracing::error; -use crate::error::{StorageError, StorageResult}; use crate::hummock::iterator::{Backward, Forward, FromRustIterator, RustIteratorBuilder}; use crate::hummock::shared_buffer::shared_buffer_batch::{SharedBufferBatch, SharedBufferBatchId}; -use crate::hummock::utils::{ - do_delete_sanity_check, do_insert_sanity_check, do_update_sanity_check, sanity_check_enabled, -}; +use crate::hummock::utils::sanity_check_enabled; use crate::hummock::value::HummockValue; use crate::row_serde::value_serde::ValueRowSerde; -use crate::storage_value::StorageValue; use crate::store::*; pub type ImmutableMemtable = SharedBufferBatch; @@ -427,347 +413,6 @@ impl KeyOp { } } -#[try_stream(ok = StateStoreKeyedRow, error = StorageError)] -pub(crate) async fn merge_stream<'a>( - mem_table_iter: impl Iterator, &'a KeyOp)> + 'a, - inner_stream: impl Stream> + 'static, - table_id: TableId, - epoch: u64, - rev: bool, -) { - let inner_stream = inner_stream.peekable(); - pin_mut!(inner_stream); - - let mut mem_table_iter = mem_table_iter.fuse().peekable(); - - loop { - match (inner_stream.as_mut().peek().await, mem_table_iter.peek()) { - (None, None) => break, - // The mem table side has come to an end, return data from the shared storage. - (Some(_), None) => { - let (key, value) = inner_stream.next().await.unwrap()?; - yield (key, value) - } - // The stream side has come to an end, return data from the mem table. - (None, Some(_)) => { - let (key, key_op) = mem_table_iter.next().unwrap(); - match key_op { - KeyOp::Insert(value) | KeyOp::Update((_, value)) => { - yield (FullKey::new(table_id, key.clone(), epoch), value.clone()) - } - _ => {} - } - } - (Some(Ok((inner_key, _))), Some((mem_table_key, _))) => { - debug_assert_eq!(inner_key.user_key.table_id, table_id); - let mut ret = inner_key.user_key.table_key.cmp(mem_table_key); - if rev { - ret = ret.reverse(); - } - match ret { - Ordering::Less => { - // yield data from storage - let (key, value) = inner_stream.next().await.unwrap()?; - yield (key, value); - } - Ordering::Equal => { - // both memtable and storage contain the key, so we advance both - // iterators and return the data in memory. - - let (_, key_op) = mem_table_iter.next().unwrap(); - let (key, old_value_in_inner) = inner_stream.next().await.unwrap()?; - match key_op { - KeyOp::Insert(value) => { - yield (key.clone(), value.clone()); - } - KeyOp::Delete(_) => {} - KeyOp::Update((old_value, new_value)) => { - debug_assert!(old_value == &old_value_in_inner); - - yield (key, new_value.clone()); - } - } - } - Ordering::Greater => { - // yield data from mem table - let (key, key_op) = mem_table_iter.next().unwrap(); - - match key_op { - KeyOp::Insert(value) => { - yield (FullKey::new(table_id, key.clone(), epoch), value.clone()); - } - KeyOp::Delete(_) => {} - KeyOp::Update(_) => unreachable!( - "memtable update should always be paired with a storage key" - ), - } - } - } - } - (Some(Err(_)), Some(_)) => { - // Throw the error. - return Err(inner_stream.next().await.unwrap().unwrap_err()); - } - } - } -} - -pub struct MemtableLocalStateStore { - mem_table: MemTable, - inner: S, - - epoch: Option, - - table_id: TableId, - op_consistency_level: OpConsistencyLevel, - table_option: TableOption, - vnodes: Arc, -} - -impl MemtableLocalStateStore { - pub fn new(inner: S, option: NewLocalOptions) -> Self { - Self { - inner, - mem_table: MemTable::new(option.op_consistency_level.clone()), - epoch: None, - table_id: option.table_id, - op_consistency_level: option.op_consistency_level, - table_option: option.table_option, - vnodes: option.vnodes, - } - } - - pub fn inner(&self) -> &S { - &self.inner - } -} - -impl LocalStateStore for MemtableLocalStateStore { - type Iter<'a> = impl StateStoreIter + 'a; - type RevIter<'a> = impl StateStoreIter + 'a; - - async fn get( - &self, - key: TableKey, - read_options: ReadOptions, - ) -> StorageResult> { - match self.mem_table.buffer.get(&key) { - None => self.inner.get(key, self.epoch(), read_options).await, - Some(op) => match op { - KeyOp::Insert(value) | KeyOp::Update((_, value)) => Ok(Some(value.clone())), - KeyOp::Delete(_) => Ok(None), - }, - } - } - - #[allow(clippy::manual_async_fn)] - fn iter( - &self, - key_range: TableKeyRange, - read_options: ReadOptions, - ) -> impl Future>> + Send + '_ { - async move { - let iter = self - .inner - .iter(key_range.clone(), self.epoch(), read_options) - .await?; - Ok(FromStreamStateStoreIter::new(Box::pin(merge_stream( - self.mem_table.iter(key_range), - iter.into_stream(to_owned_item), - self.table_id, - self.epoch(), - false, - )))) - } - } - - #[allow(clippy::manual_async_fn)] - fn rev_iter( - &self, - key_range: TableKeyRange, - read_options: ReadOptions, - ) -> impl Future>> + Send + '_ { - async move { - let iter = self - .inner - .rev_iter(key_range.clone(), self.epoch(), read_options) - .await?; - Ok(FromStreamStateStoreIter::new(Box::pin(merge_stream( - self.mem_table.rev_iter(key_range), - iter.into_stream(to_owned_item), - self.table_id, - self.epoch(), - true, - )))) - } - } - - fn insert( - &mut self, - key: TableKey, - new_val: Bytes, - old_val: Option, - ) -> StorageResult<()> { - match old_val { - None => self.mem_table.insert(key, new_val)?, - Some(old_val) => self.mem_table.update(key, old_val, new_val)?, - }; - Ok(()) - } - - fn delete(&mut self, key: TableKey, old_val: Bytes) -> StorageResult<()> { - Ok(self.mem_table.delete(key, old_val)?) - } - - async fn flush(&mut self) -> StorageResult { - let buffer = self.mem_table.drain().into_parts(); - let mut kv_pairs = Vec::with_capacity(buffer.len()); - for (key, key_op) in buffer { - match key_op { - // Currently, some executors do not strictly comply with these semantics. As - // a workaround you may call disable the check by initializing the - // state store with `op_consistency_level=Inconsistent`. - KeyOp::Insert(value) => { - if sanity_check_enabled() { - do_insert_sanity_check( - &key, - &value, - &self.inner, - self.epoch(), - self.table_id, - self.table_option, - &self.op_consistency_level, - ) - .await?; - } - kv_pairs.push((key, StorageValue::new_put(value))); - } - KeyOp::Delete(old_value) => { - if sanity_check_enabled() { - do_delete_sanity_check( - &key, - &old_value, - &self.inner, - self.epoch(), - self.table_id, - self.table_option, - &self.op_consistency_level, - ) - .await?; - } - kv_pairs.push((key, StorageValue::new_delete())); - } - KeyOp::Update((old_value, new_value)) => { - if sanity_check_enabled() { - do_update_sanity_check( - &key, - &old_value, - &new_value, - &self.inner, - self.epoch(), - self.table_id, - self.table_option, - &self.op_consistency_level, - ) - .await?; - } - kv_pairs.push((key, StorageValue::new_put(new_value))); - } - } - } - self.inner.ingest_batch( - kv_pairs, - vec![], - WriteOptions { - epoch: self.epoch(), - table_id: self.table_id, - }, - ) - } - - fn epoch(&self) -> u64 { - self.epoch.expect("should have set the epoch") - } - - fn is_dirty(&self) -> bool { - self.mem_table.is_dirty() - } - - #[allow(clippy::unused_async)] - async fn init(&mut self, options: InitOptions) -> StorageResult<()> { - assert!( - self.epoch.replace(options.epoch.curr).is_none(), - "epoch in local state store of table id {:?} is init for more than once", - self.table_id - ); - - Ok(()) - } - - fn seal_current_epoch(&mut self, next_epoch: u64, opts: SealCurrentEpochOptions) { - assert!(!self.is_dirty()); - if let Some(value_checker) = opts.switch_op_consistency_level { - self.mem_table.op_consistency_level.update(&value_checker); - } - let prev_epoch = self - .epoch - .replace(next_epoch) - .expect("should have init epoch before seal the first epoch"); - assert!( - next_epoch > prev_epoch, - "new epoch {} should be greater than current epoch: {}", - next_epoch, - prev_epoch - ); - if let Some((direction, watermarks)) = opts.table_watermarks { - let delete_ranges = watermarks - .iter() - .flat_map(|vnode_watermark| { - let inner_range = match direction { - WatermarkDirection::Ascending => { - (Unbounded, Excluded(vnode_watermark.watermark().clone())) - } - WatermarkDirection::Descending => { - (Excluded(vnode_watermark.watermark().clone()), Unbounded) - } - }; - vnode_watermark - .vnode_bitmap() - .iter_vnodes() - .map(move |vnode| { - let (start, end) = - prefixed_range_with_vnode(inner_range.clone(), vnode); - (start.map(|key| key.0.clone()), end.map(|key| key.0.clone())) - }) - }) - .collect_vec(); - if let Err(e) = self.inner.ingest_batch( - Vec::new(), - delete_ranges, - WriteOptions { - epoch: self.epoch(), - table_id: self.table_id, - }, - ) { - error!(error = %e.as_report(), "failed to write delete ranges of table watermark"); - } - } - } - - async fn try_flush(&mut self) -> StorageResult<()> { - Ok(()) - } - - fn update_vnode_bitmap(&mut self, vnodes: Arc) -> Arc { - std::mem::replace(&mut self.vnodes, vnodes) - } - - fn get_table_watermark(&self, _vnode: VirtualNode) -> Option { - // TODO: may store the written table watermark and have a correct implementation - None - } -} - #[cfg(test)] mod tests { use bytes::{BufMut, Bytes, BytesMut}; diff --git a/src/storage/src/memory.rs b/src/storage/src/memory.rs index 5f4a631494113..bd388fc47222b 100644 --- a/src/storage/src/memory.rs +++ b/src/storage/src/memory.rs @@ -14,18 +14,31 @@ use std::cmp::Ordering; use std::collections::{BTreeMap, BTreeSet, VecDeque}; +use std::future::Future; use std::ops::Bound::{Excluded, Included, Unbounded}; use std::ops::{Bound, RangeBounds}; use std::sync::{Arc, LazyLock}; use bytes::Bytes; +use itertools::Itertools; use parking_lot::RwLock; -use risingwave_common::catalog::TableId; -use risingwave_hummock_sdk::key::{FullKey, TableKey, TableKeyRange, UserKey}; +use risingwave_common::bitmap::Bitmap; +use risingwave_common::catalog::{TableId, TableOption}; +use risingwave_common::hash::{VirtualNode, VnodeBitmapExt}; +use risingwave_hummock_sdk::key::{ + prefixed_range_with_vnode, FullKey, TableKey, TableKeyRange, UserKey, +}; +use risingwave_hummock_sdk::table_watermark::WatermarkDirection; use risingwave_hummock_sdk::{HummockEpoch, HummockReadEpoch}; +use thiserror_ext::AsReport; +use tracing::error; use crate::error::StorageResult; -use crate::mem_table::MemtableLocalStateStore; +use crate::hummock::utils::{ + do_delete_sanity_check, do_insert_sanity_check, do_update_sanity_check, merge_stream, + sanity_check_enabled, +}; +use crate::mem_table::{KeyOp, MemTable}; use crate::storage_value::StorageValue; use crate::store::*; @@ -603,7 +616,6 @@ impl RangeKvStateStore { } impl StateStoreRead for RangeKvStateStore { - type ChangeLogIter = RangeKvStateStoreChangeLogIter; type Iter = RangeKvStateStoreIter; type RevIter = RangeKvStateStoreRevIter; @@ -663,6 +675,10 @@ impl StateStoreRead for RangeKvStateStore { true, )) } +} + +impl StateStoreReadLog for RangeKvStateStore { + type ChangeLogIter = RangeKvStateStoreChangeLogIter; async fn iter_log( &self, @@ -692,8 +708,8 @@ impl StateStoreRead for RangeKvStateStore { } } -impl StateStoreWrite for RangeKvStateStore { - fn ingest_batch( +impl RangeKvStateStore { + pub(crate) fn ingest_batch( &self, mut kv_pairs: Vec<(TableKey, StorageValue)>, delete_ranges: Vec<(Bound, Bound)>, @@ -735,7 +751,7 @@ impl StateStoreWrite for RangeKvStateStore { } impl StateStore for RangeKvStateStore { - type Local = MemtableLocalStateStore; + type Local = RangeKvLocalStateStore; #[allow(clippy::unused_async)] async fn try_wait_epoch( @@ -748,7 +764,259 @@ impl StateStore for RangeKvStateStore { } async fn new_local(&self, option: NewLocalOptions) -> Self::Local { - MemtableLocalStateStore::new(self.clone(), option) + RangeKvLocalStateStore::new(self.clone(), option) + } +} + +pub struct RangeKvLocalStateStore { + mem_table: MemTable, + inner: RangeKvStateStore, + + epoch: Option, + + table_id: TableId, + op_consistency_level: OpConsistencyLevel, + table_option: TableOption, + vnodes: Arc, +} + +impl RangeKvLocalStateStore { + pub fn new(inner: RangeKvStateStore, option: NewLocalOptions) -> Self { + Self { + inner, + mem_table: MemTable::new(option.op_consistency_level.clone()), + epoch: None, + table_id: option.table_id, + op_consistency_level: option.op_consistency_level, + table_option: option.table_option, + vnodes: option.vnodes, + } + } +} + +impl LocalStateStore for RangeKvLocalStateStore { + type Iter<'a> = impl StateStoreIter + 'a; + type RevIter<'a> = impl StateStoreIter + 'a; + + async fn get( + &self, + key: TableKey, + read_options: ReadOptions, + ) -> StorageResult> { + match self.mem_table.buffer.get(&key) { + None => self.inner.get(key, self.epoch(), read_options).await, + Some(op) => match op { + KeyOp::Insert(value) | KeyOp::Update((_, value)) => Ok(Some(value.clone())), + KeyOp::Delete(_) => Ok(None), + }, + } + } + + #[allow(clippy::manual_async_fn)] + fn iter( + &self, + key_range: TableKeyRange, + read_options: ReadOptions, + ) -> impl Future>> + Send + '_ { + async move { + let iter = self + .inner + .iter(key_range.clone(), self.epoch(), read_options) + .await?; + Ok(FromStreamStateStoreIter::new(Box::pin(merge_stream( + self.mem_table.iter(key_range), + iter.into_stream(to_owned_item), + self.table_id, + self.epoch(), + false, + )))) + } + } + + #[allow(clippy::manual_async_fn)] + fn rev_iter( + &self, + key_range: TableKeyRange, + read_options: ReadOptions, + ) -> impl Future>> + Send + '_ { + async move { + let iter = self + .inner + .rev_iter(key_range.clone(), self.epoch(), read_options) + .await?; + Ok(FromStreamStateStoreIter::new(Box::pin(merge_stream( + self.mem_table.rev_iter(key_range), + iter.into_stream(to_owned_item), + self.table_id, + self.epoch(), + true, + )))) + } + } + + fn insert( + &mut self, + key: TableKey, + new_val: Bytes, + old_val: Option, + ) -> StorageResult<()> { + match old_val { + None => self.mem_table.insert(key, new_val)?, + Some(old_val) => self.mem_table.update(key, old_val, new_val)?, + }; + Ok(()) + } + + fn delete(&mut self, key: TableKey, old_val: Bytes) -> StorageResult<()> { + Ok(self.mem_table.delete(key, old_val)?) + } + + async fn flush(&mut self) -> StorageResult { + let buffer = self.mem_table.drain().into_parts(); + let mut kv_pairs = Vec::with_capacity(buffer.len()); + for (key, key_op) in buffer { + match key_op { + // Currently, some executors do not strictly comply with these semantics. As + // a workaround you may call disable the check by initializing the + // state store with `op_consistency_level=Inconsistent`. + KeyOp::Insert(value) => { + if sanity_check_enabled() { + do_insert_sanity_check( + &key, + &value, + &self.inner, + self.epoch(), + self.table_id, + self.table_option, + &self.op_consistency_level, + ) + .await?; + } + kv_pairs.push((key, StorageValue::new_put(value))); + } + KeyOp::Delete(old_value) => { + if sanity_check_enabled() { + do_delete_sanity_check( + &key, + &old_value, + &self.inner, + self.epoch(), + self.table_id, + self.table_option, + &self.op_consistency_level, + ) + .await?; + } + kv_pairs.push((key, StorageValue::new_delete())); + } + KeyOp::Update((old_value, new_value)) => { + if sanity_check_enabled() { + do_update_sanity_check( + &key, + &old_value, + &new_value, + &self.inner, + self.epoch(), + self.table_id, + self.table_option, + &self.op_consistency_level, + ) + .await?; + } + kv_pairs.push((key, StorageValue::new_put(new_value))); + } + } + } + self.inner.ingest_batch( + kv_pairs, + vec![], + WriteOptions { + epoch: self.epoch(), + table_id: self.table_id, + }, + ) + } + + fn epoch(&self) -> u64 { + self.epoch.expect("should have set the epoch") + } + + fn is_dirty(&self) -> bool { + self.mem_table.is_dirty() + } + + #[allow(clippy::unused_async)] + async fn init(&mut self, options: InitOptions) -> StorageResult<()> { + assert!( + self.epoch.replace(options.epoch.curr).is_none(), + "epoch in local state store of table id {:?} is init for more than once", + self.table_id + ); + + Ok(()) + } + + fn seal_current_epoch(&mut self, next_epoch: u64, opts: SealCurrentEpochOptions) { + assert!(!self.is_dirty()); + if let Some(value_checker) = opts.switch_op_consistency_level { + self.mem_table.op_consistency_level.update(&value_checker); + } + let prev_epoch = self + .epoch + .replace(next_epoch) + .expect("should have init epoch before seal the first epoch"); + assert!( + next_epoch > prev_epoch, + "new epoch {} should be greater than current epoch: {}", + next_epoch, + prev_epoch + ); + if let Some((direction, watermarks)) = opts.table_watermarks { + let delete_ranges = watermarks + .iter() + .flat_map(|vnode_watermark| { + let inner_range = match direction { + WatermarkDirection::Ascending => { + (Unbounded, Excluded(vnode_watermark.watermark().clone())) + } + WatermarkDirection::Descending => { + (Excluded(vnode_watermark.watermark().clone()), Unbounded) + } + }; + vnode_watermark + .vnode_bitmap() + .iter_vnodes() + .map(move |vnode| { + let (start, end) = + prefixed_range_with_vnode(inner_range.clone(), vnode); + (start.map(|key| key.0.clone()), end.map(|key| key.0.clone())) + }) + }) + .collect_vec(); + if let Err(e) = self.inner.ingest_batch( + Vec::new(), + delete_ranges, + WriteOptions { + epoch: self.epoch(), + table_id: self.table_id, + }, + ) { + error!(error = %e.as_report(), "failed to write delete ranges of table watermark"); + } + } + } + + async fn try_flush(&mut self) -> StorageResult<()> { + Ok(()) + } + + fn update_vnode_bitmap(&mut self, vnodes: Arc) -> Arc { + std::mem::replace(&mut self.vnodes, vnodes) + } + + fn get_table_watermark(&self, _vnode: VirtualNode) -> Option { + // TODO: may store the written table watermark and have a correct implementation + None } } diff --git a/src/storage/src/monitor/monitored_store.rs b/src/storage/src/monitor/monitored_store.rs index 3e795f72ab47a..96a225cc22635 100644 --- a/src/storage/src/monitor/monitored_store.rs +++ b/src/storage/src/monitor/monitored_store.rs @@ -175,7 +175,6 @@ impl MonitoredStateStore { } impl StateStoreRead for MonitoredStateStore { - type ChangeLogIter = impl StateStoreReadChangeLogIter; type Iter = impl StateStoreReadIter; type RevIter = impl StateStoreReadIter; @@ -217,6 +216,10 @@ impl StateStoreRead for MonitoredStateStore { self.inner.rev_iter(key_range, epoch, read_options), ) } +} + +impl StateStoreReadLog for MonitoredStateStore { + type ChangeLogIter = impl StateStoreReadChangeLogIter; fn iter_log( &self, diff --git a/src/storage/src/monitor/traced_store.rs b/src/storage/src/monitor/traced_store.rs index 4fa24d425953b..fc4aecc6c4054 100644 --- a/src/storage/src/monitor/traced_store.rs +++ b/src/storage/src/monitor/traced_store.rs @@ -285,7 +285,6 @@ impl StateStore for TracedStateStore { } impl StateStoreRead for TracedStateStore { - type ChangeLogIter = impl StateStoreReadChangeLogIter; type Iter = impl StateStoreReadIter; type RevIter = impl StateStoreReadIter; @@ -336,6 +335,10 @@ impl StateStoreRead for TracedStateStore { ); self.traced_iter(self.inner.rev_iter(key_range, epoch, read_options), span) } +} + +impl StateStoreReadLog for TracedStateStore { + type ChangeLogIter = impl StateStoreReadChangeLogIter; fn iter_log( &self, diff --git a/src/storage/src/panic_store.rs b/src/storage/src/panic_store.rs index 1ca087696cc6c..1f04281a38a11 100644 --- a/src/storage/src/panic_store.rs +++ b/src/storage/src/panic_store.rs @@ -13,7 +13,6 @@ // limitations under the License. use std::marker::PhantomData; -use std::ops::Bound; use std::sync::Arc; use bytes::Bytes; @@ -23,7 +22,6 @@ use risingwave_hummock_sdk::key::{TableKey, TableKeyRange}; use risingwave_hummock_sdk::HummockReadEpoch; use crate::error::StorageResult; -use crate::storage_value::StorageValue; use crate::store::*; /// A panic state store. If a workload is fully in-memory, we can use this state store to @@ -32,7 +30,6 @@ use crate::store::*; pub struct PanicStateStore; impl StateStoreRead for PanicStateStore { - type ChangeLogIter = PanicStateStoreIter; type Iter = PanicStateStoreIter; type RevIter = PanicStateStoreIter; @@ -65,6 +62,10 @@ impl StateStoreRead for PanicStateStore { ) -> StorageResult { panic!("should not read from the state store!"); } +} + +impl StateStoreReadLog for PanicStateStore { + type ChangeLogIter = PanicStateStoreIter; async fn iter_log( &self, @@ -76,17 +77,6 @@ impl StateStoreRead for PanicStateStore { } } -impl StateStoreWrite for PanicStateStore { - fn ingest_batch( - &self, - _kv_pairs: Vec<(TableKey, StorageValue)>, - _delete_ranges: Vec<(Bound, Bound)>, - _write_options: WriteOptions, - ) -> StorageResult { - panic!("should not write to the state store!"); - } -} - impl LocalStateStore for PanicStateStore { type Iter<'a> = PanicStateStoreIter; type RevIter<'a> = PanicStateStoreIter; diff --git a/src/storage/src/store.rs b/src/storage/src/store.rs index da8fdf206ad5d..531ca629d4d65 100644 --- a/src/storage/src/store.rs +++ b/src/storage/src/store.rs @@ -17,7 +17,6 @@ use std::default::Default; use std::fmt::{Debug, Formatter}; use std::future::Future; use std::marker::PhantomData; -use std::ops::Bound; use std::sync::{Arc, LazyLock}; use bytes::Bytes; @@ -42,7 +41,6 @@ use risingwave_pb::hummock::PbVnodeWatermark; use crate::error::{StorageError, StorageResult}; use crate::hummock::CachePolicy; use crate::monitor::{MonitoredStateStore, MonitoredStorageMetrics}; -use crate::storage_value::StorageValue; pub trait StaticSendSync = Send + Sync + 'static; @@ -59,9 +57,7 @@ impl IterItem for StateStoreReadLogItem { } pub trait StateStoreIter: Send { - fn try_next( - &mut self, - ) -> impl Future>>> + Send + '_; + fn try_next(&mut self) -> impl StorageFuture<'_, Option>>; } pub fn to_owned_item((key, value): StateStoreKeyedRowRef<'_>) -> StorageResult { @@ -238,11 +234,22 @@ pub struct ReadLogOptions { } pub trait StateStoreReadChangeLogIter = StateStoreIter + Send + 'static; +pub trait StorageFuture<'a, T> = Future> + Send + 'a; + +pub trait StateStoreReadLog: StaticSendSync { + type ChangeLogIter: StateStoreReadChangeLogIter; + + fn iter_log( + &self, + epoch_range: (u64, u64), + key_range: TableKeyRange, + options: ReadLogOptions, + ) -> impl StorageFuture<'_, Self::ChangeLogIter>; +} pub trait StateStoreRead: StaticSendSync { type Iter: StateStoreReadIter; type RevIter: StateStoreReadIter; - type ChangeLogIter: StateStoreReadChangeLogIter; /// Point gets a value from the state store. /// The result is based on a snapshot corresponding to the given `epoch`. @@ -252,7 +259,7 @@ pub trait StateStoreRead: StaticSendSync { key: TableKey, epoch: u64, read_options: ReadOptions, - ) -> impl Future>> + Send + '_; + ) -> impl StorageFuture<'_, Option>; /// Point gets a value from the state store. /// The result is based on a snapshot corresponding to the given `epoch`. @@ -262,7 +269,7 @@ pub trait StateStoreRead: StaticSendSync { key: TableKey, epoch: u64, read_options: ReadOptions, - ) -> impl Future>> + Send + '_ { + ) -> impl StorageFuture<'_, Option> { self.get_keyed_row(key, epoch, read_options) .map_ok(|v| v.map(|(_, v)| v)) } @@ -277,21 +284,14 @@ pub trait StateStoreRead: StaticSendSync { key_range: TableKeyRange, epoch: u64, read_options: ReadOptions, - ) -> impl Future> + Send + '_; + ) -> impl StorageFuture<'_, Self::Iter>; fn rev_iter( &self, key_range: TableKeyRange, epoch: u64, read_options: ReadOptions, - ) -> impl Future> + Send + '_; - - fn iter_log( - &self, - epoch_range: (u64, u64), - key_range: TableKeyRange, - options: ReadLogOptions, - ) -> impl Future> + Send + '_; + ) -> impl StorageFuture<'_, Self::RevIter>; } pub trait StateStoreReadExt: StaticSendSync { @@ -308,7 +308,7 @@ pub trait StateStoreReadExt: StaticSendSync { epoch: u64, limit: Option, read_options: ReadOptions, - ) -> impl Future>> + Send + '_; + ) -> impl StorageFuture<'_, Vec>; } impl StateStoreReadExt for S { @@ -333,29 +333,6 @@ impl StateStoreReadExt for S { } } -pub trait StateStoreWrite: StaticSendSync { - /// Writes a batch to storage. The batch should be: - /// * Ordered. KV pairs will be directly written to the table, so it must be ordered. - /// * Locally unique. There should not be two or more operations on the same key in one write - /// batch. - /// - /// Ingests a batch of data into the state store. One write batch should never contain operation - /// on the same key. e.g. Put(233, x) then Delete(233). - /// An epoch should be provided to ingest a write batch. It is served as: - /// - A handle to represent an atomic write session. All ingested write batches associated with - /// the same `Epoch` have the all-or-nothing semantics, meaning that partial changes are not - /// queryable and will be rolled back if instructed. - /// - A version of a kv pair. kv pair associated with larger `Epoch` is guaranteed to be newer - /// then kv pair with smaller `Epoch`. Currently this version is only used to derive the - /// per-key modification history (e.g. in compaction), not across different keys. - fn ingest_batch( - &self, - kv_pairs: Vec<(TableKey, StorageValue)>, - delete_ranges: Vec<(Bound, Bound)>, - write_options: WriteOptions, - ) -> StorageResult; -} - #[derive(Clone)] pub struct TryWaitEpochOptions { pub table_id: TableId, @@ -384,7 +361,7 @@ impl From for TracedTryWaitEpochOptions { } } -pub trait StateStore: StateStoreRead + StaticSendSync + Clone { +pub trait StateStore: StateStoreRead + StateStoreReadLog + StaticSendSync + Clone { type Local: LocalStateStore; /// If epoch is `Committed`, we will wait until the epoch is committed and its data is ready to @@ -393,7 +370,7 @@ pub trait StateStore: StateStoreRead + StaticSendSync + Clone { &self, epoch: HummockReadEpoch, options: TryWaitEpochOptions, - ) -> impl Future> + Send + '_; + ) -> impl StorageFuture<'_, ()>; /// Creates a [`MonitoredStateStore`] from this state store, with given `stats`. fn monitored(self, storage_metrics: Arc) -> MonitoredStateStore { @@ -416,7 +393,7 @@ pub trait LocalStateStore: StaticSendSync { &self, key: TableKey, read_options: ReadOptions, - ) -> impl Future>> + Send + '_; + ) -> impl StorageFuture<'_, Option>; /// Opens and returns an iterator for given `prefix_hint` and `full_key_range` /// Internally, `prefix_hint` will be used to for checking `bloom_filter` and @@ -427,13 +404,13 @@ pub trait LocalStateStore: StaticSendSync { &self, key_range: TableKeyRange, read_options: ReadOptions, - ) -> impl Future>> + Send + '_; + ) -> impl StorageFuture<'_, Self::Iter<'_>>; fn rev_iter( &self, key_range: TableKeyRange, read_options: ReadOptions, - ) -> impl Future>> + Send + '_; + ) -> impl StorageFuture<'_, Self::RevIter<'_>>; /// Get last persisted watermark for a given vnode. fn get_table_watermark(&self, vnode: VirtualNode) -> Option; @@ -450,9 +427,9 @@ pub trait LocalStateStore: StaticSendSync { /// than the given `epoch` will be deleted. fn delete(&mut self, key: TableKey, old_val: Bytes) -> StorageResult<()>; - fn flush(&mut self) -> impl Future> + Send + '_; + fn flush(&mut self) -> impl StorageFuture<'_, usize>; - fn try_flush(&mut self) -> impl Future> + Send + '_; + fn try_flush(&mut self) -> impl StorageFuture<'_, ()>; fn epoch(&self) -> u64; fn is_dirty(&self) -> bool; @@ -463,7 +440,7 @@ pub trait LocalStateStore: StaticSendSync { /// In some cases like replicated state table, state table may not be empty initially, /// as such we need to wait for `epoch.prev` checkpoint to complete, /// hence this interface is made async. - fn init(&mut self, opts: InitOptions) -> impl Future> + Send + '_; + fn init(&mut self, opts: InitOptions) -> impl StorageFuture<'_, ()>; /// Updates the monotonically increasing write epoch to `new_epoch`. /// All writes after this function is called will be tagged with `new_epoch`. In other words, diff --git a/src/storage/src/store_impl.rs b/src/storage/src/store_impl.rs index 87883911c58e2..6488e9c3f23b6 100644 --- a/src/storage/src/store_impl.rs +++ b/src/storage/src/store_impl.rs @@ -228,7 +228,7 @@ pub mod verify { use std::fmt::Debug; use std::future::Future; use std::marker::PhantomData; - use std::ops::{Bound, Deref}; + use std::ops::Deref; use std::sync::Arc; use bytes::Bytes; @@ -240,7 +240,6 @@ pub mod verify { use crate::error::StorageResult; use crate::hummock::HummockStorage; - use crate::storage_value::StorageValue; use crate::store::*; use crate::store_impl::AsHummock; @@ -263,6 +262,7 @@ pub mod verify { } } + #[derive(Clone)] pub struct VerifyStateStore { pub actual: A, pub expected: Option, @@ -276,7 +276,6 @@ pub mod verify { } impl StateStoreRead for VerifyStateStore { - type ChangeLogIter = impl StateStoreReadChangeLogIter; type Iter = impl StateStoreReadIter; type RevIter = impl StateStoreReadIter; @@ -342,6 +341,10 @@ pub mod verify { Ok(verify_iter::(actual, expected)) } } + } + + impl StateStoreReadLog for VerifyStateStore { + type ChangeLogIter = impl StateStoreReadChangeLogIter; async fn iter_log( &self, @@ -392,36 +395,6 @@ pub mod verify { } } - impl StateStoreWrite for VerifyStateStore { - fn ingest_batch( - &self, - kv_pairs: Vec<(TableKey, StorageValue)>, - delete_ranges: Vec<(Bound, Bound)>, - write_options: WriteOptions, - ) -> StorageResult { - let actual = self.actual.ingest_batch( - kv_pairs.clone(), - delete_ranges.clone(), - write_options.clone(), - ); - if let Some(expected) = &self.expected { - let expected = expected.ingest_batch(kv_pairs, delete_ranges, write_options); - assert_eq!(actual.is_err(), expected.is_err()); - } - actual - } - } - - impl Clone for VerifyStateStore { - fn clone(&self) -> Self { - Self { - actual: self.actual.clone(), - expected: self.expected.clone(), - _phantom: PhantomData, - } - } - } - impl LocalStateStore for VerifyStateStore { type Iter<'a> = impl StateStoreIter + 'a; type RevIter<'a> = impl StateStoreIter + 'a; @@ -806,7 +779,7 @@ impl AsHummock for SledStateStore { } #[cfg(debug_assertions)] -pub mod boxed_state_store { +mod boxed_state_store { use std::future::Future; use std::ops::{Deref, DerefMut}; use std::sync::Arc; @@ -873,7 +846,10 @@ pub mod boxed_state_store { epoch: u64, read_options: ReadOptions, ) -> StorageResult; + } + #[async_trait::async_trait] + pub trait DynamicDispatchedStateStoreReadLog: StaticSendSync { async fn iter_log( &self, epoch_range: (u64, u64), @@ -912,7 +888,10 @@ pub mod boxed_state_store { self.rev_iter(key_range, epoch, read_options).await?, )) } + } + #[async_trait::async_trait] + impl DynamicDispatchedStateStoreReadLog for S { async fn iter_log( &self, epoch_range: (u64, u64), @@ -1159,7 +1138,6 @@ pub mod boxed_state_store { pub type BoxDynamicDispatchedStateStore = Box; impl StateStoreRead for BoxDynamicDispatchedStateStore { - type ChangeLogIter = BoxStateStoreReadChangeLogIter; type Iter = BoxStateStoreReadIter; type RevIter = BoxStateStoreReadIter; @@ -1189,6 +1167,10 @@ pub mod boxed_state_store { ) -> impl Future> + '_ { self.deref().rev_iter(key_range, epoch, read_options) } + } + + impl StateStoreReadLog for BoxDynamicDispatchedStateStore { + type ChangeLogIter = BoxStateStoreReadChangeLogIter; fn iter_log( &self, @@ -1201,7 +1183,11 @@ pub mod boxed_state_store { } pub trait DynamicDispatchedStateStore: - DynClone + DynamicDispatchedStateStoreRead + DynamicDispatchedStateStoreExt + AsHummock + DynClone + + DynamicDispatchedStateStoreRead + + DynamicDispatchedStateStoreReadLog + + DynamicDispatchedStateStoreExt + + AsHummock { } @@ -1214,7 +1200,11 @@ pub mod boxed_state_store { } impl< - S: DynClone + DynamicDispatchedStateStoreRead + DynamicDispatchedStateStoreExt + AsHummock, + S: DynClone + + DynamicDispatchedStateStoreRead + + DynamicDispatchedStateStoreReadLog + + DynamicDispatchedStateStoreExt + + AsHummock, > DynamicDispatchedStateStore for S { } diff --git a/src/stream/src/common/log_store_impl/kv_log_store/reader.rs b/src/stream/src/common/log_store_impl/kv_log_store/reader.rs index c27cf711a3ad0..20c8e7fbcb576 100644 --- a/src/stream/src/common/log_store_impl/kv_log_store/reader.rs +++ b/src/stream/src/common/log_store_impl/kv_log_store/reader.rs @@ -674,54 +674,65 @@ impl LogReader for KvLogStoreReader { #[cfg(test)] mod tests { - use std::ops::Bound::Unbounded; + use std::collections::{Bound, HashSet}; use bytes::Bytes; use itertools::Itertools; - use risingwave_common::util::epoch::test_epoch; - use risingwave_hummock_sdk::key::TableKey; + use risingwave_common::hash::VirtualNode; + use risingwave_common::util::epoch::{test_epoch, EpochExt}; + use risingwave_hummock_sdk::key::{prefixed_range_with_vnode, KeyPayloadType, TableKey}; + use risingwave_hummock_test::local_state_store_test_utils::LocalStateStoreTestExt; + use risingwave_hummock_test::test_utils::prepare_hummock_test_env; use risingwave_storage::hummock::iterator::test_utils::{ iterator_test_table_key_of, iterator_test_value_of, }; - use risingwave_storage::memory::MemoryStateStore; - use risingwave_storage::storage_value::StorageValue; - use risingwave_storage::store::{ReadOptions, StateStoreRead, StateStoreWrite, WriteOptions}; - use risingwave_storage::StateStoreIter; + use risingwave_storage::store::{ + LocalStateStore, NewLocalOptions, ReadOptions, SealCurrentEpochOptions, StateStoreRead, + }; + use risingwave_storage::{StateStore, StateStoreIter}; use crate::common::log_store_impl::kv_log_store::reader::AutoRebuildStateStoreReadIter; use crate::common::log_store_impl::kv_log_store::test_utils::TEST_TABLE_ID; #[tokio::test] async fn test_auto_rebuild_iter() { - let state_store = MemoryStateStore::new(); + let test_env = prepare_hummock_test_env().await; + test_env.register_table_id(TEST_TABLE_ID).await; + let mut state_store = test_env + .storage + .new_local(NewLocalOptions::for_test(TEST_TABLE_ID)) + .await; + let epoch = test_epoch(1); + test_env + .storage + .start_epoch(epoch, HashSet::from_iter([TEST_TABLE_ID])); + state_store.init_for_test(epoch).await.unwrap(); let key_count = 100; let pairs = (0..key_count) .map(|i| { let key = iterator_test_table_key_of(i); let value = iterator_test_value_of(i); - (TableKey(Bytes::from(key)), StorageValue::new_put(value)) + (TableKey(Bytes::from(key)), Bytes::from(value)) }) .collect_vec(); - let epoch = test_epoch(1); - state_store - .ingest_batch( - pairs.clone(), - vec![], - WriteOptions { - epoch, - table_id: TEST_TABLE_ID, - }, - ) - .unwrap(); + for (key, value) in &pairs { + state_store + .insert(key.clone(), value.clone(), None) + .unwrap(); + } + state_store.flush().await.unwrap(); + state_store.seal_current_epoch(epoch.next_epoch(), SealCurrentEpochOptions::for_test()); + test_env.commit_epoch(epoch).await; + let state_store = test_env.storage.clone(); async fn validate( - mut kv_iter: impl Iterator, StorageValue)>, + mut kv_iter: impl Iterator, Bytes)>, mut iter: impl StateStoreIter, ) { while let Some((key, value)) = iter.try_next().await.unwrap() { let (k, v) = kv_iter.next().unwrap(); assert_eq!(key.user_key.table_key, k.to_ref()); - assert_eq!(v.user_value.as_deref(), Some(value)); + assert_eq!(v.as_ref(), value); } assert!(kv_iter.next().is_none()); } @@ -730,10 +741,17 @@ mod tests { table_id: TEST_TABLE_ID, ..Default::default() }; + let key_range = prefixed_range_with_vnode( + ( + Bound::::Unbounded, + Bound::::Unbounded, + ), + VirtualNode::ZERO, + ); let kv_iter = pairs.clone().into_iter(); let iter = state_store - .iter((Unbounded, Unbounded), epoch, read_options.clone()) + .iter(key_range.clone(), epoch, read_options.clone()) .await .unwrap(); validate(kv_iter, iter).await; @@ -755,7 +773,7 @@ mod tests { false } }, - (Unbounded, Unbounded), + key_range.clone(), epoch, read_options, ) From 1bc6bea26cd2c103b827837b007f3a1d5e2d6067 Mon Sep 17 00:00:00 2001 From: Bohan Zhang Date: Wed, 8 Jan 2025 16:28:41 +0800 Subject: [PATCH 07/21] feat: batching telemetry event request avoid too many requests (#20000) Co-authored-by: tabversion --- proto/telemetry.proto | 7 +++-- src/common/src/telemetry/report.rs | 37 ++++++++++++++++++++++++--- src/common/telemetry_event/src/lib.rs | 35 +++++++++++++++++-------- 3 files changed, 63 insertions(+), 16 deletions(-) diff --git a/proto/telemetry.proto b/proto/telemetry.proto index 06e161506db15..162ab80648aba 100644 --- a/proto/telemetry.proto +++ b/proto/telemetry.proto @@ -7,8 +7,7 @@ option go_package = "risingwavelabs.com/risingwave/proto/telemetry"; enum MetaBackend { META_BACKEND_UNSPECIFIED = 0; META_BACKEND_MEMORY = 1; - reserved 2; - reserved "META_BACKEND_ETCD"; + META_BACKEND_ETCD = 2; META_BACKEND_RDB = 3; } @@ -167,3 +166,7 @@ message EventMessage { // mark the event is a test message bool is_test = 11; } + +message BatchEventMessage { + repeated EventMessage events = 1; +} diff --git a/src/common/src/telemetry/report.rs b/src/common/src/telemetry/report.rs index d4b1fd27335df..f4737038863b0 100644 --- a/src/common/src/telemetry/report.rs +++ b/src/common/src/telemetry/report.rs @@ -14,13 +14,18 @@ use std::sync::Arc; -use risingwave_telemetry_event::get_telemetry_risingwave_cloud_uuid; +use risingwave_pb::telemetry::PbEventMessage; pub use risingwave_telemetry_event::{ - current_timestamp, post_telemetry_report_pb, TELEMETRY_REPORT_URL, TELEMETRY_TRACKING_ID, + current_timestamp, do_telemetry_event_report, post_telemetry_report_pb, + TELEMETRY_EVENT_REPORT_INTERVAL, TELEMETRY_REPORT_URL, TELEMETRY_TRACKING_ID, +}; +use risingwave_telemetry_event::{ + get_telemetry_risingwave_cloud_uuid, TELEMETRY_EVENT_REPORT_STASH_SIZE, + TELEMETRY_EVENT_REPORT_TX, }; use tokio::sync::oneshot::Sender; use tokio::task::JoinHandle; -use tokio::time::{interval, Duration}; +use tokio::time::{interval as tokio_interval_fn, Duration}; use uuid::Uuid; use super::{Result, TELEMETRY_REPORT_INTERVAL}; @@ -60,9 +65,13 @@ where let begin_time = std::time::Instant::now(); let session_id = Uuid::new_v4().to_string(); - let mut interval = interval(Duration::from_secs(TELEMETRY_REPORT_INTERVAL)); + let mut interval = tokio_interval_fn(Duration::from_secs(TELEMETRY_REPORT_INTERVAL)); interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); + let mut event_interval = + tokio_interval_fn(Duration::from_secs(TELEMETRY_EVENT_REPORT_INTERVAL)); + event_interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); + // fetch telemetry tracking_id from the meta node only at the beginning // There is only one case tracking_id updated at the runtime ---- metastore data has been // cleaned. There is no way that metastore has been cleaned but nodes are still running @@ -91,9 +100,29 @@ where ) }); + let (tx, mut event_rx) = tokio::sync::mpsc::unbounded_channel::(); + TELEMETRY_EVENT_REPORT_TX.set(tx).unwrap_or_else(|_| { + tracing::warn!( + "Telemetry failed to set event reporting tx, event reporting will be disabled" + ); + }); + let mut event_stash = Vec::new(); + loop { tokio::select! { _ = interval.tick() => {}, + event = event_rx.recv() => { + debug_assert!(event.is_some()); + event_stash.push(event.unwrap()); + if event_stash.len() >= TELEMETRY_EVENT_REPORT_STASH_SIZE { + do_telemetry_event_report(&mut event_stash).await; + } + continue; + } + _ = event_interval.tick() => { + do_telemetry_event_report(&mut event_stash).await; + continue; + }, _ = &mut shutdown_rx => { tracing::info!("Telemetry exit"); return; diff --git a/src/common/telemetry_event/src/lib.rs b/src/common/telemetry_event/src/lib.rs index 0be5f40e0de1c..18874f4818627 100644 --- a/src/common/telemetry_event/src/lib.rs +++ b/src/common/telemetry_event/src/lib.rs @@ -21,9 +21,11 @@ use std::sync::OnceLock; use prost::Message; use risingwave_pb::telemetry::{ - EventMessage as PbEventMessage, PbTelemetryDatabaseObject, + EventMessage as PbEventMessage, PbBatchEventMessage, PbTelemetryDatabaseObject, TelemetryEventStage as PbTelemetryEventStage, }; +use thiserror_ext::AsReport; +use tokio::sync::mpsc::UnboundedSender; pub use util::*; pub type TelemetryResult = core::result::Result; @@ -32,6 +34,7 @@ pub type TelemetryResult = core::result::Result; pub type TelemetryError = String; pub static TELEMETRY_TRACKING_ID: OnceLock = OnceLock::new(); +pub static TELEMETRY_EVENT_REPORT_TX: OnceLock> = OnceLock::new(); pub const TELEMETRY_REPORT_URL: &str = "https://telemetry.risingwave.dev/api/v2/report"; @@ -42,6 +45,21 @@ pub fn get_telemetry_risingwave_cloud_uuid() -> Option { env::var(TELEMETRY_RISINGWAVE_CLOUD_UUID).ok() } +pub async fn do_telemetry_event_report(event_stash: &mut Vec) { + const TELEMETRY_EVENT_REPORT_TYPE: &str = "events"; // the batch report url + let url = (TELEMETRY_REPORT_URL.to_owned() + "/" + TELEMETRY_EVENT_REPORT_TYPE).to_owned(); + let batch_message = PbBatchEventMessage { + events: std::mem::take(event_stash), + }; + + post_telemetry_report_pb(&url, batch_message.encode_to_vec()) + .await + .unwrap_or_else(|e| tracing::debug!("{}", e)); +} + +pub const TELEMETRY_EVENT_REPORT_INTERVAL: u64 = 10; // 10 seconds +pub const TELEMETRY_EVENT_REPORT_STASH_SIZE: usize = 100; // 100 events to trigger a report action + pub fn report_event_common( event_stage: PbTelemetryEventStage, event_name: &str, @@ -95,15 +113,12 @@ pub fn request_to_telemetry_event( node, is_test, }; - let report_bytes = event.encode_to_vec(); - - tokio::spawn(async move { - const TELEMETRY_EVENT_REPORT_TYPE: &str = "event"; - let url = (TELEMETRY_REPORT_URL.to_owned() + "/" + TELEMETRY_EVENT_REPORT_TYPE).to_owned(); - post_telemetry_report_pb(&url, report_bytes) - .await - .unwrap_or_else(|e| tracing::info!("{}", e)) - }); + + if let Some(tx) = TELEMETRY_EVENT_REPORT_TX.get() { + let _ = tx.send(event).inspect_err(|e| { + tracing::warn!("Failed to send telemetry event queue: {}", e.as_report()) + }); + } } #[cfg(test)] From bb4dec288c5d93cc12c549e56ca7d11b0e827dc3 Mon Sep 17 00:00:00 2001 From: xxchan Date: Wed, 8 Jan 2025 21:50:11 +0800 Subject: [PATCH 08/21] refactor(meta): splits source_manager into smaller mods (#20071) Signed-off-by: xxchan --- src/meta/src/rpc/ddl_controller.rs | 4 +- src/meta/src/stream/source_manager.rs | 1131 +---------------- .../stream/source_manager/split_assignment.rs | 830 ++++++++++++ src/meta/src/stream/source_manager/worker.rs | 320 +++++ 4 files changed, 1159 insertions(+), 1126 deletions(-) create mode 100644 src/meta/src/stream/source_manager/split_assignment.rs create mode 100644 src/meta/src/stream/source_manager/worker.rs diff --git a/src/meta/src/rpc/ddl_controller.rs b/src/meta/src/rpc/ddl_controller.rs index 70419e6118b13..5c203df91748f 100644 --- a/src/meta/src/rpc/ddl_controller.rs +++ b/src/meta/src/rpc/ddl_controller.rs @@ -75,7 +75,7 @@ use crate::manager::{ }; use crate::model::{StreamContext, StreamJobFragments, TableParallelism}; use crate::stream::{ - create_source_worker_handle, validate_sink, ActorGraphBuildResult, ActorGraphBuilder, + create_source_worker, validate_sink, ActorGraphBuildResult, ActorGraphBuilder, CompleteStreamFragmentGraph, CreateStreamingJobContext, CreateStreamingJobOption, GlobalStreamManagerRef, ReplaceStreamJobContext, SourceChange, SourceManagerRef, StreamFragmentGraph, @@ -441,7 +441,7 @@ impl DdlController { /// Shared source is handled in [`Self::create_streaming_job`] async fn create_non_shared_source(&self, source: Source) -> MetaResult { - let handle = create_source_worker_handle(&source, self.source_manager.metrics.clone()) + let handle = create_source_worker(&source, self.source_manager.metrics.clone()) .await .context("failed to create source worker")?; diff --git a/src/meta/src/stream/source_manager.rs b/src/meta/src/stream/source_manager.rs index 958ce8a802bfc..6d2ea50d0a5cd 100644 --- a/src/meta/src/stream/source_manager.rs +++ b/src/meta/src/stream/source_manager.rs @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +mod split_assignment; +mod worker; use std::borrow::BorrowMut; use std::cmp::Ordering; use std::collections::hash_map::Entry; @@ -40,6 +42,8 @@ use tokio::sync::{oneshot, Mutex}; use tokio::task::JoinHandle; use tokio::time::MissedTickBehavior; use tokio::{select, time}; +pub use worker::create_source_worker; +use worker::{create_source_worker_async, ConnectorSourceWorkerHandle}; use crate::barrier::{BarrierScheduler, Command}; use crate::manager::MetadataManager; @@ -51,7 +55,7 @@ pub type SourceManagerRef = Arc; pub type SplitAssignment = HashMap>>; pub type ThrottleConfig = HashMap>>; -/// `SourceManager` keeps fetching the latest split metadata from the external source services ([`ConnectorSourceWorker::tick`]), +/// `SourceManager` keeps fetching the latest split metadata from the external source services ([`worker::ConnectorSourceWorker::tick`]), /// and sends a split assignment command if split changes detected ([`Self::tick`]). pub struct SourceManager { pub paused: Mutex<()>, @@ -59,226 +63,6 @@ pub struct SourceManager { core: Mutex, pub metrics: Arc, } - -const MAX_FAIL_CNT: u32 = 10; -const DEFAULT_SOURCE_TICK_TIMEOUT: Duration = Duration::from_secs(10); - -struct SharedSplitMap { - splits: Option>, -} - -type SharedSplitMapRef = Arc>; - -/// `ConnectorSourceWorker` keeps fetching the latest split metadata from the external source service ([`Self::tick`]), -/// and maintains it in `current_splits`. -struct ConnectorSourceWorker { - source_id: SourceId, - source_name: String, - current_splits: SharedSplitMapRef, - enumerator: P::SplitEnumerator, - period: Duration, - metrics: Arc, - connector_properties: P, - fail_cnt: u32, - source_is_up: LabelGuardedIntGauge<2>, -} - -fn extract_prop_from_existing_source(source: &Source) -> ConnectorResult { - let options_with_secret = - WithOptionsSecResolved::new(source.with_properties.clone(), source.secret_refs.clone()); - let mut properties = ConnectorProperties::extract(options_with_secret, false)?; - properties.init_from_pb_source(source); - Ok(properties) -} -fn extract_prop_from_new_source(source: &Source) -> ConnectorResult { - let options_with_secret = - WithOptionsSecResolved::new(source.with_properties.clone(), source.secret_refs.clone()); - let mut properties = ConnectorProperties::extract(options_with_secret, true)?; - properties.init_from_pb_source(source); - Ok(properties) -} - -/// Used to create a new `ConnectorSourceWorkerHandle` for a new source. -/// -/// It will call `ConnectorSourceWorker::tick()` to fetch split metadata once before returning. -pub async fn create_source_worker_handle( - source: &Source, - metrics: Arc, -) -> MetaResult { - tracing::info!("spawning new watcher for source {}", source.id); - - let splits = Arc::new(Mutex::new(SharedSplitMap { splits: None })); - let current_splits_ref = splits.clone(); - - let connector_properties = extract_prop_from_new_source(source)?; - let enable_scale_in = connector_properties.enable_drop_split(); - let enable_adaptive_splits = connector_properties.enable_adaptive_splits(); - let (sync_call_tx, sync_call_rx) = tokio::sync::mpsc::unbounded_channel(); - let handle = dispatch_source_prop!(connector_properties, prop, { - let mut worker = ConnectorSourceWorker::create( - source, - *prop, - DEFAULT_SOURCE_WORKER_TICK_INTERVAL, - current_splits_ref.clone(), - metrics, - ) - .await?; - - // if fail to fetch meta info, will refuse to create source - - // todo: make the timeout configurable, longer than `properties.sync.call.timeout` - // in kafka - tokio::time::timeout(DEFAULT_SOURCE_TICK_TIMEOUT, worker.tick()) - .await - .ok() - .with_context(|| { - format!( - "failed to fetch meta info for source {}, timeout {:?}", - source.id, DEFAULT_SOURCE_TICK_TIMEOUT - ) - })??; - - tokio::spawn(async move { worker.run(sync_call_rx).await }) - }); - Ok(ConnectorSourceWorkerHandle { - handle, - sync_call_tx, - splits, - enable_drop_split: enable_scale_in, - enable_adaptive_splits, - }) -} - -const DEFAULT_SOURCE_WORKER_TICK_INTERVAL: Duration = Duration::from_secs(30); - -impl ConnectorSourceWorker

{ - /// Recreate the `SplitEnumerator` to establish a new connection to the external source service. - async fn refresh(&mut self) -> MetaResult<()> { - let enumerator = P::SplitEnumerator::new( - self.connector_properties.clone(), - Arc::new(SourceEnumeratorContext { - metrics: self.metrics.source_enumerator_metrics.clone(), - info: SourceEnumeratorInfo { - source_id: self.source_id as u32, - }, - }), - ) - .await - .context("failed to create SplitEnumerator")?; - self.enumerator = enumerator; - self.fail_cnt = 0; - tracing::info!("refreshed source enumerator: {}", self.source_name); - Ok(()) - } - - /// On creation, connection to the external source service will be established, but `splits` - /// will not be updated until `tick` is called. - pub async fn create( - source: &Source, - connector_properties: P, - period: Duration, - splits: Arc>, - metrics: Arc, - ) -> MetaResult { - let enumerator = P::SplitEnumerator::new( - connector_properties.clone(), - Arc::new(SourceEnumeratorContext { - metrics: metrics.source_enumerator_metrics.clone(), - info: SourceEnumeratorInfo { - source_id: source.id, - }, - }), - ) - .await - .context("failed to create SplitEnumerator")?; - - let source_is_up = metrics - .source_is_up - .with_guarded_label_values(&[source.id.to_string().as_str(), &source.name]); - - Ok(Self { - source_id: source.id as SourceId, - source_name: source.name.clone(), - current_splits: splits, - enumerator, - period, - metrics, - connector_properties, - fail_cnt: 0, - source_is_up, - }) - } - - pub async fn run( - &mut self, - mut sync_call_rx: UnboundedReceiver>>, - ) { - let mut interval = time::interval(self.period); - interval.set_missed_tick_behavior(MissedTickBehavior::Skip); - loop { - select! { - biased; - tx = sync_call_rx.borrow_mut().recv() => { - if let Some(tx) = tx { - let _ = tx.send(self.tick().await); - } - } - _ = interval.tick() => { - if self.fail_cnt > MAX_FAIL_CNT { - if let Err(e) = self.refresh().await { - tracing::error!(error = %e.as_report(), "error happened when refresh from connector source worker"); - } - } - if let Err(e) = self.tick().await { - tracing::error!(error = %e.as_report(), "error happened when tick from connector source worker"); - } - } - } - } - } - - /// Uses [`SplitEnumerator`] to fetch the latest split metadata from the external source service. - async fn tick(&mut self) -> MetaResult<()> { - let source_is_up = |res: i64| { - self.source_is_up.set(res); - }; - let splits = self.enumerator.list_splits().await.inspect_err(|_| { - source_is_up(0); - self.fail_cnt += 1; - })?; - source_is_up(1); - self.fail_cnt = 0; - let mut current_splits = self.current_splits.lock().await; - current_splits.splits.replace( - splits - .into_iter() - .map(|split| (split.id(), P::Split::into(split))) - .collect(), - ); - - Ok(()) - } -} - -/// Handle for a running [`ConnectorSourceWorker`]. -pub struct ConnectorSourceWorkerHandle { - handle: JoinHandle<()>, - sync_call_tx: UnboundedSender>>, - splits: SharedSplitMapRef, - enable_drop_split: bool, - enable_adaptive_splits: bool, -} - -impl ConnectorSourceWorkerHandle { - async fn discovered_splits(&self) -> Option> { - self.splits.lock().await.splits.clone() - } - - pub fn get_enable_adaptive_splits(&self) -> bool { - self.enable_adaptive_splits - } -} - pub struct SourceManagerCore { metadata_manager: MetadataManager, @@ -300,36 +84,6 @@ pub struct SourceManagerRunningInfo { pub actor_splits: HashMap>, } -async fn handle_discover_splits( - handle: &ConnectorSourceWorkerHandle, - source_id: SourceId, - actors: &HashSet, -) -> MetaResult, SplitImpl>> { - let Some(mut discovered_splits) = handle.discovered_splits().await else { - tracing::info!( - "The discover loop for source {} is not ready yet; we'll wait for the next run", - source_id - ); - return Ok(BTreeMap::new()); - }; - if discovered_splits.is_empty() { - tracing::warn!("No splits discovered for source {}", source_id); - } - - if handle.enable_adaptive_splits { - // Connector supporting adaptive splits returns just one split, and we need to make the number of splits equal to the number of actors in this fragment. - // Because we Risingwave consume the splits statelessly and we do not need to keep the id internally, we always use actor_id as split_id. - // And prev splits record should be dropped via CN. - - debug_assert!(handle.enable_drop_split); - debug_assert!(discovered_splits.len() == 1); - discovered_splits = - fill_adaptive_split(discovered_splits.values().next().unwrap(), actors)?; - } - - Ok(discovered_splits) -} - impl SourceManagerCore { fn new( metadata_manager: MetadataManager, @@ -347,117 +101,6 @@ impl SourceManagerCore { } } - /// Checks whether the external source metadata has changed, - /// and re-assigns splits if there's a diff. - /// - /// `self.actor_splits` will not be updated. It will be updated by `Self::apply_source_change`, - /// after the mutation barrier has been collected. - async fn reassign_splits(&self) -> MetaResult> { - let mut split_assignment: SplitAssignment = HashMap::new(); - - 'loop_source: for (source_id, handle) in &self.managed_sources { - let source_fragment_ids = match self.source_fragments.get(source_id) { - Some(fragment_ids) if !fragment_ids.is_empty() => fragment_ids, - _ => { - continue; - } - }; - let backfill_fragment_ids = self.backfill_fragments.get(source_id); - - 'loop_fragment: for &fragment_id in source_fragment_ids { - let actors = match self - .metadata_manager - .get_running_actors_of_fragment(fragment_id) - .await - { - Ok(actors) => { - if actors.is_empty() { - tracing::warn!("No actors found for fragment {}", fragment_id); - continue 'loop_fragment; - } - actors - } - Err(err) => { - tracing::warn!(error = %err.as_report(), "Failed to get the actor of the fragment, maybe the fragment doesn't exist anymore"); - continue 'loop_fragment; - } - }; - - let discovered_splits = handle_discover_splits(handle, *source_id, &actors).await?; - if discovered_splits.is_empty() { - // The discover loop for this source is not ready yet; we'll wait for the next run - continue 'loop_source; - } - - let prev_actor_splits: HashMap<_, _> = actors - .into_iter() - .map(|actor_id| { - ( - actor_id, - self.actor_splits - .get(&actor_id) - .cloned() - .unwrap_or_default(), - ) - }) - .collect(); - - if let Some(new_assignment) = reassign_splits( - fragment_id, - prev_actor_splits, - &discovered_splits, - SplitDiffOptions { - enable_scale_in: handle.enable_drop_split, - enable_adaptive: handle.enable_adaptive_splits, - }, - ) { - split_assignment.insert(fragment_id, new_assignment); - } - } - - if let Some(backfill_fragment_ids) = backfill_fragment_ids { - // align splits for backfill fragments with its upstream source fragment - for (fragment_id, upstream_fragment_id) in backfill_fragment_ids { - let Some(upstream_assignment) = split_assignment.get(upstream_fragment_id) - else { - // upstream fragment unchanged, do not update backfill fragment too - continue; - }; - let actors = match self - .metadata_manager - .get_running_actors_for_source_backfill(*fragment_id) - .await - { - Ok(actors) => { - if actors.is_empty() { - tracing::warn!("No actors found for fragment {}", fragment_id); - continue; - } - actors - } - Err(err) => { - tracing::warn!(error = %err.as_report(),"Failed to get the actor of the fragment, maybe the fragment doesn't exist anymore"); - continue; - } - }; - split_assignment.insert( - *fragment_id, - align_splits( - actors, - upstream_assignment, - *fragment_id, - *upstream_fragment_id, - )?, - ); - } - } - } - - self.metadata_manager - .split_fragment_map_by_database(split_assignment) - .await - } - /// Updates states after all kinds of source change. pub fn apply_source_change(&mut self, source_change: SourceChange) { let mut added_source_fragments = Default::default(); @@ -593,203 +236,6 @@ impl SourceManagerCore { } } -/// Note: the `PartialEq` and `Ord` impl just compares the number of splits. -#[derive(Debug)] -struct ActorSplitsAssignment { - actor_id: ActorId, - splits: Vec, -} - -impl Eq for ActorSplitsAssignment {} - -impl PartialEq for ActorSplitsAssignment { - fn eq(&self, other: &Self) -> bool { - self.splits.len() == other.splits.len() - } -} - -impl PartialOrd for ActorSplitsAssignment { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl Ord for ActorSplitsAssignment { - fn cmp(&self, other: &Self) -> Ordering { - // Note: this is reversed order, to make BinaryHeap a min heap. - other.splits.len().cmp(&self.splits.len()) - } -} - -#[derive(Debug)] -struct SplitDiffOptions { - enable_scale_in: bool, - - /// For most connectors, this should be false. When enabled, RisingWave will not track any progress. - enable_adaptive: bool, -} - -#[allow(clippy::derivable_impls)] -impl Default for SplitDiffOptions { - fn default() -> Self { - SplitDiffOptions { - enable_scale_in: false, - enable_adaptive: false, - } - } -} - -/// Reassigns splits if there are new splits or dropped splits, -/// i.e., `actor_splits` and `discovered_splits` differ, or actors are rescheduled. -/// -/// The existing splits will remain unmoved in their currently assigned actor. -/// -/// If an actor has an upstream actor, it should be a backfill executor, -/// and its splits should be aligned with the upstream actor. **`reassign_splits` should not be used in this case. -/// Use [`align_splits`] instead.** -/// -/// - `fragment_id`: just for logging -/// -/// ## Different connectors' behavior of split change -/// -/// ### Kafka and Pulsar -/// They only support increasing the number of splits via adding new empty splits. -/// Old data is not moved. -/// -/// ### Kinesis -/// It supports *pairwise* shard split and merge. -/// -/// In both cases, old data remain in the old shard(s) and the old shard is still available. -/// New data are routed to the new shard(s). -/// After the retention period has expired, the old shard will become `EXPIRED` and isn't -/// listed any more. In other words, the total number of shards will first increase and then decrease. -/// -/// See also: -/// - [Kinesis resharding doc](https://docs.aws.amazon.com/streams/latest/dev/kinesis-using-sdk-java-after-resharding.html#kinesis-using-sdk-java-resharding-data-routing) -/// - An example of how the shards can be like: -fn reassign_splits( - fragment_id: FragmentId, - actor_splits: HashMap>, - discovered_splits: &BTreeMap, - opts: SplitDiffOptions, -) -> Option>> -where - T: SplitMetaData + Clone, -{ - // if no actors, return - if actor_splits.is_empty() { - return None; - } - - let prev_split_ids: HashSet<_> = actor_splits - .values() - .flat_map(|splits| splits.iter().map(SplitMetaData::id)) - .collect(); - - tracing::trace!(fragment_id, prev_split_ids = ?prev_split_ids, "previous splits"); - tracing::trace!(fragment_id, prev_split_ids = ?discovered_splits.keys(), "discovered splits"); - - let discovered_split_ids: HashSet<_> = discovered_splits.keys().cloned().collect(); - - let dropped_splits: HashSet<_> = prev_split_ids - .difference(&discovered_split_ids) - .cloned() - .collect(); - - if !dropped_splits.is_empty() { - if opts.enable_scale_in { - tracing::info!(fragment_id, dropped_spltis = ?dropped_splits, "new dropped splits"); - } else { - tracing::warn!(fragment_id, dropped_spltis = ?dropped_splits, "split dropping happened, but it is not allowed"); - } - } - - let new_discovered_splits: BTreeSet<_> = discovered_split_ids - .into_iter() - .filter(|split_id| !prev_split_ids.contains(split_id)) - .collect(); - - if opts.enable_scale_in || opts.enable_adaptive { - // if we support scale in, no more splits are discovered, and no splits are dropped, return - // we need to check if discovered_split_ids is empty, because if it is empty, we need to - // handle the case of scale in to zero (like deleting all objects from s3) - if dropped_splits.is_empty() - && new_discovered_splits.is_empty() - && !discovered_splits.is_empty() - { - return None; - } - } else { - // if we do not support scale in, and no more splits are discovered, return - if new_discovered_splits.is_empty() && !discovered_splits.is_empty() { - return None; - } - } - - tracing::info!(fragment_id, new_discovered_splits = ?new_discovered_splits, "new discovered splits"); - - let mut heap = BinaryHeap::with_capacity(actor_splits.len()); - - for (actor_id, mut splits) in actor_splits { - if opts.enable_scale_in || opts.enable_adaptive { - splits.retain(|split| !dropped_splits.contains(&split.id())); - } - - heap.push(ActorSplitsAssignment { actor_id, splits }) - } - - for split_id in new_discovered_splits { - // ActorSplitsAssignment's Ord is reversed, so this is min heap, i.e., - // we get the assignment with the least splits here. - - // Note: If multiple actors have the same number of splits, it will be randomly picked. - // When the number of source actors is larger than the number of splits, - // It's possible that the assignment is uneven. - // e.g., https://github.com/risingwavelabs/risingwave/issues/14324#issuecomment-1875033158 - // TODO: We should make the assignment rack-aware to make sure it's even. - let mut peek_ref = heap.peek_mut().unwrap(); - peek_ref - .splits - .push(discovered_splits.get(&split_id).cloned().unwrap()); - } - - Some( - heap.into_iter() - .map(|ActorSplitsAssignment { actor_id, splits }| (actor_id, splits)) - .collect(), - ) -} - -/// Assign splits to a new set of actors, according to existing assignment. -/// -/// illustration: -/// ```text -/// upstream new -/// actor x1 [split 1, split2] -> actor y1 [split 1, split2] -/// actor x2 [split 3] -> actor y2 [split 3] -/// ... -/// ``` -fn align_splits( - // (actor_id, upstream_actor_id) - aligned_actors: impl IntoIterator, - existing_assignment: &HashMap>, - fragment_id: FragmentId, - upstream_source_fragment_id: FragmentId, -) -> anyhow::Result>> { - aligned_actors - .into_iter() - .map(|(actor_id, upstream_actor_id)| { - let Some(splits) = existing_assignment.get(&upstream_actor_id) else { - return Err(anyhow::anyhow!("upstream assignment not found, fragment_id: {fragment_id}, upstream_fragment_id: {upstream_source_fragment_id}, actor_id: {actor_id}, upstream_assignment: {existing_assignment:?}, upstream_actor_id: {upstream_actor_id:?}")); - }; - Ok(( - actor_id, - splits.clone(), - )) - }) - .collect() -} - impl SourceManager { const DEFAULT_SOURCE_TICK_INTERVAL: Duration = Duration::from_secs(10); @@ -802,7 +248,7 @@ impl SourceManager { { let sources = metadata_manager.list_sources().await?; for source in sources { - Self::create_source_worker_async(source, &mut managed_sources, metrics.clone())? + create_source_worker_async(source, &mut managed_sources, metrics.clone())? } } @@ -904,283 +350,12 @@ impl SourceManager { core.apply_source_change(source_change); } - /// Migrates splits from previous actors to the new actors for a rescheduled fragment. - /// - /// Very occasionally split removal may happen during scaling, in which case we need to - /// use the old splits for reallocation instead of the latest splits (which may be missing), - /// so that we can resolve the split removal in the next command. - pub async fn migrate_splits_for_source_actors( - &self, - fragment_id: FragmentId, - prev_actor_ids: &[ActorId], - curr_actor_ids: &[ActorId], - ) -> MetaResult>> { - let core = self.core.lock().await; - - let prev_splits = prev_actor_ids - .iter() - .flat_map(|actor_id| core.actor_splits.get(actor_id).unwrap()) - .map(|split| (split.id(), split.clone())) - .collect(); - - let empty_actor_splits = curr_actor_ids - .iter() - .map(|actor_id| (*actor_id, vec![])) - .collect(); - - let diff = reassign_splits( - fragment_id, - empty_actor_splits, - &prev_splits, - // pre-allocate splits is the first time getting splits and it does not have scale-in scene - SplitDiffOptions::default(), - ) - .unwrap_or_default(); - - Ok(diff) - } - - /// Migrates splits from previous actors to the new actors for a rescheduled fragment. - pub fn migrate_splits_for_backfill_actors( - &self, - fragment_id: FragmentId, - upstream_source_fragment_id: FragmentId, - curr_actor_ids: &[ActorId], - fragment_actor_splits: &HashMap>>, - no_shuffle_upstream_actor_map: &HashMap>, - ) -> MetaResult>> { - // align splits for backfill fragments with its upstream source fragment - let actors = no_shuffle_upstream_actor_map - .iter() - .filter(|(id, _)| curr_actor_ids.contains(id)) - .map(|(id, upstream_fragment_actors)| { - ( - *id, - *upstream_fragment_actors - .get(&upstream_source_fragment_id) - .unwrap(), - ) - }); - let upstream_assignment = fragment_actor_splits - .get(&upstream_source_fragment_id) - .unwrap(); - tracing::info!( - fragment_id, - upstream_source_fragment_id, - ?upstream_assignment, - "migrate_splits_for_backfill_actors" - ); - Ok(align_splits( - actors, - upstream_assignment, - fragment_id, - upstream_source_fragment_id, - )?) - } - - /// Allocates splits to actors for a newly created source executor. - pub async fn allocate_splits(&self, job_id: &TableId) -> MetaResult { - let core = self.core.lock().await; - let table_fragments = core - .metadata_manager - .get_job_fragments_by_id(job_id) - .await?; - - let source_fragments = table_fragments.stream_source_fragments(); - - let mut assigned = HashMap::new(); - - 'loop_source: for (source_id, fragments) in source_fragments { - let handle = core - .managed_sources - .get(&source_id) - .with_context(|| format!("could not find source {}", source_id))?; - - if handle.splits.lock().await.splits.is_none() { - // force refresh source - let (tx, rx) = oneshot::channel(); - handle - .sync_call_tx - .send(tx) - .ok() - .context("failed to send sync call")?; - rx.await - .ok() - .context("failed to receive sync call response")??; - } - - for fragment_id in fragments { - let empty_actor_splits: HashMap> = table_fragments - .fragments - .get(&fragment_id) - .unwrap() - .actors - .iter() - .map(|actor| (actor.actor_id, vec![])) - .collect(); - let actor_hashset: HashSet = empty_actor_splits.keys().cloned().collect(); - let splits = handle_discover_splits(handle, source_id, &actor_hashset).await?; - if splits.is_empty() { - tracing::warn!("no splits detected for source {}", source_id); - continue 'loop_source; - } - - if let Some(diff) = reassign_splits( - fragment_id, - empty_actor_splits, - &splits, - SplitDiffOptions::default(), - ) { - assigned.insert(fragment_id, diff); - } - } - } - - Ok(assigned) - } - - /// Allocates splits to actors for replace source job. - pub async fn allocate_splits_for_replace_source( - &self, - job_id: &TableId, - merge_updates: &Vec, - ) -> MetaResult { - tracing::debug!(?merge_updates, "allocate_splits_for_replace_source"); - if merge_updates.is_empty() { - // no existing downstream. We can just re-allocate splits arbitrarily. - return self.allocate_splits(job_id).await; - } - - let core = self.core.lock().await; - let table_fragments = core - .metadata_manager - .get_job_fragments_by_id(job_id) - .await?; - - let source_fragments = table_fragments.stream_source_fragments(); - assert_eq!( - source_fragments.len(), - 1, - "replace source job should only have one source" - ); - let (_source_id, fragments) = source_fragments.into_iter().next().unwrap(); - assert_eq!( - fragments.len(), - 1, - "replace source job should only have one fragment" - ); - let fragment_id = fragments.into_iter().next().unwrap(); - - debug_assert!( - !merge_updates.is_empty() - && merge_updates.iter().all(|merge_update| { - merge_update.upstream_fragment_id == merge_updates[0].upstream_fragment_id - && merge_update.new_upstream_fragment_id == Some(fragment_id) - }), - "merge update should only replace one fragment: {:?}", - merge_updates - ); - let prev_fragment_id = merge_updates[0].upstream_fragment_id; - // Here we align the new source executor to backfill executors - // - // old_source => new_source backfill_1 - // actor_x1 => actor_y1 -----┬------>actor_a1 - // actor_x2 => actor_y2 -----┼-┬---->actor_a2 - // │ │ - // │ │ backfill_2 - // └─┼---->actor_b1 - // └---->actor_b2 - // - // Note: we can choose any backfill actor to align here. - // We use `HashMap` to dedup. - let aligned_actors: HashMap = merge_updates - .iter() - .map(|merge_update| { - assert_eq!(merge_update.added_upstream_actor_id.len(), 1); - // Note: removed_upstream_actor_id is not set for replace job, so we can't use it. - assert_eq!(merge_update.removed_upstream_actor_id.len(), 0); - ( - merge_update.added_upstream_actor_id[0], - merge_update.actor_id, - ) - }) - .collect(); - let assignment = align_splits( - aligned_actors.into_iter(), - &core.actor_splits, - fragment_id, - prev_fragment_id, - )?; - Ok(HashMap::from([(fragment_id, assignment)])) - } - - /// Allocates splits to actors for a newly created `SourceBackfill` executor. - /// - /// Unlike [`Self::allocate_splits`], which creates a new assignment, - /// this method aligns the splits for backfill fragments with its upstream source fragment ([`align_splits`]). - pub async fn allocate_splits_for_backfill( - &self, - table_id: &TableId, - // dispatchers from SourceExecutor to SourceBackfillExecutor - dispatchers: &HashMap>, - ) -> MetaResult { - let core = self.core.lock().await; - let table_fragments = core - .metadata_manager - .get_job_fragments_by_id(table_id) - .await?; - - let source_backfill_fragments = table_fragments.source_backfill_fragments()?; - - let mut assigned = HashMap::new(); - - for (_source_id, fragments) in source_backfill_fragments { - for (fragment_id, upstream_source_fragment_id) in fragments { - let upstream_actors = core - .metadata_manager - .get_running_actors_of_fragment(upstream_source_fragment_id) - .await?; - let mut backfill_actors = vec![]; - for upstream_actor in upstream_actors { - if let Some(dispatchers) = dispatchers.get(&upstream_actor) { - let err = || { - anyhow::anyhow!( - "source backfill fragment's upstream fragment should have one dispatcher, fragment_id: {fragment_id}, upstream_fragment_id: {upstream_source_fragment_id}, upstream_actor: {upstream_actor}, dispatchers: {dispatchers:?}", - fragment_id = fragment_id, - upstream_source_fragment_id = upstream_source_fragment_id, - upstream_actor = upstream_actor, - dispatchers = dispatchers - ) - }; - if dispatchers.len() != 1 || dispatchers[0].downstream_actor_id.len() != 1 { - return Err(err().into()); - } - - backfill_actors - .push((dispatchers[0].downstream_actor_id[0], upstream_actor)); - } - } - assigned.insert( - fragment_id, - align_splits( - backfill_actors, - &core.actor_splits, - fragment_id, - upstream_source_fragment_id, - )?, - ); - } - } - - Ok(assigned) - } - /// create and register connector worker for source. pub async fn register_source(&self, source: &Source) -> MetaResult<()> { tracing::debug!("register_source: {}", source.get_id()); let mut core = self.core.lock().await; if let Entry::Vacant(e) = core.managed_sources.entry(source.get_id() as _) { - let handle = create_source_worker_handle(source, self.metrics.clone()) + let handle = create_source_worker(source, self.metrics.clone()) .await .context("failed to create source worker")?; e.insert(handle); @@ -1204,66 +379,6 @@ impl SourceManager { } } - /// Used on startup ([`Self::new`]). Failed sources will not block meta startup. - fn create_source_worker_async( - source: Source, - managed_sources: &mut HashMap, - metrics: Arc, - ) -> MetaResult<()> { - tracing::info!("spawning new watcher for source {}", source.id); - - let splits = Arc::new(Mutex::new(SharedSplitMap { splits: None })); - let current_splits_ref = splits.clone(); - let source_id = source.id; - - let connector_properties = extract_prop_from_existing_source(&source)?; - - let enable_drop_split = connector_properties.enable_drop_split(); - let enable_adaptive_splits = connector_properties.enable_adaptive_splits(); - let (sync_call_tx, sync_call_rx) = tokio::sync::mpsc::unbounded_channel(); - let handle = tokio::spawn(async move { - let mut ticker = time::interval(Self::DEFAULT_SOURCE_TICK_INTERVAL); - ticker.set_missed_tick_behavior(MissedTickBehavior::Skip); - - dispatch_source_prop!(connector_properties, prop, { - let mut worker = loop { - ticker.tick().await; - - match ConnectorSourceWorker::create( - &source, - prop.deref().clone(), - DEFAULT_SOURCE_WORKER_TICK_INTERVAL, - current_splits_ref.clone(), - metrics.clone(), - ) - .await - { - Ok(worker) => { - break worker; - } - Err(e) => { - tracing::warn!(error = %e.as_report(), "failed to create source worker"); - } - } - }; - - worker.run(sync_call_rx).await - }); - }); - - managed_sources.insert( - source_id as SourceId, - ConnectorSourceWorkerHandle { - handle, - sync_call_tx, - splits, - enable_drop_split, - enable_adaptive_splits, - }, - ); - Ok(()) - } - pub async fn list_assignments(&self) -> HashMap> { let core = self.core.lock().await; core.actor_splits.clone() @@ -1384,235 +499,3 @@ pub fn build_actor_split_impls( }) .collect() } - -#[cfg(test)] -mod tests { - use std::collections::{BTreeMap, HashMap, HashSet}; - - use risingwave_common::types::JsonbVal; - use risingwave_connector::error::ConnectorResult; - use risingwave_connector::source::{SplitId, SplitMetaData}; - use serde::{Deserialize, Serialize}; - - use crate::model::{ActorId, FragmentId}; - use crate::stream::source_manager::{reassign_splits, SplitDiffOptions}; - - #[derive(Debug, Copy, Clone, Serialize, Deserialize)] - struct TestSplit { - id: u32, - } - - impl SplitMetaData for TestSplit { - fn id(&self) -> SplitId { - format!("{}", self.id).into() - } - - fn encode_to_json(&self) -> JsonbVal { - serde_json::to_value(*self).unwrap().into() - } - - fn restore_from_json(value: JsonbVal) -> ConnectorResult { - serde_json::from_value(value.take()).map_err(Into::into) - } - - fn update_offset(&mut self, _last_read_offset: String) -> ConnectorResult<()> { - Ok(()) - } - } - - fn check_all_splits( - discovered_splits: &BTreeMap, - diff: &HashMap>, - ) { - let mut split_ids: HashSet<_> = discovered_splits.keys().cloned().collect(); - - for splits in diff.values() { - for split in splits { - assert!(split_ids.remove(&split.id())) - } - } - - assert!(split_ids.is_empty()); - } - - #[test] - fn test_drop_splits() { - let mut actor_splits: HashMap = HashMap::new(); - actor_splits.insert(0, vec![TestSplit { id: 0 }, TestSplit { id: 1 }]); - actor_splits.insert(1, vec![TestSplit { id: 2 }, TestSplit { id: 3 }]); - actor_splits.insert(2, vec![TestSplit { id: 4 }, TestSplit { id: 5 }]); - - let mut prev_split_to_actor = HashMap::new(); - for (actor_id, splits) in &actor_splits { - for split in splits { - prev_split_to_actor.insert(split.id(), *actor_id); - } - } - - let discovered_splits: BTreeMap = (1..5) - .map(|i| { - let split = TestSplit { id: i }; - (split.id(), split) - }) - .collect(); - - let opts = SplitDiffOptions { - enable_scale_in: true, - enable_adaptive: false, - }; - - let prev_split_ids: HashSet<_> = actor_splits - .values() - .flat_map(|splits| splits.iter().map(|split| split.id())) - .collect(); - - let diff = reassign_splits( - FragmentId::default(), - actor_splits, - &discovered_splits, - opts, - ) - .unwrap(); - check_all_splits(&discovered_splits, &diff); - - let mut after_split_to_actor = HashMap::new(); - for (actor_id, splits) in &diff { - for split in splits { - after_split_to_actor.insert(split.id(), *actor_id); - } - } - - let discovered_split_ids: HashSet<_> = discovered_splits.keys().cloned().collect(); - - let retained_split_ids: HashSet<_> = - prev_split_ids.intersection(&discovered_split_ids).collect(); - - for retained_split_id in retained_split_ids { - assert_eq!( - prev_split_to_actor.get(retained_split_id), - after_split_to_actor.get(retained_split_id) - ) - } - } - - #[test] - fn test_drop_splits_to_empty() { - let mut actor_splits: HashMap = HashMap::new(); - actor_splits.insert(0, vec![TestSplit { id: 0 }]); - - let discovered_splits: BTreeMap = BTreeMap::new(); - - let opts = SplitDiffOptions { - enable_scale_in: true, - enable_adaptive: false, - }; - - let diff = reassign_splits( - FragmentId::default(), - actor_splits, - &discovered_splits, - opts, - ) - .unwrap(); - - assert!(!diff.is_empty()) - } - - #[test] - fn test_reassign_splits() { - let actor_splits = HashMap::new(); - let discovered_splits: BTreeMap = BTreeMap::new(); - assert!(reassign_splits( - FragmentId::default(), - actor_splits, - &discovered_splits, - Default::default() - ) - .is_none()); - - let actor_splits = (0..3).map(|i| (i, vec![])).collect(); - let discovered_splits: BTreeMap = BTreeMap::new(); - let diff = reassign_splits( - FragmentId::default(), - actor_splits, - &discovered_splits, - Default::default(), - ) - .unwrap(); - assert_eq!(diff.len(), 3); - for splits in diff.values() { - assert!(splits.is_empty()) - } - - let actor_splits = (0..3).map(|i| (i, vec![])).collect(); - let discovered_splits: BTreeMap = (0..3) - .map(|i| { - let split = TestSplit { id: i }; - (split.id(), split) - }) - .collect(); - - let diff = reassign_splits( - FragmentId::default(), - actor_splits, - &discovered_splits, - Default::default(), - ) - .unwrap(); - assert_eq!(diff.len(), 3); - for splits in diff.values() { - assert_eq!(splits.len(), 1); - } - - check_all_splits(&discovered_splits, &diff); - - let actor_splits = (0..3).map(|i| (i, vec![TestSplit { id: i }])).collect(); - let discovered_splits: BTreeMap = (0..5) - .map(|i| { - let split = TestSplit { id: i }; - (split.id(), split) - }) - .collect(); - - let diff = reassign_splits( - FragmentId::default(), - actor_splits, - &discovered_splits, - Default::default(), - ) - .unwrap(); - assert_eq!(diff.len(), 3); - for splits in diff.values() { - let len = splits.len(); - assert!(len == 1 || len == 2); - } - - check_all_splits(&discovered_splits, &diff); - - let mut actor_splits: HashMap> = - (0..3).map(|i| (i, vec![TestSplit { id: i }])).collect(); - actor_splits.insert(3, vec![]); - actor_splits.insert(4, vec![]); - - let discovered_splits: BTreeMap = (0..5) - .map(|i| { - let split = TestSplit { id: i }; - (split.id(), split) - }) - .collect(); - - let diff = reassign_splits( - FragmentId::default(), - actor_splits, - &discovered_splits, - Default::default(), - ) - .unwrap(); - assert_eq!(diff.len(), 5); - for splits in diff.values() { - assert_eq!(splits.len(), 1); - } - - check_all_splits(&discovered_splits, &diff); - } -} diff --git a/src/meta/src/stream/source_manager/split_assignment.rs b/src/meta/src/stream/source_manager/split_assignment.rs new file mode 100644 index 0000000000000..e36c1198bfc96 --- /dev/null +++ b/src/meta/src/stream/source_manager/split_assignment.rs @@ -0,0 +1,830 @@ +// Copyright 2025 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::*; + +impl SourceManager { + /// Migrates splits from previous actors to the new actors for a rescheduled fragment. + /// + /// Very occasionally split removal may happen during scaling, in which case we need to + /// use the old splits for reallocation instead of the latest splits (which may be missing), + /// so that we can resolve the split removal in the next command. + pub async fn migrate_splits_for_source_actors( + &self, + fragment_id: FragmentId, + prev_actor_ids: &[ActorId], + curr_actor_ids: &[ActorId], + ) -> MetaResult>> { + let core = self.core.lock().await; + + let prev_splits = prev_actor_ids + .iter() + .flat_map(|actor_id| core.actor_splits.get(actor_id).unwrap()) + .map(|split| (split.id(), split.clone())) + .collect(); + + let empty_actor_splits = curr_actor_ids + .iter() + .map(|actor_id| (*actor_id, vec![])) + .collect(); + + let diff = reassign_splits( + fragment_id, + empty_actor_splits, + &prev_splits, + // pre-allocate splits is the first time getting splits and it does not have scale-in scene + SplitDiffOptions::default(), + ) + .unwrap_or_default(); + + Ok(diff) + } + + /// Migrates splits from previous actors to the new actors for a rescheduled fragment. + pub fn migrate_splits_for_backfill_actors( + &self, + fragment_id: FragmentId, + upstream_source_fragment_id: FragmentId, + curr_actor_ids: &[ActorId], + fragment_actor_splits: &HashMap>>, + no_shuffle_upstream_actor_map: &HashMap>, + ) -> MetaResult>> { + // align splits for backfill fragments with its upstream source fragment + let actors = no_shuffle_upstream_actor_map + .iter() + .filter(|(id, _)| curr_actor_ids.contains(id)) + .map(|(id, upstream_fragment_actors)| { + ( + *id, + *upstream_fragment_actors + .get(&upstream_source_fragment_id) + .unwrap(), + ) + }); + let upstream_assignment = fragment_actor_splits + .get(&upstream_source_fragment_id) + .unwrap(); + tracing::info!( + fragment_id, + upstream_source_fragment_id, + ?upstream_assignment, + "migrate_splits_for_backfill_actors" + ); + Ok(align_splits( + actors, + upstream_assignment, + fragment_id, + upstream_source_fragment_id, + )?) + } + + /// Allocates splits to actors for a newly created source executor. + pub async fn allocate_splits(&self, job_id: &TableId) -> MetaResult { + let core = self.core.lock().await; + let table_fragments = core + .metadata_manager + .get_job_fragments_by_id(job_id) + .await?; + + let source_fragments = table_fragments.stream_source_fragments(); + + let mut assigned = HashMap::new(); + + 'loop_source: for (source_id, fragments) in source_fragments { + let handle = core + .managed_sources + .get(&source_id) + .with_context(|| format!("could not find source {}", source_id))?; + + if handle.splits.lock().await.splits.is_none() { + // force refresh source + let (tx, rx) = oneshot::channel(); + handle + .sync_call_tx + .send(tx) + .ok() + .context("failed to send sync call")?; + rx.await + .ok() + .context("failed to receive sync call response")??; + } + + for fragment_id in fragments { + let empty_actor_splits: HashMap> = table_fragments + .fragments + .get(&fragment_id) + .unwrap() + .actors + .iter() + .map(|actor| (actor.actor_id, vec![])) + .collect(); + let actor_hashset: HashSet = empty_actor_splits.keys().cloned().collect(); + let splits = handle.discovered_splits(source_id, &actor_hashset).await?; + if splits.is_empty() { + tracing::warn!("no splits detected for source {}", source_id); + continue 'loop_source; + } + + if let Some(diff) = reassign_splits( + fragment_id, + empty_actor_splits, + &splits, + SplitDiffOptions::default(), + ) { + assigned.insert(fragment_id, diff); + } + } + } + + Ok(assigned) + } + + /// Allocates splits to actors for replace source job. + pub async fn allocate_splits_for_replace_source( + &self, + job_id: &TableId, + merge_updates: &Vec, + ) -> MetaResult { + tracing::debug!(?merge_updates, "allocate_splits_for_replace_source"); + if merge_updates.is_empty() { + // no existing downstream. We can just re-allocate splits arbitrarily. + return self.allocate_splits(job_id).await; + } + + let core = self.core.lock().await; + let table_fragments = core + .metadata_manager + .get_job_fragments_by_id(job_id) + .await?; + + let source_fragments = table_fragments.stream_source_fragments(); + assert_eq!( + source_fragments.len(), + 1, + "replace source job should only have one source" + ); + let (_source_id, fragments) = source_fragments.into_iter().next().unwrap(); + assert_eq!( + fragments.len(), + 1, + "replace source job should only have one fragment" + ); + let fragment_id = fragments.into_iter().next().unwrap(); + + debug_assert!( + !merge_updates.is_empty() + && merge_updates.iter().all(|merge_update| { + merge_update.upstream_fragment_id == merge_updates[0].upstream_fragment_id + && merge_update.new_upstream_fragment_id == Some(fragment_id) + }), + "merge update should only replace one fragment: {:?}", + merge_updates + ); + let prev_fragment_id = merge_updates[0].upstream_fragment_id; + // Here we align the new source executor to backfill executors + // + // old_source => new_source backfill_1 + // actor_x1 => actor_y1 -----┬------>actor_a1 + // actor_x2 => actor_y2 -----┼-┬---->actor_a2 + // │ │ + // │ │ backfill_2 + // └─┼---->actor_b1 + // └---->actor_b2 + // + // Note: we can choose any backfill actor to align here. + // We use `HashMap` to dedup. + let aligned_actors: HashMap = merge_updates + .iter() + .map(|merge_update| { + assert_eq!(merge_update.added_upstream_actor_id.len(), 1); + // Note: removed_upstream_actor_id is not set for replace job, so we can't use it. + assert_eq!(merge_update.removed_upstream_actor_id.len(), 0); + ( + merge_update.added_upstream_actor_id[0], + merge_update.actor_id, + ) + }) + .collect(); + let assignment = align_splits( + aligned_actors.into_iter(), + &core.actor_splits, + fragment_id, + prev_fragment_id, + )?; + Ok(HashMap::from([(fragment_id, assignment)])) + } + + /// Allocates splits to actors for a newly created `SourceBackfill` executor. + /// + /// Unlike [`Self::allocate_splits`], which creates a new assignment, + /// this method aligns the splits for backfill fragments with its upstream source fragment ([`align_splits`]). + pub async fn allocate_splits_for_backfill( + &self, + table_id: &TableId, + // dispatchers from SourceExecutor to SourceBackfillExecutor + dispatchers: &HashMap>, + ) -> MetaResult { + let core = self.core.lock().await; + let table_fragments = core + .metadata_manager + .get_job_fragments_by_id(table_id) + .await?; + + let source_backfill_fragments = table_fragments.source_backfill_fragments()?; + + let mut assigned = HashMap::new(); + + for (_source_id, fragments) in source_backfill_fragments { + for (fragment_id, upstream_source_fragment_id) in fragments { + let upstream_actors = core + .metadata_manager + .get_running_actors_of_fragment(upstream_source_fragment_id) + .await?; + let mut backfill_actors = vec![]; + for upstream_actor in upstream_actors { + if let Some(dispatchers) = dispatchers.get(&upstream_actor) { + let err = || { + anyhow::anyhow!( + "source backfill fragment's upstream fragment should have one dispatcher, fragment_id: {fragment_id}, upstream_fragment_id: {upstream_source_fragment_id}, upstream_actor: {upstream_actor}, dispatchers: {dispatchers:?}", + fragment_id = fragment_id, + upstream_source_fragment_id = upstream_source_fragment_id, + upstream_actor = upstream_actor, + dispatchers = dispatchers + ) + }; + if dispatchers.len() != 1 || dispatchers[0].downstream_actor_id.len() != 1 { + return Err(err().into()); + } + + backfill_actors + .push((dispatchers[0].downstream_actor_id[0], upstream_actor)); + } + } + assigned.insert( + fragment_id, + align_splits( + backfill_actors, + &core.actor_splits, + fragment_id, + upstream_source_fragment_id, + )?, + ); + } + } + + Ok(assigned) + } +} + +impl SourceManagerCore { + /// Checks whether the external source metadata has changed, + /// and re-assigns splits if there's a diff. + /// + /// `self.actor_splits` will not be updated. It will be updated by `Self::apply_source_change`, + /// after the mutation barrier has been collected. + pub async fn reassign_splits(&self) -> MetaResult> { + let mut split_assignment: SplitAssignment = HashMap::new(); + + 'loop_source: for (source_id, handle) in &self.managed_sources { + let source_fragment_ids = match self.source_fragments.get(source_id) { + Some(fragment_ids) if !fragment_ids.is_empty() => fragment_ids, + _ => { + continue; + } + }; + let backfill_fragment_ids = self.backfill_fragments.get(source_id); + + 'loop_fragment: for &fragment_id in source_fragment_ids { + let actors = match self + .metadata_manager + .get_running_actors_of_fragment(fragment_id) + .await + { + Ok(actors) => { + if actors.is_empty() { + tracing::warn!("No actors found for fragment {}", fragment_id); + continue 'loop_fragment; + } + actors + } + Err(err) => { + tracing::warn!(error = %err.as_report(), "Failed to get the actor of the fragment, maybe the fragment doesn't exist anymore"); + continue 'loop_fragment; + } + }; + + let discovered_splits = handle.discovered_splits(*source_id, &actors).await?; + if discovered_splits.is_empty() { + // The discover loop for this source is not ready yet; we'll wait for the next run + continue 'loop_source; + } + + let prev_actor_splits: HashMap<_, _> = actors + .into_iter() + .map(|actor_id| { + ( + actor_id, + self.actor_splits + .get(&actor_id) + .cloned() + .unwrap_or_default(), + ) + }) + .collect(); + + if let Some(new_assignment) = reassign_splits( + fragment_id, + prev_actor_splits, + &discovered_splits, + SplitDiffOptions { + enable_scale_in: handle.enable_drop_split, + enable_adaptive: handle.enable_adaptive_splits, + }, + ) { + split_assignment.insert(fragment_id, new_assignment); + } + } + + if let Some(backfill_fragment_ids) = backfill_fragment_ids { + // align splits for backfill fragments with its upstream source fragment + for (fragment_id, upstream_fragment_id) in backfill_fragment_ids { + let Some(upstream_assignment) = split_assignment.get(upstream_fragment_id) + else { + // upstream fragment unchanged, do not update backfill fragment too + continue; + }; + let actors = match self + .metadata_manager + .get_running_actors_for_source_backfill(*fragment_id) + .await + { + Ok(actors) => { + if actors.is_empty() { + tracing::warn!("No actors found for fragment {}", fragment_id); + continue; + } + actors + } + Err(err) => { + tracing::warn!(error = %err.as_report(),"Failed to get the actor of the fragment, maybe the fragment doesn't exist anymore"); + continue; + } + }; + split_assignment.insert( + *fragment_id, + align_splits( + actors, + upstream_assignment, + *fragment_id, + *upstream_fragment_id, + )?, + ); + } + } + } + + self.metadata_manager + .split_fragment_map_by_database(split_assignment) + .await + } +} + +/// Reassigns splits if there are new splits or dropped splits, +/// i.e., `actor_splits` and `discovered_splits` differ, or actors are rescheduled. +/// +/// The existing splits will remain unmoved in their currently assigned actor. +/// +/// If an actor has an upstream actor, it should be a backfill executor, +/// and its splits should be aligned with the upstream actor. **`reassign_splits` should not be used in this case. +/// Use [`align_splits`] instead.** +/// +/// - `fragment_id`: just for logging +/// +/// ## Different connectors' behavior of split change +/// +/// ### Kafka and Pulsar +/// They only support increasing the number of splits via adding new empty splits. +/// Old data is not moved. +/// +/// ### Kinesis +/// It supports *pairwise* shard split and merge. +/// +/// In both cases, old data remain in the old shard(s) and the old shard is still available. +/// New data are routed to the new shard(s). +/// After the retention period has expired, the old shard will become `EXPIRED` and isn't +/// listed any more. In other words, the total number of shards will first increase and then decrease. +/// +/// See also: +/// - [Kinesis resharding doc](https://docs.aws.amazon.com/streams/latest/dev/kinesis-using-sdk-java-after-resharding.html#kinesis-using-sdk-java-resharding-data-routing) +/// - An example of how the shards can be like: +fn reassign_splits( + fragment_id: FragmentId, + actor_splits: HashMap>, + discovered_splits: &BTreeMap, + opts: SplitDiffOptions, +) -> Option>> +where + T: SplitMetaData + Clone, +{ + // if no actors, return + if actor_splits.is_empty() { + return None; + } + + let prev_split_ids: HashSet<_> = actor_splits + .values() + .flat_map(|splits| splits.iter().map(SplitMetaData::id)) + .collect(); + + tracing::trace!(fragment_id, prev_split_ids = ?prev_split_ids, "previous splits"); + tracing::trace!(fragment_id, prev_split_ids = ?discovered_splits.keys(), "discovered splits"); + + let discovered_split_ids: HashSet<_> = discovered_splits.keys().cloned().collect(); + + let dropped_splits: HashSet<_> = prev_split_ids + .difference(&discovered_split_ids) + .cloned() + .collect(); + + if !dropped_splits.is_empty() { + if opts.enable_scale_in { + tracing::info!(fragment_id, dropped_spltis = ?dropped_splits, "new dropped splits"); + } else { + tracing::warn!(fragment_id, dropped_spltis = ?dropped_splits, "split dropping happened, but it is not allowed"); + } + } + + let new_discovered_splits: BTreeSet<_> = discovered_split_ids + .into_iter() + .filter(|split_id| !prev_split_ids.contains(split_id)) + .collect(); + + if opts.enable_scale_in || opts.enable_adaptive { + // if we support scale in, no more splits are discovered, and no splits are dropped, return + // we need to check if discovered_split_ids is empty, because if it is empty, we need to + // handle the case of scale in to zero (like deleting all objects from s3) + if dropped_splits.is_empty() + && new_discovered_splits.is_empty() + && !discovered_splits.is_empty() + { + return None; + } + } else { + // if we do not support scale in, and no more splits are discovered, return + if new_discovered_splits.is_empty() && !discovered_splits.is_empty() { + return None; + } + } + + tracing::info!(fragment_id, new_discovered_splits = ?new_discovered_splits, "new discovered splits"); + + let mut heap = BinaryHeap::with_capacity(actor_splits.len()); + + for (actor_id, mut splits) in actor_splits { + if opts.enable_scale_in || opts.enable_adaptive { + splits.retain(|split| !dropped_splits.contains(&split.id())); + } + + heap.push(ActorSplitsAssignment { actor_id, splits }) + } + + for split_id in new_discovered_splits { + // ActorSplitsAssignment's Ord is reversed, so this is min heap, i.e., + // we get the assignment with the least splits here. + + // Note: If multiple actors have the same number of splits, it will be randomly picked. + // When the number of source actors is larger than the number of splits, + // It's possible that the assignment is uneven. + // e.g., https://github.com/risingwavelabs/risingwave/issues/14324#issuecomment-1875033158 + // TODO: We should make the assignment rack-aware to make sure it's even. + let mut peek_ref = heap.peek_mut().unwrap(); + peek_ref + .splits + .push(discovered_splits.get(&split_id).cloned().unwrap()); + } + + Some( + heap.into_iter() + .map(|ActorSplitsAssignment { actor_id, splits }| (actor_id, splits)) + .collect(), + ) +} + +/// Assign splits to a new set of actors, according to existing assignment. +/// +/// illustration: +/// ```text +/// upstream new +/// actor x1 [split 1, split2] -> actor y1 [split 1, split2] +/// actor x2 [split 3] -> actor y2 [split 3] +/// ... +/// ``` +fn align_splits( + // (actor_id, upstream_actor_id) + aligned_actors: impl IntoIterator, + existing_assignment: &HashMap>, + fragment_id: FragmentId, + upstream_source_fragment_id: FragmentId, +) -> anyhow::Result>> { + aligned_actors + .into_iter() + .map(|(actor_id, upstream_actor_id)| { + let Some(splits) = existing_assignment.get(&upstream_actor_id) else { + return Err(anyhow::anyhow!("upstream assignment not found, fragment_id: {fragment_id}, upstream_fragment_id: {upstream_source_fragment_id}, actor_id: {actor_id}, upstream_assignment: {existing_assignment:?}, upstream_actor_id: {upstream_actor_id:?}")); + }; + Ok(( + actor_id, + splits.clone(), + )) + }) + .collect() +} + +/// Note: the `PartialEq` and `Ord` impl just compares the number of splits. +#[derive(Debug)] +struct ActorSplitsAssignment { + actor_id: ActorId, + splits: Vec, +} + +impl Eq for ActorSplitsAssignment {} + +impl PartialEq for ActorSplitsAssignment { + fn eq(&self, other: &Self) -> bool { + self.splits.len() == other.splits.len() + } +} + +impl PartialOrd for ActorSplitsAssignment { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for ActorSplitsAssignment { + fn cmp(&self, other: &Self) -> Ordering { + // Note: this is reversed order, to make BinaryHeap a min heap. + other.splits.len().cmp(&self.splits.len()) + } +} + +#[derive(Debug)] +pub struct SplitDiffOptions { + pub enable_scale_in: bool, + + /// For most connectors, this should be false. When enabled, RisingWave will not track any progress. + pub enable_adaptive: bool, +} + +#[allow(clippy::derivable_impls)] +impl Default for SplitDiffOptions { + fn default() -> Self { + SplitDiffOptions { + enable_scale_in: false, + enable_adaptive: false, + } + } +} + +#[cfg(test)] +mod tests { + use std::collections::{BTreeMap, HashMap, HashSet}; + + use risingwave_common::types::JsonbVal; + use risingwave_connector::error::ConnectorResult; + use risingwave_connector::source::{SplitId, SplitMetaData}; + use serde::{Deserialize, Serialize}; + + use super::*; + use crate::model::{ActorId, FragmentId}; + + #[derive(Debug, Copy, Clone, Serialize, Deserialize)] + struct TestSplit { + id: u32, + } + + impl SplitMetaData for TestSplit { + fn id(&self) -> SplitId { + format!("{}", self.id).into() + } + + fn encode_to_json(&self) -> JsonbVal { + serde_json::to_value(*self).unwrap().into() + } + + fn restore_from_json(value: JsonbVal) -> ConnectorResult { + serde_json::from_value(value.take()).map_err(Into::into) + } + + fn update_offset(&mut self, _last_read_offset: String) -> ConnectorResult<()> { + Ok(()) + } + } + + fn check_all_splits( + discovered_splits: &BTreeMap, + diff: &HashMap>, + ) { + let mut split_ids: HashSet<_> = discovered_splits.keys().cloned().collect(); + + for splits in diff.values() { + for split in splits { + assert!(split_ids.remove(&split.id())) + } + } + + assert!(split_ids.is_empty()); + } + + #[test] + fn test_drop_splits() { + let mut actor_splits: HashMap = HashMap::new(); + actor_splits.insert(0, vec![TestSplit { id: 0 }, TestSplit { id: 1 }]); + actor_splits.insert(1, vec![TestSplit { id: 2 }, TestSplit { id: 3 }]); + actor_splits.insert(2, vec![TestSplit { id: 4 }, TestSplit { id: 5 }]); + + let mut prev_split_to_actor = HashMap::new(); + for (actor_id, splits) in &actor_splits { + for split in splits { + prev_split_to_actor.insert(split.id(), *actor_id); + } + } + + let discovered_splits: BTreeMap = (1..5) + .map(|i| { + let split = TestSplit { id: i }; + (split.id(), split) + }) + .collect(); + + let opts = SplitDiffOptions { + enable_scale_in: true, + enable_adaptive: false, + }; + + let prev_split_ids: HashSet<_> = actor_splits + .values() + .flat_map(|splits| splits.iter().map(|split| split.id())) + .collect(); + + let diff = reassign_splits( + FragmentId::default(), + actor_splits, + &discovered_splits, + opts, + ) + .unwrap(); + check_all_splits(&discovered_splits, &diff); + + let mut after_split_to_actor = HashMap::new(); + for (actor_id, splits) in &diff { + for split in splits { + after_split_to_actor.insert(split.id(), *actor_id); + } + } + + let discovered_split_ids: HashSet<_> = discovered_splits.keys().cloned().collect(); + + let retained_split_ids: HashSet<_> = + prev_split_ids.intersection(&discovered_split_ids).collect(); + + for retained_split_id in retained_split_ids { + assert_eq!( + prev_split_to_actor.get(retained_split_id), + after_split_to_actor.get(retained_split_id) + ) + } + } + + #[test] + fn test_drop_splits_to_empty() { + let mut actor_splits: HashMap = HashMap::new(); + actor_splits.insert(0, vec![TestSplit { id: 0 }]); + + let discovered_splits: BTreeMap = BTreeMap::new(); + + let opts = SplitDiffOptions { + enable_scale_in: true, + enable_adaptive: false, + }; + + let diff = reassign_splits( + FragmentId::default(), + actor_splits, + &discovered_splits, + opts, + ) + .unwrap(); + + assert!(!diff.is_empty()) + } + + #[test] + fn test_reassign_splits() { + let actor_splits = HashMap::new(); + let discovered_splits: BTreeMap = BTreeMap::new(); + assert!(reassign_splits( + FragmentId::default(), + actor_splits, + &discovered_splits, + Default::default() + ) + .is_none()); + + let actor_splits = (0..3).map(|i| (i, vec![])).collect(); + let discovered_splits: BTreeMap = BTreeMap::new(); + let diff = reassign_splits( + FragmentId::default(), + actor_splits, + &discovered_splits, + Default::default(), + ) + .unwrap(); + assert_eq!(diff.len(), 3); + for splits in diff.values() { + assert!(splits.is_empty()) + } + + let actor_splits = (0..3).map(|i| (i, vec![])).collect(); + let discovered_splits: BTreeMap = (0..3) + .map(|i| { + let split = TestSplit { id: i }; + (split.id(), split) + }) + .collect(); + + let diff = reassign_splits( + FragmentId::default(), + actor_splits, + &discovered_splits, + Default::default(), + ) + .unwrap(); + assert_eq!(diff.len(), 3); + for splits in diff.values() { + assert_eq!(splits.len(), 1); + } + + check_all_splits(&discovered_splits, &diff); + + let actor_splits = (0..3).map(|i| (i, vec![TestSplit { id: i }])).collect(); + let discovered_splits: BTreeMap = (0..5) + .map(|i| { + let split = TestSplit { id: i }; + (split.id(), split) + }) + .collect(); + + let diff = reassign_splits( + FragmentId::default(), + actor_splits, + &discovered_splits, + Default::default(), + ) + .unwrap(); + assert_eq!(diff.len(), 3); + for splits in diff.values() { + let len = splits.len(); + assert!(len == 1 || len == 2); + } + + check_all_splits(&discovered_splits, &diff); + + let mut actor_splits: HashMap> = + (0..3).map(|i| (i, vec![TestSplit { id: i }])).collect(); + actor_splits.insert(3, vec![]); + actor_splits.insert(4, vec![]); + + let discovered_splits: BTreeMap = (0..5) + .map(|i| { + let split = TestSplit { id: i }; + (split.id(), split) + }) + .collect(); + + let diff = reassign_splits( + FragmentId::default(), + actor_splits, + &discovered_splits, + Default::default(), + ) + .unwrap(); + assert_eq!(diff.len(), 5); + for splits in diff.values() { + assert_eq!(splits.len(), 1); + } + + check_all_splits(&discovered_splits, &diff); + } +} diff --git a/src/meta/src/stream/source_manager/worker.rs b/src/meta/src/stream/source_manager/worker.rs new file mode 100644 index 0000000000000..6f6d647ffb78f --- /dev/null +++ b/src/meta/src/stream/source_manager/worker.rs @@ -0,0 +1,320 @@ +// Copyright 2025 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use super::*; + +const MAX_FAIL_CNT: u32 = 10; +const DEFAULT_SOURCE_TICK_TIMEOUT: Duration = Duration::from_secs(10); + +pub struct SharedSplitMap { + pub splits: Option>, +} + +type SharedSplitMapRef = Arc>; + +/// `ConnectorSourceWorker` keeps fetching the latest split metadata from the external source service ([`Self::tick`]), +/// and maintains it in `current_splits`. +pub struct ConnectorSourceWorker { + source_id: SourceId, + source_name: String, + current_splits: SharedSplitMapRef, + enumerator: P::SplitEnumerator, + period: Duration, + metrics: Arc, + connector_properties: P, + fail_cnt: u32, + source_is_up: LabelGuardedIntGauge<2>, +} + +fn extract_prop_from_existing_source(source: &Source) -> ConnectorResult { + let options_with_secret = + WithOptionsSecResolved::new(source.with_properties.clone(), source.secret_refs.clone()); + let mut properties = ConnectorProperties::extract(options_with_secret, false)?; + properties.init_from_pb_source(source); + Ok(properties) +} +fn extract_prop_from_new_source(source: &Source) -> ConnectorResult { + let options_with_secret = + WithOptionsSecResolved::new(source.with_properties.clone(), source.secret_refs.clone()); + let mut properties = ConnectorProperties::extract(options_with_secret, true)?; + properties.init_from_pb_source(source); + Ok(properties) +} + +/// Used to create a new [`ConnectorSourceWorkerHandle`] for a new source. +/// +/// It will call [`ConnectorSourceWorker::tick()`] to fetch split metadata once before returning. +pub async fn create_source_worker( + source: &Source, + metrics: Arc, +) -> MetaResult { + tracing::info!("spawning new watcher for source {}", source.id); + + let splits = Arc::new(Mutex::new(SharedSplitMap { splits: None })); + let current_splits_ref = splits.clone(); + + let connector_properties = extract_prop_from_new_source(source)?; + let enable_scale_in = connector_properties.enable_drop_split(); + let enable_adaptive_splits = connector_properties.enable_adaptive_splits(); + let (sync_call_tx, sync_call_rx) = tokio::sync::mpsc::unbounded_channel(); + let handle = dispatch_source_prop!(connector_properties, prop, { + let mut worker = ConnectorSourceWorker::create( + source, + *prop, + DEFAULT_SOURCE_WORKER_TICK_INTERVAL, + current_splits_ref.clone(), + metrics, + ) + .await?; + + // if fail to fetch meta info, will refuse to create source + + // todo: make the timeout configurable, longer than `properties.sync.call.timeout` + // in kafka + tokio::time::timeout(DEFAULT_SOURCE_TICK_TIMEOUT, worker.tick()) + .await + .ok() + .with_context(|| { + format!( + "failed to fetch meta info for source {}, timeout {:?}", + source.id, DEFAULT_SOURCE_TICK_TIMEOUT + ) + })??; + + tokio::spawn(async move { worker.run(sync_call_rx).await }) + }); + Ok(ConnectorSourceWorkerHandle { + handle, + sync_call_tx, + splits, + enable_drop_split: enable_scale_in, + enable_adaptive_splits, + }) +} + +/// Used on startup ([`SourceManager::new`]). Failed sources will not block meta startup. +pub fn create_source_worker_async( + source: Source, + managed_sources: &mut HashMap, + metrics: Arc, +) -> MetaResult<()> { + tracing::info!("spawning new watcher for source {}", source.id); + + let splits = Arc::new(Mutex::new(SharedSplitMap { splits: None })); + let current_splits_ref = splits.clone(); + let source_id = source.id; + + let connector_properties = extract_prop_from_existing_source(&source)?; + + let enable_drop_split = connector_properties.enable_drop_split(); + let enable_adaptive_splits = connector_properties.enable_adaptive_splits(); + let (sync_call_tx, sync_call_rx) = tokio::sync::mpsc::unbounded_channel(); + let handle = tokio::spawn(async move { + let mut ticker = time::interval(DEFAULT_SOURCE_WORKER_TICK_INTERVAL); + ticker.set_missed_tick_behavior(MissedTickBehavior::Skip); + + dispatch_source_prop!(connector_properties, prop, { + let mut worker = loop { + ticker.tick().await; + + match ConnectorSourceWorker::create( + &source, + prop.deref().clone(), + DEFAULT_SOURCE_WORKER_TICK_INTERVAL, + current_splits_ref.clone(), + metrics.clone(), + ) + .await + { + Ok(worker) => { + break worker; + } + Err(e) => { + tracing::warn!(error = %e.as_report(), "failed to create source worker"); + } + } + }; + + worker.run(sync_call_rx).await + }); + }); + + managed_sources.insert( + source_id as SourceId, + ConnectorSourceWorkerHandle { + handle, + sync_call_tx, + splits, + enable_drop_split, + enable_adaptive_splits, + }, + ); + Ok(()) +} + +const DEFAULT_SOURCE_WORKER_TICK_INTERVAL: Duration = Duration::from_secs(30); + +impl ConnectorSourceWorker

{ + /// Recreate the `SplitEnumerator` to establish a new connection to the external source service. + async fn refresh(&mut self) -> MetaResult<()> { + let enumerator = P::SplitEnumerator::new( + self.connector_properties.clone(), + Arc::new(SourceEnumeratorContext { + metrics: self.metrics.source_enumerator_metrics.clone(), + info: SourceEnumeratorInfo { + source_id: self.source_id as u32, + }, + }), + ) + .await + .context("failed to create SplitEnumerator")?; + self.enumerator = enumerator; + self.fail_cnt = 0; + tracing::info!("refreshed source enumerator: {}", self.source_name); + Ok(()) + } + + /// On creation, connection to the external source service will be established, but `splits` + /// will not be updated until `tick` is called. + pub async fn create( + source: &Source, + connector_properties: P, + period: Duration, + splits: Arc>, + metrics: Arc, + ) -> MetaResult { + let enumerator = P::SplitEnumerator::new( + connector_properties.clone(), + Arc::new(SourceEnumeratorContext { + metrics: metrics.source_enumerator_metrics.clone(), + info: SourceEnumeratorInfo { + source_id: source.id, + }, + }), + ) + .await + .context("failed to create SplitEnumerator")?; + + let source_is_up = metrics + .source_is_up + .with_guarded_label_values(&[source.id.to_string().as_str(), &source.name]); + + Ok(Self { + source_id: source.id as SourceId, + source_name: source.name.clone(), + current_splits: splits, + enumerator, + period, + metrics, + connector_properties, + fail_cnt: 0, + source_is_up, + }) + } + + pub async fn run( + &mut self, + mut sync_call_rx: UnboundedReceiver>>, + ) { + let mut interval = time::interval(self.period); + interval.set_missed_tick_behavior(MissedTickBehavior::Skip); + loop { + select! { + biased; + tx = sync_call_rx.borrow_mut().recv() => { + if let Some(tx) = tx { + let _ = tx.send(self.tick().await); + } + } + _ = interval.tick() => { + if self.fail_cnt > MAX_FAIL_CNT { + if let Err(e) = self.refresh().await { + tracing::error!(error = %e.as_report(), "error happened when refresh from connector source worker"); + } + } + if let Err(e) = self.tick().await { + tracing::error!(error = %e.as_report(), "error happened when tick from connector source worker"); + } + } + } + } + } + + /// Uses [`SplitEnumerator`] to fetch the latest split metadata from the external source service. + async fn tick(&mut self) -> MetaResult<()> { + let source_is_up = |res: i64| { + self.source_is_up.set(res); + }; + let splits = self.enumerator.list_splits().await.inspect_err(|_| { + source_is_up(0); + self.fail_cnt += 1; + })?; + source_is_up(1); + self.fail_cnt = 0; + let mut current_splits = self.current_splits.lock().await; + current_splits.splits.replace( + splits + .into_iter() + .map(|split| (split.id(), P::Split::into(split))) + .collect(), + ); + + Ok(()) + } +} + +/// Handle for a running [`ConnectorSourceWorker`]. +pub struct ConnectorSourceWorkerHandle { + pub handle: JoinHandle<()>, + pub sync_call_tx: UnboundedSender>>, + pub splits: SharedSplitMapRef, + pub enable_drop_split: bool, + pub enable_adaptive_splits: bool, +} + +impl ConnectorSourceWorkerHandle { + pub fn get_enable_adaptive_splits(&self) -> bool { + self.enable_adaptive_splits + } + + pub async fn discovered_splits( + &self, + source_id: SourceId, + actors: &HashSet, + ) -> MetaResult, SplitImpl>> { + let Some(mut discovered_splits) = self.splits.lock().await.splits.clone() else { + tracing::info!( + "The discover loop for source {} is not ready yet; we'll wait for the next run", + source_id + ); + return Ok(BTreeMap::new()); + }; + if discovered_splits.is_empty() { + tracing::warn!("No splits discovered for source {}", source_id); + } + + if self.enable_adaptive_splits { + // Connector supporting adaptive splits returns just one split, and we need to make the number of splits equal to the number of actors in this fragment. + // Because we Risingwave consume the splits statelessly and we do not need to keep the id internally, we always use actor_id as split_id. + // And prev splits record should be dropped via CN. + + debug_assert!(self.enable_drop_split); + debug_assert!(discovered_splits.len() == 1); + discovered_splits = + fill_adaptive_split(discovered_splits.values().next().unwrap(), actors)?; + } + + Ok(discovered_splits) + } +} From 9d7698545b3038befed43be7262d7d61132030b6 Mon Sep 17 00:00:00 2001 From: Li0k Date: Thu, 9 Jan 2025 13:27:31 +0800 Subject: [PATCH 09/21] refactor(storage): remove option of new log (#20055) --- src/meta/src/hummock/manager/checkpoint.rs | 2 +- src/storage/hummock_sdk/src/change_log.rs | 18 +++---- .../compaction_group/hummock_version_ext.rs | 2 +- .../hummock_sdk/src/frontend_version.rs | 48 +++++++++++-------- src/storage/hummock_sdk/src/time_travel.rs | 18 +++---- src/storage/hummock_sdk/src/version.rs | 6 +-- 6 files changed, 51 insertions(+), 43 deletions(-) diff --git a/src/meta/src/hummock/manager/checkpoint.rs b/src/meta/src/hummock/manager/checkpoint.rs index 5f9664b64a011..bb47248686129 100644 --- a/src/meta/src/hummock/manager/checkpoint.rs +++ b/src/meta/src/hummock/manager/checkpoint.rs @@ -179,7 +179,7 @@ impl HummockManager { .change_log_delta .values() .flat_map(|change_log| { - let new_log = change_log.new_log.as_ref().unwrap(); + let new_log = &change_log.new_log; new_log .new_value .iter() diff --git a/src/storage/hummock_sdk/src/change_log.rs b/src/storage/hummock_sdk/src/change_log.rs index 5a7bf0143c764..5a5c4a647ecbe 100644 --- a/src/storage/hummock_sdk/src/change_log.rs +++ b/src/storage/hummock_sdk/src/change_log.rs @@ -190,11 +190,11 @@ pub fn build_table_change_log_delta<'a>( TableId::new(table_id), ChangeLogDelta { truncate_epoch, - new_log: Some(EpochNewChangeLog { + new_log: EpochNewChangeLog { new_value: vec![], old_value: vec![], epochs: epochs.clone(), - }), + }, }, ) }) @@ -203,7 +203,7 @@ pub fn build_table_change_log_delta<'a>( for table_id in &sst.table_ids { match table_change_log.get_mut(&TableId::new(*table_id)) { Some(log) => { - log.new_log.as_mut().unwrap().old_value.push(sst.clone()); + log.new_log.old_value.push(sst.clone()); } None => { warn!(table_id, ?sst, "old value sst contains non-log-store table"); @@ -214,7 +214,7 @@ pub fn build_table_change_log_delta<'a>( for sst in new_value_ssts { for table_id in &sst.table_ids { if let Some(log) = table_change_log.get_mut(&TableId::new(*table_id)) { - log.new_log.as_mut().unwrap().new_value.push(sst.clone()); + log.new_log.new_value.push(sst.clone()); } } } @@ -224,7 +224,7 @@ pub fn build_table_change_log_delta<'a>( #[derive(Debug, PartialEq, Clone)] pub struct ChangeLogDeltaCommon { pub truncate_epoch: u64, - pub new_log: Option>, + pub new_log: EpochNewChangeLogCommon, } pub type ChangeLogDelta = ChangeLogDeltaCommon; @@ -236,7 +236,7 @@ where fn from(val: &ChangeLogDeltaCommon) -> Self { Self { truncate_epoch: val.truncate_epoch, - new_log: val.new_log.as_ref().map(|a| a.into()), + new_log: Some((&val.new_log).into()), } } } @@ -248,7 +248,7 @@ where fn from(val: &PbChangeLogDelta) -> Self { Self { truncate_epoch: val.truncate_epoch, - new_log: val.new_log.as_ref().map(|a| a.into()), + new_log: val.new_log.as_ref().unwrap().into(), } } } @@ -260,7 +260,7 @@ where fn from(val: ChangeLogDeltaCommon) -> Self { Self { truncate_epoch: val.truncate_epoch, - new_log: val.new_log.map(|a| a.into()), + new_log: Some(val.new_log.into()), } } } @@ -272,7 +272,7 @@ where fn from(val: PbChangeLogDelta) -> Self { Self { truncate_epoch: val.truncate_epoch, - new_log: val.new_log.map(|a| a.into()), + new_log: val.new_log.unwrap().into(), } } } diff --git a/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs b/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs index ee316f75ffd65..eb4bb30e69dc3 100644 --- a/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs +++ b/src/storage/hummock_sdk/src/compaction_group/hummock_version_ext.rs @@ -693,7 +693,7 @@ impl HummockVersion { changed_table_info: &HashMap>, ) { for (table_id, change_log_delta) in change_log_delta { - let new_change_log = change_log_delta.new_log.as_ref().unwrap(); + let new_change_log = &change_log_delta.new_log; match table_change_log.entry(*table_id) { Entry::Occupied(entry) => { let change_log = entry.into_mut(); diff --git a/src/storage/hummock_sdk/src/frontend_version.rs b/src/storage/hummock_sdk/src/frontend_version.rs index 4840b402a292b..11eb045efbe22 100644 --- a/src/storage/hummock_sdk/src/frontend_version.rs +++ b/src/storage/hummock_sdk/src/frontend_version.rs @@ -151,14 +151,12 @@ impl FrontendHummockVersionDelta { *table_id, ChangeLogDeltaCommon { truncate_epoch: change_log_delta.truncate_epoch, - new_log: change_log_delta.new_log.as_ref().map(|new_log| { - EpochNewChangeLogCommon { - // Here we need to determine if value is null but don't care what the value is, so we fill him in using `()` - new_value: vec![(); new_log.new_value.len()], - old_value: vec![(); new_log.old_value.len()], - epochs: new_log.epochs.clone(), - } - }), + new_log: EpochNewChangeLogCommon { + // Here we need to determine if value is null but don't care what the value is, so we fill him in using `()` + new_value: vec![(); change_log_delta.new_log.new_value.len()], + old_value: vec![(); change_log_delta.new_log.old_value.len()], + epochs: change_log_delta.new_log.epochs.clone(), + }, }, ) }) @@ -187,11 +185,17 @@ impl FrontendHummockVersionDelta { ( table_id.table_id, PbChangeLogDelta { - new_log: delta.new_log.as_ref().map(|new_log| PbEpochNewChangeLog { + new_log: Some(PbEpochNewChangeLog { // Here we need to determine if value is null but don't care what the value is, so we fill him in using `PbSstableInfo::default()` - old_value: vec![PbSstableInfo::default(); new_log.old_value.len()], - new_value: vec![PbSstableInfo::default(); new_log.new_value.len()], - epochs: new_log.epochs.clone(), + old_value: vec![ + PbSstableInfo::default(); + delta.new_log.old_value.len() + ], + new_value: vec![ + PbSstableInfo::default(); + delta.new_log.new_value.len() + ], + epochs: delta.new_log.epochs.clone(), }), truncate_epoch: delta.truncate_epoch, }, @@ -228,14 +232,18 @@ impl FrontendHummockVersionDelta { TableId::new(*table_id), ChangeLogDeltaCommon { truncate_epoch: change_log_delta.truncate_epoch, - new_log: change_log_delta.new_log.as_ref().map(|new_log| { - EpochNewChangeLogCommon { - // Here we need to determine if value is null but don't care what the value is, so we fill him in using `()` - new_value: vec![(); new_log.new_value.len()], - old_value: vec![(); new_log.old_value.len()], - epochs: new_log.epochs.clone(), - } - }), + new_log: change_log_delta + .new_log + .as_ref() + .map(|new_log| { + EpochNewChangeLogCommon { + // Here we need to determine if value is null but don't care what the value is, so we fill him in using `()` + new_value: vec![(); new_log.new_value.len()], + old_value: vec![(); new_log.old_value.len()], + epochs: new_log.epochs.clone(), + } + }) + .unwrap(), }, ) }) diff --git a/src/storage/hummock_sdk/src/time_travel.rs b/src/storage/hummock_sdk/src/time_travel.rs index 0a651260b3789..9f628808909ff 100644 --- a/src/storage/hummock_sdk/src/time_travel.rs +++ b/src/storage/hummock_sdk/src/time_travel.rs @@ -191,15 +191,15 @@ impl From<(&HummockVersionDelta, &HashSet)> for IncompleteHummockV } debug_assert!(log_delta .new_log - .as_ref() - .map(|d| { - d.new_value.iter().chain(d.old_value.iter()).all(|s| { - s.table_ids - .iter() - .any(|tid| time_travel_table_ids.contains(tid)) - }) - }) - .unwrap_or(true)); + .new_value + .iter() + .chain(log_delta.new_log.old_value.iter()) + .all(|s| { + s.table_ids + .iter() + .any(|tid| time_travel_table_ids.contains(tid)) + })); + Some((*table_id, PbChangeLogDelta::from(log_delta).into())) }) .collect(), diff --git a/src/storage/hummock_sdk/src/version.rs b/src/storage/hummock_sdk/src/version.rs index 09e96860cc839..4c90e6cae47f1 100644 --- a/src/storage/hummock_sdk/src/version.rs +++ b/src/storage/hummock_sdk/src/version.rs @@ -570,7 +570,7 @@ where }) .chain(self.change_log_delta.values().flat_map(|delta| { // TODO: optimization: strip table change log - let new_log = delta.new_log.as_ref().unwrap(); + let new_log = &delta.new_log; new_log.new_value.iter().chain(new_log.old_value.iter()) })) } @@ -623,8 +623,8 @@ where ( TableId::new(*table_id), ChangeLogDeltaCommon { - new_log: log_delta.new_log.as_ref().map(Into::into), truncate_epoch: log_delta.truncate_epoch, + new_log: log_delta.new_log.as_ref().unwrap().into(), }, ) }) @@ -752,7 +752,7 @@ where ( TableId::new(*table_id), ChangeLogDeltaCommon { - new_log: log_delta.new_log.clone().map(Into::into), + new_log: log_delta.new_log.clone().unwrap().into(), truncate_epoch: log_delta.truncate_epoch, }, ) From 63b20efbfd126707e66d217e69ba6157124a6743 Mon Sep 17 00:00:00 2001 From: Richard Chien Date: Thu, 9 Jan 2025 13:40:48 +0800 Subject: [PATCH 10/21] fix(frontend): set `if_not_exists` to false when normalizing `CREATE VIEW` (#20073) Signed-off-by: Richard Chien --- src/frontend/src/handler/mod.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/frontend/src/handler/mod.rs b/src/frontend/src/handler/mod.rs index 446187335b06a..bd559b9245fc7 100644 --- a/src/frontend/src/handler/mod.rs +++ b/src/frontend/src/handler/mod.rs @@ -198,8 +198,13 @@ impl HandlerArgs { fn normalize_sql(stmt: &Statement) -> String { let mut stmt = stmt.clone(); match &mut stmt { - Statement::CreateView { or_replace, .. } => { + Statement::CreateView { + or_replace, + if_not_exists, + .. + } => { *or_replace = false; + *if_not_exists = false; } Statement::CreateTable { or_replace, From 14823be7a808a5fdb34df262dbed1c75e31d316f Mon Sep 17 00:00:00 2001 From: Dylan Date: Thu, 9 Jan 2025 13:55:42 +0800 Subject: [PATCH 11/21] fix: change iceberg glue catalog default implemantation to java (#20075) --- src/connector/src/connector_common/iceberg/mod.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/connector/src/connector_common/iceberg/mod.rs b/src/connector/src/connector_common/iceberg/mod.rs index ff53d80e89598..6964e03a5b99f 100644 --- a/src/connector/src/connector_common/iceberg/mod.rs +++ b/src/connector/src/connector_common/iceberg/mod.rs @@ -387,7 +387,7 @@ impl IcebergCommon { let catalog = iceberg_catalog_rest::RestCatalog::new(config); Ok(Arc::new(catalog)) } - "glue" => { + "glue_rust" => { let mut iceberg_configs = HashMap::new(); // glue if let Some(region) = &self.region { @@ -427,7 +427,10 @@ impl IcebergCommon { Ok(Arc::new(catalog)) } catalog_type - if catalog_type == "hive" || catalog_type == "jdbc" || catalog_type == "rest" => + if catalog_type == "hive" + || catalog_type == "jdbc" + || catalog_type == "rest" + || catalog_type == "glue" => { // Create java catalog let (file_io_props, java_catalog_props) = @@ -436,6 +439,7 @@ impl IcebergCommon { "hive" => "org.apache.iceberg.hive.HiveCatalog", "jdbc" => "org.apache.iceberg.jdbc.JdbcCatalog", "rest" => "org.apache.iceberg.rest.RESTCatalog", + "glue" => "org.apache.iceberg.aws.glue.GlueCatalog", _ => unreachable!(), }; From 934db16dbf71dd5ba1fbd69142ec0f0d3737c183 Mon Sep 17 00:00:00 2001 From: Xinhao Xu <84456268+xxhZs@users.noreply.github.com> Date: Thu, 9 Jan 2025 14:49:25 +0800 Subject: [PATCH 12/21] feat(sink): support force_append_only for es sink (#19919) --- .../sink/elasticsearch/elasticsearch_sink.slt | 3 +++ .../elasticsearch_opensearch/elasticsearch.rs | 3 +++ .../elasticsearch_converter.rs | 15 +++++++++---- .../elasticsearch_opensearch_client.rs | 5 ++++- .../elasticsearch_opensearch_config.rs | 2 ++ .../elasticsearch_opensearch_formatter.rs | 21 +++++++++++++++++-- .../elasticsearch_opensearch/opensearch.rs | 3 +++ src/connector/src/sink/remote.rs | 1 + src/connector/with_options_sink.yaml | 3 +++ 9 files changed, 49 insertions(+), 7 deletions(-) diff --git a/e2e_test/sink/elasticsearch/elasticsearch_sink.slt b/e2e_test/sink/elasticsearch/elasticsearch_sink.slt index 842e3f3303438..953bc694d744a 100644 --- a/e2e_test/sink/elasticsearch/elasticsearch_sink.slt +++ b/e2e_test/sink/elasticsearch/elasticsearch_sink.slt @@ -22,6 +22,7 @@ CREATE TABLE test_route ( statement ok CREATE SINK test_route_sink from test_route WITH ( + type = 'upsert', connector = 'elasticsearch', index = 'test_route', url = 'http://elasticsearch:9200', @@ -32,6 +33,7 @@ CREATE SINK test_route_sink from test_route WITH ( statement ok CREATE SINK s7 from t7 WITH ( + type = 'upsert', connector = 'elasticsearch', index = 'test', url = 'http://elasticsearch:9200', @@ -41,6 +43,7 @@ CREATE SINK s7 from t7 WITH ( statement ok CREATE SINK s8 from t7 WITH ( + type = 'upsert', connector = 'elasticsearch', index = 'test1', primary_key = 'v1,v3', diff --git a/src/connector/src/sink/elasticsearch_opensearch/elasticsearch.rs b/src/connector/src/sink/elasticsearch_opensearch/elasticsearch.rs index 1f0ac754fcad2..fd4cbed31698d 100644 --- a/src/connector/src/sink/elasticsearch_opensearch/elasticsearch.rs +++ b/src/connector/src/sink/elasticsearch_opensearch/elasticsearch.rs @@ -28,6 +28,7 @@ pub struct ElasticSearchSink { config: ElasticSearchOpenSearchConfig, schema: Schema, pk_indices: Vec, + is_append_only: bool, } #[async_trait] @@ -41,6 +42,7 @@ impl TryFrom for ElasticSearchSink { config, schema, pk_indices: param.downstream_pk, + is_append_only: param.sink_type.is_append_only(), }) } } @@ -64,6 +66,7 @@ impl Sink for ElasticSearchSink { self.schema.clone(), self.pk_indices.clone(), Self::SINK_NAME, + self.is_append_only, )? .into_log_sinker(self.config.concurrent_requests)) } diff --git a/src/connector/src/sink/elasticsearch_opensearch/elasticsearch_converter.rs b/src/connector/src/sink/elasticsearch_opensearch/elasticsearch_converter.rs index dc19c99261aa7..ba2571ff2a130 100644 --- a/src/connector/src/sink/elasticsearch_opensearch/elasticsearch_converter.rs +++ b/src/connector/src/sink/elasticsearch_opensearch/elasticsearch_converter.rs @@ -39,6 +39,7 @@ impl StreamChunkConverter { schema: Schema, pk_indices: &Vec, properties: &BTreeMap, + is_append_only: bool, ) -> Result { if is_remote_es_sink(sink_name) { let index_column = properties @@ -71,6 +72,7 @@ impl StreamChunkConverter { index_column, index, routing_column, + is_append_only, )?)) } else { Ok(StreamChunkConverter::Other) @@ -79,13 +81,14 @@ impl StreamChunkConverter { pub fn convert_chunk(&self, chunk: StreamChunk) -> Result { match self { - StreamChunkConverter::Es(es) => es.convert_chunk(chunk), + StreamChunkConverter::Es(es) => es.convert_chunk(chunk, es.is_append_only), StreamChunkConverter::Other => Ok(chunk), } } } pub struct EsStreamChunkConverter { formatter: ElasticSearchOpenSearchFormatter, + is_append_only: bool, } impl EsStreamChunkConverter { pub fn new( @@ -95,6 +98,7 @@ impl EsStreamChunkConverter { index_column: Option, index: Option, routing_column: Option, + is_append_only: bool, ) -> Result { let formatter = ElasticSearchOpenSearchFormatter::new( pk_indices, @@ -104,10 +108,13 @@ impl EsStreamChunkConverter { index, routing_column, )?; - Ok(Self { formatter }) + Ok(Self { + formatter, + is_append_only, + }) } - fn convert_chunk(&self, chunk: StreamChunk) -> Result { + fn convert_chunk(&self, chunk: StreamChunk, is_append_only: bool) -> Result { let mut ops = Vec::with_capacity(chunk.capacity()); let mut id_string_builder = ::new(chunk.capacity()); @@ -117,7 +124,7 @@ impl EsStreamChunkConverter { ::new(chunk.capacity()); let mut routing_builder = ::new(chunk.capacity()); - for build_bulk_para in self.formatter.convert_chunk(chunk)? { + for build_bulk_para in self.formatter.convert_chunk(chunk, is_append_only)? { let BuildBulkPara { key, value, diff --git a/src/connector/src/sink/elasticsearch_opensearch/elasticsearch_opensearch_client.rs b/src/connector/src/sink/elasticsearch_opensearch/elasticsearch_opensearch_client.rs index 5369fcdee6aab..eb02d1310b650 100644 --- a/src/connector/src/sink/elasticsearch_opensearch/elasticsearch_opensearch_client.rs +++ b/src/connector/src/sink/elasticsearch_opensearch/elasticsearch_opensearch_client.rs @@ -160,6 +160,7 @@ pub struct ElasticSearchOpenSearchSinkWriter { client: Arc, formatter: ElasticSearchOpenSearchFormatter, config: ElasticSearchOpenSearchConfig, + is_append_only: bool, } impl ElasticSearchOpenSearchSinkWriter { @@ -168,6 +169,7 @@ impl ElasticSearchOpenSearchSinkWriter { schema: Schema, pk_indices: Vec, connector: &str, + is_append_only: bool, ) -> Result { let client = Arc::new(config.build_client(connector)?); let formatter = ElasticSearchOpenSearchFormatter::new( @@ -182,6 +184,7 @@ impl ElasticSearchOpenSearchSinkWriter { client, formatter, config, + is_append_only, }) } } @@ -202,7 +205,7 @@ impl AsyncTruncateSinkWriter for ElasticSearchOpenSearchSinkWriter { let mut bulks: Vec = Vec::with_capacity(chunk_capacity); let mut bulks_size = 0; - for build_bulk_para in self.formatter.convert_chunk(chunk)? { + for build_bulk_para in self.formatter.convert_chunk(chunk, self.is_append_only)? { let BuildBulkPara { key, value, diff --git a/src/connector/src/sink/elasticsearch_opensearch/elasticsearch_opensearch_config.rs b/src/connector/src/sink/elasticsearch_opensearch/elasticsearch_opensearch_config.rs index e2907301d78a9..24f79dd84030b 100644 --- a/src/connector/src/sink/elasticsearch_opensearch/elasticsearch_opensearch_config.rs +++ b/src/connector/src/sink/elasticsearch_opensearch/elasticsearch_opensearch_config.rs @@ -77,6 +77,8 @@ pub struct ElasticSearchOpenSearchConfig { #[serde_as(as = "DisplayFromStr")] #[serde(default = "default_concurrent_requests")] pub concurrent_requests: usize, + + pub r#type: String, } fn default_retry_on_conflict() -> i32 { diff --git a/src/connector/src/sink/elasticsearch_opensearch/elasticsearch_opensearch_formatter.rs b/src/connector/src/sink/elasticsearch_opensearch/elasticsearch_opensearch_formatter.rs index 660e26d578834..3cfda0601b617 100644 --- a/src/connector/src/sink/elasticsearch_opensearch/elasticsearch_opensearch_formatter.rs +++ b/src/connector/src/sink/elasticsearch_opensearch/elasticsearch_opensearch_formatter.rs @@ -113,7 +113,11 @@ impl ElasticSearchOpenSearchFormatter { }) } - pub fn convert_chunk(&self, chunk: StreamChunk) -> Result> { + pub fn convert_chunk( + &self, + chunk: StreamChunk, + is_append_only: bool, + ) -> Result> { let mut result_vec = Vec::with_capacity(chunk.capacity()); for (op, rows) in chunk.rows() { let index = if let Some(index_column) = self.index_column { @@ -157,6 +161,11 @@ impl ElasticSearchOpenSearchFormatter { }); } Op::Delete => { + if is_append_only { + return Err(SinkError::ElasticSearchOpenSearch(anyhow!( + "`Delete` operation is not supported in `append_only` mode" + ))); + } let key = self.key_encoder.encode(rows)?; let mem_size_b = std::mem::size_of_val(&key); result_vec.push(BuildBulkPara { @@ -167,7 +176,15 @@ impl ElasticSearchOpenSearchFormatter { routing_column, }); } - Op::UpdateDelete => continue, + Op::UpdateDelete => { + if is_append_only { + return Err(SinkError::ElasticSearchOpenSearch(anyhow!( + "`UpdateDelete` operation is not supported in `append_only` mode" + ))); + } else { + continue; + } + } } } Ok(result_vec) diff --git a/src/connector/src/sink/elasticsearch_opensearch/opensearch.rs b/src/connector/src/sink/elasticsearch_opensearch/opensearch.rs index b0df3c8ca2345..bf9b79f6a8850 100644 --- a/src/connector/src/sink/elasticsearch_opensearch/opensearch.rs +++ b/src/connector/src/sink/elasticsearch_opensearch/opensearch.rs @@ -30,6 +30,7 @@ pub struct OpenSearchSink { config: ElasticSearchOpenSearchConfig, schema: Schema, pk_indices: Vec, + is_append_only: bool, } #[async_trait] @@ -43,6 +44,7 @@ impl TryFrom for OpenSearchSink { config, schema, pk_indices: param.downstream_pk, + is_append_only: param.sink_type.is_append_only(), }) } } @@ -69,6 +71,7 @@ impl Sink for OpenSearchSink { self.schema.clone(), self.pk_indices.clone(), Self::SINK_NAME, + self.is_append_only, )? .into_log_sinker(self.config.concurrent_requests)) } diff --git a/src/connector/src/sink/remote.rs b/src/connector/src/sink/remote.rs index 26683966f566b..582a5c861bb5c 100644 --- a/src/connector/src/sink/remote.rs +++ b/src/connector/src/sink/remote.rs @@ -301,6 +301,7 @@ impl RemoteLogSinker { sink_param.schema(), &sink_param.downstream_pk, &sink_param.properties, + sink_param.sink_type.is_append_only(), )?, }) } diff --git a/src/connector/with_options_sink.yaml b/src/connector/with_options_sink.yaml index d7291c8ecc61d..819e597fec448 100644 --- a/src/connector/with_options_sink.yaml +++ b/src/connector/with_options_sink.yaml @@ -331,6 +331,9 @@ ElasticSearchOpenSearchConfig: - name: concurrent_requests field_type: usize required: true + - name: r#type + field_type: String + required: true FsConfig: fields: - name: fs.path From 09e59e43dd7630f02664af8f8fb6672554351530 Mon Sep 17 00:00:00 2001 From: Richard Chien Date: Thu, 9 Jan 2025 16:57:02 +0800 Subject: [PATCH 13/21] feat(udf): support `CREATE FUNCTION/AGGREGATE IF NOT EXISTS` (#20079) Signed-off-by: Richard Chien --- e2e_test/udf/create_and_drop.slt | 115 ++++++++++++++++++ e2e_test/udf/drop_function.slt | 52 -------- src/frontend/src/handler/create_aggregate.rs | 23 ++-- src/frontend/src/handler/create_function.rs | 23 ++-- .../src/handler/create_sql_function.rs | 23 ++-- src/frontend/src/handler/mod.rs | 5 + src/frontend/src/session.rs | 39 ++++++ src/sqlparser/src/ast/mod.rs | 16 ++- src/sqlparser/src/parser.rs | 6 + src/sqlparser/tests/sqlparser_postgres.rs | 57 ++++++++- 10 files changed, 265 insertions(+), 94 deletions(-) create mode 100644 e2e_test/udf/create_and_drop.slt delete mode 100644 e2e_test/udf/drop_function.slt diff --git a/e2e_test/udf/create_and_drop.slt b/e2e_test/udf/create_and_drop.slt new file mode 100644 index 0000000000000..7b31dba16fdbd --- /dev/null +++ b/e2e_test/udf/create_and_drop.slt @@ -0,0 +1,115 @@ +# https://github.com/risingwavelabs/risingwave/issues/17263 + +statement ok +create table t (a int, b int); + +statement ok +create function add(a int, b int) returns int language python as $$ +def add(a, b): + return a+b +$$; + +statement error function with name add\(integer,integer\) exists +create function add(int, int) returns int language sql as $$select $1 + $2$$; + +statement ok +create function if not exists add(int, int) returns int language sql as $$select $1 + $2$$; + +statement ok +create function add_v2(int, int) returns int language sql as $$select $1 + $2$$; + +statement ok +create aggregate mysum(value int) returns int language python as $$ +def create_state(): + return 0 +def accumulate(state, value): + return state + value +def finish(state): + return state +$$; + +statement error function with name mysum\(integer\) exists +create aggregate mysum(value int) returns int language python as $$ +def create_state(): + return 0 +def accumulate(state, value): + return state + value +def finish(state): + return state +$$; + +statement ok +create aggregate if not exists mysum(value int) returns int language python as $$ +def create_state(): + return 0 +def accumulate(state, value): + return state + value +def finish(state): + return state +$$; + +statement ok +create materialized view mv as select add(a, b) + add_v2(a, b) as c from t; + +statement ok +create materialized view mv2 as select mysum(a) as s from t; + +statement error function used by 1 other objects +drop function add; + +statement error function used by 1 other objects +drop function if exists add; + +statement error function used by 1 other objects +drop function add_v2; + +statement error function used by 1 other objects +drop aggregate mysum; + +statement ok +drop materialized view mv; + +statement ok +drop materialized view mv2; + +statement ok +drop function add; + +statement error function not found +drop function add; + +statement ok +drop function if exists add; + +statement ok +drop function add_v2; + +statement ok +drop function if exists add_v2; + +statement ok +drop aggregate mysum; + +statement ok +drop aggregate if exists mysum; + +statement ok +create function add(a int, b int) returns int language python as $$ +def add(a, b): + return a+b +$$; + +statement ok +create sink s as select add(a, b) as c from t with (connector = 'blackhole'); + +statement error function used by 1 other objects +drop function add; + +statement ok +drop sink s; + +statement ok +drop function add; + +statement ok +drop table t; diff --git a/e2e_test/udf/drop_function.slt b/e2e_test/udf/drop_function.slt deleted file mode 100644 index ffe4e0eea481f..0000000000000 --- a/e2e_test/udf/drop_function.slt +++ /dev/null @@ -1,52 +0,0 @@ -# https://github.com/risingwavelabs/risingwave/issues/17263 - -statement ok -create table t (a int, b int); - -statement ok -create function add(a int, b int) returns int language python as $$ -def add(a, b): - return a+b -$$; - -statement ok -create function add_v2(INT, INT) returns int language sql as $$select $1 + $2$$; - -statement ok -create materialized view mv as select add(a, b) + add_v2(a, b) as c from t; - -statement error function used by 1 other objects -drop function add; - -statement error function used by 1 other objects -drop function add_v2; - -statement ok -drop materialized view mv; - -statement ok -drop function add; - -statement ok -drop function add_v2; - -statement ok -create function add(a int, b int) returns int language python as $$ -def add(a, b): - return a+b -$$; - -statement ok -create sink s as select add(a, b) as c from t with (connector = 'blackhole'); - -statement error function used by 1 other objects -drop function add; - -statement ok -drop sink s; - -statement ok -drop function add; - -statement ok -drop table t; diff --git a/src/frontend/src/handler/create_aggregate.rs b/src/frontend/src/handler/create_aggregate.rs index 32f326db9b1d9..85ba343fef408 100644 --- a/src/frontend/src/handler/create_aggregate.rs +++ b/src/frontend/src/handler/create_aggregate.rs @@ -13,6 +13,7 @@ // limitations under the License. use anyhow::Context; +use either::Either; use risingwave_common::catalog::FunctionId; use risingwave_expr::sig::{CreateFunctionOptions, UdfKind}; use risingwave_pb::catalog::function::{AggregateFunction, Kind}; @@ -20,12 +21,12 @@ use risingwave_pb::catalog::Function; use risingwave_sqlparser::ast::DataType as AstDataType; use super::*; -use crate::catalog::CatalogError; use crate::{bind_data_type, Binder}; pub async fn handle_create_aggregate( handler_args: HandlerArgs, or_replace: bool, + if_not_exists: bool, name: ObjectName, args: Vec, returns: AstDataType, @@ -74,20 +75,18 @@ pub async fn handle_create_aggregate( // resolve database and schema id let session = &handler_args.session; let db_name = &session.database(); - let (schema_name, function_name) = Binder::resolve_schema_qualified_name(db_name, name)?; + let (schema_name, function_name) = + Binder::resolve_schema_qualified_name(db_name, name.clone())?; let (database_id, schema_id) = session.get_database_and_schema_id_for_create(schema_name)?; // check if the function exists in the catalog - if (session.env().catalog_reader().read_guard()) - .get_schema_by_id(&database_id, &schema_id)? - .get_function_by_name_args(&function_name, &arg_types) - .is_some() - { - let name = format!( - "{function_name}({})", - arg_types.iter().map(|t| t.to_string()).join(",") - ); - return Err(CatalogError::Duplicated("function", name).into()); + if let Either::Right(resp) = session.check_function_name_duplicated( + StatementType::CREATE_FUNCTION, + name, + &arg_types, + if_not_exists, + )? { + return Ok(resp); } let link = match ¶ms.using { diff --git a/src/frontend/src/handler/create_function.rs b/src/frontend/src/handler/create_function.rs index b87d3c90a3488..c212eaebb56f4 100644 --- a/src/frontend/src/handler/create_function.rs +++ b/src/frontend/src/handler/create_function.rs @@ -13,6 +13,7 @@ // limitations under the License. use anyhow::Context; +use either::Either; use risingwave_common::catalog::FunctionId; use risingwave_common::types::StructType; use risingwave_expr::sig::{CreateFunctionOptions, UdfKind}; @@ -20,13 +21,13 @@ use risingwave_pb::catalog::function::{Kind, ScalarFunction, TableFunction}; use risingwave_pb::catalog::Function; use super::*; -use crate::catalog::CatalogError; use crate::{bind_data_type, Binder}; pub async fn handle_create_function( handler_args: HandlerArgs, or_replace: bool, temporary: bool, + if_not_exists: bool, name: ObjectName, args: Option>, returns: Option, @@ -107,20 +108,18 @@ pub async fn handle_create_function( // resolve database and schema id let session = &handler_args.session; let db_name = &session.database(); - let (schema_name, function_name) = Binder::resolve_schema_qualified_name(db_name, name)?; + let (schema_name, function_name) = + Binder::resolve_schema_qualified_name(db_name, name.clone())?; let (database_id, schema_id) = session.get_database_and_schema_id_for_create(schema_name)?; // check if the function exists in the catalog - if (session.env().catalog_reader().read_guard()) - .get_schema_by_id(&database_id, &schema_id)? - .get_function_by_name_args(&function_name, &arg_types) - .is_some() - { - let name = format!( - "{function_name}({})", - arg_types.iter().map(|t| t.to_string()).join(",") - ); - return Err(CatalogError::Duplicated("function", name).into()); + if let Either::Right(resp) = session.check_function_name_duplicated( + StatementType::CREATE_FUNCTION, + name, + &arg_types, + if_not_exists, + )? { + return Ok(resp); } let link = match ¶ms.using { diff --git a/src/frontend/src/handler/create_sql_function.rs b/src/frontend/src/handler/create_sql_function.rs index 4725b37ab6511..71a31ce5173cc 100644 --- a/src/frontend/src/handler/create_sql_function.rs +++ b/src/frontend/src/handler/create_sql_function.rs @@ -14,6 +14,7 @@ use std::collections::HashMap; +use either::Either; use fancy_regex::Regex; use risingwave_common::catalog::FunctionId; use risingwave_common::types::{DataType, StructType}; @@ -23,7 +24,6 @@ use risingwave_sqlparser::parser::{Parser, ParserError}; use super::*; use crate::binder::UdfContext; -use crate::catalog::CatalogError; use crate::expr::{Expr, ExprImpl, Literal}; use crate::{bind_data_type, Binder}; @@ -122,6 +122,7 @@ pub async fn handle_create_sql_function( handler_args: HandlerArgs, or_replace: bool, temporary: bool, + if_not_exists: bool, name: ObjectName, args: Option>, returns: Option, @@ -214,20 +215,18 @@ pub async fn handle_create_sql_function( // resolve database and schema id let session = &handler_args.session; let db_name = &session.database(); - let (schema_name, function_name) = Binder::resolve_schema_qualified_name(db_name, name)?; + let (schema_name, function_name) = + Binder::resolve_schema_qualified_name(db_name, name.clone())?; let (database_id, schema_id) = session.get_database_and_schema_id_for_create(schema_name)?; // check if function exists - if (session.env().catalog_reader().read_guard()) - .get_schema_by_id(&database_id, &schema_id)? - .get_function_by_name_args(&function_name, &arg_types) - .is_some() - { - let name = format!( - "{function_name}({})", - arg_types.iter().map(|t| t.to_string()).join(",") - ); - return Err(CatalogError::Duplicated("function", name).into()); + if let Either::Right(resp) = session.check_function_name_duplicated( + StatementType::CREATE_FUNCTION, + name, + &arg_types, + if_not_exists, + )? { + return Ok(resp); } // Parse function body here diff --git a/src/frontend/src/handler/mod.rs b/src/frontend/src/handler/mod.rs index bd559b9245fc7..4245e66c3034a 100644 --- a/src/frontend/src/handler/mod.rs +++ b/src/frontend/src/handler/mod.rs @@ -278,6 +278,7 @@ pub async fn handle( Statement::CreateFunction { or_replace, temporary, + if_not_exists, name, args, returns, @@ -298,6 +299,7 @@ pub async fn handle( handler_args, or_replace, temporary, + if_not_exists, name, args, returns, @@ -310,6 +312,7 @@ pub async fn handle( handler_args, or_replace, temporary, + if_not_exists, name, args, returns, @@ -320,6 +323,7 @@ pub async fn handle( } Statement::CreateAggregate { or_replace, + if_not_exists, name, args, returns, @@ -329,6 +333,7 @@ pub async fn handle( create_aggregate::handle_create_aggregate( handler_args, or_replace, + if_not_exists, name, args, returns, diff --git a/src/frontend/src/session.rs b/src/frontend/src/session.rs index fe03af7c0c786..7f99d91d6e0d5 100644 --- a/src/frontend/src/session.rs +++ b/src/frontend/src/session.rs @@ -22,6 +22,7 @@ use std::time::{Duration, Instant}; use anyhow::anyhow; use bytes::Bytes; use either::Either; +use itertools::Itertools; use parking_lot::{Mutex, RwLock, RwLockReadGuard}; use pgwire::error::{PsqlError, PsqlResult}; use pgwire::net::{Address, AddressRef}; @@ -961,6 +962,44 @@ impl SessionImpl { .map_err(RwError::from) } + pub fn check_function_name_duplicated( + &self, + stmt_type: StatementType, + name: ObjectName, + arg_types: &[DataType], + if_not_exists: bool, + ) -> Result> { + let db_name = &self.database(); + let (schema_name, function_name) = Binder::resolve_schema_qualified_name(db_name, name)?; + let (database_id, schema_id) = self.get_database_and_schema_id_for_create(schema_name)?; + + let catalog_reader = self.env().catalog_reader().read_guard(); + if catalog_reader + .get_schema_by_id(&database_id, &schema_id)? + .get_function_by_name_args(&function_name, arg_types) + .is_some() + { + let full_name = format!( + "{function_name}({})", + arg_types.iter().map(|t| t.to_string()).join(",") + ); + if if_not_exists { + Ok(Either::Right( + PgResponse::builder(stmt_type) + .notice(format!( + "function \"{}\" already exists, skipping", + full_name + )) + .into(), + )) + } else { + Err(CatalogError::Duplicated("function", full_name).into()) + } + } else { + Ok(Either::Left(())) + } + } + /// Also check if the user has the privilege to create in the schema. pub fn get_database_and_schema_id_for_create( &self, diff --git a/src/sqlparser/src/ast/mod.rs b/src/sqlparser/src/ast/mod.rs index 16496b71c97eb..98f8599ccf89a 100644 --- a/src/sqlparser/src/ast/mod.rs +++ b/src/sqlparser/src/ast/mod.rs @@ -1349,6 +1349,7 @@ pub enum Statement { CreateFunction { or_replace: bool, temporary: bool, + if_not_exists: bool, name: ObjectName, args: Option>, returns: Option, @@ -1361,6 +1362,7 @@ pub enum Statement { /// Postgres: CreateAggregate { or_replace: bool, + if_not_exists: bool, name: ObjectName, args: Vec, returns: DataType, @@ -1768,6 +1770,7 @@ impl fmt::Display for Statement { Statement::CreateFunction { or_replace, temporary, + if_not_exists, name, args, returns, @@ -1776,9 +1779,10 @@ impl fmt::Display for Statement { } => { write!( f, - "CREATE {or_replace}{temp}FUNCTION {name}", + "CREATE {or_replace}{temp}FUNCTION {if_not_exists}{name}", temp = if *temporary { "TEMPORARY " } else { "" }, or_replace = if *or_replace { "OR REPLACE " } else { "" }, + if_not_exists = if *if_not_exists { "IF NOT EXISTS " } else { "" }, )?; if let Some(args) = args { write!(f, "({})", display_comma_separated(args))?; @@ -1792,6 +1796,7 @@ impl fmt::Display for Statement { } Statement::CreateAggregate { or_replace, + if_not_exists, name, args, returns, @@ -1800,8 +1805,9 @@ impl fmt::Display for Statement { } => { write!( f, - "CREATE {or_replace}AGGREGATE {name}", + "CREATE {or_replace}AGGREGATE {if_not_exists}{name}", or_replace = if *or_replace { "OR REPLACE " } else { "" }, + if_not_exists = if *if_not_exists { "IF NOT EXISTS " } else { "" }, )?; write!(f, "({})", display_comma_separated(args))?; write!(f, " RETURNS {}", returns)?; @@ -3551,8 +3557,9 @@ mod tests { #[test] fn test_create_function_display() { let create_function = Statement::CreateFunction { - temporary: false, or_replace: false, + temporary: false, + if_not_exists: false, name: ObjectName(vec![Ident::new_unchecked("foo")]), args: Some(vec![OperateFunctionArg::unnamed(DataType::Int)]), returns: Some(CreateFunctionReturns::Value(DataType::Int)), @@ -3573,8 +3580,9 @@ mod tests { format!("{}", create_function) ); let create_function = Statement::CreateFunction { - temporary: false, or_replace: false, + temporary: false, + if_not_exists: false, name: ObjectName(vec![Ident::new_unchecked("foo")]), args: Some(vec![OperateFunctionArg::unnamed(DataType::Int)]), returns: Some(CreateFunctionReturns::Value(DataType::Int)), diff --git a/src/sqlparser/src/parser.rs b/src/sqlparser/src/parser.rs index 9eb3d9e439967..2df0183cf5c5a 100644 --- a/src/sqlparser/src/parser.rs +++ b/src/sqlparser/src/parser.rs @@ -2210,6 +2210,8 @@ impl Parser<'_> { or_replace: bool, temporary: bool, ) -> PResult { + impl_parse_to!(if_not_exists => [Keyword::IF, Keyword::NOT, Keyword::EXISTS], self); + let name = self.parse_object_name()?; self.expect_token(&Token::LParen)?; let args = if self.peek_token().token == Token::RParen { @@ -2248,6 +2250,7 @@ impl Parser<'_> { Ok(Statement::CreateFunction { or_replace, temporary, + if_not_exists, name, args, returns: return_type, @@ -2257,6 +2260,8 @@ impl Parser<'_> { } fn parse_create_aggregate(&mut self, or_replace: bool) -> PResult { + impl_parse_to!(if_not_exists => [Keyword::IF, Keyword::NOT, Keyword::EXISTS], self); + let name = self.parse_object_name()?; self.expect_token(&Token::LParen)?; let args = self.parse_comma_separated(Parser::parse_function_arg)?; @@ -2270,6 +2275,7 @@ impl Parser<'_> { Ok(Statement::CreateAggregate { or_replace, + if_not_exists, name, args, returns, diff --git a/src/sqlparser/tests/sqlparser_postgres.rs b/src/sqlparser/tests/sqlparser_postgres.rs index 7acf6d29b4444..1466a9024a6d5 100644 --- a/src/sqlparser/tests/sqlparser_postgres.rs +++ b/src/sqlparser/tests/sqlparser_postgres.rs @@ -753,6 +753,7 @@ fn parse_create_function() { Statement::CreateFunction { or_replace: false, temporary: false, + if_not_exists: false, name: ObjectName(vec![Ident::new_unchecked("add")]), args: Some(vec![ OperateFunctionArg::unnamed(DataType::Int), @@ -777,6 +778,7 @@ fn parse_create_function() { Statement::CreateFunction { or_replace: false, temporary: false, + if_not_exists: false, name: ObjectName(vec![Ident::new_unchecked("sub")]), args: Some(vec![ OperateFunctionArg::unnamed(DataType::Int), @@ -801,6 +803,7 @@ fn parse_create_function() { Statement::CreateFunction { or_replace: false, temporary: false, + if_not_exists: false, name: ObjectName(vec![Ident::new_unchecked("return_test")]), args: Some(vec![ OperateFunctionArg::unnamed(DataType::Int), @@ -826,6 +829,7 @@ fn parse_create_function() { Statement::CreateFunction { or_replace: true, temporary: false, + if_not_exists: false, name: ObjectName(vec![Ident::new_unchecked("add")]), args: Some(vec![ OperateFunctionArg::with_name("a", DataType::Int), @@ -851,12 +855,14 @@ fn parse_create_function() { } ); - let sql = "CREATE FUNCTION unnest(a INT[]) RETURNS TABLE (x INT) LANGUAGE SQL RETURN a"; + let sql = + "CREATE TEMPORARY FUNCTION unnest(a INT[]) RETURNS TABLE (x INT) LANGUAGE SQL RETURN a"; assert_eq!( verified_stmt(sql), Statement::CreateFunction { or_replace: false, - temporary: false, + temporary: true, + if_not_exists: false, name: ObjectName(vec![Ident::new_unchecked("unnest")]), args: Some(vec![OperateFunctionArg::with_name( "a", @@ -874,6 +880,32 @@ fn parse_create_function() { with_options: Default::default(), } ); + + let sql = + "CREATE FUNCTION IF NOT EXISTS add(INT, INT) RETURNS INT LANGUAGE SQL IMMUTABLE AS 'select $1 + $2;'"; + assert_eq!( + verified_stmt(sql), + Statement::CreateFunction { + or_replace: false, + temporary: false, + if_not_exists: true, + name: ObjectName(vec![Ident::new_unchecked("add")]), + args: Some(vec![ + OperateFunctionArg::unnamed(DataType::Int), + OperateFunctionArg::unnamed(DataType::Int), + ]), + returns: Some(CreateFunctionReturns::Value(DataType::Int)), + params: CreateFunctionBody { + language: Some("SQL".into()), + behavior: Some(FunctionBehavior::Immutable), + as_: Some(FunctionDefinition::SingleQuotedDef( + "select $1 + $2;".into() + )), + ..Default::default() + }, + with_options: Default::default(), + } + ); } #[test] @@ -884,6 +916,27 @@ fn parse_create_aggregate() { verified_stmt(sql), Statement::CreateAggregate { or_replace: true, + if_not_exists: false, + name: ObjectName(vec![Ident::new_unchecked("sum")]), + args: vec![OperateFunctionArg::unnamed(DataType::Int)], + returns: DataType::BigInt, + append_only: true, + params: CreateFunctionBody { + language: Some("python".into()), + as_: Some(FunctionDefinition::SingleQuotedDef("sum".into())), + using: Some(CreateFunctionUsing::Link("xxx".into())), + ..Default::default() + }, + } + ); + + let sql = + "CREATE AGGREGATE IF NOT EXISTS sum(INT) RETURNS BIGINT APPEND ONLY LANGUAGE python AS 'sum' USING LINK 'xxx'"; + assert_eq!( + verified_stmt(sql), + Statement::CreateAggregate { + or_replace: false, + if_not_exists: true, name: ObjectName(vec![Ident::new_unchecked("sum")]), args: vec![OperateFunctionArg::unnamed(DataType::Int)], returns: DataType::BigInt, From 3619d775a56a9b8eb74fe5e0c9323c4cf10d76a3 Mon Sep 17 00:00:00 2001 From: Dylan Date: Thu, 9 Jan 2025 17:58:55 +0800 Subject: [PATCH 14/21] fix(iceberg): fix iceberg time travel timezone (#20085) --- src/frontend/src/handler/explain.rs | 1 + src/frontend/src/handler/query.rs | 1 + .../src/scheduler/distributed/query.rs | 1 + src/frontend/src/scheduler/plan_fragmenter.rs | 62 +++++++++++++++---- 4 files changed, 53 insertions(+), 12 deletions(-) diff --git a/src/frontend/src/handler/explain.rs b/src/frontend/src/handler/explain.rs index e5d20ce289724..5f55f9aac378c 100644 --- a/src/frontend/src/handler/explain.rs +++ b/src/frontend/src/handler/explain.rs @@ -220,6 +220,7 @@ async fn do_handle_explain( worker_node_manager_reader, session.env().catalog_reader().clone(), session.config().batch_parallelism().0, + session.config().timezone().to_owned(), plan.clone(), )?); batch_plan_fragmenter_fmt = if explain_format == ExplainFormat::Dot { diff --git a/src/frontend/src/handler/query.rs b/src/frontend/src/handler/query.rs index 4f4b1f2187aa0..ec5a3ee393d40 100644 --- a/src/frontend/src/handler/query.rs +++ b/src/frontend/src/handler/query.rs @@ -373,6 +373,7 @@ pub fn gen_batch_plan_fragmenter( worker_node_manager_reader, session.env().catalog_reader().clone(), session.config().batch_parallelism().0, + session.config().timezone().to_owned(), plan, )?; diff --git a/src/frontend/src/scheduler/distributed/query.rs b/src/frontend/src/scheduler/distributed/query.rs index a843aa1b59fe4..cf6319c894a0e 100644 --- a/src/frontend/src/scheduler/distributed/query.rs +++ b/src/frontend/src/scheduler/distributed/query.rs @@ -734,6 +734,7 @@ pub(crate) mod tests { worker_node_selector, catalog_reader, None, + "UTC".to_owned(), batch_exchange_node.clone(), ) .unwrap(); diff --git a/src/frontend/src/scheduler/plan_fragmenter.rs b/src/frontend/src/scheduler/plan_fragmenter.rs index 76f5428348003..3949f1f2d9f5d 100644 --- a/src/frontend/src/scheduler/plan_fragmenter.rs +++ b/src/frontend/src/scheduler/plan_fragmenter.rs @@ -20,6 +20,7 @@ use std::sync::Arc; use anyhow::anyhow; use async_recursion::async_recursion; +use chrono::{MappedLocalTime, TimeZone}; use enum_as_inner::EnumAsInner; use futures::TryStreamExt; use iceberg::expr::Predicate as IcebergPredicate; @@ -33,6 +34,7 @@ use risingwave_common::bitmap::{Bitmap, BitmapBuilder}; use risingwave_common::catalog::{Schema, TableDesc}; use risingwave_common::hash::table_distribution::TableDistribution; use risingwave_common::hash::{WorkerSlotId, WorkerSlotMapping}; +use risingwave_common::types::Timestamptz; use risingwave_common::util::scan_range::ScanRange; use risingwave_connector::source::filesystem::opendal_source::opendal_enumerator::OpendalEnumerator; use risingwave_connector::source::filesystem::opendal_source::{ @@ -145,6 +147,7 @@ pub struct BatchPlanFragmenter { catalog_reader: CatalogReader, batch_parallelism: usize, + timezone: String, stage_graph_builder: Option, stage_graph: Option, @@ -163,6 +166,7 @@ impl BatchPlanFragmenter { worker_node_manager: WorkerNodeSelector, catalog_reader: CatalogReader, batch_parallelism: Option, + timezone: String, batch_node: PlanRef, ) -> SchedulerResult { // if batch_parallelism is None, it means no limit, we will use the available nodes count as @@ -186,6 +190,7 @@ impl BatchPlanFragmenter { worker_node_manager, catalog_reader, batch_parallelism, + timezone, stage_graph_builder: Some(StageGraphBuilder::new(batch_parallelism)), stage_graph: None, }; @@ -311,7 +316,11 @@ impl SourceScanInfo { Self::Incomplete(fetch_info) } - pub async fn complete(self, batch_parallelism: usize) -> SchedulerResult { + pub async fn complete( + self, + batch_parallelism: usize, + timezone: String, + ) -> SchedulerResult { let fetch_info = match self { SourceScanInfo::Incomplete(fetch_info) => fetch_info, SourceScanInfo::Complete(_) => { @@ -382,15 +391,36 @@ impl SourceScanInfo { Some(AsOf::VersionString(_)) => { bail!("Unsupported version string in iceberg time travel") } - Some(AsOf::TimestampString(ts)) => Some( - speedate::DateTime::parse_str_rfc3339(&ts) - .map(|t| { - IcebergTimeTravelInfo::TimestampMs( - t.timestamp_tz() * 1000 + t.time.microsecond as i64 / 1000, - ) - }) - .map_err(|_e| anyhow!("fail to parse timestamp"))?, - ), + Some(AsOf::TimestampString(ts)) => { + let date_time = speedate::DateTime::parse_str_rfc3339(&ts) + .map_err(|_e| anyhow!("fail to parse timestamp"))?; + let timestamp = if date_time.time.tz_offset.is_none() { + // If the input does not specify a time zone, use the time zone set by the "SET TIME ZONE" command. + let tz = + Timestamptz::lookup_time_zone(&timezone).map_err(|e| anyhow!(e))?; + match tz.with_ymd_and_hms( + date_time.date.year.into(), + date_time.date.month.into(), + date_time.date.day.into(), + date_time.time.hour.into(), + date_time.time.minute.into(), + date_time.time.second.into(), + ) { + MappedLocalTime::Single(d) => Ok(d.timestamp()), + MappedLocalTime::Ambiguous(_, _) | MappedLocalTime::None => { + Err(anyhow!(format!( + "failed to parse the timestamp {ts} with the specified time zone {tz}" + ))) + } + }? + } else { + date_time.timestamp_tz() + }; + + Some(IcebergTimeTravelInfo::TimestampMs( + timestamp * 1000 + date_time.time.microsecond as i64 / 1000, + )) + } Some(AsOf::ProcessTime) | Some(AsOf::ProcessTimeWithInterval(_)) => { unreachable!() } @@ -731,6 +761,7 @@ impl StageGraph { self, catalog_reader: &CatalogReader, worker_node_manager: &WorkerNodeSelector, + timezone: String, ) -> SchedulerResult { let mut complete_stages = HashMap::new(); self.complete_stage( @@ -739,6 +770,7 @@ impl StageGraph { &mut complete_stages, catalog_reader, worker_node_manager, + timezone, ) .await?; Ok(StageGraph { @@ -758,6 +790,7 @@ impl StageGraph { complete_stages: &mut HashMap, catalog_reader: &CatalogReader, worker_node_manager: &WorkerNodeSelector, + timezone: String, ) -> SchedulerResult<()> { let parallelism = if stage.parallelism.is_some() { // If the stage has parallelism, it means it's a complete stage. @@ -772,7 +805,7 @@ impl StageGraph { .as_ref() .unwrap() .clone() - .complete(self.batch_parallelism) + .complete(self.batch_parallelism, timezone.to_owned()) .await?; // For batch reading file source, the number of files involved is typically large. @@ -842,6 +875,7 @@ impl StageGraph { complete_stages, catalog_reader, worker_node_manager, + timezone.to_owned(), ) .await?; } @@ -935,7 +969,11 @@ impl BatchPlanFragmenter { pub async fn generate_complete_query(self) -> SchedulerResult { let stage_graph = self.stage_graph.unwrap(); let new_stage_graph = stage_graph - .complete(&self.catalog_reader, &self.worker_node_manager) + .complete( + &self.catalog_reader, + &self.worker_node_manager, + self.timezone.to_owned(), + ) .await?; Ok(Query { query_id: self.query_id, From 87eac4f4d92f04fed4d52ae3f13567a77a248455 Mon Sep 17 00:00:00 2001 From: Shanicky Chen Date: Thu, 9 Jan 2025 18:56:56 +0800 Subject: [PATCH 15/21] fix: wrong streaming_parallelism when default_parallelism is set to fixed (#20072) Signed-off-by: Shanicky Chen --- src/meta/src/controller/streaming_job.rs | 8 ++- src/tests/simulation/src/cluster.rs | 32 +++++++++++ .../integration_tests/default_parallelism.rs | 56 +++++++++++++++++++ .../tests/integration_tests/main.rs | 1 + 4 files changed, 94 insertions(+), 3 deletions(-) create mode 100644 src/tests/simulation/tests/integration_tests/default_parallelism.rs diff --git a/src/meta/src/controller/streaming_job.rs b/src/meta/src/controller/streaming_job.rs index f30e20b50f0b8..8676b2583227c 100644 --- a/src/meta/src/controller/streaming_job.rs +++ b/src/meta/src/controller/streaming_job.rs @@ -17,6 +17,7 @@ use std::num::NonZeroUsize; use anyhow::anyhow; use itertools::Itertools; +use risingwave_common::config::DefaultParallelism; use risingwave_common::hash::VnodeCountCompat; use risingwave_common::util::column_index_mapping::ColIndexMapping; use risingwave_common::util::stream_graph_visitor::visit_stream_node; @@ -119,9 +120,10 @@ impl CatalogController { let txn = inner.db.begin().await?; let create_type = streaming_job.create_type(); - let streaming_parallelism = match parallelism { - None => StreamingParallelism::Adaptive, - Some(n) => StreamingParallelism::Fixed(n.parallelism as _), + let streaming_parallelism = match (parallelism, self.env.opts.default_parallelism) { + (None, DefaultParallelism::Full) => StreamingParallelism::Adaptive, + (None, DefaultParallelism::Default(n)) => StreamingParallelism::Fixed(n.get()), + (Some(n), _) => StreamingParallelism::Fixed(n.parallelism as _), }; ensure_user_id(streaming_job.owner() as _, &txn).await?; diff --git a/src/tests/simulation/src/cluster.rs b/src/tests/simulation/src/cluster.rs index 21f113b7b9858..a3fd28bb54293 100644 --- a/src/tests/simulation/src/cluster.rs +++ b/src/tests/simulation/src/cluster.rs @@ -14,6 +14,7 @@ #![cfg_attr(not(madsim), allow(unused_imports))] +use std::cmp::max; use std::collections::HashMap; use std::future::Future; use std::io::Write; @@ -206,6 +207,37 @@ metrics_level = "Disabled" } } + pub fn for_default_parallelism(default_parallelism: usize) -> Self { + let config_path = { + let mut file = + tempfile::NamedTempFile::new().expect("failed to create temp config file"); + + let config_data = format!( + r#" +[server] +telemetry_enabled = false +metrics_level = "Disabled" +[meta] +default_parallelism = {default_parallelism} +"# + ) + .to_owned(); + file.write_all(config_data.as_bytes()) + .expect("failed to write config file"); + file.into_temp_path() + }; + + Configuration { + config_path: ConfigPath::Temp(config_path.into()), + frontend_nodes: 1, + compute_nodes: 1, + meta_nodes: 1, + compactor_nodes: 1, + compute_node_cores: default_parallelism * 2, + per_session_queries: vec![].into(), + } + } + /// Returns the config for backfill test. pub fn for_backfill() -> Self { // Embed the config file and create a temporary file at runtime. The file will be deleted diff --git a/src/tests/simulation/tests/integration_tests/default_parallelism.rs b/src/tests/simulation/tests/integration_tests/default_parallelism.rs new file mode 100644 index 0000000000000..2cb79d5c3a8cf --- /dev/null +++ b/src/tests/simulation/tests/integration_tests/default_parallelism.rs @@ -0,0 +1,56 @@ +// Copyright 2025 RisingWave Labs +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::Result; +use risingwave_simulation::cluster::{Cluster, Configuration}; +use risingwave_simulation::utils::AssertResult; + +#[tokio::test] +async fn test_no_default_parallelism() -> Result<()> { + let mut cluster = Cluster::start(Configuration::default()).await?; + + let mut session = cluster.start_session(); + + session.run("create table t(v int);").await?; + session + .run("select parallelism from rw_streaming_parallelism where name = 't';") + .await? + .assert_result_eq("ADAPTIVE"); + + Ok(()) +} + +#[tokio::test] +async fn test_default_parallelism() -> Result<()> { + let mut cluster = Cluster::start(Configuration::for_default_parallelism(2)).await?; + + let mut session = cluster.start_session(); + + session.run("create table t(v int);").await?; + session + .run("select parallelism from rw_streaming_parallelism where name = 't';") + .await? + .assert_result_eq("FIXED(2)"); + + session + .run("alter table t set parallelism = adaptive;") + .await?; + + session + .run("select parallelism from rw_streaming_parallelism where name = 't';") + .await? + .assert_result_eq("ADAPTIVE"); + + Ok(()) +} diff --git a/src/tests/simulation/tests/integration_tests/main.rs b/src/tests/simulation/tests/integration_tests/main.rs index 8244a0bb48f27..ad8e854a30e5f 100644 --- a/src/tests/simulation/tests/integration_tests/main.rs +++ b/src/tests/simulation/tests/integration_tests/main.rs @@ -29,4 +29,5 @@ mod storage; mod throttle; mod compaction; +mod default_parallelism; mod utils; From 8b5cabac7ba209efacfc05eb92086d83a45309c8 Mon Sep 17 00:00:00 2001 From: zwang28 <70626450+zwang28@users.noreply.github.com> Date: Thu, 9 Jan 2025 19:20:55 +0800 Subject: [PATCH 16/21] refactor: log deleted objects (#20086) --- src/meta/src/hummock/manager/gc.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/meta/src/hummock/manager/gc.rs b/src/meta/src/hummock/manager/gc.rs index 7bd7034ae1f99..eba07b7002244 100644 --- a/src/meta/src/hummock/manager/gc.rs +++ b/src/meta/src/hummock/manager/gc.rs @@ -535,7 +535,7 @@ impl HummockManager { break; } let delete_batch: HashSet<_> = objects_to_delete.drain(..batch_size).collect(); - tracing::debug!(?delete_batch, "Attempt to delete objects."); + tracing::info!(?delete_batch, "Attempt to delete objects."); let deleted_object_ids = delete_batch.clone(); self.gc_manager .delete_objects(delete_batch.into_iter()) From e6e9ace249c3eb9a78c5c900c47ea47dc0dd5c82 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 10 Jan 2025 03:05:55 +0000 Subject: [PATCH 17/21] chore(deps): Bump anstream from 0.6.4 to 0.6.18 in /integration_tests/feature-store/simulator (#20022) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: xxchan --- .../feature-store/simulator/Cargo.lock | 114 +++++++++++++++--- 1 file changed, 97 insertions(+), 17 deletions(-) diff --git a/integration_tests/feature-store/simulator/Cargo.lock b/integration_tests/feature-store/simulator/Cargo.lock index 79c7d3cad88b0..1049a14ba248f 100644 --- a/integration_tests/feature-store/simulator/Cargo.lock +++ b/integration_tests/feature-store/simulator/Cargo.lock @@ -19,15 +19,16 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" [[package]] name = "anstream" -version = "0.6.4" +version = "0.6.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ab91ebe16eb252986481c5b62f6098f3b698a45e34b5b98200cf20dd2484a44" +checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" dependencies = [ "anstyle", "anstyle-parse", "anstyle-query", "anstyle-wincon", "colorchoice", + "is_terminal_polyfill", "utf8parse", ] @@ -52,17 +53,17 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5ca11d4be1bab0c8bc8734a9aa7bf4ee8316d462a08c6ac5052f888fef5b494b" dependencies = [ - "windows-sys", + "windows-sys 0.48.0", ] [[package]] name = "anstyle-wincon" -version = "3.0.1" +version = "3.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0699d10d2f4d628a98ee7b57b289abbc98ff3bad977cb3152709d4bf2330628" +checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125" dependencies = [ "anstyle", - "windows-sys", + "windows-sys 0.59.0", ] [[package]] @@ -508,6 +509,12 @@ dependencies = [ "hashbrown 0.14.3", ] +[[package]] +name = "is_terminal_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" + [[package]] name = "itertools" version = "0.11.0" @@ -564,7 +571,7 @@ checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" dependencies = [ "libc", "wasi", - "windows-sys", + "windows-sys 0.48.0", ] [[package]] @@ -797,7 +804,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4031e820eb552adee9295814c0ced9e5cf38ddf1e8b7d566d6de8e2538ea989e" dependencies = [ "libc", - "windows-sys", + "windows-sys 0.48.0", ] [[package]] @@ -837,7 +844,7 @@ dependencies = [ "pin-project-lite", "socket2 0.5.4", "tokio-macros", - "windows-sys", + "windows-sys 0.48.0", ] [[package]] @@ -1037,7 +1044,16 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" dependencies = [ - "windows-targets", + "windows-targets 0.48.5", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets 0.52.6", ] [[package]] @@ -1046,13 +1062,29 @@ version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", ] [[package]] @@ -1061,38 +1093,86 @@ version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + [[package]] name = "windows_aarch64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + [[package]] name = "windows_i686_gnu" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + [[package]] name = "windows_i686_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + [[package]] name = "windows_x86_64_gnu" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + [[package]] name = "windows_x86_64_gnullvm" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + [[package]] name = "windows_x86_64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" From 340a167d142899a9705725ac84782c5d276aabcc Mon Sep 17 00:00:00 2001 From: August Date: Fri, 10 Jan 2025 18:54:12 +0800 Subject: [PATCH 18/21] fix: synchronize the index on the mview when changing the owner or schema (#20093) --- e2e_test/ddl/alter_owner.slt | 18 +++++++ e2e_test/ddl/alter_set_schema.slt | 12 ++++- src/meta/src/controller/catalog/alter_op.rs | 58 +++++++++------------ 3 files changed, 53 insertions(+), 35 deletions(-) diff --git a/e2e_test/ddl/alter_owner.slt b/e2e_test/ddl/alter_owner.slt index 11df2419f538d..37b4a94073ccf 100644 --- a/e2e_test/ddl/alter_owner.slt +++ b/e2e_test/ddl/alter_owner.slt @@ -67,6 +67,9 @@ v user1 statement ok CREATE MATERIALIZED VIEW mv AS SELECT v1, (t.v2).v1 AS v21 FROM t; +statement ok +CREATE INDEX mv_idx ON mv(v1); + statement ok ALTER MATERIALIZED VIEW mv OWNER TO user1; @@ -85,6 +88,21 @@ WHERE ---- mv user1 +query TT +SELECT + pg_class.relname AS rel_name, + pg_roles.rolname AS owner +FROM + pg_class + JOIN pg_namespace ON pg_namespace.oid = pg_class.relnamespace + JOIN pg_roles ON pg_roles.oid = pg_class.relowner +WHERE + pg_namespace.nspname NOT LIKE 'pg_%' + AND pg_namespace.nspname != 'information_schema' + AND pg_class.relname = 'mv_idx'; +---- +mv_idx user1 + statement ok CREATE SOURCE src (v INT) WITH ( connector = 'datagen', diff --git a/e2e_test/ddl/alter_set_schema.slt b/e2e_test/ddl/alter_set_schema.slt index db0f479c85c05..3f32cd732e879 100644 --- a/e2e_test/ddl/alter_set_schema.slt +++ b/e2e_test/ddl/alter_set_schema.slt @@ -23,12 +23,21 @@ CREATE INDEX test_index1 ON test_table(u); statement ok CREATE INDEX test_index2 ON test_table(v); +statement ok +CREATE MATERIALIZED VIEW test_mv AS SELECT u FROM test_table; + +statement ok +CREATE INDEX test_mv_index ON test_mv(u); + statement ok ALTER TABLE test_table SET SCHEMA public; statement ok ALTER TABLE test_table SET SCHEMA test_schema; +statement ok +ALTER MATERIALIZED VIEW test_mv SET SCHEMA test_schema; + query TT SELECT tablename, schemaname FROM pg_tables WHERE schemaname = 'test_schema'; ---- @@ -39,6 +48,7 @@ SELECT indexname, schemaname FROM pg_indexes WHERE schemaname = 'test_schema'; ---- test_index1 test_schema test_index2 test_schema +test_mv_index test_schema statement ok CREATE SOURCE test_source (v INT) WITH ( @@ -104,7 +114,7 @@ statement ok DROP SOURCE test_schema.test_source; statement ok -DROP TABLE test_schema.test_table; +DROP TABLE test_schema.test_table cascade; statement ok DROP SCHEMA test_schema; diff --git a/src/meta/src/controller/catalog/alter_op.rs b/src/meta/src/controller/catalog/alter_op.rs index 6bdbee653bb69..43a9843ec05e1 100644 --- a/src/meta/src/controller/catalog/alter_op.rs +++ b/src/meta/src/controller/catalog/alter_op.rs @@ -331,20 +331,15 @@ impl CatalogController { } // indexes. - let (index_ids, mut table_ids): (Vec, Vec) = - if table.table_type == TableType::Table { - Index::find() - .select_only() - .columns([index::Column::IndexId, index::Column::IndexTableId]) - .filter(index::Column::PrimaryTableId.eq(object_id)) - .into_tuple::<(IndexId, TableId)>() - .all(&txn) - .await? - .into_iter() - .unzip() - } else { - (vec![], vec![]) - }; + let (index_ids, mut table_ids): (Vec, Vec) = Index::find() + .select_only() + .columns([index::Column::IndexId, index::Column::IndexTableId]) + .filter(index::Column::PrimaryTableId.eq(object_id)) + .into_tuple::<(IndexId, TableId)>() + .all(&txn) + .await? + .into_iter() + .unzip(); relations.push(PbRelationInfo::Table(ObjectModel(table, obj).into())); // internal tables. @@ -501,8 +496,7 @@ impl CatalogController { .await? .ok_or_else(|| MetaError::catalog_id_not_found("table", object_id))?; check_relation_name_duplicate(&table.name, database_id, new_schema, &txn).await?; - let (associated_src_id, table_type) = - (table.optional_associated_source_id, table.table_type); + let associated_src_id = table.optional_associated_source_id; let mut obj = obj.into_active_model(); obj.schema_id = Set(Some(new_schema)); @@ -531,24 +525,20 @@ impl CatalogController { let (index_ids, (index_names, mut table_ids)): ( Vec, (Vec, Vec), - ) = if table_type == TableType::Table { - Index::find() - .select_only() - .columns([ - index::Column::IndexId, - index::Column::Name, - index::Column::IndexTableId, - ]) - .filter(index::Column::PrimaryTableId.eq(object_id)) - .into_tuple::<(IndexId, String, TableId)>() - .all(&txn) - .await? - .into_iter() - .map(|(id, name, t_id)| (id, (name, t_id))) - .unzip() - } else { - (vec![], (vec![], vec![])) - }; + ) = Index::find() + .select_only() + .columns([ + index::Column::IndexId, + index::Column::Name, + index::Column::IndexTableId, + ]) + .filter(index::Column::PrimaryTableId.eq(object_id)) + .into_tuple::<(IndexId, String, TableId)>() + .all(&txn) + .await? + .into_iter() + .map(|(id, name, t_id)| (id, (name, t_id))) + .unzip(); // internal tables. let internal_tables: Vec = Table::find() From 4c384402b6c0ca3bd1804319d4b2baabc1df654e Mon Sep 17 00:00:00 2001 From: Li0k Date: Sat, 11 Jan 2025 03:38:19 +0800 Subject: [PATCH 19/21] =?UTF-8?q?fix(storage):=20Remove=20logic=20for=20de?= =?UTF-8?q?leting=20objects=20in=20truncate=5Ftime=5Ftrav=E2=80=A6=20(#201?= =?UTF-8?q?01)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/meta/src/hummock/manager/time_travel.rs | 22 ++------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/src/meta/src/hummock/manager/time_travel.rs b/src/meta/src/hummock/manager/time_travel.rs index 721d90e860310..e9886519c0485 100644 --- a/src/meta/src/hummock/manager/time_travel.rs +++ b/src/meta/src/hummock/manager/time_travel.rs @@ -108,18 +108,8 @@ impl HummockManager { txn.commit().await?; return Ok(()); }; - let ( - latest_valid_version_id, - latest_valid_version_sst_ids, - latest_valid_version_object_ids, - ) = { - ( - latest_valid_version.id, - latest_valid_version.get_sst_ids(), - latest_valid_version.get_object_ids(), - ) - }; - let mut object_ids_to_delete: HashSet<_> = HashSet::default(); + let (latest_valid_version_id, latest_valid_version_sst_ids) = + { (latest_valid_version.id, latest_valid_version.get_sst_ids()) }; let version_ids_to_delete: Vec = hummock_time_travel_version::Entity::find() .select_only() @@ -163,8 +153,6 @@ impl HummockManager { .filter(hummock_sstable_info::Column::SstId.is_in(sst_ids_to_delete)) .exec(&txn) .await?; - let new_object_ids = delta_to_delete.newly_added_object_ids(); - object_ids_to_delete.extend(&new_object_ids - &latest_valid_version_object_ids); tracing::debug!( delta_id = delta_to_delete.id.to_u64(), "delete {} rows from hummock_sstable_info", @@ -194,8 +182,6 @@ impl HummockManager { .filter(hummock_sstable_info::Column::SstId.is_in(sst_ids_to_delete)) .exec(&txn) .await?; - let new_object_ids = prev_version.get_object_ids(); - object_ids_to_delete.extend(&new_object_ids - &latest_valid_version_object_ids); tracing::debug!( prev_version_id, "delete {} rows from hummock_sstable_info", @@ -203,10 +189,6 @@ impl HummockManager { ); next_version_sst_ids = sst_ids; } - if !object_ids_to_delete.is_empty() { - self.gc_manager - .add_may_delete_object_ids(object_ids_to_delete.into_iter()); - } let res = hummock_time_travel_version::Entity::delete_many() .filter( From 80b3fd04a8b918e57b1be12cc34b98be20724484 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 11 Jan 2025 12:56:17 +0800 Subject: [PATCH 20/21] chore(deps): Bump the all group across 1 directory with 44 updates (#20037) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Eric Fu --- .../risingwave-sink-deltalake/pom.xml | 2 +- .../risingwave-sink-iceberg/pom.xml | 2 +- java/pom.xml | 26 +++++++++---------- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/java/connector-node/risingwave-sink-deltalake/pom.xml b/java/connector-node/risingwave-sink-deltalake/pom.xml index c1918a4855c29..ea62a6a703a13 100644 --- a/java/connector-node/risingwave-sink-deltalake/pom.xml +++ b/java/connector-node/risingwave-sink-deltalake/pom.xml @@ -43,7 +43,7 @@ io.delta delta-standalone_2.12 - 3.2.1 + 3.3.0 org.apache.parquet diff --git a/java/connector-node/risingwave-sink-iceberg/pom.xml b/java/connector-node/risingwave-sink-iceberg/pom.xml index 49500676896b3..b4d2e87e404b1 100644 --- a/java/connector-node/risingwave-sink-iceberg/pom.xml +++ b/java/connector-node/risingwave-sink-iceberg/pom.xml @@ -16,7 +16,7 @@ risingwave-sink-iceberg - 1.7.0 + 1.7.1 11 11 true diff --git a/java/pom.xml b/java/pom.xml index c717cfc87a2e0..1a6060947f68c 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -65,16 +65,16 @@ UTF-8 3.25.5 - 1.68.1 - 2.24.2 + 1.69.0 + 2.24.3 2.0.16 2.6.2.Final - 2.18.1 + 2.18.2 3.4.1 1.20.4 - 2.29.20 - 12.0.15 - 1.14.4 + 2.29.45 + 12.0.16 + 1.15.0 @@ -107,7 +107,7 @@ org.apache.commons commons-text - 1.12.0 + 1.13.0 commons-io @@ -175,7 +175,7 @@ org.elasticsearch.client elasticsearch-rest-high-level-client - 7.17.25 + 7.17.26 org.opensearch.client @@ -219,7 +219,7 @@ org.assertj assertj-core - 3.26.3 + 3.27.2 test @@ -295,7 +295,7 @@ com.google.guava guava - 33.3.1-jre + 33.4.0-jre org.apache.httpcomponents @@ -310,7 +310,7 @@ org.xerial sqlite-jdbc - 3.47.0.0 + 3.47.2.0 software.amazon.awssdk @@ -398,7 +398,7 @@ com.puppycrawl.tools checkstyle - 10.20.1 + 10.21.1 @@ -491,7 +491,7 @@ org.apache.maven.plugins maven-javadoc-plugin - 3.11.1 + 3.11.2 attach-javadocs From 326ec191614bfc34075604a9783de7a029cb36dc Mon Sep 17 00:00:00 2001 From: zwang28 <70626450+zwang28@users.noreply.github.com> Date: Sat, 11 Jan 2025 15:22:23 +0800 Subject: [PATCH 21/21] fix(meta): fix time travel GC bug (#20108) --- src/meta/src/hummock/manager/gc.rs | 35 ++++++--------------- src/meta/src/hummock/manager/time_travel.rs | 24 ++++++++++++-- src/meta/src/hummock/manager/versioning.rs | 26 +++++++++++++++ 3 files changed, 57 insertions(+), 28 deletions(-) diff --git a/src/meta/src/hummock/manager/gc.rs b/src/meta/src/hummock/manager/gc.rs index eba07b7002244..f423e1995b499 100644 --- a/src/meta/src/hummock/manager/gc.rs +++ b/src/meta/src/hummock/manager/gc.rs @@ -14,7 +14,6 @@ use std::cmp; use std::collections::HashSet; -use std::ops::Bound::{Excluded, Included}; use std::ops::DerefMut; use std::sync::atomic::{AtomicBool, Ordering}; use std::time::{Duration, SystemTime}; @@ -223,30 +222,8 @@ impl HummockManager { ) -> Result> { // This lock ensures `commit_epoch` and `report_compat_task` can see the latest GC history during sanity check. let versioning = self.versioning.read().await; - let tracked_object_ids: HashSet = { - let context_info = self.context_info.read().await; - // object ids in checkpoint version - let mut tracked_object_ids = versioning.checkpoint.version.get_object_ids(); - // add object ids added between checkpoint version and current version - for (_, delta) in versioning.hummock_version_deltas.range(( - Excluded(versioning.checkpoint.version.id), - Included(versioning.current_version.id), - )) { - tracked_object_ids.extend(delta.newly_added_object_ids()); - } - // add stale object ids before the checkpoint version - let min_pinned_version_id = context_info.min_pinned_version_id(); - tracked_object_ids.extend( - versioning - .checkpoint - .stale_objects - .iter() - .filter(|(version_id, _)| **version_id >= min_pinned_version_id) - .flat_map(|(_, objects)| objects.id.iter()) - .cloned(), - ); - tracked_object_ids - }; + let tracked_object_ids: HashSet = versioning + .get_tracked_object_ids(self.context_info.read().await.min_pinned_version_id()); let to_delete = object_ids.filter(|object_id| !tracked_object_ids.contains(object_id)); self.write_gc_history(to_delete.clone()).await?; Ok(to_delete.collect()) @@ -556,9 +533,15 @@ impl HummockManager { }; // Objects pinned by either meta backup or time travel should be filtered out. let backup_pinned: HashSet<_> = backup_manager.list_pinned_ssts(); + // The version_pinned is obtained after the candidate object_ids for deletion, which is new enough for filtering purpose. + let version_pinned = { + let versioning = self.versioning.read().await; + versioning + .get_tracked_object_ids(self.context_info.read().await.min_pinned_version_id()) + }; let object_ids = object_ids .into_iter() - .filter(|s| !backup_pinned.contains(s)); + .filter(|s| !version_pinned.contains(s) && !backup_pinned.contains(s)); let object_ids = self.filter_out_objects_by_time_travel(object_ids).await?; // Retry is not necessary. Full GC will handle these objects eventually. self.delete_objects(object_ids.into_iter().collect()) diff --git a/src/meta/src/hummock/manager/time_travel.rs b/src/meta/src/hummock/manager/time_travel.rs index e9886519c0485..a31fe4d82fafb 100644 --- a/src/meta/src/hummock/manager/time_travel.rs +++ b/src/meta/src/hummock/manager/time_travel.rs @@ -108,8 +108,18 @@ impl HummockManager { txn.commit().await?; return Ok(()); }; - let (latest_valid_version_id, latest_valid_version_sst_ids) = - { (latest_valid_version.id, latest_valid_version.get_sst_ids()) }; + let ( + latest_valid_version_id, + latest_valid_version_sst_ids, + latest_valid_version_object_ids, + ) = { + ( + latest_valid_version.id, + latest_valid_version.get_sst_ids(), + latest_valid_version.get_object_ids(), + ) + }; + let mut object_ids_to_delete: HashSet<_> = HashSet::default(); let version_ids_to_delete: Vec = hummock_time_travel_version::Entity::find() .select_only() @@ -153,6 +163,8 @@ impl HummockManager { .filter(hummock_sstable_info::Column::SstId.is_in(sst_ids_to_delete)) .exec(&txn) .await?; + let new_object_ids = delta_to_delete.newly_added_object_ids(); + object_ids_to_delete.extend(&new_object_ids - &latest_valid_version_object_ids); tracing::debug!( delta_id = delta_to_delete.id.to_u64(), "delete {} rows from hummock_sstable_info", @@ -182,6 +194,8 @@ impl HummockManager { .filter(hummock_sstable_info::Column::SstId.is_in(sst_ids_to_delete)) .exec(&txn) .await?; + let new_object_ids = prev_version.get_object_ids(); + object_ids_to_delete.extend(&new_object_ids - &latest_valid_version_object_ids); tracing::debug!( prev_version_id, "delete {} rows from hummock_sstable_info", @@ -189,6 +203,12 @@ impl HummockManager { ); next_version_sst_ids = sst_ids; } + if !object_ids_to_delete.is_empty() { + // IMPORTANT: object_ids_to_delete may include objects that are still being used by SSTs not included in time travel metadata. + // So it's crucial to filter out those objects before actually deleting them, i.e. when using `try_take_may_delete_object_ids`. + self.gc_manager + .add_may_delete_object_ids(object_ids_to_delete.into_iter()); + } let res = hummock_time_travel_version::Entity::delete_many() .filter( diff --git a/src/meta/src/hummock/manager/versioning.rs b/src/meta/src/hummock/manager/versioning.rs index bdd86a4f3b803..07d99992e10b1 100644 --- a/src/meta/src/hummock/manager/versioning.rs +++ b/src/meta/src/hummock/manager/versioning.rs @@ -13,6 +13,7 @@ // limitations under the License. use std::cmp; +use std::collections::Bound::{Excluded, Included}; use std::collections::{BTreeMap, HashMap, HashSet}; use itertools::Itertools; @@ -82,6 +83,31 @@ impl Versioning { pub(super) fn mark_next_time_travel_version_snapshot(&mut self) { self.time_travel_snapshot_interval_counter = u64::MAX; } + + pub fn get_tracked_object_ids( + &self, + min_pinned_version_id: HummockVersionId, + ) -> HashSet { + // object ids in checkpoint version + let mut tracked_object_ids = self.checkpoint.version.get_object_ids(); + // add object ids added between checkpoint version and current version + for (_, delta) in self.hummock_version_deltas.range(( + Excluded(self.checkpoint.version.id), + Included(self.current_version.id), + )) { + tracked_object_ids.extend(delta.newly_added_object_ids()); + } + // add stale object ids before the checkpoint version + tracked_object_ids.extend( + self.checkpoint + .stale_objects + .iter() + .filter(|(version_id, _)| **version_id >= min_pinned_version_id) + .flat_map(|(_, objects)| objects.id.iter()) + .cloned(), + ); + tracked_object_ids + } } impl HummockManager {