Merge remote-tracking branch 'origin' into tab/drop-table-connector

risingwavelabs · Jan 7, 2025 · 6d2f2d5 · 6d2f2d5
2 parents f008d79 + f530c10
commit 6d2f2d5
Show file tree

Hide file tree

Showing 52 changed files with 2,101 additions and 735 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -377,7 +377,6 @@ tokio-postgres = { git = "https://github.com/madsim-rs/rust-postgres.git", rev =
 # sqlx version: v0.8.2
 # patch diffs: https://github.com/madsim-rs/sqlx/pull/4
 sqlx = { git = "https://github.com/madsim-rs/sqlx.git", rev = "3efe6d0065963db2a2b7f30dee69f647e28dec81" }
-futures-timer = { git = "https://github.com/madsim-rs/futures-timer.git", rev = "05b33b4" }
 # patch to remove preserve_order from serde_json
 bson = { git = "https://github.com/risingwavelabs/bson-rust", rev = "e5175ec" }
 # TODO: unpatch after PR merged https://github.com/tokio-rs/prost/pull/1210

diff --git a/ci/scripts/check.sh b/ci/scripts/check.sh
@@ -23,6 +23,12 @@ configure_static_openssl
 echo "--- Run trailing spaces check"
 scripts/check/check-trailing-spaces.sh
 
+echo "--- Check protobuf code format && Lint protobuf"
+cd proto
+buf format -d --exit-code
+buf lint
+cd ..
+
 echo "--- Rust cargo-sort check"
 cargo sort --check --workspace --grouped
 

diff --git a/ci/scripts/common.sh b/ci/scripts/common.sh
@@ -20,6 +20,11 @@ export RW_SECRET_STORE_PRIVATE_KEY_HEX="0123456789abcdef0123456789abcdef"
 unset LANG
 
 function dump_diagnose_info() {
+  ret=$?
+  if [ $ret -eq 0 ]; then
+    exit 0
+  fi
+
   echo "^^^ +++"
   echo "--- Failed to run command! Dumping diagnose info..."
   if [ -f .risingwave/config/risedev-env ]; then

diff --git a/ci/scripts/e2e-kafka-sink-test.sh b/ci/scripts/e2e-kafka-sink-test.sh
@@ -148,7 +148,7 @@ rpk topic delete test-rw-sink-debezium
 
 # test different encoding
 echo "preparing confluent schema registry"
-python3 -m pip install --break-system-packages requests confluent-kafka
+python3 -m pip install --break-system-packages -r e2e_test/requirements.txt
 
 echo "testing protobuf"
 risedev slt 'e2e_test/sink/kafka/protobuf.slt'

diff --git a/ci/scripts/misc-check.sh b/ci/scripts/misc-check.sh
diff --git a/ci/scripts/run-backfill-tests.sh b/ci/scripts/run-backfill-tests.sh
@@ -34,7 +34,7 @@ else
   RUNTIME_CLUSTER_PROFILE='ci-backfill-3cn-1fe-with-monitoring'
   MINIO_RATE_LIMIT_CLUSTER_PROFILE='ci-backfill-3cn-1fe-with-monitoring-and-minio-rate-limit'
 fi
-export RUST_LOG="info,risingwave_stream=info,risingwave_batch=info,risingwave_storage=info" \
+export RUST_LOG="info,risingwave_stream=info,risingwave_stream::executor::backfill=debug,risingwave_batch=info,risingwave_storage=info,risingwave_meta::barrier=debug" \
 
 run_sql_file() {
   psql -h localhost -p 4566 -d dev -U root -f "$@"
@@ -304,6 +304,11 @@ test_snapshot_backfill() {
 
   wait
 
+  TEST_NAME=nexmark_q3 sqllogictest -p 4566 -d dev 'e2e_test/backfill/snapshot_backfill/scale.slt' &
+  TEST_NAME=nexmark_q7 sqllogictest -p 4566 -d dev 'e2e_test/backfill/snapshot_backfill/scale.slt' &
+
+  wait
+
   TEST_NAME=nexmark_q3 sqllogictest -p 4566 -d dev 'e2e_test/backfill/snapshot_backfill/drop_mv.slt' &
   TEST_NAME=nexmark_q7 sqllogictest -p 4566 -d dev 'e2e_test/backfill/snapshot_backfill/drop_mv.slt' &
 

diff --git a/ci/workflows/pull-request.yml b/ci/workflows/pull-request.yml
@@ -656,21 +656,6 @@ steps:
     cancel_on_build_failing: true
     retry: *auto-retry
 
-  - label: "misc check"
-    command: "ci/scripts/misc-check.sh"
-    if: |
-      !(build.pull_request.labels includes "ci/pr/run-selected") && build.env("CI_STEPS") == null
-      || build.pull_request.labels includes "ci/run-misc-check"
-      || build.env("CI_STEPS") =~ /(^|,)misc-check(,|$$)/
-    plugins:
-      - docker-compose#v5.5.0:
-          run: rw-build-env
-          config: ci/docker-compose.yml
-      - shellcheck#v1.2.0:
-          files: ./**/*.sh
-    timeout_in_minutes: 5
-    retry: *auto-retry
-
   # The following jobs are triggered only when PR has corresponding labels.
 
   # Generates cpu flamegraph env

diff --git a/docker/dashboards/risingwave-dev-dashboard.json b/docker/dashboards/risingwave-dev-dashboard.json
diff --git a/docker/dashboards/risingwave-user-dashboard.json b/docker/dashboards/risingwave-user-dashboard.json
diff --git a/docs/dev/src/tests/intro.md b/docs/dev/src/tests/intro.md
@@ -16,8 +16,6 @@ RisingWave requires all code to pass fmt, clippy, sort and hakari checks. Run th
 ./risedev c             # Run all checks. Shortcut for ./risedev check
 ```
 
-There are also some miscellaneous checks. See `ci/scripts/misc-check.sh`.
-
 ## Unit and integration tests
 
 RiseDev runs unit tests with cargo-nextest. To run unit tests:

diff --git a/e2e_test/backfill/snapshot_backfill/scale.slt b/e2e_test/backfill/snapshot_backfill/scale.slt
@@ -0,0 +1,15 @@
+control substitution on
+
+statement ok
+alter materialized view ${TEST_NAME}_mv set parallelism to 1;
+
+sleep 3s
+
+include ./check_data_equal.slt.part
+
+statement ok
+alter materialized view ${TEST_NAME}_mv set parallelism to 4;
+
+sleep 3s
+
+include ./check_data_equal.slt.part
diff --git a/e2e_test/s3/file_sink.py b/e2e_test/s3/file_sink.py
@@ -62,6 +62,46 @@ def do_test(config, file_num, item_num_per_file, prefix):
     def _table():
         return 's3_test_parquet'
 
+    print("test table function file scan")
+    cur.execute(f'''
+    SELECT
+         id,
+        name,
+        sex,
+        mark,
+        test_int,
+        test_int8,
+        test_uint8,
+        test_uint16,
+        test_uint32,
+        test_uint64,
+        test_float_16,
+        test_real,
+        test_double_precision,
+        test_varchar,
+        test_bytea,
+        test_date,
+        test_time,
+        test_timestamp_s,
+        test_timestamp_ms,
+        test_timestamp_us,
+        test_timestamp_ns,
+        test_timestamptz_s,
+        test_timestamptz_ms,
+        test_timestamptz_us,
+        test_timestamptz_ns
+         FROM file_scan(
+        'parquet',
+        's3',
+        'http://127.0.0.1:9301',
+        'hummockadmin',
+        'hummockadmin',
+        's3://hummock001/test_file_scan/test_file_scan.parquet'
+        );''')
+    result = cur.fetchone()
+    assert result[0] == 0, f'file scan assertion failed: the first column is {result[0]}, expect 0.'
+
+    print("file scan test pass")
     # Execute a SELECT statement
     cur.execute(f'''CREATE TABLE {_table()}(
         id bigint primary key,
@@ -491,6 +531,21 @@ def _assert_greater(field, got, expect):
             _s3(idx),
             _local(idx)
         )
+    # put parquet file to test table function file scan
+    if data:
+        first_file_data = data[0]
+        first_table = pa.Table.from_pandas(pd.DataFrame(first_file_data))
+
+        first_file_name = f"test_file_scan.parquet"
+        first_file_path = f"test_file_scan/{first_file_name}"
+
+        pq.write_table(first_table, "data_0.parquet")
+
+        client.fput_object(
+            "hummock001",
+            first_file_path,
+            "data_0.parquet"
+        )
 
     # do test
     do_test(config, FILE_NUM, ITEM_NUM_PER_FILE, run_id)

diff --git a/e2e_test/source_inline/pubsub/prepare-data.rs b/e2e_test/source_inline/pubsub/prepare-data.rs
@@ -2,8 +2,8 @@
 ---cargo
 [dependencies]
 anyhow = "1"
-google-cloud-googleapis = { version = "0.13", features = ["pubsub"] }
-google-cloud-pubsub = "0.25"
+google-cloud-googleapis = { version = "0.16", features = ["pubsub"] }
+google-cloud-pubsub = "0.30"
 tokio = { version = "0.2", package = "madsim-tokio", features = [
     "rt",
     "rt-multi-thread",

diff --git a/grafana/risingwave-dev-dashboard.dashboard.py b/grafana/risingwave-dev-dashboard.dashboard.py
@@ -4661,6 +4661,113 @@ def section_udf(outer_panels):
         )
     ]
 
+def section_alert_overview(panels):
+    return [
+        panels.row("Alert Overview"),
+        panels.timeseries_count(
+            "Alerts",
+            """Alerts in the system group by type:
+            - Too Many Barriers: there are too many uncommitted barriers generated. This means the streaming graph is stuck or under heavy load. Check 'Barrier Latency' panel.
+            - Recovery Triggered: cluster recovery is triggered. Check 'Errors by Type' / 'Node Count' panels.
+            - Lagging Version: the checkpointed or pinned version id is lagging behind the current version id. Check 'Hummock Manager' section in dev dashboard.
+            - Lagging Compaction: there are too many ssts in L0. This can be caused by compactor failure or lag of compactor resource. Check 'Compaction' section in dev dashboard, and take care of the type of 'Commit Flush Bytes' and 'Compaction Throughput', whether the throughput is too low.
+            - Lagging Vacuum: there are too many stale files waiting to be cleaned. This can be caused by compactor failure or lag of compactor resource. Check 'Compaction' section in dev dashboard.
+            - Abnormal Meta Cache Memory: the meta cache memory usage is too large, exceeding the expected 10 percent.
+            - Abnormal Block Cache Memory: the block cache memory usage is too large, exceeding the expected 10 percent.
+            - Abnormal Uploading Memory Usage: uploading memory is more than 70 percent of the expected, and is about to spill.
+            - Write Stall: Compaction cannot keep up. Stall foreground write, Check 'Compaction' section in dev dashboard.
+            - Abnormal Version Size: the size of the version is too large, exceeding the expected 300MB. Check 'Hummock Manager' section in dev dashboard.
+            - Abnormal Delta Log Number: the number of delta logs is too large, exceeding the expected 5000. Check 'Hummock Manager' and `Compaction` section in dev dashboard and take care of the type of 'Compaction Success Count', whether the number of trivial-move tasks spiking.
+            - Abnormal Pending Event Number: the number of pending events is too large, exceeding the expected 10000000. Check 'Hummock Write' section in dev dashboard and take care of the 'Event handle latency', whether the time consumed exceeds the barrier latency.
+            - Abnormal Object Storage Failure: the number of object storage failures is too large, exceeding the expected 50. Check 'Object Storage' section in dev dashboard and take care of the 'Object Storage Failure Rate', whether the rate is too high.
+            """,
+            [
+                panels.target(
+                    f"{metric('all_barrier_nums')} >= bool 200",
+                    "Too Many Barriers {{database_id}}",
+                ),
+                panels.target(
+                    f"sum(rate({metric('recovery_latency_count')}[$__rate_interval])) > bool 0 + sum({metric('recovery_failure_cnt')}) > bool 0",
+                    "Recovery Triggered",
+                ),
+                panels.target(
+                    f"(({metric('storage_current_version_id')} - {metric('storage_checkpoint_version_id')}) >= bool 100) + "
+                    + f"(({metric('storage_current_version_id')} - {metric('storage_min_pinned_version_id')}) >= bool 100)",
+                    "Lagging Version",
+                ),
+                panels.target(
+                    f"sum(label_replace({metric('storage_level_total_file_size')}, 'L0', 'L0', 'level_index', '.*_L0') unless "
+                    + f"{metric('storage_level_total_file_size')}) by (L0) >= bool 52428800",
+                    "Lagging Compaction",
+                ),
+                panels.target(
+                    f"{metric('storage_stale_object_count')} >= bool 200",
+                    "Lagging Vacuum",
+                ),
+                panels.target(
+                    f"{metric('state_store_meta_cache_usage_ratio')} >= bool 1.1",
+                    "Abnormal Meta Cache Memory",
+                ),
+                panels.target(
+                    f"{metric('state_store_block_cache_usage_ratio')} >= bool 1.1",
+                    "Abnormal Block Cache Memory",
+                ),
+                panels.target(
+                    f"{metric('state_store_uploading_memory_usage_ratio')} >= bool 0.7",
+                    "Abnormal Uploading Memory Usage",
+                ),
+                panels.target(
+                    f"{metric('storage_write_stop_compaction_groups')} > bool 0",
+                    "Write Stall",
+                ),
+                panels.target(
+                    f"{metric('storage_version_size')} >= bool 314572800",
+                    "Abnormal Version Size",
+                ),
+                panels.target(
+                    f"{metric('storage_delta_log_count')} >= bool 5000",
+                    "Abnormal Delta Log Number",
+                ),
+                panels.target(
+                    f"{metric('state_store_event_handler_pending_event')} >= bool 10000000",
+                    "Abnormal Pending Event Number",
+                ),
+                panels.target(
+                    f"{metric('object_store_failure_count')} >= bool 50",
+                    "Abnormal Object Storage Failure",
+                ),
+            ],
+            ["last"],
+        ),
+        panels.timeseries_count(
+            "Errors",
+            "Errors in the system group by type",
+            [
+                panels.target(
+                    f"sum({metric('user_compute_error')}) by (error_type, executor_name, fragment_id)",
+                    "{{error_type}} @ {{executor_name}} (fragment_id={{fragment_id}})",
+                ),
+                panels.target(
+                    f"sum({metric('user_source_error')}) by (error_type, source_id, source_name, fragment_id)",
+                    "{{error_type}} @ {{source_name}} (source_id={{source_id}} fragment_id={{fragment_id}})",
+                ),
+                panels.target(
+                    f"sum({metric('user_sink_error')}) by (error_type, sink_id, sink_name, fragment_id)",
+                    "{{error_type}} @ {{sink_name}} (sink_id={{sink_id}} fragment_id={{fragment_id}})",
+                ),
+                panels.target(
+                    f"{metric('source_status_is_up')} == 0",
+                    "source error: source_id={{source_id}}, source_name={{source_name}} @ {{%s}}"
+                    % NODE_LABEL,
+                ),
+                panels.target(
+                    f"sum(rate({metric('object_store_failure_count')}[$__rate_interval])) by ({NODE_LABEL}, {COMPONENT_LABEL}, type)",
+                    "remote storage error {{type}}: {{%s}} @ {{%s}}"
+                    % (COMPONENT_LABEL, NODE_LABEL),
+                ),
+            ],
+        ),
+    ]
 
 templating_list = []
 if dynamic_source_enabled:
@@ -4840,5 +4947,6 @@ def section_udf(outer_panels):
         *section_network_connection(panels),
         *section_iceberg_metrics(panels),
         *section_udf(panels),
+        *section_alert_overview(panels),
     ],
 ).auto_panel_ids()
diff --git a/grafana/risingwave-dev-dashboard.json b/grafana/risingwave-dev-dashboard.json
diff --git a/grafana/risingwave-user-dashboard.dashboard.py b/grafana/risingwave-user-dashboard.dashboard.py
@@ -131,13 +131,16 @@ def section_overview(panels):
             - Too Many Barriers: there are too many uncommitted barriers generated. This means the streaming graph is stuck or under heavy load. Check 'Barrier Latency' panel.
             - Recovery Triggered: cluster recovery is triggered. Check 'Errors by Type' / 'Node Count' panels.
             - Lagging Version: the checkpointed or pinned version id is lagging behind the current version id. Check 'Hummock Manager' section in dev dashboard.
-            - Lagging Epoch: the pinned or safe epoch is lagging behind the current max committed epoch. Check 'Hummock Manager' section in dev dashboard.
-            - Lagging Compaction: there are too many files in L0. This can be caused by compactor failure or lag of compactor resource. Check 'Compaction' section in dev dashboard.
+            - Lagging Compaction: there are too many ssts in L0. This can be caused by compactor failure or lag of compactor resource. Check 'Compaction' section in dev dashboard, and take care of the type of 'Commit Flush Bytes' and 'Compaction Throughput', whether the throughput is too low.
             - Lagging Vacuum: there are too many stale files waiting to be cleaned. This can be caused by compactor failure or lag of compactor resource. Check 'Compaction' section in dev dashboard.
             - Abnormal Meta Cache Memory: the meta cache memory usage is too large, exceeding the expected 10 percent.
             - Abnormal Block Cache Memory: the block cache memory usage is too large, exceeding the expected 10 percent.
             - Abnormal Uploading Memory Usage: uploading memory is more than 70 percent of the expected, and is about to spill.
-            - Write Stall: Compaction cannot keep up. Stall foreground write.
+            - Write Stall: Compaction cannot keep up. Stall foreground write, Check 'Compaction' section in dev dashboard.
+            - Abnormal Version Size: the size of the version is too large, exceeding the expected 300MB. Check 'Hummock Manager' section in dev dashboard.
+            - Abnormal Delta Log Number: the number of delta logs is too large, exceeding the expected 5000. Check 'Hummock Manager' and `Compaction` section in dev dashboard and take care of the type of 'Compaction Success Count', whether the number of trivial-move tasks spiking.
+            - Abnormal Pending Event Number: the number of pending events is too large, exceeding the expected 10000000. Check 'Hummock Write' section in dev dashboard and take care of the 'Event handle latency', whether the time consumed exceeds the barrier latency.
+            - Abnormal Object Storage Failure: the number of object storage failures is too large, exceeding the expected 50. Check 'Object Storage' section in dev dashboard and take care of the 'Object Storage Failure Rate', whether the rate is too high.
             """,
             [
                 panels.target(
@@ -154,12 +157,8 @@ def section_overview(panels):
                     "Lagging Version",
                 ),
                 panels.target(
-                    f"(({metric('storage_max_committed_epoch')} - {metric('storage_min_pinned_epoch')}) >= bool 6553600000 unless + {metric('storage_min_pinned_epoch')} == 0)",
-                    "Lagging Epoch",
-                ),
-                panels.target(
-                    f"sum(label_replace({metric('storage_level_sst_num')}, 'L0', 'L0', 'level_index', '.*_L0') unless "
-                    + f"{metric('storage_level_sst_num')}) by (L0) >= bool 200",
+                    f"sum(label_replace({metric('storage_level_total_file_size')}, 'L0', 'L0', 'level_index', '.*_L0') unless "
+                    + f"{metric('storage_level_total_file_size')}) by (L0) >= bool 52428800",
                     "Lagging Compaction",
                 ),
                 panels.target(
@@ -182,6 +181,22 @@ def section_overview(panels):
                     f"{metric('storage_write_stop_compaction_groups')} > bool 0",
                     "Write Stall",
                 ),
+                panels.target(
+                    f"{metric('storage_version_size')} >= bool 314572800",
+                    "Abnormal Version Size",
+                ),
+                panels.target(
+                    f"{metric('storage_delta_log_count')} >= bool 5000",
+                    "Abnormal Delta Log Number",
+                ),
+                panels.target(
+                    f"{metric('state_store_event_handler_pending_event')} >= bool 10000000",
+                    "Abnormal Pending Event Number",
+                ),
+                panels.target(
+                    f"{metric('object_store_failure_count')} >= bool 50",
+                    "Abnormal Object Storage Failure",
+                ),
             ],
             ["last"],
         ),

diff --git a/grafana/risingwave-user-dashboard.json b/grafana/risingwave-user-dashboard.json