WIP

ydb-platform · Feb 10, 2025 · 283b0d4 · 283b0d4
1 parent 8b9ae32
commit 283b0d4
Show file tree

Hide file tree

Showing 43 changed files with 321 additions and 0 deletions.
diff --git a/...bleshooting/examples/_assets/overloaded-shard-1/aftermath-grafana-latencies.png b/...bleshooting/examples/_assets/overloaded-shard-1/aftermath-grafana-latencies.png
diff --git a/...g/examples/_assets/overloaded-shard-1/aftermath-grafana-latency-percentiles.png b/...g/examples/_assets/overloaded-shard-1/aftermath-grafana-latency-percentiles.png
diff --git a/...ing/examples/_assets/overloaded-shard-1/aftermath-grafana-overloaded-shards.png b/...ing/examples/_assets/overloaded-shard-1/aftermath-grafana-overloaded-shards.png
diff --git a/..._assets/overloaded-shard-1/aftermath-grafana-shard-distribution-by-workload.png b/..._assets/overloaded-shard-1/aftermath-grafana-shard-distribution-by-workload.png
diff --git a/...n/core/troubleshooting/examples/_assets/overloaded-shard-1/aftermath-ui-cpu.png b/...n/core/troubleshooting/examples/_assets/overloaded-shard-1/aftermath-ui-cpu.png
diff --git a/...troubleshooting/examples/_assets/overloaded-shard-1/aftermath-ui-top-shards.png b/...troubleshooting/examples/_assets/overloaded-shard-1/aftermath-ui-top-shards.png
diff --git a/...amples/_assets/overloaded-shard-1/incident-grafana-api-section-request-size.png b/...amples/_assets/overloaded-shard-1/incident-grafana-api-section-request-size.png
diff --git a/...g/examples/_assets/overloaded-shard-1/incident-grafana-api-section-requests.png b/...g/examples/_assets/overloaded-shard-1/incident-grafana-api-section-requests.png
diff --git a/...mples/_assets/overloaded-shard-1/incident-grafana-api-section-response-size.png b/...mples/_assets/overloaded-shard-1/incident-grafana-api-section-response-size.png
diff --git a/...leshooting/examples/_assets/overloaded-shard-1/incident-grafana-api-section.png b/...leshooting/examples/_assets/overloaded-shard-1/incident-grafana-api-section.png
diff --git a/.../examples/_assets/overloaded-shard-1/incident-grafana-cpu-by-execution-pool.png b/.../examples/_assets/overloaded-shard-1/incident-grafana-cpu-by-execution-pool.png
diff --git a/...ooting/examples/_assets/overloaded-shard-1/incident-grafana-cpu-dashboard-1.png b/...ooting/examples/_assets/overloaded-shard-1/incident-grafana-cpu-dashboard-1.png
diff --git a/...ooting/examples/_assets/overloaded-shard-1/incident-grafana-cpu-dashboard-2.png b/...ooting/examples/_assets/overloaded-shard-1/incident-grafana-cpu-dashboard-2.png
diff --git a/...ooting/examples/_assets/overloaded-shard-1/incident-grafana-cpu-dashboard-3.png b/...ooting/examples/_assets/overloaded-shard-1/incident-grafana-cpu-dashboard-3.png
diff --git a/...ooting/examples/_assets/overloaded-shard-1/incident-grafana-cpu-dashboard-4.png b/...ooting/examples/_assets/overloaded-shard-1/incident-grafana-cpu-dashboard-4.png
diff --git a/...s/_assets/overloaded-shard-1/incident-grafana-cpu-dashboard-ic-pool-by-host.png b/...s/_assets/overloaded-shard-1/incident-grafana-cpu-dashboard-ic-pool-by-host.png
diff --git a/.../examples/_assets/overloaded-shard-1/incident-grafana-cpu-dashboard-ic-pool.png b/.../examples/_assets/overloaded-shard-1/incident-grafana-cpu-dashboard-ic-pool.png
diff --git a/...ssets/overloaded-shard-1/incident-grafana-cpu-dashboard-user-pool-by-actors.png b/...ssets/overloaded-shard-1/incident-grafana-cpu-dashboard-user-pool-by-actors.png
diff --git a/...ubleshooting/examples/_assets/overloaded-shard-1/incident-grafana-latencies.png b/...ubleshooting/examples/_assets/overloaded-shard-1/incident-grafana-latencies.png
diff --git a/...ng/examples/_assets/overloaded-shard-1/incident-grafana-latency-percentiles.png b/...ng/examples/_assets/overloaded-shard-1/incident-grafana-latency-percentiles.png
diff --git a/...ting/examples/_assets/overloaded-shard-1/incident-grafana-overloaded-shards.png b/...ting/examples/_assets/overloaded-shard-1/incident-grafana-overloaded-shards.png
diff --git a/.../_assets/overloaded-shard-1/incident-grafana-shard-distribution-by-workload.png b/.../_assets/overloaded-shard-1/incident-grafana-shard-distribution-by-workload.png
diff --git a/...ooting/examples/_assets/overloaded-shard-1/incident-grafana-throughput-rows.png b/...ooting/examples/_assets/overloaded-shard-1/incident-grafana-throughput-rows.png
diff --git a/...e/troubleshooting/examples/_assets/overloaded-shard-1/incident-ui-cpu-usage.png b/...e/troubleshooting/examples/_assets/overloaded-shard-1/incident-ui-cpu-usage.png
diff --git a/.../troubleshooting/examples/_assets/overloaded-shard-1/incident-ui-table-info.png b/.../troubleshooting/examples/_assets/overloaded-shard-1/incident-ui-table-info.png
diff --git a/.../troubleshooting/examples/_assets/overloaded-shard-1/incident-ui-top-shards.png b/.../troubleshooting/examples/_assets/overloaded-shard-1/incident-ui-top-shards.png
diff --git a/...leshooting/examples/_assets/overloaded-shard/aftermath-datashard-overloaded.png b/...leshooting/examples/_assets/overloaded-shard/aftermath-datashard-overloaded.png
diff --git a/.../core/troubleshooting/examples/_assets/overloaded-shard/aftermath-latencies.png b/.../core/troubleshooting/examples/_assets/overloaded-shard/aftermath-latencies.png
diff --git a/...bleshooting/examples/_assets/overloaded-shard/aftermath-latency-percentiles.png b/...bleshooting/examples/_assets/overloaded-shard/aftermath-latency-percentiles.png
diff --git a/...ting/examples/_assets/overloaded-shard/aftermath-shard-distribution-by-load.png b/...ting/examples/_assets/overloaded-shard/aftermath-shard-distribution-by-load.png
diff --git a/...re/troubleshooting/examples/_assets/overloaded-shard/dboverview-api-details.png b/...re/troubleshooting/examples/_assets/overloaded-shard/dboverview-api-details.png
diff --git a/...core/troubleshooting/examples/_assets/overloaded-shard/dboverview-latencies.png b/...core/troubleshooting/examples/_assets/overloaded-shard/dboverview-latencies.png
diff --git a/...re/troubleshooting/examples/_assets/overloaded-shard/incident-cpu-dashboard.png b/...re/troubleshooting/examples/_assets/overloaded-shard/incident-cpu-dashboard.png
diff --git a/...bleshooting/examples/_assets/overloaded-shard/incident-datashard-overloaded.png b/...bleshooting/examples/_assets/overloaded-shard/incident-datashard-overloaded.png
diff --git a/...bleshooting/examples/_assets/overloaded-shard/incident-datashard-throughput.png b/...bleshooting/examples/_assets/overloaded-shard/incident-datashard-throughput.png
diff --git a/...ubleshooting/examples/_assets/overloaded-shard/incident-latency-percentiles.png b/...ubleshooting/examples/_assets/overloaded-shard/incident-latency-percentiles.png
diff --git a/.../core/troubleshooting/examples/_assets/overloaded-shard/incident-rw-latency.png b/.../core/troubleshooting/examples/_assets/overloaded-shard/incident-rw-latency.png
diff --git a/...troubleshooting/examples/_assets/overloaded-shard/incident-stock-table-info.png b/...troubleshooting/examples/_assets/overloaded-shard/incident-stock-table-info.png
diff --git a/.../core/troubleshooting/examples/_assets/overloaded-shard/incident-top-shards.png b/.../core/troubleshooting/examples/_assets/overloaded-shard/incident-top-shards.png
diff --git a/...e/troubleshooting/examples/_assets/overloaded-shard/incient-diagnostics-cpu.png b/...e/troubleshooting/examples/_assets/overloaded-shard/incient-diagnostics-cpu.png
diff --git a/ydb/docs/en/core/troubleshooting/examples/overloaded-shard-1.md b/ydb/docs/en/core/troubleshooting/examples/overloaded-shard-1.md
@@ -0,0 +1,192 @@
+# Overloaded shard example
+
+You were notified that your system started taking too long to process user requests.
+
+## Initial problem
+
+Let's take a look at the **Latency** diagrams in the [DB overview](../../reference/observability/metrics/grafana-dashboards.md#dboverview) Grafana dashboard to see if the problem has to do with the {{ ydb-short-name }} cluster:
+
+
+![DB Overview > Latencies > R tx server latency percentiles](_assets/overloaded-shard-1/incident-grafana-latency-percentiles.png)
+
+![DB Overview > Latencies > Read only tx server latency](_assets/overloaded-shard-1/incident-grafana-latencies.png)
+
+Indeed, the latencies have increased. Now we need to localize the problem.
+
+## Diagnostics
+
+Let's find out the reason for the latencies to increase. Perhaps, the reason is the increased workload? Here is the **Requests** diagram from the **API details** section of the [DB overview](../../reference/observability/metrics/grafana-dashboards.md#dboverview) Grafana dashboard:
+
+![API details](./_assets/overloaded-shard-1/incident-grafana-api-section-requests.png)
+
+<!--![API details](./_assets/overloaded-shard-1/incident-grafana-api-section-request-size.png)
+
+![API details](./_assets/overloaded-shard-1/incident-grafana-api-section-response-size.png)-->
+
+The number of user requests has definitely increased. But can {{ ydb-short-name }} handle the increased load without additional hardware resources?
+
+The CPU load has increased, you can see it on the **CPU by execution pool** diagram.
+
+![CPU](./_assets/overloaded-shard-1/incident-grafana-cpu-by-execution-pool.png)
+
+{% cut "See the details on the CPU Grafana dashboard" %}
+
+If we take a look at the **CPU** Grafana dashboard, the CPU usage increased in the user pool and in the interconnect pool:
+
+![CPU](./_assets/overloaded-shard-1/incident-grafana-cpu-dashboard-user-pool-by-actors.png)
+
+![CPU](./_assets/overloaded-shard-1/incident-grafana-cpu-dashboard-ic-pool.png)
+
+![CPU](./_assets/overloaded-shard-1/incident-grafana-cpu-dashboard-ic-pool-by-host.png)
+
+{% endcut %}
+
+We can also see the overall CPU usage on the **Diagnostics** tab of the [Embedded UI](../../reference/embedded-ui/index.md):
+
+![CPU diagnostics](./_assets/overloaded-shard-1/incident-ui-cpu-usage.png)
+
+It looks like the {{ ydb-short-name }} cluster is not utilizing all of its cpu capacity.
+
+If we look at the **DataShard** and **DataShard details** sections of the [DB overview](../../reference/observability/metrics/grafana-dashboards.md#dboverview) Grafana dashboard, we can see that after the load on the cluster increased, one of its data shards got overloaded.
+
+![Throughput](./_assets/overloaded-shard-1/incident-grafana-throughput-rows.png)
+
+![Shard distribution by load](./_assets/overloaded-shard-1/incident-grafana-shard-distribution-by-workload.png)
+
+![Overloaded shard](./_assets/overloaded-shard-1/incident-grafana-overloaded-shards.png)
+
+To determine what table the overloaded data shard is processing, let's open the **Diagnostics > Top shards** tab in the Embedded UI:
+
+![Diagnostics > shards](./_assets/overloaded-shard-1/incident-ui-top-shards.png)
+
+See that one of data shards that processes queries for the `kv_test` table is loaded by 67%.
+
+Let's take a look at the `kv_test` table on the **Info** tab:
+
+![stock table info](./_assets/overloaded-shard-1/incident-ui-table-info.png)
+
+{% note warning %}
+
+The `kv_test` table was created with partitioning by load disabled and has only one partition.
+
+It means that only one data shard processes requests to this table. And we know that a data shard can process only one request at a time. This is really bad practice.
+
+{% endnote %}
+
+## Solution
+
+We should enable partitioning by size and by load for the `kv_test` table:
+
+1. In the Embedded UI, select the database.
+2. Open the **Query** tab.
+3. Run the following query:
+
+    ```sql
+    ALTER TABLE kv_test SET (
+        AUTO_PARTITIONING_BY_LOAD = ENABLED
+    );
+    ```
+
+## Aftermath
+
+As soon as we enable automatic partitioning for the `kv_test` table, the overloaded data shard split.
+
+![shard distribution by load](./_assets/overloaded-shard-1/aftermath-grafana-shard-distribution-by-workload.png)
+
+![overloaded shard count](./_assets/overloaded-shard-1/aftermath-grafana-overloaded-shards.png)
+
+Two data shards are processing queries to the `kv_test` table now, none of them are overloaded:
+
+![overloaded shard count](./_assets/overloaded-shard-1/aftermath-ui-top-shards.png)
+
+Let's make sure the latencies are back to normal:
+
+![final latency percentiles](./_assets/overloaded-shard-1/aftermath-grafana-latency-percentiles.png)
+![final latencies](./_assets/overloaded-shard-1/aftermath-grafana-latencies.png)
+
+The latencies are almost as low as they used to be before the increased load. We did not add any additional hardware resources. Just enabled splitting by load.
+
+## Testbed
+
+### Topology
+
+For the example, we used a {{ ydb-short-name }} cluster consisting of three servers running Ubuntu 22.04 LTS.
+
+```mermaid
+flowchart
+
+subgraph client[Client VM]
+    cli(YDB CLI)
+end
+
+client-->cluster
+
+subgraph cluster["YDB Cluster"]
+    direction TB
+    subgraph VM1["VM 1"]
+        node1(YDB database node 1)
+        node2(YDB database node 2)
+        node3(YDB database node 3)
+        node4(YDB storage node 1)
+    end
+    subgraph VM2["VM 2"]
+        node5(YDB database node 1)
+        node6(YDB database node 2)
+        node7(YDB database node 3)
+        node8(YDB storage node 1)
+    end
+    subgraph VM3["VM 3"]
+        node9(YDB database node 1)
+        node10(YDB database node 2)
+        node11(YDB database node 3)
+        node12(YDB storage node 1)
+    end
+end
+
+classDef storage-node fill:#D0FEFE
+classDef database-node fill:#98FB98
+class node4,node8,node12 storage-node
+class node1,node2,node3,node5,node6,node7,node9,node10,node11 database-node
+```
+
+### Hardware configuration
+
+Each virtual machine has the following computing resources:
+
+- Platform: Intel Broadwell
+- Guaranteed vCPU performance: 100%
+- vCPU: 28
+- RAM: 32 GB
+
+### Test
+
+The load on the {{ ydb-short-name }} was generated with the `ydb workload` CLI command. For more information, see [{#T}](../../reference/ydb-cli/commands/workload/index.md).
+
+We performed the following steps:
+
+1. Initialize the tables for the workload test:
+
+    ```shell
+    ydb workload kv init --min-partitions 1 --auto-partition 0
+    ```
+
+    We deliberately disable automatic partitioning for the created tables by using the `--min-partitions 1 --auto-partition 0` options.
+
+1. Emulate the standard workload on the {{ ydb-short-name }} cluster:
+
+    ```shell
+    ydb workload kv run select -s 600 -t 100
+    ```
+
+    We ran a simple load type using a {{ ydb-short-name }} database as a Key-Value storage. Specifically, we used the `select` load to create SELECT queries and get rows based on an exact match of the primary key.
+
+    The `-t 100` parameter is used to ran the test in 100 threads.
+
+
+1. Overload the {{ ydb-short-name }} cluster:
+
+    ```shell
+    ydb workload kv run select -s 1200 -t 250
+    ```
+
+    To simulate the overload, as soon as the first test ended, we ran the same load test in 250 threads. This way we emulated the x2.5 increase in workload.
diff --git a/ydb/docs/en/core/troubleshooting/examples/overloaded-shard.md b/ydb/docs/en/core/troubleshooting/examples/overloaded-shard.md
@@ -0,0 +1,124 @@
+# Overloaded shard example
+
+You were notified that your system started taking too long to process user requests.
+
+## Initial problem
+
+Let's take a look at the **Latency** diagrams in the [DB overview](../../reference/observability/metrics/grafana-dashboards.md#dboverview) Grafana dashboard to see if the problem has to do with the {{ ydb-short-name }} cluster:
+
+
+![DB Overview > Latencies > RW tx server latency](_assets/overloaded-shard/incident-rw-latency.png)
+
+![DB Overview > Latencies > RW tx server latency](_assets/overloaded-shard/incident-latency-percentiles.png)
+
+Indeed, the latencies have increased. Now we need to localize the problem.
+
+## Diagnostics
+
+Let's find out the reason for the latencies to increase. Perhaps, the reason is the increased workload? Here is the **API details** section of the [DB overview](../../reference/observability/metrics/grafana-dashboards.md#dboverview) Grafana dashboard:
+
+![API details](./_assets/overloaded-shard/dboverview-api-details.png)
+
+The number of user requests has definetely increased. But can {{ ydb-short-name }} handle the increased load without additional hardware resources? See the CPU Grafana dashboard:
+
+![CPU](./_assets/overloaded-shard/incident-cpu-dashboard.png)
+
+We can also see the overall CPU usage on the **Diagnostics** tab of the [Embedded UI](../../reference/embedded-ui/index.md):
+
+![CPU diagnostics](./_assets/overloaded-shard/incient-diagnostics-cpu.png)
+
+It looks like the {{ ydb-short-name }} cluster is not utilizing all of its cpu capacity.
+
+If we look at the **DataShard** section [DB overview](../../reference/observability/metrics/grafana-dashboards.md#dboverview) Grafana dashboard? we can see that after the load on the cluster increased, one of its data shards got overloaded.
+
+![Throughput](./_assets/overloaded-shard/incident-datashard-throughput.png)
+
+![Overloaded shard](./_assets/overloaded-shard/incident-datashard-overloaded.png)
+
+To determine what table the overloaded data shard is processing, let's open the **Diagnostics > Top shards** tab in the Embedded UI:
+
+![Diagnostics > shards](./_assets/overloaded-shard/incident-top-shards.png)
+
+See that one of data shards that processes queries for the `stock` table is loaded by 94%.
+
+Let's take a look at the `stock` table on the **Info** tab:
+
+![stock table info](./_assets/overloaded-shard/incident-stock-table-info.png)
+
+{% note warning %}
+
+The `stock` table was created with partitioning by size and by load disabled and has only one partition.
+
+It means that only one data shard processes requests to this talbe. And we know that a data shard can process only one request at a time. This is really bad practice.
+
+{% endnote %}
+
+## Solution
+
+We should enable partitioning by size and by load for the `stock` table:
+
+1. In the Embedded UI, select the database.
+2. Open the **Query** tab.
+3. Run the following query:
+
+    ```sql
+    ALTER TABLE stock SET (
+        AUTO_PARTITIONING_BY_SIZE = ENABLED,
+        AUTO_PARTITIONING_BY_LOAD = ENABLED
+    );
+    ```
+
+## Aftermath
+
+As soon as we enable automatic partitioning for the `stock` table, the overloaded data shard start splitting.
+
+![shard distribution by load](./_assets/overloaded-shard/aftermath-shard-distribution-by-load.png)
+
+In five minutes the number of data shards processing the table stabilizes. Multiple data shards are processing queries to the `stock` table now, none of them are overloaded:
+
+![overloaded shard count](./_assets/overloaded-shard/aftermath-datashard-overloaded.png)
+
+![final latency percentiles](./_assets/overloaded-shard/aftermath-latency-percentiles.png)
+![final latencies](./_assets/overloaded-shard/aftermath-latencies.png)
+
+## Testbed
+
+For the example, we used a {{ ydb-short-name }} cluster consisting of three servers running Ubuntu 22.04 LTS.
+
+Each server has the following hardware configuration:
+
+- Platform: Intel Broadwell
+- Guaranteed vCPU performance: 100%
+- vCPU: 16
+- RAM: 32 GB
+
+The load on the {{ ydb-short-name }} was generated with the `ydb workload` CLI command. For more information, see [{#T}](../../reference/ydb-cli/commands/workload/index.md).
+
+We performed the following steps:
+
+1. Initialize the tables for the workload test:
+
+    ```shell
+    ydb workload stock init -p 1000 -q 10000 -o 1000 --min-partitions 1 --auto-partition 0
+    ```
+
+    We deliberately disable automatic partitioning for the created tables by using the `--min-partitions 1 --auto-partition 0` options.
+
+1. Emulate the standard workload on the {{ ydb-short-name }} cluster:
+
+    ```shell
+    ydb workload stock run put-rand-order -s 3200 -p 1000 -t 50
+    ```
+
+    We ran the stock workload that simulates a warehouse of an online store. The put-rand-order load test generates an order at random and processes it. For example, a customer has created and paid an order of 2 products. The data about the order and products is written to the database, product availability is checked and quantities in stock are decreased. A mixed data load is created.
+
+    The `-t 50` parameter is used to ran the test in 50 threads.
+
+
+1. Overload the {{ ydb-short-name }} cluster:
+
+    ```shell
+    ydb workload stock run put-rand-order -s 3200 -p 1000 -t 200
+    ```
+
+    To simulate the overload, while the previous load test is still running, we ran another instance of the same load test in 200 threads. So, the overall number of threads of the load test reaches 250.
diff --git a/ydb/docs/en/core/troubleshooting/toc_p.yaml b/ydb/docs/en/core/troubleshooting/toc_p.yaml
@@ -4,3 +4,8 @@ items:
   include:
     mode: link
     path: performance/toc_p.yaml
+- name: Diagnostic examples
+  items:
+    - name: Example 1
+      href: examples/overloaded-shard-1.md
+