From f1db5917b58685562c74eb561f3ba943672cd080 Mon Sep 17 00:00:00 2001 From: Sven Schliesing Date: Thu, 23 Jan 2025 09:29:58 +0100 Subject: [PATCH] [sophora-server] Adjust alert "SophoraServerAPISlow" (#141) * [sophora-server] Adjust alert "SophoraServerAPISlow" * [sophora-server] Use variable in runbook for alert "SophoraServerAPISlow" --- charts/sophora-server/Chart.yaml | 2 +- charts/sophora-server/alerting-runbook.md | 4 ++-- charts/sophora-server/templates/prometheusrule.yaml | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/charts/sophora-server/Chart.yaml b/charts/sophora-server/Chart.yaml index 4f0ecf4..368b9c0 100644 --- a/charts/sophora-server/Chart.yaml +++ b/charts/sophora-server/Chart.yaml @@ -15,7 +15,7 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 2.5.2 +version: 2.6.0 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to diff --git a/charts/sophora-server/alerting-runbook.md b/charts/sophora-server/alerting-runbook.md index 67db37e..3c15e7a 100644 --- a/charts/sophora-server/alerting-runbook.md +++ b/charts/sophora-server/alerting-runbook.md @@ -21,7 +21,7 @@ This document is a reference to the alerts this Helm chart can fire. **Severity:** high -**Summary:** The API of the server exhibits a response time exceeding 300ms for more than 15 minutes at the 95th percentile. +**Summary:** The API of the server exhibits a response time exceeding ${threshold} for more than 15 minutes at the 95th percentile. **Remediation steps:** @@ -105,4 +105,4 @@ This document is a reference to the alerts this Helm chart can fire. * Check if the primary server is running * Check the logs of the server * Check the logs of the primary server -* Check whether there are any network issues \ No newline at end of file +* Check whether there are any network issues diff --git a/charts/sophora-server/templates/prometheusrule.yaml b/charts/sophora-server/templates/prometheusrule.yaml index 7f0f8fb..8544ce3 100644 --- a/charts/sophora-server/templates/prometheusrule.yaml +++ b/charts/sophora-server/templates/prometheusrule.yaml @@ -21,12 +21,12 @@ spec: runbook_url: 'https://github.com/subshell/helm-charts/blob/main/charts/sophora-server/alerting-runbook.md' - alert: SophoraServerAPISlow for: 15m - expr: 'histogram_quantile(0.95, sum(rate(sophora_server_contentmanager_call_duration_seconds_bucket{job="{{ include "sophora-server.fullname" . }}"}[1m])) by (pod, le)) > 0.3' + expr: 'histogram_quantile(0.95, sum(rate(sophora_server_contentmanager_call_duration_seconds_bucket{job="{{ include "sophora-server.fullname" . }}"}[1m])) by (pod, le)) > 0.5' labels: severity: high annotations: summary: Sophora Server API is slow - description: The API of the server "{{`{{ $labels.pod }}`}}" exhibits a response time exceeding 300ms for more than 15 minutes at the 95th percentile. + description: The API of the server "{{`{{ $labels.pod }}`}}" exhibits a response time exceeding 500ms for more than 15 minutes at the 95th percentile. runbook_url: 'https://github.com/subshell/helm-charts/blob/main/charts/sophora-server/alerting-runbook.md' - alert: SophoraServerAsyncEventQueueBlocked for: 10m