Skip to content

Commit

Permalink
chore: [PAYMCLOUD-174] Update alert descriptions and configurations (#…
Browse files Browse the repository at this point in the history
…2570)

Update alert descriptions and configurations

Revised the alert descriptions to be more consistent with module names. Adjusted alert severity, window durations, and evaluation frequencies to improve monitoring accuracy and responsiveness.

Signed-off-by: Fabio Felici <[email protected]>
  • Loading branch information
ffppa authored Nov 21, 2024
1 parent 57e3be2 commit 3b07083
Showing 1 changed file with 11 additions and 11 deletions.
22 changes: 11 additions & 11 deletions src/aks-platform/99_locals.tf
Original file line number Diff line number Diff line change
Expand Up @@ -34,17 +34,17 @@ locals {
aks_logs_alerts = {
pods_failed = {
display_name = "${module.aks.name}-POD-FAILED"
description = "Detect if there is any pod failed"
description = "${module.aks.name} POD FAILED"
query = <<-KQL
KubePodInventory
| where TimeGenerated > ago(15m)
| where PodStatus == "Failed"
| project TimeGenerated, ClusterName, Namespace, Name, PodStatus
| summarize count() by PodStatus, Namespace
KQL
severity = 1
window_duration = "PT30M"
evaluation_frequency = "PT15M"
severity = 2
window_duration = "PT15M"
evaluation_frequency = "PT5M"
operator = "GreaterThan"
threshold = 1
time_aggregation_method = "Average"
Expand All @@ -65,7 +65,7 @@ locals {
}
pods_ready = {
display_name = "${module.aks.name}-POD-READY"
description = "Detect pods percentage is over defined threshold"
description = "${module.aks.name} POD Ready under threshold"
query = <<-KQL
KubePodInventory
| where TimeGenerated > ago(15m)
Expand All @@ -91,7 +91,7 @@ locals {
}
pods_oomkilled = {
display_name = "${module.aks.name}-POD-OMMKILLED"
description = "Detect if any pod is OOMKilled"
description = "${module.aks.name} POD OOMKilled"
query = <<-KQL
KubePodInventory
| where PodStatus != "running"
Expand Down Expand Up @@ -131,7 +131,7 @@ locals {
}
pods_restart = {
display_name = "${module.aks.name}-POD-RESTART-COUNT"
description = "Detect if any pod was restarted abnormally"
description = "${module.aks.name} POD Restarted multiple times"
query = <<-KQL
KubePodInventory
| where ContainerRestartCount > 0
Expand All @@ -141,8 +141,8 @@ locals {
| summarize any(RestartCount) by Namespace
KQL
severity = 2
window_duration = "PT30M"
evaluation_frequency = "PT15M"
window_duration = "PT15M"
evaluation_frequency = "PT5M"
operator = "GreaterThan"
threshold = 5
time_aggregation_method = "Average"
Expand All @@ -165,7 +165,7 @@ locals {
}
pods_cpu = {
display_name = "${module.aks.name}-POD-CPU-USAGE"
description = "Detect if any pod has High CPU Usage"
description = "${module.aks.name} POD High CPU Usage"
query = <<-KQL
let endDateTime = now();
let startDateTime = ago(1h);
Expand Down Expand Up @@ -220,7 +220,7 @@ locals {
}
pods_memory = {
display_name = "${module.aks.name}-POD-MEM-USAGE"
description = "Detect if any pod has High Memory Usage"
description = "${module.aks.name} POD High Memory Usage"
query = <<-KQL
let endDateTime = now();
let startDateTime = ago(1h);
Expand Down

0 comments on commit 3b07083

Please sign in to comment.