diff --git a/src/aks-platform/02_aks.tf b/src/aks-platform/02_aks.tf
index 407b91c07..8094c2ff9 100644
--- a/src/aks-platform/02_aks.tf
+++ b/src/aks-platform/02_aks.tf
@@ -7,7 +7,7 @@ resource "azurerm_resource_group" "aks_rg" {
module "aks" {
- source = "git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_cluster?ref=v8.54.0"
+ source = "git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_cluster?ref=v8.58.0"
name = local.aks_name
location = var.location
@@ -76,8 +76,8 @@ module "aks" {
addon_azure_key_vault_secrets_provider_enabled = true
addon_azure_pod_identity_enabled = true
- alerts_enabled = var.aks_alerts_enabled
- custom_metric_alerts = local.aks_metrics_alerts
+ alerts_enabled = var.aks_alerts_enabled
+ custom_logs_alerts = local.aks_logs_alerts
# takes a list and replaces any elements that are lists with a
# flattened sequence of the list contents.
diff --git a/src/aks-platform/99_locals.tf b/src/aks-platform/99_locals.tf
index e9f323030..cfe75075a 100644
--- a/src/aks-platform/99_locals.tf
+++ b/src/aks-platform/99_locals.tf
@@ -31,215 +31,248 @@ locals {
vnet_core_resource_group_name = "${local.product}-vnet-rg"
vnet_core_name = "${local.product}-vnet"
- aks_metrics_alerts = {
- node_cpu = {
- aggregation = "Average"
- metric_namespace = "Insights.Container/nodes"
- metric_name = "cpuUsagePercentage"
- operator = "GreaterThan"
- threshold = 80
- frequency = "PT15M"
- window_size = "PT1H"
- skip_metric_validation = var.skip_metric_validation
+ aks_logs_alerts = {
+ pods_failed = {
+ display_name = "${module.aks.name}-POD-FAILED"
+ description = "Detect if there is any pod failed"
+ query = <<-KQL
+ KubePodInventory
+ | where TimeGenerated > ago(15m)
+ | where PodStatus == "Failed"
+ | project TimeGenerated, ClusterName, Namespace, Name, PodStatus
+ | summarize count() by PodStatus, Namespace
+ KQL
+ severity = 1
+ window_duration = "PT30M"
+ evaluation_frequency = "PT15M"
+ operator = "GreaterThan"
+ threshold = 1
+ time_aggregation_method = "Average"
+ resource_id_column = "PodStatus"
+ metric_measure_column = "count_"
dimension = [
{
- name = "host"
+ name = "Namespace"
operator = "Include"
values = ["*"]
}
- ],
+ ]
+ minimum_failing_periods_to_trigger_alert = 1
+ number_of_evaluation_periods = 1
+ auto_mitigation_enabled = true
+ skip_query_validation = true
+
}
- node_memory = {
- aggregation = "Average"
- metric_namespace = "Insights.Container/nodes"
- metric_name = "memoryWorkingSetPercentage"
- operator = "GreaterThan"
- threshold = 80
- frequency = "PT15M"
- window_size = "PT1H"
- skip_metric_validation = var.skip_metric_validation
- dimension = [
- {
- name = "host"
- operator = "Include"
- values = ["*"]
- }
- ],
+ pods_ready = {
+ display_name = "${module.aks.name}-POD-READY"
+ description = "Detect pods percentage is over defined threshold"
+ query = <<-KQL
+ KubePodInventory
+ | where TimeGenerated > ago(15m)
+ | summarize TotalPodCount = count(), RunningPodCount = countif(PodStatus == "Running")
+ | extend RunningPodPercentage = (todouble(RunningPodCount) / todouble(TotalPodCount)) * 100
+ | where RunningPodPercentage > 80
+ | project RunningPodPercentage, TotalPodCount, RunningPodCount
+ | summarize any(RunningPodPercentage)
+ KQL
+ severity = 1
+ window_duration = "PT30M"
+ evaluation_frequency = "PT15M"
+ operator = "LessThan"
+ threshold = 80
+ time_aggregation_method = "Average"
+ resource_id_column = "RunningPodPercentage"
+ metric_measure_column = "any_RunningPodPercentage"
+ dimension = []
+ minimum_failing_periods_to_trigger_alert = 1
+ number_of_evaluation_periods = 1
+ auto_mitigation_enabled = true
+ skip_query_validation = true
}
- node_disk = {
- aggregation = "Average"
- metric_namespace = "Insights.Container/nodes"
- metric_name = "DiskUsedPercentage"
- operator = "GreaterThan"
- threshold = 80
- frequency = "PT15M"
- window_size = "PT1H"
- skip_metric_validation = var.skip_metric_validation
+ pods_oomkilled = {
+ display_name = "${module.aks.name}-POD-OMMKILLED"
+ description = "Detect if any pod is OOMKilled"
+ query = <<-KQL
+ KubePodInventory
+ | where PodStatus != "running"
+ | extend ContainerLastStatusJSON = parse_json(ContainerLastStatus)
+ | extend FinishedAt = todatetime(ContainerLastStatusJSON.finishedAt)
+ | where ContainerLastStatusJSON.reason == "OOMKilled"
+ | distinct PodUid, Namespace, ControllerName, ContainerLastStatus, FinishedAt
+ | order by FinishedAt asc
+ KQL
+ severity = 3
+ window_duration = "PT15M"
+ evaluation_frequency = "PT5M"
+ operator = "GreaterThan"
+ threshold = 1
+ time_aggregation_method = "Count"
+ resource_id_column = "ControllerName"
+ metric_measure_column = null
dimension = [
{
- name = "host"
+ name = "ControllerName"
operator = "Include"
values = ["*"]
},
{
- name = "device"
- operator = "Include"
- values = ["*"]
- }
- ],
- }
- node_not_ready = {
- aggregation = "Average"
- metric_namespace = "Insights.Container/nodes"
- metric_name = "nodesCount"
- operator = "GreaterThan"
- threshold = 0
- frequency = "PT15M"
- window_size = "PT1H"
- skip_metric_validation = var.skip_metric_validation
- dimension = [
- {
- name = "status"
- operator = "Include"
- values = ["NotReady"]
- }
- ],
- }
- pods_failed = {
- aggregation = "Average"
- metric_namespace = "Insights.Container/pods"
- metric_name = "podCount"
- operator = "GreaterThan"
- threshold = 0
- frequency = "PT15M"
- window_size = "PT1H"
- skip_metric_validation = var.skip_metric_validation
- dimension = [
- {
- name = "phase"
- operator = "Include"
- values = ["Failed"]
- }
- ]
- }
- pods_ready = {
- aggregation = "Average"
- metric_namespace = "Insights.Container/pods"
- metric_name = "PodReadyPercentage"
- operator = "LessThan"
- threshold = 80
- frequency = "PT15M"
- window_size = "PT1H"
- skip_metric_validation = var.skip_metric_validation
- dimension = [
- {
- name = "Kubernetes namespace"
- operator = "Include"
+ name = "Namespace"
+ operator = "Exclude"
values = [
- "aca",
- "afm",
- "apiconfig",
- "bizevents",
- "ecommerce",
- "elastic-system",
- "fdr",
- "gps",
- "mock",
- "nodo",
- "nodo-cron",
- "qi",
- "receipts",
- "selfcare",
- "shared",
- "wallet",
+ "kube-system",
+ "default"
]
}
]
+ minimum_failing_periods_to_trigger_alert = 1
+ number_of_evaluation_periods = 1
+ auto_mitigation_enabled = true
+ skip_query_validation = true
}
- container_cpu = {
- aggregation = "Average"
- metric_namespace = "Insights.Container/containers"
- metric_name = "cpuExceededPercentage"
- operator = "GreaterThan"
- threshold = 95
- frequency = "PT15M"
- window_size = "PT1H"
- skip_metric_validation = var.skip_metric_validation
+ pods_restart = {
+ display_name = "${module.aks.name}-POD-RESTART-COUNT"
+ description = "Detect if any pod was restarted abnormally"
+ query = <<-KQL
+ KubePodInventory
+ | where ContainerRestartCount > 0
+ | summarize RestartCount = sum(ContainerRestartCount) by bin(TimeGenerated, 1m), Namespace, Name, _ResourceId
+ | where RestartCount > 0
+ | project TimeGenerated, Namespace, Name, RestartCount, _ResourceId
+ | summarize any(RestartCount) by Namespace
+ KQL
+ severity = 2
+ window_duration = "PT30M"
+ evaluation_frequency = "PT15M"
+ operator = "GreaterThan"
+ threshold = 5
+ time_aggregation_method = "Average"
+ resource_id_column = "RestartCount"
+ metric_measure_column = "any_RestartCount"
dimension = [
{
- name = "Kubernetes namespace"
- operator = "Include"
+ name = "Namespace"
+ operator = "Exclude"
values = [
- "aca",
- "afm",
- "apiconfig",
- "bizevents",
- "ecommerce",
- "elastic-system",
- "fdr",
- "gps",
- "mock",
- "nodo",
- "nodo-cron",
- "qi",
- "receipts",
- "selfcare",
- "shared",
- "wallet",
+ "kube-system",
+ "default"
]
},
]
+ minimum_failing_periods_to_trigger_alert = 1
+ number_of_evaluation_periods = 1
+ auto_mitigation_enabled = true
+ skip_query_validation = true
}
- container_memory = {
- aggregation = "Average"
- metric_namespace = "Insights.Container/containers"
- metric_name = "memoryWorkingSetExceededPercentage"
- operator = "GreaterThan"
- threshold = 95
- frequency = "PT15M"
- window_size = "PT1H"
- skip_metric_validation = var.skip_metric_validation
- dimension = [
- {
- name = "Kubernetes namespace"
- operator = "Include"
- values = ["*"]
- },
- ]
- }
- container_oom = {
- aggregation = "Average"
- metric_namespace = "Insights.Container/pods"
- metric_name = "oomKilledContainerCount"
- operator = "GreaterThan"
- threshold = 0
- frequency = "PT15M"
- window_size = "PT1H"
- skip_metric_validation = var.skip_metric_validation
+ pods_cpu = {
+ display_name = "${module.aks.name}-POD-CPU-USAGE"
+ description = "Detect if any pod has High CPU Usage"
+ query = <<-KQL
+ let endDateTime = now();
+ let startDateTime = ago(1h);
+ let trendBinSize = 1m;
+ let capacityCounterName = 'cpuLimitNanoCores';
+ let usageCounterName = 'cpuUsageNanoCores';
+ let clusterName = '${module.aks.name}';
+ KubePodInventory
+ | where TimeGenerated < endDateTime
+ | where TimeGenerated >= startDateTime
+ | where ClusterName == clusterName
+ | extend InstanceName = strcat(ClusterId, '/', ContainerName)
+ | distinct Computer, InstanceName, ContainerName, ControllerName
+ | join hint.strategy=shuffle (
+ Perf
+ | where TimeGenerated < endDateTime
+ | where TimeGenerated >= startDateTime
+ | where ObjectName == 'K8SContainer'
+ | where CounterName == capacityCounterName
+ | summarize LimitValue = max(CounterValue) by Computer, InstanceName, bin(TimeGenerated, trendBinSize)
+ | project Computer, InstanceName, LimitStartTime = TimeGenerated, LimitEndTime = TimeGenerated + trendBinSize, LimitValue
+ ) on Computer, InstanceName
+ | join kind=inner hint.strategy=shuffle (
+ Perf
+ | where TimeGenerated < endDateTime + trendBinSize
+ | where TimeGenerated >= startDateTime - trendBinSize
+ | where ObjectName == 'K8SContainer'
+ | where CounterName == usageCounterName
+ | project Computer, InstanceName, UsageValue = CounterValue, TimeGenerated
+ ) on Computer, InstanceName
+ | where TimeGenerated >= LimitStartTime and TimeGenerated < LimitEndTime
+ | project Computer, ControllerName, ContainerName, TimeGenerated, UsagePercent = UsageValue * 100.0 / LimitValue
+ | summarize AggValue = avg(UsagePercent) by bin(TimeGenerated, trendBinSize) , ContainerName, ControllerName
+ KQL
+ severity = 2
+ window_duration = "PT15M"
+ evaluation_frequency = "PT5M"
+ operator = "GreaterThan"
+ threshold = 95
+ time_aggregation_method = "Average"
+ metric_measure_column = "AggValue"
dimension = [
{
- name = "Kubernetes namespace"
+ name = "ControllerName"
operator = "Include"
values = ["*"]
- },
+ }
]
+ minimum_failing_periods_to_trigger_alert = 1
+ number_of_evaluation_periods = 1
+ auto_mitigation_enabled = true
}
- container_restart = {
- aggregation = "Average"
- metric_namespace = "Insights.Container/pods"
- metric_name = "restartingContainerCount"
- operator = "GreaterThan"
- threshold = 0
- frequency = "PT15M"
- window_size = "PT1H"
- skip_metric_validation = var.skip_metric_validation
+ pods_memory = {
+ display_name = "${module.aks.name}-POD-MEM-USAGE"
+ description = "Detect if any pod has High Memory Usage"
+ query = <<-KQL
+ let endDateTime = now();
+ let startDateTime = ago(1h);
+ let trendBinSize = 1m;
+ let capacityCounterName = 'memoryLimitBytes';
+ let usageCounterName = 'memoryRssBytes';
+ let clusterName = '${module.aks.name}';
+ KubePodInventory
+ | where TimeGenerated < endDateTime
+ | where TimeGenerated >= startDateTime
+ | where ClusterName == clusterName
+ | extend InstanceName = strcat(ClusterId, '/', ContainerName)
+ | where ContainerName !contains "microservice-chart"
+ | distinct Computer, InstanceName, ContainerName, ControllerName
+ | join hint.strategy=shuffle (
+ Perf
+ | where TimeGenerated < endDateTime
+ | where TimeGenerated >= startDateTime
+ | where ObjectName == 'K8SContainer'
+ | where CounterName == capacityCounterName
+ | summarize LimitValue = max(CounterValue) by Computer, InstanceName, bin(TimeGenerated, trendBinSize)
+ | project Computer, InstanceName, LimitStartTime = TimeGenerated, LimitEndTime = TimeGenerated + trendBinSize, LimitValue
+ ) on Computer, InstanceName
+ | join kind=inner hint.strategy=shuffle (
+ Perf
+ | where TimeGenerated < endDateTime + trendBinSize
+ | where TimeGenerated >= startDateTime - trendBinSize
+ | where ObjectName == 'K8SContainer'
+ | where CounterName == usageCounterName
+ | project Computer, InstanceName, UsageValue = CounterValue, TimeGenerated
+ ) on Computer, InstanceName
+ | where TimeGenerated >= LimitStartTime and TimeGenerated < LimitEndTime
+ | project Computer, ControllerName, ContainerName, TimeGenerated, UsagePercent = UsageValue * 100.0 / LimitValue
+ | summarize AggValue = avg(UsagePercent) by bin(TimeGenerated, trendBinSize) , ContainerName, ControllerName
+ KQL
+ severity = 2
+ window_duration = "PT15M"
+ evaluation_frequency = "PT5M"
+ operator = "GreaterThan"
+ threshold = 90
+ time_aggregation_method = "Average"
+ metric_measure_column = "AggValue"
dimension = [
{
- name = "Kubernetes namespace"
+ name = "ControllerName"
operator = "Include"
values = ["*"]
- },
+ }
]
+ minimum_failing_periods_to_trigger_alert = 1
+ number_of_evaluation_periods = 1
+ auto_mitigation_enabled = true
}
}
}
diff --git a/src/aks-platform/README.md b/src/aks-platform/README.md
index 8b23a906b..a2101f84c 100644
--- a/src/aks-platform/README.md
+++ b/src/aks-platform/README.md
@@ -15,7 +15,7 @@
| Name | Source | Version |
|------|--------|---------|
-| [aks](#module\_aks) | git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_cluster | v8.54.0 |
+| [aks](#module\_aks) | git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_cluster | v8.58.0 |
| [aks\_snet](#module\_aks\_snet) | git::https://github.com/pagopa/terraform-azurerm-v3.git//subnet | v8.53.0 |
| [keda\_pod\_identity](#module\_keda\_pod\_identity) | git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_pod_identity | v8.53.0 |
| [monitoring\_pod\_identity](#module\_monitoring\_pod\_identity) | git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_pod_identity | v8.53.0 |