diff --git a/src/aks-platform/02_aks.tf b/src/aks-platform/02_aks.tf index 407b91c07..8094c2ff9 100644 --- a/src/aks-platform/02_aks.tf +++ b/src/aks-platform/02_aks.tf @@ -7,7 +7,7 @@ resource "azurerm_resource_group" "aks_rg" { module "aks" { - source = "git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_cluster?ref=v8.54.0" + source = "git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_cluster?ref=v8.58.0" name = local.aks_name location = var.location @@ -76,8 +76,8 @@ module "aks" { addon_azure_key_vault_secrets_provider_enabled = true addon_azure_pod_identity_enabled = true - alerts_enabled = var.aks_alerts_enabled - custom_metric_alerts = local.aks_metrics_alerts + alerts_enabled = var.aks_alerts_enabled + custom_logs_alerts = local.aks_logs_alerts # takes a list and replaces any elements that are lists with a # flattened sequence of the list contents. diff --git a/src/aks-platform/99_locals.tf b/src/aks-platform/99_locals.tf index e9f323030..cfe75075a 100644 --- a/src/aks-platform/99_locals.tf +++ b/src/aks-platform/99_locals.tf @@ -31,215 +31,248 @@ locals { vnet_core_resource_group_name = "${local.product}-vnet-rg" vnet_core_name = "${local.product}-vnet" - aks_metrics_alerts = { - node_cpu = { - aggregation = "Average" - metric_namespace = "Insights.Container/nodes" - metric_name = "cpuUsagePercentage" - operator = "GreaterThan" - threshold = 80 - frequency = "PT15M" - window_size = "PT1H" - skip_metric_validation = var.skip_metric_validation + aks_logs_alerts = { + pods_failed = { + display_name = "${module.aks.name}-POD-FAILED" + description = "Detect if there is any pod failed" + query = <<-KQL + KubePodInventory + | where TimeGenerated > ago(15m) + | where PodStatus == "Failed" + | project TimeGenerated, ClusterName, Namespace, Name, PodStatus + | summarize count() by PodStatus, Namespace + KQL + severity = 1 + window_duration = "PT30M" + evaluation_frequency = "PT15M" + operator = "GreaterThan" + threshold = 1 + time_aggregation_method = "Average" + resource_id_column = "PodStatus" + metric_measure_column = "count_" dimension = [ { - name = "host" + name = "Namespace" operator = "Include" values = ["*"] } - ], + ] + minimum_failing_periods_to_trigger_alert = 1 + number_of_evaluation_periods = 1 + auto_mitigation_enabled = true + skip_query_validation = true + } - node_memory = { - aggregation = "Average" - metric_namespace = "Insights.Container/nodes" - metric_name = "memoryWorkingSetPercentage" - operator = "GreaterThan" - threshold = 80 - frequency = "PT15M" - window_size = "PT1H" - skip_metric_validation = var.skip_metric_validation - dimension = [ - { - name = "host" - operator = "Include" - values = ["*"] - } - ], + pods_ready = { + display_name = "${module.aks.name}-POD-READY" + description = "Detect pods percentage is over defined threshold" + query = <<-KQL + KubePodInventory + | where TimeGenerated > ago(15m) + | summarize TotalPodCount = count(), RunningPodCount = countif(PodStatus == "Running") + | extend RunningPodPercentage = (todouble(RunningPodCount) / todouble(TotalPodCount)) * 100 + | where RunningPodPercentage > 80 + | project RunningPodPercentage, TotalPodCount, RunningPodCount + | summarize any(RunningPodPercentage) + KQL + severity = 1 + window_duration = "PT30M" + evaluation_frequency = "PT15M" + operator = "LessThan" + threshold = 80 + time_aggregation_method = "Average" + resource_id_column = "RunningPodPercentage" + metric_measure_column = "any_RunningPodPercentage" + dimension = [] + minimum_failing_periods_to_trigger_alert = 1 + number_of_evaluation_periods = 1 + auto_mitigation_enabled = true + skip_query_validation = true } - node_disk = { - aggregation = "Average" - metric_namespace = "Insights.Container/nodes" - metric_name = "DiskUsedPercentage" - operator = "GreaterThan" - threshold = 80 - frequency = "PT15M" - window_size = "PT1H" - skip_metric_validation = var.skip_metric_validation + pods_oomkilled = { + display_name = "${module.aks.name}-POD-OMMKILLED" + description = "Detect if any pod is OOMKilled" + query = <<-KQL + KubePodInventory + | where PodStatus != "running" + | extend ContainerLastStatusJSON = parse_json(ContainerLastStatus) + | extend FinishedAt = todatetime(ContainerLastStatusJSON.finishedAt) + | where ContainerLastStatusJSON.reason == "OOMKilled" + | distinct PodUid, Namespace, ControllerName, ContainerLastStatus, FinishedAt + | order by FinishedAt asc + KQL + severity = 3 + window_duration = "PT15M" + evaluation_frequency = "PT5M" + operator = "GreaterThan" + threshold = 1 + time_aggregation_method = "Count" + resource_id_column = "ControllerName" + metric_measure_column = null dimension = [ { - name = "host" + name = "ControllerName" operator = "Include" values = ["*"] }, { - name = "device" - operator = "Include" - values = ["*"] - } - ], - } - node_not_ready = { - aggregation = "Average" - metric_namespace = "Insights.Container/nodes" - metric_name = "nodesCount" - operator = "GreaterThan" - threshold = 0 - frequency = "PT15M" - window_size = "PT1H" - skip_metric_validation = var.skip_metric_validation - dimension = [ - { - name = "status" - operator = "Include" - values = ["NotReady"] - } - ], - } - pods_failed = { - aggregation = "Average" - metric_namespace = "Insights.Container/pods" - metric_name = "podCount" - operator = "GreaterThan" - threshold = 0 - frequency = "PT15M" - window_size = "PT1H" - skip_metric_validation = var.skip_metric_validation - dimension = [ - { - name = "phase" - operator = "Include" - values = ["Failed"] - } - ] - } - pods_ready = { - aggregation = "Average" - metric_namespace = "Insights.Container/pods" - metric_name = "PodReadyPercentage" - operator = "LessThan" - threshold = 80 - frequency = "PT15M" - window_size = "PT1H" - skip_metric_validation = var.skip_metric_validation - dimension = [ - { - name = "Kubernetes namespace" - operator = "Include" + name = "Namespace" + operator = "Exclude" values = [ - "aca", - "afm", - "apiconfig", - "bizevents", - "ecommerce", - "elastic-system", - "fdr", - "gps", - "mock", - "nodo", - "nodo-cron", - "qi", - "receipts", - "selfcare", - "shared", - "wallet", + "kube-system", + "default" ] } ] + minimum_failing_periods_to_trigger_alert = 1 + number_of_evaluation_periods = 1 + auto_mitigation_enabled = true + skip_query_validation = true } - container_cpu = { - aggregation = "Average" - metric_namespace = "Insights.Container/containers" - metric_name = "cpuExceededPercentage" - operator = "GreaterThan" - threshold = 95 - frequency = "PT15M" - window_size = "PT1H" - skip_metric_validation = var.skip_metric_validation + pods_restart = { + display_name = "${module.aks.name}-POD-RESTART-COUNT" + description = "Detect if any pod was restarted abnormally" + query = <<-KQL + KubePodInventory + | where ContainerRestartCount > 0 + | summarize RestartCount = sum(ContainerRestartCount) by bin(TimeGenerated, 1m), Namespace, Name, _ResourceId + | where RestartCount > 0 + | project TimeGenerated, Namespace, Name, RestartCount, _ResourceId + | summarize any(RestartCount) by Namespace + KQL + severity = 2 + window_duration = "PT30M" + evaluation_frequency = "PT15M" + operator = "GreaterThan" + threshold = 5 + time_aggregation_method = "Average" + resource_id_column = "RestartCount" + metric_measure_column = "any_RestartCount" dimension = [ { - name = "Kubernetes namespace" - operator = "Include" + name = "Namespace" + operator = "Exclude" values = [ - "aca", - "afm", - "apiconfig", - "bizevents", - "ecommerce", - "elastic-system", - "fdr", - "gps", - "mock", - "nodo", - "nodo-cron", - "qi", - "receipts", - "selfcare", - "shared", - "wallet", + "kube-system", + "default" ] }, ] + minimum_failing_periods_to_trigger_alert = 1 + number_of_evaluation_periods = 1 + auto_mitigation_enabled = true + skip_query_validation = true } - container_memory = { - aggregation = "Average" - metric_namespace = "Insights.Container/containers" - metric_name = "memoryWorkingSetExceededPercentage" - operator = "GreaterThan" - threshold = 95 - frequency = "PT15M" - window_size = "PT1H" - skip_metric_validation = var.skip_metric_validation - dimension = [ - { - name = "Kubernetes namespace" - operator = "Include" - values = ["*"] - }, - ] - } - container_oom = { - aggregation = "Average" - metric_namespace = "Insights.Container/pods" - metric_name = "oomKilledContainerCount" - operator = "GreaterThan" - threshold = 0 - frequency = "PT15M" - window_size = "PT1H" - skip_metric_validation = var.skip_metric_validation + pods_cpu = { + display_name = "${module.aks.name}-POD-CPU-USAGE" + description = "Detect if any pod has High CPU Usage" + query = <<-KQL + let endDateTime = now(); + let startDateTime = ago(1h); + let trendBinSize = 1m; + let capacityCounterName = 'cpuLimitNanoCores'; + let usageCounterName = 'cpuUsageNanoCores'; + let clusterName = '${module.aks.name}'; + KubePodInventory + | where TimeGenerated < endDateTime + | where TimeGenerated >= startDateTime + | where ClusterName == clusterName + | extend InstanceName = strcat(ClusterId, '/', ContainerName) + | distinct Computer, InstanceName, ContainerName, ControllerName + | join hint.strategy=shuffle ( + Perf + | where TimeGenerated < endDateTime + | where TimeGenerated >= startDateTime + | where ObjectName == 'K8SContainer' + | where CounterName == capacityCounterName + | summarize LimitValue = max(CounterValue) by Computer, InstanceName, bin(TimeGenerated, trendBinSize) + | project Computer, InstanceName, LimitStartTime = TimeGenerated, LimitEndTime = TimeGenerated + trendBinSize, LimitValue + ) on Computer, InstanceName + | join kind=inner hint.strategy=shuffle ( + Perf + | where TimeGenerated < endDateTime + trendBinSize + | where TimeGenerated >= startDateTime - trendBinSize + | where ObjectName == 'K8SContainer' + | where CounterName == usageCounterName + | project Computer, InstanceName, UsageValue = CounterValue, TimeGenerated + ) on Computer, InstanceName + | where TimeGenerated >= LimitStartTime and TimeGenerated < LimitEndTime + | project Computer, ControllerName, ContainerName, TimeGenerated, UsagePercent = UsageValue * 100.0 / LimitValue + | summarize AggValue = avg(UsagePercent) by bin(TimeGenerated, trendBinSize) , ContainerName, ControllerName + KQL + severity = 2 + window_duration = "PT15M" + evaluation_frequency = "PT5M" + operator = "GreaterThan" + threshold = 95 + time_aggregation_method = "Average" + metric_measure_column = "AggValue" dimension = [ { - name = "Kubernetes namespace" + name = "ControllerName" operator = "Include" values = ["*"] - }, + } ] + minimum_failing_periods_to_trigger_alert = 1 + number_of_evaluation_periods = 1 + auto_mitigation_enabled = true } - container_restart = { - aggregation = "Average" - metric_namespace = "Insights.Container/pods" - metric_name = "restartingContainerCount" - operator = "GreaterThan" - threshold = 0 - frequency = "PT15M" - window_size = "PT1H" - skip_metric_validation = var.skip_metric_validation + pods_memory = { + display_name = "${module.aks.name}-POD-MEM-USAGE" + description = "Detect if any pod has High Memory Usage" + query = <<-KQL + let endDateTime = now(); + let startDateTime = ago(1h); + let trendBinSize = 1m; + let capacityCounterName = 'memoryLimitBytes'; + let usageCounterName = 'memoryRssBytes'; + let clusterName = '${module.aks.name}'; + KubePodInventory + | where TimeGenerated < endDateTime + | where TimeGenerated >= startDateTime + | where ClusterName == clusterName + | extend InstanceName = strcat(ClusterId, '/', ContainerName) + | where ContainerName !contains "microservice-chart" + | distinct Computer, InstanceName, ContainerName, ControllerName + | join hint.strategy=shuffle ( + Perf + | where TimeGenerated < endDateTime + | where TimeGenerated >= startDateTime + | where ObjectName == 'K8SContainer' + | where CounterName == capacityCounterName + | summarize LimitValue = max(CounterValue) by Computer, InstanceName, bin(TimeGenerated, trendBinSize) + | project Computer, InstanceName, LimitStartTime = TimeGenerated, LimitEndTime = TimeGenerated + trendBinSize, LimitValue + ) on Computer, InstanceName + | join kind=inner hint.strategy=shuffle ( + Perf + | where TimeGenerated < endDateTime + trendBinSize + | where TimeGenerated >= startDateTime - trendBinSize + | where ObjectName == 'K8SContainer' + | where CounterName == usageCounterName + | project Computer, InstanceName, UsageValue = CounterValue, TimeGenerated + ) on Computer, InstanceName + | where TimeGenerated >= LimitStartTime and TimeGenerated < LimitEndTime + | project Computer, ControllerName, ContainerName, TimeGenerated, UsagePercent = UsageValue * 100.0 / LimitValue + | summarize AggValue = avg(UsagePercent) by bin(TimeGenerated, trendBinSize) , ContainerName, ControllerName + KQL + severity = 2 + window_duration = "PT15M" + evaluation_frequency = "PT5M" + operator = "GreaterThan" + threshold = 90 + time_aggregation_method = "Average" + metric_measure_column = "AggValue" dimension = [ { - name = "Kubernetes namespace" + name = "ControllerName" operator = "Include" values = ["*"] - }, + } ] + minimum_failing_periods_to_trigger_alert = 1 + number_of_evaluation_periods = 1 + auto_mitigation_enabled = true } } } diff --git a/src/aks-platform/README.md b/src/aks-platform/README.md index 8b23a906b..a2101f84c 100644 --- a/src/aks-platform/README.md +++ b/src/aks-platform/README.md @@ -15,7 +15,7 @@ | Name | Source | Version | |------|--------|---------| -| [aks](#module\_aks) | git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_cluster | v8.54.0 | +| [aks](#module\_aks) | git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_cluster | v8.58.0 | | [aks\_snet](#module\_aks\_snet) | git::https://github.com/pagopa/terraform-azurerm-v3.git//subnet | v8.53.0 | | [keda\_pod\_identity](#module\_keda\_pod\_identity) | git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_pod_identity | v8.53.0 | | [monitoring\_pod\_identity](#module\_monitoring\_pod\_identity) | git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_pod_identity | v8.53.0 |