diff --git a/src/aks-leonardo/00_monitor.tf b/src/aks-leonardo/00_monitor.tf index 4acef53a6..1a1772d5c 100644 --- a/src/aks-leonardo/00_monitor.tf +++ b/src/aks-leonardo/00_monitor.tf @@ -43,3 +43,8 @@ data "azurerm_monitor_action_group" "email" { name = local.monitor_action_group_email_name } +data "azurerm_monitor_action_group" "opsgenie" { + count = var.env_short == "p" ? 1 : 0 + resource_group_name = var.monitor_resource_group_name + name = local.monitor_action_group_opsgenie_name +} diff --git a/src/aks-leonardo/03_aks_0.tf b/src/aks-leonardo/03_aks_0.tf index fe2bf46cf..f7359b2fb 100644 --- a/src/aks-leonardo/03_aks_0.tf +++ b/src/aks-leonardo/03_aks_0.tf @@ -6,7 +6,7 @@ resource "azurerm_resource_group" "rg_aks" { } module "aks_leonardo" { - source = "git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_cluster?ref=v8.55.0" + source = "git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_cluster?ref=v8.58.0" name = local.aks_cluster_name location = var.location @@ -56,19 +56,28 @@ module "aks_leonardo" { addon_azure_key_vault_secrets_provider_enabled = true addon_azure_pod_identity_enabled = true - alerts_enabled = var.aks_alerts_enabled - custom_metric_alerts = local.aks_metrics_alerts - - action = [ - { - action_group_id = data.azurerm_monitor_action_group.slack.id - webhook_properties = null - }, - { - action_group_id = data.azurerm_monitor_action_group.email.id - webhook_properties = null - } - ] + alerts_enabled = var.aks_alerts_enabled + # custom_metric_alerts = local.aks_metrics_alerts + custom_logs_alerts = local.aks_logs_alerts + + action = flatten([ + [ + { + action_group_id = data.azurerm_monitor_action_group.slack.id + webhook_properties = null + }, + { + action_group_id = data.azurerm_monitor_action_group.email.id + webhook_properties = null + } + ], + (var.env == "prod" ? [ + { + action_group_id = data.azurerm_monitor_action_group.opsgenie.0.id + webhook_properties = null + } + ] : []) + ]) microsoft_defender_log_analytics_workspace_id = var.env == "prod" ? data.azurerm_log_analytics_workspace.log_analytics_italy.id : null diff --git a/src/aks-leonardo/99_locals.tf b/src/aks-leonardo/99_locals.tf index 3c209339f..517576d3f 100644 --- a/src/aks-leonardo/99_locals.tf +++ b/src/aks-leonardo/99_locals.tf @@ -26,179 +26,257 @@ locals { monitor_log_analytics_workspace_name = var.env_short == "d" ? "${local.product}-law" : "${local.product_location}-core-law" monitor_appinsights_name = var.env_short == "d" ? "${local.product}-appinsights" : "${local.product_location}-core-appinsights" - monitor_action_group_slack_name = "SlackPagoPA" - monitor_action_group_email_name = "PagoPA" - alert_action_group_ita_name = "${var.prefix}${var.env_short}ita" - alert_action_group_error_name = "${var.prefix}${var.env_short}error" + monitor_action_group_slack_name = "SlackPagoPA" + monitor_action_group_email_name = "PagoPA" + monitor_action_group_opsgenie_name = "InfraOpsgenie" + alert_action_group_ita_name = "${var.prefix}${var.env_short}ita" + alert_action_group_error_name = "${var.prefix}${var.env_short}error" kv_italy_name = "pagopa-${var.env_short}-itn-core-kv" kv_italy_rg_name = "pagopa-${var.env_short}-itn-core-sec-rg" - aks_metrics_alerts = { - node_cpu = { - aggregation = "Average" - metric_namespace = "Insights.Container/nodes" - metric_name = "cpuUsagePercentage" - operator = "GreaterThan" - threshold = 80 - frequency = "PT15M" - window_size = "PT1H" - dimension = [ - { - name = "host" - operator = "Include" - values = ["*"] - } - ], - } - node_memory = { - aggregation = "Average" - metric_namespace = "Insights.Container/nodes" - metric_name = "memoryWorkingSetPercentage" - operator = "GreaterThan" - threshold = 80 - frequency = "PT15M" - window_size = "PT1H" - dimension = [ - { - name = "host" - operator = "Include" - values = ["*"] - } - ], - } - node_disk = { - aggregation = "Average" - metric_namespace = "Insights.Container/nodes" - metric_name = "DiskUsedPercentage" - operator = "GreaterThan" - threshold = 80 - frequency = "PT15M" - window_size = "PT1H" - dimension = [ - { - name = "host" - operator = "Include" - values = ["*"] - }, - { - name = "device" - operator = "Include" - values = ["*"] - } - ], - } - node_not_ready = { - aggregation = "Average" - metric_namespace = "Insights.Container/nodes" - metric_name = "nodesCount" - operator = "GreaterThan" - threshold = 0 - frequency = "PT15M" - window_size = "PT1H" - dimension = [ - { - name = "status" - operator = "Include" - values = ["NotReady"] - } - ], - } + aks_logs_alerts = { pods_failed = { - aggregation = "Average" - metric_namespace = "Insights.Container/pods" - metric_name = "podCount" - operator = "GreaterThan" - threshold = 0 - frequency = "PT15M" - window_size = "PT1H" + display_name = "${module.aks_leonardo.name}-POD-FAILED" + description = "Detect if there is any pod failed" + query = <<-KQL + KubePodInventory + | where TimeGenerated > ago(15m) + | where PodStatus == "Failed" + | project TimeGenerated, ClusterName, Namespace, Name, PodStatus + | summarize count() by PodStatus, Namespace + KQL + severity = 1 + window_duration = "PT30M" + evaluation_frequency = "PT15M" + operator = "GreaterThan" + threshold = 1 + time_aggregation_method = "Average" + resource_id_column = "PodStatus" + metric_measure_column = "count_" dimension = [ { - name = "phase" + name = "Namespace" operator = "Include" - values = ["Failed"] + values = ["*"] } ] + minimum_failing_periods_to_trigger_alert = 1 + number_of_evaluation_periods = 1 + auto_mitigation_enabled = true + skip_query_validation = true + } pods_ready = { - aggregation = "Average" - metric_namespace = "Insights.Container/pods" - metric_name = "PodReadyPercentage" - operator = "LessThan" - threshold = 80 - frequency = "PT15M" - window_size = "PT1H" - dimension = [ - { - name = "Kubernetes namespace" - operator = "Include" - values = ["*"] - } - ] + display_name = "${module.aks_leonardo.name}-POD-READY" + description = "Detect pods percentage is over defined threshold" + query = <<-KQL + KubePodInventory + | where TimeGenerated > ago(15m) + | summarize TotalPodCount = count(), RunningPodCount = countif(PodStatus == "Running") + | extend RunningPodPercentage = (todouble(RunningPodCount) / todouble(TotalPodCount)) * 100 + | where RunningPodPercentage > 80 + | project RunningPodPercentage, TotalPodCount, RunningPodCount + | summarize any(RunningPodPercentage) + KQL + severity = 1 + window_duration = "PT30M" + evaluation_frequency = "PT15M" + operator = "LessThan" + threshold = 80 + time_aggregation_method = "Average" + resource_id_column = "RunningPodPercentage" + metric_measure_column = "any_RunningPodPercentage" + dimension = [] + minimum_failing_periods_to_trigger_alert = 1 + number_of_evaluation_periods = 1 + auto_mitigation_enabled = true + skip_query_validation = true } - container_cpu = { - aggregation = "Average" - metric_namespace = "Insights.Container/containers" - metric_name = "cpuExceededPercentage" - operator = "GreaterThan" - threshold = 95 - frequency = "PT15M" - window_size = "PT1H" + pods_oomkilled = { + display_name = "${module.aks_leonardo.name}-POD-OMMKILLED" + description = "Detect if any pod is OOMKilled" + query = <<-KQL + KubePodInventory + | where PodStatus != "running" + | extend ContainerLastStatusJSON = parse_json(ContainerLastStatus) + | extend FinishedAt = todatetime(ContainerLastStatusJSON.finishedAt) + | where ContainerLastStatusJSON.reason == "OOMKilled" + | distinct PodUid, Namespace, ControllerName, ContainerLastStatus, FinishedAt + | order by FinishedAt asc + KQL + severity = 3 + window_duration = "PT15M" + evaluation_frequency = "PT5M" + operator = "GreaterThan" + threshold = 1 + time_aggregation_method = "Count" + resource_id_column = "ControllerName" + metric_measure_column = null dimension = [ { - name = "Kubernetes namespace" + name = "ControllerName" operator = "Include" values = ["*"] }, + { + name = "Namespace" + operator = "Exclude" + values = [ + "kube-system", + "default" + ] + } ] + minimum_failing_periods_to_trigger_alert = 1 + number_of_evaluation_periods = 1 + auto_mitigation_enabled = true + skip_query_validation = true } - container_memory = { - aggregation = "Average" - metric_namespace = "Insights.Container/containers" - metric_name = "memoryWorkingSetExceededPercentage" - operator = "GreaterThan" - threshold = 95 - frequency = "PT15M" - window_size = "PT1H" + pods_restart = { + display_name = "${module.aks_leonardo.name}-POD-RESTART-COUNT" + description = "Detect if any pod was restarted abnormally" + query = <<-KQL + KubePodInventory + | where ContainerRestartCount > 0 + | summarize RestartCount = sum(ContainerRestartCount) by bin(TimeGenerated, 1m), Namespace, Name, _ResourceId + | where RestartCount > 0 + | project TimeGenerated, Namespace, Name, RestartCount, _ResourceId + | summarize any(RestartCount) by Namespace + KQL + severity = 2 + window_duration = "PT30M" + evaluation_frequency = "PT15M" + operator = "GreaterThan" + threshold = 5 + time_aggregation_method = "Average" + resource_id_column = "RestartCount" + metric_measure_column = "any_RestartCount" dimension = [ { - name = "Kubernetes namespace" - operator = "Include" - values = ["*"] + name = "Namespace" + operator = "Exclude" + values = [ + "kube-system", + "default" + ] }, ] + minimum_failing_periods_to_trigger_alert = 1 + number_of_evaluation_periods = 1 + auto_mitigation_enabled = true + skip_query_validation = true } - container_oom = { - aggregation = "Average" - metric_namespace = "Insights.Container/pods" - metric_name = "oomKilledContainerCount" - operator = "GreaterThan" - threshold = 0 - frequency = "PT15M" - window_size = "PT1H" + pods_cpu = { + display_name = "${module.aks_leonardo.name}-POD-CPU-USAGE" + description = "Detect if any pod has High CPU Usage" + query = <<-KQL + let endDateTime = now(); + let startDateTime = ago(1h); + let trendBinSize = 1m; + let capacityCounterName = 'cpuLimitNanoCores'; + let usageCounterName = 'cpuUsageNanoCores'; + let clusterName = '${module.aks_leonardo.name}'; + KubePodInventory + | where TimeGenerated < endDateTime + | where TimeGenerated >= startDateTime + | where ClusterName == clusterName + | extend InstanceName = strcat(ClusterId, '/', ContainerName) + | distinct Computer, InstanceName, ContainerName, ControllerName + | join hint.strategy=shuffle ( + Perf + | where TimeGenerated < endDateTime + | where TimeGenerated >= startDateTime + | where ObjectName == 'K8SContainer' + | where CounterName == capacityCounterName + | summarize LimitValue = max(CounterValue) by Computer, InstanceName, bin(TimeGenerated, trendBinSize) + | project Computer, InstanceName, LimitStartTime = TimeGenerated, LimitEndTime = TimeGenerated + trendBinSize, LimitValue + ) on Computer, InstanceName + | join kind=inner hint.strategy=shuffle ( + Perf + | where TimeGenerated < endDateTime + trendBinSize + | where TimeGenerated >= startDateTime - trendBinSize + | where ObjectName == 'K8SContainer' + | where CounterName == usageCounterName + | project Computer, InstanceName, UsageValue = CounterValue, TimeGenerated + ) on Computer, InstanceName + | where TimeGenerated >= LimitStartTime and TimeGenerated < LimitEndTime + | project Computer, ControllerName, ContainerName, TimeGenerated, UsagePercent = UsageValue * 100.0 / LimitValue + | summarize AggValue = avg(UsagePercent) by bin(TimeGenerated, trendBinSize) , ContainerName, ControllerName + KQL + severity = 2 + window_duration = "PT15M" + evaluation_frequency = "PT5M" + operator = "GreaterThan" + threshold = 95 + time_aggregation_method = "Average" + metric_measure_column = "AggValue" dimension = [ { - name = "Kubernetes namespace" + name = "ControllerName" operator = "Include" values = ["*"] - }, + } ] + minimum_failing_periods_to_trigger_alert = 1 + number_of_evaluation_periods = 1 + auto_mitigation_enabled = true } - container_restart = { - aggregation = "Average" - metric_namespace = "Insights.Container/pods" - metric_name = "restartingContainerCount" - operator = "GreaterThan" - threshold = 0 - frequency = "PT15M" - window_size = "PT1H" + pods_memory = { + display_name = "${module.aks_leonardo.name}-POD-MEM-USAGE" + description = "Detect if any pod has High Memory Usage" + query = <<-KQL + let endDateTime = now(); + let startDateTime = ago(1h); + let trendBinSize = 1m; + let capacityCounterName = 'memoryLimitBytes'; + let usageCounterName = 'memoryRssBytes'; + let clusterName = '${module.aks_leonardo.name}'; + KubePodInventory + | where TimeGenerated < endDateTime + | where TimeGenerated >= startDateTime + | where ClusterName == clusterName + | extend InstanceName = strcat(ClusterId, '/', ContainerName) + | where ContainerName !contains "microservice-chart" + | distinct Computer, InstanceName, ContainerName, ControllerName + | join hint.strategy=shuffle ( + Perf + | where TimeGenerated < endDateTime + | where TimeGenerated >= startDateTime + | where ObjectName == 'K8SContainer' + | where CounterName == capacityCounterName + | summarize LimitValue = max(CounterValue) by Computer, InstanceName, bin(TimeGenerated, trendBinSize) + | project Computer, InstanceName, LimitStartTime = TimeGenerated, LimitEndTime = TimeGenerated + trendBinSize, LimitValue + ) on Computer, InstanceName + | join kind=inner hint.strategy=shuffle ( + Perf + | where TimeGenerated < endDateTime + trendBinSize + | where TimeGenerated >= startDateTime - trendBinSize + | where ObjectName == 'K8SContainer' + | where CounterName == usageCounterName + | project Computer, InstanceName, UsageValue = CounterValue, TimeGenerated + ) on Computer, InstanceName + | where TimeGenerated >= LimitStartTime and TimeGenerated < LimitEndTime + | project Computer, ControllerName, ContainerName, TimeGenerated, UsagePercent = UsageValue * 100.0 / LimitValue + | summarize AggValue = avg(UsagePercent) by bin(TimeGenerated, trendBinSize) , ContainerName, ControllerName + KQL + severity = 2 + window_duration = "PT15M" + evaluation_frequency = "PT5M" + operator = "GreaterThan" + threshold = 90 + time_aggregation_method = "Average" + metric_measure_column = "AggValue" dimension = [ { - name = "Kubernetes namespace" + name = "ControllerName" operator = "Include" values = ["*"] - }, + } ] + minimum_failing_periods_to_trigger_alert = 1 + number_of_evaluation_periods = 1 + auto_mitigation_enabled = true } } } diff --git a/src/aks-leonardo/README.md b/src/aks-leonardo/README.md index 2eaa9b7ca..e54260d24 100644 --- a/src/aks-leonardo/README.md +++ b/src/aks-leonardo/README.md @@ -40,7 +40,7 @@ Re-enable all the resource, commented before to complete the procedure | Name | Source | Version | |------|--------|---------| -| [aks\_leonardo](#module\_aks\_leonardo) | git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_cluster | v8.55.0 | +| [aks\_leonardo](#module\_aks\_leonardo) | git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_cluster | v8.58.0 | | [aks\_prometheus\_install](#module\_aks\_prometheus\_install) | git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_prometheus_install | v8.17.1 | | [aks\_storage\_class](#module\_aks\_storage\_class) | git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_storage_class | v8.17.1 | | [elastic\_agent](#module\_elastic\_agent) | git::https://github.com/pagopa/terraform-azurerm-v3.git//elastic_agent | v8.50.0 | @@ -90,6 +90,7 @@ Re-enable all the resource, commented before to complete the procedure | [azurerm_log_analytics_workspace.log_analytics](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/log_analytics_workspace) | data source | | [azurerm_log_analytics_workspace.log_analytics_italy](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/log_analytics_workspace) | data source | | [azurerm_monitor_action_group.email](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/monitor_action_group) | data source | +| [azurerm_monitor_action_group.opsgenie](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/monitor_action_group) | data source | | [azurerm_monitor_action_group.slack](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/monitor_action_group) | data source | | [azurerm_public_ip.pip_aks_outboud](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/public_ip) | data source | | [azurerm_resource_group.monitor_italy_rg](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/resource_group) | data source |