diff --git a/src/aks-leonardo/00_monitor.tf b/src/aks-leonardo/00_monitor.tf
index 4acef53a6..1a1772d5c 100644
--- a/src/aks-leonardo/00_monitor.tf
+++ b/src/aks-leonardo/00_monitor.tf
@@ -43,3 +43,8 @@ data "azurerm_monitor_action_group" "email" {
name = local.monitor_action_group_email_name
}
+data "azurerm_monitor_action_group" "opsgenie" {
+ count = var.env_short == "p" ? 1 : 0
+ resource_group_name = var.monitor_resource_group_name
+ name = local.monitor_action_group_opsgenie_name
+}
diff --git a/src/aks-leonardo/03_aks_0.tf b/src/aks-leonardo/03_aks_0.tf
index fe2bf46cf..f7359b2fb 100644
--- a/src/aks-leonardo/03_aks_0.tf
+++ b/src/aks-leonardo/03_aks_0.tf
@@ -6,7 +6,7 @@ resource "azurerm_resource_group" "rg_aks" {
}
module "aks_leonardo" {
- source = "git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_cluster?ref=v8.55.0"
+ source = "git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_cluster?ref=v8.58.0"
name = local.aks_cluster_name
location = var.location
@@ -56,19 +56,28 @@ module "aks_leonardo" {
addon_azure_key_vault_secrets_provider_enabled = true
addon_azure_pod_identity_enabled = true
- alerts_enabled = var.aks_alerts_enabled
- custom_metric_alerts = local.aks_metrics_alerts
-
- action = [
- {
- action_group_id = data.azurerm_monitor_action_group.slack.id
- webhook_properties = null
- },
- {
- action_group_id = data.azurerm_monitor_action_group.email.id
- webhook_properties = null
- }
- ]
+ alerts_enabled = var.aks_alerts_enabled
+ # custom_metric_alerts = local.aks_metrics_alerts
+ custom_logs_alerts = local.aks_logs_alerts
+
+ action = flatten([
+ [
+ {
+ action_group_id = data.azurerm_monitor_action_group.slack.id
+ webhook_properties = null
+ },
+ {
+ action_group_id = data.azurerm_monitor_action_group.email.id
+ webhook_properties = null
+ }
+ ],
+ (var.env == "prod" ? [
+ {
+ action_group_id = data.azurerm_monitor_action_group.opsgenie.0.id
+ webhook_properties = null
+ }
+ ] : [])
+ ])
microsoft_defender_log_analytics_workspace_id = var.env == "prod" ? data.azurerm_log_analytics_workspace.log_analytics_italy.id : null
diff --git a/src/aks-leonardo/99_locals.tf b/src/aks-leonardo/99_locals.tf
index 3c209339f..517576d3f 100644
--- a/src/aks-leonardo/99_locals.tf
+++ b/src/aks-leonardo/99_locals.tf
@@ -26,179 +26,257 @@ locals {
monitor_log_analytics_workspace_name = var.env_short == "d" ? "${local.product}-law" : "${local.product_location}-core-law"
monitor_appinsights_name = var.env_short == "d" ? "${local.product}-appinsights" : "${local.product_location}-core-appinsights"
- monitor_action_group_slack_name = "SlackPagoPA"
- monitor_action_group_email_name = "PagoPA"
- alert_action_group_ita_name = "${var.prefix}${var.env_short}ita"
- alert_action_group_error_name = "${var.prefix}${var.env_short}error"
+ monitor_action_group_slack_name = "SlackPagoPA"
+ monitor_action_group_email_name = "PagoPA"
+ monitor_action_group_opsgenie_name = "InfraOpsgenie"
+ alert_action_group_ita_name = "${var.prefix}${var.env_short}ita"
+ alert_action_group_error_name = "${var.prefix}${var.env_short}error"
kv_italy_name = "pagopa-${var.env_short}-itn-core-kv"
kv_italy_rg_name = "pagopa-${var.env_short}-itn-core-sec-rg"
- aks_metrics_alerts = {
- node_cpu = {
- aggregation = "Average"
- metric_namespace = "Insights.Container/nodes"
- metric_name = "cpuUsagePercentage"
- operator = "GreaterThan"
- threshold = 80
- frequency = "PT15M"
- window_size = "PT1H"
- dimension = [
- {
- name = "host"
- operator = "Include"
- values = ["*"]
- }
- ],
- }
- node_memory = {
- aggregation = "Average"
- metric_namespace = "Insights.Container/nodes"
- metric_name = "memoryWorkingSetPercentage"
- operator = "GreaterThan"
- threshold = 80
- frequency = "PT15M"
- window_size = "PT1H"
- dimension = [
- {
- name = "host"
- operator = "Include"
- values = ["*"]
- }
- ],
- }
- node_disk = {
- aggregation = "Average"
- metric_namespace = "Insights.Container/nodes"
- metric_name = "DiskUsedPercentage"
- operator = "GreaterThan"
- threshold = 80
- frequency = "PT15M"
- window_size = "PT1H"
- dimension = [
- {
- name = "host"
- operator = "Include"
- values = ["*"]
- },
- {
- name = "device"
- operator = "Include"
- values = ["*"]
- }
- ],
- }
- node_not_ready = {
- aggregation = "Average"
- metric_namespace = "Insights.Container/nodes"
- metric_name = "nodesCount"
- operator = "GreaterThan"
- threshold = 0
- frequency = "PT15M"
- window_size = "PT1H"
- dimension = [
- {
- name = "status"
- operator = "Include"
- values = ["NotReady"]
- }
- ],
- }
+ aks_logs_alerts = {
pods_failed = {
- aggregation = "Average"
- metric_namespace = "Insights.Container/pods"
- metric_name = "podCount"
- operator = "GreaterThan"
- threshold = 0
- frequency = "PT15M"
- window_size = "PT1H"
+ display_name = "${module.aks_leonardo.name}-POD-FAILED"
+ description = "Detect if there is any pod failed"
+ query = <<-KQL
+ KubePodInventory
+ | where TimeGenerated > ago(15m)
+ | where PodStatus == "Failed"
+ | project TimeGenerated, ClusterName, Namespace, Name, PodStatus
+ | summarize count() by PodStatus, Namespace
+ KQL
+ severity = 1
+ window_duration = "PT30M"
+ evaluation_frequency = "PT15M"
+ operator = "GreaterThan"
+ threshold = 1
+ time_aggregation_method = "Average"
+ resource_id_column = "PodStatus"
+ metric_measure_column = "count_"
dimension = [
{
- name = "phase"
+ name = "Namespace"
operator = "Include"
- values = ["Failed"]
+ values = ["*"]
}
]
+ minimum_failing_periods_to_trigger_alert = 1
+ number_of_evaluation_periods = 1
+ auto_mitigation_enabled = true
+ skip_query_validation = true
+
}
pods_ready = {
- aggregation = "Average"
- metric_namespace = "Insights.Container/pods"
- metric_name = "PodReadyPercentage"
- operator = "LessThan"
- threshold = 80
- frequency = "PT15M"
- window_size = "PT1H"
- dimension = [
- {
- name = "Kubernetes namespace"
- operator = "Include"
- values = ["*"]
- }
- ]
+ display_name = "${module.aks_leonardo.name}-POD-READY"
+ description = "Detect pods percentage is over defined threshold"
+ query = <<-KQL
+ KubePodInventory
+ | where TimeGenerated > ago(15m)
+ | summarize TotalPodCount = count(), RunningPodCount = countif(PodStatus == "Running")
+ | extend RunningPodPercentage = (todouble(RunningPodCount) / todouble(TotalPodCount)) * 100
+ | where RunningPodPercentage > 80
+ | project RunningPodPercentage, TotalPodCount, RunningPodCount
+ | summarize any(RunningPodPercentage)
+ KQL
+ severity = 1
+ window_duration = "PT30M"
+ evaluation_frequency = "PT15M"
+ operator = "LessThan"
+ threshold = 80
+ time_aggregation_method = "Average"
+ resource_id_column = "RunningPodPercentage"
+ metric_measure_column = "any_RunningPodPercentage"
+ dimension = []
+ minimum_failing_periods_to_trigger_alert = 1
+ number_of_evaluation_periods = 1
+ auto_mitigation_enabled = true
+ skip_query_validation = true
}
- container_cpu = {
- aggregation = "Average"
- metric_namespace = "Insights.Container/containers"
- metric_name = "cpuExceededPercentage"
- operator = "GreaterThan"
- threshold = 95
- frequency = "PT15M"
- window_size = "PT1H"
+ pods_oomkilled = {
+ display_name = "${module.aks_leonardo.name}-POD-OMMKILLED"
+ description = "Detect if any pod is OOMKilled"
+ query = <<-KQL
+ KubePodInventory
+ | where PodStatus != "running"
+ | extend ContainerLastStatusJSON = parse_json(ContainerLastStatus)
+ | extend FinishedAt = todatetime(ContainerLastStatusJSON.finishedAt)
+ | where ContainerLastStatusJSON.reason == "OOMKilled"
+ | distinct PodUid, Namespace, ControllerName, ContainerLastStatus, FinishedAt
+ | order by FinishedAt asc
+ KQL
+ severity = 3
+ window_duration = "PT15M"
+ evaluation_frequency = "PT5M"
+ operator = "GreaterThan"
+ threshold = 1
+ time_aggregation_method = "Count"
+ resource_id_column = "ControllerName"
+ metric_measure_column = null
dimension = [
{
- name = "Kubernetes namespace"
+ name = "ControllerName"
operator = "Include"
values = ["*"]
},
+ {
+ name = "Namespace"
+ operator = "Exclude"
+ values = [
+ "kube-system",
+ "default"
+ ]
+ }
]
+ minimum_failing_periods_to_trigger_alert = 1
+ number_of_evaluation_periods = 1
+ auto_mitigation_enabled = true
+ skip_query_validation = true
}
- container_memory = {
- aggregation = "Average"
- metric_namespace = "Insights.Container/containers"
- metric_name = "memoryWorkingSetExceededPercentage"
- operator = "GreaterThan"
- threshold = 95
- frequency = "PT15M"
- window_size = "PT1H"
+ pods_restart = {
+ display_name = "${module.aks_leonardo.name}-POD-RESTART-COUNT"
+ description = "Detect if any pod was restarted abnormally"
+ query = <<-KQL
+ KubePodInventory
+ | where ContainerRestartCount > 0
+ | summarize RestartCount = sum(ContainerRestartCount) by bin(TimeGenerated, 1m), Namespace, Name, _ResourceId
+ | where RestartCount > 0
+ | project TimeGenerated, Namespace, Name, RestartCount, _ResourceId
+ | summarize any(RestartCount) by Namespace
+ KQL
+ severity = 2
+ window_duration = "PT30M"
+ evaluation_frequency = "PT15M"
+ operator = "GreaterThan"
+ threshold = 5
+ time_aggregation_method = "Average"
+ resource_id_column = "RestartCount"
+ metric_measure_column = "any_RestartCount"
dimension = [
{
- name = "Kubernetes namespace"
- operator = "Include"
- values = ["*"]
+ name = "Namespace"
+ operator = "Exclude"
+ values = [
+ "kube-system",
+ "default"
+ ]
},
]
+ minimum_failing_periods_to_trigger_alert = 1
+ number_of_evaluation_periods = 1
+ auto_mitigation_enabled = true
+ skip_query_validation = true
}
- container_oom = {
- aggregation = "Average"
- metric_namespace = "Insights.Container/pods"
- metric_name = "oomKilledContainerCount"
- operator = "GreaterThan"
- threshold = 0
- frequency = "PT15M"
- window_size = "PT1H"
+ pods_cpu = {
+ display_name = "${module.aks_leonardo.name}-POD-CPU-USAGE"
+ description = "Detect if any pod has High CPU Usage"
+ query = <<-KQL
+ let endDateTime = now();
+ let startDateTime = ago(1h);
+ let trendBinSize = 1m;
+ let capacityCounterName = 'cpuLimitNanoCores';
+ let usageCounterName = 'cpuUsageNanoCores';
+ let clusterName = '${module.aks_leonardo.name}';
+ KubePodInventory
+ | where TimeGenerated < endDateTime
+ | where TimeGenerated >= startDateTime
+ | where ClusterName == clusterName
+ | extend InstanceName = strcat(ClusterId, '/', ContainerName)
+ | distinct Computer, InstanceName, ContainerName, ControllerName
+ | join hint.strategy=shuffle (
+ Perf
+ | where TimeGenerated < endDateTime
+ | where TimeGenerated >= startDateTime
+ | where ObjectName == 'K8SContainer'
+ | where CounterName == capacityCounterName
+ | summarize LimitValue = max(CounterValue) by Computer, InstanceName, bin(TimeGenerated, trendBinSize)
+ | project Computer, InstanceName, LimitStartTime = TimeGenerated, LimitEndTime = TimeGenerated + trendBinSize, LimitValue
+ ) on Computer, InstanceName
+ | join kind=inner hint.strategy=shuffle (
+ Perf
+ | where TimeGenerated < endDateTime + trendBinSize
+ | where TimeGenerated >= startDateTime - trendBinSize
+ | where ObjectName == 'K8SContainer'
+ | where CounterName == usageCounterName
+ | project Computer, InstanceName, UsageValue = CounterValue, TimeGenerated
+ ) on Computer, InstanceName
+ | where TimeGenerated >= LimitStartTime and TimeGenerated < LimitEndTime
+ | project Computer, ControllerName, ContainerName, TimeGenerated, UsagePercent = UsageValue * 100.0 / LimitValue
+ | summarize AggValue = avg(UsagePercent) by bin(TimeGenerated, trendBinSize) , ContainerName, ControllerName
+ KQL
+ severity = 2
+ window_duration = "PT15M"
+ evaluation_frequency = "PT5M"
+ operator = "GreaterThan"
+ threshold = 95
+ time_aggregation_method = "Average"
+ metric_measure_column = "AggValue"
dimension = [
{
- name = "Kubernetes namespace"
+ name = "ControllerName"
operator = "Include"
values = ["*"]
- },
+ }
]
+ minimum_failing_periods_to_trigger_alert = 1
+ number_of_evaluation_periods = 1
+ auto_mitigation_enabled = true
}
- container_restart = {
- aggregation = "Average"
- metric_namespace = "Insights.Container/pods"
- metric_name = "restartingContainerCount"
- operator = "GreaterThan"
- threshold = 0
- frequency = "PT15M"
- window_size = "PT1H"
+ pods_memory = {
+ display_name = "${module.aks_leonardo.name}-POD-MEM-USAGE"
+ description = "Detect if any pod has High Memory Usage"
+ query = <<-KQL
+ let endDateTime = now();
+ let startDateTime = ago(1h);
+ let trendBinSize = 1m;
+ let capacityCounterName = 'memoryLimitBytes';
+ let usageCounterName = 'memoryRssBytes';
+ let clusterName = '${module.aks_leonardo.name}';
+ KubePodInventory
+ | where TimeGenerated < endDateTime
+ | where TimeGenerated >= startDateTime
+ | where ClusterName == clusterName
+ | extend InstanceName = strcat(ClusterId, '/', ContainerName)
+ | where ContainerName !contains "microservice-chart"
+ | distinct Computer, InstanceName, ContainerName, ControllerName
+ | join hint.strategy=shuffle (
+ Perf
+ | where TimeGenerated < endDateTime
+ | where TimeGenerated >= startDateTime
+ | where ObjectName == 'K8SContainer'
+ | where CounterName == capacityCounterName
+ | summarize LimitValue = max(CounterValue) by Computer, InstanceName, bin(TimeGenerated, trendBinSize)
+ | project Computer, InstanceName, LimitStartTime = TimeGenerated, LimitEndTime = TimeGenerated + trendBinSize, LimitValue
+ ) on Computer, InstanceName
+ | join kind=inner hint.strategy=shuffle (
+ Perf
+ | where TimeGenerated < endDateTime + trendBinSize
+ | where TimeGenerated >= startDateTime - trendBinSize
+ | where ObjectName == 'K8SContainer'
+ | where CounterName == usageCounterName
+ | project Computer, InstanceName, UsageValue = CounterValue, TimeGenerated
+ ) on Computer, InstanceName
+ | where TimeGenerated >= LimitStartTime and TimeGenerated < LimitEndTime
+ | project Computer, ControllerName, ContainerName, TimeGenerated, UsagePercent = UsageValue * 100.0 / LimitValue
+ | summarize AggValue = avg(UsagePercent) by bin(TimeGenerated, trendBinSize) , ContainerName, ControllerName
+ KQL
+ severity = 2
+ window_duration = "PT15M"
+ evaluation_frequency = "PT5M"
+ operator = "GreaterThan"
+ threshold = 90
+ time_aggregation_method = "Average"
+ metric_measure_column = "AggValue"
dimension = [
{
- name = "Kubernetes namespace"
+ name = "ControllerName"
operator = "Include"
values = ["*"]
- },
+ }
]
+ minimum_failing_periods_to_trigger_alert = 1
+ number_of_evaluation_periods = 1
+ auto_mitigation_enabled = true
}
}
}
diff --git a/src/aks-leonardo/README.md b/src/aks-leonardo/README.md
index 2eaa9b7ca..e54260d24 100644
--- a/src/aks-leonardo/README.md
+++ b/src/aks-leonardo/README.md
@@ -40,7 +40,7 @@ Re-enable all the resource, commented before to complete the procedure
| Name | Source | Version |
|------|--------|---------|
-| [aks\_leonardo](#module\_aks\_leonardo) | git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_cluster | v8.55.0 |
+| [aks\_leonardo](#module\_aks\_leonardo) | git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_cluster | v8.58.0 |
| [aks\_prometheus\_install](#module\_aks\_prometheus\_install) | git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_prometheus_install | v8.17.1 |
| [aks\_storage\_class](#module\_aks\_storage\_class) | git::https://github.com/pagopa/terraform-azurerm-v3.git//kubernetes_storage_class | v8.17.1 |
| [elastic\_agent](#module\_elastic\_agent) | git::https://github.com/pagopa/terraform-azurerm-v3.git//elastic_agent | v8.50.0 |
@@ -90,6 +90,7 @@ Re-enable all the resource, commented before to complete the procedure
| [azurerm_log_analytics_workspace.log_analytics](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/log_analytics_workspace) | data source |
| [azurerm_log_analytics_workspace.log_analytics_italy](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/log_analytics_workspace) | data source |
| [azurerm_monitor_action_group.email](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/monitor_action_group) | data source |
+| [azurerm_monitor_action_group.opsgenie](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/monitor_action_group) | data source |
| [azurerm_monitor_action_group.slack](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/monitor_action_group) | data source |
| [azurerm_public_ip.pip_aks_outboud](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/public_ip) | data source |
| [azurerm_resource_group.monitor_italy_rg](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/resource_group) | data source |