diff --git a/.github/workflows/aro-hcp-dev-env-cd.yml b/.github/workflows/aro-hcp-dev-env-cd.yml index 32ce86a7b..739ca0dff 100644 --- a/.github/workflows/aro-hcp-dev-env-cd.yml +++ b/.github/workflows/aro-hcp-dev-env-cd.yml @@ -110,6 +110,14 @@ --parameters currentUserId="${GITHUB_ACTOR}" \ --parameters regionalDNSSubdomain="${REGION}" + # metrics infrastructure + az deployment group create \ + --name "metrics-infra-${GITHUB_RUN_ID}" \ + --resource-group "${REGIONAL_RESOURCEGROUP}" \ + --template-file modules/metrics/metrics.bicep \ + --parameters configurations/mvp-metrics.bicepparam \ + --parameters globalResourceGroup="${REGIONAL_RESOURCEGROUP}" + deploy_service_cluster_rg: if: github.event.pull_request.merged == true needs: @@ -146,7 +154,6 @@ --template-file templates/svc-cluster.bicep \ --parameters configurations/mvp-svc-cluster.bicepparam \ --parameters currentUserId="${GITHUB_ACTOR}" \ - --parameters azureMonitorWorkspaceResourceId=$(az monitor account show --resource-group ${REGIONAL_RESOURCEGROUP} --name aro-hcp-monitor --query id -o tsv) \ --parameters regionalResourceGroup="${REGIONAL_RESOURCEGROUP}" SVC_CLUSTER_NAME=$(az deployment group show --resource-group "${SC_RESOURCEGROUP}" --name "svc-cluster-${GITHUB_RUN_ID}" --output tsv --query properties.outputs.aksClusterName.value) @@ -164,6 +171,14 @@ --parameters kvNames="['${SVC_KV_NAME}']" \ --parameters githubActionsPrincipalID=${{ secrets.GHA_PRINCIPAL_ID }} + # enable aks metrics + AZ_MONITOR_RESOURCE_ID=$(az deployment group show --resource-group "${REGIONAL_RESOURCEGROUP}" --name "metrics-infra-${GITHUB_RUN_ID} --output tsv --query properties.outputs.monitorId.value) + GRAFANA_RESOURCE_ID=$(az deployment group show --resource-group "${REGIONAL_RESOURCEGROUP}" --name "metrics-infra-${GITHUB_RUN_ID} --output tsv --query properties.outputs.grafanaId.value) + + az aks update --name "${SVC_CLUSTER_NAME}" --resource-group "${SC_RESOURCEGROUP}" \ + --azure-monitor-workspace-resource-id "${AZ_MONITOR_RESOURCE_ID}" \ + --grafana-resource-id "${GRAFANA_RESOURCE_ID}" + deploy_management_cluster_rg: if: github.event.pull_request.merged == true needs: @@ -200,7 +215,6 @@ --template-file templates/mgmt-cluster.bicep \ --parameters configurations/mvp-mgmt-cluster.bicepparam \ --parameters currentUserId="${GITHUB_ACTOR}" \ - --parameters azureMonitorWorkspaceResourceId=$(az monitor account show --resource-group ${REGIONAL_RESOURCEGROUP} --name aro-hcp-monitor --query id -o tsv) \ --parameters regionalResourceGroup="${REGIONAL_RESOURCEGROUP}" MGMT_CLUSTER_NAME=$(az deployment group show --resource-group "${MC_RESOURCEGROUP}" --name "mgmt-cluster-${GITHUB_RUN_ID}" --output tsv --query properties.outputs.aksClusterName.value) @@ -214,6 +228,14 @@ --parameters grantCosmosAccess=false \ --parameters githubActionsPrincipalID=${{ secrets.GHA_PRINCIPAL_ID }} + # enable aks metrics + AZ_MONITOR_RESOURCE_ID=$(az deployment group show --resource-group "${REGIONAL_RESOURCEGROUP}" --name "metrics-infra-${GITHUB_RUN_ID} --output tsv --query properties.outputs.monitorId.value) + GRAFANA_RESOURCE_ID=$(az deployment group show --resource-group "${REGIONAL_RESOURCEGROUP}" --name "metrics-infra-${GITHUB_RUN_ID} --output tsv --query properties.outputs.grafanaId.value) + + az aks update --name "${MGMT_CLUSTER_NAME}" --resource-group "${MC_RESOURCEGROUP}" \ + --azure-monitor-workspace-resource-id "${AZ_MONITOR_RESOURCE_ID}" \ + --grafana-resource-id "${GRAFANA_RESOURCE_ID}" + build_push_frontend: permissions: id-token: 'write' diff --git a/.github/workflows/bicep-what-if.yml b/.github/workflows/bicep-what-if.yml index 73dd62048..3306115bb 100644 --- a/.github/workflows/bicep-what-if.yml +++ b/.github/workflows/bicep-what-if.yml @@ -70,7 +70,6 @@ jobs: --template-file templates/svc-cluster.bicep \ --parameters configurations/mvp-svc-cluster.bicepparam \ --parameters currentUserId="${GITHUB_ACTOR}" \ - --parameters azureMonitorWorkspaceResourceId=$(az monitor account show --resource-group ${REGIONAL_RESOURCEGROUP} --name aro-hcp-monitor --query id -o tsv) \ --parameters regionalResourceGroup="${REGIONAL_RESOURCEGROUP}" # management cluster @@ -80,5 +79,12 @@ jobs: --template-file templates/mgmt-cluster.bicep \ --parameters configurations/mvp-mgmt-cluster.bicepparam \ --parameters currentUserId="${GITHUB_ACTOR}" \ - --parameters azureMonitorWorkspaceResourceId=$(az monitor account show --resource-group ${REGIONAL_RESOURCEGROUP} --name aro-hcp-monitor --query id -o tsv) \ --parameters regionalResourceGroup="${REGIONAL_RESOURCEGROUP}" + + # metrics infrastructure + az deployment group what-if \ + --name "metrics-infra-${GITHUB_RUN_ID}" \ + --resource-group "${REGIONAL_RESOURCEGROUP}" \ + --template-file modules/metrics/metrics.bicep \ + --parameters configurations/mvp-metrics.bicepparam \ + --parameters globalResourceGroup="${REGIONAL_RESOURCEGROUP}" diff --git a/dev-infrastructure/Makefile b/dev-infrastructure/Makefile index f0b40b5c1..fa28bf101 100644 --- a/dev-infrastructure/Makefile +++ b/dev-infrastructure/Makefile @@ -103,7 +103,6 @@ endif configurations/$(AKSCONFIG).bicepparam \ --parameters \ currentUserId=$(CURRENTUSER) \ - azureMonitorWorkspaceResourceId=$(shell az monitor account show --resource-group $(REGIONAL_RESOURCEGROUP) --name aro-hcp-monitor --query id -o tsv) \ regionalResourceGroup=$(REGIONAL_RESOURCEGROUP) .PHONY: cluster @@ -156,6 +155,28 @@ endif @scripts/cs-miwi-pg-connect.sh $(RESOURCEGROUP) "maestro-pg-" "maestro-server" "maestro" "maestro" .PHONY: maestro-miwi-pg-connect +metrics-infra: regionalRg + CURRENTUSER=$(CURRENTUSER) az deployment group create \ + --name "metrics-infra" \ + --resource-group $(REGIONAL_RESOURCEGROUP) \ + --template-file modules/metrics/metrics.bicep \ + $(PROMPT_TO_CONFIRM) \ + --parameters \ + globalResourceGroup=$(REGIONAL_RESOURCEGROUP) \ + --parameters configurations/dev-metrics.bicepparam +.PHONY: metrics-infra + +enable-aks-metrics: metrics-infra +ifndef AKSCONFIG + $(error "Must set AKSCONFIG") +endif + az aks update --enable-azure-monitor-metrics \ + --resource-group $(RESOURCEGROUP) \ + --name aro-hcp-$(AKSCONFIG) \ + --azure-monitor-workspace-resource-id $(shell az deployment group show --resource-group $(REGIONAL_RESOURCEGROUP) --name metrics-infra --output tsv --query properties.outputs.monitorId.value) \ + --grafana-resource-id $(shell az deployment group show --resource-group $(REGIONAL_RESOURCEGROUP) --name metrics-infra --output tsv --query properties.outputs.grafanaId.value) +.PHONY: enable-aks-metrics + aks.kubeconfig: ifndef AKSCONFIG $(error "Must set AKSCONFIG") diff --git a/dev-infrastructure/configurations/dev-metrics.bicepparam b/dev-infrastructure/configurations/dev-metrics.bicepparam new file mode 100644 index 000000000..d048c8e59 --- /dev/null +++ b/dev-infrastructure/configurations/dev-metrics.bicepparam @@ -0,0 +1,7 @@ +using '../modules/metrics/metrics.bicep' + +param grafanaName = take('aro-hcp-grafana-${uniqueString(readEnvironmentVariable('CURRENTUSER', ''))}', 23) +param msiName = 'aro-hcp-metrics-msi-${take(uniqueString(readEnvironmentVariable('CURRENTUSER', '')), 5)}' + +// overriden in makefile +param globalResourceGroup = '' diff --git a/dev-infrastructure/configurations/mgmt-cluster.bicepparam b/dev-infrastructure/configurations/mgmt-cluster.bicepparam index 23c193b63..d5fdfd047 100644 --- a/dev-infrastructure/configurations/mgmt-cluster.bicepparam +++ b/dev-infrastructure/configurations/mgmt-cluster.bicepparam @@ -27,4 +27,3 @@ param acrPullResourceGroups = ['global'] // These parameters are always overriden in the Makefile param currentUserId = '' param regionalResourceGroup = '' -param azureMonitorWorkspaceResourceId = '' diff --git a/dev-infrastructure/configurations/mvp-metrics.bicepparam b/dev-infrastructure/configurations/mvp-metrics.bicepparam new file mode 100644 index 000000000..0878e73e9 --- /dev/null +++ b/dev-infrastructure/configurations/mvp-metrics.bicepparam @@ -0,0 +1,7 @@ +using '../modules/metrics/metrics.bicep' + +param grafanaName = 'aro-hcp-grafana' +param msiName = 'aro-hcp-metrics-msi' + +// overriden in makefile +param globalResourceGroup = '' diff --git a/dev-infrastructure/configurations/mvp-mgmt-cluster.bicepparam b/dev-infrastructure/configurations/mvp-mgmt-cluster.bicepparam index 407bbc65f..3eff4b763 100644 --- a/dev-infrastructure/configurations/mvp-mgmt-cluster.bicepparam +++ b/dev-infrastructure/configurations/mvp-mgmt-cluster.bicepparam @@ -27,4 +27,3 @@ param acrPullResourceGroups = [regionalResourceGroup, 'global'] // These parameters are always overridden in the Makefile param currentUserId = '' param regionalResourceGroup = '' -param azureMonitorWorkspaceResourceId = '' diff --git a/dev-infrastructure/configurations/mvp-region.bicepparam b/dev-infrastructure/configurations/mvp-region.bicepparam index d8bfbfdf3..9d58a7a44 100644 --- a/dev-infrastructure/configurations/mvp-region.bicepparam +++ b/dev-infrastructure/configurations/mvp-region.bicepparam @@ -11,8 +11,5 @@ param maestroKeyVaultName = 'maestro-kv-aro-hcp-dev' param maestroEventGridNamespacesName = 'maestro-eventgrid-aro-hcp-dev' param maestroEventGridMaxClientSessionsPerAuthName = 4 -// metrics -param grafanaName = 'aro-hcp-grafana' - // This parameter is always overriden in the Makefile param currentUserId = '' diff --git a/dev-infrastructure/configurations/mvp-svc-cluster.bicepparam b/dev-infrastructure/configurations/mvp-svc-cluster.bicepparam index 5e6eb10f5..facce2787 100644 --- a/dev-infrastructure/configurations/mvp-svc-cluster.bicepparam +++ b/dev-infrastructure/configurations/mvp-svc-cluster.bicepparam @@ -35,4 +35,3 @@ param imageSyncAcrResourceGroupNames = ['global'] // These parameters are always overridden in the Makefile param currentUserId = '' param regionalResourceGroup = '' -param azureMonitorWorkspaceResourceId = '' diff --git a/dev-infrastructure/configurations/region.bicepparam b/dev-infrastructure/configurations/region.bicepparam index b9f6d5cb1..b679aea1c 100644 --- a/dev-infrastructure/configurations/region.bicepparam +++ b/dev-infrastructure/configurations/region.bicepparam @@ -9,8 +9,5 @@ param maestroKeyVaultName = take('maestro-kv-${uniqueString(currentUserId)}', 24 param maestroEventGridNamespacesName = take('maestro-eg-${uniqueString(currentUserId)}', 24) param maestroEventGridMaxClientSessionsPerAuthName = 4 -// metrics -param grafanaName = take('aro-hcp-grafana-${uniqueString(currentUserId)}', 23) - // These parameters are always overriden in the Makefile param currentUserId = '' diff --git a/dev-infrastructure/configurations/svc-cluster.bicepparam b/dev-infrastructure/configurations/svc-cluster.bicepparam index ecf65584d..d830f9bd9 100644 --- a/dev-infrastructure/configurations/svc-cluster.bicepparam +++ b/dev-infrastructure/configurations/svc-cluster.bicepparam @@ -36,4 +36,3 @@ param clustersServiceAcrResourceGroupNames = ['global'] // These parameters are always overriden in the Makefile param currentUserId = '' param regionalResourceGroup = '' -param azureMonitorWorkspaceResourceId = '' diff --git a/dev-infrastructure/modules/aks-cluster-base.bicep b/dev-infrastructure/modules/aks-cluster-base.bicep index 9b3c76323..f20d2d5f7 100644 --- a/dev-infrastructure/modules/aks-cluster-base.bicep +++ b/dev-infrastructure/modules/aks-cluster-base.bicep @@ -47,11 +47,6 @@ param userOsDiskSizeGB int = 32 param acrPullResourceGroups array = [] -// Metric Params -param azureMonitorWorkspaceResourceId string -param metricLabelsAllowlist string = '' -param metricAnnotationsAllowList string = '' - @description('Perform cryptographic operations using keys. Only works for key vaults that use the Azure role-based access control permission model.') var keyVaultCryptoUserId = subscriptionResourceId( 'Microsoft.Authorization/roleDefinitions', @@ -337,15 +332,6 @@ resource aksCluster 'Microsoft.ContainerService/managedClusters@2024-04-02-previ nodeOSUpgradeChannel: 'NodeImage' upgradeChannel: 'patch' } - azureMonitorProfile: { - metrics: { - enabled: true - kubeStateMetrics: { - metricLabelsAllowlist: metricLabelsAllowlist - metricAnnotationsAllowList: metricAnnotationsAllowList - } - } - } disableLocalAccounts: true dnsPrefix: dnsPrefix enableRBAC: true @@ -471,16 +457,6 @@ resource uami_fedcred 'Microsoft.ManagedIdentity/userAssignedIdentities/federate } ] -module aksMetrics '../modules/metrics/aks-azure-monitor-metrics.bicep' = { - name: 'aks-metrics-${aksClusterName}' - params: { - azureMonitorWorkspaceResourceId: azureMonitorWorkspaceResourceId - azureMonitorWorkspaceLocation: location - clusterResourceId: aksCluster.id - clusterLocation: aksCluster.location - } -} - // Outputs output userAssignedIdentities array = [ for i in range(0, length(workloadIdentities)): { diff --git a/dev-infrastructure/modules/metrics/Alerts.bicep b/dev-infrastructure/modules/metrics/Alerts.bicep deleted file mode 100644 index 37698da12..000000000 --- a/dev-infrastructure/modules/metrics/Alerts.bicep +++ /dev/null @@ -1,28 +0,0 @@ -// This template is copied from https://dev.azure.com/msazure/AzureRedHatOpenShift/_git/ARO-Pipelines?path=/metrics/infra/Templates/Alerts.bicep -// Ideally this template is consumed from ACR. - -param azureMonitoring string - -resource prometheusRuleGroups 'Microsoft.AlertsManagement/prometheusRuleGroups@2023-03-01' = { - name: 'hcp-prometheus-rules' - location: resourceGroup().location - properties: { - rules: [ - { - // Copy from https://github.com/Azure/prometheus-collector/blob/main/AddonBicepTemplate/recommendedMetricAlerts.bicep - alert: 'KubePodCrashLooping' - expression: 'max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", job="kube-state-metrics"}[5m]) >= 1' - for: 'PT15M' - enabled: true - severity: 4 - resolveConfiguration: { - autoResolved: true - timeToResolve: 'PT10M' - } - } - ] - scopes: [ - azureMonitoring - ] - } -} diff --git a/dev-infrastructure/modules/metrics/Metrics.bicep b/dev-infrastructure/modules/metrics/Metrics.bicep deleted file mode 100644 index 7bdd88cb7..000000000 --- a/dev-infrastructure/modules/metrics/Metrics.bicep +++ /dev/null @@ -1,53 +0,0 @@ -// This template is copied from https://dev.azure.com/msazure/AzureRedHatOpenShift/_git/ARO-Pipelines?path=/metrics/infra/Templates/Metrics.bicep -// Ideally this template is consumed from ACR. - -param grafanaName string - -resource monitor 'Microsoft.Monitor/accounts@2023-04-03' = { - name: 'aro-hcp-monitor' - location: resourceGroup().location - properties: { - publicNetworkAccess: 'Enabled' - } -} - -resource grafana 'Microsoft.Dashboard/grafana@2023-09-01' = { - name: grafanaName - location: resourceGroup().location - sku: { - name: 'Standard' - } - identity: { - type: 'SystemAssigned' - } - properties: { - grafanaMajorVersion: '10' - grafanaIntegrations: { - azureMonitorWorkspaceIntegrations: [ - { - azureMonitorWorkspaceResourceId: monitor.id - } - ] - } - } -} - -// Assign the Monitoring Data Reader role to the Azure Managed Grafana system-assigned managed identity at the workspace scope -var dataReader = 'b0d8363b-8ddd-447d-831f-62ca05bff136' - -resource roleAssignment 'Microsoft.Authorization/roleAssignments@2022-04-01' = { - name: guid(monitor.id, grafana.id, dataReader) - scope: monitor - properties: { - principalId: grafana.identity.principalId - principalType: 'ServicePrincipal' - roleDefinitionId: subscriptionResourceId('Microsoft.Authorization/roleDefinitions', dataReader) - } -} - -module alerts 'Alerts.bicep' = { - name: 'alerts' - params: { - azureMonitoring: monitor.id - } -} diff --git a/dev-infrastructure/modules/metrics/aks-azure-monitor-metrics.bicep b/dev-infrastructure/modules/metrics/aks-azure-monitor-metrics.bicep deleted file mode 100644 index 281d63dd4..000000000 --- a/dev-infrastructure/modules/metrics/aks-azure-monitor-metrics.bicep +++ /dev/null @@ -1,236 +0,0 @@ -param azureMonitorWorkspaceResourceId string -param azureMonitorWorkspaceLocation string -param clusterResourceId string -param clusterLocation string - -var clusterName = split(clusterResourceId, '/')[8] -var dceName = take('MSProm-${azureMonitorWorkspaceLocation}-${clusterName}', 44) -var dcrName = take('MSProm-${azureMonitorWorkspaceLocation}-${clusterName}', 44) -var dcraName = 'MSProm-${clusterLocation}-${clusterName}' -var nodeRecordingRuleGroupPrefix = 'NodeRecordingRulesRuleGroup-' -var nodeRecordingRuleGroupName = '${nodeRecordingRuleGroupPrefix}${clusterName}' -var nodeRecordingRuleGroupDescription = 'Node Recording Rules RuleGroup' -var kubernetesRecordingRuleGrouPrefix = 'KubernetesRecordingRulesRuleGroup-' -var kubernetesRecordingRuleGroupName = '${kubernetesRecordingRuleGrouPrefix}${clusterName}' -var kubernetesRecordingRuleGroupDescription = 'Kubernetes Recording Rules RuleGroup' - -resource dce 'Microsoft.Insights/dataCollectionEndpoints@2022-06-01' = { - name: dceName - location: azureMonitorWorkspaceLocation - kind: 'Linux' - properties: {} -} - -resource dcr 'Microsoft.Insights/dataCollectionRules@2022-06-01' = { - name: dcrName - location: azureMonitorWorkspaceLocation - kind: 'Linux' - properties: { - dataCollectionEndpointId: dce.id - dataFlows: [ - { - destinations: [ - 'MonitoringAccount1' - ] - streams: [ - 'Microsoft-PrometheusMetrics' - ] - } - ] - dataSources: { - prometheusForwarder: [ - { - name: 'PrometheusDataSource' - streams: [ - 'Microsoft-PrometheusMetrics' - ] - labelIncludeFilter: {} - } - ] - } - description: 'DCR for Azure Monitor Metrics Profile (Managed Prometheus)' - destinations: { - monitoringAccounts: [ - { - accountResourceId: azureMonitorWorkspaceResourceId - name: 'MonitoringAccount1' - } - ] - } - } -} - -resource aksCluster 'Microsoft.ContainerService/managedClusters@2024-03-02-preview' existing = { - name: clusterName -} - -resource azuremonitormetrics_dcra_clusterResourceId 'Microsoft.Insights/dataCollectionRuleAssociations@2022-06-01' = { - name: '${clusterName}-${dcraName}' - scope: aksCluster - properties: { - description: 'Association of data collection rule. Deleting this association will break the data collection for this AKS Cluster.' - dataCollectionRuleId: dcr.id - } -} - -// default recording rules from https://github.com/Azure/prometheus-collector/blob/main/AddonBicepTemplate/FullAzureMonitorMetricsProfile.bicep -resource nodeRecordingRuleGroup 'Microsoft.AlertsManagement/prometheusRuleGroups@2023-03-01' = { - name: nodeRecordingRuleGroupName - location: azureMonitorWorkspaceLocation - properties: { - description: nodeRecordingRuleGroupDescription - scopes: [azureMonitorWorkspaceResourceId, clusterResourceId] - enabled: true - clusterName: clusterName - interval: 'PT1M' - rules: [ - { - record: 'instance:node_num_cpu:sum' - expression: 'count without (cpu, mode) ( node_cpu_seconds_total{job="node",mode="idle"})' - } - { - record: 'instance:node_cpu_utilisation:rate5m' - expression: '1 - avg without (cpu) ( sum without (mode) (rate(node_cpu_seconds_total{job="node", mode=~"idle|iowait|steal"}[5m])))' - } - { - record: 'instance:node_load1_per_cpu:ratio' - expression: '( node_load1{job="node"}/ instance:node_num_cpu:sum{job="node"})' - } - { - record: 'instance:node_memory_utilisation:ratio' - expression: '1 - ( ( node_memory_MemAvailable_bytes{job="node"} or ( node_memory_Buffers_bytes{job="node"} + node_memory_Cached_bytes{job="node"} + node_memory_MemFree_bytes{job="node"} + node_memory_Slab_bytes{job="node"} ) )/ node_memory_MemTotal_bytes{job="node"})' - } - { - record: 'instance:node_vmstat_pgmajfault:rate5m' - expression: 'rate(node_vmstat_pgmajfault{job="node"}[5m])' - } - { - record: 'instance_device:node_disk_io_time_seconds:rate5m' - expression: 'rate(node_disk_io_time_seconds_total{job="node", device!=""}[5m])' - } - { - record: 'instance_device:node_disk_io_time_weighted_seconds:rate5m' - expression: 'rate(node_disk_io_time_weighted_seconds_total{job="node", device!=""}[5m])' - } - { - record: 'instance:node_network_receive_bytes_excluding_lo:rate5m' - expression: 'sum without (device) ( rate(node_network_receive_bytes_total{job="node", device!="lo"}[5m]))' - } - { - record: 'instance:node_network_transmit_bytes_excluding_lo:rate5m' - expression: 'sum without (device) ( rate(node_network_transmit_bytes_total{job="node", device!="lo"}[5m]))' - } - { - record: 'instance:node_network_receive_drop_excluding_lo:rate5m' - expression: 'sum without (device) ( rate(node_network_receive_drop_total{job="node", device!="lo"}[5m]))' - } - { - record: 'instance:node_network_transmit_drop_excluding_lo:rate5m' - expression: 'sum without (device) ( rate(node_network_transmit_drop_total{job="node", device!="lo"}[5m]))' - } - ] - } -} - -// default recording rules from https://github.com/Azure/prometheus-collector/blob/main/AddonBicepTemplate/FullAzureMonitorMetricsProfile.bicep -resource kubernetesRecordingRuleGroup 'Microsoft.AlertsManagement/prometheusRuleGroups@2023-03-01' = { - name: kubernetesRecordingRuleGroupName - location: azureMonitorWorkspaceLocation - properties: { - description: kubernetesRecordingRuleGroupDescription - scopes: [azureMonitorWorkspaceResourceId, clusterResourceId] - enabled: true - clusterName: clusterName - interval: 'PT1M' - rules: [ - { - record: 'node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate' - expression: 'sum by (cluster, namespace, pod, container) ( irate(container_cpu_usage_seconds_total{job="cadvisor", image!=""}[5m])) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) ( 1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}))' - } - { - record: 'node_namespace_pod_container:container_memory_working_set_bytes' - expression: 'container_memory_working_set_bytes{job="cadvisor", image!=""}* on (namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=""}))' - } - { - record: 'node_namespace_pod_container:container_memory_rss' - expression: 'container_memory_rss{job="cadvisor", image!=""}* on (namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=""}))' - } - { - record: 'node_namespace_pod_container:container_memory_cache' - expression: 'container_memory_cache{job="cadvisor", image!=""}* on (namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=""}))' - } - { - record: 'node_namespace_pod_container:container_memory_swap' - expression: 'container_memory_swap{job="cadvisor", image!=""}* on (namespace, pod) group_left(node) topk by(namespace, pod) (1, max by(namespace, pod, node) (kube_pod_info{node!=""}))' - } - { - record: 'cluster:namespace:pod_memory:active:kube_pod_container_resource_requests' - expression: 'kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster)group_left() max by (namespace, pod, cluster) ( (kube_pod_status_phase{phase=~"Pending|Running"} == 1))' - } - { - record: 'namespace_memory:kube_pod_container_resource_requests:sum' - expression: 'sum by (namespace, cluster) ( sum by (namespace, pod, cluster) ( max by (namespace, pod, container, cluster) ( kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( kube_pod_status_phase{phase=~"Pending|Running"} == 1 ) ))' - } - { - record: 'cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests' - expression: 'kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster)group_left() max by (namespace, pod, cluster) ( (kube_pod_status_phase{phase=~"Pending|Running"} == 1))' - } - { - record: 'namespace_cpu:kube_pod_container_resource_requests:sum' - expression: 'sum by (namespace, cluster) ( sum by (namespace, pod, cluster) ( max by (namespace, pod, container, cluster) ( kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( kube_pod_status_phase{phase=~"Pending|Running"} == 1 ) ))' - } - { - record: 'cluster:namespace:pod_memory:active:kube_pod_container_resource_limits' - expression: 'kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster)group_left() max by (namespace, pod, cluster) ( (kube_pod_status_phase{phase=~"Pending|Running"} == 1))' - } - { - record: 'namespace_memory:kube_pod_container_resource_limits:sum' - expression: 'sum by (namespace, cluster) ( sum by (namespace, pod, cluster) ( max by (namespace, pod, container, cluster) ( kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( kube_pod_status_phase{phase=~"Pending|Running"} == 1 ) ))' - } - { - record: 'cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits' - expression: 'kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster)group_left() max by (namespace, pod, cluster) ( (kube_pod_status_phase{phase=~"Pending|Running"} == 1) )' - } - { - record: 'namespace_cpu:kube_pod_container_resource_limits:sum' - expression: 'sum by (namespace, cluster) ( sum by (namespace, pod, cluster) ( max by (namespace, pod, container, cluster) ( kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( kube_pod_status_phase{phase=~"Pending|Running"} == 1 ) ))' - } - { - record: 'namespace_workload_pod:kube_pod_owner:relabel' - expression: 'max by (cluster, namespace, workload, pod) ( label_replace( label_replace( kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"}, "replicaset", "$1", "owner_name", "(.*)" ) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) ( 1, max by (replicaset, namespace, owner_name) ( kube_replicaset_owner{job="kube-state-metrics"} ) ), "workload", "$1", "owner_name", "(.*)" ))' - labels: { - workload_type: 'deployment' - } - } - { - record: 'namespace_workload_pod:kube_pod_owner:relabel' - expression: 'max by (cluster, namespace, workload, pod) ( label_replace( kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"}, "workload", "$1", "owner_name", "(.*)" ))' - labels: { - workload_type: 'daemonset' - } - } - { - record: 'namespace_workload_pod:kube_pod_owner:relabel' - expression: 'max by (cluster, namespace, workload, pod) ( label_replace( kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"}, "workload", "$1", "owner_name", "(.*)" ))' - labels: { - workload_type: 'statefulset' - } - } - { - record: 'namespace_workload_pod:kube_pod_owner:relabel' - expression: 'max by (cluster, namespace, workload, pod) ( label_replace( kube_pod_owner{job="kube-state-metrics", owner_kind="Job"}, "workload", "$1", "owner_name", "(.*)" ))' - labels: { - workload_type: 'job' - } - } - { - record: ':node_memory_MemAvailable_bytes:sum' - expression: 'sum( node_memory_MemAvailable_bytes{job="node"} or ( node_memory_Buffers_bytes{job="node"} + node_memory_Cached_bytes{job="node"} + node_memory_MemFree_bytes{job="node"} + node_memory_Slab_bytes{job="node"} )) by (cluster)' - } - { - record: 'cluster:node_cpu:ratio_rate5m' - expression: 'sum(rate(node_cpu_seconds_total{job="node",mode!="idle",mode!="iowait",mode!="steal"}[5m])) by (cluster) /count(sum(node_cpu_seconds_total{job="node"}) by (cluster, instance, cpu)) by (cluster)' - } - ] - } -} diff --git a/dev-infrastructure/modules/metrics/metrics.bicep b/dev-infrastructure/modules/metrics/metrics.bicep new file mode 100644 index 000000000..dae2572a4 --- /dev/null +++ b/dev-infrastructure/modules/metrics/metrics.bicep @@ -0,0 +1,42 @@ +// this module is only used in dev +@description('Captures logged in users UID') +param currentUserId string = '' + +@description('Metrics global resource group name') +param globalResourceGroup string + +@description('Metrics global MSI name') +param msiName string = take('metrics-admin-${uniqueString(currentUserId)}', 4) + +@description('Metrics global Grafana name') +param grafanaName string = take('aro-hcp-grafana-${uniqueString(currentUserId)}', 23) + +var grafanaAdmin = { + principalId: '366b619c-e72e-4278-8aaf-9af7851c601f' // aro-hcp-engineering + principalType: 'group' +} + +module grafana 'br:arointacr.azurecr.io/grafana.bicep:metrics.20240814.1' = { + name: 'grafana' + params: { + msiName: msiName + grafanaName: grafanaName + grafanaAdmin: grafanaAdmin + } +} + +module monitor 'br:arointacr.azurecr.io/monitor.bicep:metrics.20240814.2' = { + name: 'monitor' + params: { + globalResourceGroup: globalResourceGroup + msiName: msiName + grafanaName: grafanaName + } + dependsOn: [ + grafana + ] +} + +output msiId string = monitor.outputs.msiId +output grafanaId string = monitor.outputs.grafanaId +output monitorId string = monitor.outputs.monitorId diff --git a/dev-infrastructure/templates/mgmt-cluster.bicep b/dev-infrastructure/templates/mgmt-cluster.bicep index 19d8889ef..4603de568 100644 --- a/dev-infrastructure/templates/mgmt-cluster.bicep +++ b/dev-infrastructure/templates/mgmt-cluster.bicep @@ -80,9 +80,6 @@ param regionalDNSSubdomain string = empty(currentUserId) @description('The resource group that hosts the regional zone') param regionalResourceGroup string -@description('The resource id of the regional Azure Monitor Workspace') -param azureMonitorWorkspaceResourceId string - func isValidMaestroConsumerName(input string) bool => length(input) <= 90 && contains(input, '[^a-zA-Z0-9_-]') == false // Tags the resource group @@ -133,7 +130,6 @@ module mgmtCluster '../modules/aks-cluster-base.bicep' = { systemAgentMinCount: systemAgentMinCount systemAgentMaxCount: systemAgentMaxCount systemAgentVMSize: systemAgentVMSize - azureMonitorWorkspaceResourceId: azureMonitorWorkspaceResourceId } } diff --git a/dev-infrastructure/templates/region.bicep b/dev-infrastructure/templates/region.bicep index 68b74ea7f..0ca81eecb 100644 --- a/dev-infrastructure/templates/region.bicep +++ b/dev-infrastructure/templates/region.bicep @@ -30,9 +30,6 @@ param regionalDNSSubdomain string = empty(currentUserId) ? location : '${location}-${take(uniqueString(currentUserId), 5)}' -@description('A unique resource name for the Azure Managed Grafana instance no longer than 23 characters.') -param grafanaName string - // Tags the resource group resource subscriptionTags 'Microsoft.Resources/tags@2024-03-01' = { name: 'default' @@ -78,14 +75,3 @@ module maestroInfra '../modules/maestro/maestro-infra.bicep' = { kvCertOfficerManagedIdentityName: maestroKeyVaultCertOfficerMSIName } } - -// -// M E T R I C S -// - -module metricsInfra '../modules/metrics/Metrics.bicep' = { - name: 'metrics-infra' - params: { - grafanaName: grafanaName - } -} diff --git a/dev-infrastructure/templates/svc-cluster.bicep b/dev-infrastructure/templates/svc-cluster.bicep index 968f0f35c..ae93b6d9f 100644 --- a/dev-infrastructure/templates/svc-cluster.bicep +++ b/dev-infrastructure/templates/svc-cluster.bicep @@ -102,9 +102,6 @@ param imageSyncAcrResourceGroupNames array = [] @description('Clusters Service ACR RG names') param clustersServiceAcrResourceGroupNames array = [] -@description('The resource id of the regional Azure Monitor Workspace') -param azureMonitorWorkspaceResourceId string - // Tags the resource group resource subscriptionTags 'Microsoft.Resources/tags@2024-03-01' = { name: 'default' @@ -158,7 +155,6 @@ module svcCluster '../modules/aks-cluster-base.bicep' = { aksKeyVaultName: aksKeyVaultName deployUserAgentPool: true acrPullResourceGroups: acrPullResourceGroups - azureMonitorWorkspaceResourceId: azureMonitorWorkspaceResourceId } }