From af24536038935dec7e0a2e3aac9027217faf7efd Mon Sep 17 00:00:00 2001 From: Hyunsoo Kim Date: Tue, 6 Feb 2024 13:53:57 -0500 Subject: [PATCH 01/20] support nvidia gpu metrics and update test configs --- .../appsignals_and_eks_config.json | 3 +- .../appsignals_and_eks_config.yaml | 1 + .../appsignals_and_k8s_config.json | 3 +- .../appsignals_and_k8s_config.yaml | 1 + .../base_container_insights_config.json | 3 +- .../base_container_insights_config.yaml | 1 + .../emf_and_kubernetes_config.json | 3 +- .../emf_and_kubernetes_config.yaml | 1 + .../emf_and_kubernetes_with_gpu_config.conf | 27 + .../emf_and_kubernetes_with_gpu_config.json | 19 + .../emf_and_kubernetes_with_gpu_config.yaml | 707 ++++++++++++++++++ .../kubernetes_on_prem_config.json | 3 +- .../kubernetes_on_prem_config.yaml | 1 + .../sampleConfig/log_ecs_metric_only.yaml | 1 + .../logs_and_kubernetes_config.json | 3 +- .../logs_and_kubernetes_config.yaml | 1 + translator/tocwconfig/tocwconfig_test.go | 11 + translator/translate/otel/common/common.go | 1 + .../otel/exporter/awsemf/kubernetes.go | 49 ++ .../metricstransformprocessor/translator.go | 81 +- .../awscontainerinsight/translator.go | 2 + 21 files changed, 914 insertions(+), 8 deletions(-) create mode 100644 translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.conf create mode 100644 translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.json create mode 100644 translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml diff --git a/translator/tocwconfig/sampleConfig/appsignals_and_eks_config.json b/translator/tocwconfig/sampleConfig/appsignals_and_eks_config.json index 5aa31646d1..fdd2c73048 100644 --- a/translator/tocwconfig/sampleConfig/appsignals_and_eks_config.json +++ b/translator/tocwconfig/sampleConfig/appsignals_and_eks_config.json @@ -19,7 +19,8 @@ "cluster_name": "TestCluster", "metrics_collection_interval": 30, "disable_metric_extraction": true, - "enhanced_container_insights": false + "enhanced_container_insights": false, + "gpu_metrics": false } }, "force_flush_interval": 5, diff --git a/translator/tocwconfig/sampleConfig/appsignals_and_eks_config.yaml b/translator/tocwconfig/sampleConfig/appsignals_and_eks_config.yaml index 6b54e0320a..c6a54768ff 100644 --- a/translator/tocwconfig/sampleConfig/appsignals_and_eks_config.yaml +++ b/translator/tocwconfig/sampleConfig/appsignals_and_eks_config.yaml @@ -633,6 +633,7 @@ receivers: resource_arn: "" role_arn: "" shared_credentials_file: [] + gpu_metrics: false otlp/app_signals: protocols: grpc: diff --git a/translator/tocwconfig/sampleConfig/appsignals_and_k8s_config.json b/translator/tocwconfig/sampleConfig/appsignals_and_k8s_config.json index 2e517be541..7be14c83c7 100644 --- a/translator/tocwconfig/sampleConfig/appsignals_and_k8s_config.json +++ b/translator/tocwconfig/sampleConfig/appsignals_and_k8s_config.json @@ -15,7 +15,8 @@ "cluster_name": "TestCluster", "metrics_collection_interval": 30, "disable_metric_extraction": true, - "enhanced_container_insights": false + "enhanced_container_insights": false, + "gpu_metrics": false } }, "force_flush_interval": 5, diff --git a/translator/tocwconfig/sampleConfig/appsignals_and_k8s_config.yaml b/translator/tocwconfig/sampleConfig/appsignals_and_k8s_config.yaml index 8c33dbac40..60c3cf46f5 100644 --- a/translator/tocwconfig/sampleConfig/appsignals_and_k8s_config.yaml +++ b/translator/tocwconfig/sampleConfig/appsignals_and_k8s_config.yaml @@ -633,6 +633,7 @@ receivers: resource_arn: "" role_arn: "" shared_credentials_file: [] + gpu_metrics: false otlp/app_signals: protocols: grpc: diff --git a/translator/tocwconfig/sampleConfig/base_container_insights_config.json b/translator/tocwconfig/sampleConfig/base_container_insights_config.json index 3c69c7cc65..f089538b21 100644 --- a/translator/tocwconfig/sampleConfig/base_container_insights_config.json +++ b/translator/tocwconfig/sampleConfig/base_container_insights_config.json @@ -10,7 +10,8 @@ "cluster_name": "TestCluster", "metrics_collection_interval": 30, "disable_metric_extraction": true, - "prefer_full_pod_name": true + "prefer_full_pod_name": true, + "gpu_metrics": false } }, "force_flush_interval": 5, diff --git a/translator/tocwconfig/sampleConfig/base_container_insights_config.yaml b/translator/tocwconfig/sampleConfig/base_container_insights_config.yaml index 2eeb3aea7d..89e6375f87 100644 --- a/translator/tocwconfig/sampleConfig/base_container_insights_config.yaml +++ b/translator/tocwconfig/sampleConfig/base_container_insights_config.yaml @@ -201,6 +201,7 @@ receivers: resource_arn: "" role_arn: "" shared_credentials_file: [] + gpu_metrics: false tcplog/emf_logs: attributes: {} encoding: utf-8 diff --git a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.json b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.json index 8e1ffdbbf9..c1c6807811 100644 --- a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.json +++ b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.json @@ -10,7 +10,8 @@ "cluster_name": "TestCluster", "metrics_collection_interval": 30, "disable_metric_extraction": true, - "enhanced_container_insights": true + "enhanced_container_insights": true, + "gpu_metrics": false } }, "force_flush_interval": 5, diff --git a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.yaml b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.yaml index 471108a453..1557628b44 100644 --- a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.yaml +++ b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.yaml @@ -469,6 +469,7 @@ receivers: role_arn: "" shared_credentials_file: - /root/.aws/credentials + gpu_metrics: false tcplog/emf_logs: attributes: {} encoding: utf-8 diff --git a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.conf b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.conf new file mode 100644 index 0000000000..007bb60efb --- /dev/null +++ b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.conf @@ -0,0 +1,27 @@ +[agent] + collection_jitter = "0s" + debug = false + flush_interval = "1s" + flush_jitter = "0s" + hostname = "host_name_from_env" + interval = "60s" + logfile = "" + logtarget = "lumberjack" + metric_batch_size = 1000 + metric_buffer_limit = 10000 + omit_hostname = false + precision = "" + quiet = false + round_interval = false + +[inputs] + +[outputs] + + [[outputs.cloudwatchlogs]] + endpoint_override = "https://fake_endpoint" + force_flush_interval = "5s" + log_stream_name = "host_name_from_env" + region = "us-east-1" + +[processors] diff --git a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.json b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.json new file mode 100644 index 0000000000..8e1ffdbbf9 --- /dev/null +++ b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.json @@ -0,0 +1,19 @@ +{ + "agent": { + "region": "us-east-1" + }, + "logs": { + "metrics_collected": { + "emf": { + }, + "kubernetes": { + "cluster_name": "TestCluster", + "metrics_collection_interval": 30, + "disable_metric_extraction": true, + "enhanced_container_insights": true + } + }, + "force_flush_interval": 5, + "endpoint_override":"https://fake_endpoint" + } +} diff --git a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml new file mode 100644 index 0000000000..94d2aaf310 --- /dev/null +++ b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml @@ -0,0 +1,707 @@ +connectors: {} +exporters: + awscloudwatchlogs/emf_logs: + certificate_file_path: "" + emf_only: true + endpoint: https://fake_endpoint + imds_retries: 2 + local_mode: false + log_group_name: emf/logs/default + log_retention: 0 + log_stream_name: host_name_from_env + max_retries: 2 + middleware: agenthealth/logs + no_verify_ssl: false + num_workers: 8 + profile: default + proxy_address: "" + raw_log: true + region: us-east-1 + request_timeout_seconds: 30 + resource_arn: "" + retry_on_failure: + enabled: true + initial_interval: 5s + max_elapsed_time: 5m0s + max_interval: 30s + multiplier: 1.5 + randomization_factor: 0.5 + role_arn: "" + sending_queue: + enabled: true + num_consumers: 1 + queue_size: 1000 + storage: null + shared_credentials_file: + - /root/.aws/credentials + awsemf/containerinsights: + certificate_file_path: "" + detailed_metrics: false + dimension_rollup_option: NoDimensionRollup + disable_metric_extraction: true + eks_fargate_container_insights_enabled: false + endpoint: https://fake_endpoint + enhanced_container_insights: true + imds_retries: 2 + local_mode: false + log_group_name: /aws/containerinsights/{ClusterName}/performance + log_retention: 0 + log_stream_name: '{NodeName}' + max_retries: 2 + metric_declarations: + - dimensions: + - - ClusterName + - - ClusterName + - ContainerName + - FullPodName + - Namespace + - PodName + - - ClusterName + - ContainerName + - Namespace + - PodName + label_matchers: [] + metric_name_selectors: + - container_cpu_utilization + - container_cpu_utilization_over_container_limit + - container_cpu_limit + - container_cpu_request + - container_memory_utilization + - container_memory_utilization_over_container_limit + - container_memory_failures_total + - container_memory_limit + - container_memory_request + - container_filesystem_usage + - container_filesystem_available + - container_filesystem_utilization + - dimensions: + - - ClusterName + - Namespace + - PodName + - - ClusterName + - - ClusterName + - Namespace + - Service + - - ClusterName + - Namespace + - - ClusterName + - FullPodName + - Namespace + - PodName + label_matchers: [] + metric_name_selectors: + - pod_cpu_utilization + - pod_memory_utilization + - pod_network_rx_bytes + - pod_network_tx_bytes + - pod_cpu_utilization_over_pod_limit + - pod_memory_utilization_over_pod_limit + - dimensions: + - - ClusterName + - FullPodName + - Namespace + - PodName + - - ClusterName + - Namespace + - PodName + - - ClusterName + - Namespace + - - ClusterName + label_matchers: [] + metric_name_selectors: + - pod_interface_network_rx_dropped + - pod_interface_network_tx_dropped + - dimensions: + - - ClusterName + - Namespace + - PodName + - - ClusterName + - - ClusterName + - FullPodName + - Namespace + - PodName + - - ClusterName + - Namespace + - Service + label_matchers: [] + metric_name_selectors: + - pod_cpu_reserved_capacity + - pod_memory_reserved_capacity + - pod_number_of_container_restarts + - pod_number_of_containers + - pod_number_of_running_containers + - pod_status_ready + - pod_status_scheduled + - pod_status_running + - pod_status_pending + - pod_status_failed + - pod_status_unknown + - pod_status_succeeded + - pod_memory_request + - pod_memory_limit + - pod_cpu_limit + - pod_cpu_request + - pod_container_status_running + - pod_container_status_terminated + - pod_container_status_waiting + - pod_container_status_waiting_reason_crash_loop_back_off + - pod_container_status_waiting_reason_image_pull_error + - pod_container_status_waiting_reason_start_error + - pod_container_status_waiting_reason_create_container_error + - pod_container_status_waiting_reason_create_container_config_error + - pod_container_status_terminated_reason_oom_killed + - dimensions: + - - ClusterName + - InstanceId + - NodeName + - - ClusterName + label_matchers: [] + metric_name_selectors: + - node_cpu_utilization + - node_memory_utilization + - node_network_total_bytes + - node_cpu_reserved_capacity + - node_memory_reserved_capacity + - node_number_of_running_pods + - node_number_of_running_containers + - node_cpu_usage_total + - node_cpu_limit + - node_memory_working_set + - node_memory_limit + - node_status_condition_ready + - node_status_condition_disk_pressure + - node_status_condition_memory_pressure + - node_status_condition_pid_pressure + - node_status_condition_network_unavailable + - node_status_condition_unknown + - node_status_capacity_pods + - node_status_allocatable_pods + - dimensions: + - - ClusterName + - InstanceId + - NodeName + - - ClusterName + label_matchers: [] + metric_name_selectors: + - node_interface_network_rx_dropped + - node_interface_network_tx_dropped + - node_diskio_io_service_bytes_total + - node_diskio_io_serviced_total + - dimensions: + - - ClusterName + - InstanceId + - NodeName + - - ClusterName + label_matchers: [] + metric_name_selectors: + - node_filesystem_utilization + - node_filesystem_inodes + - node_filesystem_inodes_free + - dimensions: + - - ClusterName + - Namespace + - Service + - - ClusterName + label_matchers: [] + metric_name_selectors: + - service_number_of_running_pods + - dimensions: + - - ClusterName + - Namespace + - PodName + - - ClusterName + label_matchers: [] + metric_name_selectors: + - replicas_desired + - replicas_ready + - status_replicas_available + - status_replicas_unavailable + - dimensions: + - - ClusterName + - Namespace + - PodName + - - ClusterName + label_matchers: [] + metric_name_selectors: + - daemonset_status_number_available + - daemonset_status_number_unavailable + - dimensions: + - - ClusterName + - Namespace + - - ClusterName + label_matchers: [] + metric_name_selectors: + - namespace_number_of_running_pods + - dimensions: + - - ClusterName + label_matchers: [] + metric_name_selectors: + - cluster_node_count + - cluster_failed_node_count + - cluster_number_of_running_pods + - dimensions: + - - ClusterName + - endpoint + - - ClusterName + label_matchers: [] + metric_name_selectors: + - apiserver_storage_size_bytes + - apiserver_storage_db_total_size_in_bytes + - etcd_db_total_size_in_bytes + - dimensions: + - - ClusterName + - resource + - - ClusterName + label_matchers: [] + metric_name_selectors: + - apiserver_storage_list_duration_seconds + - apiserver_longrunning_requests + - apiserver_storage_objects + - dimensions: + - - ClusterName + - verb + - - ClusterName + label_matchers: [] + metric_name_selectors: + - apiserver_request_duration_seconds + - rest_client_request_duration_seconds + - dimensions: + - - ClusterName + - code + - verb + - - ClusterName + label_matchers: [] + metric_name_selectors: + - apiserver_request_total + - apiserver_request_total_5xx + - dimensions: + - - ClusterName + - operation + - - ClusterName + label_matchers: [] + metric_name_selectors: + - apiserver_admission_controller_admission_duration_seconds + - apiserver_admission_step_admission_duration_seconds + - etcd_request_duration_seconds + - dimensions: + - - ClusterName + - code + - method + - - ClusterName + label_matchers: [] + metric_name_selectors: + - rest_client_requests_total + - dimensions: + - - ClusterName + - request_kind + - - ClusterName + label_matchers: [] + metric_name_selectors: + - apiserver_current_inflight_requests + - apiserver_current_inqueue_requests + - dimensions: + - - ClusterName + - name + - - ClusterName + label_matchers: [] + metric_name_selectors: + - apiserver_admission_webhook_admission_duration_seconds + - dimensions: + - - ClusterName + - group + - - ClusterName + label_matchers: [] + metric_name_selectors: + - apiserver_requested_deprecated_apis + - dimensions: + - - ClusterName + - reason + - - ClusterName + label_matchers: [] + metric_name_selectors: + - apiserver_flowcontrol_rejected_requests_total + - dimensions: + - - ClusterName + - priority_level + - - ClusterName + label_matchers: [] + metric_name_selectors: + - apiserver_flowcontrol_request_concurrency_limit + - dimensions: + - - ClusterName + - FullPodName + - Namespace + - PodName + - UUID + - - ClusterName + - Namespace + - PodName + - UUID + - - ClusterName + - Namespace + - Service + - - ClusterName + label_matchers: [] + metric_name_selectors: + - pod_gpu_utilization + - pod_gpu_utilization_memory + - pod_gpu_memory_total + - pod_gpu_memory_used + - pod_gpu_power_draw + - pod_gpu_temperature + - dimensions: + - - ClusterName + - InstanceId + - NodeName + - UUID + - - ClusterName + - InstanceId + - NodeName + - - ClusterName + label_matchers: [] + metric_name_selectors: + - node_gpu_utilization + - node_gpu_utilization_memory + - node_gpu_memory_total + - node_gpu_memory_used + - node_gpu_power_draw + - node_gpu_temperature + - dimensions: + - - ClusterName + - InstanceId + - NodeName + - - ClusterName + label_matchers: [] + metric_name_selectors: + - node_gpu_total + - dimensions: + - - ClusterName + label_matchers: [] + metric_name_selectors: + - cluster_gpu_total + metric_descriptors: + - metric_name: apiserver_admission_controller_admission_duration_seconds + overwrite: true + unit: Seconds + - metric_name: apiserver_admission_step_admission_duration_seconds + overwrite: true + unit: Seconds + - metric_name: apiserver_admission_webhook_admission_duration_seconds + overwrite: true + unit: Seconds + - metric_name: apiserver_current_inflight_requests + overwrite: true + unit: Count + - metric_name: apiserver_current_inqueue_requests + overwrite: true + unit: Count + - metric_name: apiserver_flowcontrol_rejected_requests_total + overwrite: true + unit: Count + - metric_name: apiserver_flowcontrol_request_concurrency_limit + overwrite: true + unit: Count + - metric_name: apiserver_longrunning_requests + overwrite: true + unit: Count + - metric_name: apiserver_request_duration_seconds + overwrite: true + unit: Seconds + - metric_name: apiserver_request_total + overwrite: true + unit: Count + - metric_name: apiserver_request_total_5xx + overwrite: true + unit: Count + - metric_name: apiserver_requested_deprecated_apis + overwrite: true + unit: Count + - metric_name: apiserver_storage_objects + overwrite: true + unit: Count + - metric_name: etcd_request_duration_seconds + overwrite: true + unit: Seconds + - metric_name: apiserver_storage_list_duration_seconds + overwrite: true + unit: Seconds + - metric_name: apiserver_storage_db_total_size_in_bytes + overwrite: true + unit: Bytes + - metric_name: apiserver_storage_size_bytes + overwrite: true + unit: Bytes + - metric_name: etcd_db_total_size_in_bytes + overwrite: true + unit: Bytes + - metric_name: rest_client_request_duration_seconds + overwrite: true + unit: Seconds + - metric_name: rest_client_requests_total + overwrite: true + unit: Count + middleware: agenthealth/logs + namespace: ContainerInsights + no_verify_ssl: false + num_workers: 8 + output_destination: cloudwatch + parse_json_encoded_attr_values: + - Sources + - kubernetes + profile: default + proxy_address: "" + region: us-east-1 + request_timeout_seconds: 30 + resource_arn: "" + resource_to_telemetry_conversion: + enabled: true + retain_initial_value_of_delta_metric: false + role_arn: "" + shared_credentials_file: + - /root/.aws/credentials + version: "0" +extensions: + agenthealth/logs: + is_usage_data_enabled: true + stats: + operations: + - PutLogEvents +processors: + batch/containerinsights: + metadata_cardinality_limit: 1000 + metadata_keys: [] + send_batch_max_size: 0 + send_batch_size: 8192 + timeout: 5s + batch/emf_logs: + metadata_cardinality_limit: 1000 + metadata_keys: [] + send_batch_max_size: 0 + send_batch_size: 8192 + timeout: 5s + metricstransform/containerinsights: + transforms: + - action: insert + aggregation_type: "" + experimental_match_labels: + code: ^5.* + group_resource_labels: {} + include: apiserver_request_total + match_type: regexp + new_name: apiserver_request_total_5xx + operations: [] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: { } + group_resource_labels: { } + include: DCGM_FI_DEV_GPU_UTIL + match_type: "" + new_name: pod_gpu_utilization + operations: [ ] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: { } + group_resource_labels: { } + include: DCGM_FI_DEV_GPU_UTIL + match_type: "" + new_name: node_gpu_utilization + operations: [ ] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: { } + group_resource_labels: { } + include: DCGM_FI_DEV_MEM_COPY_UTIL + match_type: "" + new_name: pod_gpu_utilization_memory + operations: [ ] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: { } + group_resource_labels: { } + include: DCGM_FI_DEV_MEM_COPY_UTIL + match_type: "" + new_name: node_gpu_utilization_memory + operations: [ ] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: { } + group_resource_labels: { } + include: DCGM_FI_DEV_FB_USED + match_type: "" + new_name: pod_gpu_memory_used + operations: [ ] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: { } + group_resource_labels: { } + include: DCGM_FI_DEV_FB_USED + match_type: "" + new_name: node_gpu_memory_used + operations: [ ] + submatch_case: "" + - action: insert + aggregation_type: "sum" + experimental_match_labels: { } + group_resource_labels: { } + include: ^DCGM_FI_DEV_FB_(USED|FREE)$ + match_type: "regexp" + new_name: pod_gpu_memory_total + operations: [ ] + submatch_case: "" + - action: insert + aggregation_type: "sum" + experimental_match_labels: { } + group_resource_labels: { } + include: ^DCGM_FI_DEV_FB_(USED|FREE)$ + match_type: "regexp" + new_name: node_gpu_memory_total + operations: [ ] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: { } + group_resource_labels: { } + include: DCGM_FI_DEV_GPU_TEMP + match_type: "" + new_name: pod_gpu_temperature + operations: [ ] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: { } + group_resource_labels: { } + include: DCGM_FI_DEV_GPU_TEMP + match_type: "" + new_name: node_gpu_temperature + operations: [ ] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: { } + group_resource_labels: { } + include: DCGM_FI_DEV_POWER_USAGE + match_type: "" + new_name: pod_gpu_power_draw + operations: [ ] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: { } + group_resource_labels: { } + include: DCGM_FI_DEV_POWER_USAGE + match_type: "" + new_name: node_gpu_power_draw + operations: [ ] + submatch_case: "" + +receivers: + awscontainerinsightreceiver: + add_container_name_metric_label: true + add_full_pod_name_metric_label: true + add_service_as_attribute: true + certificate_file_path: "" + cluster_name: TestCluster + collection_interval: 30s + container_orchestrator: eks + enable_control_plane_metrics: true + endpoint: "" + imds_retries: 2 + leader_lock_name: cwagent-clusterleader + leader_lock_using_config_map_only: true + local_mode: false + max_retries: 0 + no_verify_ssl: false + num_workers: 0 + prefer_full_pod_name: true + profile: default + proxy_address: "" + region: us-east-1 + request_timeout_seconds: 0 + resource_arn: "" + role_arn: "" + shared_credentials_file: + - /root/.aws/credentials + gpu_metrics: true + tcplog/emf_logs: + attributes: {} + encoding: utf-8 + id: tcp_input + listen_address: 0.0.0.0:25888 + operators: [] + output: [] + resource: {} + retry_on_failure: + enabled: false + initial_interval: 0s + max_elapsed_time: 0s + max_interval: 0s + storage: null + type: tcp_input + udplog/emf_logs: + attributes: {} + encoding: utf-8 + id: udp_input + listen_address: 0.0.0.0:25888 + multiline: + line_end_pattern: .^ + line_start_pattern: "" + omit_pattern: false + operators: [] + output: [] + resource: {} + retry_on_failure: + enabled: false + initial_interval: 0s + max_elapsed_time: 0s + max_interval: 0s + storage: null + type: udp_input +service: + extensions: + - agenthealth/logs + pipelines: + logs/emf_logs: + exporters: + - awscloudwatchlogs/emf_logs + processors: + - batch/emf_logs + receivers: + - tcplog/emf_logs + - udplog/emf_logs + metrics/containerinsights: + exporters: + - awsemf/containerinsights + processors: + - metricstransform/containerinsights + - batch/containerinsights + receivers: + - awscontainerinsightreceiver + telemetry: + logs: + development: false + disable_caller: false + disable_stacktrace: false + encoding: console + error_output_paths: [] + initial_fields: {} + level: info + output_paths: [] + sampling: + enabled: true + initial: 2 + thereafter: 500 + tick: 10s + metrics: + address: "" + level: None + readers: [] + resource: {} + traces: + processors: [] + propagators: [] diff --git a/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.json b/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.json index cd10578c71..6109027d6f 100644 --- a/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.json +++ b/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.json @@ -8,7 +8,8 @@ "cluster_name": "TestCluster", "metrics_collection_interval": 30, "disable_metric_extraction": true, - "enhanced_container_insights": true + "enhanced_container_insights": true, + "gpu_metrics": false } }, "force_flush_interval": 5, diff --git a/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.yaml b/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.yaml index 05a1102801..91e90916b1 100644 --- a/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.yaml +++ b/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.yaml @@ -429,6 +429,7 @@ receivers: role_arn: "" shared_credentials_file: - fake-path + gpu_metrics: false service: extensions: - agenthealth/logs diff --git a/translator/tocwconfig/sampleConfig/log_ecs_metric_only.yaml b/translator/tocwconfig/sampleConfig/log_ecs_metric_only.yaml index d0361dc1d1..10ba5b9138 100644 --- a/translator/tocwconfig/sampleConfig/log_ecs_metric_only.yaml +++ b/translator/tocwconfig/sampleConfig/log_ecs_metric_only.yaml @@ -143,6 +143,7 @@ receivers: resource_arn: "" role_arn: "" shared_credentials_file: [] + gpu_metrics: true tcplog/emf_logs: attributes: {} encoding: utf-8 diff --git a/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.json b/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.json index 5f5fe1b8d6..8ad16d0886 100644 --- a/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.json +++ b/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.json @@ -9,7 +9,8 @@ "kubernetes": { "cluster_name": "TestCluster", "metrics_collection_interval": 30, - "enhanced_container_insights": true + "enhanced_container_insights": true, + "gpu_metrics": false } }, "logs_collected": { diff --git a/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.yaml b/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.yaml index 3b3c535990..48b06f125b 100644 --- a/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.yaml +++ b/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.yaml @@ -466,6 +466,7 @@ receivers: resource_arn: "" role_arn: "" shared_credentials_file: [] + gpu_metrics: false tcplog/emf_logs: attributes: {} encoding: utf-8 diff --git a/translator/tocwconfig/tocwconfig_test.go b/translator/tocwconfig/tocwconfig_test.go index 42712d5e92..2bf65e1a22 100644 --- a/translator/tocwconfig/tocwconfig_test.go +++ b/translator/tocwconfig/tocwconfig_test.go @@ -116,6 +116,17 @@ func TestEmfAndKubernetesConfig(t *testing.T) { checkTranslation(t, "emf_and_kubernetes_config", "darwin", nil, "") } +func TestEmfAndKubernetesWithGpuConfig(t *testing.T) { + resetContext(t) + readCommonConfig(t, "./sampleConfig/commonConfig/withCredentials.toml") + context.CurrentContext().SetRunInContainer(true) + t.Setenv(config.HOST_NAME, "host_name_from_env") + t.Setenv(config.HOST_IP, "127.0.0.1") + expectedEnvVars := map[string]string{} + checkTranslation(t, "emf_and_kubernetes_with_gpu_config", "linux", expectedEnvVars, "") + checkTranslation(t, "emf_and_kubernetes_with_gpu_config", "darwin", nil, "") +} + func TestKubernetesModeOnPremiseConfig(t *testing.T) { resetContext(t) context.CurrentContext().SetRunInContainer(true) diff --git a/translator/translate/otel/common/common.go b/translator/translate/otel/common/common.go index 501ce34a41..3b12656df7 100644 --- a/translator/translate/otel/common/common.go +++ b/translator/translate/otel/common/common.go @@ -45,6 +45,7 @@ const ( ContainerInsightsMetricGranularity = "metric_granularity" // replaced with enhanced_container_insights EnhancedContainerInsights = "enhanced_container_insights" PreferFullPodName = "prefer_full_pod_name" + EnableGpuMetric = "gpu_metrics" Console = "console" DiskIOKey = "diskio" NetKey = "net" diff --git a/translator/translate/otel/exporter/awsemf/kubernetes.go b/translator/translate/otel/exporter/awsemf/kubernetes.go index 564b0c2c84..a1ebe6ed8d 100644 --- a/translator/translate/otel/exporter/awsemf/kubernetes.go +++ b/translator/translate/otel/exporter/awsemf/kubernetes.go @@ -4,6 +4,7 @@ package awsemf import ( + "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/common" "github.com/open-telemetry/opentelemetry-collector-contrib/exporter/awsemfexporter" "go.opentelemetry.io/collector/confmap" @@ -46,6 +47,9 @@ func setKubernetesMetricDeclaration(conf *confmap.Conf, cfg *awsemfexporter.Conf // Setup control plane metrics kubernetesMetricDeclarations = append(kubernetesMetricDeclarations, getControlPlaneMetricDeclarations(conf)...) + // Setup GPU metrics + kubernetesMetricDeclarations = append(kubernetesMetricDeclarations, getGPUMetricDeclarations(conf)...) + cfg.MetricDeclarations = kubernetesMetricDeclarations cfg.MetricDescriptors = getControlPlaneMetricDescriptors(conf) @@ -457,3 +461,48 @@ func getControlPlaneMetricDescriptors(conf *confmap.Conf) []awsemfexporter.Metri return []awsemfexporter.MetricDescriptor{} } + +func getGPUMetricDeclarations(conf *confmap.Conf) []*awsemfexporter.MetricDeclaration { + var metricDeclarations []*awsemfexporter.MetricDeclaration + EnableGpuMetric := common.GetOrDefaultBool(conf, common.ConfigKey(common.LogsKey, common.MetricsCollectedKey, common.KubernetesKey, common.EnableGpuMetric), true) + enhancedContainerInsightsEnabled := awscontainerinsight.EnhancedContainerInsightsEnabled(conf) + if EnableGpuMetric && enhancedContainerInsightsEnabled { + metricDeclarations = append(metricDeclarations, []*awsemfexporter.MetricDeclaration{ + { + Dimensions: [][]string{{"Namespace", "ClusterName", "FullPodName", "PodName", "UUID"}, {"Namespace", "ClusterName", "PodName", "UUID"}, {"Namespace", "ClusterName", "Service"}, {"ClusterName"}}, + MetricNameSelectors: []string{ + "pod_gpu_utilization", + "pod_gpu_utilization_memory", + "pod_gpu_memory_total", + "pod_gpu_memory_used", + "pod_gpu_power_draw", + "pod_gpu_temperature", + }, + }, + { + Dimensions: [][]string{{"ClusterName", "NodeName", "InstanceId", "UUID"}, {"ClusterName", "NodeName", "InstanceId"}, {"ClusterName"}}, + MetricNameSelectors: []string{ + "node_gpu_utilization", + "node_gpu_utilization_memory", + "node_gpu_memory_total", + "node_gpu_memory_used", + "node_gpu_power_draw", + "node_gpu_temperature", + }, + }, + { + Dimensions: [][]string{{"ClusterName", "NodeName", "InstanceId"}, {"ClusterName"}}, + MetricNameSelectors: []string{ + "node_gpu_total", + }, + }, + { + Dimensions: [][]string{{"ClusterName"}}, + MetricNameSelectors: []string{ + "cluster_gpu_total", + }, + }, + }...) + } + return metricDeclarations +} diff --git a/translator/translate/otel/processor/metricstransformprocessor/translator.go b/translator/translate/otel/processor/metricstransformprocessor/translator.go index 29979b6965..6e343750c9 100644 --- a/translator/translate/otel/processor/metricstransformprocessor/translator.go +++ b/translator/translate/otel/processor/metricstransformprocessor/translator.go @@ -31,14 +31,87 @@ func (t *translator) ID() component.ID { func (t *translator) Translate(conf *confmap.Conf) (component.Config, error) { cfg := t.factory.CreateDefaultConfig().(*metricstransformprocessor.Config) - c := confmap.NewFromStringMap(map[string]interface{}{ - "transforms": map[string]interface{}{ + transformRules := []map[string]interface{}{ + { "include": "apiserver_request_total", "match_type": "regexp", "experimental_match_labels": map[string]string{"code": "^5.*"}, "action": "insert", "new_name": "apiserver_request_total_5xx", }, + } + + if isGpuEnabled(conf) { + transformRules = append(transformRules, []map[string]interface{}{ + { + "include": "DCGM_FI_DEV_GPU_UTIL", + "action": "insert", + "new_name": "pod_gpu_utilization", + }, + { + "include": "DCGM_FI_DEV_GPU_UTIL", + "action": "insert", + "new_name": "node_gpu_utilization", + }, + { + "include": "DCGM_FI_DEV_MEM_COPY_UTIL", + "action": "insert", + "new_name": "pod_gpu_utilization_memory", + }, + { + "include": "DCGM_FI_DEV_MEM_COPY_UTIL", + "action": "insert", + "new_name": "node_gpu_utilization_memory", + }, + { + "include": "DCGM_FI_DEV_FB_USED", + "action": "insert", + "new_name": "pod_gpu_memory_used", + }, + { + "include": "DCGM_FI_DEV_FB_USED", + "action": "insert", + "new_name": "node_gpu_memory_used", + }, + { + "include": "^DCGM_FI_DEV_FB_(USED|FREE)$", + "action": "insert", + "new_name": "pod_gpu_memory_total", + "aggregation_type": "sum", + "match_type": "regexp", + }, + { + "include": "^DCGM_FI_DEV_FB_(USED|FREE)$", + "action": "insert", + "new_name": "node_gpu_memory_total", + "aggregation_type": "sum", + "match_type": "regexp", + }, + { + "include": "DCGM_FI_DEV_GPU_TEMP", + "action": "insert", + "new_name": "pod_gpu_temperature", + }, + { + "include": "DCGM_FI_DEV_GPU_TEMP", + "action": "insert", + "new_name": "node_gpu_temperature", + }, + { + "include": "DCGM_FI_DEV_POWER_USAGE", + "action": "insert", + "new_name": "pod_gpu_power_draw", + }, + { + "include": "DCGM_FI_DEV_POWER_USAGE", + "action": "insert", + "new_name": "node_gpu_power_draw", + }, + }...) + } + + c := confmap.NewFromStringMap(map[string]interface{}{ + "transforms": transformRules, }) if err := c.Unmarshal(&cfg); err != nil { return nil, fmt.Errorf("unable to unmarshal into metricstransform config: %w", err) @@ -46,3 +119,7 @@ func (t *translator) Translate(conf *confmap.Conf) (component.Config, error) { return cfg, nil } + +func isGpuEnabled(conf *confmap.Conf) bool { + return common.GetOrDefaultBool(conf, common.ConfigKey(common.LogsKey, common.MetricsCollectedKey, common.KubernetesKey, common.EnableGpuMetric), true) +} diff --git a/translator/translate/otel/receiver/awscontainerinsight/translator.go b/translator/translate/otel/receiver/awscontainerinsight/translator.go index 8c58c86fce..e7b8a228a6 100644 --- a/translator/translate/otel/receiver/awscontainerinsight/translator.go +++ b/translator/translate/otel/receiver/awscontainerinsight/translator.go @@ -108,9 +108,11 @@ func (t *translator) Translate(conf *confmap.Conf) (component.Config, error) { cfg.PrefFullPodName = true cfg.EnableControlPlaneMetrics = true } + } cfg.PrefFullPodName = cfg.PrefFullPodName || common.GetOrDefaultBool(conf, common.ConfigKey(common.LogsKey, common.MetricsCollectedKey, common.KubernetesKey, common.PreferFullPodName), false) + cfg.EnableGpuMetric = cfg.EnableGpuMetric || common.GetOrDefaultBool(conf, common.ConfigKey(common.LogsKey, common.MetricsCollectedKey, common.KubernetesKey, common.EnableGpuMetric), true) return cfg, nil } From 90ef122958e21fe328e6723de702bb62a3509c91 Mon Sep 17 00:00:00 2001 From: Hyunsoo Kim Date: Tue, 13 Feb 2024 09:54:15 -0500 Subject: [PATCH 02/20] add gpu processor and update metric declarations for gpu --- internal/containerinsightscommon/const.go | 7 + plugins/processors/gpu/config.go | 21 ++ plugins/processors/gpu/factory.go | 56 ++++ plugins/processors/gpu/gpudecorator.go | 158 ++++++++++ plugins/processors/gpu/logtypeattribute.go | 47 +++ plugins/processors/gpu/metriccombiner.go | 116 +++++++ service/defaultcomponents/components.go | 2 + service/defaultcomponents/components_test.go | 3 +- .../emf_and_kubernetes_config.yaml | 3 + .../emf_and_kubernetes_with_gpu_config.yaml | 284 +++++++++++------- .../kubernetes_on_prem_config.yaml | 3 + .../logs_and_kubernetes_config.yaml | 3 + .../otel/exporter/awsemf/kubernetes.go | 48 ++- .../otel/exporter/awsemf/translator.go | 6 + .../pipeline/containerinsights/translator.go | 3 +- .../otel/processor/gpu/translator.go | 34 +++ .../metricstransformprocessor/translator.go | 203 ++++++++----- 17 files changed, 810 insertions(+), 187 deletions(-) create mode 100644 plugins/processors/gpu/config.go create mode 100644 plugins/processors/gpu/factory.go create mode 100644 plugins/processors/gpu/gpudecorator.go create mode 100644 plugins/processors/gpu/logtypeattribute.go create mode 100644 plugins/processors/gpu/metriccombiner.go create mode 100644 translator/translate/otel/processor/gpu/translator.go diff --git a/internal/containerinsightscommon/const.go b/internal/containerinsightscommon/const.go index 0b9dc01672..167cf311df 100644 --- a/internal/containerinsightscommon/const.go +++ b/internal/containerinsightscommon/const.go @@ -72,6 +72,13 @@ const ( DiskIOWrite = "Write" DiskIOTotal = "Total" + GpuUtilization = "gpu_utilization" + GpuMemUtilization = "gpu_utilization_memory" + GpuMemUsed = "gpu_memory_used" + GpuMemTotal = "gpu_memory_total" + GpuTemperature = "gpu_temperature" + GpuPowerDraw = "gpu_power_draw" + TypeCluster = "Cluster" TypeClusterService = "ClusterService" TypeClusterNamespace = "ClusterNamespace" diff --git a/plugins/processors/gpu/config.go b/plugins/processors/gpu/config.go new file mode 100644 index 0000000000..7f9198d75b --- /dev/null +++ b/plugins/processors/gpu/config.go @@ -0,0 +1,21 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +package gpu + +import ( + "go.opentelemetry.io/collector/component" +) + +type Config struct { + DropOriginalMetrics bool `mapstructure:"drop_original_metrics"` +} + +// Verify Config implements Processor interface. +var _ component.Config = (*Config)(nil) + +// Validate does not check for unsupported dimension key-value pairs, because those +// get silently dropped and ignored during translation. +func (cfg *Config) Validate() error { + return nil +} diff --git a/plugins/processors/gpu/factory.go b/plugins/processors/gpu/factory.go new file mode 100644 index 0000000000..8864cf47c0 --- /dev/null +++ b/plugins/processors/gpu/factory.go @@ -0,0 +1,56 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +package gpu + +import ( + "context" + "fmt" + + "go.opentelemetry.io/collector/component" + "go.opentelemetry.io/collector/consumer" + "go.opentelemetry.io/collector/processor" + "go.opentelemetry.io/collector/processor/processorhelper" +) + +const ( + TypeStr = "gpu" + stability = component.StabilityLevelBeta +) + +var processorCapabilities = consumer.Capabilities{MutatesData: true} + +func NewFactory() processor.Factory { + return processor.NewFactory( + TypeStr, + createDefaultConfig, + processor.WithMetrics(createMetricsProcessor, stability)) +} + +func createDefaultConfig() component.Config { + return &Config{} +} + +func createMetricsProcessor( + ctx context.Context, + set processor.CreateSettings, + cfg component.Config, + nextConsumer consumer.Metrics, +) (processor.Metrics, error) { + processorConfig, ok := cfg.(*Config) + if !ok { + return nil, fmt.Errorf("configuration parsing error") + } + + metricsProcessor := newDecorator(processorConfig, set.Logger) + + return processorhelper.NewMetricsProcessor( + ctx, + set, + cfg, + nextConsumer, + metricsProcessor.processMetrics, + processorhelper.WithCapabilities(processorCapabilities), + processorhelper.WithStart(metricsProcessor.Start), + processorhelper.WithShutdown(metricsProcessor.Shutdown)) +} diff --git a/plugins/processors/gpu/gpudecorator.go b/plugins/processors/gpu/gpudecorator.go new file mode 100644 index 0000000000..acd73d2fc7 --- /dev/null +++ b/plugins/processors/gpu/gpudecorator.go @@ -0,0 +1,158 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +package gpu + +import ( + "context" + "strings" + + "github.com/aws/amazon-cloudwatch-agent/internal/containerinsightscommon" + "go.opentelemetry.io/collector/component" + "go.opentelemetry.io/collector/pdata/pcommon" + "go.opentelemetry.io/collector/pdata/pmetric" + "go.uber.org/zap" +) + +const ( + gpuMetric = "_gpu_" +) + +var metricDuplicateTypes = []string{ + containerinsightscommon.TypeContainer, + containerinsightscommon.TypePod, + containerinsightscommon.TypeNode, +} + +var renameMapForDcgm = map[string]string{ + "DCGM_FI_DEV_GPU_UTIL": containerinsightscommon.GpuUtilization, + "DCGM_FI_DEV_MEM_COPY_UTIL": containerinsightscommon.GpuMemUtilization, + "DCGM_FI_DEV_FB_USED": containerinsightscommon.GpuMemUsed, + "DCGM_FI_DEV_FB_TOTAL": containerinsightscommon.GpuMemTotal, + "DCGM_FI_DEV_GPU_TEMP": containerinsightscommon.GpuTemperature, + "DCGM_FI_DEV_POWER_USAGE": containerinsightscommon.GpuPowerDraw, +} + +type metricMutationRule struct { + sources []string + target string + removeOriginal bool +} + +type metricMutator interface { + Process(ms pmetric.Metrics) error +} + +type attributeMutator interface { + Process(m pmetric.Metric, attrs pcommon.Map, removeOriginal bool) error +} + +type decorator struct { + *Config + logger *zap.Logger + cancelFunc context.CancelFunc + shutdownC chan bool + started bool + attributeMutators []attributeMutator + metricMutators []metricMutator +} + +func newDecorator(config *Config, logger *zap.Logger) *decorator { + _, cancel := context.WithCancel(context.Background()) + d := &decorator{ + Config: config, + logger: logger, + cancelFunc: cancel, + } + return d +} + +func (d *decorator) processMetrics(ctx context.Context, md pmetric.Metrics) (pmetric.Metrics, error) { + if !d.started { + return pmetric.NewMetrics(), nil + } + + for _, metricMutator := range d.metricMutators { + // crate memory total + metricMutator.Process(md) + } + + rms := md.ResourceMetrics() + for i := 0; i < rms.Len(); i++ { + rs := rms.At(i) + ilms := rs.ScopeMetrics() + for j := 0; j < ilms.Len(); j++ { + ils := ilms.At(j) + metrics := ils.Metrics() + d.normalize(ctx, metrics) + for k := 0; k < metrics.Len(); k++ { + m := metrics.At(k) + d.processMetricAttributes(ctx, m) + } + } + } + return md, nil +} + +func (d *decorator) normalize(_ context.Context, metrics pmetric.MetricSlice) { + // duplicate metrics for metric types by normalizing names + orgLen := metrics.Len() + for i := 0; i < orgLen; i++ { + metric := metrics.At(i) + if newName, ok := renameMapForDcgm[metric.Name()]; ok { + for _, dt := range metricDuplicateTypes { + newMetric := pmetric.NewMetric() + metric.CopyTo(newMetric) + newMetric.SetName(containerinsightscommon.MetricName(dt, newName)) + newMetric.MoveTo(metrics.AppendEmpty()) + } + } + } +} + +func (d *decorator) processMetricAttributes(_ context.Context, m pmetric.Metric) { + if !strings.Contains(m.Name(), gpuMetric) { + return + } + + switch m.Type() { + case pmetric.MetricTypeGauge: + dps := m.Gauge().DataPoints() + for i := 0; i < dps.Len(); i++ { + for _, mutator := range d.attributeMutators { + err := mutator.Process(m, dps.At(i).Attributes(), false) + if err != nil { + d.logger.Debug("failed to process attributes", zap.Error(err)) + } + } + } + case pmetric.MetricTypeSum: + dps := m.Sum().DataPoints() + for i := 0; i < dps.Len(); i++ { + for _, mutator := range d.attributeMutators { + err := mutator.Process(m, dps.At(i).Attributes(), false) + if err != nil { + d.logger.Debug("failed to process attributes", zap.Error(err)) + } + } + } + default: + d.logger.Debug("Ignore unknown metric type", zap.String("type", m.Type().String())) + } +} + +func (d *decorator) Shutdown(context.Context) error { + close(d.shutdownC) + d.cancelFunc() + return nil +} + +func (d *decorator) Start(ctx context.Context, _ component.Host) error { + d.shutdownC = make(chan bool) + logTypeMutator := NewLogTypeAttribute(d.logger) + d.attributeMutators = []attributeMutator{logTypeMutator} + metricCombiner := NewMetricCombiner(d.logger, metricMutationRule{sources: []string{"DCGM_FI_DEV_FB_USED", "DCGM_FI_DEV_FB_FREE"}, target: "DCGM_FI_DEV_FB_TOTAL"}) + d.metricMutators = []metricMutator{metricCombiner} + d.started = true + return nil +} diff --git a/plugins/processors/gpu/logtypeattribute.go b/plugins/processors/gpu/logtypeattribute.go new file mode 100644 index 0000000000..8c8b0f85ef --- /dev/null +++ b/plugins/processors/gpu/logtypeattribute.go @@ -0,0 +1,47 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +package gpu + +import ( + "strings" + + "github.com/aws/amazon-cloudwatch-agent/internal/containerinsightscommon" + "go.opentelemetry.io/collector/pdata/pcommon" + "go.opentelemetry.io/collector/pdata/pmetric" + "go.uber.org/zap" +) + +type logTypeAttribute struct { + logger *zap.Logger +} + +func NewLogTypeAttribute(logger *zap.Logger) *logTypeAttribute { + return &logTypeAttribute{ + logger: logger, + } +} + +func (an *logTypeAttribute) Process(m pmetric.Metric, attributes pcommon.Map, removeOriginal bool) error { + an.addLogTypeAttribute(m, attributes) + return nil +} + +// NOTE: There are additional metric types (PodGpu and NodeGpu) that get applied in the emf exporter. +// Those 2 metric types handled by emf exporter are used only for dimensions sets that include "GpuDevice" +func (an *logTypeAttribute) addLogTypeAttribute(m pmetric.Metric, attributes pcommon.Map) { + logType := "" + switch strings.Split(m.Name(), "_")[0] { + case "container": + logType = containerinsightscommon.TypeContainer + case "pod": + logType = containerinsightscommon.TypePod + case "node": + logType = containerinsightscommon.TypeNode + case "cluster": + logType = containerinsightscommon.TypeCluster + default: + an.logger.Warn("metric name is either empty or not a supported type") + } + attributes.PutStr("Type", logType) +} diff --git a/plugins/processors/gpu/metriccombiner.go b/plugins/processors/gpu/metriccombiner.go new file mode 100644 index 0000000000..d14ada92e8 --- /dev/null +++ b/plugins/processors/gpu/metriccombiner.go @@ -0,0 +1,116 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +package gpu + +import ( + "go.opentelemetry.io/collector/pdata/pmetric" + "go.uber.org/zap" +) + +type metricCombiner struct { + logger *zap.Logger + rule metricMutationRule +} + +func NewMetricCombiner(logger *zap.Logger, rule metricMutationRule) *metricCombiner { + return &metricCombiner{ + logger: logger, + rule: rule, + } +} + +// basic idea/code is from metricsgenerationprocessor [BETA] https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/processor/metricsgenerationprocessor/README.md +func (mm *metricCombiner) Process(ms pmetric.Metrics) error { + rms := ms.ResourceMetrics() + for i := 0; i < rms.Len(); i++ { + rm := rms.At(i) + nameToMetricMap := mm.getNameToMetricMap(rm) + + from2Val := float64(0) + from1, ok := nameToMetricMap[mm.rule.sources[0]] + if !ok { + mm.logger.Debug("Missing first metric", zap.String("metric_name", mm.rule.sources[0])) + continue + } + from2, ok := nameToMetricMap[mm.rule.sources[1]] + if !ok { + mm.logger.Debug("Missing second metric", zap.String("metric_name", mm.rule.sources[1])) + continue + } + from2Val = mm.getMetricValue(from2) + mm.generateMetrics(rm, mm.rule.target, from1.Name(), from1.Unit(), from2Val) + } + return nil +} + +func (mm *metricCombiner) getNameToMetricMap(rm pmetric.ResourceMetrics) map[string]pmetric.Metric { + ilms := rm.ScopeMetrics() + metricMap := make(map[string]pmetric.Metric) + + for i := 0; i < ilms.Len(); i++ { + ilm := ilms.At(i) + metricSlice := ilm.Metrics() + for j := 0; j < metricSlice.Len(); j++ { + metric := metricSlice.At(j) + metricMap[metric.Name()] = metric + } + } + return metricMap +} + +func (mm *metricCombiner) getMetricValue(metric pmetric.Metric) float64 { + if metric.Type() == pmetric.MetricTypeGauge { + dataPoints := metric.Gauge().DataPoints() + if dataPoints.Len() > 0 { + switch dataPoints.At(0).ValueType() { + case pmetric.NumberDataPointValueTypeDouble: + return dataPoints.At(0).DoubleValue() + case pmetric.NumberDataPointValueTypeInt: + return float64(dataPoints.At(0).IntValue()) + } + } + return 0 + } + return 0 +} + +// generateMetrics creates a new metric based on the given rule and add it to the Resource Metric. +// The value for newly calculated metrics is always a floting point number and the dataType is set +// as MetricTypeDoubleGauge. +func (mm *metricCombiner) generateMetrics(rm pmetric.ResourceMetrics, newName string, f1name string, unit string, f2val float64) { + ilms := rm.ScopeMetrics() + for i := 0; i < ilms.Len(); i++ { + ilm := ilms.At(i) + metricSlice := ilm.Metrics() + for j := 0; j < metricSlice.Len(); j++ { + metric := metricSlice.At(j) + if metric.Name() == f1name { + newMetric := ilm.Metrics().AppendEmpty() + newMetric.SetName(newName) + newMetric.SetUnit(unit) + newMetric.SetEmptyGauge() + mm.addDoubleGaugeDataPoints(metric, newMetric, f2val) + } + } + } +} + +func (mm *metricCombiner) addDoubleGaugeDataPoints(from pmetric.Metric, to pmetric.Metric, m2val float64) { + dataPoints := from.Gauge().DataPoints() + for i := 0; i < dataPoints.Len(); i++ { + from := dataPoints.At(i) + var val float64 + switch from.ValueType() { + case pmetric.NumberDataPointValueTypeDouble: + val = from.DoubleValue() + case pmetric.NumberDataPointValueTypeInt: + val = float64(from.IntValue()) + } + + newDp := to.Gauge().DataPoints().AppendEmpty() + from.CopyTo(newDp) + value := val + m2val + newDp.SetDoubleValue(value) + } +} diff --git a/service/defaultcomponents/components.go b/service/defaultcomponents/components.go index 602371fa40..509b1f0184 100644 --- a/service/defaultcomponents/components.go +++ b/service/defaultcomponents/components.go @@ -4,6 +4,7 @@ package defaultcomponents import ( + "github.com/aws/amazon-cloudwatch-agent/plugins/processors/gpu" "github.com/open-telemetry/opentelemetry-collector-contrib/exporter/awscloudwatchlogsexporter" "github.com/open-telemetry/opentelemetry-collector-contrib/exporter/awsemfexporter" "github.com/open-telemetry/opentelemetry-collector-contrib/exporter/awsxrayexporter" @@ -53,6 +54,7 @@ func Factories() (otelcol.Factories, error) { metricstransformprocessor.NewFactory(), resourcedetectionprocessor.NewFactory(), transformprocessor.NewFactory(), + gpu.NewFactory(), ); err != nil { return otelcol.Factories{}, err } diff --git a/service/defaultcomponents/components_test.go b/service/defaultcomponents/components_test.go index 17369775b4..ab3f2b63f4 100644 --- a/service/defaultcomponents/components_test.go +++ b/service/defaultcomponents/components_test.go @@ -11,7 +11,7 @@ import ( const ( receiversCount = 5 - processorCount = 7 + processorCount = 8 exportersCount = 5 extensionsCount = 2 ) @@ -35,6 +35,7 @@ func TestComponents(t *testing.T) { assert.NotNil(t, processors["ec2tagger"]) assert.NotNil(t, processors["metricstransform"]) assert.NotNil(t, processors["transform"]) + assert.NotNil(t, processors["gpu"]) exporters := factories.Exporters assert.Len(t, exporters, exportersCount) diff --git a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.yaml b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.yaml index 1557628b44..056d1ca312 100644 --- a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.yaml +++ b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.yaml @@ -442,6 +442,8 @@ processors: new_name: apiserver_request_total_5xx operations: [] submatch_case: "" + gpu/containerinsights: + drop_original_metrics: false receivers: awscontainerinsightreceiver: add_container_name_metric_label: true @@ -522,6 +524,7 @@ service: processors: - metricstransform/containerinsights - batch/containerinsights + - gpu/containerinsights receivers: - awscontainerinsightreceiver telemetry: diff --git a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml index 94d2aaf310..5b46c4dfe7 100644 --- a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml +++ b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml @@ -329,18 +329,66 @@ exporters: - apiserver_flowcontrol_request_concurrency_limit - dimensions: - - ClusterName + - - ClusterName + - ContainerName + - Namespace + - PodName + - - ClusterName + - ContainerName - FullPodName - Namespace - PodName - - UUID + label_matchers: [] + metric_name_selectors: + - container_gpu_utilization + - container_gpu_utilization_memory + - container_gpu_memory_total + - container_gpu_memory_used + - container_gpu_power_draw + - container_gpu_temperature + - dimensions: - - ClusterName + - ContainerName + - FullPodName + - GpuDevice - Namespace - PodName - - UUID + label_matchers: [] + metric_name_selectors: + - container_gpu_utilization + - container_gpu_utilization_memory + - container_gpu_memory_total + - container_gpu_memory_used + - container_gpu_power_draw + - container_gpu_temperature + - dimensions: + - - ClusterName + - - ClusterName + - Namespace - - ClusterName - Namespace - Service - - ClusterName + - Namespace + - PodName + - - ClusterName + - FullPodName + - Namespace + - PodName + label_matchers: [] + metric_name_selectors: + - pod_gpu_utilization + - pod_gpu_utilization_memory + - pod_gpu_memory_total + - pod_gpu_memory_used + - pod_gpu_power_draw + - pod_gpu_temperature + - dimensions: + - - ClusterName + - FullPodName + - GpuDevice + - Namespace + - PodName label_matchers: [] metric_name_selectors: - pod_gpu_utilization @@ -350,14 +398,23 @@ exporters: - pod_gpu_power_draw - pod_gpu_temperature - dimensions: + - - ClusterName - - ClusterName - InstanceId - NodeName - - UUID + label_matchers: [] + metric_name_selectors: + - node_gpu_utilization + - node_gpu_utilization_memory + - node_gpu_memory_total + - node_gpu_memory_used + - node_gpu_power_draw + - node_gpu_temperature + - dimensions: - - ClusterName + - GpuDevice - InstanceId - NodeName - - - ClusterName label_matchers: [] metric_name_selectors: - node_gpu_utilization @@ -491,114 +548,116 @@ processors: new_name: apiserver_request_total_5xx operations: [] submatch_case: "" - - action: insert - aggregation_type: "" - experimental_match_labels: { } - group_resource_labels: { } - include: DCGM_FI_DEV_GPU_UTIL - match_type: "" - new_name: pod_gpu_utilization - operations: [ ] - submatch_case: "" - - action: insert - aggregation_type: "" - experimental_match_labels: { } - group_resource_labels: { } - include: DCGM_FI_DEV_GPU_UTIL - match_type: "" - new_name: node_gpu_utilization - operations: [ ] - submatch_case: "" - - action: insert - aggregation_type: "" - experimental_match_labels: { } - group_resource_labels: { } - include: DCGM_FI_DEV_MEM_COPY_UTIL - match_type: "" - new_name: pod_gpu_utilization_memory - operations: [ ] - submatch_case: "" - - action: insert - aggregation_type: "" - experimental_match_labels: { } - group_resource_labels: { } - include: DCGM_FI_DEV_MEM_COPY_UTIL - match_type: "" - new_name: node_gpu_utilization_memory - operations: [ ] - submatch_case: "" - - action: insert - aggregation_type: "" - experimental_match_labels: { } - group_resource_labels: { } - include: DCGM_FI_DEV_FB_USED - match_type: "" - new_name: pod_gpu_memory_used - operations: [ ] - submatch_case: "" - - action: insert - aggregation_type: "" - experimental_match_labels: { } - group_resource_labels: { } - include: DCGM_FI_DEV_FB_USED - match_type: "" - new_name: node_gpu_memory_used - operations: [ ] - submatch_case: "" - - action: insert - aggregation_type: "sum" - experimental_match_labels: { } - group_resource_labels: { } - include: ^DCGM_FI_DEV_FB_(USED|FREE)$ - match_type: "regexp" - new_name: pod_gpu_memory_total - operations: [ ] - submatch_case: "" - - action: insert - aggregation_type: "sum" - experimental_match_labels: { } - group_resource_labels: { } - include: ^DCGM_FI_DEV_FB_(USED|FREE)$ - match_type: "regexp" - new_name: node_gpu_memory_total - operations: [ ] - submatch_case: "" - - action: insert - aggregation_type: "" - experimental_match_labels: { } - group_resource_labels: { } - include: DCGM_FI_DEV_GPU_TEMP - match_type: "" - new_name: pod_gpu_temperature - operations: [ ] - submatch_case: "" - - action: insert - aggregation_type: "" - experimental_match_labels: { } - group_resource_labels: { } - include: DCGM_FI_DEV_GPU_TEMP - match_type: "" - new_name: node_gpu_temperature - operations: [ ] - submatch_case: "" - - action: insert - aggregation_type: "" - experimental_match_labels: { } - group_resource_labels: { } - include: DCGM_FI_DEV_POWER_USAGE - match_type: "" - new_name: pod_gpu_power_draw - operations: [ ] - submatch_case: "" - - action: insert - aggregation_type: "" - experimental_match_labels: { } - group_resource_labels: { } - include: DCGM_FI_DEV_POWER_USAGE - match_type: "" - new_name: node_gpu_power_draw - operations: [ ] - submatch_case: "" +# - action: insert +# aggregation_type: "" +# experimental_match_labels: { } +# group_resource_labels: { } +# include: DCGM_FI_DEV_GPU_UTIL +# match_type: "" +# new_name: pod_gpu_utilization +# operations: [ ] +# submatch_case: "" +# - action: insert +# aggregation_type: "" +# experimental_match_labels: { } +# group_resource_labels: { } +# include: DCGM_FI_DEV_GPU_UTIL +# match_type: "" +# new_name: node_gpu_utilization +# operations: [ ] +# submatch_case: "" +# - action: insert +# aggregation_type: "" +# experimental_match_labels: { } +# group_resource_labels: { } +# include: DCGM_FI_DEV_MEM_COPY_UTIL +# match_type: "" +# new_name: pod_gpu_utilization_memory +# operations: [ ] +# submatch_case: "" +# - action: insert +# aggregation_type: "" +# experimental_match_labels: { } +# group_resource_labels: { } +# include: DCGM_FI_DEV_MEM_COPY_UTIL +# match_type: "" +# new_name: node_gpu_utilization_memory +# operations: [ ] +# submatch_case: "" +# - action: insert +# aggregation_type: "" +# experimental_match_labels: { } +# group_resource_labels: { } +# include: DCGM_FI_DEV_FB_USED +# match_type: "" +# new_name: pod_gpu_memory_used +# operations: [ ] +# submatch_case: "" +# - action: insert +# aggregation_type: "" +# experimental_match_labels: { } +# group_resource_labels: { } +# include: DCGM_FI_DEV_FB_USED +# match_type: "" +# new_name: node_gpu_memory_used +# operations: [ ] +# submatch_case: "" +# - action: insert +# aggregation_type: "sum" +# experimental_match_labels: { } +# group_resource_labels: { } +# include: ^DCGM_FI_DEV_FB_(USED|FREE)$ +# match_type: "regexp" +# new_name: pod_gpu_memory_total +# operations: [ ] +# submatch_case: "" +# - action: insert +# aggregation_type: "sum" +# experimental_match_labels: { } +# group_resource_labels: { } +# include: ^DCGM_FI_DEV_FB_(USED|FREE)$ +# match_type: "regexp" +# new_name: node_gpu_memory_total +# operations: [ ] +# submatch_case: "" +# - action: insert +# aggregation_type: "" +# experimental_match_labels: { } +# group_resource_labels: { } +# include: DCGM_FI_DEV_GPU_TEMP +# match_type: "" +# new_name: pod_gpu_temperature +# operations: [ ] +# submatch_case: "" +# - action: insert +# aggregation_type: "" +# experimental_match_labels: { } +# group_resource_labels: { } +# include: DCGM_FI_DEV_GPU_TEMP +# match_type: "" +# new_name: node_gpu_temperature +# operations: [ ] +# submatch_case: "" +# - action: insert +# aggregation_type: "" +# experimental_match_labels: { } +# group_resource_labels: { } +# include: DCGM_FI_DEV_POWER_USAGE +# match_type: "" +# new_name: pod_gpu_power_draw +# operations: [ ] +# submatch_case: "" +# - action: insert +# aggregation_type: "" +# experimental_match_labels: { } +# group_resource_labels: { } +# include: DCGM_FI_DEV_POWER_USAGE +# match_type: "" +# new_name: node_gpu_power_draw +# operations: [ ] +# submatch_case: "" + gpu/containerinsights: + drop_original_metrics: false receivers: awscontainerinsightreceiver: @@ -680,6 +739,7 @@ service: processors: - metricstransform/containerinsights - batch/containerinsights + - gpu/containerinsights receivers: - awscontainerinsightreceiver telemetry: diff --git a/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.yaml b/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.yaml index 91e90916b1..b176d8f927 100644 --- a/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.yaml +++ b/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.yaml @@ -402,6 +402,8 @@ processors: new_name: apiserver_request_total_5xx operations: [] submatch_case: "" + gpu/containerinsights: + drop_original_metrics: false receivers: awscontainerinsightreceiver: add_container_name_metric_label: true @@ -440,6 +442,7 @@ service: processors: - metricstransform/containerinsights - batch/containerinsights + - gpu/containerinsights receivers: - awscontainerinsightreceiver telemetry: diff --git a/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.yaml b/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.yaml index 48b06f125b..14bd028c06 100644 --- a/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.yaml +++ b/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.yaml @@ -440,6 +440,8 @@ processors: new_name: apiserver_request_total_5xx operations: [] submatch_case: "" + gpu/containerinsights: + drop_original_metrics: false receivers: awscontainerinsightreceiver: add_container_name_metric_label: true @@ -519,6 +521,7 @@ service: processors: - metricstransform/containerinsights - batch/containerinsights + - gpu/containerinsights receivers: - awscontainerinsightreceiver telemetry: diff --git a/translator/translate/otel/exporter/awsemf/kubernetes.go b/translator/translate/otel/exporter/awsemf/kubernetes.go index a1ebe6ed8d..e7f8a2a0dc 100644 --- a/translator/translate/otel/exporter/awsemf/kubernetes.go +++ b/translator/translate/otel/exporter/awsemf/kubernetes.go @@ -469,7 +469,40 @@ func getGPUMetricDeclarations(conf *confmap.Conf) []*awsemfexporter.MetricDeclar if EnableGpuMetric && enhancedContainerInsightsEnabled { metricDeclarations = append(metricDeclarations, []*awsemfexporter.MetricDeclaration{ { - Dimensions: [][]string{{"Namespace", "ClusterName", "FullPodName", "PodName", "UUID"}, {"Namespace", "ClusterName", "PodName", "UUID"}, {"Namespace", "ClusterName", "Service"}, {"ClusterName"}}, + Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "Namespace", "PodName", "ContainerName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "ContainerName"}}, + MetricNameSelectors: []string{ + "container_gpu_utilization", + "container_gpu_utilization_memory", + "container_gpu_memory_total", + "container_gpu_memory_used", + "container_gpu_power_draw", + "container_gpu_temperature", + }, + }, + { + Dimensions: [][]string{{"ClusterName", "Namespace", "PodName", "FullPodName", "ContainerName", "GpuDevice"}}, + MetricNameSelectors: []string{ + "container_gpu_utilization", + "container_gpu_utilization_memory", + "container_gpu_memory_total", + "container_gpu_memory_used", + "container_gpu_power_draw", + "container_gpu_temperature", + }, + }, + { + Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "Namespace"}, {"ClusterName", "Namespace", "Service"}, {"ClusterName", "Namespace", "PodName"}, {"ClusterName", "Namespace", "PodName", "FullPodName"}}, + MetricNameSelectors: []string{ + "pod_gpu_utilization", + "pod_gpu_utilization_memory", + "pod_gpu_memory_total", + "pod_gpu_memory_used", + "pod_gpu_power_draw", + "pod_gpu_temperature", + }, + }, + { + Dimensions: [][]string{{"ClusterName", "Namespace", "PodName", "FullPodName", "GpuDevice"}}, MetricNameSelectors: []string{ "pod_gpu_utilization", "pod_gpu_utilization_memory", @@ -480,7 +513,18 @@ func getGPUMetricDeclarations(conf *confmap.Conf) []*awsemfexporter.MetricDeclar }, }, { - Dimensions: [][]string{{"ClusterName", "NodeName", "InstanceId", "UUID"}, {"ClusterName", "NodeName", "InstanceId"}, {"ClusterName"}}, + Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "NodeName", "InstanceId"}}, + MetricNameSelectors: []string{ + "node_gpu_utilization", + "node_gpu_utilization_memory", + "node_gpu_memory_total", + "node_gpu_memory_used", + "node_gpu_power_draw", + "node_gpu_temperature", + }, + }, + { + Dimensions: [][]string{{"ClusterName", "NodeName", "InstanceId", "GpuDevice"}}, MetricNameSelectors: []string{ "node_gpu_utilization", "node_gpu_utilization_memory", diff --git a/translator/translate/otel/exporter/awsemf/translator.go b/translator/translate/otel/exporter/awsemf/translator.go index bafd70cbef..93b98b0933 100644 --- a/translator/translate/otel/exporter/awsemf/translator.go +++ b/translator/translate/otel/exporter/awsemf/translator.go @@ -191,6 +191,8 @@ func setKubernetesFields(conf *confmap.Conf, cfg *awsemfexporter.Config) error { cfg.EnhancedContainerInsights = true } + setEnableGpuMetrics(kubernetesBasePathKey, conf, cfg) + return nil } @@ -228,3 +230,7 @@ func setPrometheusFields(conf *confmap.Conf, cfg *awsemfexporter.Config) error { func setDisableMetricExtraction(baseKey string, conf *confmap.Conf, cfg *awsemfexporter.Config) { cfg.DisableMetricExtraction = common.GetOrDefaultBool(conf, common.ConfigKey(baseKey, common.DisableMetricExtraction), false) } + +func setEnableGpuMetrics(baseKey string, conf *confmap.Conf, cfg *awsemfexporter.Config) { + cfg.EnableGpuMetric = common.GetOrDefaultBool(conf, common.ConfigKey(baseKey, common.EnableGpuMetric), true) +} diff --git a/translator/translate/otel/pipeline/containerinsights/translator.go b/translator/translate/otel/pipeline/containerinsights/translator.go index ba5fb093e2..a4073f9144 100644 --- a/translator/translate/otel/pipeline/containerinsights/translator.go +++ b/translator/translate/otel/pipeline/containerinsights/translator.go @@ -6,6 +6,7 @@ package containerinsights import ( "fmt" + "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/processor/gpu" "go.opentelemetry.io/collector/component" "go.opentelemetry.io/collector/confmap" @@ -52,7 +53,7 @@ func (t *translator) Translate(conf *confmap.Conf) (*common.ComponentTranslators if enhancedContainerInsightsEnabled { return &common.ComponentTranslators{ Receivers: common.NewTranslatorMap(awscontainerinsight.NewTranslator()), - Processors: common.NewTranslatorMap(metricstransformprocessor.NewTranslatorWithName(pipelineName), batchprocessor.NewTranslatorWithNameAndSection(pipelineName, common.LogsKey)), // EKS & ECS CI sit under metrics_collected in "logs" + Processors: common.NewTranslatorMap(metricstransformprocessor.NewTranslatorWithName(pipelineName), batchprocessor.NewTranslatorWithNameAndSection(pipelineName, common.LogsKey), gpu.NewTranslatorWithName(pipelineName)), // EKS & ECS CI sit under metrics_collected in "logs" Exporters: common.NewTranslatorMap(awsemf.NewTranslatorWithName(pipelineName)), Extensions: common.NewTranslatorMap(agenthealth.NewTranslator(component.DataTypeLogs, []string{agenthealth.OperationPutLogEvents})), }, nil diff --git a/translator/translate/otel/processor/gpu/translator.go b/translator/translate/otel/processor/gpu/translator.go new file mode 100644 index 0000000000..0fe5998d5c --- /dev/null +++ b/translator/translate/otel/processor/gpu/translator.go @@ -0,0 +1,34 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +package gpu + +import ( + "github.com/aws/amazon-cloudwatch-agent/plugins/processors/gpu" + "go.opentelemetry.io/collector/component" + "go.opentelemetry.io/collector/confmap" + "go.opentelemetry.io/collector/processor" + + "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/common" +) + +type translator struct { + name string + factory processor.Factory +} + +var _ common.Translator[component.Config] = (*translator)(nil) + +func NewTranslatorWithName(name string) common.Translator[component.Config] { + return &translator{name, gpu.NewFactory()} +} + +func (t *translator) ID() component.ID { + return component.NewIDWithName(t.factory.Type(), t.name) +} + +func (t *translator) Translate(conf *confmap.Conf) (component.Config, error) { + cfg := t.factory.CreateDefaultConfig().(*gpu.Config) + cfg.DropOriginalMetrics = false + return cfg, nil +} diff --git a/translator/translate/otel/processor/metricstransformprocessor/translator.go b/translator/translate/otel/processor/metricstransformprocessor/translator.go index 6e343750c9..2d43a7842f 100644 --- a/translator/translate/otel/processor/metricstransformprocessor/translator.go +++ b/translator/translate/otel/processor/metricstransformprocessor/translator.go @@ -41,74 +41,135 @@ func (t *translator) Translate(conf *confmap.Conf) (component.Config, error) { }, } - if isGpuEnabled(conf) { - transformRules = append(transformRules, []map[string]interface{}{ - { - "include": "DCGM_FI_DEV_GPU_UTIL", - "action": "insert", - "new_name": "pod_gpu_utilization", - }, - { - "include": "DCGM_FI_DEV_GPU_UTIL", - "action": "insert", - "new_name": "node_gpu_utilization", - }, - { - "include": "DCGM_FI_DEV_MEM_COPY_UTIL", - "action": "insert", - "new_name": "pod_gpu_utilization_memory", - }, - { - "include": "DCGM_FI_DEV_MEM_COPY_UTIL", - "action": "insert", - "new_name": "node_gpu_utilization_memory", - }, - { - "include": "DCGM_FI_DEV_FB_USED", - "action": "insert", - "new_name": "pod_gpu_memory_used", - }, - { - "include": "DCGM_FI_DEV_FB_USED", - "action": "insert", - "new_name": "node_gpu_memory_used", - }, - { - "include": "^DCGM_FI_DEV_FB_(USED|FREE)$", - "action": "insert", - "new_name": "pod_gpu_memory_total", - "aggregation_type": "sum", - "match_type": "regexp", - }, - { - "include": "^DCGM_FI_DEV_FB_(USED|FREE)$", - "action": "insert", - "new_name": "node_gpu_memory_total", - "aggregation_type": "sum", - "match_type": "regexp", - }, - { - "include": "DCGM_FI_DEV_GPU_TEMP", - "action": "insert", - "new_name": "pod_gpu_temperature", - }, - { - "include": "DCGM_FI_DEV_GPU_TEMP", - "action": "insert", - "new_name": "node_gpu_temperature", - }, - { - "include": "DCGM_FI_DEV_POWER_USAGE", - "action": "insert", - "new_name": "pod_gpu_power_draw", - }, - { - "include": "DCGM_FI_DEV_POWER_USAGE", - "action": "insert", - "new_name": "node_gpu_power_draw", - }, - }...) - } + //if isGpuEnabled(conf) { + // gpuTransformRules := []map[string]interface{}{ + // { + // "include": "DCGM_FI_DEV_GPU_UTIL", + // "action": "insert", + // "new_name": "container_gpu_utilization", + // }, + // { + // "include": "DCGM_FI_DEV_GPU_UTIL", + // "action": "insert", + // "new_name": "pod_gpu_utilization", + // }, + // { + // "include": "DCGM_FI_DEV_GPU_UTIL", + // "action": "insert", + // "new_name": "node_gpu_utilization", + // }, + // { + // "include": "DCGM_FI_DEV_MEM_COPY_UTIL", + // "action": "insert", + // "new_name": "container_gpu_utilization_memory", + // }, + // { + // "include": "DCGM_FI_DEV_MEM_COPY_UTIL", + // "action": "insert", + // "new_name": "pod_gpu_utilization_memory", + // }, + // { + // "include": "DCGM_FI_DEV_MEM_COPY_UTIL", + // "action": "insert", + // "new_name": "node_gpu_utilization_memory", + // }, + // { + // "include": "DCGM_FI_DEV_FB_USED", + // "action": "insert", + // "new_name": "container_gpu_memory_used", + // }, + // { + // "include": "DCGM_FI_DEV_FB_USED", + // "action": "insert", + // "new_name": "pod_gpu_memory_used", + // }, + // { + // "include": "DCGM_FI_DEV_FB_USED", + // "action": "insert", + // "new_name": "node_gpu_memory_used", + // }, + // { + // "include": "DCGM_FI_DEV_FB_TOTAL", + // "action": "insert", + // "new_name": "container_gpu_memory_total", + // }, + // { + // "include": "DCGM_FI_DEV_FB_TOTAL", + // "action": "insert", + // "new_name": "pod_gpu_memory_total", + // }, + // { + // "include": "DCGM_FI_DEV_FB_TOTAL", + // "action": "insert", + // "new_name": "node_gpu_memory_total", + // }, + // //{ + // // "include": "^DCGM_FI_DEV_FB_(USED|FREE)$", + // // "action": "combine", + // // "new_name": "pod_gpu_memory_total", + // // "aggregation_type": "sum", + // // "match_type": "regexp", + // //}, + // //{ + // // "include": "^DCGM_FI_DEV_FB_(USED|FREE)$", + // // "action": "combine", + // // "new_name": "node_gpu_memory_total", + // // "aggregation_type": "sum", + // // "match_type": "regexp", + // //}, + // { + // "include": "DCGM_FI_DEV_GPU_TEMP", + // "action": "insert", + // "new_name": "cotainer_gpu_temperature", + // }, + // { + // "include": "DCGM_FI_DEV_GPU_TEMP", + // "action": "insert", + // "new_name": "pod_gpu_temperature", + // }, + // { + // "include": "DCGM_FI_DEV_GPU_TEMP", + // "action": "insert", + // "new_name": "node_gpu_temperature", + // }, + // { + // "include": "DCGM_FI_DEV_POWER_USAGE", + // "action": "insert", + // "new_name": "container_gpu_power_draw", + // }, + // { + // "include": "DCGM_FI_DEV_POWER_USAGE", + // "action": "insert", + // "new_name": "pod_gpu_power_draw", + // }, + // { + // "include": "DCGM_FI_DEV_POWER_USAGE", + // "action": "insert", + // "new_name": "node_gpu_power_draw", + // }, + // } + // + // for _, rule := range gpuTransformRules { + // logType := "" + // metricName := rule["new_name"].(string) + // if strings.HasPrefix(metricName, "container_") { + // logType = "Node" + // } else if strings.HasPrefix(metricName, "node_") { + // logType = "Node" + // } else if strings.HasPrefix(metricName, "cluster_") { + // logType = "Cluster" + // } else { + // logType = "Pod" + // } + // rule["operations"] = map[string]interface{}{ + // "action": "add_label", + // "new_label": containerinsightscommon.MetricType, + // "new_value": logType, + // } + // } + // + // transformRules = append(transformRules, gpuTransformRules...) + //} c := confmap.NewFromStringMap(map[string]interface{}{ "transforms": transformRules, @@ -120,6 +181,6 @@ func (t *translator) Translate(conf *confmap.Conf) (component.Config, error) { return cfg, nil } -func isGpuEnabled(conf *confmap.Conf) bool { - return common.GetOrDefaultBool(conf, common.ConfigKey(common.LogsKey, common.MetricsCollectedKey, common.KubernetesKey, common.EnableGpuMetric), true) -} +//func isGpuEnabled(conf *confmap.Conf) bool { +// return common.GetOrDefaultBool(conf, common.ConfigKey(common.LogsKey, common.MetricsCollectedKey, common.KubernetesKey, common.EnableGpuMetric), true) +//} From 91e818215756e9d5b80f05d6c55508b56eeedfa6 Mon Sep 17 00:00:00 2001 From: Hyunsoo Kim Date: Fri, 16 Feb 2024 12:25:06 -0500 Subject: [PATCH 03/20] use metric transformer to handle gpu metrics and labels & clean up --- internal/containerinsightscommon/const.go | 1 + plugins/processors/gpu/factory.go | 2 +- plugins/processors/gpu/gpudecorator.go | 158 -------------- plugins/processors/gpu/logtypeattribute.go | 25 ++- plugins/processors/gpu/metriccombiner.go | 116 ---------- plugins/processors/gpu/processor.go | 111 ++++++++++ .../otel/exporter/awsemf/kubernetes.go | 43 +--- .../otel/exporter/awsemf/translator.go | 6 - .../metricstransformprocessor/translator.go | 204 +++++++----------- 9 files changed, 215 insertions(+), 451 deletions(-) delete mode 100644 plugins/processors/gpu/gpudecorator.go delete mode 100644 plugins/processors/gpu/metriccombiner.go create mode 100644 plugins/processors/gpu/processor.go diff --git a/internal/containerinsightscommon/const.go b/internal/containerinsightscommon/const.go index 167cf311df..aea5cb97d5 100644 --- a/internal/containerinsightscommon/const.go +++ b/internal/containerinsightscommon/const.go @@ -78,6 +78,7 @@ const ( GpuMemTotal = "gpu_memory_total" GpuTemperature = "gpu_temperature" GpuPowerDraw = "gpu_power_draw" + GpuFanSpeed = "gpu_fan_speed" TypeCluster = "Cluster" TypeClusterService = "ClusterService" diff --git a/plugins/processors/gpu/factory.go b/plugins/processors/gpu/factory.go index 8864cf47c0..9c7c61b21a 100644 --- a/plugins/processors/gpu/factory.go +++ b/plugins/processors/gpu/factory.go @@ -42,7 +42,7 @@ func createMetricsProcessor( return nil, fmt.Errorf("configuration parsing error") } - metricsProcessor := newDecorator(processorConfig, set.Logger) + metricsProcessor := newGpuProcessor(processorConfig, set.Logger) return processorhelper.NewMetricsProcessor( ctx, diff --git a/plugins/processors/gpu/gpudecorator.go b/plugins/processors/gpu/gpudecorator.go deleted file mode 100644 index acd73d2fc7..0000000000 --- a/plugins/processors/gpu/gpudecorator.go +++ /dev/null @@ -1,158 +0,0 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: MIT - -package gpu - -import ( - "context" - "strings" - - "github.com/aws/amazon-cloudwatch-agent/internal/containerinsightscommon" - "go.opentelemetry.io/collector/component" - "go.opentelemetry.io/collector/pdata/pcommon" - "go.opentelemetry.io/collector/pdata/pmetric" - "go.uber.org/zap" -) - -const ( - gpuMetric = "_gpu_" -) - -var metricDuplicateTypes = []string{ - containerinsightscommon.TypeContainer, - containerinsightscommon.TypePod, - containerinsightscommon.TypeNode, -} - -var renameMapForDcgm = map[string]string{ - "DCGM_FI_DEV_GPU_UTIL": containerinsightscommon.GpuUtilization, - "DCGM_FI_DEV_MEM_COPY_UTIL": containerinsightscommon.GpuMemUtilization, - "DCGM_FI_DEV_FB_USED": containerinsightscommon.GpuMemUsed, - "DCGM_FI_DEV_FB_TOTAL": containerinsightscommon.GpuMemTotal, - "DCGM_FI_DEV_GPU_TEMP": containerinsightscommon.GpuTemperature, - "DCGM_FI_DEV_POWER_USAGE": containerinsightscommon.GpuPowerDraw, -} - -type metricMutationRule struct { - sources []string - target string - removeOriginal bool -} - -type metricMutator interface { - Process(ms pmetric.Metrics) error -} - -type attributeMutator interface { - Process(m pmetric.Metric, attrs pcommon.Map, removeOriginal bool) error -} - -type decorator struct { - *Config - logger *zap.Logger - cancelFunc context.CancelFunc - shutdownC chan bool - started bool - attributeMutators []attributeMutator - metricMutators []metricMutator -} - -func newDecorator(config *Config, logger *zap.Logger) *decorator { - _, cancel := context.WithCancel(context.Background()) - d := &decorator{ - Config: config, - logger: logger, - cancelFunc: cancel, - } - return d -} - -func (d *decorator) processMetrics(ctx context.Context, md pmetric.Metrics) (pmetric.Metrics, error) { - if !d.started { - return pmetric.NewMetrics(), nil - } - - for _, metricMutator := range d.metricMutators { - // crate memory total - metricMutator.Process(md) - } - - rms := md.ResourceMetrics() - for i := 0; i < rms.Len(); i++ { - rs := rms.At(i) - ilms := rs.ScopeMetrics() - for j := 0; j < ilms.Len(); j++ { - ils := ilms.At(j) - metrics := ils.Metrics() - d.normalize(ctx, metrics) - for k := 0; k < metrics.Len(); k++ { - m := metrics.At(k) - d.processMetricAttributes(ctx, m) - } - } - } - return md, nil -} - -func (d *decorator) normalize(_ context.Context, metrics pmetric.MetricSlice) { - // duplicate metrics for metric types by normalizing names - orgLen := metrics.Len() - for i := 0; i < orgLen; i++ { - metric := metrics.At(i) - if newName, ok := renameMapForDcgm[metric.Name()]; ok { - for _, dt := range metricDuplicateTypes { - newMetric := pmetric.NewMetric() - metric.CopyTo(newMetric) - newMetric.SetName(containerinsightscommon.MetricName(dt, newName)) - newMetric.MoveTo(metrics.AppendEmpty()) - } - } - } -} - -func (d *decorator) processMetricAttributes(_ context.Context, m pmetric.Metric) { - if !strings.Contains(m.Name(), gpuMetric) { - return - } - - switch m.Type() { - case pmetric.MetricTypeGauge: - dps := m.Gauge().DataPoints() - for i := 0; i < dps.Len(); i++ { - for _, mutator := range d.attributeMutators { - err := mutator.Process(m, dps.At(i).Attributes(), false) - if err != nil { - d.logger.Debug("failed to process attributes", zap.Error(err)) - } - } - } - case pmetric.MetricTypeSum: - dps := m.Sum().DataPoints() - for i := 0; i < dps.Len(); i++ { - for _, mutator := range d.attributeMutators { - err := mutator.Process(m, dps.At(i).Attributes(), false) - if err != nil { - d.logger.Debug("failed to process attributes", zap.Error(err)) - } - } - } - default: - d.logger.Debug("Ignore unknown metric type", zap.String("type", m.Type().String())) - } -} - -func (d *decorator) Shutdown(context.Context) error { - close(d.shutdownC) - d.cancelFunc() - return nil -} - -func (d *decorator) Start(ctx context.Context, _ component.Host) error { - d.shutdownC = make(chan bool) - logTypeMutator := NewLogTypeAttribute(d.logger) - d.attributeMutators = []attributeMutator{logTypeMutator} - metricCombiner := NewMetricCombiner(d.logger, metricMutationRule{sources: []string{"DCGM_FI_DEV_FB_USED", "DCGM_FI_DEV_FB_FREE"}, target: "DCGM_FI_DEV_FB_TOTAL"}) - d.metricMutators = []metricMutator{metricCombiner} - d.started = true - return nil -} diff --git a/plugins/processors/gpu/logtypeattribute.go b/plugins/processors/gpu/logtypeattribute.go index 8c8b0f85ef..36fc2808b5 100644 --- a/plugins/processors/gpu/logtypeattribute.go +++ b/plugins/processors/gpu/logtypeattribute.go @@ -12,6 +12,18 @@ import ( "go.uber.org/zap" ) +const logTypeSuffix = "GPU" + +var defaultGpuLabels = []string{ + "ClusterName", + "Namespace", + "Service", + "ContainerName", + "FullPodName", + "PodName", + "GpuDevice", +} + type logTypeAttribute struct { logger *zap.Logger } @@ -23,7 +35,8 @@ func NewLogTypeAttribute(logger *zap.Logger) *logTypeAttribute { } func (an *logTypeAttribute) Process(m pmetric.Metric, attributes pcommon.Map, removeOriginal bool) error { - an.addLogTypeAttribute(m, attributes) + //an.addLogTypeAttribute(m, attributes) + an.addDefaultAttributes(m, attributes) return nil } @@ -43,5 +56,13 @@ func (an *logTypeAttribute) addLogTypeAttribute(m pmetric.Metric, attributes pco default: an.logger.Warn("metric name is either empty or not a supported type") } - attributes.PutStr("Type", logType) + attributes.PutStr("Type", logType+logTypeSuffix) +} + +func (an *logTypeAttribute) addDefaultAttributes(m pmetric.Metric, attributes pcommon.Map) { + for _, k := range defaultGpuLabels { + if _, ok := attributes.Get(k); !ok { + attributes.PutStr(k, "") + } + } } diff --git a/plugins/processors/gpu/metriccombiner.go b/plugins/processors/gpu/metriccombiner.go deleted file mode 100644 index d14ada92e8..0000000000 --- a/plugins/processors/gpu/metriccombiner.go +++ /dev/null @@ -1,116 +0,0 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: MIT - -package gpu - -import ( - "go.opentelemetry.io/collector/pdata/pmetric" - "go.uber.org/zap" -) - -type metricCombiner struct { - logger *zap.Logger - rule metricMutationRule -} - -func NewMetricCombiner(logger *zap.Logger, rule metricMutationRule) *metricCombiner { - return &metricCombiner{ - logger: logger, - rule: rule, - } -} - -// basic idea/code is from metricsgenerationprocessor [BETA] https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/processor/metricsgenerationprocessor/README.md -func (mm *metricCombiner) Process(ms pmetric.Metrics) error { - rms := ms.ResourceMetrics() - for i := 0; i < rms.Len(); i++ { - rm := rms.At(i) - nameToMetricMap := mm.getNameToMetricMap(rm) - - from2Val := float64(0) - from1, ok := nameToMetricMap[mm.rule.sources[0]] - if !ok { - mm.logger.Debug("Missing first metric", zap.String("metric_name", mm.rule.sources[0])) - continue - } - from2, ok := nameToMetricMap[mm.rule.sources[1]] - if !ok { - mm.logger.Debug("Missing second metric", zap.String("metric_name", mm.rule.sources[1])) - continue - } - from2Val = mm.getMetricValue(from2) - mm.generateMetrics(rm, mm.rule.target, from1.Name(), from1.Unit(), from2Val) - } - return nil -} - -func (mm *metricCombiner) getNameToMetricMap(rm pmetric.ResourceMetrics) map[string]pmetric.Metric { - ilms := rm.ScopeMetrics() - metricMap := make(map[string]pmetric.Metric) - - for i := 0; i < ilms.Len(); i++ { - ilm := ilms.At(i) - metricSlice := ilm.Metrics() - for j := 0; j < metricSlice.Len(); j++ { - metric := metricSlice.At(j) - metricMap[metric.Name()] = metric - } - } - return metricMap -} - -func (mm *metricCombiner) getMetricValue(metric pmetric.Metric) float64 { - if metric.Type() == pmetric.MetricTypeGauge { - dataPoints := metric.Gauge().DataPoints() - if dataPoints.Len() > 0 { - switch dataPoints.At(0).ValueType() { - case pmetric.NumberDataPointValueTypeDouble: - return dataPoints.At(0).DoubleValue() - case pmetric.NumberDataPointValueTypeInt: - return float64(dataPoints.At(0).IntValue()) - } - } - return 0 - } - return 0 -} - -// generateMetrics creates a new metric based on the given rule and add it to the Resource Metric. -// The value for newly calculated metrics is always a floting point number and the dataType is set -// as MetricTypeDoubleGauge. -func (mm *metricCombiner) generateMetrics(rm pmetric.ResourceMetrics, newName string, f1name string, unit string, f2val float64) { - ilms := rm.ScopeMetrics() - for i := 0; i < ilms.Len(); i++ { - ilm := ilms.At(i) - metricSlice := ilm.Metrics() - for j := 0; j < metricSlice.Len(); j++ { - metric := metricSlice.At(j) - if metric.Name() == f1name { - newMetric := ilm.Metrics().AppendEmpty() - newMetric.SetName(newName) - newMetric.SetUnit(unit) - newMetric.SetEmptyGauge() - mm.addDoubleGaugeDataPoints(metric, newMetric, f2val) - } - } - } -} - -func (mm *metricCombiner) addDoubleGaugeDataPoints(from pmetric.Metric, to pmetric.Metric, m2val float64) { - dataPoints := from.Gauge().DataPoints() - for i := 0; i < dataPoints.Len(); i++ { - from := dataPoints.At(i) - var val float64 - switch from.ValueType() { - case pmetric.NumberDataPointValueTypeDouble: - val = from.DoubleValue() - case pmetric.NumberDataPointValueTypeInt: - val = float64(from.IntValue()) - } - - newDp := to.Gauge().DataPoints().AppendEmpty() - from.CopyTo(newDp) - value := val + m2val - newDp.SetDoubleValue(value) - } -} diff --git a/plugins/processors/gpu/processor.go b/plugins/processors/gpu/processor.go new file mode 100644 index 0000000000..5b33c2120f --- /dev/null +++ b/plugins/processors/gpu/processor.go @@ -0,0 +1,111 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +package gpu + +import ( + "context" + "strings" + + "github.com/aws/amazon-cloudwatch-agent/internal/containerinsightscommon" + "go.opentelemetry.io/collector/component" + "go.opentelemetry.io/collector/pdata/pcommon" + "go.opentelemetry.io/collector/pdata/pmetric" + "go.uber.org/zap" +) + +const ( + gpuMetric = "_gpu_" +) + +var renameMapForDcgm = map[string]string{ + "DCGM_FI_DEV_GPU_UTIL": containerinsightscommon.GpuUtilization, + "DCGM_FI_DEV_FB_USED_PERCENT": containerinsightscommon.GpuMemUtilization, + "DCGM_FI_DEV_FB_USED": containerinsightscommon.GpuMemUsed, + "DCGM_FI_DEV_FB_TOTAL": containerinsightscommon.GpuMemTotal, + "DCGM_FI_DEV_GPU_TEMP": containerinsightscommon.GpuTemperature, + "DCGM_FI_DEV_POWER_USAGE": containerinsightscommon.GpuPowerDraw, + // "DCGM_FI_DEV_FAN_SPEED": containerinsightscommon.GpuFanSpeed, +} + +type gpuprocessor struct { + *Config + logger *zap.Logger + cancelFunc context.CancelFunc + shutdownC chan bool + started bool +} + +func newGpuProcessor(config *Config, logger *zap.Logger) *gpuprocessor { + _, cancel := context.WithCancel(context.Background()) + d := &gpuprocessor{ + Config: config, + logger: logger, + cancelFunc: cancel, + } + return d +} + +func (d *gpuprocessor) processMetrics(ctx context.Context, md pmetric.Metrics) (pmetric.Metrics, error) { + if !d.started { + return pmetric.NewMetrics(), nil + } + + rms := md.ResourceMetrics() + for i := 0; i < rms.Len(); i++ { + rs := rms.At(i) + ilms := rs.ScopeMetrics() + for j := 0; j < ilms.Len(); j++ { + ils := ilms.At(j) + metrics := ils.Metrics() + for k := 0; k < metrics.Len(); k++ { + m := metrics.At(k) + d.processMetricAttributes(ctx, m) + } + } + } + return md, nil +} + +func (d *gpuprocessor) processMetricAttributes(_ context.Context, m pmetric.Metric) { + // only decorate GPU metrics + // another option is to separate GPU of its own pipeline to minimize extra processing of metrics + if !strings.Contains(m.Name(), gpuMetric) { + return + } + + switch m.Type() { + case pmetric.MetricTypeGauge: + dps := m.Gauge().DataPoints() + for i := 0; i < dps.Len(); i++ { + addDefaultAttributes(dps.At(i).Attributes()) + } + case pmetric.MetricTypeSum: + dps := m.Sum().DataPoints() + for i := 0; i < dps.Len(); i++ { + addDefaultAttributes(dps.At(i).Attributes()) + } + default: + d.logger.Debug("Ignore unknown metric type", zap.String("type", m.Type().String())) + } +} + +func addDefaultAttributes(attributes pcommon.Map) { + for _, k := range defaultGpuLabels { + if _, ok := attributes.Get(k); !ok { + attributes.PutStr(k, "") + } + } +} + +func (d *gpuprocessor) Shutdown(context.Context) error { + close(d.shutdownC) + d.cancelFunc() + return nil +} + +func (d *gpuprocessor) Start(ctx context.Context, _ component.Host) error { + d.shutdownC = make(chan bool) + d.started = true + return nil +} diff --git a/translator/translate/otel/exporter/awsemf/kubernetes.go b/translator/translate/otel/exporter/awsemf/kubernetes.go index e7f8a2a0dc..a17ec9a14f 100644 --- a/translator/translate/otel/exporter/awsemf/kubernetes.go +++ b/translator/translate/otel/exporter/awsemf/kubernetes.go @@ -469,7 +469,7 @@ func getGPUMetricDeclarations(conf *confmap.Conf) []*awsemfexporter.MetricDeclar if EnableGpuMetric && enhancedContainerInsightsEnabled { metricDeclarations = append(metricDeclarations, []*awsemfexporter.MetricDeclaration{ { - Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "Namespace", "PodName", "ContainerName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "ContainerName"}}, + Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "Namespace", "PodName", "ContainerName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "ContainerName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "ContainerName", "GpuDevice"}}, MetricNameSelectors: []string{ "container_gpu_utilization", "container_gpu_utilization_memory", @@ -480,29 +480,7 @@ func getGPUMetricDeclarations(conf *confmap.Conf) []*awsemfexporter.MetricDeclar }, }, { - Dimensions: [][]string{{"ClusterName", "Namespace", "PodName", "FullPodName", "ContainerName", "GpuDevice"}}, - MetricNameSelectors: []string{ - "container_gpu_utilization", - "container_gpu_utilization_memory", - "container_gpu_memory_total", - "container_gpu_memory_used", - "container_gpu_power_draw", - "container_gpu_temperature", - }, - }, - { - Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "Namespace"}, {"ClusterName", "Namespace", "Service"}, {"ClusterName", "Namespace", "PodName"}, {"ClusterName", "Namespace", "PodName", "FullPodName"}}, - MetricNameSelectors: []string{ - "pod_gpu_utilization", - "pod_gpu_utilization_memory", - "pod_gpu_memory_total", - "pod_gpu_memory_used", - "pod_gpu_power_draw", - "pod_gpu_temperature", - }, - }, - { - Dimensions: [][]string{{"ClusterName", "Namespace", "PodName", "FullPodName", "GpuDevice"}}, + Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "Namespace"}, {"ClusterName", "Namespace", "Service"}, {"ClusterName", "Namespace", "PodName"}, {"ClusterName", "Namespace", "PodName", "FullPodName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "GpuDevice"}}, MetricNameSelectors: []string{ "pod_gpu_utilization", "pod_gpu_utilization_memory", @@ -513,18 +491,7 @@ func getGPUMetricDeclarations(conf *confmap.Conf) []*awsemfexporter.MetricDeclar }, }, { - Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "NodeName", "InstanceId"}}, - MetricNameSelectors: []string{ - "node_gpu_utilization", - "node_gpu_utilization_memory", - "node_gpu_memory_total", - "node_gpu_memory_used", - "node_gpu_power_draw", - "node_gpu_temperature", - }, - }, - { - Dimensions: [][]string{{"ClusterName", "NodeName", "InstanceId", "GpuDevice"}}, + Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "NodeName", "InstanceId"}, {"ClusterName", "NodeName", "InstanceId", "GpuDevice"}}, MetricNameSelectors: []string{ "node_gpu_utilization", "node_gpu_utilization_memory", @@ -532,17 +499,21 @@ func getGPUMetricDeclarations(conf *confmap.Conf) []*awsemfexporter.MetricDeclar "node_gpu_memory_used", "node_gpu_power_draw", "node_gpu_temperature", + "node_gpu_fan_speed", }, }, { Dimensions: [][]string{{"ClusterName", "NodeName", "InstanceId"}, {"ClusterName"}}, MetricNameSelectors: []string{ "node_gpu_total", + "node_gpu_request", + "node_gpu_limit", }, }, { Dimensions: [][]string{{"ClusterName"}}, MetricNameSelectors: []string{ + "cluster_gpu_request", "cluster_gpu_total", }, }, diff --git a/translator/translate/otel/exporter/awsemf/translator.go b/translator/translate/otel/exporter/awsemf/translator.go index 93b98b0933..bafd70cbef 100644 --- a/translator/translate/otel/exporter/awsemf/translator.go +++ b/translator/translate/otel/exporter/awsemf/translator.go @@ -191,8 +191,6 @@ func setKubernetesFields(conf *confmap.Conf, cfg *awsemfexporter.Config) error { cfg.EnhancedContainerInsights = true } - setEnableGpuMetrics(kubernetesBasePathKey, conf, cfg) - return nil } @@ -230,7 +228,3 @@ func setPrometheusFields(conf *confmap.Conf, cfg *awsemfexporter.Config) error { func setDisableMetricExtraction(baseKey string, conf *confmap.Conf, cfg *awsemfexporter.Config) { cfg.DisableMetricExtraction = common.GetOrDefaultBool(conf, common.ConfigKey(baseKey, common.DisableMetricExtraction), false) } - -func setEnableGpuMetrics(baseKey string, conf *confmap.Conf, cfg *awsemfexporter.Config) { - cfg.EnableGpuMetric = common.GetOrDefaultBool(conf, common.ConfigKey(baseKey, common.EnableGpuMetric), true) -} diff --git a/translator/translate/otel/processor/metricstransformprocessor/translator.go b/translator/translate/otel/processor/metricstransformprocessor/translator.go index 2d43a7842f..8ea70d3135 100644 --- a/translator/translate/otel/processor/metricstransformprocessor/translator.go +++ b/translator/translate/otel/processor/metricstransformprocessor/translator.go @@ -6,6 +6,7 @@ package metricstransformprocessor import ( "fmt" + "github.com/aws/amazon-cloudwatch-agent/internal/containerinsightscommon" "github.com/open-telemetry/opentelemetry-collector-contrib/processor/metricstransformprocessor" "go.opentelemetry.io/collector/component" "go.opentelemetry.io/collector/confmap" @@ -14,6 +15,34 @@ import ( "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/common" ) +const gpuLogSuffix = "GPU" + +var metricDuplicateTypes = []string{ + containerinsightscommon.TypeContainer, + containerinsightscommon.TypePod, + containerinsightscommon.TypeNode, +} + +var defaultGpuLabels = []string{ + "ClusterName", + "Namespace", + "Service", + "ContainerName", + "FullPodName", + "PodName", + "GpuDevice", +} + +var renameMapForDcgm = map[string]string{ + "DCGM_FI_DEV_GPU_UTIL": containerinsightscommon.GpuUtilization, + "DCGM_FI_DEV_FB_USED_PERCENT": containerinsightscommon.GpuMemUtilization, + "DCGM_FI_DEV_FB_USED": containerinsightscommon.GpuMemUsed, + "DCGM_FI_DEV_FB_TOTAL": containerinsightscommon.GpuMemTotal, + "DCGM_FI_DEV_GPU_TEMP": containerinsightscommon.GpuTemperature, + "DCGM_FI_DEV_POWER_USAGE": containerinsightscommon.GpuPowerDraw, + // "DCGM_FI_DEV_FAN_SPEED": containerinsightscommon.GpuFanSpeed, +} + type translator struct { name string factory processor.Factory @@ -41,135 +70,46 @@ func (t *translator) Translate(conf *confmap.Conf) (component.Config, error) { }, } - //if isGpuEnabled(conf) { - // gpuTransformRules := []map[string]interface{}{ - // { - // "include": "DCGM_FI_DEV_GPU_UTIL", - // "action": "insert", - // "new_name": "container_gpu_utilization", - // }, - // { - // "include": "DCGM_FI_DEV_GPU_UTIL", - // "action": "insert", - // "new_name": "pod_gpu_utilization", - // }, - // { - // "include": "DCGM_FI_DEV_GPU_UTIL", - // "action": "insert", - // "new_name": "node_gpu_utilization", - // }, - // { - // "include": "DCGM_FI_DEV_MEM_COPY_UTIL", - // "action": "insert", - // "new_name": "container_gpu_utilization_memory", - // }, - // { - // "include": "DCGM_FI_DEV_MEM_COPY_UTIL", - // "action": "insert", - // "new_name": "pod_gpu_utilization_memory", - // }, - // { - // "include": "DCGM_FI_DEV_MEM_COPY_UTIL", - // "action": "insert", - // "new_name": "node_gpu_utilization_memory", - // }, - // { - // "include": "DCGM_FI_DEV_FB_USED", - // "action": "insert", - // "new_name": "container_gpu_memory_used", - // }, - // { - // "include": "DCGM_FI_DEV_FB_USED", - // "action": "insert", - // "new_name": "pod_gpu_memory_used", - // }, - // { - // "include": "DCGM_FI_DEV_FB_USED", - // "action": "insert", - // "new_name": "node_gpu_memory_used", - // }, - // { - // "include": "DCGM_FI_DEV_FB_TOTAL", - // "action": "insert", - // "new_name": "container_gpu_memory_total", - // }, - // { - // "include": "DCGM_FI_DEV_FB_TOTAL", - // "action": "insert", - // "new_name": "pod_gpu_memory_total", - // }, - // { - // "include": "DCGM_FI_DEV_FB_TOTAL", - // "action": "insert", - // "new_name": "node_gpu_memory_total", - // }, - // //{ - // // "include": "^DCGM_FI_DEV_FB_(USED|FREE)$", - // // "action": "combine", - // // "new_name": "pod_gpu_memory_total", - // // "aggregation_type": "sum", - // // "match_type": "regexp", - // //}, - // //{ - // // "include": "^DCGM_FI_DEV_FB_(USED|FREE)$", - // // "action": "combine", - // // "new_name": "node_gpu_memory_total", - // // "aggregation_type": "sum", - // // "match_type": "regexp", - // //}, - // { - // "include": "DCGM_FI_DEV_GPU_TEMP", - // "action": "insert", - // "new_name": "cotainer_gpu_temperature", - // }, - // { - // "include": "DCGM_FI_DEV_GPU_TEMP", - // "action": "insert", - // "new_name": "pod_gpu_temperature", - // }, - // { - // "include": "DCGM_FI_DEV_GPU_TEMP", - // "action": "insert", - // "new_name": "node_gpu_temperature", - // }, - // { - // "include": "DCGM_FI_DEV_POWER_USAGE", - // "action": "insert", - // "new_name": "container_gpu_power_draw", - // }, - // { - // "include": "DCGM_FI_DEV_POWER_USAGE", - // "action": "insert", - // "new_name": "pod_gpu_power_draw", - // }, - // { - // "include": "DCGM_FI_DEV_POWER_USAGE", - // "action": "insert", - // "new_name": "node_gpu_power_draw", - // }, - // } - // - // for _, rule := range gpuTransformRules { - // logType := "" - // metricName := rule["new_name"].(string) - // if strings.HasPrefix(metricName, "container_") { - // logType = "Node" - // } else if strings.HasPrefix(metricName, "node_") { - // logType = "Node" - // } else if strings.HasPrefix(metricName, "cluster_") { - // logType = "Cluster" - // } else { - // logType = "Pod" - // } - // rule["operations"] = map[string]interface{}{ - // "action": "add_label", - // "new_label": containerinsightscommon.MetricType, - // "new_value": logType, - // } - // } - // - // transformRules = append(transformRules, gpuTransformRules...) - //} + if isGpuEnabled(conf) { + var operations []map[string]interface{} + // appends DCGM metric transform rules for each metric type (container/pod/node) with following format: + // { + // "include": "DCGM_FI_DEV_GPU_UTIL", + // "action": "insert", + // "new_name": "container_gpu_utilization", + // "operations": [ + // { + // "action": "add_label", + // "new_label": "Type", + // "new_value": "ContainerGPU", + // }, + // ... + // ] + // }, + for old, new := range renameMapForDcgm { + for _, t := range metricDuplicateTypes { + // convert decimals to percent + if new == containerinsightscommon.GpuMemUtilization { + operations = append(operations, map[string]interface{}{ + "action": "experimental_scale_value", + "experimental_scale": 100, + }) + } + transformRules = append(transformRules, map[string]interface{}{ + "include": old, + "action": "insert", + "new_name": containerinsightscommon.MetricName(t, new), + "operations": append([]map[string]interface{}{ + { + "action": "add_label", + "new_label": containerinsightscommon.MetricType, + "new_value": t + gpuLogSuffix, + }, + }, operations...), + }) + } + } + } c := confmap.NewFromStringMap(map[string]interface{}{ "transforms": transformRules, @@ -181,6 +121,6 @@ func (t *translator) Translate(conf *confmap.Conf) (component.Config, error) { return cfg, nil } -//func isGpuEnabled(conf *confmap.Conf) bool { -// return common.GetOrDefaultBool(conf, common.ConfigKey(common.LogsKey, common.MetricsCollectedKey, common.KubernetesKey, common.EnableGpuMetric), true) -//} +func isGpuEnabled(conf *confmap.Conf) bool { + return common.GetOrDefaultBool(conf, common.ConfigKey(common.LogsKey, common.MetricsCollectedKey, common.KubernetesKey, common.EnableGpuMetric), true) +} From c652722436818f735fce935cf928dfb4e1005732 Mon Sep 17 00:00:00 2001 From: Hyunsoo Kim Date: Fri, 16 Feb 2024 15:49:13 -0500 Subject: [PATCH 04/20] update metric trasformer rules & remove unused funcs/files --- plugins/processors/gpu/config.go | 4 +- plugins/processors/gpu/config_test.go | 19 + plugins/processors/gpu/factory_test.go | 45 ++ plugins/processors/gpu/logtypeattribute.go | 68 --- plugins/processors/gpu/processor.go | 18 +- plugins/processors/gpu/processor_test.go | 108 ++++ service/defaultcomponents/components.go | 3 +- .../emf_and_kubernetes_config.yaml | 3 +- .../emf_and_kubernetes_with_gpu_config.yaml | 544 +++++++++++++----- .../kubernetes_on_prem_config.yaml | 3 +- .../logs_and_kubernetes_config.yaml | 3 +- .../otel/exporter/awsemf/kubernetes.go | 3 +- .../otel/exporter/awsemf/translator_test.go | 30 + .../pipeline/containerinsights/translator.go | 3 +- .../containerinsights/translator_test.go | 2 +- .../otel/processor/gpu/translator.go | 1 - .../metricstransformprocessor/translator.go | 20 +- 17 files changed, 624 insertions(+), 253 deletions(-) create mode 100644 plugins/processors/gpu/config_test.go create mode 100644 plugins/processors/gpu/factory_test.go delete mode 100644 plugins/processors/gpu/logtypeattribute.go create mode 100644 plugins/processors/gpu/processor_test.go diff --git a/plugins/processors/gpu/config.go b/plugins/processors/gpu/config.go index 7f9198d75b..b72cbdc39c 100644 --- a/plugins/processors/gpu/config.go +++ b/plugins/processors/gpu/config.go @@ -7,9 +7,7 @@ import ( "go.opentelemetry.io/collector/component" ) -type Config struct { - DropOriginalMetrics bool `mapstructure:"drop_original_metrics"` -} +type Config struct{} // Verify Config implements Processor interface. var _ component.Config = (*Config)(nil) diff --git a/plugins/processors/gpu/config_test.go b/plugins/processors/gpu/config_test.go new file mode 100644 index 0000000000..db3918301a --- /dev/null +++ b/plugins/processors/gpu/config_test.go @@ -0,0 +1,19 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +package gpu + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "go.opentelemetry.io/collector/component" + "go.opentelemetry.io/collector/confmap" +) + +func TestUnmarshalDefaultConfig(t *testing.T) { + factory := NewFactory() + cfg := factory.CreateDefaultConfig() + assert.NoError(t, component.UnmarshalConfig(confmap.New(), cfg)) + assert.Equal(t, factory.CreateDefaultConfig(), cfg) +} diff --git a/plugins/processors/gpu/factory_test.go b/plugins/processors/gpu/factory_test.go new file mode 100644 index 0000000000..bae457d92b --- /dev/null +++ b/plugins/processors/gpu/factory_test.go @@ -0,0 +1,45 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +package gpu + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.opentelemetry.io/collector/component" + "go.opentelemetry.io/collector/component/componenttest" + "go.opentelemetry.io/collector/consumer/consumertest" + "go.opentelemetry.io/collector/processor/processortest" +) + +func TestCreateDefaultConfig(t *testing.T) { + factory := NewFactory() + require.NotNil(t, factory) + + cfg := factory.CreateDefaultConfig() + assert.NotNil(t, cfg, "failed to create default config") + assert.NoError(t, componenttest.CheckConfigStruct(cfg)) +} + +func TestCreateProcessor(t *testing.T) { + factory := NewFactory() + require.NotNil(t, factory) + + cfg := factory.CreateDefaultConfig() + setting := processortest.NewNopCreateSettings() + + tProcessor, err := factory.CreateTracesProcessor(context.Background(), setting, cfg, consumertest.NewNop()) + assert.Equal(t, err, component.ErrDataTypeIsNotSupported) + assert.Nil(t, tProcessor) + + mProcessor, err := factory.CreateMetricsProcessor(context.Background(), setting, cfg, consumertest.NewNop()) + assert.NoError(t, err) + assert.NotNil(t, mProcessor) + + lProcessor, err := factory.CreateLogsProcessor(context.Background(), setting, cfg, consumertest.NewNop()) + assert.Equal(t, err, component.ErrDataTypeIsNotSupported) + assert.Nil(t, lProcessor) +} diff --git a/plugins/processors/gpu/logtypeattribute.go b/plugins/processors/gpu/logtypeattribute.go deleted file mode 100644 index 36fc2808b5..0000000000 --- a/plugins/processors/gpu/logtypeattribute.go +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: MIT - -package gpu - -import ( - "strings" - - "github.com/aws/amazon-cloudwatch-agent/internal/containerinsightscommon" - "go.opentelemetry.io/collector/pdata/pcommon" - "go.opentelemetry.io/collector/pdata/pmetric" - "go.uber.org/zap" -) - -const logTypeSuffix = "GPU" - -var defaultGpuLabels = []string{ - "ClusterName", - "Namespace", - "Service", - "ContainerName", - "FullPodName", - "PodName", - "GpuDevice", -} - -type logTypeAttribute struct { - logger *zap.Logger -} - -func NewLogTypeAttribute(logger *zap.Logger) *logTypeAttribute { - return &logTypeAttribute{ - logger: logger, - } -} - -func (an *logTypeAttribute) Process(m pmetric.Metric, attributes pcommon.Map, removeOriginal bool) error { - //an.addLogTypeAttribute(m, attributes) - an.addDefaultAttributes(m, attributes) - return nil -} - -// NOTE: There are additional metric types (PodGpu and NodeGpu) that get applied in the emf exporter. -// Those 2 metric types handled by emf exporter are used only for dimensions sets that include "GpuDevice" -func (an *logTypeAttribute) addLogTypeAttribute(m pmetric.Metric, attributes pcommon.Map) { - logType := "" - switch strings.Split(m.Name(), "_")[0] { - case "container": - logType = containerinsightscommon.TypeContainer - case "pod": - logType = containerinsightscommon.TypePod - case "node": - logType = containerinsightscommon.TypeNode - case "cluster": - logType = containerinsightscommon.TypeCluster - default: - an.logger.Warn("metric name is either empty or not a supported type") - } - attributes.PutStr("Type", logType+logTypeSuffix) -} - -func (an *logTypeAttribute) addDefaultAttributes(m pmetric.Metric, attributes pcommon.Map) { - for _, k := range defaultGpuLabels { - if _, ok := attributes.Get(k); !ok { - attributes.PutStr(k, "") - } - } -} diff --git a/plugins/processors/gpu/processor.go b/plugins/processors/gpu/processor.go index 5b33c2120f..f3c7412bc3 100644 --- a/plugins/processors/gpu/processor.go +++ b/plugins/processors/gpu/processor.go @@ -7,7 +7,6 @@ import ( "context" "strings" - "github.com/aws/amazon-cloudwatch-agent/internal/containerinsightscommon" "go.opentelemetry.io/collector/component" "go.opentelemetry.io/collector/pdata/pcommon" "go.opentelemetry.io/collector/pdata/pmetric" @@ -18,14 +17,14 @@ const ( gpuMetric = "_gpu_" ) -var renameMapForDcgm = map[string]string{ - "DCGM_FI_DEV_GPU_UTIL": containerinsightscommon.GpuUtilization, - "DCGM_FI_DEV_FB_USED_PERCENT": containerinsightscommon.GpuMemUtilization, - "DCGM_FI_DEV_FB_USED": containerinsightscommon.GpuMemUsed, - "DCGM_FI_DEV_FB_TOTAL": containerinsightscommon.GpuMemTotal, - "DCGM_FI_DEV_GPU_TEMP": containerinsightscommon.GpuTemperature, - "DCGM_FI_DEV_POWER_USAGE": containerinsightscommon.GpuPowerDraw, - // "DCGM_FI_DEV_FAN_SPEED": containerinsightscommon.GpuFanSpeed, +var defaultGpuLabels = []string{ + "ClusterName", + "Namespace", + "Service", + "ContainerName", + "FullPodName", + "PodName", + "GpuDevice", } type gpuprocessor struct { @@ -90,6 +89,7 @@ func (d *gpuprocessor) processMetricAttributes(_ context.Context, m pmetric.Metr } } +// adds empty string for default attributes since prometheus drops them during relabeling process func addDefaultAttributes(attributes pcommon.Map) { for _, k := range defaultGpuLabels { if _, ok := attributes.Get(k); !ok { diff --git a/plugins/processors/gpu/processor_test.go b/plugins/processors/gpu/processor_test.go new file mode 100644 index 0000000000..eda521093a --- /dev/null +++ b/plugins/processors/gpu/processor_test.go @@ -0,0 +1,108 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +package gpu + +import ( + "context" + "testing" + + "github.com/grafana/regexp" + "github.com/stretchr/testify/assert" + "go.opentelemetry.io/collector/pdata/pmetric" + "go.uber.org/zap" +) + +var normalizedNameRegex = regexp.MustCompile("^(container|pod|node)_gpu_[a-z_]+$") + +func TestProcessMetrics(t *testing.T) { + logger, _ := zap.NewDevelopment() + gp := &gpuprocessor{ + logger: logger, + Config: createDefaultConfig().(*Config), + } + ctx := context.Background() + gp.Start(ctx, nil) + + testcases := map[string]struct { + metrics pmetric.Metrics + want map[string]string + }{ + "keepExisting": { + metrics: generateMetrics(map[string]string{ + "ClusterName": "cluster", + "Namespace": "namespace", + "Service": "service", + "ContainerName": "container", + "FullPodName": "fullpod", + "PodName": "pod", + "GpuDevice": "gpu", + }), + want: map[string]string{ + "ClusterName": "cluster", + "Namespace": "namespace", + "Service": "service", + "ContainerName": "container", + "FullPodName": "fullpod", + "PodName": "pod", + "GpuDevice": "gpu", + }, + }, + "addMissing": { + metrics: generateMetrics(map[string]string{ + "ClusterName": "cluster", + "Namespace": "namespace", + "Service": "service", + "ContainerName": "container", + "FullPodName": "fullpod", + }), + want: map[string]string{ + "ClusterName": "cluster", + "Namespace": "namespace", + "Service": "service", + "ContainerName": "container", + "FullPodName": "fullpod", + "PodName": "", + "GpuDevice": "", + }, + }, + "addAll": { + metrics: generateMetrics(map[string]string{}), + want: map[string]string{ + "ClusterName": "", + "Namespace": "", + "Service": "", + "ContainerName": "", + "FullPodName": "", + "PodName": "", + "GpuDevice": "", + }, + }, + } + + for _, tc := range testcases { + ms, _ := gp.processMetrics(ctx, tc.metrics) + attrs := ms.ResourceMetrics().At(0).ScopeMetrics().At(0).Metrics().At(0).Gauge().DataPoints().At(0).Attributes() + assert.Equal(t, len(defaultGpuLabels), attrs.Len()) + for k, v := range tc.want { + got, ok := attrs.Get(k) + assert.True(t, ok) + assert.Equal(t, v, got.Str()) + } + } +} + +func generateMetrics(dimensions map[string]string) pmetric.Metrics { + md := pmetric.NewMetrics() + + m := md.ResourceMetrics().AppendEmpty().ScopeMetrics().AppendEmpty().Metrics().AppendEmpty() + m.SetName("test" + gpuMetric) + gauge := m.SetEmptyGauge().DataPoints().AppendEmpty() + gauge.SetIntValue(10) + + for k, v := range dimensions { + gauge.Attributes().PutStr(k, v) + } + + return md +} diff --git a/service/defaultcomponents/components.go b/service/defaultcomponents/components.go index 509b1f0184..180e0b3f3c 100644 --- a/service/defaultcomponents/components.go +++ b/service/defaultcomponents/components.go @@ -4,7 +4,6 @@ package defaultcomponents import ( - "github.com/aws/amazon-cloudwatch-agent/plugins/processors/gpu" "github.com/open-telemetry/opentelemetry-collector-contrib/exporter/awscloudwatchlogsexporter" "github.com/open-telemetry/opentelemetry-collector-contrib/exporter/awsemfexporter" "github.com/open-telemetry/opentelemetry-collector-contrib/exporter/awsxrayexporter" @@ -26,6 +25,8 @@ import ( "go.opentelemetry.io/collector/receiver" "go.opentelemetry.io/collector/receiver/otlpreceiver" + "github.com/aws/amazon-cloudwatch-agent/plugins/processors/gpu" + "github.com/aws/amazon-cloudwatch-agent/extension/agenthealth" "github.com/aws/amazon-cloudwatch-agent/plugins/outputs/cloudwatch" "github.com/aws/amazon-cloudwatch-agent/plugins/processors/awsappsignals" diff --git a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.yaml b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.yaml index 056d1ca312..cbd26cbc21 100644 --- a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.yaml +++ b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.yaml @@ -442,8 +442,7 @@ processors: new_name: apiserver_request_total_5xx operations: [] submatch_case: "" - gpu/containerinsights: - drop_original_metrics: false + gpu/containerinsights: {} receivers: awscontainerinsightreceiver: add_container_name_metric_label: true diff --git a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml index 5b46c4dfe7..ee094aba31 100644 --- a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml +++ b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml @@ -338,22 +338,13 @@ exporters: - FullPodName - Namespace - PodName - label_matchers: [] - metric_name_selectors: - - container_gpu_utilization - - container_gpu_utilization_memory - - container_gpu_memory_total - - container_gpu_memory_used - - container_gpu_power_draw - - container_gpu_temperature - - dimensions: - - ClusterName - ContainerName - FullPodName - GpuDevice - Namespace - PodName - label_matchers: [] + label_matchers: [ ] metric_name_selectors: - container_gpu_utilization - container_gpu_utilization_memory @@ -375,21 +366,12 @@ exporters: - FullPodName - Namespace - PodName - label_matchers: [] - metric_name_selectors: - - pod_gpu_utilization - - pod_gpu_utilization_memory - - pod_gpu_memory_total - - pod_gpu_memory_used - - pod_gpu_power_draw - - pod_gpu_temperature - - dimensions: - - ClusterName - FullPodName - GpuDevice - Namespace - PodName - label_matchers: [] + label_matchers: [ ] metric_name_selectors: - pod_gpu_utilization - pod_gpu_utilization_memory @@ -402,20 +384,11 @@ exporters: - - ClusterName - InstanceId - NodeName - label_matchers: [] - metric_name_selectors: - - node_gpu_utilization - - node_gpu_utilization_memory - - node_gpu_memory_total - - node_gpu_memory_used - - node_gpu_power_draw - - node_gpu_temperature - - dimensions: - - ClusterName - GpuDevice - InstanceId - NodeName - label_matchers: [] + label_matchers: [ ] metric_name_selectors: - node_gpu_utilization - node_gpu_utilization_memory @@ -423,18 +396,22 @@ exporters: - node_gpu_memory_used - node_gpu_power_draw - node_gpu_temperature + - node_gpu_fan_speed - dimensions: - - ClusterName - InstanceId - NodeName - - ClusterName - label_matchers: [] + label_matchers: [ ] metric_name_selectors: - node_gpu_total + - node_gpu_request + - node_gpu_limit - dimensions: - - ClusterName - label_matchers: [] + label_matchers: [ ] metric_name_selectors: + - cluster_gpu_request - cluster_gpu_total metric_descriptors: - metric_name: apiserver_admission_controller_admission_duration_seconds @@ -537,127 +514,390 @@ processors: send_batch_size: 8192 timeout: 5s metricstransform/containerinsights: - transforms: - - action: insert + transforms: + - action: insert + aggregation_type: "" + experimental_match_labels: + code: ^5.* + group_resource_labels: { } + include: apiserver_request_total + match_type: regexp + new_name: apiserver_request_total_5xx + operations: [ ] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: { } + group_resource_labels: { } + include: DCGM_FI_DEV_FB_USED + match_type: "" + new_name: container_gpu_memory_used + operations: + - action: add_label + aggregated_values: [ ] + aggregation_type: "" + experimental_scale: 0 + label: "" + label_set: [ ] + label_value: "" + new_label: Type + new_value: ContainerGPU + value_actions: [ ] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: { } + group_resource_labels: { } + include: DCGM_FI_DEV_FB_USED + match_type: "" + new_name: pod_gpu_memory_used + operations: + - action: add_label + aggregated_values: [ ] + aggregation_type: "" + experimental_scale: 0 + label: "" + label_set: [ ] + label_value: "" + new_label: Type + new_value: PodGPU + value_actions: [ ] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: { } + group_resource_labels: { } + include: DCGM_FI_DEV_FB_USED + match_type: "" + new_name: node_gpu_memory_used + operations: + - action: add_label + aggregated_values: [ ] + aggregation_type: "" + experimental_scale: 0 + label: "" + label_set: [ ] + label_value: "" + new_label: Type + new_value: NodeGPU + value_actions: [ ] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: { } + group_resource_labels: { } + include: DCGM_FI_DEV_FB_TOTAL + match_type: "" + new_name: container_gpu_memory_total + operations: + - action: add_label + aggregated_values: [ ] + aggregation_type: "" + experimental_scale: 0 + label: "" + label_set: [ ] + label_value: "" + new_label: Type + new_value: ContainerGPU + value_actions: [ ] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: { } + group_resource_labels: { } + include: DCGM_FI_DEV_FB_TOTAL + match_type: "" + new_name: pod_gpu_memory_total + operations: + - action: add_label + aggregated_values: [ ] + aggregation_type: "" + experimental_scale: 0 + label: "" + label_set: [ ] + label_value: "" + new_label: Type + new_value: PodGPU + value_actions: [ ] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: { } + group_resource_labels: { } + include: DCGM_FI_DEV_FB_TOTAL + match_type: "" + new_name: node_gpu_memory_total + operations: + - action: add_label + aggregated_values: [ ] + aggregation_type: "" + experimental_scale: 0 + label: "" + label_set: [ ] + label_value: "" + new_label: Type + new_value: NodeGPU + value_actions: [ ] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: { } + group_resource_labels: { } + include: DCGM_FI_DEV_GPU_TEMP + match_type: "" + new_name: container_gpu_temperature + operations: + - action: add_label + aggregated_values: [ ] + aggregation_type: "" + experimental_scale: 0 + label: "" + label_set: [ ] + label_value: "" + new_label: Type + new_value: ContainerGPU + value_actions: [ ] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: { } + group_resource_labels: { } + include: DCGM_FI_DEV_GPU_TEMP + match_type: "" + new_name: pod_gpu_temperature + operations: + - action: add_label + aggregated_values: [ ] + aggregation_type: "" + experimental_scale: 0 + label: "" + label_set: [ ] + label_value: "" + new_label: Type + new_value: PodGPU + value_actions: [ ] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: { } + group_resource_labels: { } + include: DCGM_FI_DEV_GPU_TEMP + match_type: "" + new_name: node_gpu_temperature + operations: + - action: add_label + aggregated_values: [ ] + aggregation_type: "" + experimental_scale: 0 + label: "" + label_set: [ ] + label_value: "" + new_label: Type + new_value: NodeGPU + value_actions: [ ] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: { } + group_resource_labels: { } + include: DCGM_FI_DEV_POWER_USAGE + match_type: "" + new_name: container_gpu_power_draw + operations: + - action: add_label + aggregated_values: [ ] + aggregation_type: "" + experimental_scale: 0 + label: "" + label_set: [ ] + label_value: "" + new_label: Type + new_value: ContainerGPU + value_actions: [ ] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: { } + group_resource_labels: { } + include: DCGM_FI_DEV_POWER_USAGE + match_type: "" + new_name: pod_gpu_power_draw + operations: + - action: add_label + aggregated_values: [ ] + aggregation_type: "" + experimental_scale: 0 + label: "" + label_set: [ ] + label_value: "" + new_label: Type + new_value: PodGPU + value_actions: [ ] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: { } + group_resource_labels: { } + include: DCGM_FI_DEV_POWER_USAGE + match_type: "" + new_name: node_gpu_power_draw + operations: + - action: add_label + aggregated_values: [ ] + aggregation_type: "" + experimental_scale: 0 + label: "" + label_set: [ ] + label_value: "" + new_label: Type + new_value: NodeGPU + value_actions: [ ] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: { } + group_resource_labels: { } + include: DCGM_FI_DEV_GPU_UTIL + match_type: "" + new_name: container_gpu_utilization + operations: + - action: add_label + aggregated_values: [ ] + aggregation_type: "" + experimental_scale: 0 + label: "" + label_set: [ ] + label_value: "" + new_label: Type + new_value: ContainerGPU + value_actions: [ ] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: { } + group_resource_labels: { } + include: DCGM_FI_DEV_GPU_UTIL + match_type: "" + new_name: pod_gpu_utilization + operations: + - action: add_label + aggregated_values: [ ] + aggregation_type: "" + experimental_scale: 0 + label: "" + label_set: [ ] + label_value: "" + new_label: Type + new_value: PodGPU + value_actions: [ ] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: { } + group_resource_labels: { } + include: DCGM_FI_DEV_GPU_UTIL + match_type: "" + new_name: node_gpu_utilization + operations: + - action: add_label + aggregated_values: [ ] + aggregation_type: "" + experimental_scale: 0 + label: "" + label_set: [ ] + label_value: "" + new_label: Type + new_value: NodeGPU + value_actions: [ ] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: { } + group_resource_labels: { } + include: DCGM_FI_DEV_FB_USED_PERCENT + match_type: "" + new_name: container_gpu_utilization_memory + operations: + - action: add_label + aggregated_values: [ ] + aggregation_type: "" + experimental_scale: 0 + label: "" + label_set: [ ] + label_value: "" + new_label: Type + new_value: ContainerGPU + value_actions: [ ] + - action: experimental_scale_value + aggregated_values: [ ] + aggregation_type: "" + experimental_scale: 100 + label: "" + label_set: [ ] + label_value: "" + new_label: "" + new_value: "" + value_actions: [ ] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: { } + group_resource_labels: { } + include: DCGM_FI_DEV_FB_USED_PERCENT + match_type: "" + new_name: pod_gpu_utilization_memory + operations: + - action: add_label + aggregated_values: [ ] + aggregation_type: "" + experimental_scale: 0 + label: "" + label_set: [ ] + label_value: "" + new_label: Type + new_value: PodGPU + value_actions: [ ] + - action: experimental_scale_value + aggregated_values: [ ] + aggregation_type: "" + experimental_scale: 100 + label: "" + label_set: [ ] + label_value: "" + new_label: "" + new_value: "" + value_actions: [ ] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: { } + group_resource_labels: { } + include: DCGM_FI_DEV_FB_USED_PERCENT + match_type: "" + new_name: node_gpu_utilization_memory + operations: + - action: add_label + aggregated_values: [ ] + aggregation_type: "" + experimental_scale: 0 + label: "" + label_set: [ ] + label_value: "" + new_label: Type + new_value: NodeGPU + value_actions: [ ] + - action: experimental_scale_value + aggregated_values: [ ] aggregation_type: "" - experimental_match_labels: - code: ^5.* - group_resource_labels: {} - include: apiserver_request_total - match_type: regexp - new_name: apiserver_request_total_5xx - operations: [] - submatch_case: "" -# - action: insert -# aggregation_type: "" -# experimental_match_labels: { } -# group_resource_labels: { } -# include: DCGM_FI_DEV_GPU_UTIL -# match_type: "" -# new_name: pod_gpu_utilization -# operations: [ ] -# submatch_case: "" -# - action: insert -# aggregation_type: "" -# experimental_match_labels: { } -# group_resource_labels: { } -# include: DCGM_FI_DEV_GPU_UTIL -# match_type: "" -# new_name: node_gpu_utilization -# operations: [ ] -# submatch_case: "" -# - action: insert -# aggregation_type: "" -# experimental_match_labels: { } -# group_resource_labels: { } -# include: DCGM_FI_DEV_MEM_COPY_UTIL -# match_type: "" -# new_name: pod_gpu_utilization_memory -# operations: [ ] -# submatch_case: "" -# - action: insert -# aggregation_type: "" -# experimental_match_labels: { } -# group_resource_labels: { } -# include: DCGM_FI_DEV_MEM_COPY_UTIL -# match_type: "" -# new_name: node_gpu_utilization_memory -# operations: [ ] -# submatch_case: "" -# - action: insert -# aggregation_type: "" -# experimental_match_labels: { } -# group_resource_labels: { } -# include: DCGM_FI_DEV_FB_USED -# match_type: "" -# new_name: pod_gpu_memory_used -# operations: [ ] -# submatch_case: "" -# - action: insert -# aggregation_type: "" -# experimental_match_labels: { } -# group_resource_labels: { } -# include: DCGM_FI_DEV_FB_USED -# match_type: "" -# new_name: node_gpu_memory_used -# operations: [ ] -# submatch_case: "" -# - action: insert -# aggregation_type: "sum" -# experimental_match_labels: { } -# group_resource_labels: { } -# include: ^DCGM_FI_DEV_FB_(USED|FREE)$ -# match_type: "regexp" -# new_name: pod_gpu_memory_total -# operations: [ ] -# submatch_case: "" -# - action: insert -# aggregation_type: "sum" -# experimental_match_labels: { } -# group_resource_labels: { } -# include: ^DCGM_FI_DEV_FB_(USED|FREE)$ -# match_type: "regexp" -# new_name: node_gpu_memory_total -# operations: [ ] -# submatch_case: "" -# - action: insert -# aggregation_type: "" -# experimental_match_labels: { } -# group_resource_labels: { } -# include: DCGM_FI_DEV_GPU_TEMP -# match_type: "" -# new_name: pod_gpu_temperature -# operations: [ ] -# submatch_case: "" -# - action: insert -# aggregation_type: "" -# experimental_match_labels: { } -# group_resource_labels: { } -# include: DCGM_FI_DEV_GPU_TEMP -# match_type: "" -# new_name: node_gpu_temperature -# operations: [ ] -# submatch_case: "" -# - action: insert -# aggregation_type: "" -# experimental_match_labels: { } -# group_resource_labels: { } -# include: DCGM_FI_DEV_POWER_USAGE -# match_type: "" -# new_name: pod_gpu_power_draw -# operations: [ ] -# submatch_case: "" -# - action: insert -# aggregation_type: "" -# experimental_match_labels: { } -# group_resource_labels: { } -# include: DCGM_FI_DEV_POWER_USAGE -# match_type: "" -# new_name: node_gpu_power_draw -# operations: [ ] -# submatch_case: "" - gpu/containerinsights: - drop_original_metrics: false + experimental_scale: 100 + label: "" + label_set: [ ] + label_value: "" + new_label: "" + new_value: "" + value_actions: [ ] + submatch_case: "" + gpu/containerinsights: {} receivers: awscontainerinsightreceiver: diff --git a/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.yaml b/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.yaml index b176d8f927..f51e46904a 100644 --- a/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.yaml +++ b/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.yaml @@ -402,8 +402,7 @@ processors: new_name: apiserver_request_total_5xx operations: [] submatch_case: "" - gpu/containerinsights: - drop_original_metrics: false + gpu/containerinsights: {} receivers: awscontainerinsightreceiver: add_container_name_metric_label: true diff --git a/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.yaml b/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.yaml index 14bd028c06..bf63e010e9 100644 --- a/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.yaml +++ b/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.yaml @@ -440,8 +440,7 @@ processors: new_name: apiserver_request_total_5xx operations: [] submatch_case: "" - gpu/containerinsights: - drop_original_metrics: false + gpu/containerinsights: {} receivers: awscontainerinsightreceiver: add_container_name_metric_label: true diff --git a/translator/translate/otel/exporter/awsemf/kubernetes.go b/translator/translate/otel/exporter/awsemf/kubernetes.go index a17ec9a14f..2a6a812c1f 100644 --- a/translator/translate/otel/exporter/awsemf/kubernetes.go +++ b/translator/translate/otel/exporter/awsemf/kubernetes.go @@ -4,10 +4,11 @@ package awsemf import ( - "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/common" "github.com/open-telemetry/opentelemetry-collector-contrib/exporter/awsemfexporter" "go.opentelemetry.io/collector/confmap" + "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/common" + "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/receiver/awscontainerinsight" ) diff --git a/translator/translate/otel/exporter/awsemf/translator_test.go b/translator/translate/otel/exporter/awsemf/translator_test.go index d32babc147..7489ec2ba1 100644 --- a/translator/translate/otel/exporter/awsemf/translator_test.go +++ b/translator/translate/otel/exporter/awsemf/translator_test.go @@ -399,6 +399,36 @@ func TestTranslator(t *testing.T) { Dimensions: [][]string{{"ClusterName", "priority_level"}, {"ClusterName"}}, MetricNameSelectors: []string{"apiserver_flowcontrol_request_concurrency_limit"}, }, + { + Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "Namespace", "PodName", "ContainerName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "ContainerName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "ContainerName", "GpuDevice"}}, + MetricNameSelectors: []string{ + "container_gpu_utilization", "container_gpu_utilization_memory", "container_gpu_memory_total", "container_gpu_memory_used", "container_gpu_power_draw", "container_gpu_temperature", + }, + }, + { + Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "Namespace"}, {"ClusterName", "Namespace", "Service"}, {"ClusterName", "Namespace", "PodName"}, {"ClusterName", "Namespace", "PodName", "FullPodName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "GpuDevice"}}, + MetricNameSelectors: []string{ + "pod_gpu_utilization", "pod_gpu_utilization_memory", "pod_gpu_memory_total", "pod_gpu_memory_used", "pod_gpu_power_draw", "pod_gpu_temperature", + }, + }, + { + Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "NodeName", "InstanceId"}, {"ClusterName", "NodeName", "InstanceId", "GpuDevice"}}, + MetricNameSelectors: []string{ + "node_gpu_utilization", "node_gpu_utilization_memory", "node_gpu_memory_total", "node_gpu_memory_used", "node_gpu_power_draw", "node_gpu_temperature", "node_gpu_fan_speed", + }, + }, + { + Dimensions: [][]string{{"ClusterName", "NodeName", "InstanceId"}, {"ClusterName"}}, + MetricNameSelectors: []string{ + "node_gpu_total", "node_gpu_request", "node_gpu_limit", + }, + }, + { + Dimensions: [][]string{{"ClusterName"}}, + MetricNameSelectors: []string{ + "cluster_gpu_request", "cluster_gpu_total", + }, + }, }, "metric_descriptors": []awsemfexporter.MetricDescriptor{ { diff --git a/translator/translate/otel/pipeline/containerinsights/translator.go b/translator/translate/otel/pipeline/containerinsights/translator.go index a4073f9144..67a706f526 100644 --- a/translator/translate/otel/pipeline/containerinsights/translator.go +++ b/translator/translate/otel/pipeline/containerinsights/translator.go @@ -6,10 +6,11 @@ package containerinsights import ( "fmt" - "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/processor/gpu" "go.opentelemetry.io/collector/component" "go.opentelemetry.io/collector/confmap" + "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/processor/gpu" + "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/common" "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/exporter/awsemf" "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/extension/agenthealth" diff --git a/translator/translate/otel/pipeline/containerinsights/translator_test.go b/translator/translate/otel/pipeline/containerinsights/translator_test.go index 14a721e6b2..70f723903f 100644 --- a/translator/translate/otel/pipeline/containerinsights/translator_test.go +++ b/translator/translate/otel/pipeline/containerinsights/translator_test.go @@ -81,7 +81,7 @@ func TestTranslator(t *testing.T) { want: &want{ pipelineType: "metrics/containerinsights", receivers: []string{"awscontainerinsightreceiver"}, - processors: []string{"metricstransform/containerinsights", "batch/containerinsights"}, + processors: []string{"metricstransform/containerinsights", "batch/containerinsights", "gpu/containerinsights"}, exporters: []string{"awsemf/containerinsights"}, extensions: []string{"agenthealth/logs"}, }, diff --git a/translator/translate/otel/processor/gpu/translator.go b/translator/translate/otel/processor/gpu/translator.go index 0fe5998d5c..b1c4f89633 100644 --- a/translator/translate/otel/processor/gpu/translator.go +++ b/translator/translate/otel/processor/gpu/translator.go @@ -29,6 +29,5 @@ func (t *translator) ID() component.ID { func (t *translator) Translate(conf *confmap.Conf) (component.Config, error) { cfg := t.factory.CreateDefaultConfig().(*gpu.Config) - cfg.DropOriginalMetrics = false return cfg, nil } diff --git a/translator/translate/otel/processor/metricstransformprocessor/translator.go b/translator/translate/otel/processor/metricstransformprocessor/translator.go index 8ea70d3135..8d498706d5 100644 --- a/translator/translate/otel/processor/metricstransformprocessor/translator.go +++ b/translator/translate/otel/processor/metricstransformprocessor/translator.go @@ -6,12 +6,13 @@ package metricstransformprocessor import ( "fmt" - "github.com/aws/amazon-cloudwatch-agent/internal/containerinsightscommon" "github.com/open-telemetry/opentelemetry-collector-contrib/processor/metricstransformprocessor" "go.opentelemetry.io/collector/component" "go.opentelemetry.io/collector/confmap" "go.opentelemetry.io/collector/processor" + "github.com/aws/amazon-cloudwatch-agent/internal/containerinsightscommon" + "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/common" ) @@ -40,7 +41,6 @@ var renameMapForDcgm = map[string]string{ "DCGM_FI_DEV_FB_TOTAL": containerinsightscommon.GpuMemTotal, "DCGM_FI_DEV_GPU_TEMP": containerinsightscommon.GpuTemperature, "DCGM_FI_DEV_POWER_USAGE": containerinsightscommon.GpuPowerDraw, - // "DCGM_FI_DEV_FAN_SPEED": containerinsightscommon.GpuFanSpeed, } type translator struct { @@ -71,7 +71,6 @@ func (t *translator) Translate(conf *confmap.Conf) (component.Config, error) { } if isGpuEnabled(conf) { - var operations []map[string]interface{} // appends DCGM metric transform rules for each metric type (container/pod/node) with following format: // { // "include": "DCGM_FI_DEV_GPU_UTIL", @@ -87,14 +86,15 @@ func (t *translator) Translate(conf *confmap.Conf) (component.Config, error) { // ] // }, for old, new := range renameMapForDcgm { + var operations []map[string]interface{} + // convert decimals to percent + if new == containerinsightscommon.GpuMemUtilization { + operations = append(operations, map[string]interface{}{ + "action": "experimental_scale_value", + "experimental_scale": 100, + }) + } for _, t := range metricDuplicateTypes { - // convert decimals to percent - if new == containerinsightscommon.GpuMemUtilization { - operations = append(operations, map[string]interface{}{ - "action": "experimental_scale_value", - "experimental_scale": 100, - }) - } transformRules = append(transformRules, map[string]interface{}{ "include": old, "action": "insert", From 10f9a3364d3d347c14d0d67e40b1f753636f1c6f Mon Sep 17 00:00:00 2001 From: Hyunsoo Kim Date: Fri, 16 Feb 2024 16:01:37 -0500 Subject: [PATCH 05/20] fix lint --- service/defaultcomponents/components.go | 3 +-- translator/translate/otel/exporter/awsemf/kubernetes.go | 1 - .../translate/otel/pipeline/containerinsights/translator.go | 3 +-- translator/translate/otel/processor/gpu/translator.go | 2 +- .../otel/processor/metricstransformprocessor/translator.go | 1 - 5 files changed, 3 insertions(+), 7 deletions(-) diff --git a/service/defaultcomponents/components.go b/service/defaultcomponents/components.go index 180e0b3f3c..0d14c9232f 100644 --- a/service/defaultcomponents/components.go +++ b/service/defaultcomponents/components.go @@ -25,12 +25,11 @@ import ( "go.opentelemetry.io/collector/receiver" "go.opentelemetry.io/collector/receiver/otlpreceiver" - "github.com/aws/amazon-cloudwatch-agent/plugins/processors/gpu" - "github.com/aws/amazon-cloudwatch-agent/extension/agenthealth" "github.com/aws/amazon-cloudwatch-agent/plugins/outputs/cloudwatch" "github.com/aws/amazon-cloudwatch-agent/plugins/processors/awsappsignals" "github.com/aws/amazon-cloudwatch-agent/plugins/processors/ec2tagger" + "github.com/aws/amazon-cloudwatch-agent/plugins/processors/gpu" ) func Factories() (otelcol.Factories, error) { diff --git a/translator/translate/otel/exporter/awsemf/kubernetes.go b/translator/translate/otel/exporter/awsemf/kubernetes.go index 2a6a812c1f..4f76a7566d 100644 --- a/translator/translate/otel/exporter/awsemf/kubernetes.go +++ b/translator/translate/otel/exporter/awsemf/kubernetes.go @@ -8,7 +8,6 @@ import ( "go.opentelemetry.io/collector/confmap" "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/common" - "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/receiver/awscontainerinsight" ) diff --git a/translator/translate/otel/pipeline/containerinsights/translator.go b/translator/translate/otel/pipeline/containerinsights/translator.go index 67a706f526..29dec1b817 100644 --- a/translator/translate/otel/pipeline/containerinsights/translator.go +++ b/translator/translate/otel/pipeline/containerinsights/translator.go @@ -9,12 +9,11 @@ import ( "go.opentelemetry.io/collector/component" "go.opentelemetry.io/collector/confmap" - "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/processor/gpu" - "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/common" "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/exporter/awsemf" "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/extension/agenthealth" "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/processor/batchprocessor" + "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/processor/gpu" "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/processor/metricstransformprocessor" "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/receiver/awscontainerinsight" ) diff --git a/translator/translate/otel/processor/gpu/translator.go b/translator/translate/otel/processor/gpu/translator.go index b1c4f89633..3f542a469c 100644 --- a/translator/translate/otel/processor/gpu/translator.go +++ b/translator/translate/otel/processor/gpu/translator.go @@ -4,11 +4,11 @@ package gpu import ( - "github.com/aws/amazon-cloudwatch-agent/plugins/processors/gpu" "go.opentelemetry.io/collector/component" "go.opentelemetry.io/collector/confmap" "go.opentelemetry.io/collector/processor" + "github.com/aws/amazon-cloudwatch-agent/plugins/processors/gpu" "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/common" ) diff --git a/translator/translate/otel/processor/metricstransformprocessor/translator.go b/translator/translate/otel/processor/metricstransformprocessor/translator.go index 8d498706d5..e607e80826 100644 --- a/translator/translate/otel/processor/metricstransformprocessor/translator.go +++ b/translator/translate/otel/processor/metricstransformprocessor/translator.go @@ -12,7 +12,6 @@ import ( "go.opentelemetry.io/collector/processor" "github.com/aws/amazon-cloudwatch-agent/internal/containerinsightscommon" - "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/common" ) From 1bbf610adaef7e604703a9a958b825799d402ef1 Mon Sep 17 00:00:00 2001 From: Hyunsoo Kim Date: Tue, 20 Feb 2024 10:56:51 -0500 Subject: [PATCH 06/20] remove gpu processor --- plugins/processors/gpu/config.go | 19 --- plugins/processors/gpu/config_test.go | 19 --- plugins/processors/gpu/factory.go | 56 --------- plugins/processors/gpu/factory_test.go | 45 ------- plugins/processors/gpu/processor.go | 111 ------------------ plugins/processors/gpu/processor_test.go | 108 ----------------- service/defaultcomponents/components.go | 2 - .../emf_and_kubernetes_config.yaml | 2 - .../emf_and_kubernetes_with_gpu_config.yaml | 2 - .../kubernetes_on_prem_config.yaml | 2 - .../logs_and_kubernetes_config.yaml | 2 - .../pipeline/containerinsights/translator.go | 3 +- .../containerinsights/translator_test.go | 2 +- .../otel/processor/gpu/translator.go | 33 ------ 14 files changed, 2 insertions(+), 404 deletions(-) delete mode 100644 plugins/processors/gpu/config.go delete mode 100644 plugins/processors/gpu/config_test.go delete mode 100644 plugins/processors/gpu/factory.go delete mode 100644 plugins/processors/gpu/factory_test.go delete mode 100644 plugins/processors/gpu/processor.go delete mode 100644 plugins/processors/gpu/processor_test.go delete mode 100644 translator/translate/otel/processor/gpu/translator.go diff --git a/plugins/processors/gpu/config.go b/plugins/processors/gpu/config.go deleted file mode 100644 index b72cbdc39c..0000000000 --- a/plugins/processors/gpu/config.go +++ /dev/null @@ -1,19 +0,0 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: MIT - -package gpu - -import ( - "go.opentelemetry.io/collector/component" -) - -type Config struct{} - -// Verify Config implements Processor interface. -var _ component.Config = (*Config)(nil) - -// Validate does not check for unsupported dimension key-value pairs, because those -// get silently dropped and ignored during translation. -func (cfg *Config) Validate() error { - return nil -} diff --git a/plugins/processors/gpu/config_test.go b/plugins/processors/gpu/config_test.go deleted file mode 100644 index db3918301a..0000000000 --- a/plugins/processors/gpu/config_test.go +++ /dev/null @@ -1,19 +0,0 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: MIT - -package gpu - -import ( - "testing" - - "github.com/stretchr/testify/assert" - "go.opentelemetry.io/collector/component" - "go.opentelemetry.io/collector/confmap" -) - -func TestUnmarshalDefaultConfig(t *testing.T) { - factory := NewFactory() - cfg := factory.CreateDefaultConfig() - assert.NoError(t, component.UnmarshalConfig(confmap.New(), cfg)) - assert.Equal(t, factory.CreateDefaultConfig(), cfg) -} diff --git a/plugins/processors/gpu/factory.go b/plugins/processors/gpu/factory.go deleted file mode 100644 index 9c7c61b21a..0000000000 --- a/plugins/processors/gpu/factory.go +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: MIT - -package gpu - -import ( - "context" - "fmt" - - "go.opentelemetry.io/collector/component" - "go.opentelemetry.io/collector/consumer" - "go.opentelemetry.io/collector/processor" - "go.opentelemetry.io/collector/processor/processorhelper" -) - -const ( - TypeStr = "gpu" - stability = component.StabilityLevelBeta -) - -var processorCapabilities = consumer.Capabilities{MutatesData: true} - -func NewFactory() processor.Factory { - return processor.NewFactory( - TypeStr, - createDefaultConfig, - processor.WithMetrics(createMetricsProcessor, stability)) -} - -func createDefaultConfig() component.Config { - return &Config{} -} - -func createMetricsProcessor( - ctx context.Context, - set processor.CreateSettings, - cfg component.Config, - nextConsumer consumer.Metrics, -) (processor.Metrics, error) { - processorConfig, ok := cfg.(*Config) - if !ok { - return nil, fmt.Errorf("configuration parsing error") - } - - metricsProcessor := newGpuProcessor(processorConfig, set.Logger) - - return processorhelper.NewMetricsProcessor( - ctx, - set, - cfg, - nextConsumer, - metricsProcessor.processMetrics, - processorhelper.WithCapabilities(processorCapabilities), - processorhelper.WithStart(metricsProcessor.Start), - processorhelper.WithShutdown(metricsProcessor.Shutdown)) -} diff --git a/plugins/processors/gpu/factory_test.go b/plugins/processors/gpu/factory_test.go deleted file mode 100644 index bae457d92b..0000000000 --- a/plugins/processors/gpu/factory_test.go +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: MIT - -package gpu - -import ( - "context" - "testing" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - "go.opentelemetry.io/collector/component" - "go.opentelemetry.io/collector/component/componenttest" - "go.opentelemetry.io/collector/consumer/consumertest" - "go.opentelemetry.io/collector/processor/processortest" -) - -func TestCreateDefaultConfig(t *testing.T) { - factory := NewFactory() - require.NotNil(t, factory) - - cfg := factory.CreateDefaultConfig() - assert.NotNil(t, cfg, "failed to create default config") - assert.NoError(t, componenttest.CheckConfigStruct(cfg)) -} - -func TestCreateProcessor(t *testing.T) { - factory := NewFactory() - require.NotNil(t, factory) - - cfg := factory.CreateDefaultConfig() - setting := processortest.NewNopCreateSettings() - - tProcessor, err := factory.CreateTracesProcessor(context.Background(), setting, cfg, consumertest.NewNop()) - assert.Equal(t, err, component.ErrDataTypeIsNotSupported) - assert.Nil(t, tProcessor) - - mProcessor, err := factory.CreateMetricsProcessor(context.Background(), setting, cfg, consumertest.NewNop()) - assert.NoError(t, err) - assert.NotNil(t, mProcessor) - - lProcessor, err := factory.CreateLogsProcessor(context.Background(), setting, cfg, consumertest.NewNop()) - assert.Equal(t, err, component.ErrDataTypeIsNotSupported) - assert.Nil(t, lProcessor) -} diff --git a/plugins/processors/gpu/processor.go b/plugins/processors/gpu/processor.go deleted file mode 100644 index f3c7412bc3..0000000000 --- a/plugins/processors/gpu/processor.go +++ /dev/null @@ -1,111 +0,0 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: MIT - -package gpu - -import ( - "context" - "strings" - - "go.opentelemetry.io/collector/component" - "go.opentelemetry.io/collector/pdata/pcommon" - "go.opentelemetry.io/collector/pdata/pmetric" - "go.uber.org/zap" -) - -const ( - gpuMetric = "_gpu_" -) - -var defaultGpuLabels = []string{ - "ClusterName", - "Namespace", - "Service", - "ContainerName", - "FullPodName", - "PodName", - "GpuDevice", -} - -type gpuprocessor struct { - *Config - logger *zap.Logger - cancelFunc context.CancelFunc - shutdownC chan bool - started bool -} - -func newGpuProcessor(config *Config, logger *zap.Logger) *gpuprocessor { - _, cancel := context.WithCancel(context.Background()) - d := &gpuprocessor{ - Config: config, - logger: logger, - cancelFunc: cancel, - } - return d -} - -func (d *gpuprocessor) processMetrics(ctx context.Context, md pmetric.Metrics) (pmetric.Metrics, error) { - if !d.started { - return pmetric.NewMetrics(), nil - } - - rms := md.ResourceMetrics() - for i := 0; i < rms.Len(); i++ { - rs := rms.At(i) - ilms := rs.ScopeMetrics() - for j := 0; j < ilms.Len(); j++ { - ils := ilms.At(j) - metrics := ils.Metrics() - for k := 0; k < metrics.Len(); k++ { - m := metrics.At(k) - d.processMetricAttributes(ctx, m) - } - } - } - return md, nil -} - -func (d *gpuprocessor) processMetricAttributes(_ context.Context, m pmetric.Metric) { - // only decorate GPU metrics - // another option is to separate GPU of its own pipeline to minimize extra processing of metrics - if !strings.Contains(m.Name(), gpuMetric) { - return - } - - switch m.Type() { - case pmetric.MetricTypeGauge: - dps := m.Gauge().DataPoints() - for i := 0; i < dps.Len(); i++ { - addDefaultAttributes(dps.At(i).Attributes()) - } - case pmetric.MetricTypeSum: - dps := m.Sum().DataPoints() - for i := 0; i < dps.Len(); i++ { - addDefaultAttributes(dps.At(i).Attributes()) - } - default: - d.logger.Debug("Ignore unknown metric type", zap.String("type", m.Type().String())) - } -} - -// adds empty string for default attributes since prometheus drops them during relabeling process -func addDefaultAttributes(attributes pcommon.Map) { - for _, k := range defaultGpuLabels { - if _, ok := attributes.Get(k); !ok { - attributes.PutStr(k, "") - } - } -} - -func (d *gpuprocessor) Shutdown(context.Context) error { - close(d.shutdownC) - d.cancelFunc() - return nil -} - -func (d *gpuprocessor) Start(ctx context.Context, _ component.Host) error { - d.shutdownC = make(chan bool) - d.started = true - return nil -} diff --git a/plugins/processors/gpu/processor_test.go b/plugins/processors/gpu/processor_test.go deleted file mode 100644 index eda521093a..0000000000 --- a/plugins/processors/gpu/processor_test.go +++ /dev/null @@ -1,108 +0,0 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: MIT - -package gpu - -import ( - "context" - "testing" - - "github.com/grafana/regexp" - "github.com/stretchr/testify/assert" - "go.opentelemetry.io/collector/pdata/pmetric" - "go.uber.org/zap" -) - -var normalizedNameRegex = regexp.MustCompile("^(container|pod|node)_gpu_[a-z_]+$") - -func TestProcessMetrics(t *testing.T) { - logger, _ := zap.NewDevelopment() - gp := &gpuprocessor{ - logger: logger, - Config: createDefaultConfig().(*Config), - } - ctx := context.Background() - gp.Start(ctx, nil) - - testcases := map[string]struct { - metrics pmetric.Metrics - want map[string]string - }{ - "keepExisting": { - metrics: generateMetrics(map[string]string{ - "ClusterName": "cluster", - "Namespace": "namespace", - "Service": "service", - "ContainerName": "container", - "FullPodName": "fullpod", - "PodName": "pod", - "GpuDevice": "gpu", - }), - want: map[string]string{ - "ClusterName": "cluster", - "Namespace": "namespace", - "Service": "service", - "ContainerName": "container", - "FullPodName": "fullpod", - "PodName": "pod", - "GpuDevice": "gpu", - }, - }, - "addMissing": { - metrics: generateMetrics(map[string]string{ - "ClusterName": "cluster", - "Namespace": "namespace", - "Service": "service", - "ContainerName": "container", - "FullPodName": "fullpod", - }), - want: map[string]string{ - "ClusterName": "cluster", - "Namespace": "namespace", - "Service": "service", - "ContainerName": "container", - "FullPodName": "fullpod", - "PodName": "", - "GpuDevice": "", - }, - }, - "addAll": { - metrics: generateMetrics(map[string]string{}), - want: map[string]string{ - "ClusterName": "", - "Namespace": "", - "Service": "", - "ContainerName": "", - "FullPodName": "", - "PodName": "", - "GpuDevice": "", - }, - }, - } - - for _, tc := range testcases { - ms, _ := gp.processMetrics(ctx, tc.metrics) - attrs := ms.ResourceMetrics().At(0).ScopeMetrics().At(0).Metrics().At(0).Gauge().DataPoints().At(0).Attributes() - assert.Equal(t, len(defaultGpuLabels), attrs.Len()) - for k, v := range tc.want { - got, ok := attrs.Get(k) - assert.True(t, ok) - assert.Equal(t, v, got.Str()) - } - } -} - -func generateMetrics(dimensions map[string]string) pmetric.Metrics { - md := pmetric.NewMetrics() - - m := md.ResourceMetrics().AppendEmpty().ScopeMetrics().AppendEmpty().Metrics().AppendEmpty() - m.SetName("test" + gpuMetric) - gauge := m.SetEmptyGauge().DataPoints().AppendEmpty() - gauge.SetIntValue(10) - - for k, v := range dimensions { - gauge.Attributes().PutStr(k, v) - } - - return md -} diff --git a/service/defaultcomponents/components.go b/service/defaultcomponents/components.go index 0d14c9232f..602371fa40 100644 --- a/service/defaultcomponents/components.go +++ b/service/defaultcomponents/components.go @@ -29,7 +29,6 @@ import ( "github.com/aws/amazon-cloudwatch-agent/plugins/outputs/cloudwatch" "github.com/aws/amazon-cloudwatch-agent/plugins/processors/awsappsignals" "github.com/aws/amazon-cloudwatch-agent/plugins/processors/ec2tagger" - "github.com/aws/amazon-cloudwatch-agent/plugins/processors/gpu" ) func Factories() (otelcol.Factories, error) { @@ -54,7 +53,6 @@ func Factories() (otelcol.Factories, error) { metricstransformprocessor.NewFactory(), resourcedetectionprocessor.NewFactory(), transformprocessor.NewFactory(), - gpu.NewFactory(), ); err != nil { return otelcol.Factories{}, err } diff --git a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.yaml b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.yaml index cbd26cbc21..1557628b44 100644 --- a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.yaml +++ b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.yaml @@ -442,7 +442,6 @@ processors: new_name: apiserver_request_total_5xx operations: [] submatch_case: "" - gpu/containerinsights: {} receivers: awscontainerinsightreceiver: add_container_name_metric_label: true @@ -523,7 +522,6 @@ service: processors: - metricstransform/containerinsights - batch/containerinsights - - gpu/containerinsights receivers: - awscontainerinsightreceiver telemetry: diff --git a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml index ee094aba31..6cdb314e54 100644 --- a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml +++ b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml @@ -897,7 +897,6 @@ processors: new_value: "" value_actions: [ ] submatch_case: "" - gpu/containerinsights: {} receivers: awscontainerinsightreceiver: @@ -979,7 +978,6 @@ service: processors: - metricstransform/containerinsights - batch/containerinsights - - gpu/containerinsights receivers: - awscontainerinsightreceiver telemetry: diff --git a/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.yaml b/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.yaml index f51e46904a..91e90916b1 100644 --- a/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.yaml +++ b/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.yaml @@ -402,7 +402,6 @@ processors: new_name: apiserver_request_total_5xx operations: [] submatch_case: "" - gpu/containerinsights: {} receivers: awscontainerinsightreceiver: add_container_name_metric_label: true @@ -441,7 +440,6 @@ service: processors: - metricstransform/containerinsights - batch/containerinsights - - gpu/containerinsights receivers: - awscontainerinsightreceiver telemetry: diff --git a/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.yaml b/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.yaml index bf63e010e9..48b06f125b 100644 --- a/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.yaml +++ b/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.yaml @@ -440,7 +440,6 @@ processors: new_name: apiserver_request_total_5xx operations: [] submatch_case: "" - gpu/containerinsights: {} receivers: awscontainerinsightreceiver: add_container_name_metric_label: true @@ -520,7 +519,6 @@ service: processors: - metricstransform/containerinsights - batch/containerinsights - - gpu/containerinsights receivers: - awscontainerinsightreceiver telemetry: diff --git a/translator/translate/otel/pipeline/containerinsights/translator.go b/translator/translate/otel/pipeline/containerinsights/translator.go index 29dec1b817..ba5fb093e2 100644 --- a/translator/translate/otel/pipeline/containerinsights/translator.go +++ b/translator/translate/otel/pipeline/containerinsights/translator.go @@ -13,7 +13,6 @@ import ( "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/exporter/awsemf" "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/extension/agenthealth" "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/processor/batchprocessor" - "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/processor/gpu" "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/processor/metricstransformprocessor" "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/receiver/awscontainerinsight" ) @@ -53,7 +52,7 @@ func (t *translator) Translate(conf *confmap.Conf) (*common.ComponentTranslators if enhancedContainerInsightsEnabled { return &common.ComponentTranslators{ Receivers: common.NewTranslatorMap(awscontainerinsight.NewTranslator()), - Processors: common.NewTranslatorMap(metricstransformprocessor.NewTranslatorWithName(pipelineName), batchprocessor.NewTranslatorWithNameAndSection(pipelineName, common.LogsKey), gpu.NewTranslatorWithName(pipelineName)), // EKS & ECS CI sit under metrics_collected in "logs" + Processors: common.NewTranslatorMap(metricstransformprocessor.NewTranslatorWithName(pipelineName), batchprocessor.NewTranslatorWithNameAndSection(pipelineName, common.LogsKey)), // EKS & ECS CI sit under metrics_collected in "logs" Exporters: common.NewTranslatorMap(awsemf.NewTranslatorWithName(pipelineName)), Extensions: common.NewTranslatorMap(agenthealth.NewTranslator(component.DataTypeLogs, []string{agenthealth.OperationPutLogEvents})), }, nil diff --git a/translator/translate/otel/pipeline/containerinsights/translator_test.go b/translator/translate/otel/pipeline/containerinsights/translator_test.go index 70f723903f..14a721e6b2 100644 --- a/translator/translate/otel/pipeline/containerinsights/translator_test.go +++ b/translator/translate/otel/pipeline/containerinsights/translator_test.go @@ -81,7 +81,7 @@ func TestTranslator(t *testing.T) { want: &want{ pipelineType: "metrics/containerinsights", receivers: []string{"awscontainerinsightreceiver"}, - processors: []string{"metricstransform/containerinsights", "batch/containerinsights", "gpu/containerinsights"}, + processors: []string{"metricstransform/containerinsights", "batch/containerinsights"}, exporters: []string{"awsemf/containerinsights"}, extensions: []string{"agenthealth/logs"}, }, diff --git a/translator/translate/otel/processor/gpu/translator.go b/translator/translate/otel/processor/gpu/translator.go deleted file mode 100644 index 3f542a469c..0000000000 --- a/translator/translate/otel/processor/gpu/translator.go +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: MIT - -package gpu - -import ( - "go.opentelemetry.io/collector/component" - "go.opentelemetry.io/collector/confmap" - "go.opentelemetry.io/collector/processor" - - "github.com/aws/amazon-cloudwatch-agent/plugins/processors/gpu" - "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/common" -) - -type translator struct { - name string - factory processor.Factory -} - -var _ common.Translator[component.Config] = (*translator)(nil) - -func NewTranslatorWithName(name string) common.Translator[component.Config] { - return &translator{name, gpu.NewFactory()} -} - -func (t *translator) ID() component.ID { - return component.NewIDWithName(t.factory.Type(), t.name) -} - -func (t *translator) Translate(conf *confmap.Conf) (component.Config, error) { - cfg := t.factory.CreateDefaultConfig().(*gpu.Config) - return cfg, nil -} From e413c9537cb2a09962aa0cce2235d908063a72f2 Mon Sep 17 00:00:00 2001 From: Hyunsoo Kim Date: Sat, 24 Feb 2024 20:05:39 -0500 Subject: [PATCH 07/20] Revert "remove gpu processor" This reverts commit 7a6784b24ef6a222f7f79ee1ad4b74909d3ccc2d. --- plugins/processors/gpu/config.go | 19 +++ plugins/processors/gpu/config_test.go | 19 +++ plugins/processors/gpu/factory.go | 56 +++++++++ plugins/processors/gpu/factory_test.go | 45 +++++++ plugins/processors/gpu/processor.go | 111 ++++++++++++++++++ plugins/processors/gpu/processor_test.go | 108 +++++++++++++++++ service/defaultcomponents/components.go | 2 + .../emf_and_kubernetes_config.yaml | 2 + .../emf_and_kubernetes_with_gpu_config.yaml | 2 + .../kubernetes_on_prem_config.yaml | 2 + .../logs_and_kubernetes_config.yaml | 2 + .../pipeline/containerinsights/translator.go | 3 +- .../containerinsights/translator_test.go | 2 +- .../otel/processor/gpu/translator.go | 33 ++++++ 14 files changed, 404 insertions(+), 2 deletions(-) create mode 100644 plugins/processors/gpu/config.go create mode 100644 plugins/processors/gpu/config_test.go create mode 100644 plugins/processors/gpu/factory.go create mode 100644 plugins/processors/gpu/factory_test.go create mode 100644 plugins/processors/gpu/processor.go create mode 100644 plugins/processors/gpu/processor_test.go create mode 100644 translator/translate/otel/processor/gpu/translator.go diff --git a/plugins/processors/gpu/config.go b/plugins/processors/gpu/config.go new file mode 100644 index 0000000000..b72cbdc39c --- /dev/null +++ b/plugins/processors/gpu/config.go @@ -0,0 +1,19 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +package gpu + +import ( + "go.opentelemetry.io/collector/component" +) + +type Config struct{} + +// Verify Config implements Processor interface. +var _ component.Config = (*Config)(nil) + +// Validate does not check for unsupported dimension key-value pairs, because those +// get silently dropped and ignored during translation. +func (cfg *Config) Validate() error { + return nil +} diff --git a/plugins/processors/gpu/config_test.go b/plugins/processors/gpu/config_test.go new file mode 100644 index 0000000000..db3918301a --- /dev/null +++ b/plugins/processors/gpu/config_test.go @@ -0,0 +1,19 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +package gpu + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "go.opentelemetry.io/collector/component" + "go.opentelemetry.io/collector/confmap" +) + +func TestUnmarshalDefaultConfig(t *testing.T) { + factory := NewFactory() + cfg := factory.CreateDefaultConfig() + assert.NoError(t, component.UnmarshalConfig(confmap.New(), cfg)) + assert.Equal(t, factory.CreateDefaultConfig(), cfg) +} diff --git a/plugins/processors/gpu/factory.go b/plugins/processors/gpu/factory.go new file mode 100644 index 0000000000..9c7c61b21a --- /dev/null +++ b/plugins/processors/gpu/factory.go @@ -0,0 +1,56 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +package gpu + +import ( + "context" + "fmt" + + "go.opentelemetry.io/collector/component" + "go.opentelemetry.io/collector/consumer" + "go.opentelemetry.io/collector/processor" + "go.opentelemetry.io/collector/processor/processorhelper" +) + +const ( + TypeStr = "gpu" + stability = component.StabilityLevelBeta +) + +var processorCapabilities = consumer.Capabilities{MutatesData: true} + +func NewFactory() processor.Factory { + return processor.NewFactory( + TypeStr, + createDefaultConfig, + processor.WithMetrics(createMetricsProcessor, stability)) +} + +func createDefaultConfig() component.Config { + return &Config{} +} + +func createMetricsProcessor( + ctx context.Context, + set processor.CreateSettings, + cfg component.Config, + nextConsumer consumer.Metrics, +) (processor.Metrics, error) { + processorConfig, ok := cfg.(*Config) + if !ok { + return nil, fmt.Errorf("configuration parsing error") + } + + metricsProcessor := newGpuProcessor(processorConfig, set.Logger) + + return processorhelper.NewMetricsProcessor( + ctx, + set, + cfg, + nextConsumer, + metricsProcessor.processMetrics, + processorhelper.WithCapabilities(processorCapabilities), + processorhelper.WithStart(metricsProcessor.Start), + processorhelper.WithShutdown(metricsProcessor.Shutdown)) +} diff --git a/plugins/processors/gpu/factory_test.go b/plugins/processors/gpu/factory_test.go new file mode 100644 index 0000000000..bae457d92b --- /dev/null +++ b/plugins/processors/gpu/factory_test.go @@ -0,0 +1,45 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +package gpu + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.opentelemetry.io/collector/component" + "go.opentelemetry.io/collector/component/componenttest" + "go.opentelemetry.io/collector/consumer/consumertest" + "go.opentelemetry.io/collector/processor/processortest" +) + +func TestCreateDefaultConfig(t *testing.T) { + factory := NewFactory() + require.NotNil(t, factory) + + cfg := factory.CreateDefaultConfig() + assert.NotNil(t, cfg, "failed to create default config") + assert.NoError(t, componenttest.CheckConfigStruct(cfg)) +} + +func TestCreateProcessor(t *testing.T) { + factory := NewFactory() + require.NotNil(t, factory) + + cfg := factory.CreateDefaultConfig() + setting := processortest.NewNopCreateSettings() + + tProcessor, err := factory.CreateTracesProcessor(context.Background(), setting, cfg, consumertest.NewNop()) + assert.Equal(t, err, component.ErrDataTypeIsNotSupported) + assert.Nil(t, tProcessor) + + mProcessor, err := factory.CreateMetricsProcessor(context.Background(), setting, cfg, consumertest.NewNop()) + assert.NoError(t, err) + assert.NotNil(t, mProcessor) + + lProcessor, err := factory.CreateLogsProcessor(context.Background(), setting, cfg, consumertest.NewNop()) + assert.Equal(t, err, component.ErrDataTypeIsNotSupported) + assert.Nil(t, lProcessor) +} diff --git a/plugins/processors/gpu/processor.go b/plugins/processors/gpu/processor.go new file mode 100644 index 0000000000..f3c7412bc3 --- /dev/null +++ b/plugins/processors/gpu/processor.go @@ -0,0 +1,111 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +package gpu + +import ( + "context" + "strings" + + "go.opentelemetry.io/collector/component" + "go.opentelemetry.io/collector/pdata/pcommon" + "go.opentelemetry.io/collector/pdata/pmetric" + "go.uber.org/zap" +) + +const ( + gpuMetric = "_gpu_" +) + +var defaultGpuLabels = []string{ + "ClusterName", + "Namespace", + "Service", + "ContainerName", + "FullPodName", + "PodName", + "GpuDevice", +} + +type gpuprocessor struct { + *Config + logger *zap.Logger + cancelFunc context.CancelFunc + shutdownC chan bool + started bool +} + +func newGpuProcessor(config *Config, logger *zap.Logger) *gpuprocessor { + _, cancel := context.WithCancel(context.Background()) + d := &gpuprocessor{ + Config: config, + logger: logger, + cancelFunc: cancel, + } + return d +} + +func (d *gpuprocessor) processMetrics(ctx context.Context, md pmetric.Metrics) (pmetric.Metrics, error) { + if !d.started { + return pmetric.NewMetrics(), nil + } + + rms := md.ResourceMetrics() + for i := 0; i < rms.Len(); i++ { + rs := rms.At(i) + ilms := rs.ScopeMetrics() + for j := 0; j < ilms.Len(); j++ { + ils := ilms.At(j) + metrics := ils.Metrics() + for k := 0; k < metrics.Len(); k++ { + m := metrics.At(k) + d.processMetricAttributes(ctx, m) + } + } + } + return md, nil +} + +func (d *gpuprocessor) processMetricAttributes(_ context.Context, m pmetric.Metric) { + // only decorate GPU metrics + // another option is to separate GPU of its own pipeline to minimize extra processing of metrics + if !strings.Contains(m.Name(), gpuMetric) { + return + } + + switch m.Type() { + case pmetric.MetricTypeGauge: + dps := m.Gauge().DataPoints() + for i := 0; i < dps.Len(); i++ { + addDefaultAttributes(dps.At(i).Attributes()) + } + case pmetric.MetricTypeSum: + dps := m.Sum().DataPoints() + for i := 0; i < dps.Len(); i++ { + addDefaultAttributes(dps.At(i).Attributes()) + } + default: + d.logger.Debug("Ignore unknown metric type", zap.String("type", m.Type().String())) + } +} + +// adds empty string for default attributes since prometheus drops them during relabeling process +func addDefaultAttributes(attributes pcommon.Map) { + for _, k := range defaultGpuLabels { + if _, ok := attributes.Get(k); !ok { + attributes.PutStr(k, "") + } + } +} + +func (d *gpuprocessor) Shutdown(context.Context) error { + close(d.shutdownC) + d.cancelFunc() + return nil +} + +func (d *gpuprocessor) Start(ctx context.Context, _ component.Host) error { + d.shutdownC = make(chan bool) + d.started = true + return nil +} diff --git a/plugins/processors/gpu/processor_test.go b/plugins/processors/gpu/processor_test.go new file mode 100644 index 0000000000..eda521093a --- /dev/null +++ b/plugins/processors/gpu/processor_test.go @@ -0,0 +1,108 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +package gpu + +import ( + "context" + "testing" + + "github.com/grafana/regexp" + "github.com/stretchr/testify/assert" + "go.opentelemetry.io/collector/pdata/pmetric" + "go.uber.org/zap" +) + +var normalizedNameRegex = regexp.MustCompile("^(container|pod|node)_gpu_[a-z_]+$") + +func TestProcessMetrics(t *testing.T) { + logger, _ := zap.NewDevelopment() + gp := &gpuprocessor{ + logger: logger, + Config: createDefaultConfig().(*Config), + } + ctx := context.Background() + gp.Start(ctx, nil) + + testcases := map[string]struct { + metrics pmetric.Metrics + want map[string]string + }{ + "keepExisting": { + metrics: generateMetrics(map[string]string{ + "ClusterName": "cluster", + "Namespace": "namespace", + "Service": "service", + "ContainerName": "container", + "FullPodName": "fullpod", + "PodName": "pod", + "GpuDevice": "gpu", + }), + want: map[string]string{ + "ClusterName": "cluster", + "Namespace": "namespace", + "Service": "service", + "ContainerName": "container", + "FullPodName": "fullpod", + "PodName": "pod", + "GpuDevice": "gpu", + }, + }, + "addMissing": { + metrics: generateMetrics(map[string]string{ + "ClusterName": "cluster", + "Namespace": "namespace", + "Service": "service", + "ContainerName": "container", + "FullPodName": "fullpod", + }), + want: map[string]string{ + "ClusterName": "cluster", + "Namespace": "namespace", + "Service": "service", + "ContainerName": "container", + "FullPodName": "fullpod", + "PodName": "", + "GpuDevice": "", + }, + }, + "addAll": { + metrics: generateMetrics(map[string]string{}), + want: map[string]string{ + "ClusterName": "", + "Namespace": "", + "Service": "", + "ContainerName": "", + "FullPodName": "", + "PodName": "", + "GpuDevice": "", + }, + }, + } + + for _, tc := range testcases { + ms, _ := gp.processMetrics(ctx, tc.metrics) + attrs := ms.ResourceMetrics().At(0).ScopeMetrics().At(0).Metrics().At(0).Gauge().DataPoints().At(0).Attributes() + assert.Equal(t, len(defaultGpuLabels), attrs.Len()) + for k, v := range tc.want { + got, ok := attrs.Get(k) + assert.True(t, ok) + assert.Equal(t, v, got.Str()) + } + } +} + +func generateMetrics(dimensions map[string]string) pmetric.Metrics { + md := pmetric.NewMetrics() + + m := md.ResourceMetrics().AppendEmpty().ScopeMetrics().AppendEmpty().Metrics().AppendEmpty() + m.SetName("test" + gpuMetric) + gauge := m.SetEmptyGauge().DataPoints().AppendEmpty() + gauge.SetIntValue(10) + + for k, v := range dimensions { + gauge.Attributes().PutStr(k, v) + } + + return md +} diff --git a/service/defaultcomponents/components.go b/service/defaultcomponents/components.go index 602371fa40..0d14c9232f 100644 --- a/service/defaultcomponents/components.go +++ b/service/defaultcomponents/components.go @@ -29,6 +29,7 @@ import ( "github.com/aws/amazon-cloudwatch-agent/plugins/outputs/cloudwatch" "github.com/aws/amazon-cloudwatch-agent/plugins/processors/awsappsignals" "github.com/aws/amazon-cloudwatch-agent/plugins/processors/ec2tagger" + "github.com/aws/amazon-cloudwatch-agent/plugins/processors/gpu" ) func Factories() (otelcol.Factories, error) { @@ -53,6 +54,7 @@ func Factories() (otelcol.Factories, error) { metricstransformprocessor.NewFactory(), resourcedetectionprocessor.NewFactory(), transformprocessor.NewFactory(), + gpu.NewFactory(), ); err != nil { return otelcol.Factories{}, err } diff --git a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.yaml b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.yaml index 1557628b44..cbd26cbc21 100644 --- a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.yaml +++ b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.yaml @@ -442,6 +442,7 @@ processors: new_name: apiserver_request_total_5xx operations: [] submatch_case: "" + gpu/containerinsights: {} receivers: awscontainerinsightreceiver: add_container_name_metric_label: true @@ -522,6 +523,7 @@ service: processors: - metricstransform/containerinsights - batch/containerinsights + - gpu/containerinsights receivers: - awscontainerinsightreceiver telemetry: diff --git a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml index 6cdb314e54..ee094aba31 100644 --- a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml +++ b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml @@ -897,6 +897,7 @@ processors: new_value: "" value_actions: [ ] submatch_case: "" + gpu/containerinsights: {} receivers: awscontainerinsightreceiver: @@ -978,6 +979,7 @@ service: processors: - metricstransform/containerinsights - batch/containerinsights + - gpu/containerinsights receivers: - awscontainerinsightreceiver telemetry: diff --git a/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.yaml b/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.yaml index 91e90916b1..f51e46904a 100644 --- a/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.yaml +++ b/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.yaml @@ -402,6 +402,7 @@ processors: new_name: apiserver_request_total_5xx operations: [] submatch_case: "" + gpu/containerinsights: {} receivers: awscontainerinsightreceiver: add_container_name_metric_label: true @@ -440,6 +441,7 @@ service: processors: - metricstransform/containerinsights - batch/containerinsights + - gpu/containerinsights receivers: - awscontainerinsightreceiver telemetry: diff --git a/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.yaml b/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.yaml index 48b06f125b..bf63e010e9 100644 --- a/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.yaml +++ b/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.yaml @@ -440,6 +440,7 @@ processors: new_name: apiserver_request_total_5xx operations: [] submatch_case: "" + gpu/containerinsights: {} receivers: awscontainerinsightreceiver: add_container_name_metric_label: true @@ -519,6 +520,7 @@ service: processors: - metricstransform/containerinsights - batch/containerinsights + - gpu/containerinsights receivers: - awscontainerinsightreceiver telemetry: diff --git a/translator/translate/otel/pipeline/containerinsights/translator.go b/translator/translate/otel/pipeline/containerinsights/translator.go index ba5fb093e2..29dec1b817 100644 --- a/translator/translate/otel/pipeline/containerinsights/translator.go +++ b/translator/translate/otel/pipeline/containerinsights/translator.go @@ -13,6 +13,7 @@ import ( "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/exporter/awsemf" "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/extension/agenthealth" "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/processor/batchprocessor" + "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/processor/gpu" "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/processor/metricstransformprocessor" "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/receiver/awscontainerinsight" ) @@ -52,7 +53,7 @@ func (t *translator) Translate(conf *confmap.Conf) (*common.ComponentTranslators if enhancedContainerInsightsEnabled { return &common.ComponentTranslators{ Receivers: common.NewTranslatorMap(awscontainerinsight.NewTranslator()), - Processors: common.NewTranslatorMap(metricstransformprocessor.NewTranslatorWithName(pipelineName), batchprocessor.NewTranslatorWithNameAndSection(pipelineName, common.LogsKey)), // EKS & ECS CI sit under metrics_collected in "logs" + Processors: common.NewTranslatorMap(metricstransformprocessor.NewTranslatorWithName(pipelineName), batchprocessor.NewTranslatorWithNameAndSection(pipelineName, common.LogsKey), gpu.NewTranslatorWithName(pipelineName)), // EKS & ECS CI sit under metrics_collected in "logs" Exporters: common.NewTranslatorMap(awsemf.NewTranslatorWithName(pipelineName)), Extensions: common.NewTranslatorMap(agenthealth.NewTranslator(component.DataTypeLogs, []string{agenthealth.OperationPutLogEvents})), }, nil diff --git a/translator/translate/otel/pipeline/containerinsights/translator_test.go b/translator/translate/otel/pipeline/containerinsights/translator_test.go index 14a721e6b2..70f723903f 100644 --- a/translator/translate/otel/pipeline/containerinsights/translator_test.go +++ b/translator/translate/otel/pipeline/containerinsights/translator_test.go @@ -81,7 +81,7 @@ func TestTranslator(t *testing.T) { want: &want{ pipelineType: "metrics/containerinsights", receivers: []string{"awscontainerinsightreceiver"}, - processors: []string{"metricstransform/containerinsights", "batch/containerinsights"}, + processors: []string{"metricstransform/containerinsights", "batch/containerinsights", "gpu/containerinsights"}, exporters: []string{"awsemf/containerinsights"}, extensions: []string{"agenthealth/logs"}, }, diff --git a/translator/translate/otel/processor/gpu/translator.go b/translator/translate/otel/processor/gpu/translator.go new file mode 100644 index 0000000000..3f542a469c --- /dev/null +++ b/translator/translate/otel/processor/gpu/translator.go @@ -0,0 +1,33 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +package gpu + +import ( + "go.opentelemetry.io/collector/component" + "go.opentelemetry.io/collector/confmap" + "go.opentelemetry.io/collector/processor" + + "github.com/aws/amazon-cloudwatch-agent/plugins/processors/gpu" + "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/common" +) + +type translator struct { + name string + factory processor.Factory +} + +var _ common.Translator[component.Config] = (*translator)(nil) + +func NewTranslatorWithName(name string) common.Translator[component.Config] { + return &translator{name, gpu.NewFactory()} +} + +func (t *translator) ID() component.ID { + return component.NewIDWithName(t.factory.Type(), t.name) +} + +func (t *translator) Translate(conf *confmap.Conf) (component.Config, error) { + cfg := t.factory.CreateDefaultConfig().(*gpu.Config) + return cfg, nil +} From ecc28b6104c1fc8a7ade356e5b44e4c547f37d58 Mon Sep 17 00:00:00 2001 From: Hyunsoo Kim Date: Mon, 26 Feb 2024 16:01:43 -0500 Subject: [PATCH 08/20] bring gpu processor back to filter attributes --- internal/containerinsightscommon/const.go | 6 +- plugins/processors/gpu/processor.go | 127 +++++++++++++++--- plugins/processors/gpu/processor_test.go | 86 ++++++------ .../emf_and_kubernetes_with_gpu_config.yaml | 12 +- .../otel/exporter/awsemf/kubernetes.go | 6 +- .../otel/exporter/awsemf/translator_test.go | 6 +- .../metricstransformprocessor/translator.go | 5 + 7 files changed, 170 insertions(+), 78 deletions(-) diff --git a/internal/containerinsightscommon/const.go b/internal/containerinsightscommon/const.go index aea5cb97d5..62a20a48ee 100644 --- a/internal/containerinsightscommon/const.go +++ b/internal/containerinsightscommon/const.go @@ -73,12 +73,14 @@ const ( DiskIOTotal = "Total" GpuUtilization = "gpu_utilization" - GpuMemUtilization = "gpu_utilization_memory" + GpuMemUtilization = "gpu_memory_utilization" GpuMemUsed = "gpu_memory_used" GpuMemTotal = "gpu_memory_total" GpuTemperature = "gpu_temperature" GpuPowerDraw = "gpu_power_draw" - GpuFanSpeed = "gpu_fan_speed" + GpuRequest = "gpu_request" + GpuLimit = "gpu_limit" + GpuTotal = "gpu_total" TypeCluster = "Cluster" TypeClusterService = "ClusterService" diff --git a/plugins/processors/gpu/processor.go b/plugins/processors/gpu/processor.go index f3c7412bc3..3c2493bd41 100644 --- a/plugins/processors/gpu/processor.go +++ b/plugins/processors/gpu/processor.go @@ -5,6 +5,7 @@ package gpu import ( "context" + "encoding/json" "strings" "go.opentelemetry.io/collector/component" @@ -14,17 +15,39 @@ import ( ) const ( - gpuMetric = "_gpu_" + gpuMetric = "_gpu_" + gpuContainerMetricPrefix = "container_" + gpuPodMetricPrefix = "pod_" + gpuNodeMetricPrefix = "node_" ) -var defaultGpuLabels = []string{ - "ClusterName", - "Namespace", - "Service", - "ContainerName", - "FullPodName", - "PodName", - "GpuDevice", +var podContainerMetricLabels = map[string]map[string]interface{}{ + "ClusterName": nil, + "FullPodName": nil, + "PodName": nil, + "InstanceId": nil, + "InstanceType": nil, + "NodeName": nil, + "Timestamp": nil, + "Type": nil, + "Version": nil, + "Namespace": nil, + "Sources": nil, + "UUID": nil, + "kubernetes": nil, +} + +var nodeMetricLabels = map[string]map[string]interface{}{ + "ClusterName": nil, + "InstanceId": nil, + "InstanceType": nil, + "NodeName": nil, + "Timestamp": nil, + "Type": nil, + "Version": nil, + "kubernetes": { + "host": nil, + }, } type gpuprocessor struct { @@ -73,27 +96,87 @@ func (d *gpuprocessor) processMetricAttributes(_ context.Context, m pmetric.Metr return } + var labels map[string]map[string]interface{} + if strings.HasPrefix(m.Name(), gpuNodeMetricPrefix) { + labels = nodeMetricLabels + } else if strings.HasPrefix(m.Name(), gpuContainerMetricPrefix) { + labels = podContainerMetricLabels + labels["kubernetes"] = map[string]interface{}{ + "container_name": nil, + "containerd": nil, + "host": nil, + "labels": nil, + "pod_id": nil, + "pod_name": nil, + "pod_owners": nil, + "namespace": nil, + } + } else if strings.HasPrefix(m.Name(), gpuPodMetricPrefix) { + labels = podContainerMetricLabels + labels["kubernetes"] = map[string]interface{}{ + "host": nil, + "labels": nil, + "pod_id": nil, + "pod_name": nil, + "pod_owners": nil, + "namespace": nil, + } + } + + var dps pmetric.NumberDataPointSlice switch m.Type() { case pmetric.MetricTypeGauge: - dps := m.Gauge().DataPoints() - for i := 0; i < dps.Len(); i++ { - addDefaultAttributes(dps.At(i).Attributes()) - } + dps = m.Gauge().DataPoints() case pmetric.MetricTypeSum: - dps := m.Sum().DataPoints() - for i := 0; i < dps.Len(); i++ { - addDefaultAttributes(dps.At(i).Attributes()) - } + dps = m.Sum().DataPoints() default: d.logger.Debug("Ignore unknown metric type", zap.String("type", m.Type().String())) } + + for i := 0; i < dps.Len(); i++ { + d.filterAttributes(dps.At(i).Attributes(), labels) + } } -// adds empty string for default attributes since prometheus drops them during relabeling process -func addDefaultAttributes(attributes pcommon.Map) { - for _, k := range defaultGpuLabels { - if _, ok := attributes.Get(k); !ok { - attributes.PutStr(k, "") +func (d *gpuprocessor) filterAttributes(attributes pcommon.Map, labels map[string]map[string]interface{}) { + if len(labels) < 1 { + return + } + // remove labels that are no in the keep list + attributes.RemoveIf(func(k string, _ pcommon.Value) bool { + if _, ok := labels[k]; !ok { + return true + } + return false + }) + + // if a label has child level filter list, that means the label is map type + // only handles map type since there are currently only map and value types with GPU + for lk, ls := range labels { + if len(ls) < 1 { + continue + } + if av, ok := attributes.Get(lk); ok { + // decode json formatted string value into a map then encode again after filtering elements + var blob map[string]json.RawMessage + strVal := av.Str() + err := json.Unmarshal([]byte(strVal), &blob) + if err != nil { + d.logger.Warn("gpuprocessor: failed to unmarshal label", zap.String("label", lk)) + continue + } + newBlob := make(map[string]json.RawMessage) + for bkey, bval := range blob { + if _, ok := ls[bkey]; ok { + newBlob[bkey] = bval + } + } + bytes, err := json.Marshal(newBlob) + if err != nil { + d.logger.Warn("gpuprocessor: failed to marshall label", zap.String("label", lk)) + continue + } + attributes.PutStr(lk, string(bytes)) } } } diff --git a/plugins/processors/gpu/processor_test.go b/plugins/processors/gpu/processor_test.go index eda521093a..f5a8bcaf4a 100644 --- a/plugins/processors/gpu/processor_test.go +++ b/plugins/processors/gpu/processor_test.go @@ -26,64 +26,66 @@ func TestProcessMetrics(t *testing.T) { testcases := map[string]struct { metrics pmetric.Metrics + labels map[string]map[string]interface{} want map[string]string }{ - "keepExisting": { - metrics: generateMetrics(map[string]string{ - "ClusterName": "cluster", - "Namespace": "namespace", - "Service": "service", - "ContainerName": "container", - "FullPodName": "fullpod", - "PodName": "pod", - "GpuDevice": "gpu", + "nonNode": { + metrics: generateMetrics("prefix", map[string]string{ + "ClusterName": "cluster", }), + labels: map[string]map[string]interface{}{}, want: map[string]string{ - "ClusterName": "cluster", - "Namespace": "namespace", - "Service": "service", - "ContainerName": "container", - "FullPodName": "fullpod", - "PodName": "pod", - "GpuDevice": "gpu", + "ClusterName": "cluster", }, }, - "addMissing": { - metrics: generateMetrics(map[string]string{ - "ClusterName": "cluster", - "Namespace": "namespace", - "Service": "service", - "ContainerName": "container", - "FullPodName": "fullpod", + "nodeDropSimple": { + metrics: generateMetrics("node", map[string]string{ + "ClusterName": "cluster", + "Drop": "val", }), + labels: map[string]map[string]interface{}{ + "ClusterName": {}, + }, + want: map[string]string{ + "ClusterName": "cluster", + }, + }, + "nodeDropJson": { + metrics: generateMetrics("node", map[string]string{ + "ClusterName": "cluster", + "kubernetes": "{\"a\":\"1\",\"b\":\"2\"}", + }), + labels: map[string]map[string]interface{}{ + "ClusterName": {}, + "kubernetes": {"a": map[string]interface{}{}}, + }, want: map[string]string{ - "ClusterName": "cluster", - "Namespace": "namespace", - "Service": "service", - "ContainerName": "container", - "FullPodName": "fullpod", - "PodName": "", - "GpuDevice": "", + "ClusterName": "cluster", + "kubernetes": "{\"a\":\"1\"}", }, }, - "addAll": { - metrics: generateMetrics(map[string]string{}), + "nodeDropMixed": { + metrics: generateMetrics("node", map[string]string{ + "ClusterName": "cluster", + "Drop": "val", + "kubernetes": "{\"a\":\"1\",\"b\":\"2\"}", + }), + labels: map[string]map[string]interface{}{ + "ClusterName": {}, + "kubernetes": {"a": map[string]interface{}{}}, + }, want: map[string]string{ - "ClusterName": "", - "Namespace": "", - "Service": "", - "ContainerName": "", - "FullPodName": "", - "PodName": "", - "GpuDevice": "", + "ClusterName": "cluster", + "kubernetes": "{\"a\":\"1\"}", }, }, } for _, tc := range testcases { + nodeMetricLabels = tc.labels ms, _ := gp.processMetrics(ctx, tc.metrics) attrs := ms.ResourceMetrics().At(0).ScopeMetrics().At(0).Metrics().At(0).Gauge().DataPoints().At(0).Attributes() - assert.Equal(t, len(defaultGpuLabels), attrs.Len()) + assert.Equal(t, len(tc.want), attrs.Len()) for k, v := range tc.want { got, ok := attrs.Get(k) assert.True(t, ok) @@ -92,11 +94,11 @@ func TestProcessMetrics(t *testing.T) { } } -func generateMetrics(dimensions map[string]string) pmetric.Metrics { +func generateMetrics(prefix string, dimensions map[string]string) pmetric.Metrics { md := pmetric.NewMetrics() m := md.ResourceMetrics().AppendEmpty().ScopeMetrics().AppendEmpty().Metrics().AppendEmpty() - m.SetName("test" + gpuMetric) + m.SetName(prefix + gpuMetric) gauge := m.SetEmptyGauge().DataPoints().AppendEmpty() gauge.SetIntValue(10) diff --git a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml index ee094aba31..17533dd94d 100644 --- a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml +++ b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml @@ -347,7 +347,7 @@ exporters: label_matchers: [ ] metric_name_selectors: - container_gpu_utilization - - container_gpu_utilization_memory + - container_gpu_memory_utilization - container_gpu_memory_total - container_gpu_memory_used - container_gpu_power_draw @@ -374,7 +374,7 @@ exporters: label_matchers: [ ] metric_name_selectors: - pod_gpu_utilization - - pod_gpu_utilization_memory + - pod_gpu_memory_utilization - pod_gpu_memory_total - pod_gpu_memory_used - pod_gpu_power_draw @@ -391,7 +391,7 @@ exporters: label_matchers: [ ] metric_name_selectors: - node_gpu_utilization - - node_gpu_utilization_memory + - node_gpu_memory_utilization - node_gpu_memory_total - node_gpu_memory_used - node_gpu_power_draw @@ -816,7 +816,7 @@ processors: group_resource_labels: { } include: DCGM_FI_DEV_FB_USED_PERCENT match_type: "" - new_name: container_gpu_utilization_memory + new_name: container_gpu_memory_utilization operations: - action: add_label aggregated_values: [ ] @@ -845,7 +845,7 @@ processors: group_resource_labels: { } include: DCGM_FI_DEV_FB_USED_PERCENT match_type: "" - new_name: pod_gpu_utilization_memory + new_name: pod_gpu_memory_utilization operations: - action: add_label aggregated_values: [ ] @@ -874,7 +874,7 @@ processors: group_resource_labels: { } include: DCGM_FI_DEV_FB_USED_PERCENT match_type: "" - new_name: node_gpu_utilization_memory + new_name: node_gpu_memory_utilization operations: - action: add_label aggregated_values: [ ] diff --git a/translator/translate/otel/exporter/awsemf/kubernetes.go b/translator/translate/otel/exporter/awsemf/kubernetes.go index 4f76a7566d..6e2b0bb791 100644 --- a/translator/translate/otel/exporter/awsemf/kubernetes.go +++ b/translator/translate/otel/exporter/awsemf/kubernetes.go @@ -472,7 +472,7 @@ func getGPUMetricDeclarations(conf *confmap.Conf) []*awsemfexporter.MetricDeclar Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "Namespace", "PodName", "ContainerName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "ContainerName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "ContainerName", "GpuDevice"}}, MetricNameSelectors: []string{ "container_gpu_utilization", - "container_gpu_utilization_memory", + "container_gpu_memory_utilization", "container_gpu_memory_total", "container_gpu_memory_used", "container_gpu_power_draw", @@ -483,7 +483,7 @@ func getGPUMetricDeclarations(conf *confmap.Conf) []*awsemfexporter.MetricDeclar Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "Namespace"}, {"ClusterName", "Namespace", "Service"}, {"ClusterName", "Namespace", "PodName"}, {"ClusterName", "Namespace", "PodName", "FullPodName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "GpuDevice"}}, MetricNameSelectors: []string{ "pod_gpu_utilization", - "pod_gpu_utilization_memory", + "pod_gpu_memory_utilization", "pod_gpu_memory_total", "pod_gpu_memory_used", "pod_gpu_power_draw", @@ -494,7 +494,7 @@ func getGPUMetricDeclarations(conf *confmap.Conf) []*awsemfexporter.MetricDeclar Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "NodeName", "InstanceId"}, {"ClusterName", "NodeName", "InstanceId", "GpuDevice"}}, MetricNameSelectors: []string{ "node_gpu_utilization", - "node_gpu_utilization_memory", + "node_gpu_memory_utilization", "node_gpu_memory_total", "node_gpu_memory_used", "node_gpu_power_draw", diff --git a/translator/translate/otel/exporter/awsemf/translator_test.go b/translator/translate/otel/exporter/awsemf/translator_test.go index 7489ec2ba1..43005df927 100644 --- a/translator/translate/otel/exporter/awsemf/translator_test.go +++ b/translator/translate/otel/exporter/awsemf/translator_test.go @@ -402,19 +402,19 @@ func TestTranslator(t *testing.T) { { Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "Namespace", "PodName", "ContainerName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "ContainerName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "ContainerName", "GpuDevice"}}, MetricNameSelectors: []string{ - "container_gpu_utilization", "container_gpu_utilization_memory", "container_gpu_memory_total", "container_gpu_memory_used", "container_gpu_power_draw", "container_gpu_temperature", + "container_gpu_utilization", "container_gpu_memory_utilization", "container_gpu_memory_total", "container_gpu_memory_used", "container_gpu_power_draw", "container_gpu_temperature", }, }, { Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "Namespace"}, {"ClusterName", "Namespace", "Service"}, {"ClusterName", "Namespace", "PodName"}, {"ClusterName", "Namespace", "PodName", "FullPodName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "GpuDevice"}}, MetricNameSelectors: []string{ - "pod_gpu_utilization", "pod_gpu_utilization_memory", "pod_gpu_memory_total", "pod_gpu_memory_used", "pod_gpu_power_draw", "pod_gpu_temperature", + "pod_gpu_utilization", "pod_gpu_memory_utilization", "pod_gpu_memory_total", "pod_gpu_memory_used", "pod_gpu_power_draw", "pod_gpu_temperature", }, }, { Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "NodeName", "InstanceId"}, {"ClusterName", "NodeName", "InstanceId", "GpuDevice"}}, MetricNameSelectors: []string{ - "node_gpu_utilization", "node_gpu_utilization_memory", "node_gpu_memory_total", "node_gpu_memory_used", "node_gpu_power_draw", "node_gpu_temperature", "node_gpu_fan_speed", + "node_gpu_utilization", "node_gpu_memory_utilization", "node_gpu_memory_total", "node_gpu_memory_used", "node_gpu_power_draw", "node_gpu_temperature", "node_gpu_fan_speed", }, }, { diff --git a/translator/translate/otel/processor/metricstransformprocessor/translator.go b/translator/translate/otel/processor/metricstransformprocessor/translator.go index e607e80826..855bfa93d0 100644 --- a/translator/translate/otel/processor/metricstransformprocessor/translator.go +++ b/translator/translate/otel/processor/metricstransformprocessor/translator.go @@ -92,6 +92,11 @@ func (t *translator) Translate(conf *confmap.Conf) (component.Config, error) { "action": "experimental_scale_value", "experimental_scale": 100, }) + } else if new == containerinsightscommon.GpuMemTotal || new == containerinsightscommon.GpuMemUsed { + operations = append(operations, map[string]interface{}{ + "action": "experimental_scale_value", + "experimental_scale": 1024 * 1024, + }) } for _, t := range metricDuplicateTypes { transformRules = append(transformRules, map[string]interface{}{ From 3f42b765f60271863ff4f774c35c131282056f98 Mon Sep 17 00:00:00 2001 From: Hyunsoo Kim Date: Mon, 26 Feb 2024 22:25:22 -0500 Subject: [PATCH 09/20] update test --- .../emf_and_kubernetes_with_gpu_config.yaml | 2061 +++++++++-------- 1 file changed, 1060 insertions(+), 1001 deletions(-) diff --git a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml index 17533dd94d..2c7b9fe03c 100644 --- a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml +++ b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml @@ -1,1007 +1,1066 @@ connectors: {} exporters: - awscloudwatchlogs/emf_logs: - certificate_file_path: "" - emf_only: true - endpoint: https://fake_endpoint - imds_retries: 2 - local_mode: false - log_group_name: emf/logs/default - log_retention: 0 - log_stream_name: host_name_from_env - max_retries: 2 - middleware: agenthealth/logs - no_verify_ssl: false - num_workers: 8 - profile: default - proxy_address: "" - raw_log: true - region: us-east-1 - request_timeout_seconds: 30 - resource_arn: "" - retry_on_failure: - enabled: true - initial_interval: 5s - max_elapsed_time: 5m0s - max_interval: 30s - multiplier: 1.5 - randomization_factor: 0.5 - role_arn: "" - sending_queue: - enabled: true - num_consumers: 1 - queue_size: 1000 - storage: null - shared_credentials_file: - - /root/.aws/credentials - awsemf/containerinsights: - certificate_file_path: "" - detailed_metrics: false - dimension_rollup_option: NoDimensionRollup - disable_metric_extraction: true - eks_fargate_container_insights_enabled: false - endpoint: https://fake_endpoint - enhanced_container_insights: true - imds_retries: 2 - local_mode: false - log_group_name: /aws/containerinsights/{ClusterName}/performance - log_retention: 0 - log_stream_name: '{NodeName}' - max_retries: 2 - metric_declarations: - - dimensions: - - - ClusterName - - - ClusterName - - ContainerName - - FullPodName - - Namespace - - PodName - - - ClusterName - - ContainerName - - Namespace - - PodName - label_matchers: [] - metric_name_selectors: - - container_cpu_utilization - - container_cpu_utilization_over_container_limit - - container_cpu_limit - - container_cpu_request - - container_memory_utilization - - container_memory_utilization_over_container_limit - - container_memory_failures_total - - container_memory_limit - - container_memory_request - - container_filesystem_usage - - container_filesystem_available - - container_filesystem_utilization - - dimensions: - - - ClusterName - - Namespace - - PodName - - - ClusterName - - - ClusterName - - Namespace - - Service - - - ClusterName - - Namespace - - - ClusterName - - FullPodName - - Namespace - - PodName - label_matchers: [] - metric_name_selectors: - - pod_cpu_utilization - - pod_memory_utilization - - pod_network_rx_bytes - - pod_network_tx_bytes - - pod_cpu_utilization_over_pod_limit - - pod_memory_utilization_over_pod_limit - - dimensions: - - - ClusterName - - FullPodName - - Namespace - - PodName - - - ClusterName - - Namespace - - PodName - - - ClusterName - - Namespace - - - ClusterName - label_matchers: [] - metric_name_selectors: - - pod_interface_network_rx_dropped - - pod_interface_network_tx_dropped - - dimensions: - - - ClusterName - - Namespace - - PodName - - - ClusterName - - - ClusterName - - FullPodName - - Namespace - - PodName - - - ClusterName - - Namespace - - Service - label_matchers: [] - metric_name_selectors: - - pod_cpu_reserved_capacity - - pod_memory_reserved_capacity - - pod_number_of_container_restarts - - pod_number_of_containers - - pod_number_of_running_containers - - pod_status_ready - - pod_status_scheduled - - pod_status_running - - pod_status_pending - - pod_status_failed - - pod_status_unknown - - pod_status_succeeded - - pod_memory_request - - pod_memory_limit - - pod_cpu_limit - - pod_cpu_request - - pod_container_status_running - - pod_container_status_terminated - - pod_container_status_waiting - - pod_container_status_waiting_reason_crash_loop_back_off - - pod_container_status_waiting_reason_image_pull_error - - pod_container_status_waiting_reason_start_error - - pod_container_status_waiting_reason_create_container_error - - pod_container_status_waiting_reason_create_container_config_error - - pod_container_status_terminated_reason_oom_killed - - dimensions: - - - ClusterName - - InstanceId - - NodeName - - - ClusterName - label_matchers: [] - metric_name_selectors: - - node_cpu_utilization - - node_memory_utilization - - node_network_total_bytes - - node_cpu_reserved_capacity - - node_memory_reserved_capacity - - node_number_of_running_pods - - node_number_of_running_containers - - node_cpu_usage_total - - node_cpu_limit - - node_memory_working_set - - node_memory_limit - - node_status_condition_ready - - node_status_condition_disk_pressure - - node_status_condition_memory_pressure - - node_status_condition_pid_pressure - - node_status_condition_network_unavailable - - node_status_condition_unknown - - node_status_capacity_pods - - node_status_allocatable_pods - - dimensions: - - - ClusterName - - InstanceId - - NodeName - - - ClusterName - label_matchers: [] - metric_name_selectors: - - node_interface_network_rx_dropped - - node_interface_network_tx_dropped - - node_diskio_io_service_bytes_total - - node_diskio_io_serviced_total - - dimensions: - - - ClusterName - - InstanceId - - NodeName - - - ClusterName - label_matchers: [] - metric_name_selectors: - - node_filesystem_utilization - - node_filesystem_inodes - - node_filesystem_inodes_free - - dimensions: - - - ClusterName - - Namespace - - Service - - - ClusterName - label_matchers: [] - metric_name_selectors: - - service_number_of_running_pods - - dimensions: - - - ClusterName - - Namespace - - PodName - - - ClusterName - label_matchers: [] - metric_name_selectors: - - replicas_desired - - replicas_ready - - status_replicas_available - - status_replicas_unavailable - - dimensions: - - - ClusterName - - Namespace - - PodName - - - ClusterName - label_matchers: [] - metric_name_selectors: - - daemonset_status_number_available - - daemonset_status_number_unavailable - - dimensions: - - - ClusterName - - Namespace - - - ClusterName - label_matchers: [] - metric_name_selectors: - - namespace_number_of_running_pods - - dimensions: - - - ClusterName - label_matchers: [] - metric_name_selectors: - - cluster_node_count - - cluster_failed_node_count - - cluster_number_of_running_pods - - dimensions: - - - ClusterName - - endpoint - - - ClusterName - label_matchers: [] - metric_name_selectors: - - apiserver_storage_size_bytes - - apiserver_storage_db_total_size_in_bytes - - etcd_db_total_size_in_bytes - - dimensions: - - - ClusterName - - resource - - - ClusterName - label_matchers: [] - metric_name_selectors: - - apiserver_storage_list_duration_seconds - - apiserver_longrunning_requests - - apiserver_storage_objects - - dimensions: - - - ClusterName - - verb - - - ClusterName - label_matchers: [] - metric_name_selectors: - - apiserver_request_duration_seconds - - rest_client_request_duration_seconds - - dimensions: - - - ClusterName - - code - - verb - - - ClusterName - label_matchers: [] - metric_name_selectors: - - apiserver_request_total - - apiserver_request_total_5xx - - dimensions: - - - ClusterName - - operation - - - ClusterName - label_matchers: [] - metric_name_selectors: - - apiserver_admission_controller_admission_duration_seconds - - apiserver_admission_step_admission_duration_seconds - - etcd_request_duration_seconds - - dimensions: - - - ClusterName - - code - - method - - - ClusterName - label_matchers: [] - metric_name_selectors: - - rest_client_requests_total - - dimensions: - - - ClusterName - - request_kind - - - ClusterName - label_matchers: [] - metric_name_selectors: - - apiserver_current_inflight_requests - - apiserver_current_inqueue_requests - - dimensions: - - - ClusterName - - name - - - ClusterName - label_matchers: [] - metric_name_selectors: - - apiserver_admission_webhook_admission_duration_seconds - - dimensions: - - - ClusterName - - group - - - ClusterName - label_matchers: [] - metric_name_selectors: - - apiserver_requested_deprecated_apis - - dimensions: - - - ClusterName - - reason - - - ClusterName - label_matchers: [] - metric_name_selectors: - - apiserver_flowcontrol_rejected_requests_total - - dimensions: - - - ClusterName - - priority_level - - - ClusterName - label_matchers: [] - metric_name_selectors: - - apiserver_flowcontrol_request_concurrency_limit - - dimensions: - - - ClusterName - - - ClusterName - - ContainerName - - Namespace - - PodName - - - ClusterName - - ContainerName - - FullPodName - - Namespace - - PodName - - - ClusterName - - ContainerName - - FullPodName - - GpuDevice - - Namespace - - PodName - label_matchers: [ ] - metric_name_selectors: - - container_gpu_utilization - - container_gpu_memory_utilization - - container_gpu_memory_total - - container_gpu_memory_used - - container_gpu_power_draw - - container_gpu_temperature - - dimensions: - - - ClusterName - - - ClusterName - - Namespace - - - ClusterName - - Namespace - - Service - - - ClusterName - - Namespace - - PodName - - - ClusterName - - FullPodName - - Namespace - - PodName - - - ClusterName - - FullPodName - - GpuDevice - - Namespace - - PodName - label_matchers: [ ] - metric_name_selectors: - - pod_gpu_utilization - - pod_gpu_memory_utilization - - pod_gpu_memory_total - - pod_gpu_memory_used - - pod_gpu_power_draw - - pod_gpu_temperature - - dimensions: - - - ClusterName - - - ClusterName - - InstanceId - - NodeName - - - ClusterName - - GpuDevice - - InstanceId - - NodeName - label_matchers: [ ] - metric_name_selectors: - - node_gpu_utilization - - node_gpu_memory_utilization - - node_gpu_memory_total - - node_gpu_memory_used - - node_gpu_power_draw - - node_gpu_temperature - - node_gpu_fan_speed - - dimensions: - - - ClusterName - - InstanceId - - NodeName - - - ClusterName - label_matchers: [ ] - metric_name_selectors: - - node_gpu_total - - node_gpu_request - - node_gpu_limit - - dimensions: - - - ClusterName - label_matchers: [ ] - metric_name_selectors: - - cluster_gpu_request - - cluster_gpu_total - metric_descriptors: - - metric_name: apiserver_admission_controller_admission_duration_seconds - overwrite: true - unit: Seconds - - metric_name: apiserver_admission_step_admission_duration_seconds - overwrite: true - unit: Seconds - - metric_name: apiserver_admission_webhook_admission_duration_seconds - overwrite: true - unit: Seconds - - metric_name: apiserver_current_inflight_requests - overwrite: true - unit: Count - - metric_name: apiserver_current_inqueue_requests - overwrite: true - unit: Count - - metric_name: apiserver_flowcontrol_rejected_requests_total - overwrite: true - unit: Count - - metric_name: apiserver_flowcontrol_request_concurrency_limit - overwrite: true - unit: Count - - metric_name: apiserver_longrunning_requests - overwrite: true - unit: Count - - metric_name: apiserver_request_duration_seconds - overwrite: true - unit: Seconds - - metric_name: apiserver_request_total - overwrite: true - unit: Count - - metric_name: apiserver_request_total_5xx - overwrite: true - unit: Count - - metric_name: apiserver_requested_deprecated_apis - overwrite: true - unit: Count - - metric_name: apiserver_storage_objects - overwrite: true - unit: Count - - metric_name: etcd_request_duration_seconds - overwrite: true - unit: Seconds - - metric_name: apiserver_storage_list_duration_seconds - overwrite: true - unit: Seconds - - metric_name: apiserver_storage_db_total_size_in_bytes - overwrite: true - unit: Bytes - - metric_name: apiserver_storage_size_bytes - overwrite: true - unit: Bytes - - metric_name: etcd_db_total_size_in_bytes - overwrite: true - unit: Bytes - - metric_name: rest_client_request_duration_seconds - overwrite: true - unit: Seconds - - metric_name: rest_client_requests_total - overwrite: true - unit: Count - middleware: agenthealth/logs - namespace: ContainerInsights - no_verify_ssl: false - num_workers: 8 - output_destination: cloudwatch - parse_json_encoded_attr_values: - - Sources - - kubernetes - profile: default - proxy_address: "" - region: us-east-1 - request_timeout_seconds: 30 - resource_arn: "" - resource_to_telemetry_conversion: - enabled: true - retain_initial_value_of_delta_metric: false - role_arn: "" - shared_credentials_file: - - /root/.aws/credentials - version: "0" + awscloudwatchlogs/emf_logs: + certificate_file_path: "" + emf_only: true + endpoint: https://fake_endpoint + imds_retries: 2 + local_mode: false + log_group_name: emf/logs/default + log_retention: 0 + log_stream_name: host_name_from_env + max_retries: 2 + middleware: agenthealth/logs + no_verify_ssl: false + num_workers: 8 + profile: default + proxy_address: "" + raw_log: true + region: us-east-1 + request_timeout_seconds: 30 + resource_arn: "" + retry_on_failure: + enabled: true + initial_interval: 5s + max_elapsed_time: 5m0s + max_interval: 30s + multiplier: 1.5 + randomization_factor: 0.5 + role_arn: "" + sending_queue: + enabled: true + num_consumers: 1 + queue_size: 1000 + storage: null + shared_credentials_file: + - /root/.aws/credentials + awsemf/containerinsights: + certificate_file_path: "" + detailed_metrics: false + dimension_rollup_option: NoDimensionRollup + disable_metric_extraction: true + eks_fargate_container_insights_enabled: false + endpoint: https://fake_endpoint + enhanced_container_insights: true + imds_retries: 2 + local_mode: false + log_group_name: /aws/containerinsights/{ClusterName}/performance + log_retention: 0 + log_stream_name: '{NodeName}' + max_retries: 2 + metric_declarations: + - dimensions: + - - ClusterName + - - ClusterName + - ContainerName + - FullPodName + - Namespace + - PodName + - - ClusterName + - ContainerName + - Namespace + - PodName + label_matchers: [] + metric_name_selectors: + - container_cpu_utilization + - container_cpu_utilization_over_container_limit + - container_cpu_limit + - container_cpu_request + - container_memory_utilization + - container_memory_utilization_over_container_limit + - container_memory_failures_total + - container_memory_limit + - container_memory_request + - container_filesystem_usage + - container_filesystem_available + - container_filesystem_utilization + - dimensions: + - - ClusterName + - Namespace + - PodName + - - ClusterName + - - ClusterName + - Namespace + - Service + - - ClusterName + - Namespace + - - ClusterName + - FullPodName + - Namespace + - PodName + label_matchers: [] + metric_name_selectors: + - pod_cpu_utilization + - pod_memory_utilization + - pod_network_rx_bytes + - pod_network_tx_bytes + - pod_cpu_utilization_over_pod_limit + - pod_memory_utilization_over_pod_limit + - dimensions: + - - ClusterName + - FullPodName + - Namespace + - PodName + - - ClusterName + - Namespace + - PodName + - - ClusterName + - Namespace + - - ClusterName + label_matchers: [] + metric_name_selectors: + - pod_interface_network_rx_dropped + - pod_interface_network_tx_dropped + - dimensions: + - - ClusterName + - Namespace + - PodName + - - ClusterName + - - ClusterName + - FullPodName + - Namespace + - PodName + - - ClusterName + - Namespace + - Service + label_matchers: [] + metric_name_selectors: + - pod_cpu_reserved_capacity + - pod_memory_reserved_capacity + - pod_number_of_container_restarts + - pod_number_of_containers + - pod_number_of_running_containers + - pod_status_ready + - pod_status_scheduled + - pod_status_running + - pod_status_pending + - pod_status_failed + - pod_status_unknown + - pod_status_succeeded + - pod_memory_request + - pod_memory_limit + - pod_cpu_limit + - pod_cpu_request + - pod_container_status_running + - pod_container_status_terminated + - pod_container_status_waiting + - pod_container_status_waiting_reason_crash_loop_back_off + - pod_container_status_waiting_reason_image_pull_error + - pod_container_status_waiting_reason_start_error + - pod_container_status_waiting_reason_create_container_error + - pod_container_status_waiting_reason_create_container_config_error + - pod_container_status_terminated_reason_oom_killed + - dimensions: + - - ClusterName + - InstanceId + - NodeName + - - ClusterName + label_matchers: [] + metric_name_selectors: + - node_cpu_utilization + - node_memory_utilization + - node_network_total_bytes + - node_cpu_reserved_capacity + - node_memory_reserved_capacity + - node_number_of_running_pods + - node_number_of_running_containers + - node_cpu_usage_total + - node_cpu_limit + - node_memory_working_set + - node_memory_limit + - node_status_condition_ready + - node_status_condition_disk_pressure + - node_status_condition_memory_pressure + - node_status_condition_pid_pressure + - node_status_condition_network_unavailable + - node_status_condition_unknown + - node_status_capacity_pods + - node_status_allocatable_pods + - dimensions: + - - ClusterName + - InstanceId + - NodeName + - - ClusterName + label_matchers: [] + metric_name_selectors: + - node_interface_network_rx_dropped + - node_interface_network_tx_dropped + - node_diskio_io_service_bytes_total + - node_diskio_io_serviced_total + - dimensions: + - - ClusterName + - InstanceId + - NodeName + - - ClusterName + label_matchers: [] + metric_name_selectors: + - node_filesystem_utilization + - node_filesystem_inodes + - node_filesystem_inodes_free + - dimensions: + - - ClusterName + - Namespace + - Service + - - ClusterName + label_matchers: [] + metric_name_selectors: + - service_number_of_running_pods + - dimensions: + - - ClusterName + - Namespace + - PodName + - - ClusterName + label_matchers: [] + metric_name_selectors: + - replicas_desired + - replicas_ready + - status_replicas_available + - status_replicas_unavailable + - dimensions: + - - ClusterName + - Namespace + - PodName + - - ClusterName + label_matchers: [] + metric_name_selectors: + - daemonset_status_number_available + - daemonset_status_number_unavailable + - dimensions: + - - ClusterName + - Namespace + - - ClusterName + label_matchers: [] + metric_name_selectors: + - namespace_number_of_running_pods + - dimensions: + - - ClusterName + label_matchers: [] + metric_name_selectors: + - cluster_node_count + - cluster_failed_node_count + - cluster_number_of_running_pods + - dimensions: + - - ClusterName + - endpoint + - - ClusterName + label_matchers: [] + metric_name_selectors: + - apiserver_storage_size_bytes + - apiserver_storage_db_total_size_in_bytes + - etcd_db_total_size_in_bytes + - dimensions: + - - ClusterName + - resource + - - ClusterName + label_matchers: [] + metric_name_selectors: + - apiserver_storage_list_duration_seconds + - apiserver_longrunning_requests + - apiserver_storage_objects + - dimensions: + - - ClusterName + - verb + - - ClusterName + label_matchers: [] + metric_name_selectors: + - apiserver_request_duration_seconds + - rest_client_request_duration_seconds + - dimensions: + - - ClusterName + - code + - verb + - - ClusterName + label_matchers: [] + metric_name_selectors: + - apiserver_request_total + - apiserver_request_total_5xx + - dimensions: + - - ClusterName + - operation + - - ClusterName + label_matchers: [] + metric_name_selectors: + - apiserver_admission_controller_admission_duration_seconds + - apiserver_admission_step_admission_duration_seconds + - etcd_request_duration_seconds + - dimensions: + - - ClusterName + - code + - method + - - ClusterName + label_matchers: [] + metric_name_selectors: + - rest_client_requests_total + - dimensions: + - - ClusterName + - request_kind + - - ClusterName + label_matchers: [] + metric_name_selectors: + - apiserver_current_inflight_requests + - apiserver_current_inqueue_requests + - dimensions: + - - ClusterName + - name + - - ClusterName + label_matchers: [] + metric_name_selectors: + - apiserver_admission_webhook_admission_duration_seconds + - dimensions: + - - ClusterName + - group + - - ClusterName + label_matchers: [] + metric_name_selectors: + - apiserver_requested_deprecated_apis + - dimensions: + - - ClusterName + - reason + - - ClusterName + label_matchers: [] + metric_name_selectors: + - apiserver_flowcontrol_rejected_requests_total + - dimensions: + - - ClusterName + - priority_level + - - ClusterName + label_matchers: [] + metric_name_selectors: + - apiserver_flowcontrol_request_concurrency_limit + - dimensions: + - - ClusterName + - - ClusterName + - ContainerName + - Namespace + - PodName + - - ClusterName + - ContainerName + - FullPodName + - Namespace + - PodName + - - ClusterName + - ContainerName + - FullPodName + - GpuDevice + - Namespace + - PodName + label_matchers: [] + metric_name_selectors: + - container_gpu_utilization + - container_gpu_memory_utilization + - container_gpu_memory_total + - container_gpu_memory_used + - container_gpu_power_draw + - container_gpu_temperature + - dimensions: + - - ClusterName + - - ClusterName + - Namespace + - - ClusterName + - Namespace + - Service + - - ClusterName + - Namespace + - PodName + - - ClusterName + - FullPodName + - Namespace + - PodName + - - ClusterName + - FullPodName + - GpuDevice + - Namespace + - PodName + label_matchers: [] + metric_name_selectors: + - pod_gpu_utilization + - pod_gpu_memory_utilization + - pod_gpu_memory_total + - pod_gpu_memory_used + - pod_gpu_power_draw + - pod_gpu_temperature + - dimensions: + - - ClusterName + - - ClusterName + - InstanceId + - NodeName + - - ClusterName + - GpuDevice + - InstanceId + - NodeName + label_matchers: [] + metric_name_selectors: + - node_gpu_utilization + - node_gpu_memory_utilization + - node_gpu_memory_total + - node_gpu_memory_used + - node_gpu_power_draw + - node_gpu_temperature + - node_gpu_fan_speed + - dimensions: + - - ClusterName + - InstanceId + - NodeName + - - ClusterName + label_matchers: [] + metric_name_selectors: + - node_gpu_total + - node_gpu_request + - node_gpu_limit + - dimensions: + - - ClusterName + label_matchers: [] + metric_name_selectors: + - cluster_gpu_request + - cluster_gpu_total + metric_descriptors: + - metric_name: apiserver_admission_controller_admission_duration_seconds + overwrite: true + unit: Seconds + - metric_name: apiserver_admission_step_admission_duration_seconds + overwrite: true + unit: Seconds + - metric_name: apiserver_admission_webhook_admission_duration_seconds + overwrite: true + unit: Seconds + - metric_name: apiserver_current_inflight_requests + overwrite: true + unit: Count + - metric_name: apiserver_current_inqueue_requests + overwrite: true + unit: Count + - metric_name: apiserver_flowcontrol_rejected_requests_total + overwrite: true + unit: Count + - metric_name: apiserver_flowcontrol_request_concurrency_limit + overwrite: true + unit: Count + - metric_name: apiserver_longrunning_requests + overwrite: true + unit: Count + - metric_name: apiserver_request_duration_seconds + overwrite: true + unit: Seconds + - metric_name: apiserver_request_total + overwrite: true + unit: Count + - metric_name: apiserver_request_total_5xx + overwrite: true + unit: Count + - metric_name: apiserver_requested_deprecated_apis + overwrite: true + unit: Count + - metric_name: apiserver_storage_objects + overwrite: true + unit: Count + - metric_name: etcd_request_duration_seconds + overwrite: true + unit: Seconds + - metric_name: apiserver_storage_list_duration_seconds + overwrite: true + unit: Seconds + - metric_name: apiserver_storage_db_total_size_in_bytes + overwrite: true + unit: Bytes + - metric_name: apiserver_storage_size_bytes + overwrite: true + unit: Bytes + - metric_name: etcd_db_total_size_in_bytes + overwrite: true + unit: Bytes + - metric_name: rest_client_request_duration_seconds + overwrite: true + unit: Seconds + - metric_name: rest_client_requests_total + overwrite: true + unit: Count + middleware: agenthealth/logs + namespace: ContainerInsights + no_verify_ssl: false + num_workers: 8 + output_destination: cloudwatch + parse_json_encoded_attr_values: + - Sources + - kubernetes + profile: default + proxy_address: "" + region: us-east-1 + request_timeout_seconds: 30 + resource_arn: "" + resource_to_telemetry_conversion: + enabled: true + retain_initial_value_of_delta_metric: false + role_arn: "" + shared_credentials_file: + - /root/.aws/credentials + version: "0" extensions: - agenthealth/logs: - is_usage_data_enabled: true - stats: - operations: - - PutLogEvents + agenthealth/logs: + is_usage_data_enabled: true + stats: + operations: + - PutLogEvents processors: - batch/containerinsights: - metadata_cardinality_limit: 1000 - metadata_keys: [] - send_batch_max_size: 0 - send_batch_size: 8192 - timeout: 5s - batch/emf_logs: - metadata_cardinality_limit: 1000 - metadata_keys: [] - send_batch_max_size: 0 - send_batch_size: 8192 - timeout: 5s - metricstransform/containerinsights: - transforms: - - action: insert - aggregation_type: "" - experimental_match_labels: - code: ^5.* - group_resource_labels: { } - include: apiserver_request_total - match_type: regexp - new_name: apiserver_request_total_5xx - operations: [ ] - submatch_case: "" - - action: insert - aggregation_type: "" - experimental_match_labels: { } - group_resource_labels: { } - include: DCGM_FI_DEV_FB_USED - match_type: "" - new_name: container_gpu_memory_used - operations: - - action: add_label - aggregated_values: [ ] - aggregation_type: "" - experimental_scale: 0 - label: "" - label_set: [ ] - label_value: "" - new_label: Type - new_value: ContainerGPU - value_actions: [ ] - submatch_case: "" - - action: insert - aggregation_type: "" - experimental_match_labels: { } - group_resource_labels: { } - include: DCGM_FI_DEV_FB_USED - match_type: "" - new_name: pod_gpu_memory_used - operations: - - action: add_label - aggregated_values: [ ] - aggregation_type: "" - experimental_scale: 0 - label: "" - label_set: [ ] - label_value: "" - new_label: Type - new_value: PodGPU - value_actions: [ ] - submatch_case: "" - - action: insert - aggregation_type: "" - experimental_match_labels: { } - group_resource_labels: { } - include: DCGM_FI_DEV_FB_USED - match_type: "" - new_name: node_gpu_memory_used - operations: - - action: add_label - aggregated_values: [ ] - aggregation_type: "" - experimental_scale: 0 - label: "" - label_set: [ ] - label_value: "" - new_label: Type - new_value: NodeGPU - value_actions: [ ] - submatch_case: "" - - action: insert - aggregation_type: "" - experimental_match_labels: { } - group_resource_labels: { } - include: DCGM_FI_DEV_FB_TOTAL - match_type: "" - new_name: container_gpu_memory_total - operations: - - action: add_label - aggregated_values: [ ] - aggregation_type: "" - experimental_scale: 0 - label: "" - label_set: [ ] - label_value: "" - new_label: Type - new_value: ContainerGPU - value_actions: [ ] - submatch_case: "" - - action: insert - aggregation_type: "" - experimental_match_labels: { } - group_resource_labels: { } - include: DCGM_FI_DEV_FB_TOTAL - match_type: "" - new_name: pod_gpu_memory_total - operations: - - action: add_label - aggregated_values: [ ] - aggregation_type: "" - experimental_scale: 0 - label: "" - label_set: [ ] - label_value: "" - new_label: Type - new_value: PodGPU - value_actions: [ ] - submatch_case: "" - - action: insert - aggregation_type: "" - experimental_match_labels: { } - group_resource_labels: { } - include: DCGM_FI_DEV_FB_TOTAL - match_type: "" - new_name: node_gpu_memory_total - operations: - - action: add_label - aggregated_values: [ ] - aggregation_type: "" - experimental_scale: 0 - label: "" - label_set: [ ] - label_value: "" - new_label: Type - new_value: NodeGPU - value_actions: [ ] - submatch_case: "" - - action: insert - aggregation_type: "" - experimental_match_labels: { } - group_resource_labels: { } - include: DCGM_FI_DEV_GPU_TEMP - match_type: "" - new_name: container_gpu_temperature - operations: - - action: add_label - aggregated_values: [ ] - aggregation_type: "" - experimental_scale: 0 - label: "" - label_set: [ ] - label_value: "" - new_label: Type - new_value: ContainerGPU - value_actions: [ ] - submatch_case: "" - - action: insert - aggregation_type: "" - experimental_match_labels: { } - group_resource_labels: { } - include: DCGM_FI_DEV_GPU_TEMP - match_type: "" - new_name: pod_gpu_temperature - operations: - - action: add_label - aggregated_values: [ ] - aggregation_type: "" - experimental_scale: 0 - label: "" - label_set: [ ] - label_value: "" - new_label: Type - new_value: PodGPU - value_actions: [ ] - submatch_case: "" - - action: insert - aggregation_type: "" - experimental_match_labels: { } - group_resource_labels: { } - include: DCGM_FI_DEV_GPU_TEMP - match_type: "" - new_name: node_gpu_temperature - operations: - - action: add_label - aggregated_values: [ ] - aggregation_type: "" - experimental_scale: 0 - label: "" - label_set: [ ] - label_value: "" - new_label: Type - new_value: NodeGPU - value_actions: [ ] - submatch_case: "" - - action: insert - aggregation_type: "" - experimental_match_labels: { } - group_resource_labels: { } - include: DCGM_FI_DEV_POWER_USAGE - match_type: "" - new_name: container_gpu_power_draw - operations: - - action: add_label - aggregated_values: [ ] - aggregation_type: "" - experimental_scale: 0 - label: "" - label_set: [ ] - label_value: "" - new_label: Type - new_value: ContainerGPU - value_actions: [ ] - submatch_case: "" - - action: insert - aggregation_type: "" - experimental_match_labels: { } - group_resource_labels: { } - include: DCGM_FI_DEV_POWER_USAGE - match_type: "" - new_name: pod_gpu_power_draw - operations: - - action: add_label - aggregated_values: [ ] - aggregation_type: "" - experimental_scale: 0 - label: "" - label_set: [ ] - label_value: "" - new_label: Type - new_value: PodGPU - value_actions: [ ] - submatch_case: "" - - action: insert - aggregation_type: "" - experimental_match_labels: { } - group_resource_labels: { } - include: DCGM_FI_DEV_POWER_USAGE - match_type: "" - new_name: node_gpu_power_draw - operations: - - action: add_label - aggregated_values: [ ] - aggregation_type: "" - experimental_scale: 0 - label: "" - label_set: [ ] - label_value: "" - new_label: Type - new_value: NodeGPU - value_actions: [ ] - submatch_case: "" - - action: insert - aggregation_type: "" - experimental_match_labels: { } - group_resource_labels: { } - include: DCGM_FI_DEV_GPU_UTIL - match_type: "" - new_name: container_gpu_utilization - operations: - - action: add_label - aggregated_values: [ ] - aggregation_type: "" - experimental_scale: 0 - label: "" - label_set: [ ] - label_value: "" - new_label: Type - new_value: ContainerGPU - value_actions: [ ] - submatch_case: "" - - action: insert - aggregation_type: "" - experimental_match_labels: { } - group_resource_labels: { } - include: DCGM_FI_DEV_GPU_UTIL - match_type: "" - new_name: pod_gpu_utilization - operations: - - action: add_label - aggregated_values: [ ] - aggregation_type: "" - experimental_scale: 0 - label: "" - label_set: [ ] - label_value: "" - new_label: Type - new_value: PodGPU - value_actions: [ ] - submatch_case: "" - - action: insert - aggregation_type: "" - experimental_match_labels: { } - group_resource_labels: { } - include: DCGM_FI_DEV_GPU_UTIL - match_type: "" - new_name: node_gpu_utilization - operations: - - action: add_label - aggregated_values: [ ] - aggregation_type: "" - experimental_scale: 0 - label: "" - label_set: [ ] - label_value: "" - new_label: Type - new_value: NodeGPU - value_actions: [ ] - submatch_case: "" - - action: insert - aggregation_type: "" - experimental_match_labels: { } - group_resource_labels: { } - include: DCGM_FI_DEV_FB_USED_PERCENT - match_type: "" - new_name: container_gpu_memory_utilization - operations: - - action: add_label - aggregated_values: [ ] - aggregation_type: "" - experimental_scale: 0 - label: "" - label_set: [ ] - label_value: "" - new_label: Type - new_value: ContainerGPU - value_actions: [ ] - - action: experimental_scale_value - aggregated_values: [ ] - aggregation_type: "" - experimental_scale: 100 - label: "" - label_set: [ ] - label_value: "" - new_label: "" - new_value: "" - value_actions: [ ] - submatch_case: "" - - action: insert - aggregation_type: "" - experimental_match_labels: { } - group_resource_labels: { } - include: DCGM_FI_DEV_FB_USED_PERCENT - match_type: "" - new_name: pod_gpu_memory_utilization - operations: - - action: add_label - aggregated_values: [ ] - aggregation_type: "" - experimental_scale: 0 - label: "" - label_set: [ ] - label_value: "" - new_label: Type - new_value: PodGPU - value_actions: [ ] - - action: experimental_scale_value - aggregated_values: [ ] - aggregation_type: "" - experimental_scale: 100 - label: "" - label_set: [ ] - label_value: "" - new_label: "" - new_value: "" - value_actions: [ ] - submatch_case: "" - - action: insert - aggregation_type: "" - experimental_match_labels: { } - group_resource_labels: { } - include: DCGM_FI_DEV_FB_USED_PERCENT - match_type: "" - new_name: node_gpu_memory_utilization - operations: - - action: add_label - aggregated_values: [ ] - aggregation_type: "" - experimental_scale: 0 - label: "" - label_set: [ ] - label_value: "" - new_label: Type - new_value: NodeGPU - value_actions: [ ] - - action: experimental_scale_value - aggregated_values: [ ] - aggregation_type: "" - experimental_scale: 100 - label: "" - label_set: [ ] - label_value: "" - new_label: "" - new_value: "" - value_actions: [ ] - submatch_case: "" - gpu/containerinsights: {} - + batch/containerinsights: + metadata_cardinality_limit: 1000 + metadata_keys: [] + send_batch_max_size: 0 + send_batch_size: 8192 + timeout: 5s + batch/emf_logs: + metadata_cardinality_limit: 1000 + metadata_keys: [] + send_batch_max_size: 0 + send_batch_size: 8192 + timeout: 5s + gpu/containerinsights: {} + metricstransform/containerinsights: + transforms: + - action: insert + aggregation_type: "" + experimental_match_labels: + code: ^5.* + group_resource_labels: {} + include: apiserver_request_total + match_type: regexp + new_name: apiserver_request_total_5xx + operations: [] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: {} + group_resource_labels: {} + include: DCGM_FI_DEV_FB_USED_PERCENT + match_type: "" + new_name: container_gpu_memory_utilization + operations: + - action: add_label + aggregated_values: [] + aggregation_type: "" + experimental_scale: 0 + label: "" + label_set: [] + label_value: "" + new_label: Type + new_value: ContainerGPU + value_actions: [] + - action: experimental_scale_value + aggregated_values: [] + aggregation_type: "" + experimental_scale: 100 + label: "" + label_set: [] + label_value: "" + new_label: "" + new_value: "" + value_actions: [] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: {} + group_resource_labels: {} + include: DCGM_FI_DEV_FB_USED_PERCENT + match_type: "" + new_name: pod_gpu_memory_utilization + operations: + - action: add_label + aggregated_values: [] + aggregation_type: "" + experimental_scale: 0 + label: "" + label_set: [] + label_value: "" + new_label: Type + new_value: PodGPU + value_actions: [] + - action: experimental_scale_value + aggregated_values: [] + aggregation_type: "" + experimental_scale: 100 + label: "" + label_set: [] + label_value: "" + new_label: "" + new_value: "" + value_actions: [] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: {} + group_resource_labels: {} + include: DCGM_FI_DEV_FB_USED_PERCENT + match_type: "" + new_name: node_gpu_memory_utilization + operations: + - action: add_label + aggregated_values: [] + aggregation_type: "" + experimental_scale: 0 + label: "" + label_set: [] + label_value: "" + new_label: Type + new_value: NodeGPU + value_actions: [] + - action: experimental_scale_value + aggregated_values: [] + aggregation_type: "" + experimental_scale: 100 + label: "" + label_set: [] + label_value: "" + new_label: "" + new_value: "" + value_actions: [] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: {} + group_resource_labels: {} + include: DCGM_FI_DEV_FB_USED + match_type: "" + new_name: container_gpu_memory_used + operations: + - action: add_label + aggregated_values: [] + aggregation_type: "" + experimental_scale: 0 + label: "" + label_set: [] + label_value: "" + new_label: Type + new_value: ContainerGPU + value_actions: [] + - action: experimental_scale_value + aggregated_values: [] + aggregation_type: "" + experimental_scale: 1.048576e+06 + label: "" + label_set: [] + label_value: "" + new_label: "" + new_value: "" + value_actions: [] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: {} + group_resource_labels: {} + include: DCGM_FI_DEV_FB_USED + match_type: "" + new_name: pod_gpu_memory_used + operations: + - action: add_label + aggregated_values: [] + aggregation_type: "" + experimental_scale: 0 + label: "" + label_set: [] + label_value: "" + new_label: Type + new_value: PodGPU + value_actions: [] + - action: experimental_scale_value + aggregated_values: [] + aggregation_type: "" + experimental_scale: 1.048576e+06 + label: "" + label_set: [] + label_value: "" + new_label: "" + new_value: "" + value_actions: [] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: {} + group_resource_labels: {} + include: DCGM_FI_DEV_FB_USED + match_type: "" + new_name: node_gpu_memory_used + operations: + - action: add_label + aggregated_values: [] + aggregation_type: "" + experimental_scale: 0 + label: "" + label_set: [] + label_value: "" + new_label: Type + new_value: NodeGPU + value_actions: [] + - action: experimental_scale_value + aggregated_values: [] + aggregation_type: "" + experimental_scale: 1.048576e+06 + label: "" + label_set: [] + label_value: "" + new_label: "" + new_value: "" + value_actions: [] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: {} + group_resource_labels: {} + include: DCGM_FI_DEV_FB_TOTAL + match_type: "" + new_name: container_gpu_memory_total + operations: + - action: add_label + aggregated_values: [] + aggregation_type: "" + experimental_scale: 0 + label: "" + label_set: [] + label_value: "" + new_label: Type + new_value: ContainerGPU + value_actions: [] + - action: experimental_scale_value + aggregated_values: [] + aggregation_type: "" + experimental_scale: 1.048576e+06 + label: "" + label_set: [] + label_value: "" + new_label: "" + new_value: "" + value_actions: [] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: {} + group_resource_labels: {} + include: DCGM_FI_DEV_FB_TOTAL + match_type: "" + new_name: pod_gpu_memory_total + operations: + - action: add_label + aggregated_values: [] + aggregation_type: "" + experimental_scale: 0 + label: "" + label_set: [] + label_value: "" + new_label: Type + new_value: PodGPU + value_actions: [] + - action: experimental_scale_value + aggregated_values: [] + aggregation_type: "" + experimental_scale: 1.048576e+06 + label: "" + label_set: [] + label_value: "" + new_label: "" + new_value: "" + value_actions: [] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: {} + group_resource_labels: {} + include: DCGM_FI_DEV_FB_TOTAL + match_type: "" + new_name: node_gpu_memory_total + operations: + - action: add_label + aggregated_values: [] + aggregation_type: "" + experimental_scale: 0 + label: "" + label_set: [] + label_value: "" + new_label: Type + new_value: NodeGPU + value_actions: [] + - action: experimental_scale_value + aggregated_values: [] + aggregation_type: "" + experimental_scale: 1.048576e+06 + label: "" + label_set: [] + label_value: "" + new_label: "" + new_value: "" + value_actions: [] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: {} + group_resource_labels: {} + include: DCGM_FI_DEV_GPU_TEMP + match_type: "" + new_name: container_gpu_temperature + operations: + - action: add_label + aggregated_values: [] + aggregation_type: "" + experimental_scale: 0 + label: "" + label_set: [] + label_value: "" + new_label: Type + new_value: ContainerGPU + value_actions: [] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: {} + group_resource_labels: {} + include: DCGM_FI_DEV_GPU_TEMP + match_type: "" + new_name: pod_gpu_temperature + operations: + - action: add_label + aggregated_values: [] + aggregation_type: "" + experimental_scale: 0 + label: "" + label_set: [] + label_value: "" + new_label: Type + new_value: PodGPU + value_actions: [] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: {} + group_resource_labels: {} + include: DCGM_FI_DEV_GPU_TEMP + match_type: "" + new_name: node_gpu_temperature + operations: + - action: add_label + aggregated_values: [] + aggregation_type: "" + experimental_scale: 0 + label: "" + label_set: [] + label_value: "" + new_label: Type + new_value: NodeGPU + value_actions: [] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: {} + group_resource_labels: {} + include: DCGM_FI_DEV_POWER_USAGE + match_type: "" + new_name: container_gpu_power_draw + operations: + - action: add_label + aggregated_values: [] + aggregation_type: "" + experimental_scale: 0 + label: "" + label_set: [] + label_value: "" + new_label: Type + new_value: ContainerGPU + value_actions: [] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: {} + group_resource_labels: {} + include: DCGM_FI_DEV_POWER_USAGE + match_type: "" + new_name: pod_gpu_power_draw + operations: + - action: add_label + aggregated_values: [] + aggregation_type: "" + experimental_scale: 0 + label: "" + label_set: [] + label_value: "" + new_label: Type + new_value: PodGPU + value_actions: [] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: {} + group_resource_labels: {} + include: DCGM_FI_DEV_POWER_USAGE + match_type: "" + new_name: node_gpu_power_draw + operations: + - action: add_label + aggregated_values: [] + aggregation_type: "" + experimental_scale: 0 + label: "" + label_set: [] + label_value: "" + new_label: Type + new_value: NodeGPU + value_actions: [] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: {} + group_resource_labels: {} + include: DCGM_FI_DEV_GPU_UTIL + match_type: "" + new_name: container_gpu_utilization + operations: + - action: add_label + aggregated_values: [] + aggregation_type: "" + experimental_scale: 0 + label: "" + label_set: [] + label_value: "" + new_label: Type + new_value: ContainerGPU + value_actions: [] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: {} + group_resource_labels: {} + include: DCGM_FI_DEV_GPU_UTIL + match_type: "" + new_name: pod_gpu_utilization + operations: + - action: add_label + aggregated_values: [] + aggregation_type: "" + experimental_scale: 0 + label: "" + label_set: [] + label_value: "" + new_label: Type + new_value: PodGPU + value_actions: [] + submatch_case: "" + - action: insert + aggregation_type: "" + experimental_match_labels: {} + group_resource_labels: {} + include: DCGM_FI_DEV_GPU_UTIL + match_type: "" + new_name: node_gpu_utilization + operations: + - action: add_label + aggregated_values: [] + aggregation_type: "" + experimental_scale: 0 + label: "" + label_set: [] + label_value: "" + new_label: Type + new_value: NodeGPU + value_actions: [] + submatch_case: "" receivers: - awscontainerinsightreceiver: - add_container_name_metric_label: true - add_full_pod_name_metric_label: true - add_service_as_attribute: true - certificate_file_path: "" - cluster_name: TestCluster - collection_interval: 30s - container_orchestrator: eks - enable_control_plane_metrics: true - endpoint: "" - imds_retries: 2 - leader_lock_name: cwagent-clusterleader - leader_lock_using_config_map_only: true - local_mode: false - max_retries: 0 - no_verify_ssl: false - num_workers: 0 - prefer_full_pod_name: true - profile: default - proxy_address: "" - region: us-east-1 - request_timeout_seconds: 0 - resource_arn: "" - role_arn: "" - shared_credentials_file: - - /root/.aws/credentials - gpu_metrics: true - tcplog/emf_logs: - attributes: {} - encoding: utf-8 - id: tcp_input - listen_address: 0.0.0.0:25888 - operators: [] - output: [] - resource: {} - retry_on_failure: - enabled: false - initial_interval: 0s - max_elapsed_time: 0s - max_interval: 0s - storage: null - type: tcp_input - udplog/emf_logs: - attributes: {} - encoding: utf-8 - id: udp_input - listen_address: 0.0.0.0:25888 - multiline: - line_end_pattern: .^ - line_start_pattern: "" - omit_pattern: false - operators: [] - output: [] - resource: {} - retry_on_failure: - enabled: false - initial_interval: 0s - max_elapsed_time: 0s - max_interval: 0s - storage: null - type: udp_input + awscontainerinsightreceiver: + add_container_name_metric_label: true + add_full_pod_name_metric_label: true + add_service_as_attribute: true + certificate_file_path: "" + cluster_name: TestCluster + collection_interval: 30s + container_orchestrator: eks + enable_control_plane_metrics: true + endpoint: "" + gpu_metrics: true + imds_retries: 2 + leader_lock_name: cwagent-clusterleader + leader_lock_using_config_map_only: true + local_mode: false + max_retries: 0 + no_verify_ssl: false + num_workers: 0 + prefer_full_pod_name: true + profile: default + proxy_address: "" + region: us-east-1 + request_timeout_seconds: 0 + resource_arn: "" + role_arn: "" + shared_credentials_file: + - /root/.aws/credentials + tcplog/emf_logs: + attributes: {} + encoding: utf-8 + id: tcp_input + listen_address: 0.0.0.0:25888 + operators: [] + output: [] + resource: {} + retry_on_failure: + enabled: false + initial_interval: 0s + max_elapsed_time: 0s + max_interval: 0s + storage: null + type: tcp_input + udplog/emf_logs: + attributes: {} + encoding: utf-8 + id: udp_input + listen_address: 0.0.0.0:25888 + multiline: + line_end_pattern: .^ + line_start_pattern: "" + omit_pattern: false + operators: [] + output: [] + resource: {} + retry_on_failure: + enabled: false + initial_interval: 0s + max_elapsed_time: 0s + max_interval: 0s + storage: null + type: udp_input service: - extensions: - - agenthealth/logs - pipelines: - logs/emf_logs: - exporters: - - awscloudwatchlogs/emf_logs - processors: - - batch/emf_logs - receivers: - - tcplog/emf_logs - - udplog/emf_logs - metrics/containerinsights: - exporters: - - awsemf/containerinsights - processors: - - metricstransform/containerinsights - - batch/containerinsights - - gpu/containerinsights - receivers: - - awscontainerinsightreceiver - telemetry: - logs: - development: false - disable_caller: false - disable_stacktrace: false - encoding: console - error_output_paths: [] - initial_fields: {} - level: info - output_paths: [] - sampling: - enabled: true - initial: 2 - thereafter: 500 - tick: 10s - metrics: - address: "" - level: None - readers: [] - resource: {} - traces: - processors: [] - propagators: [] + extensions: + - agenthealth/logs + pipelines: + logs/emf_logs: + exporters: + - awscloudwatchlogs/emf_logs + processors: + - batch/emf_logs + receivers: + - tcplog/emf_logs + - udplog/emf_logs + metrics/containerinsights: + exporters: + - awsemf/containerinsights + processors: + - metricstransform/containerinsights + - batch/containerinsights + - gpu/containerinsights + receivers: + - awscontainerinsightreceiver + telemetry: + logs: + development: false + disable_caller: false + disable_stacktrace: false + encoding: console + error_output_paths: [] + initial_fields: {} + level: info + output_paths: [] + sampling: + enabled: true + initial: 2 + thereafter: 500 + tick: 10s + metrics: + address: "" + level: None + readers: [] + resource: {} + traces: + processors: [] + propagators: [] From 887a9d7fab002a0548120ca007d850293842ddbb Mon Sep 17 00:00:00 2001 From: Hyunsoo Kim Date: Tue, 27 Feb 2024 15:13:36 -0500 Subject: [PATCH 10/20] rename gpu processor package to gpuattributes and address comments --- .../{gpu => gpuattributes}/config.go | 2 +- .../{gpu => gpuattributes}/config_test.go | 2 +- .../{gpu => gpuattributes}/factory.go | 6 +-- .../{gpu => gpuattributes}/factory_test.go | 2 +- .../{gpu => gpuattributes}/processor.go | 40 +++++-------------- .../{gpu => gpuattributes}/processor_test.go | 2 +- service/defaultcomponents/components.go | 4 +- .../otel/processor/gpu/translator.go | 6 +-- .../metricstransformprocessor/translator.go | 10 ----- 9 files changed, 22 insertions(+), 52 deletions(-) rename plugins/processors/{gpu => gpuattributes}/config.go (95%) rename plugins/processors/{gpu => gpuattributes}/config_test.go (95%) rename plugins/processors/{gpu => gpuattributes}/factory.go (86%) rename plugins/processors/{gpu => gpuattributes}/factory_test.go (98%) rename plugins/processors/{gpu => gpuattributes}/processor.go (83%) rename plugins/processors/{gpu => gpuattributes}/processor_test.go (99%) diff --git a/plugins/processors/gpu/config.go b/plugins/processors/gpuattributes/config.go similarity index 95% rename from plugins/processors/gpu/config.go rename to plugins/processors/gpuattributes/config.go index b72cbdc39c..6dfd340d45 100644 --- a/plugins/processors/gpu/config.go +++ b/plugins/processors/gpuattributes/config.go @@ -1,7 +1,7 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: MIT -package gpu +package gpuattributes import ( "go.opentelemetry.io/collector/component" diff --git a/plugins/processors/gpu/config_test.go b/plugins/processors/gpuattributes/config_test.go similarity index 95% rename from plugins/processors/gpu/config_test.go rename to plugins/processors/gpuattributes/config_test.go index db3918301a..50f76e3ac8 100644 --- a/plugins/processors/gpu/config_test.go +++ b/plugins/processors/gpuattributes/config_test.go @@ -1,7 +1,7 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: MIT -package gpu +package gpuattributes import ( "testing" diff --git a/plugins/processors/gpu/factory.go b/plugins/processors/gpuattributes/factory.go similarity index 86% rename from plugins/processors/gpu/factory.go rename to plugins/processors/gpuattributes/factory.go index 9c7c61b21a..aae0a144b9 100644 --- a/plugins/processors/gpu/factory.go +++ b/plugins/processors/gpuattributes/factory.go @@ -1,7 +1,7 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: MIT -package gpu +package gpuattributes import ( "context" @@ -50,7 +50,5 @@ func createMetricsProcessor( cfg, nextConsumer, metricsProcessor.processMetrics, - processorhelper.WithCapabilities(processorCapabilities), - processorhelper.WithStart(metricsProcessor.Start), - processorhelper.WithShutdown(metricsProcessor.Shutdown)) + processorhelper.WithCapabilities(processorCapabilities)) } diff --git a/plugins/processors/gpu/factory_test.go b/plugins/processors/gpuattributes/factory_test.go similarity index 98% rename from plugins/processors/gpu/factory_test.go rename to plugins/processors/gpuattributes/factory_test.go index bae457d92b..7fd46aca74 100644 --- a/plugins/processors/gpu/factory_test.go +++ b/plugins/processors/gpuattributes/factory_test.go @@ -1,7 +1,7 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: MIT -package gpu +package gpuattributes import ( "context" diff --git a/plugins/processors/gpu/processor.go b/plugins/processors/gpuattributes/processor.go similarity index 83% rename from plugins/processors/gpu/processor.go rename to plugins/processors/gpuattributes/processor.go index 3c2493bd41..9a6066040c 100644 --- a/plugins/processors/gpu/processor.go +++ b/plugins/processors/gpuattributes/processor.go @@ -1,14 +1,13 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: MIT -package gpu +package gpuattributes import ( "context" "encoding/json" "strings" - "go.opentelemetry.io/collector/component" "go.opentelemetry.io/collector/pdata/pcommon" "go.opentelemetry.io/collector/pdata/pmetric" "go.uber.org/zap" @@ -34,6 +33,8 @@ var podContainerMetricLabels = map[string]map[string]interface{}{ "Namespace": nil, "Sources": nil, "UUID": nil, + "Service": nil, + "GpuDevice": nil, "kubernetes": nil, } @@ -52,26 +53,18 @@ var nodeMetricLabels = map[string]map[string]interface{}{ type gpuprocessor struct { *Config - logger *zap.Logger - cancelFunc context.CancelFunc - shutdownC chan bool - started bool + logger *zap.Logger } func newGpuProcessor(config *Config, logger *zap.Logger) *gpuprocessor { - _, cancel := context.WithCancel(context.Background()) d := &gpuprocessor{ - Config: config, - logger: logger, - cancelFunc: cancel, + Config: config, + logger: logger, } return d } -func (d *gpuprocessor) processMetrics(ctx context.Context, md pmetric.Metrics) (pmetric.Metrics, error) { - if !d.started { - return pmetric.NewMetrics(), nil - } +func (d *gpuprocessor) processMetrics(_ context.Context, md pmetric.Metrics) (pmetric.Metrics, error) { rms := md.ResourceMetrics() for i := 0; i < rms.Len(); i++ { @@ -82,14 +75,14 @@ func (d *gpuprocessor) processMetrics(ctx context.Context, md pmetric.Metrics) ( metrics := ils.Metrics() for k := 0; k < metrics.Len(); k++ { m := metrics.At(k) - d.processMetricAttributes(ctx, m) + d.processMetricAttributes(m) } } } return md, nil } -func (d *gpuprocessor) processMetricAttributes(_ context.Context, m pmetric.Metric) { +func (d *gpuprocessor) processMetricAttributes(m pmetric.Metric) { // only decorate GPU metrics // another option is to separate GPU of its own pipeline to minimize extra processing of metrics if !strings.Contains(m.Name(), gpuMetric) { @@ -101,6 +94,7 @@ func (d *gpuprocessor) processMetricAttributes(_ context.Context, m pmetric.Metr labels = nodeMetricLabels } else if strings.HasPrefix(m.Name(), gpuContainerMetricPrefix) { labels = podContainerMetricLabels + labels["ContainerName"] = nil labels["kubernetes"] = map[string]interface{}{ "container_name": nil, "containerd": nil, @@ -139,7 +133,7 @@ func (d *gpuprocessor) processMetricAttributes(_ context.Context, m pmetric.Metr } func (d *gpuprocessor) filterAttributes(attributes pcommon.Map, labels map[string]map[string]interface{}) { - if len(labels) < 1 { + if len(labels) == 0 { return } // remove labels that are no in the keep list @@ -180,15 +174,3 @@ func (d *gpuprocessor) filterAttributes(attributes pcommon.Map, labels map[strin } } } - -func (d *gpuprocessor) Shutdown(context.Context) error { - close(d.shutdownC) - d.cancelFunc() - return nil -} - -func (d *gpuprocessor) Start(ctx context.Context, _ component.Host) error { - d.shutdownC = make(chan bool) - d.started = true - return nil -} diff --git a/plugins/processors/gpu/processor_test.go b/plugins/processors/gpuattributes/processor_test.go similarity index 99% rename from plugins/processors/gpu/processor_test.go rename to plugins/processors/gpuattributes/processor_test.go index f5a8bcaf4a..7b4d73887f 100644 --- a/plugins/processors/gpu/processor_test.go +++ b/plugins/processors/gpuattributes/processor_test.go @@ -1,7 +1,7 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: MIT -package gpu +package gpuattributes import ( "context" diff --git a/service/defaultcomponents/components.go b/service/defaultcomponents/components.go index 0d14c9232f..2d575aed4e 100644 --- a/service/defaultcomponents/components.go +++ b/service/defaultcomponents/components.go @@ -29,7 +29,7 @@ import ( "github.com/aws/amazon-cloudwatch-agent/plugins/outputs/cloudwatch" "github.com/aws/amazon-cloudwatch-agent/plugins/processors/awsappsignals" "github.com/aws/amazon-cloudwatch-agent/plugins/processors/ec2tagger" - "github.com/aws/amazon-cloudwatch-agent/plugins/processors/gpu" + "github.com/aws/amazon-cloudwatch-agent/plugins/processors/gpuattributes" ) func Factories() (otelcol.Factories, error) { @@ -54,7 +54,7 @@ func Factories() (otelcol.Factories, error) { metricstransformprocessor.NewFactory(), resourcedetectionprocessor.NewFactory(), transformprocessor.NewFactory(), - gpu.NewFactory(), + gpuattributes.NewFactory(), ); err != nil { return otelcol.Factories{}, err } diff --git a/translator/translate/otel/processor/gpu/translator.go b/translator/translate/otel/processor/gpu/translator.go index 3f542a469c..ab7538ee0f 100644 --- a/translator/translate/otel/processor/gpu/translator.go +++ b/translator/translate/otel/processor/gpu/translator.go @@ -8,7 +8,7 @@ import ( "go.opentelemetry.io/collector/confmap" "go.opentelemetry.io/collector/processor" - "github.com/aws/amazon-cloudwatch-agent/plugins/processors/gpu" + "github.com/aws/amazon-cloudwatch-agent/plugins/processors/gpuattributes" "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/common" ) @@ -20,7 +20,7 @@ type translator struct { var _ common.Translator[component.Config] = (*translator)(nil) func NewTranslatorWithName(name string) common.Translator[component.Config] { - return &translator{name, gpu.NewFactory()} + return &translator{name, gpuattributes.NewFactory()} } func (t *translator) ID() component.ID { @@ -28,6 +28,6 @@ func (t *translator) ID() component.ID { } func (t *translator) Translate(conf *confmap.Conf) (component.Config, error) { - cfg := t.factory.CreateDefaultConfig().(*gpu.Config) + cfg := t.factory.CreateDefaultConfig().(*gpuattributes.Config) return cfg, nil } diff --git a/translator/translate/otel/processor/metricstransformprocessor/translator.go b/translator/translate/otel/processor/metricstransformprocessor/translator.go index 855bfa93d0..fe1eced9fb 100644 --- a/translator/translate/otel/processor/metricstransformprocessor/translator.go +++ b/translator/translate/otel/processor/metricstransformprocessor/translator.go @@ -23,16 +23,6 @@ var metricDuplicateTypes = []string{ containerinsightscommon.TypeNode, } -var defaultGpuLabels = []string{ - "ClusterName", - "Namespace", - "Service", - "ContainerName", - "FullPodName", - "PodName", - "GpuDevice", -} - var renameMapForDcgm = map[string]string{ "DCGM_FI_DEV_GPU_UTIL": containerinsightscommon.GpuUtilization, "DCGM_FI_DEV_FB_USED_PERCENT": containerinsightscommon.GpuMemUtilization, From e6d95b8c4a726f109840135c5716b2dbdfead914 Mon Sep 17 00:00:00 2001 From: Hyunsoo Kim Date: Tue, 27 Feb 2024 15:21:27 -0500 Subject: [PATCH 11/20] remove start from test --- plugins/processors/gpuattributes/processor_test.go | 1 - 1 file changed, 1 deletion(-) diff --git a/plugins/processors/gpuattributes/processor_test.go b/plugins/processors/gpuattributes/processor_test.go index 7b4d73887f..2b8b3c1728 100644 --- a/plugins/processors/gpuattributes/processor_test.go +++ b/plugins/processors/gpuattributes/processor_test.go @@ -22,7 +22,6 @@ func TestProcessMetrics(t *testing.T) { Config: createDefaultConfig().(*Config), } ctx := context.Background() - gp.Start(ctx, nil) testcases := map[string]struct { metrics pmetric.Metrics From 5a11adb837089faa69613bfd8d363b80febbd130 Mon Sep 17 00:00:00 2001 From: Hyunsoo Kim Date: Thu, 29 Feb 2024 16:32:39 -0500 Subject: [PATCH 12/20] update feature toggle flag to accelerated_compute_metrics use constant variables use slices for label filtering stop adding gpuattribtues processor when it's turned off --- internal/containerinsightscommon/const.go | 21 ++- internal/containerinsightscommon/k8sconst.go | 3 + internal/containerinsightscommon/util.go | 22 +-- .../ecsdecorator/ecsdecorator_test.go | 10 +- plugins/processors/ecsdecorator/metricRule.go | 4 +- .../ecsdecorator/metricRule_test.go | 6 +- plugins/processors/gpuattributes/factory.go | 4 +- plugins/processors/gpuattributes/processor.go | 154 ++++++++++-------- .../gpuattributes/processor_test.go | 31 +--- .../structuredlogsadapter/metricruletagger.go | 4 +- .../metricruletagger_test.go | 8 +- .../appsignals_and_eks_config.json | 2 +- .../appsignals_and_eks_config.yaml | 2 +- .../appsignals_and_k8s_config.json | 2 +- .../appsignals_and_k8s_config.yaml | 2 +- .../base_container_insights_config.json | 2 +- .../base_container_insights_config.yaml | 2 +- .../emf_and_kubernetes_config.json | 2 +- .../emf_and_kubernetes_config.yaml | 2 +- .../emf_and_kubernetes_with_gpu_config.yaml | 2 +- .../kubernetes_on_prem_config.json | 2 +- .../kubernetes_on_prem_config.yaml | 2 +- .../sampleConfig/log_ecs_metric_only.yaml | 2 +- .../logs_and_kubernetes_config.json | 2 +- .../logs_and_kubernetes_config.yaml | 2 +- translator/translate/otel/common/common.go | 2 +- .../otel/exporter/awsemf/kubernetes.go | 4 +- .../pipeline/containerinsights/translator.go | 7 +- .../metricstransformprocessor/translator.go | 21 +-- .../awscontainerinsight/translator.go | 2 +- .../{granularity.go => utils.go} | 4 + 31 files changed, 172 insertions(+), 163 deletions(-) rename translator/translate/otel/receiver/awscontainerinsight/{granularity.go => utils.go} (77%) diff --git a/internal/containerinsightscommon/const.go b/internal/containerinsightscommon/const.go index 62a20a48ee..4922050949 100644 --- a/internal/containerinsightscommon/const.go +++ b/internal/containerinsightscommon/const.go @@ -4,15 +4,19 @@ package containerinsightscommon const ( - InstanceId = "InstanceId" GoPSUtilProcDirEnv = "HOST_PROC" - MinTimeDiff = 50 * 1000 // We assume 50 micro-seconds is the minimal gap between two collected data sample to be valid to calculate delta - ClusterNameKey = "ClusterName" - NodeNameKey = "NodeName" + MinTimeDiff = 50 * 1000 // We assume 50 micro-seconds is the minimal gap between two collected data sample to be valid to calculate delta - MetricType = "Type" - SourcesKey = "Sources" + ClusterNameKey = "ClusterName" + NodeNameKey = "NodeName" // Attribute names + InstanceIdKey = "InstanceIdKey" + InstanceTypeKey = "InstanceType" + AutoScalingGroupNameKey = "AutoScalingGroupName" + VersionKey = "Version" + MetricType = "Type" + SourcesKey = "Sources" + GpuDeviceKey = "GpuDevice" // metric collected CpuTotal = "cpu_usage_total" @@ -81,6 +85,7 @@ const ( GpuRequest = "gpu_request" GpuLimit = "gpu_limit" GpuTotal = "gpu_total" + GpuUniqueId = "UUID" TypeCluster = "Cluster" TypeClusterService = "ClusterService" @@ -96,6 +101,10 @@ const ( TypeNodeNet = "NodeNet" TypeInstanceDiskIO = "InstanceDiskIO" TypeNodeDiskIO = "NodeDiskIO" + TypeGpuContainer = "ContainerGPU" + TypeGpuPod = "PodGPU" + TypeGpuNode = "NodeGPU" + TypeGpuCluster = "ClusterGPU" TypePod = "Pod" TypePodNet = "PodNet" diff --git a/internal/containerinsightscommon/k8sconst.go b/internal/containerinsightscommon/k8sconst.go index b975824d49..c8423665fa 100644 --- a/internal/containerinsightscommon/k8sconst.go +++ b/internal/containerinsightscommon/k8sconst.go @@ -12,11 +12,14 @@ const ( Kubernetes = "kubernetes" K8sNamespace = "Namespace" PodIdKey = "PodId" + FullPodNameKey = "FullPodName" PodNameKey = "PodName" K8sPodNameKey = "K8sPodName" ContainerNamekey = "ContainerName" ContainerIdkey = "ContainerId" PodOwnersKey = "PodOwners" + HostKey = "host" + K8sKey = "kubernetes" RunningPodCount = "number_of_running_pods" RunningContainerCount = "number_of_running_containers" diff --git a/internal/containerinsightscommon/util.go b/internal/containerinsightscommon/util.go index aa5d7c3e68..352bbcb0a8 100644 --- a/internal/containerinsightscommon/util.go +++ b/internal/containerinsightscommon/util.go @@ -34,35 +34,23 @@ func MetricName(mType string, name string) string { namespace := "namespace_" switch mType { - case TypeInstance: - prefix = instancePrefix - case TypeInstanceFS: - prefix = instancePrefix - case TypeInstanceDiskIO: + case TypeInstance, TypeInstanceFS, TypeInstanceDiskIO: prefix = instancePrefix case TypeInstanceNet: prefix = instanceNetPrefix - case TypeNode: - prefix = nodePrefix - case TypeNodeFS: - prefix = nodePrefix - case TypeNodeDiskIO: + case TypeNode, TypeNodeFS, TypeNodeDiskIO, TypeGpuNode: prefix = nodePrefix case TypeNodeNet: prefix = nodeNetPrefix - case TypePod: + case TypePod, TypeGpuPod: prefix = podPrefix case TypePodNet: prefix = podNetPrefix - case TypeContainer: - prefix = containerPrefix - case TypeContainerDiskIO: - prefix = containerPrefix - case TypeContainerFS: + case TypeContainer, TypeContainerDiskIO, TypeContainerFS, TypeGpuContainer: prefix = containerPrefix case TypeService: prefix = service - case TypeCluster: + case TypeCluster, TypeGpuCluster: prefix = cluster case K8sNamespace: prefix = namespace diff --git a/plugins/processors/ecsdecorator/ecsdecorator_test.go b/plugins/processors/ecsdecorator/ecsdecorator_test.go index f68be5e8e4..203c49933a 100644 --- a/plugins/processors/ecsdecorator/ecsdecorator_test.go +++ b/plugins/processors/ecsdecorator/ecsdecorator_test.go @@ -15,7 +15,7 @@ import ( ) func TestTagMetricSourceForTypeInstance(t *testing.T) { - tags := map[string]string{MetricType: TypeInstance, InstanceId: "TestEC2InstanceId", ContainerInstanceIdKey: "TestContainerInstanceId", ClusterNameKey: "TestClusterName"} + tags := map[string]string{MetricType: TypeInstance, InstanceIdKey: "TestEC2InstanceId", ContainerInstanceIdKey: "TestContainerInstanceId", ClusterNameKey: "TestClusterName"} fields := map[string]interface{}{MetricName(TypeInstance, CpuUtilization): 0, MetricName(TypeInstance, MemUtilization): 0, MetricName(TypeInstance, NetTotalBytes): 0, MetricName(TypeInstance, CpuReservedCapacity): 0, MetricName(TypeInstance, MemReservedCapacity): 0, MetricName(TypeInstance, RunningTaskCount): 0, MetricName(TypeInstance, CpuTotal): 0, @@ -29,7 +29,7 @@ func TestTagMetricSourceForTypeInstance(t *testing.T) { } func TestTagMetricSourceForTypeInstanceFS(t *testing.T) { - tags := map[string]string{MetricType: TypeInstanceFS, InstanceId: "TestEC2InstanceId", ContainerInstanceIdKey: "TestContainerInstanceId", ClusterNameKey: "TestClusterName"} + tags := map[string]string{MetricType: TypeInstanceFS, InstanceIdKey: "TestEC2InstanceId", ContainerInstanceIdKey: "TestContainerInstanceId", ClusterNameKey: "TestClusterName"} fields := map[string]interface{}{MetricName(TypeInstance, CpuUtilization): 0, MetricName(TypeInstance, MemUtilization): 0, MetricName(TypeInstance, NetTotalBytes): 0, MetricName(TypeInstance, CpuReservedCapacity): 0, MetricName(TypeInstance, MemReservedCapacity): 0, MetricName(TypeInstance, RunningTaskCount): 0, MetricName(TypeInstance, CpuTotal): 0, @@ -43,7 +43,7 @@ func TestTagMetricSourceForTypeInstanceFS(t *testing.T) { } func TestTagMetricSourceForTypeInstanceNet(t *testing.T) { - tags := map[string]string{MetricType: TypeInstanceNet, InstanceId: "TestEC2InstanceId", ContainerInstanceIdKey: "TestContainerInstanceId", ClusterNameKey: "TestClusterName"} + tags := map[string]string{MetricType: TypeInstanceNet, InstanceIdKey: "TestEC2InstanceId", ContainerInstanceIdKey: "TestContainerInstanceId", ClusterNameKey: "TestClusterName"} fields := map[string]interface{}{MetricName(TypeInstance, CpuUtilization): 0, MetricName(TypeInstance, MemUtilization): 0, MetricName(TypeInstance, NetTotalBytes): 0, MetricName(TypeInstance, CpuReservedCapacity): 0, MetricName(TypeInstance, MemReservedCapacity): 0, MetricName(TypeInstance, RunningTaskCount): 0, MetricName(TypeInstance, CpuTotal): 0, @@ -57,7 +57,7 @@ func TestTagMetricSourceForTypeInstanceNet(t *testing.T) { } func TestTagMetricSourceForTypeInstanceDiskIO(t *testing.T) { - tags := map[string]string{MetricType: TypeInstanceDiskIO, InstanceId: "TestEC2InstanceId", ContainerInstanceIdKey: "TestContainerInstanceId", ClusterNameKey: "TestClusterName"} + tags := map[string]string{MetricType: TypeInstanceDiskIO, InstanceIdKey: "TestEC2InstanceId", ContainerInstanceIdKey: "TestContainerInstanceId", ClusterNameKey: "TestClusterName"} fields := map[string]interface{}{MetricName(TypeInstance, CpuUtilization): 0, MetricName(TypeInstance, MemUtilization): 0, MetricName(TypeInstance, NetTotalBytes): 0, MetricName(TypeInstance, CpuReservedCapacity): 0, MetricName(TypeInstance, MemReservedCapacity): 0, MetricName(TypeInstance, RunningTaskCount): 0, MetricName(TypeInstance, CpuTotal): 0, @@ -71,7 +71,7 @@ func TestTagMetricSourceForTypeInstanceDiskIO(t *testing.T) { } func TestTagLogGroup(t *testing.T) { - tags := map[string]string{MetricType: TypeInstance, InstanceId: "TestEC2InstanceId", ContainerInstanceIdKey: "TestContainerInstanceId", ClusterNameKey: "TestClusterName"} + tags := map[string]string{MetricType: TypeInstance, InstanceIdKey: "TestEC2InstanceId", ContainerInstanceIdKey: "TestContainerInstanceId", ClusterNameKey: "TestClusterName"} fields := map[string]interface{}{MetricName(TypeInstance, CpuUtilization): 0, MetricName(TypeInstance, MemUtilization): 0, MetricName(TypeInstance, NetTotalBytes): 0, MetricName(TypeInstance, CpuReservedCapacity): 0, MetricName(TypeInstance, MemReservedCapacity): 0, MetricName(TypeInstance, RunningTaskCount): 0, MetricName(TypeInstance, CpuTotal): 0, diff --git a/plugins/processors/ecsdecorator/metricRule.go b/plugins/processors/ecsdecorator/metricRule.go index bbaa0e72e4..1cc2a08958 100644 --- a/plugins/processors/ecsdecorator/metricRule.go +++ b/plugins/processors/ecsdecorator/metricRule.go @@ -24,7 +24,7 @@ var nodeMetricRules = []structuredlogscommon.MetricRule{ {Unit: Percent, Name: MetricName(TypeInstance, MemReservedCapacity)}, {Unit: BytesPerSec, Name: MetricName(TypeInstance, NetTotalBytes)}, {Unit: Count, Name: MetricName(TypeInstance, RunningTaskCount)}}, - DimensionSets: [][]string{{ContainerInstanceIdKey, InstanceId, ClusterNameKey}}, + DimensionSets: [][]string{{ContainerInstanceIdKey, InstanceIdKey, ClusterNameKey}}, Namespace: cloudwatchNamespace, }, { @@ -48,7 +48,7 @@ var nodeFSMetricRules = []structuredlogscommon.MetricRule{ { Metrics: []structuredlogscommon.MetricAttr{ {Unit: Percent, Name: MetricName(TypeInstanceFS, FSUtilization)}}, - DimensionSets: [][]string{{ContainerInstanceIdKey, InstanceId, ClusterNameKey}, {ClusterNameKey}}, + DimensionSets: [][]string{{ContainerInstanceIdKey, InstanceIdKey, ClusterNameKey}, {ClusterNameKey}}, Namespace: cloudwatchNamespace, }, } diff --git a/plugins/processors/ecsdecorator/metricRule_test.go b/plugins/processors/ecsdecorator/metricRule_test.go index 650a1b429e..2cd3bf6942 100644 --- a/plugins/processors/ecsdecorator/metricRule_test.go +++ b/plugins/processors/ecsdecorator/metricRule_test.go @@ -17,7 +17,7 @@ import ( ) func TestNodeFull(t *testing.T) { - tags := map[string]string{MetricType: TypeInstance, InstanceId: "TestEC2InstanceId", ContainerInstanceIdKey: "TestContainerInstanceId", ClusterNameKey: "TestClusterName"} + tags := map[string]string{MetricType: TypeInstance, InstanceIdKey: "TestEC2InstanceId", ContainerInstanceIdKey: "TestContainerInstanceId", ClusterNameKey: "TestClusterName"} fields := map[string]interface{}{MetricName(TypeInstance, CpuUtilization): 0, MetricName(TypeInstance, MemUtilization): 0, MetricName(TypeInstance, NetTotalBytes): 0, MetricName(TypeInstance, CpuReservedCapacity): 0, MetricName(TypeInstance, MemReservedCapacity): 0, MetricName(TypeInstance, RunningTaskCount): 0, MetricName(TypeInstance, CpuTotal): 0, @@ -31,7 +31,7 @@ func TestNodeFull(t *testing.T) { } func TestNodeLackOfCpuUtilization(t *testing.T) { - tags := map[string]string{MetricType: TypeInstance, InstanceId: "TestEC2InstanceId", ContainerInstanceIdKey: "TestContainerInstanceId", ClusterNameKey: "TestClusterName"} + tags := map[string]string{MetricType: TypeInstance, InstanceIdKey: "TestEC2InstanceId", ContainerInstanceIdKey: "TestContainerInstanceId", ClusterNameKey: "TestClusterName"} fields := map[string]interface{}{MetricName(TypeInstance, MemUtilization): 0, MetricName(TypeInstance, NetTotalBytes): 0, MetricName(TypeInstance, CpuReservedCapacity): 0, MetricName(TypeInstance, MemReservedCapacity): 0, MetricName(TypeInstance, RunningTaskCount): 0, MetricName(TypeInstance, CpuTotal): 0, @@ -64,7 +64,7 @@ func TestNodeLackOfInstanceId(t *testing.T) { } func TestNodeFSFull(t *testing.T) { - tags := map[string]string{MetricType: TypeInstanceFS, InstanceId: "TestEC2InstanceId", ContainerInstanceIdKey: "TestContainerInstanceId", ClusterNameKey: "TestClusterName"} + tags := map[string]string{MetricType: TypeInstanceFS, InstanceIdKey: "TestEC2InstanceId", ContainerInstanceIdKey: "TestContainerInstanceId", ClusterNameKey: "TestClusterName"} fields := map[string]interface{}{MetricName(TypeInstanceFS, FSUtilization): 0} m := metric.New("test", tags, fields, time.Now()) new(ECSDecorator).tagMetricRule(m) diff --git a/plugins/processors/gpuattributes/factory.go b/plugins/processors/gpuattributes/factory.go index aae0a144b9..826174e290 100644 --- a/plugins/processors/gpuattributes/factory.go +++ b/plugins/processors/gpuattributes/factory.go @@ -14,7 +14,7 @@ import ( ) const ( - TypeStr = "gpu" + TypeStr = "gpuattributes" stability = component.StabilityLevelBeta ) @@ -42,7 +42,7 @@ func createMetricsProcessor( return nil, fmt.Errorf("configuration parsing error") } - metricsProcessor := newGpuProcessor(processorConfig, set.Logger) + metricsProcessor := newGpuAttributesProcessor(processorConfig, set.Logger) return processorhelper.NewMetricsProcessor( ctx, diff --git a/plugins/processors/gpuattributes/processor.go b/plugins/processors/gpuattributes/processor.go index 9a6066040c..c9e8a90eb6 100644 --- a/plugins/processors/gpuattributes/processor.go +++ b/plugins/processors/gpuattributes/processor.go @@ -8,64 +8,85 @@ import ( "encoding/json" "strings" + "github.com/aws/amazon-cloudwatch-agent/internal/containerinsightscommon" "go.opentelemetry.io/collector/pdata/pcommon" "go.opentelemetry.io/collector/pdata/pmetric" "go.uber.org/zap" ) const ( - gpuMetric = "_gpu_" + gpuMetricIdentifier = "_gpu_" gpuContainerMetricPrefix = "container_" gpuPodMetricPrefix = "pod_" gpuNodeMetricPrefix = "node_" ) -var podContainerMetricLabels = map[string]map[string]interface{}{ - "ClusterName": nil, - "FullPodName": nil, - "PodName": nil, - "InstanceId": nil, - "InstanceType": nil, - "NodeName": nil, - "Timestamp": nil, - "Type": nil, - "Version": nil, - "Namespace": nil, - "Sources": nil, - "UUID": nil, - "Service": nil, - "GpuDevice": nil, - "kubernetes": nil, +// schemas at each resource level +// - Container Schema +// - ClusterName +// - ClusterName, Namespace, PodName, ContainerName +// - ClusterName, Namespace, PodName, FullPodName, ContainerName +// - ClusterName, Namespace, PodName, FullPodName, ContainerName, GpuDevice +// +// - Pod +// - ClusterName +// - ClusterName, Namespace +// - ClusterName, Namespace, Service +// - ClusterName, Namespace, PodName +// - ClusterName, Namespace, PodName, FullPodName +// - ClusterName, Namespace, PodName, FullPodName, GpuDevice +// +// - Node +// - ClusterName +// - ClusterName, InstanceIdKey, NodeName +// - ClusterName, InstanceIdKey, NodeName, GpuDevice + +var commonLabels = []string{ + containerinsightscommon.ClusterNameKey, + containerinsightscommon.InstanceIdKey, + containerinsightscommon.GpuDeviceKey, + containerinsightscommon.MetricType, + containerinsightscommon.NodeNameKey, + containerinsightscommon.VersionKey, + containerinsightscommon.SourcesKey, + containerinsightscommon.Timestamp, } -var nodeMetricLabels = map[string]map[string]interface{}{ - "ClusterName": nil, - "InstanceId": nil, - "InstanceType": nil, - "NodeName": nil, - "Timestamp": nil, - "Type": nil, - "Version": nil, - "kubernetes": { - "host": nil, - }, +var podAndContainerLabels = []string{ + containerinsightscommon.K8sNamespace, + containerinsightscommon.FullPodNameKey, + containerinsightscommon.PodNameKey, + containerinsightscommon.TypeService, + containerinsightscommon.GpuUniqueId, } -type gpuprocessor struct { +var containerK8sBlobLabels = []string{ + "container_name", + "containerd", +} +var podK8sBlobLabels = []string{ + "host", + "labels", + "pod_id", + "pod_name", + "pod_owners", + "namespace", +} + +type gpuAttributesProcessor struct { *Config logger *zap.Logger } -func newGpuProcessor(config *Config, logger *zap.Logger) *gpuprocessor { - d := &gpuprocessor{ +func newGpuAttributesProcessor(config *Config, logger *zap.Logger) *gpuAttributesProcessor { + d := &gpuAttributesProcessor{ Config: config, logger: logger, } return d } -func (d *gpuprocessor) processMetrics(_ context.Context, md pmetric.Metrics) (pmetric.Metrics, error) { - +func (d *gpuAttributesProcessor) processMetrics(_ context.Context, md pmetric.Metrics) (pmetric.Metrics, error) { rms := md.ResourceMetrics() for i := 0; i < rms.Len(); i++ { rs := rms.At(i) @@ -82,39 +103,36 @@ func (d *gpuprocessor) processMetrics(_ context.Context, md pmetric.Metrics) (pm return md, nil } -func (d *gpuprocessor) processMetricAttributes(m pmetric.Metric) { +func (d *gpuAttributesProcessor) processMetricAttributes(m pmetric.Metric) { // only decorate GPU metrics - // another option is to separate GPU of its own pipeline to minimize extra processing of metrics - if !strings.Contains(m.Name(), gpuMetric) { + if !strings.Contains(m.Name(), gpuMetricIdentifier) { return } - var labels map[string]map[string]interface{} - if strings.HasPrefix(m.Name(), gpuNodeMetricPrefix) { - labels = nodeMetricLabels - } else if strings.HasPrefix(m.Name(), gpuContainerMetricPrefix) { - labels = podContainerMetricLabels - labels["ContainerName"] = nil - labels["kubernetes"] = map[string]interface{}{ - "container_name": nil, - "containerd": nil, - "host": nil, - "labels": nil, - "pod_id": nil, - "pod_name": nil, - "pod_owners": nil, - "namespace": nil, - } + var labels []string + labels = append(labels, commonLabels...) + k8sBlobLabels := []string{containerinsightscommon.HostKey} + if strings.HasPrefix(m.Name(), gpuContainerMetricPrefix) { + labels = append(labels, podAndContainerLabels...) + labels = append(labels, containerinsightscommon.ContainerNamekey) + k8sBlobLabels = append(k8sBlobLabels, containerK8sBlobLabels...) + k8sBlobLabels = append(k8sBlobLabels, podK8sBlobLabels...) } else if strings.HasPrefix(m.Name(), gpuPodMetricPrefix) { - labels = podContainerMetricLabels - labels["kubernetes"] = map[string]interface{}{ - "host": nil, - "labels": nil, - "pod_id": nil, - "pod_name": nil, - "pod_owners": nil, - "namespace": nil, - } + labels = append(labels, podAndContainerLabels...) + k8sBlobLabels = append(k8sBlobLabels, podK8sBlobLabels...) + } + + labelFilter := map[string]map[string]interface{}{} + for _, attr := range labels { + labelFilter[attr] = nil + } + + k8sBlobMap := map[string]interface{}{} + for _, attr := range k8sBlobLabels { + k8sBlobMap[attr] = nil + } + if len(k8sBlobMap) > 0 { + labelFilter[containerinsightscommon.K8sKey] = k8sBlobMap } var dps pmetric.NumberDataPointSlice @@ -124,19 +142,19 @@ func (d *gpuprocessor) processMetricAttributes(m pmetric.Metric) { case pmetric.MetricTypeSum: dps = m.Sum().DataPoints() default: - d.logger.Debug("Ignore unknown metric type", zap.String("type", m.Type().String())) + d.logger.Debug("Ignore unknown metric type", zap.String(containerinsightscommon.MetricType, m.Type().String())) } for i := 0; i < dps.Len(); i++ { - d.filterAttributes(dps.At(i).Attributes(), labels) + d.filterAttributes(dps.At(i).Attributes(), labelFilter) } } -func (d *gpuprocessor) filterAttributes(attributes pcommon.Map, labels map[string]map[string]interface{}) { +func (d *gpuAttributesProcessor) filterAttributes(attributes pcommon.Map, labels map[string]map[string]interface{}) { if len(labels) == 0 { return } - // remove labels that are no in the keep list + // remove labels that are not in the keep list attributes.RemoveIf(func(k string, _ pcommon.Value) bool { if _, ok := labels[k]; !ok { return true @@ -147,7 +165,7 @@ func (d *gpuprocessor) filterAttributes(attributes pcommon.Map, labels map[strin // if a label has child level filter list, that means the label is map type // only handles map type since there are currently only map and value types with GPU for lk, ls := range labels { - if len(ls) < 1 { + if len(ls) == 0 { continue } if av, ok := attributes.Get(lk); ok { @@ -156,7 +174,7 @@ func (d *gpuprocessor) filterAttributes(attributes pcommon.Map, labels map[strin strVal := av.Str() err := json.Unmarshal([]byte(strVal), &blob) if err != nil { - d.logger.Warn("gpuprocessor: failed to unmarshal label", zap.String("label", lk)) + d.logger.Warn("gpuAttributesProcessor: failed to unmarshal label", zap.String("label", lk)) continue } newBlob := make(map[string]json.RawMessage) @@ -167,7 +185,7 @@ func (d *gpuprocessor) filterAttributes(attributes pcommon.Map, labels map[strin } bytes, err := json.Marshal(newBlob) if err != nil { - d.logger.Warn("gpuprocessor: failed to marshall label", zap.String("label", lk)) + d.logger.Warn("gpuAttributesProcessor: failed to marshall label", zap.String("label", lk)) continue } attributes.PutStr(lk, string(bytes)) diff --git a/plugins/processors/gpuattributes/processor_test.go b/plugins/processors/gpuattributes/processor_test.go index 2b8b3c1728..25fec605be 100644 --- a/plugins/processors/gpuattributes/processor_test.go +++ b/plugins/processors/gpuattributes/processor_test.go @@ -17,22 +17,21 @@ var normalizedNameRegex = regexp.MustCompile("^(container|pod|node)_gpu_[a-z_]+$ func TestProcessMetrics(t *testing.T) { logger, _ := zap.NewDevelopment() - gp := &gpuprocessor{ + gp := &gpuAttributesProcessor{ logger: logger, Config: createDefaultConfig().(*Config), } ctx := context.Background() testcases := map[string]struct { - metrics pmetric.Metrics - labels map[string]map[string]interface{} - want map[string]string + resource string + metrics pmetric.Metrics + want map[string]string }{ "nonNode": { metrics: generateMetrics("prefix", map[string]string{ "ClusterName": "cluster", }), - labels: map[string]map[string]interface{}{}, want: map[string]string{ "ClusterName": "cluster", }, @@ -42,9 +41,6 @@ func TestProcessMetrics(t *testing.T) { "ClusterName": "cluster", "Drop": "val", }), - labels: map[string]map[string]interface{}{ - "ClusterName": {}, - }, want: map[string]string{ "ClusterName": "cluster", }, @@ -52,36 +48,27 @@ func TestProcessMetrics(t *testing.T) { "nodeDropJson": { metrics: generateMetrics("node", map[string]string{ "ClusterName": "cluster", - "kubernetes": "{\"a\":\"1\",\"b\":\"2\"}", + "kubernetes": "{\"host\":\"test\"}", }), - labels: map[string]map[string]interface{}{ - "ClusterName": {}, - "kubernetes": {"a": map[string]interface{}{}}, - }, want: map[string]string{ "ClusterName": "cluster", - "kubernetes": "{\"a\":\"1\"}", + "kubernetes": "{\"host\":\"test\"}", }, }, "nodeDropMixed": { metrics: generateMetrics("node", map[string]string{ "ClusterName": "cluster", "Drop": "val", - "kubernetes": "{\"a\":\"1\",\"b\":\"2\"}", + "kubernetes": "{\"host\":\"test\",\"b\":\"2\"}", }), - labels: map[string]map[string]interface{}{ - "ClusterName": {}, - "kubernetes": {"a": map[string]interface{}{}}, - }, want: map[string]string{ "ClusterName": "cluster", - "kubernetes": "{\"a\":\"1\"}", + "kubernetes": "{\"host\":\"test\"}", }, }, } for _, tc := range testcases { - nodeMetricLabels = tc.labels ms, _ := gp.processMetrics(ctx, tc.metrics) attrs := ms.ResourceMetrics().At(0).ScopeMetrics().At(0).Metrics().At(0).Gauge().DataPoints().At(0).Attributes() assert.Equal(t, len(tc.want), attrs.Len()) @@ -97,7 +84,7 @@ func generateMetrics(prefix string, dimensions map[string]string) pmetric.Metric md := pmetric.NewMetrics() m := md.ResourceMetrics().AppendEmpty().ScopeMetrics().AppendEmpty().Metrics().AppendEmpty() - m.SetName(prefix + gpuMetric) + m.SetName(prefix + gpuMetricIdentifier) gauge := m.SetEmptyGauge().DataPoints().AppendEmpty() gauge.SetIntValue(10) diff --git a/plugins/processors/k8sdecorator/structuredlogsadapter/metricruletagger.go b/plugins/processors/k8sdecorator/structuredlogsadapter/metricruletagger.go index 66613547ab..d44a428a5e 100644 --- a/plugins/processors/k8sdecorator/structuredlogsadapter/metricruletagger.go +++ b/plugins/processors/k8sdecorator/structuredlogsadapter/metricruletagger.go @@ -28,7 +28,7 @@ var nodeMetricRules = []structuredlogscommon.MetricRule{ {Unit: Percent, Name: MetricName(TypeNode, MemReservedCapacity)}, {Unit: Count, Name: MetricName(TypeNode, RunningPodCount)}, {Unit: Count, Name: MetricName(TypeNode, RunningContainerCount)}}, - DimensionSets: [][]string{{NodeNameKey, InstanceId, ClusterNameKey}}, + DimensionSets: [][]string{{NodeNameKey, InstanceIdKey, ClusterNameKey}}, Namespace: cloudwatchNamespace, }, { @@ -80,7 +80,7 @@ var nodeFSMetricRules = []structuredlogscommon.MetricRule{ { Metrics: []structuredlogscommon.MetricAttr{ {Unit: Percent, Name: MetricName(TypeNodeFS, FSUtilization)}}, - DimensionSets: [][]string{{NodeNameKey, InstanceId, ClusterNameKey}, {ClusterNameKey}}, + DimensionSets: [][]string{{NodeNameKey, InstanceIdKey, ClusterNameKey}, {ClusterNameKey}}, Namespace: cloudwatchNamespace, }, } diff --git a/plugins/processors/k8sdecorator/structuredlogsadapter/metricruletagger_test.go b/plugins/processors/k8sdecorator/structuredlogsadapter/metricruletagger_test.go index df48517ca3..778eaf4325 100644 --- a/plugins/processors/k8sdecorator/structuredlogsadapter/metricruletagger_test.go +++ b/plugins/processors/k8sdecorator/structuredlogsadapter/metricruletagger_test.go @@ -17,7 +17,7 @@ import ( ) func TestNodeFull(t *testing.T) { - tags := map[string]string{MetricType: TypeNode, NodeNameKey: "TestNodeName", ClusterNameKey: "TestClusterName", InstanceId: "i-123"} + tags := map[string]string{MetricType: TypeNode, NodeNameKey: "TestNodeName", ClusterNameKey: "TestClusterName", InstanceIdKey: "i-123"} fields := map[string]interface{}{MetricName(TypeNode, CpuUtilization): 0, MetricName(TypeNode, MemUtilization): 0, MetricName(TypeNode, NetTotalBytes): 0, MetricName(TypeNode, CpuReservedCapacity): 0, MetricName(TypeNode, MemReservedCapacity): 0, MetricName(TypeNode, RunningPodCount): 0, MetricName(TypeNode, RunningContainerCount): 0, MetricName(TypeNode, CpuTotal): 0, @@ -31,7 +31,7 @@ func TestNodeFull(t *testing.T) { } func TestNodeLackOfCpuUtilization(t *testing.T) { - tags := map[string]string{MetricType: TypeNode, NodeNameKey: "TestNodeName", ClusterNameKey: "TestClusterName", InstanceId: "i-123"} + tags := map[string]string{MetricType: TypeNode, NodeNameKey: "TestNodeName", ClusterNameKey: "TestClusterName", InstanceIdKey: "i-123"} fields := map[string]interface{}{MetricName(TypeNode, MemUtilization): 0, MetricName(TypeNode, NetTotalBytes): 0, MetricName(TypeNode, CpuReservedCapacity): 0, MetricName(TypeNode, MemReservedCapacity): 0, MetricName(TypeNode, RunningPodCount): 0, MetricName(TypeNode, RunningContainerCount): 0, MetricName(TypeNode, CpuTotal): 0, @@ -48,7 +48,7 @@ func TestNodeLackOfCpuUtilization(t *testing.T) { } func TestNodeLackOfNodeNameKey(t *testing.T) { - tags := map[string]string{MetricType: TypeNode, ClusterNameKey: "TestClusterName", InstanceId: "i-123"} + tags := map[string]string{MetricType: TypeNode, ClusterNameKey: "TestClusterName", InstanceIdKey: "i-123"} fields := map[string]interface{}{MetricName(TypeNode, CpuUtilization): 0, MetricName(TypeNode, MemUtilization): 0, MetricName(TypeNode, NetTotalBytes): 0, MetricName(TypeNode, CpuReservedCapacity): 0, MetricName(TypeNode, MemReservedCapacity): 0, MetricName(TypeNode, RunningPodCount): 0, MetricName(TypeNode, RunningContainerCount): 0, MetricName(TypeNode, CpuTotal): 0, @@ -91,7 +91,7 @@ func TestPodFullLackOfService(t *testing.T) { } func TestNodeFSFull(t *testing.T) { - tags := map[string]string{MetricType: TypeNodeFS, NodeNameKey: "TestNodeName", ClusterNameKey: "TestClusterName", InstanceId: "i-123"} + tags := map[string]string{MetricType: TypeNodeFS, NodeNameKey: "TestNodeName", ClusterNameKey: "TestClusterName", InstanceIdKey: "i-123"} fields := map[string]interface{}{MetricName(TypeNodeFS, FSUtilization): 0} m := metric.New("test", tags, fields, time.Now()) TagMetricRule(m) diff --git a/translator/tocwconfig/sampleConfig/appsignals_and_eks_config.json b/translator/tocwconfig/sampleConfig/appsignals_and_eks_config.json index fdd2c73048..e289b8adae 100644 --- a/translator/tocwconfig/sampleConfig/appsignals_and_eks_config.json +++ b/translator/tocwconfig/sampleConfig/appsignals_and_eks_config.json @@ -20,7 +20,7 @@ "metrics_collection_interval": 30, "disable_metric_extraction": true, "enhanced_container_insights": false, - "gpu_metrics": false + "accelerated_compute_metrics": false } }, "force_flush_interval": 5, diff --git a/translator/tocwconfig/sampleConfig/appsignals_and_eks_config.yaml b/translator/tocwconfig/sampleConfig/appsignals_and_eks_config.yaml index c6a54768ff..1497105046 100644 --- a/translator/tocwconfig/sampleConfig/appsignals_and_eks_config.yaml +++ b/translator/tocwconfig/sampleConfig/appsignals_and_eks_config.yaml @@ -633,7 +633,7 @@ receivers: resource_arn: "" role_arn: "" shared_credentials_file: [] - gpu_metrics: false + accelerated_compute_metrics: false otlp/app_signals: protocols: grpc: diff --git a/translator/tocwconfig/sampleConfig/appsignals_and_k8s_config.json b/translator/tocwconfig/sampleConfig/appsignals_and_k8s_config.json index 7be14c83c7..f89476d589 100644 --- a/translator/tocwconfig/sampleConfig/appsignals_and_k8s_config.json +++ b/translator/tocwconfig/sampleConfig/appsignals_and_k8s_config.json @@ -16,7 +16,7 @@ "metrics_collection_interval": 30, "disable_metric_extraction": true, "enhanced_container_insights": false, - "gpu_metrics": false + "accelerated_compute_metrics": false } }, "force_flush_interval": 5, diff --git a/translator/tocwconfig/sampleConfig/appsignals_and_k8s_config.yaml b/translator/tocwconfig/sampleConfig/appsignals_and_k8s_config.yaml index 60c3cf46f5..3309ee62b6 100644 --- a/translator/tocwconfig/sampleConfig/appsignals_and_k8s_config.yaml +++ b/translator/tocwconfig/sampleConfig/appsignals_and_k8s_config.yaml @@ -633,7 +633,7 @@ receivers: resource_arn: "" role_arn: "" shared_credentials_file: [] - gpu_metrics: false + accelerated_compute_metrics: false otlp/app_signals: protocols: grpc: diff --git a/translator/tocwconfig/sampleConfig/base_container_insights_config.json b/translator/tocwconfig/sampleConfig/base_container_insights_config.json index f089538b21..510cb41463 100644 --- a/translator/tocwconfig/sampleConfig/base_container_insights_config.json +++ b/translator/tocwconfig/sampleConfig/base_container_insights_config.json @@ -11,7 +11,7 @@ "metrics_collection_interval": 30, "disable_metric_extraction": true, "prefer_full_pod_name": true, - "gpu_metrics": false + "accelerated_compute_metrics": false } }, "force_flush_interval": 5, diff --git a/translator/tocwconfig/sampleConfig/base_container_insights_config.yaml b/translator/tocwconfig/sampleConfig/base_container_insights_config.yaml index 89e6375f87..af1e74489b 100644 --- a/translator/tocwconfig/sampleConfig/base_container_insights_config.yaml +++ b/translator/tocwconfig/sampleConfig/base_container_insights_config.yaml @@ -201,7 +201,7 @@ receivers: resource_arn: "" role_arn: "" shared_credentials_file: [] - gpu_metrics: false + accelerated_compute_metrics: false tcplog/emf_logs: attributes: {} encoding: utf-8 diff --git a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.json b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.json index c1c6807811..5581444869 100644 --- a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.json +++ b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.json @@ -11,7 +11,7 @@ "metrics_collection_interval": 30, "disable_metric_extraction": true, "enhanced_container_insights": true, - "gpu_metrics": false + "accelerated_compute_metrics": false } }, "force_flush_interval": 5, diff --git a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.yaml b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.yaml index cbd26cbc21..5ef38a2444 100644 --- a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.yaml +++ b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.yaml @@ -470,7 +470,7 @@ receivers: role_arn: "" shared_credentials_file: - /root/.aws/credentials - gpu_metrics: false + accelerated_compute_metrics: false tcplog/emf_logs: attributes: {} encoding: utf-8 diff --git a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml index 2c7b9fe03c..47706f1636 100644 --- a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml +++ b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml @@ -969,7 +969,7 @@ receivers: container_orchestrator: eks enable_control_plane_metrics: true endpoint: "" - gpu_metrics: true + accelerated_compute_metrics: true imds_retries: 2 leader_lock_name: cwagent-clusterleader leader_lock_using_config_map_only: true diff --git a/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.json b/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.json index 6109027d6f..1552a1b7b4 100644 --- a/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.json +++ b/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.json @@ -9,7 +9,7 @@ "metrics_collection_interval": 30, "disable_metric_extraction": true, "enhanced_container_insights": true, - "gpu_metrics": false + "accelerated_compute_metrics": false } }, "force_flush_interval": 5, diff --git a/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.yaml b/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.yaml index f51e46904a..b8a9c4fd91 100644 --- a/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.yaml +++ b/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.yaml @@ -430,7 +430,7 @@ receivers: role_arn: "" shared_credentials_file: - fake-path - gpu_metrics: false + accelerated_compute_metrics: false service: extensions: - agenthealth/logs diff --git a/translator/tocwconfig/sampleConfig/log_ecs_metric_only.yaml b/translator/tocwconfig/sampleConfig/log_ecs_metric_only.yaml index 10ba5b9138..5309a114c1 100644 --- a/translator/tocwconfig/sampleConfig/log_ecs_metric_only.yaml +++ b/translator/tocwconfig/sampleConfig/log_ecs_metric_only.yaml @@ -143,7 +143,7 @@ receivers: resource_arn: "" role_arn: "" shared_credentials_file: [] - gpu_metrics: true + accelerated_compute_metrics: true tcplog/emf_logs: attributes: {} encoding: utf-8 diff --git a/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.json b/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.json index 8ad16d0886..eed2cbb8ac 100644 --- a/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.json +++ b/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.json @@ -10,7 +10,7 @@ "cluster_name": "TestCluster", "metrics_collection_interval": 30, "enhanced_container_insights": true, - "gpu_metrics": false + "accelerated_compute_metrics": false } }, "logs_collected": { diff --git a/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.yaml b/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.yaml index bf63e010e9..f7281de8d5 100644 --- a/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.yaml +++ b/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.yaml @@ -467,7 +467,7 @@ receivers: resource_arn: "" role_arn: "" shared_credentials_file: [] - gpu_metrics: false + accelerated_compute_metrics: false tcplog/emf_logs: attributes: {} encoding: utf-8 diff --git a/translator/translate/otel/common/common.go b/translator/translate/otel/common/common.go index 3b12656df7..083f0d1e38 100644 --- a/translator/translate/otel/common/common.go +++ b/translator/translate/otel/common/common.go @@ -45,7 +45,7 @@ const ( ContainerInsightsMetricGranularity = "metric_granularity" // replaced with enhanced_container_insights EnhancedContainerInsights = "enhanced_container_insights" PreferFullPodName = "prefer_full_pod_name" - EnableGpuMetric = "gpu_metrics" + EnableAcceleratedComputingMetric = "accelerated_compute_metrics" Console = "console" DiskIOKey = "diskio" NetKey = "net" diff --git a/translator/translate/otel/exporter/awsemf/kubernetes.go b/translator/translate/otel/exporter/awsemf/kubernetes.go index 6e2b0bb791..f72a32ef73 100644 --- a/translator/translate/otel/exporter/awsemf/kubernetes.go +++ b/translator/translate/otel/exporter/awsemf/kubernetes.go @@ -7,7 +7,6 @@ import ( "github.com/open-telemetry/opentelemetry-collector-contrib/exporter/awsemfexporter" "go.opentelemetry.io/collector/confmap" - "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/common" "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/receiver/awscontainerinsight" ) @@ -464,9 +463,8 @@ func getControlPlaneMetricDescriptors(conf *confmap.Conf) []awsemfexporter.Metri func getGPUMetricDeclarations(conf *confmap.Conf) []*awsemfexporter.MetricDeclaration { var metricDeclarations []*awsemfexporter.MetricDeclaration - EnableGpuMetric := common.GetOrDefaultBool(conf, common.ConfigKey(common.LogsKey, common.MetricsCollectedKey, common.KubernetesKey, common.EnableGpuMetric), true) enhancedContainerInsightsEnabled := awscontainerinsight.EnhancedContainerInsightsEnabled(conf) - if EnableGpuMetric && enhancedContainerInsightsEnabled { + if awscontainerinsight.AcceleratedComputeMetricsEnabled(conf) && enhancedContainerInsightsEnabled { metricDeclarations = append(metricDeclarations, []*awsemfexporter.MetricDeclaration{ { Dimensions: [][]string{{"ClusterName"}, {"ClusterName", "Namespace", "PodName", "ContainerName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "ContainerName"}, {"ClusterName", "Namespace", "PodName", "FullPodName", "ContainerName", "GpuDevice"}}, diff --git a/translator/translate/otel/pipeline/containerinsights/translator.go b/translator/translate/otel/pipeline/containerinsights/translator.go index 29dec1b817..e1b83d8cd5 100644 --- a/translator/translate/otel/pipeline/containerinsights/translator.go +++ b/translator/translate/otel/pipeline/containerinsights/translator.go @@ -51,9 +51,14 @@ func (t *translator) Translate(conf *confmap.Conf) (*common.ComponentTranslators // Append the metricstransformprocessor only if enhanced container insights is enabled enhancedContainerInsightsEnabled := awscontainerinsight.EnhancedContainerInsightsEnabled(conf) if enhancedContainerInsightsEnabled { + processors := common.NewTranslatorMap(metricstransformprocessor.NewTranslatorWithName(pipelineName), batchprocessor.NewTranslatorWithNameAndSection(pipelineName, common.LogsKey)) + acceleratedComputeMetricsEnabled := awscontainerinsight.AcceleratedComputeMetricsEnabled(conf) + if acceleratedComputeMetricsEnabled { + processors.Set(gpu.NewTranslatorWithName(pipelineName)) + } return &common.ComponentTranslators{ Receivers: common.NewTranslatorMap(awscontainerinsight.NewTranslator()), - Processors: common.NewTranslatorMap(metricstransformprocessor.NewTranslatorWithName(pipelineName), batchprocessor.NewTranslatorWithNameAndSection(pipelineName, common.LogsKey), gpu.NewTranslatorWithName(pipelineName)), // EKS & ECS CI sit under metrics_collected in "logs" + Processors: processors, // EKS & ECS CI sit under metrics_collected in "logs" Exporters: common.NewTranslatorMap(awsemf.NewTranslatorWithName(pipelineName)), Extensions: common.NewTranslatorMap(agenthealth.NewTranslator(component.DataTypeLogs, []string{agenthealth.OperationPutLogEvents})), }, nil diff --git a/translator/translate/otel/processor/metricstransformprocessor/translator.go b/translator/translate/otel/processor/metricstransformprocessor/translator.go index fe1eced9fb..8604de207f 100644 --- a/translator/translate/otel/processor/metricstransformprocessor/translator.go +++ b/translator/translate/otel/processor/metricstransformprocessor/translator.go @@ -6,6 +6,7 @@ package metricstransformprocessor import ( "fmt" + "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/receiver/awscontainerinsight" "github.com/open-telemetry/opentelemetry-collector-contrib/processor/metricstransformprocessor" "go.opentelemetry.io/collector/component" "go.opentelemetry.io/collector/confmap" @@ -18,9 +19,9 @@ import ( const gpuLogSuffix = "GPU" var metricDuplicateTypes = []string{ - containerinsightscommon.TypeContainer, - containerinsightscommon.TypePod, - containerinsightscommon.TypeNode, + containerinsightscommon.TypeGpuContainer, + containerinsightscommon.TypeGpuPod, + containerinsightscommon.TypeGpuNode, } var renameMapForDcgm = map[string]string{ @@ -59,19 +60,19 @@ func (t *translator) Translate(conf *confmap.Conf) (component.Config, error) { }, } - if isGpuEnabled(conf) { + if awscontainerinsight.AcceleratedComputeMetricsEnabled(conf) { // appends DCGM metric transform rules for each metric type (container/pod/node) with following format: // { // "include": "DCGM_FI_DEV_GPU_UTIL", // "action": "insert", // "new_name": "container_gpu_utilization", - // "operations": [ + // "operations": [ // { // "action": "add_label", - // "new_label": "Type", + // "new_label": "Type", // "new_value": "ContainerGPU", // }, - // ... + // ... // ] // }, for old, new := range renameMapForDcgm { @@ -97,7 +98,7 @@ func (t *translator) Translate(conf *confmap.Conf) (component.Config, error) { { "action": "add_label", "new_label": containerinsightscommon.MetricType, - "new_value": t + gpuLogSuffix, + "new_value": t, }, }, operations...), }) @@ -114,7 +115,3 @@ func (t *translator) Translate(conf *confmap.Conf) (component.Config, error) { return cfg, nil } - -func isGpuEnabled(conf *confmap.Conf) bool { - return common.GetOrDefaultBool(conf, common.ConfigKey(common.LogsKey, common.MetricsCollectedKey, common.KubernetesKey, common.EnableGpuMetric), true) -} diff --git a/translator/translate/otel/receiver/awscontainerinsight/translator.go b/translator/translate/otel/receiver/awscontainerinsight/translator.go index e7b8a228a6..94360518a6 100644 --- a/translator/translate/otel/receiver/awscontainerinsight/translator.go +++ b/translator/translate/otel/receiver/awscontainerinsight/translator.go @@ -112,7 +112,7 @@ func (t *translator) Translate(conf *confmap.Conf) (component.Config, error) { } cfg.PrefFullPodName = cfg.PrefFullPodName || common.GetOrDefaultBool(conf, common.ConfigKey(common.LogsKey, common.MetricsCollectedKey, common.KubernetesKey, common.PreferFullPodName), false) - cfg.EnableGpuMetric = cfg.EnableGpuMetric || common.GetOrDefaultBool(conf, common.ConfigKey(common.LogsKey, common.MetricsCollectedKey, common.KubernetesKey, common.EnableGpuMetric), true) + cfg.EnableAcceleratedComputeMetrics = cfg.EnableAcceleratedComputeMetrics || AcceleratedComputeMetricsEnabled(conf) return cfg, nil } diff --git a/translator/translate/otel/receiver/awscontainerinsight/granularity.go b/translator/translate/otel/receiver/awscontainerinsight/utils.go similarity index 77% rename from translator/translate/otel/receiver/awscontainerinsight/granularity.go rename to translator/translate/otel/receiver/awscontainerinsight/utils.go index e0d364081b..6d5a7e3ff9 100644 --- a/translator/translate/otel/receiver/awscontainerinsight/granularity.go +++ b/translator/translate/otel/receiver/awscontainerinsight/utils.go @@ -23,3 +23,7 @@ func EnhancedContainerInsightsEnabled(conf *confmap.Conf) bool { } return isSet } + +func AcceleratedComputeMetricsEnabled(conf *confmap.Conf) bool { + return common.GetOrDefaultBool(conf, common.ConfigKey(common.LogsKey, common.MetricsCollectedKey, common.KubernetesKey, common.EnableAcceleratedComputingMetric), true) +} From c47375d560a95f8a1c4408052de972cba22404c8 Mon Sep 17 00:00:00 2001 From: Hyunsoo Kim Date: Thu, 29 Feb 2024 22:00:02 -0500 Subject: [PATCH 13/20] prebuild gpu attribute filter lists update test cases update feature toggle variable name --- internal/containerinsightscommon/const.go | 2 +- plugins/processors/gpuattributes/processor.go | 48 +++++++++---------- .../gpuattributes/processor_test.go | 4 +- .../emf_and_kubernetes_config.yaml | 2 - .../emf_and_kubernetes_with_gpu_config.yaml | 4 +- .../kubernetes_on_prem_config.yaml | 2 - .../logs_and_kubernetes_config.yaml | 2 - translator/translate/otel/common/common.go | 2 +- .../pipeline/containerinsights/translator.go | 3 +- .../containerinsights/translator_test.go | 2 +- .../receiver/awscontainerinsight/utils.go | 2 +- 11 files changed, 35 insertions(+), 38 deletions(-) diff --git a/internal/containerinsightscommon/const.go b/internal/containerinsightscommon/const.go index 4922050949..fd534396f7 100644 --- a/internal/containerinsightscommon/const.go +++ b/internal/containerinsightscommon/const.go @@ -10,7 +10,7 @@ const ( ClusterNameKey = "ClusterName" NodeNameKey = "NodeName" // Attribute names - InstanceIdKey = "InstanceIdKey" + InstanceIdKey = "InstanceId" InstanceTypeKey = "InstanceType" AutoScalingGroupNameKey = "AutoScalingGroupName" VersionKey = "Version" diff --git a/plugins/processors/gpuattributes/processor.go b/plugins/processors/gpuattributes/processor.go index c9e8a90eb6..cd5137453e 100644 --- a/plugins/processors/gpuattributes/processor.go +++ b/plugins/processors/gpuattributes/processor.go @@ -40,8 +40,7 @@ const ( // - ClusterName // - ClusterName, InstanceIdKey, NodeName // - ClusterName, InstanceIdKey, NodeName, GpuDevice - -var commonLabels = []string{ +var nodeLabels = []string{ containerinsightscommon.ClusterNameKey, containerinsightscommon.InstanceIdKey, containerinsightscommon.GpuDeviceKey, @@ -51,27 +50,30 @@ var commonLabels = []string{ containerinsightscommon.SourcesKey, containerinsightscommon.Timestamp, } - -var podAndContainerLabels = []string{ +var podLabels = append([]string{ containerinsightscommon.K8sNamespace, containerinsightscommon.FullPodNameKey, containerinsightscommon.PodNameKey, containerinsightscommon.TypeService, containerinsightscommon.GpuUniqueId, -} +}, nodeLabels...) +var containerLabels = append([]string{ + containerinsightscommon.ContainerNamekey, +}, podLabels...) -var containerK8sBlobLabels = []string{ - "container_name", - "containerd", -} -var podK8sBlobLabels = []string{ +var nodeK8sLabels = []string{containerinsightscommon.HostKey} +var podK8sLabels = append([]string{ "host", "labels", "pod_id", "pod_name", "pod_owners", "namespace", -} +}, nodeK8sLabels...) +var containerK8sLabels = append([]string{ + "container_name", + "containerd", +}, podK8sLabels...) type gpuAttributesProcessor struct { *Config @@ -109,24 +111,22 @@ func (d *gpuAttributesProcessor) processMetricAttributes(m pmetric.Metric) { return } - var labels []string - labels = append(labels, commonLabels...) - k8sBlobLabels := []string{containerinsightscommon.HostKey} + var labels, k8sBlobLabels []string if strings.HasPrefix(m.Name(), gpuContainerMetricPrefix) { - labels = append(labels, podAndContainerLabels...) - labels = append(labels, containerinsightscommon.ContainerNamekey) - k8sBlobLabels = append(k8sBlobLabels, containerK8sBlobLabels...) - k8sBlobLabels = append(k8sBlobLabels, podK8sBlobLabels...) + labels = containerLabels + k8sBlobLabels = containerK8sLabels } else if strings.HasPrefix(m.Name(), gpuPodMetricPrefix) { - labels = append(labels, podAndContainerLabels...) - k8sBlobLabels = append(k8sBlobLabels, podK8sBlobLabels...) + labels = podLabels + k8sBlobLabels = podK8sLabels + } else if strings.HasPrefix(m.Name(), gpuNodeMetricPrefix) { + labels = nodeLabels + k8sBlobLabels = nodeK8sLabels } labelFilter := map[string]map[string]interface{}{} for _, attr := range labels { labelFilter[attr] = nil } - k8sBlobMap := map[string]interface{}{} for _, attr := range k8sBlobLabels { k8sBlobMap[attr] = nil @@ -156,10 +156,10 @@ func (d *gpuAttributesProcessor) filterAttributes(attributes pcommon.Map, labels } // remove labels that are not in the keep list attributes.RemoveIf(func(k string, _ pcommon.Value) bool { - if _, ok := labels[k]; !ok { - return true + if _, ok := labels[k]; ok { + return false } - return false + return true }) // if a label has child level filter list, that means the label is map type diff --git a/plugins/processors/gpuattributes/processor_test.go b/plugins/processors/gpuattributes/processor_test.go index 25fec605be..c96b2178e6 100644 --- a/plugins/processors/gpuattributes/processor_test.go +++ b/plugins/processors/gpuattributes/processor_test.go @@ -5,6 +5,7 @@ package gpuattributes import ( "context" + "fmt" "testing" "github.com/grafana/regexp" @@ -68,7 +69,8 @@ func TestProcessMetrics(t *testing.T) { }, } - for _, tc := range testcases { + for tname, tc := range testcases { + fmt.Printf("running %s\n", tname) ms, _ := gp.processMetrics(ctx, tc.metrics) attrs := ms.ResourceMetrics().At(0).ScopeMetrics().At(0).Metrics().At(0).Gauge().DataPoints().At(0).Attributes() assert.Equal(t, len(tc.want), attrs.Len()) diff --git a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.yaml b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.yaml index 5ef38a2444..a93fb7fc4d 100644 --- a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.yaml +++ b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_config.yaml @@ -442,7 +442,6 @@ processors: new_name: apiserver_request_total_5xx operations: [] submatch_case: "" - gpu/containerinsights: {} receivers: awscontainerinsightreceiver: add_container_name_metric_label: true @@ -523,7 +522,6 @@ service: processors: - metricstransform/containerinsights - batch/containerinsights - - gpu/containerinsights receivers: - awscontainerinsightreceiver telemetry: diff --git a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml index 47706f1636..2fe3980845 100644 --- a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml +++ b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml @@ -513,7 +513,7 @@ processors: send_batch_max_size: 0 send_batch_size: 8192 timeout: 5s - gpu/containerinsights: {} + gpuattributes/containerinsights: {} metricstransform/containerinsights: transforms: - action: insert @@ -1038,7 +1038,7 @@ service: processors: - metricstransform/containerinsights - batch/containerinsights - - gpu/containerinsights + - gpuattributes/containerinsights receivers: - awscontainerinsightreceiver telemetry: diff --git a/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.yaml b/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.yaml index b8a9c4fd91..8ed36b21d0 100644 --- a/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.yaml +++ b/translator/tocwconfig/sampleConfig/kubernetes_on_prem_config.yaml @@ -402,7 +402,6 @@ processors: new_name: apiserver_request_total_5xx operations: [] submatch_case: "" - gpu/containerinsights: {} receivers: awscontainerinsightreceiver: add_container_name_metric_label: true @@ -441,7 +440,6 @@ service: processors: - metricstransform/containerinsights - batch/containerinsights - - gpu/containerinsights receivers: - awscontainerinsightreceiver telemetry: diff --git a/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.yaml b/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.yaml index f7281de8d5..7c7f742159 100644 --- a/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.yaml +++ b/translator/tocwconfig/sampleConfig/logs_and_kubernetes_config.yaml @@ -440,7 +440,6 @@ processors: new_name: apiserver_request_total_5xx operations: [] submatch_case: "" - gpu/containerinsights: {} receivers: awscontainerinsightreceiver: add_container_name_metric_label: true @@ -520,7 +519,6 @@ service: processors: - metricstransform/containerinsights - batch/containerinsights - - gpu/containerinsights receivers: - awscontainerinsightreceiver telemetry: diff --git a/translator/translate/otel/common/common.go b/translator/translate/otel/common/common.go index 083f0d1e38..61cfba9d03 100644 --- a/translator/translate/otel/common/common.go +++ b/translator/translate/otel/common/common.go @@ -45,7 +45,7 @@ const ( ContainerInsightsMetricGranularity = "metric_granularity" // replaced with enhanced_container_insights EnhancedContainerInsights = "enhanced_container_insights" PreferFullPodName = "prefer_full_pod_name" - EnableAcceleratedComputingMetric = "accelerated_compute_metrics" + EnableAcceleratedComputeMetric = "accelerated_compute_metrics" Console = "console" DiskIOKey = "diskio" NetKey = "net" diff --git a/translator/translate/otel/pipeline/containerinsights/translator.go b/translator/translate/otel/pipeline/containerinsights/translator.go index e1b83d8cd5..4e5169ffac 100644 --- a/translator/translate/otel/pipeline/containerinsights/translator.go +++ b/translator/translate/otel/pipeline/containerinsights/translator.go @@ -51,11 +51,12 @@ func (t *translator) Translate(conf *confmap.Conf) (*common.ComponentTranslators // Append the metricstransformprocessor only if enhanced container insights is enabled enhancedContainerInsightsEnabled := awscontainerinsight.EnhancedContainerInsightsEnabled(conf) if enhancedContainerInsightsEnabled { - processors := common.NewTranslatorMap(metricstransformprocessor.NewTranslatorWithName(pipelineName), batchprocessor.NewTranslatorWithNameAndSection(pipelineName, common.LogsKey)) + processors := common.NewTranslatorMap(metricstransformprocessor.NewTranslatorWithName(pipelineName)) acceleratedComputeMetricsEnabled := awscontainerinsight.AcceleratedComputeMetricsEnabled(conf) if acceleratedComputeMetricsEnabled { processors.Set(gpu.NewTranslatorWithName(pipelineName)) } + processors.Set(batchprocessor.NewTranslatorWithNameAndSection(pipelineName, common.LogsKey)) return &common.ComponentTranslators{ Receivers: common.NewTranslatorMap(awscontainerinsight.NewTranslator()), Processors: processors, // EKS & ECS CI sit under metrics_collected in "logs" diff --git a/translator/translate/otel/pipeline/containerinsights/translator_test.go b/translator/translate/otel/pipeline/containerinsights/translator_test.go index 70f723903f..68445b5901 100644 --- a/translator/translate/otel/pipeline/containerinsights/translator_test.go +++ b/translator/translate/otel/pipeline/containerinsights/translator_test.go @@ -81,7 +81,7 @@ func TestTranslator(t *testing.T) { want: &want{ pipelineType: "metrics/containerinsights", receivers: []string{"awscontainerinsightreceiver"}, - processors: []string{"metricstransform/containerinsights", "batch/containerinsights", "gpu/containerinsights"}, + processors: []string{"metricstransform/containerinsights", "batch/containerinsights", "gpuattributes/containerinsights"}, exporters: []string{"awsemf/containerinsights"}, extensions: []string{"agenthealth/logs"}, }, diff --git a/translator/translate/otel/receiver/awscontainerinsight/utils.go b/translator/translate/otel/receiver/awscontainerinsight/utils.go index 6d5a7e3ff9..721951b056 100644 --- a/translator/translate/otel/receiver/awscontainerinsight/utils.go +++ b/translator/translate/otel/receiver/awscontainerinsight/utils.go @@ -25,5 +25,5 @@ func EnhancedContainerInsightsEnabled(conf *confmap.Conf) bool { } func AcceleratedComputeMetricsEnabled(conf *confmap.Conf) bool { - return common.GetOrDefaultBool(conf, common.ConfigKey(common.LogsKey, common.MetricsCollectedKey, common.KubernetesKey, common.EnableAcceleratedComputingMetric), true) + return common.GetOrDefaultBool(conf, common.ConfigKey(common.LogsKey, common.MetricsCollectedKey, common.KubernetesKey, common.EnableAcceleratedComputeMetric), true) } From 8037d9bf47fd8d85d56afd7e1d620d9190e62b6c Mon Sep 17 00:00:00 2001 From: Hyunsoo Kim Date: Thu, 29 Feb 2024 22:03:15 -0500 Subject: [PATCH 14/20] fix format --- plugins/processors/gpuattributes/processor.go | 3 ++- .../otel/processor/metricstransformprocessor/translator.go | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/plugins/processors/gpuattributes/processor.go b/plugins/processors/gpuattributes/processor.go index cd5137453e..38e2a392dd 100644 --- a/plugins/processors/gpuattributes/processor.go +++ b/plugins/processors/gpuattributes/processor.go @@ -8,10 +8,11 @@ import ( "encoding/json" "strings" - "github.com/aws/amazon-cloudwatch-agent/internal/containerinsightscommon" "go.opentelemetry.io/collector/pdata/pcommon" "go.opentelemetry.io/collector/pdata/pmetric" "go.uber.org/zap" + + "github.com/aws/amazon-cloudwatch-agent/internal/containerinsightscommon" ) const ( diff --git a/translator/translate/otel/processor/metricstransformprocessor/translator.go b/translator/translate/otel/processor/metricstransformprocessor/translator.go index 8604de207f..316cd7221a 100644 --- a/translator/translate/otel/processor/metricstransformprocessor/translator.go +++ b/translator/translate/otel/processor/metricstransformprocessor/translator.go @@ -6,12 +6,13 @@ package metricstransformprocessor import ( "fmt" - "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/receiver/awscontainerinsight" "github.com/open-telemetry/opentelemetry-collector-contrib/processor/metricstransformprocessor" "go.opentelemetry.io/collector/component" "go.opentelemetry.io/collector/confmap" "go.opentelemetry.io/collector/processor" + "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/receiver/awscontainerinsight" + "github.com/aws/amazon-cloudwatch-agent/internal/containerinsightscommon" "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/common" ) From 312a86bef15c8798b0c76c216d2db52b1684d1c9 Mon Sep 17 00:00:00 2001 From: Hyunsoo Kim Date: Thu, 29 Feb 2024 22:13:26 -0500 Subject: [PATCH 15/20] update test --- .../otel/pipeline/containerinsights/translator_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/translator/translate/otel/pipeline/containerinsights/translator_test.go b/translator/translate/otel/pipeline/containerinsights/translator_test.go index 68445b5901..d3a4bb67a1 100644 --- a/translator/translate/otel/pipeline/containerinsights/translator_test.go +++ b/translator/translate/otel/pipeline/containerinsights/translator_test.go @@ -81,7 +81,7 @@ func TestTranslator(t *testing.T) { want: &want{ pipelineType: "metrics/containerinsights", receivers: []string{"awscontainerinsightreceiver"}, - processors: []string{"metricstransform/containerinsights", "batch/containerinsights", "gpuattributes/containerinsights"}, + processors: []string{"metricstransform/containerinsights", "gpuattributes/containerinsights", "batch/containerinsights"}, exporters: []string{"awsemf/containerinsights"}, extensions: []string{"agenthealth/logs"}, }, From 6452bcd45b0fbf13748b1feb8faaf812e226c7dc Mon Sep 17 00:00:00 2001 From: Hyunsoo Kim Date: Thu, 29 Feb 2024 23:15:36 -0500 Subject: [PATCH 16/20] prepopulate label filter --- plugins/processors/gpuattributes/processor.go | 117 ++++++++++-------- 1 file changed, 65 insertions(+), 52 deletions(-) diff --git a/plugins/processors/gpuattributes/processor.go b/plugins/processors/gpuattributes/processor.go index 38e2a392dd..6d54eece81 100644 --- a/plugins/processors/gpuattributes/processor.go +++ b/plugins/processors/gpuattributes/processor.go @@ -41,40 +41,68 @@ const ( // - ClusterName // - ClusterName, InstanceIdKey, NodeName // - ClusterName, InstanceIdKey, NodeName, GpuDevice -var nodeLabels = []string{ - containerinsightscommon.ClusterNameKey, - containerinsightscommon.InstanceIdKey, - containerinsightscommon.GpuDeviceKey, - containerinsightscommon.MetricType, - containerinsightscommon.NodeNameKey, - containerinsightscommon.VersionKey, - containerinsightscommon.SourcesKey, - containerinsightscommon.Timestamp, +var containerLabelFilter = map[string]map[string]interface{}{ + containerinsightscommon.ClusterNameKey: nil, + containerinsightscommon.InstanceIdKey: nil, + containerinsightscommon.GpuDeviceKey: nil, + containerinsightscommon.MetricType: nil, + containerinsightscommon.NodeNameKey: nil, + containerinsightscommon.K8sNamespace: nil, + containerinsightscommon.FullPodNameKey: nil, + containerinsightscommon.PodNameKey: nil, + containerinsightscommon.TypeService: nil, + containerinsightscommon.GpuUniqueId: nil, + containerinsightscommon.ContainerNamekey: nil, + containerinsightscommon.VersionKey: nil, + containerinsightscommon.SourcesKey: nil, + containerinsightscommon.Timestamp: nil, + containerinsightscommon.K8sKey: { + containerinsightscommon.HostKey: nil, + "labels": nil, + "pod_id": nil, + "pod_name": nil, + "pod_owners": nil, + "namespace": nil, + "container_name": nil, + "containerd": nil, + }, +} +var podLabelFilter = map[string]map[string]interface{}{ + containerinsightscommon.ClusterNameKey: nil, + containerinsightscommon.InstanceIdKey: nil, + containerinsightscommon.GpuDeviceKey: nil, + containerinsightscommon.MetricType: nil, + containerinsightscommon.NodeNameKey: nil, + containerinsightscommon.K8sNamespace: nil, + containerinsightscommon.FullPodNameKey: nil, + containerinsightscommon.PodNameKey: nil, + containerinsightscommon.TypeService: nil, + containerinsightscommon.GpuUniqueId: nil, + containerinsightscommon.VersionKey: nil, + containerinsightscommon.SourcesKey: nil, + containerinsightscommon.Timestamp: nil, + containerinsightscommon.K8sKey: { + containerinsightscommon.HostKey: nil, + "labels": nil, + "pod_id": nil, + "pod_name": nil, + "pod_owners": nil, + "namespace": nil, + }, +} +var nodeLabelFilter = map[string]map[string]interface{}{ + containerinsightscommon.ClusterNameKey: nil, + containerinsightscommon.InstanceIdKey: nil, + containerinsightscommon.GpuDeviceKey: nil, + containerinsightscommon.MetricType: nil, + containerinsightscommon.NodeNameKey: nil, + containerinsightscommon.VersionKey: nil, + containerinsightscommon.SourcesKey: nil, + containerinsightscommon.Timestamp: nil, + containerinsightscommon.K8sKey: { + containerinsightscommon.HostKey: nil, + }, } -var podLabels = append([]string{ - containerinsightscommon.K8sNamespace, - containerinsightscommon.FullPodNameKey, - containerinsightscommon.PodNameKey, - containerinsightscommon.TypeService, - containerinsightscommon.GpuUniqueId, -}, nodeLabels...) -var containerLabels = append([]string{ - containerinsightscommon.ContainerNamekey, -}, podLabels...) - -var nodeK8sLabels = []string{containerinsightscommon.HostKey} -var podK8sLabels = append([]string{ - "host", - "labels", - "pod_id", - "pod_name", - "pod_owners", - "namespace", -}, nodeK8sLabels...) -var containerK8sLabels = append([]string{ - "container_name", - "containerd", -}, podK8sLabels...) type gpuAttributesProcessor struct { *Config @@ -112,28 +140,13 @@ func (d *gpuAttributesProcessor) processMetricAttributes(m pmetric.Metric) { return } - var labels, k8sBlobLabels []string + labelFilter := map[string]map[string]interface{}{} if strings.HasPrefix(m.Name(), gpuContainerMetricPrefix) { - labels = containerLabels - k8sBlobLabels = containerK8sLabels + labelFilter = containerLabelFilter } else if strings.HasPrefix(m.Name(), gpuPodMetricPrefix) { - labels = podLabels - k8sBlobLabels = podK8sLabels + labelFilter = podLabelFilter } else if strings.HasPrefix(m.Name(), gpuNodeMetricPrefix) { - labels = nodeLabels - k8sBlobLabels = nodeK8sLabels - } - - labelFilter := map[string]map[string]interface{}{} - for _, attr := range labels { - labelFilter[attr] = nil - } - k8sBlobMap := map[string]interface{}{} - for _, attr := range k8sBlobLabels { - k8sBlobMap[attr] = nil - } - if len(k8sBlobMap) > 0 { - labelFilter[containerinsightscommon.K8sKey] = k8sBlobMap + labelFilter = nodeLabelFilter } var dps pmetric.NumberDataPointSlice From c70fefc284cefd874ad035ad5cd0c54f0902f108 Mon Sep 17 00:00:00 2001 From: Hyunsoo Kim Date: Thu, 29 Feb 2024 23:19:52 -0500 Subject: [PATCH 17/20] format --- .../otel/processor/metricstransformprocessor/translator.go | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/translator/translate/otel/processor/metricstransformprocessor/translator.go b/translator/translate/otel/processor/metricstransformprocessor/translator.go index 316cd7221a..8d7ed7abb6 100644 --- a/translator/translate/otel/processor/metricstransformprocessor/translator.go +++ b/translator/translate/otel/processor/metricstransformprocessor/translator.go @@ -6,15 +6,14 @@ package metricstransformprocessor import ( "fmt" - "github.com/open-telemetry/opentelemetry-collector-contrib/processor/metricstransformprocessor" "go.opentelemetry.io/collector/component" "go.opentelemetry.io/collector/confmap" "go.opentelemetry.io/collector/processor" - "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/receiver/awscontainerinsight" - "github.com/aws/amazon-cloudwatch-agent/internal/containerinsightscommon" "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/common" + "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/receiver/awscontainerinsight" + "github.com/open-telemetry/opentelemetry-collector-contrib/processor/metricstransformprocessor" ) const gpuLogSuffix = "GPU" From 7803297d4856e15001007588b8516047fd88570a Mon Sep 17 00:00:00 2001 From: Hyunsoo Kim Date: Fri, 1 Mar 2024 10:26:17 -0500 Subject: [PATCH 18/20] format --- .../otel/processor/metricstransformprocessor/translator.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/translator/translate/otel/processor/metricstransformprocessor/translator.go b/translator/translate/otel/processor/metricstransformprocessor/translator.go index 8d7ed7abb6..ff0996b6ca 100644 --- a/translator/translate/otel/processor/metricstransformprocessor/translator.go +++ b/translator/translate/otel/processor/metricstransformprocessor/translator.go @@ -6,6 +6,7 @@ package metricstransformprocessor import ( "fmt" + "github.com/open-telemetry/opentelemetry-collector-contrib/processor/metricstransformprocessor" "go.opentelemetry.io/collector/component" "go.opentelemetry.io/collector/confmap" "go.opentelemetry.io/collector/processor" @@ -13,7 +14,6 @@ import ( "github.com/aws/amazon-cloudwatch-agent/internal/containerinsightscommon" "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/common" "github.com/aws/amazon-cloudwatch-agent/translator/translate/otel/receiver/awscontainerinsight" - "github.com/open-telemetry/opentelemetry-collector-contrib/processor/metricstransformprocessor" ) const gpuLogSuffix = "GPU" From 1d1fef02663e4fb0bf0aaeecdf63cc0c92a79e23 Mon Sep 17 00:00:00 2001 From: Hyunsoo Kim Date: Fri, 1 Mar 2024 17:24:03 -0500 Subject: [PATCH 19/20] update otel contrib --- go.mod | 44 ++++++++++++++---------------- go.sum | 84 ++++++++++++++++++++++++++-------------------------------- 2 files changed, 58 insertions(+), 70 deletions(-) diff --git a/go.mod b/go.mod index f3f2d2ff0f..8beac47726 100644 --- a/go.mod +++ b/go.mod @@ -7,38 +7,38 @@ replace github.com/influxdata/telegraf => github.com/aws/telegraf v0.10.2-0.2023 // Replace with https://github.com/amazon-contributing/opentelemetry-collector-contrib, there are no requirements for all receivers/processors/exporters // to be all replaced since there are some changes that will always be from upstream replace ( - github.com/open-telemetry/opentelemetry-collector-contrib/exporter/awscloudwatchlogsexporter => github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awscloudwatchlogsexporter v0.0.0-20240301150504-be26c7745633 - github.com/open-telemetry/opentelemetry-collector-contrib/exporter/awsemfexporter => github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awsemfexporter v0.0.0-20240301150504-be26c7745633 - github.com/open-telemetry/opentelemetry-collector-contrib/exporter/awsxrayexporter => github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awsxrayexporter v0.0.0-20240301150504-be26c7745633 + github.com/open-telemetry/opentelemetry-collector-contrib/exporter/awscloudwatchlogsexporter => github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awscloudwatchlogsexporter v0.0.0-20240301221517-9cb314e7d27b + github.com/open-telemetry/opentelemetry-collector-contrib/exporter/awsemfexporter => github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awsemfexporter v0.0.0-20240301221517-9cb314e7d27b + github.com/open-telemetry/opentelemetry-collector-contrib/exporter/awsxrayexporter => github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awsxrayexporter v0.0.0-20240301221517-9cb314e7d27b ) -replace github.com/open-telemetry/opentelemetry-collector-contrib/extension/awsproxy => github.com/amazon-contributing/opentelemetry-collector-contrib/extension/awsproxy v0.0.0-20240301150504-be26c7745633 +replace github.com/open-telemetry/opentelemetry-collector-contrib/extension/awsproxy => github.com/amazon-contributing/opentelemetry-collector-contrib/extension/awsproxy v0.0.0-20240301221517-9cb314e7d27b replace ( - github.com/open-telemetry/opentelemetry-collector-contrib/internal/aws/awsutil => github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/awsutil v0.0.0-20240301150504-be26c7745633 - github.com/open-telemetry/opentelemetry-collector-contrib/internal/aws/containerinsight => github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/containerinsight v0.0.0-20240301150504-be26c7745633 - github.com/open-telemetry/opentelemetry-collector-contrib/internal/aws/cwlogs => github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/cwlogs v0.0.0-20240301150504-be26c7745633 - github.com/open-telemetry/opentelemetry-collector-contrib/internal/aws/k8s => github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/k8s v0.0.0-20240301150504-be26c7745633 - github.com/open-telemetry/opentelemetry-collector-contrib/internal/aws/proxy => github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/proxy v0.0.0-20240301150504-be26c7745633 - github.com/open-telemetry/opentelemetry-collector-contrib/internal/aws/xray => github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/xray v0.0.0-20240301150504-be26c7745633 + github.com/open-telemetry/opentelemetry-collector-contrib/internal/aws/awsutil => github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/awsutil v0.0.0-20240301221517-9cb314e7d27b + github.com/open-telemetry/opentelemetry-collector-contrib/internal/aws/containerinsight => github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/containerinsight v0.0.0-20240301221517-9cb314e7d27b + github.com/open-telemetry/opentelemetry-collector-contrib/internal/aws/cwlogs => github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/cwlogs v0.0.0-20240301221517-9cb314e7d27b + github.com/open-telemetry/opentelemetry-collector-contrib/internal/aws/k8s => github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/k8s v0.0.0-20240301221517-9cb314e7d27b + github.com/open-telemetry/opentelemetry-collector-contrib/internal/aws/proxy => github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/proxy v0.0.0-20240301221517-9cb314e7d27b + github.com/open-telemetry/opentelemetry-collector-contrib/internal/aws/xray => github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/xray v0.0.0-20240301221517-9cb314e7d27b ) replace ( - github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza => github.com/amazon-contributing/opentelemetry-collector-contrib/pkg/stanza v0.0.0-20240301150504-be26c7745633 + github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza => github.com/amazon-contributing/opentelemetry-collector-contrib/pkg/stanza v0.0.0-20240301221517-9cb314e7d27b // Replace with contrib to revert upstream change https://github.com/open-telemetry/opentelemetry-collector-contrib/pull/20519 - github.com/open-telemetry/opentelemetry-collector-contrib/pkg/translator/prometheus => github.com/amazon-contributing/opentelemetry-collector-contrib/pkg/translator/prometheus v0.0.0-20240301150504-be26c7745633 + github.com/open-telemetry/opentelemetry-collector-contrib/pkg/translator/prometheus => github.com/amazon-contributing/opentelemetry-collector-contrib/pkg/translator/prometheus v0.0.0-20240301221517-9cb314e7d27b ) -replace github.com/open-telemetry/opentelemetry-collector-contrib/processor/resourcedetectionprocessor => github.com/amazon-contributing/opentelemetry-collector-contrib/processor/resourcedetectionprocessor v0.0.0-20240301150504-be26c7745633 +replace github.com/open-telemetry/opentelemetry-collector-contrib/processor/resourcedetectionprocessor => github.com/amazon-contributing/opentelemetry-collector-contrib/processor/resourcedetectionprocessor v0.0.0-20240301221517-9cb314e7d27b replace ( - github.com/open-telemetry/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver => github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver v0.0.0-20240301150504-be26c7745633 - github.com/open-telemetry/opentelemetry-collector-contrib/receiver/awsxrayreceiver => github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awsxrayreceiver v0.0.0-20240301150504-be26c7745633 - github.com/open-telemetry/opentelemetry-collector-contrib/receiver/prometheusreceiver => github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/prometheusreceiver v0.0.0-20240301150504-be26c7745633 + github.com/open-telemetry/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver => github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver v0.0.0-20240301221517-9cb314e7d27b + github.com/open-telemetry/opentelemetry-collector-contrib/receiver/awsxrayreceiver => github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awsxrayreceiver v0.0.0-20240301221517-9cb314e7d27b + github.com/open-telemetry/opentelemetry-collector-contrib/receiver/prometheusreceiver => github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/prometheusreceiver v0.0.0-20240301221517-9cb314e7d27b ) // Omit fields in HTTPClientSettings. Pending fix for https://github.com/open-telemetry/opentelemetry-collector/issues/8627 -replace go.opentelemetry.io/collector/config/confighttp => github.com/amazon-contributing/opentelemetry-collector-contrib/config/confighttp v0.0.0-20240301150504-be26c7745633 +replace go.opentelemetry.io/collector/config/confighttp => github.com/amazon-contributing/opentelemetry-collector-contrib/config/confighttp v0.0.0-20240301221517-9cb314e7d27b // Temporary fix, pending PR https://github.com/shirou/gopsutil/pull/957 replace github.com/shirou/gopsutil/v3 => github.com/aws/telegraf/patches/gopsutil/v3 v3.0.0-20231109213610-a8c21c54a2be // indirect @@ -93,7 +93,7 @@ replace github.com/aws/aws-sdk-go => github.com/aws/aws-sdk-go v1.48.6 require ( github.com/BurntSushi/toml v1.3.2 github.com/Jeffail/gabs v1.4.0 - github.com/amazon-contributing/opentelemetry-collector-contrib/extension/awsmiddleware v0.0.0-20240301150504-be26c7745633 + github.com/amazon-contributing/opentelemetry-collector-contrib/extension/awsmiddleware v0.0.0-20240301221517-9cb314e7d27b github.com/aws/aws-sdk-go v1.48.6 github.com/aws/aws-sdk-go-v2/service/cloudwatch v1.30.2 github.com/bigkevmcd/go-configparser v0.0.0-20200217161103-d137835d2579 @@ -103,6 +103,7 @@ require ( github.com/gobwas/glob v0.2.3 github.com/google/go-cmp v0.6.0 github.com/google/uuid v1.4.0 + github.com/grafana/regexp v0.0.0-20221122212121-6b5c0a4cb7fd github.com/hashicorp/golang-lru v1.0.2 github.com/influxdata/telegraf v0.0.0-00010101000000-000000000000 github.com/influxdata/wlog v0.0.0-20160411224016-7c63b0a71ef8 @@ -183,12 +184,11 @@ require ( github.com/Azure/go-autorest/tracing v0.6.0 // indirect github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.20.0 // indirect github.com/Microsoft/go-winio v0.6.1 // indirect - github.com/Microsoft/hcsshim v0.11.4 // indirect github.com/Showmax/go-fqdn v1.0.0 // indirect github.com/alecthomas/participle v0.4.1 // indirect github.com/alecthomas/participle/v2 v2.1.0 // indirect github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137 // indirect - github.com/amazon-contributing/opentelemetry-collector-contrib/override/aws v0.0.0-20240301150504-be26c7745633 // indirect + github.com/amazon-contributing/opentelemetry-collector-contrib/override/aws v0.0.0-20240301221517-9cb314e7d27b // indirect github.com/antchfx/jsonquery v1.1.5 // indirect github.com/antchfx/xmlquery v1.3.9 // indirect github.com/antchfx/xpath v1.2.0 // indirect @@ -211,9 +211,7 @@ require ( github.com/checkpoint-restore/go-criu/v5 v5.3.0 // indirect github.com/cilium/ebpf v0.9.1 // indirect github.com/cncf/xds/go v0.0.0-20230607035331-e9ce68804cb4 // indirect - github.com/containerd/cgroups v1.1.0 // indirect github.com/containerd/console v1.0.3 // indirect - github.com/containerd/containerd v1.7.7 // indirect github.com/containerd/ttrpc v1.2.2 // indirect github.com/coreos/go-semver v0.3.0 // indirect github.com/coreos/go-systemd/v22 v22.5.0 // indirect @@ -259,7 +257,6 @@ require ( github.com/gophercloud/gophercloud v1.7.0 // indirect github.com/gorilla/websocket v1.5.0 // indirect github.com/gosnmp/gosnmp v1.34.0 // indirect - github.com/grafana/regexp v0.0.0-20221122212121-6b5c0a4cb7fd // indirect github.com/grpc-ecosystem/grpc-gateway/v2 v2.18.1 // indirect github.com/hashicorp/consul/api v1.25.1 // indirect github.com/hashicorp/cronexpr v1.1.2 // indirect @@ -424,7 +421,6 @@ require ( gopkg.in/yaml.v2 v2.4.0 // indirect k8s.io/klog v1.0.0 // indirect k8s.io/kube-openapi v0.0.0-20230717233707-2695361300d9 // indirect - k8s.io/kubelet v0.28.3 // indirect k8s.io/utils v0.0.0-20230711102312-30195339c3c7 // indirect sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect sigs.k8s.io/structured-merge-diff/v4 v4.3.0 // indirect diff --git a/go.sum b/go.sum index b33a6a24a5..e104a0946c 100644 --- a/go.sum +++ b/go.sum @@ -103,8 +103,6 @@ github.com/Knetic/govaluate v3.0.1-0.20171022003610-9aa49832a739+incompatible/go github.com/Mellanox/rdmamap v0.0.0-20191106181932-7c3c4763a6ee h1:atI/FFjXh6hIVlPE1Jup9m8N4B9q/OSbMUe2EBahs+w= github.com/Microsoft/go-winio v0.6.1 h1:9/kr64B9VUZrLm5YYwbGtUJnMgqWVOdUAXu6Migciow= github.com/Microsoft/go-winio v0.6.1/go.mod h1:LRdKpFKfdobln8UmuiYcKPot9D2v6svN5+sAH+4kjUM= -github.com/Microsoft/hcsshim v0.11.4 h1:68vKo2VN8DE9AdN4tnkWnmdhqdbpUFM8OF3Airm7fz8= -github.com/Microsoft/hcsshim v0.11.4/go.mod h1:smjE4dvqPX9Zldna+t5FG3rnoHhaB7QYxPRqGcpAD9w= github.com/NYTimes/gziphandler v0.0.0-20170623195520-56545f4a5d46/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ= github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU= github.com/PuerkitoBio/purell v1.1.1/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0= @@ -136,44 +134,44 @@ github.com/alecthomas/units v0.0.0-20190924025748-f65c72e2690d/go.mod h1:rBZYJk5 github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137 h1:s6gZFSlWYmbqAuRjVTiNNhvNRfY2Wxp9nhfyel4rklc= github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137/go.mod h1:OMCwj8VM1Kc9e19TLln2VL61YJF0x1XFtfdL4JdbSyE= github.com/aliyun/alibaba-cloud-sdk-go v1.61.1483 h1:J8HaD+Zpfi1gcel3HCKpoHHEsrcuRrZlSnx7R9SCf5I= -github.com/amazon-contributing/opentelemetry-collector-contrib/config/confighttp v0.0.0-20240301150504-be26c7745633 h1:e3DwgRDdP9phdp6k9twVcu2Ny0vfAzc3Rwl0eMOfq0Y= -github.com/amazon-contributing/opentelemetry-collector-contrib/config/confighttp v0.0.0-20240301150504-be26c7745633/go.mod h1:3sU3HgF5wc32CVljnzGo4Fn/9+T0N1Z6tCJyKdW2MvM= -github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awscloudwatchlogsexporter v0.0.0-20240301150504-be26c7745633 h1:+FvOYuT7AviGaOo0ZMUeJw5JPKR9/u9kPbbhgPLcpNY= -github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awscloudwatchlogsexporter v0.0.0-20240301150504-be26c7745633/go.mod h1:9L23Mib5WjvuWRMmLyZrH+OJyeDz0fEZ9e2ummzZlgU= -github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awsemfexporter v0.0.0-20240301150504-be26c7745633 h1:O0R9PN9rFJoza6wNfAy6EPPPqBC794mgzFMMUf1NmWw= -github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awsemfexporter v0.0.0-20240301150504-be26c7745633/go.mod h1:wTGyQcKa708Ci78kpp+YNr4budNC5QRQ2tiD7LXFRwY= -github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awsxrayexporter v0.0.0-20240301150504-be26c7745633 h1:Apvmp24bx02iA7obU01JjeRSRIKM18XJfMA/FI8FYqs= -github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awsxrayexporter v0.0.0-20240301150504-be26c7745633/go.mod h1:n5I9WMdSLqf1nngS88/Os8Ts2xF+jd0RxGVfEP46Xsk= -github.com/amazon-contributing/opentelemetry-collector-contrib/extension/awsmiddleware v0.0.0-20240301150504-be26c7745633 h1:BvCQAUlMhOOeFQZ+WrEfqtEllIcCyZkG9vE4MAL6f40= -github.com/amazon-contributing/opentelemetry-collector-contrib/extension/awsmiddleware v0.0.0-20240301150504-be26c7745633/go.mod h1:5JOe6ISApVHBIsZuLb8ppaY06ujDcHCxYJE5wCymNoI= -github.com/amazon-contributing/opentelemetry-collector-contrib/extension/awsproxy v0.0.0-20240301150504-be26c7745633 h1:0GMY6w5wgABApvm6IniaB+oCXKOMfYNokpVzPV6AcKw= -github.com/amazon-contributing/opentelemetry-collector-contrib/extension/awsproxy v0.0.0-20240301150504-be26c7745633/go.mod h1:9EAh+a7cph0PYSAW5xTv9HGMdWopom2dFUJd1AgwMNc= -github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/awsutil v0.0.0-20240301150504-be26c7745633 h1:hEdUY7ESfD1CludrZDhTz+Z/1zZNfUkrXW8GtBhankY= -github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/awsutil v0.0.0-20240301150504-be26c7745633/go.mod h1:Mpe0DhouTXYDk/DyFDyQbjKpVxRTqahohQT5lidS2jY= -github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/containerinsight v0.0.0-20240301150504-be26c7745633 h1:8RCYHGBzAHOtBhXVuabNmMtPYgnWXW5oe27r40kO/fg= -github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/containerinsight v0.0.0-20240301150504-be26c7745633/go.mod h1:Xg5sUWQEuVshBnsZB7wxGDLf5DfzAqnFZGVbMHvoaj8= -github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/cwlogs v0.0.0-20240301150504-be26c7745633 h1:pkrJVynW4UiM5LaGMRPSp9KscZBfV/qw19z1AnsUGhc= -github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/cwlogs v0.0.0-20240301150504-be26c7745633/go.mod h1:YNWxiYSLiIxOmw5hmA6jSnK8KVyru1JLX1OQeUCtCf8= -github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/k8s v0.0.0-20240301150504-be26c7745633 h1:nRbxbNWWnmT7UUQPV6bWLD862c0ByonLLbROJ5+LNg0= -github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/k8s v0.0.0-20240301150504-be26c7745633/go.mod h1:Yv2d1E/dUBNVintLtLz8zE8RahV5m5Mo5HOnn4s19Sg= -github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/proxy v0.0.0-20240301150504-be26c7745633 h1:XKc3VbC43aGRd8TRCKWLJBJvdvUH5PW6cDwkanI68JU= -github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/proxy v0.0.0-20240301150504-be26c7745633/go.mod h1:UIrDCIdIOYvs4RAPPEz5p4nSKVw0aFLa5BrKo5ww0Is= -github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/xray v0.0.0-20240301150504-be26c7745633 h1:kzy07FFx4oMDxWKS1JEj0XJzsR1ogGIivQCKJ7dQYTQ= -github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/xray v0.0.0-20240301150504-be26c7745633/go.mod h1:D6lmFdWbpYRttNnfkZJSs2ZZcAUGfe96/Vrm8tiK3Tw= -github.com/amazon-contributing/opentelemetry-collector-contrib/override/aws v0.0.0-20240301150504-be26c7745633 h1:4FWWbVLVFSz4HqLaGqzM8V85G1gENX5mSmjtiMs2AZE= -github.com/amazon-contributing/opentelemetry-collector-contrib/override/aws v0.0.0-20240301150504-be26c7745633/go.mod h1:t/hYoRTnlPuRjh8y0BwVGgNvNIXpU2QJME5YVppUUHQ= -github.com/amazon-contributing/opentelemetry-collector-contrib/pkg/stanza v0.0.0-20240301150504-be26c7745633 h1:htAyOtDBxSOE/328XbE00TrxDFnNzhxL8tjmDTjjwbY= -github.com/amazon-contributing/opentelemetry-collector-contrib/pkg/stanza v0.0.0-20240301150504-be26c7745633/go.mod h1:7uCHpcHoawJsqoyPLxaFROWsZXPSF6/op3Hmw4pV4WE= -github.com/amazon-contributing/opentelemetry-collector-contrib/pkg/translator/prometheus v0.0.0-20240301150504-be26c7745633 h1:azbupjp6V8CIUbqMAWXsGkt7CKgu+T3KSxTKjO0fSNg= -github.com/amazon-contributing/opentelemetry-collector-contrib/pkg/translator/prometheus v0.0.0-20240301150504-be26c7745633/go.mod h1:HXv8nyJ+RUHGLZMPbaPFnWKonYWNTJfZ9ZUudqtwudw= -github.com/amazon-contributing/opentelemetry-collector-contrib/processor/resourcedetectionprocessor v0.0.0-20240301150504-be26c7745633 h1:Gm82XcMdwpsIjAodU5UW6A8FX2NLlXYLf8rdZx767vQ= -github.com/amazon-contributing/opentelemetry-collector-contrib/processor/resourcedetectionprocessor v0.0.0-20240301150504-be26c7745633/go.mod h1:L4/sIFbml9J28qd8i2aJGztmCg2FCCtfTdwikpu4JEs= -github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver v0.0.0-20240301150504-be26c7745633 h1:rI/OO0byMwR7Iaux4p6LVEE6V2OK1iNP50EM/psgaZI= -github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver v0.0.0-20240301150504-be26c7745633/go.mod h1:Pxs4/jrWY1ePIHp7wp1UMdnSWqEBLjeaxpgf5ZW+LMk= -github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awsxrayreceiver v0.0.0-20240301150504-be26c7745633 h1:uLAWP3xoDK8/4Aa4lbuAwCGL5ORYVOUdOa2Udn2HSQ0= -github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awsxrayreceiver v0.0.0-20240301150504-be26c7745633/go.mod h1:4IMYeZjU+IgZdXHuiLOIVtdp42lrMjk+rtlQpENeGSM= -github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/prometheusreceiver v0.0.0-20240301150504-be26c7745633 h1:YDhcQ+p+yMBcJF3vTNQ0ebpiXuGdvc0PwAdWN9nDQkk= -github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/prometheusreceiver v0.0.0-20240301150504-be26c7745633/go.mod h1:fnNxw30DVmpiS3tt1nUETZH3g/boGnBLx7+hYwYd9EU= +github.com/amazon-contributing/opentelemetry-collector-contrib/config/confighttp v0.0.0-20240301221517-9cb314e7d27b h1:JbyxYa3c8PQRpQZpEekKvWMTk2iSiT/M79WikF7vQQQ= +github.com/amazon-contributing/opentelemetry-collector-contrib/config/confighttp v0.0.0-20240301221517-9cb314e7d27b/go.mod h1:3sU3HgF5wc32CVljnzGo4Fn/9+T0N1Z6tCJyKdW2MvM= +github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awscloudwatchlogsexporter v0.0.0-20240301221517-9cb314e7d27b h1:ejIXrN48i5YDshcb+Kwy5IPgwfHvkUXMfdxzjbvVOMU= +github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awscloudwatchlogsexporter v0.0.0-20240301221517-9cb314e7d27b/go.mod h1:9L23Mib5WjvuWRMmLyZrH+OJyeDz0fEZ9e2ummzZlgU= +github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awsemfexporter v0.0.0-20240301221517-9cb314e7d27b h1:82/jRNSmc+EzC63bGqfjFDSsQjPWU8OwxoJd9jpIJqI= +github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awsemfexporter v0.0.0-20240301221517-9cb314e7d27b/go.mod h1:wTGyQcKa708Ci78kpp+YNr4budNC5QRQ2tiD7LXFRwY= +github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awsxrayexporter v0.0.0-20240301221517-9cb314e7d27b h1:Q8OMVsAYFNVBZaxDXj2ZudGWZLeqx8kddMlAGY3GWIw= +github.com/amazon-contributing/opentelemetry-collector-contrib/exporter/awsxrayexporter v0.0.0-20240301221517-9cb314e7d27b/go.mod h1:n5I9WMdSLqf1nngS88/Os8Ts2xF+jd0RxGVfEP46Xsk= +github.com/amazon-contributing/opentelemetry-collector-contrib/extension/awsmiddleware v0.0.0-20240301221517-9cb314e7d27b h1:HC4A222OXGHCZBqeYgF3Gn7b4Slhbqdq9eb3xqmfiyE= +github.com/amazon-contributing/opentelemetry-collector-contrib/extension/awsmiddleware v0.0.0-20240301221517-9cb314e7d27b/go.mod h1:5JOe6ISApVHBIsZuLb8ppaY06ujDcHCxYJE5wCymNoI= +github.com/amazon-contributing/opentelemetry-collector-contrib/extension/awsproxy v0.0.0-20240301221517-9cb314e7d27b h1:zVFqhr3y62aiZLtYQhUNekx9FOZjt9d5vXZm/zNq9Wc= +github.com/amazon-contributing/opentelemetry-collector-contrib/extension/awsproxy v0.0.0-20240301221517-9cb314e7d27b/go.mod h1:9EAh+a7cph0PYSAW5xTv9HGMdWopom2dFUJd1AgwMNc= +github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/awsutil v0.0.0-20240301221517-9cb314e7d27b h1:6Zpwt33dpmekzVpel7D/QazzIQE6I9FSN75e5FTkVYE= +github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/awsutil v0.0.0-20240301221517-9cb314e7d27b/go.mod h1:Mpe0DhouTXYDk/DyFDyQbjKpVxRTqahohQT5lidS2jY= +github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/containerinsight v0.0.0-20240301221517-9cb314e7d27b h1:FCzUDb9pW2vnoqoTeGjsx/kGP2qIpESCJfvR6OF+j+U= +github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/containerinsight v0.0.0-20240301221517-9cb314e7d27b/go.mod h1:Xg5sUWQEuVshBnsZB7wxGDLf5DfzAqnFZGVbMHvoaj8= +github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/cwlogs v0.0.0-20240301221517-9cb314e7d27b h1:xNzkwWOjoLUiEzeOEZx/mtpnEXLh08eAjPyCCO7NreU= +github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/cwlogs v0.0.0-20240301221517-9cb314e7d27b/go.mod h1:YNWxiYSLiIxOmw5hmA6jSnK8KVyru1JLX1OQeUCtCf8= +github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/k8s v0.0.0-20240301221517-9cb314e7d27b h1:l3uTt3PUxGpypKwU0YudxWILtybWpCoijoWW0c4FdhY= +github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/k8s v0.0.0-20240301221517-9cb314e7d27b/go.mod h1:RNHBmikDFzPQ8GbL/UtLxiJ7xqbK5RrFsfUSnIjJJlE= +github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/proxy v0.0.0-20240301221517-9cb314e7d27b h1:3N0egSY+PxUT/wPil7ccBH3eayLuZVuTTfLIAXJHElI= +github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/proxy v0.0.0-20240301221517-9cb314e7d27b/go.mod h1:UIrDCIdIOYvs4RAPPEz5p4nSKVw0aFLa5BrKo5ww0Is= +github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/xray v0.0.0-20240301221517-9cb314e7d27b h1:6yMzkipSRWcMpQzyrgdLUw6da8nKn5+4/75zpcP8ibg= +github.com/amazon-contributing/opentelemetry-collector-contrib/internal/aws/xray v0.0.0-20240301221517-9cb314e7d27b/go.mod h1:D6lmFdWbpYRttNnfkZJSs2ZZcAUGfe96/Vrm8tiK3Tw= +github.com/amazon-contributing/opentelemetry-collector-contrib/override/aws v0.0.0-20240301221517-9cb314e7d27b h1:96/K8X5jsdpzbJqTqbOuSR04zjlvrtUvxdxlxT4u9iM= +github.com/amazon-contributing/opentelemetry-collector-contrib/override/aws v0.0.0-20240301221517-9cb314e7d27b/go.mod h1:t/hYoRTnlPuRjh8y0BwVGgNvNIXpU2QJME5YVppUUHQ= +github.com/amazon-contributing/opentelemetry-collector-contrib/pkg/stanza v0.0.0-20240301221517-9cb314e7d27b h1:Mlj7UkmzRGqoQAAfqN74BTtZzXans4a6X/Lveu76ydE= +github.com/amazon-contributing/opentelemetry-collector-contrib/pkg/stanza v0.0.0-20240301221517-9cb314e7d27b/go.mod h1:7uCHpcHoawJsqoyPLxaFROWsZXPSF6/op3Hmw4pV4WE= +github.com/amazon-contributing/opentelemetry-collector-contrib/pkg/translator/prometheus v0.0.0-20240301221517-9cb314e7d27b h1:yOA5poXBCcbcchcGRQYEL5pHUrwqJlZKz7Hs9J7mIP4= +github.com/amazon-contributing/opentelemetry-collector-contrib/pkg/translator/prometheus v0.0.0-20240301221517-9cb314e7d27b/go.mod h1:HXv8nyJ+RUHGLZMPbaPFnWKonYWNTJfZ9ZUudqtwudw= +github.com/amazon-contributing/opentelemetry-collector-contrib/processor/resourcedetectionprocessor v0.0.0-20240301221517-9cb314e7d27b h1:HSD0iQwUJuRO1iXMxxZQ80WZF3Re/Y+ctm11kujMpr8= +github.com/amazon-contributing/opentelemetry-collector-contrib/processor/resourcedetectionprocessor v0.0.0-20240301221517-9cb314e7d27b/go.mod h1:L4/sIFbml9J28qd8i2aJGztmCg2FCCtfTdwikpu4JEs= +github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver v0.0.0-20240301221517-9cb314e7d27b h1:BpHUBQvrBwqD/sl+4pus6Jk2q+vPQNk2Y9d4AAJGXd8= +github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awscontainerinsightreceiver v0.0.0-20240301221517-9cb314e7d27b/go.mod h1:Cs6HCNy7eQcEkyqAoX8HRqkRrExKA31Sc7DIOJKoGRQ= +github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awsxrayreceiver v0.0.0-20240301221517-9cb314e7d27b h1:JFW+DadLtUQC1LiuFLn7EQpOkxsf7fTRmTJeGS1/lc4= +github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/awsxrayreceiver v0.0.0-20240301221517-9cb314e7d27b/go.mod h1:4IMYeZjU+IgZdXHuiLOIVtdp42lrMjk+rtlQpENeGSM= +github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/prometheusreceiver v0.0.0-20240301221517-9cb314e7d27b h1:I3Rws21xhtovJPhB60wSLzmhaN7ASWvOyO2vIysc0U8= +github.com/amazon-contributing/opentelemetry-collector-contrib/receiver/prometheusreceiver v0.0.0-20240301221517-9cb314e7d27b/go.mod h1:fnNxw30DVmpiS3tt1nUETZH3g/boGnBLx7+hYwYd9EU= github.com/amir/raidman v0.0.0-20170415203553-1ccc43bfb9c9 h1:FXrPTd8Rdlc94dKccl7KPmdmIbVh/OjelJ8/vgMRzcQ= github.com/andybalholm/brotli v1.0.4 h1:V7DdXeJtZscaqfNuAdSRuRFzuiKlHSC/Zh3zl9qY3JY= github.com/antchfx/jsonquery v1.1.5 h1:1YWrNFYCcIuJPIjFeOP5b6TXbLSUYY8qqxWbuZOB1qE= @@ -295,12 +293,8 @@ github.com/cncf/udpa/go v0.0.0-20201120205902-5459f2c99403/go.mod h1:WmhPx2Nbnht github.com/cncf/xds/go v0.0.0-20210312221358-fbca930ec8ed/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= github.com/cncf/xds/go v0.0.0-20230607035331-e9ce68804cb4 h1:/inchEIKaYC1Akx+H+gqO04wryn5h75LSazbRlnya1k= github.com/cncf/xds/go v0.0.0-20230607035331-e9ce68804cb4/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= -github.com/containerd/cgroups v1.1.0 h1:v8rEWFl6EoqHB+swVNjVoCJE8o3jX7e8nqBGPLaDFBM= -github.com/containerd/cgroups v1.1.0/go.mod h1:6ppBcbh/NOOUU+dMKrykgaBnK9lCIBxHqJDGwsa1mIw= github.com/containerd/console v1.0.3 h1:lIr7SlA5PxZyMV30bDW0MGbiOPXwc63yRuCP0ARubLw= github.com/containerd/console v1.0.3/go.mod h1:7LqA/THxQ86k76b8c/EMSiaJ3h1eZkMkXar0TQ1gf3U= -github.com/containerd/containerd v1.7.7 h1:QOC2K4A42RQpcrZyptP6z9EJZnlHfHJUfZrAAHe15q4= -github.com/containerd/containerd v1.7.7/go.mod h1:3c4XZv6VeT9qgf9GMTxNTMFxGJrGpI2vz1yk4ye+YY8= github.com/containerd/ttrpc v1.2.2 h1:9vqZr0pxwOF5koz6N0N3kJ0zDHokrcPxIR/ZR2YFtOs= github.com/containerd/ttrpc v1.2.2/go.mod h1:sIT6l32Ph/H9cvnJsfXM5drIVzTr5A2flTf1G5tYZak= github.com/containerd/typeurl v1.0.2 h1:Chlt8zIieDbzQFzXzAeBEF92KhExuE4p9p92/QmY7aY= @@ -1808,8 +1802,6 @@ k8s.io/klog/v2 v2.100.1/go.mod h1:y1WjHnz7Dj687irZUWR/WLkLc5N1YHtjLdmgWjndZn0= k8s.io/kube-openapi v0.0.0-20210305001622-591a79e4bda7/go.mod h1:wXW5VT87nVfh/iLV8FpR2uDvrFyomxbtb1KivDbvPTE= k8s.io/kube-openapi v0.0.0-20230717233707-2695361300d9 h1:LyMgNKD2P8Wn1iAwQU5OhxCKlKJy0sHc+PcDwFB24dQ= k8s.io/kube-openapi v0.0.0-20230717233707-2695361300d9/go.mod h1:wZK2AVp1uHCp4VamDVgBP2COHZjqD1T68Rf0CM3YjSM= -k8s.io/kubelet v0.28.3 h1:bp/uIf1R5F61BlFvFtzc4PDEiK7TtFcw3wFJlc0V0LM= -k8s.io/kubelet v0.28.3/go.mod h1:E3NHYbp/v45Ao6AD0EOZnqO3L0R6Haks6Nm0+bnFwtU= k8s.io/utils v0.0.0-20201110183641-67b214c5f920/go.mod h1:jPW/WVKK9YHAvNhRxK0md/EJ228hCsBRufyofKtW8HA= k8s.io/utils v0.0.0-20230711102312-30195339c3c7 h1:ZgnF1KZsYxWIifwSNZFZgNtWE89WI5yiP5WwlfDoIyc= k8s.io/utils v0.0.0-20230711102312-30195339c3c7/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= From cf0292f58df5321b0b23161bceeed24c630a668d Mon Sep 17 00:00:00 2001 From: Hyunsoo Kim Date: Fri, 1 Mar 2024 17:40:37 -0500 Subject: [PATCH 20/20] fix test --- .../sampleConfig/emf_and_kubernetes_with_gpu_config.yaml | 9 ++++++--- translator/tocwconfig/tocwconfig_test.go | 1 + 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml index 2fe3980845..f4498809a6 100644 --- a/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml +++ b/translator/tocwconfig/sampleConfig/emf_and_kubernetes_with_gpu_config.yaml @@ -5,7 +5,7 @@ exporters: emf_only: true endpoint: https://fake_endpoint imds_retries: 2 - local_mode: false + local_mode: true log_group_name: emf/logs/default log_retention: 0 log_stream_name: host_name_from_env @@ -43,7 +43,7 @@ exporters: endpoint: https://fake_endpoint enhanced_container_insights: true imds_retries: 2 - local_mode: false + local_mode: true log_group_name: /aws/containerinsights/{ClusterName}/performance log_retention: 0 log_stream_name: '{NodeName}' @@ -500,6 +500,9 @@ extensions: stats: operations: - PutLogEvents + usage_flags: + mode: OP + region_type: ACJ processors: batch/containerinsights: metadata_cardinality_limit: 1000 @@ -973,7 +976,7 @@ receivers: imds_retries: 2 leader_lock_name: cwagent-clusterleader leader_lock_using_config_map_only: true - local_mode: false + local_mode: true max_retries: 0 no_verify_ssl: false num_workers: 0 diff --git a/translator/tocwconfig/tocwconfig_test.go b/translator/tocwconfig/tocwconfig_test.go index 8780167d8c..dd09244354 100644 --- a/translator/tocwconfig/tocwconfig_test.go +++ b/translator/tocwconfig/tocwconfig_test.go @@ -127,6 +127,7 @@ func TestEmfAndKubernetesWithGpuConfig(t *testing.T) { resetContext(t) readCommonConfig(t, "./sampleConfig/commonConfig/withCredentials.toml") context.CurrentContext().SetRunInContainer(true) + context.CurrentContext().SetMode(config.ModeOnPremise) t.Setenv(config.HOST_NAME, "host_name_from_env") t.Setenv(config.HOST_IP, "127.0.0.1") expectedEnvVars := map[string]string{}