From 2728c1914ef7c37820bfa03b16375809096e0867 Mon Sep 17 00:00:00 2001
From: Hyunsoo Kim <884273+movence@users.noreply.github.com>
Date: Wed, 29 May 2024 11:15:05 -0400
Subject: [PATCH] Add gpu count metrics including limit, request and total
 counts (#214)

---
 internal/aws/containerinsight/const.go        |   8 +
 internal/aws/containerinsight/utils.go        |  19 +-
 internal/aws/containerinsight/utils_test.go   |  37 +++-
 internal/aws/k8s/k8sclient/node.go            |  53 +++--
 internal/aws/k8s/k8sclient/node_info.go       |  12 +-
 internal/aws/k8s/k8sclient/node_test.go       | 190 ++++++++++++++++++
 internal/aws/k8s/k8sclient/pod.go             |   9 +
 internal/aws/k8s/k8sclient/pod_info.go        |   7 +
 internal/aws/k8s/k8sclient/pod_test.go        |  48 ++++-
 internal/aws/k8s/k8sutil/util.go              |   9 +
 internal/aws/k8s/k8sutil/util_test.go         |   7 +
 .../internal/k8sapiserver/k8sapiserver.go     |  99 ++++++++-
 .../k8sapiserver/k8sapiserver_test.go         |  50 ++++-
 .../internal/k8sapiserver/utils.go            |   2 +
 .../awscontainerinsightreceiver/receiver.go   |   2 +-
 15 files changed, 514 insertions(+), 38 deletions(-)

diff --git a/internal/aws/containerinsight/const.go b/internal/aws/containerinsight/const.go
index a5551aeb0d25..161e5f0244df 100644
--- a/internal/aws/containerinsight/const.go
+++ b/internal/aws/containerinsight/const.go
@@ -140,6 +140,10 @@ const (
 	EfaRxDropped          = "rx_dropped"
 	EfaTxBytes            = "tx_bytes"
 
+	GpuLimit   = "gpu_limit"
+	GpuTotal   = "gpu_total"
+	GpuRequest = "gpu_request"
+
 	// Define the metric types
 	TypeCluster            = "Cluster"
 	TypeClusterService     = "ClusterService"
@@ -319,5 +323,9 @@ func init() {
 		EfaRxBytes:            UnitBytesPerSec,
 		EfaRxDropped:          UnitCountPerSec,
 		EfaTxBytes:            UnitBytesPerSec,
+
+		GpuLimit:   UnitCount,
+		GpuTotal:   UnitCount,
+		GpuRequest: UnitCount,
 	}
 }
diff --git a/internal/aws/containerinsight/utils.go b/internal/aws/containerinsight/utils.go
index 7e87d77692bf..4c0b6fa15d0d 100644
--- a/internal/aws/containerinsight/utils.go
+++ b/internal/aws/containerinsight/utils.go
@@ -139,7 +139,7 @@ func getPrefixByMetricType(mType string) string {
 		prefix = nodeNetPrefix
 	case TypeNodeEFA:
 		prefix = nodeEfaPrefix
-	case TypePod:
+	case TypePod, TypePodGPU:
 		prefix = podPrefix
 	case TypePodNet:
 		prefix = podNetPrefix
@@ -259,23 +259,24 @@ func ConvertToOTLPMetrics(fields map[string]any, tags map[string]string, logger
 	for key, value := range fields {
 		metric := RemovePrefix(metricType, key)
 		unit := GetUnitForMetric(metric)
+		scopeMetric := ilms.AppendEmpty()
 		switch t := value.(type) {
 		case int:
-			intGauge(ilms.AppendEmpty(), key, unit, int64(t), timestamp)
+			intGauge(scopeMetric, key, unit, int64(t), timestamp)
 		case int32:
-			intGauge(ilms.AppendEmpty(), key, unit, int64(t), timestamp)
+			intGauge(scopeMetric, key, unit, int64(t), timestamp)
 		case int64:
-			intGauge(ilms.AppendEmpty(), key, unit, t, timestamp)
+			intGauge(scopeMetric, key, unit, t, timestamp)
 		case uint:
-			intGauge(ilms.AppendEmpty(), key, unit, int64(t), timestamp)
+			intGauge(scopeMetric, key, unit, int64(t), timestamp)
 		case uint32:
-			intGauge(ilms.AppendEmpty(), key, unit, int64(t), timestamp)
+			intGauge(scopeMetric, key, unit, int64(t), timestamp)
 		case uint64:
-			intGauge(ilms.AppendEmpty(), key, unit, int64(t), timestamp)
+			intGauge(scopeMetric, key, unit, int64(t), timestamp)
 		case float32:
-			doubleGauge(ilms.AppendEmpty(), key, unit, float64(t), timestamp)
+			doubleGauge(scopeMetric, key, unit, float64(t), timestamp)
 		case float64:
-			doubleGauge(ilms.AppendEmpty(), key, unit, t, timestamp)
+			doubleGauge(scopeMetric, key, unit, t, timestamp)
 		default:
 			valueType := fmt.Sprintf("%T", value)
 			logger.Warn("Detected unexpected field", zap.String("key", key), zap.Any("value", value), zap.String("value type", valueType))
diff --git a/internal/aws/containerinsight/utils_test.go b/internal/aws/containerinsight/utils_test.go
index 80b2a45be656..9ff78aa77a7b 100644
--- a/internal/aws/containerinsight/utils_test.go
+++ b/internal/aws/containerinsight/utils_test.go
@@ -214,7 +214,8 @@ func TestConvertToOTLPMetricsForInvalidMetrics(t *testing.T) {
 	md = ConvertToOTLPMetrics(fields, tags, zap.NewNop())
 	rm := md.ResourceMetrics().At(0)
 	ilms := rm.ScopeMetrics()
-	assert.Equal(t, 0, ilms.Len())
+	assert.Equal(t, 1, ilms.Len())
+	assert.Equal(t, 0, ilms.At(0).Metrics().Len())
 }
 
 func TestConvertToOTLPMetricsForClusterMetrics(t *testing.T) {
@@ -912,3 +913,37 @@ func TestConvertToOTLPMetricsForPodEfaMetrics(t *testing.T) {
 	md = ConvertToOTLPMetrics(fields, tags, zap.NewNop())
 	checkMetricsAreExpected(t, md, fields, tags, expectedUnits)
 }
+
+func TestConvertToOTLPMetricsForAcceleratorCountMetrics(t *testing.T) {
+	var fields map[string]any
+	var expectedUnits map[string]string
+	var tags map[string]string
+	var md pmetric.Metrics
+	now := time.Now()
+	timestamp := strconv.FormatInt(now.UnixNano(), 10)
+
+	fields = map[string]any{
+		"pod_gpu_limit":   int64(3),
+		"pod_gpu_total":   int64(3),
+		"pod_gpu_request": int64(3),
+	}
+	expectedUnits = map[string]string{
+		"pod_gpu_limit":   UnitCount,
+		"pod_gpu_total":   UnitCount,
+		"pod_gpu_request": UnitCount,
+	}
+	tags = map[string]string{
+		"ClusterName":   "eks-aoc",
+		"InstanceId":    "i-01bf9fb097cbf3205",
+		"InstanceType":  "t2.xlarge",
+		"Namespace":     "amazon-cloudwatch",
+		"NodeName":      "ip-192-168-12-170.ec2.internal",
+		"PodName":       "cloudwatch-agent",
+		"ContainerName": "cloudwatch-agent",
+		"Type":          "PodGPU",
+		"Version":       "0",
+		"Timestamp":     timestamp,
+	}
+	md = ConvertToOTLPMetrics(fields, tags, zap.NewNop())
+	checkMetricsAreExpected(t, md, fields, tags, expectedUnits)
+}
diff --git a/internal/aws/k8s/k8sclient/node.go b/internal/aws/k8s/k8sclient/node.go
index 8682417c590d..9b9fbb52166b 100644
--- a/internal/aws/k8s/k8sclient/node.go
+++ b/internal/aws/k8s/k8sclient/node.go
@@ -18,6 +18,11 @@ import (
 	"k8s.io/client-go/tools/cache"
 )
 
+const (
+	instanceTypeLabelKey     = "node.kubernetes.io/instance-type"
+	instanceTypeLabelKeyBeta = "beta.kubernetes.io/instance-type"
+)
+
 // This needs to be reviewed for newer versions of k8s.
 var failedNodeConditions = map[v1.NodeConditionType]bool{
 	v1.NodeMemoryPressure:     true,
@@ -27,6 +32,7 @@ var failedNodeConditions = map[v1.NodeConditionType]bool{
 }
 
 type NodeClient interface {
+	NodeInfos() map[string]*NodeInfo
 	// Get the number of failed nodes for current cluster
 	ClusterFailedNodeCount() int
 	// Get the number of nodes for current cluster
@@ -72,6 +78,7 @@ type nodeClient struct {
 	captureNodeLevelInfo bool
 
 	mu                     sync.RWMutex
+	nodeInfos              map[string]*NodeInfo
 	clusterFailedNodeCount int
 	clusterNodeCount       int
 	nodeToCapacityMap      map[string]v1.ResourceList
@@ -79,6 +86,15 @@ type nodeClient struct {
 	nodeToConditionsMap    map[string]map[v1.NodeConditionType]v1.ConditionStatus
 }
 
+func (c *nodeClient) NodeInfos() map[string]*NodeInfo {
+	if c.store.GetResetRefreshStatus() {
+		c.refresh()
+	}
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+	return c.nodeInfos
+}
+
 func (c *nodeClient) ClusterFailedNodeCount() int {
 	if c.store.GetResetRefreshStatus() {
 		c.refresh()
@@ -145,24 +161,26 @@ func (c *nodeClient) refresh() {
 	nodeToAllocatableMap := make(map[string]v1.ResourceList)
 	nodeToConditionsMap := make(map[string]map[v1.NodeConditionType]v1.ConditionStatus)
 
+	nodeInfos := map[string]*NodeInfo{}
 	for _, obj := range objsList {
-		node := obj.(*nodeInfo)
+		node := obj.(*NodeInfo)
+		nodeInfos[node.Name] = node
 
 		if c.captureNodeLevelInfo {
-			nodeToCapacityMap[node.name] = node.capacity
-			nodeToAllocatableMap[node.name] = node.allocatable
+			nodeToCapacityMap[node.Name] = node.Capacity
+			nodeToAllocatableMap[node.Name] = node.Allocatable
 			conditionsMap := make(map[v1.NodeConditionType]v1.ConditionStatus)
-			for _, condition := range node.conditions {
+			for _, condition := range node.Conditions {
 				conditionsMap[condition.Type] = condition.Status
 			}
-			nodeToConditionsMap[node.name] = conditionsMap
+			nodeToConditionsMap[node.Name] = conditionsMap
 		}
 		clusterNodeCountNew++
 
 		failed := false
 
 	Loop:
-		for _, condition := range node.conditions {
+		for _, condition := range node.Conditions {
 			if _, ok := failedNodeConditions[condition.Type]; ok {
 				// match the failedNodeConditions type we care about
 				if condition.Status != v1.ConditionFalse {
@@ -178,6 +196,7 @@ func (c *nodeClient) refresh() {
 		}
 	}
 
+	c.nodeInfos = nodeInfos
 	c.clusterFailedNodeCount = clusterFailedNodeCountNew
 	c.clusterNodeCount = clusterNodeCountNew
 	c.nodeToCapacityMap = nodeToCapacityMap
@@ -222,13 +241,23 @@ func transformFuncNode(obj any) (any, error) {
 	if !ok {
 		return nil, fmt.Errorf("input obj %v is not Node type", obj)
 	}
-	info := new(nodeInfo)
-	info.name = node.Name
-	info.capacity = node.Status.Capacity
-	info.allocatable = node.Status.Allocatable
-	info.conditions = []*NodeCondition{}
+	info := new(NodeInfo)
+	info.Name = node.Name
+	info.Capacity = node.Status.Capacity
+	info.Allocatable = node.Status.Allocatable
+	info.Conditions = []*NodeCondition{}
+	info.ProviderId = node.Spec.ProviderID
+	if instanceType, ok := node.Labels[instanceTypeLabelKey]; ok {
+		info.InstanceType = instanceType
+	} else {
+		// fallback for compatibility with k8s versions older than v1.17
+		// https://kubernetes.io/docs/reference/labels-annotations-taints/#beta-kubernetes-io-instance-type-deprecated
+		if instanceType, ok := node.Labels[instanceTypeLabelKeyBeta]; ok {
+			info.InstanceType = instanceType
+		}
+	}
 	for _, condition := range node.Status.Conditions {
-		info.conditions = append(info.conditions, &NodeCondition{
+		info.Conditions = append(info.Conditions, &NodeCondition{
 			Type:   condition.Type,
 			Status: condition.Status,
 		})
diff --git a/internal/aws/k8s/k8sclient/node_info.go b/internal/aws/k8s/k8sclient/node_info.go
index 6b1462adadcd..5fc419c552b9 100644
--- a/internal/aws/k8s/k8sclient/node_info.go
+++ b/internal/aws/k8s/k8sclient/node_info.go
@@ -7,11 +7,13 @@ import (
 	v1 "k8s.io/api/core/v1"
 )
 
-type nodeInfo struct {
-	name        string
-	conditions  []*NodeCondition
-	capacity    v1.ResourceList
-	allocatable v1.ResourceList
+type NodeInfo struct {
+	Name         string
+	Conditions   []*NodeCondition
+	Capacity     v1.ResourceList
+	Allocatable  v1.ResourceList
+	ProviderId   string
+	InstanceType string
 }
 
 type NodeCondition struct {
diff --git a/internal/aws/k8s/k8sclient/node_test.go b/internal/aws/k8s/k8sclient/node_test.go
index c90394548165..a00cddb654e3 100644
--- a/internal/aws/k8s/k8sclient/node_test.go
+++ b/internal/aws/k8s/k8sclient/node_test.go
@@ -36,6 +36,7 @@ var nodeArray = []any{
 				"failure-domain.beta.kubernetes.io/region": "eu-west-1",
 				"failure-domain.beta.kubernetes.io/zone":   "eu-west-1c",
 				"kubernetes.io/hostname":                   "ip-192-168-200-63.eu-west-1.compute.internal",
+				"node.kubernetes.io/instance-type":         "t3.medium",
 			},
 			Annotations: map[string]string{
 				"node.alpha.kubernetes.io/ttl":                           "0",
@@ -112,6 +113,9 @@ var nodeArray = []any{
 				Architecture:            "amd64",
 			},
 		},
+		Spec: v1.NodeSpec{
+			ProviderID: "aws:///eu-west-1c/i-09087f37a14b9ded1",
+		},
 	},
 	&v1.Node{
 		ObjectMeta: metav1.ObjectMeta{
@@ -132,6 +136,7 @@ var nodeArray = []any{
 				"kubernetes.io/hostname":                   "ip-192-168-76-61.eu-west-1.compute.internal",
 				"kubernetes.io/arch":                       "amd64",
 				"beta.kubernetes.io/instance-type":         "t3.medium",
+				"node.kubernetes.io/instance-type":         "t3.medium",
 			},
 			Annotations: map[string]string{
 				"node.alpha.kubernetes.io/ttl":                           "0",
@@ -208,6 +213,9 @@ var nodeArray = []any{
 				Architecture:            "amd64",
 			},
 		},
+		Spec: v1.NodeSpec{
+			ProviderID: "aws:///eu-west-1a/i-09087f37a14b9ded2",
+		},
 	},
 	&v1.Node{
 		ObjectMeta: metav1.ObjectMeta{
@@ -304,6 +312,9 @@ var nodeArray = []any{
 				Architecture:            "amd64",
 			},
 		},
+		Spec: v1.NodeSpec{
+			ProviderID: "aws:///eu-west-1b/i-09087f37a14b9ded3",
+		},
 	},
 }
 
@@ -322,6 +333,95 @@ func TestNodeClient(t *testing.T) {
 				"nodeToCapacityMap":      map[string]v1.ResourceList{},                             // Node level info is not captured by default
 				"nodeToAllocatableMap":   map[string]v1.ResourceList{},                             // Node level info is not captured by default
 				"nodeToConditionsMap":    map[string]map[v1.NodeConditionType]v1.ConditionStatus{}, // Node level info is not captured by default
+				"nodeInfos": []*NodeInfo{
+					{
+						Name: "ip-192-168-200-63.eu-west-1.compute.internal",
+						Conditions: []*NodeCondition{
+							{
+								Type:   v1.NodeConditionType("MemoryPressure"),
+								Status: v1.ConditionFalse,
+							},
+							{
+								Type:   v1.NodeConditionType("DiskPressure"),
+								Status: v1.ConditionFalse,
+							},
+							{
+								Type:   v1.NodeConditionType("PIDPressure"),
+								Status: v1.ConditionFalse,
+							},
+							{
+								Type:   v1.NodeConditionType("Ready"),
+								Status: v1.ConditionTrue,
+							},
+						},
+						Allocatable: v1.ResourceList{
+							v1.ResourcePods: *resource.NewQuantity(5, resource.DecimalSI),
+						},
+						Capacity: v1.ResourceList{
+							v1.ResourcePods: *resource.NewQuantity(5, resource.DecimalSI),
+						},
+						ProviderId:   "aws:///eu-west-1c/i-09087f37a14b9ded1",
+						InstanceType: "t3.medium",
+					},
+					{
+						Name: "ip-192-168-76-61.eu-west-1.compute.internal",
+						Conditions: []*NodeCondition{
+							{
+								Type:   v1.NodeConditionType("MemoryPressure"),
+								Status: v1.ConditionFalse,
+							},
+							{
+								Type:   v1.NodeConditionType("DiskPressure"),
+								Status: v1.ConditionFalse,
+							},
+							{
+								Type:   v1.NodeConditionType("PIDPressure"),
+								Status: v1.ConditionFalse,
+							},
+							{
+								Type:   v1.NodeConditionType("Ready"),
+								Status: v1.ConditionTrue,
+							},
+						},
+						Allocatable: v1.ResourceList{
+							v1.ResourcePods: *resource.NewQuantity(10, resource.DecimalSI),
+						},
+						Capacity: v1.ResourceList{
+							v1.ResourcePods: *resource.NewQuantity(10, resource.DecimalSI),
+						},
+						ProviderId:   "aws:///eu-west-1a/i-09087f37a14b9ded2",
+						InstanceType: "t3.medium",
+					},
+					{
+						Name: "ip-192-168-153-1.eu-west-1.compute.internal",
+						Conditions: []*NodeCondition{
+							{
+								Type:   v1.NodeConditionType("MemoryPressure"),
+								Status: v1.ConditionFalse,
+							},
+							{
+								Type:   v1.NodeConditionType("DiskPressure"),
+								Status: v1.ConditionTrue,
+							},
+							{
+								Type:   v1.NodeConditionType("PIDPressure"),
+								Status: v1.ConditionFalse,
+							},
+							{
+								Type:   v1.NodeConditionType("Ready"),
+								Status: v1.ConditionFalse,
+							},
+						},
+						Allocatable: v1.ResourceList{
+							v1.ResourcePods: *resource.NewQuantity(1, resource.DecimalSI),
+						},
+						Capacity: v1.ResourceList{
+							v1.ResourcePods: *resource.NewQuantity(5, resource.DecimalSI),
+						},
+						ProviderId:   "aws:///eu-west-1b/i-09087f37a14b9ded3",
+						InstanceType: "t3.medium",
+					},
+				},
 			},
 		},
 		"CaptureNodeLevelInfo": {
@@ -374,6 +474,95 @@ func TestNodeClient(t *testing.T) {
 						"Ready":          "False",
 					},
 				},
+				"nodeInfos": []*NodeInfo{
+					{
+						Name: "ip-192-168-200-63.eu-west-1.compute.internal",
+						Conditions: []*NodeCondition{
+							{
+								Type:   v1.NodeConditionType("MemoryPressure"),
+								Status: v1.ConditionFalse,
+							},
+							{
+								Type:   v1.NodeConditionType("DiskPressure"),
+								Status: v1.ConditionFalse,
+							},
+							{
+								Type:   v1.NodeConditionType("PIDPressure"),
+								Status: v1.ConditionFalse,
+							},
+							{
+								Type:   v1.NodeConditionType("Ready"),
+								Status: v1.ConditionTrue,
+							},
+						},
+						Allocatable: v1.ResourceList{
+							v1.ResourcePods: *resource.NewQuantity(5, resource.DecimalSI),
+						},
+						Capacity: v1.ResourceList{
+							v1.ResourcePods: *resource.NewQuantity(5, resource.DecimalSI),
+						},
+						ProviderId:   "aws:///eu-west-1c/i-09087f37a14b9ded1",
+						InstanceType: "t3.medium",
+					},
+					{
+						Name: "ip-192-168-76-61.eu-west-1.compute.internal",
+						Conditions: []*NodeCondition{
+							{
+								Type:   v1.NodeConditionType("MemoryPressure"),
+								Status: v1.ConditionFalse,
+							},
+							{
+								Type:   v1.NodeConditionType("DiskPressure"),
+								Status: v1.ConditionFalse,
+							},
+							{
+								Type:   v1.NodeConditionType("PIDPressure"),
+								Status: v1.ConditionFalse,
+							},
+							{
+								Type:   v1.NodeConditionType("Ready"),
+								Status: v1.ConditionTrue,
+							},
+						},
+						Allocatable: v1.ResourceList{
+							v1.ResourcePods: *resource.NewQuantity(10, resource.DecimalSI),
+						},
+						Capacity: v1.ResourceList{
+							v1.ResourcePods: *resource.NewQuantity(10, resource.DecimalSI),
+						},
+						ProviderId:   "aws:///eu-west-1a/i-09087f37a14b9ded2",
+						InstanceType: "t3.medium",
+					},
+					{
+						Name: "ip-192-168-153-1.eu-west-1.compute.internal",
+						Conditions: []*NodeCondition{
+							{
+								Type:   v1.NodeConditionType("MemoryPressure"),
+								Status: v1.ConditionFalse,
+							},
+							{
+								Type:   v1.NodeConditionType("DiskPressure"),
+								Status: v1.ConditionTrue,
+							},
+							{
+								Type:   v1.NodeConditionType("PIDPressure"),
+								Status: v1.ConditionFalse,
+							},
+							{
+								Type:   v1.NodeConditionType("Ready"),
+								Status: v1.ConditionFalse,
+							},
+						},
+						Allocatable: v1.ResourceList{
+							v1.ResourcePods: *resource.NewQuantity(1, resource.DecimalSI),
+						},
+						Capacity: v1.ResourceList{
+							v1.ResourcePods: *resource.NewQuantity(5, resource.DecimalSI),
+						},
+						ProviderId:   "aws:///eu-west-1b/i-09087f37a14b9ded3",
+						InstanceType: "t3.medium",
+					},
+				},
 			},
 		},
 	}
@@ -388,6 +577,7 @@ func TestNodeClient(t *testing.T) {
 			require.Equal(t, testCase.want["nodeToCapacityMap"], client.NodeToCapacityMap())
 			require.Equal(t, testCase.want["nodeToAllocatableMap"], client.NodeToAllocatableMap())
 			require.Equal(t, testCase.want["nodeToConditionsMap"], client.NodeToConditionsMap())
+			require.EqualValues(t, testCase.want["nodeInfos"], client.NodeInfos())
 
 			client.shutdown()
 			assert.True(t, client.stopped)
diff --git a/internal/aws/k8s/k8sclient/pod.go b/internal/aws/k8s/k8sclient/pod.go
index 626c9d77a193..4f82aac7c106 100644
--- a/internal/aws/k8s/k8sclient/pod.go
+++ b/internal/aws/k8s/k8sclient/pod.go
@@ -128,6 +128,15 @@ func transformFuncPod(obj any) (any, error) {
 	info.OwnerReferences = pod.OwnerReferences
 	info.Phase = pod.Status.Phase
 	info.Conditions = pod.Status.Conditions
+	containerInfos := make([]*ContainerInfo, 0)
+	for _, container := range pod.Spec.Containers {
+		containerInfos = append(containerInfos, &ContainerInfo{
+			Name:      container.Name,
+			Resources: container.Resources,
+		})
+	}
+	info.Containers = containerInfos
+	info.NodeName = pod.Spec.NodeName
 	return info, nil
 }
 
diff --git a/internal/aws/k8s/k8sclient/pod_info.go b/internal/aws/k8s/k8sclient/pod_info.go
index 14e3eee13008..a9dfe6a3ce2c 100644
--- a/internal/aws/k8s/k8sclient/pod_info.go
+++ b/internal/aws/k8s/k8sclient/pod_info.go
@@ -16,4 +16,11 @@ type PodInfo struct {
 	OwnerReferences []metaV1.OwnerReference
 	Phase           v1.PodPhase
 	Conditions      []v1.PodCondition
+	NodeName        string
+	Containers      []*ContainerInfo
+}
+
+type ContainerInfo struct {
+	Name      string
+	Resources v1.ResourceRequirements
 }
diff --git a/internal/aws/k8s/k8sclient/pod_test.go b/internal/aws/k8s/k8sclient/pod_test.go
index 9e0e8bdcf60a..b0af59f84844 100644
--- a/internal/aws/k8s/k8sclient/pod_test.go
+++ b/internal/aws/k8s/k8sclient/pod_test.go
@@ -11,6 +11,7 @@ import (
 	"github.com/stretchr/testify/assert"
 	"go.uber.org/zap"
 	v1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/client-go/kubernetes/fake"
 )
@@ -192,8 +193,8 @@ func TestTransformFuncPod(t *testing.T) {
 	assert.Error(t, err)
 }
 
-func TestPodClient_PodNameToPodMap(t *testing.T) {
-	skip(t, "Flaky test - See https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/11078")
+func TestPodClient_PodInfos(t *testing.T) {
+	//skip(t, "Flaky test - See https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/11078")
 	setOption := podSyncCheckerOption(&mockReflectorSyncChecker{})
 
 	samplePodArray := []any{
@@ -203,10 +204,29 @@ func TestPodClient_PodNameToPodMap(t *testing.T) {
 				Name:      "kube-proxy-csm88",
 				Namespace: "kube-system",
 				SelfLink:  "/api/v1/namespaces/kube-system/pods/kube-proxy-csm88",
+				Labels: map[string]string{
+					"key": "value",
+				},
 			},
 			Status: v1.PodStatus{
 				Phase: "Running",
 			},
+			Spec: v1.PodSpec{
+				NodeName: "node-1",
+				Containers: []v1.Container{
+					{
+						Name: "container-1",
+						Resources: v1.ResourceRequirements{
+							Limits: v1.ResourceList{
+								"nvidia.com/gpu": resource.MustParse("1"),
+							},
+							Requests: v1.ResourceList{
+								"nvidia.com/gpu": resource.MustParse("1"),
+							},
+						},
+					},
+				},
+			},
 		},
 	}
 
@@ -220,13 +240,29 @@ func TestPodClient_PodNameToPodMap(t *testing.T) {
 			Name:      "kube-proxy-csm88",
 			Namespace: "kube-system",
 			UID:       "bc5f5839-f62e-44b9-a79e-af250d92dcb1",
-			Labels:    map[string]string{},
-			Phase:     v1.PodRunning,
+			Labels: map[string]string{
+				"key": "value",
+			},
+			Phase:    v1.PodRunning,
+			NodeName: "node-1",
+			Containers: []*ContainerInfo{
+				{
+					Name: "container-1",
+					Resources: v1.ResourceRequirements{
+						Limits: v1.ResourceList{
+							"nvidia.com/gpu": resource.MustParse("1"),
+						},
+						Requests: v1.ResourceList{
+							"nvidia.com/gpu": resource.MustParse("1"),
+						},
+					},
+				},
+			},
 		},
 	}
 
-	resultMap := client.PodInfos()
-	assert.Equal(t, expectedArray, resultMap)
+	results := client.PodInfos()
+	assert.Equal(t, expectedArray, results)
 	client.shutdown()
 	assert.True(t, client.stopped)
 }
diff --git a/internal/aws/k8s/k8sutil/util.go b/internal/aws/k8s/k8sutil/util.go
index 2264446eacfb..d6e115194d4b 100644
--- a/internal/aws/k8s/k8sutil/util.go
+++ b/internal/aws/k8s/k8sutil/util.go
@@ -5,6 +5,7 @@ package k8sutil // import "github.com/open-telemetry/opentelemetry-collector-con
 
 import (
 	"fmt"
+	"strings"
 )
 
 // CreatePodKey concatenates namespace and podName to get a pod key
@@ -22,3 +23,11 @@ func CreateContainerKey(namespace, podName, containerName string) string {
 	}
 	return fmt.Sprintf("namespace:%s,podName:%s,containerName:%s", namespace, podName, containerName)
 }
+
+// ParseInstanceIdFromProviderId parses EC2 instance id from node's provider id which has format of aws:///<subnet>/<instanceId>
+func ParseInstanceIdFromProviderId(providerId string) string {
+	if providerId == "" || !strings.HasPrefix(providerId, "aws://") {
+		return ""
+	}
+	return providerId[strings.LastIndex(providerId, "/")+1:]
+}
diff --git a/internal/aws/k8s/k8sutil/util_test.go b/internal/aws/k8s/k8sutil/util_test.go
index 16734f8513bc..045497194ead 100644
--- a/internal/aws/k8s/k8sutil/util_test.go
+++ b/internal/aws/k8s/k8sutil/util_test.go
@@ -22,3 +22,10 @@ func TestCreateContainerKey(t *testing.T) {
 	assert.Equal(t, "", CreateContainerKey("default", "", "testContainer"))
 	assert.Equal(t, "", CreateContainerKey("default", "testPod", ""))
 }
+
+func TestParseInstanceIdFromProviderId(t *testing.T) {
+	assert.Equal(t, "i-0b00e07ccd388f915", ParseInstanceIdFromProviderId("aws:///us-west-2b/i-0b00e07ccd388f915"))
+	assert.Equal(t, "i-0b00e07ccd388f915", ParseInstanceIdFromProviderId("aws:///us-east-1c/i-0b00e07ccd388f915"))
+	assert.Equal(t, "", ParseInstanceIdFromProviderId(":///us-east-1c/i-0b00e07ccd388f915"))
+	assert.Equal(t, "", ParseInstanceIdFromProviderId(""))
+}
diff --git a/receiver/awscontainerinsightreceiver/internal/k8sapiserver/k8sapiserver.go b/receiver/awscontainerinsightreceiver/internal/k8sapiserver/k8sapiserver.go
index cc9a17ef801d..c14d10ed5005 100644
--- a/receiver/awscontainerinsightreceiver/internal/k8sapiserver/k8sapiserver.go
+++ b/receiver/awscontainerinsightreceiver/internal/k8sapiserver/k8sapiserver.go
@@ -16,6 +16,7 @@ import (
 	"go.opentelemetry.io/collector/pdata/pmetric"
 	"go.uber.org/zap"
 	v1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
 	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/apimachinery/pkg/watch"
 	"k8s.io/client-go/kubernetes"
@@ -61,6 +62,7 @@ type K8sAPIServer struct {
 	leaderElection            *LeaderElection
 	addFullPodNameMetricLabel bool
 	includeEnhancedMetrics    bool
+	enableAcceleratedMetrics  bool
 }
 
 type clusterNameProvider interface {
@@ -70,7 +72,7 @@ type clusterNameProvider interface {
 type Option func(*K8sAPIServer)
 
 // NewK8sAPIServer creates a k8sApiServer which can generate cluster-level metrics
-func NewK8sAPIServer(cnp clusterNameProvider, logger *zap.Logger, leaderElection *LeaderElection, addFullPodNameMetricLabel bool, includeEnhancedMetrics bool, options ...Option) (*K8sAPIServer, error) {
+func NewK8sAPIServer(cnp clusterNameProvider, logger *zap.Logger, leaderElection *LeaderElection, addFullPodNameMetricLabel bool, includeEnhancedMetrics bool, enableAcceleratedMetrics bool, options ...Option) (*K8sAPIServer, error) {
 
 	k := &K8sAPIServer{
 		logger:                    logger,
@@ -78,6 +80,7 @@ func NewK8sAPIServer(cnp clusterNameProvider, logger *zap.Logger, leaderElection
 		leaderElection:            leaderElection,
 		addFullPodNameMetricLabel: addFullPodNameMetricLabel,
 		includeEnhancedMetrics:    includeEnhancedMetrics,
+		enableAcceleratedMetrics:  enableAcceleratedMetrics,
 	}
 
 	for _, opt := range options {
@@ -125,6 +128,9 @@ func (k *K8sAPIServer) GetMetrics() []pmetric.Metrics {
 	result = append(result, k.getStatefulSetMetrics(clusterName, timestampNs)...)
 	result = append(result, k.getReplicaSetMetrics(clusterName, timestampNs)...)
 	result = append(result, k.getPendingPodStatusMetrics(clusterName, timestampNs)...)
+	if k.enableAcceleratedMetrics {
+		result = append(result, k.getAcceleratorCountMetrics(clusterName, timestampNs)...)
+	}
 
 	return result
 }
@@ -351,7 +357,7 @@ func (k *K8sAPIServer) getPendingPodStatusMetrics(clusterName, timestampNs strin
 			}
 
 			attributes[ci.PodStatus] = string(v1.PodPending)
-			attributes["k8s.node.name"] = "pending"
+			attributes["k8s.node.name"] = pendingNodeName
 
 			kubernetesBlob := map[string]any{}
 			k.getKubernetesBlob(podInfo, kubernetesBlob, attributes)
@@ -442,6 +448,95 @@ func (k *K8sAPIServer) getKubernetesBlob(pod *k8sclient.PodInfo, kubernetesBlob
 	}
 }
 
+func (k *K8sAPIServer) getAcceleratorCountMetrics(clusterName, timestampNs string) []pmetric.Metrics {
+	var metrics []pmetric.Metrics
+	podsList := k.leaderElection.podClient.PodInfos()
+	nodeInfos := k.leaderElection.nodeClient.NodeInfos()
+	podKeyToServiceNamesMap := k.leaderElection.epClient.PodKeyToServiceNames()
+	for _, podInfo := range podsList {
+		// only care for pending and running pods
+		if podInfo.Phase != v1.PodPending && podInfo.Phase != v1.PodRunning {
+			continue
+		}
+
+		fields := map[string]any{}
+
+		var podLimit, podRequest, podTotal int64
+		var gpuAllocated bool
+		for _, container := range podInfo.Containers {
+			// check if at least 1 container is using gpu to add count metrics for a pod
+			_, found := container.Resources.Limits[resourceSpecNvidiaGpuKey]
+			gpuAllocated = gpuAllocated || found
+
+			if len(container.Resources.Limits) == 0 {
+				continue
+			}
+			podRequest += container.Resources.Requests.Name(resourceSpecNvidiaGpuKey, resource.DecimalExponent).Value()
+			limit := container.Resources.Limits.Name(resourceSpecNvidiaGpuKey, resource.DecimalExponent).Value()
+			// still counting pending pods to get total # of limit from spec
+			podLimit += limit
+
+			// sum of running pods only. a pod will be stuck in pending state when there is less or no gpu available than limit value
+			// e.g. limit=2,available=1 - *_gpu_limit=2, *_gpu_request=2, *_gpu_total=0
+			// request value is optional and must be equal to limit https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/
+			if podInfo.Phase == v1.PodRunning {
+				podTotal += limit
+			}
+		}
+		// skip adding gpu count metrics when none of containers has gpu resource allocated
+		if !gpuAllocated {
+			continue
+		}
+		// add pod level count metrics here then metricstransformprocessor will duplicate to node/cluster level metrics
+		fields[ci.MetricName(ci.TypePod, ci.GpuLimit)] = podLimit
+		fields[ci.MetricName(ci.TypePod, ci.GpuRequest)] = podRequest
+		fields[ci.MetricName(ci.TypePod, ci.GpuTotal)] = podTotal
+
+		attributes := map[string]string{
+			ci.ClusterNameKey:        clusterName,
+			ci.MetricType:            ci.TypePodGPU,
+			ci.Timestamp:             timestampNs,
+			ci.AttributeK8sNamespace: podInfo.Namespace,
+			ci.Version:               "0",
+		}
+
+		podKey := k8sutil.CreatePodKey(podInfo.Namespace, podInfo.Name)
+		if serviceList, ok := podKeyToServiceNamesMap[podKey]; ok {
+			if len(serviceList) > 0 {
+				attributes[ci.TypeService] = serviceList[0]
+			}
+		}
+
+		kubernetesBlob := map[string]any{}
+		k.getKubernetesBlob(podInfo, kubernetesBlob, attributes)
+		if podInfo.NodeName != "" {
+			// decorate with instance ID and type attributes which become dimensions for node_gpu_* metrics
+			attributes[ci.NodeNameKey] = podInfo.NodeName
+			kubernetesBlob["host"] = podInfo.NodeName
+			if nodeInfo, ok := nodeInfos[podInfo.NodeName]; ok {
+				attributes[ci.InstanceID] = k8sutil.ParseInstanceIdFromProviderId(nodeInfo.ProviderId)
+				attributes[ci.InstanceType] = nodeInfo.InstanceType
+			}
+		} else {
+			// fallback when node name is not available
+			attributes[ci.NodeNameKey] = pendingNodeName
+			kubernetesBlob["host"] = pendingNodeName
+		}
+		if len(kubernetesBlob) > 0 {
+			kubernetesInfo, err := json.Marshal(kubernetesBlob)
+			if err != nil {
+				k.logger.Warn("Error parsing kubernetes blob for pod metrics")
+			} else {
+				attributes[ci.AttributeKubernetes] = string(kubernetesInfo)
+			}
+		}
+		attributes[ci.SourcesKey] = "[\"apiserver\"]"
+		md := ci.ConvertToOTLPMetrics(fields, attributes, k.logger)
+		metrics = append(metrics, md)
+	}
+	return metrics
+}
+
 // Shutdown stops the k8sApiServer
 func (k *K8sAPIServer) Shutdown() error {
 	if k.cancel != nil {
diff --git a/receiver/awscontainerinsightreceiver/internal/k8sapiserver/k8sapiserver_test.go b/receiver/awscontainerinsightreceiver/internal/k8sapiserver/k8sapiserver_test.go
index d4d27fbaea00..92149566450c 100644
--- a/receiver/awscontainerinsightreceiver/internal/k8sapiserver/k8sapiserver_test.go
+++ b/receiver/awscontainerinsightreceiver/internal/k8sapiserver/k8sapiserver_test.go
@@ -14,6 +14,7 @@ import (
 	"go.opentelemetry.io/collector/pdata/pmetric"
 	"go.uber.org/zap"
 	v1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
 	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/apimachinery/pkg/watch"
 	"k8s.io/client-go/kubernetes"
@@ -138,6 +139,12 @@ func (client *MockClient) PodInfos() []*k8sclient.PodInfo {
 	return args.Get(0).([]*k8sclient.PodInfo)
 }
 
+// k8sclient.NodeClient
+func (client *MockClient) NodeInfos() map[string]*k8sclient.NodeInfo {
+	args := client.Called()
+	return args.Get(0).(map[string]*k8sclient.NodeInfo)
+}
+
 // k8sclient.NodeClient
 func (client *MockClient) ClusterFailedNodeCount() int {
 	args := client.Called()
@@ -224,7 +231,7 @@ func (m mockClusterNameProvider) GetClusterName() string {
 }
 
 func TestK8sAPIServer_New(t *testing.T) {
-	k8sAPIServer, err := NewK8sAPIServer(mockClusterNameProvider{}, zap.NewNop(), nil, false, false)
+	k8sAPIServer, err := NewK8sAPIServer(mockClusterNameProvider{}, zap.NewNop(), nil, false, false, false)
 	assert.Nil(t, k8sAPIServer)
 	assert.NotNil(t, err)
 }
@@ -304,11 +311,43 @@ func TestK8sAPIServer_GetMetrics(t *testing.T) {
 			UID:       "bc5f5839-f62e-44b9-a79e-af250d92dcb1",
 			Phase:     v1.PodPending,
 		},
+		{
+			Name:      "gpu-burn-6dcbd994fb-9fn8w",
+			Namespace: "amazon-cloudwatch",
+			NodeName:  "ip-192-168-57-23.us-west-2.compute.internal",
+			UID:       "bc5f5839-f62e-44b9-a79e-af250d92dcb1",
+			Phase:     v1.PodRunning,
+			Containers: []*k8sclient.ContainerInfo{
+				{
+					Name: "container-1",
+					Resources: v1.ResourceRequirements{
+						Limits: v1.ResourceList{
+							resourceSpecNvidiaGpuKey: resource.MustParse("2"),
+						},
+					},
+				},
+			},
+		},
 	})
 	mockClient.On("PodKeyToServiceNames").Return(map[string][]string{
 		"namespace:kube-system,podName:coredns-7554568866-26jdf": {"kube-dns"},
 		"namespace:kube-system,podName:coredns-7554568866-shwn6": {"kube-dns"},
 	})
+	mockClient.On("NodeInfos").Return(map[string]*k8sclient.NodeInfo{
+		"ip-192-168-57-23.us-west-2.compute.internal": {
+			Name: "ip-192-168-57-23.us-west-2.compute.internal",
+			Conditions: []*k8sclient.NodeCondition{
+				{
+					Type:   v1.NodeReady,
+					Status: v1.ConditionTrue,
+				},
+			},
+			Capacity:     map[v1.ResourceName]resource.Quantity{},
+			Allocatable:  map[v1.ResourceName]resource.Quantity{},
+			ProviderId:   "aws:///us-west-2/i-abcdef123456789",
+			InstanceType: "g4dn-12xl",
+		},
+	})
 
 	leaderElection := &LeaderElection{
 		k8sClient:         &mockK8sClient{},
@@ -326,7 +365,7 @@ func TestK8sAPIServer_GetMetrics(t *testing.T) {
 
 	t.Setenv("HOST_NAME", hostName)
 	t.Setenv("K8S_NAMESPACE", "namespace")
-	k8sAPIServer, err := NewK8sAPIServer(mockClusterNameProvider{}, zap.NewNop(), leaderElection, true, true)
+	k8sAPIServer, err := NewK8sAPIServer(mockClusterNameProvider{}, zap.NewNop(), leaderElection, true, true, true)
 
 	assert.NotNil(t, k8sAPIServer)
 	assert.Nil(t, err)
@@ -401,6 +440,13 @@ func TestK8sAPIServer_GetMetrics(t *testing.T) {
 			assert.Equal(t, "kube-system", getStringAttrVal(metric, ci.AttributeK8sNamespace))
 			assert.Equal(t, "Pending", getStringAttrVal(metric, "pod_status"))
 			assert.Equal(t, "Pod", getStringAttrVal(metric, ci.MetricType))
+		case ci.TypePodGPU:
+			assertMetricValueEqual(t, metric, "pod_gpu_limit", int64(2))
+			assertMetricValueEqual(t, metric, "pod_gpu_total", int64(2))
+			assert.Equal(t, "amazon-cloudwatch", getStringAttrVal(metric, ci.AttributeK8sNamespace))
+			assert.Equal(t, ci.TypePodGPU, getStringAttrVal(metric, ci.MetricType))
+			assert.Equal(t, "i-abcdef123456789", getStringAttrVal(metric, ci.InstanceID))
+			assert.Equal(t, "g4dn-12xl", getStringAttrVal(metric, ci.InstanceType))
 		default:
 			assert.Fail(t, "Unexpected metric type: "+metricType)
 		}
diff --git a/receiver/awscontainerinsightreceiver/internal/k8sapiserver/utils.go b/receiver/awscontainerinsightreceiver/internal/k8sapiserver/utils.go
index a2efa312805d..c6a6293630fd 100644
--- a/receiver/awscontainerinsightreceiver/internal/k8sapiserver/utils.go
+++ b/receiver/awscontainerinsightreceiver/internal/k8sapiserver/utils.go
@@ -19,6 +19,8 @@ const (
 	splitRegexStr              = "\\.|-"
 	KubeProxy                  = "kube-proxy"
 	cronJobAllowedString       = "0123456789"
+	resourceSpecNvidiaGpuKey   = "nvidia.com/gpu"
+	pendingNodeName            = "pending"
 )
 
 var (
diff --git a/receiver/awscontainerinsightreceiver/receiver.go b/receiver/awscontainerinsightreceiver/receiver.go
index b34aaa82be48..79b463f40209 100644
--- a/receiver/awscontainerinsightreceiver/receiver.go
+++ b/receiver/awscontainerinsightreceiver/receiver.go
@@ -110,7 +110,7 @@ func (acir *awsContainerInsightReceiver) Start(ctx context.Context, host compone
 				return err
 			}
 
-			acir.k8sapiserver, err = k8sapiserver.NewK8sAPIServer(hostinfo, acir.settings.Logger, leaderElection, acir.config.AddFullPodNameMetricLabel, acir.config.EnableControlPlaneMetrics)
+			acir.k8sapiserver, err = k8sapiserver.NewK8sAPIServer(hostinfo, acir.settings.Logger, leaderElection, acir.config.AddFullPodNameMetricLabel, acir.config.EnableControlPlaneMetrics, acir.config.EnableAcceleratedComputeMetrics)
 			if err != nil {
 				return err
 			}