Skip to content

Commit

Permalink
Move GPU request, limit, usage and reserved_capacity metrics into pod…
Browse files Browse the repository at this point in the history
… and node stores
  • Loading branch information
sky333999 committed Aug 14, 2024
1 parent e08453f commit 9c631d4
Show file tree
Hide file tree
Showing 22 changed files with 193 additions and 527 deletions.
14 changes: 8 additions & 6 deletions internal/aws/containerinsight/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,9 +141,10 @@ const (
EfaRxDropped = "rx_dropped"
EfaTxBytes = "tx_bytes"

GpuLimit = "gpu_limit"
GpuTotal = "gpu_total"
GpuRequest = "gpu_request"
GpuLimit = "gpu_limit"
GpuUsageTotal = "gpu_usage_total"
GpuRequest = "gpu_request"
GpuReservedCapacity = "gpu_reserved_capacity"

// Define the metric types
TypeCluster = "Cluster"
Expand Down Expand Up @@ -325,8 +326,9 @@ func init() {
EfaRxDropped: UnitCountPerSec,
EfaTxBytes: UnitBytesPerSec,

GpuLimit: UnitCount,
GpuTotal: UnitCount,
GpuRequest: UnitCount,
GpuLimit: UnitCount,
GpuUsageTotal: UnitCount,
GpuRequest: UnitCount,
GpuReservedCapacity: UnitPercent,
}
}
50 changes: 16 additions & 34 deletions internal/aws/containerinsight/utils_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,10 @@ func TestConvertToOTLPMetricsForNodeMetrics(t *testing.T) {
"node_network_tx_packets": 37.802748111760124,
"node_number_of_running_containers": int32(7),
"node_number_of_running_pods": int64(7),
"node_gpu_request": int32(7),
"node_gpu_limit": int32(7),
"node_gpu_usage_total": int32(7),
"node_gpu_reserved_capacity": 3.0093851356081194,
}
expectedUnits = map[string]string{
"node_cpu_limit": "",
Expand Down Expand Up @@ -467,6 +471,10 @@ func TestConvertToOTLPMetricsForNodeMetrics(t *testing.T) {
"node_network_tx_packets": UnitCountPerSec,
"node_number_of_running_containers": UnitCount,
"node_number_of_running_pods": UnitCount,
"node_gpu_request": UnitCount,
"node_gpu_limit": UnitCount,
"node_gpu_usage_total": UnitCount,
"node_gpu_reserved_capacity": UnitPercent,
}
tags = map[string]string{
"AutoScalingGroupName": "eks-a6bb9db9-267c-401c-db55-df8ef645b06f",
Expand Down Expand Up @@ -716,6 +724,10 @@ func TestConvertToOTLPMetricsForPodMetrics(t *testing.T) {
"pod_status_unknown": 0,
"pod_status_ready": 1,
"pod_status_scheduled": 1,
"pod_gpu_request": 1,
"pod_gpu_limit": 1,
"pod_gpu_usage_total": 1,
"pod_gpu_reserved_capacity": 2.3677681271483983,
}
expectedUnits = map[string]string{
"pod_cpu_limit": "",
Expand Down Expand Up @@ -762,6 +774,10 @@ func TestConvertToOTLPMetricsForPodMetrics(t *testing.T) {
"pod_status_unknown": UnitCount,
"pod_status_ready": UnitCount,
"pod_status_scheduled": UnitCount,
"pod_gpu_request": UnitCount,
"pod_gpu_limit": UnitCount,
"pod_gpu_usage_total": UnitCount,
"pod_gpu_reserved_capacity": UnitPercent,
}
tags = map[string]string{
"ClusterName": "eks-aoc",
Expand Down Expand Up @@ -913,37 +929,3 @@ func TestConvertToOTLPMetricsForPodEfaMetrics(t *testing.T) {
md = ConvertToOTLPMetrics(fields, tags, zap.NewNop())
checkMetricsAreExpected(t, md, fields, tags, expectedUnits)
}

func TestConvertToOTLPMetricsForAcceleratorCountMetrics(t *testing.T) {
var fields map[string]any
var expectedUnits map[string]string
var tags map[string]string
var md pmetric.Metrics
now := time.Now()
timestamp := strconv.FormatInt(now.UnixNano(), 10)

fields = map[string]any{
"pod_gpu_limit": int64(3),
"pod_gpu_total": int64(3),
"pod_gpu_request": int64(3),
}
expectedUnits = map[string]string{
"pod_gpu_limit": UnitCount,
"pod_gpu_total": UnitCount,
"pod_gpu_request": UnitCount,
}
tags = map[string]string{
"ClusterName": "eks-aoc",
"InstanceId": "i-01bf9fb097cbf3205",
"InstanceType": "t2.xlarge",
"Namespace": "amazon-cloudwatch",
"NodeName": "ip-192-168-12-170.ec2.internal",
"PodName": "cloudwatch-agent",
"ContainerName": "cloudwatch-agent",
"Type": "PodGPU",
"Version": "0",
"Timestamp": timestamp,
}
md = ConvertToOTLPMetrics(fields, tags, zap.NewNop())
checkMetricsAreExpected(t, md, fields, tags, expectedUnits)
}
53 changes: 12 additions & 41 deletions internal/aws/k8s/k8sclient/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,6 @@ import (
"k8s.io/client-go/tools/cache"
)

const (
instanceTypeLabelKey = "node.kubernetes.io/instance-type"
instanceTypeLabelKeyBeta = "beta.kubernetes.io/instance-type"
)

// This needs to be reviewed for newer versions of k8s.
var failedNodeConditions = map[v1.NodeConditionType]bool{
v1.NodeMemoryPressure: true,
Expand All @@ -32,7 +27,6 @@ var failedNodeConditions = map[v1.NodeConditionType]bool{
}

type NodeClient interface {
NodeInfos() map[string]*NodeInfo
// Get the number of failed nodes for current cluster
ClusterFailedNodeCount() int
// Get the number of nodes for current cluster
Expand Down Expand Up @@ -78,23 +72,13 @@ type nodeClient struct {
captureNodeLevelInfo bool

mu sync.RWMutex
nodeInfos map[string]*NodeInfo
clusterFailedNodeCount int
clusterNodeCount int
nodeToCapacityMap map[string]v1.ResourceList
nodeToAllocatableMap map[string]v1.ResourceList
nodeToConditionsMap map[string]map[v1.NodeConditionType]v1.ConditionStatus
}

func (c *nodeClient) NodeInfos() map[string]*NodeInfo {
if c.store.GetResetRefreshStatus() {
c.refresh()
}
c.mu.RLock()
defer c.mu.RUnlock()
return c.nodeInfos
}

func (c *nodeClient) ClusterFailedNodeCount() int {
if c.store.GetResetRefreshStatus() {
c.refresh()
Expand Down Expand Up @@ -161,26 +145,24 @@ func (c *nodeClient) refresh() {
nodeToAllocatableMap := make(map[string]v1.ResourceList)
nodeToConditionsMap := make(map[string]map[v1.NodeConditionType]v1.ConditionStatus)

nodeInfos := map[string]*NodeInfo{}
for _, obj := range objsList {
node := obj.(*NodeInfo)
nodeInfos[node.Name] = node
node := obj.(*nodeInfo)

if c.captureNodeLevelInfo {
nodeToCapacityMap[node.Name] = node.Capacity
nodeToAllocatableMap[node.Name] = node.Allocatable
nodeToCapacityMap[node.name] = node.capacity
nodeToAllocatableMap[node.name] = node.allocatable
conditionsMap := make(map[v1.NodeConditionType]v1.ConditionStatus)
for _, condition := range node.Conditions {
for _, condition := range node.conditions {
conditionsMap[condition.Type] = condition.Status
}
nodeToConditionsMap[node.Name] = conditionsMap
nodeToConditionsMap[node.name] = conditionsMap
}
clusterNodeCountNew++

failed := false

Loop:
for _, condition := range node.Conditions {
for _, condition := range node.conditions {
if _, ok := failedNodeConditions[condition.Type]; ok {
// match the failedNodeConditions type we care about
if condition.Status != v1.ConditionFalse {
Expand All @@ -196,7 +178,6 @@ func (c *nodeClient) refresh() {
}
}

c.nodeInfos = nodeInfos
c.clusterFailedNodeCount = clusterFailedNodeCountNew
c.clusterNodeCount = clusterNodeCountNew
c.nodeToCapacityMap = nodeToCapacityMap
Expand Down Expand Up @@ -241,23 +222,13 @@ func transformFuncNode(obj any) (any, error) {
if !ok {
return nil, fmt.Errorf("input obj %v is not Node type", obj)
}
info := new(NodeInfo)
info.Name = node.Name
info.Capacity = node.Status.Capacity
info.Allocatable = node.Status.Allocatable
info.Conditions = []*NodeCondition{}
info.ProviderID = node.Spec.ProviderID
if instanceType, ok := node.Labels[instanceTypeLabelKey]; ok {
info.InstanceType = instanceType
} else {
// fallback for compatibility with k8s versions older than v1.17
// https://kubernetes.io/docs/reference/labels-annotations-taints/#beta-kubernetes-io-instance-type-deprecated
if instanceType, ok := node.Labels[instanceTypeLabelKeyBeta]; ok {
info.InstanceType = instanceType
}
}
info := new(nodeInfo)
info.name = node.Name
info.capacity = node.Status.Capacity
info.allocatable = node.Status.Allocatable
info.conditions = []*NodeCondition{}
for _, condition := range node.Status.Conditions {
info.Conditions = append(info.Conditions, &NodeCondition{
info.conditions = append(info.conditions, &NodeCondition{
Type: condition.Type,
Status: condition.Status,
})
Expand Down
12 changes: 5 additions & 7 deletions internal/aws/k8s/k8sclient/node_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,11 @@ import (
v1 "k8s.io/api/core/v1"
)

type NodeInfo struct {
Name string
Conditions []*NodeCondition
Capacity v1.ResourceList
Allocatable v1.ResourceList
ProviderID string
InstanceType string
type nodeInfo struct {
name string
conditions []*NodeCondition
capacity v1.ResourceList
allocatable v1.ResourceList
}

type NodeCondition struct {
Expand Down
Loading

0 comments on commit 9c631d4

Please sign in to comment.