Skip to content

Commit

Permalink
cadvisor: Fix pod detection for containerd aws#188
Browse files Browse the repository at this point in the history
  • Loading branch information
pingleig committed Mar 22, 2021
1 parent fbdd619 commit 8784c3c
Show file tree
Hide file tree
Showing 9 changed files with 57 additions and 35 deletions.
3 changes: 3 additions & 0 deletions internal/containerinsightscommon/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,4 +92,7 @@ const (
TypeContainer = "Container"
TypeContainerFS = "ContainerFS"
TypeContainerDiskIO = "ContainerDiskIO"
// Special type for pause container, introduced in https://github.com/aws/amazon-cloudwatch-agent/issues/188
// because containerd does not set container name pause container name to POD like docker does.
TypeInfraContainer = "InfraContainer"
)
43 changes: 23 additions & 20 deletions plugins/inputs/cadvisor/container_info_processor.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,32 +98,42 @@ func processContainer(info *cinfo.ContainerInfo, detailMode bool, containerOrche
if !detailMode {
return result, pKey
}

if len(info.Spec.Labels) == 0 {
log.Printf("W! no label found from container spec, is containerd socket mounted? https://github.com/aws/amazon-cloudwatch-agent/issues/188")
}
// Only a container has all these three labels set.
containerName := info.Spec.Labels[containerNameLable]
namespace := info.Spec.Labels[namespaceLable]
podName := info.Spec.Labels[podNameLable]
podId := info.Spec.Labels[podIdLable]
if containerName == "" || namespace == "" || podName == "" {
// NOTE: containerName can be empty for pause container on containerd
// https://github.com/containerd/cri/issues/922#issuecomment-423729537
if namespace == "" || podName == "" {
return result, pKey
}

// Pod's cgroup path is parent for a container.
// contianer name: /kubepods.slice/kubepods-burstable.slice/kubepods-burstable-pod04d39715_075e_4c7c_b128_67f7897c05b7.slice/docker-57b3dabd69b94beb462244a0c15c244b509adad0940cdcc67ca079b8208ec1f2.scope
// container name: /kubepods.slice/kubepods-burstable.slice/kubepods-burstable-pod04d39715_075e_4c7c_b128_67f7897c05b7.slice/docker-57b3dabd69b94beb462244a0c15c244b509adad0940cdcc67ca079b8208ec1f2.scope
// pod name: /kubepods.slice/kubepods-burstable.slice/kubepods-burstable-pod04d39715_075e_4c7c_b128_67f7897c05b7.slice/
podPath := path.Dir(info.Name)
pKey = &podKey{cgroupPath: podPath, podName: podName, podId: podId, namespace: namespace}

tags[PodIdKey] = podId
tags[K8sPodNameKey] = podName
tags[K8sNamespace] = namespace
if containerName != infraContainerName {

switch containerName {
// For docker, pause container name is set to POD while containerd does not set it.
// See https://github.com/aws/amazon-cloudwatch-agent/issues/188
case "", infraContainerName:
// NOTE: the pod here is only used by NetMetricExtractor,
// other pod info like CPU, Mem are dealt within in processPod.
containerType = TypeInfraContainer
default:
tags[ContainerNamekey] = containerName
tags[ContainerIdkey] = path.Base(info.Name)
containerType = TypeContainer
} else {
// NOTE: the pod here is only used by NetMetricExtractor,
// other pod info like CPU, Mem are dealt within in processPod.
containerType = TypePod
}
} else {
containerType = TypeNode
Expand All @@ -146,15 +156,18 @@ func processContainer(info *cinfo.ContainerInfo, detailMode bool, containerOrche
return result, pKey
}

// processPod is almost identical as processContainer. We got this second loop because pod detection relies
// on inspecting labels from containers in processContainer. cgroup path for detected pods are saved in podKeys.
// We may not get container before pod when looping all returned cgroup paths so we use a two pass solution
// in processContainers.
func processPod(info *cinfo.ContainerInfo, podKeys map[string]podKey) []*extractors.CAdvisorMetric {
var result []*extractors.CAdvisorMetric
if isContainerInContainer(info.Name) {
log.Printf("D! drop metric because it's nested container, name %s", info.Name)
return result
}

podKey := getPodKey(info, podKeys)
if podKey == nil {
podKey, ok := podKeys[info.Name]
if !ok {
return result
}

Expand All @@ -177,16 +190,6 @@ func processPod(info *cinfo.ContainerInfo, podKeys map[string]podKey) []*extract
return result
}

func getPodKey(info *cinfo.ContainerInfo, podKeys map[string]podKey) *podKey {
key := info.Name

if v, ok := podKeys[key]; ok {
return &v
}

return nil
}

// Check if it's a container running inside container, caller will drop the metric when return value is true.
// The validation is based on ContainerReference.Name, which is essentially cgroup path.
// The first version is from https://github.com/aws/amazon-cloudwatch-agent/commit/e8daa5f5926c5a5f38e0ceb746c141be463e11e4#diff-599185154c116b295172b56311729990d20672f6659500870997c018ce072100
Expand Down
6 changes: 4 additions & 2 deletions plugins/inputs/cadvisor/extractors/cpu_extractor.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ func (c *CpuMetricExtractor) recordPreviousInfo(info *cInfo.ContainerInfo) {

func (c *CpuMetricExtractor) GetValue(info *cInfo.ContainerInfo, containerType string) []*CAdvisorMetric {
var metrics []*CAdvisorMetric
if info.Spec.Labels[containerNameLable] == infraContainerName {
// Skip infra container and handle node, pod, other containers in pod
if containerType == TypeInfraContainer {
return metrics
}

Expand All @@ -41,6 +42,7 @@ func (c *CpuMetricExtractor) GetValue(info *cInfo.ContainerInfo, containerType s

if deltaCTimeInNano > MinTimeDiff {
metric := newCadvisorMetric(containerType)
metric.cgroupPath = info.Name

metric.fields[MetricName(containerType, CpuTotal)] = float64(curStats.Cpu.Usage.Total-preStats.Cpu.Usage.Total) / float64(deltaCTimeInNano) * decimalToMillicores
metric.fields[MetricName(containerType, CpuUser)] = float64(curStats.Cpu.Usage.User-preStats.Cpu.Usage.User) / float64(deltaCTimeInNano) * decimalToMillicores
Expand All @@ -59,6 +61,6 @@ func (c *CpuMetricExtractor) CleanUp(now time.Time) {

func NewCpuMetricExtractor() *CpuMetricExtractor {
return &CpuMetricExtractor{
preInfos: mapWithExpiry.NewMapWithExpiry(CleanInteval),
preInfos: mapWithExpiry.NewMapWithExpiry(CleanInterval),
}
}
2 changes: 1 addition & 1 deletion plugins/inputs/cadvisor/extractors/diskio_extractor.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ func (d *DiskIOMetricExtractor) CleanUp(now time.Time) {

func NewDiskIOMetricExtractor() *DiskIOMetricExtractor {
return &DiskIOMetricExtractor{
preInfos: mapWithExpiry.NewMapWithExpiry(CleanInteval),
preInfos: mapWithExpiry.NewMapWithExpiry(CleanInterval),
}
}

Expand Down
20 changes: 14 additions & 6 deletions plugins/inputs/cadvisor/extractors/extractor.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,28 @@ import (
)

const (
containerNameLable = "io.kubernetes.container.name"
// TODO: https://github.com/containerd/cri/issues/922#issuecomment-423729537 the container name can be empty on containerd
infraContainerName = "POD"
containerNameLabel = "io.kubernetes.container.name"
Metrics = "Metrics"
Dimensions = "Dimensions"
CleanInteval = 5 * time.Minute
CleanInterval = 5 * time.Minute
)

type MetricExtractor interface {
HasValue(*cinfo.ContainerInfo) bool
GetValue(*cinfo.ContainerInfo, string) []*CAdvisorMetric
// GetValue normally applies to the following types:
// containerinsightscommon.TypeContainer
// containerinsightscommon.TypePod
// containerinsightscommon.TypeNode
// and ignores:
// containerinsightscommon.TypeInfraContainer
// The only exception is NetMetricExtractor because pod network metrics comes from infra container (i.e. pause).
// See https://www.ianlewis.org/en/almighty-pause-container
GetValue(info *cinfo.ContainerInfo, containerType string) []*CAdvisorMetric
CleanUp(time.Time)
}

type CAdvisorMetric struct {
cgroupPath string // source of the metric for debugging merge conflict
fields map[string]interface{}
tags map[string]string
metricType string
Expand Down Expand Up @@ -68,7 +75,8 @@ func (c *CAdvisorMetric) Merge(src *CAdvisorMetric) {
// If there is any conflict, keep the fields with earlier timestamp
for k, v := range src.fields {
if _, ok := c.fields[k]; ok {
log.Printf("D! metric being merged has conflict in fields, src: %v, dest: %v \n", *src, *c)
log.Printf("D! metric being merged has conflict in fields, path src: %q, dest: %q", src.cgroupPath, c.cgroupPath)
log.Printf("D! metric being merged has conflict in fields, src: %v, dest: %v", *src, *c)
if c.tags[containerinsightscommon.Timestamp] < src.tags[containerinsightscommon.Timestamp] {
continue
}
Expand Down
2 changes: 1 addition & 1 deletion plugins/inputs/cadvisor/extractors/fs_extractor.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ func (f *FileSystemMetricExtractor) HasValue(info *cinfo.ContainerInfo) bool {

func (f *FileSystemMetricExtractor) GetValue(info *cinfo.ContainerInfo, containerType string) []*CAdvisorMetric {
var metrics []*CAdvisorMetric
if containerType == TypePod || info.Spec.Labels[containerNameLable] == infraContainerName {
if containerType == TypePod || containerType == TypeInfraContainer {
return metrics
}

Expand Down
5 changes: 3 additions & 2 deletions plugins/inputs/cadvisor/extractors/mem_extractor.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,12 @@ func (m *MemMetricExtractor) HasValue(info *cinfo.ContainerInfo) bool {

func (m *MemMetricExtractor) GetValue(info *cinfo.ContainerInfo, containerType string) []*CAdvisorMetric {
var metrics []*CAdvisorMetric
if info.Spec.Labels[containerNameLable] == infraContainerName {
if containerType == TypeInfraContainer {
return metrics
}

metric := newCadvisorMetric(containerType)
metric.cgroupPath = info.Name
curStats := GetStats(info)

metric.fields[MetricName(containerType, MemUsage)] = curStats.Memory.Usage
Expand Down Expand Up @@ -64,6 +65,6 @@ func (m *MemMetricExtractor) CleanUp(now time.Time) {

func NewMemMetricExtractor() *MemMetricExtractor {
return &MemMetricExtractor{
preInfos: mapWithExpiry.NewMapWithExpiry(CleanInteval),
preInfos: mapWithExpiry.NewMapWithExpiry(CleanInterval),
}
}
10 changes: 7 additions & 3 deletions plugins/inputs/cadvisor/extractors/net_extractor.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,14 @@ func (n *NetMetricExtractor) HasValue(info *cinfo.ContainerInfo) bool {
func (n *NetMetricExtractor) GetValue(info *cinfo.ContainerInfo, containerType string) []*CAdvisorMetric {
var metrics []*CAdvisorMetric

// Just a protection here, there is no Container level Net metrics
if (containerType == TypePod && info.Spec.Labels[containerNameLable] != infraContainerName) || containerType == TypeContainer {
// Ignore both pod and container because the network metrics comes from InfraContainer.
if containerType == TypePod || containerType == TypeContainer {
return metrics
}
// Rename type to pod so the metric name prefix is pod_
if containerType == TypeInfraContainer {
containerType = TypePod
}

if preInfo, ok := n.preInfos.Get(info.Name); ok {
curStats := GetStats(info)
Expand Down Expand Up @@ -112,7 +116,7 @@ func (n *NetMetricExtractor) CleanUp(now time.Time) {

func NewNetMetricExtractor() *NetMetricExtractor {
return &NetMetricExtractor{
preInfos: mapWithExpiry.NewMapWithExpiry(CleanInteval),
preInfos: mapWithExpiry.NewMapWithExpiry(CleanInterval),
}
}

Expand Down
1 change: 1 addition & 0 deletions plugins/inputs/cadvisor/merger.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package cadvisor

import (
"fmt"

. "github.com/aws/amazon-cloudwatch-agent/internal/containerinsightscommon"
"github.com/aws/amazon-cloudwatch-agent/plugins/inputs/cadvisor/extractors"
)
Expand Down

0 comments on commit 8784c3c

Please sign in to comment.