From 8784c3cdc833b99e8e6136781bbefa44b01e191a Mon Sep 17 00:00:00 2001 From: Pinglei Guo Date: Sun, 21 Mar 2021 21:07:40 -0700 Subject: [PATCH] cadvisor: Fix pod detection for containerd #188 --- internal/containerinsightscommon/const.go | 3 ++ .../cadvisor/container_info_processor.go | 43 ++++++++++--------- .../cadvisor/extractors/cpu_extractor.go | 6 ++- .../cadvisor/extractors/diskio_extractor.go | 2 +- .../inputs/cadvisor/extractors/extractor.go | 20 ++++++--- .../cadvisor/extractors/fs_extractor.go | 2 +- .../cadvisor/extractors/mem_extractor.go | 5 ++- .../cadvisor/extractors/net_extractor.go | 10 +++-- plugins/inputs/cadvisor/merger.go | 1 + 9 files changed, 57 insertions(+), 35 deletions(-) diff --git a/internal/containerinsightscommon/const.go b/internal/containerinsightscommon/const.go index db8084e33a..0b9dc01672 100644 --- a/internal/containerinsightscommon/const.go +++ b/internal/containerinsightscommon/const.go @@ -92,4 +92,7 @@ const ( TypeContainer = "Container" TypeContainerFS = "ContainerFS" TypeContainerDiskIO = "ContainerDiskIO" + // Special type for pause container, introduced in https://github.com/aws/amazon-cloudwatch-agent/issues/188 + // because containerd does not set container name pause container name to POD like docker does. + TypeInfraContainer = "InfraContainer" ) diff --git a/plugins/inputs/cadvisor/container_info_processor.go b/plugins/inputs/cadvisor/container_info_processor.go index 2e8d0741dc..aca152fc57 100644 --- a/plugins/inputs/cadvisor/container_info_processor.go +++ b/plugins/inputs/cadvisor/container_info_processor.go @@ -98,17 +98,23 @@ func processContainer(info *cinfo.ContainerInfo, detailMode bool, containerOrche if !detailMode { return result, pKey } + + if len(info.Spec.Labels) == 0 { + log.Printf("W! no label found from container spec, is containerd socket mounted? https://github.com/aws/amazon-cloudwatch-agent/issues/188") + } // Only a container has all these three labels set. containerName := info.Spec.Labels[containerNameLable] namespace := info.Spec.Labels[namespaceLable] podName := info.Spec.Labels[podNameLable] podId := info.Spec.Labels[podIdLable] - if containerName == "" || namespace == "" || podName == "" { + // NOTE: containerName can be empty for pause container on containerd + // https://github.com/containerd/cri/issues/922#issuecomment-423729537 + if namespace == "" || podName == "" { return result, pKey } // Pod's cgroup path is parent for a container. - // contianer name: /kubepods.slice/kubepods-burstable.slice/kubepods-burstable-pod04d39715_075e_4c7c_b128_67f7897c05b7.slice/docker-57b3dabd69b94beb462244a0c15c244b509adad0940cdcc67ca079b8208ec1f2.scope + // container name: /kubepods.slice/kubepods-burstable.slice/kubepods-burstable-pod04d39715_075e_4c7c_b128_67f7897c05b7.slice/docker-57b3dabd69b94beb462244a0c15c244b509adad0940cdcc67ca079b8208ec1f2.scope // pod name: /kubepods.slice/kubepods-burstable.slice/kubepods-burstable-pod04d39715_075e_4c7c_b128_67f7897c05b7.slice/ podPath := path.Dir(info.Name) pKey = &podKey{cgroupPath: podPath, podName: podName, podId: podId, namespace: namespace} @@ -116,14 +122,18 @@ func processContainer(info *cinfo.ContainerInfo, detailMode bool, containerOrche tags[PodIdKey] = podId tags[K8sPodNameKey] = podName tags[K8sNamespace] = namespace - if containerName != infraContainerName { + + switch containerName { + // For docker, pause container name is set to POD while containerd does not set it. + // See https://github.com/aws/amazon-cloudwatch-agent/issues/188 + case "", infraContainerName: + // NOTE: the pod here is only used by NetMetricExtractor, + // other pod info like CPU, Mem are dealt within in processPod. + containerType = TypeInfraContainer + default: tags[ContainerNamekey] = containerName tags[ContainerIdkey] = path.Base(info.Name) containerType = TypeContainer - } else { - // NOTE: the pod here is only used by NetMetricExtractor, - // other pod info like CPU, Mem are dealt within in processPod. - containerType = TypePod } } else { containerType = TypeNode @@ -146,15 +156,18 @@ func processContainer(info *cinfo.ContainerInfo, detailMode bool, containerOrche return result, pKey } +// processPod is almost identical as processContainer. We got this second loop because pod detection relies +// on inspecting labels from containers in processContainer. cgroup path for detected pods are saved in podKeys. +// We may not get container before pod when looping all returned cgroup paths so we use a two pass solution +// in processContainers. func processPod(info *cinfo.ContainerInfo, podKeys map[string]podKey) []*extractors.CAdvisorMetric { var result []*extractors.CAdvisorMetric if isContainerInContainer(info.Name) { - log.Printf("D! drop metric because it's nested container, name %s", info.Name) return result } - podKey := getPodKey(info, podKeys) - if podKey == nil { + podKey, ok := podKeys[info.Name] + if !ok { return result } @@ -177,16 +190,6 @@ func processPod(info *cinfo.ContainerInfo, podKeys map[string]podKey) []*extract return result } -func getPodKey(info *cinfo.ContainerInfo, podKeys map[string]podKey) *podKey { - key := info.Name - - if v, ok := podKeys[key]; ok { - return &v - } - - return nil -} - // Check if it's a container running inside container, caller will drop the metric when return value is true. // The validation is based on ContainerReference.Name, which is essentially cgroup path. // The first version is from https://github.com/aws/amazon-cloudwatch-agent/commit/e8daa5f5926c5a5f38e0ceb746c141be463e11e4#diff-599185154c116b295172b56311729990d20672f6659500870997c018ce072100 diff --git a/plugins/inputs/cadvisor/extractors/cpu_extractor.go b/plugins/inputs/cadvisor/extractors/cpu_extractor.go index 7c3d7cc685..f5c51b211f 100644 --- a/plugins/inputs/cadvisor/extractors/cpu_extractor.go +++ b/plugins/inputs/cadvisor/extractors/cpu_extractor.go @@ -29,7 +29,8 @@ func (c *CpuMetricExtractor) recordPreviousInfo(info *cInfo.ContainerInfo) { func (c *CpuMetricExtractor) GetValue(info *cInfo.ContainerInfo, containerType string) []*CAdvisorMetric { var metrics []*CAdvisorMetric - if info.Spec.Labels[containerNameLable] == infraContainerName { + // Skip infra container and handle node, pod, other containers in pod + if containerType == TypeInfraContainer { return metrics } @@ -41,6 +42,7 @@ func (c *CpuMetricExtractor) GetValue(info *cInfo.ContainerInfo, containerType s if deltaCTimeInNano > MinTimeDiff { metric := newCadvisorMetric(containerType) + metric.cgroupPath = info.Name metric.fields[MetricName(containerType, CpuTotal)] = float64(curStats.Cpu.Usage.Total-preStats.Cpu.Usage.Total) / float64(deltaCTimeInNano) * decimalToMillicores metric.fields[MetricName(containerType, CpuUser)] = float64(curStats.Cpu.Usage.User-preStats.Cpu.Usage.User) / float64(deltaCTimeInNano) * decimalToMillicores @@ -59,6 +61,6 @@ func (c *CpuMetricExtractor) CleanUp(now time.Time) { func NewCpuMetricExtractor() *CpuMetricExtractor { return &CpuMetricExtractor{ - preInfos: mapWithExpiry.NewMapWithExpiry(CleanInteval), + preInfos: mapWithExpiry.NewMapWithExpiry(CleanInterval), } } diff --git a/plugins/inputs/cadvisor/extractors/diskio_extractor.go b/plugins/inputs/cadvisor/extractors/diskio_extractor.go index 8a9529a9f6..887413a8d4 100644 --- a/plugins/inputs/cadvisor/extractors/diskio_extractor.go +++ b/plugins/inputs/cadvisor/extractors/diskio_extractor.go @@ -90,7 +90,7 @@ func (d *DiskIOMetricExtractor) CleanUp(now time.Time) { func NewDiskIOMetricExtractor() *DiskIOMetricExtractor { return &DiskIOMetricExtractor{ - preInfos: mapWithExpiry.NewMapWithExpiry(CleanInteval), + preInfos: mapWithExpiry.NewMapWithExpiry(CleanInterval), } } diff --git a/plugins/inputs/cadvisor/extractors/extractor.go b/plugins/inputs/cadvisor/extractors/extractor.go index 5e6b20f094..d076f18e11 100644 --- a/plugins/inputs/cadvisor/extractors/extractor.go +++ b/plugins/inputs/cadvisor/extractors/extractor.go @@ -12,21 +12,28 @@ import ( ) const ( - containerNameLable = "io.kubernetes.container.name" - // TODO: https://github.com/containerd/cri/issues/922#issuecomment-423729537 the container name can be empty on containerd - infraContainerName = "POD" + containerNameLabel = "io.kubernetes.container.name" Metrics = "Metrics" Dimensions = "Dimensions" - CleanInteval = 5 * time.Minute + CleanInterval = 5 * time.Minute ) type MetricExtractor interface { HasValue(*cinfo.ContainerInfo) bool - GetValue(*cinfo.ContainerInfo, string) []*CAdvisorMetric + // GetValue normally applies to the following types: + // containerinsightscommon.TypeContainer + // containerinsightscommon.TypePod + // containerinsightscommon.TypeNode + // and ignores: + // containerinsightscommon.TypeInfraContainer + // The only exception is NetMetricExtractor because pod network metrics comes from infra container (i.e. pause). + // See https://www.ianlewis.org/en/almighty-pause-container + GetValue(info *cinfo.ContainerInfo, containerType string) []*CAdvisorMetric CleanUp(time.Time) } type CAdvisorMetric struct { + cgroupPath string // source of the metric for debugging merge conflict fields map[string]interface{} tags map[string]string metricType string @@ -68,7 +75,8 @@ func (c *CAdvisorMetric) Merge(src *CAdvisorMetric) { // If there is any conflict, keep the fields with earlier timestamp for k, v := range src.fields { if _, ok := c.fields[k]; ok { - log.Printf("D! metric being merged has conflict in fields, src: %v, dest: %v \n", *src, *c) + log.Printf("D! metric being merged has conflict in fields, path src: %q, dest: %q", src.cgroupPath, c.cgroupPath) + log.Printf("D! metric being merged has conflict in fields, src: %v, dest: %v", *src, *c) if c.tags[containerinsightscommon.Timestamp] < src.tags[containerinsightscommon.Timestamp] { continue } diff --git a/plugins/inputs/cadvisor/extractors/fs_extractor.go b/plugins/inputs/cadvisor/extractors/fs_extractor.go index ab46604137..f241d0c97e 100644 --- a/plugins/inputs/cadvisor/extractors/fs_extractor.go +++ b/plugins/inputs/cadvisor/extractors/fs_extractor.go @@ -26,7 +26,7 @@ func (f *FileSystemMetricExtractor) HasValue(info *cinfo.ContainerInfo) bool { func (f *FileSystemMetricExtractor) GetValue(info *cinfo.ContainerInfo, containerType string) []*CAdvisorMetric { var metrics []*CAdvisorMetric - if containerType == TypePod || info.Spec.Labels[containerNameLable] == infraContainerName { + if containerType == TypePod || containerType == TypeInfraContainer { return metrics } diff --git a/plugins/inputs/cadvisor/extractors/mem_extractor.go b/plugins/inputs/cadvisor/extractors/mem_extractor.go index f8efdaf022..7e8be186fa 100644 --- a/plugins/inputs/cadvisor/extractors/mem_extractor.go +++ b/plugins/inputs/cadvisor/extractors/mem_extractor.go @@ -25,11 +25,12 @@ func (m *MemMetricExtractor) HasValue(info *cinfo.ContainerInfo) bool { func (m *MemMetricExtractor) GetValue(info *cinfo.ContainerInfo, containerType string) []*CAdvisorMetric { var metrics []*CAdvisorMetric - if info.Spec.Labels[containerNameLable] == infraContainerName { + if containerType == TypeInfraContainer { return metrics } metric := newCadvisorMetric(containerType) + metric.cgroupPath = info.Name curStats := GetStats(info) metric.fields[MetricName(containerType, MemUsage)] = curStats.Memory.Usage @@ -64,6 +65,6 @@ func (m *MemMetricExtractor) CleanUp(now time.Time) { func NewMemMetricExtractor() *MemMetricExtractor { return &MemMetricExtractor{ - preInfos: mapWithExpiry.NewMapWithExpiry(CleanInteval), + preInfos: mapWithExpiry.NewMapWithExpiry(CleanInterval), } } diff --git a/plugins/inputs/cadvisor/extractors/net_extractor.go b/plugins/inputs/cadvisor/extractors/net_extractor.go index 5691f311d5..1e5b956149 100644 --- a/plugins/inputs/cadvisor/extractors/net_extractor.go +++ b/plugins/inputs/cadvisor/extractors/net_extractor.go @@ -39,10 +39,14 @@ func (n *NetMetricExtractor) HasValue(info *cinfo.ContainerInfo) bool { func (n *NetMetricExtractor) GetValue(info *cinfo.ContainerInfo, containerType string) []*CAdvisorMetric { var metrics []*CAdvisorMetric - // Just a protection here, there is no Container level Net metrics - if (containerType == TypePod && info.Spec.Labels[containerNameLable] != infraContainerName) || containerType == TypeContainer { + // Ignore both pod and container because the network metrics comes from InfraContainer. + if containerType == TypePod || containerType == TypeContainer { return metrics } + // Rename type to pod so the metric name prefix is pod_ + if containerType == TypeInfraContainer { + containerType = TypePod + } if preInfo, ok := n.preInfos.Get(info.Name); ok { curStats := GetStats(info) @@ -112,7 +116,7 @@ func (n *NetMetricExtractor) CleanUp(now time.Time) { func NewNetMetricExtractor() *NetMetricExtractor { return &NetMetricExtractor{ - preInfos: mapWithExpiry.NewMapWithExpiry(CleanInteval), + preInfos: mapWithExpiry.NewMapWithExpiry(CleanInterval), } } diff --git a/plugins/inputs/cadvisor/merger.go b/plugins/inputs/cadvisor/merger.go index 8d0a1e8145..f2bf9b751e 100644 --- a/plugins/inputs/cadvisor/merger.go +++ b/plugins/inputs/cadvisor/merger.go @@ -5,6 +5,7 @@ package cadvisor import ( "fmt" + . "github.com/aws/amazon-cloudwatch-agent/internal/containerinsightscommon" "github.com/aws/amazon-cloudwatch-agent/plugins/inputs/cadvisor/extractors" )