From 60ae5fa66807879324325487be0e635916915aad Mon Sep 17 00:00:00 2001 From: Joseph Azevedo Date: Sun, 9 Jan 2022 09:53:13 -0500 Subject: [PATCH] Add support for cgroup v2 (#18) --- CHANGELOG.md | 10 +- README.md | 80 ++-- docs/collecting.md | 2 + docs/collecting_cgroup_v2.md | 55 +++ man/radvisor-run-docker.1.md | 2 +- man/radvisor-run-kubernetes.1.md | 2 +- man/radvisor.1.md | 2 +- src/collection/collectors/all.rs | 12 +- src/collection/collectors/cgroup_v2/files.rs | 45 ++ src/collection/collectors/cgroup_v2/mod.rs | 256 +++++++++++ src/collection/collectors/cgroup_v2/read.rs | 233 +++++++++++ src/collection/collectors/mod.rs | 1 + src/polling/providers/docker.rs | 59 ++- src/polling/providers/kubernetes.rs | 65 ++- src/shared.rs | 1 + src/util/cgroup.rs | 419 ++++++++++++------- src/util/lazy_quantity.rs | 3 + 17 files changed, 1002 insertions(+), 245 deletions(-) create mode 100644 docs/collecting_cgroup_v2.md create mode 100644 src/collection/collectors/cgroup_v2/files.rs create mode 100644 src/collection/collectors/cgroup_v2/mod.rs create mode 100644 src/collection/collectors/cgroup_v2/read.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index e73feea..689ca18 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,13 +9,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 --- -## [1.4.0](https://github.com/elba-docker/radvisor/compare/v1.3.0...v1.4.0) - 2021-02-02 +## [1.4.0](https://github.com/elba-docker/radvisor/compare/v1.3.0...v1.4.0) - 2022-01-09 [![v1.4.0](https://img.shields.io/badge/release-v1.4.0-2bab64)](https://github.com/elba-docker/radvisor/releases/tag/v1.4.0) ### Added - Changed license from the MIT License to the GNU General Public License v3.0 +- Support for [cgroup v2](https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html) was added to the Docker container statistics collection. + - **Motivation**: Cgroup v2 handles accounting for writeback I/O better than cgroup v1, and is slowly replacing cgroup v1 as the more modern system. + - **Support**: Docker added support for Cgroup v2 in its 20.10 release, and most low-level container runtimes added support for it by the end of 2021. Additionally Cgroup v2 is the default-mounted cgroup version in Ubuntu starting in 21.10, and in most other popular distros starting around 2021. + - **Changes**: + - The schema for the CSV data in the log-file is different in cgroup v1 and cgroup v2, since the kernel exposes different statistics for each. + - The `CollectorType` log metadata field was added to distinguish whether a log file contains statistics from `cgroup_v1` or `cgroup_v2` + - `Cgroup` and `CgroupDriver` log metadata fields were moved under `CollectorMetadata` + - (internal) A new abstraction was introduced, `Collector`, which defines a trait that is used to collect resource utilization statistics for a running target. Both `cgroup_v1::Collector` and `cgroup_v2::Collector` implement this trait. --- diff --git a/README.md b/README.md index c53508f..4d93103 100644 --- a/README.md +++ b/README.md @@ -10,14 +10,14 @@ ### 🐋 Docker -##### `/var/log/radvisor/stats/c0cd2077ec95e1b340e85c2...b_1585108344.log` +##### `/var/log/radvisor/stats/7762ff15c99a2d238f4d26c...1_1641734705.log` ```yaml --- -Version: 1.3.1 +Version: 1.4.0 Provider: docker Metadata: - Created: "2020-10-11T04:22:18Z" + Created: "2022-01-09T13:25:04Z" Command: 'bash -c ''sleep 2s; apt-get update; sleep 2s; DEBIAN_FRONTEND=noninteractive apt-get install -y stress wget; sleep 2s; dd if=/dev/zero of=/tmp/file1 bs=512M count=1 oflag=direct; sleep 2s; stress --cpu 8 --io 4 --vm 4 --vm-bytes 1024M --timeout 10s; sleep 2s; wget "http://ipv4.download.thinkbroadband.com/10MB.zip"; sleep 2s''' Id: 7762ff15c99a2d238f4d26c22b5eda5b97ebc03bd0a711693104dcb6f71fe411 Image: ubuntu @@ -28,46 +28,30 @@ Metadata: Status: Up Less than a second SizeRw: ~ SizeRootFs: ~ -PerfTable: - Delimiter: "," - Columns: - cpu.usage.percpu: - Type: int - Count: 32 - read: - Type: epoch19 +PerfTable: # ... System: OsType: Linux OsRelease: 4.15.0 - Distribution: - Id: ubuntu - IdLike: debian - Name: Ubuntu - PrettyName: Ubuntu 18.04.1 LTS - Version: 18.04.1 LTS (Bionic Beaver) - VersionId: "18.04" - VersionCodename: bionic - CpeName: ~ - BuildId: ~ - Variant: ~ - VariantId: ~ + Distribution: # ... MemoryTotal: 65870408 SwapTotal: 3145724 Hostname: node-0.sandbox.infosphere.emulab.net CpuCount: 32 CpuOnlineCount: 4 CpuSpeed: 1279 -Cgroup: system.slice/docker-7762ff15c99a2d238f4d26c22b5eda5b97ebc03bd0a711693104dcb6f71fe411.scope -CgroupDriver: systemd -PolledAt: 1602390140142271945 -InitializedAt: 1602390140157676566 +CollectorType: cgroup_v2 +CollectorMetadata: + Cgroup: system.slice/docker-7762ff15c99a2d238f4d26c22b5eda5b97ebc03bd0a711693104dcb6f71fe411.scope + CgroupDriver: systemd +PolledAt: 1641734740142271945 +InitializedAt: 1641734740157676566 --- -read,pids.current,pids.max,cpu.usage.total,cpu.usage.system,cpu.usage.user,cpu.usage.percpu,cpu.stat.user,cpu.stat.system,cpu.throttling.periods,cpu.throttling.throttled.count,cpu.throttling.throttled.time,memory.usage.current,memory.usage.max,memory.limit.hard,memory.limit.soft,memory.failcnt,memory.hierarchical_limit.memory,memory.hierarchical_limit.memoryswap,memory.cache,memory.rss.all,memory.rss.huge,memory.mapped,memory.swap,memory.paged.in,memory.paged.out,memory.fault.total,memory.fault.major,memory.anon.inactive,memory.anon.active,memory.file.inactive,memory.file.active,memory.unevictable,blkio.time,blkio.sectors,blkio.service.bytes.read,blkio.service.bytes.write,blkio.service.bytes.sync,blkio.service.bytes.async,blkio.service.ios.read,blkio.service.ios.write,blkio.service.ios.sync,blkio.service.ios.async,blkio.service.time.read,blkio.service.time.write,blkio.service.time.sync,blkio.service.time.async,blkio.queued.read,blkio.queued.write,blkio.queued.sync,blkio.queued.async,blkio.wait.read,blkio.wait.write,blkio.wait.sync,blkio.wait.async,blkio.merged.read,blkio.merged.write,blkio.merged.sync,blkio.merged.async,blkio.throttle.service.bytes.read,blkio.throttle.service.bytes.write,blkio.throttle.service.bytes.sync,blkio.throttle.service.bytes.async,blkio.throttle.service.ios.read,blkio.throttle.service.ios.write,blkio.throttle.service.ios.sync,blkio.throttle.service.ios.async,blkio.bfq.service.bytes.read,blkio.bfq.service.bytes.write,blkio.bfq.service.bytes.sync,blkio.bfq.service.bytes.async,blkio.bfq.service.ios.read,blkio.bfq.service.ios.write,blkio.bfq.service.ios.sync,blkio.bfq.service.ios.async -1602390175053135973,18,4915,45675783181,0,45675783181,9719044209 12310201631 11027849186 12618688155 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0,2925,1668,0,0,0,2802823168,3667771392,9223372036854771712,9223372036854771712,0,9223372036854771712,,35323904,2754781184,0,28672,,6837817,6156636,6854722,0,0,2754711552,7380992,27942912,0,2273087370,1306336,0,668844032,662777856,6066176,0,331937,331753,184,0,68057100860,68011971780,45129080,0,0,0,0,0,222907407415,222860666999,46740416,0,32,0,32,0,668844032,662777856,6066176,0,331937,331753,184,,,,,,,, -1602390175103189646,18,4915,45876609757,0,45876610443,9767491855 12362201213 11076227542 12670689833 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0,2938,1676,0,0,0,2968367104,3667771392,9223372036854771712,9223372036854771712,0,9223372036854771712,,35323904,2920067072,0,28672,,6878171,6156636,6895076,0,0,2919976960,7380992,27942912,0,2273087370,1306336,0,668844032,662777856,6066176,0,333749,333565,184,0,68057100860,68011971780,45129080,0,0,0,0,0,222907407415,222860666999,46740416,0,32,0,32,0,668844032,662777856,6066176,0,333750,333566,184,,,,,,,, +read,pids.current,pids.max,cpu.stat/usage_usec,cpu.stat/system_usec,cpu.stat/user_usec,cpu.stat/nr_periods,cpu.stat/nr_throttled,cpu.stat/throttled_usec,memory.current,memory.high,memory.max,memory.stat/anon,memory.stat/file,memory.stat/kernel_stack,memory.stat/pagetables,memory.stat/percpu,memory.stat/sock,memory.stat/shmem,memory.stat/file_mapped,memory.stat/file_dirty,memory.stat/file_writeback,memory.stat/swapcached,memory.stat/inactive_anon,memory.stat/active_anon,memory.stat/inactive_file,memory.stat/active_file,memory.stat/unevictable,memory.stat/pgfault,memory.stat/pgmajfault,io.stat/rbytes,io.stat/wbytes,io.stat/rios,io.stat/wios,io.stat/dbytes,io.stat/dios +1641734705052508079,1,28989,58688,40630,18057,0,0,0,5558272,max,max,405504,3514368,49152,0,0,0,0,2838528,0,0,0,270336,0,1486848,2027520,0,1650,0,3891200,0,58,0,0,0 +# ... ``` -More information about what each column represents can be found in the [docs page](https://github.com/elba-docker/radvisor/blob/master/docs/collecting.md). +More information about what each column represents can be found in the [docs pages](https://github.com/elba-docker/radvisor/blob/master/docs/collecting_cgroup_v2.md) (for information about the columns ouputted when `CollectorType: cgroup_v1`, see [this page instead](https://github.com/elba-docker/radvisor/blob/master/docs/collecting.md)). ### ⚓ Kubernetes @@ -75,7 +59,7 @@ More information about what each column represents can be found in the [docs pag ```yaml --- -Version: 1.3.1 +Version: 1.4.0 Provider: kubernetes Metadata: Uid: 9f0b1893-15e7-442a-966a-b0d19a35fc1c @@ -91,29 +75,11 @@ Metadata: Phase: Running QosClass: BestEffort StartedAt: "2020-03-29T04:32:36Z" -PerfTable: - Delimiter: "," - Columns: - cpu.usage.percpu: - Type: int - Count: 32 - read: - Type: epoch19 +PerfTable: # ... System: OsType: Linux OsRelease: 4.15.0 - Distribution: - Id: ubuntu - IdLike: debian - Name: Ubuntu - PrettyName: Ubuntu 18.04.1 LTS - Version: 18.04.1 LTS (Bionic Beaver) - VersionId: "18.04" - VersionCodename: bionic - CpeName: ~ - BuildId: ~ - Variant: ~ - VariantId: ~ + Distribution: # ... MemoryTotal: 65870408 SwapTotal: 3145724 Hostname: node-0.sandbox.infosphere.emulab.net @@ -121,13 +87,15 @@ System: CpuOnlineCount: 32 CpuSpeed: 1198 PolledAt: 1585470948008442929 -Cgroup: /kubepods.slice/kubepods-besteffort.slice/kubepods-besteffort-pod9f0b1893_15e7_442a_966a_b0d19a35fc1c.slice -CgroupDriver: systemd +CollectorType: cgroup_v1 +CollectorMetadata: + Cgroup: /kubepods.slice/kubepods-besteffort.slice/kubepods-besteffort-pod9f0b1893_15e7_442a_966a_b0d19a35fc1c.slice + CgroupDriver: systemd InitializedAt: 1585470948030565581 --- read,pids.current,pids.max,cpu.usage.total,cpu.usage.system,cpu.usage.user,cpu.usage.percpu,cpu.stat.user,cpu.stat.system,cpu.throttling.periods,cpu.throttling.throttled.count,cpu.throttling.throttled.time,memory.usage.current,memory.usage.max,memory.limit.hard,memory.limit.soft,memory.failcnt,memory.hierarchical_limit.memory,memory.hierarchical_limit.memoryswap,memory.cache,memory.rss.all,memory.rss.huge,memory.mapped,memory.swap,memory.paged.in,memory.paged.out,memory.fault.total,memory.fault.major,memory.anon.inactive,memory.anon.active,memory.file.inactive,memory.file.active,memory.unevictable,blkio.time,blkio.sectors,blkio.service.bytes.read,blkio.service.bytes.write,blkio.service.bytes.sync,blkio.service.bytes.async,blkio.service.ios.read,blkio.service.ios.write,blkio.service.ios.sync,blkio.service.ios.async,blkio.service.time.read,blkio.service.time.write,blkio.service.time.sync,blkio.service.time.async,blkio.queued.read,blkio.queued.write,blkio.queued.sync,blkio.queued.async,blkio.wait.read,blkio.wait.write,blkio.wait.sync,blkio.wait.async,blkio.merged.read,blkio.merged.write,blkio.merged.sync,blkio.merged.async,blkio.throttle.service.bytes.read,blkio.throttle.service.bytes.write,blkio.throttle.service.bytes.sync,blkio.throttle.service.bytes.async,blkio.throttle.service.ios.read,blkio.throttle.service.ios.write,blkio.throttle.service.ios.sync,blkio.throttle.service.ios.async,blkio.bfq.service.bytes.read,blkio.bfq.service.bytes.write,blkio.bfq.service.bytes.sync,blkio.bfq.service.bytes.async,blkio.bfq.service.ios.read,blkio.bfq.service.ios.write,blkio.bfq.service.ios.sync,blkio.bfq.service.ios.async 1602390175053135973,18,4915,45675783181,0,45675783181,9719044209 12310201631 11027849186 12618688155 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0,2925,1668,0,0,0,2802823168,3667771392,9223372036854771712,9223372036854771712,0,9223372036854771712,,35323904,2754781184,0,28672,,6837817,6156636,6854722,0,0,2754711552,7380992,27942912,0,2273087370,1306336,0,668844032,662777856,6066176,0,331937,331753,184,0,68057100860,68011971780,45129080,0,0,0,0,0,222907407415,222860666999,46740416,0,32,0,32,0,668844032,662777856,6066176,0,331937,331753,184,,,,,,,, -1602390175103189646,18,4915,45876609757,0,45876610443,9767491855 12362201213 11076227542 12670689833 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0,2938,1676,0,0,0,2968367104,3667771392,9223372036854771712,9223372036854771712,0,9223372036854771712,,35323904,2920067072,0,28672,,6878171,6156636,6895076,0,0,2919976960,7380992,27942912,0,2273087370,1306336,0,668844032,662777856,6066176,0,333749,333565,184,0,68057100860,68011971780,45129080,0,0,0,0,0,222907407415,222860666999,46740416,0,32,0,32,0,668844032,662777856,6066176,0,333750,333566,184,,,,,,,, +# ... ``` ## 📜 Runtime Options @@ -136,7 +104,7 @@ Many of the specific details of collection can be controlled via the command lin ```console $ radvisor help -radvisor 1.3.1 +radvisor 1.4.0 Joseph Azevedo , Bhanu Garg Monitors container resource utilization with high granularity and low overhead diff --git a/docs/collecting.md b/docs/collecting.md index a8a9e3c..14ce581 100644 --- a/docs/collecting.md +++ b/docs/collecting.md @@ -1,5 +1,7 @@ # Runtime Statistics Collection - cgroup v1 +> **Note**: this document contains information about the cgroup v1 collector implementation. For information about the statistics collection mechanisms used with cgroup v2, see collecting_cgroup_v2.md. + Runtime statistics for each Docker container are taken from the virtual files for each container's cgroup, located at `/sys/fs/cgroup//docker//file`. More information is available at the Docker wiki: [Runtime metrics](https://docs.docker.com/config/containers/runmetrics/). diff --git a/docs/collecting_cgroup_v2.md b/docs/collecting_cgroup_v2.md new file mode 100644 index 0000000..c4d76da --- /dev/null +++ b/docs/collecting_cgroup_v2.md @@ -0,0 +1,55 @@ +# Runtime Statistics Collection - cgroup v2 + +> **Note**: this document contains information about the cgroup v2 collector implementation. For information about the statistics collection mechanisms used with cgroup v1, see collecting.md. + +Docker prepares individual cgroups for each container, and these are mounted (by default) at `/sys/fs/cgroup/system.slice/docker-.scope` when using systemd as the cgroup driver. + +As with cgroup v1, network transfer amounts is out-of-scope of this tool (even for cgroup v2), since instrumenting network utilization requires an entirely different mechanism than the one used for block (disk) I/O, CPU, and memory. + +## Statistics collected + +The following fields are collected for each log line in the target log files: + +- `read` +- `pids.current` +- `pids.max` +- `cpu.stat/usage_usec` +- `cpu.stat/system_usec` +- `cpu.stat/user_usec` +- `cpu.stat/nr_periods` +- `cpu.stat/nr_throttled` +- `cpu.stat/throttled_usec` +- `memory.current` +- `memory.high` +- `memory.max` +- `memory.stat/anon` +- `memory.stat/file` +- `memory.stat/kernel_stack` +- `memory.stat/pagetables` +- `memory.stat/percpu` +- `memory.stat/sock` +- `memory.stat/shmem` +- `memory.stat/file_mapped` +- `memory.stat/file_dirty` +- `memory.stat/file_writeback` +- `memory.stat/swapcached` +- `memory.stat/inactive_anon` +- `memory.stat/active_anon` +- `memory.stat/inactive_file` +- `memory.stat/active_file` +- `memory.stat/unevictable` +- `memory.stat/pgfault` +- `memory.stat/pgmajfault` +- `io.stat/rbytes` +- `io.stat/wbytes` +- `io.stat/rios` +- `io.stat/wios` +- `io.stat/dbytes` +- `io.stat/dios` + +Most of these fields are straightforward, as they directly correspond to a field in a cgroup accounting file (when in the format of `/`, such as `cup.stat/usage_usec`). Alternatively, some fields come from cgroup accounting files that contain a single field, such as `pids.current` and `pids.max`. Information about what these fields specifically mean can be found in the [documentation for cgroup v2](https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html). + +The only fields that require discussion are: + +- `read` - this is the timestamp of the log line, as a nanosecond Unix timestamp +- `io.stat/*` - these fields all come from the `io.stat` file, except the valuses are added together among all devices to produce a single value for each field. diff --git a/man/radvisor-run-docker.1.md b/man/radvisor-run-docker.1.md index 080e18b..8ea2970 100644 --- a/man/radvisor-run-docker.1.md +++ b/man/radvisor-run-docker.1.md @@ -14,7 +14,7 @@ DESCRIPTION =========== **radvisor run docker** runs a collection thread that writes resource statistics to -output CSV files using configurable intervals. While running, it collects statistics for containers by polling the docker daemon to get a list of active running containers (every 1s by default) and using their cgroups to read information on their system resource utilization. +output CSV files using configurable intervals. While running, it collects statistics for containers by polling the docker daemon to get a list of active running containers (every 1s by default) and using their cgroups to read information on their system resource utilization. This works whether the host has enabled cgroup v1 or cgroup v2, though the individual fields collected will be different. Likely needs to be run as root. diff --git a/man/radvisor-run-kubernetes.1.md b/man/radvisor-run-kubernetes.1.md index 0d03dad..44670f3 100644 --- a/man/radvisor-run-kubernetes.1.md +++ b/man/radvisor-run-kubernetes.1.md @@ -14,7 +14,7 @@ DESCRIPTION =========== **radvisor run kubernetes** runs a collection thread that writes resource statistics to -output CSV files using configurable intervals. While running, it collects statistics for Kubernetes pods, polling the Kubernetes API server to get a list of all active running pods that have been scheduled on the current machine's node, using the cgroup for each pod. +output CSV files using configurable intervals. While running, it collects statistics for Kubernetes pods, polling the Kubernetes API server to get a list of all active running pods that have been scheduled on the current machine's node, using the cgroup for each pod. Note that the Kubernetes command only supports cgroup v1. Needs to be a part of an active cluster and needs to be able to find the Kubernetes config file (or specified using **\--kube-config**). diff --git a/man/radvisor.1.md b/man/radvisor.1.md index 313b6e0..9d1753f 100644 --- a/man/radvisor.1.md +++ b/man/radvisor.1.md @@ -21,7 +21,7 @@ Originally developed in Rust as a custom tool to help detect and analyze millibo rAdvisor runs by polling the target *provider* (either the local Docker daemon or the Kubernetes API server) every 1 second to get a list of active, running containers/pods. From this list, rAdvisor runs a collection thread every 50ms to get resource utilization data for each active target -using Linux [`cgroups`](https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/6/html/resource_management_guide/ch01), +using Linux [`cgroups`](https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/6/html/resource_management_guide/ch01) (both v1 and v2), outputting the resultant logs in `/var/log/radvisor/stats`. The primary command is `radvisor run`, which has its own man page at **radvisor-run(1)**. diff --git a/src/collection/collectors/all.rs b/src/collection/collectors/all.rs index 4dd8515..c527dbe 100644 --- a/src/collection/collectors/all.rs +++ b/src/collection/collectors/all.rs @@ -1,41 +1,47 @@ use crate::collection::buffers::WorkingBuffers; -use crate::collection::collectors::{cgroup_v1, Collector, StatWriter}; +use crate::collection::collectors::{cgroup_v1, cgroup_v2, Collector, StatWriter}; use crate::collection::perf_table::TableMetadata; use crate::shared::CollectionMethod; use anyhow::Error; pub enum CollectorImpl { CgroupV1(cgroup_v1::Collector), + CgroupV2(cgroup_v2::Collector), } impl Collector for CollectorImpl { fn metadata(&mut self) -> Option { match self { Self::CgroupV1(v1) => v1.metadata(), + Self::CgroupV2(v2) => v2.metadata(), } } fn table_metadata(&mut self) -> TableMetadata { match self { Self::CgroupV1(v1) => v1.table_metadata(), + Self::CgroupV2(v2) => v2.table_metadata(), } } fn get_type(&self) -> &'static str { match self { Self::CgroupV1(v1) => v1.get_type(), + Self::CgroupV2(v2) => v2.get_type(), } } fn init(&mut self) -> Result<(), Error> { match self { Self::CgroupV1(v1) => v1.init(), + Self::CgroupV2(v2) => v2.init(), } } fn write_header(&mut self, writer: &mut StatWriter) -> Result<(), csv::Error> { match self { Self::CgroupV1(v1) => v1.write_header(writer), + Self::CgroupV2(v2) => v2.write_header(writer), } } @@ -46,6 +52,7 @@ impl Collector for CollectorImpl { ) -> Result<(), csv::Error> { match self { Self::CgroupV1(v1) => v1.collect(writer, working_buffers), + Self::CgroupV2(v2) => v2.collect(writer, working_buffers), } } } @@ -56,6 +63,9 @@ impl From for CollectorImpl { CollectionMethod::LinuxCgroupV1(path) => { Self::CgroupV1(cgroup_v1::Collector::new(path)) }, + CollectionMethod::LinuxCgroupV2(path) => { + Self::CgroupV2(cgroup_v2::Collector::new(path)) + }, } } } diff --git a/src/collection/collectors/cgroup_v2/files.rs b/src/collection/collectors/cgroup_v2/files.rs new file mode 100644 index 0000000..abae074 --- /dev/null +++ b/src/collection/collectors/cgroup_v2/files.rs @@ -0,0 +1,45 @@ +use std::fs::File; +use std::path::{Path, PathBuf}; + +const CGROUP_V2_ROOT: &str = "/sys/fs/cgroup"; + +/// File handles re-used for each target that read into the /proc VFS +pub struct ProcFileHandles { + pub pids_current: Option, + pub pids_max: Option, + pub cpu_stat: Option, + pub memory_current: Option, + pub memory_high: Option, + pub memory_max: Option, + pub memory_stat: Option, + pub io_stat: Option, +} + +impl ProcFileHandles { + /// Initializes all file handles to /proc files, utilizing them over the + /// entire timeline of the target monitoring. If a handle fails to + /// open, the struct field will be None + #[must_use] + pub fn new>(cgroup: C) -> Self { + Self { + pids_current: o(&cgroup, "pids.current"), + pids_max: o(&cgroup, "pids.max"), + cpu_stat: o(&cgroup, "cpu.stat"), + memory_current: o(&cgroup, "memory.current"), + memory_high: o(&cgroup, "memory.high"), + memory_max: o(&cgroup, "memory.max"), + memory_stat: o(&cgroup, "memory.stat"), + io_stat: o(&cgroup, "io.stat"), + } + } +} + +/// Opens a stats file in /proc for the cgroup corresponding to the given +/// relative cgroup +#[must_use] +fn o>(cgroup: C, file: &str) -> Option { + let mut path: PathBuf = PathBuf::from(CGROUP_V2_ROOT); + path.push(cgroup); + path.push(file); + File::open(path).ok() +} diff --git a/src/collection/collectors/cgroup_v2/mod.rs b/src/collection/collectors/cgroup_v2/mod.rs new file mode 100644 index 0000000..9e53caa --- /dev/null +++ b/src/collection/collectors/cgroup_v2/mod.rs @@ -0,0 +1,256 @@ +mod files; +mod read; + +use crate::collection::buffers::WorkingBuffers; +use crate::collection::collectors::{Collector as CollectorTrait, StatWriter}; +use crate::collection::perf_table::{Column, ColumnType, TableMetadata}; +use crate::util::{self, CgroupDriver, CgroupPath}; +use anyhow::Error; +use csv::ByteRecord; +use files::ProcFileHandles; +use serde::Serialize; +use std::collections::BTreeMap; +use std::path::PathBuf; + +/// Implements `crate::collection::collector::Collector` +/// for cgroup v2-sourced data +pub struct Collector { + cgroup: CgroupPath, + file_handles: Option, +} + +impl Collector { + pub const fn new(cgroup: CgroupPath) -> Self { + Self { + cgroup, + file_handles: None, + } + } +} + +#[derive(Clone, Debug, PartialEq, Serialize)] +#[serde(rename_all = "PascalCase")] +struct Metadata<'a> { + cgroup: &'a PathBuf, + cgroup_driver: &'a CgroupDriver, +} + +impl CollectorTrait for Collector { + fn metadata(&mut self) -> Option { + let metadata = Metadata { + cgroup: &self.cgroup.path, + cgroup_driver: &self.cgroup.driver, + }; + + serde_yaml::to_value(&metadata).ok() + } + + fn table_metadata(&mut self) -> TableMetadata { + let mut columns: BTreeMap = BTreeMap::new(); + // Include metadata on the read (timestamp) column + columns.insert(String::from("read"), Column::Scalar { + r#type: ColumnType::Epoch19, + }); + TableMetadata { + delimiter: ",", + columns, + } + } + + fn get_type(&self) -> &'static str { "cgroup_v2" } + + fn init(&mut self) -> Result<(), Error> { + // Open file handles to all of the /proc files in the cgroupfs + let handles = ProcFileHandles::new(&self.cgroup.path); + self.file_handles = Some(handles); + Ok(()) + } + + fn write_header(&mut self, writer: &mut StatWriter) -> Result<(), csv::Error> { + writer.write_byte_record(&HEADER) + } + + fn collect( + &mut self, + writer: &mut StatWriter, + working_buffers: &mut WorkingBuffers, + ) -> Result<(), csv::Error> { + let file_handles = self + .file_handles + .as_ref() + .expect("file handles not yet initialized during collect()"); + + collect_read(working_buffers); + let pids_result = collect_pids(working_buffers, file_handles); + let cpu_result = collect_cpu(working_buffers, file_handles); + let memory_result = collect_memory(working_buffers, file_handles); + let io_result = collect_io(working_buffers, file_handles); + + // If all of the cgroup file reads were empty, + // skip writing the byte record. + if pids_result == Err(read::Empty) + && cpu_result == Err(read::Empty) + && memory_result == Err(read::Empty) + && io_result == Err(read::Empty) + { + // Discard the working record + working_buffers.record.clear(); + } else { + let result = writer.write_byte_record(&working_buffers.record); + working_buffers.record.clear(); + result?; + } + + Ok(()) + } +} + +lazy_static::lazy_static! { + /// Static CSV header for the stats collector + static ref HEADER: ByteRecord = ByteRecord::from(get_headers()); +} + +/// Creates the headers for the logfiles +#[allow(clippy::vec_init_then_push)] +fn get_headers() -> Vec { + let mut headers: Vec = vec![]; + // Add read headers + headers.push("read".into()); + // Add pids headers + headers.push("pids.current".into()); + headers.push("pids.max".into()); + // Add cpu headers + for cpu_stat_key in CPU_STAT_KEYS { + headers.push(format!( + "cpu.stat/{}", + String::from_utf8(cpu_stat_key.to_vec()).unwrap() + )); + } + // Add memory headers + headers.push("memory.current".into()); + headers.push("memory.high".into()); + headers.push("memory.max".into()); + for memory_stat_key in MEMORY_STAT_KEYS { + headers.push(format!( + "memory.stat/{}", + String::from_utf8(memory_stat_key.to_vec()).unwrap() + )); + } + // Add io headers + for io_stat_key in IO_STAT_KEYS { + headers.push(format!( + "io.stat/{}", + String::from_utf8(io_stat_key.to_vec()).unwrap() + )); + } + + headers +} + +/// Collects the nanosecond unix timestamp read time +#[inline] +fn collect_read(buffers: &mut WorkingBuffers) { + let nano_ts = util::nano_ts(); + let mut itoa_buffer = itoa::Buffer::new(); + let formatted = itoa_buffer.format(nano_ts); + buffers.record.push_field(formatted.as_bytes()); +} + +/// Collects all stats for the pids controller +/// see +#[inline] +fn collect_pids( + buffers: &mut WorkingBuffers, + handles: &ProcFileHandles, +) -> Result<(), read::Empty> { + let pids_current = read::single_value_file(&handles.pids_current, buffers, b"0"); + let pids_max = read::single_value_file(&handles.pids_max, buffers, b"max"); + if pids_current == Err(read::Empty) && pids_max == Err(read::Empty) { + Err(read::Empty) + } else { + Ok(()) + } +} + +/// Keys to read from the cpu.stat file +const CPU_STAT_KEYS: [&[u8]; 6] = [ + b"usage_usec", + b"system_usec", + b"user_usec", + b"nr_periods", + b"nr_throttled", + b"throttled_usec", +]; +const CPU_STAT_DEFAULTS: [&[u8]; 6] = [b"0"; 6]; + +/// Collects all stats for the cpu controller +/// see +#[inline] +fn collect_cpu(buffers: &mut WorkingBuffers, handles: &ProcFileHandles) -> Result<(), read::Empty> { + read::flat_keyed_file( + &handles.cpu_stat, + buffers, + &CPU_STAT_KEYS, + &CPU_STAT_DEFAULTS, + ) +} + +/// Keys to read from the memory.stat file +const MEMORY_STAT_KEYS: [&[u8]; 18] = [ + b"anon", + b"file", + b"kernel_stack", + b"pagetables", + b"percpu", + b"sock", + b"shmem", + b"file_mapped", + b"file_dirty", + b"file_writeback", + b"swapcached", + b"inactive_anon", + b"active_anon", + b"inactive_file", + b"active_file", + b"unevictable", + b"pgfault", + b"pgmajfault", +]; +const MEMORY_STAT_DEFAULTS: [&[u8]; 18] = [b"0"; 18]; + +/// Collects all stats for the memory controller +/// see +#[inline] +fn collect_memory( + buffers: &mut WorkingBuffers, + handles: &ProcFileHandles, +) -> Result<(), read::Empty> { + let mem_current = read::single_value_file(&handles.memory_current, buffers, b"0"); + let mem_high = read::single_value_file(&handles.memory_high, buffers, b"max"); + let mem_max = read::single_value_file(&handles.memory_max, buffers, b"max"); + let mem_stat = read::flat_keyed_file( + &handles.memory_stat, + buffers, + &MEMORY_STAT_KEYS, + &MEMORY_STAT_DEFAULTS, + ); + if mem_current == Err(read::Empty) + && mem_high == Err(read::Empty) + && mem_max == Err(read::Empty) + && mem_stat == Err(read::Empty) + { + Err(read::Empty) + } else { + Ok(()) + } +} + +/// Keys to read and get totals for from the io.stat file +const IO_STAT_KEYS: [&[u8]; 6] = [b"rbytes", b"wbytes", b"rios", b"wios", b"dbytes", b"dios"]; + +/// Collects all stats for the io controller +/// see +#[inline] +fn collect_io(buffers: &mut WorkingBuffers, handles: &ProcFileHandles) -> Result<(), read::Empty> { + read::io_stat_file(&handles.io_stat, buffers, &IO_STAT_KEYS) +} diff --git a/src/collection/collectors/cgroup_v2/read.rs b/src/collection/collectors/cgroup_v2/read.rs new file mode 100644 index 0000000..708eae1 --- /dev/null +++ b/src/collection/collectors/cgroup_v2/read.rs @@ -0,0 +1,233 @@ +use crate::collection::buffers::WorkingBuffers; +use crate::util::{self, BufferLike, ByteLines, LazyQuantity}; +use std::fs::File; +use std::io::{Read, Seek, SeekFrom}; + +#[derive(Copy, Clone, PartialEq)] +pub struct Empty; + +/// Tries to read the given file handle, +/// and directly write the contents as a field to the next record. +/// If the written field was empty, returns Err(Empty). +pub fn single_value_file( + file: &Option, + buffers: &mut WorkingBuffers, + default: &'static [u8], +) -> Result<(), Empty> { + let content = match read_to_buffer(file, buffers) { + None => &[], + Some(_) => buffers.buffer.trim(), + }; + + let is_empty = util::content_len_raw(content) == 0; + if is_empty { + buffers.record.push_field(default); + } else { + buffers.record.push_field(content); + } + + buffers.buffer.clear(); + + if is_empty { + Err(Empty) + } else { + Ok(()) + } +} + +/// Attempts to read the given file into the buffer, if it exists. +/// If successful, returns Some with the length of the part of the file read. +/// If the file handle wasn't given, or reading was unsuccessful, returns None. +fn read_to_buffer(file: &Option, buffers: &mut WorkingBuffers) -> Option { + match file { + None => None, + Some(f) => { + let mut file_mut = f; + let result = match file_mut.read(&mut buffers.buffer.b) { + Err(_) => None, + Ok(len) => { + buffers.buffer.len = len; + if len == 0 { + None + } else { + Some(len) + } + }, + }; + // Ignore errors: if seeking fails, then the effect next time will be pushing + // empty buffers to the CSV rows, which lets the other monitoring + // continue + let _result = file_mut.seek(SeekFrom::Start(0)); + result + }, + } +} + +/// Tries to read the given file handle, +/// attempting to find the given keys in the file's contents. +/// The keys' values are written to the row buffer +/// in the same order as the keys slice, +/// and if a value does not exist, the cell is empty. +/// If all of the written values were empty, +/// then Err(Empty) is returned. +pub fn flat_keyed_file( + file: &Option, + buffers: &mut WorkingBuffers, + keys: &[&'static [u8]; K], + defaults: &[&'static [u8]; K], +) -> Result<(), Empty> { + // Ignore errors: the buffer will just remain empty + // and all of the below processing will result in empty fields. + // It is important to always write K fields, + // so we don't return early. + let _result = read_to_buffer(file, buffers); + + // Create K slices, + // each pointing to a location in the buffer + // where the statistic was found. + // After scanning each line, + // the slices will be consumed to add records. + let mut slices: [&[u8]; K] = [&[]; K]; + + let lines = ByteLines::new(&buffers.buffer.b); + for (line, _) in lines { + // Split the the buffer by the space in the middle + // to obtain the key and value: + if let Some(space) = util::find_char(line, 0, util::is_space) { + let (key, value) = (&line[..space], &line[(space + 1)..]); + for (i, &target_key) in keys.iter().enumerate() { + if target_key == key { + slices[i] = value; + break; + } + } + } + } + + // Consume each of the slices + let mut all_empty = true; + for (i, slice) in slices.iter().enumerate() { + all_empty = all_empty && slice.is_empty(); + if slice.is_empty() { + buffers.record.push_field(defaults[i]); + } else { + buffers.record.push_field(slice); + } + } + + buffers.buffer.clear(); + + if all_empty { + Err(Empty) + } else { + Ok(()) + } +} + +/// Tries to read the given file handle, +/// attempting to read it in as an IO stats file. +/// In this mode, it will search for the given keys +/// similar to `read::flat_keyed_file`, +/// but it will also sum multiple values for the same key +/// that occur in the IO stats file-specific format, +/// where each line gives stats for a single device. +/// In this way, the written values of this function +/// give the total IO stats over all devices. +/// If all of the written values were 0, +/// then Err(Empty) is returned. +pub fn io_stat_file( + file: &Option, + buffers: &mut WorkingBuffers, + keys: &[&'static [u8]; K], +) -> Result<(), Empty> { + // Ignore errors: the buffer will just remain empty + // and all of the below processing will result in empty fields. + // It is important to always write K fields, + // so we don't return early. + let _result = read_to_buffer(file, buffers); + + // Create K lazy quantities, + // where each corresponds to the nth key. + // As we scan each line in the stat file, + // we look for values for each key, + // and when found, add the value to the lazy quantity. + // This prevents us from parsing the bytes as an integer + // unless we need to add two values together. + let mut quantities = [LazyQuantity::<'_, u64>::default(); K]; + + let lines = ByteLines::new(&buffers.buffer.b); + for (line, _) in lines { + let fields = IoLineFieldIter::new(line); + for (key, value) in fields { + for (i, &target_key) in keys.iter().enumerate() { + if target_key == key { + quantities[i] = quantities[i].plus(value); + break; + } + } + } + } + + // Consume each of the quantities + let mut all_zero = true; + for qty in quantities { + all_zero = all_zero && qty.is_zero(); + qty.write_to_record(&mut buffers.copy_buffer, &mut buffers.record); + } + + buffers.buffer.clear(); + buffers.copy_buffer.clear(); + + if all_zero { + Err(Empty) + } else { + Ok(()) + } +} + +pub struct IoLineFieldIter<'a> { + remainder: &'a [u8], +} + +impl<'a> IoLineFieldIter<'a> { + fn new(line: &'a [u8]) -> Self { + // The first space separates the device from the fields + match util::find_char(line, 0, util::is_space) { + Some(space) => { + let (_dev, fields) = (&line[..space], &line[(space + 1)..]); + Self { remainder: fields } + }, + // Return an empty iterator + None => Self { + remainder: &line[0..0], + }, + } + } +} + +impl<'a> Iterator for IoLineFieldIter<'a> { + type Item = (&'a [u8], &'a [u8]); + fn next(&mut self) -> Option { + loop { + if self.remainder.is_empty() { + return None; + } + + match util::find_char(self.remainder, 0, util::is_space) { + None => return None, + Some(space) => { + let (pair, rest) = (&self.remainder[..space], &self.remainder[(space + 1)..]); + self.remainder = rest; + + // Try to parse the key=value pair + if let Some(equals) = util::find_char(pair, 0, |c| c == b'=') { + let (key, value) = (&pair[..equals], &pair[(equals + 1)..]); + return Some((key, value)); + } + + // If the current pair was not valid; continue to next pair + }, + } + } + } +} diff --git a/src/collection/collectors/mod.rs b/src/collection/collectors/mod.rs index 34ed911..483e419 100644 --- a/src/collection/collectors/mod.rs +++ b/src/collection/collectors/mod.rs @@ -1,5 +1,6 @@ mod all; mod cgroup_v1; +mod cgroup_v2; use crate::cli; use crate::collection::buffers::WorkingBuffers; diff --git a/src/polling/providers/docker.rs b/src/polling/providers/docker.rs index 9b0ac22..2689875 100644 --- a/src/polling/providers/docker.rs +++ b/src/polling/providers/docker.rs @@ -2,11 +2,12 @@ use crate::cli::RunCommand; use crate::polling::providers::{InitializationError, Provider}; use crate::shared::{CollectionEvent, CollectionMethod, CollectionTarget}; use crate::shell::Shell; -use crate::util::{self, CgroupManager, CgroupPath, ItemPool}; +use crate::util::{self, CgroupManager, CgroupPath, CgroupSlices, GetCgroupError, ItemPool}; use anyhow::Error; use shiplift::builder::ContainerListOptions; use shiplift::rep::Container; use std::collections::BTreeMap; +use std::path::PathBuf; use std::sync::Arc; use tokio::runtime::Runtime; @@ -50,7 +51,8 @@ impl From for InitializationError { #[derive(Debug)] enum StartCollectionError { MetadataSerializationError(Error), - CgroupNotFound, + CgroupNotFound(PathBuf), + CgroupVersionDetectionFailed, } impl Provider for Docker { @@ -112,11 +114,11 @@ impl Provider for Docker { Err(error) => { let container_display = display(container); match error { - StartCollectionError::CgroupNotFound => { + StartCollectionError::CgroupNotFound(path) => { self.shell().warn(format!( "Could not start collection for container {}: cgroup path \ - could not be constructed or does not exist", - container_display + '{:?}' does not exist on system", + container_display, path, )); }, StartCollectionError::MetadataSerializationError(cause) => { @@ -126,6 +128,14 @@ impl Provider for Docker { container_display, cause )); }, + StartCollectionError::CgroupVersionDetectionFailed => { + self.shell().warn(format!( + "Could not start collection for container {}: failed to \ + detect the currently running cgroup version (are cgroups \ + mounted in /sys/fs/cgroup?)", + container_display + )); + }, } // Ignore container and continue initializing the rest @@ -231,27 +241,35 @@ impl Docker { ) -> Result { // Only one type of CollectionMethod currently match self.get_cgroup(container) { - Some(cgroup) => Ok(CollectionMethod::LinuxCgroupV1(cgroup)), - None => Err(StartCollectionError::CgroupNotFound), + Ok(cgroup) => match cgroup.version { + util::CgroupVersion::V1 => Ok(CollectionMethod::LinuxCgroupV1(cgroup)), + util::CgroupVersion::V2 => Ok(CollectionMethod::LinuxCgroupV2(cgroup)), + }, + Err(GetCgroupError::VersionDetectionFailed) => { + Err(StartCollectionError::CgroupVersionDetectionFailed) + }, + Err(GetCgroupError::NotFound(path)) => Err(StartCollectionError::CgroupNotFound(path)), + Err(GetCgroupError::CgroupV1NotEnabled) => unreachable!(), } } /// Gets the group path for the given container, printing out a /// message upon the first successful cgroup resolution - fn get_cgroup(&mut self, c: &Container) -> Option { - // Determine if the manager had a resolved group beforehand + fn get_cgroup(&mut self, c: &Container) -> Result { + // Determine if the manager had a resolved version or driver beforehand let had_driver = self.cgroup_manager.driver().is_some(); + let had_version = self.cgroup_manager.version().is_some(); // Container cgroups are under the dockerd parent, and are in leaf - // cgroups by (full) container ID. Cgroup path depends on the driver used: - // according to https://docs.docker.com/engine/reference/commandline/dockerd/#default-cgroup-parent , + // cgroups by (full) container ID. + // Cgroup path depends on the driver used and version enabled. + // According to https://docs.docker.com/engine/reference/commandline/dockerd/#default-cgroup-parent , // "[container cgroups are mounted at] `/docker` for fs cgroup driver and // `system.slice` for systemd cgroup driver." - let cgroup_option: Option = self.cgroup_manager.get_cgroup_divided( - &["system.slice", &format!("docker-{}.scope", &c.id)], - &["docker", &c.id], - false, - ); + let result = self.cgroup_manager.get_cgroup(CgroupSlices { + systemd: &["system.slice", &format!("docker-{}.scope", &c.id)], + cgroupfs: &["docker", &c.id], + }); if !had_driver { if let Some(driver) = self.cgroup_manager.driver() { @@ -260,7 +278,14 @@ impl Docker { } } - cgroup_option + if !had_version { + if let Some(version) = self.cgroup_manager.version() { + self.shell() + .info(format!("Identified {} as cgroup version", version)); + } + } + + result } /// Gets a reference to the current shell diff --git a/src/polling/providers/kubernetes.rs b/src/polling/providers/kubernetes.rs index de77cf1..ba7b8cf 100644 --- a/src/polling/providers/kubernetes.rs +++ b/src/polling/providers/kubernetes.rs @@ -2,7 +2,7 @@ use crate::cli::RunCommand; use crate::polling::providers::{InitializationError, KubernetesOptions, Provider}; use crate::shared::{CollectionEvent, CollectionMethod, CollectionTarget}; use crate::shell::Shell; -use crate::util::{self, CgroupManager, CgroupPath, ItemPool}; +use crate::util::{self, CgroupManager, CgroupPath, CgroupSlices, GetCgroupError, ItemPool}; use anyhow::Error; use gethostname::gethostname; use k8s_openapi::api::core::v1::{Node, Pod}; @@ -121,9 +121,11 @@ impl QualityOfService { #[derive(Debug)] enum StartCollectionError { MetadataSerializationError(Error), - CgroupNotFound, MissingPodUid, FailedQosParse, + CgroupNotFound(PathBuf), + CgroupVersionDetectionFailed, + CgroupV1NotEnabled, } impl StartCollectionError { @@ -135,10 +137,10 @@ impl StartCollectionError { .or_else(|| pod.meta().uid.as_deref()) .unwrap_or(NONE_STR); match self { - Self::CgroupNotFound => format!( - "Could not start collection for pod {}: cgroup path could not be constructed or \ - does not exist", - pod_display + Self::CgroupNotFound(path) => format!( + "Could not start collection for pod {}: cgroup path '{:?}' does not exist on \ + system", + pod_display, path, ), Self::MetadataSerializationError(cause) => format!( "Could not start collection for pod {}: failed to serialize pod metadata: {}", @@ -158,6 +160,16 @@ impl StartCollectionError { .and_then(|s| s.qos_class.as_deref()) .unwrap_or(NONE_STR) ), + Self::CgroupVersionDetectionFailed => format!( + "Could not start collection for pod {}: failed to detect the currently running \ + cgroup version (are cgroups mounted in /sys/fs/cgroup?)", + pod_display + ), + Self::CgroupV1NotEnabled => format!( + "Could not start collection for pod {}: cgroup v1 isn't enabled (the Kubernetes \ + collector does not support cgroup v2)", + pod_display + ), } } } @@ -394,21 +406,39 @@ impl Kubernetes { // Construct the cgroup path from the UID and QoS class // from the metadata, and make sure it exists/is mounted match self.get_cgroup(uid, qos_class) { - Some(cgroup) => Ok(CollectionMethod::LinuxCgroupV1(cgroup)), - None => Err(StartCollectionError::CgroupNotFound), + Ok(cgroup) => match cgroup.version { + util::CgroupVersion::V1 => Ok(CollectionMethod::LinuxCgroupV1(cgroup)), + util::CgroupVersion::V2 => Ok(CollectionMethod::LinuxCgroupV2(cgroup)), + }, + Err(GetCgroupError::VersionDetectionFailed) => { + Err(StartCollectionError::CgroupVersionDetectionFailed) + }, + Err(GetCgroupError::NotFound(path)) => Err(StartCollectionError::CgroupNotFound(path)), + Err(GetCgroupError::CgroupV1NotEnabled) => { + Err(StartCollectionError::CgroupV1NotEnabled) + }, } } /// Gets the group path for the given UID and quality of service class, /// printing out a message upon the first successful cgroup resolution - fn get_cgroup(&mut self, uid: &str, qos_class: QualityOfService) -> Option { + fn get_cgroup( + &mut self, + uid: &str, + qos_class: QualityOfService, + ) -> Result { let pod_slice = String::from("pod") + uid; - // Determine if the manager had a resolved group beforehand + // Determine if the manager had a resolved version or driver beforehand let had_driver = self.cgroup_manager.driver().is_some(); + let had_version = self.cgroup_manager.version().is_some(); - let cgroup_option: Option = self - .cgroup_manager - .get_cgroup(&[ROOT_CGROUP, qos_class.into(), &pod_slice], true); + let base_cgroup_slices = &[ROOT_CGROUP, qos_class.into(), &pod_slice]; + // Only support cgroup v1 for Kubernetes pods + // (cgroup v2 is untested) + let result = self.cgroup_manager.get_cgroup_v1(CgroupSlices { + cgroupfs: base_cgroup_slices, + systemd: &util::build_systemd_cgroup_hierarchy(base_cgroup_slices), + }); if !had_driver { if let Some(driver) = self.cgroup_manager.driver() { @@ -417,7 +447,14 @@ impl Kubernetes { } } - cgroup_option + if !had_version { + if let Some(version) = self.cgroup_manager.version() { + self.shell() + .info(format!("Identified {} as cgroup version", version)); + } + } + + result } /// Gets the current node's hostname diff --git a/src/shared.rs b/src/shared.rs index c8a7955..783ae19 100644 --- a/src/shared.rs +++ b/src/shared.rs @@ -31,6 +31,7 @@ pub enum CollectionEvent { #[derive(Clone, Debug, PartialEq, Serialize)] pub enum CollectionMethod { LinuxCgroupV1(CgroupPath), + LinuxCgroupV2(CgroupPath), } /// Single container/pod/process/other entity that represents a single target diff --git a/src/util/cgroup.rs b/src/util/cgroup.rs index 5c059c7..cfb603e 100644 --- a/src/util/cgroup.rs +++ b/src/util/cgroup.rs @@ -1,10 +1,9 @@ use serde::Serialize; use std::fmt; -use std::fs; use std::path::{Path, PathBuf}; -/// Docker cgroup driver used to orchestrate moving containers in and out of -/// cgroups +/// Docker cgroup driver used to orchestrate +/// moving containers in and out of cgroups #[derive(Clone, Copy, Debug, PartialEq, Serialize)] #[serde(rename_all = "lowercase")] pub enum CgroupDriver { @@ -21,109 +20,209 @@ impl fmt::Display for CgroupDriver { } } +/// Linux cgroup version +#[derive(Clone, Copy, Debug, PartialEq, Serialize)] +#[serde(rename_all = "lowercase")] +pub enum CgroupVersion { + V1, + V2, +} + +impl fmt::Display for CgroupVersion { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::V1 => write!(f, "v1"), + Self::V2 => write!(f, "v2"), + } + } +} + +pub const CGROUP_V2_CHECK_PATH: &str = "/sys/fs/cgroup/cgroup.controllers"; + +impl CgroupVersion { + fn try_resolve() -> Option { + if Path::new(CGROUP_V2_CHECK_PATH).exists() { + return Some(Self::V2); + } + + if cgroup_exists(None::, Self::V1) { + return Some(Self::V1); + } + + None + } +} + +pub struct CgroupSlices<'c, 's, C, S> +where + C: AsRef, + S: AsRef, +{ + pub cgroupfs: &'c [C], + pub systemd: &'s [S], +} + +impl CgroupSlices<'_, '_, C, S> +where + C: AsRef, + S: AsRef, +{ + #[must_use] + fn pick_and_join(&self, driver: CgroupDriver) -> PathBuf { + match driver { + CgroupDriver::Cgroupfs => join_slices(self.cgroupfs), + CgroupDriver::Systemd => join_slices(self.systemd), + } + } +} + +#[must_use] +fn join_slices(s: &[impl AsRef]) -> PathBuf { + let mut path_buf = PathBuf::new(); + for s in s { + path_buf.push(s.as_ref()); + } + path_buf +} + /// Encapsulated behavior for lazy-resolution of Docker cgroup driver (systemd -/// or cgroupfs). Works for cgroups v1 +/// or cgroupfs). Works for cgroup v1 and v2 pub struct CgroupManager { - driver: Option, + driver: Option, + version: Option, } /// Resolved and existing cgroup path constructed from the construction methods /// on `CgroupManager` #[derive(Clone, Debug, PartialEq, Serialize)] pub struct CgroupPath { - pub path: PathBuf, - pub driver: CgroupDriver, + pub path: PathBuf, + pub driver: CgroupDriver, + pub version: CgroupVersion, } impl Default for CgroupManager { fn default() -> Self { Self::new() } } +pub enum GetCgroupError { + CgroupV1NotEnabled, + VersionDetectionFailed, + NotFound(PathBuf), +} + impl CgroupManager { - /// Creates a new cgroup manager with an unknown driver type + /// Creates a new cgroup manager with an unknown driver type and version #[must_use] - pub const fn new() -> Self { Self { driver: None } } - - /// Joins together the given slices to make a target cgroup, performing - /// formatting conversions as necessary to target the current cgroup - /// driver. If no driver is currently set, then tries to detect the - /// current driver by seeing if the resultant formatted cgroup path from - /// any of the drivers currently exists in the cgroup filesystem. This - /// existence check is also performed if the current driver is known; if the - /// cgroup was constructed and exists, returns Some(constructed path), else - /// None. - /// - /// To use the direct slices without transforming their formatting in any - /// way, set construct to false - pub fn get_cgroup(&mut self, slices: &[&str], construct: bool) -> Option { - match self.driver { - Some(driver) => { - let path = make(driver, slices, construct); - match cgroup_exists(&path) { - true => Some(CgroupPath { path, driver }), - false => None, - } - }, - None => self - .try_resolve(CgroupDriver::Systemd, slices, construct) - .or_else(|| self.try_resolve(CgroupDriver::Cgroupfs, slices, construct)), + pub const fn new() -> Self { + Self { + driver: None, + version: None, } } - /// Joins together the given slices to make a target cgroup, performing - /// formatting conversions as necessary to target the current cgroup - /// driver. If no driver is currently set, then tries to detect the - /// current driver by seeing if the resultant formatted cgroup path from - /// any of the drivers currently exists in the cgroup filesystem. This - /// existence check is also performed if the current driver is known; if the - /// cgroup was constructed and exists, returns `Some(path)`, else - /// `None` + /// Joins together the given slices to make a target cgroup, + /// selecting the appropriate list of slices depending on the driver. + /// Ensures that the cgroup path exists before returning it. + /// If either the driver or version hasn't been detected yet, + /// then this function also tries to detect them. /// - /// Differs from `get_cgroup` in that it allows for different slices to be - /// specified for each driver - /// - /// To use the direct slices without transforming their formatting in any - /// way, set construct to false - pub fn get_cgroup_divided( + /// Only works if cgroups are enabled, + /// and mounted in the filesystem at `LINUX_CGROUP_ROOT`; + /// otherwise returns `Err`. + #[allow(clippy::needless_pass_by_value)] + pub fn get_cgroup( &mut self, - systemd_slices: &[&str], - cgroupfs_slices: &[&str], - construct: bool, - ) -> Option { + slices: CgroupSlices<'_, '_, C, S>, + ) -> Result + where + C: AsRef, + S: AsRef, + { + let version = self + .get_version_or_resolve() + .ok_or(GetCgroupError::VersionDetectionFailed)?; + match self.driver { Some(driver) => { - let path = match driver { - CgroupDriver::Systemd { .. } => make(driver, systemd_slices, construct), - CgroupDriver::Cgroupfs => make(driver, cgroupfs_slices, construct), - }; - match cgroup_exists(&path) { - true => Some(CgroupPath { path, driver }), - false => None, + // Pick the appropriate list of slices for the driver, + // and join them together to make the path. + let path: PathBuf = slices.pick_and_join(driver); + + // Make sure the cgroup exists before returning it + match cgroup_exists(Some(&path), version) { + true => Ok(CgroupPath { + path, + driver, + version, + }), + false => Err(GetCgroupError::NotFound(path)), } }, - None => self - .try_resolve(CgroupDriver::Systemd, systemd_slices, construct) - .or_else(|| self.try_resolve(CgroupDriver::Cgroupfs, cgroupfs_slices, construct)), + None => { + // Try to see if the systemd cgroup exists + let systemd_cgroup = join_slices(slices.systemd); + if cgroup_exists(Some(&systemd_cgroup), version) { + self.driver = Some(CgroupDriver::Systemd); + return Ok(CgroupPath { + path: systemd_cgroup, + driver: CgroupDriver::Systemd, + version, + }); + } + + // Otherwise, try to see if the cgroupfs cgroup exists + let cgroupfs_cgroup = join_slices(slices.cgroupfs); + if cgroup_exists(Some(&cgroupfs_cgroup), version) { + self.driver = Some(CgroupDriver::Cgroupfs); + return Ok(CgroupPath { + path: cgroupfs_cgroup, + driver: CgroupDriver::Cgroupfs, + version, + }); + } + + Err(GetCgroupError::NotFound(systemd_cgroup)) + }, } } - /// Attempts to resolve the cgroup driver, by making the cgroup path for the - /// given driver and then testing whether it exists + /// Joins together the given slices to make a target cgroup, + /// selecting the appropriate list of slices depending on the driver. + /// Ensures that the cgroup path exists before returning it. + /// If either the driver or version hasn't been detected yet, + /// then this function also tries to detect them. /// - /// To use the direct slices without transforming their formatting in any - /// way, set construct to false - fn try_resolve( + /// Only works if cgroup v1 is enabled, + /// and mounted in the filesystem at `LINUX_CGROUP_ROOT`; + /// otherwise returns `Err`. + pub fn get_cgroup_v1( &mut self, - driver: CgroupDriver, - slices: &[&str], - construct: bool, - ) -> Option { - let path = make(driver, slices, construct); - match cgroup_exists(&path) { - false => None, - true => { - self.driver = Some(driver); - Some(CgroupPath { path, driver }) + slices: CgroupSlices<'_, '_, C, S>, + ) -> Result + where + C: AsRef, + S: AsRef, + { + let version = self + .get_version_or_resolve() + .ok_or(GetCgroupError::VersionDetectionFailed)?; + + if version != CgroupVersion::V1 { + return Err(GetCgroupError::CgroupV1NotEnabled); + } + + self.get_cgroup(slices) + } + + fn get_version_or_resolve(&mut self) -> Option { + match self.version { + Some(version) => Some(version), + None => { + // Attempt to detect the current version + let version = CgroupVersion::try_resolve()?; + self.version = Some(version); + Some(version) }, } } @@ -131,102 +230,116 @@ impl CgroupManager { /// Gets the current resolved driver for the manager #[must_use] pub const fn driver(&self) -> Option { self.driver } -} -/// Constructs a cgroup absolute path according to the style expected by the -/// given driver -/// -/// To use the direct slices without transforming their formatting in any -/// way, set construct to false -#[must_use] -pub fn make(driver: CgroupDriver, slices: &[&str], construct: bool) -> PathBuf { - match driver { - CgroupDriver::Cgroupfs => make_cgroupfs(slices), - CgroupDriver::Systemd => make_systemd(slices, construct), - } + /// Gets the current resolved cgroup version for the manager + #[must_use] + pub const fn version(&self) -> Option { self.version } } -const SYSTEMD_SLICE_SUFFIX: &str = ".slice"; - -/// Converts a vec of slice names such as vec!["kubepods", "burstable", -/// "pod1234-5678"] into a systemd-style cgroup path such as "/kubepods.slice/ -/// kubepods-burstable.slice/kubepods-burstable-pod1234_5678.slice" +/// Converts a vec of slice names such as: +/// ```rs +/// vec!["kubepods", "burstable", "pod1234-5678"] +/// ``` +/// into a systemd-style list of slice names such as: +/// ```rs +/// vec![ +/// "kubepods.slice", +/// "kubepods-burstable.slice", +/// "kubepods-burstable-pod1234_5678.slice", +/// ] +/// ``` /// see [`kubernetes/kubelet/cm/cgroup_manager_linux.go:ToSystemd()`](https://github.com/kubernetes/kubernetes/blob/bb5ed1b79709c865d9aa86008048f19331530041/pkg/kubelet/cm/cgroup_manager_linux.go#L87-L103) -/// -/// To use the direct slices without transforming their formatting in any -/// way, set construct to false -#[must_use] -fn make_systemd(slices: &[&str], construct: bool) -> PathBuf { - if slices.is_empty() || slices.len() == 1 && slices[0].is_empty() { - return PathBuf::from(""); +pub fn build_systemd_cgroup_hierarchy(slices: &[impl AsRef]) -> Vec { + if slices.is_empty() || slices.len() == 1 && slices[0].as_ref().is_empty() { + return vec![]; } - match construct { - false => slices.iter().collect::(), - true => { - // First, escape systemd slices - let escaped = slices.iter().map(|&s| escape_systemd(s)); - - // Aggregate each slice with all previous to build final path - let mut path: PathBuf = PathBuf::new(); - // Previously accumulated slices like "kubepods-burstable-" - let mut accumulator: String = String::new(); - // Re-usable working buffer - let mut working: String = String::new(); - for slice in escaped { - // Add the current slice to the path - working += &accumulator; - working += &slice; - working += SYSTEMD_SLICE_SUFFIX; - path.push(&working); - working.clear(); - - // Add the current slice to the accumulator - accumulator += &slice; - accumulator += "-"; - } + // Aggregate each slice with all previous to build the hierarchy: + // Previously accumulated slices like "kubepods-burstable-" + let mut accumulator: String = String::new(); + let mut hierarchy: Vec = Vec::with_capacity(slices.len()); + for base_slice in slices { + // Escape each slice before processing it + let base_slice = base_slice.as_ref(); + let escaped_slice = escape_systemd(base_slice); - path - }, + // Add the current slice to the list + hierarchy.push(format!("{}{}.slice", &accumulator, &escaped_slice)); + + // Add the current slice to the accumulator + accumulator += &escaped_slice; + accumulator += "-"; } + + hierarchy } /// Escapes a cgroup slice to be in the style of Systemd cgroups /// see [`kubernetes/kubelet/cm/cgroup_manager_linux.go:escapeSystemdCgroupName()`](https://github.com/kubernetes/kubernetes/blob/bb5ed1b79709c865d9aa86008048f19331530041/pkg/kubelet/cm/cgroup_manager_linux.go#L74-L76) #[must_use] -pub fn escape_systemd(slice: &str) -> String { slice.replace("-", "_") } - -/// Converts a vec of slice names such as vec!["kubepods", "burstable", -/// "pod1234-5678"] into a systemd-style cgroup path such as -/// `/kubepods/burstable/pod1234_5678` see [`kubernetes/kubelet/cm/cgroup_manager_linux.go:ToCgroupfs()`](https://github.com/kubernetes/kubernetes/blob/bb5ed1b79709c865d9aa86008048f19331530041/pkg/kubelet/cm/cgroup_manager_linux.go#L116-L118) -#[must_use] -fn make_cgroupfs(slices: &[&str]) -> PathBuf { slices.iter().collect() } +fn escape_systemd(slice: &str) -> String { slice.replace("-", "_") } pub const INVALID_CGROUP_MOUNT_MESSAGE: &str = - "rAdvisor expects cgroups to be mounted in /sys/fs/cgroup. If this is\nthe case, make sure \ - that the 'cpuacct' resource controller has not been disabled."; + "rAdvisor expects cgroups to be enabled and mounted in /sys/fs/cgroup."; -/// Checks if cgroups are mounted in /sys/fs/cgroup and if the cpuacct subsystem -/// is enabled (necessary for proper driver detection) +/// Checks if cgroups are mounted in /sys/fs/cgroup +/// (for both cgroup v1 and v2) #[must_use] -pub fn cgroups_mounted_properly() -> bool { - // Use the raw subsystem directory to see if the expected cgroup hierarchy - // exists - cgroup_exists("") -} +pub fn cgroups_mounted_properly() -> bool { Path::new(STANDARD_CGROUP_MOUNT_ROOT).exists() } + +// From https://man7.org/linux/man-pages/man7/cgroups.7.html +pub const STANDARD_CGROUP_MOUNT_ROOT: &str = "/sys/fs/cgroup"; -pub const LINUX_CGROUP_ROOT: &str = "/sys/fs/cgroup"; +// From https://man7.org/linux/man-pages/man7/cgroups.7.html +pub const CGROUP_V1_SUBSYSTEMS: &[&str] = &[ + // Place the cpuacct subsystem first, + // since it is most likely to exist + // (so it will be checked first in `cgroup_v1_exists`). + "cpuacct", + "cpu", + "cpuset", + "memory", + "devices", + "freezer", + "net_cls", + "blkio", + "perf_event", + "net_prio", + "hugetlb", + "pids", + "rdma", +]; -/// Determines whether the given (absolute) cgroup exists in the virtual -/// filesystem **Note**: fails if cgroups aren't mounted in /sys/fs/cgroup or if -/// the cpuacct subsystem isn't enabled. +/// Determines whether the given (absolute) cgroup +/// exists in the virtual filesystem at the standard mount point. #[must_use] -fn cgroup_exists>(path: C) -> bool { - let mut full_path: PathBuf = [LINUX_CGROUP_ROOT, "cpuacct"].iter().collect(); - full_path.push(path); - match fs::metadata(full_path) { - Err(_) => false, - // As long as it exists and is a directory, assume all is good - Ok(metadata) => metadata.is_dir(), +fn cgroup_exists>(path: Option, version: CgroupVersion) -> bool { + match version { + CgroupVersion::V1 => { + // See if any of the cgroup v1 subsystems are mounted + for subsystem in CGROUP_V1_SUBSYSTEMS { + let mut full_path = PathBuf::new(); + full_path.push(STANDARD_CGROUP_MOUNT_ROOT); + full_path.push(subsystem); + if let Some(p) = &path { + full_path.push(p.as_ref()); + } + + if full_path.exists() { + return true; + } + } + }, + CgroupVersion::V2 => { + let mut full_path = PathBuf::new(); + full_path.push(STANDARD_CGROUP_MOUNT_ROOT); + if let Some(p) = path { + full_path.push(p.as_ref()); + } + + return full_path.exists(); + }, } + + false } diff --git a/src/util/lazy_quantity.rs b/src/util/lazy_quantity.rs index 26a3a22..be78cc8 100644 --- a/src/util/lazy_quantity.rs +++ b/src/util/lazy_quantity.rs @@ -62,6 +62,9 @@ impl<'a, T: FromRadix10Checked + SaturatingAdd + Integer + Copy> LazyQuantity<'a record.push_field(working.content()); working.clear(); } + + /// Returns whether the lazy quantity has had any values added to it + pub fn is_zero(&self) -> bool { matches!(self, Self::Zero) } } impl<'a, T: FromRadix10Checked + SaturatingAdd + Integer + Copy> Default for LazyQuantity<'a, T> {