From 457545be410593dfdc1d03754b7082db5c5cb775 Mon Sep 17 00:00:00 2001 From: Antvirf Date: Mon, 18 Nov 2024 11:24:51 +0000 Subject: [PATCH 1/3] feat: reaping by `POD_STATUS_PHASES` --- README.md | 16 +++++++- rules/pod_status_phase.go | 36 ++++++++++++++++++ rules/pod_status_phase_test.go | 69 ++++++++++++++++++++++++++++++++++ rules/rules.go | 1 + 4 files changed, 121 insertions(+), 1 deletion(-) create mode 100644 rules/pod_status_phase.go create mode 100644 rules/pod_status_phase_test.go diff --git a/README.md b/README.md index c8cf03b..cac9b98 100644 --- a/README.md +++ b/README.md @@ -210,7 +210,7 @@ Note that this will not catch statuses that are describing the entire pod like t Flags a pod for reaping based on the pod status. -Enabled and configured by setting the environment variable `POD_STATUSES` with a coma separated list (no whitespace) of statuses. If the pod status in the specified list of status, the pod will be flagged for reaping. +Enabled and configured by setting the environment variable `POD_STATUSES` with a comma separated list (no whitespace) of statuses. If the pod status in the specified list of status, the pod will be flagged for reaping. Example: @@ -221,6 +221,20 @@ POD_STATUSES=Evicted,Unknown ``` Note that pod status is different than container statuses as it checks the status of the overall pod rather than the status of containers in the pod. The most obvious use case of this if dealing with `Evicted` pods. +### `POD_PHASE_STATUSES` + +Flags a pod for reaping based on it's `pod.Status.Phase` - i.e. `Pending`, `Running`, `Succeeded`, `Failed`, `Unknown`. See [Kubernetes Go client docs on this type here.](https://pkg.go.dev/k8s.io/api/core/v1#PodPhase) + +Enabled and configured by setting the environment variable `POD_PHASE_STATUSES` with a comma-separated list (no whitespaces) of pod status phases. If a pod is in the status phase specified in the list, it will be flagged for reaping. + +Example: + +```sh +# every 10 minutes, kill all pods with status Pending or Failed +SCHEDULE=@every 10m +POD_STATUSES=Pending,Failed +``` + ### `MAX_DURATION` Flags a pod for reaping based on the pods current run duration. diff --git a/rules/pod_status_phase.go b/rules/pod_status_phase.go new file mode 100644 index 0000000..f1799b6 --- /dev/null +++ b/rules/pod_status_phase.go @@ -0,0 +1,36 @@ +package rules + +import ( + "fmt" + "os" + "strings" + + v1 "k8s.io/api/core/v1" +) + +const envPodStatusPhase = "POD_STATUS_PHASES" + +var _ Rule = (*podStatusPhase)(nil) + +type podStatusPhase struct { + reapStatusPhases []string +} + +func (rule *podStatusPhase) load() (bool, string, error) { + value, active := os.LookupEnv(envPodStatusPhase) + if !active { + return false, "", nil + } + rule.reapStatusPhases = strings.Split(value, ",") + return true, fmt.Sprintf("pod status phase in [%s]", value), nil +} + +func (rule *podStatusPhase) ShouldReap(pod v1.Pod) (bool, string) { + status := string(pod.Status.Phase) + for _, reapStatusPhase := range rule.reapStatusPhases { + if status == reapStatusPhase { + return true, fmt.Sprintf("has pod status phase %s", reapStatusPhase) + } + } + return false, "" +} diff --git a/rules/pod_status_phase_test.go b/rules/pod_status_phase_test.go new file mode 100644 index 0000000..6b45079 --- /dev/null +++ b/rules/pod_status_phase_test.go @@ -0,0 +1,69 @@ +package rules + +import ( + "os" + "testing" + + "github.com/stretchr/testify/assert" + v1 "k8s.io/api/core/v1" +) + +func testPodFromPhase(phase v1.PodPhase) v1.Pod { + return v1.Pod{ + Status: v1.PodStatus{ + Phase: phase, + }, + } +} + +func TestPodPhaseLoad(t *testing.T) { + t.Run("load", func(t *testing.T) { + os.Clearenv() + os.Setenv(envPodStatusPhase, "test-phase") + loaded, message, err := (&podStatusPhase{}).load() + assert.NoError(t, err) + assert.Equal(t, "pod status phase in [test-phase]", message) + assert.True(t, loaded) + }) + t.Run("load multiple-statuses", func(t *testing.T) { + os.Clearenv() + os.Setenv(envPodStatusPhase, "test-phase,another-phase") + podStatusPhase := podStatusPhase{} + loaded, message, err := podStatusPhase.load() + assert.NoError(t, err) + assert.Equal(t, "pod status phase in [test-phase,another-phase]", message) + assert.True(t, loaded) + assert.Equal(t, 2, len(podStatusPhase.reapStatusPhases)) + assert.Equal(t, "test-phase", podStatusPhase.reapStatusPhases[0]) + assert.Equal(t, "another-phase", podStatusPhase.reapStatusPhases[1]) + }) + t.Run("no load", func(t *testing.T) { + os.Clearenv() + loaded, message, err := (&podStatusPhase{}).load() + assert.NoError(t, err) + assert.Equal(t, "", message) + assert.False(t, loaded) + }) +} + +func TestPodStatusPhaseShouldReap(t *testing.T) { + t.Run("reap", func(t *testing.T) { + os.Clearenv() + os.Setenv(envPodStatusPhase, "test-phase,another-phase") + podStatusPhase := podStatusPhase{} + podStatusPhase.load() + pod := testPodFromPhase("another-phase") + shouldReap, reason := podStatusPhase.ShouldReap(pod) + assert.True(t, shouldReap) + assert.Regexp(t, ".*another-phase.*", reason) + }) + t.Run("no reap", func(t *testing.T) { + os.Clearenv() + os.Setenv(envPodStatusPhase, "test-phase,another-phase") + podStatusPhase := podStatusPhase{} + podStatusPhase.load() + pod := testPodFromPhase("not-present") + shouldReap, _ := podStatusPhase.ShouldReap(pod) + assert.False(t, shouldReap) + }) +} diff --git a/rules/rules.go b/rules/rules.go index 1887a88..1161eea 100644 --- a/rules/rules.go +++ b/rules/rules.go @@ -33,6 +33,7 @@ func LoadRules() (Rules, error) { &duration{}, &unready{}, &podStatus{}, + &podStatusPhase{}, } // return only the active rules loadedRules := []Rule{} From 02933b3e07f393363217ff85ae8ac760d503b370 Mon Sep 17 00:00:00 2001 From: Antvirf Date: Mon, 18 Nov 2024 11:34:25 +0000 Subject: [PATCH 2/3] docs: fix example --- README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index cac9b98..f8d3a41 100644 --- a/README.md +++ b/README.md @@ -118,21 +118,21 @@ Acceptable values are positive integers. Negative integers will evaluate to 0 an ### `POD_SORTING_STRATEGY` -Default value: unset (which will use the pod ordering return without specification from the API server). +Default value: unset (which will use the pod ordering return without specification from the API server). Accepted values: - (unset) - use the default ordering from the API server - `random` (case-sensitive) will randomly shuffle the list of pods before killing - `oldest-first` (case-sensitive) will sort pods into oldest-first based on the pods start time. (!! warning below). - `youngest-first` (case-sensitive) will sort pods into youngest-first based on the pods start time (!! warning below) -- `pod-deletion-cost` (case-sensitive) will sort pods based on the [pod deletion cost annotation](https://kubernetes.io/docs/concepts/workloads/controllers/replicaset/#pod-deletion-cost). +- `pod-deletion-cost` (case-sensitive) will sort pods based on the [pod deletion cost annotation](https://kubernetes.io/docs/concepts/workloads/controllers/replicaset/#pod-deletion-cost). !! WARNINGS !! Pod start time is not always defined. In these cases, sorting strategies based on age put pods without start times at the -end of the list. From my experience, this usually happens during a race condition with the pod initially being scheduled, +end of the list. From my experience, this usually happens during a race condition with the pod initially being scheduled, but there may be other cases hidden away. -Using pod-reaper against the kube-system namespace can have some surprising implications. For example, during testing I +Using pod-reaper against the kube-system namespace can have some surprising implications. For example, during testing I found that the kube-schedule was owned by a master node (not a replicaset/daemon-set) and appeared to effectively ignore delete actions. The age returned from `kubectl` was reset, but the actual pod start time was unaffected. As a result of this, I found a looping scenario where the kube scheduler was effectively always the oldest pod. @@ -141,7 +141,7 @@ In examples/pod-sorting-strategy.yml I mitigated this using by excluding on the ## Logging -Pod reaper logs in JSON format using a logrus (https://github.com/sirupsen/logrus). +Pod reaper logs in JSON format using a logrus (https://github.com/sirupsen/logrus). - rule load: customer messages for each rule are logged when the pod-reaper is starting - reap cycle: a message is logged each time the reaper starts a cycle. @@ -208,7 +208,7 @@ Note that this will not catch statuses that are describing the entire pod like t ### `POD_STATUS` -Flags a pod for reaping based on the pod status. +Flags a pod for reaping based on the pod status. Enabled and configured by setting the environment variable `POD_STATUSES` with a comma separated list (no whitespace) of statuses. If the pod status in the specified list of status, the pod will be flagged for reaping. @@ -232,7 +232,7 @@ Example: ```sh # every 10 minutes, kill all pods with status Pending or Failed SCHEDULE=@every 10m -POD_STATUSES=Pending,Failed +POD_PHASE_STATUSES=Pending,Failed ``` ### `MAX_DURATION` From b2f6bab4133e6a5804801988a60594b92c4ad1b4 Mon Sep 17 00:00:00 2001 From: Antvirf Date: Tue, 19 Nov 2024 11:49:23 +0000 Subject: [PATCH 3/3] docs: update changelog --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 00c9b9b..2bd1b8a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # pod-reaper: kills pods dead +### 2.14.0 + +- add `POD_STATUS_PHASES` to reap pods by pod.Status.Phase (Pending, Running, Succeeded, Failed, Unknown) + ### 2.13.0 - the reaper checks now also the initContainerStatus