From 14430e069763985bd4474b13b7c8c51c228ab642 Mon Sep 17 00:00:00 2001 From: fabriziopandini Date: Mon, 18 Nov 2024 19:20:20 +0100 Subject: [PATCH] Refine v1beta2 stale deletion messages --- .../kubeadm/internal/controllers/status.go | 31 +- .../internal/controllers/status_test.go | 4 +- .../machine/machine_controller_status.go | 39 +-- .../machine/machine_controller_status_test.go | 34 ++- .../machinedeployment_status.go | 35 ++- .../machinedeployment_status_test.go | 36 +-- .../machineset_controller_status.go | 35 ++- .../machineset_controller_status_test.go | 268 +++++++++++++----- 8 files changed, 341 insertions(+), 141 deletions(-) diff --git a/controlplane/kubeadm/internal/controllers/status.go b/controlplane/kubeadm/internal/controllers/status.go index 118d355c0606..971ebf31d0ed 100644 --- a/controlplane/kubeadm/internal/controllers/status.go +++ b/controlplane/kubeadm/internal/controllers/status.go @@ -25,6 +25,7 @@ import ( "github.com/pkg/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/sets" "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" @@ -700,9 +701,26 @@ func aggregateStaleMachines(machines collections.Machines) string { } machineNames := []string{} + delayReasons := sets.Set[string]{} for _, machine := range machines { - if !machine.GetDeletionTimestamp().IsZero() && time.Since(machine.GetDeletionTimestamp().Time) > time.Minute*30 { + if !machine.GetDeletionTimestamp().IsZero() && time.Since(machine.GetDeletionTimestamp().Time) > time.Minute*15 { machineNames = append(machineNames, machine.GetName()) + + deletingCondition := v1beta2conditions.Get(machine, clusterv1.MachineDeletingV1Beta2Condition) + if deletingCondition != nil && + deletingCondition.Status == metav1.ConditionTrue && + deletingCondition.Reason == clusterv1.MachineDeletingDrainingNodeV1Beta2Reason && + machine.Status.Deletion != nil && time.Since(machine.Status.Deletion.NodeDrainStartTime.Time) > 5*time.Minute { + if strings.Contains(deletingCondition.Message, "cannot evict pod as it would violate the pod's disruption budget.") { + delayReasons.Insert("PodDisruptionBudgets") + } + if strings.Contains(deletingCondition.Message, "deletionTimestamp set, but still not removed from the Node") { + delayReasons.Insert("Pods not terminating") + } + if strings.Contains(deletingCondition.Message, "failed to evict Pod") { + delayReasons.Insert("Pod eviction errors") + } + } } } @@ -723,7 +741,16 @@ func aggregateStaleMachines(machines collections.Machines) string { } else { message += " are " } - message += "in deletion since more than 30m" + message += "in deletion since more than 15m" + if len(delayReasons) > 0 { + reasonList := []string{} + for _, r := range []string{"PodDisruptionBudgets", "Pods not terminating", "Pod eviction errors"} { + if delayReasons.Has(r) { + reasonList = append(reasonList, r) + } + } + message += fmt.Sprintf(", delay likely due to %s", strings.Join(reasonList, ", ")) + } return message } diff --git a/controlplane/kubeadm/internal/controllers/status_test.go b/controlplane/kubeadm/internal/controllers/status_test.go index 1f3653ea78ed..5a12ae82824d 100644 --- a/controlplane/kubeadm/internal/controllers/status_test.go +++ b/controlplane/kubeadm/internal/controllers/status_test.go @@ -381,7 +381,7 @@ func Test_setScalingDownCondition(t *testing.T) { Status: metav1.ConditionTrue, Reason: controlplanev1.KubeadmControlPlaneScalingDownV1Beta2Reason, Message: "Scaling down from 3 to 1 replicas is blocked because:\n" + - "* Machine m1 is in deletion since more than 30m", + "* Machine m1 is in deletion since more than 15m", }, }, { @@ -402,7 +402,7 @@ func Test_setScalingDownCondition(t *testing.T) { Status: metav1.ConditionTrue, Reason: controlplanev1.KubeadmControlPlaneScalingDownV1Beta2Reason, Message: "Scaling down from 3 to 1 replicas is blocked because:\n" + - "* Machines m1, m2 are in deletion since more than 30m", + "* Machines m1, m2 are in deletion since more than 15m", }, }, { diff --git a/internal/controllers/machine/machine_controller_status.go b/internal/controllers/machine/machine_controller_status.go index 4c5a1101ddbc..be29ff68a7b8 100644 --- a/internal/controllers/machine/machine_controller_status.go +++ b/internal/controllers/machine/machine_controller_status.go @@ -604,28 +604,33 @@ func setReadyCondition(ctx context.Context, machine *clusterv1.Machine) { // message in the summary. // This is also important to ensure we have a limited amount of unique messages across Machines thus allowing to // nicely aggregate Ready conditions from many Machines into the MachinesReady condition of e.g. the MachineSet. -// For the same reason we are only surfacing messages with "more than 30m" instead of using the exact durations. -// 30 minutes is a duration after which we assume it makes sense to emphasize that Node drains and waiting for volume +// For the same reason we are only surfacing messages with "more than 15m" instead of using the exact durations. +// 15 minutes is a duration after which we assume it makes sense to emphasize that Node drains and waiting for volume // detach are still in progress. func calculateDeletingConditionForSummary(machine *clusterv1.Machine) v1beta2conditions.ConditionWithOwnerInfo { deletingCondition := v1beta2conditions.Get(machine, clusterv1.MachineDeletingV1Beta2Condition) - var msg string - switch { - case deletingCondition == nil: - // NOTE: this should never happen given that setDeletingCondition is called before this method and - // it always adds a Deleting condition. - msg = "Machine deletion in progress" - case deletingCondition.Reason == clusterv1.MachineDeletingDrainingNodeV1Beta2Reason && - machine.Status.Deletion != nil && machine.Status.Deletion.NodeDrainStartTime != nil && - time.Since(machine.Status.Deletion.NodeDrainStartTime.Time) > 30*time.Minute: - msg = fmt.Sprintf("Machine deletion in progress, stage: %s (since more than 30m)", deletingCondition.Reason) - case deletingCondition.Reason == clusterv1.MachineDeletingWaitingForVolumeDetachV1Beta2Reason && - machine.Status.Deletion != nil && machine.Status.Deletion.WaitForNodeVolumeDetachStartTime != nil && - time.Since(machine.Status.Deletion.WaitForNodeVolumeDetachStartTime.Time) > 30*time.Minute: - msg = fmt.Sprintf("Machine deletion in progress, stage: %s (since more than 30m)", deletingCondition.Reason) - default: + msg := "Machine deletion in progress" + if deletingCondition != nil { msg = fmt.Sprintf("Machine deletion in progress, stage: %s", deletingCondition.Reason) + if !machine.GetDeletionTimestamp().IsZero() && time.Since(machine.GetDeletionTimestamp().Time) > time.Minute*15 { + msg = fmt.Sprintf("Machine deletion in progress since more than 15m, stage: %s", deletingCondition.Reason) + if deletingCondition.Reason == clusterv1.MachineDeletingDrainingNodeV1Beta2Reason && time.Since(machine.Status.Deletion.NodeDrainStartTime.Time) > 5*time.Minute { + delayReasons := []string{} + if strings.Contains(deletingCondition.Message, "cannot evict pod as it would violate the pod's disruption budget.") { + delayReasons = append(delayReasons, "PodDisruptionBudgets") + } + if strings.Contains(deletingCondition.Message, "deletionTimestamp set, but still not removed from the Node") { + delayReasons = append(delayReasons, "Pods not terminating") + } + if strings.Contains(deletingCondition.Message, "failed to evict Pod") { + delayReasons = append(delayReasons, "Pod eviction errors") + } + if len(delayReasons) > 0 { + msg += fmt.Sprintf(", delay likely due to %s", strings.Join(delayReasons, ", ")) + } + } + } } return v1beta2conditions.ConditionWithOwnerInfo{ diff --git a/internal/controllers/machine/machine_controller_status_test.go b/internal/controllers/machine/machine_controller_status_test.go index edec3d4107d1..d72985347fff 100644 --- a/internal/controllers/machine/machine_controller_status_test.go +++ b/internal/controllers/machine/machine_controller_status_test.go @@ -1515,11 +1515,12 @@ func TestCalculateDeletingConditionForSummary(t *testing.T) { }, }, { - name: "Deleting condition with DrainingNode since more than 30m", + name: "Deleting condition with DrainingNode since more than 15m", machine: &clusterv1.Machine{ ObjectMeta: metav1.ObjectMeta{ - Name: "machine-test", - Namespace: metav1.NamespaceDefault, + Name: "machine-test", + Namespace: metav1.NamespaceDefault, + DeletionTimestamp: &metav1.Time{Time: time.Now().Add(-16 * time.Minute)}, }, Status: clusterv1.MachineStatus{ V1Beta2: &clusterv1.MachineV1Beta2Status{ @@ -1529,19 +1530,15 @@ func TestCalculateDeletingConditionForSummary(t *testing.T) { Status: metav1.ConditionTrue, Reason: clusterv1.MachineDeletingDrainingNodeV1Beta2Reason, Message: `Drain not completed yet (started at 2024-10-09T16:13:59Z): -* Pods with deletionTimestamp that still exist: pod-2-deletionTimestamp-set-1, pod-2-deletionTimestamp-set-2, pod-2-deletionTimestamp-set-3, pod-3-to-trigger-eviction-successfully-1, pod-3-to-trigger-eviction-successfully-2, ... (2 more) -* Pods with eviction failed: - * Cannot evict pod as it would violate the pod's disruption budget. The disruption budget pod-5-pdb needs 20 healthy pods and has 20 currently: pod-5-to-trigger-eviction-pdb-violated-1, pod-5-to-trigger-eviction-pdb-violated-2, pod-5-to-trigger-eviction-pdb-violated-3, ... (3 more) - * some other error 1: pod-6-to-trigger-eviction-some-other-error - * some other error 2: pod-7-to-trigger-eviction-some-other-error - * some other error 3: pod-8-to-trigger-eviction-some-other-error - * some other error 4: pod-9-to-trigger-eviction-some-other-error - * ... (1 more error applying to 1 Pod)`, +* Pods pod-2-deletionTimestamp-set-1, pod-3-to-trigger-eviction-successfully-1: deletionTimestamp set, but still not removed from the Node +* Pod pod-5-to-trigger-eviction-pdb-violated-1: cannot evict pod as it would violate the pod's disruption budget. The disruption budget pod-5-pdb needs 20 healthy pods and has 20 currently +* Pod pod-6-to-trigger-eviction-some-other-error: failed to evict Pod, some other error 1 +After above Pods have been removed from the Node, the following Pods will be evicted: pod-7-eviction-later, pod-8-eviction-later`, }, }, }, Deletion: &clusterv1.MachineDeletionStatus{ - NodeDrainStartTime: &metav1.Time{Time: time.Now().Add(-31 * time.Minute)}, + NodeDrainStartTime: &metav1.Time{Time: time.Now().Add(-6 * time.Minute)}, }, }, }, @@ -1554,16 +1551,17 @@ func TestCalculateDeletingConditionForSummary(t *testing.T) { Type: clusterv1.MachineDeletingV1Beta2Condition, Status: metav1.ConditionTrue, Reason: clusterv1.MachineDeletingV1Beta2Reason, - Message: "Machine deletion in progress, stage: DrainingNode (since more than 30m)", + Message: "Machine deletion in progress since more than 15m, stage: DrainingNode, delay likely due to PodDisruptionBudgets, Pods not terminating, Pod eviction errors", }, }, }, { - name: "Deleting condition with WaitingForVolumeDetach since more than 30m", + name: "Deleting condition with WaitingForVolumeDetach since more than 15m", machine: &clusterv1.Machine{ ObjectMeta: metav1.ObjectMeta{ - Name: "machine-test", - Namespace: metav1.NamespaceDefault, + Name: "machine-test", + Namespace: metav1.NamespaceDefault, + DeletionTimestamp: &metav1.Time{Time: time.Now().Add(-16 * time.Minute)}, }, Status: clusterv1.MachineStatus{ V1Beta2: &clusterv1.MachineV1Beta2Status{ @@ -1577,7 +1575,7 @@ func TestCalculateDeletingConditionForSummary(t *testing.T) { }, }, Deletion: &clusterv1.MachineDeletionStatus{ - WaitForNodeVolumeDetachStartTime: &metav1.Time{Time: time.Now().Add(-31 * time.Minute)}, + WaitForNodeVolumeDetachStartTime: &metav1.Time{Time: time.Now().Add(-6 * time.Minute)}, }, }, }, @@ -1590,7 +1588,7 @@ func TestCalculateDeletingConditionForSummary(t *testing.T) { Type: clusterv1.MachineDeletingV1Beta2Condition, Status: metav1.ConditionTrue, Reason: clusterv1.MachineDeletingV1Beta2Reason, - Message: "Machine deletion in progress, stage: WaitingForVolumeDetach (since more than 30m)", + Message: "Machine deletion in progress since more than 15m, stage: WaitingForVolumeDetach", }, }, }, diff --git a/internal/controllers/machinedeployment/machinedeployment_status.go b/internal/controllers/machinedeployment/machinedeployment_status.go index 85afa15ff173..4af9d6ffb0e5 100644 --- a/internal/controllers/machinedeployment/machinedeployment_status.go +++ b/internal/controllers/machinedeployment/machinedeployment_status.go @@ -25,6 +25,7 @@ import ( "github.com/pkg/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/sets" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -237,7 +238,7 @@ func setScalingDownCondition(_ context.Context, machineDeployment *clusterv1.Mac if getMachinesSucceeded { staleMessage := aggregateStaleMachines(machines) if staleMessage != "" { - message += fmt.Sprintf(" and %s", staleMessage) + message += fmt.Sprintf("\n* %s", staleMessage) } } v1beta2conditions.Set(machineDeployment, metav1.Condition{ @@ -443,7 +444,7 @@ func setDeletingCondition(_ context.Context, machineDeployment *clusterv1.Machin } staleMessage := aggregateStaleMachines(machines) if staleMessage != "" { - message += fmt.Sprintf(" and %s", staleMessage) + message += fmt.Sprintf("\n* %s", staleMessage) } } if len(machines) == 0 && len(machineSets) > 0 { @@ -487,9 +488,26 @@ func aggregateStaleMachines(machines collections.Machines) string { } machineNames := []string{} + delayReasons := sets.Set[string]{} for _, machine := range machines { - if !machine.GetDeletionTimestamp().IsZero() && time.Since(machine.GetDeletionTimestamp().Time) > time.Minute*30 { + if !machine.GetDeletionTimestamp().IsZero() && time.Since(machine.GetDeletionTimestamp().Time) > time.Minute*15 { machineNames = append(machineNames, machine.GetName()) + + deletingCondition := v1beta2conditions.Get(machine, clusterv1.MachineDeletingV1Beta2Condition) + if deletingCondition != nil && + deletingCondition.Status == metav1.ConditionTrue && + deletingCondition.Reason == clusterv1.MachineDeletingDrainingNodeV1Beta2Reason && + machine.Status.Deletion != nil && time.Since(machine.Status.Deletion.NodeDrainStartTime.Time) > 5*time.Minute { + if strings.Contains(deletingCondition.Message, "cannot evict pod as it would violate the pod's disruption budget.") { + delayReasons.Insert("PodDisruptionBudgets") + } + if strings.Contains(deletingCondition.Message, "deletionTimestamp set, but still not removed from the Node") { + delayReasons.Insert("Pods not terminating") + } + if strings.Contains(deletingCondition.Message, "failed to evict Pod") { + delayReasons.Insert("Pod eviction errors") + } + } } } @@ -510,7 +528,16 @@ func aggregateStaleMachines(machines collections.Machines) string { } else { message += " are " } - message += "in deletion since more than 30m" + message += "in deletion since more than 15m" + if len(delayReasons) > 0 { + reasonList := []string{} + for _, r := range []string{"PodDisruptionBudgets", "Pods not terminating", "Pod eviction errors"} { + if delayReasons.Has(r) { + reasonList = append(reasonList, r) + } + } + message += fmt.Sprintf(", delay likely due to %s", strings.Join(reasonList, ", ")) + } return message } diff --git a/internal/controllers/machinedeployment/machinedeployment_status_test.go b/internal/controllers/machinedeployment/machinedeployment_status_test.go index 5de23106ac79..8ce79a0231b6 100644 --- a/internal/controllers/machinedeployment/machinedeployment_status_test.go +++ b/internal/controllers/machinedeployment/machinedeployment_status_test.go @@ -518,10 +518,11 @@ func Test_setScalingDownCondition(t *testing.T) { }, getAndAdoptMachineSetsForDeploymentSucceeded: true, expectCondition: metav1.Condition{ - Type: clusterv1.MachineDeploymentScalingDownV1Beta2Condition, - Status: metav1.ConditionTrue, - Reason: clusterv1.MachineDeploymentScalingDownV1Beta2Reason, - Message: "Scaling down from 2 to 1 replicas and Machine stale-machine-1 is in deletion since more than 30m", + Type: clusterv1.MachineDeploymentScalingDownV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineDeploymentScalingDownV1Beta2Reason, + Message: "Scaling down from 2 to 1 replicas\n" + + "* Machine stale-machine-1 is in deletion since more than 15m", }, }, { @@ -539,10 +540,11 @@ func Test_setScalingDownCondition(t *testing.T) { }, getAndAdoptMachineSetsForDeploymentSucceeded: true, expectCondition: metav1.Condition{ - Type: clusterv1.MachineDeploymentScalingDownV1Beta2Condition, - Status: metav1.ConditionTrue, - Reason: clusterv1.MachineDeploymentScalingDownV1Beta2Reason, - Message: "Scaling down from 4 to 1 replicas and Machines stale-machine-1, stale-machine-2, stale-machine-3 are in deletion since more than 30m", + Type: clusterv1.MachineDeploymentScalingDownV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineDeploymentScalingDownV1Beta2Reason, + Message: "Scaling down from 4 to 1 replicas\n" + + "* Machines stale-machine-1, stale-machine-2, stale-machine-3 are in deletion since more than 15m", }, }, { @@ -562,10 +564,11 @@ func Test_setScalingDownCondition(t *testing.T) { }, getAndAdoptMachineSetsForDeploymentSucceeded: true, expectCondition: metav1.Condition{ - Type: clusterv1.MachineDeploymentScalingDownV1Beta2Condition, - Status: metav1.ConditionTrue, - Reason: clusterv1.MachineDeploymentScalingDownV1Beta2Reason, - Message: "Scaling down from 6 to 1 replicas and Machines stale-machine-1, stale-machine-2, stale-machine-3, ... (2 more) are in deletion since more than 30m", + Type: clusterv1.MachineDeploymentScalingDownV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineDeploymentScalingDownV1Beta2Reason, + Message: "Scaling down from 6 to 1 replicas\n" + + "* Machines stale-machine-1, stale-machine-2, stale-machine-3, ... (2 more) are in deletion since more than 15m", }, }, { @@ -1072,10 +1075,11 @@ func Test_setDeletingCondition(t *testing.T) { }, getMachinesSucceeded: true, expectCondition: metav1.Condition{ - Type: clusterv1.MachineDeploymentDeletingV1Beta2Condition, - Status: metav1.ConditionTrue, - Reason: clusterv1.MachineDeploymentDeletingV1Beta2Reason, - Message: "Deleting 1 Machine and Machine m1 is in deletion since more than 30m", + Type: clusterv1.MachineDeploymentDeletingV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineDeploymentDeletingV1Beta2Reason, + Message: "Deleting 1 Machine\n" + + "* Machine m1 is in deletion since more than 15m", }, }, { diff --git a/internal/controllers/machineset/machineset_controller_status.go b/internal/controllers/machineset/machineset_controller_status.go index fec501adc43b..074f4281fde3 100644 --- a/internal/controllers/machineset/machineset_controller_status.go +++ b/internal/controllers/machineset/machineset_controller_status.go @@ -24,6 +24,7 @@ import ( "time" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/sets" "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" @@ -190,7 +191,7 @@ func setScalingDownCondition(_ context.Context, ms *clusterv1.MachineSet, machin message := fmt.Sprintf("Scaling down from %d to %d replicas", currentReplicas, desiredReplicas) staleMessage := aggregateStaleMachines(machines) if staleMessage != "" { - message += fmt.Sprintf(" and %s", staleMessage) + message += fmt.Sprintf("\n* %s", staleMessage) } v1beta2conditions.Set(ms, metav1.Condition{ Type: clusterv1.MachineSetScalingDownV1Beta2Condition, @@ -395,7 +396,7 @@ func setDeletingCondition(_ context.Context, machineSet *clusterv1.MachineSet, m } staleMessage := aggregateStaleMachines(machines) if staleMessage != "" { - message += fmt.Sprintf(" and %s", staleMessage) + message += fmt.Sprintf("\n* %s", staleMessage) } } if message == "" { @@ -435,9 +436,26 @@ func aggregateStaleMachines(machines []*clusterv1.Machine) string { } machineNames := []string{} + delayReasons := sets.Set[string]{} for _, machine := range machines { - if !machine.GetDeletionTimestamp().IsZero() && time.Since(machine.GetDeletionTimestamp().Time) > time.Minute*30 { + if !machine.GetDeletionTimestamp().IsZero() && time.Since(machine.GetDeletionTimestamp().Time) > time.Minute*15 { machineNames = append(machineNames, machine.GetName()) + + deletingCondition := v1beta2conditions.Get(machine, clusterv1.MachineDeletingV1Beta2Condition) + if deletingCondition != nil && + deletingCondition.Status == metav1.ConditionTrue && + deletingCondition.Reason == clusterv1.MachineDeletingDrainingNodeV1Beta2Reason && + machine.Status.Deletion != nil && time.Since(machine.Status.Deletion.NodeDrainStartTime.Time) > 5*time.Minute { + if strings.Contains(deletingCondition.Message, "cannot evict pod as it would violate the pod's disruption budget.") { + delayReasons.Insert("PodDisruptionBudgets") + } + if strings.Contains(deletingCondition.Message, "deletionTimestamp set, but still not removed from the Node") { + delayReasons.Insert("Pods not terminating") + } + if strings.Contains(deletingCondition.Message, "failed to evict Pod") { + delayReasons.Insert("Pod eviction errors") + } + } } } @@ -458,7 +476,16 @@ func aggregateStaleMachines(machines []*clusterv1.Machine) string { } else { message += " are " } - message += "in deletion since more than 30m" + message += "in deletion since more than 15m" + if len(delayReasons) > 0 { + reasonList := []string{} + for _, r := range []string{"PodDisruptionBudgets", "Pods not terminating", "Pod eviction errors"} { + if delayReasons.Has(r) { + reasonList = append(reasonList, r) + } + } + message += fmt.Sprintf(", delay likely due to %s", strings.Join(reasonList, ", ")) + } return message } diff --git a/internal/controllers/machineset/machineset_controller_status_test.go b/internal/controllers/machineset/machineset_controller_status_test.go index 6a56713b6e05..0e139d4906f2 100644 --- a/internal/controllers/machineset/machineset_controller_status_test.go +++ b/internal/controllers/machineset/machineset_controller_status_test.go @@ -406,7 +406,7 @@ func Test_setScalingDownCondition(t *testing.T) { name: "scaling down to zero", ms: machineSet, machines: []*clusterv1.Machine{ - newMachine("machine-1"), + fakeMachine("machine-1"), }, getAndAdoptMachinesForMachineSetSucceeded: true, expectCondition: metav1.Condition{ @@ -420,51 +420,54 @@ func Test_setScalingDownCondition(t *testing.T) { name: "scaling down with 1 stale machine", ms: machineSet1Replica, machines: []*clusterv1.Machine{ - newStaleDeletingMachine("stale-machine-1"), - newMachine("machine-2"), + fakeMachine("stale-machine-1", withStaleDeletionTimestamp()), + fakeMachine("machine-2"), }, getAndAdoptMachinesForMachineSetSucceeded: true, expectCondition: metav1.Condition{ - Type: clusterv1.MachineSetScalingDownV1Beta2Condition, - Status: metav1.ConditionTrue, - Reason: clusterv1.MachineSetScalingDownV1Beta2Reason, - Message: "Scaling down from 2 to 1 replicas and Machine stale-machine-1 is in deletion since more than 30m", + Type: clusterv1.MachineSetScalingDownV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineSetScalingDownV1Beta2Reason, + Message: "Scaling down from 2 to 1 replicas\n" + + "* Machine stale-machine-1 is in deletion since more than 15m", }, }, { name: "scaling down with 3 stale machines", ms: machineSet1Replica, machines: []*clusterv1.Machine{ - newStaleDeletingMachine("stale-machine-2"), - newStaleDeletingMachine("stale-machine-1"), - newStaleDeletingMachine("stale-machine-3"), - newMachine("machine-4"), + fakeMachine("stale-machine-2", withStaleDeletionTimestamp()), + fakeMachine("stale-machine-1", withStaleDeletionTimestamp()), + fakeMachine("stale-machine-3", withStaleDeletionTimestamp()), + fakeMachine("machine-4"), }, getAndAdoptMachinesForMachineSetSucceeded: true, expectCondition: metav1.Condition{ - Type: clusterv1.MachineSetScalingDownV1Beta2Condition, - Status: metav1.ConditionTrue, - Reason: clusterv1.MachineSetScalingDownV1Beta2Reason, - Message: "Scaling down from 4 to 1 replicas and Machines stale-machine-1, stale-machine-2, stale-machine-3 are in deletion since more than 30m", + Type: clusterv1.MachineSetScalingDownV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineSetScalingDownV1Beta2Reason, + Message: "Scaling down from 4 to 1 replicas\n" + + "* Machines stale-machine-1, stale-machine-2, stale-machine-3 are in deletion since more than 15m", }, }, { name: "scaling down with 5 stale machines", ms: machineSet1Replica, machines: []*clusterv1.Machine{ - newStaleDeletingMachine("stale-machine-5"), - newStaleDeletingMachine("stale-machine-4"), - newStaleDeletingMachine("stale-machine-2"), - newStaleDeletingMachine("stale-machine-3"), - newStaleDeletingMachine("stale-machine-1"), - newMachine("machine-6"), + fakeMachine("stale-machine-5", withStaleDeletionTimestamp()), + fakeMachine("stale-machine-4", withStaleDeletionTimestamp()), + fakeMachine("stale-machine-2", withStaleDeletionTimestamp()), + fakeMachine("stale-machine-3", withStaleDeletionTimestamp()), + fakeMachine("stale-machine-1", withStaleDeletionTimestamp()), + fakeMachine("machine-6"), }, getAndAdoptMachinesForMachineSetSucceeded: true, expectCondition: metav1.Condition{ - Type: clusterv1.MachineSetScalingDownV1Beta2Condition, - Status: metav1.ConditionTrue, - Reason: clusterv1.MachineSetScalingDownV1Beta2Reason, - Message: "Scaling down from 6 to 1 replicas and Machines stale-machine-1, stale-machine-2, stale-machine-3, ... (2 more) are in deletion since more than 30m", + Type: clusterv1.MachineSetScalingDownV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineSetScalingDownV1Beta2Reason, + Message: "Scaling down from 6 to 1 replicas\n" + + "* Machines stale-machine-1, stale-machine-2, stale-machine-3, ... (2 more) are in deletion since more than 15m", }, }, { @@ -482,7 +485,7 @@ func Test_setScalingDownCondition(t *testing.T) { name: "deleting machineset having 1 replica", ms: deletingMachineSet, machines: []*clusterv1.Machine{ - newMachine("machine-1"), + fakeMachine("machine-1"), }, getAndAdoptMachinesForMachineSetSucceeded: true, expectCondition: metav1.Condition{ @@ -549,7 +552,7 @@ func Test_setMachinesReadyCondition(t *testing.T) { name: "one machine is ready", machineSet: machineSet, machines: []*clusterv1.Machine{ - newMachine("machine-1", readyCondition), + fakeMachine("machine-1", withV1Beta2Condition(readyCondition)), }, getAndAdoptMachinesForMachineSetSucceeded: true, expectCondition: metav1.Condition{ @@ -562,8 +565,8 @@ func Test_setMachinesReadyCondition(t *testing.T) { name: "all machines are ready", machineSet: machineSet, machines: []*clusterv1.Machine{ - newMachine("machine-1", readyCondition), - newMachine("machine-2", readyCondition), + fakeMachine("machine-1", withV1Beta2Condition(readyCondition)), + fakeMachine("machine-2", withV1Beta2Condition(readyCondition)), }, getAndAdoptMachinesForMachineSetSucceeded: true, expectCondition: metav1.Condition{ @@ -576,8 +579,8 @@ func Test_setMachinesReadyCondition(t *testing.T) { name: "one ready, one has nothing reported", machineSet: machineSet, machines: []*clusterv1.Machine{ - newMachine("machine-1", readyCondition), - newMachine("machine-2"), + fakeMachine("machine-1", withV1Beta2Condition(readyCondition)), + fakeMachine("machine-2"), }, getAndAdoptMachinesForMachineSetSucceeded: true, expectCondition: metav1.Condition{ @@ -591,25 +594,25 @@ func Test_setMachinesReadyCondition(t *testing.T) { name: "one ready, one reporting not ready, one reporting unknown, one reporting deleting", machineSet: machineSet, machines: []*clusterv1.Machine{ - newMachine("machine-1", readyCondition), - newMachine("machine-2", metav1.Condition{ + fakeMachine("machine-1", withV1Beta2Condition(readyCondition)), + fakeMachine("machine-2", withV1Beta2Condition(metav1.Condition{ Type: clusterv1.MachineReadyV1Beta2Condition, Status: metav1.ConditionFalse, Reason: "SomeReason", Message: "HealthCheckSucceeded: Some message", - }), - newMachine("machine-3", metav1.Condition{ + })), + fakeMachine("machine-3", withV1Beta2Condition(metav1.Condition{ Type: clusterv1.MachineReadyV1Beta2Condition, Status: metav1.ConditionUnknown, Reason: "SomeUnknownReason", Message: "Some unknown message", - }), - newMachine("machine-4", metav1.Condition{ + })), + fakeMachine("machine-4", withV1Beta2Condition(metav1.Condition{ Type: clusterv1.MachineReadyV1Beta2Condition, Status: metav1.ConditionFalse, Reason: clusterv1.MachineDeletingV1Beta2Reason, Message: "Deleting: Machine deletion in progress, stage: DrainingNode", - }), + })), }, getAndAdoptMachinesForMachineSetSucceeded: true, expectCondition: metav1.Condition{ @@ -673,11 +676,11 @@ func Test_setMachinesUpToDateCondition(t *testing.T) { name: "One machine up-to-date", machineSet: machineSet, machines: []*clusterv1.Machine{ - newMachine("up-to-date-1", metav1.Condition{ + fakeMachine("up-to-date-1", withV1Beta2Condition(metav1.Condition{ Type: clusterv1.MachineUpToDateV1Beta2Condition, Status: metav1.ConditionTrue, Reason: "some-reason-1", - }), + })), }, getAndAdoptMachinesForMachineSetSucceeded: true, expectCondition: metav1.Condition{ @@ -691,12 +694,12 @@ func Test_setMachinesUpToDateCondition(t *testing.T) { name: "One machine unknown", machineSet: machineSet, machines: []*clusterv1.Machine{ - newMachine("unknown-1", metav1.Condition{ + fakeMachine("unknown-1", withV1Beta2Condition(metav1.Condition{ Type: clusterv1.MachineUpToDateV1Beta2Condition, Status: metav1.ConditionUnknown, Reason: "some-unknown-reason-1", Message: "some unknown message", - }), + })), }, getAndAdoptMachinesForMachineSetSucceeded: true, expectCondition: metav1.Condition{ @@ -710,12 +713,12 @@ func Test_setMachinesUpToDateCondition(t *testing.T) { name: "One machine not up-to-date", machineSet: machineSet, machines: []*clusterv1.Machine{ - newMachine("not-up-to-date-machine-1", metav1.Condition{ + fakeMachine("not-up-to-date-machine-1", withV1Beta2Condition(metav1.Condition{ Type: clusterv1.MachineUpToDateV1Beta2Condition, Status: metav1.ConditionFalse, Reason: "some-not-up-to-date-reason", Message: "some not up-to-date message", - }), + })), }, getAndAdoptMachinesForMachineSetSucceeded: true, expectCondition: metav1.Condition{ @@ -729,7 +732,7 @@ func Test_setMachinesUpToDateCondition(t *testing.T) { name: "One machine without up-to-date condition, one new Machines without up-to-date condition", machineSet: machineSet, machines: []*clusterv1.Machine{ - newMachine("no-condition-machine-1"), + fakeMachine("no-condition-machine-1"), fakeMachine("no-condition-machine-2-new", withCreationTimestamp(time.Now().Add(-5*time.Second))), // ignored because it's new }, getAndAdoptMachinesForMachineSetSucceeded: true, @@ -744,30 +747,30 @@ func Test_setMachinesUpToDateCondition(t *testing.T) { name: "Two machines not up-to-date, two up-to-date, two not reported", machineSet: machineSet, machines: []*clusterv1.Machine{ - newMachine("up-to-date-1", metav1.Condition{ + fakeMachine("up-to-date-1", withV1Beta2Condition(metav1.Condition{ Type: clusterv1.MachineUpToDateV1Beta2Condition, Status: metav1.ConditionTrue, Reason: "TestUpToDate", - }), - newMachine("up-to-date-2", metav1.Condition{ + })), + fakeMachine("up-to-date-2", withV1Beta2Condition(metav1.Condition{ Type: clusterv1.MachineUpToDateV1Beta2Condition, Status: metav1.ConditionTrue, Reason: "TestUpToDate", - }), - newMachine("not-up-to-date-machine-1", metav1.Condition{ + })), + fakeMachine("not-up-to-date-machine-1", withV1Beta2Condition(metav1.Condition{ Type: clusterv1.MachineUpToDateV1Beta2Condition, Status: metav1.ConditionFalse, Reason: "TestNotUpToDate", Message: "This is not up-to-date message", - }), - newMachine("not-up-to-date-machine-2", metav1.Condition{ + })), + fakeMachine("not-up-to-date-machine-2", withV1Beta2Condition(metav1.Condition{ Type: clusterv1.MachineUpToDateV1Beta2Condition, Status: metav1.ConditionFalse, Reason: "TestNotUpToDate", Message: "This is not up-to-date message", - }), - newMachine("no-condition-machine-1"), - newMachine("no-condition-machine-2"), + })), + fakeMachine("no-condition-machine-1"), + fakeMachine("no-condition-machine-2"), }, getAndAdoptMachinesForMachineSetSucceeded: true, expectCondition: metav1.Condition{ @@ -954,7 +957,7 @@ func Test_setDeletingCondition(t *testing.T) { machineSet: &clusterv1.MachineSet{ObjectMeta: metav1.ObjectMeta{DeletionTimestamp: &metav1.Time{Time: time.Now()}}}, getAndAdoptMachinesForMachineSetSucceeded: true, machines: []*clusterv1.Machine{ - newMachine("m1"), + fakeMachine("m1"), }, expectCondition: metav1.Condition{ Type: clusterv1.MachineSetDeletingV1Beta2Condition, @@ -968,15 +971,16 @@ func Test_setDeletingCondition(t *testing.T) { machineSet: &clusterv1.MachineSet{ObjectMeta: metav1.ObjectMeta{DeletionTimestamp: &metav1.Time{Time: time.Now()}}}, getAndAdoptMachinesForMachineSetSucceeded: true, machines: []*clusterv1.Machine{ - newStaleDeletingMachine("m1"), - newStaleDeletingMachine("m2"), - newMachine("m3"), + fakeMachine("m1", withStaleDeletionTimestamp()), + fakeMachine("m2", withStaleDeletionTimestamp()), + fakeMachine("m3"), }, expectCondition: metav1.Condition{ - Type: clusterv1.MachineSetDeletingV1Beta2Condition, - Status: metav1.ConditionTrue, - Reason: clusterv1.MachineSetDeletingV1Beta2Reason, - Message: "Deleting 3 Machines and Machines m1, m2 are in deletion since more than 30m", + Type: clusterv1.MachineSetDeletingV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineSetDeletingV1Beta2Reason, + Message: "Deleting 3 Machines\n" + + "* Machines m1, m2 are in deletion since more than 15m", }, }, } @@ -993,25 +997,112 @@ func Test_setDeletingCondition(t *testing.T) { } } -func newMachine(name string, conditions ...metav1.Condition) *clusterv1.Machine { - m := &clusterv1.Machine{ - ObjectMeta: metav1.ObjectMeta{ - Name: name, - Namespace: metav1.NamespaceDefault, +func Test_aggregateStaleMachines(t *testing.T) { + tests := []struct { + name string + machines []*clusterv1.Machine + expectMessage string + }{ + { + name: "Empty when there are no machines", + machines: nil, + expectMessage: "", + }, + { + name: "Empty when there are no deleting machines", + machines: []*clusterv1.Machine{ + fakeMachine("m1"), + fakeMachine("m2"), + fakeMachine("m3"), + }, + expectMessage: "", + }, + { + name: "Empty when there are deleting machines but not yet stale", + machines: []*clusterv1.Machine{ + fakeMachine("m1", withDeletionTimestamp()), + fakeMachine("m2", withDeletionTimestamp()), + fakeMachine("m3"), + }, + expectMessage: "", + }, + { + name: "Report stale machine not draining", + machines: []*clusterv1.Machine{ + fakeMachine("m1", withDeletionTimestamp()), + fakeMachine("m2", withStaleDeletionTimestamp()), + fakeMachine("m3"), + }, + expectMessage: "Machine m2 is in deletion since more than 15m", + }, + { + name: "Report stale machines not draining", + machines: []*clusterv1.Machine{ + fakeMachine("m1", withStaleDeletionTimestamp()), + fakeMachine("m2", withStaleDeletionTimestamp()), + fakeMachine("m3"), + }, + expectMessage: "Machines m1, m2 are in deletion since more than 15m", + }, + { + name: "Does not report details about stale machines draining since less than 5 minutes", + machines: []*clusterv1.Machine{ + fakeMachine("m1", withStaleDeletionTimestamp(), withV1Beta2Condition(metav1.Condition{ + Type: clusterv1.MachineDeletingV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineDeletingDrainingNodeV1Beta2Reason, + Message: `Drain not completed yet (started at 2024-10-09T16:13:59Z): +* Pods pod-2-deletionTimestamp-set-1, pod-3-to-trigger-eviction-successfully-1: deletionTimestamp set, but still not removed from the Node +* Pod pod-5-to-trigger-eviction-pdb-violated-1: cannot evict pod as it would violate the pod's disruption budget. The disruption budget pod-5-pdb needs 20 healthy pods and has 20 currently +* Pod pod-6-to-trigger-eviction-some-other-error: failed to evict Pod, some other error 1 +After above Pods have been removed from the Node, the following Pods will be evicted: pod-7-eviction-later, pod-8-eviction-later`, + })), + fakeMachine("m2", withStaleDeletionTimestamp(), withV1Beta2Condition(metav1.Condition{ + Type: clusterv1.MachineDeletingV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineDeletingDrainingNodeV1Beta2Reason, + Message: `Drain not completed yet (started at 2024-10-09T16:13:59Z): +* Pods pod-2-deletionTimestamp-set-1, pod-3-to-trigger-eviction-successfully-1: deletionTimestamp set, but still not removed from the Node +After above Pods have been removed from the Node, the following Pods will be evicted: pod-7-eviction-later, pod-8-eviction-later`, + })), + fakeMachine("m3"), + }, + expectMessage: "Machines m1, m2 are in deletion since more than 15m", + }, + { + name: "Report details about stale machines draining since more than 5 minutes", + machines: []*clusterv1.Machine{ + fakeMachine("m1", withStaleDeletionTimestamp(), withV1Beta2Condition(metav1.Condition{ + Type: clusterv1.MachineDeletingV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineDeletingDrainingNodeV1Beta2Reason, + Message: `Drain not completed yet (started at 2024-10-09T16:13:59Z): +* Pods pod-2-deletionTimestamp-set-1, pod-3-to-trigger-eviction-successfully-1: deletionTimestamp set, but still not removed from the Node +* Pod pod-5-to-trigger-eviction-pdb-violated-1: cannot evict pod as it would violate the pod's disruption budget. The disruption budget pod-5-pdb needs 20 healthy pods and has 20 currently +* Pod pod-6-to-trigger-eviction-some-other-error: failed to evict Pod, some other error 1 +After above Pods have been removed from the Node, the following Pods will be evicted: pod-7-eviction-later, pod-8-eviction-later`, + }), withStaleDrain()), + fakeMachine("m2", withStaleDeletionTimestamp(), withV1Beta2Condition(metav1.Condition{ + Type: clusterv1.MachineDeletingV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineDeletingDrainingNodeV1Beta2Reason, + Message: `Drain not completed yet (started at 2024-10-09T16:13:59Z): +* Pods pod-2-deletionTimestamp-set-1, pod-3-to-trigger-eviction-successfully-1: deletionTimestamp set, but still not removed from the Node +After above Pods have been removed from the Node, the following Pods will be evicted: pod-7-eviction-later, pod-8-eviction-later`, + }), withStaleDrain()), + fakeMachine("m3"), + }, + expectMessage: "Machines m1, m2 are in deletion since more than 15m, delay likely due to PodDisruptionBudgets, Pods not terminating, Pod eviction errors", }, } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := NewWithT(t) - for _, condition := range conditions { - v1beta2conditions.Set(m, condition) + got := aggregateStaleMachines(tt.machines) + g.Expect(got).To(Equal(tt.expectMessage)) + }) } - - return m -} - -func newStaleDeletingMachine(name string) *clusterv1.Machine { - m := newMachine(name) - m.DeletionTimestamp = ptr.To(metav1.Time{Time: time.Now().Add(-1 * time.Hour)}) - return m } type fakeMachinesOption func(m *clusterv1.Machine) @@ -1034,6 +1125,27 @@ func withCreationTimestamp(t time.Time) fakeMachinesOption { } } +func withDeletionTimestamp() fakeMachinesOption { + return func(m *clusterv1.Machine) { + m.DeletionTimestamp = ptr.To(metav1.Time{Time: time.Now()}) + } +} + +func withStaleDeletionTimestamp() fakeMachinesOption { + return func(m *clusterv1.Machine) { + m.DeletionTimestamp = ptr.To(metav1.Time{Time: time.Now().Add(-1 * time.Hour)}) + } +} + +func withStaleDrain() fakeMachinesOption { + return func(m *clusterv1.Machine) { + if m.Status.Deletion == nil { + m.Status.Deletion = &clusterv1.MachineDeletionStatus{} + } + m.Status.Deletion.NodeDrainStartTime = ptr.To(metav1.Time{Time: time.Now().Add(-6 * time.Minute)}) + } +} + func withV1Beta2Condition(c metav1.Condition) fakeMachinesOption { return func(m *clusterv1.Machine) { if m.Status.V1Beta2 == nil {