Skip to content

Commit

Permalink
Refine v1beta2 stale deletion messages
Browse files Browse the repository at this point in the history
  • Loading branch information
fabriziopandini committed Nov 19, 2024
1 parent a21faa0 commit 14430e0
Show file tree
Hide file tree
Showing 8 changed files with 341 additions and 141 deletions.
31 changes: 29 additions & 2 deletions controlplane/kubeadm/internal/controllers/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (

"github.com/pkg/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/utils/ptr"
ctrl "sigs.k8s.io/controller-runtime"

Expand Down Expand Up @@ -700,9 +701,26 @@ func aggregateStaleMachines(machines collections.Machines) string {
}

machineNames := []string{}
delayReasons := sets.Set[string]{}
for _, machine := range machines {
if !machine.GetDeletionTimestamp().IsZero() && time.Since(machine.GetDeletionTimestamp().Time) > time.Minute*30 {
if !machine.GetDeletionTimestamp().IsZero() && time.Since(machine.GetDeletionTimestamp().Time) > time.Minute*15 {
machineNames = append(machineNames, machine.GetName())

deletingCondition := v1beta2conditions.Get(machine, clusterv1.MachineDeletingV1Beta2Condition)
if deletingCondition != nil &&
deletingCondition.Status == metav1.ConditionTrue &&
deletingCondition.Reason == clusterv1.MachineDeletingDrainingNodeV1Beta2Reason &&
machine.Status.Deletion != nil && time.Since(machine.Status.Deletion.NodeDrainStartTime.Time) > 5*time.Minute {
if strings.Contains(deletingCondition.Message, "cannot evict pod as it would violate the pod's disruption budget.") {
delayReasons.Insert("PodDisruptionBudgets")
}
if strings.Contains(deletingCondition.Message, "deletionTimestamp set, but still not removed from the Node") {
delayReasons.Insert("Pods not terminating")
}
if strings.Contains(deletingCondition.Message, "failed to evict Pod") {
delayReasons.Insert("Pod eviction errors")
}
}
}
}

Expand All @@ -723,7 +741,16 @@ func aggregateStaleMachines(machines collections.Machines) string {
} else {
message += " are "
}
message += "in deletion since more than 30m"
message += "in deletion since more than 15m"
if len(delayReasons) > 0 {
reasonList := []string{}
for _, r := range []string{"PodDisruptionBudgets", "Pods not terminating", "Pod eviction errors"} {
if delayReasons.Has(r) {
reasonList = append(reasonList, r)
}
}
message += fmt.Sprintf(", delay likely due to %s", strings.Join(reasonList, ", "))
}

return message
}
Expand Down
4 changes: 2 additions & 2 deletions controlplane/kubeadm/internal/controllers/status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -381,7 +381,7 @@ func Test_setScalingDownCondition(t *testing.T) {
Status: metav1.ConditionTrue,
Reason: controlplanev1.KubeadmControlPlaneScalingDownV1Beta2Reason,
Message: "Scaling down from 3 to 1 replicas is blocked because:\n" +
"* Machine m1 is in deletion since more than 30m",
"* Machine m1 is in deletion since more than 15m",
},
},
{
Expand All @@ -402,7 +402,7 @@ func Test_setScalingDownCondition(t *testing.T) {
Status: metav1.ConditionTrue,
Reason: controlplanev1.KubeadmControlPlaneScalingDownV1Beta2Reason,
Message: "Scaling down from 3 to 1 replicas is blocked because:\n" +
"* Machines m1, m2 are in deletion since more than 30m",
"* Machines m1, m2 are in deletion since more than 15m",
},
},
{
Expand Down
39 changes: 22 additions & 17 deletions internal/controllers/machine/machine_controller_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -604,28 +604,33 @@ func setReadyCondition(ctx context.Context, machine *clusterv1.Machine) {
// message in the summary.
// This is also important to ensure we have a limited amount of unique messages across Machines thus allowing to
// nicely aggregate Ready conditions from many Machines into the MachinesReady condition of e.g. the MachineSet.
// For the same reason we are only surfacing messages with "more than 30m" instead of using the exact durations.
// 30 minutes is a duration after which we assume it makes sense to emphasize that Node drains and waiting for volume
// For the same reason we are only surfacing messages with "more than 15m" instead of using the exact durations.
// 15 minutes is a duration after which we assume it makes sense to emphasize that Node drains and waiting for volume
// detach are still in progress.
func calculateDeletingConditionForSummary(machine *clusterv1.Machine) v1beta2conditions.ConditionWithOwnerInfo {
deletingCondition := v1beta2conditions.Get(machine, clusterv1.MachineDeletingV1Beta2Condition)

var msg string
switch {
case deletingCondition == nil:
// NOTE: this should never happen given that setDeletingCondition is called before this method and
// it always adds a Deleting condition.
msg = "Machine deletion in progress"
case deletingCondition.Reason == clusterv1.MachineDeletingDrainingNodeV1Beta2Reason &&
machine.Status.Deletion != nil && machine.Status.Deletion.NodeDrainStartTime != nil &&
time.Since(machine.Status.Deletion.NodeDrainStartTime.Time) > 30*time.Minute:
msg = fmt.Sprintf("Machine deletion in progress, stage: %s (since more than 30m)", deletingCondition.Reason)
case deletingCondition.Reason == clusterv1.MachineDeletingWaitingForVolumeDetachV1Beta2Reason &&
machine.Status.Deletion != nil && machine.Status.Deletion.WaitForNodeVolumeDetachStartTime != nil &&
time.Since(machine.Status.Deletion.WaitForNodeVolumeDetachStartTime.Time) > 30*time.Minute:
msg = fmt.Sprintf("Machine deletion in progress, stage: %s (since more than 30m)", deletingCondition.Reason)
default:
msg := "Machine deletion in progress"
if deletingCondition != nil {
msg = fmt.Sprintf("Machine deletion in progress, stage: %s", deletingCondition.Reason)
if !machine.GetDeletionTimestamp().IsZero() && time.Since(machine.GetDeletionTimestamp().Time) > time.Minute*15 {
msg = fmt.Sprintf("Machine deletion in progress since more than 15m, stage: %s", deletingCondition.Reason)
if deletingCondition.Reason == clusterv1.MachineDeletingDrainingNodeV1Beta2Reason && time.Since(machine.Status.Deletion.NodeDrainStartTime.Time) > 5*time.Minute {
delayReasons := []string{}
if strings.Contains(deletingCondition.Message, "cannot evict pod as it would violate the pod's disruption budget.") {
delayReasons = append(delayReasons, "PodDisruptionBudgets")
}
if strings.Contains(deletingCondition.Message, "deletionTimestamp set, but still not removed from the Node") {
delayReasons = append(delayReasons, "Pods not terminating")
}
if strings.Contains(deletingCondition.Message, "failed to evict Pod") {
delayReasons = append(delayReasons, "Pod eviction errors")
}
if len(delayReasons) > 0 {
msg += fmt.Sprintf(", delay likely due to %s", strings.Join(delayReasons, ", "))
}
}
}
}

return v1beta2conditions.ConditionWithOwnerInfo{
Expand Down
34 changes: 16 additions & 18 deletions internal/controllers/machine/machine_controller_status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1515,11 +1515,12 @@ func TestCalculateDeletingConditionForSummary(t *testing.T) {
},
},
{
name: "Deleting condition with DrainingNode since more than 30m",
name: "Deleting condition with DrainingNode since more than 15m",
machine: &clusterv1.Machine{
ObjectMeta: metav1.ObjectMeta{
Name: "machine-test",
Namespace: metav1.NamespaceDefault,
Name: "machine-test",
Namespace: metav1.NamespaceDefault,
DeletionTimestamp: &metav1.Time{Time: time.Now().Add(-16 * time.Minute)},
},
Status: clusterv1.MachineStatus{
V1Beta2: &clusterv1.MachineV1Beta2Status{
Expand All @@ -1529,19 +1530,15 @@ func TestCalculateDeletingConditionForSummary(t *testing.T) {
Status: metav1.ConditionTrue,
Reason: clusterv1.MachineDeletingDrainingNodeV1Beta2Reason,
Message: `Drain not completed yet (started at 2024-10-09T16:13:59Z):
* Pods with deletionTimestamp that still exist: pod-2-deletionTimestamp-set-1, pod-2-deletionTimestamp-set-2, pod-2-deletionTimestamp-set-3, pod-3-to-trigger-eviction-successfully-1, pod-3-to-trigger-eviction-successfully-2, ... (2 more)
* Pods with eviction failed:
* Cannot evict pod as it would violate the pod's disruption budget. The disruption budget pod-5-pdb needs 20 healthy pods and has 20 currently: pod-5-to-trigger-eviction-pdb-violated-1, pod-5-to-trigger-eviction-pdb-violated-2, pod-5-to-trigger-eviction-pdb-violated-3, ... (3 more)
* some other error 1: pod-6-to-trigger-eviction-some-other-error
* some other error 2: pod-7-to-trigger-eviction-some-other-error
* some other error 3: pod-8-to-trigger-eviction-some-other-error
* some other error 4: pod-9-to-trigger-eviction-some-other-error
* ... (1 more error applying to 1 Pod)`,
* Pods pod-2-deletionTimestamp-set-1, pod-3-to-trigger-eviction-successfully-1: deletionTimestamp set, but still not removed from the Node
* Pod pod-5-to-trigger-eviction-pdb-violated-1: cannot evict pod as it would violate the pod's disruption budget. The disruption budget pod-5-pdb needs 20 healthy pods and has 20 currently
* Pod pod-6-to-trigger-eviction-some-other-error: failed to evict Pod, some other error 1
After above Pods have been removed from the Node, the following Pods will be evicted: pod-7-eviction-later, pod-8-eviction-later`,
},
},
},
Deletion: &clusterv1.MachineDeletionStatus{
NodeDrainStartTime: &metav1.Time{Time: time.Now().Add(-31 * time.Minute)},
NodeDrainStartTime: &metav1.Time{Time: time.Now().Add(-6 * time.Minute)},
},
},
},
Expand All @@ -1554,16 +1551,17 @@ func TestCalculateDeletingConditionForSummary(t *testing.T) {
Type: clusterv1.MachineDeletingV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: clusterv1.MachineDeletingV1Beta2Reason,
Message: "Machine deletion in progress, stage: DrainingNode (since more than 30m)",
Message: "Machine deletion in progress since more than 15m, stage: DrainingNode, delay likely due to PodDisruptionBudgets, Pods not terminating, Pod eviction errors",
},
},
},
{
name: "Deleting condition with WaitingForVolumeDetach since more than 30m",
name: "Deleting condition with WaitingForVolumeDetach since more than 15m",
machine: &clusterv1.Machine{
ObjectMeta: metav1.ObjectMeta{
Name: "machine-test",
Namespace: metav1.NamespaceDefault,
Name: "machine-test",
Namespace: metav1.NamespaceDefault,
DeletionTimestamp: &metav1.Time{Time: time.Now().Add(-16 * time.Minute)},
},
Status: clusterv1.MachineStatus{
V1Beta2: &clusterv1.MachineV1Beta2Status{
Expand All @@ -1577,7 +1575,7 @@ func TestCalculateDeletingConditionForSummary(t *testing.T) {
},
},
Deletion: &clusterv1.MachineDeletionStatus{
WaitForNodeVolumeDetachStartTime: &metav1.Time{Time: time.Now().Add(-31 * time.Minute)},
WaitForNodeVolumeDetachStartTime: &metav1.Time{Time: time.Now().Add(-6 * time.Minute)},
},
},
},
Expand All @@ -1590,7 +1588,7 @@ func TestCalculateDeletingConditionForSummary(t *testing.T) {
Type: clusterv1.MachineDeletingV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: clusterv1.MachineDeletingV1Beta2Reason,
Message: "Machine deletion in progress, stage: WaitingForVolumeDetach (since more than 30m)",
Message: "Machine deletion in progress since more than 15m, stage: WaitingForVolumeDetach",
},
},
},
Expand Down
35 changes: 31 additions & 4 deletions internal/controllers/machinedeployment/machinedeployment_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (

"github.com/pkg/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/sets"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"

Expand Down Expand Up @@ -237,7 +238,7 @@ func setScalingDownCondition(_ context.Context, machineDeployment *clusterv1.Mac
if getMachinesSucceeded {
staleMessage := aggregateStaleMachines(machines)
if staleMessage != "" {
message += fmt.Sprintf(" and %s", staleMessage)
message += fmt.Sprintf("\n* %s", staleMessage)
}
}
v1beta2conditions.Set(machineDeployment, metav1.Condition{
Expand Down Expand Up @@ -443,7 +444,7 @@ func setDeletingCondition(_ context.Context, machineDeployment *clusterv1.Machin
}
staleMessage := aggregateStaleMachines(machines)
if staleMessage != "" {
message += fmt.Sprintf(" and %s", staleMessage)
message += fmt.Sprintf("\n* %s", staleMessage)
}
}
if len(machines) == 0 && len(machineSets) > 0 {
Expand Down Expand Up @@ -487,9 +488,26 @@ func aggregateStaleMachines(machines collections.Machines) string {
}

machineNames := []string{}
delayReasons := sets.Set[string]{}
for _, machine := range machines {
if !machine.GetDeletionTimestamp().IsZero() && time.Since(machine.GetDeletionTimestamp().Time) > time.Minute*30 {
if !machine.GetDeletionTimestamp().IsZero() && time.Since(machine.GetDeletionTimestamp().Time) > time.Minute*15 {
machineNames = append(machineNames, machine.GetName())

deletingCondition := v1beta2conditions.Get(machine, clusterv1.MachineDeletingV1Beta2Condition)
if deletingCondition != nil &&
deletingCondition.Status == metav1.ConditionTrue &&
deletingCondition.Reason == clusterv1.MachineDeletingDrainingNodeV1Beta2Reason &&
machine.Status.Deletion != nil && time.Since(machine.Status.Deletion.NodeDrainStartTime.Time) > 5*time.Minute {
if strings.Contains(deletingCondition.Message, "cannot evict pod as it would violate the pod's disruption budget.") {
delayReasons.Insert("PodDisruptionBudgets")
}
if strings.Contains(deletingCondition.Message, "deletionTimestamp set, but still not removed from the Node") {
delayReasons.Insert("Pods not terminating")
}
if strings.Contains(deletingCondition.Message, "failed to evict Pod") {
delayReasons.Insert("Pod eviction errors")
}
}
}
}

Expand All @@ -510,7 +528,16 @@ func aggregateStaleMachines(machines collections.Machines) string {
} else {
message += " are "
}
message += "in deletion since more than 30m"
message += "in deletion since more than 15m"
if len(delayReasons) > 0 {
reasonList := []string{}
for _, r := range []string{"PodDisruptionBudgets", "Pods not terminating", "Pod eviction errors"} {
if delayReasons.Has(r) {
reasonList = append(reasonList, r)
}
}
message += fmt.Sprintf(", delay likely due to %s", strings.Join(reasonList, ", "))
}

return message
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -518,10 +518,11 @@ func Test_setScalingDownCondition(t *testing.T) {
},
getAndAdoptMachineSetsForDeploymentSucceeded: true,
expectCondition: metav1.Condition{
Type: clusterv1.MachineDeploymentScalingDownV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: clusterv1.MachineDeploymentScalingDownV1Beta2Reason,
Message: "Scaling down from 2 to 1 replicas and Machine stale-machine-1 is in deletion since more than 30m",
Type: clusterv1.MachineDeploymentScalingDownV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: clusterv1.MachineDeploymentScalingDownV1Beta2Reason,
Message: "Scaling down from 2 to 1 replicas\n" +
"* Machine stale-machine-1 is in deletion since more than 15m",
},
},
{
Expand All @@ -539,10 +540,11 @@ func Test_setScalingDownCondition(t *testing.T) {
},
getAndAdoptMachineSetsForDeploymentSucceeded: true,
expectCondition: metav1.Condition{
Type: clusterv1.MachineDeploymentScalingDownV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: clusterv1.MachineDeploymentScalingDownV1Beta2Reason,
Message: "Scaling down from 4 to 1 replicas and Machines stale-machine-1, stale-machine-2, stale-machine-3 are in deletion since more than 30m",
Type: clusterv1.MachineDeploymentScalingDownV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: clusterv1.MachineDeploymentScalingDownV1Beta2Reason,
Message: "Scaling down from 4 to 1 replicas\n" +
"* Machines stale-machine-1, stale-machine-2, stale-machine-3 are in deletion since more than 15m",
},
},
{
Expand All @@ -562,10 +564,11 @@ func Test_setScalingDownCondition(t *testing.T) {
},
getAndAdoptMachineSetsForDeploymentSucceeded: true,
expectCondition: metav1.Condition{
Type: clusterv1.MachineDeploymentScalingDownV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: clusterv1.MachineDeploymentScalingDownV1Beta2Reason,
Message: "Scaling down from 6 to 1 replicas and Machines stale-machine-1, stale-machine-2, stale-machine-3, ... (2 more) are in deletion since more than 30m",
Type: clusterv1.MachineDeploymentScalingDownV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: clusterv1.MachineDeploymentScalingDownV1Beta2Reason,
Message: "Scaling down from 6 to 1 replicas\n" +
"* Machines stale-machine-1, stale-machine-2, stale-machine-3, ... (2 more) are in deletion since more than 15m",
},
},
{
Expand Down Expand Up @@ -1072,10 +1075,11 @@ func Test_setDeletingCondition(t *testing.T) {
},
getMachinesSucceeded: true,
expectCondition: metav1.Condition{
Type: clusterv1.MachineDeploymentDeletingV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: clusterv1.MachineDeploymentDeletingV1Beta2Reason,
Message: "Deleting 1 Machine and Machine m1 is in deletion since more than 30m",
Type: clusterv1.MachineDeploymentDeletingV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: clusterv1.MachineDeploymentDeletingV1Beta2Reason,
Message: "Deleting 1 Machine\n" +
"* Machine m1 is in deletion since more than 15m",
},
},
{
Expand Down
Loading

0 comments on commit 14430e0

Please sign in to comment.