From 037b9e24ffec27a2e9ab045d74409ff3e3aff014 Mon Sep 17 00:00:00 2001 From: Nico Schieder Date: Thu, 30 Jan 2025 02:38:52 +0100 Subject: [PATCH] UPSTREAM: 1663: Recommended leaderelection setting (#1663) Extensive e2e tests revealed that operator-controller might run into leader election timeouts during cluster bootstrap, causing sporadic alerts being generated. This commit uses recommended settings for leaderelection LeaseDuration: 15s -> 137s RenewDeadline: 10s -> 107s RetryPeriod: 2s -> 26s Warning: This will increase potential down-time of catalogd to 163s in the worst case (up from 17s). (LeaseDuration + RetryPeriod) --- catalogd/cmd/catalogd/main.go | 11 +++++++++-- cmd/operator-controller/main.go | 9 ++++++++- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/catalogd/cmd/catalogd/main.go b/catalogd/cmd/catalogd/main.go index 77698444c..35854aeae 100644 --- a/catalogd/cmd/catalogd/main.go +++ b/catalogd/cmd/catalogd/main.go @@ -42,6 +42,7 @@ import ( _ "k8s.io/client-go/plugin/pkg/client/auth" "k8s.io/klog/v2" "k8s.io/klog/v2/textlogger" + "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" crcache "sigs.k8s.io/controller-runtime/pkg/cache" "sigs.k8s.io/controller-runtime/pkg/certwatcher" @@ -231,8 +232,14 @@ func main() { HealthProbeBindAddress: probeAddr, LeaderElection: enableLeaderElection, LeaderElectionID: "catalogd-operator-lock", - WebhookServer: webhookServer, - Cache: cacheOptions, + // Recommended Leader Election values + // https://github.com/openshift/enhancements/blob/61581dcd985130357d6e4b0e72b87ee35394bf6e/CONVENTIONS.md#handling-kube-apiserver-disruption + LeaseDuration: ptr.To(137 * time.Second), + RenewDeadline: ptr.To(107 * time.Second), + RetryPeriod: ptr.To(26 * time.Second), + + WebhookServer: webhookServer, + Cache: cacheOptions, }) if err != nil { setupLog.Error(err, "unable to create manager") diff --git a/cmd/operator-controller/main.go b/cmd/operator-controller/main.go index e45a130f7..6ce04026c 100644 --- a/cmd/operator-controller/main.go +++ b/cmd/operator-controller/main.go @@ -40,6 +40,7 @@ import ( _ "k8s.io/client-go/plugin/pkg/client/auth" "k8s.io/klog/v2" "k8s.io/klog/v2/textlogger" + "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" crcache "sigs.k8s.io/controller-runtime/pkg/cache" "sigs.k8s.io/controller-runtime/pkg/certwatcher" @@ -229,7 +230,13 @@ func main() { HealthProbeBindAddress: probeAddr, LeaderElection: enableLeaderElection, LeaderElectionID: "9c4404e7.operatorframework.io", - Cache: cacheOptions, + // Recommended Leader Election values + // https://github.com/openshift/enhancements/blob/61581dcd985130357d6e4b0e72b87ee35394bf6e/CONVENTIONS.md#handling-kube-apiserver-disruption + LeaseDuration: ptr.To(137 * time.Second), + RenewDeadline: ptr.To(107 * time.Second), + RetryPeriod: ptr.To(26 * time.Second), + + Cache: cacheOptions, // LeaderElectionReleaseOnCancel defines if the leader should step down voluntarily // when the Manager ends. This requires the binary to immediately end when the // Manager is stopped, otherwise, this setting is unsafe. Setting this significantly