From 037b9e24ffec27a2e9ab045d74409ff3e3aff014 Mon Sep 17 00:00:00 2001
From: Nico Schieder <nschieder@redhat.com>
Date: Thu, 30 Jan 2025 02:38:52 +0100
Subject: [PATCH] UPSTREAM: 1663: Recommended leaderelection setting (#1663)

Extensive e2e tests revealed that operator-controller might run into
leader election timeouts during cluster bootstrap, causing sporadic
alerts being generated.

This commit uses recommended settings for leaderelection
LeaseDuration: 15s -> 137s
RenewDeadline: 10s -> 107s
RetryPeriod:    2s ->  26s

Warning: This will increase potential down-time of catalogd to 163s in
the worst case (up from 17s). (LeaseDuration + RetryPeriod)
---
 catalogd/cmd/catalogd/main.go   | 11 +++++++++--
 cmd/operator-controller/main.go |  9 ++++++++-
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/catalogd/cmd/catalogd/main.go b/catalogd/cmd/catalogd/main.go
index 77698444c..35854aeae 100644
--- a/catalogd/cmd/catalogd/main.go
+++ b/catalogd/cmd/catalogd/main.go
@@ -42,6 +42,7 @@ import (
 	_ "k8s.io/client-go/plugin/pkg/client/auth"
 	"k8s.io/klog/v2"
 	"k8s.io/klog/v2/textlogger"
+	"k8s.io/utils/ptr"
 	ctrl "sigs.k8s.io/controller-runtime"
 	crcache "sigs.k8s.io/controller-runtime/pkg/cache"
 	"sigs.k8s.io/controller-runtime/pkg/certwatcher"
@@ -231,8 +232,14 @@ func main() {
 		HealthProbeBindAddress: probeAddr,
 		LeaderElection:         enableLeaderElection,
 		LeaderElectionID:       "catalogd-operator-lock",
-		WebhookServer:          webhookServer,
-		Cache:                  cacheOptions,
+		// Recommended Leader Election values
+		// https://github.com/openshift/enhancements/blob/61581dcd985130357d6e4b0e72b87ee35394bf6e/CONVENTIONS.md#handling-kube-apiserver-disruption
+		LeaseDuration: ptr.To(137 * time.Second),
+		RenewDeadline: ptr.To(107 * time.Second),
+		RetryPeriod:   ptr.To(26 * time.Second),
+
+		WebhookServer: webhookServer,
+		Cache:         cacheOptions,
 	})
 	if err != nil {
 		setupLog.Error(err, "unable to create manager")
diff --git a/cmd/operator-controller/main.go b/cmd/operator-controller/main.go
index e45a130f7..6ce04026c 100644
--- a/cmd/operator-controller/main.go
+++ b/cmd/operator-controller/main.go
@@ -40,6 +40,7 @@ import (
 	_ "k8s.io/client-go/plugin/pkg/client/auth"
 	"k8s.io/klog/v2"
 	"k8s.io/klog/v2/textlogger"
+	"k8s.io/utils/ptr"
 	ctrl "sigs.k8s.io/controller-runtime"
 	crcache "sigs.k8s.io/controller-runtime/pkg/cache"
 	"sigs.k8s.io/controller-runtime/pkg/certwatcher"
@@ -229,7 +230,13 @@ func main() {
 		HealthProbeBindAddress: probeAddr,
 		LeaderElection:         enableLeaderElection,
 		LeaderElectionID:       "9c4404e7.operatorframework.io",
-		Cache:                  cacheOptions,
+		// Recommended Leader Election values
+		// https://github.com/openshift/enhancements/blob/61581dcd985130357d6e4b0e72b87ee35394bf6e/CONVENTIONS.md#handling-kube-apiserver-disruption
+		LeaseDuration: ptr.To(137 * time.Second),
+		RenewDeadline: ptr.To(107 * time.Second),
+		RetryPeriod:   ptr.To(26 * time.Second),
+
+		Cache: cacheOptions,
 		// LeaderElectionReleaseOnCancel defines if the leader should step down voluntarily
 		// when the Manager ends. This requires the binary to immediately end when the
 		// Manager is stopped, otherwise, this setting is unsafe. Setting this significantly