From e81c86f59095e29adb2a2676cff2f927f42b9d86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Hern=C3=A1ndez?= Date: Wed, 15 Sep 2021 10:46:06 +0200 Subject: [PATCH] [EOS-5172] Capsule error 500 when has deployed with more than one replica --- CHANGELOG.md | 1 + controllers/secret/tls.go | 40 +++++++++++++++++++++++++++++++++++---- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 54d71754..6075420d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ## 0.1.0 (August 19, 2021) +* [EOS-5172] Capsule error 500 when has deployed with more than one replica * Adapt to Stratio CICD flow * Add system-user-group as exception of capsule-user-group * Using v0.0.5 tag from upstream as base \ No newline at end of file diff --git a/controllers/secret/tls.go b/controllers/secret/tls.go index a2a41057..513adce0 100644 --- a/controllers/secret/tls.go +++ b/controllers/secret/tls.go @@ -21,12 +21,14 @@ import ( "context" "crypto/x509" "encoding/pem" - "syscall" + "fmt" + "os" "time" "github.com/go-logr/logr" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" @@ -82,7 +84,7 @@ func (r TLSReconciler) Reconcile(ctx context.Context, request ctrl.Request) (ctr r.Log.Info("Missing Capsule TLS certificate") rq = 6 * 30 * 24 * time.Hour - opts := cert.NewCertOpts(time.Now().Add(rq), "capsule-webhook-service.capsule-system.svc") + opts := cert.NewCertOpts(time.Now().Add(rq), fmt.Sprintf("capsule-webhook-service.%s.svc", r.Namespace)) var crt, key *bytes.Buffer crt, key, err = ca.GenerateCertificate(opts) if err != nil { @@ -124,8 +126,38 @@ func (r TLSReconciler) Reconcile(ctx context.Context, request ctrl.Request) (ctr } if instance.Name == tlsSecretName && res == controllerutil.OperationResultUpdated { - r.Log.Info("Capsule TLS certificates has been updated, we need to restart the Controller") - _ = syscall.Kill(syscall.Getpid(), syscall.SIGINT) + r.Log.Info("Capsule TLS certificates has been updated, Controller pods must be restarted to load new certificate") + + hostname, _ := os.Hostname() + leaderPod := &corev1.Pod{} + if err = r.Client.Get(ctx, types.NamespacedName{Namespace: os.Getenv("NAMESPACE"), Name: hostname}, leaderPod); err != nil { + r.Log.Error(err, "cannot retrieve the leader Pod, probably running in out of the cluster mode") + + return reconcile.Result{}, nil + } + + podList := &corev1.PodList{} + if err = r.Client.List(ctx, podList, client.MatchingLabels(leaderPod.ObjectMeta.Labels)); err != nil { + r.Log.Error(err, "cannot retrieve list of Capsule pods requiring restart upon TLS update") + + return reconcile.Result{}, nil + } + + for _, p := range podList.Items { + nonLeaderPod := p + // Skipping this Pod, must be deleted at the end + if nonLeaderPod.GetName() == leaderPod.GetName() { + continue + } + + if err = r.Client.Delete(ctx, &nonLeaderPod); err != nil { + r.Log.Error(err, "cannot delete the non-leader Pod due to TLS update") + } + } + + if err = r.Client.Delete(ctx, leaderPod); err != nil { + r.Log.Error(err, "cannot delete the leader Pod due to TLS update") + } } r.Log.Info("Reconciliation completed, processing back in " + rq.String())