From 08e15d3973511fa6f91f6c1a3a8f8f176110b765 Mon Sep 17 00:00:00 2001 From: Maskym Vavilov Date: Thu, 24 Oct 2024 15:22:35 +0100 Subject: [PATCH] report healtchecks into DNSRecord status Signed-off-by: Maskym Vavilov --- api/v1alpha1/conditions.go | 7 ++ internal/controller/dnsrecord_controller.go | 88 +++++++++++++------ internal/controller/dnsrecord_healthchecks.go | 4 +- .../controller/dnsrecord_healthchecks_test.go | 41 ++++++++- 4 files changed, 105 insertions(+), 35 deletions(-) diff --git a/api/v1alpha1/conditions.go b/api/v1alpha1/conditions.go index db246752..6d63524f 100644 --- a/api/v1alpha1/conditions.go +++ b/api/v1alpha1/conditions.go @@ -4,3 +4,10 @@ type ConditionType string type ConditionReason string const ConditionTypeReady ConditionType = "Ready" +const ConditionReasonProviderSuccess ConditionReason = "ProviderSuccess" +const ConditionReasonAwaitingValidation ConditionReason = "AwaitingValidation" + +const ConditionTypeHealthy ConditionType = "Healthy" +const ConditionReasonHealthy ConditionReason = "AllChecksPassed" +const ConditionReasonPartiallyHealthy ConditionReason = "SomeChecksPassed" +const ConditionReasonUnhealthy ConditionReason = "HealthChecksFailed" diff --git a/internal/controller/dnsrecord_controller.go b/internal/controller/dnsrecord_controller.go index 195a9884..5f4bab38 100644 --- a/internal/controller/dnsrecord_controller.go +++ b/internal/controller/dnsrecord_controller.go @@ -125,7 +125,7 @@ func (r *DNSRecordReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( reason := "DNSProviderError" message := fmt.Sprintf("The dns provider could not be loaded: %v", err) setDNSRecordCondition(dnsRecord, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse, reason, message) - return r.updateStatus(ctx, previous, dnsRecord, false, err) + return r.updateStatus(ctx, previous, dnsRecord, false, false, err) } if probesEnabled { @@ -174,7 +174,7 @@ func (r *DNSRecordReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( logger.Error(err, "Failed to validate record") setDNSRecordCondition(dnsRecord, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse, "ValidationError", fmt.Sprintf("validation of DNSRecord failed: %v", err)) - return r.updateStatus(ctx, previous, dnsRecord, false, err) + return r.updateStatus(ctx, previous, dnsRecord, false, false, err) } //Ensure an Owner ID has been assigned to the record (OwnerID set in the status) @@ -197,14 +197,14 @@ func (r *DNSRecordReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( if err != nil { setDNSRecordCondition(dnsRecord, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse, "DNSProviderError", fmt.Sprintf("The dns provider could not be loaded: %v", err)) - return r.updateStatus(ctx, previous, dnsRecord, false, err) + return r.updateStatus(ctx, previous, dnsRecord, false, false, err) } z, err := p.DNSZoneForHost(ctx, dnsRecord.Spec.RootHost) if err != nil { setDNSRecordCondition(dnsRecord, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse, "DNSProviderError", fmt.Sprintf("Unable to find suitable zone in provider: %v", provider.SanitizeError(err))) - return r.updateStatus(ctx, previous, dnsRecord, false, err) + return r.updateStatus(ctx, previous, dnsRecord, false, false, err) } //Add zone id/domainName to status @@ -220,16 +220,16 @@ func (r *DNSRecordReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( if err != nil { setDNSRecordCondition(dnsRecord, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse, "DNSProviderError", fmt.Sprintf("The dns provider could not be loaded: %v", err)) - return r.updateStatus(ctx, previous, dnsRecord, false, err) + return r.updateStatus(ctx, previous, dnsRecord, false, false, err) } // Publish the record - hadChanges, err := r.publishRecord(ctx, dnsRecord, dnsProvider) + hadChanges, hasHealthyProbes, err := r.publishRecord(ctx, dnsRecord, dnsProvider) if err != nil { logger.Error(err, "Failed to publish record") setDNSRecordCondition(dnsRecord, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse, "ProviderError", fmt.Sprintf("The DNS provider failed to ensure the record: %v", provider.SanitizeError(err))) - return r.updateStatus(ctx, previous, dnsRecord, hadChanges, err) + return r.updateStatus(ctx, previous, dnsRecord, hadChanges, hasHealthyProbes, err) } if probesEnabled { @@ -238,7 +238,7 @@ func (r *DNSRecordReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( } } - return r.updateStatus(ctx, previous, dnsRecord, hadChanges, nil) + return r.updateStatus(ctx, previous, dnsRecord, hadChanges, hasHealthyProbes, nil) } // setLogger Updates the given Logger with record/zone metadata from the given DNSRecord. @@ -252,7 +252,7 @@ func (r *DNSRecordReconciler) setLogger(ctx context.Context, logger logr.Logger, return log.IntoContext(ctx, logger), logger } -func (r *DNSRecordReconciler) updateStatus(ctx context.Context, previous, current *v1alpha1.DNSRecord, hadChanges bool, specErr error) (reconcile.Result, error) { +func (r *DNSRecordReconciler) updateStatus(ctx context.Context, previous, current *v1alpha1.DNSRecord, hadChanges, hasHealthyProbes bool, specErr error) (reconcile.Result, error) { var requeueTime time.Duration logger := log.FromContext(ctx) @@ -283,7 +283,6 @@ func (r *DNSRecordReconciler) updateStatus(ctx context.Context, previous, curren logger.V(1).Info("Changes needed on the same generation of record") } requeueTime = randomizedValidationRequeue - setDNSRecordCondition(current, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse, "AwaitingValidation", "Awaiting validation") } else { logger.Info("All records are already up to date") // reset the valid for from randomized value to a fixed value once validation succeeds @@ -293,9 +292,10 @@ func (r *DNSRecordReconciler) updateStatus(ctx context.Context, previous, curren // uses current.Status.ValidFor as the last requeue duration. Double it. requeueTime = exponentialRequeueTime(current.Status.ValidFor) } - setDNSRecordCondition(current, string(v1alpha1.ConditionTypeReady), metav1.ConditionTrue, "ProviderSuccess", "Provider ensured the dns record") } + setStatusConditions(current, hadChanges, hasHealthyProbes) + // valid for is always a requeue time current.Status.ValidFor = requeueTime.String() @@ -365,7 +365,7 @@ func (r *DNSRecordReconciler) SetupWithManager(mgr ctrl.Manager, maxRequeue, val func (r *DNSRecordReconciler) deleteRecord(ctx context.Context, dnsRecord *v1alpha1.DNSRecord, dnsProvider provider.Provider) (bool, error) { logger := log.FromContext(ctx) - hadChanges, err := r.applyChanges(ctx, dnsRecord, dnsProvider, true) + hadChanges, _, err := r.applyChanges(ctx, dnsRecord, dnsProvider, true) if err != nil { if strings.Contains(err.Error(), "was not found") || strings.Contains(err.Error(), "notFound") { logger.Info("Record not found in zone, continuing") @@ -382,21 +382,22 @@ func (r *DNSRecordReconciler) deleteRecord(ctx context.Context, dnsRecord *v1alp } // publishRecord publishes record(s) to the DNSPRovider(i.e. route53) zone (dnsRecord.Status.ZoneID). -func (r *DNSRecordReconciler) publishRecord(ctx context.Context, dnsRecord *v1alpha1.DNSRecord, dnsProvider provider.Provider) (bool, error) { +// returns if it had changes, if record is healthy and an error. If had no changes - the healthy bool can be ignored +func (r *DNSRecordReconciler) publishRecord(ctx context.Context, dnsRecord *v1alpha1.DNSRecord, dnsProvider provider.Provider) (bool, bool, error) { logger := log.FromContext(ctx) if prematurely, _ := recordReceivedPrematurely(dnsRecord); prematurely { logger.V(1).Info("Skipping DNSRecord - is still valid") - return false, nil + return false, true, nil } - hadChanges, err := r.applyChanges(ctx, dnsRecord, dnsProvider, false) + hadChanges, isHealthy, err := r.applyChanges(ctx, dnsRecord, dnsProvider, false) if err != nil { - return hadChanges, err + return hadChanges, isHealthy, err } logger.Info("Published DNSRecord to zone") - return hadChanges, nil + return hadChanges, isHealthy, nil } // recordReceivedPrematurely returns true if current reconciliation loop started before @@ -432,6 +433,35 @@ func exponentialRequeueTime(lastRequeueTime string) time.Duration { return newRequeue } +// setStatusConditions sets healthy and ready condition on given DNSRecord +func setStatusConditions(record *v1alpha1.DNSRecord, hadChanges, hasHealthyProbes bool) { + // we get here only when spec err is nil - can trust hadChanges bool + + // give precedence to AwaitingValidation condition + if hadChanges { + setDNSRecordCondition(record, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse, string(v1alpha1.ConditionReasonAwaitingValidation), "Awaiting validation") + return + } + + // at least one of the probes is healthy + if hasHealthyProbes { + if len(record.Spec.Endpoints) == len(record.Status.Endpoints) { + // all probes are healthy + setDNSRecordCondition(record, string(v1alpha1.ConditionTypeHealthy), metav1.ConditionTrue, string(v1alpha1.ConditionReasonHealthy), "All healthchecks succeeded") + } else { + // at least one of the probes is healthy + setDNSRecordCondition(record, string(v1alpha1.ConditionTypeHealthy), metav1.ConditionFalse, string(v1alpha1.ConditionReasonPartiallyHealthy), "At least one healthcheck succeeded") + } + // in any case - hostname will resolve so the record is ready + setDNSRecordCondition(record, string(v1alpha1.ConditionTypeReady), metav1.ConditionTrue, string(v1alpha1.ConditionReasonProviderSuccess), "Provider ensured the dns record") + return + } + // none of the probes is healthy - mirror the healthy condition + setDNSRecordCondition(record, string(v1alpha1.ConditionTypeHealthy), metav1.ConditionFalse, string(v1alpha1.ConditionReasonUnhealthy), "None of the healthchecks succeeded") + setDNSRecordCondition(record, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse, string(v1alpha1.ConditionReasonUnhealthy), "None of the healthchecks succeeded") + +} + // setDNSRecordCondition adds or updates a given condition in the DNSRecord status. func setDNSRecordCondition(dnsRecord *v1alpha1.DNSRecord, conditionType string, status metav1.ConditionStatus, reason, message string) { cond := metav1.Condition{ @@ -469,7 +499,7 @@ func (r *DNSRecordReconciler) getDNSProvider(ctx context.Context, dnsRecord *v1a // applyChanges creates the Plan and applies it to the registry. Returns true only if the Plan had no errors and there were changes to apply. // The error is nil only if the changes were successfully applied or there were no changes to be made. -func (r *DNSRecordReconciler) applyChanges(ctx context.Context, dnsRecord *v1alpha1.DNSRecord, dnsProvider provider.Provider, isDelete bool) (bool, error) { +func (r *DNSRecordReconciler) applyChanges(ctx context.Context, dnsRecord *v1alpha1.DNSRecord, dnsProvider provider.Provider, isDelete bool) (bool, bool, error) { logger := log.FromContext(ctx) rootDomainName := dnsRecord.Spec.RootHost zoneDomainFilter := externaldnsendpoint.NewDomainFilter([]string{dnsRecord.Status.ZoneDomainName}) @@ -480,13 +510,13 @@ func (r *DNSRecordReconciler) applyChanges(ctx context.Context, dnsRecord *v1alp dnsRecord.Status.OwnerID, txtRegistryCacheInterval, txtRegistryWildcardReplacement, managedDNSRecordTypes, excludeDNSRecordTypes, txtRegistryEncryptEnabled, []byte(txtRegistryEncryptAESKey)) if err != nil { - return false, err + return false, false, err } policyID := "sync" policy, exists := externaldnsplan.Policies[policyID] if !exists { - return false, fmt.Errorf("unknown policy: %s", policyID) + return false, false, fmt.Errorf("unknown policy: %s", policyID) } //If we are deleting set the expected endpoints to an empty array @@ -497,25 +527,25 @@ func (r *DNSRecordReconciler) applyChanges(ctx context.Context, dnsRecord *v1alp //zoneEndpoints = Records in the current dns provider zone zoneEndpoints, err := registry.Records(ctx) if err != nil { - return false, err + return false, false, err } //specEndpoints = Records that this DNSRecord expects to exist specEndpoints, err := registry.AdjustEndpoints(dnsRecord.Spec.Endpoints) if err != nil { - return false, fmt.Errorf("adjusting specEndpoints: %w", err) + return false, false, fmt.Errorf("adjusting specEndpoints: %w", err) } - //healthySpecEndpoints = Records that this DNSRecord expects to exist, that do not have matching unhealthy probes - healthySpecEndpoints, _, err := r.removeUnhealthyEndpoints(ctx, specEndpoints, dnsRecord) + // healthySpecEndpoints = Records that this DNSRecord expects to exist, that do not have matching unhealthy probes + healthySpecEndpoints, hasHealthyEPs, err := r.removeUnhealthyEndpoints(ctx, specEndpoints, dnsRecord) if err != nil { - return false, fmt.Errorf("removing unhealthy specEndpoints: %w", err) + return false, false, fmt.Errorf("removing unhealthy specEndpoints: %w", err) } //statusEndpoints = Records that were created/updated by this DNSRecord last statusEndpoints, err := registry.AdjustEndpoints(dnsRecord.Status.Endpoints) if err != nil { - return false, fmt.Errorf("adjusting statusEndpoints: %w", err) + return false, false, fmt.Errorf("adjusting statusEndpoints: %w", err) } // add related endpoints to the record @@ -534,16 +564,16 @@ func (r *DNSRecordReconciler) applyChanges(ctx context.Context, dnsRecord *v1alp plan = plan.Calculate() if err = plan.Error(); err != nil { - return false, err + return false, hasHealthyEPs, err } dnsRecord.Status.DomainOwners = plan.Owners dnsRecord.Status.Endpoints = healthySpecEndpoints if plan.Changes.HasChanges() { logger.Info("Applying changes") err = registry.ApplyChanges(ctx, plan.Changes) - return true, err + return true, hasHealthyEPs, err } - return false, nil + return false, hasHealthyEPs, nil } // filterEndpoints takes a list of zoneEndpoints and removes from it all endpoints diff --git a/internal/controller/dnsrecord_healthchecks.go b/internal/controller/dnsrecord_healthchecks.go index fe72748d..9e91831e 100644 --- a/internal/controller/dnsrecord_healthchecks.go +++ b/internal/controller/dnsrecord_healthchecks.go @@ -144,11 +144,9 @@ func (r *DNSRecordReconciler) removeUnhealthyEndpoints(ctx context.Context, endp } - healthyEndpoints := *common.ToEndpoints(tree, ptr.To([]*endpoint.Endpoint{})) - // if at least one of the leaf probes was healthy return healthy probes if haveHealthyProbes { - return healthyEndpoints, true, nil + return *common.ToEndpoints(tree, ptr.To([]*endpoint.Endpoint{})), true, nil } // if none of the probes are healthy - don't modify endpoints return endpoints, false, nil diff --git a/internal/controller/dnsrecord_healthchecks_test.go b/internal/controller/dnsrecord_healthchecks_test.go index 7470ad91..5c145211 100644 --- a/internal/controller/dnsrecord_healthchecks_test.go +++ b/internal/controller/dnsrecord_healthchecks_test.go @@ -66,7 +66,7 @@ var _ = Describe("DNSRecordReconciler_HealthChecks", func() { ContainElement(MatchFields(IgnoreExtras, Fields{ "Type": Equal(string(v1alpha1.ConditionTypeReady)), "Status": Equal(metav1.ConditionTrue), - "Reason": Equal("ProviderSuccess"), + "Reason": Equal(string(v1alpha1.ConditionReasonProviderSuccess)), "Message": Equal("Provider ensured the dns record"), })), ) @@ -183,6 +183,18 @@ var _ = Describe("DNSRecordReconciler_HealthChecks", func() { It("Should remove unhealthy endpoints", func() { //Create default test dnsrecord Expect(k8sClient.Create(ctx, dnsRecord)).To(Succeed()) + Eventually(func(g Gomega) { + err := k8sClient.Get(ctx, client.ObjectKeyFromObject(dnsRecord), dnsRecord) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(dnsRecord.Status.Conditions).To( + ContainElement(MatchFields(IgnoreExtras, Fields{ + "Type": Equal(string(v1alpha1.ConditionTypeHealthy)), + "Status": Equal(metav1.ConditionTrue), + "Reason": Equal(string(v1alpha1.ConditionReasonHealthy)), + "Message": Equal("All healthchecks succeeded"), + })), + ) + }, TestTimeoutMedium, time.Second).Should(Succeed()) By("Making one of the probes unhealthy") Eventually(func(g Gomega) { @@ -210,7 +222,7 @@ var _ = Describe("DNSRecordReconciler_HealthChecks", func() { g.Expect(updated).To(BeTrue()) }, TestTimeoutMedium, time.Second).Should(Succeed()) - By("Ensure unhealthy endpoints are gone") + By("Ensure unhealthy endpoints are gone and status is updated") Eventually(func(g Gomega) { Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(dnsRecord), dnsRecord)).To(Succeed()) @@ -233,6 +245,14 @@ var _ = Describe("DNSRecordReconciler_HealthChecks", func() { "ProviderSpecific": Equal(externaldnsendpoint.ProviderSpecific{{Name: "geo-code", Value: "GEO-EU"}}), })), )) + g.Expect(dnsRecord.Status.Conditions).To( + ContainElement(MatchFields(IgnoreExtras, Fields{ + "Type": Equal(string(v1alpha1.ConditionTypeHealthy)), + "Status": Equal(metav1.ConditionFalse), + "Reason": Equal(string(v1alpha1.ConditionReasonPartiallyHealthy)), + "Message": Equal("At least one healthcheck succeeded"), + })), + ) }, TestTimeoutMedium, time.Second).Should(Succeed()) @@ -261,11 +281,26 @@ var _ = Describe("DNSRecordReconciler_HealthChecks", func() { }, TestTimeoutMedium, time.Second).Should(Succeed()) // we don't remove EPs if this leads to empty EPs - By("Ensure endpoints are not changed ") + By("Ensure endpoints are not changed and status is updated") Eventually(func(g Gomega) { record := &v1alpha1.DNSRecord{} Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(dnsRecord), record)).To(Succeed()) g.Expect(dnsRecord.Status.Endpoints).To(BeEquivalentTo(record.Status.Endpoints)) + + g.Expect(record.Status.Conditions).To(ContainElements( + MatchFields(IgnoreExtras, Fields{ + "Type": Equal(string(v1alpha1.ConditionTypeHealthy)), + "Status": Equal(metav1.ConditionFalse), + "Reason": Equal(string(v1alpha1.ConditionReasonUnhealthy)), + "Message": Equal("None of the healthchecks succeeded"), + }), + MatchFields(IgnoreExtras, Fields{ + "Type": Equal(string(v1alpha1.ConditionTypeReady)), + "Status": Equal(metav1.ConditionFalse), + "Reason": Equal(string(v1alpha1.ConditionReasonUnhealthy)), + "Message": Equal("None of the healthchecks succeeded"), + }), + )) }, TestTimeoutMedium, time.Second).Should(Succeed()) })