From 4bfff2eecd2df8ca104f3fa7a245bff96c37c88d Mon Sep 17 00:00:00 2001 From: Maskym Vavilov Date: Thu, 17 Oct 2024 12:13:47 +0100 Subject: [PATCH] report healtchecks into DNSRecord status Signed-off-by: Maskym Vavilov --- api/v1alpha1/conditions.go | 7 ++ internal/controller/dnsrecord_controller.go | 100 +++++++++++++----- .../controller/dnsrecord_controller_test.go | 2 +- internal/controller/dnsrecord_healthchecks.go | 2 +- .../controller/dnsrecord_healthchecks_test.go | 34 +++++- .../geo-dnsrecord-healthchecks.yaml | 5 - 6 files changed, 109 insertions(+), 41 deletions(-) diff --git a/api/v1alpha1/conditions.go b/api/v1alpha1/conditions.go index db246752..6d63524f 100644 --- a/api/v1alpha1/conditions.go +++ b/api/v1alpha1/conditions.go @@ -4,3 +4,10 @@ type ConditionType string type ConditionReason string const ConditionTypeReady ConditionType = "Ready" +const ConditionReasonProviderSuccess ConditionReason = "ProviderSuccess" +const ConditionReasonAwaitingValidation ConditionReason = "AwaitingValidation" + +const ConditionTypeHealthy ConditionType = "Healthy" +const ConditionReasonHealthy ConditionReason = "AllChecksPassed" +const ConditionReasonPartiallyHealthy ConditionReason = "SomeChecksPassed" +const ConditionReasonUnhealthy ConditionReason = "HealthChecksFailed" diff --git a/internal/controller/dnsrecord_controller.go b/internal/controller/dnsrecord_controller.go index 195a9884..789f8232 100644 --- a/internal/controller/dnsrecord_controller.go +++ b/internal/controller/dnsrecord_controller.go @@ -125,7 +125,7 @@ func (r *DNSRecordReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( reason := "DNSProviderError" message := fmt.Sprintf("The dns provider could not be loaded: %v", err) setDNSRecordCondition(dnsRecord, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse, reason, message) - return r.updateStatus(ctx, previous, dnsRecord, false, err) + return r.updateStatus(ctx, previous, dnsRecord, false, []string{}, err) } if probesEnabled { @@ -174,7 +174,7 @@ func (r *DNSRecordReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( logger.Error(err, "Failed to validate record") setDNSRecordCondition(dnsRecord, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse, "ValidationError", fmt.Sprintf("validation of DNSRecord failed: %v", err)) - return r.updateStatus(ctx, previous, dnsRecord, false, err) + return r.updateStatus(ctx, previous, dnsRecord, false, []string{}, err) } //Ensure an Owner ID has been assigned to the record (OwnerID set in the status) @@ -197,14 +197,14 @@ func (r *DNSRecordReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( if err != nil { setDNSRecordCondition(dnsRecord, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse, "DNSProviderError", fmt.Sprintf("The dns provider could not be loaded: %v", err)) - return r.updateStatus(ctx, previous, dnsRecord, false, err) + return r.updateStatus(ctx, previous, dnsRecord, false, []string{}, err) } z, err := p.DNSZoneForHost(ctx, dnsRecord.Spec.RootHost) if err != nil { setDNSRecordCondition(dnsRecord, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse, "DNSProviderError", fmt.Sprintf("Unable to find suitable zone in provider: %v", provider.SanitizeError(err))) - return r.updateStatus(ctx, previous, dnsRecord, false, err) + return r.updateStatus(ctx, previous, dnsRecord, false, []string{}, err) } //Add zone id/domainName to status @@ -220,16 +220,16 @@ func (r *DNSRecordReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( if err != nil { setDNSRecordCondition(dnsRecord, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse, "DNSProviderError", fmt.Sprintf("The dns provider could not be loaded: %v", err)) - return r.updateStatus(ctx, previous, dnsRecord, false, err) + return r.updateStatus(ctx, previous, dnsRecord, false, []string{}, err) } // Publish the record - hadChanges, err := r.publishRecord(ctx, dnsRecord, dnsProvider) + hadChanges, notHealthyProbes, err := r.publishRecord(ctx, dnsRecord, dnsProvider) if err != nil { logger.Error(err, "Failed to publish record") setDNSRecordCondition(dnsRecord, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse, "ProviderError", fmt.Sprintf("The DNS provider failed to ensure the record: %v", provider.SanitizeError(err))) - return r.updateStatus(ctx, previous, dnsRecord, hadChanges, err) + return r.updateStatus(ctx, previous, dnsRecord, hadChanges, notHealthyProbes, err) } if probesEnabled { @@ -238,7 +238,7 @@ func (r *DNSRecordReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( } } - return r.updateStatus(ctx, previous, dnsRecord, hadChanges, nil) + return r.updateStatus(ctx, previous, dnsRecord, hadChanges, notHealthyProbes, nil) } // setLogger Updates the given Logger with record/zone metadata from the given DNSRecord. @@ -252,7 +252,7 @@ func (r *DNSRecordReconciler) setLogger(ctx context.Context, logger logr.Logger, return log.IntoContext(ctx, logger), logger } -func (r *DNSRecordReconciler) updateStatus(ctx context.Context, previous, current *v1alpha1.DNSRecord, hadChanges bool, specErr error) (reconcile.Result, error) { +func (r *DNSRecordReconciler) updateStatus(ctx context.Context, previous, current *v1alpha1.DNSRecord, hadChanges bool, notHealthyProbes []string, specErr error) (reconcile.Result, error) { var requeueTime time.Duration logger := log.FromContext(ctx) @@ -283,7 +283,6 @@ func (r *DNSRecordReconciler) updateStatus(ctx context.Context, previous, curren logger.V(1).Info("Changes needed on the same generation of record") } requeueTime = randomizedValidationRequeue - setDNSRecordCondition(current, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse, "AwaitingValidation", "Awaiting validation") } else { logger.Info("All records are already up to date") // reset the valid for from randomized value to a fixed value once validation succeeds @@ -293,9 +292,10 @@ func (r *DNSRecordReconciler) updateStatus(ctx context.Context, previous, curren // uses current.Status.ValidFor as the last requeue duration. Double it. requeueTime = exponentialRequeueTime(current.Status.ValidFor) } - setDNSRecordCondition(current, string(v1alpha1.ConditionTypeReady), metav1.ConditionTrue, "ProviderSuccess", "Provider ensured the dns record") } + setStatusConditions(current, hadChanges, notHealthyProbes) + // valid for is always a requeue time current.Status.ValidFor = requeueTime.String() @@ -365,7 +365,7 @@ func (r *DNSRecordReconciler) SetupWithManager(mgr ctrl.Manager, maxRequeue, val func (r *DNSRecordReconciler) deleteRecord(ctx context.Context, dnsRecord *v1alpha1.DNSRecord, dnsProvider provider.Provider) (bool, error) { logger := log.FromContext(ctx) - hadChanges, err := r.applyChanges(ctx, dnsRecord, dnsProvider, true) + hadChanges, _, err := r.applyChanges(ctx, dnsRecord, dnsProvider, true) if err != nil { if strings.Contains(err.Error(), "was not found") || strings.Contains(err.Error(), "notFound") { logger.Info("Record not found in zone, continuing") @@ -382,21 +382,22 @@ func (r *DNSRecordReconciler) deleteRecord(ctx context.Context, dnsRecord *v1alp } // publishRecord publishes record(s) to the DNSPRovider(i.e. route53) zone (dnsRecord.Status.ZoneID). -func (r *DNSRecordReconciler) publishRecord(ctx context.Context, dnsRecord *v1alpha1.DNSRecord, dnsProvider provider.Provider) (bool, error) { +// returns if it had changes, if record is healthy and an error. If had no changes - the healthy bool can be ignored +func (r *DNSRecordReconciler) publishRecord(ctx context.Context, dnsRecord *v1alpha1.DNSRecord, dnsProvider provider.Provider) (bool, []string, error) { logger := log.FromContext(ctx) if prematurely, _ := recordReceivedPrematurely(dnsRecord); prematurely { logger.V(1).Info("Skipping DNSRecord - is still valid") - return false, nil + return false, []string{}, nil } - hadChanges, err := r.applyChanges(ctx, dnsRecord, dnsProvider, false) + hadChanges, notHealthyProbes, err := r.applyChanges(ctx, dnsRecord, dnsProvider, false) if err != nil { - return hadChanges, err + return hadChanges, notHealthyProbes, err } logger.Info("Published DNSRecord to zone") - return hadChanges, nil + return hadChanges, notHealthyProbes, nil } // recordReceivedPrematurely returns true if current reconciliation loop started before @@ -432,6 +433,47 @@ func exponentialRequeueTime(lastRequeueTime string) time.Duration { return newRequeue } +// setStatusConditions sets healthy and ready condition on given DNSRecord +func setStatusConditions(record *v1alpha1.DNSRecord, hadChanges bool, notHealthyProbes []string) { + // we get here only when spec err is nil - can trust hadChanges bool + + // give precedence to AwaitingValidation condition + if hadChanges { + setDNSRecordCondition(record, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse, string(v1alpha1.ConditionReasonAwaitingValidation), "Awaiting validation") + return + } + setDNSRecordCondition(record, string(v1alpha1.ConditionTypeReady), metav1.ConditionTrue, string(v1alpha1.ConditionReasonProviderSuccess), "Provider ensured the dns record") + + // probes are disabled or not defined + if record.Spec.HealthCheck == nil { + return + } + + // we don't have probes yet + if cap(notHealthyProbes) == 0 { + setDNSRecordCondition(record, string(v1alpha1.ConditionTypeHealthy), metav1.ConditionFalse, string(v1alpha1.ConditionReasonUnhealthy), "Probes are creating") + setDNSRecordCondition(record, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse, string(v1alpha1.ConditionReasonUnhealthy), "Probes are creating") + return + } + + // we have healthy probes + if len(notHealthyProbes) < cap(notHealthyProbes) { + if len(notHealthyProbes) == 0 { + // all probes are healthy + setDNSRecordCondition(record, string(v1alpha1.ConditionTypeHealthy), metav1.ConditionTrue, string(v1alpha1.ConditionReasonHealthy), "All healthchecks succeeded") + } else { + // at least one of the probes is healthy + setDNSRecordCondition(record, string(v1alpha1.ConditionTypeHealthy), metav1.ConditionFalse, string(v1alpha1.ConditionReasonPartiallyHealthy), fmt.Sprintf("Not healthy addresses: %s", notHealthyProbes)) + } + // in any case - hostname will resolve, so the record is ready (don't change ready condition) + return + } + // none of the probes is healthy (change ready condition to false) + setDNSRecordCondition(record, string(v1alpha1.ConditionTypeHealthy), metav1.ConditionFalse, string(v1alpha1.ConditionReasonUnhealthy), fmt.Sprintf("Not healthy addresses: %s", notHealthyProbes)) + setDNSRecordCondition(record, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse, string(v1alpha1.ConditionReasonUnhealthy), "None of the healthchecks succeeded") + +} + // setDNSRecordCondition adds or updates a given condition in the DNSRecord status. func setDNSRecordCondition(dnsRecord *v1alpha1.DNSRecord, conditionType string, status metav1.ConditionStatus, reason, message string) { cond := metav1.Condition{ @@ -469,7 +511,7 @@ func (r *DNSRecordReconciler) getDNSProvider(ctx context.Context, dnsRecord *v1a // applyChanges creates the Plan and applies it to the registry. Returns true only if the Plan had no errors and there were changes to apply. // The error is nil only if the changes were successfully applied or there were no changes to be made. -func (r *DNSRecordReconciler) applyChanges(ctx context.Context, dnsRecord *v1alpha1.DNSRecord, dnsProvider provider.Provider, isDelete bool) (bool, error) { +func (r *DNSRecordReconciler) applyChanges(ctx context.Context, dnsRecord *v1alpha1.DNSRecord, dnsProvider provider.Provider, isDelete bool) (bool, []string, error) { logger := log.FromContext(ctx) rootDomainName := dnsRecord.Spec.RootHost zoneDomainFilter := externaldnsendpoint.NewDomainFilter([]string{dnsRecord.Status.ZoneDomainName}) @@ -480,13 +522,13 @@ func (r *DNSRecordReconciler) applyChanges(ctx context.Context, dnsRecord *v1alp dnsRecord.Status.OwnerID, txtRegistryCacheInterval, txtRegistryWildcardReplacement, managedDNSRecordTypes, excludeDNSRecordTypes, txtRegistryEncryptEnabled, []byte(txtRegistryEncryptAESKey)) if err != nil { - return false, err + return false, []string{}, err } policyID := "sync" policy, exists := externaldnsplan.Policies[policyID] if !exists { - return false, fmt.Errorf("unknown policy: %s", policyID) + return false, []string{}, fmt.Errorf("unknown policy: %s", policyID) } //If we are deleting set the expected endpoints to an empty array @@ -497,25 +539,25 @@ func (r *DNSRecordReconciler) applyChanges(ctx context.Context, dnsRecord *v1alp //zoneEndpoints = Records in the current dns provider zone zoneEndpoints, err := registry.Records(ctx) if err != nil { - return false, err + return false, []string{}, err } //specEndpoints = Records that this DNSRecord expects to exist specEndpoints, err := registry.AdjustEndpoints(dnsRecord.Spec.Endpoints) if err != nil { - return false, fmt.Errorf("adjusting specEndpoints: %w", err) + return false, []string{}, fmt.Errorf("adjusting specEndpoints: %w", err) } - //healthySpecEndpoints = Records that this DNSRecord expects to exist, that do not have matching unhealthy probes - healthySpecEndpoints, _, err := r.removeUnhealthyEndpoints(ctx, specEndpoints, dnsRecord) + // healthySpecEndpoints = Records that this DNSRecord expects to exist, that do not have matching unhealthy probes + healthySpecEndpoints, notHealthyProbes, err := r.removeUnhealthyEndpoints(ctx, specEndpoints, dnsRecord) if err != nil { - return false, fmt.Errorf("removing unhealthy specEndpoints: %w", err) + return false, []string{}, fmt.Errorf("removing unhealthy specEndpoints: %w", err) } //statusEndpoints = Records that were created/updated by this DNSRecord last statusEndpoints, err := registry.AdjustEndpoints(dnsRecord.Status.Endpoints) if err != nil { - return false, fmt.Errorf("adjusting statusEndpoints: %w", err) + return false, []string{}, fmt.Errorf("adjusting statusEndpoints: %w", err) } // add related endpoints to the record @@ -534,16 +576,16 @@ func (r *DNSRecordReconciler) applyChanges(ctx context.Context, dnsRecord *v1alp plan = plan.Calculate() if err = plan.Error(); err != nil { - return false, err + return false, notHealthyProbes, err } dnsRecord.Status.DomainOwners = plan.Owners dnsRecord.Status.Endpoints = healthySpecEndpoints if plan.Changes.HasChanges() { logger.Info("Applying changes") err = registry.ApplyChanges(ctx, plan.Changes) - return true, err + return true, notHealthyProbes, err } - return false, nil + return false, notHealthyProbes, nil } // filterEndpoints takes a list of zoneEndpoints and removes from it all endpoints diff --git a/internal/controller/dnsrecord_controller_test.go b/internal/controller/dnsrecord_controller_test.go index 399bfe51..906fd68f 100644 --- a/internal/controller/dnsrecord_controller_test.go +++ b/internal/controller/dnsrecord_controller_test.go @@ -1,4 +1,4 @@ -//go:build integration2 +//go:build integration /* Copyright 2024. diff --git a/internal/controller/dnsrecord_healthchecks.go b/internal/controller/dnsrecord_healthchecks.go index bef1f037..41de8722 100644 --- a/internal/controller/dnsrecord_healthchecks.go +++ b/internal/controller/dnsrecord_healthchecks.go @@ -103,7 +103,7 @@ func (r *DNSRecordReconciler) ensureProbe(ctx context.Context, generated *v1alph // // If this leads to an empty array of endpoints it: // - Does nothing (prevents NXDomain response) if we already published -// - Returns empty array of nothing is published (prevent from publishing unhealthy EPs) +// - Returns empty array if nothing is published (prevent from publishing unhealthy EPs) // // it returns the list of healthy endpoints, an array of unhealthy addresses and an error func (r *DNSRecordReconciler) removeUnhealthyEndpoints(ctx context.Context, specEndpoints []*endpoint.Endpoint, dnsRecord *v1alpha1.DNSRecord) ([]*endpoint.Endpoint, []string, error) { diff --git a/internal/controller/dnsrecord_healthchecks_test.go b/internal/controller/dnsrecord_healthchecks_test.go index d124836c..82bdc32d 100644 --- a/internal/controller/dnsrecord_healthchecks_test.go +++ b/internal/controller/dnsrecord_healthchecks_test.go @@ -178,9 +178,9 @@ var _ = Describe("DNSRecordReconciler_HealthChecks", func() { g.Expect(dnsRecord.Status.Conditions).To( ContainElement(MatchFields(IgnoreExtras, Fields{ "Type": Equal(string(v1alpha1.ConditionTypeReady)), - "Status": Equal(metav1.ConditionTrue), - "Reason": Equal("ProviderSuccess"), - "Message": Equal("Provider ensured the dns record"), + "Status": Equal(metav1.ConditionFalse), + "Reason": Equal(string(v1alpha1.ConditionReasonUnhealthy)), + "Message": Equal("None of the healthchecks succeeded"), "ObservedGeneration": Equal(dnsRecord.Generation), })), ) @@ -249,7 +249,7 @@ var _ = Describe("DNSRecordReconciler_HealthChecks", func() { g.Expect(updated).To(BeTrue()) }, TestTimeoutMedium, time.Second).Should(Succeed()) - By("Ensure unhealthy endpoints are gone") + By("Ensure unhealthy endpoints are gone and status is updated") Eventually(func(g Gomega) { Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(dnsRecord), dnsRecord)).To(Succeed()) @@ -259,6 +259,14 @@ var _ = Describe("DNSRecordReconciler_HealthChecks", func() { "Targets": ConsistOf("172.32.200.2"), })), )) + g.Expect(dnsRecord.Status.Conditions).To( + ContainElement(MatchFields(IgnoreExtras, Fields{ + "Type": Equal(string(v1alpha1.ConditionTypeHealthy)), + "Status": Equal(metav1.ConditionFalse), + "Reason": Equal(string(v1alpha1.ConditionReasonPartiallyHealthy)), + "Message": Equal("Not healthy addresses: [172.32.200.1]"), + })), + ) }, TestTimeoutMedium, time.Second).Should(Succeed()) @@ -278,6 +286,7 @@ var _ = Describe("DNSRecordReconciler_HealthChecks", func() { if probe.Spec.Address == "172.32.200.2" { probe.Status.Healthy = ptr.To(false) probe.Status.LastCheckedAt = metav1.Now() + probe.Status.ConsecutiveFailures = dnsRecord.Spec.HealthCheck.FailureThreshold + 1 g.Expect(k8sClient.Status().Update(ctx, &probe)).To(Succeed()) updated = true } @@ -287,11 +296,26 @@ var _ = Describe("DNSRecordReconciler_HealthChecks", func() { }, TestTimeoutMedium, time.Second).Should(Succeed()) // we don't remove EPs if this leads to empty EPs - By("Ensure endpoints are not changed ") + By("Ensure endpoints are not changed and status is updated") Eventually(func(g Gomega) { newRecord := &v1alpha1.DNSRecord{} Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(dnsRecord), newRecord)).To(Succeed()) g.Expect(dnsRecord.Status.Endpoints).To(BeEquivalentTo(newRecord.Status.Endpoints)) + + g.Expect(newRecord.Status.Conditions).To(ContainElements( + MatchFields(IgnoreExtras, Fields{ + "Type": Equal(string(v1alpha1.ConditionTypeHealthy)), + "Status": Equal(metav1.ConditionFalse), + "Reason": Equal(string(v1alpha1.ConditionReasonUnhealthy)), + "Message": Equal("Not healthy addresses: [172.32.200.1 172.32.200.2]"), + }), + MatchFields(IgnoreExtras, Fields{ + "Type": Equal(string(v1alpha1.ConditionTypeReady)), + "Status": Equal(metav1.ConditionFalse), + "Reason": Equal(string(v1alpha1.ConditionReasonUnhealthy)), + "Message": Equal("None of the healthchecks succeeded"), + }), + )) }, TestTimeoutMedium, time.Second).Should(Succeed()) }) diff --git a/test/e2e/fixtures/healthcheck_test/geo-dnsrecord-healthchecks.yaml b/test/e2e/fixtures/healthcheck_test/geo-dnsrecord-healthchecks.yaml index 0fd63026..aa6e451a 100644 --- a/test/e2e/fixtures/healthcheck_test/geo-dnsrecord-healthchecks.yaml +++ b/test/e2e/fixtures/healthcheck_test/geo-dnsrecord-healthchecks.yaml @@ -4,11 +4,6 @@ metadata: name: ${testID} namespace: ${testNamespace} spec: - healthCheck: - endpoint: "/health" - port: 80 - protocol: "HTTPS" - failureThreshold: 3 endpoints: - dnsName: 14byhk-2k52h1.klb.${testHostname} recordTTL: 60