diff --git a/api/v1alpha1/conditions.go b/api/v1alpha1/conditions.go index db246752..6d63524f 100644 --- a/api/v1alpha1/conditions.go +++ b/api/v1alpha1/conditions.go @@ -4,3 +4,10 @@ type ConditionType string type ConditionReason string const ConditionTypeReady ConditionType = "Ready" +const ConditionReasonProviderSuccess ConditionReason = "ProviderSuccess" +const ConditionReasonAwaitingValidation ConditionReason = "AwaitingValidation" + +const ConditionTypeHealthy ConditionType = "Healthy" +const ConditionReasonHealthy ConditionReason = "AllChecksPassed" +const ConditionReasonPartiallyHealthy ConditionReason = "SomeChecksPassed" +const ConditionReasonUnhealthy ConditionReason = "HealthChecksFailed" diff --git a/internal/controller/dnsrecord_controller.go b/internal/controller/dnsrecord_controller.go index 195a9884..02c26dae 100644 --- a/internal/controller/dnsrecord_controller.go +++ b/internal/controller/dnsrecord_controller.go @@ -125,7 +125,7 @@ func (r *DNSRecordReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( reason := "DNSProviderError" message := fmt.Sprintf("The dns provider could not be loaded: %v", err) setDNSRecordCondition(dnsRecord, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse, reason, message) - return r.updateStatus(ctx, previous, dnsRecord, false, err) + return r.updateStatus(ctx, previous, dnsRecord, false, []string{}, err) } if probesEnabled { @@ -174,7 +174,7 @@ func (r *DNSRecordReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( logger.Error(err, "Failed to validate record") setDNSRecordCondition(dnsRecord, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse, "ValidationError", fmt.Sprintf("validation of DNSRecord failed: %v", err)) - return r.updateStatus(ctx, previous, dnsRecord, false, err) + return r.updateStatus(ctx, previous, dnsRecord, false, []string{}, err) } //Ensure an Owner ID has been assigned to the record (OwnerID set in the status) @@ -197,14 +197,14 @@ func (r *DNSRecordReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( if err != nil { setDNSRecordCondition(dnsRecord, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse, "DNSProviderError", fmt.Sprintf("The dns provider could not be loaded: %v", err)) - return r.updateStatus(ctx, previous, dnsRecord, false, err) + return r.updateStatus(ctx, previous, dnsRecord, false, []string{}, err) } z, err := p.DNSZoneForHost(ctx, dnsRecord.Spec.RootHost) if err != nil { setDNSRecordCondition(dnsRecord, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse, "DNSProviderError", fmt.Sprintf("Unable to find suitable zone in provider: %v", provider.SanitizeError(err))) - return r.updateStatus(ctx, previous, dnsRecord, false, err) + return r.updateStatus(ctx, previous, dnsRecord, false, []string{}, err) } //Add zone id/domainName to status @@ -220,16 +220,16 @@ func (r *DNSRecordReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( if err != nil { setDNSRecordCondition(dnsRecord, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse, "DNSProviderError", fmt.Sprintf("The dns provider could not be loaded: %v", err)) - return r.updateStatus(ctx, previous, dnsRecord, false, err) + return r.updateStatus(ctx, previous, dnsRecord, false, []string{}, err) } // Publish the record - hadChanges, err := r.publishRecord(ctx, dnsRecord, dnsProvider) + hadChanges, notHealthyProbes, err := r.publishRecord(ctx, dnsRecord, dnsProvider) if err != nil { logger.Error(err, "Failed to publish record") setDNSRecordCondition(dnsRecord, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse, "ProviderError", fmt.Sprintf("The DNS provider failed to ensure the record: %v", provider.SanitizeError(err))) - return r.updateStatus(ctx, previous, dnsRecord, hadChanges, err) + return r.updateStatus(ctx, previous, dnsRecord, hadChanges, notHealthyProbes, err) } if probesEnabled { @@ -238,7 +238,7 @@ func (r *DNSRecordReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( } } - return r.updateStatus(ctx, previous, dnsRecord, hadChanges, nil) + return r.updateStatus(ctx, previous, dnsRecord, hadChanges, notHealthyProbes, nil) } // setLogger Updates the given Logger with record/zone metadata from the given DNSRecord. @@ -252,7 +252,7 @@ func (r *DNSRecordReconciler) setLogger(ctx context.Context, logger logr.Logger, return log.IntoContext(ctx, logger), logger } -func (r *DNSRecordReconciler) updateStatus(ctx context.Context, previous, current *v1alpha1.DNSRecord, hadChanges bool, specErr error) (reconcile.Result, error) { +func (r *DNSRecordReconciler) updateStatus(ctx context.Context, previous, current *v1alpha1.DNSRecord, hadChanges bool, notHealthyProbes []string, specErr error) (reconcile.Result, error) { var requeueTime time.Duration logger := log.FromContext(ctx) @@ -283,7 +283,6 @@ func (r *DNSRecordReconciler) updateStatus(ctx context.Context, previous, curren logger.V(1).Info("Changes needed on the same generation of record") } requeueTime = randomizedValidationRequeue - setDNSRecordCondition(current, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse, "AwaitingValidation", "Awaiting validation") } else { logger.Info("All records are already up to date") // reset the valid for from randomized value to a fixed value once validation succeeds @@ -293,9 +292,10 @@ func (r *DNSRecordReconciler) updateStatus(ctx context.Context, previous, curren // uses current.Status.ValidFor as the last requeue duration. Double it. requeueTime = exponentialRequeueTime(current.Status.ValidFor) } - setDNSRecordCondition(current, string(v1alpha1.ConditionTypeReady), metav1.ConditionTrue, "ProviderSuccess", "Provider ensured the dns record") } + setStatusConditions(current, hadChanges, notHealthyProbes) + // valid for is always a requeue time current.Status.ValidFor = requeueTime.String() @@ -365,7 +365,7 @@ func (r *DNSRecordReconciler) SetupWithManager(mgr ctrl.Manager, maxRequeue, val func (r *DNSRecordReconciler) deleteRecord(ctx context.Context, dnsRecord *v1alpha1.DNSRecord, dnsProvider provider.Provider) (bool, error) { logger := log.FromContext(ctx) - hadChanges, err := r.applyChanges(ctx, dnsRecord, dnsProvider, true) + hadChanges, _, err := r.applyChanges(ctx, dnsRecord, dnsProvider, true) if err != nil { if strings.Contains(err.Error(), "was not found") || strings.Contains(err.Error(), "notFound") { logger.Info("Record not found in zone, continuing") @@ -382,21 +382,22 @@ func (r *DNSRecordReconciler) deleteRecord(ctx context.Context, dnsRecord *v1alp } // publishRecord publishes record(s) to the DNSPRovider(i.e. route53) zone (dnsRecord.Status.ZoneID). -func (r *DNSRecordReconciler) publishRecord(ctx context.Context, dnsRecord *v1alpha1.DNSRecord, dnsProvider provider.Provider) (bool, error) { +// returns if it had changes, if record is healthy and an error. If had no changes - the healthy bool can be ignored +func (r *DNSRecordReconciler) publishRecord(ctx context.Context, dnsRecord *v1alpha1.DNSRecord, dnsProvider provider.Provider) (bool, []string, error) { logger := log.FromContext(ctx) if prematurely, _ := recordReceivedPrematurely(dnsRecord); prematurely { logger.V(1).Info("Skipping DNSRecord - is still valid") - return false, nil + return false, []string{}, nil } - hadChanges, err := r.applyChanges(ctx, dnsRecord, dnsProvider, false) + hadChanges, notHealthyProbes, err := r.applyChanges(ctx, dnsRecord, dnsProvider, false) if err != nil { - return hadChanges, err + return hadChanges, notHealthyProbes, err } logger.Info("Published DNSRecord to zone") - return hadChanges, nil + return hadChanges, notHealthyProbes, nil } // recordReceivedPrematurely returns true if current reconciliation loop started before @@ -432,6 +433,40 @@ func exponentialRequeueTime(lastRequeueTime string) time.Duration { return newRequeue } +// setStatusConditions sets healthy and ready condition on given DNSRecord +func setStatusConditions(record *v1alpha1.DNSRecord, hadChanges bool, notHealthyProbes []string) { + // we get here only when spec err is nil - can trust hadChanges bool + + // give precedence to AwaitingValidation condition + if hadChanges { + setDNSRecordCondition(record, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse, string(v1alpha1.ConditionReasonAwaitingValidation), "Awaiting validation") + return + } + setDNSRecordCondition(record, string(v1alpha1.ConditionTypeReady), metav1.ConditionTrue, string(v1alpha1.ConditionReasonProviderSuccess), "Provider ensured the dns record") + + // probes are disabled + if cap(notHealthyProbes) == 0 { + return + } + + // we have healthy probes + if len(notHealthyProbes) < cap(notHealthyProbes) { + if len(notHealthyProbes) == 0 { + // all probes are healthy + setDNSRecordCondition(record, string(v1alpha1.ConditionTypeHealthy), metav1.ConditionTrue, string(v1alpha1.ConditionReasonHealthy), "All healthchecks succeeded") + } else { + // at least one of the probes is healthy + setDNSRecordCondition(record, string(v1alpha1.ConditionTypeHealthy), metav1.ConditionFalse, string(v1alpha1.ConditionReasonPartiallyHealthy), fmt.Sprintf("Not healthy addresses: %s", notHealthyProbes)) + } + // in any case - hostname will resolve, so the record is ready (don't change ready condition) + return + } + // none of the probes is healthy (change ready condition to false) + setDNSRecordCondition(record, string(v1alpha1.ConditionTypeHealthy), metav1.ConditionFalse, string(v1alpha1.ConditionReasonUnhealthy), fmt.Sprintf("Not healthy addresses: %s", notHealthyProbes)) + setDNSRecordCondition(record, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse, string(v1alpha1.ConditionReasonUnhealthy), "None of the healthchecks succeeded") + +} + // setDNSRecordCondition adds or updates a given condition in the DNSRecord status. func setDNSRecordCondition(dnsRecord *v1alpha1.DNSRecord, conditionType string, status metav1.ConditionStatus, reason, message string) { cond := metav1.Condition{ @@ -469,7 +504,7 @@ func (r *DNSRecordReconciler) getDNSProvider(ctx context.Context, dnsRecord *v1a // applyChanges creates the Plan and applies it to the registry. Returns true only if the Plan had no errors and there were changes to apply. // The error is nil only if the changes were successfully applied or there were no changes to be made. -func (r *DNSRecordReconciler) applyChanges(ctx context.Context, dnsRecord *v1alpha1.DNSRecord, dnsProvider provider.Provider, isDelete bool) (bool, error) { +func (r *DNSRecordReconciler) applyChanges(ctx context.Context, dnsRecord *v1alpha1.DNSRecord, dnsProvider provider.Provider, isDelete bool) (bool, []string, error) { logger := log.FromContext(ctx) rootDomainName := dnsRecord.Spec.RootHost zoneDomainFilter := externaldnsendpoint.NewDomainFilter([]string{dnsRecord.Status.ZoneDomainName}) @@ -480,13 +515,13 @@ func (r *DNSRecordReconciler) applyChanges(ctx context.Context, dnsRecord *v1alp dnsRecord.Status.OwnerID, txtRegistryCacheInterval, txtRegistryWildcardReplacement, managedDNSRecordTypes, excludeDNSRecordTypes, txtRegistryEncryptEnabled, []byte(txtRegistryEncryptAESKey)) if err != nil { - return false, err + return false, []string{}, err } policyID := "sync" policy, exists := externaldnsplan.Policies[policyID] if !exists { - return false, fmt.Errorf("unknown policy: %s", policyID) + return false, []string{}, fmt.Errorf("unknown policy: %s", policyID) } //If we are deleting set the expected endpoints to an empty array @@ -497,25 +532,25 @@ func (r *DNSRecordReconciler) applyChanges(ctx context.Context, dnsRecord *v1alp //zoneEndpoints = Records in the current dns provider zone zoneEndpoints, err := registry.Records(ctx) if err != nil { - return false, err + return false, []string{}, err } //specEndpoints = Records that this DNSRecord expects to exist specEndpoints, err := registry.AdjustEndpoints(dnsRecord.Spec.Endpoints) if err != nil { - return false, fmt.Errorf("adjusting specEndpoints: %w", err) + return false, []string{}, fmt.Errorf("adjusting specEndpoints: %w", err) } - //healthySpecEndpoints = Records that this DNSRecord expects to exist, that do not have matching unhealthy probes - healthySpecEndpoints, _, err := r.removeUnhealthyEndpoints(ctx, specEndpoints, dnsRecord) + // healthySpecEndpoints = Records that this DNSRecord expects to exist, that do not have matching unhealthy probes + healthySpecEndpoints, notHealthyProbes, err := r.removeUnhealthyEndpoints(ctx, specEndpoints, dnsRecord) if err != nil { - return false, fmt.Errorf("removing unhealthy specEndpoints: %w", err) + return false, []string{}, fmt.Errorf("removing unhealthy specEndpoints: %w", err) } //statusEndpoints = Records that were created/updated by this DNSRecord last statusEndpoints, err := registry.AdjustEndpoints(dnsRecord.Status.Endpoints) if err != nil { - return false, fmt.Errorf("adjusting statusEndpoints: %w", err) + return false, []string{}, fmt.Errorf("adjusting statusEndpoints: %w", err) } // add related endpoints to the record @@ -534,16 +569,16 @@ func (r *DNSRecordReconciler) applyChanges(ctx context.Context, dnsRecord *v1alp plan = plan.Calculate() if err = plan.Error(); err != nil { - return false, err + return false, notHealthyProbes, err } dnsRecord.Status.DomainOwners = plan.Owners dnsRecord.Status.Endpoints = healthySpecEndpoints if plan.Changes.HasChanges() { logger.Info("Applying changes") err = registry.ApplyChanges(ctx, plan.Changes) - return true, err + return true, notHealthyProbes, err } - return false, nil + return false, notHealthyProbes, nil } // filterEndpoints takes a list of zoneEndpoints and removes from it all endpoints diff --git a/internal/controller/dnsrecord_healthchecks.go b/internal/controller/dnsrecord_healthchecks.go index 5cae130b..7a710834 100644 --- a/internal/controller/dnsrecord_healthchecks.go +++ b/internal/controller/dnsrecord_healthchecks.go @@ -100,12 +100,14 @@ func (r *DNSRecordReconciler) ensureProbe(ctx context.Context, generated *v1alph // - If the health check CR has insufficient failures - it is healthy // - If the health check CR is deleting - it is healthy // - If the health check is a CNAME and any IP is healthy - the CNAME is healthy -func (r *DNSRecordReconciler) removeUnhealthyEndpoints(ctx context.Context, endpoints []*endpoint.Endpoint, dnsRecord *v1alpha1.DNSRecord) ([]*endpoint.Endpoint, bool, error) { +// +// it returns the list of healthy endpoints, an array of unhealthy addresses and an error +func (r *DNSRecordReconciler) removeUnhealthyEndpoints(ctx context.Context, endpoints []*endpoint.Endpoint, dnsRecord *v1alpha1.DNSRecord) ([]*endpoint.Endpoint, []string, error) { probes := &v1alpha1.DNSHealthCheckProbeList{} // we are deleting or don't have health checks - don't bother if (dnsRecord.DeletionTimestamp != nil && !dnsRecord.DeletionTimestamp.IsZero()) || dnsRecord.Spec.HealthCheck == nil { - return endpoints, true, nil + return endpoints, []string{}, nil } // get all probes owned by this record @@ -116,8 +118,9 @@ func (r *DNSRecordReconciler) removeUnhealthyEndpoints(ctx context.Context, endp Namespace: dnsRecord.Namespace, }) if err != nil { - return nil, false, err + return nil, []string{}, err } + unhealthyAddresses := make([]string, 0, len(probes.Items)) // use adjusted endpoints instead of spec ones tree := common.MakeTreeFromDNSRecord(&v1alpha1.DNSRecord{ @@ -141,21 +144,22 @@ func (r *DNSRecordReconciler) removeUnhealthyEndpoints(ctx context.Context, endp tree.RemoveNode(&common.DNSTreeNode{ Name: probe.Spec.Address, }) + unhealthyAddresses = append(unhealthyAddresses, probe.Spec.Address) } } // if at least one of the leaf probes was healthy return healthy probes if haveHealthyProbes { - return *common.ToEndpoints(tree, ptr.To([]*endpoint.Endpoint{})), true, nil + return *common.ToEndpoints(tree, ptr.To([]*endpoint.Endpoint{})), unhealthyAddresses, nil } // if none of the probes are healthy - don't modify endpoints if dnsRecord.Status.Endpoints != nil && len(dnsRecord.Status.Endpoints) != 0 { // if we already have EPs in the provider, we will use them - return dnsRecord.Status.Endpoints, false, nil + return dnsRecord.Status.Endpoints, unhealthyAddresses, nil } // if we have nothing in provider use adjusted spec endpoints - return endpoints, false, nil + return endpoints, unhealthyAddresses, nil } func buildDesiredProbes(dnsRecord *v1alpha1.DNSRecord, leafs *[]string, allowInsecureCerts bool) []*v1alpha1.DNSHealthCheckProbe { diff --git a/internal/controller/dnsrecord_healthchecks_test.go b/internal/controller/dnsrecord_healthchecks_test.go index 7470ad91..f7cfd426 100644 --- a/internal/controller/dnsrecord_healthchecks_test.go +++ b/internal/controller/dnsrecord_healthchecks_test.go @@ -66,7 +66,7 @@ var _ = Describe("DNSRecordReconciler_HealthChecks", func() { ContainElement(MatchFields(IgnoreExtras, Fields{ "Type": Equal(string(v1alpha1.ConditionTypeReady)), "Status": Equal(metav1.ConditionTrue), - "Reason": Equal("ProviderSuccess"), + "Reason": Equal(string(v1alpha1.ConditionReasonProviderSuccess)), "Message": Equal("Provider ensured the dns record"), })), ) @@ -183,6 +183,18 @@ var _ = Describe("DNSRecordReconciler_HealthChecks", func() { It("Should remove unhealthy endpoints", func() { //Create default test dnsrecord Expect(k8sClient.Create(ctx, dnsRecord)).To(Succeed()) + Eventually(func(g Gomega) { + err := k8sClient.Get(ctx, client.ObjectKeyFromObject(dnsRecord), dnsRecord) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(dnsRecord.Status.Conditions).To( + ContainElement(MatchFields(IgnoreExtras, Fields{ + "Type": Equal(string(v1alpha1.ConditionTypeHealthy)), + "Status": Equal(metav1.ConditionTrue), + "Reason": Equal(string(v1alpha1.ConditionReasonHealthy)), + "Message": Equal("All healthchecks succeeded"), + })), + ) + }, TestTimeoutMedium, time.Second).Should(Succeed()) By("Making one of the probes unhealthy") Eventually(func(g Gomega) { @@ -210,7 +222,7 @@ var _ = Describe("DNSRecordReconciler_HealthChecks", func() { g.Expect(updated).To(BeTrue()) }, TestTimeoutMedium, time.Second).Should(Succeed()) - By("Ensure unhealthy endpoints are gone") + By("Ensure unhealthy endpoints are gone and status is updated") Eventually(func(g Gomega) { Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(dnsRecord), dnsRecord)).To(Succeed()) @@ -233,6 +245,14 @@ var _ = Describe("DNSRecordReconciler_HealthChecks", func() { "ProviderSpecific": Equal(externaldnsendpoint.ProviderSpecific{{Name: "geo-code", Value: "GEO-EU"}}), })), )) + g.Expect(dnsRecord.Status.Conditions).To( + ContainElement(MatchFields(IgnoreExtras, Fields{ + "Type": Equal(string(v1alpha1.ConditionTypeHealthy)), + "Status": Equal(metav1.ConditionFalse), + "Reason": Equal(string(v1alpha1.ConditionReasonPartiallyHealthy)), + "Message": Equal("Not healthy addresses: [172.32.200.1]"), + })), + ) }, TestTimeoutMedium, time.Second).Should(Succeed()) @@ -252,6 +272,7 @@ var _ = Describe("DNSRecordReconciler_HealthChecks", func() { if probe.Spec.Address == "172.32.200.2" { probe.Status.Healthy = ptr.To(false) probe.Status.LastCheckedAt = metav1.Now() + probe.Status.ConsecutiveFailures = dnsRecord.Spec.HealthCheck.FailureThreshold + 1 g.Expect(k8sClient.Status().Update(ctx, &probe)).To(Succeed()) updated = true } @@ -261,11 +282,26 @@ var _ = Describe("DNSRecordReconciler_HealthChecks", func() { }, TestTimeoutMedium, time.Second).Should(Succeed()) // we don't remove EPs if this leads to empty EPs - By("Ensure endpoints are not changed ") + By("Ensure endpoints are not changed and status is updated") Eventually(func(g Gomega) { record := &v1alpha1.DNSRecord{} Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(dnsRecord), record)).To(Succeed()) g.Expect(dnsRecord.Status.Endpoints).To(BeEquivalentTo(record.Status.Endpoints)) + + g.Expect(record.Status.Conditions).To(ContainElements( + MatchFields(IgnoreExtras, Fields{ + "Type": Equal(string(v1alpha1.ConditionTypeHealthy)), + "Status": Equal(metav1.ConditionFalse), + "Reason": Equal(string(v1alpha1.ConditionReasonUnhealthy)), + "Message": Equal("Not healthy addresses: [172.32.200.1 172.32.200.2]"), + }), + MatchFields(IgnoreExtras, Fields{ + "Type": Equal(string(v1alpha1.ConditionTypeReady)), + "Status": Equal(metav1.ConditionFalse), + "Reason": Equal(string(v1alpha1.ConditionReasonUnhealthy)), + "Message": Equal("None of the healthchecks succeeded"), + }), + )) }, TestTimeoutMedium, time.Second).Should(Succeed()) }) diff --git a/test/e2e/fixtures/healthcheck_test/geo-dnsrecord-healthchecks.yaml b/test/e2e/fixtures/healthcheck_test/geo-dnsrecord-healthchecks.yaml index 0fd63026..aa6e451a 100644 --- a/test/e2e/fixtures/healthcheck_test/geo-dnsrecord-healthchecks.yaml +++ b/test/e2e/fixtures/healthcheck_test/geo-dnsrecord-healthchecks.yaml @@ -4,11 +4,6 @@ metadata: name: ${testID} namespace: ${testNamespace} spec: - healthCheck: - endpoint: "/health" - port: 80 - protocol: "HTTPS" - failureThreshold: 3 endpoints: - dnsName: 14byhk-2k52h1.klb.${testHostname} recordTTL: 60