Skip to content

Commit

Permalink
report healtchecks into DNSRecord status
Browse files Browse the repository at this point in the history
Signed-off-by: Maskym Vavilov <[email protected]>
  • Loading branch information
maksymvavilov committed Oct 30, 2024
1 parent 8a96f5f commit 4bfff2e
Show file tree
Hide file tree
Showing 6 changed files with 109 additions and 41 deletions.
7 changes: 7 additions & 0 deletions api/v1alpha1/conditions.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,10 @@ type ConditionType string
type ConditionReason string

const ConditionTypeReady ConditionType = "Ready"
const ConditionReasonProviderSuccess ConditionReason = "ProviderSuccess"
const ConditionReasonAwaitingValidation ConditionReason = "AwaitingValidation"

const ConditionTypeHealthy ConditionType = "Healthy"
const ConditionReasonHealthy ConditionReason = "AllChecksPassed"
const ConditionReasonPartiallyHealthy ConditionReason = "SomeChecksPassed"
const ConditionReasonUnhealthy ConditionReason = "HealthChecksFailed"
100 changes: 71 additions & 29 deletions internal/controller/dnsrecord_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ func (r *DNSRecordReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
reason := "DNSProviderError"
message := fmt.Sprintf("The dns provider could not be loaded: %v", err)
setDNSRecordCondition(dnsRecord, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse, reason, message)
return r.updateStatus(ctx, previous, dnsRecord, false, err)
return r.updateStatus(ctx, previous, dnsRecord, false, []string{}, err)
}

if probesEnabled {
Expand Down Expand Up @@ -174,7 +174,7 @@ func (r *DNSRecordReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
logger.Error(err, "Failed to validate record")
setDNSRecordCondition(dnsRecord, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse,
"ValidationError", fmt.Sprintf("validation of DNSRecord failed: %v", err))
return r.updateStatus(ctx, previous, dnsRecord, false, err)
return r.updateStatus(ctx, previous, dnsRecord, false, []string{}, err)
}

//Ensure an Owner ID has been assigned to the record (OwnerID set in the status)
Expand All @@ -197,14 +197,14 @@ func (r *DNSRecordReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
if err != nil {
setDNSRecordCondition(dnsRecord, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse,
"DNSProviderError", fmt.Sprintf("The dns provider could not be loaded: %v", err))
return r.updateStatus(ctx, previous, dnsRecord, false, err)
return r.updateStatus(ctx, previous, dnsRecord, false, []string{}, err)
}

z, err := p.DNSZoneForHost(ctx, dnsRecord.Spec.RootHost)
if err != nil {
setDNSRecordCondition(dnsRecord, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse,
"DNSProviderError", fmt.Sprintf("Unable to find suitable zone in provider: %v", provider.SanitizeError(err)))
return r.updateStatus(ctx, previous, dnsRecord, false, err)
return r.updateStatus(ctx, previous, dnsRecord, false, []string{}, err)
}

//Add zone id/domainName to status
Expand All @@ -220,16 +220,16 @@ func (r *DNSRecordReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
if err != nil {
setDNSRecordCondition(dnsRecord, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse,
"DNSProviderError", fmt.Sprintf("The dns provider could not be loaded: %v", err))
return r.updateStatus(ctx, previous, dnsRecord, false, err)
return r.updateStatus(ctx, previous, dnsRecord, false, []string{}, err)
}

// Publish the record
hadChanges, err := r.publishRecord(ctx, dnsRecord, dnsProvider)
hadChanges, notHealthyProbes, err := r.publishRecord(ctx, dnsRecord, dnsProvider)
if err != nil {
logger.Error(err, "Failed to publish record")
setDNSRecordCondition(dnsRecord, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse,
"ProviderError", fmt.Sprintf("The DNS provider failed to ensure the record: %v", provider.SanitizeError(err)))
return r.updateStatus(ctx, previous, dnsRecord, hadChanges, err)
return r.updateStatus(ctx, previous, dnsRecord, hadChanges, notHealthyProbes, err)
}

if probesEnabled {
Expand All @@ -238,7 +238,7 @@ func (r *DNSRecordReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
}
}

return r.updateStatus(ctx, previous, dnsRecord, hadChanges, nil)
return r.updateStatus(ctx, previous, dnsRecord, hadChanges, notHealthyProbes, nil)
}

// setLogger Updates the given Logger with record/zone metadata from the given DNSRecord.
Expand All @@ -252,7 +252,7 @@ func (r *DNSRecordReconciler) setLogger(ctx context.Context, logger logr.Logger,
return log.IntoContext(ctx, logger), logger
}

func (r *DNSRecordReconciler) updateStatus(ctx context.Context, previous, current *v1alpha1.DNSRecord, hadChanges bool, specErr error) (reconcile.Result, error) {
func (r *DNSRecordReconciler) updateStatus(ctx context.Context, previous, current *v1alpha1.DNSRecord, hadChanges bool, notHealthyProbes []string, specErr error) (reconcile.Result, error) {
var requeueTime time.Duration
logger := log.FromContext(ctx)

Expand Down Expand Up @@ -283,7 +283,6 @@ func (r *DNSRecordReconciler) updateStatus(ctx context.Context, previous, curren
logger.V(1).Info("Changes needed on the same generation of record")
}
requeueTime = randomizedValidationRequeue
setDNSRecordCondition(current, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse, "AwaitingValidation", "Awaiting validation")
} else {
logger.Info("All records are already up to date")
// reset the valid for from randomized value to a fixed value once validation succeeds
Expand All @@ -293,9 +292,10 @@ func (r *DNSRecordReconciler) updateStatus(ctx context.Context, previous, curren
// uses current.Status.ValidFor as the last requeue duration. Double it.
requeueTime = exponentialRequeueTime(current.Status.ValidFor)
}
setDNSRecordCondition(current, string(v1alpha1.ConditionTypeReady), metav1.ConditionTrue, "ProviderSuccess", "Provider ensured the dns record")
}

setStatusConditions(current, hadChanges, notHealthyProbes)

// valid for is always a requeue time
current.Status.ValidFor = requeueTime.String()

Expand Down Expand Up @@ -365,7 +365,7 @@ func (r *DNSRecordReconciler) SetupWithManager(mgr ctrl.Manager, maxRequeue, val
func (r *DNSRecordReconciler) deleteRecord(ctx context.Context, dnsRecord *v1alpha1.DNSRecord, dnsProvider provider.Provider) (bool, error) {
logger := log.FromContext(ctx)

hadChanges, err := r.applyChanges(ctx, dnsRecord, dnsProvider, true)
hadChanges, _, err := r.applyChanges(ctx, dnsRecord, dnsProvider, true)
if err != nil {
if strings.Contains(err.Error(), "was not found") || strings.Contains(err.Error(), "notFound") {
logger.Info("Record not found in zone, continuing")
Expand All @@ -382,21 +382,22 @@ func (r *DNSRecordReconciler) deleteRecord(ctx context.Context, dnsRecord *v1alp
}

// publishRecord publishes record(s) to the DNSPRovider(i.e. route53) zone (dnsRecord.Status.ZoneID).
func (r *DNSRecordReconciler) publishRecord(ctx context.Context, dnsRecord *v1alpha1.DNSRecord, dnsProvider provider.Provider) (bool, error) {
// returns if it had changes, if record is healthy and an error. If had no changes - the healthy bool can be ignored
func (r *DNSRecordReconciler) publishRecord(ctx context.Context, dnsRecord *v1alpha1.DNSRecord, dnsProvider provider.Provider) (bool, []string, error) {
logger := log.FromContext(ctx)

if prematurely, _ := recordReceivedPrematurely(dnsRecord); prematurely {
logger.V(1).Info("Skipping DNSRecord - is still valid")
return false, nil
return false, []string{}, nil
}

hadChanges, err := r.applyChanges(ctx, dnsRecord, dnsProvider, false)
hadChanges, notHealthyProbes, err := r.applyChanges(ctx, dnsRecord, dnsProvider, false)
if err != nil {
return hadChanges, err
return hadChanges, notHealthyProbes, err
}
logger.Info("Published DNSRecord to zone")

return hadChanges, nil
return hadChanges, notHealthyProbes, nil
}

// recordReceivedPrematurely returns true if current reconciliation loop started before
Expand Down Expand Up @@ -432,6 +433,47 @@ func exponentialRequeueTime(lastRequeueTime string) time.Duration {
return newRequeue
}

// setStatusConditions sets healthy and ready condition on given DNSRecord
func setStatusConditions(record *v1alpha1.DNSRecord, hadChanges bool, notHealthyProbes []string) {
// we get here only when spec err is nil - can trust hadChanges bool

// give precedence to AwaitingValidation condition
if hadChanges {
setDNSRecordCondition(record, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse, string(v1alpha1.ConditionReasonAwaitingValidation), "Awaiting validation")
return
}
setDNSRecordCondition(record, string(v1alpha1.ConditionTypeReady), metav1.ConditionTrue, string(v1alpha1.ConditionReasonProviderSuccess), "Provider ensured the dns record")

// probes are disabled or not defined
if record.Spec.HealthCheck == nil {
return
}

// we don't have probes yet
if cap(notHealthyProbes) == 0 {
setDNSRecordCondition(record, string(v1alpha1.ConditionTypeHealthy), metav1.ConditionFalse, string(v1alpha1.ConditionReasonUnhealthy), "Probes are creating")
setDNSRecordCondition(record, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse, string(v1alpha1.ConditionReasonUnhealthy), "Probes are creating")
return
}

// we have healthy probes
if len(notHealthyProbes) < cap(notHealthyProbes) {
if len(notHealthyProbes) == 0 {
// all probes are healthy
setDNSRecordCondition(record, string(v1alpha1.ConditionTypeHealthy), metav1.ConditionTrue, string(v1alpha1.ConditionReasonHealthy), "All healthchecks succeeded")
} else {
// at least one of the probes is healthy
setDNSRecordCondition(record, string(v1alpha1.ConditionTypeHealthy), metav1.ConditionFalse, string(v1alpha1.ConditionReasonPartiallyHealthy), fmt.Sprintf("Not healthy addresses: %s", notHealthyProbes))
}
// in any case - hostname will resolve, so the record is ready (don't change ready condition)
return
}
// none of the probes is healthy (change ready condition to false)
setDNSRecordCondition(record, string(v1alpha1.ConditionTypeHealthy), metav1.ConditionFalse, string(v1alpha1.ConditionReasonUnhealthy), fmt.Sprintf("Not healthy addresses: %s", notHealthyProbes))
setDNSRecordCondition(record, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse, string(v1alpha1.ConditionReasonUnhealthy), "None of the healthchecks succeeded")

}

// setDNSRecordCondition adds or updates a given condition in the DNSRecord status.
func setDNSRecordCondition(dnsRecord *v1alpha1.DNSRecord, conditionType string, status metav1.ConditionStatus, reason, message string) {
cond := metav1.Condition{
Expand Down Expand Up @@ -469,7 +511,7 @@ func (r *DNSRecordReconciler) getDNSProvider(ctx context.Context, dnsRecord *v1a

// applyChanges creates the Plan and applies it to the registry. Returns true only if the Plan had no errors and there were changes to apply.
// The error is nil only if the changes were successfully applied or there were no changes to be made.
func (r *DNSRecordReconciler) applyChanges(ctx context.Context, dnsRecord *v1alpha1.DNSRecord, dnsProvider provider.Provider, isDelete bool) (bool, error) {
func (r *DNSRecordReconciler) applyChanges(ctx context.Context, dnsRecord *v1alpha1.DNSRecord, dnsProvider provider.Provider, isDelete bool) (bool, []string, error) {
logger := log.FromContext(ctx)
rootDomainName := dnsRecord.Spec.RootHost
zoneDomainFilter := externaldnsendpoint.NewDomainFilter([]string{dnsRecord.Status.ZoneDomainName})
Expand All @@ -480,13 +522,13 @@ func (r *DNSRecordReconciler) applyChanges(ctx context.Context, dnsRecord *v1alp
dnsRecord.Status.OwnerID, txtRegistryCacheInterval, txtRegistryWildcardReplacement, managedDNSRecordTypes,
excludeDNSRecordTypes, txtRegistryEncryptEnabled, []byte(txtRegistryEncryptAESKey))
if err != nil {
return false, err
return false, []string{}, err
}

policyID := "sync"
policy, exists := externaldnsplan.Policies[policyID]
if !exists {
return false, fmt.Errorf("unknown policy: %s", policyID)
return false, []string{}, fmt.Errorf("unknown policy: %s", policyID)
}

//If we are deleting set the expected endpoints to an empty array
Expand All @@ -497,25 +539,25 @@ func (r *DNSRecordReconciler) applyChanges(ctx context.Context, dnsRecord *v1alp
//zoneEndpoints = Records in the current dns provider zone
zoneEndpoints, err := registry.Records(ctx)
if err != nil {
return false, err
return false, []string{}, err
}

//specEndpoints = Records that this DNSRecord expects to exist
specEndpoints, err := registry.AdjustEndpoints(dnsRecord.Spec.Endpoints)
if err != nil {
return false, fmt.Errorf("adjusting specEndpoints: %w", err)
return false, []string{}, fmt.Errorf("adjusting specEndpoints: %w", err)
}

//healthySpecEndpoints = Records that this DNSRecord expects to exist, that do not have matching unhealthy probes
healthySpecEndpoints, _, err := r.removeUnhealthyEndpoints(ctx, specEndpoints, dnsRecord)
// healthySpecEndpoints = Records that this DNSRecord expects to exist, that do not have matching unhealthy probes
healthySpecEndpoints, notHealthyProbes, err := r.removeUnhealthyEndpoints(ctx, specEndpoints, dnsRecord)
if err != nil {
return false, fmt.Errorf("removing unhealthy specEndpoints: %w", err)
return false, []string{}, fmt.Errorf("removing unhealthy specEndpoints: %w", err)
}

//statusEndpoints = Records that were created/updated by this DNSRecord last
statusEndpoints, err := registry.AdjustEndpoints(dnsRecord.Status.Endpoints)
if err != nil {
return false, fmt.Errorf("adjusting statusEndpoints: %w", err)
return false, []string{}, fmt.Errorf("adjusting statusEndpoints: %w", err)
}

// add related endpoints to the record
Expand All @@ -534,16 +576,16 @@ func (r *DNSRecordReconciler) applyChanges(ctx context.Context, dnsRecord *v1alp

plan = plan.Calculate()
if err = plan.Error(); err != nil {
return false, err
return false, notHealthyProbes, err
}
dnsRecord.Status.DomainOwners = plan.Owners
dnsRecord.Status.Endpoints = healthySpecEndpoints
if plan.Changes.HasChanges() {
logger.Info("Applying changes")
err = registry.ApplyChanges(ctx, plan.Changes)
return true, err
return true, notHealthyProbes, err
}
return false, nil
return false, notHealthyProbes, nil
}

// filterEndpoints takes a list of zoneEndpoints and removes from it all endpoints
Expand Down
2 changes: 1 addition & 1 deletion internal/controller/dnsrecord_controller_test.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
//go:build integration2
//go:build integration

/*
Copyright 2024.
Expand Down
2 changes: 1 addition & 1 deletion internal/controller/dnsrecord_healthchecks.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ func (r *DNSRecordReconciler) ensureProbe(ctx context.Context, generated *v1alph
//
// If this leads to an empty array of endpoints it:
// - Does nothing (prevents NXDomain response) if we already published
// - Returns empty array of nothing is published (prevent from publishing unhealthy EPs)
// - Returns empty array if nothing is published (prevent from publishing unhealthy EPs)
//
// it returns the list of healthy endpoints, an array of unhealthy addresses and an error
func (r *DNSRecordReconciler) removeUnhealthyEndpoints(ctx context.Context, specEndpoints []*endpoint.Endpoint, dnsRecord *v1alpha1.DNSRecord) ([]*endpoint.Endpoint, []string, error) {
Expand Down
34 changes: 29 additions & 5 deletions internal/controller/dnsrecord_healthchecks_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -178,9 +178,9 @@ var _ = Describe("DNSRecordReconciler_HealthChecks", func() {
g.Expect(dnsRecord.Status.Conditions).To(
ContainElement(MatchFields(IgnoreExtras, Fields{
"Type": Equal(string(v1alpha1.ConditionTypeReady)),
"Status": Equal(metav1.ConditionTrue),
"Reason": Equal("ProviderSuccess"),
"Message": Equal("Provider ensured the dns record"),
"Status": Equal(metav1.ConditionFalse),
"Reason": Equal(string(v1alpha1.ConditionReasonUnhealthy)),
"Message": Equal("None of the healthchecks succeeded"),
"ObservedGeneration": Equal(dnsRecord.Generation),
})),
)
Expand Down Expand Up @@ -249,7 +249,7 @@ var _ = Describe("DNSRecordReconciler_HealthChecks", func() {
g.Expect(updated).To(BeTrue())
}, TestTimeoutMedium, time.Second).Should(Succeed())

By("Ensure unhealthy endpoints are gone")
By("Ensure unhealthy endpoints are gone and status is updated")
Eventually(func(g Gomega) {
Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(dnsRecord), dnsRecord)).To(Succeed())

Expand All @@ -259,6 +259,14 @@ var _ = Describe("DNSRecordReconciler_HealthChecks", func() {
"Targets": ConsistOf("172.32.200.2"),
})),
))
g.Expect(dnsRecord.Status.Conditions).To(
ContainElement(MatchFields(IgnoreExtras, Fields{
"Type": Equal(string(v1alpha1.ConditionTypeHealthy)),
"Status": Equal(metav1.ConditionFalse),
"Reason": Equal(string(v1alpha1.ConditionReasonPartiallyHealthy)),
"Message": Equal("Not healthy addresses: [172.32.200.1]"),
})),
)

}, TestTimeoutMedium, time.Second).Should(Succeed())

Expand All @@ -278,6 +286,7 @@ var _ = Describe("DNSRecordReconciler_HealthChecks", func() {
if probe.Spec.Address == "172.32.200.2" {
probe.Status.Healthy = ptr.To(false)
probe.Status.LastCheckedAt = metav1.Now()
probe.Status.ConsecutiveFailures = dnsRecord.Spec.HealthCheck.FailureThreshold + 1
g.Expect(k8sClient.Status().Update(ctx, &probe)).To(Succeed())
updated = true
}
Expand All @@ -287,11 +296,26 @@ var _ = Describe("DNSRecordReconciler_HealthChecks", func() {
}, TestTimeoutMedium, time.Second).Should(Succeed())

// we don't remove EPs if this leads to empty EPs
By("Ensure endpoints are not changed ")
By("Ensure endpoints are not changed and status is updated")
Eventually(func(g Gomega) {
newRecord := &v1alpha1.DNSRecord{}
Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(dnsRecord), newRecord)).To(Succeed())
g.Expect(dnsRecord.Status.Endpoints).To(BeEquivalentTo(newRecord.Status.Endpoints))

g.Expect(newRecord.Status.Conditions).To(ContainElements(
MatchFields(IgnoreExtras, Fields{
"Type": Equal(string(v1alpha1.ConditionTypeHealthy)),
"Status": Equal(metav1.ConditionFalse),
"Reason": Equal(string(v1alpha1.ConditionReasonUnhealthy)),
"Message": Equal("Not healthy addresses: [172.32.200.1 172.32.200.2]"),
}),
MatchFields(IgnoreExtras, Fields{
"Type": Equal(string(v1alpha1.ConditionTypeReady)),
"Status": Equal(metav1.ConditionFalse),
"Reason": Equal(string(v1alpha1.ConditionReasonUnhealthy)),
"Message": Equal("None of the healthchecks succeeded"),
}),
))
}, TestTimeoutMedium, time.Second).Should(Succeed())
})

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,6 @@ metadata:
name: ${testID}
namespace: ${testNamespace}
spec:
healthCheck:
endpoint: "/health"
port: 80
protocol: "HTTPS"
failureThreshold: 3
endpoints:
- dnsName: 14byhk-2k52h1.klb.${testHostname}
recordTTL: 60
Expand Down

0 comments on commit 4bfff2e

Please sign in to comment.