Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Report healtchecks to status #280

Merged
merged 1 commit into from
Oct 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions api/v1alpha1/conditions.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,10 @@ type ConditionType string
type ConditionReason string

const ConditionTypeReady ConditionType = "Ready"
const ConditionReasonProviderSuccess ConditionReason = "ProviderSuccess"
const ConditionReasonAwaitingValidation ConditionReason = "AwaitingValidation"

const ConditionTypeHealthy ConditionType = "Healthy"
const ConditionReasonHealthy ConditionReason = "AllChecksPassed"
const ConditionReasonPartiallyHealthy ConditionReason = "SomeChecksPassed"
const ConditionReasonUnhealthy ConditionReason = "HealthChecksFailed"
100 changes: 71 additions & 29 deletions internal/controller/dnsrecord_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ func (r *DNSRecordReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
reason := "DNSProviderError"
message := fmt.Sprintf("The dns provider could not be loaded: %v", err)
setDNSRecordCondition(dnsRecord, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse, reason, message)
return r.updateStatus(ctx, previous, dnsRecord, false, err)
return r.updateStatus(ctx, previous, dnsRecord, false, []string{}, err)
}

if probesEnabled {
Expand Down Expand Up @@ -174,7 +174,7 @@ func (r *DNSRecordReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
logger.Error(err, "Failed to validate record")
setDNSRecordCondition(dnsRecord, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse,
"ValidationError", fmt.Sprintf("validation of DNSRecord failed: %v", err))
return r.updateStatus(ctx, previous, dnsRecord, false, err)
return r.updateStatus(ctx, previous, dnsRecord, false, []string{}, err)
}

//Ensure an Owner ID has been assigned to the record (OwnerID set in the status)
Expand All @@ -197,14 +197,14 @@ func (r *DNSRecordReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
if err != nil {
setDNSRecordCondition(dnsRecord, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse,
"DNSProviderError", fmt.Sprintf("The dns provider could not be loaded: %v", err))
return r.updateStatus(ctx, previous, dnsRecord, false, err)
return r.updateStatus(ctx, previous, dnsRecord, false, []string{}, err)
}

z, err := p.DNSZoneForHost(ctx, dnsRecord.Spec.RootHost)
if err != nil {
setDNSRecordCondition(dnsRecord, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse,
"DNSProviderError", fmt.Sprintf("Unable to find suitable zone in provider: %v", provider.SanitizeError(err)))
return r.updateStatus(ctx, previous, dnsRecord, false, err)
return r.updateStatus(ctx, previous, dnsRecord, false, []string{}, err)
}

//Add zone id/domainName to status
Expand All @@ -220,7 +220,7 @@ func (r *DNSRecordReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
if err != nil {
setDNSRecordCondition(dnsRecord, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse,
"DNSProviderError", fmt.Sprintf("The dns provider could not be loaded: %v", err))
return r.updateStatus(ctx, previous, dnsRecord, false, err)
return r.updateStatus(ctx, previous, dnsRecord, false, []string{}, err)
}

if probesEnabled {
Expand All @@ -230,15 +230,15 @@ func (r *DNSRecordReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
}
// just check probes here and don't publish
// Publish the record
hadChanges, err := r.publishRecord(ctx, dnsRecord, dnsProvider)
hadChanges, notHealthyProbes, err := r.publishRecord(ctx, dnsRecord, dnsProvider)
if err != nil {
logger.Error(err, "Failed to publish record")
setDNSRecordCondition(dnsRecord, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse,
"ProviderError", fmt.Sprintf("The DNS provider failed to ensure the record: %v", provider.SanitizeError(err)))
return r.updateStatus(ctx, previous, dnsRecord, hadChanges, err)
return r.updateStatus(ctx, previous, dnsRecord, hadChanges, notHealthyProbes, err)
}

return r.updateStatus(ctx, previous, dnsRecord, hadChanges, nil)
return r.updateStatus(ctx, previous, dnsRecord, hadChanges, notHealthyProbes, nil)
}

// setLogger Updates the given Logger with record/zone metadata from the given DNSRecord.
Expand All @@ -252,7 +252,7 @@ func (r *DNSRecordReconciler) setLogger(ctx context.Context, logger logr.Logger,
return log.IntoContext(ctx, logger), logger
}

func (r *DNSRecordReconciler) updateStatus(ctx context.Context, previous, current *v1alpha1.DNSRecord, hadChanges bool, specErr error) (reconcile.Result, error) {
func (r *DNSRecordReconciler) updateStatus(ctx context.Context, previous, current *v1alpha1.DNSRecord, hadChanges bool, notHealthyProbes []string, specErr error) (reconcile.Result, error) {
var requeueTime time.Duration
logger := log.FromContext(ctx)

Expand Down Expand Up @@ -283,7 +283,6 @@ func (r *DNSRecordReconciler) updateStatus(ctx context.Context, previous, curren
logger.V(1).Info("Changes needed on the same generation of record")
}
requeueTime = randomizedValidationRequeue
setDNSRecordCondition(current, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse, "AwaitingValidation", "Awaiting validation")
} else {
logger.Info("All records are already up to date")
// reset the valid for from randomized value to a fixed value once validation succeeds
Expand All @@ -293,9 +292,10 @@ func (r *DNSRecordReconciler) updateStatus(ctx context.Context, previous, curren
// uses current.Status.ValidFor as the last requeue duration. Double it.
requeueTime = exponentialRequeueTime(current.Status.ValidFor)
}
setDNSRecordCondition(current, string(v1alpha1.ConditionTypeReady), metav1.ConditionTrue, "ProviderSuccess", "Provider ensured the dns record")
}

setStatusConditions(current, hadChanges, notHealthyProbes)

// valid for is always a requeue time
current.Status.ValidFor = requeueTime.String()

Expand Down Expand Up @@ -365,7 +365,7 @@ func (r *DNSRecordReconciler) SetupWithManager(mgr ctrl.Manager, maxRequeue, val
func (r *DNSRecordReconciler) deleteRecord(ctx context.Context, dnsRecord *v1alpha1.DNSRecord, dnsProvider provider.Provider) (bool, error) {
logger := log.FromContext(ctx)

hadChanges, err := r.applyChanges(ctx, dnsRecord, dnsProvider, true)
hadChanges, _, err := r.applyChanges(ctx, dnsRecord, dnsProvider, true)
if err != nil {
if strings.Contains(err.Error(), "was not found") || strings.Contains(err.Error(), "notFound") {
logger.Info("Record not found in zone, continuing")
Expand All @@ -382,21 +382,22 @@ func (r *DNSRecordReconciler) deleteRecord(ctx context.Context, dnsRecord *v1alp
}

// publishRecord publishes record(s) to the DNSPRovider(i.e. route53) zone (dnsRecord.Status.ZoneID).
func (r *DNSRecordReconciler) publishRecord(ctx context.Context, dnsRecord *v1alpha1.DNSRecord, dnsProvider provider.Provider) (bool, error) {
// returns if it had changes, if record is healthy and an error. If had no changes - the healthy bool can be ignored
func (r *DNSRecordReconciler) publishRecord(ctx context.Context, dnsRecord *v1alpha1.DNSRecord, dnsProvider provider.Provider) (bool, []string, error) {
logger := log.FromContext(ctx)

if prematurely, _ := recordReceivedPrematurely(dnsRecord); prematurely {
logger.V(1).Info("Skipping DNSRecord - is still valid")
return false, nil
return false, []string{}, nil
}

hadChanges, err := r.applyChanges(ctx, dnsRecord, dnsProvider, false)
hadChanges, notHealthyProbes, err := r.applyChanges(ctx, dnsRecord, dnsProvider, false)
if err != nil {
return hadChanges, err
return hadChanges, notHealthyProbes, err
}
logger.Info("Published DNSRecord to zone")

return hadChanges, nil
return hadChanges, notHealthyProbes, nil
}

// recordReceivedPrematurely returns true if current reconciliation loop started before
Expand Down Expand Up @@ -432,6 +433,47 @@ func exponentialRequeueTime(lastRequeueTime string) time.Duration {
return newRequeue
}

// setStatusConditions sets healthy and ready condition on given DNSRecord
func setStatusConditions(record *v1alpha1.DNSRecord, hadChanges bool, notHealthyProbes []string) {
// we get here only when spec err is nil - can trust hadChanges bool

// give precedence to AwaitingValidation condition
if hadChanges {
setDNSRecordCondition(record, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse, string(v1alpha1.ConditionReasonAwaitingValidation), "Awaiting validation")
return
}
setDNSRecordCondition(record, string(v1alpha1.ConditionTypeReady), metav1.ConditionTrue, string(v1alpha1.ConditionReasonProviderSuccess), "Provider ensured the dns record")

// probes are disabled or not defined
if record.Spec.HealthCheck == nil {
return
}

// we don't have probes yet
if cap(notHealthyProbes) == 0 {
setDNSRecordCondition(record, string(v1alpha1.ConditionTypeHealthy), metav1.ConditionFalse, string(v1alpha1.ConditionReasonUnhealthy), "Probes are creating")
setDNSRecordCondition(record, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse, string(v1alpha1.ConditionReasonUnhealthy), "Probes are creating")
return
}

// we have healthy probes
if len(notHealthyProbes) < cap(notHealthyProbes) {
if len(notHealthyProbes) == 0 {
// all probes are healthy
setDNSRecordCondition(record, string(v1alpha1.ConditionTypeHealthy), metav1.ConditionTrue, string(v1alpha1.ConditionReasonHealthy), "All healthchecks succeeded")
} else {
// at least one of the probes is healthy
setDNSRecordCondition(record, string(v1alpha1.ConditionTypeHealthy), metav1.ConditionFalse, string(v1alpha1.ConditionReasonPartiallyHealthy), fmt.Sprintf("Not healthy addresses: %s", notHealthyProbes))
}
// in any case - hostname will resolve, so the record is ready (don't change ready condition)
return
}
// none of the probes is healthy (change ready condition to false)
setDNSRecordCondition(record, string(v1alpha1.ConditionTypeHealthy), metav1.ConditionFalse, string(v1alpha1.ConditionReasonUnhealthy), fmt.Sprintf("Not healthy addresses: %s", notHealthyProbes))
setDNSRecordCondition(record, string(v1alpha1.ConditionTypeReady), metav1.ConditionFalse, string(v1alpha1.ConditionReasonUnhealthy), "None of the healthchecks succeeded")

}

// setDNSRecordCondition adds or updates a given condition in the DNSRecord status.
func setDNSRecordCondition(dnsRecord *v1alpha1.DNSRecord, conditionType string, status metav1.ConditionStatus, reason, message string) {
cond := metav1.Condition{
Expand Down Expand Up @@ -469,7 +511,7 @@ func (r *DNSRecordReconciler) getDNSProvider(ctx context.Context, dnsRecord *v1a

// applyChanges creates the Plan and applies it to the registry. Returns true only if the Plan had no errors and there were changes to apply.
// The error is nil only if the changes were successfully applied or there were no changes to be made.
func (r *DNSRecordReconciler) applyChanges(ctx context.Context, dnsRecord *v1alpha1.DNSRecord, dnsProvider provider.Provider, isDelete bool) (bool, error) {
func (r *DNSRecordReconciler) applyChanges(ctx context.Context, dnsRecord *v1alpha1.DNSRecord, dnsProvider provider.Provider, isDelete bool) (bool, []string, error) {
logger := log.FromContext(ctx)
rootDomainName := dnsRecord.Spec.RootHost
zoneDomainFilter := externaldnsendpoint.NewDomainFilter([]string{dnsRecord.Status.ZoneDomainName})
Expand All @@ -480,13 +522,13 @@ func (r *DNSRecordReconciler) applyChanges(ctx context.Context, dnsRecord *v1alp
dnsRecord.Status.OwnerID, txtRegistryCacheInterval, txtRegistryWildcardReplacement, managedDNSRecordTypes,
excludeDNSRecordTypes, txtRegistryEncryptEnabled, []byte(txtRegistryEncryptAESKey))
if err != nil {
return false, err
return false, []string{}, err
}

policyID := "sync"
policy, exists := externaldnsplan.Policies[policyID]
if !exists {
return false, fmt.Errorf("unknown policy: %s", policyID)
return false, []string{}, fmt.Errorf("unknown policy: %s", policyID)
}

//If we are deleting set the expected endpoints to an empty array
Expand All @@ -497,25 +539,25 @@ func (r *DNSRecordReconciler) applyChanges(ctx context.Context, dnsRecord *v1alp
//zoneEndpoints = Records in the current dns provider zone
zoneEndpoints, err := registry.Records(ctx)
if err != nil {
return false, err
return false, []string{}, err
}

//specEndpoints = Records that this DNSRecord expects to exist
specEndpoints, err := registry.AdjustEndpoints(dnsRecord.Spec.Endpoints)
if err != nil {
return false, fmt.Errorf("adjusting specEndpoints: %w", err)
return false, []string{}, fmt.Errorf("adjusting specEndpoints: %w", err)
}

//healthySpecEndpoints = Records that this DNSRecord expects to exist, that do not have matching unhealthy probes
healthySpecEndpoints, _, err := r.removeUnhealthyEndpoints(ctx, specEndpoints, dnsRecord)
// healthySpecEndpoints = Records that this DNSRecord expects to exist, that do not have matching unhealthy probes
healthySpecEndpoints, notHealthyProbes, err := r.removeUnhealthyEndpoints(ctx, specEndpoints, dnsRecord)
if err != nil {
return false, fmt.Errorf("removing unhealthy specEndpoints: %w", err)
return false, []string{}, fmt.Errorf("removing unhealthy specEndpoints: %w", err)
}

//statusEndpoints = Records that were created/updated by this DNSRecord last
statusEndpoints, err := registry.AdjustEndpoints(dnsRecord.Status.Endpoints)
if err != nil {
return false, fmt.Errorf("adjusting statusEndpoints: %w", err)
return false, []string{}, fmt.Errorf("adjusting statusEndpoints: %w", err)
}

// add related endpoints to the record
Expand All @@ -534,16 +576,16 @@ func (r *DNSRecordReconciler) applyChanges(ctx context.Context, dnsRecord *v1alp

plan = plan.Calculate()
if err = plan.Error(); err != nil {
return false, err
return false, notHealthyProbes, err
}
dnsRecord.Status.DomainOwners = plan.Owners
dnsRecord.Status.Endpoints = healthySpecEndpoints
if plan.Changes.HasChanges() {
logger.Info("Applying changes")
err = registry.ApplyChanges(ctx, plan.Changes)
return true, err
return true, notHealthyProbes, err
}
return false, nil
return false, notHealthyProbes, nil
}

// filterEndpoints takes a list of zoneEndpoints and removes from it all endpoints
Expand Down
2 changes: 1 addition & 1 deletion internal/controller/dnsrecord_controller_test.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
//go:build integration2
//go:build integration

/*
Copyright 2024.
Expand Down
2 changes: 1 addition & 1 deletion internal/controller/dnsrecord_healthchecks.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ func (r *DNSRecordReconciler) ensureProbe(ctx context.Context, generated *v1alph
//
// If this leads to an empty array of endpoints it:
// - Does nothing (prevents NXDomain response) if we already published
// - Returns empty array of nothing is published (prevent from publishing unhealthy EPs)
// - Returns empty array if nothing is published (prevent from publishing unhealthy EPs)
//
// it returns the list of healthy endpoints, an array of unhealthy addresses and an error
func (r *DNSRecordReconciler) removeUnhealthyEndpoints(ctx context.Context, specEndpoints []*endpoint.Endpoint, dnsRecord *v1alpha1.DNSRecord) ([]*endpoint.Endpoint, []string, error) {
Expand Down
34 changes: 29 additions & 5 deletions internal/controller/dnsrecord_healthchecks_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -178,9 +178,9 @@ var _ = Describe("DNSRecordReconciler_HealthChecks", func() {
g.Expect(dnsRecord.Status.Conditions).To(
ContainElement(MatchFields(IgnoreExtras, Fields{
"Type": Equal(string(v1alpha1.ConditionTypeReady)),
"Status": Equal(metav1.ConditionTrue),
"Reason": Equal("ProviderSuccess"),
"Message": Equal("Provider ensured the dns record"),
"Status": Equal(metav1.ConditionFalse),
"Reason": Equal(string(v1alpha1.ConditionReasonUnhealthy)),
"Message": Equal("None of the healthchecks succeeded"),
"ObservedGeneration": Equal(dnsRecord.Generation),
})),
)
Expand Down Expand Up @@ -249,7 +249,7 @@ var _ = Describe("DNSRecordReconciler_HealthChecks", func() {
g.Expect(updated).To(BeTrue())
}, TestTimeoutMedium, time.Second).Should(Succeed())

By("Ensure unhealthy endpoints are gone")
By("Ensure unhealthy endpoints are gone and status is updated")
Eventually(func(g Gomega) {
Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(dnsRecord), dnsRecord)).To(Succeed())

Expand All @@ -259,6 +259,14 @@ var _ = Describe("DNSRecordReconciler_HealthChecks", func() {
"Targets": ConsistOf("172.32.200.2"),
})),
))
g.Expect(dnsRecord.Status.Conditions).To(
ContainElement(MatchFields(IgnoreExtras, Fields{
"Type": Equal(string(v1alpha1.ConditionTypeHealthy)),
"Status": Equal(metav1.ConditionFalse),
"Reason": Equal(string(v1alpha1.ConditionReasonPartiallyHealthy)),
"Message": Equal("Not healthy addresses: [172.32.200.1]"),
})),
)

}, TestTimeoutMedium, time.Second).Should(Succeed())

Expand All @@ -278,6 +286,7 @@ var _ = Describe("DNSRecordReconciler_HealthChecks", func() {
if probe.Spec.Address == "172.32.200.2" {
probe.Status.Healthy = ptr.To(false)
probe.Status.LastCheckedAt = metav1.Now()
probe.Status.ConsecutiveFailures = dnsRecord.Spec.HealthCheck.FailureThreshold + 1
g.Expect(k8sClient.Status().Update(ctx, &probe)).To(Succeed())
updated = true
}
Expand All @@ -287,11 +296,26 @@ var _ = Describe("DNSRecordReconciler_HealthChecks", func() {
}, TestTimeoutMedium, time.Second).Should(Succeed())

// we don't remove EPs if this leads to empty EPs
By("Ensure endpoints are not changed ")
By("Ensure endpoints are not changed and status is updated")
Eventually(func(g Gomega) {
newRecord := &v1alpha1.DNSRecord{}
Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(dnsRecord), newRecord)).To(Succeed())
g.Expect(dnsRecord.Status.Endpoints).To(BeEquivalentTo(newRecord.Status.Endpoints))

g.Expect(newRecord.Status.Conditions).To(ContainElements(
MatchFields(IgnoreExtras, Fields{
"Type": Equal(string(v1alpha1.ConditionTypeHealthy)),
"Status": Equal(metav1.ConditionFalse),
"Reason": Equal(string(v1alpha1.ConditionReasonUnhealthy)),
"Message": Equal("Not healthy addresses: [172.32.200.1 172.32.200.2]"),
}),
MatchFields(IgnoreExtras, Fields{
"Type": Equal(string(v1alpha1.ConditionTypeReady)),
"Status": Equal(metav1.ConditionFalse),
"Reason": Equal(string(v1alpha1.ConditionReasonUnhealthy)),
"Message": Equal("None of the healthchecks succeeded"),
}),
))
}, TestTimeoutMedium, time.Second).Should(Succeed())
})

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,6 @@ metadata:
name: ${testID}
namespace: ${testNamespace}
spec:
healthCheck:
endpoint: "/health"
port: 80
protocol: "HTTPS"
failureThreshold: 3
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this will fail e2e now so removing. Will be brought back in a next PR

endpoints:
- dnsName: 14byhk-2k52h1.klb.${testHostname}
recordTTL: 60
Expand Down
Loading