Skip to content

Commit

Permalink
consul: improve reliability of deregistration (#24166)
Browse files Browse the repository at this point in the history
When the local Consul agent receives a deregister request, it performs a
pre-flight check using the locally cached ACL token. The agent then sends the
request upstream to the Consul servers as part of anti-entropy, using its own
token. This requires that the token we use for deregistration is valid even
though that's not the token used to write to the Consul server.

There are several cases where the service identity token might no longer exist
at the time of deregistration:
* A race condition between the sync and destroying the allocation.
* Misconfiguration of the Consul auth method with a TTL.
* Out-of-band destruction of the token.

Additionally, Nomad's sync with Consul returns early if there are any errors,
which means that a single broken token can prevent any other service on the
Nomad agent from being registered or deregistered.

Update Nomad's sync with Consul to use the Nomad agent's own Consul token for
deregistration, regardless of which token the service was registered
with. Accumulate errors from the sync so that they no longer block
deregistration of other services.

Fixes: #20159
  • Loading branch information
tgross authored Oct 11, 2024
1 parent 5bb6d96 commit 4de1665
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 75 deletions.
7 changes: 7 additions & 0 deletions .changelog/24166.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
```release-note:bug
consul: Fixed a bug where broken Consul ACL tokens could block registration and deregistration of services and checks
```

```release-note:bug
consul: Fixed a bug where service deregistration could fail because Consul ACL tokens were revoked during allocation GC
```
122 changes: 57 additions & 65 deletions command/agent/consul/service_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"github.com/armon/go-metrics"
"github.com/hashicorp/consul/api"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/go-multierror"
"github.com/hashicorp/go-set/v3"
"github.com/hashicorp/nomad/client/serviceregistration"
"github.com/hashicorp/nomad/helper"
Expand Down Expand Up @@ -973,8 +974,7 @@ func (c *ServiceClient) merge(ops *operations) {
func (c *ServiceClient) sync(reason syncReason) error {
c.logger.Trace("execute sync", "reason", reason)

sreg, creg, sdereg, cdereg := 0, 0, 0, 0
var err error
sreg, creg, sdereg, cdereg, fails := 0, 0, 0, 0, 0

// Get the list of all namespaces created so we can iterate them.
namespaces, err := c.namespacesClient.List()
Expand All @@ -1001,8 +1001,10 @@ func (c *ServiceClient) sync(reason syncReason) error {
// de-registering services.
inProbation := time.Now().Before(c.deregisterProbationExpiry)

var mErr *multierror.Error // collect errors for individual services/checks

// Remove Nomad services in Consul but unknown to Nomad.
for id := range servicesInConsul {
for id, service := range servicesInConsul {
if _, ok := c.services[id]; ok {
// Known service, skip
continue
Expand All @@ -1027,38 +1029,16 @@ func (c *ServiceClient) sync(reason syncReason) error {
continue
}

// Get the Consul namespace this service is in.
ns := servicesInConsul[id].Namespace

token := c.getServiceToken(id)

// If this service has a sidecar, we need to remove the sidecar first,
// otherwise Consul will produce a warning and an error when removing
// the parent service.
//
// The sidecar is not tracked on the Nomad side; it was registered
// implicitly through the parent service.
if sidecar := getNomadSidecar(id, servicesInConsul); sidecar != nil {
if err := c.agentAPI.ServiceDeregisterOpts(sidecar.ID,
&api.QueryOptions{Namespace: ns, Token: token},
); err != nil {
metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
return err
}
}

// Remove the unwanted service.
if err := c.agentAPI.ServiceDeregisterOpts(id,
&api.QueryOptions{Namespace: ns, Token: token},
); err != nil {
if isOldNomadService(id) {
// Don't hard-fail on old entries. See #3620
continue
}

// Remove the service and any sidecar; this will return an error early
// if removing the sidecar fails
err := c.syncRemoveService(service.Namespace, id, servicesInConsul)
if err != nil {
metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
return err
mErr = multierror.Append(mErr, err)
fails++
continue
}

sdereg++
metrics.IncrCounter([]string{"client", "consul", "service_deregistrations"}, 1)
}
Expand All @@ -1078,7 +1058,9 @@ func (c *ServiceClient) sync(reason syncReason) error {
Token: token,
}); err != nil {
metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
return err
mErr = multierror.Append(mErr, err)
fails++
continue
}
sreg++
metrics.IncrCounter([]string{"client", "consul", "service_registrations"}, 1)
Expand All @@ -1092,7 +1074,13 @@ func (c *ServiceClient) sync(reason syncReason) error {
nsChecks, err := c.agentAPI.ChecksWithFilterOpts("", &api.QueryOptions{Namespace: normalizeNamespace(namespace)})
if err != nil {
metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
return fmt.Errorf("failed to query Consul checks: %w", err)
err = fmt.Errorf("failed to query Consul checks: %w", err)
if mErr.Len() == 0 {
return err
} else {
mErr = multierror.Append(mErr, err)
return mErr.ErrorOrNil()
}
}
for k, v := range nsChecks {
checksInConsul[k] = v
Expand Down Expand Up @@ -1128,14 +1116,12 @@ func (c *ServiceClient) sync(reason syncReason) error {
// Unknown Nomad managed check; remove. Note: this query has to use the
// Nomad agent's own Consul token, because by definition we don't have
// an associated workload for it
if err := c.agentAPI.CheckDeregisterOpts(id, &api.QueryOptions{Namespace: check.Namespace}); err != nil {
if isOldNomadService(check.ServiceID) {
// Don't hard-fail on old entries.
continue
}

err := c.agentAPI.CheckDeregisterOpts(id, &api.QueryOptions{Namespace: check.Namespace})
if err != nil {
metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
return err
mErr = multierror.Append(mErr, err)
fails++
continue
}
cdereg++
metrics.IncrCounter([]string{"client", "consul", "check_deregistrations"}, 1)
Expand All @@ -1152,16 +1138,39 @@ func (c *ServiceClient) sync(reason syncReason) error {
}
if err := c.agentAPI.CheckRegisterOpts(check, opts); err != nil {
metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
return err
mErr = multierror.Append(mErr, err)
fails++
continue
}
creg++
metrics.IncrCounter([]string{"client", "consul", "check_registrations"}, 1)
}

// Only log if something was actually synced
if sreg > 0 || sdereg > 0 || creg > 0 || cdereg > 0 {
if sreg > 0 || sdereg > 0 || creg > 0 || cdereg > 0 || fails > 0 {
c.logger.Debug("sync complete", "registered_services", sreg, "deregistered_services", sdereg,
"registered_checks", creg, "deregistered_checks", cdereg)
"registered_checks", creg, "deregistered_checks", cdereg, "failures", fails)
}
return mErr.ErrorOrNil()
}

// syncRemoveService removes an unwanted service from Consul. If the service has
// a sidecar, we need to remove the sidecar first, otherwise Consul will produce
// a warning and an error when removing the parent service. So this returns
// early if the sidecar can't be removed.
func (c *ServiceClient) syncRemoveService(ns, id string, servicesInConsul map[string]*api.AgentService) error {
// The sidecar is not tracked on the Nomad side; it was registered
// implicitly through the parent service.
if sidecar := getNomadSidecar(id, servicesInConsul); sidecar != nil {
err := c.agentAPI.ServiceDeregisterOpts(sidecar.ID, &api.QueryOptions{Namespace: ns})
if err != nil {
return err
}
}

err := c.agentAPI.ServiceDeregisterOpts(id, &api.QueryOptions{Namespace: ns})
if err != nil {
return err
}
return nil
}
Expand Down Expand Up @@ -1825,10 +1834,7 @@ func (c *ServiceClient) Shutdown() error {
// Always attempt to deregister Nomad agent Consul entries, even if
// deadline was reached
for id := range c.agentServices.Items() {
opts := &api.QueryOptions{
Token: c.getServiceToken(id),
}
if err := c.agentAPI.ServiceDeregisterOpts(id, opts); err != nil {
if err := c.agentAPI.ServiceDeregisterOpts(id, nil); err != nil {
c.logger.Error("failed deregistering agent service", "service_id", id, "error", err)
}
}
Expand Down Expand Up @@ -1865,10 +1871,8 @@ func (c *ServiceClient) Shutdown() error {
if remainingChecks == nil || checkRemains(id) {
check := remainingChecks[id]
ns := check.Namespace
token := c.getServiceToken(check.ServiceID)

if err := c.agentAPI.CheckDeregisterOpts(id,
&api.QueryOptions{Namespace: ns, Token: token}); err != nil {
&api.QueryOptions{Namespace: ns}); err != nil {
c.logger.Error("failed deregistering agent check", "check_id", id, "error", err)
}
}
Expand Down Expand Up @@ -2043,7 +2047,7 @@ func isNomadAgent(id string) bool {
// service (new or old formats). Agent services return false as independent
// client and server agents may be running on the same machine. #2827
func isNomadService(id string) bool {
return strings.HasPrefix(id, nomadTaskPrefix) || isOldNomadService(id)
return strings.HasPrefix(id, nomadTaskPrefix)
}

// isNomadCheck returns true if the ID matches the pattern of a Nomad managed
Expand All @@ -2052,18 +2056,6 @@ func isNomadCheck(id string) bool {
return strings.HasPrefix(id, nomadCheckPrefix)
}

// isOldNomadService returns true if the ID matches an old pattern managed by
// Nomad.
//
// Pre-0.7.1 task service IDs are of the form:
//
// {nomadServicePrefix}-executor-{ALLOC_ID}-{Service.Name}-{Service.Tags...}
// Example Service ID: _nomad-executor-1234-echo-http-tag1-tag2-tag3
func isOldNomadService(id string) bool {
const prefix = nomadServicePrefix + "-executor"
return strings.HasPrefix(id, prefix)
}

const (
sidecarSuffix = "-sidecar-proxy"
)
Expand Down
11 changes: 1 addition & 10 deletions command/agent/consul/unit_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -891,24 +891,15 @@ func TestIsNomadService(t *testing.T) {
}{
{"_nomad-client-nomad-client-http", false},
{"_nomad-server-nomad-serf", false},

// Pre-0.7.1 style IDs still match
{"_nomad-executor-abc", true},
{"_nomad-executor", true},

// Post-0.7.1 style IDs match
{"_nomad-task-FBBK265QN4TMT25ND4EP42TJVMYJ3HR4", true},

{"not-nomad", false},
{"_nomad", false},
}

for _, test := range tests {
t.Run(test.id, func(t *testing.T) {
actual := isNomadService(test.id)
if actual != test.result {
t.Errorf("%q should be %t but found %t", test.id, test.result, actual)
}
must.Eq(t, test.result, actual)
})
}
}
Expand Down

0 comments on commit 4de1665

Please sign in to comment.