From 4de16659421e1e5e2b23a8b0397bbeb0fd18b645 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Fri, 11 Oct 2024 12:32:23 -0400 Subject: [PATCH] consul: improve reliability of deregistration (#24166) When the local Consul agent receives a deregister request, it performs a pre-flight check using the locally cached ACL token. The agent then sends the request upstream to the Consul servers as part of anti-entropy, using its own token. This requires that the token we use for deregistration is valid even though that's not the token used to write to the Consul server. There are several cases where the service identity token might no longer exist at the time of deregistration: * A race condition between the sync and destroying the allocation. * Misconfiguration of the Consul auth method with a TTL. * Out-of-band destruction of the token. Additionally, Nomad's sync with Consul returns early if there are any errors, which means that a single broken token can prevent any other service on the Nomad agent from being registered or deregistered. Update Nomad's sync with Consul to use the Nomad agent's own Consul token for deregistration, regardless of which token the service was registered with. Accumulate errors from the sync so that they no longer block deregistration of other services. Fixes: https://github.com/hashicorp/nomad/issues/20159 --- .changelog/24166.txt | 7 ++ command/agent/consul/service_client.go | 122 ++++++++++++------------- command/agent/consul/unit_test.go | 11 +-- 3 files changed, 65 insertions(+), 75 deletions(-) create mode 100644 .changelog/24166.txt diff --git a/.changelog/24166.txt b/.changelog/24166.txt new file mode 100644 index 00000000000..ebac6cfe36c --- /dev/null +++ b/.changelog/24166.txt @@ -0,0 +1,7 @@ +```release-note:bug +consul: Fixed a bug where broken Consul ACL tokens could block registration and deregistration of services and checks +``` + +```release-note:bug +consul: Fixed a bug where service deregistration could fail because Consul ACL tokens were revoked during allocation GC +``` diff --git a/command/agent/consul/service_client.go b/command/agent/consul/service_client.go index 973dd3649ef..29904608e05 100644 --- a/command/agent/consul/service_client.go +++ b/command/agent/consul/service_client.go @@ -22,6 +22,7 @@ import ( "github.com/armon/go-metrics" "github.com/hashicorp/consul/api" "github.com/hashicorp/go-hclog" + "github.com/hashicorp/go-multierror" "github.com/hashicorp/go-set/v3" "github.com/hashicorp/nomad/client/serviceregistration" "github.com/hashicorp/nomad/helper" @@ -973,8 +974,7 @@ func (c *ServiceClient) merge(ops *operations) { func (c *ServiceClient) sync(reason syncReason) error { c.logger.Trace("execute sync", "reason", reason) - sreg, creg, sdereg, cdereg := 0, 0, 0, 0 - var err error + sreg, creg, sdereg, cdereg, fails := 0, 0, 0, 0, 0 // Get the list of all namespaces created so we can iterate them. namespaces, err := c.namespacesClient.List() @@ -1001,8 +1001,10 @@ func (c *ServiceClient) sync(reason syncReason) error { // de-registering services. inProbation := time.Now().Before(c.deregisterProbationExpiry) + var mErr *multierror.Error // collect errors for individual services/checks + // Remove Nomad services in Consul but unknown to Nomad. - for id := range servicesInConsul { + for id, service := range servicesInConsul { if _, ok := c.services[id]; ok { // Known service, skip continue @@ -1027,38 +1029,16 @@ func (c *ServiceClient) sync(reason syncReason) error { continue } - // Get the Consul namespace this service is in. - ns := servicesInConsul[id].Namespace - - token := c.getServiceToken(id) - - // If this service has a sidecar, we need to remove the sidecar first, - // otherwise Consul will produce a warning and an error when removing - // the parent service. - // - // The sidecar is not tracked on the Nomad side; it was registered - // implicitly through the parent service. - if sidecar := getNomadSidecar(id, servicesInConsul); sidecar != nil { - if err := c.agentAPI.ServiceDeregisterOpts(sidecar.ID, - &api.QueryOptions{Namespace: ns, Token: token}, - ); err != nil { - metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) - return err - } - } - - // Remove the unwanted service. - if err := c.agentAPI.ServiceDeregisterOpts(id, - &api.QueryOptions{Namespace: ns, Token: token}, - ); err != nil { - if isOldNomadService(id) { - // Don't hard-fail on old entries. See #3620 - continue - } - + // Remove the service and any sidecar; this will return an error early + // if removing the sidecar fails + err := c.syncRemoveService(service.Namespace, id, servicesInConsul) + if err != nil { metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) - return err + mErr = multierror.Append(mErr, err) + fails++ + continue } + sdereg++ metrics.IncrCounter([]string{"client", "consul", "service_deregistrations"}, 1) } @@ -1078,7 +1058,9 @@ func (c *ServiceClient) sync(reason syncReason) error { Token: token, }); err != nil { metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) - return err + mErr = multierror.Append(mErr, err) + fails++ + continue } sreg++ metrics.IncrCounter([]string{"client", "consul", "service_registrations"}, 1) @@ -1092,7 +1074,13 @@ func (c *ServiceClient) sync(reason syncReason) error { nsChecks, err := c.agentAPI.ChecksWithFilterOpts("", &api.QueryOptions{Namespace: normalizeNamespace(namespace)}) if err != nil { metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) - return fmt.Errorf("failed to query Consul checks: %w", err) + err = fmt.Errorf("failed to query Consul checks: %w", err) + if mErr.Len() == 0 { + return err + } else { + mErr = multierror.Append(mErr, err) + return mErr.ErrorOrNil() + } } for k, v := range nsChecks { checksInConsul[k] = v @@ -1128,14 +1116,12 @@ func (c *ServiceClient) sync(reason syncReason) error { // Unknown Nomad managed check; remove. Note: this query has to use the // Nomad agent's own Consul token, because by definition we don't have // an associated workload for it - if err := c.agentAPI.CheckDeregisterOpts(id, &api.QueryOptions{Namespace: check.Namespace}); err != nil { - if isOldNomadService(check.ServiceID) { - // Don't hard-fail on old entries. - continue - } - + err := c.agentAPI.CheckDeregisterOpts(id, &api.QueryOptions{Namespace: check.Namespace}) + if err != nil { metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) - return err + mErr = multierror.Append(mErr, err) + fails++ + continue } cdereg++ metrics.IncrCounter([]string{"client", "consul", "check_deregistrations"}, 1) @@ -1152,16 +1138,39 @@ func (c *ServiceClient) sync(reason syncReason) error { } if err := c.agentAPI.CheckRegisterOpts(check, opts); err != nil { metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) - return err + mErr = multierror.Append(mErr, err) + fails++ + continue } creg++ metrics.IncrCounter([]string{"client", "consul", "check_registrations"}, 1) } // Only log if something was actually synced - if sreg > 0 || sdereg > 0 || creg > 0 || cdereg > 0 { + if sreg > 0 || sdereg > 0 || creg > 0 || cdereg > 0 || fails > 0 { c.logger.Debug("sync complete", "registered_services", sreg, "deregistered_services", sdereg, - "registered_checks", creg, "deregistered_checks", cdereg) + "registered_checks", creg, "deregistered_checks", cdereg, "failures", fails) + } + return mErr.ErrorOrNil() +} + +// syncRemoveService removes an unwanted service from Consul. If the service has +// a sidecar, we need to remove the sidecar first, otherwise Consul will produce +// a warning and an error when removing the parent service. So this returns +// early if the sidecar can't be removed. +func (c *ServiceClient) syncRemoveService(ns, id string, servicesInConsul map[string]*api.AgentService) error { + // The sidecar is not tracked on the Nomad side; it was registered + // implicitly through the parent service. + if sidecar := getNomadSidecar(id, servicesInConsul); sidecar != nil { + err := c.agentAPI.ServiceDeregisterOpts(sidecar.ID, &api.QueryOptions{Namespace: ns}) + if err != nil { + return err + } + } + + err := c.agentAPI.ServiceDeregisterOpts(id, &api.QueryOptions{Namespace: ns}) + if err != nil { + return err } return nil } @@ -1825,10 +1834,7 @@ func (c *ServiceClient) Shutdown() error { // Always attempt to deregister Nomad agent Consul entries, even if // deadline was reached for id := range c.agentServices.Items() { - opts := &api.QueryOptions{ - Token: c.getServiceToken(id), - } - if err := c.agentAPI.ServiceDeregisterOpts(id, opts); err != nil { + if err := c.agentAPI.ServiceDeregisterOpts(id, nil); err != nil { c.logger.Error("failed deregistering agent service", "service_id", id, "error", err) } } @@ -1865,10 +1871,8 @@ func (c *ServiceClient) Shutdown() error { if remainingChecks == nil || checkRemains(id) { check := remainingChecks[id] ns := check.Namespace - token := c.getServiceToken(check.ServiceID) - if err := c.agentAPI.CheckDeregisterOpts(id, - &api.QueryOptions{Namespace: ns, Token: token}); err != nil { + &api.QueryOptions{Namespace: ns}); err != nil { c.logger.Error("failed deregistering agent check", "check_id", id, "error", err) } } @@ -2043,7 +2047,7 @@ func isNomadAgent(id string) bool { // service (new or old formats). Agent services return false as independent // client and server agents may be running on the same machine. #2827 func isNomadService(id string) bool { - return strings.HasPrefix(id, nomadTaskPrefix) || isOldNomadService(id) + return strings.HasPrefix(id, nomadTaskPrefix) } // isNomadCheck returns true if the ID matches the pattern of a Nomad managed @@ -2052,18 +2056,6 @@ func isNomadCheck(id string) bool { return strings.HasPrefix(id, nomadCheckPrefix) } -// isOldNomadService returns true if the ID matches an old pattern managed by -// Nomad. -// -// Pre-0.7.1 task service IDs are of the form: -// -// {nomadServicePrefix}-executor-{ALLOC_ID}-{Service.Name}-{Service.Tags...} -// Example Service ID: _nomad-executor-1234-echo-http-tag1-tag2-tag3 -func isOldNomadService(id string) bool { - const prefix = nomadServicePrefix + "-executor" - return strings.HasPrefix(id, prefix) -} - const ( sidecarSuffix = "-sidecar-proxy" ) diff --git a/command/agent/consul/unit_test.go b/command/agent/consul/unit_test.go index 888a5d2235d..e2d53fce871 100644 --- a/command/agent/consul/unit_test.go +++ b/command/agent/consul/unit_test.go @@ -891,14 +891,7 @@ func TestIsNomadService(t *testing.T) { }{ {"_nomad-client-nomad-client-http", false}, {"_nomad-server-nomad-serf", false}, - - // Pre-0.7.1 style IDs still match - {"_nomad-executor-abc", true}, - {"_nomad-executor", true}, - - // Post-0.7.1 style IDs match {"_nomad-task-FBBK265QN4TMT25ND4EP42TJVMYJ3HR4", true}, - {"not-nomad", false}, {"_nomad", false}, } @@ -906,9 +899,7 @@ func TestIsNomadService(t *testing.T) { for _, test := range tests { t.Run(test.id, func(t *testing.T) { actual := isNomadService(test.id) - if actual != test.result { - t.Errorf("%q should be %t but found %t", test.id, test.result, actual) - } + must.Eq(t, test.result, actual) }) } }