From 54b00498fd888bb48e65cc952cd9035fad28dbfd Mon Sep 17 00:00:00 2001 From: Christian Winther Date: Tue, 16 Jun 2020 23:14:45 +0200 Subject: [PATCH] Removing deadlock in Consul manager The code in `continuouslyAcquireConsulLeadership()` had a logic bomb that could cause any new Consul state changes to not propagate to the reconciler. `continuouslyAcquireConsulLeadership` used to `select` on two channels in an infinite loop * `timer.C`: Periodically would call `acquireConsulLeadership` every 250ms * `m.masterCh`: The intend for the channel was to quickly react to leadership changes and try to acquire it, rather than letting the `timer` trigger the work. This was put in place to decrease recovery time for new leadership election. `acquireConsulLeadership` setups a Consul Lock and tries to acquire the mutex (meaning the Resec process is now leader). It was assumed at the time that `acquireConsulLeadership` would `return` in case it could not acquire the mutex within a reasonable time, and in those cases, the `m.masterCh` would be drained in `continuouslyAcquireConsulLeadership` until the next `timer.C` triggered. But since this is not the case the `m.masterCh` channel would continue to get new writes from `watchConsulMasterService`, until the buffered channel is full, and end up blocking the function - since a full channel will block writes. Responsible for following the IP+Port of the currently elected Resec leader (decided via Consul Lock mutex in `acquireConsulLeadership`). When a leadership change happened, it would update the reconciler state and then write an update to the `m.masterCh` to manually trigger a faster leadership reelection. However, when the writes to `m.masterCh` blocks, it also mean that the reconciler no longer will get any Consul leadership change updates (since they are emitted in the same loop as `m.masterCh` writes that now blocks), so it will be unaware of a new leader having been selected in the Resec culster, causing the process to never recover on its own. Signed-off-by: Christian Winther --- resec/consul/manager.go | 9 --------- resec/consul/new.go | 1 - resec/consul/structs.go | 1 - 3 files changed, 11 deletions(-) diff --git a/resec/consul/manager.go b/resec/consul/manager.go index 81eb854..3cb3e00 100644 --- a/resec/consul/manager.go +++ b/resec/consul/manager.go @@ -40,12 +40,6 @@ func (m *Manager) continuouslyAcquireConsulLeadership() { case <-m.stopCh: return - // if consul master service have changes, immediately try to claim the lock - // since there is a good chance the service changed because the current master - // went away - case <-m.masterCh: - m.acquireConsulLeadership() - // Periodically try to acquire the consul lock case <-timer.C: m.acquireConsulLeadership() @@ -346,9 +340,6 @@ func (m *Manager) watchConsulMasterService() { m.state.MasterPort = master.Service.Port m.emit() - - m.logger.Infof("Saw change in master service. New IP+Port is: %s:%d", m.state.MasterAddr, m.state.MasterPort) - m.masterCh <- true } } } diff --git a/resec/consul/new.go b/resec/consul/new.go index ca24bfe..0838d79 100644 --- a/resec/consul/new.go +++ b/resec/consul/new.go @@ -83,7 +83,6 @@ func NewConnection(c *cli.Context, redisConfig redis.Config) (*Manager, error) { commandCh: make(chan Command, 10), config: consulConfig, logger: log.WithField("system", "consul"), - masterCh: make(chan interface{}, 1), stateCh: make(chan state.Consul, 10), stopCh: make(chan interface{}, 1), state: &state.Consul{ diff --git a/resec/consul/structs.go b/resec/consul/structs.go index 0b20cad..1db8041 100644 --- a/resec/consul/structs.go +++ b/resec/consul/structs.go @@ -17,7 +17,6 @@ type Manager struct { lockCh <-chan struct{} // lock channel used by Consul SDK to notify about changes lockErrorCh <-chan struct{} // lock error channel used by Consul SDK to notify about errors related to the lock logger *log.Entry // logger for the consul connection struct - masterCh chan interface{} // notification channel used to notify the Consul Lock go-routing that the master service changed state *state.Consul // state used by the reconciler stateCh chan state.Consul // state channel used to notify the reconciler of changes stopCh chan interface{} // internal channel used to stop all go-routines when gracefully shutting down