diff --git a/go/common/host/services.go b/go/common/host/services.go index bda314958..9b1dcfbe4 100644 --- a/go/common/host/services.go +++ b/go/common/host/services.go @@ -162,7 +162,7 @@ type EnclaveService interface { // EvictEnclave will remove the enclave from the list of enclaves, it is used when an enclave is unhealthy // - the enclave guardians are responsible for calling this method when they detect an enclave is unhealthy to notify // the service that it should failover if possible - EvictEnclave(enclaveID *common.EnclaveID) + NotifyUnavailable(enclaveID *common.EnclaveID) // SubmitAndBroadcastTx submits an encrypted transaction to the enclave, and broadcasts it to other hosts on the network (in particular, to the sequencer) SubmitAndBroadcastTx(ctx context.Context, encryptedParams common.EncryptedRequest) (*responses.RawTx, error) diff --git a/go/host/enclave/guardian.go b/go/host/enclave/guardian.go index d5212ad00..55af443fc 100644 --- a/go/host/enclave/guardian.go +++ b/go/host/enclave/guardian.go @@ -275,9 +275,9 @@ func (g *Guardian) mainLoop() { g.logger.Trace("mainLoop - enclave status", "status", g.state.GetStatus()) switch g.state.GetStatus() { case Disconnected, Unavailable: - if unavailableCounter > 3 { + // todo make this eviction trigger configurable once we've settled on how it should work + if unavailableCounter > 10 { // enclave has been unavailable for a while, evict it from the HA pool - // todo - @matt - we need to consider more carefully when to evict an enclave g.evictEnclaveFromHAPool() } // nothing to do, we are waiting for the enclave to be available @@ -609,7 +609,6 @@ func (g *Guardian) periodicBatchProduction() { skipBatchIfEmpty := g.maxBatchInterval > g.batchInterval && time.Since(g.lastBatchCreated) < g.maxBatchInterval err := g.enclaveClient.CreateBatch(context.Background(), skipBatchIfEmpty) if err != nil { - // todo: is this too low a bar for failover? Retry first? g.logger.Error("Unable to produce batch", log.ErrKey, err) g.evictEnclaveFromHAPool() } @@ -813,16 +812,12 @@ func (g *Guardian) startSequencerProcesses() { go g.periodicBundleSubmission() } -// evictEnclaveFromHAPool evicts a failing enclave from the HA pool and shuts down the guardian. +// evictEnclaveFromHAPool evicts a failing enclave from the HA pool if appropriate // This is called when the enclave is unrecoverable and we want to notify the host that it should failover if an // alternative enclave is available. func (g *Guardian) evictEnclaveFromHAPool() { - g.logger.Error("Enclave is unrecoverable - requesting to evict it from HA pool") - err := g.Stop() - if err != nil { - g.logger.Error("Error while stopping guardian of failed enclave", log.ErrKey, err) - } - go g.sl.Enclaves().EvictEnclave(g.enclaveID) + g.logger.Warn("Enclave is unavailable - notifying enclave service to evict it from HA pool if necessary") + go g.sl.Enclaves().NotifyUnavailable(g.enclaveID) } func (g *Guardian) getRollupsAndContractAddrTxs(processed common.ProcessedL1Data) ([]*common.L1RollupTx, bool) { diff --git a/go/host/enclave/service.go b/go/host/enclave/service.go index e027e26c4..c35232ccc 100644 --- a/go/host/enclave/service.go +++ b/go/host/enclave/service.go @@ -122,7 +122,15 @@ func (e *Service) GetEnclaveClients() []common.Enclave { return clients } -func (e *Service) EvictEnclave(enclaveID *common.EnclaveID) { +func (e *Service) NotifyUnavailable(enclaveID *common.EnclaveID) { + if len(e.enclaveGuardians) <= 1 { + e.logger.Info("not running in HA mode, no need to evict enclave", log.EnclaveIDKey, enclaveID) + return + } + if *e.activeSequencerID != *enclaveID { + e.logger.Info("Enclave is not the active sequencer, no need to evict yet.", log.EnclaveIDKey, enclaveID) + return + } failedEnclaveIdx := -1 e.haLock.Lock() defer e.haLock.Unlock() diff --git a/tools/walletextension/common/config.go b/tools/walletextension/common/config.go index 5250392ee..f3ebe71ef 100644 --- a/tools/walletextension/common/config.go +++ b/tools/walletextension/common/config.go @@ -25,9 +25,9 @@ type Config struct { RateLimitWindow time.Duration RateLimitMaxConcurrentRequests int - InsideEnclave bool // Indicates if the program is running inside an enclave - KeyExchangeURL string - EnableTLS bool - TLSDomain string - EncryptingCertificateEnabled bool + InsideEnclave bool // Indicates if the program is running inside an enclave + KeyExchangeURL string + EnableTLS bool + TLSDomain string + EncryptingCertificateEnabled bool }