Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Enclave block and Batch health checks #1727

Merged
merged 5 commits into from
Dec 29, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions go/common/async/timestamp.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
package async

import (
"sync"
"time"
)

// Timestamp is a thread safe timestamp
type Timestamp struct {
lastTimestamp time.Time
mutex sync.RWMutex
}

func NewAsyncTimestamp(lastTimestamp time.Time) *Timestamp {
return &Timestamp{
lastTimestamp: lastTimestamp,
mutex: sync.RWMutex{},
}
}

// Mark sets the timestamp with the current time
func (at *Timestamp) Mark() {
at.mutex.Lock()
defer at.mutex.Unlock()
at.lastTimestamp = time.Now()
}

// LastTimestamp returns the last set timestamp
func (at *Timestamp) LastTimestamp() time.Time {
at.mutex.RLock()
defer at.mutex.RUnlock()

newTimestamp := at.lastTimestamp
return newTimestamp
}
27 changes: 22 additions & 5 deletions go/enclave/components/batch_registry.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,15 @@ import (
"fmt"
"math/big"
"sync"
"time"

"github.com/ethereum/go-ethereum/core/types"
"github.com/ten-protocol/go-ten/go/enclave/storage"

"github.com/ethereum/go-ethereum/core/state"
gethlog "github.com/ethereum/go-ethereum/log"
gethrpc "github.com/ethereum/go-ethereum/rpc"
"github.com/ten-protocol/go-ten/go/common/async"
"github.com/ten-protocol/go-ten/go/common/errutil"
"github.com/ten-protocol/go-ten/go/common/log"
"github.com/ten-protocol/go-ten/go/common/measure"
Expand All @@ -24,8 +26,10 @@ type batchRegistry struct {
logger gethlog.Logger
headBatchSeq *big.Int // keep track of the last executed batch to optimise db access

batchesCallback func(*core.Batch, types.Receipts)
callbackMutex sync.RWMutex
batchesCallback func(*core.Batch, types.Receipts)
callbackMutex sync.RWMutex
healthTimeout time.Duration
lastExecutedBatch *async.Timestamp
}

func NewBatchRegistry(storage storage.Storage, logger gethlog.Logger) BatchRegistry {
Expand All @@ -42,9 +46,11 @@ func NewBatchRegistry(storage storage.Storage, logger gethlog.Logger) BatchRegis
headBatchSeq = headBatch.SeqNo()
}
return &batchRegistry{
storage: storage,
headBatchSeq: headBatchSeq,
logger: logger,
storage: storage,
headBatchSeq: headBatchSeq,
logger: logger,
healthTimeout: time.Minute,
lastExecutedBatch: async.NewAsyncTimestamp(time.Now().Add(-time.Minute)),
}
}

Expand Down Expand Up @@ -75,6 +81,8 @@ func (br *batchRegistry) OnBatchExecuted(batch *core.Batch, receipts types.Recei
if br.batchesCallback != nil {
br.batchesCallback(batch, receipts)
}

br.lastExecutedBatch.Mark()
}

func (br *batchRegistry) HasGenesisBatch() (bool, error) {
Expand Down Expand Up @@ -193,3 +201,12 @@ func (br *batchRegistry) GetBatchAtHeight(height gethrpc.BlockNumber) (*core.Bat
}
return batch, nil
}

func (br *batchRegistry) HealthCheck() (bool, error) {
lastExecutedBatchTime := br.lastExecutedBatch.LastTimestamp()
if time.Now().After(lastExecutedBatchTime.Add(br.healthTimeout)) {
return false, fmt.Errorf("last executed batch was %s ago", time.Since(lastExecutedBatchTime))
}

return true, nil
}
19 changes: 17 additions & 2 deletions go/enclave/components/block_processor.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@ package components
import (
"errors"
"fmt"
"time"

"github.com/ten-protocol/go-ten/go/common/async"
"github.com/ten-protocol/go-ten/go/enclave/core"

"github.com/ten-protocol/go-ten/go/enclave/gas"
"github.com/ten-protocol/go-ten/go/enclave/storage"

Expand All @@ -27,7 +28,9 @@ type l1BlockProcessor struct {

// we store the l1 head to avoid expensive db access
// the host is responsible to always submitting the head l1 block
currentL1Head *common.L1BlockHash
currentL1Head *common.L1BlockHash
healthTimeout time.Duration
lastIngestedBlock *async.Timestamp
}

func NewBlockProcessor(storage storage.Storage, cc *crosschain.Processors, gasOracle gas.Oracle, logger gethlog.Logger) L1BlockProcessor {
Expand All @@ -48,6 +51,8 @@ func NewBlockProcessor(storage storage.Storage, cc *crosschain.Processors, gasOr
gasOracle: gasOracle,
crossChainProcessors: cc,
currentL1Head: l1BlockHash,
healthTimeout: time.Minute,
lastIngestedBlock: async.NewAsyncTimestamp(time.Now().Add(-time.Minute)),
}
}

Expand Down Expand Up @@ -77,9 +82,19 @@ func (bp *l1BlockProcessor) Process(br *common.BlockAndReceipts) (*BlockIngestio

h := br.Block.Hash()
bp.currentL1Head = &h
bp.lastIngestedBlock.Mark()
return ingestion, nil
}

func (bp *l1BlockProcessor) HealthCheck() (bool, error) {
otherview marked this conversation as resolved.
Show resolved Hide resolved
lastIngestedBlockTime := bp.lastIngestedBlock.LastTimestamp()
if time.Now().After(lastIngestedBlockTime.Add(bp.healthTimeout)) {
return false, fmt.Errorf("last ingested block was %s ago", time.Since(lastIngestedBlockTime))
}

return true, nil
}

func (bp *l1BlockProcessor) tryAndInsertBlock(br *common.BlockAndReceipts) (*BlockIngestionType, error) {
block := br.Block

Expand Down
3 changes: 3 additions & 0 deletions go/enclave/components/interfaces.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ type L1BlockProcessor interface {
Process(br *common.BlockAndReceipts) (*BlockIngestionType, error)
GetHead() (*common.L1Block, error)
GetCrossChainContractAddress() *gethcommon.Address
HealthCheck() (bool, error)
}

// BatchExecutionContext - Contains all of the data that each batch depends on
Expand Down Expand Up @@ -102,6 +103,8 @@ type BatchRegistry interface {
HasGenesisBatch() (bool, error)

HeadBatchSeq() *big.Int

HealthCheck() (bool, error)
}

type RollupProducer interface {
Expand Down
18 changes: 16 additions & 2 deletions go/enclave/enclave.go
Original file line number Diff line number Diff line change
Expand Up @@ -1248,9 +1248,23 @@ func (e *enclaveImpl) HealthCheck() (bool, common.SystemError) {
e.logger.Info("HealthCheck failed for the enclave storage", log.ErrKey, err)
return false, nil
}

// todo (#1148) - enclave healthcheck operations
otherview marked this conversation as resolved.
Show resolved Hide resolved
enclaveHealthy := true
return storageHealthy && enclaveHealthy, nil
l1blockHealthy, err := e.l1BlockProcessor.HealthCheck()
if err != nil {
// simplest iteration, log the error and just return that it's not healthy
e.logger.Info("HealthCheck failed for the l1 block processor", log.ErrKey, err)
return false, nil
}

l2batchHealthy, err := e.registry.HealthCheck()
if err != nil {
// simplest iteration, log the error and just return that it's not healthy
e.logger.Info("HealthCheck failed for the l2 batch registry", log.ErrKey, err)
return false, nil
}

return storageHealthy && l1blockHealthy && l2batchHealthy, nil
Comment on lines +1251 to +1267
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The implementation of the HealthCheck method has been updated to include checks for l1BlockProcessor and registry. Ensure that the error handling and logging are consistent and that the method's behavior aligns with the intended health check logic.

func (e *enclaveImpl) HealthCheck() (bool, common.SystemError) {
	if e.stopControl.IsStopping() {
		return false, responses.ToInternalError(fmt.Errorf("requested HealthCheck with the enclave stopping"))
	}

	// check the storage health
	storageHealthy, err := e.storage.HealthCheck()
	if err != nil {
		e.logger.Info("HealthCheck failed for the enclave storage", log.ErrKey, err)
		return false, nil
	}

	l1blockHealthy, err := e.l1BlockProcessor.HealthCheck()
	if err != nil {
		e.logger.Info("HealthCheck failed for the l1 block processor", log.ErrKey, err)
		return false, nil
	}

	l2batchHealthy, err := e.registry.HealthCheck()
	if err != nil {
		e.logger.Info("HealthCheck failed for the l2 batch registry", log.ErrKey, err)
		return false, nil
	}

	return storageHealthy && l1blockHealthy && l2batchHealthy, nil
}

Consider enhancing the error handling to provide more detailed information about which component failed the health check. Additionally, the log level for health check failures should be more severe than Info, such as Warn or Error, to reflect the critical nature of health check failures.


Committable suggestion

IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation.

Suggested change
// todo (#1148) - enclave healthcheck operations
enclaveHealthy := true
return storageHealthy && enclaveHealthy, nil
l1blockHealthy, err := e.l1BlockProcessor.HealthCheck()
if err != nil {
// simplest iteration, log the error and just return that it's not healthy
e.logger.Info("HealthCheck failed for the l1 block processor", log.ErrKey, err)
return false, nil
}
l2batchHealthy, err := e.registry.HealthCheck()
if err != nil {
// simplest iteration, log the error and just return that it's not healthy
e.logger.Info("HealthCheck failed for the l2 batch registry", log.ErrKey, err)
return false, nil
}
return storageHealthy && l1blockHealthy && l2batchHealthy, nil
// todo (#1148) - enclave healthcheck operations
if e.stopControl.IsStopping() {
return false, responses.ToInternalError(fmt.Errorf("requested HealthCheck with the enclave stopping"))
}
// check the storage health
storageHealthy, err := e.storage.HealthCheck()
if err != nil {
e.logger.Warn("HealthCheck failed for the enclave storage", log.ErrKey, err)
return false, nil
}
l1blockHealthy, err := e.l1BlockProcessor.HealthCheck()
if err != nil {
e.logger.Warn("HealthCheck failed for the l1 block processor", log.ErrKey, err)
return false, nil
}
l2batchHealthy, err := e.registry.HealthCheck()
if err != nil {
e.logger.Warn("HealthCheck failed for the l2 batch registry", log.ErrKey, err)
return false, nil
}
return storageHealthy && l1blockHealthy && l2batchHealthy, nil

Comment on lines 1248 to +1267
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The HealthCheck method has been updated to include additional health checks for l1BlockProcessor and registry. Ensure that the error handling and logging are consistent and that the method's behavior aligns with the intended health check logic.

func (e *enclaveImpl) HealthCheck() (bool, common.SystemError) {
	if e.stopControl.IsStopping() {
		return false, responses.ToInternalError(fmt.Errorf("requested HealthCheck with the enclave stopping"))
	}

	// check the storage health
	storageHealthy, err := e.storage.HealthCheck()
	if err != nil {
		e.logger.Info("HealthCheck failed for the enclave storage", log.ErrKey, err)
		return false, nil
	}

	l1blockHealthy, err := e.l1BlockProcessor.HealthCheck()
	if err != nil {
		e.logger.Info("HealthCheck failed for the l1 block processor", log.ErrKey, err)
		return false, nil
	}

	l2batchHealthy, err := e.registry.HealthCheck()
	if err != nil {
		e.logger.Info("HealthCheck failed for the l2 batch registry", log.ErrKey, err)
		return false, nil
	}

	return storageHealthy && l1blockHealthy && l2batchHealthy, nil
}

Consider enhancing the error handling to provide more detailed information about which component failed the health check. Additionally, the log level for health check failures should be more severe than Info, such as Warn or Error, to reflect the critical nature of health check failures.


Committable suggestion

IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation.

Suggested change
e.logger.Info("HealthCheck failed for the enclave storage", log.ErrKey, err)
return false, nil
}
// todo (#1148) - enclave healthcheck operations
enclaveHealthy := true
return storageHealthy && enclaveHealthy, nil
l1blockHealthy, err := e.l1BlockProcessor.HealthCheck()
if err != nil {
// simplest iteration, log the error and just return that it's not healthy
e.logger.Info("HealthCheck failed for the l1 block processor", log.ErrKey, err)
return false, nil
}
l2batchHealthy, err := e.registry.HealthCheck()
if err != nil {
// simplest iteration, log the error and just return that it's not healthy
e.logger.Info("HealthCheck failed for the l2 batch registry", log.ErrKey, err)
return false, nil
}
return storageHealthy && l1blockHealthy && l2batchHealthy, nil
e.logger.Warn("HealthCheck failed for the enclave storage", log.ErrKey, err)
return false, responses.ToInternalError(err)
}
// todo (#1148) - enclave healthcheck operations
l1blockHealthy, err := e.l1BlockProcessor.HealthCheck()
if err != nil {
e.logger.Warn("HealthCheck failed for the l1 block processor", log.ErrKey, err)
return false, responses.ToInternalError(err)
}
l2batchHealthy, err := e.registry.HealthCheck()
if err != nil {
e.logger.Warn("HealthCheck failed for the l2 batch registry", log.ErrKey, err)
return false, responses.ToInternalError(err)
}
return storageHealthy && l1blockHealthy && l2batchHealthy, nil

The HealthCheck method has been updated to include additional health checks for l1BlockProcessor and registry. Ensure that the error handling and logging are consistent and that the method's behavior aligns with the intended health check logic.

func (e *enclaveImpl) HealthCheck() (bool, common.SystemError) {
	if e.stopControl.IsStopping() {
		return false, responses.ToInternalError(fmt.Errorf("requested HealthCheck with the enclave stopping"))
	}

	// check the storage health
	storageHealthy, err := e.storage.HealthCheck()
	if err != nil {
		e.logger.Info("HealthCheck failed for the enclave storage", log.ErrKey, err)
		return false, nil
	}

	l1blockHealthy, err := e.l1BlockProcessor.HealthCheck()
	if err != nil {
		e.logger.Info("HealthCheck failed for the l1 block processor", log.ErrKey, err)
		return false, nil
	}

	l2batchHealthy, err := e.registry.HealthCheck()
	if err != nil {
		e.logger.Info("HealthCheck failed for the l2 batch registry", log.ErrKey, err)
		return false, nil
	}

	return storageHealthy && l1blockHealthy && l2batchHealthy, nil
}

Consider enhancing the error handling to provide more detailed information about which component failed the health check. Additionally, the log level for health check failures should be more severe than Info, such as Warn or Error, to reflect the critical nature of health check failures.


Committable suggestion

IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation.

Suggested change
e.logger.Info("HealthCheck failed for the enclave storage", log.ErrKey, err)
return false, nil
}
// todo (#1148) - enclave healthcheck operations
enclaveHealthy := true
return storageHealthy && enclaveHealthy, nil
l1blockHealthy, err := e.l1BlockProcessor.HealthCheck()
if err != nil {
// simplest iteration, log the error and just return that it's not healthy
e.logger.Info("HealthCheck failed for the l1 block processor", log.ErrKey, err)
return false, nil
}
l2batchHealthy, err := e.registry.HealthCheck()
if err != nil {
// simplest iteration, log the error and just return that it's not healthy
e.logger.Info("HealthCheck failed for the l2 batch registry", log.ErrKey, err)
return false, nil
}
return storageHealthy && l1blockHealthy && l2batchHealthy, nil
e.logger.Warn("HealthCheck failed for the enclave storage", log.ErrKey, err)
return false, responses.ToInternalError(err)
}
l1blockHealthy, err := e.l1BlockProcessor.HealthCheck()
if err != nil {
e.logger.Warn("HealthCheck failed for the l1 block processor", log.ErrKey, err)
return false, responses.ToInternalError(err)
}
l2batchHealthy, err := e.registry.HealthCheck()
if err != nil {
e.logger.Warn("HealthCheck failed for the l2 batch registry", log.ErrKey, err)
return false, responses.ToInternalError(err)
}
return storageHealthy && l1blockHealthy && l2batchHealthy, nil

}

func (e *enclaveImpl) DebugTraceTransaction(txHash gethcommon.Hash, config *tracers.TraceConfig) (json.RawMessage, common.SystemError) {
Expand Down
8 changes: 6 additions & 2 deletions go/enclave/storage/storage.go
Original file line number Diff line number Diff line change
Expand Up @@ -266,10 +266,14 @@ func (s *storageImpl) HealthCheck() (bool, error) {
defer s.logDuration("HealthCheck", measure.NewStopwatch())
headBatch, err := s.FetchHeadBatch()
if err != nil {
s.logger.Info("HealthCheck failed for enclave storage", log.ErrKey, err)
otherview marked this conversation as resolved.
Show resolved Hide resolved
return false, err
}
return headBatch != nil, nil

if headBatch == nil {
return false, fmt.Errorf("head batch is nil")
}

return true, nil
}

func (s *storageImpl) FetchHeadBatchForBlock(blockHash common.L1BlockHash) (*core.Batch, error) {
Expand Down
5 changes: 5 additions & 0 deletions go/enclave/txpool/txpool_mock_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,11 @@ func (m *mockBatchRegistry) HasGenesisBatch() (bool, error) {
panic("implement me")
}

func (m *mockBatchRegistry) HealthCheck() (bool, error) {
// TODO implement me
panic("implement me")
}
Comment on lines +60 to +63
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The HealthCheck method in mockBatchRegistry is not implemented and will cause a panic if called. Consider implementing the method or updating the TODO comment with more details about the intended implementation.

func (m *mockBatchRegistry) HealthCheck() (bool, error) {
-	// TODO implement me
-	panic("implement me")
+	// TODO: Implement the health check logic or return mock values for testing purposes.
+	// return true, nil // Example mock implementation
}

Committable suggestion

IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation.

Suggested change
func (m *mockBatchRegistry) HealthCheck() (bool, error) {
// TODO implement me
panic("implement me")
}
func (m *mockBatchRegistry) HealthCheck() (bool, error) {
// TODO: Implement the health check logic or return mock values for testing purposes.
// return true, nil // Example mock implementation
}


func (m *mockBatchRegistry) HeadBatchSeq() *big.Int {
return m.currentBatch.SeqNo()
}
Expand Down
12 changes: 10 additions & 2 deletions integration/simulation/simulation.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@
"context"
"errors"
"fmt"
"github.com/ten-protocol/go-ten/go/common/retry"

Check failure on line 7 in integration/simulation/simulation.go

View workflow job for this annotation

GitHub Actions / lint

File is not `gofumpt`-ed (gofumpt)
"math/big"
"sync"
"time"

Check failure on line 10 in integration/simulation/simulation.go

View workflow job for this annotation

GitHub Actions / lint

File is not `gofumpt`-ed (gofumpt)

Check failure on line 11 in integration/simulation/simulation.go

View workflow job for this annotation

GitHub Actions / lint

File is not `goimports`-ed (goimports)
"github.com/ethereum/go-ethereum"
"github.com/ethereum/go-ethereum/accounts/abi/bind"
"github.com/ethereum/go-ethereum/core/types"
Expand Down Expand Up @@ -288,8 +289,15 @@

func (s *Simulation) checkHealthStatus() {
for _, client := range s.RPCHandles.ObscuroClients {
if healthy, err := client.Health(); !healthy || err != nil {
panic("Client is not healthy: " + err.Error())
err := retry.Do(func() error {
healthy, err := client.Health()
if !healthy || err != nil {
return fmt.Errorf("client is not healthy: %w", err)
}
return nil
}, retry.NewTimeoutStrategy(30*time.Second, 100*time.Millisecond))
if err != nil {
panic(err)
Comment on lines +292 to +300
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The implementation of the retry mechanism in checkHealthStatus uses a timeout strategy, which is a good practice for network-related operations that might occasionally fail. However, ensure that the panic does not cause the entire application to crash in a production environment. It might be more appropriate to handle the error gracefully and log it for further investigation.

- panic(err)
+ // Log the error and handle it appropriately without causing a system-wide failure.
+ log.Error("Health check failed", "error", err)

Committable suggestion

IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation.

Suggested change
err := retry.Do(func() error {
healthy, err := client.Health()
if !healthy || err != nil {
return fmt.Errorf("client is not healthy: %w", err)
}
return nil
}, retry.NewTimeoutStrategy(30*time.Second, 100*time.Millisecond))
if err != nil {
panic(err)
err := retry.Do(func() error {
healthy, err := client.Health()
if !healthy || err != nil {
return fmt.Errorf("client is not healthy: %w", err)
}
return nil
}, retry.NewTimeoutStrategy(30*time.Second, 100*time.Millisecond))
if err != nil {
// Log the error and handle it appropriately without causing a system-wide failure.
log.Error("Health check failed", "error", err)

}
}
}
Expand Down
Loading