-
Notifications
You must be signed in to change notification settings - Fork 1.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
BCI-2450: Health check for MultiNode to ensure that node is not in syncing state #11765
Changes from 3 commits
2ec068f
09ba32d
d36798f
fcba78f
1b5a54e
265ea7f
902d49a
cd98bac
056bbee
450962c
e866e07
5b428a6
e1fb9d3
31c3fd2
690503f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -224,53 +224,81 @@ func (n *node[CHAIN_ID, HEAD, RPC]) start(startCtx context.Context) { | |
} | ||
n.setState(nodeStateDialed) | ||
|
||
if err := n.verify(startCtx); errors.Is(err, errInvalidChainID) { | ||
n.lfcLog.Errorw("Verify failed: Node has the wrong chain ID", "err", err) | ||
n.declareInvalidChainID() | ||
return | ||
} else if err != nil { | ||
n.lfcLog.Errorw(fmt.Sprintf("Verify failed: %v", err), "err", err) | ||
n.declareUnreachable() | ||
return | ||
} | ||
|
||
n.declareAlive() | ||
state := n.verifyConn(startCtx, n.lfcLog) | ||
n.declareState(state) | ||
} | ||
|
||
// verify checks that all connections to eth nodes match the given chain ID | ||
// verifyChainID checks that connection to the node matches the given chain ID | ||
// Not thread-safe | ||
// Pure verify: does not mutate node "state" field. | ||
func (n *node[CHAIN_ID, HEAD, RPC]) verify(callerCtx context.Context) (err error) { | ||
// Pure verifyChainID: does not mutate node "state" field. | ||
func (n *node[CHAIN_ID, HEAD, RPC]) verifyChainID(callerCtx context.Context, lggr logger.Logger) nodeState { | ||
promPoolRPCNodeVerifies.WithLabelValues(n.chainFamily, n.chainID.String(), n.name).Inc() | ||
promFailed := func() { | ||
promPoolRPCNodeVerifiesFailed.WithLabelValues(n.chainFamily, n.chainID.String(), n.name).Inc() | ||
} | ||
|
||
st := n.State() | ||
switch st { | ||
case nodeStateDialed, nodeStateOutOfSync, nodeStateInvalidChainID: | ||
case nodeStateDialed, nodeStateOutOfSync, nodeStateInvalidChainID, nodeStateSyncing: | ||
default: | ||
panic(fmt.Sprintf("cannot verify node in state %v", st)) | ||
} | ||
|
||
var chainID CHAIN_ID | ||
var err error | ||
if chainID, err = n.rpc.ChainID(callerCtx); err != nil { | ||
promFailed() | ||
return fmt.Errorf("failed to verify chain ID for node %s: %w", n.name, err) | ||
lggr.Errorw("Failed to verify chain ID for node", "err", err, "nodeState", n.State()) | ||
return nodeStateUnreachable | ||
} else if chainID.String() != n.chainID.String() { | ||
promFailed() | ||
return fmt.Errorf( | ||
err = fmt.Errorf( | ||
"rpc ChainID doesn't match local chain ID: RPC ID=%s, local ID=%s, node name=%s: %w", | ||
chainID.String(), | ||
n.chainID.String(), | ||
n.name, | ||
errInvalidChainID, | ||
) | ||
lggr.Errorw("Failed to verify RPC node; remote endpoint returned the wrong chain ID", "err", err, "nodeState", n.State()) | ||
return nodeStateInvalidChainID | ||
} | ||
|
||
promPoolRPCNodeVerifiesSuccess.WithLabelValues(n.chainFamily, n.chainID.String(), n.name).Inc() | ||
|
||
return nil | ||
return nodeStateAlive | ||
} | ||
|
||
// createVerifiedConn - establishes new connection with the RPC and verifies that it's valid: chainID matches, and it's not syncing. | ||
// Returns desired state if one of the verifications fails. Otherwise, returns nodeStateAlive. | ||
func (n *node[CHAIN_ID, HEAD, RPC]) createVerifiedConn(ctx context.Context, lggr logger.Logger) nodeState { | ||
if err := n.rpc.Dial(ctx); err != nil { | ||
n.lfcLog.Errorw("Dial failed: Node is unreachable", "err", err, "nodeState", n.State()) | ||
return nodeStateUnreachable | ||
} | ||
|
||
return n.verifyConn(ctx, lggr) | ||
} | ||
|
||
// verifyConn - verifies that current connection is valid: chainID matches, and it's not syncing. | ||
// Returns desired state if one of the verifications fails. Otherwise, returns nodeStateAlive. | ||
func (n *node[CHAIN_ID, HEAD, RPC]) verifyConn(ctx context.Context, lggr logger.Logger) nodeState { | ||
state := n.verifyChainID(ctx, lggr) | ||
if state != nodeStateAlive { | ||
return state | ||
} | ||
|
||
isSyncing, err := n.rpc.IsSyncing(ctx) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we put an additional config setting here? This will allow us to skip IsSyncing() calls for chains/clients that don't reliably implement this functionality, or have bugs around this behavior. |
||
if err != nil { | ||
lggr.Errorw("Unexpected error while verifying RPC node synchronization status", "err", err, "nodeState", n.State()) | ||
return nodeStateUnreachable | ||
} | ||
|
||
if isSyncing { | ||
lggr.Errorw("Verification failed: Node is syncing", "nodeState", n.State()) | ||
return nodeStateSyncing | ||
} | ||
|
||
return nodeStateAlive | ||
} | ||
|
||
// disconnectAll disconnects all clients connected to the node | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why does this return a state now?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
my initial thought would be an error return from the name of the method
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@samsondav
This way we unify error handling and logging across multiple callers and unify signature with other methods that perform similar actions - verify a new connection.
Previously
verify
was called directly in multiple places:outOfSyncLoop
,unreachableLoop
,invalidChainIDLoop
andstart
. NewsyncingLoop
also needs to check that chainID matches.All the callers had to log the error and do corresponding state transition.
With new
syncing
check we would endup with something like thisInstead we can do this
@poopoothegorilla yes, the naming is not perfect, but I do not have better options. What do you suggest?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yeh i dont think the name is that big of a deal...
I guess one option could be to return both the nodeState and error... that way you could just have the caller handle the logging and you might not have to pass the lggr through all callsIts been a while since i read the PR sorry
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yeah this makes sense