From 61376d21ce52bdf427a7c2503ea246ed2402ccb6 Mon Sep 17 00:00:00 2001 From: Lukasz Zimnoch Date: Mon, 20 May 2024 12:37:55 +0200 Subject: [PATCH] Changes in the inactivity claim trigger Here we introduce the following changes: - Inactivity claims are always issued with a non-empty inactive members set. This is a hard on-chain requirement that was missed so far - Heartbeat signing errors are not counted as consecutive heartbeat inactivity failure. This is because if signing failed, the inactivity claim will likely fail as well. Moreover, in case of signing error the inactive members set cannot be determined which violates the requirement from the first point --- pkg/tbtc/heartbeat.go | 73 +++++++++++++++++++++-------------- pkg/tbtc/heartbeat_test.go | 49 +++++++++++++++++------ pkg/tbtc/signing.go | 22 +++++------ pkg/tbtc/signing_loop.go | 20 ++++++++-- pkg/tbtc/signing_loop_test.go | 56 +++++++++++++++++++-------- 5 files changed, 149 insertions(+), 71 deletions(-) diff --git a/pkg/tbtc/heartbeat.go b/pkg/tbtc/heartbeat.go index 770d055879..c86afd88db 100644 --- a/pkg/tbtc/heartbeat.go +++ b/pkg/tbtc/heartbeat.go @@ -34,8 +34,8 @@ const ( // of 25 blocks is roughly 5 minutes, assuming 12 seconds per block. heartbeatTimeoutSafetyMarginBlocks = 25 // heartbeatSigningMinimumActiveOperators determines the minimum number of - // active operators during signing for a heartbeat to be considered valid. - heartbeatSigningMinimumActiveOperators = 70 + // active members during signing for a heartbeat to be considered valid. + heartbeatSigningMinimumActiveMembers = 70 // heartbeatConsecutiveFailuresThreshold determines the number of consecutive // heartbeat failures required to trigger inactivity operator notification. heartbeatConsecutiveFailureThreshold = 3 @@ -60,7 +60,7 @@ type heartbeatSigningExecutor interface { ctx context.Context, message *big.Int, startBlock uint64, - ) (*tecdsa.Signature, uint32, uint64, error) + ) (*tecdsa.Signature, *signingActivityReport, uint64, error) } // heartbeatInactivityClaimExecutor is an interface meant to decouple the @@ -168,51 +168,69 @@ func (ha *heartbeatAction) execute() error { ) defer cancelHeartbeatSigningCtx() - signature, activeOperatorsCount, _, err := ha.signingExecutor.sign( + signature, activityReport, _, err := ha.signingExecutor.sign( heartbeatSigningCtx, messageToSign, ha.startBlock, ) + if err != nil { + // Do not count this error as heartbeat inactivity failure. If the + // process returned an error here, that likely means the group signing + // threshold was not met. In such a case, the inactivity claim does not + // have a chance for success anyway (it needs the group threshold to + // be met as well). + return fmt.Errorf("heartbeat signing process errored out: [%v]", err) + } - // If there was no error and the number of active operators during signing - // was enough, we can consider the heartbeat procedure as successful. - if err == nil && activeOperatorsCount >= heartbeatSigningMinimumActiveOperators { + // If the number of active members during signing was enough, we can + // consider the heartbeat procedure as successful. + activeMembersCount := len(activityReport.activeMembers) + if activeMembersCount >= heartbeatSigningMinimumActiveMembers { ha.logger.Infof( - "successfully generated signature [%s] for heartbeat message [0x%x]", + "heartbeat generated signature [%s] for message [0x%x]", signature, ha.proposal.Message[:], ) - // Reset the counter for consecutive heartbeat failure. + // Reset the counter of consecutive heartbeat inactivity failures. ha.failureCounter.reset(walletKey) return nil } - // If there was an error or the number of active operators during signing - // was not enough, we must consider the heartbeat procedure as a failure. + // If the number of active members during signing was not enough, we + // must consider the heartbeat procedure as an inactivity failure. ha.logger.Warnf( - "heartbeat failed; [%d/%d] operators participated; the process "+ - "returned [%v] as error", - activeOperatorsCount, - heartbeatSigningMinimumActiveOperators, - err, + "heartbeat generated signature but minimum activity "+ + "threshold was not met ([%d/%d] members participated); "+ + "counting it as inactivity failure", + activeMembersCount, + heartbeatSigningMinimumActiveMembers, ) - // Increment the heartbeat failure counter. + // Increment the heartbeat inactivity failure counter. ha.failureCounter.increment(walletKey) - // If the number of consecutive heartbeat failures does not exceed the - // threshold do not notify about operator inactivity. + // If the number of consecutive heartbeat inactivity failures does not + // exceed the threshold, do not issue an inactivity claim yet. if ha.failureCounter.get(walletKey) < heartbeatConsecutiveFailureThreshold { ha.logger.Warnf( - "leaving without notifying about operator inactivity; current "+ - "heartbeat failure count is [%d]", + "not issuing an inactivity claim yet; current consecutive"+ + "heartbeat inactivity failure count is [%d/%d]", ha.failureCounter.get(walletKey), + heartbeatConsecutiveFailureThreshold, ) return nil } + // This should not happen but check it just in case as inactivity claim + // requires a non-empty inactive members set. + if len(activityReport.inactiveMembers) == 0 { + return fmt.Errorf( + "inactivity claim aborted due to an undetermined set of inactive members", + ) + } + heartbeatInactivityCtx, cancelHeartbeatInactivityCtx := withCancelOnBlock( context.Background(), ha.expiryBlock-heartbeatTimeoutSafetyMarginBlocks, @@ -220,15 +238,14 @@ func (ha *heartbeatAction) execute() error { ) defer cancelHeartbeatInactivityCtx() - // The value of consecutive heartbeat failures exceeds the threshold. - // Proceed with operator inactivity notification. + // The value of consecutive heartbeat inactivity failures exceeds the threshold. + // Proceed with operator inactivity claim. err = ha.inactivityClaimExecutor.claimInactivity( heartbeatInactivityCtx, - // Leave the list of inactive operators empty even if some operators - // were inactive during signing heartbeat. The inactive operators could - // simply be in the process of unstaking and therefore should not be - // punished. - []group.MemberIndex{}, + // It's safe to consider unstaking members as inactive members in the claim. + // Inactive members are set ineligible for on-chain rewards for a certain + // period of time. This is a desired outcome for unstaking members as well. + activityReport.inactiveMembers, true, messageToSign, ) diff --git a/pkg/tbtc/heartbeat_test.go b/pkg/tbtc/heartbeat_test.go index 6946904201..c635659a08 100644 --- a/pkg/tbtc/heartbeat_test.go +++ b/pkg/tbtc/heartbeat_test.go @@ -6,6 +6,7 @@ import ( "encoding/hex" "fmt" "math/big" + "reflect" "testing" "github.com/keep-network/keep-core/internal/testutils" @@ -51,7 +52,7 @@ func TestHeartbeatAction_HappyPath(t *testing.T) { // Set the active operators count to the minimum required value. mockExecutor := &mockHeartbeatSigningExecutor{} - mockExecutor.activeOperatorsCount = heartbeatSigningMinimumActiveOperators + mockExecutor.activeOperatorsCount = heartbeatSigningMinimumActiveMembers inactivityClaimExecutor := &mockInactivityClaimExecutor{} @@ -130,7 +131,7 @@ func TestHeartbeatAction_OperatorUnstaking(t *testing.T) { // Set the active operators count to the minimum required value. mockExecutor := &mockHeartbeatSigningExecutor{} - mockExecutor.activeOperatorsCount = heartbeatSigningMinimumActiveOperators + mockExecutor.activeOperatorsCount = heartbeatSigningMinimumActiveMembers inactivityClaimExecutor := &mockInactivityClaimExecutor{} @@ -193,7 +194,7 @@ func TestHeartbeatAction_Failure_SigningError(t *testing.T) { mockExecutor := &mockHeartbeatSigningExecutor{} mockExecutor.shouldFail = true - mockExecutor.activeOperatorsCount = heartbeatSigningMinimumActiveOperators + mockExecutor.activeOperatorsCount = heartbeatSigningMinimumActiveMembers inactivityClaimExecutor := &mockInactivityClaimExecutor{} @@ -217,14 +218,22 @@ func TestHeartbeatAction_Failure_SigningError(t *testing.T) { // Do not expect the execution to result in an error. Signing error does not // mean the procedure failure. err = action.execute() - if err != nil { - t.Fatal(err) + + expectedError := fmt.Errorf("heartbeat signing process errored out: [oofta]") + if !reflect.DeepEqual(expectedError, err) { + t.Errorf( + "unexpected error\n"+ + "expected: %v\n"+ + "actual: %v\n", + expectedError, + err, + ) } testutils.AssertUintsEqual( t, "heartbeat failure count", - 1, + 0, uint64(heartbeatFailureCounter.get(walletPublicKeyStr)), ) testutils.AssertBigIntsEqual( @@ -264,7 +273,7 @@ func TestHeartbeatAction_Failure_TooFewActiveOperators(t *testing.T) { // Set the active operators count just below the required number. mockExecutor := &mockHeartbeatSigningExecutor{} - mockExecutor.activeOperatorsCount = heartbeatSigningMinimumActiveOperators - 1 + mockExecutor.activeOperatorsCount = heartbeatSigningMinimumActiveMembers - 1 inactivityClaimExecutor := &mockInactivityClaimExecutor{} @@ -344,7 +353,7 @@ func TestHeartbeatAction_Failure_CounterExceeded(t *testing.T) { hostChain.setHeartbeatProposalValidationResult(proposal, true) mockExecutor := &mockHeartbeatSigningExecutor{} - mockExecutor.shouldFail = true + mockExecutor.activeOperatorsCount = heartbeatSigningMinimumActiveMembers - 1 inactivityClaimExecutor := &mockInactivityClaimExecutor{} @@ -424,7 +433,7 @@ func TestHeartbeatAction_Failure_InactivityExecutionFailure(t *testing.T) { hostChain.setHeartbeatProposalValidationResult(proposal, true) mockExecutor := &mockHeartbeatSigningExecutor{} - mockExecutor.shouldFail = true + mockExecutor.activeOperatorsCount = heartbeatSigningMinimumActiveMembers - 1 inactivityClaimExecutor := &mockInactivityClaimExecutor{} inactivityClaimExecutor.shouldFail = true @@ -603,15 +612,31 @@ func (mhse *mockHeartbeatSigningExecutor) sign( ctx context.Context, message *big.Int, startBlock uint64, -) (*tecdsa.Signature, uint32, uint64, error) { +) (*tecdsa.Signature, *signingActivityReport, uint64, error) { mhse.requestedMessage = message mhse.requestedStartBlock = startBlock if mhse.shouldFail { - return nil, 0, 0, fmt.Errorf("oofta") + return nil, nil, 0, fmt.Errorf("oofta") + } + + activeMembers := make([]group.MemberIndex, 0) + inactiveMembers := make([]group.MemberIndex, 0) + + for memberIndex := uint32(1); memberIndex <= 100; memberIndex++ { + if memberIndex <= mhse.activeOperatorsCount { + activeMembers = append(activeMembers, group.MemberIndex(memberIndex)) + } else { + inactiveMembers = append(inactiveMembers, group.MemberIndex(memberIndex)) + } + } + + activityReport := &signingActivityReport{ + activeMembers: activeMembers, + inactiveMembers: inactiveMembers, } - return &tecdsa.Signature{}, mhse.activeOperatorsCount, startBlock + 1, nil + return &tecdsa.Signature{}, activityReport, startBlock + 1, nil } type mockInactivityClaimExecutor struct { diff --git a/pkg/tbtc/signing.go b/pkg/tbtc/signing.go index b4ab2b069e..370e8df583 100644 --- a/pkg/tbtc/signing.go +++ b/pkg/tbtc/signing.go @@ -174,9 +174,9 @@ func (se *signingExecutor) sign( ctx context.Context, message *big.Int, startBlock uint64, -) (*tecdsa.Signature, uint32, uint64, error) { +) (*tecdsa.Signature, *signingActivityReport, uint64, error) { if lockAcquired := se.lock.TryAcquire(1); !lockAcquired { - return nil, 0, 0, errSigningExecutorBusy + return nil, nil, 0, errSigningExecutorBusy } defer se.lock.Release(1) @@ -184,7 +184,7 @@ func (se *signingExecutor) sign( walletPublicKeyBytes, err := marshalPublicKey(wallet.publicKey) if err != nil { - return nil, 0, 0, fmt.Errorf("cannot marshal wallet public key: [%v]", err) + return nil, nil, 0, fmt.Errorf("cannot marshal wallet public key: [%v]", err) } loopTimeoutBlock := startBlock + @@ -198,9 +198,9 @@ func (se *signingExecutor) sign( ) type signingOutcome struct { - signature *tecdsa.Signature - activeMembersCount uint32 - endBlock uint64 + signature *tecdsa.Signature + activityReport *signingActivityReport + endBlock uint64 } wg := sync.WaitGroup{} @@ -367,9 +367,9 @@ func (se *signingExecutor) sign( ) signingOutcomeChan <- &signingOutcome{ - signature: loopResult.result.Signature, - activeMembersCount: loopResult.activeMembersCount, - endBlock: loopResult.latestEndBlock, + signature: loopResult.result.Signature, + activityReport: loopResult.activityReport, + endBlock: loopResult.latestEndBlock, } }(currentSigner) } @@ -386,9 +386,9 @@ func (se *signingExecutor) sign( // signer, that means all signers failed and have not produced a signature. select { case outcome := <-signingOutcomeChan: - return outcome.signature, outcome.activeMembersCount, outcome.endBlock, nil + return outcome.signature, outcome.activityReport, outcome.endBlock, nil default: - return nil, 0, 0, fmt.Errorf("all signers failed") + return nil, nil, 0, fmt.Errorf("all signers failed") } } diff --git a/pkg/tbtc/signing_loop.go b/pkg/tbtc/signing_loop.go index 48f881df03..7e787f1975 100644 --- a/pkg/tbtc/signing_loop.go +++ b/pkg/tbtc/signing_loop.go @@ -140,13 +140,20 @@ type signingAttemptParams struct { // signingAttemptFn represents a function performing a signing attempt. type signingAttemptFn func(*signingAttemptParams) (*signing.Result, uint64, error) +// signingActivityReport holds information about the activity of the signing +// group members during the signing process. +type signingActivityReport struct { + activeMembers []group.MemberIndex + inactiveMembers []group.MemberIndex +} + // signingRetryLoopResult represents the result of the signing retry loop. type signingRetryLoopResult struct { // result is the outcome of the signing process. result *signing.Result - // activeMembersCount is the number of members that participated in the - // signing process. - activeMembersCount uint32 + // activityReport holds information about the activity of the signing + // group members during the signing process. + activityReport *signingActivityReport // latestEndBlock is the block at which the slowest signer of the successful // signing attempt completed signature computation. This block is also // the common end block accepted by all other members of the signing group. @@ -409,9 +416,14 @@ func (srl *signingRetryLoop) start( continue } + activityReport := &signingActivityReport{ + activeMembers: readyMembersIndexes, + inactiveMembers: unreadyMembersIndexes, + } + return &signingRetryLoopResult{ result: result, - activeMembersCount: uint32(len(readyMembersIndexes)), + activityReport: activityReport, latestEndBlock: latestEndBlock, attemptTimeoutBlock: timeoutBlock, }, nil diff --git a/pkg/tbtc/signing_loop_test.go b/pkg/tbtc/signing_loop_test.go index cf3b02e716..93397a9ef2 100644 --- a/pkg/tbtc/signing_loop_test.go +++ b/pkg/tbtc/signing_loop_test.go @@ -100,8 +100,11 @@ func TestSigningRetryLoop(t *testing.T) { }, expectedErr: nil, expectedResult: &signingRetryLoopResult{ - result: testResult, - activeMembersCount: 10, + result: testResult, + activityReport: &signingActivityReport{ + activeMembers: signingGroupMembersIndexes, + inactiveMembers: []group.MemberIndex{}, + }, latestEndBlock: 215, // the end block resolved by the done check phase attemptTimeoutBlock: 236, // start block of the first attempt + 30 }, @@ -151,8 +154,11 @@ func TestSigningRetryLoop(t *testing.T) { }, expectedErr: nil, expectedResult: &signingRetryLoopResult{ - result: testResult, - activeMembersCount: 6, + result: testResult, + activityReport: &signingActivityReport{ + activeMembers: []group.MemberIndex{1, 2, 3, 6, 7, 9}, + inactiveMembers: []group.MemberIndex{4, 5, 8, 10}, + }, latestEndBlock: 215, // the end block resolved by the done check phase attemptTimeoutBlock: 236, // start block of the first attempt + 30 }, @@ -206,8 +212,11 @@ func TestSigningRetryLoop(t *testing.T) { }, expectedErr: nil, expectedResult: &signingRetryLoopResult{ - result: testResult, - activeMembersCount: 10, + result: testResult, + activityReport: &signingActivityReport{ + activeMembers: signingGroupMembersIndexes, + inactiveMembers: []group.MemberIndex{}, + }, latestEndBlock: 260, // the end block resolved by the done check phase attemptTimeoutBlock: 277, // start block of the second attempt + 30 }, @@ -263,8 +272,11 @@ func TestSigningRetryLoop(t *testing.T) { }, expectedErr: nil, expectedResult: &signingRetryLoopResult{ - result: testResult, - activeMembersCount: 10, + result: testResult, + activityReport: &signingActivityReport{ + activeMembers: signingGroupMembersIndexes, + inactiveMembers: []group.MemberIndex{}, + }, latestEndBlock: 260, // the end block resolved by the done check phase attemptTimeoutBlock: 277, // start block of the second attempt + 30 }, @@ -320,8 +332,11 @@ func TestSigningRetryLoop(t *testing.T) { }, expectedErr: nil, expectedResult: &signingRetryLoopResult{ - result: testResult, - activeMembersCount: 10, + result: testResult, + activityReport: &signingActivityReport{ + activeMembers: signingGroupMembersIndexes, + inactiveMembers: []group.MemberIndex{}, + }, latestEndBlock: 260, // the end block resolved by the done check phase attemptTimeoutBlock: 277, // start block of the second attempt + 30 }, @@ -369,8 +384,11 @@ func TestSigningRetryLoop(t *testing.T) { expectedOutgoingDoneChecks: nil, expectedErr: nil, expectedResult: &signingRetryLoopResult{ - result: testResult, - activeMembersCount: 10, + result: testResult, + activityReport: &signingActivityReport{ + activeMembers: signingGroupMembersIndexes, + inactiveMembers: []group.MemberIndex{}, + }, latestEndBlock: 260, // the end block resolved by the done check phase attemptTimeoutBlock: 277, // start block of the second attempt + 30 }, @@ -441,8 +459,11 @@ func TestSigningRetryLoop(t *testing.T) { }, expectedErr: nil, expectedResult: &signingRetryLoopResult{ - result: testResult, - activeMembersCount: 10, + result: testResult, + activityReport: &signingActivityReport{ + activeMembers: signingGroupMembersIndexes, + inactiveMembers: []group.MemberIndex{}, + }, latestEndBlock: 260, // the end block resolved by the done check phase attemptTimeoutBlock: 277, // start block of the second attempt + 30 }, @@ -547,8 +568,11 @@ func TestSigningRetryLoop(t *testing.T) { }, expectedErr: nil, expectedResult: &signingRetryLoopResult{ - result: testResult, - activeMembersCount: 10, + result: testResult, + activityReport: &signingActivityReport{ + activeMembers: signingGroupMembersIndexes, + inactiveMembers: []group.MemberIndex{}, + }, latestEndBlock: 260, // the end block resolved by the done check phase attemptTimeoutBlock: 277, // start block of the second attempt + 30 },