From 76a1157cd9e15d406e4a2e07ba70a823af97a55d Mon Sep 17 00:00:00 2001 From: ziggie Date: Wed, 24 Jan 2024 00:23:10 +0000 Subject: [PATCH 1/2] chain: PrunedBlockDispatcher bugfix. In case we fail the request via the neutriono block peer fetcher we make sure we fail all dependant `GetBlock` calls and remove this block request completely so that a new request for this same block hash can be made. --- chain/bitcoind_conn.go | 28 ++++- chain/pruned_block_dispatcher.go | 90 ++++++++++++++-- chain/pruned_block_dispatcher_test.go | 143 ++++++++++++++++++++++---- 3 files changed, 231 insertions(+), 30 deletions(-) diff --git a/chain/bitcoind_conn.go b/chain/bitcoind_conn.go index 88edf1b05a..8a580dbb5c 100644 --- a/chain/bitcoind_conn.go +++ b/chain/bitcoind_conn.go @@ -469,10 +469,16 @@ func (c *BitcoindConn) GetBlock(hash *chainhash.Hash) (*wire.MsgBlock, error) { return nil, err } + // cancelChan is needed in case a block request with the same hash is + // already registered and fails via the returned errChan. Because we + // don't register a new request in this case this cancelChan is used + // to signal the failure of the dependant block request. + cancelChan := make(chan error, 1) + // Now that we know the block has been pruned for sure, request it from // our backend peers. blockChan, errChan := c.prunedBlockDispatcher.Query( - []*chainhash.Hash{hash}, + []*chainhash.Hash{hash}, cancelChan, ) for { @@ -482,12 +488,32 @@ func (c *BitcoindConn) GetBlock(hash *chainhash.Hash) (*wire.MsgBlock, error) { case err := <-errChan: if err != nil { + // An error was returned for this block request. + // We have to make sure we remove the blockhash + // from the list of queried blocks. + // Moreover because in case a block is requested + // more than once no redundant block requests + // are registered but rather a reply channel is + // added to the pending block request. This + // means we need to cancel all dependent + // `GetBlock` calls via the cancel channel when + // the fetching of the block was NOT successful. + c.prunedBlockDispatcher.CancelRequest( + *hash, err, + ) + return nil, err } // errChan fired before blockChan with a nil error, wait // for the block now. + // The cancelChan is only used when there is already a pending + // block request for this hash and that block request fails via + // the error channel above. + case err := <-cancelChan: + return nil, err + case <-c.quit: return nil, ErrBitcoindClientShuttingDown } diff --git a/chain/pruned_block_dispatcher.go b/chain/pruned_block_dispatcher.go index 9c2d5e6168..915c595500 100644 --- a/chain/pruned_block_dispatcher.go +++ b/chain/pruned_block_dispatcher.go @@ -143,7 +143,14 @@ type PrunedBlockDispatcher struct { // NOTE: The blockMtx lock must always be held when accessing this // field. blocksQueried map[chainhash.Hash][]chan *wire.MsgBlock - blockMtx sync.Mutex + + // blockQueryCancel signals the cancellation of a `GetBlock` request. + // + // NOTE: The blockMtx lock must always be held when accessing this + // field. + blockQueryCancel map[chainhash.Hash][]chan<- error + + blockMtx sync.Mutex // currentPeers represents the set of peers we're currently connected // to. Each peer found here will have a worker spawned within the @@ -198,12 +205,13 @@ func NewPrunedBlockDispatcher(cfg *PrunedBlockDispatcherConfig) ( NewWorker: query.NewWorker, Ranking: query.NewPeerRanking(), }), - blocksQueried: make(map[chainhash.Hash][]chan *wire.MsgBlock), - currentPeers: make(map[string]*peer.Peer), - bannedPeers: make(map[string]struct{}), - peersConnected: peersConnected, - timeSource: blockchain.NewMedianTime(), - quit: make(chan struct{}), + blocksQueried: make(map[chainhash.Hash][]chan *wire.MsgBlock), + blockQueryCancel: make(map[chainhash.Hash][]chan<- error), + currentPeers: make(map[string]*peer.Peer), + bannedPeers: make(map[string]struct{}), + peersConnected: peersConnected, + timeSource: blockchain.NewMedianTime(), + quit: make(chan struct{}), }, nil } @@ -502,9 +510,10 @@ func (d *PrunedBlockDispatcher) banPeer(peer string) { // Query submits a request to query the information of the given blocks. func (d *PrunedBlockDispatcher) Query(blocks []*chainhash.Hash, + cancelChan chan<- error, opts ...query.QueryOption) (<-chan *wire.MsgBlock, <-chan error) { - reqs, blockChan, err := d.newRequest(blocks) + reqs, blockChan, err := d.newRequest(blocks, cancelChan) if err != nil { errChan := make(chan error, 1) errChan <- err @@ -515,14 +524,18 @@ func (d *PrunedBlockDispatcher) Query(blocks []*chainhash.Hash, if len(reqs) > 0 { errChan = d.workManager.Query(reqs, opts...) } + return blockChan, errChan } // newRequest construct a new query request for the given blocks to submit to // the internal workManager. A channel is also returned through which the // requested blocks are sent through. -func (d *PrunedBlockDispatcher) newRequest(blocks []*chainhash.Hash) ( - []*query.Request, <-chan *wire.MsgBlock, error) { +// +// NOTE: The cancelChan must be buffered. +func (d *PrunedBlockDispatcher) newRequest(blocks []*chainhash.Hash, + cancelChan chan<- error) ([]*query.Request, <-chan *wire.MsgBlock, + error) { // Make sure the channel is buffered enough to handle all blocks. blockChan := make(chan *wire.MsgBlock, len(blocks)) @@ -550,6 +563,10 @@ func (d *PrunedBlockDispatcher) newRequest(blocks []*chainhash.Hash) ( } else { log.Debugf("Received new request for pending query of "+ "block %v", *block) + + d.blockQueryCancel[*block] = append( + d.blockQueryCancel[*block], cancelChan, + ) } d.blocksQueried[*block] = append( @@ -614,6 +631,8 @@ func (d *PrunedBlockDispatcher) handleResp(req, resp wire.Message, Finished: false, } } + copyblockChans := make([]chan *wire.MsgBlock, len(blockChans)) + copy(copyblockChans, blockChans) err := blockchain.CheckBlockSanity( btcutil.NewBlock(block), d.cfg.ChainParams.PowLimit, @@ -667,7 +686,7 @@ func (d *PrunedBlockDispatcher) handleResp(req, resp wire.Message, go func() { defer d.wg.Done() - for _, blockChan := range blockChans { + for _, blockChan := range copyblockChans { select { case blockChan <- block: case <-d.quit: @@ -678,3 +697,52 @@ func (d *PrunedBlockDispatcher) handleResp(req, resp wire.Message, return progress } + +// CancelRequest removes all information regarding a failed block request. +// When for example the Peer disconnects or runs in a timeout we make sure +// that all related information is deleted and a new request for this block +// can be registered. Moreover will also cancel all depending goroutines. +func (d *PrunedBlockDispatcher) CancelRequest(blockHash chainhash.Hash, + err error) { + + // failDependant is a helper function which fails all dependant + // goroutines via their cancel channels. + failDependant := func(cancelChans []chan<- error) { + defer d.wg.Done() + + for _, cancel := range cancelChans { + select { + case cancel <- err: + case <-d.quit: + return + } + } + } + + d.blockMtx.Lock() + + // Before removing the block hash we get the cancelChans which were + // registered for block requests that had already an ongoing pending + // request. + cancelChans, ok := d.blockQueryCancel[blockHash] + var copycancelChans []chan<- error + if ok { + copycancelChans = make([]chan<- error, len(cancelChans)) + copy(copycancelChans, cancelChans) + } + + // Remove all data related to this block request to make sure the same + // block can be registered again in the future. + delete(d.blocksQueried, blockHash) + delete(d.blockQueryCancel, blockHash) + + d.blockMtx.Unlock() + + // In case there are goroutines depending on this block request we + // make sure we cancel them. + // We do this in a goroutine to not block the initial request. + if ok { + d.wg.Add(1) + go failDependant(copycancelChans) + } +} diff --git a/chain/pruned_block_dispatcher_test.go b/chain/pruned_block_dispatcher_test.go index b7f801d8b5..990d0d3a63 100644 --- a/chain/pruned_block_dispatcher_test.go +++ b/chain/pruned_block_dispatcher_test.go @@ -15,6 +15,7 @@ import ( "github.com/btcsuite/btcd/chaincfg/chainhash" "github.com/btcsuite/btcd/peer" "github.com/btcsuite/btcd/wire" + "github.com/lightninglabs/neutrino/query" "github.com/lightningnetwork/lnd/ticker" "github.com/stretchr/testify/require" ) @@ -258,12 +259,18 @@ func (h *prunedBlockDispatcherHarness) newPeer() *peer.Peer { } // query requests the given blocks from the PrunedBlockDispatcher. -func (h *prunedBlockDispatcherHarness) query(blocks []*chainhash.Hash) ( - <-chan *wire.MsgBlock, <-chan error) { +func (h *prunedBlockDispatcherHarness) query(blocks []*chainhash.Hash, + opts ...query.QueryOption) ( + <-chan *wire.MsgBlock, <-chan error, <-chan error) { h.t.Helper() - blockChan, errChan := h.dispatcher.Query(blocks) + // cancelChan will receive an error msg in case the dependant block + // request fails. This is used for block requests already have a pending + // request registered and this request fails. + cancelChan := make(chan error, 1) + + blockChan, errChan := h.dispatcher.Query(blocks, cancelChan, opts...) select { case err := <-errChan: require.NoError(h.t, err) @@ -274,7 +281,7 @@ func (h *prunedBlockDispatcherHarness) query(blocks []*chainhash.Hash) ( h.blocksQueried[*block]++ } - return blockChan, errChan + return blockChan, errChan, cancelChan } // disablePeerReplies prevents the query peer from replying. @@ -363,7 +370,7 @@ func (h *prunedBlockDispatcherHarness) assertPeerQueried() { // assertPeerReplied asserts that the query peer replies with a block the // PrunedBlockDispatcher queried for. func (h *prunedBlockDispatcherHarness) assertPeerReplied( - blockChan <-chan *wire.MsgBlock, errChan <-chan error, + blockChan <-chan *wire.MsgBlock, errChan, cancelChan <-chan error, expectCompletionSignal bool) { h.t.Helper() @@ -385,10 +392,18 @@ func (h *prunedBlockDispatcherHarness) assertPeerReplied( delete(h.blocksQueried, blockHash) } + // We need to check the errChan after a timeout because when a request + // was successful a nil error is signaled via the errChan and this + // might happen even before the block is received. case <-time.After(5 * time.Second): select { case err := <-errChan: h.t.Fatalf("received unexpected error send: %v", err) + + case err := <-cancelChan: + h.t.Fatalf("received unexpected cancel request with "+ + "error: %v", err) + default: } h.t.Fatal("expected reply from peer") @@ -406,6 +421,38 @@ func (h *prunedBlockDispatcherHarness) assertPeerReplied( } } +// assertPeerFailed asserts that the query request fails with an expected +// error. +func (h *prunedBlockDispatcherHarness) assertPeerFailed( + blockChan <-chan *wire.MsgBlock, errChan, cancelChan <-chan error, + expectedErr error) { + + h.t.Helper() + + select { + case <-blockChan: + h.t.Fatalf("expected no reply from peer") + + case err := <-errChan: + require.ErrorIs(h.t, err, expectedErr) + for _, hash := range h.hashes { + h.dispatcher.CancelRequest(*hash, err) + // The corresponding block is deleted from the request + // queue in `CancelRequest` so we delete it from the + // harness as well. + delete(h.blocksQueried, *hash) + } + + case err := <-cancelChan: + require.ErrorIs(h.t, err, expectedErr) + + case <-time.After(5 * time.Second): + h.t.Fatalf("expected the error for the block request: %v", + expectedErr) + } + +} + // assertNoPeerDialed asserts that the PrunedBlockDispatcher hasn't established // a new peer connection. func (h *prunedBlockDispatcherHarness) assertNoPeerDialed() { @@ -420,15 +467,21 @@ func (h *prunedBlockDispatcherHarness) assertNoPeerDialed() { // assertNoReply asserts that the peer hasn't replied to a query. func (h *prunedBlockDispatcherHarness) assertNoReply( - blockChan <-chan *wire.MsgBlock, errChan <-chan error) { + blockChan <-chan *wire.MsgBlock, errChan, cancelChan <-chan error) { h.t.Helper() select { case block := <-blockChan: h.t.Fatalf("received unexpected block %v", block.BlockHash()) + case err := <-errChan: h.t.Fatalf("received unexpected error send: %v", err) + + case err := <-cancelChan: + h.t.Fatalf("received unexpected cancel request with error: %v", + err) + case <-time.After(2 * time.Second): } } @@ -449,16 +502,64 @@ func TestPrunedBlockDispatcherQuerySameBlock(t *testing.T) { // Queue all the block requests one by one. blockChans := make([]<-chan *wire.MsgBlock, 0, numRequests) errChans := make([]<-chan error, 0, numRequests) + cancelChans := make([]<-chan error, 0, numRequests) + + for i := 0; i < numRequests; i++ { + blockChan, errChan, cancelChan := h.query(h.hashes) + blockChans = append(blockChans, blockChan) + errChans = append(errChans, errChan) + cancelChans = append(cancelChans, cancelChan) + + } + + // We should only see one query. + h.assertPeerQueried() + for i := 0; i < numRequests; i++ { + h.assertPeerReplied(blockChans[i], errChans[i], cancelChans[i], + i == 0) + } +} + +// TestPrunedBlockDispatcherQuerySameBlock tests that client requests for the +// same block result in only fetching the block once while pending. +func TestPrunedBlockDispatcherQueryFailSameBlock(t *testing.T) { + t.Parallel() + + const numBlocks = 1 + const numPeers = 5 + const numRequests = numBlocks * numPeers + + h := newNetworkBlockTestHarness(t, numBlocks, numPeers, numPeers) + h.start() + defer h.stop() + + // Queue all the block requests one by one. + blockChans := make([]<-chan *wire.MsgBlock, 0, numRequests) + errChans := make([]<-chan error, 0, numRequests) + cancelChans := make([]<-chan error, 0, numRequests) + + // We want to force a timeout. + h.disablePeerReplies() + for i := 0; i < numRequests; i++ { - blockChan, errChan := h.query(h.hashes) + // The default retry number is 2 and is defined in the neutrino + // package. We want to fail the request therefore we use 1. + // Moreover the default timeout of a single request is 2 seconds + // and currently not configurable so we have to make sure when + // asserting not not timeout before. + blockChan, errChan, cancelChan := h.query( + h.hashes, query.NumRetries(1), + ) blockChans = append(blockChans, blockChan) errChans = append(errChans, errChan) + cancelChans = append(cancelChans, cancelChan) } // We should only see one query. h.assertPeerQueried() for i := 0; i < numRequests; i++ { - h.assertPeerReplied(blockChans[i], errChans[i], i == 0) + h.assertPeerFailed(blockChans[i], errChans[i], cancelChans[i], + query.ErrQueryTimeout) } } @@ -476,7 +577,7 @@ func TestPrunedBlockDispatcherMultipleGetData(t *testing.T) { defer h.stop() // Request all blocks. - blockChan, errChan := h.query(h.hashes) + blockChan, errChan, cancelChan := h.query(h.hashes) // Since we have more blocks than can fit in a single GetData message, // we should expect multiple queries. For each query, we should expect @@ -491,7 +592,8 @@ func TestPrunedBlockDispatcherMultipleGetData(t *testing.T) { for j := 0; j < maxRequestInvs; j++ { expectCompletionSignal := blocksRecvd == numBlocks-1 h.assertPeerReplied( - blockChan, errChan, expectCompletionSignal, + blockChan, errChan, cancelChan, + expectCompletionSignal, ) blocksRecvd++ @@ -517,16 +619,21 @@ func TestPrunedBlockDispatcherMultipleQueryPeers(t *testing.T) { // Queue all the block requests one by one. blockChans := make([]<-chan *wire.MsgBlock, 0, numBlocks) errChans := make([]<-chan error, 0, numBlocks) + cancelChans := make([]<-chan error, 0, numBlocks) + for i := 0; i < numBlocks; i++ { - blockChan, errChan := h.query(h.hashes[i : i+1]) + blockChan, errChan, cancelChan := h.query(h.hashes[i : i+1]) blockChans = append(blockChans, blockChan) errChans = append(errChans, errChan) + cancelChans = append(cancelChans, cancelChan) + } // We should see one query per block. for i := 0; i < numBlocks; i++ { h.assertPeerQueried() - h.assertPeerReplied(blockChans[i], errChans[i], true) + h.assertPeerReplied(blockChans[i], errChans[i], cancelChans[i], + true) } } @@ -544,7 +651,7 @@ func TestPrunedBlockDispatcherPeerPoller(t *testing.T) { h.assertNoPeerDialed() // We'll then query for a block. - blockChan, errChan := h.query(h.hashes) + blockChan, errChan, cancelChan := h.query(h.hashes) // Refresh our peers. This would dial some peers, but we don't have any // yet. @@ -563,7 +670,7 @@ func TestPrunedBlockDispatcherPeerPoller(t *testing.T) { // Disconnect our peer and re-enable replies. h.disconnectPeer(peer, false) h.enablePeerReplies() - h.assertNoReply(blockChan, errChan) + h.assertNoReply(blockChan, errChan, cancelChan) // Force a refresh once again. Since the peer has disconnected, a new // connection should be made and the peer should be queried again. @@ -579,7 +686,7 @@ func TestPrunedBlockDispatcherPeerPoller(t *testing.T) { // Now that we know we've connected to the peer, we should be able to // receive their response. - h.assertPeerReplied(blockChan, errChan, true) + h.assertPeerReplied(blockChan, errChan, cancelChan, true) } // TestPrunedBlockDispatcherInvalidBlock ensures that validation is performed on @@ -597,9 +704,9 @@ func TestPrunedBlockDispatcherInvalidBlock(t *testing.T) { // We'll then query for a block. We shouldn't see a response as the // block should have failed validation. - blockChan, errChan := h.query(h.hashes) + blockChan, errChan, cancelChan := h.query(h.hashes) h.assertPeerQueried() - h.assertNoReply(blockChan, errChan) + h.assertNoReply(blockChan, errChan, cancelChan) // Since the peer sent us an invalid block, they should have been // disconnected and banned. Refreshing our peers shouldn't result in a @@ -618,7 +725,7 @@ func TestPrunedBlockDispatcherInvalidBlock(t *testing.T) { h.refreshPeers() h.assertPeerDialed() h.assertPeerQueried() - h.assertPeerReplied(blockChan, errChan, true) + h.assertPeerReplied(blockChan, errChan, cancelChan, true) } func TestSatisfiesRequiredServices(t *testing.T) { From 987fbac6386da56bab16a6702190075cc8bf118c Mon Sep 17 00:00:00 2001 From: ziggie Date: Thu, 25 Jan 2024 14:35:37 +0000 Subject: [PATCH 2/2] chain: refine peer selection logic for pruned nodes. We discard inbound peers because we don't know their listen port and sort the peers by pingtime to select the fastest first. --- chain/pruned_block_dispatcher.go | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/chain/pruned_block_dispatcher.go b/chain/pruned_block_dispatcher.go index 915c595500..bf008e1a31 100644 --- a/chain/pruned_block_dispatcher.go +++ b/chain/pruned_block_dispatcher.go @@ -7,6 +7,7 @@ import ( "fmt" "math/rand" "net" + "sort" "sync" "time" @@ -395,7 +396,25 @@ func (d *PrunedBlockDispatcher) connectToPeer(addr string) (bool, error) { // "full-node". func filterPeers(peers []btcjson.GetPeerInfoResult) ([]string, error) { var eligible []string // nolint:prealloc + + // First we sort the peers by the measured ping time, to choose the best + // peers to fetch blocks from. + sort.Slice(peers, func(i, j int) bool { + return peers[i].PingTime < peers[j].PingTime + }) + for _, peer := range peers { + // We cannot use the inbound peers here because the referenced + // port in the `addr` field is not the listen port for the p2p + // connection but a random outgoing port of the peer. + if peer.Inbound { + log.Debugf("Inbound peer %v not considering for "+ + "outbound connection to fetch pruned blocks", + peer) + + continue + } + rawServices, err := hex.DecodeString(peer.Services) if err != nil { return nil, err