From 610a9865f820c8f62345567a60e59aad589f61a5 Mon Sep 17 00:00:00 2001 From: Ryan Stout Date: Thu, 11 Jul 2024 11:00:03 -0700 Subject: [PATCH] Log NewReportingPlugin errors for Commit and Exec plugins Also output metrics --- .../ocr2/plugins/ccip/ccipcommit/factory.go | 12 +++++++++++- .../ocr2/plugins/ccip/ccipcommit/factory_test.go | 3 +++ .../services/ocr2/plugins/ccip/ccipexec/factory.go | 12 +++++++++++- .../ocr2/plugins/ccip/ccipexec/factory_test.go | 3 +++ core/services/ocr2/plugins/ccip/metrics.go | 14 ++++++++++++++ 5 files changed, 42 insertions(+), 2 deletions(-) diff --git a/core/services/ocr2/plugins/ccip/ccipcommit/factory.go b/core/services/ocr2/plugins/ccip/ccipcommit/factory.go index 648f62a23a2..fcb6e106d37 100644 --- a/core/services/ocr2/plugins/ccip/ccipcommit/factory.go +++ b/core/services/ocr2/plugins/ccip/ccipcommit/factory.go @@ -86,7 +86,7 @@ func (rf *CommitReportingPluginFactory) NewReportingPlugin(config types.Reportin // retried via RetryUntilSuccess. NewReportingPlugin must return successfully in order for the Commit plugin to // function, hence why we can only keep retrying it until it succeeds. func (rf *CommitReportingPluginFactory) NewReportingPluginFn(config types.ReportingPluginConfig) func() (reportingPluginAndInfo, error) { - return func() (reportingPluginAndInfo, error) { + newReportingPluginFn := func() (reportingPluginAndInfo, error) { ctx := context.Background() // todo: consider adding some timeout destPriceReg, err := rf.config.commitStore.ChangeConfig(ctx, config.OnchainConfig, config.OffchainConfig) @@ -147,4 +147,14 @@ func (rf *CommitReportingPluginFactory) NewReportingPluginFn(config types.Report return reportingPluginAndInfo{plugin, pluginInfo}, nil } + + return func() (reportingPluginAndInfo, error) { + result, err := newReportingPluginFn() + if err != nil { + rf.config.lggr.Errorw("NewReportingPlugin failed", "err", err) + rf.config.metricsCollector.NewReportingPluginError() + } + + return result, err + } } diff --git a/core/services/ocr2/plugins/ccip/ccipcommit/factory_test.go b/core/services/ocr2/plugins/ccip/ccipcommit/factory_test.go index 825026bd17e..37997899519 100644 --- a/core/services/ocr2/plugins/ccip/ccipcommit/factory_test.go +++ b/core/services/ocr2/plugins/ccip/ccipcommit/factory_test.go @@ -11,6 +11,7 @@ import ( "github.com/smartcontractkit/chainlink-common/pkg/types/ccip" "github.com/smartcontractkit/chainlink/v2/core/logger" + ccip2 "github.com/smartcontractkit/chainlink/v2/core/services/ocr2/plugins/ccip" "github.com/smartcontractkit/chainlink/v2/core/services/ocr2/plugins/ccip/internal/ccipdata" ccipdataprovidermocks "github.com/smartcontractkit/chainlink/v2/core/services/ocr2/plugins/ccip/internal/ccipdata/ccipdataprovider/mocks" "github.com/smartcontractkit/chainlink/v2/core/services/ocr2/plugins/ccip/internal/ccipdata/mocks" @@ -24,6 +25,8 @@ import ( // retries a sufficient number of times to get through the transient errors and eventually succeed. func TestNewReportingPluginRetriesUntilSuccess(t *testing.T) { commitConfig := CommitPluginStaticConfig{} + commitConfig.lggr = logger.NullLogger + commitConfig.metricsCollector = ccip2.NoopMetricsCollector // For this unit test, ensure that there is no delay between retries commitConfig.newReportingPluginRetryConfig = ccipdata.RetryConfig{ diff --git a/core/services/ocr2/plugins/ccip/ccipexec/factory.go b/core/services/ocr2/plugins/ccip/ccipexec/factory.go index 1a18793a833..0f8051bde36 100644 --- a/core/services/ocr2/plugins/ccip/ccipexec/factory.go +++ b/core/services/ocr2/plugins/ccip/ccipexec/factory.go @@ -82,7 +82,7 @@ func (rf *ExecutionReportingPluginFactory) NewReportingPlugin(config types.Repor // retried via RetryUntilSuccess. NewReportingPlugin must return successfully in order for the Exec plugin to function, // hence why we can only keep retrying it until it succeeds. func (rf *ExecutionReportingPluginFactory) NewReportingPluginFn(config types.ReportingPluginConfig) func() (reportingPluginAndInfo, error) { - return func() (reportingPluginAndInfo, error) { + newReportingPluginFn := func() (reportingPluginAndInfo, error) { ctx := context.Background() // todo: consider setting a timeout destPriceRegistry, destWrappedNative, err := rf.config.offRampReader.ChangeConfig(ctx, config.OnchainConfig, config.OffchainConfig) @@ -154,4 +154,14 @@ func (rf *ExecutionReportingPluginFactory) NewReportingPluginFn(config types.Rep return reportingPluginAndInfo{plugin, pluginInfo}, nil } + + return func() (reportingPluginAndInfo, error) { + result, err := newReportingPluginFn() + if err != nil { + rf.config.lggr.Errorw("NewReportingPlugin failed", "err", err) + rf.config.metricsCollector.NewReportingPluginError() + } + + return result, err + } } diff --git a/core/services/ocr2/plugins/ccip/ccipexec/factory_test.go b/core/services/ocr2/plugins/ccip/ccipexec/factory_test.go index 7bbb9be0c69..869aa7e332c 100644 --- a/core/services/ocr2/plugins/ccip/ccipexec/factory_test.go +++ b/core/services/ocr2/plugins/ccip/ccipexec/factory_test.go @@ -11,6 +11,7 @@ import ( "github.com/smartcontractkit/chainlink-common/pkg/types/ccip" "github.com/smartcontractkit/chainlink/v2/core/logger" + ccip2 "github.com/smartcontractkit/chainlink/v2/core/services/ocr2/plugins/ccip" "github.com/smartcontractkit/chainlink/v2/core/services/ocr2/plugins/ccip/internal/ccipdata" ccipdataprovidermocks "github.com/smartcontractkit/chainlink/v2/core/services/ocr2/plugins/ccip/internal/ccipdata/ccipdataprovider/mocks" "github.com/smartcontractkit/chainlink/v2/core/services/ocr2/plugins/ccip/internal/ccipdata/mocks" @@ -23,6 +24,8 @@ import ( // retries a sufficient number of times to get through the transient errors and eventually succeed. func TestNewReportingPluginRetriesUntilSuccess(t *testing.T) { execConfig := ExecutionPluginStaticConfig{} + execConfig.lggr = logger.TestLogger(t) + execConfig.metricsCollector = ccip2.NoopMetricsCollector // For this unit test, ensure that there is no delay between retries execConfig.newReportingPluginRetryConfig = ccipdata.RetryConfig{ diff --git a/core/services/ocr2/plugins/ccip/metrics.go b/core/services/ocr2/plugins/ccip/metrics.go index f481b5d447d..9ec9fde316e 100644 --- a/core/services/ocr2/plugins/ccip/metrics.go +++ b/core/services/ocr2/plugins/ccip/metrics.go @@ -20,6 +20,10 @@ var ( Name: "ccip_sequence_number_counter", Help: "Sequence number of the last message processed by the plugin", }, []string{"plugin", "source", "dest", "ocrPhase"}) + newReportingPluginErrorCounter = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Name: "ccip_new_reporting_plugin_error_counter", + Help: "The count of the number of errors when calling NewReportingPlugin", + }, []string{"plugin"}) ) type ocrPhase string @@ -35,6 +39,7 @@ type PluginMetricsCollector interface { NumberOfMessagesBasedOnInterval(phase ocrPhase, seqNrMin, seqNrMax uint64) UnexpiredCommitRoots(count int) SequenceNumber(phase ocrPhase, seqNr uint64) + NewReportingPluginError() } type pluginMetricsCollector struct { @@ -79,6 +84,12 @@ func (p *pluginMetricsCollector) SequenceNumber(phase ocrPhase, seqNr uint64) { Set(float64(seqNr)) } +func (p *pluginMetricsCollector) NewReportingPluginError() { + newReportingPluginErrorCounter. + WithLabelValues(p.pluginName). + Inc() +} + var ( // NoopMetricsCollector is a no-op implementation of PluginMetricsCollector NoopMetricsCollector PluginMetricsCollector = noop{} @@ -97,3 +108,6 @@ func (d noop) UnexpiredCommitRoots(int) { func (d noop) SequenceNumber(ocrPhase, uint64) { } + +func (d noop) NewReportingPluginError() { +}