diff --git a/README.md b/README.md index bd006e4..a62a6e1 100644 --- a/README.md +++ b/README.md @@ -2,9 +2,11 @@ A monitoring tool for Ethereum networks that checks node health and reports issues to Discord. +The checks are run against a Grafana instance, which is configured with a Prometheus datasource. The checks themselves are rather specific to the custom Prometheus metrics ethPandaOps has setup, so your mileage may vary as a public user. + ## Usage -### Docker +### Pulse Check All Clients ```bash docker run -e GRAFANA_SERVICE_TOKEN=your_token \ @@ -15,13 +17,17 @@ docker run -e GRAFANA_SERVICE_TOKEN=your_token \ --network NETWORK_NAME ``` -You can also pass in a target client to scope the checks + notification. This can be done with `--ethereum-cl` or `--ethereum-el`: +### Pulse Check Specific Client + +You can also pass in a target client to scope the checks + notification. + +This can be done with `--ethereum-cl` or `--ethereum-el`: ```bash docker run -e GRAFANA_SERVICE_TOKEN=your_token \ -e DISCORD_BOT_TOKEN=your_token \ -e OPENROUTER_API_KEY=optional_key \ - ethpandaops/panda-pulse:0.0.2 \ + ethpandaops/panda-pulse:latest \ --discord-channel CHANNEL_ID \ --network NETWORK_NAME \ --ethereum-cl CLIENT_NAME @@ -39,5 +45,7 @@ docker run -e GRAFANA_SERVICE_TOKEN=your_token \ - `--network` (required): Network to monitor (e.g., `pectra-devnet-5`) - `--discord-channel` (required): Discord channel ID for notifications -- `--ethereum-cl`: Filter for specific consensus client (default: all) -- `--ethereum-el`: Filter for specific execution client (default: all) +- `--ethereum-cl`: Filter for specific consensus client +- `--ethereum-el`: Filter for specific execution client +- `--grafana-base-url`: Grafana base URL +- `--prometheus-datasource-id`: Prometheus datasource ID diff --git a/cmd/main.go b/cmd/main.go index ef17e4e..589ece6 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -15,27 +15,30 @@ import ( ) const ( - grafanaBaseURL = "https://grafana.observability.ethpandaops.io" - prometheusDatasourceID = "UhcO3vy7z" + defaultGrafanaBaseURL = "https://grafana.observability.ethpandaops.io" + defaultPromDatasourceID = "UhcO3vy7z" ) // Config contains the configuration for the panda-pulse tool. type Config struct { - Network string - ConsensusNode string - ExecutionNode string - DiscordChannel string - GrafanaToken string - DiscordToken string - OpenRouterKey string + Network string + ConsensusNode string + ExecutionNode string + DiscordChannel string + GrafanaToken string + DiscordToken string + OpenRouterKey string + GrafanaBaseURL string + PromDatasourceID string } func main() { var cfg Config rootCmd := &cobra.Command{ - Use: "panda-pulse", - Short: "EthPandaOps dev-net monitoring tool", + Use: "panda-pulse", + Short: "EthPandaOps dev-net monitoring tool", + SilenceUsage: true, RunE: func(cmd *cobra.Command, args []string) error { if cfg.GrafanaToken == "" { return fmt.Errorf("GRAFANA_SERVICE_TOKEN environment variable is required") @@ -66,10 +69,12 @@ func main() { }, } - rootCmd.Flags().StringVar(&cfg.Network, "network", "", "Network to monitor (e.g., pectra-devnet-5)") - rootCmd.Flags().StringVar(&cfg.DiscordChannel, "discord-channel", "", "Discord channel to notify") - rootCmd.Flags().StringVar(&cfg.ConsensusNode, "ethereum-cl", checks.ClientTypeAll.String(), "Consensus client to monitor") - rootCmd.Flags().StringVar(&cfg.ExecutionNode, "ethereum-el", checks.ClientTypeAll.String(), "Execution client to monitor") + rootCmd.Flags().StringVar(&cfg.Network, "network", "", "network to monitor (e.g., pectra-devnet-5)") + rootCmd.Flags().StringVar(&cfg.DiscordChannel, "discord-channel", "", "discord channel to notify") + rootCmd.Flags().StringVar(&cfg.ConsensusNode, "ethereum-cl", checks.ClientTypeAll.String(), "consensus client to monitor") + rootCmd.Flags().StringVar(&cfg.ExecutionNode, "ethereum-el", checks.ClientTypeAll.String(), "execution client to monitor") + rootCmd.Flags().StringVar(&cfg.GrafanaBaseURL, "grafana-base-url", defaultGrafanaBaseURL, "grafana base URL") + rootCmd.Flags().StringVar(&cfg.PromDatasourceID, "prometheus-datasource-id", defaultPromDatasourceID, "prometheus datasource ID") if err := rootCmd.MarkFlagRequired("network"); err != nil { fmt.Fprintf(os.Stderr, "Error: %v\n", err) @@ -88,8 +93,6 @@ func main() { cfg.OpenRouterKey = os.Getenv("OPENROUTER_API_KEY") if err := rootCmd.Execute(); err != nil { - fmt.Fprintf(os.Stderr, "Error: %v\n", err) - os.Exit(1) } } @@ -99,7 +102,7 @@ func runChecks(cmd *cobra.Command, cfg Config) error { httpClient := &http.Client{Timeout: 30 * time.Second} // Initialize Grafana client. - grafanaClient := grafana.NewClient(grafanaBaseURL, prometheusDatasourceID, cfg.GrafanaToken, httpClient) + grafanaClient := grafana.NewClient(cfg.GrafanaBaseURL, cfg.PromDatasourceID, cfg.GrafanaToken, httpClient) // Initialize Discord notifier. discordNotifier, err := discord.NewNotifier(cfg.DiscordToken, cfg.OpenRouterKey) diff --git a/pkg/checks/checks.go b/pkg/checks/checks.go index 82c7323..5159935 100644 --- a/pkg/checks/checks.go +++ b/pkg/checks/checks.go @@ -2,6 +2,7 @@ package checks import ( "context" + "fmt" "time" ) @@ -84,13 +85,7 @@ func (r *defaultRunner) RunChecks(ctx context.Context, cfg Config) ([]*Result, e result, err := check.Run(ctx, cfg) if err != nil { - result = &Result{ - Name: check.Name(), - Category: check.Category(), - Status: StatusFail, - Description: err.Error(), - Timestamp: time.Now(), - } + return nil, fmt.Errorf("failed to run check %s: %w", check.Name(), err) } results = append(results, result) diff --git a/pkg/checks/cl_peer_count_test.go b/pkg/checks/cl_peer_count_test.go new file mode 100644 index 0000000..62198e7 --- /dev/null +++ b/pkg/checks/cl_peer_count_test.go @@ -0,0 +1,118 @@ +package checks + +import ( + "context" + "testing" + + "github.com/ethpandaops/panda-pulse/pkg/grafana" + "github.com/ethpandaops/panda-pulse/pkg/grafana/mock" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/mock/gomock" +) + +func TestCLPeerCountCheck_Run(t *testing.T) { + failingResponse := &grafana.QueryResponse{ + Results: grafana.QueryResults{ + PandaPulse: grafana.QueryPandaPulse{ + Frames: []grafana.QueryFrame{ + { + Schema: grafana.QuerySchema{ + Fields: []grafana.QueryField{ + { + Labels: map[string]string{ + "instance": "node1", + "ingress_user": "user1", + }, + }, + }, + }, + Data: grafana.QueryData{ + Values: []interface{}{1.0}, + }, + }, + }, + }, + }, + } + + tests := []struct { + name string + config Config + mockResponse *grafana.QueryResponse + mockError error + expectedStatus Status + expectError bool + }{ + { + name: "all nodes have sufficient peers", + config: Config{ + Network: "mainnet", + ConsensusNode: "lighthouse", + ExecutionNode: "geth", + }, + mockResponse: &grafana.QueryResponse{}, + expectedStatus: StatusOK, + }, + { + name: "nodes with low peer count", + config: Config{ + Network: "mainnet", + ConsensusNode: "lighthouse", + ExecutionNode: "geth", + }, + mockResponse: failingResponse, + expectedStatus: StatusFail, + }, + { + name: "grafana error", + config: Config{ + Network: "mainnet", + ConsensusNode: "lighthouse", + ExecutionNode: "geth", + }, + mockError: assert.AnError, + expectError: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + mockClient := mock.NewMockGrafanaClient(ctrl) + mockClient.EXPECT().Query(gomock.Any(), gomock.Any()).Return(tt.mockResponse, tt.mockError) + + check := NewCLPeerCountCheck(mockClient) + result, err := check.Run(context.Background(), tt.config) + + if tt.expectError { + require.Error(t, err) + + return + } + + require.NoError(t, err) + assert.Equal(t, tt.expectedStatus, result.Status) + assert.NotEmpty(t, result.Description) + assert.NotNil(t, result.Details) + assert.Contains(t, result.Details, "query") + }) + } +} + +func TestCLPeerCountCheck_Name(t *testing.T) { + check := NewCLPeerCountCheck(nil) + assert.Equal(t, "Low peer count", check.Name()) +} + +func TestCLPeerCountCheck_Category(t *testing.T) { + check := NewCLPeerCountCheck(nil) + assert.Equal(t, CategorySync, check.Category()) +} + +func TestCLPeerCountCheck_ClientType(t *testing.T) { + check := NewCLPeerCountCheck(nil) + assert.Equal(t, ClientTypeCL, check.ClientType()) +} diff --git a/pkg/checks/cl_sync_test.go b/pkg/checks/cl_sync_test.go new file mode 100644 index 0000000..5cfb8b0 --- /dev/null +++ b/pkg/checks/cl_sync_test.go @@ -0,0 +1,118 @@ +package checks + +import ( + "context" + "testing" + + "github.com/ethpandaops/panda-pulse/pkg/grafana" + "github.com/ethpandaops/panda-pulse/pkg/grafana/mock" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.uber.org/mock/gomock" +) + +func TestCLSyncCheck_Run(t *testing.T) { + failingResponse := &grafana.QueryResponse{ + Results: grafana.QueryResults{ + PandaPulse: grafana.QueryPandaPulse{ + Frames: []grafana.QueryFrame{ + { + Schema: grafana.QuerySchema{ + Fields: []grafana.QueryField{ + { + Labels: map[string]string{ + "instance": "node1", + "ingress_user": "user1", + }, + }, + }, + }, + Data: grafana.QueryData{ + Values: []interface{}{1.0}, + }, + }, + }, + }, + }, + } + + tests := []struct { + name string + config Config + mockResponse *grafana.QueryResponse + mockError error + expectedStatus Status + expectError bool + }{ + { + name: "all nodes synced", + config: Config{ + Network: "mainnet", + ConsensusNode: "lighthouse", + ExecutionNode: "geth", + }, + mockResponse: &grafana.QueryResponse{}, + expectedStatus: StatusOK, + }, + { + name: "nodes not syncing", + config: Config{ + Network: "mainnet", + ConsensusNode: "lighthouse", + ExecutionNode: "geth", + }, + mockResponse: failingResponse, + expectedStatus: StatusFail, + }, + { + name: "grafana error", + config: Config{ + Network: "mainnet", + ConsensusNode: "lighthouse", + ExecutionNode: "geth", + }, + mockError: assert.AnError, + expectError: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + mockClient := mock.NewMockGrafanaClient(ctrl) + mockClient.EXPECT().Query(gomock.Any(), gomock.Any()).Return(tt.mockResponse, tt.mockError) + + check := NewCLSyncCheck(mockClient) + result, err := check.Run(context.Background(), tt.config) + + if tt.expectError { + require.Error(t, err) + + return + } + + require.NoError(t, err) + assert.Equal(t, tt.expectedStatus, result.Status) + assert.NotEmpty(t, result.Description) + assert.NotNil(t, result.Details) + assert.Contains(t, result.Details, "query") + }) + } +} + +func TestCLSyncCheck_Name(t *testing.T) { + check := NewCLSyncCheck(nil) + assert.Equal(t, "Node failing to sync", check.Name()) +} + +func TestCLSyncCheck_Category(t *testing.T) { + check := NewCLSyncCheck(nil) + assert.Equal(t, CategorySync, check.Category()) +} + +func TestCLSyncCheck_ClientType(t *testing.T) { + check := NewCLSyncCheck(nil) + assert.Equal(t, ClientTypeCL, check.ClientType()) +} diff --git a/pkg/grafana/client.go b/pkg/grafana/client.go index ee631e3..221c7ee 100644 --- a/pkg/grafana/client.go +++ b/pkg/grafana/client.go @@ -80,7 +80,7 @@ func (c *client) Query(ctx context.Context, query string) (*QueryResponse, error } if resp.StatusCode != http.StatusOK { - return nil, fmt.Errorf("unexpected status code %d: %s", resp.StatusCode, string(body)) + return nil, fmt.Errorf("unexpected status code %d", resp.StatusCode) } var response QueryResponse