feature/BCF-2404-grpc-tracing: grpc tracing in core (#10698)

* feature/BCF-2404-grpc-tracing: Implementing grpc tracing in core * feature/BCF-2404-grpc-tracing: fixing config related tests * feature/BCF-2404-grpc-tracing: removing redundant forwarding of host env vars * feature/BCF-2404-grpc-tracing: adding tracing config to TestOCRv2Basic integration test * feature/BCF-2404-grpc-tracing: running go mod tidy * feature/BCF-2404-grpc-tracing: bumping kuberesolver to v5.1.1 * feature/BCF-2404-grpc-tracing: refactoring and minor cleanup * feature/BCF-2404-grpc-tracing: adding matrix for plugin image to eth-smoke-tests for median loop plugin * feature/BCF-2404-grpc-tracing: WIP adding OTEL collector, tempo, and grafana UI to integration testing workflow * feature/BCF-2404-grpc-tracing: adding build targets for plugin image for local testing and bumping relay * feature/BCF-2404-grpc-tracing: trying tr -d '\r' in secret parsing * feature/BCF-2404-grpc-tracing: connecting test container nodes to custom tracing bridge network * feature/BCF-2404-grpc-tracing: fixing integration-tests runner yaml and updating docker network for telem containers in runner to custom bridge with name tracing * feature/BCF-2404-grpc-tracing: WIP testing docker network * feature/BCF-2404-grpc-tracing: migrating tracing job steps to live under eth smoke tests matrix as github jobs have isolated virtual environments * feature/BCF-2404-grpc-tracing: picking random port on persistent server to allow for multiple PRs to generate traces * bumping zap to v1.26.0 and fixing rebase error * feature/BCF-2404-grpc-tracing: wiring SamplingRatio tracing configuration * feature/BCF-2404-grpc-tracing: updating after loop.SetupTelemetry refactored to loop.NewGRPCOpts and loop.SetupTracing * feature/BCF-2404-grpc-tracing: bumping relay * feature/BCF-2404-grpc-tracing: updating initGlobals to pass tracing setup error for logging through to NewApplication * feature/BCF-2404-grpc-tracing: adding ValidateConfig for Tracing struct in core/config/toml * feature/BCF-2404-grpc-tracing: adding validation for Tracing.SamplingRatio TOML config * feature/BCF-2404-grpc-tracing: tracing toml type ValidateConfig was brittle - expanded to accept docker network logical addresses * feature/BCF-2404-grpc-tracing: wiring TracingConfig through NewLoopRegistry by including it in LoopRegistry * feature/BCF-2404-grpc-tracing: refactoring for TOML config types * feature/BCF-2404-grpc-tracing: bumping relay * feature/BCF-2404-grpc-tracing: fix reference to Attributes type in integration-tests module * feature/BCF-2404-grpc-tracing: fixing test * feature/BCF-2404-grpc-tracing: bumping relay * feature/BCF-2404-grpc-tracing: adding coverage * feature/BCF-2404-grpc-tracing: adding coverage and adding toml omit empty tag to Attributes * feature/BCF-2404-grpc-tracing: localhost --> 127.0.01 in reverse forward and adding coverage * feature/BCF-2404-grpc-tracing: adding (legacy) name to eth smoke tests matrix * feature/BCF-2404-grpc-tracing: adding coverage * feature/BCF-2404-grpc-tracing: minor refactoring * feature/BCF-2404-grpc-tracing: updating enable tracing job to work in merge queue * feature/BCF-2404-grpc-tracing: fixing localhost --> 127.0.0.1 in previous commit
smartcontractkit · Oct 20, 2023 · 2930423 · 2930423
1 parent 75b7554
commit 2930423
Show file tree

Hide file tree

Showing 56 changed files with 1,136 additions and 70 deletions.
diff --git a/.github/tracing/README.md b/.github/tracing/README.md
@@ -0,0 +1,5 @@
+# Distributed Tracing
+
+These config files are for an OTEL collector, grafana Tempo, and a grafana UI instance to run as containers on the same network.
+
+A localhost client can send gRPC calls to the server. The gRPC server is instrumented with open telemetry traces, which are sent to the OTEL collector and forwarded to the Tempo backend. The grafana UI can then read the trace data from the Tempo backend. 
diff --git a/.github/tracing/grafana-datasources.yaml b/.github/tracing/grafana-datasources.yaml
@@ -0,0 +1,18 @@
+apiVersion: 1
+
+datasources:
+- name: Tempo
+  type: tempo
+  access: proxy
+  orgId: 1
+  url: http://tempo:3200
+  basicAuth: false
+  isDefault: true
+  version: 1
+  editable: false
+  apiVersion: 1
+  uid: tempo
+  jsonData:
+    httpMethod: GET
+    serviceMap:
+      datasourceUid: prometheus
diff --git a/.github/tracing/local-smoke-docker-compose.yaml b/.github/tracing/local-smoke-docker-compose.yaml
@@ -0,0 +1,46 @@
+version: "3"
+services:
+
+  # ... the OpenTelemetry Collector configured to receive traces and export to Tempo ...
+  otel-collector:
+    image: otel/opentelemetry-collector:0.61.0
+    command: [ "--config=/etc/otel-collector.yaml" ]
+    volumes:
+      - ./otel-collector.yaml:/etc/otel-collector.yaml
+    ports:
+      - "4317:4317" # otlp grpc
+    depends_on:
+      - tempo
+    networks:
+      - tracing-network
+
+  # .. Which accepts requests from grafana ...
+  tempo:
+    image: grafana/tempo:latest
+    command: [ "-config.file=/etc/tempo.yaml" ]
+    volumes:
+      - ./tempo.yaml:/etc/tempo.yaml
+      - ./tempo-data:/tmp/tempo
+    ports:
+      - "4317"  # otlp grpc
+    networks:
+      - tracing-network
+
+  grafana:
+    image: grafana/grafana:9.4.3
+    volumes:
+      - ./grafana-datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml
+    environment:
+      - GF_AUTH_ANONYMOUS_ENABLED=true
+      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
+      - GF_AUTH_DISABLE_LOGIN_FORM=true
+      - GF_FEATURE_TOGGLES_ENABLE=traceqlEditor
+    ports:
+      - "3000:3000"
+    networks:
+      - tracing-network
+
+networks:
+  tracing-network:
+    name: tracing
+    driver: bridge
diff --git a/.github/tracing/otel-collector.yaml b/.github/tracing/otel-collector.yaml
@@ -0,0 +1,15 @@
+receivers:
+  otlp:
+    protocols:
+      grpc:
+        endpoint: "0.0.0.0:4317"
+exporters:
+  otlp:
+    endpoint: tempo:4317
+    tls:
+      insecure: true
+service:
+  pipelines:
+    traces:
+      receivers: [otlp]
+      exporters: [otlp]
diff --git a/.github/tracing/tempo.yaml b/.github/tracing/tempo.yaml
@@ -0,0 +1,24 @@
+server:
+  http_listen_port: 3200
+
+distributor:
+  receivers:                           
+    otlp:
+      protocols:
+        http:
+        grpc:
+
+ingester:
+  max_block_duration: 5m      # cut the headblock when this much time passes. this is being set for demo purposes and should probably be left alone normally
+
+compactor:
+  compaction:
+    block_retention: 1h       # overall Tempo trace retention. set for demo purposes
+
+storage:
+  trace:
+    backend: local            # backend configuration to use
+    wal:
+      path: /tmp/tempo/wal    # where to store the the wal locally
+    local:
+      path: /tmp/tempo/blocks
diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
@@ -53,7 +53,6 @@ jobs:
         continue-on-error: true
     outputs:
       src: ${{ steps.changes.outputs.src }}
-
   build-chainlink:
     environment: integration
     permissions:
@@ -254,6 +253,11 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
+        image:
+          - name: (legacy)
+            tag-suffix: ""
+          - name: (plugins)
+            tag-suffix: -plugins
         product:
           - name: cron
             nodes: 1
@@ -296,7 +300,7 @@ jobs:
             os: ubuntu20.04-8cores-32GB
             pyroscope_env: ci-smoke-forwarder-ocr-evm-simulated
     runs-on: ${{ matrix.product.os }}
-    name: ETH Smoke Tests ${{ matrix.product.name }}
+    name: ETH Smoke Tests ${{ matrix.product.name }}${{ matrix.image.tag-suffix }}
     steps:
       - name: Checkout the repo
         uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0
@@ -311,6 +315,97 @@ jobs:
           else
             echo "run_command=./smoke/${{ matrix.product.name }}_test.go" >> "$GITHUB_OUTPUT"
           fi
+      - name: Check for "enable tracing" label
+        id: check-label
+        run: |
+          label=$(jq -r '.pull_request.labels[]?.name // empty' "$GITHUB_EVENT_PATH")
+
+          if [[ -n "$label" ]]; then
+            if [[ "$label" == "enable tracing" ]]; then
+              echo "Enable tracing label found."
+              echo "trace=true" >> $GITHUB_OUTPUT
+            else
+              echo "Enable tracing label not found."
+              echo "trace=false" >> $GITHUB_OUTPUT
+            fi
+          else
+            echo "No labels present or labels are null."
+            echo "trace=false" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Setup Grafana and OpenTelemetry
+        id: docker-setup
+        if: steps.check-label.outputs.trace == 'true' && matrix.product.name == 'ocr2' && matrix.image.tag-suffix == '-plugins'
+        run: |
+          # Create network
+          docker network create --driver bridge tracing
+
+          # Start Grafana
+          cd ./.github/tracing
+          docker run -d --network=tracing --name=grafana -p 3000:3000 -v $PWD/grafana-datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml -e GF_AUTH_ANONYMOUS_ENABLED=true -e GF_AUTH_ANONYMOUS_ORG_ROLE=Admin -e GF_AUTH_DISABLE_LOGIN_FORM=true -e GF_FEATURE_TOGGLES_ENABLE=traceqlEditor grafana/grafana:9.4.3
+
+          # Start Tempo
+          docker run -d --network=tracing --name=tempo -v ./tempo.yaml:/etc/tempo.yaml -v $PWD/tempo-data:/tmp/tempo grafana/tempo:latest -config.file=/etc/tempo.yaml
+
+          # Start OpenTelemetry Collector
+          docker run -d --network=tracing --name=otel-collector -v $PWD/otel-collector.yaml:/etc/otel-collector.yaml -p 4317:4317 otel/opentelemetry-collector:0.61.0 --config=/etc/otel-collector.yaml
+
+      - name: Generate port
+        id: generate-port
+        env:
+          GITHUB_PR_NUMBER: ${{ github.event.number }}
+        run: |
+          PORT_BASE=3001
+          MAX_PORT=8000 
+  
+          # Use PR number as offset. Given GitHub PRs are incremental, this guarantees uniqueness for at least 5000 PRs.
+          OFFSET=$GITHUB_PR_NUMBER
+          echo "PR Number: $OFFSET"
+
+          # Ensure that we don't exceed the max port
+          if (( OFFSET > (MAX_PORT - PORT_BASE) )); then
+              OFFSET=$((OFFSET % (MAX_PORT - PORT_BASE)))
+          fi
+
+          # Map the offset to the port range
+          REMOTE_PORT=$((PORT_BASE + OFFSET))
+          echo "REMOTE_PORT=$REMOTE_PORT" >> $GITHUB_OUTPUT
+      - name: Reverse SSH Tunneling
+        if: steps.check-label.outputs.trace == 'true' && matrix.product.name == 'ocr2' && matrix.image.tag-suffix == '-plugins'
+        env:
+          TRACING_SSH_KEY: ${{ secrets.TRACING_SSH_KEY }}
+          TRACING_SSH_SERVER: ${{ secrets.TRACING_SSH_SERVER }}
+          REMOTE_PORT: ${{ steps.generate-port.outputs.REMOTE_PORT }}
+        run: |
+            eval $(ssh-agent)
+            echo "test"
+            echo "$TRACING_SSH_KEY" | wc -c
+            echo "$TRACING_SSH_KEY" | tr -d '\r' | wc -c
+            echo "$TRACING_SSH_KEY" | tr -d '\r' | base64 --decode | ssh-add -
+            # f: background process
+            # N: do not execute a remote command
+            # R: remote port forwarding
+            ssh -o StrictHostKeyChecking=no -f -N -R $REMOTE_PORT:127.0.0.1:3000 user-gha@$TRACING_SSH_SERVER
+            echo "To view Grafana locally:"
+            echo "ssh -N -L 8000:localhost:$REMOTE_PORT user-gha@$TRACING_SSH_SERVER"
+            echo "Then visit http://localhost:8000 in a browser."
+            echo "If you are unable to connect, check with the security team that you have access to the tracing server."
+      - name: Show Grafana Logs
+        if: steps.check-label.outputs.trace == 'true' && matrix.product.name == 'ocr2' && matrix.image.tag-suffix == '-plugins'
+        run: |
+            docker logs grafana
+      - name: Show Tempo Logs
+        if: steps.check-label.outputs.trace == 'true' && matrix.product.name == 'ocr2' && matrix.image.tag-suffix == '-plugins'
+        run: |
+            docker logs tempo
+      - name: Show OpenTelemetry Collector Logs
+        if: steps.check-label.outputs.trace == 'true' && matrix.product.name == 'ocr2' && matrix.image.tag-suffix == '-plugins'
+        run: |
+            docker logs otel-collector
+      - name: Set sleep time to use in future steps
+        if: steps.check-label.outputs.trace == 'true' && matrix.product.name == 'ocr2' && matrix.image.tag-suffix == '-plugins'
+        run: |
+          echo "SLEEP_TIME=2400" >> "$GITHUB_ENV"
       ## Run this step when changes that require tests to be run are made
       - name: Run Tests
         if: needs.changes.outputs.src == 'true'
@@ -323,7 +418,7 @@ jobs:
           test_command_to_run: make test_need_operator_assets && cd ./integration-tests && go test -timeout 30m -count=1 -json -test.parallel=${{ matrix.product.nodes }} ${{ steps.build-go-test-command.outputs.run_command }} 2>&1 | tee /tmp/gotest.log | gotestfmt
           test_download_vendor_packages_command: cd ./integration-tests && go mod download
           cl_repo: ${{ env.CHAINLINK_IMAGE }}
-          cl_image_tag: ${{ github.sha }}
+          cl_image_tag: ${{ github.sha }}${{ matrix.image.tag-suffix }}
           aws_registries: ${{ secrets.QA_AWS_ACCOUNT_NUMBER }}
           artifacts_name: ${{ matrix.product.name }}-test-logs
           artifacts_location: ./integration-tests/smoke/logs/
@@ -347,7 +442,6 @@ jobs:
           QA_AWS_REGION: ${{ secrets.QA_AWS_REGION }}
           QA_AWS_ROLE_TO_ASSUME: ${{ secrets.QA_AWS_ROLE_TO_ASSUME }}
           QA_KUBECONFIG: ${{ secrets.QA_KUBECONFIG }}
-
       - name: Collect Metrics
         if: always()
         id: collect-gha-metrics
@@ -358,6 +452,11 @@ jobs:
           this-job-name: ETH Smoke Tests ${{ matrix.product.name }}
           test-results-file: '{"testType":"go","filePath":"/tmp/gotest.log"}'
         continue-on-error: true
+      - name: Keep action running to view traces
+        if: steps.check-label.outputs.trace == 'true' && matrix.product.name == 'ocr2' && matrix.image.tag-suffix == '-plugins'
+        run: |
+          echo "Sleeping for $SLEEP_TIME seconds..."
+          sleep $SLEEP_TIME
 
   ### Used to check the required checks box when the matrix completes
   eth-smoke-tests:

diff --git a/core/chains/evm/config/mocks/chain_scoped_config.go b/core/chains/evm/config/mocks/chain_scoped_config.go
diff --git a/core/cmd/shell.go b/core/cmd/shell.go
@@ -60,12 +60,20 @@ var (
 	grpcOpts        loop.GRPCOpts
 )
 
-func initGlobals(cfg config.Prometheus) {
-	// Avoid double initializations.
+func initGlobals(cfgProm config.Prometheus, cfgTracing config.Tracing) error {
+	// Avoid double initializations, but does not prevent relay methods from being called multiple times.
+	var err error
 	initGlobalsOnce.Do(func() {
-		prometheus = ginprom.New(ginprom.Namespace("service"), ginprom.Token(cfg.AuthToken()))
-		grpcOpts = loop.SetupTelemetry(nil) // default prometheus.Registerer
+		prometheus = ginprom.New(ginprom.Namespace("service"), ginprom.Token(cfgProm.AuthToken()))
+		grpcOpts = loop.NewGRPCOpts(nil) // default prometheus.Registerer
+		err = loop.SetupTracing(loop.TracingConfig{
+			Enabled:         cfgTracing.Enabled(),
+			CollectorTarget: cfgTracing.CollectorTarget(),
+			NodeAttributes:  cfgTracing.Attributes(),
+			SamplingRatio:   cfgTracing.SamplingRatio(),
+		})
 	})
+	return err
 }
 
 var (
@@ -126,7 +134,10 @@ type ChainlinkAppFactory struct{}
 
 // NewApplication returns a new instance of the node with the given config.
 func (n ChainlinkAppFactory) NewApplication(ctx context.Context, cfg chainlink.GeneralConfig, appLggr logger.Logger, db *sqlx.DB) (app chainlink.Application, err error) {
-	initGlobals(cfg.Prometheus())
+	err = initGlobals(cfg.Prometheus(), cfg.Tracing())
+	if err != nil {
+		appLggr.Errorf("Failed to initialize globals: %v", err)
+	}
 
 	err = migrate.SetMigrationENVVars(cfg)
 	if err != nil {
@@ -143,7 +154,7 @@ func (n ChainlinkAppFactory) NewApplication(ctx context.Context, cfg chainlink.G
 
 	dbListener := cfg.Database().Listener()
 	eventBroadcaster := pg.NewEventBroadcaster(cfg.Database().URL(), dbListener.MinReconnectInterval(), dbListener.MaxReconnectDuration(), appLggr, cfg.AppID())
-	loopRegistry := plugins.NewLoopRegistry(appLggr)
+	loopRegistry := plugins.NewLoopRegistry(appLggr, cfg.Tracing())
 
 	// create the relayer-chain interoperators from application configuration
 	relayerFactory := chainlink.RelayerFactory{

diff --git a/core/cmd/shell_local_test.go b/core/cmd/shell_local_test.go
@@ -42,7 +42,7 @@ import (
 func genTestEVMRelayers(t *testing.T, opts evm.ChainRelayExtenderConfig, ks evmrelayer.CSAETHKeystore) *chainlink.CoreRelayerChainInteroperators {
 	f := chainlink.RelayerFactory{
 		Logger:       opts.Logger,
-		LoopRegistry: plugins.NewLoopRegistry(opts.Logger),
+		LoopRegistry: plugins.NewLoopRegistry(opts.Logger, opts.AppConfig.Tracing()),
 	}
 
 	relayers, err := chainlink.NewCoreRelayerChainInteroperators(chainlink.InitEVM(testutils.Context(t), f, chainlink.EVMFactoryConfig{

diff --git a/core/cmd/shell_test.go b/core/cmd/shell_test.go
@@ -345,7 +345,7 @@ func TestNewUserCache(t *testing.T) {
 
 func TestSetupSolanaRelayer(t *testing.T) {
 	lggr := logger.TestLogger(t)
-	reg := plugins.NewLoopRegistry(lggr)
+	reg := plugins.NewLoopRegistry(lggr, nil)
 	ks := mocks.NewSolana(t)
 
 	// config 3 chains but only enable 2 => should only be 2 relayer
@@ -432,7 +432,7 @@ func TestSetupSolanaRelayer(t *testing.T) {
 
 func TestSetupStarkNetRelayer(t *testing.T) {
 	lggr := logger.TestLogger(t)
-	reg := plugins.NewLoopRegistry(lggr)
+	reg := plugins.NewLoopRegistry(lggr, nil)
 	ks := mocks.NewStarkNet(t)
 	// config 3 chains but only enable 2 => should only be 2 relayer
 	nEnabledChains := 2

diff --git a/core/config/app_config.go b/core/config/app_config.go
@@ -53,6 +53,7 @@ type AppConfig interface {
 	TelemetryIngress() TelemetryIngress
 	Threshold() Threshold
 	WebServer() WebServer
+	Tracing() Tracing
 }
 
 type DatabaseBackupMode string

diff --git a/core/config/docs/core.toml b/core/config/docs/core.toml
@@ -540,3 +540,17 @@ InfiniteDepthQueries = false # Default
 # DisableRateLimiting skips ratelimiting on asset requests.
 DisableRateLimiting = false # Default
 
+[Tracing]
+# Enabled turns trace collection on or off. On requires an OTEL Tracing Collector.
+Enabled = false # Default
+# CollectorTarget is the logical address of the OTEL Tracing Collector.
+CollectorTarget = "localhost:4317" # Example
+# NodeID is an unique name for this node relative to any other node traces are collected for.
+NodeID = "NodeID" # Example
+# SamplingRatio is the ratio of traces to sample for this node.
+SamplingRatio = 1.0 # Example
+
+# Tracing.Attributes are user specified key-value pairs to associate in the context of the traces
+[Tracing.Attributes]
+# env is an example user specified key-value pair
+env = "test" # Example