From e5359cad982b909166069cddb9d3ec4ccdfef811 Mon Sep 17 00:00:00 2001 From: Maria Elisabeth Schreiber Date: Fri, 7 Feb 2025 14:16:46 -0700 Subject: [PATCH] docs: update observability/telemetry overview pages (#6563) Co-authored-by: Edward Huang Co-authored-by: Shane Myrick --- .../telemetry/instrumentation/conditions.mdx | 2 + .../telemetry/instrumentation/events.mdx | 2 + .../telemetry/instrumentation/instruments.mdx | 2 + .../telemetry/instrumentation/selectors.mdx | 2 + .../telemetry/instrumentation/spans.mdx | 2 + .../instrumentation/standard-attributes.mdx | 2 + .../instrumentation/standard-instruments.mdx | 2 + .../telemetry/log-exporters/overview.mdx | 2 + .../router/telemetry/log-exporters/stdout.mdx | 2 + .../telemetry/metrics-exporters/datadog.mdx | 2 + .../telemetry/metrics-exporters/dynatrace.mdx | 2 + .../telemetry/metrics-exporters/new-relic.mdx | 2 + .../telemetry/metrics-exporters/otlp.mdx | 2 + .../telemetry/metrics-exporters/overview.mdx | 2 + .../metrics-exporters/prometheus.mdx | 2 + .../telemetry/trace-exporters/datadog.mdx | 2 + .../telemetry/trace-exporters/dynatrace.mdx | 2 + .../telemetry/trace-exporters/jaeger.mdx | 2 + .../telemetry/trace-exporters/new-relic.mdx | 2 + .../router/telemetry/trace-exporters/otlp.mdx | 2 + .../telemetry/trace-exporters/overview.mdx | 2 + .../telemetry/trace-exporters/zipkin.mdx | 2 + .../debugging-client-requests.mdx | 79 ++++ docs/source/routing/observability/index.mdx | 59 +-- .../otel-traces-to-prometheus.mdx | 2 + docs/source/routing/observability/otel.mdx | 2 + .../routing/observability/telemetry.mdx | 364 +++++++++++++----- 27 files changed, 433 insertions(+), 117 deletions(-) create mode 100644 docs/source/routing/observability/debugging-client-requests.mdx diff --git a/docs/source/reference/router/telemetry/instrumentation/conditions.mdx b/docs/source/reference/router/telemetry/instrumentation/conditions.mdx index 79ed89b061..7ff72a34a5 100644 --- a/docs/source/reference/router/telemetry/instrumentation/conditions.mdx +++ b/docs/source/reference/router/telemetry/instrumentation/conditions.mdx @@ -2,6 +2,8 @@ title: Conditions subtitle: Set conditions for when events or instruments are triggered description: Set conditions for when events or instruments are triggered in the Apollo GraphOS Router. +context: + - telemetry --- You can set conditions for when an [instrument](/router/configuration/telemetry/instrumentation/instruments) should be mutated or an [event](/router/configuration/telemetry/instrumentation/events) should be triggered. diff --git a/docs/source/reference/router/telemetry/instrumentation/events.mdx b/docs/source/reference/router/telemetry/instrumentation/events.mdx index 6d0b330e0b..f6a93c9db0 100644 --- a/docs/source/reference/router/telemetry/instrumentation/events.mdx +++ b/docs/source/reference/router/telemetry/instrumentation/events.mdx @@ -2,6 +2,8 @@ title: Events subtitle: Capture events from the router's request lifecycle description: Capture standard and custom events from the Apollo GraphOS Router's request lifecycle services. +context: + - telemetry --- import RouterServices from '../../../../../shared/router-lifecycle-services.mdx'; diff --git a/docs/source/reference/router/telemetry/instrumentation/instruments.mdx b/docs/source/reference/router/telemetry/instrumentation/instruments.mdx index 6a98c41c9f..3f6a92ee12 100644 --- a/docs/source/reference/router/telemetry/instrumentation/instruments.mdx +++ b/docs/source/reference/router/telemetry/instrumentation/instruments.mdx @@ -2,6 +2,8 @@ title: Instruments subtitle: Collect measurements with standard and custom instruments description: Create and customize instruments to collect data and report measurements from the Apollo GraphOS Router's request lifecycle services. +context: + - telemetry --- import RouterServices from '../../../../../shared/router-lifecycle-services.mdx'; diff --git a/docs/source/reference/router/telemetry/instrumentation/selectors.mdx b/docs/source/reference/router/telemetry/instrumentation/selectors.mdx index aefb405193..0e905747cc 100644 --- a/docs/source/reference/router/telemetry/instrumentation/selectors.mdx +++ b/docs/source/reference/router/telemetry/instrumentation/selectors.mdx @@ -2,6 +2,8 @@ title: Selectors subtitle: Select data from the router pipeline to extract description: Extract and select data from the Apollo GraphOS Router's pipeline services to attach to telemetry. +context: + - telemetry --- import RouterServices from '../../../../../shared/router-lifecycle-services.mdx'; diff --git a/docs/source/reference/router/telemetry/instrumentation/spans.mdx b/docs/source/reference/router/telemetry/instrumentation/spans.mdx index f64a5a5237..4049b14979 100644 --- a/docs/source/reference/router/telemetry/instrumentation/spans.mdx +++ b/docs/source/reference/router/telemetry/instrumentation/spans.mdx @@ -2,6 +2,8 @@ title: Spans subtitle: Add router lifecycle context to traces description: Use spans to add contextual information from the Apollo GraphOS Router or Apollo Router Core to traces displayed by your application performance monitors (APM). +context: + - telemetry --- import RouterServices from '../../../../../shared/router-lifecycle-services.mdx'; diff --git a/docs/source/reference/router/telemetry/instrumentation/standard-attributes.mdx b/docs/source/reference/router/telemetry/instrumentation/standard-attributes.mdx index bea6b3927f..d393e3097d 100644 --- a/docs/source/reference/router/telemetry/instrumentation/standard-attributes.mdx +++ b/docs/source/reference/router/telemetry/instrumentation/standard-attributes.mdx @@ -2,6 +2,8 @@ title: OpenTelemetry standard attributes subtitle: Attach standard attributes to router telemetry description: Attach OpenTelemetry (OTel) standard attributes to Apollo GraphOS Router or Apollo Router Core telemetry. +context: + - telemetry --- import RouterServices from '../../../../../shared/router-lifecycle-services.mdx'; diff --git a/docs/source/reference/router/telemetry/instrumentation/standard-instruments.mdx b/docs/source/reference/router/telemetry/instrumentation/standard-instruments.mdx index 714984bb79..bda984e192 100644 --- a/docs/source/reference/router/telemetry/instrumentation/standard-instruments.mdx +++ b/docs/source/reference/router/telemetry/instrumentation/standard-instruments.mdx @@ -2,6 +2,8 @@ title: Router Instruments subtitle: Standard metric instruments for the router's request lifecycle description: Reference of standard metric instruments for the request lifecycle of GraphOS Router and Apollo Router Core. Consumable via the router's metrics exporters. +context: + - telemetry --- ## Standard metric instruments diff --git a/docs/source/reference/router/telemetry/log-exporters/overview.mdx b/docs/source/reference/router/telemetry/log-exporters/overview.mdx index bbfe532694..1333013aa2 100644 --- a/docs/source/reference/router/telemetry/log-exporters/overview.mdx +++ b/docs/source/reference/router/telemetry/log-exporters/overview.mdx @@ -2,6 +2,8 @@ title: Router Logging subtitle: Configure logging in the router description: Configure logging in the Apollo GraphOS Router or Apollo Router Core. Set the log level and output format. +context: + - telemetry --- GraphOS Router and Apollo Router Core provide built-in logging to capture records about their activity. diff --git a/docs/source/reference/router/telemetry/log-exporters/stdout.mdx b/docs/source/reference/router/telemetry/log-exporters/stdout.mdx index fddae9b1be..2063eb2024 100644 --- a/docs/source/reference/router/telemetry/log-exporters/stdout.mdx +++ b/docs/source/reference/router/telemetry/log-exporters/stdout.mdx @@ -2,6 +2,8 @@ title: Router Logging to stdout subtitle: Configure logging to stdout description: Configure logging output to stdout in the Apollo GraphOS Router or Apollo Router Core. Format in human-readable text or machine-readable JSON. +context: + - telemetry --- You can configure GraphOS Router or Apollo Router Core logging to be directed to stdout, and its output format can be set to text or JSON. diff --git a/docs/source/reference/router/telemetry/metrics-exporters/datadog.mdx b/docs/source/reference/router/telemetry/metrics-exporters/datadog.mdx index 4526ab38d4..dea5e72616 100644 --- a/docs/source/reference/router/telemetry/metrics-exporters/datadog.mdx +++ b/docs/source/reference/router/telemetry/metrics-exporters/datadog.mdx @@ -2,6 +2,8 @@ title: Datadog exporter (via OTLP) subtitle: Configure the Datadog exporter for metrics description: Configure the Datadog exporter for metrics via OpenTelemetry Protocol (OTLP) in the Apollo GraphOS Router or Apollo Router Core. +context: + - telemetry --- Enable and configure the [OTLP exporter](/router/configuration/telemetry/exporters/metrics/otlp) for metrics in the GraphOS Router or Apollo Router Core for use with [Datadog](https://www.datadoghq.com/). diff --git a/docs/source/reference/router/telemetry/metrics-exporters/dynatrace.mdx b/docs/source/reference/router/telemetry/metrics-exporters/dynatrace.mdx index fedb7f9e90..db28900e05 100644 --- a/docs/source/reference/router/telemetry/metrics-exporters/dynatrace.mdx +++ b/docs/source/reference/router/telemetry/metrics-exporters/dynatrace.mdx @@ -2,6 +2,8 @@ title: Dynatrace exporter (via OTLP) subtitle: Configure the Dynatrace exporter for metrics description: Configure the Dynatrace exporter for metrics via OpenTelemetry Protocol (OTLP) in the Apollo Router. +context: + - telemetry --- Enable and configure the [OTLP exporter](/router/configuration/telemetry/exporters/metrics/otlp) for metrics in the Apollo Router for use with [Dynatrace](https://dynatrace.com/). diff --git a/docs/source/reference/router/telemetry/metrics-exporters/new-relic.mdx b/docs/source/reference/router/telemetry/metrics-exporters/new-relic.mdx index b22ae19903..57a0a660bf 100644 --- a/docs/source/reference/router/telemetry/metrics-exporters/new-relic.mdx +++ b/docs/source/reference/router/telemetry/metrics-exporters/new-relic.mdx @@ -2,6 +2,8 @@ title: New Relic exporter (via OTLP) subtitle: Configure the New Relic exporter for metrics description: Configure the New Relic exporter for metrics via OpenTelemetry Protocol (OTLP) in the Apollo GraphOS Router or Apollo Router Core. +context: + - telemetry --- Enable and configure the [OTLP exporter](/router/configuration/telemetry/exporters/metrics/otlp) for metrics in the GraphOS Router or Apollo Router Core for use with [New Relic](https://newrelic.com/). diff --git a/docs/source/reference/router/telemetry/metrics-exporters/otlp.mdx b/docs/source/reference/router/telemetry/metrics-exporters/otlp.mdx index 08042c405a..6831abfb8f 100644 --- a/docs/source/reference/router/telemetry/metrics-exporters/otlp.mdx +++ b/docs/source/reference/router/telemetry/metrics-exporters/otlp.mdx @@ -2,6 +2,8 @@ title: OpenTelemetry Protocol (OTLP) exporter subtitle: Configure the OpenTelemetry Protocol (OTLP) exporter for metrics description: Configure the OpenTelemetry Protocol (OTLP) exporter for metrics in the Apollo GraphOS Router or Apollo Router Core. +context: + - telemetry --- import BatchProcessorPreamble from '../../../../../shared/batch-processor-preamble.mdx'; import BatchProcessorRef from '../../../../../shared/batch-processor-ref.mdx'; diff --git a/docs/source/reference/router/telemetry/metrics-exporters/overview.mdx b/docs/source/reference/router/telemetry/metrics-exporters/overview.mdx index c269a0f0d1..488baaf3db 100644 --- a/docs/source/reference/router/telemetry/metrics-exporters/overview.mdx +++ b/docs/source/reference/router/telemetry/metrics-exporters/overview.mdx @@ -4,6 +4,8 @@ subtitle: Export router metrics description: Collect and export metrics from the Apollo GraphOS Router or Apollo Router Core for Prometheus, OpenTelemetry Protocol (OTLP), Datadog, and New Relic. redirectFrom: - /technotes/TN0015-router-to-apm-via-opentelemetry/ +context: + - telemetry --- The GraphOS Router and Apollo Router Core support collection of metrics with [OpenTelemetry](https://opentelemetry.io/), with exporters for: diff --git a/docs/source/reference/router/telemetry/metrics-exporters/prometheus.mdx b/docs/source/reference/router/telemetry/metrics-exporters/prometheus.mdx index 00a4323f08..518c31d336 100644 --- a/docs/source/reference/router/telemetry/metrics-exporters/prometheus.mdx +++ b/docs/source/reference/router/telemetry/metrics-exporters/prometheus.mdx @@ -2,6 +2,8 @@ title: Prometheus exporter subtitle: Configure the Prometheus metrics exporter description: Configure the Prometheus metrics exporter endpoint in the Apollo GraphOS Router or Apollo Router Core. +context: + - telemetry --- Enable and configure the [Prometheus](https://www.prometheus.io/) exporter for metrics in the GraphOS Router or Apollo Router Core. diff --git a/docs/source/reference/router/telemetry/trace-exporters/datadog.mdx b/docs/source/reference/router/telemetry/trace-exporters/datadog.mdx index 8d306fc6d2..a359423073 100644 --- a/docs/source/reference/router/telemetry/trace-exporters/datadog.mdx +++ b/docs/source/reference/router/telemetry/trace-exporters/datadog.mdx @@ -2,6 +2,8 @@ title: Datadog exporter (via OTLP) subtitle: Configure the Datadog exporter for tracing description: Configure the Datadog exporter for tracing via OpenTelemetry Protocol (OTLP) in the Apollo GraphOS Router or Apollo Router Core. +context: + - telemetry --- import BatchProcessorPreamble from '../../../../../shared/batch-processor-preamble.mdx'; import BatchProcessorRef from '../../../../../shared/batch-processor-ref.mdx'; diff --git a/docs/source/reference/router/telemetry/trace-exporters/dynatrace.mdx b/docs/source/reference/router/telemetry/trace-exporters/dynatrace.mdx index db736561d4..81f5dc8b86 100644 --- a/docs/source/reference/router/telemetry/trace-exporters/dynatrace.mdx +++ b/docs/source/reference/router/telemetry/trace-exporters/dynatrace.mdx @@ -2,6 +2,8 @@ title: Dynatrace exporter (via OTLP) subtitle: Configure the Dynatrace exporter for tracing description: Configure the Dynatrace exporter for tracing via OpenTelemetry Protocol (OTLP) in the Apollo Router. +context: + - telemetry --- Enable and configure the [OTLP exporter](/router/configuration/telemetry/exporters/tracing/otlp) for tracing in the Apollo Router for use with [Dynatrace](https://dynatrace.com/). diff --git a/docs/source/reference/router/telemetry/trace-exporters/jaeger.mdx b/docs/source/reference/router/telemetry/trace-exporters/jaeger.mdx index bde89146a0..4d323613f0 100644 --- a/docs/source/reference/router/telemetry/trace-exporters/jaeger.mdx +++ b/docs/source/reference/router/telemetry/trace-exporters/jaeger.mdx @@ -2,6 +2,8 @@ title: Jaeger exporter (via OTLP) subtitle: Configure the Jaeger exporter for tracing description: Configure the Jaeger exporter for tracing via OpenTelemetry Protocol (OTLP) in the Apollo GraphOS Router or Apollo Router Core. +context: + - telemetry --- import BatchProcessorPreamble from "../../../../../shared/batch-processor-preamble.mdx"; diff --git a/docs/source/reference/router/telemetry/trace-exporters/new-relic.mdx b/docs/source/reference/router/telemetry/trace-exporters/new-relic.mdx index 2c1a645d3c..8aa69558c9 100644 --- a/docs/source/reference/router/telemetry/trace-exporters/new-relic.mdx +++ b/docs/source/reference/router/telemetry/trace-exporters/new-relic.mdx @@ -2,6 +2,8 @@ title: New Relic exporter (via OTLP) subtitle: Configure the New Relic exporter for tracing description: Configure the New Relic exporter for tracing via OpenTelemetry Protocol (OTLP) in the Apollo GraphOS Router or Apollo Router Core. +context: + - telemetry --- Enable and configure the [OTLP exporter](/router/configuration/telemetry/exporters/tracing/otlp) for tracing in the GraphOS Router or Apollo Router Core for use with [New Relic](https://newrelic.com/). diff --git a/docs/source/reference/router/telemetry/trace-exporters/otlp.mdx b/docs/source/reference/router/telemetry/trace-exporters/otlp.mdx index fa7ff3a39e..432ac0ddaa 100644 --- a/docs/source/reference/router/telemetry/trace-exporters/otlp.mdx +++ b/docs/source/reference/router/telemetry/trace-exporters/otlp.mdx @@ -2,6 +2,8 @@ title: OpenTelemetry Protocol (OTLP) exporter subtitle: Configure the OpenTelemetry Protocol exporter for tracing description: Configure the OpenTelemetry Protocol (OTLP) exporter for tracing in the Apollo GraphOS Router or Apollo Router Core. +context: + - telemetry --- import BatchProcessorPreamble from '../../../../../shared/batch-processor-preamble.mdx'; import BatchProcessorRef from '../../../../../shared/batch-processor-ref.mdx'; diff --git a/docs/source/reference/router/telemetry/trace-exporters/overview.mdx b/docs/source/reference/router/telemetry/trace-exporters/overview.mdx index 76f54d7e4d..48d2bcf1de 100644 --- a/docs/source/reference/router/telemetry/trace-exporters/overview.mdx +++ b/docs/source/reference/router/telemetry/trace-exporters/overview.mdx @@ -2,6 +2,8 @@ title: Router Tracing subtitle: Collect tracing information from the router description: Collect and export tracing information from the Apollo GraphOS Router or Apollo Router Core. Supports OpenTelemetry Protocol (OTLP), Datadog, New Relic, Jaeger, Zipkin. +context: + - telemetry --- The GraphOS Router and Apollo Router Core support collection of traces with [OpenTelemetry](https://opentelemetry.io/), with exporters for: diff --git a/docs/source/reference/router/telemetry/trace-exporters/zipkin.mdx b/docs/source/reference/router/telemetry/trace-exporters/zipkin.mdx index b4a732999e..1af24bb6a8 100644 --- a/docs/source/reference/router/telemetry/trace-exporters/zipkin.mdx +++ b/docs/source/reference/router/telemetry/trace-exporters/zipkin.mdx @@ -2,6 +2,8 @@ title: Zipkin exporter subtitle: Configure the Zipkin exporter for tracing description: Enable and configure the Zipkin exporter for tracing in the Apollo GraphOS Router or Apollo Router Core. +context: + - telemetry --- import BatchProcessorPreamble from '../../../../../shared/batch-processor-preamble.mdx'; import BatchProcessorRef from '../../../../../shared/batch-processor-ref.mdx'; diff --git a/docs/source/routing/observability/debugging-client-requests.mdx b/docs/source/routing/observability/debugging-client-requests.mdx new file mode 100644 index 0000000000..6126b3cfaa --- /dev/null +++ b/docs/source/routing/observability/debugging-client-requests.mdx @@ -0,0 +1,79 @@ +--- +title: Debugging Client Requests to GraphOS Router +subtitle: Options for analyzing and debugging incoming requests +description: Learn how to use GraphOS router telemetry and GraphOS Insights to inspect and debug incoming HTTP client requests. +context: + - telemetry +--- + +By default, the GraphOS Router operates [without generating HTTP request logs or exporting telemetry metrics beyond what it sends to GraphOS](/graphos/routing/observability). +This default minimizes potentially high observability costs that can result from high request volumes. +If you need more data than the default [GraphOS Insights](/graphos/platform/insights), you can configure your router to collect and export additional telemetry. + +## Using GraphOS Insights + +GraphOS Studio lets you analyze data from failed requests, such as GraphQL error messages ([if enabled](/graphos/routing/graphos-reporting#errors)) and the ID of the client making the request. You can also [segment your insights data](/graphos/platform/insights/client-segmentation) based on the client ID. + + + +[Learn how to ensure client IDs are included in all requests.](/graphos/routing/observability/client-id-enforcement) + + + +## Enabling additional telemetry + +You can instrument [router telemetry](/graphos/routing/observability/telemetry) if you need information outside of what's presented in GraphOS Studio to debug client requests. + + + +If you want to debug client requests in your own environment, Apollo recommends first doing so in a non-production environment or using logic to debug on a per-request basis. + + + +### Logging requests + +You can conditionally include request bodies, including GraphQL operations, in your telemetry based on specific [conditions](/graphos/reference/router/telemetry/instrumentation/conditions). Apply these conditions on a router request [event](/graphos/reference/router/telemetry/instrumentation/events) like so: + +```yaml title="router.yaml" +telemetry: + instrumentation: + events: + router: + request: + level: info + condition: # Only log the router request if you sent `x-log-request` with the value `enabled` + eq: + - request_header: x-log-request + - "enabled" +``` + +### Debugging router logs + +By default, the router uses the `info` level for its logging. [Enabling other logging levels](/graphos/reference/router/telemetry/log-exporters/overview) can help debug specific scenarios. Using non-`info` level configurations is only recommended for local or non-production environments. + +## Rhai scripts and coprocessors + +Hooking into the router service layer with either [Rhai scripts](/graphos/routing/customization/rhai) or [coprocessors](/graphos/routing/customization/coprocessor) gives you access to the full HTTP request before processing occurs. You can use either Rhai scripts or coprocessors to add custom logic for what to log and when. + +See the Apollo Solutions ["Hello World" coprocessor](https://github.com/apollosolutions/example-coprocessor-helloworld) for an example of a coprocessor that simply logs the router's payload. + + + +## Alternative cloud services + +If you are deploying the router to a cloud service, you likely already have access to the raw HTTP logs through other services like load balancers. You should be able to find specific client request logs for a particular operation using the operation hash or trace ID. Refer to the docs for your cloud providers for more information. Popular cloud provider links are provided below. + +### Amazon Web Services + +- [AWS CloudWatch Logs](https://docs.aws.amazon.com/AmazonCloudWatch/latest/logs/WhatIsCloudWatchLogs.html) +- [AWS Elastic Load Balancer](https://docs.aws.amazon.com/elasticloadbalancing/latest/application/load-balancer-access-logs.html) + +### Google Cloud Platform + +- [Google Cloud Observability](https://cloud.google.com/logging/docs/log-analytics) +- [Google Cloud Load Balancing](https://cloud.google.com/load-balancing/docs/l7-internal/monitoring) + +### Microsoft Azure + +- [Azure App Service Logging](https://learn.microsoft.com/en-us/azure/app-service/troubleshoot-diagnostic-logs) +- [Azure Load Balancer](https://learn.microsoft.com/en-us/azure/load-balancer/monitor-load-balancer) diff --git a/docs/source/routing/observability/index.mdx b/docs/source/routing/observability/index.mdx index 48d531e69f..5ec8f5dafb 100644 --- a/docs/source/routing/observability/index.mdx +++ b/docs/source/routing/observability/index.mdx @@ -1,39 +1,34 @@ --- -title: Observability with GraphOS +title: GraphOS Observability Overview subtitle: Capture and export signals about supergraph health with GraphOS and router telemetry description: Learn how to collect supergraph metrics in order to monitor and optimize your GraphQL usage and performance. Collect raw metrics, insights, and alerts with Apollo GraphOS, GraphOS Studio, and GraphOS Router and Apollo Router Core. redirectFrom: - /federation/performance/monitoring/ --- -Monitoring a supergraph requires gathering metrics about each client, server, subgraph, and router involved in sending or handling requests. Ideally, the entire request pipeline—from client to router to subgraph and back—is instrumented with metrics that can be collected and exported for analysis. +Apollo GraphOS provides the observability signals and tools your team needs to monitor the health and performance of your deployed supergraph. It collects operation metrics from across your supergraph and presents them in its Studio Insights suite to help you visualize and analyze the state of your supergraph. -Apollo GraphOS provides the observability signals and tools your team needs to maintain the health and performance of your deployed supergraphs. Via declarative configuration, GraphOS enables routers to collect GraphQL operation and field metrics and report them back. GraphOS also specifies how to capture metrics on the clients and subgraphs handling operations. +## How observability in GraphOS works -## Understanding runtime health with router telemetry +GraphOS collects its metrics from clients, routers, and subgraphs. By default, GraphOS Router automatically [reports operation and field usage metrics to GraphOS Studio](/graphos/platform/insights/sending-operation-metrics#from-the-apollo-router-or-apollo-server). -Both the GraphOS Router and Apollo Router Core run a request-handling pipeline with multiple stages that starts with receiving requests and ends with sending back responses. The continuous operation and throughput of this request pipeline, or "request lifecycle," reflects the health of a running supergraph. Observability of the router request lifecycle is therefore key to understanding the health of a supergraph. +To gain deeper insights into the health of your supergraph, you can configure your GraphOS Router to collect telemetry about requests as they're processed through the pipeline of the router request lifecycle. The router provides both standard and customizable signals. - - -To enable observability, the router supports telemetry that can be added and customized in every stage of the router request lifecycle. You can add logs, metrics, and traces, and you can export them to your application performance monitoring (APM) solution. - -To learn more, go to [Router Telemetry](/graphos/routing/observability/telemetry), then browse the pages in [Router Telemetry](/graphos/reference/router/telemetry/log-exporters/overview) reference docs. +GraphOS supports exporting its collected metrics to various observability tools. GraphOS Studio offers a Datadog integration, and GraphOS Router provides exporters for several observability tools and APMs. -## Automating supergraph metrics collection with GraphOS + -Everything connected to GraphOS—including clients, routers, and subgraphs—can report metrics about GraphQL operations they send and service. GraphOS thus is the hub for collecting operation metrics, and its Studio IDE offers tools to visualize and analyze those operations and their field usage. - -The metrics that GraphOS collects can be forwarded to your APM solution. Apollo offers a [Datadog integration](/graphos/platform/insights/datadog-forwarding) to forward your graph's performance metrics to your Datadog account. +If you're new to observability, check out [OpenTelemetry's observability primer](https://opentelemetry.io/docs/concepts/observability-primer/) to learn core observability concepts. + ## Analyzing metrics and gathering insights with GraphOS -Once the various metrics are collected by GraphOS, you can use the GraphOS Studio UI to visualize and analyze them to understand your supergraph's usage and performance. +Everything connected to GraphOS—including clients, routers, and subgraphs—can report metrics about GraphQL operations they send and service. GraphOS thus is the hub for collecting operation metrics. -- Examine them in the Studio IDE from any variant's **Insights** page and use them to improve your graph's performance. +Once operation and field usage metrics are collected by GraphOS, you can use the GraphOS Studio [**Insights**](/graphos/platform/insights) suite to visualize and analyze them to understand your supergraph's usage and performance. -- Create GraphOS notifications to notify your team about changes to your graph and its performance. +Additionally, you can forward the metrics that GraphOS collects to your APM solution. Apollo offers a [Datadog integration](/graphos/platform/insights/datadog-forwarding) to forward your graph's performance metrics to your Datadog account. @@ -53,15 +48,33 @@ If your organization doesn't currently have an Enterprise plan, you can test out +## Enabling additional runtime telemetry + +Both the GraphOS Router and Apollo Router Core run a request-handling pipeline with multiple stages that starts with receiving requests and ends with sending back responses. The continuous operation and throughput of this request pipeline, or _request lifecycle_, reflects the health of a running supergraph. Observability of the router request lifecycle is therefore key to understanding the health of a supergraph. + + + +To enable observability, the router supports telemetry that can be added and customized in different stages of the router request lifecycle. You can add logs, metrics, and traces and export them to your application performance monitoring (APM) solution. + +To learn more, go to [Router Telemetry](/graphos/routing/observability/telemetry), then browse the pages in [Router Telemetry](/graphos/reference/router/telemetry/log-exporters/overview) reference docs. + + +## Next steps + + + +If you're an enterprise customer looking for more material on this topic, try the [Enterprise best practices: Supergraph observability](https://www.apollographql.com/tutorials/supergraph-observability) course on Odyssey. + +Not an enterprise customer? [Learn about GraphOS for Enterprise.](https://www.apollographql.com/pricing) -## Next steps + -- Learn about metrics collection with [GraphOS Metrics Collection](/graphos/platform/insights/sending-operation-metrics). +- Learn how to use [GraphOS Insights](/graphos/platform/insights/) to monitor and improve your graph's performance. -- Learn about subgraph observability with [Subgraph Observability](/graphos/routing/observability/subgraph-error-inclusion). +- Learn how to [configure router telemetry](/graphos/routing/observability/telemetry) -- Learn about client observability with [Client Observability](/graphos/routing/observability/client-id-enforcement/). +- Learn about [subgraph observability](/graphos/routing/observability/subgraph-error-inclusion). -- Learn how to use insights to improve your graph's performance with [GraphOS Metrics and Insights](/graphos/platform/insights/). +- Learn about [client observability](/graphos/routing/observability/debugging-client-requests). -- Learn how to use notifications with [GraphOS notifications](/graphos/platform/insights/notifications). +- Learn how to enable [GraphOS notifications](/graphos/platform/insights/notifications). diff --git a/docs/source/routing/observability/otel-traces-to-prometheus.mdx b/docs/source/routing/observability/otel-traces-to-prometheus.mdx index 08841ed1b0..2cb865ee8a 100644 --- a/docs/source/routing/observability/otel-traces-to-prometheus.mdx +++ b/docs/source/routing/observability/otel-traces-to-prometheus.mdx @@ -7,6 +7,8 @@ published: 2022-06-03 tags: [server, observability] redirectFrom: - /technotes/TN0003-opentelemetry-traces-to-prometheus/ +context: + - telemetry --- diff --git a/docs/source/routing/observability/otel.mdx b/docs/source/routing/observability/otel.mdx index 7c0f3585c0..7096368712 100644 --- a/docs/source/routing/observability/otel.mdx +++ b/docs/source/routing/observability/otel.mdx @@ -3,6 +3,8 @@ title: OpenTelemetry in Apollo Federation sidebar_title: OpenTelemetry subtitle: Configure your federated graph to emit logs, traces, and metrics description: Learn how to configure your federated GraphQL services to generate and process telemetry data, including logs, traces, and metrics. +context: + - telemetry --- [OpenTelemetry](https://opentelemetry.io/) is a collection of open-source tools for generating and processing telemetry data (such as logs, traces, and metrics) from different systems in a generic and consistent way. diff --git a/docs/source/routing/observability/telemetry.mdx b/docs/source/routing/observability/telemetry.mdx index 96ba1b7f37..163a740551 100644 --- a/docs/source/routing/observability/telemetry.mdx +++ b/docs/source/routing/observability/telemetry.mdx @@ -2,14 +2,25 @@ title: Router Telemetry subtitle: Collect observable data to monitor your router and supergraph description: Observe and monitor the health and performance of GraphQL operations in the Apollo GraphOS Router or Apollo Router Core by collecting and exporting telemetry logs, metrics, and traces. +context: + - telemetry --- import TelemetryPerformanceNote from '../../../shared/telemetry-performance.mdx'; -In this overview, learn about: -- How GraphOS Router and Apollo Router Core telemetry enable supergraph observability and debuggability -- What data is captured in the router's logs, metrics, and traces -- What exporters are available to provide telemetry to your application performance monitoring (APM) tools +Since the router is the single access point for all traffic to and from your graph, router telemetry is the most comprehensive way to observe your supergraph. By implementing telemetry, you can: + +- Monitor your supergraph's health and performance +- Diagnose issues and deduce root causes +- Optimize resource usage and system reliability + +To understand how router telemetry fits into the broader set of GraphOS observability tooling, see the [observability overview](/graphos/routing/observability). + +## How router telemetry works + +By default, the router doesn't collect or export any telemetry beyond [the operation](/graphos/platform/insights/sending-operation-metrics#from-the-apollo-router-or-apollo-server) and [field usage metrics](/graphos/platform/insights/sending-operation-metrics#from-the-apollo-router) it sends to GraphOS. You configure which additional telemetry data to collect and where to export it via your router's configuration file. + +The router request lifecycle is the primary data source for telemetry data or _signals_. Telemetry signals include _logs_, _metrics_, and _traces_. The section on [router telemetry signals](#router-telemetry-signals) explains these data types and gives basic configuration examples. _Exporters_ are responsible for sending telemetry data to your application performance monitoring (APM) and observability tools for storage, visualization, and analysis. ```mermaid flowchart LR @@ -20,139 +31,304 @@ flowchart LR end apms["APM, agent,
or collector"] - exporters--"native or
OTLP"-->apms - + exporters--"OTLP"-->apms ``` -## Observability through telemetry - -The health of your supergraph is only as good as the health of your router. Because the router is the single entry point to the supergraph, all client requests pass through the [router request lifecycle](/graphos/routing/request-lifecycle). Any issues with the router are likely to affect the handling of all requests to your supergraph. - -Diagnosing your router's health and performance requires it to show observable data about its inner workings. The more observable data you can monitor and analyze, the faster you can identify unhealthy behaviors, deduce root causes, and implement fixes. - -The router provides the necessary data to monitor its health and troubleshoot issues. The router's observability is critical for maintaining a healthy, performant supergraph and minimizing its [mean time to repair (MTTR)](https://en.wikipedia.org/wiki/Mean_time_to_repair). - -## Collect exactly the telemetry you need - -Effective telemetry provides just the right amount and granularity of information to maintain your graph. Too much data can overwhelm your system, for example, with high cardinality metrics. Too little may not provide enough information to debug issues. +### Telemetry exporters + +The router emits telemetry in the industry-standard OpenTelemetry Protocol (OTLP) format and is therefore compatible with many APM tools, including: + +- Prometheus +- OpenTelemetry Collector +- Datadog +- New Relic +- Jaeger +- Zipkin + +### Attributes and selectors + +Attributes and selectors are key-value pairs that add contextual information from the router request lifecycle to telemetry data. You can use attributes and selectors to annotate events, metrics, and spans so they can help you filter and group data in your APMs. + +The router supports a set of standard attributes from [OpenTelemetry semantic conventions](https://opentelemetry.io/docs/specs/semconv/). Example attributes include: + +- HTTP status code +- GraphQL operation name +- Subgraph name + +Selectors allow you to define custom data points based on the router's request lifecycle. + +| | Description | +| ----- | ----- | +| **Attribute** | Standard data points that can be attached to spans, instruments, and events. | +| **Selector** | Custom data points extracted from the router's request lifecycle, tailored to specific needs. | + +## Router telemetry signals + +The router supports three signal types for collecting and exporting telemetry: + + + + + + + + + + + + + + + + + + + + + + +
SignalDescription
Logs and events +
    +
  • Capture and export logs in text or JSON format.
  • +
  • Trigger custom events to log critical actions during the router request lifecycle.
  • +
+
Metrics and instruments +
    +
  • Export standard metrics for Router operations.
  • +
  • Leverage OpenTelemetry (OTEL) metrics to capture HTTP lifecycle data.
  • +
  • Define custom metrics using attributes and selectors.
  • +
+
Traces and spans +
    +
  • Export traces of router transactions.
  • +
  • Use spans to monitor specific actions within traces and attach attributes or selectors for deeper insights.
  • +
+
+ + +These mechanisms let you collect data about the inner workings of your router and graph and export them accordingly. -Specific events that need to be captured—and the conditions under which they need to be captured—can change as client applications and graphs change. Different environments, such as production and development, can have different observability requirements. - -Router telemetry is customizable to meet the observability needs of different graphs. You can record custom events in different stages of the router request lifecycle and create custom contexts with [attributes](#router-telemetry-attributes) to track a request or response as it flows through the router. You can shape the volume and rate of emitted telemetry, for example, with batched telemetry. +### Logs and events -## Router telemetry types +Logs record events in the router's request lifecycle. Examples of logged events include: -The router collects different types of telemetry, including: +- Information about the router lifecycle +- Warnings about misconfiguration +- Errors that occurred during a request -* [Logs and events](#logs-and-events) -* [Metrics and instruments](#metrics-and-instruments) -* [Traces and spans](#traces-and-spans) +#### Log exporters -These let you collect data about the inner workings of your router and export logs, metrics, and traces to your application performance monitoring (APM) and observability tools. +You can log events to standard output in either text or JSON format. Logs can also be consumed by [logging exporters](/router/configuration/telemetry/exporters/logging/overview) and as part of [spans](/graphos/routing/observability/telemetry#traces-and-spans) via [tracing exporters](/router/configuration/telemetry/exporters/tracing/overview). ```mermaid flowchart LR - subgraph Router - lifecycle("Request Lifecycle
(telemetry sources)") - logs_exporter("Logs
Exporter") - terminal(stdout) - metrics_exporter("Metrics
Exporter") - traces_exporter("Traces
Exporter") - prometheus("Prometheus
Endpoint") - - lifecycle-->logs_exporter-->terminal - lifecycle--->metrics_exporter-->prometheus - lifecycle--->traces_exporter - end - - otlp_apm["OTLP-enabled APM
(e.g. New Relic)"] - zipkin[Zipkin] - datadog[Datadog agent] - apm1[APM] - collector("OpenTelemetry
Collector") - jaeger("Jaeger
(agent or collector)") + Router --"Emits logs in
text or JSON format"--> stdout + stdout --"Exports logs"--> log_store + log_store[("Log store")] +``` +#### Example log configuration - metrics_exporter--"OTLP"--->otlp_apm - metrics_exporter--"OTLP"--->collector - metrics_exporter--"OTLP"--->datadog - traces_exporter--"OTLP"--->jaeger - traces_exporter--"native or
OTLP"-->datadog - traces_exporter--"native"-->jaeger - traces_exporter--"native"--->zipkin - prometheus<--"scrapes"-->apm1 +This configuration snippet enables stdout logging in JSON: +```yaml title="router.yaml" +telemetry: + exporters: + logging: + stdout: + enabled: true + format: json ``` -### Logs and events - -Logs record **events** in the router. Examples of logged events include: +### Metrics and instruments -* Information about the router lifecycle -* Warnings about misconfiguration -* Errors that occurred during a request +Metrics are measurements of the router's behavior that are collected and often analyzed over time to identify trends. Examples of router metrics include the number of incoming HTTP requests and the time spent processing a request. + +Instruments define _how_ to collect and report metrics. Different kinds of instruments include counters, gauges, and histograms. For example, given the metric "number of incoming HTTP requests," a counter records the total number of requests, a histogram captures the distribution of request counts over time, and a gauge provides a snapshot of the current request count at a given moment. + +#### Instrument types + +Metric instruments fall into three categories: + + + + + + + + + + + + + + + + + + + + + + +
Instrument TypeDescription
OTEL instruments + Standard OpenTelemetry instruments around the HTTP lifecycle, including: +
    +
  • The number of HTTP requests by HTTP status
  • +
  • A histogram of HTTP router request duration
  • +
  • The number of active requests in flight
  • +
  • A histogram of request body sizes
  • +
+
Router instruments + Standard instruments for the router request life cycle, including: +
    +
  • Count of GraphQL errors in responses
  • +
  • Time spent loading the schema in seconds
  • +
  • Number of entries in the router's cache
  • +
  • Time spent warming up the query planner queries in seconds
  • +
+
Custom instrument + Custom instruments defined in the router request life cycle. +
+ +#### Example instrument configuration + +This configuration snippet enables OTEL instrumentation for a histogram of request body sizes: + +```yaml title="router.yaml" +telemetry: + instrumentation: + instruments: + router: + http.server.request.body.size: true +``` -Logs can be consumed by [logging exporters](/router/configuration/telemetry/exporters/logging/overview) and as part of [spans](#traces-and-spans) via [tracing exporters](/router/configuration/telemetry/exporters/tracing/overview). +See [Instruments](/router/configuration/telemetry/instrumentation/instruments) for an overview of available instruments and a guide for configuring and customizing instruments. -### Metrics and instruments +#### Metric exporters -Metrics are measurements of the router's behavior that can be exported and monitored. Different kinds of metrics include histograms, gauges, and counts. +In addition to the [operation metrics](/graphos/platform/insights/sending-operation-metrics#from-the-apollo-router-or-apollo-server) and [field usage metrics](/graphos/platform/insights/sending-operation-metrics#from-the-apollo-router) that GraphOS Router sends to GraphOS, you can configure the router with metric exporters for other observability tools and APMs. -Metrics can be consumed by _exporters_. See [Metrics exporters](/router/configuration/telemetry/exporters/metrics/overview) for an overview of supported exporters. +```mermaid +flowchart LR + Router --"OTEL
metrics"--> APM + Router --"Usage/Performance
metrics"--> GraphOS +``` -An individual metric is called an _instrument_. Example instruments of the router include: +This configuration snippet enables exporting metrics to Prometheus: -* Number of received requests -* Histogram of request durations -* Number of in-flight requests +```yaml title="router.yaml" +telemetry: + exporters: + metrics: + prometheus: + enabled: true + listen: 127.0.0.1:9090 + path: /metrics +``` -See [Instruments](/router/configuration/telemetry/instrumentation/instruments) for an overview of available instruments and a guide for configuring and customizing instruments. +Learn more about [sending metrics to Prometheus](/graphos/reference/router/telemetry/metrics-exporters/prometheus) and [metric exporters](/graphos/reference/router/telemetry/metrics-exporters/overview) in general. ### Traces and spans -Traces monitor the flow of a request through the router. A trace is composed of [**spans**](/router/configuration/telemetry/instrumentation/spans). A span captures a request's duration as it flows through the router request lifecycle. Spans may include contextual information about the request, such as the HTTP status code, or the name of the subgraph being queried. +Traces help you monitor the flow of a request through the router. A trace is composed of [spans](/router/configuration/telemetry/instrumentation/spans). A span captures a request's duration as it flows through the router request lifecycle. Spans may include contextual information about the request, such as the HTTP status code or the name of the subgraph being queried. Examples of spans include: -* `router` - Wraps an entire request from the HTTP perspective -* `supergraph` - Wraps a request once GraphQL parsing has taken place -* `subgraph` - Wraps a request to a subgraph. - -Traces are consumed via [tracing exporters](/router/configuration/telemetry/exporters/tracing/overview). +- router \- Wraps an entire request from the HTTP perspective +- supergraph \- Wraps a request once GraphQL parsing has taken place +- subgraph \- Wraps a request to a subgraph. -## Router telemetry exporters +#### Tracing exporters -The router exports its collected telemetry in formats compatible with industry-standard APM tools. The router supports logging, metrics, and tracing exporters for a variety of tools, including: +If you've enabled federated tracing (also known as FTV1 tracing) in your subgraph libraries, the router [sends field-level traces to GraphOS](/graphos/routing/graphos-reporting#reporting-field-level-traces). Additionally, trace exporters can consume and report traces to your APM. -* Prometheus -* OpenTelemetry Collector -* Datadog -* New Relic -* Jaeger -* Zipkin +```mermaid +flowchart LR + Router --"OTEL
traces"--> APM + Router --"FTV1 Data"--> GraphOS +``` -For more information, see [logging exporters](/router/configuration/telemetry/exporters/logging/overview), [metrics exporters](/router/configuration/telemetry/exporters/metrics/overview), and [tracing exporters](/router/configuration/telemetry/exporters/tracing/overview). +This configuration snippet enables +- setting attributes that Datadog uses to organize its APM view +- exporting traces to a Datadog agent: + +```yaml title="router.yaml" +telemetry: + instrumentation: + spans: + mode: spec_compliant + router: + attributes: + otel.name: router + operation.name: "router" + resource.name: + request_method: true + supergraph: + attributes: + otel.name: supergraph + operation.name: "supergraph" + resource.name: + operation_name: string + subgraph: + attributes: + otel.name: subgraph + operation.name: "subgraph" + resource.name: + subgraph_operation_name: string + exporters: + tracing: + otlp: + enabled: true + endpoint: "${env.DATADOG_AGENT_HOST}:4317" +``` -## Router telemetry attributes +Learn more about [sending traces to DataDog](/graphos/reference/router/telemetry/trace-exporters/datadog) and [trace exporters](/graphos/reference/router/telemetry/trace-exporters/overview) in general. -You can annotate events, metrics, and spans with **attributes**. Attributes are key-value pairs that add contextual information about the router pipeline to telemetry. You can then use these attributes to filter and group data in your APMs. +## Best practices -Example attributes include: +### Collecting exactly the telemetry you need -* HTTP status code -* GraphQL operation name -* Subgraph name +Effective telemetry provides just the right amount and granularity of information to maintain your graph. Too much data can overwhelm your system, for example, with high cardinality metrics. Too little may not provide enough information to debug issues. -You can use [standard attributes](/router/configuration/telemetry/instrumentation/standard-attributes) or [selectors](/router/configuration/telemetry/instrumentation/selectors) as span attributes. +Specific events that need to be captured—and the conditions under which they need to be captured—can change as client applications and graphs change. Different environments, such as production and development, can have different observability requirements. - +Router telemetry is customizable to meet the observability needs of different graphs. Keep in mind your particular environments' and graphs' requirements when configuring your telemetry. -[Custom attributes for spans](/router/configuration/telemetry/instrumentation/spans/#attributes) require a GraphOS [Dedicated or Enterprise plan](https://www.apollographql.com/pricing#observability). +#### Setting conditions for collecting telemetry - +You can set [conditions](/graphos/reference/router/telemetry/instrumentation/conditions) for instruments and events to only collect telemetry data when necessary. This configuration snippet enables only collecting the configured telemetry data when the `request_header` is equal to "example-value": -## Best practices +```yaml +eq: + - "example-value" + - request_header: x-req-header +``` +#### Dropping metrics using views + +You can use metric exporters' [`view`](/graphos/reference/router/telemetry/metrics-exporters/overview#views) property with the `drop` aggregation to remove certain metrics from being sent to your APM. This configuration snippet removes all instruments that begin with `apollo_router`: + +```yaml title="router.yaml" +telemetry: + exporters: + metrics: + common: + service_name: apollo-router + views: + - name: apollo_router* + aggregation: drop +``` ### Balancing telemetry and router performance + +## Next steps + +Consult the following documentation for details on how to configure the various telemetry mechanisms and exporters: + +- [Log Exporters Overview](/graphos/reference/router/telemetry/log-exporters/overview) +- [Trace Exporters Overview](/graphos/reference/router/telemetry/trace-exporters/overview) +- [Metrics Exporters Overview](/graphos/reference/router/telemetry/metrics-exporters/overview) +- [Attributes and Selectors](/graphos/reference/router/telemetry/instrumentation/selectors) +- [Conditions](/graphos/reference/router/telemetry/instrumentation/conditions)