From 9471770c9039a72de484cbfc11c357c74f4dc8cd Mon Sep 17 00:00:00 2001 From: heydbut Date: Thu, 25 May 2023 15:34:50 -0400 Subject: [PATCH] Add envoy dashboard --- README.md | 2 + .../otel-collector-envoy-dashboard/main.tf | 373 ++++++++++++++++++ .../otel-collector-envoy-dashboard/outputs.tf | 4 + .../variables.tf | 4 + main.tf | 10 + outputs.tf | 10 + 6 files changed, 403 insertions(+) create mode 100644 collector-dashboards/otel-collector-envoy-dashboard/main.tf create mode 100644 collector-dashboards/otel-collector-envoy-dashboard/outputs.tf create mode 100644 collector-dashboards/otel-collector-envoy-dashboard/variables.tf diff --git a/README.md b/README.md index 738a994..6f7d19e 100644 --- a/README.md +++ b/README.md @@ -28,9 +28,11 @@ Each resource has an associated module that will create Lightstep dashboards to * __OpenTelemetry CouchDB Receiver__ (module: [`otel-collector-couchdbreceiver-dashboard`](https://github.com/lightstep/terraform-opentelemetry-dashboards/tree/main/collector-dashboards/otel-collector-couchdbreceiver-dashboard)) * __OpenTelemetry Collector__ (module: [`otel-collector-dashboard`](https://github.com/lightstep/terraform-opentelemetry-dashboards/tree/main/collector-dashboards/otel-collector-dashboard)) * __OpenTelemetry elasticsearchreceiver Integration__ (module: [`otel-collector-elasticsearchreceiver-dashboard`](https://github.com/lightstep/terraform-opentelemetry-dashboards/tree/main/collector-dashboards/otel-collector-elasticsearchreceiver-dashboard)) +* __Envoy - Overview__ (module: [`otel-collector-envoy-dashboard`](https://github.com/lightstep/terraform-opentelemetry-dashboards/tree/main/collector-dashboards/otel-collector-envoy-dashboard)) * __Flink - Overview__ (module: [`otel-collector-flink-dashboard`](https://github.com/lightstep/terraform-opentelemetry-dashboards/tree/main/collector-dashboards/otel-collector-flink-dashboard)) * __Fluentd Records__ (module: [`otel-collector-fluentd-dashboard`](https://github.com/lightstep/terraform-opentelemetry-dashboards/tree/main/collector-dashboards/otel-collector-fluentd-dashboard)) * __Gunicorn - Overview__ (module: [`otel-collector-gunicorn-dashboard`](https://github.com/lightstep/terraform-opentelemetry-dashboards/tree/main/collector-dashboards/otel-collector-gunicorn-dashboard)) +* __OpenTelemetry Hadoop Dashboard__ (module: [`otel-collector-hadoop-dashboard`](https://github.com/lightstep/terraform-opentelemetry-dashboards/tree/main/collector-dashboards/otel-collector-hadoop-dashboard)) * __OpenTelemetry HBase Dashboard__ (module: [`otel-collector-hbase-dashboard`](https://github.com/lightstep/terraform-opentelemetry-dashboards/tree/main/collector-dashboards/otel-collector-hbase-dashboard)) * __OpenTelemetry / Host__ (module: [`otel-collector-host-dashboard`](https://github.com/lightstep/terraform-opentelemetry-dashboards/tree/main/collector-dashboards/otel-collector-host-dashboard)) * __OpenTelemetry / Host Metrics / CPU__ (module: [`otel-collector-hostmetrics-cpu-dashboard`](https://github.com/lightstep/terraform-opentelemetry-dashboards/tree/main/collector-dashboards/otel-collector-hostmetrics-cpu-dashboard)) diff --git a/collector-dashboards/otel-collector-envoy-dashboard/main.tf b/collector-dashboards/otel-collector-envoy-dashboard/main.tf new file mode 100644 index 0000000..f85f310 --- /dev/null +++ b/collector-dashboards/otel-collector-envoy-dashboard/main.tf @@ -0,0 +1,373 @@ +terraform { + required_providers { + lightstep = { + source = "lightstep/lightstep" + version = "~> 1.76.0" + } + } + required_version = ">= v1.0.11" +} + +resource "lightstep_dashboard" "otel_collector_envoy_dashboard" { + project_name = var.lightstep_project + dashboard_name = "Envoy - Overview" + dashboard_description = "This dashboard provides a high-level overview of your Envoy cluster so you can monitor its performance and resource usage." + + group { + rank = 3 + title = "Listeners" + visibility_type = "explicit" + + chart { + name = "Listeners Success Rate (Excluding Admin Interface)" + type = "timeseries" + rank = 0 + x_pos = 0 + y_pos = 0 + width = 16 + height = 8 + + query { + query_name = "((a/b)*100)" + display = "line" + hidden = false + query_string = "with\n a = metric envoy_http_downstream_rq_xx | filter ((((envoy_response_code_class != \"4\") && (envoy_response_code_class != 4)) && (envoy_response_code_class != 4.0)) && (((envoy_response_code_class != \"5\") && (envoy_response_code_class != 5)) && (envoy_response_code_class != 5.0))) | rate | group_by [], sum;\n b = metric envoy_http_downstream_rq_completed | filter (((envoy_response_code_class == \"2\") || (envoy_response_code_class == 2)) || (envoy_response_code_class == 2.0)) | rate | group_by [], sum;\njoin (((a / b)*100)), a=0, b=0" + } + } + chart { + name = "Listeners Response Time Percentiles" + type = "timeseries" + rank = 1 + x_pos = 16 + y_pos = 0 + width = 16 + height = 8 + + query { + query_name = "a" + display = "line" + hidden = false + query_string = "metric envoy_http_downstream_rq_time | delta | group_by [], sum | point percentile(value, 50.0), percentile(value, 95.0), percentile(value, 99.0), percentile(value, 99.9)" + } + } + chart { + name = "Listener Traffic" + type = "timeseries" + rank = 2 + x_pos = 32 + y_pos = 0 + width = 16 + height = 8 + + query { + query_name = "a" + display = "line" + hidden = false + query_string = "metric envoy_http_downstream_rq_completed | rate | group_by [], sum" + } + query { + query_name = "b" + display = "line" + hidden = false + query_string = "metric envoy_http_downstream_cx_tx_bytes_total | rate | group_by [], sum" + } + } + chart { + name = "Requests Rejected By Reason" + type = "timeseries" + rank = 3 + x_pos = 0 + y_pos = 8 + width = 16 + height = 8 + + query { + query_name = "a" + display = "line" + hidden = false + query_string = "metric envoy_http_no_route | rate | group_by [], sum" + } + query { + query_name = "b" + display = "line" + hidden = false + query_string = "metric envoy_http_no_cluster | rate | group_by [], sum" + } + query { + query_name = "c" + display = "line" + hidden = false + query_string = "metric envoy_http_downstream_cx_protocol_error | rate | group_by [], sum" + } + } + chart { + name = "Active Connections Per Type and Listener" + type = "timeseries" + rank = 4 + x_pos = 16 + y_pos = 8 + width = 16 + height = 8 + + query { + query_name = "a" + display = "line" + hidden = false + query_string = "metric envoy_http_downstream_cx_active | latest | group_by [], sum" + } + query { + query_name = "b" + display = "line" + hidden = false + query_string = "metric envoy_http_downstream_cx_http2_active | latest | group_by [], sum" + } + query { + query_name = "c" + display = "line" + hidden = false + query_string = "metric envoy_http_downstream_cx_http1_active | latest | group_by [], sum" + } + } + } + group { + rank = 0 + title = "" + visibility_type = "implicit" + } + group { + rank = 1 + title = "Overview" + visibility_type = "explicit" + + chart { + name = "Incoming Success Rate (Non-5xx Responses)" + type = "timeseries" + rank = 0 + x_pos = 0 + y_pos = 0 + width = 16 + height = 8 + + query { + query_name = "((1-(a/b))*100)" + display = "big_number" + hidden = false + query_string = "with\n a = metric envoy_cluster_upstream_rq_xx | filter (((envoy_response_code_class == \"5\") || (envoy_response_code_class == 5)) || (envoy_response_code_class == 5.0)) | rate 5m, 5m | group_by [], mean;\n b = metric envoy_cluster_upstream_rq_completed | rate 5m, 5m | group_by [], mean;\njoin (((1-(a / b))*100)), a=0, b=0 | reduce 5m, mean" + } + } + chart { + name = "Incoming Requests Volume" + type = "timeseries" + rank = 1 + x_pos = 16 + y_pos = 0 + width = 16 + height = 8 + + query { + query_name = "a" + display = "big_number" + hidden = false + query_string = "metric envoy_cluster_upstream_rq_xx | rate 5m | group_by [], mean" + } + } + chart { + name = "Incoming Requests by Release" + type = "timeseries" + rank = 2 + x_pos = 32 + y_pos = 0 + width = 16 + height = 8 + + query { + query_name = "a" + display = "big_number" + hidden = false + query_string = "metric envoy_cluster_upstream_rq_total | rate 5m | group_by [], mean" + } + } + } + group { + rank = 2 + title = "Upstream Clusters" + visibility_type = "explicit" + + chart { + name = "Upstream Response 2xx (% Breakdown)" + type = "timeseries" + rank = 0 + x_pos = 0 + y_pos = 0 + width = 16 + height = 8 + + query { + query_name = "((a/b)*100)" + display = "bar" + hidden = false + query_string = "with\n a = metric envoy_cluster_upstream_rq_xx | filter (((envoy_response_code_class == \"2\") || (envoy_response_code_class == 2)) || (envoy_response_code_class == 2.0)) | rate | group_by [], sum;\n b = metric envoy_cluster_upstream_rq_completed | rate | group_by [], sum;\njoin (((a / b)*100)), a=0, b=0" + } + } + chart { + name = "Upstream Response 3xx (% Breakdown)" + type = "timeseries" + rank = 1 + x_pos = 16 + y_pos = 0 + width = 16 + height = 8 + + query { + query_name = "((a/b)*100)" + display = "bar" + hidden = false + query_string = "with\n a = metric envoy_cluster_upstream_rq_xx | filter (((envoy_response_code_class == \"3\") || (envoy_response_code_class == 3)) || (envoy_response_code_class == 3.0)) | rate | group_by [], sum;\n b = metric envoy_cluster_upstream_rq_completed | rate | group_by [], sum;\njoin (((a / b)*100)), a=0, b=0" + } + } + chart { + name = "Upstream Response 4xx (% Breakdown)" + type = "timeseries" + rank = 2 + x_pos = 32 + y_pos = 0 + width = 16 + height = 8 + + query { + query_name = "((a/b)*100)" + display = "bar" + hidden = false + query_string = "with\n a = metric envoy_cluster_upstream_rq_xx | filter (((envoy_response_code_class == \"4\") || (envoy_response_code_class == 4)) || (envoy_response_code_class == 4.0)) | rate | group_by [], sum;\n b = metric envoy_cluster_upstream_rq_completed | rate | group_by [], sum;\njoin (((a / b)*100)), a=0, b=0" + } + } + chart { + name = "Upstream Response 5xx (% Breakdown)" + type = "timeseries" + rank = 3 + x_pos = 0 + y_pos = 8 + width = 16 + height = 8 + + query { + query_name = "((a/b)*100)" + display = "bar" + hidden = false + query_string = "with\n a = metric envoy_cluster_upstream_rq_xx | filter (((envoy_response_code_class == \"5\") || (envoy_response_code_class == 5)) || (envoy_response_code_class == 5.0)) | rate | group_by [], sum;\n b = metric envoy_cluster_upstream_rq_completed | rate | group_by [], sum;\njoin (((a / b)*100)), a=0, b=0" + } + } + chart { + name = "Upstream Response 2xx (Total Breakdown)" + type = "timeseries" + rank = 4 + x_pos = 16 + y_pos = 8 + width = 16 + height = 8 + + query { + query_name = "a" + display = "bar" + hidden = false + query_string = "metric envoy_cluster_upstream_rq_xx | filter (((envoy_response_code_class == \"2\") || (envoy_response_code_class == 2)) || (envoy_response_code_class == 2.0)) | rate | group_by [], sum" + } + } + chart { + name = "Upstream Response 3xx (Total Breakdown)" + type = "timeseries" + rank = 5 + x_pos = 32 + y_pos = 8 + width = 16 + height = 8 + + query { + query_name = "a" + display = "bar" + hidden = false + query_string = "metric envoy_cluster_upstream_rq_xx | filter (((envoy_response_code_class == \"3\") || (envoy_response_code_class == 3)) || (envoy_response_code_class == 3.0)) | rate | group_by [], sum" + } + } + chart { + name = "Upstream Response 4xx (Total Breakdown)" + type = "timeseries" + rank = 6 + x_pos = 0 + y_pos = 16 + width = 16 + height = 8 + + query { + query_name = "a" + display = "bar" + hidden = false + query_string = "metric envoy_cluster_upstream_rq_xx | filter (((envoy_response_code_class == \"4\") || (envoy_response_code_class == 4)) || (envoy_response_code_class == 4.0)) | rate | group_by [], sum" + } + } + chart { + name = "Upstream Response 5xx (Total Breakdown)" + type = "timeseries" + rank = 7 + x_pos = 16 + y_pos = 16 + width = 16 + height = 8 + + query { + query_name = "a" + display = "bar" + hidden = false + query_string = "metric envoy_cluster_upstream_rq_xx | filter (((envoy_response_code_class == \"5\") || (envoy_response_code_class == 5)) || (envoy_response_code_class == 5.0)) | rate | group_by [], sum" + } + } + chart { + name = "Upstream p99 Response Time" + type = "timeseries" + rank = 8 + x_pos = 32 + y_pos = 16 + width = 16 + height = 8 + + query { + query_name = "a" + display = "line" + hidden = false + query_string = "metric envoy_cluster_upstream_rq_time | delta | group_by [], sum | point percentile(value, 99.0)" + } + } + chart { + name = "Average Upstream Traffic Rate" + type = "timeseries" + rank = 9 + x_pos = 0 + y_pos = 24 + width = 16 + height = 8 + + query { + query_name = "a" + display = "line" + hidden = false + query_string = "metric envoy_cluster_upstream_rq_total | rate | group_by [], sum" + } + } + chart { + name = "Cluster Load Balancer Panics" + type = "timeseries" + rank = 10 + x_pos = 16 + y_pos = 24 + width = 16 + height = 8 + + query { + query_name = "a" + display = "line" + hidden = false + query_string = "metric envoy_cluster_lb_healthy_panic | rate | group_by [], sum" + } + } + } +} \ No newline at end of file diff --git a/collector-dashboards/otel-collector-envoy-dashboard/outputs.tf b/collector-dashboards/otel-collector-envoy-dashboard/outputs.tf new file mode 100644 index 0000000..b3dd109 --- /dev/null +++ b/collector-dashboards/otel-collector-envoy-dashboard/outputs.tf @@ -0,0 +1,4 @@ +output "dashboard_url" { + value = "https://app.lightstep.com/${var.lightstep_project}/dashboard/${lightstep_dashboard.otel_collector_envoy_dashboard.id}" + description = "OpenTelemetry Collector Envoy Metrics Dashboard URL" +} diff --git a/collector-dashboards/otel-collector-envoy-dashboard/variables.tf b/collector-dashboards/otel-collector-envoy-dashboard/variables.tf new file mode 100644 index 0000000..21ee69f --- /dev/null +++ b/collector-dashboards/otel-collector-envoy-dashboard/variables.tf @@ -0,0 +1,4 @@ +variable "lightstep_project" { + description = "Name of Lightstep project" + type = string +} diff --git a/main.tf b/main.tf index b1ad40d..c081289 100644 --- a/main.tf +++ b/main.tf @@ -74,6 +74,11 @@ module "lightstep_otel_collector_elasticsearchreceiver_dashboard" { lightstep_project = var.lightstep_project } +module "lightstep_otel_collector_envoy_dashboard" { + source = "./collector-dashboards/otel-collector-envoy-dashboard" + lightstep_project = var.lightstep_project +} + module "lightstep_otel_collector_flink_dashboard" { source = "./collector-dashboards/otel-collector-flink-dashboard" lightstep_project = var.lightstep_project @@ -89,6 +94,11 @@ module "lightstep_otel_collector_gunicorn_dashboard" { lightstep_project = var.lightstep_project } +module "lightstep_otel_collector_hadoop_dashboard" { + source = "./collector-dashboards/otel-collector-hadoop-dashboard" + lightstep_project = var.lightstep_project +} + module "lightstep_otel_collector_hbase_dashboard" { source = "./collector-dashboards/otel-collector-hbase-dashboard" lightstep_project = var.lightstep_project diff --git a/outputs.tf b/outputs.tf index 2b8b900..a698c7f 100644 --- a/outputs.tf +++ b/outputs.tf @@ -59,6 +59,11 @@ output "lightstep_otel_collector_elasticsearchreceiver_dashboard_url" { description = "Lightstep OpenTelemetry OpenTelemetry elasticsearchreceiver Integration Dashboard" } +output "lightstep_otel_collector_envoy_dashboard_url" { + value = module.lightstep_otel_collector_envoy_dashboard.dashboard_url + description = "Lightstep OpenTelemetry Envoy - Overview Dashboard" +} + output "lightstep_otel_collector_flink_dashboard_url" { value = module.lightstep_otel_collector_flink_dashboard.dashboard_url description = "Lightstep OpenTelemetry Flink - Overview Dashboard" @@ -74,6 +79,11 @@ output "lightstep_otel_collector_gunicorn_dashboard_url" { description = "Lightstep OpenTelemetry Gunicorn - Overview Dashboard" } +output "lightstep_otel_collector_hadoop_dashboard_url" { + value = module.lightstep_otel_collector_hadoop_dashboard.dashboard_url + description = "Lightstep OpenTelemetry OpenTelemetry Hadoop Dashboard Dashboard" +} + output "lightstep_otel_collector_hbase_dashboard_url" { value = module.lightstep_otel_collector_hbase_dashboard.dashboard_url description = "Lightstep OpenTelemetry OpenTelemetry HBase Dashboard Dashboard"