diff --git a/charts/grafana-sampling/.helmignore b/charts/grafana-sampling/.helmignore new file mode 100644 index 0000000000..0e8a0eb36f --- /dev/null +++ b/charts/grafana-sampling/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/charts/grafana-sampling/Chart.lock b/charts/grafana-sampling/Chart.lock new file mode 100644 index 0000000000..a54564640a --- /dev/null +++ b/charts/grafana-sampling/Chart.lock @@ -0,0 +1,9 @@ +dependencies: +- name: grafana-agent + repository: https://grafana.github.io/helm-charts + version: 0.36.0 +- name: grafana-agent + repository: https://grafana.github.io/helm-charts + version: 0.36.0 +digest: sha256:6d04a55dce2c09c4c250c6453e0d58f7280750bf04fce51027b4e235062413e5 +generated: "2024-03-11T15:41:30.921516-07:00" diff --git a/charts/grafana-sampling/Chart.yaml b/charts/grafana-sampling/Chart.yaml new file mode 100644 index 0000000000..44e076b128 --- /dev/null +++ b/charts/grafana-sampling/Chart.yaml @@ -0,0 +1,18 @@ +apiVersion: v2 +name: grafana-sampling +description: A Helm chart for a layered OTLP tail sampling and metrics generation pipeline. +type: application +version: 0.1.0 +appVersion: "v0.40.2" +sources: + - https://github.com/grafana/agent + - https://grafana.com/docs/grafana-cloud/monitor-applications/application-observability/setup/sampling/tail/ +dependencies: + - name: grafana-agent + version: 0.36.0 + repository: https://grafana.github.io/helm-charts + alias: grafana-agent-deployment + - name: grafana-agent + version: 0.36.0 + repository: https://grafana.github.io/helm-charts + alias: grafana-agent-statefulset diff --git a/charts/grafana-sampling/README.md b/charts/grafana-sampling/README.md new file mode 100644 index 0000000000..6ad9785b72 --- /dev/null +++ b/charts/grafana-sampling/README.md @@ -0,0 +1,124 @@ +# grafana-sampling + +![Version: 0.1.0](https://img.shields.io/badge/Version-0.1.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: v0.40.2](https://img.shields.io/badge/AppVersion-v0.40.2-informational?style=flat-square) + +A Helm chart for a layered OTLP tail sampling and metrics generation pipeline. + +This chart deploys the following architecture to your environment: +![Photo of sampling architecture](./sampling-architecture.png) + +Note: by default, only OTLP traces are accepted at the load balancing layer. + +## Chart Repo + +Add the following repo to use the chart: + +```console +helm repo add grafana https://grafana.github.io/helm-charts +``` +## Installing the Chart + +Use the following command to install the chart with the release name `my-release`. Make sure to populate the required values. + +```console +helm install my-release grafana/grafana-sampling --values - < + - name: GRAFANA_CLOUD_PROMETHEUS_URL + value: + - name: GRAFANA_CLOUD_PROMETHEUS_USERNAME + value: + - name: GRAFANA_CLOUD_TEMPO_ENDPOINT + value: + - name: GRAFANA_CLOUD_TEMPO_USERNAME + value: + # This is required for adaptive metric deduplication in Grafana Cloud + - name: POD_UID + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.uid +EOF +``` + +## Uninstalling the Chart + +To uninstall/delete the my-release deployment: + +```console +helm delete my-release +``` + +The command removes all the Kubernetes components associated with the chart and deletes the release. + +## Upgrading + +A major chart version change indicates that there is an incompatible breaking change needing manual actions. + +## Values + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| grafana-agent-deployment.agent.configMap.create | bool | `false` | | +| grafana-agent-deployment.agent.extraPorts[0].name | string | `"otlp-grpc"` | | +| grafana-agent-deployment.agent.extraPorts[0].port | int | `4317` | | +| grafana-agent-deployment.agent.extraPorts[0].protocol | string | `"TCP"` | | +| grafana-agent-deployment.agent.extraPorts[0].targetPort | int | `4317` | | +| grafana-agent-deployment.agent.extraPorts[1].name | string | `"otlp-http"` | | +| grafana-agent-deployment.agent.extraPorts[1].port | int | `4318` | | +| grafana-agent-deployment.agent.extraPorts[1].protocol | string | `"TCP"` | | +| grafana-agent-deployment.agent.extraPorts[1].targetPort | int | `4318` | | +| grafana-agent-deployment.agent.resources.requests.cpu | string | `"1"` | | +| grafana-agent-deployment.agent.resources.requests.memory | string | `"2G"` | | +| grafana-agent-deployment.controller.autoscaling.enabled | bool | `false` | Creates a HorizontalPodAutoscaler for controller type deployment. | +| grafana-agent-deployment.controller.autoscaling.maxReplicas | int | `5` | The upper limit for the number of replicas to which the autoscaler can scale up. | +| grafana-agent-deployment.controller.autoscaling.minReplicas | int | `2` | The lower limit for the number of replicas to which the autoscaler can scale down. | +| grafana-agent-deployment.controller.autoscaling.targetCPUUtilizationPercentage | int | `0` | Average CPU utilization across all relevant pods, a percentage of the requested value of the resource for the pods. Setting `targetCPUUtilizationPercentage` to 0 will disable CPU scaling. | +| grafana-agent-deployment.controller.autoscaling.targetMemoryUtilizationPercentage | int | `80` | Average Memory utilization across all relevant pods, a percentage of the requested value of the resource for the pods. Setting `targetMemoryUtilizationPercentage` to 0 will disable Memory scaling. | +| grafana-agent-deployment.controller.replicas | int | `1` | | +| grafana-agent-deployment.controller.type | string | `"deployment"` | | +| grafana-agent-deployment.nameOverride | string | `"deployment"` | Do not change this. | +| grafana-agent-statefulset.agent.configMap.create | bool | `false` | | +| grafana-agent-statefulset.agent.extraEnv[0].name | string | `"GRAFANA_CLOUD_API_KEY"` | | +| grafana-agent-statefulset.agent.extraEnv[0].value | string | `""` | | +| grafana-agent-statefulset.agent.extraEnv[1].name | string | `"GRAFANA_CLOUD_PROMETHEUS_URL"` | | +| grafana-agent-statefulset.agent.extraEnv[1].value | string | `""` | | +| grafana-agent-statefulset.agent.extraEnv[2].name | string | `"GRAFANA_CLOUD_PROMETHEUS_USERNAME"` | | +| grafana-agent-statefulset.agent.extraEnv[2].value | string | `""` | | +| grafana-agent-statefulset.agent.extraEnv[3].name | string | `"GRAFANA_CLOUD_TEMPO_ENDPOINT"` | | +| grafana-agent-statefulset.agent.extraEnv[3].value | string | `""` | | +| grafana-agent-statefulset.agent.extraEnv[4].name | string | `"GRAFANA_CLOUD_TEMPO_USERNAME"` | | +| grafana-agent-statefulset.agent.extraEnv[4].value | string | `""` | | +| grafana-agent-statefulset.agent.extraEnv[5].name | string | `"POD_UID"` | | +| grafana-agent-statefulset.agent.extraEnv[5].valueFrom.fieldRef.apiVersion | string | `"v1"` | | +| grafana-agent-statefulset.agent.extraEnv[5].valueFrom.fieldRef.fieldPath | string | `"metadata.uid"` | | +| grafana-agent-statefulset.agent.extraPorts[0].name | string | `"otlp-grpc"` | | +| grafana-agent-statefulset.agent.extraPorts[0].port | int | `4317` | | +| grafana-agent-statefulset.agent.extraPorts[0].protocol | string | `"TCP"` | | +| grafana-agent-statefulset.agent.extraPorts[0].targetPort | int | `4317` | | +| grafana-agent-statefulset.agent.resources.requests.cpu | string | `"1"` | | +| grafana-agent-statefulset.agent.resources.requests.memory | string | `"2G"` | | +| grafana-agent-statefulset.controller.autoscaling.enabled | bool | `false` | Creates a HorizontalPodAutoscaler for controller type deployment. | +| grafana-agent-statefulset.controller.autoscaling.maxReplicas | int | `5` | The upper limit for the number of replicas to which the autoscaler can scale up. | +| grafana-agent-statefulset.controller.autoscaling.minReplicas | int | `2` | The lower limit for the number of replicas to which the autoscaler can scale down. | +| grafana-agent-statefulset.controller.autoscaling.targetCPUUtilizationPercentage | int | `0` | Average CPU utilization across all relevant pods, a percentage of the requested value of the resource for the pods. Setting `targetCPUUtilizationPercentage` to 0 will disable CPU scaling. | +| grafana-agent-statefulset.controller.autoscaling.targetMemoryUtilizationPercentage | int | `80` | Average Memory utilization across all relevant pods, a percentage of the requested value of the resource for the pods. Setting `targetMemoryUtilizationPercentage` to 0 will disable Memory scaling. | +| grafana-agent-statefulset.controller.replicas | int | `1` | | +| grafana-agent-statefulset.controller.type | string | `"statefulset"` | | +| grafana-agent-statefulset.nameOverride | string | `"statefulset"` | Do not change this. | +| grafana-agent-statefulset.rbac.create | bool | `false` | | +| grafana-agent-statefulset.service.clusterIP | string | `"None"` | | +| grafana-agent-statefulset.serviceAccount.create | bool | `false` | | +| metricsGeneration.dimensions | list | `["service.namespace","service.version","deployment.environment","k8s.cluster.name"]` | Additional dimensions to add to generated metrics. | +| metricsGeneration.enabled | bool | `true` | Toggle generation of spanmetrics and servicegraph metrics. | +| sampling.decisionWait | string | `"15s"` | Wait time since the first span of a trace before making a sampling decision. | +| sampling.enabled | bool | `true` | Toggle tail sampling. | +| sampling.extraPolicies | string | A policy to sample long requests is added by default. | User-defined policies in river format. | +| sampling.failedRequests.percentage | int | `50` | Percentage of failed requests to sample. | +| sampling.failedRequests.sample | bool | `false` | Toggle sampling failed requests. | +| sampling.successfulRequests.percentage | int | `10` | Percentage of successful requests to sample. | +| sampling.successfulRequests.sample | bool | `true` | Toggle sampling successful requests. | + diff --git a/charts/grafana-sampling/README.md.gotmpl b/charts/grafana-sampling/README.md.gotmpl new file mode 100644 index 0000000000..5cae818919 --- /dev/null +++ b/charts/grafana-sampling/README.md.gotmpl @@ -0,0 +1,63 @@ +{{ template "chart.header" . }} + +{{ template "chart.versionBadge" . }}{{ template "chart.typeBadge" . }}{{ template "chart.appVersionBadge" . }} + +{{ template "chart.description" . }} + +This chart deploys the following architecture to your environment: +![Photo of sampling architecture](./sampling-architecture.png) + +Note: by default, only OTLP traces are accepted at the load balancing layer. + + +## Chart Repo + +Add the following repo to use the chart: + +```console +helm repo add grafana https://grafana.github.io/helm-charts +``` +## Installing the Chart + +Use the following command to install the chart with the release name `my-release`. Make sure to populate the required values. + +```console +helm install my-release grafana/grafana-sampling --values - < + - name: GRAFANA_CLOUD_PROMETHEUS_URL + value: + - name: GRAFANA_CLOUD_PROMETHEUS_USERNAME + value: + - name: GRAFANA_CLOUD_TEMPO_ENDPOINT + value: + - name: GRAFANA_CLOUD_TEMPO_USERNAME + value: + # This is required for adaptive metric deduplication in Grafana Cloud + - name: POD_UID + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.uid +EOF +``` + +## Uninstalling the Chart + +To uninstall/delete the my-release deployment: + +```console +helm delete my-release +``` + +The command removes all the Kubernetes components associated with the chart and deletes the release. + +## Upgrading + +A major chart version change indicates that there is an incompatible breaking change needing manual actions. + +{{ template "chart.valuesSection" . }} + diff --git a/charts/grafana-sampling/sampling-architecture.png b/charts/grafana-sampling/sampling-architecture.png new file mode 100644 index 0000000000..ebd068947d Binary files /dev/null and b/charts/grafana-sampling/sampling-architecture.png differ diff --git a/charts/grafana-sampling/templates/_agent_config_deployment.river.txt b/charts/grafana-sampling/templates/_agent_config_deployment.river.txt new file mode 100644 index 0000000000..c78b8e702a --- /dev/null +++ b/charts/grafana-sampling/templates/_agent_config_deployment.river.txt @@ -0,0 +1,5 @@ +{{- define "agent.config.deployment" -}} + {{- include "deployment.receiver.otlp" . }} + {{- include "deployment.processor.batch" . }} + {{- include "deployment.exporter.loadbalancing" . }} +{{- end -}} diff --git a/charts/grafana-sampling/templates/_agent_config_statefulset.river.txt b/charts/grafana-sampling/templates/_agent_config_statefulset.river.txt new file mode 100644 index 0000000000..44e8b7f671 --- /dev/null +++ b/charts/grafana-sampling/templates/_agent_config_statefulset.river.txt @@ -0,0 +1,18 @@ +{{- define "agent.config.statefulset" -}} + {{- include "statefulset.receiver.otlp" . }} + {{- if .Values.metricsGeneration.enabled -}} + {{- include "statefulset.connector.spanmetrics" . }} + {{- include "statefulset.processor.transform.drop_unneeded_resource_attributes" . }} + {{- include "statefulset.processor.transform.use_grafana_metric_names" . }} + {{- include "statefulset.processor.filter" . }} + {{- include "statefulset.connector.servicegraph" . }} + {{- include "statefulset.exporter.prometheus" . }} + {{- include "statefulset.prometheus.remote_write" . }} + {{- end -}} + {{- if .Values.sampling.enabled -}} + {{- include "statefulset.processor.tail_sampling" . }} + {{- end -}} + {{- include "statefulset.processor.batch" . }} + {{- include "exporter.otlp" . }} + {{- include "auth.basic" . }} +{{- end -}} diff --git a/charts/grafana-sampling/templates/_helpers.tpl b/charts/grafana-sampling/templates/_helpers.tpl new file mode 100644 index 0000000000..eabc133596 --- /dev/null +++ b/charts/grafana-sampling/templates/_helpers.tpl @@ -0,0 +1,9 @@ +{{/* use the release name as the serviceAccount name for deployment and statefulset agents */}} +{{- define "grafana-agent.serviceAccountName" -}} +{{- default .Release.Name }} +{{- end }} + +{{/* Calculate name of image ID to use for "grafana-agent". */}} +{{- define "grafana-agent.imageId" -}} +{{- printf ":%s" .Chart.AppVersion }} +{{- end }} diff --git a/charts/grafana-sampling/templates/_otelcol_auth_basic.river.txt b/charts/grafana-sampling/templates/_otelcol_auth_basic.river.txt new file mode 100644 index 0000000000..2a34fe93ba --- /dev/null +++ b/charts/grafana-sampling/templates/_otelcol_auth_basic.river.txt @@ -0,0 +1,8 @@ +{{- define "auth.basic" -}} +otelcol.auth.basic "grafana_cloud_tempo" { + // https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.auth.basic/ + username = env("GRAFANA_CLOUD_TEMPO_USERNAME") + password = env("GRAFANA_CLOUD_API_KEY") +} + +{{ end }} diff --git a/charts/grafana-sampling/templates/_otelcol_connector_servicegraph.river.txt b/charts/grafana-sampling/templates/_otelcol_connector_servicegraph.river.txt new file mode 100644 index 0000000000..f3e97c52e3 --- /dev/null +++ b/charts/grafana-sampling/templates/_otelcol_connector_servicegraph.river.txt @@ -0,0 +1,20 @@ +{{- define "statefulset.connector.servicegraph" -}} +otelcol.connector.servicegraph "default" { + // https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.connector.servicegraph/ + dimensions = [ + {{- range $.Values.metricsGeneration.dimensions }} + {{ . | quote }}, + {{- end }} + ] + latency_histogram_buckets = ["0s", "0.005s", "0.01s", "0.025s", "0.05s", "0.075s", "0.1s", "0.25s", "0.5s", "0.75s", "1s", "2.5s", "5s", "7.5s", "10s"] + + store { + ttl = "2s" + } + + output { + metrics = [otelcol.processor.batch.default.input] + } +} + +{{ end }} diff --git a/charts/grafana-sampling/templates/_otelcol_connector_spanmetrics.river.txt b/charts/grafana-sampling/templates/_otelcol_connector_spanmetrics.river.txt new file mode 100644 index 0000000000..1c12d34c13 --- /dev/null +++ b/charts/grafana-sampling/templates/_otelcol_connector_spanmetrics.river.txt @@ -0,0 +1,26 @@ +{{- define "statefulset.connector.spanmetrics" -}} +otelcol.connector.spanmetrics "default" { + // https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.connector.spanmetrics/ + {{- range $.Values.metricsGeneration.dimensions }} + dimension { + name = {{ . | quote }} + } + {{- end }} + + namespace = "traces.spanmetrics" + + histogram { + unit = "s" + + explicit { + buckets = ["0s", "0.005s", "0.01s", "0.025s", "0.05s", "0.075s", "0.1s", "0.25s", "0.5s", "0.75s", "1s", "2.5s", "5s", "7.5s", "10s"] + } + } + + output { + metrics = [otelcol.processor.filter.drop_unneeded_span_metrics.input] + } +} + + +{{ end }} diff --git a/charts/grafana-sampling/templates/_otelcol_exporter_loadbalancing.river.txt b/charts/grafana-sampling/templates/_otelcol_exporter_loadbalancing.river.txt new file mode 100644 index 0000000000..f2ec7ef43e --- /dev/null +++ b/charts/grafana-sampling/templates/_otelcol_exporter_loadbalancing.river.txt @@ -0,0 +1,22 @@ +{{- define "deployment.exporter.loadbalancing" -}} +otelcol.exporter.loadbalancing "default" { + // https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.exporter.loadbalancing/ + resolver { + + kubernetes { + service = "{{ .Release.Name }}-statefulset.{{ .Release.Namespace }}" + } + } + + protocol { + otlp { + client { + tls { + insecure = true + } + } + } + } +} + +{{ end }} diff --git a/charts/grafana-sampling/templates/_otelcol_exporter_otlp.river.txt b/charts/grafana-sampling/templates/_otelcol_exporter_otlp.river.txt new file mode 100644 index 0000000000..6b1c68285a --- /dev/null +++ b/charts/grafana-sampling/templates/_otelcol_exporter_otlp.river.txt @@ -0,0 +1,10 @@ +{{- define "exporter.otlp" -}} +otelcol.exporter.otlp "grafana_cloud_tempo" { + // https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.exporter.otlp/ + client { + endpoint = env("GRAFANA_CLOUD_TEMPO_ENDPOINT") + auth = otelcol.auth.basic.grafana_cloud_tempo.handler + } +} + +{{ end }} diff --git a/charts/grafana-sampling/templates/_otelcol_exporter_prometheus.river.txt b/charts/grafana-sampling/templates/_otelcol_exporter_prometheus.river.txt new file mode 100644 index 0000000000..9813580b4e --- /dev/null +++ b/charts/grafana-sampling/templates/_otelcol_exporter_prometheus.river.txt @@ -0,0 +1,8 @@ +{{- define "statefulset.exporter.prometheus" -}} +otelcol.exporter.prometheus "grafana_cloud_prometheus" { + // https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.exporter.prometheus/ + add_metric_suffixes = false + forward_to = [prometheus.remote_write.grafana_cloud_prometheus.receiver] +} + +{{ end }} diff --git a/charts/grafana-sampling/templates/_otelcol_processor_batch.river.txt b/charts/grafana-sampling/templates/_otelcol_processor_batch.river.txt new file mode 100644 index 0000000000..4c1f6b58cb --- /dev/null +++ b/charts/grafana-sampling/templates/_otelcol_processor_batch.river.txt @@ -0,0 +1,22 @@ +{{- define "deployment.processor.batch" -}} +otelcol.processor.batch "default" { + // https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.processor.batch/ + output { + traces = [otelcol.exporter.loadbalancing.default.input] + } +} + +{{ end }} + +{{- define "statefulset.processor.batch" -}} +otelcol.processor.batch "default" { + // https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.processor.batch/ + output { + {{ if .Values.metricsGeneration.Enabled }} + metrics = [otelcol.exporter.prometheus.grafana_cloud_prometheus.input] + {{ end }} + traces = [otelcol.exporter.otlp.grafana_cloud_tempo.input] + } +} + +{{ end }} diff --git a/charts/grafana-sampling/templates/_otelcol_processor_filter.river.txt b/charts/grafana-sampling/templates/_otelcol_processor_filter.river.txt new file mode 100644 index 0000000000..b02087ef7b --- /dev/null +++ b/charts/grafana-sampling/templates/_otelcol_processor_filter.river.txt @@ -0,0 +1,17 @@ +{{- define "statefulset.processor.filter" -}} +otelcol.processor.filter "drop_unneeded_span_metrics" { + // https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.processor.filter/ + error_mode = "ignore" + + metrics { + datapoint = [ + "IsMatch(metric.name, \"traces.spanmetrics.calls|traces.spanmetrics.duration\") and IsMatch(attributes[\"span.kind\"], \"SPAN_KIND_INTERNAL|SPAN_KIND_CLIENT|SPAN_KIND_PRODUCER\")", + ] + } + + output { + metrics = [otelcol.processor.transform.use_grafana_metric_names.input] + } +} + +{{ end }} diff --git a/charts/grafana-sampling/templates/_otelcol_processor_tail_sampling.river.txt b/charts/grafana-sampling/templates/_otelcol_processor_tail_sampling.river.txt new file mode 100644 index 0000000000..0e752a8089 --- /dev/null +++ b/charts/grafana-sampling/templates/_otelcol_processor_tail_sampling.river.txt @@ -0,0 +1,60 @@ +{{- define "statefulset.processor.tail_sampling" -}} +otelcol.processor.tail_sampling "default" { + // https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.processor.tail_sampling/ + + decision_wait = {{ .Values.sampling.decisionWait | quote }} + +{{ if .Values.sampling.successfulRequests.sample }} + policy { + name = "sample-successful-requests" + type = "and" + and { + and_sub_policy { + name = "status-code-policy" + type = "status_code" + status_code { + status_codes = ["OK", "UNSET"] + } + } + and_sub_policy { + name = "probabilistic-policy" + type = "probabilistic" + probabilistic { + sampling_percentage = {{ .Values.sampling.successfulRequests.percentage }} + } + } + } + } +{{ end }} + +{{ if .Values.sampling.failedRequests.sample }} + policy { + name = "sample-failed-requests" + type = "and" + and { + and_sub_policy { + name = "status-code-policy" + type = "status_code" + status_code { + status_codes = ["ERROR"] + } + } + and_sub_policy { + name = "probabilistic-policy" + type = "probabilistic" + probabilistic { + sampling_percentage = {{ .Values.sampling.failedRequests.percentage }} + } + } + } + } +{{ end }} + +{{ .Values.sampling.extraPolicies | indent 2 }} + + output { + traces = [otelcol.processor.batch.default.input] + } +} + +{{ end }} diff --git a/charts/grafana-sampling/templates/_otelcol_processor_transform.river.txt b/charts/grafana-sampling/templates/_otelcol_processor_transform.river.txt new file mode 100644 index 0000000000..7963a325aa --- /dev/null +++ b/charts/grafana-sampling/templates/_otelcol_processor_transform.river.txt @@ -0,0 +1,46 @@ +{{- define "statefulset.processor.transform.use_grafana_metric_names" -}} +otelcol.processor.transform "use_grafana_metric_names" { + // https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.processor.transform/ + error_mode = "ignore" + + metric_statements { + context = "metric" + statements = [ + "set(name, \"traces.spanmetrics.latency\") where name == \"traces.spanmetrics.duration\"", + "set(name, \"traces.spanmetrics.calls.total\") where name == \"traces.spanmetrics.calls\"", + ] + } + + output { + metrics = [otelcol.processor.batch.default.input] + } +} + +{{ end }} + +{{- define "statefulset.processor.transform.drop_unneeded_resource_attributes"}} +otelcol.processor.transform "drop_unneeded_resource_attributes" { + // https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.processor.transform/ + error_mode = "ignore" + + trace_statements { + context = "resource" + statements = [ + "delete_key(attributes, \"k8s.pod.start_time\")", + "delete_key(attributes, \"os.description\")", + "delete_key(attributes, \"os.type\")", + "delete_key(attributes, \"process.command_args\")", + "delete_key(attributes, \"process.executable.path\")", + "delete_key(attributes, \"process.pid\")", + "delete_key(attributes, \"process.runtime.description\")", + "delete_key(attributes, \"process.runtime.name\")", + "delete_key(attributes, \"process.runtime.version\")", + ] + } + + output { + traces = [otelcol.connector.spanmetrics.default.input] + } +} + +{{ end }} diff --git a/charts/grafana-sampling/templates/_otelcol_receiver_otlp.river.txt b/charts/grafana-sampling/templates/_otelcol_receiver_otlp.river.txt new file mode 100644 index 0000000000..386a604a5a --- /dev/null +++ b/charts/grafana-sampling/templates/_otelcol_receiver_otlp.river.txt @@ -0,0 +1,39 @@ +{{- define "deployment.receiver.otlp" -}} +otelcol.receiver.otlp "default" { + // https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.receiver.otlp/ + + // configures the default grpc endpoint "0.0.0.0:4317" + grpc { } + // configures the default http/protobuf endpoint "0.0.0.0:4318" + http { } + + output { + traces = [otelcol.processor.batch.default.input] + } +} + +{{ end }} + +{{- define "statefulset.receiver.otlp" -}} +otelcol.receiver.otlp "default" { + // https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.receiver.otlp/ + + // configures the default grpc endpoint "0.0.0.0:4317" + grpc { } + + output { + traces = [ + {{ if .Values.sampling.enabled }} + otelcol.processor.tail_sampling.default.input, + {{ else }} + otelcol.processor.batch.default.input, + {{ end }} + {{ if .Values.metricsGeneration.enabled }} + otelcol.connector.servicegraph.default.input, + otelcol.processor.transform.drop_unneeded_resource_attributes.input, + {{ end }} + ] + } +} + +{{ end }} diff --git a/charts/grafana-sampling/templates/_prometheus_remote_write.river.txt b/charts/grafana-sampling/templates/_prometheus_remote_write.river.txt new file mode 100644 index 0000000000..1ddee68f17 --- /dev/null +++ b/charts/grafana-sampling/templates/_prometheus_remote_write.river.txt @@ -0,0 +1,20 @@ +{{- define "statefulset.prometheus.remote_write" -}} +prometheus.remote_write "grafana_cloud_prometheus" { + // https://grafana.com/docs/agent/latest/flow/reference/components/prometheus.remote_write/ + endpoint { + url = env("GRAFANA_CLOUD_PROMETHEUS_URL") + + basic_auth { + username = env("GRAFANA_CLOUD_PROMETHEUS_USERNAME") + password = env("GRAFANA_CLOUD_API_KEY") + } + queue_config { + retry_on_http_429 = false + } + } + external_labels = { + "__metrics_gen_instance" = env("POD_UID"), + } +} + +{{ end }} diff --git a/charts/grafana-sampling/templates/configmap_deployment.yaml b/charts/grafana-sampling/templates/configmap_deployment.yaml new file mode 100644 index 0000000000..0255b35e12 --- /dev/null +++ b/charts/grafana-sampling/templates/configmap_deployment.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ .Release.Name }}-deployment + labels: + {{- include "grafana-agent.labels" . | nindent 4 }} +data: + config.river: |- {{- (include "agent.config.deployment" .) | nindent 4 }} diff --git a/charts/grafana-sampling/templates/configmap_statefulset.yaml b/charts/grafana-sampling/templates/configmap_statefulset.yaml new file mode 100644 index 0000000000..2a0a5499f1 --- /dev/null +++ b/charts/grafana-sampling/templates/configmap_statefulset.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ .Release.Name }}-statefulset + labels: + {{- include "grafana-agent.labels" . | nindent 4 }} +data: + config.river: |- {{- (include "agent.config.statefulset" .) | nindent 4 }} diff --git a/charts/grafana-sampling/values.yaml b/charts/grafana-sampling/values.yaml new file mode 100644 index 0000000000..71b9ab18c6 --- /dev/null +++ b/charts/grafana-sampling/values.yaml @@ -0,0 +1,140 @@ +metricsGeneration: + # -- Toggle generation of spanmetrics and servicegraph metrics. + enabled: true + # -- Additional dimensions to add to generated metrics. + dimensions: + - service.namespace + - service.version + - deployment.environment + - k8s.cluster.name + +sampling: + # -- Toggle tail sampling. + enabled: true + # -- Wait time since the first span of a trace before making a sampling decision. + decisionWait: 15s + successfulRequests: + # -- Toggle sampling successful requests. + sample: true + # -- Percentage of successful requests to sample. + percentage: 10 + failedRequests: + # -- Toggle sampling failed requests. + sample: false + # -- Percentage of failed requests to sample. + percentage: 50 + # -- User-defined policies in river format. + # @default -- A policy to sample long requests is added by default. + extraPolicies: |- + policy { + name = "sample-long-requests" + type = "and" + and { + and_sub_policy { + name = "latency" + type = "latency" + latency { + threshold_ms = 5000 + } + } + and_sub_policy { + name = "probabilistic-policy" + type = "probabilistic" + probabilistic { + sampling_percentage = 50 + } + } + } + } + +# @ignored Ignore agent deployment +grafana-agent-deployment: + # -- Do not change this. + nameOverride: deployment + controller: + type: deployment + replicas: 1 + autoscaling: + # -- Creates a HorizontalPodAutoscaler for controller type deployment. + enabled: false + # -- The lower limit for the number of replicas to which the autoscaler can scale down. + minReplicas: 2 + # -- The upper limit for the number of replicas to which the autoscaler can scale up. + maxReplicas: 5 + # -- Average CPU utilization across all relevant pods, a percentage of the requested value of the resource for the pods. Setting `targetCPUUtilizationPercentage` to 0 will disable CPU scaling. + targetCPUUtilizationPercentage: 0 + # -- Average Memory utilization across all relevant pods, a percentage of the requested value of the resource for the pods. Setting `targetMemoryUtilizationPercentage` to 0 will disable Memory scaling. + targetMemoryUtilizationPercentage: 80 + agent: + # This chart creates the configmaps + configMap: + create: false + resources: + requests: + cpu: "1" + memory: "2G" + extraPorts: + - name: otlp-grpc + port: 4317 + targetPort: 4317 + protocol: TCP + - name: otlp-http + port: 4318 + targetPort: 4318 + protocol: TCP + +# @ignored Ignore agent statefulset +grafana-agent-statefulset: + # -- Do not change this. + nameOverride: statefulset + controller: + type: statefulset + replicas: 1 + autoscaling: + # -- Creates a HorizontalPodAutoscaler for controller type deployment. + enabled: false + # -- The lower limit for the number of replicas to which the autoscaler can scale down. + minReplicas: 2 + # -- The upper limit for the number of replicas to which the autoscaler can scale up. + maxReplicas: 5 + # -- Average CPU utilization across all relevant pods, a percentage of the requested value of the resource for the pods. Setting `targetCPUUtilizationPercentage` to 0 will disable CPU scaling. + targetCPUUtilizationPercentage: 0 + # -- Average Memory utilization across all relevant pods, a percentage of the requested value of the resource for the pods. Setting `targetMemoryUtilizationPercentage` to 0 will disable Memory scaling. + targetMemoryUtilizationPercentage: 80 + service: + clusterIP: None + agent: + extraEnv: + - name: GRAFANA_CLOUD_API_KEY + value: + - name: GRAFANA_CLOUD_PROMETHEUS_URL + value: + - name: GRAFANA_CLOUD_PROMETHEUS_USERNAME + value: + - name: GRAFANA_CLOUD_TEMPO_ENDPOINT + value: + - name: GRAFANA_CLOUD_TEMPO_USERNAME + value: + # This is required for adaptive metric deduplication in Grafana Cloud + - name: POD_UID + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.uid + # This chart creates the configmaps + configMap: + create: false + resources: + requests: + cpu: "1" + memory: "2G" + extraPorts: + - name: otlp-grpc + port: 4317 + targetPort: 4317 + protocol: TCP + # The statefulset and deployment can share the same serviceAccount and rbac roles + serviceAccount: + create: false + rbac: + create: false