diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 800c0a59b9..e46c9e748e 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -47,14 +47,19 @@ jobs: CR_TOKEN: "${{ secrets.GITHUB_TOKEN }}" CR_SKIP_EXISTING: "true" + - name: Login to GHCR + uses: docker/login-action@v3.0.0 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Push charts to GHCR run: | shopt -s nullglob - for pkg in .cr-release-packages/*; do + for pkg in .cr-release-packages/*.tgz; do if [ -z "${pkg:-}" ]; then break fi - if ! helm push "${pkg}" "oci://ghcr.io/${GITHUB_REPOSITORY_OWNER}/charts"; then - echo '::warning:: helm push failed!' - fi + helm push "${pkg}" "oci://ghcr.io/${GITHUB_REPOSITORY_OWNER}/helm-charts" done diff --git a/charts/agent-operator/Chart.yaml b/charts/agent-operator/Chart.yaml index d9125e9eba..769bc0b7af 100644 --- a/charts/agent-operator/Chart.yaml +++ b/charts/agent-operator/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 name: grafana-agent-operator description: A Helm chart for Grafana Agent Operator type: application -version: 0.3.19 +version: 0.3.20 appVersion: "0.40.3" home: https://grafana.com/docs/agent/v0.40/ icon: https://raw.githubusercontent.com/grafana/agent/v0.40.3/docs/sources/assets/logo_and_name.png diff --git a/charts/agent-operator/README.md b/charts/agent-operator/README.md index 7a5187eee1..49afcf11e2 100644 --- a/charts/agent-operator/README.md +++ b/charts/agent-operator/README.md @@ -1,6 +1,6 @@ # grafana-agent-operator -![Version: 0.3.19](https://img.shields.io/badge/Version-0.3.19-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 0.40.3](https://img.shields.io/badge/AppVersion-0.40.3-informational?style=flat-square) +![Version: 0.3.20](https://img.shields.io/badge/Version-0.3.20-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 0.40.3](https://img.shields.io/badge/AppVersion-0.40.3-informational?style=flat-square) A Helm chart for Grafana Agent Operator @@ -75,4 +75,7 @@ A major chart version change (like v1.2.3 -> v2.0.0) indicates that there is an | resources | object | `{}` | Resource limits and requests config | | serviceAccount.create | bool | `true` | Toggle to create ServiceAccount | | serviceAccount.name | string | `nil` | Service account name | +| test.image.registry | string | `"docker.io"` | Test image registry | +| test.image.repository | string | `"library/busybox"` | Test image repo | +| test.image.tag | string | `"latest"` | Test image tag | | tolerations | list | `[]` | Tolerations applied to Pods | diff --git a/charts/agent-operator/templates/tests/test-grafanaagent.yaml b/charts/agent-operator/templates/tests/test-grafanaagent.yaml index 9e9d9132de..4001da4e9e 100644 --- a/charts/agent-operator/templates/tests/test-grafanaagent.yaml +++ b/charts/agent-operator/templates/tests/test-grafanaagent.yaml @@ -107,12 +107,12 @@ metadata: spec: containers: - name: busybox - image: busybox + image: "{{ .Values.test.image.registry }}/{{ .Values.test.image.repository }}:{{ .Values.test.image.tag }}" command: ['wget'] args: ['grafana-agent-test-operated:8080/-/healthy'] # Wait for GrafanaAgent CR initContainers: - name: sleep - image: busybox + image: "{{ .Values.test.image.registry }}/{{ .Values.test.image.repository }}:{{ .Values.test.image.tag }}" command: ['sleep', '60'] restartPolicy: Never diff --git a/charts/agent-operator/values.yaml b/charts/agent-operator/values.yaml index 57d3991f65..2c75016249 100644 --- a/charts/agent-operator/values.yaml +++ b/charts/agent-operator/values.yaml @@ -43,6 +43,15 @@ image: # -- Image pull secrets pullSecrets: [] +test: + image: + # -- Test image registry + registry: docker.io + # -- Test image repo + repository: library/busybox + # -- Test image tag + tag: latest + # -- hostAliases to add hostAliases: [] # - ip: 1.2.3.4 diff --git a/charts/grafana-sampling/.helmignore b/charts/grafana-sampling/.helmignore new file mode 100644 index 0000000000..0e8a0eb36f --- /dev/null +++ b/charts/grafana-sampling/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/charts/grafana-sampling/Chart.lock b/charts/grafana-sampling/Chart.lock new file mode 100644 index 0000000000..a54564640a --- /dev/null +++ b/charts/grafana-sampling/Chart.lock @@ -0,0 +1,9 @@ +dependencies: +- name: grafana-agent + repository: https://grafana.github.io/helm-charts + version: 0.36.0 +- name: grafana-agent + repository: https://grafana.github.io/helm-charts + version: 0.36.0 +digest: sha256:6d04a55dce2c09c4c250c6453e0d58f7280750bf04fce51027b4e235062413e5 +generated: "2024-03-11T15:41:30.921516-07:00" diff --git a/charts/grafana-sampling/Chart.yaml b/charts/grafana-sampling/Chart.yaml new file mode 100644 index 0000000000..44e076b128 --- /dev/null +++ b/charts/grafana-sampling/Chart.yaml @@ -0,0 +1,18 @@ +apiVersion: v2 +name: grafana-sampling +description: A Helm chart for a layered OTLP tail sampling and metrics generation pipeline. +type: application +version: 0.1.0 +appVersion: "v0.40.2" +sources: + - https://github.com/grafana/agent + - https://grafana.com/docs/grafana-cloud/monitor-applications/application-observability/setup/sampling/tail/ +dependencies: + - name: grafana-agent + version: 0.36.0 + repository: https://grafana.github.io/helm-charts + alias: grafana-agent-deployment + - name: grafana-agent + version: 0.36.0 + repository: https://grafana.github.io/helm-charts + alias: grafana-agent-statefulset diff --git a/charts/grafana-sampling/README.md b/charts/grafana-sampling/README.md new file mode 100644 index 0000000000..6ad9785b72 --- /dev/null +++ b/charts/grafana-sampling/README.md @@ -0,0 +1,124 @@ +# grafana-sampling + +![Version: 0.1.0](https://img.shields.io/badge/Version-0.1.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: v0.40.2](https://img.shields.io/badge/AppVersion-v0.40.2-informational?style=flat-square) + +A Helm chart for a layered OTLP tail sampling and metrics generation pipeline. + +This chart deploys the following architecture to your environment: +![Photo of sampling architecture](./sampling-architecture.png) + +Note: by default, only OTLP traces are accepted at the load balancing layer. + +## Chart Repo + +Add the following repo to use the chart: + +```console +helm repo add grafana https://grafana.github.io/helm-charts +``` +## Installing the Chart + +Use the following command to install the chart with the release name `my-release`. Make sure to populate the required values. + +```console +helm install my-release grafana/grafana-sampling --values - < + - name: GRAFANA_CLOUD_PROMETHEUS_URL + value: + - name: GRAFANA_CLOUD_PROMETHEUS_USERNAME + value: + - name: GRAFANA_CLOUD_TEMPO_ENDPOINT + value: + - name: GRAFANA_CLOUD_TEMPO_USERNAME + value: + # This is required for adaptive metric deduplication in Grafana Cloud + - name: POD_UID + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.uid +EOF +``` + +## Uninstalling the Chart + +To uninstall/delete the my-release deployment: + +```console +helm delete my-release +``` + +The command removes all the Kubernetes components associated with the chart and deletes the release. + +## Upgrading + +A major chart version change indicates that there is an incompatible breaking change needing manual actions. + +## Values + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| grafana-agent-deployment.agent.configMap.create | bool | `false` | | +| grafana-agent-deployment.agent.extraPorts[0].name | string | `"otlp-grpc"` | | +| grafana-agent-deployment.agent.extraPorts[0].port | int | `4317` | | +| grafana-agent-deployment.agent.extraPorts[0].protocol | string | `"TCP"` | | +| grafana-agent-deployment.agent.extraPorts[0].targetPort | int | `4317` | | +| grafana-agent-deployment.agent.extraPorts[1].name | string | `"otlp-http"` | | +| grafana-agent-deployment.agent.extraPorts[1].port | int | `4318` | | +| grafana-agent-deployment.agent.extraPorts[1].protocol | string | `"TCP"` | | +| grafana-agent-deployment.agent.extraPorts[1].targetPort | int | `4318` | | +| grafana-agent-deployment.agent.resources.requests.cpu | string | `"1"` | | +| grafana-agent-deployment.agent.resources.requests.memory | string | `"2G"` | | +| grafana-agent-deployment.controller.autoscaling.enabled | bool | `false` | Creates a HorizontalPodAutoscaler for controller type deployment. | +| grafana-agent-deployment.controller.autoscaling.maxReplicas | int | `5` | The upper limit for the number of replicas to which the autoscaler can scale up. | +| grafana-agent-deployment.controller.autoscaling.minReplicas | int | `2` | The lower limit for the number of replicas to which the autoscaler can scale down. | +| grafana-agent-deployment.controller.autoscaling.targetCPUUtilizationPercentage | int | `0` | Average CPU utilization across all relevant pods, a percentage of the requested value of the resource for the pods. Setting `targetCPUUtilizationPercentage` to 0 will disable CPU scaling. | +| grafana-agent-deployment.controller.autoscaling.targetMemoryUtilizationPercentage | int | `80` | Average Memory utilization across all relevant pods, a percentage of the requested value of the resource for the pods. Setting `targetMemoryUtilizationPercentage` to 0 will disable Memory scaling. | +| grafana-agent-deployment.controller.replicas | int | `1` | | +| grafana-agent-deployment.controller.type | string | `"deployment"` | | +| grafana-agent-deployment.nameOverride | string | `"deployment"` | Do not change this. | +| grafana-agent-statefulset.agent.configMap.create | bool | `false` | | +| grafana-agent-statefulset.agent.extraEnv[0].name | string | `"GRAFANA_CLOUD_API_KEY"` | | +| grafana-agent-statefulset.agent.extraEnv[0].value | string | `""` | | +| grafana-agent-statefulset.agent.extraEnv[1].name | string | `"GRAFANA_CLOUD_PROMETHEUS_URL"` | | +| grafana-agent-statefulset.agent.extraEnv[1].value | string | `""` | | +| grafana-agent-statefulset.agent.extraEnv[2].name | string | `"GRAFANA_CLOUD_PROMETHEUS_USERNAME"` | | +| grafana-agent-statefulset.agent.extraEnv[2].value | string | `""` | | +| grafana-agent-statefulset.agent.extraEnv[3].name | string | `"GRAFANA_CLOUD_TEMPO_ENDPOINT"` | | +| grafana-agent-statefulset.agent.extraEnv[3].value | string | `""` | | +| grafana-agent-statefulset.agent.extraEnv[4].name | string | `"GRAFANA_CLOUD_TEMPO_USERNAME"` | | +| grafana-agent-statefulset.agent.extraEnv[4].value | string | `""` | | +| grafana-agent-statefulset.agent.extraEnv[5].name | string | `"POD_UID"` | | +| grafana-agent-statefulset.agent.extraEnv[5].valueFrom.fieldRef.apiVersion | string | `"v1"` | | +| grafana-agent-statefulset.agent.extraEnv[5].valueFrom.fieldRef.fieldPath | string | `"metadata.uid"` | | +| grafana-agent-statefulset.agent.extraPorts[0].name | string | `"otlp-grpc"` | | +| grafana-agent-statefulset.agent.extraPorts[0].port | int | `4317` | | +| grafana-agent-statefulset.agent.extraPorts[0].protocol | string | `"TCP"` | | +| grafana-agent-statefulset.agent.extraPorts[0].targetPort | int | `4317` | | +| grafana-agent-statefulset.agent.resources.requests.cpu | string | `"1"` | | +| grafana-agent-statefulset.agent.resources.requests.memory | string | `"2G"` | | +| grafana-agent-statefulset.controller.autoscaling.enabled | bool | `false` | Creates a HorizontalPodAutoscaler for controller type deployment. | +| grafana-agent-statefulset.controller.autoscaling.maxReplicas | int | `5` | The upper limit for the number of replicas to which the autoscaler can scale up. | +| grafana-agent-statefulset.controller.autoscaling.minReplicas | int | `2` | The lower limit for the number of replicas to which the autoscaler can scale down. | +| grafana-agent-statefulset.controller.autoscaling.targetCPUUtilizationPercentage | int | `0` | Average CPU utilization across all relevant pods, a percentage of the requested value of the resource for the pods. Setting `targetCPUUtilizationPercentage` to 0 will disable CPU scaling. | +| grafana-agent-statefulset.controller.autoscaling.targetMemoryUtilizationPercentage | int | `80` | Average Memory utilization across all relevant pods, a percentage of the requested value of the resource for the pods. Setting `targetMemoryUtilizationPercentage` to 0 will disable Memory scaling. | +| grafana-agent-statefulset.controller.replicas | int | `1` | | +| grafana-agent-statefulset.controller.type | string | `"statefulset"` | | +| grafana-agent-statefulset.nameOverride | string | `"statefulset"` | Do not change this. | +| grafana-agent-statefulset.rbac.create | bool | `false` | | +| grafana-agent-statefulset.service.clusterIP | string | `"None"` | | +| grafana-agent-statefulset.serviceAccount.create | bool | `false` | | +| metricsGeneration.dimensions | list | `["service.namespace","service.version","deployment.environment","k8s.cluster.name"]` | Additional dimensions to add to generated metrics. | +| metricsGeneration.enabled | bool | `true` | Toggle generation of spanmetrics and servicegraph metrics. | +| sampling.decisionWait | string | `"15s"` | Wait time since the first span of a trace before making a sampling decision. | +| sampling.enabled | bool | `true` | Toggle tail sampling. | +| sampling.extraPolicies | string | A policy to sample long requests is added by default. | User-defined policies in river format. | +| sampling.failedRequests.percentage | int | `50` | Percentage of failed requests to sample. | +| sampling.failedRequests.sample | bool | `false` | Toggle sampling failed requests. | +| sampling.successfulRequests.percentage | int | `10` | Percentage of successful requests to sample. | +| sampling.successfulRequests.sample | bool | `true` | Toggle sampling successful requests. | + diff --git a/charts/grafana-sampling/README.md.gotmpl b/charts/grafana-sampling/README.md.gotmpl new file mode 100644 index 0000000000..5cae818919 --- /dev/null +++ b/charts/grafana-sampling/README.md.gotmpl @@ -0,0 +1,63 @@ +{{ template "chart.header" . }} + +{{ template "chart.versionBadge" . }}{{ template "chart.typeBadge" . }}{{ template "chart.appVersionBadge" . }} + +{{ template "chart.description" . }} + +This chart deploys the following architecture to your environment: +![Photo of sampling architecture](./sampling-architecture.png) + +Note: by default, only OTLP traces are accepted at the load balancing layer. + + +## Chart Repo + +Add the following repo to use the chart: + +```console +helm repo add grafana https://grafana.github.io/helm-charts +``` +## Installing the Chart + +Use the following command to install the chart with the release name `my-release`. Make sure to populate the required values. + +```console +helm install my-release grafana/grafana-sampling --values - < + - name: GRAFANA_CLOUD_PROMETHEUS_URL + value: + - name: GRAFANA_CLOUD_PROMETHEUS_USERNAME + value: + - name: GRAFANA_CLOUD_TEMPO_ENDPOINT + value: + - name: GRAFANA_CLOUD_TEMPO_USERNAME + value: + # This is required for adaptive metric deduplication in Grafana Cloud + - name: POD_UID + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.uid +EOF +``` + +## Uninstalling the Chart + +To uninstall/delete the my-release deployment: + +```console +helm delete my-release +``` + +The command removes all the Kubernetes components associated with the chart and deletes the release. + +## Upgrading + +A major chart version change indicates that there is an incompatible breaking change needing manual actions. + +{{ template "chart.valuesSection" . }} + diff --git a/charts/grafana-sampling/sampling-architecture.png b/charts/grafana-sampling/sampling-architecture.png new file mode 100644 index 0000000000..ebd068947d Binary files /dev/null and b/charts/grafana-sampling/sampling-architecture.png differ diff --git a/charts/grafana-sampling/templates/_agent_config_deployment.river.txt b/charts/grafana-sampling/templates/_agent_config_deployment.river.txt new file mode 100644 index 0000000000..c78b8e702a --- /dev/null +++ b/charts/grafana-sampling/templates/_agent_config_deployment.river.txt @@ -0,0 +1,5 @@ +{{- define "agent.config.deployment" -}} + {{- include "deployment.receiver.otlp" . }} + {{- include "deployment.processor.batch" . }} + {{- include "deployment.exporter.loadbalancing" . }} +{{- end -}} diff --git a/charts/grafana-sampling/templates/_agent_config_statefulset.river.txt b/charts/grafana-sampling/templates/_agent_config_statefulset.river.txt new file mode 100644 index 0000000000..44e8b7f671 --- /dev/null +++ b/charts/grafana-sampling/templates/_agent_config_statefulset.river.txt @@ -0,0 +1,18 @@ +{{- define "agent.config.statefulset" -}} + {{- include "statefulset.receiver.otlp" . }} + {{- if .Values.metricsGeneration.enabled -}} + {{- include "statefulset.connector.spanmetrics" . }} + {{- include "statefulset.processor.transform.drop_unneeded_resource_attributes" . }} + {{- include "statefulset.processor.transform.use_grafana_metric_names" . }} + {{- include "statefulset.processor.filter" . }} + {{- include "statefulset.connector.servicegraph" . }} + {{- include "statefulset.exporter.prometheus" . }} + {{- include "statefulset.prometheus.remote_write" . }} + {{- end -}} + {{- if .Values.sampling.enabled -}} + {{- include "statefulset.processor.tail_sampling" . }} + {{- end -}} + {{- include "statefulset.processor.batch" . }} + {{- include "exporter.otlp" . }} + {{- include "auth.basic" . }} +{{- end -}} diff --git a/charts/grafana-sampling/templates/_helpers.tpl b/charts/grafana-sampling/templates/_helpers.tpl new file mode 100644 index 0000000000..eabc133596 --- /dev/null +++ b/charts/grafana-sampling/templates/_helpers.tpl @@ -0,0 +1,9 @@ +{{/* use the release name as the serviceAccount name for deployment and statefulset agents */}} +{{- define "grafana-agent.serviceAccountName" -}} +{{- default .Release.Name }} +{{- end }} + +{{/* Calculate name of image ID to use for "grafana-agent". */}} +{{- define "grafana-agent.imageId" -}} +{{- printf ":%s" .Chart.AppVersion }} +{{- end }} diff --git a/charts/grafana-sampling/templates/_otelcol_auth_basic.river.txt b/charts/grafana-sampling/templates/_otelcol_auth_basic.river.txt new file mode 100644 index 0000000000..2a34fe93ba --- /dev/null +++ b/charts/grafana-sampling/templates/_otelcol_auth_basic.river.txt @@ -0,0 +1,8 @@ +{{- define "auth.basic" -}} +otelcol.auth.basic "grafana_cloud_tempo" { + // https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.auth.basic/ + username = env("GRAFANA_CLOUD_TEMPO_USERNAME") + password = env("GRAFANA_CLOUD_API_KEY") +} + +{{ end }} diff --git a/charts/grafana-sampling/templates/_otelcol_connector_servicegraph.river.txt b/charts/grafana-sampling/templates/_otelcol_connector_servicegraph.river.txt new file mode 100644 index 0000000000..f3e97c52e3 --- /dev/null +++ b/charts/grafana-sampling/templates/_otelcol_connector_servicegraph.river.txt @@ -0,0 +1,20 @@ +{{- define "statefulset.connector.servicegraph" -}} +otelcol.connector.servicegraph "default" { + // https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.connector.servicegraph/ + dimensions = [ + {{- range $.Values.metricsGeneration.dimensions }} + {{ . | quote }}, + {{- end }} + ] + latency_histogram_buckets = ["0s", "0.005s", "0.01s", "0.025s", "0.05s", "0.075s", "0.1s", "0.25s", "0.5s", "0.75s", "1s", "2.5s", "5s", "7.5s", "10s"] + + store { + ttl = "2s" + } + + output { + metrics = [otelcol.processor.batch.default.input] + } +} + +{{ end }} diff --git a/charts/grafana-sampling/templates/_otelcol_connector_spanmetrics.river.txt b/charts/grafana-sampling/templates/_otelcol_connector_spanmetrics.river.txt new file mode 100644 index 0000000000..1c12d34c13 --- /dev/null +++ b/charts/grafana-sampling/templates/_otelcol_connector_spanmetrics.river.txt @@ -0,0 +1,26 @@ +{{- define "statefulset.connector.spanmetrics" -}} +otelcol.connector.spanmetrics "default" { + // https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.connector.spanmetrics/ + {{- range $.Values.metricsGeneration.dimensions }} + dimension { + name = {{ . | quote }} + } + {{- end }} + + namespace = "traces.spanmetrics" + + histogram { + unit = "s" + + explicit { + buckets = ["0s", "0.005s", "0.01s", "0.025s", "0.05s", "0.075s", "0.1s", "0.25s", "0.5s", "0.75s", "1s", "2.5s", "5s", "7.5s", "10s"] + } + } + + output { + metrics = [otelcol.processor.filter.drop_unneeded_span_metrics.input] + } +} + + +{{ end }} diff --git a/charts/grafana-sampling/templates/_otelcol_exporter_loadbalancing.river.txt b/charts/grafana-sampling/templates/_otelcol_exporter_loadbalancing.river.txt new file mode 100644 index 0000000000..f2ec7ef43e --- /dev/null +++ b/charts/grafana-sampling/templates/_otelcol_exporter_loadbalancing.river.txt @@ -0,0 +1,22 @@ +{{- define "deployment.exporter.loadbalancing" -}} +otelcol.exporter.loadbalancing "default" { + // https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.exporter.loadbalancing/ + resolver { + + kubernetes { + service = "{{ .Release.Name }}-statefulset.{{ .Release.Namespace }}" + } + } + + protocol { + otlp { + client { + tls { + insecure = true + } + } + } + } +} + +{{ end }} diff --git a/charts/grafana-sampling/templates/_otelcol_exporter_otlp.river.txt b/charts/grafana-sampling/templates/_otelcol_exporter_otlp.river.txt new file mode 100644 index 0000000000..6b1c68285a --- /dev/null +++ b/charts/grafana-sampling/templates/_otelcol_exporter_otlp.river.txt @@ -0,0 +1,10 @@ +{{- define "exporter.otlp" -}} +otelcol.exporter.otlp "grafana_cloud_tempo" { + // https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.exporter.otlp/ + client { + endpoint = env("GRAFANA_CLOUD_TEMPO_ENDPOINT") + auth = otelcol.auth.basic.grafana_cloud_tempo.handler + } +} + +{{ end }} diff --git a/charts/grafana-sampling/templates/_otelcol_exporter_prometheus.river.txt b/charts/grafana-sampling/templates/_otelcol_exporter_prometheus.river.txt new file mode 100644 index 0000000000..9813580b4e --- /dev/null +++ b/charts/grafana-sampling/templates/_otelcol_exporter_prometheus.river.txt @@ -0,0 +1,8 @@ +{{- define "statefulset.exporter.prometheus" -}} +otelcol.exporter.prometheus "grafana_cloud_prometheus" { + // https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.exporter.prometheus/ + add_metric_suffixes = false + forward_to = [prometheus.remote_write.grafana_cloud_prometheus.receiver] +} + +{{ end }} diff --git a/charts/grafana-sampling/templates/_otelcol_processor_batch.river.txt b/charts/grafana-sampling/templates/_otelcol_processor_batch.river.txt new file mode 100644 index 0000000000..4c1f6b58cb --- /dev/null +++ b/charts/grafana-sampling/templates/_otelcol_processor_batch.river.txt @@ -0,0 +1,22 @@ +{{- define "deployment.processor.batch" -}} +otelcol.processor.batch "default" { + // https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.processor.batch/ + output { + traces = [otelcol.exporter.loadbalancing.default.input] + } +} + +{{ end }} + +{{- define "statefulset.processor.batch" -}} +otelcol.processor.batch "default" { + // https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.processor.batch/ + output { + {{ if .Values.metricsGeneration.Enabled }} + metrics = [otelcol.exporter.prometheus.grafana_cloud_prometheus.input] + {{ end }} + traces = [otelcol.exporter.otlp.grafana_cloud_tempo.input] + } +} + +{{ end }} diff --git a/charts/grafana-sampling/templates/_otelcol_processor_filter.river.txt b/charts/grafana-sampling/templates/_otelcol_processor_filter.river.txt new file mode 100644 index 0000000000..b02087ef7b --- /dev/null +++ b/charts/grafana-sampling/templates/_otelcol_processor_filter.river.txt @@ -0,0 +1,17 @@ +{{- define "statefulset.processor.filter" -}} +otelcol.processor.filter "drop_unneeded_span_metrics" { + // https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.processor.filter/ + error_mode = "ignore" + + metrics { + datapoint = [ + "IsMatch(metric.name, \"traces.spanmetrics.calls|traces.spanmetrics.duration\") and IsMatch(attributes[\"span.kind\"], \"SPAN_KIND_INTERNAL|SPAN_KIND_CLIENT|SPAN_KIND_PRODUCER\")", + ] + } + + output { + metrics = [otelcol.processor.transform.use_grafana_metric_names.input] + } +} + +{{ end }} diff --git a/charts/grafana-sampling/templates/_otelcol_processor_tail_sampling.river.txt b/charts/grafana-sampling/templates/_otelcol_processor_tail_sampling.river.txt new file mode 100644 index 0000000000..0e752a8089 --- /dev/null +++ b/charts/grafana-sampling/templates/_otelcol_processor_tail_sampling.river.txt @@ -0,0 +1,60 @@ +{{- define "statefulset.processor.tail_sampling" -}} +otelcol.processor.tail_sampling "default" { + // https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.processor.tail_sampling/ + + decision_wait = {{ .Values.sampling.decisionWait | quote }} + +{{ if .Values.sampling.successfulRequests.sample }} + policy { + name = "sample-successful-requests" + type = "and" + and { + and_sub_policy { + name = "status-code-policy" + type = "status_code" + status_code { + status_codes = ["OK", "UNSET"] + } + } + and_sub_policy { + name = "probabilistic-policy" + type = "probabilistic" + probabilistic { + sampling_percentage = {{ .Values.sampling.successfulRequests.percentage }} + } + } + } + } +{{ end }} + +{{ if .Values.sampling.failedRequests.sample }} + policy { + name = "sample-failed-requests" + type = "and" + and { + and_sub_policy { + name = "status-code-policy" + type = "status_code" + status_code { + status_codes = ["ERROR"] + } + } + and_sub_policy { + name = "probabilistic-policy" + type = "probabilistic" + probabilistic { + sampling_percentage = {{ .Values.sampling.failedRequests.percentage }} + } + } + } + } +{{ end }} + +{{ .Values.sampling.extraPolicies | indent 2 }} + + output { + traces = [otelcol.processor.batch.default.input] + } +} + +{{ end }} diff --git a/charts/grafana-sampling/templates/_otelcol_processor_transform.river.txt b/charts/grafana-sampling/templates/_otelcol_processor_transform.river.txt new file mode 100644 index 0000000000..7963a325aa --- /dev/null +++ b/charts/grafana-sampling/templates/_otelcol_processor_transform.river.txt @@ -0,0 +1,46 @@ +{{- define "statefulset.processor.transform.use_grafana_metric_names" -}} +otelcol.processor.transform "use_grafana_metric_names" { + // https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.processor.transform/ + error_mode = "ignore" + + metric_statements { + context = "metric" + statements = [ + "set(name, \"traces.spanmetrics.latency\") where name == \"traces.spanmetrics.duration\"", + "set(name, \"traces.spanmetrics.calls.total\") where name == \"traces.spanmetrics.calls\"", + ] + } + + output { + metrics = [otelcol.processor.batch.default.input] + } +} + +{{ end }} + +{{- define "statefulset.processor.transform.drop_unneeded_resource_attributes"}} +otelcol.processor.transform "drop_unneeded_resource_attributes" { + // https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.processor.transform/ + error_mode = "ignore" + + trace_statements { + context = "resource" + statements = [ + "delete_key(attributes, \"k8s.pod.start_time\")", + "delete_key(attributes, \"os.description\")", + "delete_key(attributes, \"os.type\")", + "delete_key(attributes, \"process.command_args\")", + "delete_key(attributes, \"process.executable.path\")", + "delete_key(attributes, \"process.pid\")", + "delete_key(attributes, \"process.runtime.description\")", + "delete_key(attributes, \"process.runtime.name\")", + "delete_key(attributes, \"process.runtime.version\")", + ] + } + + output { + traces = [otelcol.connector.spanmetrics.default.input] + } +} + +{{ end }} diff --git a/charts/grafana-sampling/templates/_otelcol_receiver_otlp.river.txt b/charts/grafana-sampling/templates/_otelcol_receiver_otlp.river.txt new file mode 100644 index 0000000000..386a604a5a --- /dev/null +++ b/charts/grafana-sampling/templates/_otelcol_receiver_otlp.river.txt @@ -0,0 +1,39 @@ +{{- define "deployment.receiver.otlp" -}} +otelcol.receiver.otlp "default" { + // https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.receiver.otlp/ + + // configures the default grpc endpoint "0.0.0.0:4317" + grpc { } + // configures the default http/protobuf endpoint "0.0.0.0:4318" + http { } + + output { + traces = [otelcol.processor.batch.default.input] + } +} + +{{ end }} + +{{- define "statefulset.receiver.otlp" -}} +otelcol.receiver.otlp "default" { + // https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.receiver.otlp/ + + // configures the default grpc endpoint "0.0.0.0:4317" + grpc { } + + output { + traces = [ + {{ if .Values.sampling.enabled }} + otelcol.processor.tail_sampling.default.input, + {{ else }} + otelcol.processor.batch.default.input, + {{ end }} + {{ if .Values.metricsGeneration.enabled }} + otelcol.connector.servicegraph.default.input, + otelcol.processor.transform.drop_unneeded_resource_attributes.input, + {{ end }} + ] + } +} + +{{ end }} diff --git a/charts/grafana-sampling/templates/_prometheus_remote_write.river.txt b/charts/grafana-sampling/templates/_prometheus_remote_write.river.txt new file mode 100644 index 0000000000..1ddee68f17 --- /dev/null +++ b/charts/grafana-sampling/templates/_prometheus_remote_write.river.txt @@ -0,0 +1,20 @@ +{{- define "statefulset.prometheus.remote_write" -}} +prometheus.remote_write "grafana_cloud_prometheus" { + // https://grafana.com/docs/agent/latest/flow/reference/components/prometheus.remote_write/ + endpoint { + url = env("GRAFANA_CLOUD_PROMETHEUS_URL") + + basic_auth { + username = env("GRAFANA_CLOUD_PROMETHEUS_USERNAME") + password = env("GRAFANA_CLOUD_API_KEY") + } + queue_config { + retry_on_http_429 = false + } + } + external_labels = { + "__metrics_gen_instance" = env("POD_UID"), + } +} + +{{ end }} diff --git a/charts/grafana-sampling/templates/configmap_deployment.yaml b/charts/grafana-sampling/templates/configmap_deployment.yaml new file mode 100644 index 0000000000..0255b35e12 --- /dev/null +++ b/charts/grafana-sampling/templates/configmap_deployment.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ .Release.Name }}-deployment + labels: + {{- include "grafana-agent.labels" . | nindent 4 }} +data: + config.river: |- {{- (include "agent.config.deployment" .) | nindent 4 }} diff --git a/charts/grafana-sampling/templates/configmap_statefulset.yaml b/charts/grafana-sampling/templates/configmap_statefulset.yaml new file mode 100644 index 0000000000..2a0a5499f1 --- /dev/null +++ b/charts/grafana-sampling/templates/configmap_statefulset.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ .Release.Name }}-statefulset + labels: + {{- include "grafana-agent.labels" . | nindent 4 }} +data: + config.river: |- {{- (include "agent.config.statefulset" .) | nindent 4 }} diff --git a/charts/grafana-sampling/values.yaml b/charts/grafana-sampling/values.yaml new file mode 100644 index 0000000000..71b9ab18c6 --- /dev/null +++ b/charts/grafana-sampling/values.yaml @@ -0,0 +1,140 @@ +metricsGeneration: + # -- Toggle generation of spanmetrics and servicegraph metrics. + enabled: true + # -- Additional dimensions to add to generated metrics. + dimensions: + - service.namespace + - service.version + - deployment.environment + - k8s.cluster.name + +sampling: + # -- Toggle tail sampling. + enabled: true + # -- Wait time since the first span of a trace before making a sampling decision. + decisionWait: 15s + successfulRequests: + # -- Toggle sampling successful requests. + sample: true + # -- Percentage of successful requests to sample. + percentage: 10 + failedRequests: + # -- Toggle sampling failed requests. + sample: false + # -- Percentage of failed requests to sample. + percentage: 50 + # -- User-defined policies in river format. + # @default -- A policy to sample long requests is added by default. + extraPolicies: |- + policy { + name = "sample-long-requests" + type = "and" + and { + and_sub_policy { + name = "latency" + type = "latency" + latency { + threshold_ms = 5000 + } + } + and_sub_policy { + name = "probabilistic-policy" + type = "probabilistic" + probabilistic { + sampling_percentage = 50 + } + } + } + } + +# @ignored Ignore agent deployment +grafana-agent-deployment: + # -- Do not change this. + nameOverride: deployment + controller: + type: deployment + replicas: 1 + autoscaling: + # -- Creates a HorizontalPodAutoscaler for controller type deployment. + enabled: false + # -- The lower limit for the number of replicas to which the autoscaler can scale down. + minReplicas: 2 + # -- The upper limit for the number of replicas to which the autoscaler can scale up. + maxReplicas: 5 + # -- Average CPU utilization across all relevant pods, a percentage of the requested value of the resource for the pods. Setting `targetCPUUtilizationPercentage` to 0 will disable CPU scaling. + targetCPUUtilizationPercentage: 0 + # -- Average Memory utilization across all relevant pods, a percentage of the requested value of the resource for the pods. Setting `targetMemoryUtilizationPercentage` to 0 will disable Memory scaling. + targetMemoryUtilizationPercentage: 80 + agent: + # This chart creates the configmaps + configMap: + create: false + resources: + requests: + cpu: "1" + memory: "2G" + extraPorts: + - name: otlp-grpc + port: 4317 + targetPort: 4317 + protocol: TCP + - name: otlp-http + port: 4318 + targetPort: 4318 + protocol: TCP + +# @ignored Ignore agent statefulset +grafana-agent-statefulset: + # -- Do not change this. + nameOverride: statefulset + controller: + type: statefulset + replicas: 1 + autoscaling: + # -- Creates a HorizontalPodAutoscaler for controller type deployment. + enabled: false + # -- The lower limit for the number of replicas to which the autoscaler can scale down. + minReplicas: 2 + # -- The upper limit for the number of replicas to which the autoscaler can scale up. + maxReplicas: 5 + # -- Average CPU utilization across all relevant pods, a percentage of the requested value of the resource for the pods. Setting `targetCPUUtilizationPercentage` to 0 will disable CPU scaling. + targetCPUUtilizationPercentage: 0 + # -- Average Memory utilization across all relevant pods, a percentage of the requested value of the resource for the pods. Setting `targetMemoryUtilizationPercentage` to 0 will disable Memory scaling. + targetMemoryUtilizationPercentage: 80 + service: + clusterIP: None + agent: + extraEnv: + - name: GRAFANA_CLOUD_API_KEY + value: + - name: GRAFANA_CLOUD_PROMETHEUS_URL + value: + - name: GRAFANA_CLOUD_PROMETHEUS_USERNAME + value: + - name: GRAFANA_CLOUD_TEMPO_ENDPOINT + value: + - name: GRAFANA_CLOUD_TEMPO_USERNAME + value: + # This is required for adaptive metric deduplication in Grafana Cloud + - name: POD_UID + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.uid + # This chart creates the configmaps + configMap: + create: false + resources: + requests: + cpu: "1" + memory: "2G" + extraPorts: + - name: otlp-grpc + port: 4317 + targetPort: 4317 + protocol: TCP + # The statefulset and deployment can share the same serviceAccount and rbac roles + serviceAccount: + create: false + rbac: + create: false diff --git a/charts/k8s-monitoring/README.md b/charts/k8s-monitoring/README.md new file mode 100644 index 0000000000..cb8bfad19c --- /dev/null +++ b/charts/k8s-monitoring/README.md @@ -0,0 +1,8 @@ +# Kubernetes Monitoring chart + +The source for the Kubernetes Monitoring Helm chart can be found at +. +Releases of the chart are still published to the +repository. + +If you have any issues with this chart, please file them on the [Kubernetes Monitoring Helm chart](https://github.com/grafana/k8s-monitoring-helm) repository. diff --git a/charts/loki-distributed/Chart.yaml b/charts/loki-distributed/Chart.yaml index 4535797966..efdd9ead60 100644 --- a/charts/loki-distributed/Chart.yaml +++ b/charts/loki-distributed/Chart.yaml @@ -2,8 +2,8 @@ apiVersion: v2 name: loki-distributed description: Helm chart for Grafana Loki in microservices mode type: application -appVersion: 2.9.4 -version: 0.78.3 +appVersion: 2.9.6 +version: 0.78.5 home: https://grafana.github.io/helm-charts sources: - https://github.com/grafana/loki diff --git a/charts/loki-distributed/README.md b/charts/loki-distributed/README.md index d9fbdd1d1e..c84763a8ff 100644 --- a/charts/loki-distributed/README.md +++ b/charts/loki-distributed/README.md @@ -1,6 +1,6 @@ # loki-distributed -![Version: 0.78.3](https://img.shields.io/badge/Version-0.78.3-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 2.9.4](https://img.shields.io/badge/AppVersion-2.9.4-informational?style=flat-square) +![Version: 0.78.5](https://img.shields.io/badge/Version-0.78.5-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 2.9.6](https://img.shields.io/badge/AppVersion-2.9.6-informational?style=flat-square) Helm chart for Grafana Loki in microservices mode diff --git a/charts/tempo-distributed/Chart.yaml b/charts/tempo-distributed/Chart.yaml index 5f9ec77378..ebf70e588a 100644 --- a/charts/tempo-distributed/Chart.yaml +++ b/charts/tempo-distributed/Chart.yaml @@ -2,8 +2,8 @@ apiVersion: v2 name: tempo-distributed description: Grafana Tempo in MicroService mode type: application -version: 1.8.5 -appVersion: 2.3.1 +version: 1.9.1 +appVersion: 2.4.1 engine: gotpl home: https://grafana.com/docs/tempo/latest/ icon: https://raw.githubusercontent.com/grafana/tempo/master/docs/tempo/website/logo_and_name.png diff --git a/charts/tempo-distributed/README.md b/charts/tempo-distributed/README.md index de831de8bb..02f86079f6 100644 --- a/charts/tempo-distributed/README.md +++ b/charts/tempo-distributed/README.md @@ -1,6 +1,6 @@ # tempo-distributed -![Version: 1.8.5](https://img.shields.io/badge/Version-1.8.5-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 2.3.1](https://img.shields.io/badge/AppVersion-2.3.1-informational?style=flat-square) +![Version: 1.9.1](https://img.shields.io/badge/Version-1.9.1-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 2.4.1](https://img.shields.io/badge/AppVersion-2.4.1-informational?style=flat-square) Grafana Tempo in MicroService mode @@ -244,6 +244,13 @@ The memcached default args are removed and should be provided manually. The sett | adminApi.terminationGracePeriodSeconds | int | `60` | | | adminApi.tolerations | list | `[]` | | | adminApi.topologySpreadConstraints | string | Defaults to allow skew no more then 1 node per AZ | topologySpread for admin-api pods. Passed through `tpl` and, thus, to be configured as string | +| cache.caches[0].memcached.consistent_hash | bool | `true` | | +| cache.caches[0].memcached.host | string | `"{{ include \"tempo.fullname\" . }}-memcached"` | | +| cache.caches[0].memcached.service | string | `"memcached-client"` | | +| cache.caches[0].memcached.timeout | string | `"500ms"` | | +| cache.caches[0].roles[0] | string | `"parquet-footer"` | | +| cache.caches[0].roles[1] | string | `"bloom"` | | +| cache.caches[0].roles[2] | string | `"frontend-search"` | | | compactor.config.compaction.block_retention | string | `"48h"` | Duration to keep blocks | | compactor.config.compaction.compacted_block_retention | string | `"1h"` | | | compactor.config.compaction.compaction_cycle | string | `"30s"` | The time between compaction cycles | @@ -319,7 +326,7 @@ The memcached default args are removed and should be provided manually. The sett | distributor.topologySpreadConstraints | string | Defaults to allow skew no more then 1 node per AZ | topologySpread for distributor pods. Passed through `tpl` and, thus, to be configured as string | | enterprise.enabled | bool | `false` | | | enterprise.image.repository | string | `"grafana/enterprise-traces"` | Grafana Enterprise Metrics container image repository. Note: for Grafana Tempo use the value 'image.repository' | -| enterprise.image.tag | string | `"v2.3.2"` | Grafana Enterprise Metrics container image tag. Note: for Grafana Tempo use the value 'image.tag' | +| enterprise.image.tag | string | `"v2.4.0"` | Grafana Enterprise Metrics container image tag. Note: for Grafana Tempo use the value 'image.tag' | | enterpriseFederationFrontend.affinity | string | Hard node and soft zone anti-affinity | Affinity for federation-frontend pods. Passed through `tpl` and, thus, to be configured as string | | enterpriseFederationFrontend.autoscaling.enabled | bool | `false` | Enable autoscaling for the federation-frontend | | enterpriseFederationFrontend.autoscaling.maxReplicas | int | `3` | Maximum autoscaling replicas for the federation-frontend | diff --git a/charts/tempo-distributed/values.yaml b/charts/tempo-distributed/values.yaml index f4a66698ed..7a76a25082 100644 --- a/charts/tempo-distributed/values.yaml +++ b/charts/tempo-distributed/values.yaml @@ -1263,6 +1263,8 @@ config: | grpc_server_max_send_msg_size: {{ .Values.server.grpc_server_max_send_msg_size }} http_server_read_timeout: {{ .Values.server.http_server_read_timeout }} http_server_write_timeout: {{ .Values.server.http_server_write_timeout }} + cache: + {{- toYaml .Values.cache | nindent 2}} storage: trace: {{- if .Values.storage.trace.block.version }} @@ -1294,14 +1296,6 @@ config: | path: /var/tempo/traces wal: path: /var/tempo/wal - {{- if .Values.memcached.enabled }} - cache: memcached - memcached: - consistent_hash: true - host: {{ include "tempo.fullname" . }}-memcached - service: memcached-client - timeout: 500ms - {{- end }} # Set Tempo server configuration # Refers to https://grafana.com/docs/tempo/latest/configuration/#server @@ -1321,6 +1315,21 @@ server: # -- Write timeout for HTTP server http_server_write_timeout: 30s +# Use this block to configure caches available throughout the application. +# Multiple caches can be created and assigned roles which determine how they are used by Tempo. +# https://grafana.com/docs/tempo/latest/configuration/#cache +cache: + caches: + - memcached: + host: '{{ include "tempo.fullname" . }}-memcached' + service: memcached-client + consistent_hash: true + timeout: 500ms + roles: + - parquet-footer + - bloom + - frontend-search + # To configure a different storage backend instead of local storage: # storage: # trace: @@ -1866,7 +1875,7 @@ enterprise: # -- Grafana Enterprise Metrics container image repository. Note: for Grafana Tempo use the value 'image.repository' repository: grafana/enterprise-traces # -- Grafana Enterprise Metrics container image tag. Note: for Grafana Tempo use the value 'image.tag' - tag: v2.3.2 + tag: v2.4.0 # Note: pullPolicy and optional pullSecrets are set in toplevel 'image' section, not here # In order to use Grafana Enterprise Traces features, you will need to provide the contents of your Grafana Enterprise Traces