Skip to content

Commit

Permalink
Updating Chart to 1.4.0 and adding support for DCGM exporter for Nvid… (
Browse files Browse the repository at this point in the history
  • Loading branch information
mitali-salvi authored Mar 13, 2024
1 parent c884492 commit fc5d13d
Show file tree
Hide file tree
Showing 15 changed files with 358 additions and 46 deletions.
2 changes: 1 addition & 1 deletion charts/amazon-cloudwatch-observability/Chart.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
apiVersion: v2
name: amazon-cloudwatch-observability
version: 1.2.1
version: 1.4.0
appVersion: 1.0.0
description: A Helm chart for Amazon CloudWatch Observability
type: application
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.12.0
labels:
app.kubernetes.io/name: amazon-cloudwatch-agent-operator
name: amazoncloudwatchagents.cloudwatch.aws.amazon.com
spec:
group: cloudwatch.aws.amazon.com
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.12.0
labels:
app.kubernetes.io/name: amazon-cloudwatch-agent-operator
name: instrumentations.cloudwatch.aws.amazon.com
spec:
group: cloudwatch.aws.amazon.com
Expand Down
90 changes: 65 additions & 25 deletions charts/amazon-cloudwatch-observability/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -5,35 +5,46 @@ Expand the name of the chart.
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
{{- end }}

{{/*
Function to generate a random name and store it in .Release
*/}}
{{- define "generate_static_name_randomiser" -}}
{{- if not (index .Release "randomiser") -}}
{{- $_ := set .Release "randomiser" dict -}}
{{- end -}}
{{- $key := printf "%s_%s" .Release.Name "randomiser" -}}
{{- if not (index .Release.randomiser $key) -}}
{{- $_ := set .Release.randomiser $key (randAlphaNum 3) -}}
{{- end -}}
{{- index .Release.randomiser $key -}}
{{- end -}}

{{/*
Name of k8s cluster
*/}}
{{- define "kubernetes-cluster.name" -}}
{{- if empty .Values.clusterName }}
{{- default "" (printf "k8s-cluster-%s" (sha256sum .Release.Name | trunc 7)) }}
{{- else }}
{{- default "" .Values.clusterName }}
{{- end }}
{{- default (printf "k8s-cluster-%s-%s" (sha256sum .Release.Name | trunc 7) (include "generate_static_name_randomiser" .)) .Values.clusterName }}
{{- end }}

{{/*
Helper function to modify cloudwatch-agent config
*/}}
{{- define "cloudwatch-agent.config-modifier" -}}
{{- $configCopy := deepCopy .Values.agent.config }}
{{- $configCopy := deepCopy .Config }}

{{- $agent := pluck "agent" $configCopy | first }}
{{- if and (empty $agent) (empty $agent.region) }}
{{- $agent := set $agent "region" .Values.region }}
{{- $agentRegion := dict "region" .Values.region }}
{{- $agent := set $configCopy "agent" $agentRegion }}
{{- end }}

{{- $appSignals := pluck "app_signals" $configCopy.logs.metrics_collected | first }}
{{- if empty $appSignals.hosted_in }}
{{- if and (hasKey $configCopy.logs.metrics_collected "app_signals") (empty $appSignals.hosted_in) }}
{{- $appSignals := set $appSignals "hosted_in" (include "kubernetes-cluster.name" .) }}
{{- end }}

{{- $containerInsights := pluck "kubernetes" $configCopy.logs.metrics_collected | first }}
{{- if empty $containerInsights.cluster_name }}
{{- if and (hasKey $configCopy.logs.metrics_collected "kubernetes") (empty $containerInsights.cluster_name) }}
{{- $containerInsights := set $containerInsights "cluster_name" (include "kubernetes-cluster.name" .) }}
{{- end }}

Expand All @@ -43,33 +54,26 @@ Helper function to modify cloudwatch-agent config
{{/*
Helper function to modify customer supplied agent config if ContainerInsights or ApplicationSignals is enabled
*/}}
{{- define "cloudwatch-agent.supplied-config" -}}
{{- if or (hasKey .Values.agent.config.logs "app_signals") (and (hasKey .Values.agent.config.logs "metrics_collected") (hasKey .Values.agent.config.logs.metrics_collected "kubernetes")) }}
{{- define "cloudwatch-agent.modify-config" -}}
{{- if and (hasKey .Config "logs") (or (and (hasKey .Config.logs "metrics_collected") (hasKey .Config.logs.metrics_collected "app_signals")) (and (hasKey .Config.logs "metrics_collected") (hasKey .Config.logs.metrics_collected "kubernetes"))) }}
{{- include "cloudwatch-agent.config-modifier" . }}
{{- else }}
{{- default "" .Values.agent.config | toJson | quote }}
{{- default "" .Config | toJson | quote }}
{{- end }}
{{- end }}

{{/*
Helper function to modify default agent config
Name for cloudwatch-agent
*/}}
{{- define "cloudwatch-agent.modify-default-config" -}}
{{- $configCopy := deepCopy .Values.agent.defaultConfig }}
{{- $agentRegion := dict "region" .Values.region }}
{{- $agent := set $configCopy "agent" $agentRegion }}
{{- $appSignals := pluck "app_signals" $configCopy.logs.metrics_collected | first }}
{{- $appSignals := set $appSignals "hosted_in" (include "kubernetes-cluster.name" .) }}
{{- $containerInsights := pluck "kubernetes" $configCopy.logs.metrics_collected | first }}
{{- $containerInsights := set $containerInsights "cluster_name" (include "kubernetes-cluster.name" .) }}
{{- default "" $configCopy | toJson | quote }}
{{- define "cloudwatch-agent.name" -}}
{{- default "cloudwatch-agent" .Values.agent.name }}
{{- end }}

{{/*
Name for cloudwatch-agent
Name for dcgm-exporter
*/}}
{{- define "cloudwatch-agent.name" -}}
{{- default "cloudwatch-agent" .Values.agent.name }}
{{- define "dcgm-exporter.name" -}}
{{- default "dcgm-exporter" .Values.dcgmExporter.name }}
{{- end }}

{{/*
Expand All @@ -88,6 +92,7 @@ Get the current recommended cloudwatch agent image for a region
Get the current recommended cloudwatch agent operator image for a region
*/}}
{{- define "cloudwatch-agent-operator.image" -}}
{{- $region := .Values.region | required ".Values.region is required." -}}
{{- $imageDomain := "" -}}
{{- $imageDomain = index .Values.manager.image.repositoryDomainMap .Values.region -}}
{{- if not $imageDomain -}}
Expand All @@ -100,6 +105,7 @@ Get the current recommended cloudwatch agent operator image for a region
Get the current recommended fluent-bit image for a region
*/}}
{{- define "fluent-bit.image" -}}
{{- $region := .Values.region | required ".Values.region is required." -}}
{{- $imageDomain := "" -}}
{{- $imageDomain = index .Values.containerLogs.fluentBit.image.repositoryDomainMap .Values.region -}}
{{- if not $imageDomain -}}
Expand All @@ -108,6 +114,33 @@ Get the current recommended fluent-bit image for a region
{{- printf "%s/%s:%s" $imageDomain .Values.containerLogs.fluentBit.image.repository .Values.containerLogs.fluentBit.image.tag -}}
{{- end -}}

{{/*
Get the current recommended dcgm-exporter image for a region
*/}}
{{- define "dcgm-exporter.image" -}}
{{- $region := .Values.region | required ".Values.region is required." -}}
{{- $imageDomain := "" -}}
{{- $imageDomain = index .Values.dcgmExporter.image.repositoryDomainMap .Values.region -}}
{{- if not $imageDomain -}}
{{- $imageDomain = .Values.dcgmExporter.image.repositoryDomainMap.public -}}
{{- end -}}
{{- printf "%s/%s:%s" $imageDomain .Values.dcgmExporter.image.repository .Values.dcgmExporter.image.tag -}}
{{- end -}}

{{/*
Get the current recommended auto instrumentation java image
*/}}
{{- define "auto-instrumentation-java.image" -}}
{{- printf "%s/%s:%s" .Values.manager.autoInstrumentationImage.java.repositoryDomain .Values.manager.autoInstrumentationImage.java.repository .Values.manager.autoInstrumentationImage.java.tag -}}
{{- end -}}

{{/*
Get the current recommended auto instrumentation python image
*/}}
{{- define "auto-instrumentation-python.image" -}}
{{- printf "%s/%s:%s" .Values.manager.autoInstrumentationImage.python.repositoryDomain .Values.manager.autoInstrumentationImage.python.repository .Values.manager.autoInstrumentationImage.python.tag -}}
{{- end -}}

{{/*
Common labels
*/}}
Expand Down Expand Up @@ -149,6 +182,13 @@ Create the name of the service account to use
{{- end }}
{{- end }}

{{/*
Create the name of the service account to use for dcgm exporter
*/}}
{{- define "dcgm-exporter.serviceAccountName" -}}
{{- default "dcgm-exporter-service-acct" .Values.dcgmExporter.serviceAccount.name }}
{{- end }}

{{- define "amazon-cloudwatch-observability.podAnnotations" -}}
{{- if .Values.manager.podAnnotations }}
{{- .Values.manager.podAnnotations | toYaml }}
Expand Down
43 changes: 43 additions & 0 deletions charts/amazon-cloudwatch-observability/templates/certmanager.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,46 @@ spec:
selfSigned: { }
{{- end }}
{{- end }}
{{- if ( .Values.agent.certManager.enabled) -}}
---
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
labels:
{{- include "amazon-cloudwatch-observability.labels" . | nindent 4}}
name: "amazon-cloudwatch-observability-agent-cert"
namespace: {{ .Release.Namespace }}
spec:
dnsNames:
- "dcgm-exporter-service"
- "dcgm-exporter-service.amazon-cloudwatch.svc"
issuerRef:
kind: Issuer
name: "agent-ca"
secretName: "amazon-cloudwatch-observability-agent-cert"
{{- if not .Values.agent.certManager.issuerRef }}
---
apiVersion: cert-manager.io/v1
kind: Issuer
metadata:
{{- if .Values.agent.certManager.issuerAnnotations }}
annotations:
{{- toYaml .Values.agent.certManager.issuerAnnotations | nindent 4 }}
{{- end }}
labels:
{{- include "amazon-cloudwatch-observability.labels" . | nindent 4}}
name: "agent-ca"
namespace: {{ .Release.Namespace }}
spec:
selfSigned: { }
{{- end }}
---
apiVersion: v1
kind: Secret
metadata:
labels:
{{- include "amazon-cloudwatch-observability.labels" . | nindent 4}}
name: "amazon-cloudwatch-observability-agent-cert"
namespace: {{ .Release.Namespace }}
{{- end }}

Original file line number Diff line number Diff line change
@@ -1,7 +1,23 @@
{{- if .Values.agent.enabled }}
{{- if empty .Values.region }}
{{- fail "region is a required field" }}
{{- end }}
{{- if and (.Values.agent.autoGenerateCert.enabled) (not .Values.agent.certManager.enabled) -}}
{{- $altNames := list ( printf "%s-service" (include "dcgm-exporter.name" .) ) ( printf "%s-service.%s.svc" (include "dcgm-exporter.name" .) .Release.Namespace ) -}}
{{- $ca := genCA ("agent-ca") ( .Values.agent.autoGenerateCert.expiryDays | int ) -}}
{{- $cert := genSignedCert ("agent") nil $altNames ( .Values.admissionWebhooks.autoGenerateCert.expiryDays | int ) $ca -}}
apiVersion: v1
kind: Secret
metadata:
labels:
{{- include "amazon-cloudwatch-observability.labels" . | nindent 4}}
name: "amazon-cloudwatch-observability-agent-cert"
namespace: {{ .Release.Namespace }}
data:
ca.crt: {{ $ca.Cert | b64enc }}
tls.crt: {{ $cert.Cert | b64enc }}
tls.key: {{ $cert.Key | b64enc }}
---
{{- end -}}

{{- $region := .Values.region | required ".Values.region is required." -}}
apiVersion: cloudwatch.aws.amazon.com/v1alpha1
kind: AmazonCloudWatchAgent
metadata:
Expand All @@ -12,9 +28,9 @@ spec:
mode: daemonset
serviceAccount: {{ template "cloudwatch-agent.serviceAccountName" . }}
{{- if .Values.agent.config }}
config: {{ template "cloudwatch-agent.supplied-config" . }}
config: {{ include "cloudwatch-agent.modify-config" (merge (dict "Config" .Values.agent.config) . ) }}
{{- else }}
config: {{ template "cloudwatch-agent.modify-default-config" . }}
config: {{ include "cloudwatch-agent.modify-config" (merge (dict "Config" .Values.agent.defaultConfig) . ) }}
{{- end }}
resources:
requests:
Expand All @@ -41,6 +57,9 @@ spec:
- mountPath: /dev/disk
name: devdisk
readOnly: true
- mountPath: /etc/amazon-cloudwatch-observability-agent-cert
name: agenttls
readOnly: true
volumes:
- name: rootfs
hostPath:
Expand All @@ -60,6 +79,12 @@ spec:
- hostPath:
path: /dev/disk/
name: devdisk
- name: agenttls
secret:
secretName: amazon-cloudwatch-observability-agent-cert
items:
- key: ca.crt
path: tls-ca.crt
env:
- name: K8S_NODE_NAME
valueFrom:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ .Values.dcgmExporter.configmap }}
namespace: {{ .Release.Namespace }}
data:
dcp-metrics-included.csv: |
DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %).
DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %).
DCGM_FI_DEV_FB_FREE, gauge, Framebuffer memory free (in MiB).
DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB).
DCGM_FI_DEV_FB_TOTAL, gauge, Framebuffer memory used (in MiB).
DCGM_FI_DEV_FB_USED_PERCENT, gauge, Percentage used of Frame Buffer: Used/(Total - Reserved).
DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C).
DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C).
DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W).
web-config.yaml: |
tls_server_config:
cert_file: /etc/amazon-cloudwatch-observability-dcgm-cert/server.crt
key_file: /etc/amazon-cloudwatch-observability-dcgm-cert/server.key
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: {{ include "dcgm-exporter.name" . }}
namespace: {{ .Release.Namespace }}
labels:
k8s-app: {{ include "dcgm-exporter.name" . }}
version: v1
spec:
selector:
matchLabels:
k8s-app: {{ include "dcgm-exporter.name" . }}
template:
metadata:
labels:
k8s-app: {{ include "dcgm-exporter.name" . }}
version: v1
spec:
serviceAccountName: {{ template "dcgm-exporter.serviceAccountName" . }}
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: {{ .Values.gpuNodeLabelKey }}
operator: In
values: {{ .Values.gpuInstances | toYaml | nindent 16 }}
containers:
- name: dcgm-exporter
image: {{ template "dcgm-exporter.image" . }}
args:
{{- range $.Values.dcgmExporter.arguments }}
- {{ . }}
{{- end }}
resources:
requests:
cpu: 250m
memory: 128Mi
limits:
cpu: 500m
memory: 250Mi
env:
- name: "DCGM_EXPORTER_KUBERNETES"
value: "true"
- name: "DCGM_EXPORTER_LISTEN"
value: "{{ .Values.dcgmExporter.service.address }}"
- name: "DCGM_EXPORTER_COLLECTORS"
value: "/etc/dcgm-exporter/dcp-metrics-included.csv"
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
ports:
- name: "metrics"
containerPort: {{ .Values.dcgmExporter.service.port }}
volumeMounts:
- name: "pod-gpu-resources"
readOnly: true
mountPath: "/var/lib/kubelet/pod-resources"
- name: "dcgm-config"
mountPath: /etc/dcgm-exporter/
- mountPath: /etc/amazon-cloudwatch-observability-dcgm-cert
name: dcgmtls
readOnly: true
volumes:
- name: dcgmtls
secret:
secretName: amazon-cloudwatch-observability-agent-cert
items:
- key: tls.crt
path: server.crt
- key: tls.key
path: server.key
- name: "pod-gpu-resources"
hostPath:
path: /var/lib/kubelet/pod-resources
- name: "dcgm-config"
configMap:
name: {{ .Values.dcgmExporter.configmap }}
Loading

0 comments on commit fc5d13d

Please sign in to comment.