Skip to content

Commit 842aa57

Browse files
authored
Merge pull request #10 from civo/fix-chart
fix chart
2 parents 241c9db + 208b90e commit 842aa57

11 files changed

+341
-64
lines changed

README.md

+54
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# Node Agent
2+
3+
`node-agent` monitors the health of Kubernetes nodes and can automatically restart VM instances when necessary. It triggers a restart under the following conditions:
4+
5+
- A node enters the **NotReady** state.
6+
- The number of available GPUs per node falls below a configured threshold.
7+
8+
9+
## Set Your `civo-node-agent` Secret
10+
11+
```
12+
export CIVO_DESIRED_GPU_COUNT="12"
13+
export CIVO_NODE_POOL_ID="xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxxxxxx"
14+
export CIVO_API_KEY="xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
15+
kubectl -n kube-system delete secret civo-node-agent --ignore-not-found
16+
kubectl -n kube-system create secret generic civo-node-agent
17+
kubectl -n kube-system patch secret civo-node-agent -n kube-system --type='merge' \
18+
-p='{"stringData": {"civo-api-key": "'"$CIVO_API_KEY"'", "node-pool-id": "'"$CIVO_NODE_POOL_ID"'", "desired-gpu-count": "'"$CIVO_DESIRED_GPU_COUNT"'"}}'
19+
```
20+
21+
## Install `node-agent` chart
22+
23+
```bash
24+
helm upgrade -n kube-system --install node-agent ./charts
25+
```
26+
27+
## Configuration Details
28+
29+
The following configurations are stored in the `node-agent` secret in the `kube-system` namespace.
30+
31+
`node-pool-id`: The ID of your Kubernetes node pool which you want monitored. To collect this value, go to the [civo kubernetes dashboard](https://dashboard.civo.com/kubernetes), select your cluster, and click copy next to your pool id.
32+
33+
`desired-gpu-count`: This value is intended to match the number of GPUs per node. If you had a 2-node cluster with 8 GPU total, you would set this value to 4 to represent the number of GPUs per node.
34+
35+
`civo-api-key`: The civo api key to use when automatically rebooting nodes. To collect this value, go to toue [civo settings security tab](https://dashboard.civo.com/security).
36+
37+
38+
## Temp details until CI is complete
39+
40+
To build the binary for amd64
41+
42+
`CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build .`
43+
44+
To build the docker image for amd64
45+
46+
`docker buildx build --platform linux/amd64 -t johndietz/node-agent:1.4 --push .`
47+
48+
To set the image and tag used by the chart, see the image section of the `values.yaml`
49+
50+
```
51+
image:
52+
repository: johndietz/node-agent
53+
tag: "1.4"
54+
```

charts/.helmignore

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# Patterns to ignore when building packages.
2+
# This supports shell glob matching, relative path matching, and
3+
# negation (prefixed with !). Only one pattern per line.
4+
.DS_Store
5+
# Common VCS dirs
6+
.git/
7+
.gitignore
8+
.bzr/
9+
.bzrignore
10+
.hg/
11+
.hgignore
12+
.svn/
13+
# Common backup files
14+
*.swp
15+
*.bak
16+
*.tmp
17+
*.orig
18+
*~
19+
# Various IDEs
20+
.project
21+
.idea/
22+
*.tmproj
23+
.vscode/

charts/Chart.yaml

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
apiVersion: v2
2+
name: node-agent
3+
description: A Helm chart for Kubernetes
4+
5+
# A chart can be either an 'application' or a 'library' chart.
6+
#
7+
# Application charts are a collection of templates that can be packaged into versioned archives
8+
# to be deployed.
9+
#
10+
# Library charts provide useful utilities or functions for the chart developer. They're included as
11+
# a dependency of application charts to inject those utilities and functions into the rendering
12+
# pipeline. Library charts do not define any templates and therefore cannot be deployed.
13+
type: application
14+
15+
# This is the chart version. This version number should be incremented each time you make changes
16+
# to the chart and its templates, including the app version.
17+
# Versions are expected to follow Semantic Versioning (https://semver.org/)
18+
version: 0.1.0-rc.1
19+
20+
# This is the version number of the application being deployed. This version number should be
21+
# incremented each time you make changes to the application. Versions are not expected to
22+
# follow Semantic Versioning. They should reflect the version the application is using.
23+
# It is recommended to use it with quotes.
24+
appVersion: "v0.1.0"

charts/templates/_helpers.tpl

+62
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
{{/*
2+
Expand the name of the chart.
3+
*/}}
4+
{{- define "node-agent.name" -}}
5+
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
6+
{{- end }}
7+
8+
{{/*
9+
Create a default fully qualified app name.
10+
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
11+
If release name contains chart name it will be used as a full name.
12+
*/}}
13+
{{- define "node-agent.fullname" -}}
14+
{{- if .Values.fullnameOverride }}
15+
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
16+
{{- else }}
17+
{{- $name := default .Chart.Name .Values.nameOverride }}
18+
{{- if contains $name .Release.Name }}
19+
{{- .Release.Name | trunc 63 | trimSuffix "-" }}
20+
{{- else }}
21+
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
22+
{{- end }}
23+
{{- end }}
24+
{{- end }}
25+
26+
{{/*
27+
Create chart name and version as used by the chart label.
28+
*/}}
29+
{{- define "node-agent.chart" -}}
30+
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
31+
{{- end }}
32+
33+
{{/*
34+
Common labels
35+
*/}}
36+
{{- define "node-agent.labels" -}}
37+
helm.sh/chart: {{ include "node-agent.chart" . }}
38+
{{ include "node-agent.selectorLabels" . }}
39+
{{- if .Chart.AppVersion }}
40+
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
41+
{{- end }}
42+
app.kubernetes.io/managed-by: {{ .Release.Service }}
43+
{{- end }}
44+
45+
{{/*
46+
Selector labels
47+
*/}}
48+
{{- define "node-agent.selectorLabels" -}}
49+
app.kubernetes.io/name: {{ include "node-agent.name" . }}
50+
app.kubernetes.io/instance: {{ .Release.Name }}
51+
{{- end }}
52+
53+
{{/*
54+
Create the name of the service account to use
55+
*/}}
56+
{{- define "node-agent.serviceAccountName" -}}
57+
{{- if .Values.serviceAccount.create }}
58+
{{- default (include "node-agent.fullname" .) .Values.serviceAccount.name }}
59+
{{- else }}
60+
{{- default "default" .Values.serviceAccount.name }}
61+
{{- end }}
62+
{{- end }}

charts/templates/deployment.yaml

+100
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
apiVersion: apps/v1
2+
kind: Deployment
3+
metadata:
4+
name: {{ .Chart.Name }}
5+
namespace: kube-system
6+
labels:
7+
{{- include "node-agent.labels" . | nindent 4 }}
8+
spec:
9+
{{- if not .Values.autoscaling.enabled }}
10+
replicas: {{ .Values.replicaCount }}
11+
{{- end }}
12+
selector:
13+
matchLabels:
14+
{{- include "node-agent.selectorLabels" . | nindent 6 }}
15+
template:
16+
metadata:
17+
{{- with .Values.podAnnotations }}
18+
annotations:
19+
{{- toYaml . | nindent 8 }}
20+
{{- end }}
21+
labels:
22+
{{- include "node-agent.labels" . | nindent 8 }}
23+
{{- with .Values.podLabels }}
24+
{{- toYaml . | nindent 8 }}
25+
{{- end }}
26+
spec:
27+
{{- with .Values.imagePullSecrets }}
28+
imagePullSecrets:
29+
{{- toYaml . | nindent 8 }}
30+
{{- end }}
31+
serviceAccountName: {{ include "node-agent.serviceAccountName" . }}
32+
{{- with .Values.podSecurityContext }}
33+
securityContext:
34+
{{- toYaml . | nindent 8 }}
35+
{{- end }}
36+
containers:
37+
- name: {{ .Chart.Name }}
38+
env:
39+
- name: CIVO_API_KEY
40+
valueFrom:
41+
secretKeyRef:
42+
name: civo-node-agent
43+
key: civo-api-key
44+
- name: CIVO_API_URL
45+
valueFrom:
46+
secretKeyRef:
47+
name: civo-api-access
48+
key: api-url
49+
- name: CIVO_CLUSTER_ID
50+
valueFrom:
51+
secretKeyRef:
52+
name: civo-api-access
53+
key: cluster-id
54+
- name: CIVO_REGION
55+
valueFrom:
56+
secretKeyRef:
57+
name: civo-api-access
58+
key: region
59+
- name: CIVO_NODE_POOL_ID
60+
valueFrom:
61+
secretKeyRef:
62+
name: civo-node-agent
63+
key: node-pool-id
64+
- name: CIVO_NODE_DESIRED_GPU_COUNT
65+
valueFrom:
66+
secretKeyRef:
67+
name: civo-node-agent
68+
key: desired-gpu-count
69+
{{- with .Values.securityContext }}
70+
securityContext:
71+
{{- toYaml . | nindent 12 }}
72+
{{- end }}
73+
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
74+
imagePullPolicy: {{ .Values.image.pullPolicy }}
75+
{{- with .Values.resources }}
76+
resources:
77+
{{- toYaml . | nindent 12 }}
78+
{{- end }}
79+
{{- with .Values.volumeMounts }}
80+
volumeMounts:
81+
{{- toYaml . | nindent 12 }}
82+
{{- end }}
83+
{{- with .Values.volumes }}
84+
volumes:
85+
{{- toYaml . | nindent 8 }}
86+
{{- end }}
87+
{{- with .Values.nodeSelector }}
88+
nodeSelector:
89+
{{- toYaml . | nindent 8 }}
90+
{{- end }}
91+
{{- with .Values.affinity }}
92+
affinity:
93+
{{- toYaml . | nindent 8 }}
94+
{{- end }}
95+
{{- with .Values.tolerations }}
96+
tolerations:
97+
{{- toYaml . | nindent 8 }}
98+
{{- end }}
99+
100+
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,20 @@
11
apiVersion: rbac.authorization.k8s.io/v1
22
kind: ClusterRole
33
metadata:
4-
name: node-agent
4+
name: {{ .Chart.Name }}
55
rules:
66
- apiGroups: [""]
77
resources: ["nodes"]
88
verbs: ["get", "list", "watch"]
9-
10-
119
---
1210
apiVersion: rbac.authorization.k8s.io/v1
1311
kind: ClusterRoleBinding
1412
metadata:
15-
name: node-agent
13+
name: {{ .Chart.Name }}
1614
subjects:
1715
- kind: ServiceAccount
18-
name: default
19-
namespace: default
16+
name: {{ .Chart.Name }}
17+
namespace: kube-system
2018
roleRef:
2119
kind: ClusterRole
22-
name: node-agent
20+
name: {{ .Chart.Name }}

charts/templates/serviceaccount.yaml

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
{{- if .Values.serviceAccount.create -}}
2+
apiVersion: v1
3+
kind: ServiceAccount
4+
metadata:
5+
name: {{ .Chart.Name }}
6+
namespace: kube-system
7+
labels:
8+
{{- include "node-agent.labels" . | nindent 4 }}
9+
{{- with .Values.serviceAccount.annotations }}
10+
annotations:
11+
{{- toYaml . | nindent 4 }}
12+
{{- end }}
13+
automountServiceAccountToken: {{ .Values.serviceAccount.automount }}
14+
{{- end }}

charts/values.yaml

+48
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
replicaCount: 1
2+
3+
4+
image:
5+
repository: johndietz/node-agent
6+
pullPolicy: IfNotPresent
7+
tag: "1.4"
8+
9+
imagePullSecrets: []
10+
nameOverride: ""
11+
fullnameOverride: ""
12+
13+
serviceAccount:
14+
create: true
15+
automount: true
16+
annotations: {}
17+
name: ""
18+
19+
podAnnotations: {}
20+
podLabels: {}
21+
podSecurityContext: {}
22+
securityContext: {}
23+
resources: {}
24+
25+
26+
autoscaling:
27+
enabled: false
28+
minReplicas: 1
29+
maxReplicas: 100
30+
targetCPUUtilizationPercentage: 80
31+
32+
volumes: []
33+
# - name: foo
34+
# secret:
35+
# secretName: mysecret
36+
# optional: false
37+
38+
# Additional volumeMounts on the output Deployment definition.
39+
volumeMounts: []
40+
# - name: foo
41+
# mountPath: "/etc/foo"
42+
# readOnly: true
43+
44+
nodeSelector: {}
45+
46+
tolerations: []
47+
48+
affinity: {}

pkg/watcher/watcher.go

+10-6
Original file line numberDiff line numberDiff line change
@@ -66,9 +66,6 @@ func NewWatcher(ctx context.Context, apiURL, apiKey, region, clusterID, nodePool
6666
if err != nil {
6767
return nil, fmt.Errorf("CIVO_NODE_DESIRED_GPU_COUNT has an invalid value, %s: %w", nodeDesiredGPUCount, err)
6868
}
69-
if n < 1 {
70-
return nil, fmt.Errorf("CIVO_NODE_DESIRED_GPU_COUNT must be at least 1: %s", nodeDesiredGPUCount)
71-
}
7269

7370
w.nodeDesiredGPUCount = n
7471
w.nodeSelector = &metav1.LabelSelector{
@@ -122,7 +119,7 @@ func (w *watcher) setupCivoClient() error {
122119

123120
client, err := civogo.NewClientWithURL(w.apiKey, w.apiURL, w.region)
124121
if err != nil {
125-
return fmt.Errorf("failed to intiliase civo client: %w", err)
122+
return fmt.Errorf("failed to initialise civo client: %w", err)
126123
}
127124

128125
userAgent := &civogo.Component{
@@ -183,10 +180,17 @@ func isNodeReady(node *corev1.Node) bool {
183180
}
184181

185182
func isNodeDesiredGPU(node *corev1.Node, desired int) bool {
186-
quantity := node.Status.Allocatable[gpuResourceName]
187-
if quantity.IsZero() {
183+
if desired == 0 {
184+
slog.Info("desired gpu count is set to 0", "node", node.GetName())
185+
return true
186+
}
187+
188+
quantity, exists := node.Status.Allocatable[gpuResourceName]
189+
if !exists || quantity.IsZero() {
190+
slog.Info("read allocatable gpus", "node", node.GetName(), "count", quantity.String())
188191
return false
189192
}
193+
190194
gpuCount, ok := quantity.AsInt64()
191195
if !ok {
192196
return false

0 commit comments

Comments
 (0)