From e0dc4745143ee810e418943fa3b557b0566371d8 Mon Sep 17 00:00:00 2001 From: EliseCastle23 Date: Wed, 25 Oct 2023 08:32:27 -1000 Subject: [PATCH] changing the folder name to be plural and added a new node monitor to test if there is a node stuck in the "notready" state. --- .../application.yaml | 0 .../{node-monitor => node-monitors}/auth.yaml | 0 .../cronjob.yaml | 0 .../node-monitors/node-not-ready.yaml | 43 +++++++++++++++++++ 4 files changed, 43 insertions(+) rename kube/services/{node-monitor => node-monitors}/application.yaml (100%) rename kube/services/{node-monitor => node-monitors}/auth.yaml (100%) rename kube/services/{node-monitor => node-monitors}/cronjob.yaml (100%) create mode 100644 kube/services/node-monitors/node-not-ready.yaml diff --git a/kube/services/node-monitor/application.yaml b/kube/services/node-monitors/application.yaml similarity index 100% rename from kube/services/node-monitor/application.yaml rename to kube/services/node-monitors/application.yaml diff --git a/kube/services/node-monitor/auth.yaml b/kube/services/node-monitors/auth.yaml similarity index 100% rename from kube/services/node-monitor/auth.yaml rename to kube/services/node-monitors/auth.yaml diff --git a/kube/services/node-monitor/cronjob.yaml b/kube/services/node-monitors/cronjob.yaml similarity index 100% rename from kube/services/node-monitor/cronjob.yaml rename to kube/services/node-monitors/cronjob.yaml diff --git a/kube/services/node-monitors/node-not-ready.yaml b/kube/services/node-monitors/node-not-ready.yaml new file mode 100644 index 000000000..0188369d0 --- /dev/null +++ b/kube/services/node-monitors/node-not-ready.yaml @@ -0,0 +1,43 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + name: node-not-ready-cron + namespace: default +spec: + schedule: "*/5 * * * *" + jobTemplate: + spec: + template: + metadata: + labels: + app: gen3job + spec: + serviceAccountName: node-monitor + containers: + - name: kubectl + image: quay.io/cdis/awshelper + env: + - name: SLACK_WEBHOOK_URL + valueFrom: + configMapKeyRef: + name: global + key: slack_webhook + + command: ["/bin/bash"] + args: + - "-c" + - | + #!/bin/sh + + # Get nodes that show "NodeStatusNeverUpdated" + NODES=$(kubectl get nodes -o json | jq -r '.items[] | select(.status.conditions[] | select(.type == "Ready" and .status == "Unknown")) | .metadata.name') + + if [ -n "$NODES" ]; then + echo "Nodes reporting 'NodeStatusNeverUpdated', sending an alert:" + echo "$NODES" + # Send alert to Slack + curl -X POST -H 'Content-type: application/json' --data "{\"text\":\"WARNING: Node \`${NODES}\` is stuck in "NotReady"!\"}" $SLACK_WEBHOOK_URL + else + echo "No nodes reporting 'NodeStatusNeverUpdated'" + fi + restartPolicy: OnFailure