Skip to content

Commit

Permalink
Let's see if this works
Browse files Browse the repository at this point in the history
  • Loading branch information
AidanHilt committed Apr 30, 2024
1 parent 40be00d commit 67a10c9
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 0 deletions.
22 changes: 22 additions & 0 deletions kube/services/workflow-age-monitor/application.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: node-monitor-application
namespace: argocd
spec:
destination:
namespace: default
server: https://kubernetes.default.svc
project: default
source:
repoURL: https://github.com/uc-cdis/cloud-automation.git
targetRevision: master
path: kube/services/node-monitors/
directory:
exclude: "application.yaml"
syncPolicy:
automated:
prune: true
selfHeal: true
syncOptions:
- CreateNamespace=true
55 changes: 55 additions & 0 deletions kube/services/workflow-age-monitor/argo-workflow-age.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
apiVersion: batch/v1
kind: CronJob
metadata:
name: argo-workflow-age
namespace: default
spec:
schedule: "*/5 * * * *"
jobTemplate:
spec:
template:
metadata:
labels:
app: gen3job
spec:
serviceAccountName: workflow-monitor
containers:
- name: kubectl
image: quay.io/cdis/awshelper
env:
# This is 3 * 3600, or 3 hours
- name: THRESHOLD_TIME
value: "10800"
- name: SLACK_WEBHOOK_URL
valueFrom:
configMapKeyRef:
name: global
key: slack_webhook

command: ["/bin/bash"]
args:
- "-c"
- |
#!/bin/bash
# Get all workflows with specific label and check their age
kubectl get workflows -all-namespaces -o json | jq -c '.items[] | {name: .metadata.name, creationTimestamp: .metadata.creationTimestamp}' | while read workflow_info; do
WORKFLOW_NAME=$(echo $workflow_info | jq -r '.name')
CREATION_TIMESTAMP=$(echo $workflow_info | jq -r '.creationTimestamp')
# Convert creation timestamp to Unix Epoch time
CREATION_EPOCH=$(date -d "$CREATION_TIMESTAMP" +%s)
# Get current Unix Epoch time
CURRENT_EPOCH=$(date +%s)
# Calculate workflow age in seconds
WORKFLOW_AGE=$(($CURRENT_EPOCH - $CREATION_EPOCH))
# Check if workflow age is greater than threshold
if [ "$WORKFLOW_AGE" -gt "$THRESHOLD_TIME" ]; then
echo "Workflow $WORKFLOW_NAME has been running for over $THRESHOLD_TIME seconds, sending an alert"
# Send alert to Slack
curl -X POST -H 'Content-type: application/json' --data "{\"text\":\"WARNING: Workflow \`${WORKFLOW_NAME}\` has been running longer than $THRESHOLD_TIME seconds\"}" $SLACK_WEBHOOK_URL
fi
done
restartPolicy: OnFailure
18 changes: 18 additions & 0 deletions kube/services/workflow-age-monitor/auth.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: argo-workflow-monitor
namespace: default
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: argo-workflow-monitor-binding
subjects:
- kind: ServiceAccount
name: argo-workflow-monitor
namespace: default
roleRef:
kind: ClusterRole
name: argo-argo-workflows-view
apiGroup: rbac.authorization.k8s.io

0 comments on commit 67a10c9

Please sign in to comment.