diff --git a/kube/services/workflow-age-monitor/argo-workflow-age.yaml b/kube/services/workflow-age-monitor/argo-workflow-age.yaml index 0d0c29115..390456a05 100644 --- a/kube/services/workflow-age-monitor/argo-workflow-age.yaml +++ b/kube/services/workflow-age-monitor/argo-workflow-age.yaml @@ -34,22 +34,24 @@ spec: # Get all workflows with specific label and check their age kubectl get workflows --all-namespaces -o json | jq -c '.items[] | {name: .metadata.name, creationTimestamp: .metadata.creationTimestamp}' | while read workflow_info; do WORKFLOW_NAME=$(echo $workflow_info | jq -r '.name') - CREATION_TIMESTAMP=$(echo $workflow_info | jq -r '.creationTimestamp') + STARTED_TIMESTAMP=$(echo $workflow_info | jq -r '.status.startedAt') - # Convert creation timestamp to Unix Epoch time - CREATION_EPOCH=$(date -d "$CREATION_TIMESTAMP" +%s) + if [ ! -z STARTED_TIMESTAMP ]; then + # Convert creation timestamp to Unix Epoch time + CREATION_EPOCH=$(date -d "$STARTED_TIMESTAMP" +%s) - # Get current Unix Epoch time - CURRENT_EPOCH=$(date +%s) + # Get current Unix Epoch time + CURRENT_EPOCH=$(date +%s) - # Calculate workflow age in seconds - WORKFLOW_AGE=$(($CURRENT_EPOCH - $CREATION_EPOCH)) + # Calculate workflow age in seconds + WORKFLOW_AGE=$(($CURRENT_EPOCH - $CREATION_EPOCH)) - # Check if workflow age is greater than threshold - if [ "$WORKFLOW_AGE" -gt "$THRESHOLD_TIME" ]; then - echo "Workflow $WORKFLOW_NAME has been running for over $THRESHOLD_TIME seconds, sending an alert" - # Send alert to Slack - curl -X POST -H 'Content-type: application/json' --data "{\"text\":\"WARNING: Workflow \`${WORKFLOW_NAME}\` has been running longer than $THRESHOLD_TIME seconds\"}" $SLACK_WEBHOOK_URL + # Check if workflow age is greater than threshold + if [ "$WORKFLOW_AGE" -gt "$THRESHOLD_TIME" ]; then + echo "Workflow $WORKFLOW_NAME has been running for over $THRESHOLD_TIME seconds, sending an alert" + # Send alert to Slack + curl -X POST -H 'Content-type: application/json' --data "{\"text\":\"WARNING: Workflow \`${WORKFLOW_NAME}\` has been running longer than $THRESHOLD_TIME seconds\"}" $SLACK_WEBHOOK_URL + fi fi done restartPolicy: OnFailure