diff --git a/gen3/bin/kube-setup-argo.sh b/gen3/bin/kube-setup-argo.sh index 677f62257..1a25a98c8 100644 --- a/gen3/bin/kube-setup-argo.sh +++ b/gen3/bin/kube-setup-argo.sh @@ -204,6 +204,18 @@ EOF aws iam put-role-policy --role-name ${roleName} --policy-name ${internalBucketPolicy} --policy-document file://$internalBucketPolicyFile || true fi + # Create a secret for the slack webhook + alarm_webhook=$(g3kubectl get cm global -o yaml | yq .data.slack_alarm_webhook | tr -d '"') + + if [ -z "$alarm_webhook" ]; then + gen3_log_err "Please set a slack_alarm_webhook in the 'global' configmap. This is needed to alert for failed workflows." + exit 1 + fi + + g3kubectl -n argo delete secret slack-webhook-secret + g3kubectl -n argo create secret generic "slack-webhook-secret" --from-literal=SLACK_WEBHOOK_URL=$alarm_webhook + + ## if new bucket then do the following # Get the aws keys from secret # Create and attach lifecycle policy diff --git a/kube/services/argo/values.yaml b/kube/services/argo/values.yaml index c8178dd2a..eeb2e9e01 100644 --- a/kube/services/argo/values.yaml +++ b/kube/services/argo/values.yaml @@ -61,6 +61,20 @@ controller: workflowDefaults: spec: archiveLogs: true + onExit: alert-on-timeout + templates: + - name: alert-on-timeout + script: + image: quay.io/cdis/amazonlinux-debug:master + command: [sh] + envFrom: + - secretRef: + name: slack-webhook-secret + source: | + failure_reason=$(echo {{workflow.failures}} | jq 'any(.[]; .message == "Step exceeded its deadline")' ) + if [ "$failure_reason" ]; then + curl -X POST -H 'Content-type: application/json' --data "{\"text\":\"ALERT: Workflow {{workflow.name}} has been killed due to timeout\"}" "$SLACK_WEBHOOK_URL" + fi # -- [Node selector] nodeSelector: