diff --git a/executor/workflow_manager_sfn.go b/executor/workflow_manager_sfn.go index 486fef3e..5469460d 100644 --- a/executor/workflow_manager_sfn.go +++ b/executor/workflow_manager_sfn.go @@ -392,6 +392,21 @@ func (wm *SFNWorkflowManager) UpdateWorkflowSummary(ctx context.Context, workflo if err := wm.updateWorkflowLastJob(ctx, workflow); err != nil { return err } + failedJob := "" + failedJobResource := "" + if workflow.LastJob != nil { + failedJob = workflow.LastJob.State + if workflow.LastJob.StateResource != nil { + failedJobResource = workflow.LastJob.StateResource.Name + } + } + log.CounterD("workflow-failed", 1, logger.M{ + "workflow-name": workflow.WorkflowDefinition.Name, + "workflow-version": workflow.WorkflowDefinition.Version, + "workflow-id": workflow.ID, + "failed-job-name": failedJob, + "failed-job-resource": failedJobResource, + }) } workflow.Output = aws.StringValue(describeOutput.Output) // use for error or success (TODO: actually this is only sent for success) diff --git a/kvconfig.yml b/kvconfig.yml index 9651fefc..a5aff595 100644 --- a/kvconfig.yml +++ b/kvconfig.yml @@ -12,6 +12,15 @@ routes: dimensions: ["id"] stat_type: "counter" + workflow-job-failed-metric: + matchers: + title: ["workflow-failed"] + output: + type: "alerts" + series: "workflow-manager.job-failed" + dimensions: ["workflow-name", "failed-job-name", "failed-job-resource", "workflow-version"] + stat_type: "counter" + execution-not-found-alert: matchers: title: ["execution-not-found"]