From d346d808b57625a57e6f752c3508385428947b2b Mon Sep 17 00:00:00 2001
From: Jared O'Connell <joconnel@redhat.com>
Date: Fri, 19 Jul 2024 20:23:02 -0400
Subject: [PATCH] Fix several issues in failure cases

Fixed improper locking, paths that could lead to double-resolution, and more.
---
 internal/step/foreach/provider.go     | 34 +++++++---
 internal/step/lifecycle.go            | 14 ++---
 internal/step/plugin/provider.go      | 90 +++++++++++++++++++--------
 internal/step/plugin/provider_test.go |  2 +
 workflow/executor.go                  |  8 ++-
 workflow/workflow.go                  | 30 ++++++---
 workflow/workflow_test.go             | 90 ++++++++++++++++++++++++---
 7 files changed, 207 insertions(+), 61 deletions(-)

diff --git a/internal/step/foreach/provider.go b/internal/step/foreach/provider.go
index f2ded8b4..3ddf9bf9 100644
--- a/internal/step/foreach/provider.go
+++ b/internal/step/foreach/provider.go
@@ -3,6 +3,7 @@ package foreach
 import (
 	"context"
 	"fmt"
+	"go.arcalot.io/dgraph"
 	"reflect"
 	"sync"
 
@@ -46,9 +47,9 @@ var executeLifecycleStage = step.LifecycleStage{
 		"items":    {},
 		"wait_for": {},
 	},
-	NextStages: []string{
-		string(StageIDOutputs),
-		string(StageIDFailed),
+	NextStages: map[string]dgraph.DependencyType{
+		string(StageIDOutputs): dgraph.AndDependency,
+		string(StageIDFailed):  dgraph.CompletionAndDependency,
 	},
 	Fatal: false,
 }
@@ -58,7 +59,7 @@ var outputLifecycleStage = step.LifecycleStage{
 	RunningName:  "output",
 	FinishedName: "output",
 	InputFields:  map[string]struct{}{},
-	NextStages:   []string{},
+	NextStages:   map[string]dgraph.DependencyType{},
 	Fatal:        false,
 }
 var errorLifecycleStage = step.LifecycleStage{
@@ -67,7 +68,7 @@ var errorLifecycleStage = step.LifecycleStage{
 	RunningName:  "processing error",
 	FinishedName: "error",
 	InputFields:  map[string]struct{}{},
-	NextStages:   []string{},
+	NextStages:   map[string]dgraph.DependencyType{},
 	Fatal:        true,
 }
 
@@ -359,6 +360,7 @@ type runningStep struct {
 	inputAvailable     bool
 	inputData          chan []any
 	ctx                context.Context
+	closed             bool
 	wg                 sync.WaitGroup
 	cancel             context.CancelFunc
 	stageChangeHandler step.StageChangeHandler
@@ -368,6 +370,11 @@ type runningStep struct {
 
 func (r *runningStep) ProvideStageInput(stage string, input map[string]any) error {
 	r.lock.Lock()
+	if r.closed {
+		r.logger.Debugf("exiting foreach ProvideStageInput due to step being closed")
+		r.lock.Unlock()
+		return nil
+	}
 	switch stage {
 	case string(StageIDExecute):
 		items := input["items"]
@@ -390,8 +397,8 @@ func (r *runningStep) ProvideStageInput(stage string, input map[string]any) erro
 			r.currentState = step.RunningStepStateRunning
 		}
 		r.inputAvailable = true
+		r.inputData <- input // Send before unlock to ensure that it never gets closed before sending.
 		r.lock.Unlock()
-		r.inputData <- input
 		return nil
 	case string(StageIDOutputs):
 		r.lock.Unlock()
@@ -412,14 +419,19 @@ func (r *runningStep) CurrentStage() string {
 }
 
 func (r *runningStep) State() step.RunningStepState {
-	r.lock.Lock()
+	r.lock.Lock() // TODO: Determine why this gets stuck.
 	defer r.lock.Unlock()
 	return r.currentState
 }
 
 func (r *runningStep) Close() error {
+	r.lock.Lock()
+	r.closed = true
+	r.lock.Unlock()
 	r.cancel()
 	r.wg.Wait()
+	r.logger.Debugf("Closing inputData channel in foreach step provider")
+	close(r.inputData)
 	return nil
 }
 
@@ -431,7 +443,7 @@ func (r *runningStep) ForceClose() error {
 func (r *runningStep) run() {
 	r.wg.Add(1)
 	defer func() {
-		close(r.inputData)
+		r.logger.Debugf("foreach run function done")
 		r.wg.Done()
 	}()
 	waitingForInput := false
@@ -471,7 +483,10 @@ func (r *runningStep) run() {
 			input := input
 			go func() {
 				defer func() {
-					<-sem
+					select {
+					case <-sem:
+					case <-r.ctx.Done(): // Must not deadlock if closed early.
+					}
 					wg.Done()
 				}()
 				r.logger.Debugf("Queuing item %d...", i)
@@ -540,6 +555,7 @@ func (r *runningStep) run() {
 		r.lock.Unlock()
 		r.stageChangeHandler.OnStepComplete(r, previousStage, &outputID, &outputData, &r.wg)
 	case <-r.ctx.Done():
+		r.logger.Debugf("context done")
 		return
 	}
 
diff --git a/internal/step/lifecycle.go b/internal/step/lifecycle.go
index 83495250..700fe2fd 100644
--- a/internal/step/lifecycle.go
+++ b/internal/step/lifecycle.go
@@ -30,10 +30,10 @@ func (l Lifecycle[StageType]) DAG() (dgraph.DirectedGraph[StageType], error) {
 		}
 	}
 	for _, stage := range l.Stages {
-		node := lang.Must2(dag.GetNodeByID(stage.Identifier()))
-		for _, nextStage := range stage.NextStageIDs() {
-			if err := node.Connect(nextStage); err != nil {
-				return nil, fmt.Errorf("failed to connect lifecycle stage %s to %s (%w)", node.ID(), nextStage, err)
+		for nextStage, dependencyType := range stage.NextStageIDs() {
+			nextStageNode := lang.Must2(dag.GetNodeByID(nextStage))
+			if err := nextStageNode.ConnectDependency(stage.Identifier(), dependencyType); err != nil {
+				return nil, fmt.Errorf("failed to connect lifecycle stage %s to %s (%w)", stage.Identifier(), nextStage, err)
 			}
 		}
 	}
@@ -52,7 +52,7 @@ type lifecycleStage interface {
 	// Identifier returns the ID of the stage.
 	Identifier() string
 	// NextStageIDs returns the next stage identifiers.
-	NextStageIDs() []string
+	NextStageIDs() map[string]dgraph.DependencyType
 }
 
 // LifecycleStage is the description of a single stage within a step lifecycle.
@@ -72,7 +72,7 @@ type LifecycleStage struct {
 	// will pause if there is no input available.
 	// It will automatically create a DAG node between the current and the described next stages to ensure
 	// that it is running in order.
-	NextStages []string
+	NextStages map[string]dgraph.DependencyType
 	// Fatal indicates that this stage should be treated as fatal unless handled by the workflow.
 	Fatal bool
 }
@@ -83,7 +83,7 @@ func (l LifecycleStage) Identifier() string {
 }
 
 // NextStageIDs is a helper function that returns the next possible stages.
-func (l LifecycleStage) NextStageIDs() []string {
+func (l LifecycleStage) NextStageIDs() map[string]dgraph.DependencyType {
 	return l.NextStages
 }
 
diff --git a/internal/step/plugin/provider.go b/internal/step/plugin/provider.go
index cfddb7ac..92d84ec3 100644
--- a/internal/step/plugin/provider.go
+++ b/internal/step/plugin/provider.go
@@ -3,6 +3,7 @@ package plugin
 import (
 	"context"
 	"fmt"
+	"go.arcalot.io/dgraph"
 	"go.flow.arcalot.io/pluginsdk/plugin"
 	"reflect"
 	"strings"
@@ -168,8 +169,9 @@ var deployingLifecycleStage = step.LifecycleStage{
 	RunningName:  "deploying",
 	FinishedName: "deployed",
 	InputFields:  map[string]struct{}{string(StageIDDeploy): {}},
-	NextStages: []string{
-		string(StageIDStarting), string(StageIDDeployFailed),
+	NextStages: map[string]dgraph.DependencyType{
+		string(StageIDStarting):     dgraph.AndDependency,
+		string(StageIDDeployFailed): dgraph.CompletionAndDependency,
 	},
 	Fatal: false,
 }
@@ -189,8 +191,10 @@ var enablingLifecycleStage = step.LifecycleStage{
 	InputFields: map[string]struct{}{
 		"enabled": {},
 	},
-	NextStages: []string{
-		string(StageIDStarting), string(StageIDDisabled), string(StageIDCrashed),
+	NextStages: map[string]dgraph.DependencyType{
+		string(StageIDStarting): dgraph.AndDependency,
+		string(StageIDDisabled): dgraph.AndDependency,
+		string(StageIDCrashed):  dgraph.CompletionAndDependency,
 	},
 }
 
@@ -203,8 +207,9 @@ var startingLifecycleStage = step.LifecycleStage{
 		"input":    {},
 		"wait_for": {},
 	},
-	NextStages: []string{
-		string(StageIDRunning), string(StageIDCrashed),
+	NextStages: map[string]dgraph.DependencyType{
+		string(StageIDRunning): dgraph.AndDependency,
+		string(StageIDCrashed): dgraph.CompletionAndDependency,
 	},
 }
 
@@ -214,8 +219,9 @@ var runningLifecycleStage = step.LifecycleStage{
 	RunningName:  "running",
 	FinishedName: "completed",
 	InputFields:  map[string]struct{}{},
-	NextStages: []string{
-		string(StageIDOutput), string(StageIDCrashed),
+	NextStages: map[string]dgraph.DependencyType{
+		string(StageIDOutput):  dgraph.AndDependency,
+		string(StageIDCrashed): dgraph.CompletionAndDependency,
 	},
 }
 
@@ -227,8 +233,10 @@ var cancelledLifecycleStage = step.LifecycleStage{
 	InputFields: map[string]struct{}{
 		"stop_if": {},
 	},
-	NextStages: []string{
-		string(StageIDOutput), string(StageIDCrashed), string(StageIDDeployFailed),
+	NextStages: map[string]dgraph.DependencyType{
+		string(StageIDOutput):       dgraph.AndDependency,
+		string(StageIDCrashed):      dgraph.CompletionAndDependency,
+		string(StageIDDeployFailed): dgraph.CompletionAndDependency,
 	},
 }
 
@@ -902,6 +910,7 @@ func (r *runningStep) closeComponents(closeATP bool) error {
 	r.cancel()
 	r.lock.Lock()
 	if r.closed {
+		r.lock.Unlock()
 		return nil // Already closed
 	}
 	var atpErr error
@@ -941,7 +950,7 @@ func (r *runningStep) run() {
 			r.logger.Warningf("failed to remove deployed container for step %s/%s", r.runID, r.pluginStepID)
 		}
 		r.lock.Unlock()
-		r.transitionToCancelled()
+		r.cancelledEarly()
 		return
 	default:
 		r.container = container
@@ -960,9 +969,6 @@ func (r *runningStep) run() {
 		return
 	}
 
-	// It's enabled, so the disabled stage will not occur.
-	r.stageChangeHandler.OnStepStageFailure(r, string(StageIDDisabled), &r.wg, err)
-
 	if err := r.startStage(container); err != nil {
 		r.startFailed(err)
 		return
@@ -1050,12 +1056,20 @@ func (r *runningStep) enableStage() (bool, error) {
 		&r.wg,
 	)
 
+	var enabled bool
 	select {
-	case enabled := <-r.enabledInput:
-		return enabled, nil
+	case enabled = <-r.enabledInput:
 	case <-r.ctx.Done():
 		return false, fmt.Errorf("step closed while determining enablement status")
 	}
+
+	if enabled {
+		r.lock.Lock()
+		// It's enabled, so the disabled stage will not occur.
+		r.stageChangeHandler.OnStepStageFailure(r, string(StageIDDisabled), &r.wg, fmt.Errorf("step enabled; cannot be disabled anymore"))
+		r.lock.Unlock()
+	}
+	return enabled, nil
 }
 
 func (r *runningStep) startStage(container deployer.Plugin) error {
@@ -1150,9 +1164,6 @@ func (r *runningStep) runStage() error {
 	var result atp.ExecutionResult
 	select {
 	case result = <-r.executionChannel:
-		if result.Error != nil {
-			return result.Error
-		}
 	case <-r.ctx.Done():
 		// In this case, it is being instructed to stop. A signal should have been sent.
 		// Shutdown (with sigterm) the container, then wait for the output (valid or error).
@@ -1170,6 +1181,10 @@ func (r *runningStep) runStage() error {
 
 	}
 
+	if result.Error != nil {
+		return result.Error
+	}
+
 	// Execution complete, move to state running stage outputs, then to state finished stage.
 	r.transitionStage(StageIDOutput, step.RunningStepStateRunning)
 	r.completeStep(r.currentStage, step.RunningStepStateFinished, &result.OutputID, &result.OutputData)
@@ -1178,6 +1193,8 @@ func (r *runningStep) runStage() error {
 }
 
 func (r *runningStep) markStageFailures(firstStage StageID, err error) {
+	r.lock.Lock()
+	defer r.lock.Unlock()
 	switch firstStage {
 	case StageIDEnabling:
 		r.stageChangeHandler.OnStepStageFailure(r, string(StageIDEnabling), &r.wg, err)
@@ -1214,15 +1231,16 @@ func (r *runningStep) deployFailed(err error) {
 	r.markStageFailures(StageIDEnabling, err)
 }
 
-func (r *runningStep) transitionToCancelled() {
+func (r *runningStep) cancelledEarly() {
 	r.logger.Infof("Step %s/%s cancelled", r.runID, r.pluginStepID)
 	// Follow the convention of transitioning to running then finished.
 	r.transitionStage(StageIDCancelled, step.RunningStepStateRunning)
-	// Cancelled currently has no output.
-	r.transitionStage(StageIDCancelled, step.RunningStepStateFinished)
-
 	// This is called after deployment. So everything after deployment cannot occur.
 	err := fmt.Errorf("step %s/%s cancelled", r.runID, r.pluginStepID)
+	// Cancelled currently has no output.
+	// Set it as unresolvable since it's cancelled, and to prevent conflicts with its inputs.
+	r.transitionFromFailedStage(StageIDCancelled, step.RunningStepStateFinished, err)
+
 	// Note: This function is only called if it's cancelled during the deployment phase.
 	// If that changes, the stage IDs marked as failed need to be changed.
 	r.markStageFailures(StageIDEnabling, err)
@@ -1248,12 +1266,11 @@ func (r *runningStep) transitionToDisabled() {
 
 	err := fmt.Errorf("step %s/%s disabled", r.runID, r.pluginStepID)
 	r.markStageFailures(StageIDStarting, err)
-
 }
 
 func (r *runningStep) startFailed(err error) {
 	r.logger.Debugf("Start failed stage for step %s/%s", r.runID, r.pluginStepID)
-	r.transitionStage(StageIDCrashed, step.RunningStepStateRunning)
+	r.transitionFromFailedStage(StageIDCrashed, step.RunningStepStateRunning, err)
 	r.logger.Warningf("Plugin step %s/%s start failed. %v", r.runID, r.pluginStepID, err)
 
 	// Now it's done.
@@ -1288,8 +1305,29 @@ func (r *runningStep) transitionStage(newStage StageID, state step.RunningStepSt
 	r.transitionStageWithOutput(newStage, state, nil, nil)
 }
 
+func (r *runningStep) transitionFromFailedStage(newStage StageID, state step.RunningStepState, err error) {
+	r.lock.Lock()
+	defer r.lock.Unlock()
+	previousStage := string(r.currentStage)
+	r.currentStage = newStage
+	// Don't forget to update this, or else it will behave very oddly.
+	// First running, then finished. You can't skip states.
+	r.state = state
+	r.stageChangeHandler.OnStepStageFailure(
+		r,
+		previousStage,
+		&r.wg,
+		err,
+	)
+}
+
 // TransitionStage transitions the stage to the specified stage, and the state to the specified state.
-func (r *runningStep) transitionStageWithOutput(newStage StageID, state step.RunningStepState, outputID *string, previousStageOutput *any) {
+func (r *runningStep) transitionStageWithOutput(
+	newStage StageID,
+	state step.RunningStepState,
+	outputID *string,
+	previousStageOutput *any,
+) {
 	// A current lack of observability into the atp client prevents
 	// non-fragile testing of this function.
 	r.lock.Lock()
diff --git a/internal/step/plugin/provider_test.go b/internal/step/plugin/provider_test.go
index e946f000..6d458229 100644
--- a/internal/step/plugin/provider_test.go
+++ b/internal/step/plugin/provider_test.go
@@ -518,3 +518,5 @@ func TestProvider_StartFail(t *testing.T) {
 		assert.NoError(t, running.Close())
 	})
 }
+
+// TODO: Add more tests here for the current functions and code paths.
diff --git a/workflow/executor.go b/workflow/executor.go
index 9c1c8816..132b0d23 100644
--- a/workflow/executor.go
+++ b/workflow/executor.go
@@ -413,8 +413,12 @@ func (e *executor) connectStepDependencies(
 			if err != nil {
 				return fmt.Errorf("bug: node for current stage not found (%w)", err)
 			}
-			for _, nextStage := range stage.NextStages {
-				if err := currentStageNode.Connect(GetStageNodeID(stepID, nextStage)); err != nil {
+			for nextStage, dependencyType := range stage.NextStages {
+				nextStageNode, err := dag.GetNodeByID(GetStageNodeID(stepID, nextStage))
+				if err != nil {
+					return fmt.Errorf("bug: node for next stage not found (%w)", err)
+				}
+				if err := nextStageNode.ConnectDependency(currentStageNode.ID(), dependencyType); err != nil {
 					return fmt.Errorf("bug: cannot connect nodes (%w)", err)
 				}
 			}
diff --git a/workflow/workflow.go b/workflow/workflow.go
index d3feae76..a25ed9e6 100644
--- a/workflow/workflow.go
+++ b/workflow/workflow.go
@@ -284,7 +284,13 @@ errGatherLoop:
 	}
 }
 
-func (l *loopState) onStageComplete(stepID string, previousStage *string, previousStageOutputID *string, previousStageOutput *any, wg *sync.WaitGroup) {
+func (l *loopState) onStageComplete(
+	stepID string,
+	previousStage *string,
+	previousStageOutputID *string,
+	previousStageOutput *any,
+	wg *sync.WaitGroup,
+) {
 	l.lock.Lock()
 	defer func() {
 		if previousStage != nil {
@@ -303,10 +309,10 @@ func (l *loopState) onStageComplete(stepID string, previousStage *string, previo
 		l.cancel()
 		return
 	}
-	l.logger.Debugf("Resolving node %q in the DAG", stageNode.ID())
+	l.logger.Debugf("Resolving node %q in the DAG on stage complete", stageNode.ID())
 	if err := stageNode.ResolveNode(dgraph.Resolved); err != nil {
-		l.logger.Errorf("Failed to resolve stage node ID %s (%w)", stageNode.ID(), err)
-		l.recentErrors <- fmt.Errorf("failed to resolve stage node ID %s (%w)", stageNode.ID(), err)
+		errMessage := fmt.Errorf("failed to resolve stage node ID %s (%s)", stageNode.ID(), err.Error())
+		l.recentErrors <- errMessage
 		l.cancel()
 		return
 	}
@@ -320,7 +326,7 @@ func (l *loopState) onStageComplete(stepID string, previousStage *string, previo
 		}
 		// Resolves the node in the DAG. This allows us to know which nodes are
 		// ready for processing due to all dependencies being resolved.
-		l.logger.Debugf("Resolving node %q in the DAG", outputNode.ID())
+		l.logger.Debugf("Resolving output node %q in the DAG", outputNode.ID())
 		if err := outputNode.ResolveNode(dgraph.Resolved); err != nil {
 			l.logger.Errorf("Failed to resolve output node ID %s (%w)", outputNode.ID(), err)
 			l.recentErrors <- fmt.Errorf("failed to resolve output node ID %s (%w)", outputNode.ID(), err)
@@ -371,7 +377,7 @@ func (l *loopState) markOutputsUnresolvable(stepID string, stageID string, skipp
 			l.logger.Debugf("Will mark node %s in the DAG as unresolvable", stepID+"."+stage.ID+"."+stageOutputID)
 			err = unresolvableOutputNode.ResolveNode(dgraph.Unresolvable)
 			if err != nil {
-				l.logger.Errorf("Error while marking node %s in DAG as unresolvable (%s)", unresolvableOutputNode.ID(), err.Error())
+				panic(fmt.Errorf("error while marking node %s in DAG as unresolvable (%s)", unresolvableOutputNode.ID(), err.Error()))
 			}
 		}
 	}
@@ -390,7 +396,13 @@ func (l *loopState) markStageNodeUnresolvable(stepID string, stageID string) {
 	l.logger.Debugf("Will mark node %s in the DAG as unresolvable", stepID+"."+stageID)
 	err = unresolvableOutputNode.ResolveNode(dgraph.Unresolvable)
 	if err != nil {
-		l.logger.Errorf("Error while marking node %s in DAG as unresolvable (%s)", unresolvableOutputNode.ID(), err.Error())
+		panic(
+			fmt.Errorf(
+				"error while marking node %s in DAG as unresolvable (%s)",
+				unresolvableOutputNode.ID(),
+				err.Error(),
+			),
+		)
 	}
 }
 
@@ -404,7 +416,7 @@ func (l *loopState) notifySteps() { //nolint:gocognit
 	// Can include runnable nodes, nodes that cannot be resolved, and nodes that are not for running, like inputs.
 	for nodeID, resolutionStatus := range readyNodes {
 		failed := resolutionStatus == dgraph.Unresolvable
-		l.logger.Debugf("Processing step node %s", nodeID)
+		l.logger.Debugf("Processing step node %s with resolution status %q", nodeID, resolutionStatus)
 		node, err := l.dag.GetNodeByID(nodeID)
 		if err != nil {
 			panic(fmt.Errorf("failed to get node %s (%w)", nodeID, err))
@@ -438,6 +450,8 @@ func (l *loopState) notifySteps() { //nolint:gocognit
 		// untypedInputData stores the resolved data
 		untypedInputData, err := l.resolveExpressions(inputData, l.data)
 		if err != nil {
+			// An error here often indicates a locking issue in a step provider. This would be caused
+			// by the lock not being held when the output was marked resolved.
 			panic(fmt.Errorf("cannot resolve expressions for %s (%w)", nodeID, err))
 		}
 
diff --git a/workflow/workflow_test.go b/workflow/workflow_test.go
index 59f140bd..67aee20b 100644
--- a/workflow/workflow_test.go
+++ b/workflow/workflow_test.go
@@ -54,7 +54,7 @@ func TestOutputFailed(t *testing.T) {
 
 	var typedError *workflow.ErrNoMorePossibleOutputs
 	if !errors.As(err, &typedError) {
-		t.Fatalf("incorrect error type returned: %T", err)
+		t.Fatalf("incorrect error type returned: %T (%s)", err, err)
 	}
 }
 
@@ -756,7 +756,7 @@ func TestMissingInputsFailedDeployment(t *testing.T) {
 	assert.Equals(t, outputID, "")
 	var typedError *workflow.ErrNoMorePossibleOutputs
 	if !errors.As(err, &typedError) {
-		t.Fatalf("incorrect error type returned: %T", err)
+		t.Fatalf("incorrect error type returned: %T (%s)", err, err)
 	}
 }
 
@@ -1218,7 +1218,7 @@ func TestInputCancelledStepWorkflow(t *testing.T) {
 	assert.Equals(t, outputID, "")
 	var typedError *workflow.ErrNoMorePossibleOutputs
 	if !errors.As(err, &typedError) {
-		t.Fatalf("incorrect error type returned: %T", err)
+		t.Fatalf("incorrect error type returned: %T (%s)", err, err)
 	}
 }
 
@@ -1269,7 +1269,7 @@ func TestInputDisabledStepWorkflow(t *testing.T) {
 	assert.Error(t, err)
 	var typedError *workflow.ErrNoMorePossibleOutputs
 	if !errors.As(err, &typedError) {
-		t.Fatalf("incorrect error type returned: %T", err)
+		t.Fatalf("incorrect error type returned: %T (%s)", err, err)
 	}
 }
 
@@ -1426,7 +1426,7 @@ func TestMultiDependencyWorkflowFailureWithDisabling(t *testing.T) {
 	t.Logf("MultiDependency workflow failed purposefully in %d ms", duration.Milliseconds())
 	var typedError *workflow.ErrNoMorePossibleOutputs
 	if !errors.As(err, &typedError) {
-		t.Fatalf("incorrect error type returned: %T", err)
+		t.Fatalf("incorrect error type returned: %T (%s)", err, err)
 	}
 	assert.LessThan(t, duration.Seconds(), 4) // It will take 5 seconds if it fails to fail early.
 }
@@ -1481,7 +1481,7 @@ func TestMultiDependencyWorkflowFailureWithCancellation(t *testing.T) {
 	t.Logf("MultiDependency workflow failed purposefully in %d ms", duration.Milliseconds())
 	var typedError *workflow.ErrNoMorePossibleOutputs
 	if !errors.As(err, &typedError) {
-		t.Fatalf("incorrect error type returned: %T", err)
+		t.Fatalf("incorrect error type returned: %T (%s)", err, err)
 	}
 	assert.LessThan(t, duration.Seconds(), 4) // It will take 5 seconds if it fails to fail early.
 }
@@ -1533,7 +1533,7 @@ func TestMultiDependencyWorkflowFailureWithErrorOut(t *testing.T) {
 	t.Logf("MultiDependency workflow failed purposefully in %d ms", duration.Milliseconds())
 	var typedError *workflow.ErrNoMorePossibleOutputs
 	if !errors.As(err, &typedError) {
-		t.Fatalf("incorrect error type returned: %T", err)
+		t.Fatalf("incorrect error type returned: %T (%s)", err, err)
 	}
 	// If it takes 5 seconds, then there was a deadlock in the client.
 	// If it takes 6 seconds, then it waited for the second step.
@@ -1571,7 +1571,7 @@ steps:
       wait_time_ms: 7000
 outputs:
   workflow-success:
-    simple_wait_output: !expr $.steps.failed_step.outputs
+    failed_step_output: !expr $.steps.failed_step.outputs
     simple_wait_output: !expr $.steps.long_wait.outputs
 `
 
@@ -1593,13 +1593,85 @@ func TestMultiDependencyWorkflowFailureWithDeployFailure(t *testing.T) {
 	t.Logf("MultiDependency workflow failed purposefully in %d ms", duration.Milliseconds())
 	var typedError *workflow.ErrNoMorePossibleOutputs
 	if !errors.As(err, &typedError) {
-		t.Fatalf("incorrect error type returned: %T", err)
+		t.Fatalf("incorrect error type returned: %T (%s)", err, err)
 	}
 	// If it takes 5 seconds, then there was a deadlock in the client.
 	// If it takes 6 seconds, then it waited for the second step.
 	assert.LessThan(t, duration.Milliseconds(), 5500)
 }
 
+var multiDependencyFailureWorkflowWithDoubleFailure = `
+version: v0.2.0
+input:
+  root: WorkflowInput
+  objects:
+    WorkflowInput:
+      id: WorkflowInput
+      properties: {}
+steps:
+  failed_step_A:
+    plugin:
+      src: "n/a"
+      deployment_type: "builtin"
+    step: wait
+    input:
+      wait_time_ms: 0
+    deploy: # This fails first.
+      deployer_name: "test-impl"
+      deploy_time: 0
+      deploy_succeed: !expr 'false'
+  quick_step:
+    plugin:
+      src: "n/a"
+      deployment_type: "builtin"
+    step: wait
+    input:
+      wait_time_ms: 1 # Second, this succeeds, which cancels the second failing step.
+  failed_step_B:
+    plugin:
+      src: "n/a"
+      deployment_type: "builtin"
+    step: wait
+    wait_for: !expr $.steps.failed_step_A.outputs # Makes it unresolvable
+    stop_if: !expr $.steps.quick_step.outputs # Hopefully triggers the second resolution.
+    input:
+      wait_time_ms: 0
+    deploy:
+      deployer_name: "test-impl"
+      deploy_succeed: !expr 'true'
+  long_wait:
+    plugin:
+      src: "n/a"
+      deployment_type: "builtin"
+    step: wait
+    input:
+      wait_time_ms: 100
+outputs:
+  workflow-success:
+    failed_step_output: !expr $.steps.failed_step_A.outputs
+  wait-only: # For the error to be exposed, we need an alternative output that persists beyond the error.
+    wait_output: !expr $.steps.long_wait.outputs
+`
+
+func TestMultiDependencyWorkflowFailureDoubleFailure(t *testing.T) {
+	// Creates a scenario where step B's starting (due to wait-for) depends on step A's outputs,
+	// making A's outputs become unresolvable, while at the same time the step that needs that info (B) crashes.
+	// Transitioning B to crashed resolves starting, so that is in conflict with the unresolvable
+	// state propagated from step A.
+	preparedWorkflow := assert.NoErrorR[workflow.ExecutableWorkflow](t)(
+		getTestImplPreparedWorkflow(t, multiDependencyFailureWorkflowWithDoubleFailure),
+	)
+
+	startTime := time.Now() // Right before execute to not include pre-processing time.
+	_, _, err := preparedWorkflow.Execute(context.Background(), map[string]any{})
+	duration := time.Since(startTime)
+	assert.NoError(t, err)
+	t.Logf("MultiDependency finished in %d ms", duration.Milliseconds())
+}
+
+// TODO: Create a situation where step A's outputs are resolvable, but the context is cancelled,
+//		resulting in a possible conflict with the cancelled step stage DAG node.
+
 func createTestExecutableWorkflow(t *testing.T, workflowStr string, workflowCtx map[string][]byte) (workflow.ExecutableWorkflow, error) {
 	logConfig := log.Config{
 		Level:       log.LevelDebug,