From 827ab2221718ac76222d6c974688a19b661db65d Mon Sep 17 00:00:00 2001 From: Jared O'Connell Date: Wed, 3 Jul 2024 18:55:13 -0400 Subject: [PATCH 01/22] Changes to mark steps unresolvable This commit is missing dependency updates --- internal/step/dummy/provider_test.go | 3 + internal/step/plugin/provider.go | 35 +++- internal/step/plugin/provider_test.go | 12 ++ internal/step/provider.go | 9 + workflow/model.go | 37 ++-- workflow/model_test.go | 10 +- workflow/workflow.go | 118 +++++++++++- workflow/workflow_test.go | 249 +++++++++++++++++++++----- 8 files changed, 393 insertions(+), 80 deletions(-) diff --git a/internal/step/dummy/provider_test.go b/internal/step/dummy/provider_test.go index 622896e4..fd6e0fc1 100644 --- a/internal/step/dummy/provider_test.go +++ b/internal/step/dummy/provider_test.go @@ -14,6 +14,9 @@ type stageChangeHandler struct { message chan string } +func (s *stageChangeHandler) OnStepStageFailure(_ step.RunningStep, _ string, _ *sync.WaitGroup, _ error) { +} + func (s *stageChangeHandler) OnStageChange(_ step.RunningStep, _ *string, _ *string, _ *any, _ string, _ bool, _ *sync.WaitGroup) { } diff --git a/internal/step/plugin/provider.go b/internal/step/plugin/provider.go index 23ad9798..f89f1bfa 100644 --- a/internal/step/plugin/provider.go +++ b/internal/step/plugin/provider.go @@ -292,7 +292,7 @@ func (p *pluginProvider) LoadSchema(inputs map[string]any, _ map[string][]byte) requestedDeploymentType, pluginSource, err) } // Set up the ATP connection - transport := atp.NewClientWithLogger(pluginConnector, p.logger) + transport := atp.NewClientWithLogger(pluginConnector, p.logger.WithLabel("source", "atp-client")) // Read the schema information s, err := transport.ReadSchema() if err != nil { @@ -957,6 +957,9 @@ func (r *runningStep) run() { if !enabled { r.transitionToDisabled() return + } else { + // It's enabled, so the disabled stage will not occur. + r.stageChangeHandler.OnStepStageFailure(r, string(StageIDDisabled), &r.wg, err) } if err := r.startStage(container); err != nil { r.startFailed(err) @@ -1055,7 +1058,7 @@ func (r *runningStep) enableStage() (bool, error) { func (r *runningStep) startStage(container deployer.Plugin) error { r.logger.Debugf("Starting stage for step %s/%s", r.runID, r.pluginStepID) - atpClient := atp.NewClientWithLogger(container, r.logger) + atpClient := atp.NewClientWithLogger(container, r.logger.WithLabel("source", "atp-client")) var inputReceivedEarly bool r.lock.Lock() r.atpClient = atpClient @@ -1118,9 +1121,12 @@ func (r *runningStep) startStage(container deployer.Plugin) error { return fmt.Errorf("schema mismatch between local and remote deployed plugin in step %s/%s, unserializing input failed (%w)", r.runID, r.pluginStepID, err) } + r.wg.Add(1) + // Runs the ATP client in a goroutine in order to wait for it. // On context done, the deployer has 30 seconds before it will error out. go func() { + defer r.wg.Done() result := r.atpClient.Execute( schema.Input{RunID: r.runID, ID: r.pluginStepID, InputData: runInput}, r.signalToStep, @@ -1180,6 +1186,13 @@ func (r *runningStep) deployFailed(err error) { Error: err.Error(), }) r.completeStep(StageIDDeployFailed, step.RunningStepStateFinished, &outputID, &output) + // If deployment fails, enabling, disabled, starting, running, and output cannot occur. + err = fmt.Errorf("deployment failed for step %s/%s", r.runID, r.pluginStepID) + r.stageChangeHandler.OnStepStageFailure(r, string(StageIDEnabling), &r.wg, err) + r.stageChangeHandler.OnStepStageFailure(r, string(StageIDDisabled), &r.wg, err) + r.stageChangeHandler.OnStepStageFailure(r, string(StageIDStarting), &r.wg, err) + r.stageChangeHandler.OnStepStageFailure(r, string(StageIDRunning), &r.wg, err) + r.stageChangeHandler.OnStepStageFailure(r, string(StageIDOutput), &r.wg, err) } func (r *runningStep) transitionToCancelled() { @@ -1188,6 +1201,14 @@ func (r *runningStep) transitionToCancelled() { r.transitionStage(StageIDCancelled, step.RunningStepStateRunning) // Cancelled currently has no output. r.transitionStage(StageIDCancelled, step.RunningStepStateFinished) + + // This is called after deployment. So everything after deployment cannot occur. + err := fmt.Errorf("step %s/%s cancelled", r.runID, r.pluginStepID) + r.stageChangeHandler.OnStepStageFailure(r, string(StageIDEnabling), &r.wg, err) + r.stageChangeHandler.OnStepStageFailure(r, string(StageIDDisabled), &r.wg, err) + r.stageChangeHandler.OnStepStageFailure(r, string(StageIDStarting), &r.wg, err) + r.stageChangeHandler.OnStepStageFailure(r, string(StageIDRunning), &r.wg, err) + r.stageChangeHandler.OnStepStageFailure(r, string(StageIDOutput), &r.wg, err) } func (r *runningStep) transitionToDisabled() { @@ -1207,6 +1228,11 @@ func (r *runningStep) transitionToDisabled() { schema.PointerTo("output"), &disabledOutput, ) + + err := fmt.Errorf("step %s/%s disabled", r.runID, r.pluginStepID) + r.stageChangeHandler.OnStepStageFailure(r, string(StageIDStarting), &r.wg, err) + r.stageChangeHandler.OnStepStageFailure(r, string(StageIDRunning), &r.wg, err) + r.stageChangeHandler.OnStepStageFailure(r, string(StageIDOutput), &r.wg, err) } func (r *runningStep) startFailed(err error) { @@ -1221,6 +1247,9 @@ func (r *runningStep) startFailed(err error) { }) r.completeStep(StageIDCrashed, step.RunningStepStateFinished, &outputID, &output) + + r.stageChangeHandler.OnStepStageFailure(r, string(StageIDRunning), &r.wg, err) + r.stageChangeHandler.OnStepStageFailure(r, string(StageIDOutput), &r.wg, err) } func (r *runningStep) runFailed(err error) { @@ -1234,6 +1263,8 @@ func (r *runningStep) runFailed(err error) { Output: err.Error(), }) r.completeStep(StageIDCrashed, step.RunningStepStateFinished, &outputID, &output) + + r.stageChangeHandler.OnStepStageFailure(r, string(StageIDOutput), &r.wg, err) } // TransitionStage transitions the stage to the specified stage, and the state to the specified state. diff --git a/internal/step/plugin/provider_test.go b/internal/step/plugin/provider_test.go index 84a523e0..e946f000 100644 --- a/internal/step/plugin/provider_test.go +++ b/internal/step/plugin/provider_test.go @@ -46,6 +46,10 @@ func (s *deployFailStageChangeHandler) OnStepComplete( s.message <- message } +func (s *deployFailStageChangeHandler) OnStepStageFailure(_ step.RunningStep, _ string, _ *sync.WaitGroup, _ error) { + +} + type startFailStageChangeHandler struct { message chan string } @@ -79,6 +83,10 @@ func (s *startFailStageChangeHandler) OnStepComplete( s.message <- message } +func (s *startFailStageChangeHandler) OnStepStageFailure(_ step.RunningStep, _ string, _ *sync.WaitGroup, _ error) { + +} + type stageChangeHandler struct { message chan string } @@ -111,6 +119,10 @@ func (s *stageChangeHandler) OnStepComplete( s.message <- message } +func (s *stageChangeHandler) OnStepStageFailure(_ step.RunningStep, _ string, _ *sync.WaitGroup, _ error) { + +} + func TestProvider_MissingDeployer(t *testing.T) { logger := log.New( log.Config{ diff --git a/internal/step/provider.go b/internal/step/provider.go index b8722486..5e45a63c 100644 --- a/internal/step/provider.go +++ b/internal/step/provider.go @@ -56,6 +56,15 @@ type StageChangeHandler interface { previousStageOutput *any, wg *sync.WaitGroup, ) + + // OnStepStageFailure is called when it becomes known that the step's stage will not produce an output. + // The error is optional. + OnStepStageFailure( + step RunningStep, + stage string, + wg *sync.WaitGroup, + err error, + ) } // RunnableStep is a step that already has a schema and can be run. diff --git a/workflow/model.go b/workflow/model.go index 54ccba15..454ce5c7 100644 --- a/workflow/model.go +++ b/workflow/model.go @@ -2,12 +2,10 @@ package workflow import ( "fmt" - "regexp" - "strings" - "go.arcalot.io/dgraph" "go.flow.arcalot.io/engine/internal/step" "go.flow.arcalot.io/pluginsdk/schema" + "regexp" ) // Workflow is the primary data structure describing workflows. @@ -229,30 +227,23 @@ type ErrNoMorePossibleSteps struct { // Error returns an explanation on why the error happened. func (e ErrNoMorePossibleSteps) Error() string { - var outputsUnmetDependencies []string //nolint:prealloc - for _, node := range e.dag.ListNodes() { - if node.Item().Kind != DAGItemKindOutput { - continue - } - var unmetDependencies []string - inbound, err := node.ListInboundConnections() - if err != nil { - panic(fmt.Errorf("failed to fetch output node inbound dependencies (%w)", err)) - } - for i := range inbound { - unmetDependencies = append(unmetDependencies, i) - } - outputsUnmetDependencies = append( - outputsUnmetDependencies, - fmt.Sprintf("%s: %s", node.Item().OutputID, strings.Join(unmetDependencies, ", ")), - ) - } return fmt.Sprintf( - "no steps running, no more executable steps, cannot construct any output (outputs have the following dependencies: %s)", - strings.Join(outputsUnmetDependencies, "; "), + "no steps running, no more executable steps; cannot construct any output." + + " this is the fallback system, indicating a failure of the output resolution system", ) } +// ErrNoMorePossibleOutputs indicates that the workflow has terminated due to it being impossible to resolve an output. +// This means that steps that the output(s) depended on did not have the required results. +type ErrNoMorePossibleOutputs struct { + dag dgraph.DirectedGraph[*DAGItem] +} + +// Error returns an explanation on why the error happened. +func (e ErrNoMorePossibleOutputs) Error() string { + return fmt.Sprintf("all outputs marked as unresolvable") +} + // ErrInvalidState indicates that the workflow failed due to an invalid state. type ErrInvalidState struct { processingSteps int diff --git a/workflow/model_test.go b/workflow/model_test.go index 03556be5..df34f3b7 100644 --- a/workflow/model_test.go +++ b/workflow/model_test.go @@ -6,7 +6,7 @@ import ( "testing" ) -var versionExp = "v0.2.0" +var testVersionExp = "v0.2.0" var inputExp = map[string]any{ "root": "RootObject", "objects": map[string]any{ @@ -24,9 +24,9 @@ var stepsExp = map[string]any{ "input": map[string]any{ "wait_time_ms": 1}}, } -var outputID = "success" +var testExpectedOutputID = "success" var outputsExp = map[string]any{ - outputID: "!expr $.steps.long_wait.outputs", + testExpectedOutputID: "!expr $.steps.long_wait.outputs", } var outputSchemaRootID = "RootObjectOut" var stepOutputSchemaInput = map[string]any{ @@ -42,10 +42,10 @@ var stepOutputSchemaInput = map[string]any{ }}}}}}, } var outputSchemaInput = map[string]any{ - outputID: stepOutputSchemaInput, + testExpectedOutputID: stepOutputSchemaInput, } var workflowSchemaInput = map[string]any{ - "version": versionExp, + "version": testVersionExp, "input": inputExp, "steps": stepsExp, "outputs": outputsExp, diff --git a/workflow/workflow.go b/workflow/workflow.go index 0c2ee7e5..aecc99e9 100644 --- a/workflow/workflow.go +++ b/workflow/workflow.go @@ -72,6 +72,12 @@ func (e *executableWorkflow) Execute(ctx context.Context, serializedInput any) ( // We use an internal cancel function to abort the workflow if something bad happens. ctx, cancel := context.WithCancel(ctx) defer cancel() + outputNodes := make(map[string]dgraph.Node[*DAGItem]) + for _, node := range e.dag.ListNodes() { + if node.Item().Kind == DAGItemKindOutput { + outputNodes[node.ID()] = node + } + } l := &loopState{ logger: e.logger.WithLabel("source", "workflow"), @@ -86,9 +92,11 @@ func (e *executableWorkflow) Execute(ctx context.Context, serializedInput any) ( runningSteps: make(map[string]step.RunningStep, len(e.dag.ListNodes())), outputDataChannel: make(chan outputDataType, 1), outputDone: false, + waitingOutputs: outputNodes, cancel: cancel, workflowContext: e.workflowContext, recentErrors: make(chan error, 20), // Big buffer in case there are lots of subsequent errors. + lifecycles: e.lifecycles, } l.lock.Lock() @@ -141,6 +149,16 @@ func (e *executableWorkflow) Execute(ctx context.Context, serializedInput any) ( } l.onStageComplete(stepID, &previousStage, previousStageOutputID, previousStageOutput, wg) }, + onStepStageFailure: func(step step.RunningStep, stage string, wg *sync.WaitGroup, err error) { + if err == nil { + e.logger.Debugf("Step %q stage %q declared that it will not produce an output", stepID, stage) + } else { + e.logger.Debugf("Step %q stage %q declared that it will not produce an output (%s)", stepID, stage, err.Error()) + } + l.unresolveOutputs(stepID, stage, nil) + l.unresolveStageNode(stepID, stage) + l.notifySteps() + }, } e.logger.Debugf("Launching step %s...", stepID) runningStep, err := runnableStep.Start(e.stepRunData[stepID], stepID, stageHandler) @@ -235,9 +253,12 @@ type loopState struct { runningSteps map[string]step.RunningStep outputDataChannel chan outputDataType outputDone bool - recentErrors chan error - cancel context.CancelFunc - workflowContext map[string][]byte + // waitingOutputs keeps track of all workflow output nodes to know when the workflow fails. + waitingOutputs map[string]dgraph.Node[*DAGItem] + recentErrors chan error + cancel context.CancelFunc + workflowContext map[string][]byte + lifecycles map[string]step.Lifecycle[step.LifecycleStageWithSchema] } // getLastError gathers the last errors. If there are several, it creates a new one that consolidates them. @@ -317,6 +338,9 @@ func (l *loopState) onStageComplete(stepID string, previousStage *string, previo *previousStageOutput, ) } + // Unresolve all alternative output ID nodes. + // Use the lifecycle to find the possible output IDs + l.unresolveOutputs(stepID, *previousStage, previousStageOutputID) // Placing data from the output into the general data structure l.data[WorkflowStepsKey].(map[string]any)[stepID].(map[string]any)[*previousStage] = map[string]any{} @@ -325,6 +349,51 @@ func (l *loopState) onStageComplete(stepID string, previousStage *string, previo l.notifySteps() } +// Marks the outputs of that stage unresolvable. +// Optionally skip an output that should not unresolve. +// Does not (un)resolve the non-output node for that stage. +// To prevent a deadlock, this `notifySteps()` should be called at some point after this is called. +func (l *loopState) unresolveOutputs(stepID string, stageID string, skippedOutput *string) { + stages := l.lifecycles[stepID].Stages + for _, stage := range stages { + if stage.ID != stageID { + continue + } + for stageOutputID, _ := range stage.Outputs { + if skippedOutput != nil && stageOutputID == *skippedOutput { + continue + } + unresolvableOutputNode, err := l.dag.GetNodeByID(GetOutputNodeID(stepID, stage.ID, stageOutputID)) + if err != nil { + l.logger.Warningf("Could not get DAG node %s (%s)", stepID+"."+stage.ID+"."+stageOutputID, err.Error()) + continue + } + l.logger.Debugf("Will unresolve node %s in the DAG", stepID+"."+stage.ID+"."+stageOutputID) + err = unresolvableOutputNode.ResolveNode(dgraph.Unresolvable) + if err != nil { + l.logger.Errorf("Error while unresolving node %s in DAG (%s)", unresolvableOutputNode.ID(), err.Error()) + } + } + } +} + +// unresolves the stage node for the step's stage. +// Does not (un)resolve the outputs of that node. For that, call unresolveOutputs() instead or +// in addition to calling this function. +// To prevent a deadlock, this `notifySteps()` should be called at some point after this is called. +func (l *loopState) unresolveStageNode(stepID string, stageID string) { + unresolvableOutputNode, err := l.dag.GetNodeByID(GetStageNodeID(stepID, stageID)) + if err != nil { + l.logger.Warningf("Could not get DAG node %s (%s)", stepID+"."+stageID, err.Error()) + return + } + l.logger.Debugf("Will unresolve node %s in the DAG", stepID+"."+stageID) + err = unresolvableOutputNode.ResolveNode(dgraph.Unresolvable) + if err != nil { + l.logger.Errorf("Error while unresolving node %s in DAG (%s)", unresolvableOutputNode.ID(), err.Error()) + } +} + // notifySteps is a function we can call to go through all DAG nodes that are marked // ready and provides step inputs based on expressions. // The lock should be acquired by the caller before this is called. @@ -332,24 +401,39 @@ func (l *loopState) notifySteps() { //nolint:gocognit readyNodes := l.dag.PopReadyNodes() l.logger.Debugf("Currently %d DAG nodes are ready. Now processing them.", len(readyNodes)) - // Can include runnable nodes, nodes that cannot be resolved, and nodes that are not for running, like an input. + // Can include runnable nodes, nodes that cannot be resolved, and nodes that are not for running, like inputs. for nodeID, resolutionStatus := range readyNodes { - if resolutionStatus == dgraph.Unresolvable { - l.logger.Debugf("Disregarding node %q with resolution %q", nodeID, resolutionStatus) - continue - } + failed := resolutionStatus == dgraph.Unresolvable l.logger.Debugf("Processing step node %s", nodeID) node, err := l.dag.GetNodeByID(nodeID) if err != nil { panic(fmt.Errorf("failed to get node %s (%w)", nodeID, err)) } nodeItem := node.Item() + if failed { + if nodeItem.Kind == DAGItemKindOutput { + l.logger.Debugf("Output node %s failed", nodeID) + // Check to see if there are any remaining output nodes, and if there aren't, + // cancel the context. + delete(l.waitingOutputs, nodeID) + if len(l.waitingOutputs) == 0 { + l.recentErrors <- &ErrNoMorePossibleOutputs{ + l.dag, + } + l.cancel() + } + } else { + l.logger.Debugf("Disregarding failed node %s with type %s", nodeID, nodeItem.Kind) + } + continue + } // The data structure that the particular node requires. One or more fields. May or may not contain expressions. inputData := nodeItem.Data if inputData == nil { // No input data is needed. This is often the case for input nodes. continue } + // Resolve any expressions in the input data. // untypedInputData stores the resolved data untypedInputData, err := l.resolveExpressions(inputData, l.data) @@ -468,6 +552,7 @@ func (l *loopState) checkForDeadlocks(retries int, wg *sync.WaitGroup) { l.dag, } l.logger.Debugf("DAG:\n%s", l.dag.Mermaid()) + l.logger.Errorf("TERMINATING WORKFLOW; Errors below this error may be due to the early termination") l.cancel() } else { // Retry. There are times when all the steps are in a transition state. @@ -476,7 +561,9 @@ func (l *loopState) checkForDeadlocks(retries int, wg *sync.WaitGroup) { wg.Add(1) go func() { time.Sleep(5 * time.Millisecond) + l.lock.Lock() l.checkForDeadlocks(retries-1, wg) + l.lock.Unlock() wg.Done() }() } @@ -538,6 +625,12 @@ type stageChangeHandler struct { previousStageOutput *any, wg *sync.WaitGroup, ) + onStepStageFailure func( + step step.RunningStep, + stage string, + wg *sync.WaitGroup, + err error, + ) } func (s stageChangeHandler) OnStageChange( @@ -561,3 +654,12 @@ func (s stageChangeHandler) OnStepComplete( ) { s.onStepComplete(step, previousStage, previousStageOutputID, previousStageOutput, wg) } + +func (s stageChangeHandler) OnStepStageFailure( + step step.RunningStep, + stage string, + wg *sync.WaitGroup, + err error, +) { + s.onStepStageFailure(step, stage, wg, err) +} diff --git a/workflow/workflow_test.go b/workflow/workflow_test.go index dc8bb471..cc231f20 100644 --- a/workflow/workflow_test.go +++ b/workflow/workflow_test.go @@ -51,12 +51,44 @@ func TestOutputFailed(t *testing.T) { _, outputData, err := preparedWorkflow.Execute(context.Background(), map[string]any{"name": "Arca Lot"}) assert.Nil(t, outputData) assert.Error(t, err) - var typedError *workflow.ErrNoMorePossibleSteps + + var typedError *workflow.ErrNoMorePossibleOutputs if !errors.As(err, &typedError) { t.Fatalf("incorrect error type returned: %T", err) } } +var simpleValidLiteralInputWaitWorkflowDefinition = ` +version: v0.2.0 +input: + root: RootObject + objects: + RootObject: + id: RootObject + properties: {} +steps: + wait_1: + plugin: + src: "n/a" + deployment_type: "builtin" + step: wait + input: + wait_time_ms: 0 +outputs: + a: + b: !expr $.steps.wait_1.outputs +` + +func TestSimpleValidWaitWorkflow(t *testing.T) { + // Just a single wait + preparedWorkflow := assert.NoErrorR[workflow.ExecutableWorkflow](t)( + getTestImplPreparedWorkflow(t, simpleValidLiteralInputWaitWorkflowDefinition), + ) + outputID, _, err := preparedWorkflow.Execute(context.Background(), map[string]any{}) + assert.NoError(t, err) + assert.Equals(t, outputID, "a") +} + var stepCancellationWorkflowDefinition = ` version: v0.2.0 input: @@ -247,37 +279,6 @@ func TestDeploymentStepCancellation(t *testing.T) { assert.LessThan(t, duration.Milliseconds(), 200) } -var simpleValidLiteralInputWaitWorkflowDefinition = ` -version: v0.2.0 -input: - root: RootObject - objects: - RootObject: - id: RootObject - properties: {} -steps: - wait_1: - plugin: - src: "n/a" - deployment_type: "builtin" - step: wait - input: - wait_time_ms: 0 -outputs: - a: - b: !expr $.steps.wait_1.outputs -` - -func TestSimpleValidWaitWorkflow(t *testing.T) { - // Just a single wait - preparedWorkflow := assert.NoErrorR[workflow.ExecutableWorkflow](t)( - getTestImplPreparedWorkflow(t, simpleValidLiteralInputWaitWorkflowDefinition), - ) - outputID, _, err := preparedWorkflow.Execute(context.Background(), map[string]any{}) - assert.NoError(t, err) - assert.Equals(t, outputID, "a") -} - func TestWithDoubleSerializationDetection(t *testing.T) { preparedWorkflow := assert.NoErrorR[workflow.ExecutableWorkflow](t)( getTestImplPreparedWorkflow(t, simpleValidLiteralInputWaitWorkflowDefinition), @@ -306,7 +307,8 @@ func TestWithDoubleSerializationDetection(t *testing.T) { map[string]any{}, }, } - for _, i := range testIter { + for i, testData := range testIter { + t.Logf("Starting iteration %d", i) errorDetect := util.NewInvalidSerializationDetectorSchema() // Inject the error detector into the object rootObject.PropertiesValue["error_detector"] = schema.NewPropertySchema( @@ -316,10 +318,10 @@ func TestWithDoubleSerializationDetection(t *testing.T) { nil, nil, nil, - i.defaultSpec, + testData.defaultSpec, nil, ) - outputID, _, err := preparedWorkflow.Execute(context.Background(), i.input) + outputID, _, err := preparedWorkflow.Execute(context.Background(), testData.input) assert.NoError(t, err) assert.Equals(t, outputID, "a") // Confirm that, while we did no double-unserializations or double-serializations, @@ -751,7 +753,12 @@ func TestMissingInputsFailedDeployment(t *testing.T) { ) outputID, _, err := preparedWorkflow.Execute(context.Background(), map[string]any{}) assert.Error(t, err) + t.Logf("Test output: %s", err.Error()) assert.Equals(t, outputID, "") + var typedError *workflow.ErrNoMorePossibleOutputs + if !errors.As(err, &typedError) { + t.Fatalf("incorrect error type returned: %T", err) + } } var missingInputsWrongOutputWorkflowDefinition = ` @@ -1190,7 +1197,7 @@ steps: wait_time_ms: 0 stop_if: $.input.step_cancelled outputs: - success: + did-not-cancel: simple_wait_output: !expr $.steps.simple_wait.outputs.success ` @@ -1206,11 +1213,14 @@ func TestInputCancelledStepWorkflow(t *testing.T) { preparedWorkflow := assert.NoErrorR[workflow.ExecutableWorkflow](t)( getTestImplPreparedWorkflow(t, inputCancelledStepWorkflow), ) - _, _, err := preparedWorkflow.Execute(context.Background(), map[string]any{ + outputID, _, err := preparedWorkflow.Execute(context.Background(), map[string]any{ "step_cancelled": true, }) - assert.Error(t, err) - assert.Contains(t, err.Error(), "cannot construct any output") + assert.Equals(t, outputID, "") + var typedError *workflow.ErrNoMorePossibleOutputs + if !errors.As(err, &typedError) { + t.Fatalf("incorrect error type returned: %T", err) + } } var inputDisabledStepWorkflow = ` @@ -1234,7 +1244,7 @@ steps: wait_time_ms: 20 enabled: !expr $.input.step_enabled outputs: - success: + workflow-success: simple_wait_output: !expr $.steps.simple_wait.outputs.success ` @@ -1252,13 +1262,16 @@ func TestInputDisabledStepWorkflow(t *testing.T) { "step_enabled": true, }) assert.NoError(t, err) - assert.Equals(t, outputID, "success") + assert.Equals(t, outputID, "workflow-success") // The workflow should fail with it disabled because the output cannot be resolved. _, _, err = preparedWorkflow.Execute(context.Background(), map[string]any{ "step_enabled": false, }) assert.Error(t, err) - assert.Contains(t, err.Error(), "cannot construct any output") + var typedError *workflow.ErrNoMorePossibleOutputs + if !errors.As(err, &typedError) { + t.Fatalf("incorrect error type returned: %T", err) + } } var dynamicDisabledStepWorkflow = ` @@ -1310,6 +1323,7 @@ func TestDelayedDisabledStepWorkflow(t *testing.T) { outputID, _, err := preparedWorkflow.Execute(context.Background(), map[string]any{ "sleep_time": 20, }) + // TODO: This is failing here unexpectedly. assert.NoError(t, err) assert.Equals(t, outputID, "success") // Fail with a non-20ms input. @@ -1367,6 +1381,157 @@ func TestExpressionWithWhitespace(t *testing.T) { }) } +var multiDependencyFailureWorkflowWithDisabling = ` +version: v0.2.0 +input: + root: WorkflowInput + objects: + WorkflowInput: + id: WorkflowInput + properties: + fail_purposefully: + type: + type_id: bool +steps: + disabled_step: + plugin: + src: "n/a" + deployment_type: "builtin" + step: wait + input: + wait_time_ms: 0 + enabled: !expr '!$.input.fail_purposefully' + long_wait: + plugin: + src: "n/a" + deployment_type: "builtin" + step: wait + input: + wait_time_ms: 5000 +outputs: + workflow-success: + simple_wait_output: !expr $.steps.disabled_step.outputs.success + long_wait_output: !expr $.steps.long_wait.outputs.success +` + +func TestMultiDependencyWorkflowFailureWithDisabling(t *testing.T) { + // Tests failure when one dependency fails immediately, and the other one fails later. + preparedWorkflow := assert.NoErrorR[workflow.ExecutableWorkflow](t)( + getTestImplPreparedWorkflow(t, multiDependencyFailureWorkflowWithDisabling), + ) + startTime := time.Now() // Right before execute to not include pre-processing time. + _, _, err := preparedWorkflow.Execute(context.Background(), map[string]any{ + "fail_purposefully": true, + }) + duration := time.Since(startTime) + assert.Error(t, err) + t.Logf("MultiDependency workflow failed purposefully in %d ms", duration.Milliseconds()) + assert.LessThan(t, duration.Seconds(), 4) // It will take 5 seconds if it fails to fail early. +} + +var multiDependencyFailureWorkflowWithCancellation = ` +version: v0.2.0 +input: + root: WorkflowInput + objects: + WorkflowInput: + id: WorkflowInput + properties: + fail_purposefully: + type: + type_id: bool +steps: + canceled_step: + plugin: + src: "n/a" + deployment_type: "builtin" + step: wait + input: + wait_time_ms: 0 + stop_if: !expr '$.input.fail_purposefully' + deploy: + deployer_name: "test-impl" + deploy_time: 50 # 50 ms of delay to make the cancellation more reliable. + long_wait: + plugin: + src: "n/a" + deployment_type: "builtin" + step: wait + input: + wait_time_ms: 5000 +outputs: + workflow-success: + simple_wait_output: !expr $.steps.canceled_step.outputs.success + long_wait_output: !expr $.steps.long_wait.outputs.success +` + +func TestMultiDependencyWorkflowFailureWithCancellation(t *testing.T) { + // Tests failure when one dependency fails immediately, and the other one fails later. + preparedWorkflow := assert.NoErrorR[workflow.ExecutableWorkflow](t)( + getTestImplPreparedWorkflow(t, multiDependencyFailureWorkflowWithCancellation), + ) + startTime := time.Now() // Right before execute to not include pre-processing time. + _, _, err := preparedWorkflow.Execute(context.Background(), map[string]any{ + "fail_purposefully": true, + }) + duration := time.Since(startTime) + assert.Error(t, err) + t.Logf("MultiDependency workflow failed purposefully in %d ms", duration.Milliseconds()) + assert.LessThan(t, duration.Seconds(), 4) // It will take 5 seconds if it fails to fail early. +} + +var multiDependencyFailureWorkflowWithErrorOut = ` +version: v0.2.0 +input: + root: WorkflowInput + objects: + WorkflowInput: + id: WorkflowInput + properties: + fail_purposefully: + type: + type_id: bool +steps: + disabled_step: + plugin: + src: "n/a" + deployment_type: "builtin" + step: hello + input: + fail: !expr $.input.fail_purposefully + long_wait: + plugin: + src: "n/a" + deployment_type: "builtin" + step: wait + input: + wait_time_ms: 7000 +outputs: + workflow-success: + simple_wait_output: !expr $.steps.disabled_step.outputs.success + simple_wait_output: !expr $.steps.long_wait.outputs.success +` + +func TestMultiDependencyWorkflowFailureWithErrorOut(t *testing.T) { + // Tests failure when one dependency fails immediately, and the other one fails later. + preparedWorkflow := assert.NoErrorR[workflow.ExecutableWorkflow](t)( + getTestImplPreparedWorkflow(t, multiDependencyFailureWorkflowWithErrorOut), + ) + startTime := time.Now() // Right before execute to not include pre-processing time. + _, _, err := preparedWorkflow.Execute(context.Background(), map[string]any{ + "fail_purposefully": true, + }) + duration := time.Since(startTime) + assert.Error(t, err) + t.Logf("MultiDependency workflow failed purposefully in %d ms", duration.Milliseconds()) + // If it takes 5 seconds, then there was a deadlock in the client. + // If it takes 6 seconds, then it waited for the second step. + assert.LessThan(t, duration.Milliseconds(), 5500) +} + +// TODO: Have a version of these tests that depends on outputs instead of outputs.success +// This is required due to them being separate nodes that need to be marked unresolvable. + func createTestExecutableWorkflow(t *testing.T, workflowStr string, workflowCtx map[string][]byte) (workflow.ExecutableWorkflow, error) { logConfig := log.Config{ Level: log.LevelDebug, From 89d366b43c89ad086a74f065acb572d5792d8cf7 Mon Sep 17 00:00:00 2001 From: Jared O'Connell Date: Tue, 9 Jul 2024 16:38:55 -0400 Subject: [PATCH 02/22] Use dependencies with concurrency fixes This is especially important with this branch because it exposed a lot of problems --- go.mod | 6 +++--- go.sum | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/go.mod b/go.mod index 2200f383..4b37a585 100644 --- a/go.mod +++ b/go.mod @@ -4,14 +4,14 @@ go 1.21 require ( go.arcalot.io/assert v1.8.0 - go.arcalot.io/dgraph v1.4.1 + go.arcalot.io/dgraph v1.5.0 go.arcalot.io/lang v1.1.0 go.arcalot.io/log/v2 v2.1.0 go.flow.arcalot.io/deployer v0.6.1 go.flow.arcalot.io/dockerdeployer v0.7.2 go.flow.arcalot.io/expressions v0.4.3 go.flow.arcalot.io/kubernetesdeployer v0.9.3 - go.flow.arcalot.io/pluginsdk v0.12.1 + go.flow.arcalot.io/pluginsdk v0.12.2 go.flow.arcalot.io/podmandeployer v0.11.2 go.flow.arcalot.io/pythondeployer v0.6.1 go.flow.arcalot.io/testdeployer v0.6.1 @@ -29,7 +29,7 @@ require ( github.com/docker/go-units v0.5.0 // indirect github.com/emicklei/go-restful/v3 v3.12.0 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect - github.com/fxamacker/cbor/v2 v2.6.0 // indirect + github.com/fxamacker/cbor/v2 v2.7.0 // indirect github.com/go-logr/logr v1.4.2 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/go-openapi/jsonpointer v0.21.0 // indirect diff --git a/go.sum b/go.sum index 147337d3..5b464a69 100644 --- a/go.sum +++ b/go.sum @@ -25,8 +25,8 @@ github.com/evanphx/json-patch v4.12.0+incompatible h1:4onqiflcdA9EOZ4RxV643DvftH github.com/evanphx/json-patch v4.12.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= -github.com/fxamacker/cbor/v2 v2.6.0 h1:sU6J2usfADwWlYDAFhZBQ6TnLFBHxgesMrQfQgk1tWA= -github.com/fxamacker/cbor/v2 v2.6.0/go.mod h1:pxXPTn3joSm21Gbwsv0w9OSA2y1HFR9qXEeXQVeNoDQ= +github.com/fxamacker/cbor/v2 v2.7.0 h1:iM5WgngdRBanHcxugY4JySA0nk1wZorNOpTgCMedv5E= +github.com/fxamacker/cbor/v2 v2.7.0/go.mod h1:pxXPTn3joSm21Gbwsv0w9OSA2y1HFR9qXEeXQVeNoDQ= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= @@ -123,8 +123,8 @@ github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9de github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= go.arcalot.io/assert v1.8.0 h1:hGcHMPncQXwQvjj7MbyOu2gg8VIBB00crUJZpeQOjxs= go.arcalot.io/assert v1.8.0/go.mod h1:nNmWPoNUHFyrPkNrD2aASm5yPuAfiWdB/4X7Lw3ykHk= -go.arcalot.io/dgraph v1.4.1 h1:y/lhJ68WzNUDR2BYSk7tZAZhVokts92svcrJLbK4Ebo= -go.arcalot.io/dgraph v1.4.1/go.mod h1:+Kxc81utiihMSmC1/ttSPGLDlWPpvgOpNxSFmIDPxFM= +go.arcalot.io/dgraph v1.5.0 h1:6cGlxLzmmehJoD/nj0Hkql7uh90EU0A0GtZhGYkr28M= +go.arcalot.io/dgraph v1.5.0/go.mod h1:+Kxc81utiihMSmC1/ttSPGLDlWPpvgOpNxSFmIDPxFM= go.arcalot.io/exex v0.2.0 h1:u44pjwPwcH57TF8knhaqVZP/1V/KbnRe//pKzMwDpLw= go.arcalot.io/exex v0.2.0/go.mod h1:5zlFr+7vOQNZKYCNOEDdsad+z/dlvXKs2v4kG+v+bQo= go.arcalot.io/lang v1.1.0 h1:ugglRKpd3qIMkdghAjKJxsziIgHm8QpxrzZPSXoa08I= @@ -139,8 +139,8 @@ go.flow.arcalot.io/expressions v0.4.3 h1:0BRRghutHp0sctsITHe/A1le0yYiJtKNTxm27T+ go.flow.arcalot.io/expressions v0.4.3/go.mod h1:UORX78N4ep71wOzNXdIo/UY+6SdDD0id0mvuRNEQMeM= go.flow.arcalot.io/kubernetesdeployer v0.9.3 h1:XKiqmCqXb6ZLwP5IQTAKS/gJHpq0Ub/yEjCfgAwQF2A= go.flow.arcalot.io/kubernetesdeployer v0.9.3/go.mod h1:DtB6HR7HBt/HA1vME0faIpOQ/lhfBJjL6OAGgT3Bu/Q= -go.flow.arcalot.io/pluginsdk v0.12.1 h1:HlWo1+Fn13u0EoeH9KpcWzdn1miqucNEvj0cirFhcA8= -go.flow.arcalot.io/pluginsdk v0.12.1/go.mod h1:J0RYsfD6g1WKMVSbLGZR/ZJBVdcjt+bOuKoOvnNPpy4= +go.flow.arcalot.io/pluginsdk v0.12.2 h1:ue+Ax1c3Bw0TmnuAEZ4r19D6ZA4dns9rP3NO4cvmTec= +go.flow.arcalot.io/pluginsdk v0.12.2/go.mod h1:U+7mtXo4aT3YNn9fNWZ3b5RBvXttm/iocc2/P4MGa88= go.flow.arcalot.io/podmandeployer v0.11.2 h1:aqrHaNaCXYDREqDJpKhVeVIIZPiPld5SDvnUcc0Tjiw= go.flow.arcalot.io/podmandeployer v0.11.2/go.mod h1:70+9M6eVQa0EEynDZ720P3AEzXt1gZto2lvoMlyjuzo= go.flow.arcalot.io/pythondeployer v0.6.1 h1:IyaA9BVfHJ2fhC+fNfT6VicrtRGFlZOlSaAVGGPwo1E= From bfccd9d9d753b8c7ae2156444484bade28472ec4 Mon Sep 17 00:00:00 2001 From: Jared O'Connell Date: Tue, 9 Jul 2024 17:05:12 -0400 Subject: [PATCH 03/22] Fix linter errors --- internal/step/plugin/provider.go | 7 ++++--- workflow/model.go | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/internal/step/plugin/provider.go b/internal/step/plugin/provider.go index f89f1bfa..59cb6af2 100644 --- a/internal/step/plugin/provider.go +++ b/internal/step/plugin/provider.go @@ -957,10 +957,11 @@ func (r *runningStep) run() { if !enabled { r.transitionToDisabled() return - } else { - // It's enabled, so the disabled stage will not occur. - r.stageChangeHandler.OnStepStageFailure(r, string(StageIDDisabled), &r.wg, err) } + + // It's enabled, so the disabled stage will not occur. + r.stageChangeHandler.OnStepStageFailure(r, string(StageIDDisabled), &r.wg, err) + if err := r.startStage(container); err != nil { r.startFailed(err) return diff --git a/workflow/model.go b/workflow/model.go index 454ce5c7..d5e3adc4 100644 --- a/workflow/model.go +++ b/workflow/model.go @@ -241,7 +241,7 @@ type ErrNoMorePossibleOutputs struct { // Error returns an explanation on why the error happened. func (e ErrNoMorePossibleOutputs) Error() string { - return fmt.Sprintf("all outputs marked as unresolvable") + return "all outputs marked as unresolvable" } // ErrInvalidState indicates that the workflow failed due to an invalid state. From 344edb76867165aea9e60e04706d4979687fc977 Mon Sep 17 00:00:00 2001 From: Jared O'Connell Date: Tue, 9 Jul 2024 17:16:50 -0400 Subject: [PATCH 04/22] Resolve TODOs --- workflow/workflow_test.go | 79 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 73 insertions(+), 6 deletions(-) diff --git a/workflow/workflow_test.go b/workflow/workflow_test.go index cc231f20..478e4793 100644 --- a/workflow/workflow_test.go +++ b/workflow/workflow_test.go @@ -1323,7 +1323,6 @@ func TestDelayedDisabledStepWorkflow(t *testing.T) { outputID, _, err := preparedWorkflow.Execute(context.Background(), map[string]any{ "sleep_time": 20, }) - // TODO: This is failing here unexpectedly. assert.NoError(t, err) assert.Equals(t, outputID, "success") // Fail with a non-20ms input. @@ -1426,6 +1425,10 @@ func TestMultiDependencyWorkflowFailureWithDisabling(t *testing.T) { duration := time.Since(startTime) assert.Error(t, err) t.Logf("MultiDependency workflow failed purposefully in %d ms", duration.Milliseconds()) + var typedError *workflow.ErrNoMorePossibleOutputs + if !errors.As(err, &typedError) { + t.Fatalf("incorrect error type returned: %T", err) + } assert.LessThan(t, duration.Seconds(), 4) // It will take 5 seconds if it fails to fail early. } @@ -1461,7 +1464,7 @@ steps: wait_time_ms: 5000 outputs: workflow-success: - simple_wait_output: !expr $.steps.canceled_step.outputs.success + cancelled_step_output: !expr $.steps.canceled_step.outputs.success long_wait_output: !expr $.steps.long_wait.outputs.success ` @@ -1477,6 +1480,10 @@ func TestMultiDependencyWorkflowFailureWithCancellation(t *testing.T) { duration := time.Since(startTime) assert.Error(t, err) t.Logf("MultiDependency workflow failed purposefully in %d ms", duration.Milliseconds()) + var typedError *workflow.ErrNoMorePossibleOutputs + if !errors.As(err, &typedError) { + t.Fatalf("incorrect error type returned: %T", err) + } assert.LessThan(t, duration.Seconds(), 4) // It will take 5 seconds if it fails to fail early. } @@ -1508,8 +1515,8 @@ steps: wait_time_ms: 7000 outputs: workflow-success: - simple_wait_output: !expr $.steps.disabled_step.outputs.success - simple_wait_output: !expr $.steps.long_wait.outputs.success + disabled_output: !expr $.steps.disabled_step.outputs.success + long_wait_output: !expr $.steps.long_wait.outputs.success ` func TestMultiDependencyWorkflowFailureWithErrorOut(t *testing.T) { @@ -1524,13 +1531,73 @@ func TestMultiDependencyWorkflowFailureWithErrorOut(t *testing.T) { duration := time.Since(startTime) assert.Error(t, err) t.Logf("MultiDependency workflow failed purposefully in %d ms", duration.Milliseconds()) + var typedError *workflow.ErrNoMorePossibleOutputs + if !errors.As(err, &typedError) { + t.Fatalf("incorrect error type returned: %T", err) + } // If it takes 5 seconds, then there was a deadlock in the client. // If it takes 6 seconds, then it waited for the second step. assert.LessThan(t, duration.Milliseconds(), 5500) } -// TODO: Have a version of these tests that depends on outputs instead of outputs.success -// This is required due to them being separate nodes that need to be marked unresolvable. +var multiDependencyFailureWorkflowWithDeployFailure = ` +version: v0.2.0 +input: + root: WorkflowInput + objects: + WorkflowInput: + id: WorkflowInput + properties: + fail_purposefully: + type: + type_id: bool +steps: + failed_step: + plugin: + src: "n/a" + deployment_type: "builtin" + step: wait + input: + wait_time_ms: 0 + deploy: + deployer_name: "test-impl" + deploy_succeed: !expr '!$.input.fail_purposefully' + long_wait: + plugin: + src: "n/a" + deployment_type: "builtin" + step: wait + input: + wait_time_ms: 7000 +outputs: + workflow-success: + simple_wait_output: !expr $.steps.failed_step.outputs + simple_wait_output: !expr $.steps.long_wait.outputs +` + +func TestMultiDependencyWorkflowFailureWithDeployFailure(t *testing.T) { + // Tests failure when one dependency fails immediately, and the other one fails later. + // In this specific test the output depends on the `steps.step_name.outputs` node + // instead of the `steps.step_name.outputs.success` node because they are handled + // differently in the engine. + preparedWorkflow := assert.NoErrorR[workflow.ExecutableWorkflow](t)( + getTestImplPreparedWorkflow(t, multiDependencyFailureWorkflowWithDeployFailure), + ) + startTime := time.Now() // Right before execute to not include pre-processing time. + _, _, err := preparedWorkflow.Execute(context.Background(), map[string]any{ + "fail_purposefully": true, + }) + duration := time.Since(startTime) + assert.Error(t, err) + t.Logf("MultiDependency workflow failed purposefully in %d ms", duration.Milliseconds()) + var typedError *workflow.ErrNoMorePossibleOutputs + if !errors.As(err, &typedError) { + t.Fatalf("incorrect error type returned: %T", err) + } + // If it takes 5 seconds, then there was a deadlock in the client. + // If it takes 6 seconds, then it waited for the second step. + assert.LessThan(t, duration.Milliseconds(), 5500) +} func createTestExecutableWorkflow(t *testing.T, workflowStr string, workflowCtx map[string][]byte) (workflow.ExecutableWorkflow, error) { logConfig := log.Config{ From 98d5313eb2226c9630919d3fd4dae1c9ed14efd7 Mon Sep 17 00:00:00 2001 From: Jared O'Connell Date: Tue, 9 Jul 2024 17:23:16 -0400 Subject: [PATCH 05/22] Fix linter errors --- workflow/workflow.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/workflow.go b/workflow/workflow.go index aecc99e9..72ec3c7d 100644 --- a/workflow/workflow.go +++ b/workflow/workflow.go @@ -359,7 +359,7 @@ func (l *loopState) unresolveOutputs(stepID string, stageID string, skippedOutpu if stage.ID != stageID { continue } - for stageOutputID, _ := range stage.Outputs { + for stageOutputID := range stage.Outputs { if skippedOutput != nil && stageOutputID == *skippedOutput { continue } From 650369c24b557951aee833841dfd5feccfdd31b2 Mon Sep 17 00:00:00 2001 From: Jared O'Connell Date: Wed, 10 Jul 2024 13:52:33 -0400 Subject: [PATCH 06/22] Address review comments --- internal/step/plugin/provider.go | 46 +++++++++++++++++++++----------- workflow/workflow.go | 28 +++++++++---------- 2 files changed, 44 insertions(+), 30 deletions(-) diff --git a/internal/step/plugin/provider.go b/internal/step/plugin/provider.go index 59cb6af2..79cc4bc2 100644 --- a/internal/step/plugin/provider.go +++ b/internal/step/plugin/provider.go @@ -1176,6 +1176,27 @@ func (r *runningStep) runStage() error { return nil } +func (r *runningStep) markStageFailures(firstStage StageID, err error) { + switch firstStage { + case StageIDEnabling: + r.stageChangeHandler.OnStepStageFailure(r, string(StageIDEnabling), &r.wg, err) + fallthrough + case StageIDDisabled: + r.stageChangeHandler.OnStepStageFailure(r, string(StageIDDisabled), &r.wg, err) + fallthrough + case StageIDStarting: + r.stageChangeHandler.OnStepStageFailure(r, string(StageIDStarting), &r.wg, err) + fallthrough + case StageIDRunning: + r.stageChangeHandler.OnStepStageFailure(r, string(StageIDRunning), &r.wg, err) + fallthrough + case StageIDOutput: + r.stageChangeHandler.OnStepStageFailure(r, string(StageIDOutput), &r.wg, err) + default: + panic("unknown StageID") + } +} + func (r *runningStep) deployFailed(err error) { r.logger.Debugf("Deploy failed stage for step %s/%s", r.runID, r.pluginStepID) r.transitionStage(StageIDDeployFailed, step.RunningStepStateRunning) @@ -1189,11 +1210,7 @@ func (r *runningStep) deployFailed(err error) { r.completeStep(StageIDDeployFailed, step.RunningStepStateFinished, &outputID, &output) // If deployment fails, enabling, disabled, starting, running, and output cannot occur. err = fmt.Errorf("deployment failed for step %s/%s", r.runID, r.pluginStepID) - r.stageChangeHandler.OnStepStageFailure(r, string(StageIDEnabling), &r.wg, err) - r.stageChangeHandler.OnStepStageFailure(r, string(StageIDDisabled), &r.wg, err) - r.stageChangeHandler.OnStepStageFailure(r, string(StageIDStarting), &r.wg, err) - r.stageChangeHandler.OnStepStageFailure(r, string(StageIDRunning), &r.wg, err) - r.stageChangeHandler.OnStepStageFailure(r, string(StageIDOutput), &r.wg, err) + r.markStageFailures(StageIDEnabling, err) } func (r *runningStep) transitionToCancelled() { @@ -1205,11 +1222,9 @@ func (r *runningStep) transitionToCancelled() { // This is called after deployment. So everything after deployment cannot occur. err := fmt.Errorf("step %s/%s cancelled", r.runID, r.pluginStepID) - r.stageChangeHandler.OnStepStageFailure(r, string(StageIDEnabling), &r.wg, err) - r.stageChangeHandler.OnStepStageFailure(r, string(StageIDDisabled), &r.wg, err) - r.stageChangeHandler.OnStepStageFailure(r, string(StageIDStarting), &r.wg, err) - r.stageChangeHandler.OnStepStageFailure(r, string(StageIDRunning), &r.wg, err) - r.stageChangeHandler.OnStepStageFailure(r, string(StageIDOutput), &r.wg, err) + // Note: This function is only called if it's cancelled during the deployment phase. + // If that changes, the stage IDs marked as failed need to be changed. + r.markStageFailures(StageIDEnabling, err) } func (r *runningStep) transitionToDisabled() { @@ -1231,9 +1246,8 @@ func (r *runningStep) transitionToDisabled() { ) err := fmt.Errorf("step %s/%s disabled", r.runID, r.pluginStepID) - r.stageChangeHandler.OnStepStageFailure(r, string(StageIDStarting), &r.wg, err) - r.stageChangeHandler.OnStepStageFailure(r, string(StageIDRunning), &r.wg, err) - r.stageChangeHandler.OnStepStageFailure(r, string(StageIDOutput), &r.wg, err) + r.markStageFailures(StageIDStarting, err) + } func (r *runningStep) startFailed(err error) { @@ -1249,8 +1263,8 @@ func (r *runningStep) startFailed(err error) { r.completeStep(StageIDCrashed, step.RunningStepStateFinished, &outputID, &output) - r.stageChangeHandler.OnStepStageFailure(r, string(StageIDRunning), &r.wg, err) - r.stageChangeHandler.OnStepStageFailure(r, string(StageIDOutput), &r.wg, err) + r.markStageFailures(StageIDRunning, err) + } func (r *runningStep) runFailed(err error) { @@ -1265,7 +1279,7 @@ func (r *runningStep) runFailed(err error) { }) r.completeStep(StageIDCrashed, step.RunningStepStateFinished, &outputID, &output) - r.stageChangeHandler.OnStepStageFailure(r, string(StageIDOutput), &r.wg, err) + r.markStageFailures(StageIDOutput, err) } // TransitionStage transitions the stage to the specified stage, and the state to the specified state. diff --git a/workflow/workflow.go b/workflow/workflow.go index 72ec3c7d..51a406e5 100644 --- a/workflow/workflow.go +++ b/workflow/workflow.go @@ -155,8 +155,8 @@ func (e *executableWorkflow) Execute(ctx context.Context, serializedInput any) ( } else { e.logger.Debugf("Step %q stage %q declared that it will not produce an output (%s)", stepID, stage, err.Error()) } - l.unresolveOutputs(stepID, stage, nil) - l.unresolveStageNode(stepID, stage) + l.markOutputsUnresolvable(stepID, stage, nil) + l.markStageNodeUnresolvable(stepID, stage) l.notifySteps() }, } @@ -338,9 +338,9 @@ func (l *loopState) onStageComplete(stepID string, previousStage *string, previo *previousStageOutput, ) } - // Unresolve all alternative output ID nodes. + // Mark all alternative output ID nodes as unresolvable. // Use the lifecycle to find the possible output IDs - l.unresolveOutputs(stepID, *previousStage, previousStageOutputID) + l.markOutputsUnresolvable(stepID, *previousStage, previousStageOutputID) // Placing data from the output into the general data structure l.data[WorkflowStepsKey].(map[string]any)[stepID].(map[string]any)[*previousStage] = map[string]any{} @@ -352,8 +352,8 @@ func (l *loopState) onStageComplete(stepID string, previousStage *string, previo // Marks the outputs of that stage unresolvable. // Optionally skip an output that should not unresolve. // Does not (un)resolve the non-output node for that stage. -// To prevent a deadlock, this `notifySteps()` should be called at some point after this is called. -func (l *loopState) unresolveOutputs(stepID string, stageID string, skippedOutput *string) { +// To prevent a deadlock, `notifySteps()` should be called at some point after this is called. +func (l *loopState) markOutputsUnresolvable(stepID string, stageID string, skippedOutput *string) { stages := l.lifecycles[stepID].Stages for _, stage := range stages { if stage.ID != stageID { @@ -368,29 +368,29 @@ func (l *loopState) unresolveOutputs(stepID string, stageID string, skippedOutpu l.logger.Warningf("Could not get DAG node %s (%s)", stepID+"."+stage.ID+"."+stageOutputID, err.Error()) continue } - l.logger.Debugf("Will unresolve node %s in the DAG", stepID+"."+stage.ID+"."+stageOutputID) + l.logger.Debugf("Will resolve node %s in the DAG as unresolvable", stepID+"."+stage.ID+"."+stageOutputID) err = unresolvableOutputNode.ResolveNode(dgraph.Unresolvable) if err != nil { - l.logger.Errorf("Error while unresolving node %s in DAG (%s)", unresolvableOutputNode.ID(), err.Error()) + l.logger.Errorf("Error while resolving node %s in DAG as unresolvable (%s)", unresolvableOutputNode.ID(), err.Error()) } } } } -// unresolves the stage node for the step's stage. -// Does not (un)resolve the outputs of that node. For that, call unresolveOutputs() instead or +// Marks the stage node for the step's stage as unresolvable. +// Does not (un)resolve the outputs of that node. For that, call markOutputsUnresolvable() instead or // in addition to calling this function. -// To prevent a deadlock, this `notifySteps()` should be called at some point after this is called. -func (l *loopState) unresolveStageNode(stepID string, stageID string) { +// To prevent a deadlock, `notifySteps()` should be called at some point after this is called. +func (l *loopState) markStageNodeUnresolvable(stepID string, stageID string) { unresolvableOutputNode, err := l.dag.GetNodeByID(GetStageNodeID(stepID, stageID)) if err != nil { l.logger.Warningf("Could not get DAG node %s (%s)", stepID+"."+stageID, err.Error()) return } - l.logger.Debugf("Will unresolve node %s in the DAG", stepID+"."+stageID) + l.logger.Debugf("Will resolve node %s in the DAG as unresolvable", stepID+"."+stageID) err = unresolvableOutputNode.ResolveNode(dgraph.Unresolvable) if err != nil { - l.logger.Errorf("Error while unresolving node %s in DAG (%s)", unresolvableOutputNode.ID(), err.Error()) + l.logger.Errorf("Error while resolving node %s in DAG as unresolvable (%s)", unresolvableOutputNode.ID(), err.Error()) } } From 79bea59b9bdf1309ada715cba02518f964390249 Mon Sep 17 00:00:00 2001 From: Jared O'Connell Date: Wed, 10 Jul 2024 15:18:17 -0400 Subject: [PATCH 07/22] Address review comments --- workflow/workflow.go | 8 ++++---- workflow/workflow_test.go | 15 ++++++++------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/workflow/workflow.go b/workflow/workflow.go index 51a406e5..d3feae76 100644 --- a/workflow/workflow.go +++ b/workflow/workflow.go @@ -368,10 +368,10 @@ func (l *loopState) markOutputsUnresolvable(stepID string, stageID string, skipp l.logger.Warningf("Could not get DAG node %s (%s)", stepID+"."+stage.ID+"."+stageOutputID, err.Error()) continue } - l.logger.Debugf("Will resolve node %s in the DAG as unresolvable", stepID+"."+stage.ID+"."+stageOutputID) + l.logger.Debugf("Will mark node %s in the DAG as unresolvable", stepID+"."+stage.ID+"."+stageOutputID) err = unresolvableOutputNode.ResolveNode(dgraph.Unresolvable) if err != nil { - l.logger.Errorf("Error while resolving node %s in DAG as unresolvable (%s)", unresolvableOutputNode.ID(), err.Error()) + l.logger.Errorf("Error while marking node %s in DAG as unresolvable (%s)", unresolvableOutputNode.ID(), err.Error()) } } } @@ -387,10 +387,10 @@ func (l *loopState) markStageNodeUnresolvable(stepID string, stageID string) { l.logger.Warningf("Could not get DAG node %s (%s)", stepID+"."+stageID, err.Error()) return } - l.logger.Debugf("Will resolve node %s in the DAG as unresolvable", stepID+"."+stageID) + l.logger.Debugf("Will mark node %s in the DAG as unresolvable", stepID+"."+stageID) err = unresolvableOutputNode.ResolveNode(dgraph.Unresolvable) if err != nil { - l.logger.Errorf("Error while resolving node %s in DAG as unresolvable (%s)", unresolvableOutputNode.ID(), err.Error()) + l.logger.Errorf("Error while marking node %s in DAG as unresolvable (%s)", unresolvableOutputNode.ID(), err.Error()) } } diff --git a/workflow/workflow_test.go b/workflow/workflow_test.go index 478e4793..dac0b40d 100644 --- a/workflow/workflow_test.go +++ b/workflow/workflow_test.go @@ -308,7 +308,6 @@ func TestWithDoubleSerializationDetection(t *testing.T) { }, } for i, testData := range testIter { - t.Logf("Starting iteration %d", i) errorDetect := util.NewInvalidSerializationDetectorSchema() // Inject the error detector into the object rootObject.PropertiesValue["error_detector"] = schema.NewPropertySchema( @@ -1414,7 +1413,7 @@ outputs: ` func TestMultiDependencyWorkflowFailureWithDisabling(t *testing.T) { - // Tests failure when one dependency fails immediately, and the other one fails later. + // Tests failure when one dependency is disabled immediately, and the other one completes later. preparedWorkflow := assert.NoErrorR[workflow.ExecutableWorkflow](t)( getTestImplPreparedWorkflow(t, multiDependencyFailureWorkflowWithDisabling), ) @@ -1469,7 +1468,7 @@ outputs: ` func TestMultiDependencyWorkflowFailureWithCancellation(t *testing.T) { - // Tests failure when one dependency fails immediately, and the other one fails later. + // Tests failure when one dependency is cancelled immediately, and the other one completes later. preparedWorkflow := assert.NoErrorR[workflow.ExecutableWorkflow](t)( getTestImplPreparedWorkflow(t, multiDependencyFailureWorkflowWithCancellation), ) @@ -1520,7 +1519,8 @@ outputs: ` func TestMultiDependencyWorkflowFailureWithErrorOut(t *testing.T) { - // Tests failure when one dependency fails immediately, and the other one fails later. + // Tests failure when one dependency fails due to the wrong output immediately, and the + // other one completes later. preparedWorkflow := assert.NoErrorR[workflow.ExecutableWorkflow](t)( getTestImplPreparedWorkflow(t, multiDependencyFailureWorkflowWithErrorOut), ) @@ -1576,9 +1576,10 @@ outputs: ` func TestMultiDependencyWorkflowFailureWithDeployFailure(t *testing.T) { - // Tests failure when one dependency fails immediately, and the other one fails later. - // In this specific test the output depends on the `steps.step_name.outputs` node - // instead of the `steps.step_name.outputs.success` node because they are handled + // Tests failure when one dependency fails (due to failed deployment) immediately, + // and the other one fails later. + // In this specific test the output depends on the `steps.failed_step.outputs` node + // instead of the `steps.failed_step.outputs.success` node because they are handled // differently in the engine. preparedWorkflow := assert.NoErrorR[workflow.ExecutableWorkflow](t)( getTestImplPreparedWorkflow(t, multiDependencyFailureWorkflowWithDeployFailure), From b7df1e540f0894fd403d440134ed6d35083ffdf9 Mon Sep 17 00:00:00 2001 From: Jared O'Connell Date: Wed, 10 Jul 2024 15:28:03 -0400 Subject: [PATCH 08/22] Improve comment --- workflow/workflow_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/workflow_test.go b/workflow/workflow_test.go index dac0b40d..c5c65dff 100644 --- a/workflow/workflow_test.go +++ b/workflow/workflow_test.go @@ -1519,7 +1519,7 @@ outputs: ` func TestMultiDependencyWorkflowFailureWithErrorOut(t *testing.T) { - // Tests failure when one dependency fails due to the wrong output immediately, and the + // Tests failure when one dependency fails immediately due to the wrong output, and the // other one completes later. preparedWorkflow := assert.NoErrorR[workflow.ExecutableWorkflow](t)( getTestImplPreparedWorkflow(t, multiDependencyFailureWorkflowWithErrorOut), From 24ccfd53bad8862954b9efe36ae1818df6df56e6 Mon Sep 17 00:00:00 2001 From: Jared O'Connell Date: Wed, 10 Jul 2024 15:40:54 -0400 Subject: [PATCH 09/22] Remove unused variable --- workflow/workflow_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/workflow_test.go b/workflow/workflow_test.go index c5c65dff..59f140bd 100644 --- a/workflow/workflow_test.go +++ b/workflow/workflow_test.go @@ -307,7 +307,7 @@ func TestWithDoubleSerializationDetection(t *testing.T) { map[string]any{}, }, } - for i, testData := range testIter { + for _, testData := range testIter { errorDetect := util.NewInvalidSerializationDetectorSchema() // Inject the error detector into the object rootObject.PropertiesValue["error_detector"] = schema.NewPropertySchema( From 44b07de18214f6ca1ee944b95858264344b91d0f Mon Sep 17 00:00:00 2001 From: Jared O'Connell Date: Wed, 10 Jul 2024 17:48:09 -0400 Subject: [PATCH 10/22] Fix race condition This ensures that the waitgroup properly waits for all steps. There used to be a race condition that caused it to call Wait before Add() --- internal/step/plugin/provider.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/internal/step/plugin/provider.go b/internal/step/plugin/provider.go index 79cc4bc2..cfddb7ac 100644 --- a/internal/step/plugin/provider.go +++ b/internal/step/plugin/provider.go @@ -688,6 +688,7 @@ func (r *runnableStep) Start(input map[string]any, runID string, stageChangeHand runID: runID, } + s.wg.Add(1) // Wait for the run to finish before closing. go s.run() return s, nil @@ -922,8 +923,8 @@ func (r *runningStep) closeComponents(closeATP bool) error { return nil } +// Note: Caller must add 1 to the waitgroup before calling. func (r *runningStep) run() { - r.wg.Add(1) // Wait for the run to finish before closing. defer func() { r.cancel() // Close before WaitGroup done r.wg.Done() // Done. Close may now exit. From d346d808b57625a57e6f752c3508385428947b2b Mon Sep 17 00:00:00 2001 From: Jared O'Connell Date: Fri, 19 Jul 2024 20:23:02 -0400 Subject: [PATCH 11/22] Fix several issues in failure cases Fixed improper locking, paths that could lead to double-resolution, and more. --- internal/step/foreach/provider.go | 34 +++++++--- internal/step/lifecycle.go | 14 ++--- internal/step/plugin/provider.go | 90 +++++++++++++++++++-------- internal/step/plugin/provider_test.go | 2 + workflow/executor.go | 8 ++- workflow/workflow.go | 30 ++++++--- workflow/workflow_test.go | 90 ++++++++++++++++++++++++--- 7 files changed, 207 insertions(+), 61 deletions(-) diff --git a/internal/step/foreach/provider.go b/internal/step/foreach/provider.go index f2ded8b4..3ddf9bf9 100644 --- a/internal/step/foreach/provider.go +++ b/internal/step/foreach/provider.go @@ -3,6 +3,7 @@ package foreach import ( "context" "fmt" + "go.arcalot.io/dgraph" "reflect" "sync" @@ -46,9 +47,9 @@ var executeLifecycleStage = step.LifecycleStage{ "items": {}, "wait_for": {}, }, - NextStages: []string{ - string(StageIDOutputs), - string(StageIDFailed), + NextStages: map[string]dgraph.DependencyType{ + string(StageIDOutputs): dgraph.AndDependency, + string(StageIDFailed): dgraph.CompletionAndDependency, }, Fatal: false, } @@ -58,7 +59,7 @@ var outputLifecycleStage = step.LifecycleStage{ RunningName: "output", FinishedName: "output", InputFields: map[string]struct{}{}, - NextStages: []string{}, + NextStages: map[string]dgraph.DependencyType{}, Fatal: false, } var errorLifecycleStage = step.LifecycleStage{ @@ -67,7 +68,7 @@ var errorLifecycleStage = step.LifecycleStage{ RunningName: "processing error", FinishedName: "error", InputFields: map[string]struct{}{}, - NextStages: []string{}, + NextStages: map[string]dgraph.DependencyType{}, Fatal: true, } @@ -359,6 +360,7 @@ type runningStep struct { inputAvailable bool inputData chan []any ctx context.Context + closed bool wg sync.WaitGroup cancel context.CancelFunc stageChangeHandler step.StageChangeHandler @@ -368,6 +370,11 @@ type runningStep struct { func (r *runningStep) ProvideStageInput(stage string, input map[string]any) error { r.lock.Lock() + if r.closed { + r.logger.Debugf("exiting foreach ProvideStageInput due to step being closed") + r.lock.Unlock() + return nil + } switch stage { case string(StageIDExecute): items := input["items"] @@ -390,8 +397,8 @@ func (r *runningStep) ProvideStageInput(stage string, input map[string]any) erro r.currentState = step.RunningStepStateRunning } r.inputAvailable = true + r.inputData <- input // Send before unlock to ensure that it never gets closed before sending. r.lock.Unlock() - r.inputData <- input return nil case string(StageIDOutputs): r.lock.Unlock() @@ -412,14 +419,19 @@ func (r *runningStep) CurrentStage() string { } func (r *runningStep) State() step.RunningStepState { - r.lock.Lock() + r.lock.Lock() // TODO: Determine why this gets stuck. defer r.lock.Unlock() return r.currentState } func (r *runningStep) Close() error { + r.lock.Lock() + r.closed = true + r.lock.Unlock() r.cancel() r.wg.Wait() + r.logger.Debugf("Closing inputData channel in foreach step provider") + close(r.inputData) return nil } @@ -431,7 +443,7 @@ func (r *runningStep) ForceClose() error { func (r *runningStep) run() { r.wg.Add(1) defer func() { - close(r.inputData) + r.logger.Debugf("foreach run function done") r.wg.Done() }() waitingForInput := false @@ -471,7 +483,10 @@ func (r *runningStep) run() { input := input go func() { defer func() { - <-sem + select { + case <-sem: + case <-r.ctx.Done(): // Must not deadlock if closed early. + } wg.Done() }() r.logger.Debugf("Queuing item %d...", i) @@ -540,6 +555,7 @@ func (r *runningStep) run() { r.lock.Unlock() r.stageChangeHandler.OnStepComplete(r, previousStage, &outputID, &outputData, &r.wg) case <-r.ctx.Done(): + r.logger.Debugf("context done") return } diff --git a/internal/step/lifecycle.go b/internal/step/lifecycle.go index 83495250..700fe2fd 100644 --- a/internal/step/lifecycle.go +++ b/internal/step/lifecycle.go @@ -30,10 +30,10 @@ func (l Lifecycle[StageType]) DAG() (dgraph.DirectedGraph[StageType], error) { } } for _, stage := range l.Stages { - node := lang.Must2(dag.GetNodeByID(stage.Identifier())) - for _, nextStage := range stage.NextStageIDs() { - if err := node.Connect(nextStage); err != nil { - return nil, fmt.Errorf("failed to connect lifecycle stage %s to %s (%w)", node.ID(), nextStage, err) + for nextStage, dependencyType := range stage.NextStageIDs() { + nextStageNode := lang.Must2(dag.GetNodeByID(nextStage)) + if err := nextStageNode.ConnectDependency(stage.Identifier(), dependencyType); err != nil { + return nil, fmt.Errorf("failed to connect lifecycle stage %s to %s (%w)", stage.Identifier(), nextStage, err) } } } @@ -52,7 +52,7 @@ type lifecycleStage interface { // Identifier returns the ID of the stage. Identifier() string // NextStageIDs returns the next stage identifiers. - NextStageIDs() []string + NextStageIDs() map[string]dgraph.DependencyType } // LifecycleStage is the description of a single stage within a step lifecycle. @@ -72,7 +72,7 @@ type LifecycleStage struct { // will pause if there is no input available. // It will automatically create a DAG node between the current and the described next stages to ensure // that it is running in order. - NextStages []string + NextStages map[string]dgraph.DependencyType // Fatal indicates that this stage should be treated as fatal unless handled by the workflow. Fatal bool } @@ -83,7 +83,7 @@ func (l LifecycleStage) Identifier() string { } // NextStageIDs is a helper function that returns the next possible stages. -func (l LifecycleStage) NextStageIDs() []string { +func (l LifecycleStage) NextStageIDs() map[string]dgraph.DependencyType { return l.NextStages } diff --git a/internal/step/plugin/provider.go b/internal/step/plugin/provider.go index cfddb7ac..92d84ec3 100644 --- a/internal/step/plugin/provider.go +++ b/internal/step/plugin/provider.go @@ -3,6 +3,7 @@ package plugin import ( "context" "fmt" + "go.arcalot.io/dgraph" "go.flow.arcalot.io/pluginsdk/plugin" "reflect" "strings" @@ -168,8 +169,9 @@ var deployingLifecycleStage = step.LifecycleStage{ RunningName: "deploying", FinishedName: "deployed", InputFields: map[string]struct{}{string(StageIDDeploy): {}}, - NextStages: []string{ - string(StageIDStarting), string(StageIDDeployFailed), + NextStages: map[string]dgraph.DependencyType{ + string(StageIDStarting): dgraph.AndDependency, + string(StageIDDeployFailed): dgraph.CompletionAndDependency, }, Fatal: false, } @@ -189,8 +191,10 @@ var enablingLifecycleStage = step.LifecycleStage{ InputFields: map[string]struct{}{ "enabled": {}, }, - NextStages: []string{ - string(StageIDStarting), string(StageIDDisabled), string(StageIDCrashed), + NextStages: map[string]dgraph.DependencyType{ + string(StageIDStarting): dgraph.AndDependency, + string(StageIDDisabled): dgraph.AndDependency, + string(StageIDCrashed): dgraph.CompletionAndDependency, }, } @@ -203,8 +207,9 @@ var startingLifecycleStage = step.LifecycleStage{ "input": {}, "wait_for": {}, }, - NextStages: []string{ - string(StageIDRunning), string(StageIDCrashed), + NextStages: map[string]dgraph.DependencyType{ + string(StageIDRunning): dgraph.AndDependency, + string(StageIDCrashed): dgraph.CompletionAndDependency, }, } @@ -214,8 +219,9 @@ var runningLifecycleStage = step.LifecycleStage{ RunningName: "running", FinishedName: "completed", InputFields: map[string]struct{}{}, - NextStages: []string{ - string(StageIDOutput), string(StageIDCrashed), + NextStages: map[string]dgraph.DependencyType{ + string(StageIDOutput): dgraph.AndDependency, + string(StageIDCrashed): dgraph.CompletionAndDependency, }, } @@ -227,8 +233,10 @@ var cancelledLifecycleStage = step.LifecycleStage{ InputFields: map[string]struct{}{ "stop_if": {}, }, - NextStages: []string{ - string(StageIDOutput), string(StageIDCrashed), string(StageIDDeployFailed), + NextStages: map[string]dgraph.DependencyType{ + string(StageIDOutput): dgraph.AndDependency, + string(StageIDCrashed): dgraph.CompletionAndDependency, + string(StageIDDeployFailed): dgraph.CompletionAndDependency, }, } @@ -902,6 +910,7 @@ func (r *runningStep) closeComponents(closeATP bool) error { r.cancel() r.lock.Lock() if r.closed { + r.lock.Unlock() return nil // Already closed } var atpErr error @@ -941,7 +950,7 @@ func (r *runningStep) run() { r.logger.Warningf("failed to remove deployed container for step %s/%s", r.runID, r.pluginStepID) } r.lock.Unlock() - r.transitionToCancelled() + r.cancelledEarly() return default: r.container = container @@ -960,9 +969,6 @@ func (r *runningStep) run() { return } - // It's enabled, so the disabled stage will not occur. - r.stageChangeHandler.OnStepStageFailure(r, string(StageIDDisabled), &r.wg, err) - if err := r.startStage(container); err != nil { r.startFailed(err) return @@ -1050,12 +1056,20 @@ func (r *runningStep) enableStage() (bool, error) { &r.wg, ) + var enabled bool select { - case enabled := <-r.enabledInput: - return enabled, nil + case enabled = <-r.enabledInput: case <-r.ctx.Done(): return false, fmt.Errorf("step closed while determining enablement status") } + + if enabled { + r.lock.Lock() + // It's enabled, so the disabled stage will not occur. + r.stageChangeHandler.OnStepStageFailure(r, string(StageIDDisabled), &r.wg, fmt.Errorf("step enabled; cannot be disabled anymore")) + r.lock.Unlock() + } + return enabled, nil } func (r *runningStep) startStage(container deployer.Plugin) error { @@ -1150,9 +1164,6 @@ func (r *runningStep) runStage() error { var result atp.ExecutionResult select { case result = <-r.executionChannel: - if result.Error != nil { - return result.Error - } case <-r.ctx.Done(): // In this case, it is being instructed to stop. A signal should have been sent. // Shutdown (with sigterm) the container, then wait for the output (valid or error). @@ -1170,6 +1181,10 @@ func (r *runningStep) runStage() error { } + if result.Error != nil { + return result.Error + } + // Execution complete, move to state running stage outputs, then to state finished stage. r.transitionStage(StageIDOutput, step.RunningStepStateRunning) r.completeStep(r.currentStage, step.RunningStepStateFinished, &result.OutputID, &result.OutputData) @@ -1178,6 +1193,8 @@ func (r *runningStep) runStage() error { } func (r *runningStep) markStageFailures(firstStage StageID, err error) { + r.lock.Lock() + defer r.lock.Unlock() switch firstStage { case StageIDEnabling: r.stageChangeHandler.OnStepStageFailure(r, string(StageIDEnabling), &r.wg, err) @@ -1214,15 +1231,16 @@ func (r *runningStep) deployFailed(err error) { r.markStageFailures(StageIDEnabling, err) } -func (r *runningStep) transitionToCancelled() { +func (r *runningStep) cancelledEarly() { r.logger.Infof("Step %s/%s cancelled", r.runID, r.pluginStepID) // Follow the convention of transitioning to running then finished. r.transitionStage(StageIDCancelled, step.RunningStepStateRunning) - // Cancelled currently has no output. - r.transitionStage(StageIDCancelled, step.RunningStepStateFinished) - // This is called after deployment. So everything after deployment cannot occur. err := fmt.Errorf("step %s/%s cancelled", r.runID, r.pluginStepID) + // Cancelled currently has no output. + // Set it as unresolvable since it's cancelled, and to prevent conflicts with its inputs. + r.transitionFromFailedStage(StageIDCancelled, step.RunningStepStateFinished, err) + // Note: This function is only called if it's cancelled during the deployment phase. // If that changes, the stage IDs marked as failed need to be changed. r.markStageFailures(StageIDEnabling, err) @@ -1248,12 +1266,11 @@ func (r *runningStep) transitionToDisabled() { err := fmt.Errorf("step %s/%s disabled", r.runID, r.pluginStepID) r.markStageFailures(StageIDStarting, err) - } func (r *runningStep) startFailed(err error) { r.logger.Debugf("Start failed stage for step %s/%s", r.runID, r.pluginStepID) - r.transitionStage(StageIDCrashed, step.RunningStepStateRunning) + r.transitionFromFailedStage(StageIDCrashed, step.RunningStepStateRunning, err) r.logger.Warningf("Plugin step %s/%s start failed. %v", r.runID, r.pluginStepID, err) // Now it's done. @@ -1288,8 +1305,29 @@ func (r *runningStep) transitionStage(newStage StageID, state step.RunningStepSt r.transitionStageWithOutput(newStage, state, nil, nil) } +func (r *runningStep) transitionFromFailedStage(newStage StageID, state step.RunningStepState, err error) { + r.lock.Lock() + defer r.lock.Unlock() + previousStage := string(r.currentStage) + r.currentStage = newStage + // Don't forget to update this, or else it will behave very oddly. + // First running, then finished. You can't skip states. + r.state = state + r.stageChangeHandler.OnStepStageFailure( + r, + previousStage, + &r.wg, + err, + ) +} + // TransitionStage transitions the stage to the specified stage, and the state to the specified state. -func (r *runningStep) transitionStageWithOutput(newStage StageID, state step.RunningStepState, outputID *string, previousStageOutput *any) { +func (r *runningStep) transitionStageWithOutput( + newStage StageID, + state step.RunningStepState, + outputID *string, + previousStageOutput *any, +) { // A current lack of observability into the atp client prevents // non-fragile testing of this function. r.lock.Lock() diff --git a/internal/step/plugin/provider_test.go b/internal/step/plugin/provider_test.go index e946f000..6d458229 100644 --- a/internal/step/plugin/provider_test.go +++ b/internal/step/plugin/provider_test.go @@ -518,3 +518,5 @@ func TestProvider_StartFail(t *testing.T) { assert.NoError(t, running.Close()) }) } + +// TODO: Add more tests here for the current functions and code paths. diff --git a/workflow/executor.go b/workflow/executor.go index 9c1c8816..132b0d23 100644 --- a/workflow/executor.go +++ b/workflow/executor.go @@ -413,8 +413,12 @@ func (e *executor) connectStepDependencies( if err != nil { return fmt.Errorf("bug: node for current stage not found (%w)", err) } - for _, nextStage := range stage.NextStages { - if err := currentStageNode.Connect(GetStageNodeID(stepID, nextStage)); err != nil { + for nextStage, dependencyType := range stage.NextStages { + nextStageNode, err := dag.GetNodeByID(GetStageNodeID(stepID, nextStage)) + if err != nil { + return fmt.Errorf("bug: node for next stage not found (%w)", err) + } + if err := nextStageNode.ConnectDependency(currentStageNode.ID(), dependencyType); err != nil { return fmt.Errorf("bug: cannot connect nodes (%w)", err) } } diff --git a/workflow/workflow.go b/workflow/workflow.go index d3feae76..a25ed9e6 100644 --- a/workflow/workflow.go +++ b/workflow/workflow.go @@ -284,7 +284,13 @@ errGatherLoop: } } -func (l *loopState) onStageComplete(stepID string, previousStage *string, previousStageOutputID *string, previousStageOutput *any, wg *sync.WaitGroup) { +func (l *loopState) onStageComplete( + stepID string, + previousStage *string, + previousStageOutputID *string, + previousStageOutput *any, + wg *sync.WaitGroup, +) { l.lock.Lock() defer func() { if previousStage != nil { @@ -303,10 +309,10 @@ func (l *loopState) onStageComplete(stepID string, previousStage *string, previo l.cancel() return } - l.logger.Debugf("Resolving node %q in the DAG", stageNode.ID()) + l.logger.Debugf("Resolving node %q in the DAG on stage complete", stageNode.ID()) if err := stageNode.ResolveNode(dgraph.Resolved); err != nil { - l.logger.Errorf("Failed to resolve stage node ID %s (%w)", stageNode.ID(), err) - l.recentErrors <- fmt.Errorf("failed to resolve stage node ID %s (%w)", stageNode.ID(), err) + errMessage := fmt.Errorf("failed to resolve stage node ID %s (%s)", stageNode.ID(), err.Error()) + l.recentErrors <- errMessage l.cancel() return } @@ -320,7 +326,7 @@ func (l *loopState) onStageComplete(stepID string, previousStage *string, previo } // Resolves the node in the DAG. This allows us to know which nodes are // ready for processing due to all dependencies being resolved. - l.logger.Debugf("Resolving node %q in the DAG", outputNode.ID()) + l.logger.Debugf("Resolving output node %q in the DAG", outputNode.ID()) if err := outputNode.ResolveNode(dgraph.Resolved); err != nil { l.logger.Errorf("Failed to resolve output node ID %s (%w)", outputNode.ID(), err) l.recentErrors <- fmt.Errorf("failed to resolve output node ID %s (%w)", outputNode.ID(), err) @@ -371,7 +377,7 @@ func (l *loopState) markOutputsUnresolvable(stepID string, stageID string, skipp l.logger.Debugf("Will mark node %s in the DAG as unresolvable", stepID+"."+stage.ID+"."+stageOutputID) err = unresolvableOutputNode.ResolveNode(dgraph.Unresolvable) if err != nil { - l.logger.Errorf("Error while marking node %s in DAG as unresolvable (%s)", unresolvableOutputNode.ID(), err.Error()) + panic(fmt.Errorf("error while marking node %s in DAG as unresolvable (%s)", unresolvableOutputNode.ID(), err.Error())) } } } @@ -390,7 +396,13 @@ func (l *loopState) markStageNodeUnresolvable(stepID string, stageID string) { l.logger.Debugf("Will mark node %s in the DAG as unresolvable", stepID+"."+stageID) err = unresolvableOutputNode.ResolveNode(dgraph.Unresolvable) if err != nil { - l.logger.Errorf("Error while marking node %s in DAG as unresolvable (%s)", unresolvableOutputNode.ID(), err.Error()) + panic( + fmt.Errorf( + "error while marking node %s in DAG as unresolvable (%s)", + unresolvableOutputNode.ID(), + err.Error(), + ), + ) } } @@ -404,7 +416,7 @@ func (l *loopState) notifySteps() { //nolint:gocognit // Can include runnable nodes, nodes that cannot be resolved, and nodes that are not for running, like inputs. for nodeID, resolutionStatus := range readyNodes { failed := resolutionStatus == dgraph.Unresolvable - l.logger.Debugf("Processing step node %s", nodeID) + l.logger.Debugf("Processing step node %s with resolution status %q", nodeID, resolutionStatus) node, err := l.dag.GetNodeByID(nodeID) if err != nil { panic(fmt.Errorf("failed to get node %s (%w)", nodeID, err)) @@ -438,6 +450,8 @@ func (l *loopState) notifySteps() { //nolint:gocognit // untypedInputData stores the resolved data untypedInputData, err := l.resolveExpressions(inputData, l.data) if err != nil { + // An error here often indicates a locking issue in a step provider. This would be caused + // by the lock not being held when the output was marked resolved. panic(fmt.Errorf("cannot resolve expressions for %s (%w)", nodeID, err)) } diff --git a/workflow/workflow_test.go b/workflow/workflow_test.go index 59f140bd..67aee20b 100644 --- a/workflow/workflow_test.go +++ b/workflow/workflow_test.go @@ -54,7 +54,7 @@ func TestOutputFailed(t *testing.T) { var typedError *workflow.ErrNoMorePossibleOutputs if !errors.As(err, &typedError) { - t.Fatalf("incorrect error type returned: %T", err) + t.Fatalf("incorrect error type returned: %T (%s)", err, err) } } @@ -756,7 +756,7 @@ func TestMissingInputsFailedDeployment(t *testing.T) { assert.Equals(t, outputID, "") var typedError *workflow.ErrNoMorePossibleOutputs if !errors.As(err, &typedError) { - t.Fatalf("incorrect error type returned: %T", err) + t.Fatalf("incorrect error type returned: %T (%s)", err, err) } } @@ -1218,7 +1218,7 @@ func TestInputCancelledStepWorkflow(t *testing.T) { assert.Equals(t, outputID, "") var typedError *workflow.ErrNoMorePossibleOutputs if !errors.As(err, &typedError) { - t.Fatalf("incorrect error type returned: %T", err) + t.Fatalf("incorrect error type returned: %T (%s)", err, err) } } @@ -1269,7 +1269,7 @@ func TestInputDisabledStepWorkflow(t *testing.T) { assert.Error(t, err) var typedError *workflow.ErrNoMorePossibleOutputs if !errors.As(err, &typedError) { - t.Fatalf("incorrect error type returned: %T", err) + t.Fatalf("incorrect error type returned: %T (%s)", err, err) } } @@ -1426,7 +1426,7 @@ func TestMultiDependencyWorkflowFailureWithDisabling(t *testing.T) { t.Logf("MultiDependency workflow failed purposefully in %d ms", duration.Milliseconds()) var typedError *workflow.ErrNoMorePossibleOutputs if !errors.As(err, &typedError) { - t.Fatalf("incorrect error type returned: %T", err) + t.Fatalf("incorrect error type returned: %T (%s)", err, err) } assert.LessThan(t, duration.Seconds(), 4) // It will take 5 seconds if it fails to fail early. } @@ -1481,7 +1481,7 @@ func TestMultiDependencyWorkflowFailureWithCancellation(t *testing.T) { t.Logf("MultiDependency workflow failed purposefully in %d ms", duration.Milliseconds()) var typedError *workflow.ErrNoMorePossibleOutputs if !errors.As(err, &typedError) { - t.Fatalf("incorrect error type returned: %T", err) + t.Fatalf("incorrect error type returned: %T (%s)", err, err) } assert.LessThan(t, duration.Seconds(), 4) // It will take 5 seconds if it fails to fail early. } @@ -1533,7 +1533,7 @@ func TestMultiDependencyWorkflowFailureWithErrorOut(t *testing.T) { t.Logf("MultiDependency workflow failed purposefully in %d ms", duration.Milliseconds()) var typedError *workflow.ErrNoMorePossibleOutputs if !errors.As(err, &typedError) { - t.Fatalf("incorrect error type returned: %T", err) + t.Fatalf("incorrect error type returned: %T (%s)", err, err) } // If it takes 5 seconds, then there was a deadlock in the client. // If it takes 6 seconds, then it waited for the second step. @@ -1571,7 +1571,7 @@ steps: wait_time_ms: 7000 outputs: workflow-success: - simple_wait_output: !expr $.steps.failed_step.outputs + failed_step_output: !expr $.steps.failed_step.outputs simple_wait_output: !expr $.steps.long_wait.outputs ` @@ -1593,13 +1593,85 @@ func TestMultiDependencyWorkflowFailureWithDeployFailure(t *testing.T) { t.Logf("MultiDependency workflow failed purposefully in %d ms", duration.Milliseconds()) var typedError *workflow.ErrNoMorePossibleOutputs if !errors.As(err, &typedError) { - t.Fatalf("incorrect error type returned: %T", err) + t.Fatalf("incorrect error type returned: %T (%s)", err, err) } // If it takes 5 seconds, then there was a deadlock in the client. // If it takes 6 seconds, then it waited for the second step. assert.LessThan(t, duration.Milliseconds(), 5500) } +var multiDependencyFailureWorkflowWithDoubleFailure = ` +version: v0.2.0 +input: + root: WorkflowInput + objects: + WorkflowInput: + id: WorkflowInput + properties: {} +steps: + failed_step_A: + plugin: + src: "n/a" + deployment_type: "builtin" + step: wait + input: + wait_time_ms: 0 + deploy: # This fails first. + deployer_name: "test-impl" + deploy_time: 0 + deploy_succeed: !expr 'false' + quick_step: + plugin: + src: "n/a" + deployment_type: "builtin" + step: wait + input: + wait_time_ms: 1 # Second, this succeeds, which cancels the second failing step. + failed_step_B: + plugin: + src: "n/a" + deployment_type: "builtin" + step: wait + wait_for: !expr $.steps.failed_step_A.outputs # Makes it unresolvable + stop_if: !expr $.steps.quick_step.outputs # Hopefully triggers the second resolution. + input: + wait_time_ms: 0 + deploy: + deployer_name: "test-impl" + deploy_succeed: !expr 'true' + long_wait: + plugin: + src: "n/a" + deployment_type: "builtin" + step: wait + input: + wait_time_ms: 100 +outputs: + workflow-success: + failed_step_output: !expr $.steps.failed_step_A.outputs + wait-only: # For the error to be exposed, we need an alternative output that persists beyond the error. + wait_output: !expr $.steps.long_wait.outputs +` + +func TestMultiDependencyWorkflowFailureDoubleFailure(t *testing.T) { + // Creates a scenario where step B's starting (due to wait-for) depends on step A's outputs, + // making A's outputs become unresolvable, while at the same time the step that needs that info (B) crashes. + // Transitioning B to crashed resolves starting, so that is in conflict with the unresolvable + // state propagated from step A. + preparedWorkflow := assert.NoErrorR[workflow.ExecutableWorkflow](t)( + getTestImplPreparedWorkflow(t, multiDependencyFailureWorkflowWithDoubleFailure), + ) + + startTime := time.Now() // Right before execute to not include pre-processing time. + _, _, err := preparedWorkflow.Execute(context.Background(), map[string]any{}) + duration := time.Since(startTime) + assert.NoError(t, err) + t.Logf("MultiDependency finished in %d ms", duration.Milliseconds()) +} + +// TODO: Create a situation where step A's outputs are resolvable, but the context is cancelled, +// resulting in a possible conflict with the cancelled step stage DAG node. + func createTestExecutableWorkflow(t *testing.T, workflowStr string, workflowCtx map[string][]byte) (workflow.ExecutableWorkflow, error) { logConfig := log.Config{ Level: log.LevelDebug, From 79a65af95011b43196912cdf318f4936fbd1180d Mon Sep 17 00:00:00 2001 From: Jared O'Connell Date: Thu, 25 Jul 2024 16:09:12 -0400 Subject: [PATCH 12/22] Added closed output and more tests --- internal/step/foreach/provider.go | 2 +- internal/step/plugin/provider.go | 361 +++++++++++++++++++++++------- workflow/workflow.go | 153 +++++++++---- workflow/workflow_test.go | 307 +++++++++++++++++++++---- 4 files changed, 658 insertions(+), 165 deletions(-) diff --git a/internal/step/foreach/provider.go b/internal/step/foreach/provider.go index 3ddf9bf9..4abdedf4 100644 --- a/internal/step/foreach/provider.go +++ b/internal/step/foreach/provider.go @@ -419,7 +419,7 @@ func (r *runningStep) CurrentStage() string { } func (r *runningStep) State() step.RunningStepState { - r.lock.Lock() // TODO: Determine why this gets stuck. + r.lock.Lock() defer r.lock.Unlock() return r.currentState } diff --git a/internal/step/plugin/provider.go b/internal/step/plugin/provider.go index 92d84ec3..583be81d 100644 --- a/internal/step/plugin/provider.go +++ b/internal/step/plugin/provider.go @@ -8,9 +8,10 @@ import ( "reflect" "strings" "sync" + "sync/atomic" "time" - log "go.arcalot.io/log/v2" + "go.arcalot.io/log/v2" "go.flow.arcalot.io/deployer" "go.flow.arcalot.io/deployer/registry" "go.flow.arcalot.io/engine/internal/step" @@ -159,6 +160,8 @@ const ( StageIDOutput StageID = "outputs" // StageIDCrashed is a stage that indicates that the plugin has quit unexpectedly. StageIDCrashed StageID = "crashed" + // StageIDClosed is a stage that indicates that the plugin has exited due to workflow termination. + StageIDClosed StageID = "closed" // StageIDStarting is a stage that indicates that the plugin execution has begun. StageIDStarting StageID = "starting" ) @@ -172,6 +175,7 @@ var deployingLifecycleStage = step.LifecycleStage{ NextStages: map[string]dgraph.DependencyType{ string(StageIDStarting): dgraph.AndDependency, string(StageIDDeployFailed): dgraph.CompletionAndDependency, + string(StageIDClosed): dgraph.CompletionAndDependency, }, Fatal: false, } @@ -195,6 +199,7 @@ var enablingLifecycleStage = step.LifecycleStage{ string(StageIDStarting): dgraph.AndDependency, string(StageIDDisabled): dgraph.AndDependency, string(StageIDCrashed): dgraph.CompletionAndDependency, + string(StageIDClosed): dgraph.CompletionAndDependency, }, } @@ -204,12 +209,14 @@ var startingLifecycleStage = step.LifecycleStage{ RunningName: "starting", FinishedName: "started", InputFields: map[string]struct{}{ - "input": {}, - "wait_for": {}, + "input": {}, + "wait_for": {}, + "closure_wait_timeout": {}, }, NextStages: map[string]dgraph.DependencyType{ string(StageIDRunning): dgraph.AndDependency, string(StageIDCrashed): dgraph.CompletionAndDependency, + string(StageIDClosed): dgraph.CompletionAndDependency, }, } @@ -222,6 +229,7 @@ var runningLifecycleStage = step.LifecycleStage{ NextStages: map[string]dgraph.DependencyType{ string(StageIDOutput): dgraph.AndDependency, string(StageIDCrashed): dgraph.CompletionAndDependency, + string(StageIDClosed): dgraph.CompletionAndDependency, }, } @@ -234,9 +242,10 @@ var cancelledLifecycleStage = step.LifecycleStage{ "stop_if": {}, }, NextStages: map[string]dgraph.DependencyType{ - string(StageIDOutput): dgraph.AndDependency, + string(StageIDOutput): dgraph.CompletionAndDependency, string(StageIDCrashed): dgraph.CompletionAndDependency, string(StageIDDeployFailed): dgraph.CompletionAndDependency, + string(StageIDClosed): dgraph.CompletionAndDependency, }, } @@ -260,6 +269,12 @@ var crashedLifecycleStage = step.LifecycleStage{ RunningName: "crashed", FinishedName: "crashed", } +var closedLifecycleStage = step.LifecycleStage{ + ID: string(StageIDClosed), + WaitingName: "closed", + RunningName: "closed", + FinishedName: "closed", +} // Lifecycle returns a lifecycle that contains all plugin lifecycle stages. func (p *pluginProvider) Lifecycle() step.Lifecycle[step.LifecycleStage] { @@ -275,6 +290,7 @@ func (p *pluginProvider) Lifecycle() step.Lifecycle[step.LifecycleStage] { disabledLifecycleStage, finishedLifecycleStage, crashedLifecycleStage, + closedLifecycleStage, }, } } @@ -429,6 +445,24 @@ func (r *runnableStep) DisabledOutputSchema() *schema.StepOutputSchema { ) } +func ClosureTimeoutSchema() *schema.PropertySchema { + return schema.NewPropertySchema( + schema.NewIntSchema(schema.PointerTo(int64(0)), nil, nil), + schema.NewDisplayValue( + schema.PointerTo("closure wait timeout"), + schema.PointerTo("The amount of milliseconds to wait after sending the cancel "+ + "signal on closure before force killing the step."), + nil, + ), + false, + nil, + nil, + nil, + schema.PointerTo("5000"), // Default 5 seconds. + nil, + ) +} + func (r *runnableStep) Lifecycle(input map[string]any) (result step.Lifecycle[step.LifecycleStageWithSchema], err error) { rawStepID, ok := input["step"] if !ok || rawStepID == nil { @@ -576,6 +610,7 @@ func (r *runnableStep) Lifecycle(input map[string]any) (result step.Lifecycle[st nil, nil, ), + "closure_wait_timeout": ClosureTimeoutSchema(), }, Outputs: map[string]*schema.StepOutputSchema{ "started": r.StartedSchema(), @@ -632,6 +667,51 @@ func (r *runnableStep) Lifecycle(input map[string]any) (result step.Lifecycle[st }, }, }, + { + LifecycleStage: closedLifecycleStage, + InputSchema: nil, + Outputs: map[string]*schema.StepOutputSchema{ + "result": { + SchemaValue: schema.NewScopeSchema( + schema.NewObjectSchema( + "ClosedInfo", + map[string]*schema.PropertySchema{ + "cancelled": schema.NewPropertySchema( + schema.NewBoolSchema(), + schema.NewDisplayValue( + schema.PointerTo("cancelled"), + schema.PointerTo("Whether the step was cancelled by stop_if"), + nil, + ), + true, + nil, + nil, + nil, + nil, + nil, + ), + "close_requested": schema.NewPropertySchema( + schema.NewBoolSchema(), + schema.NewDisplayValue( + schema.PointerTo("close requested"), + schema.PointerTo("Whether the step was closed with Close()"), + nil, + ), + true, + nil, + nil, + nil, + nil, + nil, + ), + }, + ), + ), + DisplayValue: nil, + ErrorValue: true, + }, + }, + }, }, }, nil } @@ -682,7 +762,7 @@ func (r *runnableStep) Start(input map[string]any, runID string, stageChangeHand ctx: ctx, cancel: cancel, deployInput: make(chan any, 1), - runInput: make(chan any, 1), + runInput: make(chan runInput, 1), enabledInput: make(chan bool, 1), logger: r.logger, deploymentType: r.deploymentType, @@ -690,9 +770,9 @@ func (r *runnableStep) Start(input map[string]any, runID string, stageChangeHand pluginStepID: stepID, state: step.RunningStepStateStarting, localDeployer: r.localDeployer, - executionChannel: make(chan atp.ExecutionResult), - signalToStep: make(chan schema.Input), - signalFromStep: make(chan schema.Input), + executionChannel: make(chan atp.ExecutionResult, 2), // Buffer in case a result happens after context done. + signalToStep: make(chan schema.Input, 10), + signalFromStep: make(chan schema.Input, 10), runID: runID, } @@ -702,6 +782,11 @@ func (r *runnableStep) Start(input map[string]any, runID string, stageChangeHand return s, nil } +type runInput struct { + stepInputData any + forceCloseTimeoutMS int64 +} + type runningStep struct { deployerRegistry registry.Registry stepSchema schema.Step @@ -710,12 +795,13 @@ type runningStep struct { wg sync.WaitGroup ctx context.Context cancel context.CancelFunc + cancelled bool atpClient atp.Client deployInput chan any deployInputAvailable bool enabledInput chan bool enabledInputAvailable bool - runInput chan any + runInput chan runInput runInputAvailable bool logger log.Logger currentStage StageID @@ -730,7 +816,7 @@ type runningStep struct { executionChannel chan atp.ExecutionResult signalToStep chan schema.Input // Communicates with the ATP client, not other steps. signalFromStep chan schema.Input // Communicates with the ATP client, not other steps. - closed bool + closed atomic.Bool // Store channels for sending pre-calculated signal outputs to other steps? // Store channels for receiving pre-calculated signal inputs from other steps? } @@ -755,8 +841,6 @@ func (r *runningStep) ProvideStageInput(stage string, input map[string]any) erro // and notifySteps function. r.lock.Lock() defer r.lock.Unlock() - r.logger.Debugf("ProvideStageInput START") - defer r.logger.Debugf("ProvideStageInput END") // Checks which stage it is getting input for, and handles it. switch stage { @@ -770,6 +854,8 @@ func (r *runningStep) ProvideStageInput(stage string, input map[string]any) erro return nil case string(StageIDCancelled): return r.provideCancelledInput(input) + case string(StageIDClosed): + return nil case string(StageIDDeployFailed): return nil case string(StageIDCrashed): @@ -840,6 +926,19 @@ func (r *runningStep) provideStartingInput(input map[string]any) error { if _, err := r.stepSchema.Input().Unserialize(input["input"]); err != nil { return err } + + // Now get the other necessary data for running the step + var timeout int64 + if input["closure_wait_timeout"] != nil { + unserializedTimeout, err := ClosureTimeoutSchema().Unserialize(input["closure_wait_timeout"]) + if err != nil { + return err + } + timeout = unserializedTimeout.(int64) + } else { + timeout = 5000 + } + // Make sure we transition the state before unlocking so there are no race conditions. r.runInputAvailable = true @@ -848,7 +947,10 @@ func (r *runningStep) provideStartingInput(input map[string]any) error { // Feed the run step its input over the channel. select { - case r.runInput <- input["input"]: + case r.runInput <- runInput{ + stepInputData: input["input"], + forceCloseTimeoutMS: timeout, + }: default: return fmt.Errorf("unable to provide input to run stage for step %s/%s", r.runID, r.pluginStepID) } @@ -858,12 +960,25 @@ func (r *runningStep) provideStartingInput(input map[string]any) error { func (r *runningStep) provideCancelledInput(input map[string]any) error { // Note: The calling function must have the step mutex locked // Cancel if the step field is present and isn't false - if input["stop_if"] != false && input["stop_if"] != nil { + if input["stop_if"] == nil { + return nil + } + if input["stop_if"] != false { + r.cancelled = true r.cancelStep() } return nil } +func (r *runningStep) hasCancellationHandler() bool { + handler, present := r.stepSchema.SignalHandlers()[plugin.CancellationSignalSchema.ID()] + return present && handler != nil +} + +func (r *runningStep) getCancellationHandler() *schema.SignalSchema { + return r.stepSchema.SignalHandlers()[plugin.CancellationSignalSchema.ID()] +} + // cancelStep gracefully requests cancellation for any stage. // If running, it sends a cancel signal if the plugin supports it. func (r *runningStep) cancelStep() { @@ -872,11 +987,14 @@ func (r *runningStep) cancelStep() { // If it isn't, cancelling the context alone should be enough. if r.currentStage == StageIDRunning { // Verify that the step has a cancel signal - cancelSignal := r.stepSchema.SignalHandlers()[plugin.CancellationSignalSchema.ID()] - if cancelSignal == nil { + if !r.hasCancellationHandler() { r.logger.Errorf("could not cancel step %s/%s. Does not contain cancel signal receiver.", r.runID, r.pluginStepID) - } else if err := plugin.CancellationSignalSchema.DataSchema().ValidateCompatibility(cancelSignal.DataSchema()); err != nil { + } + cancelSignal := r.getCancellationHandler() + if err := plugin.CancellationSignalSchema.DataSchema().ValidateCompatibility(cancelSignal.DataSchema()); err != nil { r.logger.Errorf("validation failed for cancel signal for step %s/%s: %s", r.runID, r.pluginStepID, err) + } else if r.signalToStep == nil { + r.logger.Debugf("signal send channel closed; the step %s/%s likely finished", r.runID, r.pluginStepID) } else { // Validated. Now call the signal. r.signalToStep <- schema.Input{RunID: r.runID, ID: cancelSignal.ID(), InputData: map[any]any{}} @@ -890,26 +1008,56 @@ func (r *runningStep) cancelStep() { // Warning: This means that it won't wait for the ATP client to finish. This is okay if using a deployer that // will stop execution once the deployer closes it. func (r *runningStep) ForceClose() error { - err := r.closeComponents(false) + closedAlready := r.closed.Swap(true) + if closedAlready { + r.wg.Wait() + return nil + } + err := r.forceClose() + if err != nil { + return err + } // Wait for the run to finish to ensure that it's not running after closing. r.wg.Wait() - r.closed = true r.logger.Warningf("Step %s/%s force closed.", r.runID, r.pluginStepID) + return nil +} + +// This is necessary so that the waitgroup's Wait() function is not called by a function whose +// completion is required to end the wait. +func (r *runningStep) forceCloseInternal() error { + closedAlready := r.closed.Swap(true) + if closedAlready { + return nil + } + return r.forceClose() +} + +func (r *runningStep) forceClose() error { + r.closed.Store(true) + r.cancel() + err := r.closeComponents(false) return err } func (r *runningStep) Close() error { + closedAlready := r.closed.Swap(true) + if closedAlready { + r.wg.Wait() + return nil + } + r.cancel() err := r.closeComponents(true) // Wait for the run to finish to ensure that it's not running after closing. r.wg.Wait() - r.closed = true + r.closed.Store(true) return err } func (r *runningStep) closeComponents(closeATP bool) error { r.cancel() r.lock.Lock() - if r.closed { + if r.closed.Load() { r.lock.Unlock() return nil // Already closed } @@ -938,8 +1086,14 @@ func (r *runningStep) run() { r.cancel() // Close before WaitGroup done r.wg.Done() // Done. Close may now exit. }() - container, err := r.deployStage() - if err != nil { + container, contextDoneEarly, err := r.deployStage() + if contextDoneEarly { + if err != nil { + r.logger.Debugf("error due to step early closure: %s", err.Error()) + } + r.closedEarly(StageIDEnabling, true) + return + } else if err != nil { r.deployFailed(err) return } @@ -947,21 +1101,28 @@ func (r *runningStep) run() { select { case <-r.ctx.Done(): if err := container.Close(); err != nil { - r.logger.Warningf("failed to remove deployed container for step %s/%s", r.runID, r.pluginStepID) + r.logger.Warningf("failed to close deployed container for step %s/%s", r.runID, r.pluginStepID) } r.lock.Unlock() - r.cancelledEarly() + r.closedEarly(StageIDEnabling, false) return default: r.container = container } + defer func() { + err := container.Close() + if err != nil { + r.logger.Errorf("failed to close deployed container for step %s/%s", r.runID, r.pluginStepID) + } + }() r.lock.Unlock() r.logger.Debugf("Successfully deployed container with ID '%s' for step %s/%s", container.ID(), r.runID, r.pluginStepID) r.logger.Debugf("Checking to see if step %s/%s is enabled", r.runID, r.pluginStepID) - enabled, err := r.enableStage() - if err != nil { - r.startFailed(err) + enabled, contextDoneEarly := r.enableStage() + if contextDoneEarly { + r.closedEarly(StageIDStarting, true) + return } r.logger.Debugf("Step %s/%s enablement state: %t", r.runID, r.pluginStepID, enabled) if !enabled { @@ -969,16 +1130,25 @@ func (r *runningStep) run() { return } - if err := r.startStage(container); err != nil { + var forceCloseTimeoutMS int64 + if contextDoneEarly, forceCloseTimeoutMS, err = r.startStage(container); contextDoneEarly { + r.closedEarly(StageIDRunning, true) + return + } else if err != nil { r.startFailed(err) return } - if err := r.runStage(); err != nil { + if err := r.runStage(forceCloseTimeoutMS); err != nil { r.runFailed(err) } } -func (r *runningStep) deployStage() (deployer.Plugin, error) { +// deployStage deploys the step. +// Return types: +// - deployer.Plugin: The deployer's connection to the plugin. +// - bool: True if the context was done, causing early failure. +// - err: The error, if a failure occurred. +func (r *runningStep) deployStage() (deployer.Plugin, bool, error) { r.logger.Debugf("Deploying stage for step %s/%s", r.runID, r.pluginStepID) r.lock.Lock() r.state = step.RunningStepStateRunning @@ -1014,7 +1184,7 @@ func (r *runningStep) deployStage() (deployer.Plugin, error) { r.state = step.RunningStepStateRunning r.lock.Unlock() case <-r.ctx.Done(): - return nil, fmt.Errorf("step closed before deployment config could be obtained") + return nil, true, nil } } r.lock.Lock() @@ -1027,18 +1197,21 @@ func (r *runningStep) deployStage() (deployer.Plugin, error) { stepDeployer, err = r.deployerRegistry.Create(r.deploymentType, deployerConfig, r.logger.WithLabel("source", "deployer")) if err != nil { - return nil, err + return nil, false, err } } container, err := stepDeployer.Deploy(r.ctx, r.source) if err != nil { - return nil, err + return nil, false, err } - return container, nil + return container, false, nil } // enableStage returns the result of whether the stage was enabled or not. -func (r *runningStep) enableStage() (bool, error) { +// Return values: +// - bool: Whether the step was enabled. +// - bool: True if the step was disabled due to context done. +func (r *runningStep) enableStage() (bool, bool) { r.lock.Lock() previousStage := string(r.currentStage) r.currentStage = StageIDEnabling @@ -1060,7 +1233,7 @@ func (r *runningStep) enableStage() (bool, error) { select { case enabled = <-r.enabledInput: case <-r.ctx.Done(): - return false, fmt.Errorf("step closed while determining enablement status") + return false, true } if enabled { @@ -1069,10 +1242,15 @@ func (r *runningStep) enableStage() (bool, error) { r.stageChangeHandler.OnStepStageFailure(r, string(StageIDDisabled), &r.wg, fmt.Errorf("step enabled; cannot be disabled anymore")) r.lock.Unlock() } - return enabled, nil + return enabled, false } -func (r *runningStep) startStage(container deployer.Plugin) error { +// startStage gets the inputs and starts the step. +// Return values: +// - bool: True if the step was closed early due to context done. +// - int64: the amount of milliseconds to wait to force terminate a step. +// - error: Any error that occurred while trying to start the step. +func (r *runningStep) startStage(container deployer.Plugin) (bool, int64, error) { r.logger.Debugf("Starting stage for step %s/%s", r.runID, r.pluginStepID) atpClient := atp.NewClientWithLogger(container, r.logger.WithLabel("source", "atp-client")) var inputReceivedEarly bool @@ -1080,7 +1258,7 @@ func (r *runningStep) startStage(container deployer.Plugin) error { r.atpClient = atpClient r.lock.Unlock() - var runInput any + var runInput runInput var newState step.RunningStepState select { case runInput = <-r.runInput: @@ -1119,44 +1297,51 @@ func (r *runningStep) startStage(container deployer.Plugin) error { r.state = step.RunningStepStateRunning r.lock.Unlock() case <-r.ctx.Done(): - return fmt.Errorf("step closed while waiting for run configuration") + r.logger.Debugf("step closed while waiting for run configuration") + return true, 0, nil } } inputSchema, err := r.atpClient.ReadSchema() if err != nil { - return err + return false, 0, err } steps := inputSchema.Steps() stepSchema, ok := steps[r.pluginStepID] if !ok { - return fmt.Errorf("error in run step %s: schema mismatch between local and remote deployed plugin, no stepSchema named %s found in remote", r.runID, r.pluginStepID) + return false, 0, fmt.Errorf("error in run step %s: schema mismatch between local and remote deployed plugin, no stepSchema named %s found in remote", r.runID, r.pluginStepID) } // Re-verify input. This should have also been done earlier. - if _, err := stepSchema.Input().Unserialize(runInput); err != nil { - return fmt.Errorf("schema mismatch between local and remote deployed plugin in step %s/%s, unserializing input failed (%w)", r.runID, r.pluginStepID, err) + if _, err := stepSchema.Input().Unserialize(runInput.stepInputData); err != nil { + return false, 0, fmt.Errorf("schema mismatch between local and remote deployed plugin in step %s/%s, unserializing input failed (%w)", r.runID, r.pluginStepID, err) } r.wg.Add(1) // Runs the ATP client in a goroutine in order to wait for it. - // On context done, the deployer has 30 seconds before it will error out. + // On context done, the deployer has limited time before it will error out. go func() { defer r.wg.Done() result := r.atpClient.Execute( - schema.Input{RunID: r.runID, ID: r.pluginStepID, InputData: runInput}, + schema.Input{RunID: r.runID, ID: r.pluginStepID, InputData: runInput.stepInputData}, r.signalToStep, r.signalFromStep, ) + r.lock.Lock() + // The sender should be the one to close the signal send channel + channel := r.signalToStep + r.signalToStep = nil + close(channel) + r.lock.Unlock() r.executionChannel <- result if err = r.atpClient.Close(); err != nil { r.logger.Warningf("Error while closing ATP client: %s", err) } }() - return nil + return false, runInput.forceCloseTimeoutMS, nil } -func (r *runningStep) runStage() error { +func (r *runningStep) runStage(forCloseWaitMS int64) error { r.logger.Debugf("Running stage for step %s/%s", r.runID, r.pluginStepID) startedOutput := any(map[any]any{}) r.transitionStageWithOutput(StageIDRunning, step.RunningStepStateRunning, schema.PointerTo("started"), &startedOutput) @@ -1166,19 +1351,29 @@ func (r *runningStep) runStage() error { case result = <-r.executionChannel: case <-r.ctx.Done(): // In this case, it is being instructed to stop. A signal should have been sent. - // Shutdown (with sigterm) the container, then wait for the output (valid or error). - r.logger.Debugf("Got step context done before step run complete. Waiting up to 30 seconds for result.") + if r.hasCancellationHandler() { + r.logger.Debugf("Got step context done before step run complete. Sending cancellation signal. Waiting up to %d milliseconds for result.", forCloseWaitMS) + r.lock.Lock() + r.cancelStep() + r.lock.Unlock() + } else { + r.logger.Warningf("Got step context done before step run complete. Force closing step %s/%s.", r.runID, r.pluginStepID) + err := r.forceCloseInternal() + return fmt.Errorf("step forced closed after timeout without result (%s)", err) + } + // Wait for cancellation to occur. select { case result = <-r.executionChannel: // Successfully stopped before end of timeout. - case <-time.After(time.Duration(30) * time.Second): - r.logger.Warningf("Step %s/%s did not complete within the 30 second time limit. Force closing container.", - r.runID, r.pluginStepID) - if err := r.ForceClose(); err != nil { - r.logger.Warningf("Error in step %s/%s while closing plugin container (%w)", r.runID, r.pluginStepID, err) + case <-time.After(time.Duration(forCloseWaitMS) * time.Millisecond): + r.logger.Warningf("Cancelled step %s/%s did not complete within the %d millisecond time limit. Force closing container.", + r.runID, r.pluginStepID, forCloseWaitMS) + if err := r.forceCloseInternal(); err != nil { + r.logger.Warningf("Error in step %s/%s while closing plugin container (%s)", r.runID, r.pluginStepID, err.Error()) + return fmt.Errorf("step closed after timeout without result with error (%s)", err.Error()) } + return fmt.Errorf("step closed after timeout without result") } - } if result.Error != nil { @@ -1215,6 +1410,13 @@ func (r *runningStep) markStageFailures(firstStage StageID, err error) { } } +// Closable is the graceful case, so this is necessary if it crashes. +func (r *runningStep) markNotClosable(err error) { + r.lock.Lock() + defer r.lock.Unlock() + r.stageChangeHandler.OnStepStageFailure(r, string(StageIDClosed), &r.wg, err) +} + func (r *runningStep) deployFailed(err error) { r.logger.Debugf("Deploy failed stage for step %s/%s", r.runID, r.pluginStepID) r.transitionStage(StageIDDeployFailed, step.RunningStepStateRunning) @@ -1229,21 +1431,7 @@ func (r *runningStep) deployFailed(err error) { // If deployment fails, enabling, disabled, starting, running, and output cannot occur. err = fmt.Errorf("deployment failed for step %s/%s", r.runID, r.pluginStepID) r.markStageFailures(StageIDEnabling, err) -} - -func (r *runningStep) cancelledEarly() { - r.logger.Infof("Step %s/%s cancelled", r.runID, r.pluginStepID) - // Follow the convention of transitioning to running then finished. - r.transitionStage(StageIDCancelled, step.RunningStepStateRunning) - // This is called after deployment. So everything after deployment cannot occur. - err := fmt.Errorf("step %s/%s cancelled", r.runID, r.pluginStepID) - // Cancelled currently has no output. - // Set it as unresolvable since it's cancelled, and to prevent conflicts with its inputs. - r.transitionFromFailedStage(StageIDCancelled, step.RunningStepStateFinished, err) - - // Note: This function is only called if it's cancelled during the deployment phase. - // If that changes, the stage IDs marked as failed need to be changed. - r.markStageFailures(StageIDEnabling, err) + r.markNotClosable(err) } func (r *runningStep) transitionToDisabled() { @@ -1266,10 +1454,31 @@ func (r *runningStep) transitionToDisabled() { err := fmt.Errorf("step %s/%s disabled", r.runID, r.pluginStepID) r.markStageFailures(StageIDStarting, err) + r.markNotClosable(err) +} + +func (r *runningStep) closedEarly(stageToMarkUnresolvable StageID, priorStageFailed bool) { + r.logger.Infof("Step %s/%s closed", r.runID, r.pluginStepID) + // Follow the convention of transitioning to running then finished. + if priorStageFailed { + r.transitionFromFailedStage(StageIDClosed, step.RunningStepStateRunning, fmt.Errorf("step closed early")) + } else { + r.transitionStage(StageIDClosed, step.RunningStepStateRunning) + } + closedOutput := any(map[any]any{"cancelled": r.cancelled, "close_requested": r.closed.Load()}) + + r.completeStep( + StageIDClosed, + step.RunningStepStateFinished, + schema.PointerTo("result"), + &closedOutput, + ) + + err := fmt.Errorf("step %s/%s closed due to workflow termination", r.runID, r.pluginStepID) + r.markStageFailures(stageToMarkUnresolvable, err) } func (r *runningStep) startFailed(err error) { - r.logger.Debugf("Start failed stage for step %s/%s", r.runID, r.pluginStepID) r.transitionFromFailedStage(StageIDCrashed, step.RunningStepStateRunning, err) r.logger.Warningf("Plugin step %s/%s start failed. %v", r.runID, r.pluginStepID, err) @@ -1280,13 +1489,11 @@ func (r *runningStep) startFailed(err error) { }) r.completeStep(StageIDCrashed, step.RunningStepStateFinished, &outputID, &output) - r.markStageFailures(StageIDRunning, err) - + r.markNotClosable(err) } func (r *runningStep) runFailed(err error) { - r.logger.Debugf("Run failed stage for step %s/%s", r.runID, r.pluginStepID) r.transitionStage(StageIDCrashed, step.RunningStepStateRunning) r.logger.Warningf("Plugin step %s/%s run failed. %v", r.runID, r.pluginStepID, err) @@ -1296,8 +1503,8 @@ func (r *runningStep) runFailed(err error) { Output: err.Error(), }) r.completeStep(StageIDCrashed, step.RunningStepStateFinished, &outputID, &output) - r.markStageFailures(StageIDOutput, err) + r.markNotClosable(err) } // TransitionStage transitions the stage to the specified stage, and the state to the specified state. diff --git a/workflow/workflow.go b/workflow/workflow.go index a25ed9e6..0e603c94 100644 --- a/workflow/workflow.go +++ b/workflow/workflow.go @@ -93,6 +93,7 @@ func (e *executableWorkflow) Execute(ctx context.Context, serializedInput any) ( outputDataChannel: make(chan outputDataType, 1), outputDone: false, waitingOutputs: outputNodes, + context: ctx, cancel: cancel, workflowContext: e.workflowContext, recentErrors: make(chan error, 20), // Big buffer in case there are lots of subsequent errors. @@ -131,9 +132,8 @@ func (e *executableWorkflow) Execute(ctx context.Context, serializedInput any) ( if !inputAvailable { waitingForInputText = " and is waiting for input" } - e.logger.Debugf("START Stage change for step %s to %s%s...", stepID, stage, waitingForInputText) + e.logger.Debugf("Stage change for step %s to %s%s...", stepID, stage, waitingForInputText) l.onStageComplete(stepID, previousStage, previousStageOutputID, previousStageOutput, wg) - e.logger.Debugf("DONE Stage change for step %s to %s%s...", stepID, stage, waitingForInputText) }, onStepComplete: func( step step.RunningStep, @@ -169,15 +169,7 @@ func (e *executableWorkflow) Execute(ctx context.Context, serializedInput any) ( } l.lock.Unlock() // Let's make sure we are closing all steps once this function terminates so we don't leave stuff running. - defer func() { - e.logger.Debugf("Terminating all steps...") - for stepID, runningStep := range l.runningSteps { - e.logger.Debugf("Terminating step %s...", stepID) - if err := runningStep.ForceClose(); err != nil { - panic(fmt.Errorf("failed to close step %s (%w)", stepID, err)) - } - } - }() + defer l.terminateAllSteps() // We remove the input node from the DAG and call the notifySteps function once to trigger the workflow // start. @@ -205,34 +197,70 @@ func (e *executableWorkflow) Execute(ctx context.Context, serializedInput any) ( select { case outputDataEntry, ok := <-l.outputDataChannel: if !ok { - return "", nil, fmt.Errorf("output data channel unexpectedly closed") - } - e.logger.Debugf("Output complete.") - outputID = outputDataEntry.outputID - outputData = outputDataEntry.outputData - outputSchema, ok := e.outputSchema[outputID] - if !ok { - return "", nil, fmt.Errorf( - "bug: no output named '%s' found in output schema", - outputID, - ) - } - _, err := outputSchema.Unserialize(outputDataEntry.outputData) - if err != nil { - return "", nil, fmt.Errorf( - "bug: output schema cannot unserialize output data (%w)", - err, - ) + lastErrors := l.handleErrors() + return "", nil, fmt.Errorf("output data channel unexpectedly closed. %s", lastErrors) } - return outputDataEntry.outputID, outputData, nil + return e.handleOutput(l, outputDataEntry) case <-ctx.Done(): - lastErr := l.getLastError() - e.logger.Debugf("Workflow execution aborted. %s", lastErr) - if lastErr != nil { - return "", nil, lastErr + lastErrors := l.handleErrors() + if lastErrors == nil { + e.logger.Warningf("Workflow execution aborted. Waiting 6 more seconds for output. %s", lastErrors) + } else { + return "", nil, lastErrors + } + timedContext, cancelFunction := context.WithTimeout(context.Background(), 5*time.Second) + wg := &sync.WaitGroup{} + wg.Add(1) + go func() { + defer wg.Done() + l.terminateAllSteps() + }() + defer wg.Wait() + defer cancelFunction() + select { + case outputDataEntry, ok := <-l.outputDataChannel: + if !ok { + lastErrors := l.handleErrors() + return "", nil, fmt.Errorf("output data channel unexpectedly closed while waiting after execution aborted. %s", lastErrors) + } + return e.handleOutput(l, outputDataEntry) + case err := <-l.recentErrors: // The context is done, so instead just check for errors. + // Put it back in the channel + l.recentErrors <- err + lastErrors := l.handleErrors() + l.logger.Errorf("workflow failed with error %s", err.Error()) + return "", nil, lastErrors + case <-timedContext.Done(): + lastErrors := l.handleErrors() + return "", nil, fmt.Errorf("workflow execution aborted (%w) (%s)", ctx.Err(), lastErrors) } - return "", nil, fmt.Errorf("workflow execution aborted (%w)", ctx.Err()) + + } +} + +func (e *executableWorkflow) handleOutput(l *loopState, outputDataEntry outputDataType) (outputID string, outputData any, err error) { + lastErrors := l.handleErrors() + if lastErrors != nil { + e.logger.Warningf("errors occurred alongside completed output (%s)", lastErrors.Error()) } + e.logger.Debugf("Output complete.") + outputID = outputDataEntry.outputID + outputData = outputDataEntry.outputData + outputSchema, ok := e.outputSchema[outputID] + if !ok { + return "", nil, fmt.Errorf( + "bug: no output named '%s' found in output schema (%s)", + outputID, lastErrors, + ) + } + _, err = outputSchema.Unserialize(outputDataEntry.outputData) + if err != nil { + return "", nil, fmt.Errorf( + "bug: output schema cannot unserialize output data (%w) (%s)", + err, lastErrors, + ) + } + return outputDataEntry.outputID, outputData, nil } type outputDataType struct { @@ -255,35 +283,63 @@ type loopState struct { outputDone bool // waitingOutputs keeps track of all workflow output nodes to know when the workflow fails. waitingOutputs map[string]dgraph.Node[*DAGItem] + context context.Context recentErrors chan error cancel context.CancelFunc workflowContext map[string][]byte lifecycles map[string]step.Lifecycle[step.LifecycleStageWithSchema] } -// getLastError gathers the last errors. If there are several, it creates a new one that consolidates them. +func (l *loopState) terminateAllSteps() { + l.logger.Debugf("Terminating all steps...") + for stepID, runningStep := range l.runningSteps { + l.logger.Debugf("Terminating step %s...", stepID) + if err := runningStep.ForceClose(); err != nil { + panic(fmt.Errorf("failed to close step %s (%w)", stepID, err)) + } + } +} + +// getLastError gathers the last errors. If there are several, it creates a new one that consolidates +// all non-duplicate ones. // This will read from the channel. Calling again will only gather new errors since the last call. func (l *loopState) getLastError() error { - var errors []error + errors := map[string]error{} errGatherLoop: for { select { case err := <-l.recentErrors: - errors = append(errors, err) + errors[err.Error()] = err default: break errGatherLoop // No more errors } } switch len(errors) { case 0: - return nil + fallthrough case 1: - return errors[0] + for _, err := range errors { + return err + } + return nil default: - return fmt.Errorf("multiple errors: %v", errors) + errorsAsString := "" + for errStr := range errors { + errorsAsString += " " + errStr + } + return fmt.Errorf("multiple errors:%s", errorsAsString) } } +func (l *loopState) handleErrors() error { + lastErr := l.getLastError() + if lastErr != nil { + l.logger.Warningf("Workflow execution aborted with error: %s", lastErr.Error()) + return lastErr + } + return nil +} + func (l *loopState) onStageComplete( stepID string, previousStage *string, @@ -574,11 +630,16 @@ func (l *loopState) checkForDeadlocks(retries int, wg *sync.WaitGroup) { l.logger.Warningf("No running steps. Rechecking...") wg.Add(1) go func() { - time.Sleep(5 * time.Millisecond) - l.lock.Lock() - l.checkForDeadlocks(retries-1, wg) - l.lock.Unlock() - wg.Done() + defer wg.Done() + select { + case <-time.After(time.Duration(5) * time.Millisecond): + time.Sleep(5 * time.Millisecond) + l.lock.Lock() + l.checkForDeadlocks(retries-1, wg) + l.lock.Unlock() + case <-l.context.Done(): + return + } }() } } diff --git a/workflow/workflow_test.go b/workflow/workflow_test.go index 67aee20b..d704f7af 100644 --- a/workflow/workflow_test.go +++ b/workflow/workflow_test.go @@ -114,7 +114,7 @@ steps: input: # It needs to be long enough for it to ensure that long_wait is in a running state. # The other case will be tested separately. - wait_time_ms: 20 + wait_time_ms: 5 outputs: a: cancelled_step_output: !expr $.steps.long_wait.outputs @@ -154,7 +154,7 @@ steps: deployment_type: "builtin" step: wait input: - wait_time_ms: 80 + wait_time_ms: 10 # Delay needs to be delayed long enough to ensure that last_step isn't running when it's cancelled by short_wait delay: plugin: @@ -162,7 +162,7 @@ steps: deployment_type: "builtin" step: wait input: - wait_time_ms: 50 + wait_time_ms: 5 last_step: plugin: src: "n/a" @@ -228,7 +228,7 @@ steps: deployment_type: "builtin" step: wait input: - wait_time_ms: 100 + wait_time_ms: 20 step_to_cancel: plugin: src: "n/a" @@ -241,7 +241,7 @@ steps: # Delay needs to be delayed long enough to ensure that it's in a deploy state when it's cancelled by short_wait deploy: deployer_name: "test-impl" - deploy_time: 50 # 50 ms + deploy_time: 10 # ms short_wait: plugin: src: "n/a" @@ -801,7 +801,7 @@ func TestMissingInputsWrongOutput(t *testing.T) { assert.Equals(t, outputID, "") } -var fiveSecWaitWorkflowDefinition = ` +var fourSecWaitWorkflowDefinition = ` version: v0.2.0 input: root: RootObject @@ -815,23 +815,19 @@ steps: src: "n/a" deployment_type: "builtin" step: wait + closure_wait_timeout: 5000 input: - wait_time_ms: 5000 + wait_time_ms: 4000 outputs: success: first_step_output: !expr $.steps.long_wait.outputs ` func TestEarlyContextCancellation(t *testing.T) { - // For this test, a workflow runs two steps, where each step runs a wait step for 5s - // The second wait step waits for the first to succeed after which it runs - // Due to the wait for condition, the steps will execute serially - // The total execution time for this test function should be greater than 10seconds - // as each step runs for 5s and are run serially - // The test double deployer will be used for this test, as we - // need a deployer to test the plugin step provider. + // Test to ensure the workflow aborts when instructed to. + // The wait step should exit gracefully when the workflow is cancelled. logConfig := log.Config{ - Level: log.LevelInfo, + Level: log.LevelDebug, Destination: log.DestinationStdout, } logger := log.New( @@ -848,10 +844,10 @@ func TestEarlyContextCancellation(t *testing.T) { stepRegistry, builtinfunctions.GetFunctions(), )) - wf := lang.Must2(workflow.NewYAMLConverter(stepRegistry).FromYAML([]byte(fiveSecWaitWorkflowDefinition))) + wf := lang.Must2(workflow.NewYAMLConverter(stepRegistry).FromYAML([]byte(fourSecWaitWorkflowDefinition))) preparedWorkflow := lang.Must2(executor.Prepare(wf, map[string][]byte{})) - // Cancel the context after 3 ms to simulate cancellation with ctrl-c. - ctx, cancel := context.WithTimeout(context.Background(), time.Millisecond*3) + // Cancel the context after 30 ms to simulate cancellation with ctrl-c. + ctx, cancel := context.WithTimeout(context.Background(), time.Millisecond*10) startTime := time.Now() // Right before execute to not include pre-processing time. //nolint:dogsled _, _, _ = preparedWorkflow.Execute(ctx, map[string]any{}) @@ -860,7 +856,7 @@ func TestEarlyContextCancellation(t *testing.T) { duration := time.Since(startTime) t.Logf("Test execution time: %s", duration) if duration >= 1000*time.Millisecond { - t.Fatalf("Test execution time is greater than 1000 milliseconds; Is the workflow properly cancelling?") + t.Fatalf("Test execution time is greater than 100 milliseconds; Is the workflow properly cancelling?") } } @@ -1189,7 +1185,7 @@ steps: deployer_name: "test-impl" # stop_if doesn't create any dependency, so we must keep the step in the deployment # stage long enough for the cancellation to occur at the intended stage. - deploy_time: 50 # ms + deploy_time: 25 # ms input: # The actual wait time should not matter for this test because the intention is to # to cancel it before it is run. @@ -1299,7 +1295,7 @@ steps: step: wait input: wait_time_ms: 0 - enabled: !expr $.steps.initial_wait.outputs.success.message == "Plugin slept for 20 ms." + enabled: !expr $.steps.initial_wait.outputs.success.message == "Plugin slept for 5 ms." outputs: success: initial_wait_output: !expr $.steps.initial_wait.outputs.success @@ -1317,16 +1313,16 @@ func TestDelayedDisabledStepWorkflow(t *testing.T) { preparedWorkflow := assert.NoErrorR[workflow.ExecutableWorkflow](t)( getTestImplPreparedWorkflow(t, dynamicDisabledStepWorkflow), ) - // The second step expects a 20ms sleep/wait. - // Pass with a 20ms input. + // The second step expects a 5ms sleep/wait. + // Pass with a 5ms input. outputID, _, err := preparedWorkflow.Execute(context.Background(), map[string]any{ - "sleep_time": 20, + "sleep_time": 5, }) assert.NoError(t, err) assert.Equals(t, outputID, "success") - // Fail with a non-20ms input. + // Fail with a non-5ms input. outputID, outputData, err := preparedWorkflow.Execute(context.Background(), map[string]any{ - "sleep_time": 19, + "sleep_time": 4, }) assert.NoError(t, err) assert.Equals(t, outputID, "disabled") @@ -1404,6 +1400,7 @@ steps: src: "n/a" deployment_type: "builtin" step: wait + closure_wait_timeout: 0 input: wait_time_ms: 5000 outputs: @@ -1417,18 +1414,18 @@ func TestMultiDependencyWorkflowFailureWithDisabling(t *testing.T) { preparedWorkflow := assert.NoErrorR[workflow.ExecutableWorkflow](t)( getTestImplPreparedWorkflow(t, multiDependencyFailureWorkflowWithDisabling), ) - startTime := time.Now() // Right before execute to not include pre-processing time. + startTime := time.Now() // Right before execution to not include pre-processing time. _, _, err := preparedWorkflow.Execute(context.Background(), map[string]any{ "fail_purposefully": true, }) duration := time.Since(startTime) assert.Error(t, err) - t.Logf("MultiDependency workflow failed purposefully in %d ms", duration.Milliseconds()) + t.Logf("MultiDependencyFailureWithDisabling workflow failed purposefully in %d ms", duration.Milliseconds()) var typedError *workflow.ErrNoMorePossibleOutputs if !errors.As(err, &typedError) { t.Fatalf("incorrect error type returned: %T (%s)", err, err) } - assert.LessThan(t, duration.Seconds(), 4) // It will take 5 seconds if it fails to fail early. + assert.LessThan(t, duration.Milliseconds(), 500) // It will take 5 seconds if it fails to fail early. } var multiDependencyFailureWorkflowWithCancellation = ` @@ -1453,12 +1450,13 @@ steps: stop_if: !expr '$.input.fail_purposefully' deploy: deployer_name: "test-impl" - deploy_time: 50 # 50 ms of delay to make the cancellation more reliable. + deploy_time: 10 # 10 ms of delay to make the cancellation more reliable. long_wait: plugin: src: "n/a" deployment_type: "builtin" step: wait + closure_wait_timeout: 0 input: wait_time_ms: 5000 outputs: @@ -1472,13 +1470,13 @@ func TestMultiDependencyWorkflowFailureWithCancellation(t *testing.T) { preparedWorkflow := assert.NoErrorR[workflow.ExecutableWorkflow](t)( getTestImplPreparedWorkflow(t, multiDependencyFailureWorkflowWithCancellation), ) - startTime := time.Now() // Right before execute to not include pre-processing time. + startTime := time.Now() // Right before execution to not include pre-processing time. _, _, err := preparedWorkflow.Execute(context.Background(), map[string]any{ "fail_purposefully": true, }) duration := time.Since(startTime) assert.Error(t, err) - t.Logf("MultiDependency workflow failed purposefully in %d ms", duration.Milliseconds()) + t.Logf("MultiDependencyFailureWithCancellation workflow failed purposefully in %d ms", duration.Milliseconds()) var typedError *workflow.ErrNoMorePossibleOutputs if !errors.As(err, &typedError) { t.Fatalf("incorrect error type returned: %T (%s)", err, err) @@ -1510,6 +1508,7 @@ steps: src: "n/a" deployment_type: "builtin" step: wait + closure_wait_timeout: 0 input: wait_time_ms: 7000 outputs: @@ -1524,13 +1523,13 @@ func TestMultiDependencyWorkflowFailureWithErrorOut(t *testing.T) { preparedWorkflow := assert.NoErrorR[workflow.ExecutableWorkflow](t)( getTestImplPreparedWorkflow(t, multiDependencyFailureWorkflowWithErrorOut), ) - startTime := time.Now() // Right before execute to not include pre-processing time. + startTime := time.Now() // Right before execution to not include pre-processing time. _, _, err := preparedWorkflow.Execute(context.Background(), map[string]any{ "fail_purposefully": true, }) duration := time.Since(startTime) assert.Error(t, err) - t.Logf("MultiDependency workflow failed purposefully in %d ms", duration.Milliseconds()) + t.Logf("MultiDependencyFailureWithErrorOut workflow failed purposefully in %d ms", duration.Milliseconds()) var typedError *workflow.ErrNoMorePossibleOutputs if !errors.As(err, &typedError) { t.Fatalf("incorrect error type returned: %T (%s)", err, err) @@ -1567,6 +1566,7 @@ steps: src: "n/a" deployment_type: "builtin" step: wait + closure_wait_timeout: 0 input: wait_time_ms: 7000 outputs: @@ -1584,7 +1584,7 @@ func TestMultiDependencyWorkflowFailureWithDeployFailure(t *testing.T) { preparedWorkflow := assert.NoErrorR[workflow.ExecutableWorkflow](t)( getTestImplPreparedWorkflow(t, multiDependencyFailureWorkflowWithDeployFailure), ) - startTime := time.Now() // Right before execute to not include pre-processing time. + startTime := time.Now() // Right before execution to not include pre-processing time. _, _, err := preparedWorkflow.Execute(context.Background(), map[string]any{ "fail_purposefully": true, }) @@ -1644,8 +1644,9 @@ steps: src: "n/a" deployment_type: "builtin" step: wait + closure_wait_timeout: 0 input: - wait_time_ms: 100 + wait_time_ms: 10 outputs: workflow-success: failed_step_output: !expr $.steps.failed_step_A.outputs @@ -1662,15 +1663,239 @@ func TestMultiDependencyWorkflowFailureDoubleFailure(t *testing.T) { getTestImplPreparedWorkflow(t, multiDependencyFailureWorkflowWithDoubleFailure), ) - startTime := time.Now() // Right before execute to not include pre-processing time. - _, _, err := preparedWorkflow.Execute(context.Background(), map[string]any{}) + startTime := time.Now() // Right before execution to not include pre-processing time. + outputID, _, err := preparedWorkflow.Execute(context.Background(), map[string]any{}) + duration := time.Since(startTime) + assert.NoError(t, err) + assert.Equals(t, outputID, "wait-only") + t.Logf("MultiDependency DoubleFailure finished in %d ms", duration.Milliseconds()) +} + +var multiDependencyFailureWorkflowContextCancelled = ` +version: v0.2.0 +input: + root: WorkflowInput + objects: + WorkflowInput: + id: WorkflowInput + properties: {} +steps: + wait_step: + plugin: + src: "n/a" + deployment_type: "builtin" + step: wait + closure_wait_timeout: 0 + input: + wait_time_ms: 500 + cancelled_step: + plugin: + src: "n/a" + deployment_type: "builtin" + step: wait + stop_if: !expr $.steps.wait_step.outputs # The context will get cancelled before this triggers. + input: + wait_time_ms: 0 + deploy: + deployer_name: "test-impl" + deploy_succeed: !expr 'true' + deploy_time: 5 # ms +outputs: + finished: + cancelled_step_output: !expr $.steps.cancelled_step.outputs + wait-only: # The workflow needs to keep running after the cancelled step exits. + wait_output: !expr $.steps.wait_step.outputs +` + +func TestMultiDependencyWorkflowContextCanceled(t *testing.T) { + // A scenario where the step's inputs are resolvable, but the context is cancelled, resulting + // in a possible conflict with the cancelled step stage DAG node. + // To do this, create a multi-dependency setup, finish the step, then cancel the workflow + // before the workflow finishes. + preparedWorkflow := assert.NoErrorR[workflow.ExecutableWorkflow](t)( + getTestImplPreparedWorkflow(t, multiDependencyFailureWorkflowContextCancelled), + ) + + ctx, timeout := context.WithTimeout(context.Background(), time.Millisecond*10) + startTime := time.Now() // Right before execution to not include pre-processing time. + _, _, err := preparedWorkflow.Execute(ctx, map[string]any{}) + duration := time.Since(startTime) + assert.NoError(t, err) + timeout() + t.Logf("MultiDependency ContextCanceled finished in %d ms", duration.Milliseconds()) +} + +var multiDependencyDependOnClosedStepPostDeployment = ` +version: v0.2.0 +input: + root: WorkflowInput + objects: + WorkflowInput: + id: WorkflowInput + properties: {} +steps: + wait_step: + plugin: + src: "n/a" + deployment_type: "builtin" + step: wait + input: + wait_time_ms: 1 + long_wait: + plugin: + src: "n/a" + deployment_type: "builtin" + step: wait + input: + wait_time_ms: 5000 + cancelled_step: + plugin: + src: "n/a" + deployment_type: "builtin" + step: wait + stop_if: !expr $.steps.wait_step.outputs + wait_for: !expr $.steps.long_wait.outputs + input: + wait_time_ms: 0 + # The deploy section is blank, so it will target a post-deployment cancellation. +outputs: + finished: + cancelled_step_output: !expr $.steps.cancelled_step.outputs + closed: # The workflow needs to keep running after the cancelled step exits. + closed_output: !expr $.steps.cancelled_step.closed.result + wait_finished: + wait_output: !expr $.steps.long_wait.outputs +` + +func TestMultiDependencyDependOnClosedStepPostDeployment(t *testing.T) { + // This has the output depend on the closed output of a step. + preparedWorkflow := assert.NoErrorR[workflow.ExecutableWorkflow](t)( + getTestImplPreparedWorkflow(t, multiDependencyDependOnClosedStepPostDeployment), + ) + + startTime := time.Now() // Right before execution to not include pre-processing time. + outputID, outputData, err := preparedWorkflow.Execute(context.Background(), map[string]any{}) + duration := time.Since(startTime) + t.Logf("MultiDependency DependOnClosedStepPostDeployment finished in %d ms", duration.Milliseconds()) + assert.NoError(t, err) + assert.Equals(t, outputID, "closed") + assert.Equals(t, outputData.(map[any]any), map[any]any{ + "closed_output": map[any]any{ + "cancelled": true, + "close_requested": false, + }, + }) +} + +var multiDependencyDependOnClosedDeployment = ` +version: v0.2.0 +input: + root: WorkflowInput + objects: + WorkflowInput: + id: WorkflowInput + properties: {} +steps: + wait_step: + plugin: + src: "n/a" + deployment_type: "builtin" + step: wait + input: + wait_time_ms: 1 + cancelled_step: + plugin: + src: "n/a" + deployment_type: "builtin" + step: wait + stop_if: !expr $.steps.wait_step.outputs + input: + wait_time_ms: 500 + # The deploy section has a delay black, so it will target a mid-deployment cancellation. + closure_wait_timeout: 0 + deploy: + deployer_name: "test-impl" + deploy_time: 10 # ms. If this is too low, there will be a race that results in run fail instead of closure. +outputs: + finished: + cancelled_step_output: !expr $.steps.cancelled_step.outputs + closed: # The workflow needs to keep running after the cancelled step exits. + closed_output: !expr $.steps.cancelled_step.closed.result +` + +func TestMultiDependencyDependOnClosedStepDeployment(t *testing.T) { + // This has the output depend on the closed output of a step. + preparedWorkflow := assert.NoErrorR[workflow.ExecutableWorkflow](t)( + getTestImplPreparedWorkflow(t, multiDependencyDependOnClosedDeployment), + ) + + startTime := time.Now() // Right before execution to not include pre-processing time. + outputID, outputData, err := preparedWorkflow.Execute(context.Background(), map[string]any{}) duration := time.Since(startTime) + t.Logf("MultiDependency DependOnClosedStep finished in %d ms", duration.Milliseconds()) assert.NoError(t, err) - t.Logf("MultiDependency finished in %d ms", duration.Milliseconds()) + assert.Equals(t, outputID, "closed") + assert.Equals(t, outputData.(map[any]any), map[any]any{ + "closed_output": map[any]any{ + "cancelled": true, + "close_requested": false, + }, + }) } -// TODO: Create a situation where step A's outputs are resolvable, but the context is cancelled, -// resulting in a possible conflict with the cancelled step stage DAG node. +var multiDependencyDependOnContextDoneDeployment = ` +version: v0.2.0 +input: + root: WorkflowInput + objects: + WorkflowInput: + id: WorkflowInput + properties: {} +steps: + wait_step: + plugin: + src: "n/a" + deployment_type: "builtin" + step: wait + closure_wait_timeout: 0 + input: + wait_time_ms: 1000 + not_enabled_step: + plugin: + src: "n/a" + deployment_type: "builtin" + step: wait + enabled: !expr $.steps.wait_step.outputs.success.message == "Plugin slept for 100 ms." + input: + wait_time_ms: 0 +outputs: + finished: + cancelled_step_output: !expr $.steps.not_enabled_step.outputs + closed: # The workflow needs to keep running after the cancelled step exits. + closed_output: !expr $.steps.not_enabled_step.closed.result +` + +func TestMultiDependencyDependOnContextDoneDeployment(t *testing.T) { + // A scenario where you close the context but still expect an output by depending on the closed output. + preparedWorkflow := assert.NoErrorR[workflow.ExecutableWorkflow](t)( + getTestImplPreparedWorkflow(t, multiDependencyDependOnContextDoneDeployment), + ) + + ctx, timeout := context.WithTimeout(context.Background(), time.Millisecond*10) + startTime := time.Now() // Right before execution to not include pre-processing time. + outputID, outputData, err := preparedWorkflow.Execute(ctx, map[string]any{}) + duration := time.Since(startTime) + assert.NoError(t, err) + timeout() + assert.Equals(t, outputID, "closed") + assert.Equals(t, outputData.(map[any]any), map[any]any{ + "closed_output": map[any]any{ + "cancelled": false, + "close_requested": true, + }, + }) + t.Logf("MultiDependency DependOnClosedStep finished in %d ms", duration.Milliseconds()) +} func createTestExecutableWorkflow(t *testing.T, workflowStr string, workflowCtx map[string][]byte) (workflow.ExecutableWorkflow, error) { logConfig := log.Config{ From d39357b13fff714ff294f695ed7d66a0c19fd786 Mon Sep 17 00:00:00 2001 From: Jared O'Connell Date: Thu, 25 Jul 2024 16:22:41 -0400 Subject: [PATCH 13/22] Fix comments --- internal/step/plugin/provider.go | 2 +- internal/step/plugin/provider_test.go | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/internal/step/plugin/provider.go b/internal/step/plugin/provider.go index 583be81d..9bc2642a 100644 --- a/internal/step/plugin/provider.go +++ b/internal/step/plugin/provider.go @@ -1350,7 +1350,7 @@ func (r *runningStep) runStage(forCloseWaitMS int64) error { select { case result = <-r.executionChannel: case <-r.ctx.Done(): - // In this case, it is being instructed to stop. A signal should have been sent. + // In this case, it is being instructed to stop. A cancellation signal should be sent if supported. if r.hasCancellationHandler() { r.logger.Debugf("Got step context done before step run complete. Sending cancellation signal. Waiting up to %d milliseconds for result.", forCloseWaitMS) r.lock.Lock() diff --git a/internal/step/plugin/provider_test.go b/internal/step/plugin/provider_test.go index 6d458229..e946f000 100644 --- a/internal/step/plugin/provider_test.go +++ b/internal/step/plugin/provider_test.go @@ -518,5 +518,3 @@ func TestProvider_StartFail(t *testing.T) { assert.NoError(t, running.Close()) }) } - -// TODO: Add more tests here for the current functions and code paths. From eab20c476d4a4e31721073cafe39623f72e075f7 Mon Sep 17 00:00:00 2001 From: Jared O'Connell Date: Thu, 25 Jul 2024 16:31:44 -0400 Subject: [PATCH 14/22] Update plugin SDK --- go.mod | 8 ++++---- go.sum | 16 ++++++++-------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/go.mod b/go.mod index 4b37a585..ecd31b85 100644 --- a/go.mod +++ b/go.mod @@ -6,12 +6,12 @@ require ( go.arcalot.io/assert v1.8.0 go.arcalot.io/dgraph v1.5.0 go.arcalot.io/lang v1.1.0 - go.arcalot.io/log/v2 v2.1.0 + go.arcalot.io/log/v2 v2.2.0 go.flow.arcalot.io/deployer v0.6.1 go.flow.arcalot.io/dockerdeployer v0.7.2 go.flow.arcalot.io/expressions v0.4.3 go.flow.arcalot.io/kubernetesdeployer v0.9.3 - go.flow.arcalot.io/pluginsdk v0.12.2 + go.flow.arcalot.io/pluginsdk v0.12.3 go.flow.arcalot.io/podmandeployer v0.11.2 go.flow.arcalot.io/pythondeployer v0.6.1 go.flow.arcalot.io/testdeployer v0.6.1 @@ -66,8 +66,8 @@ require ( go.opentelemetry.io/proto/otlp v1.2.0 // indirect golang.org/x/net v0.26.0 // indirect golang.org/x/oauth2 v0.20.0 // indirect - golang.org/x/sys v0.21.0 // indirect - golang.org/x/term v0.21.0 // indirect + golang.org/x/sys v0.22.0 // indirect + golang.org/x/term v0.22.0 // indirect golang.org/x/text v0.16.0 // indirect golang.org/x/time v0.5.0 // indirect google.golang.org/grpc v1.64.0 // indirect diff --git a/go.sum b/go.sum index 5b464a69..aa4b96aa 100644 --- a/go.sum +++ b/go.sum @@ -129,8 +129,8 @@ go.arcalot.io/exex v0.2.0 h1:u44pjwPwcH57TF8knhaqVZP/1V/KbnRe//pKzMwDpLw= go.arcalot.io/exex v0.2.0/go.mod h1:5zlFr+7vOQNZKYCNOEDdsad+z/dlvXKs2v4kG+v+bQo= go.arcalot.io/lang v1.1.0 h1:ugglRKpd3qIMkdghAjKJxsziIgHm8QpxrzZPSXoa08I= go.arcalot.io/lang v1.1.0/go.mod h1:2BZJO4csY7NnN/Nf1+eTdIQH4A2vxtOMneaO+PJl+Co= -go.arcalot.io/log/v2 v2.1.0 h1:lNO931hJ82LgS6WcCFCxpLWXQXPFhOkz6PyAJ/augq4= -go.arcalot.io/log/v2 v2.1.0/go.mod h1:PNWOSkkPmgS2OMlWTIlB/WqOw0yaBvDYd8ENAP80H4k= +go.arcalot.io/log/v2 v2.2.0 h1:a4wVAqQ/6zFyEG6I9Dnd7eFA52KKGaBK3hj9PgZ/e0c= +go.arcalot.io/log/v2 v2.2.0/go.mod h1:h/Hlyz6wH+mjRUKdL3W2fG2oMrAm2qgPxb0rNJJDUuY= go.flow.arcalot.io/deployer v0.6.1 h1:Q65VHeRZzdrMJZqTRb26EQZQbK+C3pORETVlpw02xWQ= go.flow.arcalot.io/deployer v0.6.1/go.mod h1:Oh+71KYQEof6IS3UGhpMyjQQPRcuomUccn7fwAqrPxE= go.flow.arcalot.io/dockerdeployer v0.7.2 h1:+vUU1f0+wRYAqI41jPI1MUQFU3yknu49qsPGTaxvGug= @@ -139,8 +139,8 @@ go.flow.arcalot.io/expressions v0.4.3 h1:0BRRghutHp0sctsITHe/A1le0yYiJtKNTxm27T+ go.flow.arcalot.io/expressions v0.4.3/go.mod h1:UORX78N4ep71wOzNXdIo/UY+6SdDD0id0mvuRNEQMeM= go.flow.arcalot.io/kubernetesdeployer v0.9.3 h1:XKiqmCqXb6ZLwP5IQTAKS/gJHpq0Ub/yEjCfgAwQF2A= go.flow.arcalot.io/kubernetesdeployer v0.9.3/go.mod h1:DtB6HR7HBt/HA1vME0faIpOQ/lhfBJjL6OAGgT3Bu/Q= -go.flow.arcalot.io/pluginsdk v0.12.2 h1:ue+Ax1c3Bw0TmnuAEZ4r19D6ZA4dns9rP3NO4cvmTec= -go.flow.arcalot.io/pluginsdk v0.12.2/go.mod h1:U+7mtXo4aT3YNn9fNWZ3b5RBvXttm/iocc2/P4MGa88= +go.flow.arcalot.io/pluginsdk v0.12.3 h1:TQyzRltJ92ZiRv3JMkVKywu04xhfIq+aFAlH4oSfK0I= +go.flow.arcalot.io/pluginsdk v0.12.3/go.mod h1:5kMCVigP89J/KU5T72EAczQXPWXZkRfUFcpnIkOECV8= go.flow.arcalot.io/podmandeployer v0.11.2 h1:aqrHaNaCXYDREqDJpKhVeVIIZPiPld5SDvnUcc0Tjiw= go.flow.arcalot.io/podmandeployer v0.11.2/go.mod h1:70+9M6eVQa0EEynDZ720P3AEzXt1gZto2lvoMlyjuzo= go.flow.arcalot.io/pythondeployer v0.6.1 h1:IyaA9BVfHJ2fhC+fNfT6VicrtRGFlZOlSaAVGGPwo1E= @@ -185,10 +185,10 @@ golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5h golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210616094352-59db8d763f22/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws= -golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.21.0 h1:WVXCp+/EBEHOj53Rvu+7KiT/iElMrO8ACK16SMZ3jaA= -golang.org/x/term v0.21.0/go.mod h1:ooXLefLobQVslOqselCNF4SxFAaoS6KujMbsGzSDmX0= +golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI= +golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.22.0 h1:BbsgPEJULsl2fV/AT3v15Mjva5yXKQDyKf+TbDz7QJk= +golang.org/x/term v0.22.0/go.mod h1:F3qCibpT5AMpCRfhfT53vVJwhLtIVHhB9XDjfFvnMI4= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4= From 6d169cbd07837a0304447963ae830e8d86857202 Mon Sep 17 00:00:00 2001 From: Jared O'Connell Date: Thu, 25 Jul 2024 17:50:03 -0400 Subject: [PATCH 15/22] Change locking for marking stages failed --- internal/step/plugin/provider.go | 8 +------- workflow/workflow.go | 2 ++ workflow/workflow_test.go | 2 +- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/internal/step/plugin/provider.go b/internal/step/plugin/provider.go index 9bc2642a..3582e928 100644 --- a/internal/step/plugin/provider.go +++ b/internal/step/plugin/provider.go @@ -1237,10 +1237,8 @@ func (r *runningStep) enableStage() (bool, bool) { } if enabled { - r.lock.Lock() // It's enabled, so the disabled stage will not occur. r.stageChangeHandler.OnStepStageFailure(r, string(StageIDDisabled), &r.wg, fmt.Errorf("step enabled; cannot be disabled anymore")) - r.lock.Unlock() } return enabled, false } @@ -1388,8 +1386,6 @@ func (r *runningStep) runStage(forCloseWaitMS int64) error { } func (r *runningStep) markStageFailures(firstStage StageID, err error) { - r.lock.Lock() - defer r.lock.Unlock() switch firstStage { case StageIDEnabling: r.stageChangeHandler.OnStepStageFailure(r, string(StageIDEnabling), &r.wg, err) @@ -1412,8 +1408,6 @@ func (r *runningStep) markStageFailures(firstStage StageID, err error) { // Closable is the graceful case, so this is necessary if it crashes. func (r *runningStep) markNotClosable(err error) { - r.lock.Lock() - defer r.lock.Unlock() r.stageChangeHandler.OnStepStageFailure(r, string(StageIDClosed), &r.wg, err) } @@ -1514,12 +1508,12 @@ func (r *runningStep) transitionStage(newStage StageID, state step.RunningStepSt func (r *runningStep) transitionFromFailedStage(newStage StageID, state step.RunningStepState, err error) { r.lock.Lock() - defer r.lock.Unlock() previousStage := string(r.currentStage) r.currentStage = newStage // Don't forget to update this, or else it will behave very oddly. // First running, then finished. You can't skip states. r.state = state + r.lock.Unlock() r.stageChangeHandler.OnStepStageFailure( r, previousStage, diff --git a/workflow/workflow.go b/workflow/workflow.go index 0e603c94..2ae76cf6 100644 --- a/workflow/workflow.go +++ b/workflow/workflow.go @@ -155,6 +155,8 @@ func (e *executableWorkflow) Execute(ctx context.Context, serializedInput any) ( } else { e.logger.Debugf("Step %q stage %q declared that it will not produce an output (%s)", stepID, stage, err.Error()) } + l.lock.Lock() + defer l.lock.Unlock() l.markOutputsUnresolvable(stepID, stage, nil) l.markStageNodeUnresolvable(stepID, stage) l.notifySteps() diff --git a/workflow/workflow_test.go b/workflow/workflow_test.go index d704f7af..193c1e19 100644 --- a/workflow/workflow_test.go +++ b/workflow/workflow_test.go @@ -1716,7 +1716,7 @@ func TestMultiDependencyWorkflowContextCanceled(t *testing.T) { getTestImplPreparedWorkflow(t, multiDependencyFailureWorkflowContextCancelled), ) - ctx, timeout := context.WithTimeout(context.Background(), time.Millisecond*10) + ctx, timeout := context.WithTimeout(context.Background(), time.Millisecond*30) startTime := time.Now() // Right before execution to not include pre-processing time. _, _, err := preparedWorkflow.Execute(ctx, map[string]any{}) duration := time.Since(startTime) From ad2cfc7a0e4ace3a980e1ef602ccd5cdc0fb3a64 Mon Sep 17 00:00:00 2001 From: Jared O'Connell Date: Thu, 25 Jul 2024 18:05:59 -0400 Subject: [PATCH 16/22] Add unresolvability to foreach steps --- internal/step/foreach/provider.go | 14 +++- workflow/workflow_test.go | 113 +++++++++++++++++++++++++++++- 2 files changed, 124 insertions(+), 3 deletions(-) diff --git a/internal/step/foreach/provider.go b/internal/step/foreach/provider.go index 4abdedf4..ace5e6e5 100644 --- a/internal/step/foreach/provider.go +++ b/internal/step/foreach/provider.go @@ -157,7 +157,7 @@ func (l *forEachProvider) LoadSchema(inputs map[string]any, workflowContext map[ return nil, err } - executor, err := l.executorFactory(l.logger) + executor, err := l.executorFactory(l.logger.WithLabel("subworkflow", workflowFileName.(string))) if err != nil { return nil, err } @@ -518,8 +518,12 @@ func (r *runningStep) run() { r.currentState = step.RunningStepStateRunning var outputID string var outputData any + var unresolvableStage StageID + var unresolvableError error if errors { r.currentStage = StageIDFailed + unresolvableStage = StageIDOutputs + unresolvableError = fmt.Errorf("foreach subworkflow failed with errors (%v)", itemErrors) outputID = "error" dataMap := make(map[int]any, len(loopData)) for i, entry := range itemOutputs { @@ -533,6 +537,8 @@ func (r *runningStep) run() { } } else { r.currentStage = StageIDOutputs + unresolvableStage = StageIDFailed + unresolvableError = fmt.Errorf("foreach succeeded, so error case is unresolvable") outputID = "success" outputData = map[string]any{ "data": itemOutputs, @@ -549,6 +555,12 @@ func (r *runningStep) run() { false, &r.wg, ) + r.stageChangeHandler.OnStepStageFailure( + r, + string(unresolvableStage), + &r.wg, + unresolvableError, + ) r.lock.Lock() r.currentState = step.RunningStepStateFinished previousStage = string(r.currentStage) diff --git a/workflow/workflow_test.go b/workflow/workflow_test.go index 193c1e19..2b1851f0 100644 --- a/workflow/workflow_test.go +++ b/workflow/workflow_test.go @@ -1496,7 +1496,7 @@ input: type: type_id: bool steps: - disabled_step: + failed_step: plugin: src: "n/a" deployment_type: "builtin" @@ -1513,7 +1513,7 @@ steps: wait_time_ms: 7000 outputs: workflow-success: - disabled_output: !expr $.steps.disabled_step.outputs.success + failed_output: !expr $.steps.failed_step.outputs.success long_wait_output: !expr $.steps.long_wait.outputs.success ` @@ -1897,6 +1897,115 @@ func TestMultiDependencyDependOnContextDoneDeployment(t *testing.T) { t.Logf("MultiDependency DependOnClosedStep finished in %d ms", duration.Milliseconds()) } +var multiDependencyForEachParent = ` +version: v0.2.0 +input: + root: RootObject + objects: + RootObject: + id: RootObject + properties: {} +steps: + subworkflow: + kind: foreach + items: + - {} + workflow: subworkflow.yaml + long_wait: + plugin: + src: "n/a" + deployment_type: "builtin" + step: wait + input: + wait_time_ms: 5000 +outputs: + success: + long_wait_output: !expr $.steps.long_wait.outputs + subworkflow_output: !expr $.steps.subworkflow.outputs.success +` + +var multiDependencyForEachSubwf = ` +version: v0.2.0 +input: + root: RootObject + objects: + RootObject: + id: RootObject + properties: {} +steps: + failed_step: + plugin: + src: "n/a" + deployment_type: "builtin" + step: hello + input: + fail: !expr true +outputs: + success: + b: !expr $.steps.failed_step.outputs.success +` + +func TestMultiDependencyForeach(t *testing.T) { + // This test runs a workflow with a wait and a subworkfow. + // This tests to ensure that the parent workflow immediately detects + // and acts on the missing dependency caused by the error output + // coming from the subworkflow. + + logConfig := log.Config{ + Level: log.LevelInfo, + Destination: log.DestinationStdout, + } + logger := log.New( + logConfig, + ) + cfg := &config.Config{ + Log: logConfig, + } + factories := workflowFactory{ + config: cfg, + } + deployerRegistry := deployerregistry.New( + deployer.Any(testimpl.NewFactory()), + ) + + pluginProvider := assert.NoErrorR[step.Provider](t)( + plugin.New(logger, deployerRegistry, map[string]interface{}{ + "builtin": map[string]any{ + "deployer_name": "test-impl", + "deploy_time": "0", + }, + }), + ) + stepRegistry, err := stepregistry.New( + pluginProvider, + lang.Must2(foreach.New(logger, factories.createYAMLParser, factories.createWorkflow)), + ) + assert.NoError(t, err) + + factories.stepRegistry = stepRegistry + executor := lang.Must2(workflow.NewExecutor( + logger, + cfg, + stepRegistry, + builtinfunctions.GetFunctions(), + )) + wf := lang.Must2(workflow.NewYAMLConverter(stepRegistry).FromYAML([]byte(multiDependencyForEachParent))) + preparedWorkflow := lang.Must2(executor.Prepare(wf, map[string][]byte{ + "subworkflow.yaml": []byte(multiDependencyForEachSubwf), + })) + startTime := time.Now() // Right before execute to not include pre-processing time. + _, _, err = preparedWorkflow.Execute(context.Background(), map[string]any{}) + duration := time.Since(startTime) + assert.Error(t, err) + + t.Logf("MultiDependency workflow failed purposefully in %d ms", duration.Milliseconds()) + var typedError *workflow.ErrNoMorePossibleOutputs + if !errors.As(err, &typedError) { + t.Fatalf("incorrect error type returned: %T (%s)", err, err) + } + assert.LessThan(t, duration.Milliseconds(), 400) +} + func createTestExecutableWorkflow(t *testing.T, workflowStr string, workflowCtx map[string][]byte) (workflow.ExecutableWorkflow, error) { logConfig := log.Config{ Level: log.LevelDebug, From 18680bd27ea35eb68f75de3076405a6ef9a3ebab Mon Sep 17 00:00:00 2001 From: Jared O'Connell Date: Fri, 26 Jul 2024 11:35:55 -0400 Subject: [PATCH 17/22] Address most linter errors --- internal/step/plugin/provider.go | 64 +++++++++++++++++++------------- workflow/workflow.go | 9 ++++- workflow/workflow_test.go | 2 +- 3 files changed, 47 insertions(+), 28 deletions(-) diff --git a/internal/step/plugin/provider.go b/internal/step/plugin/provider.go index 3582e928..9e583f5d 100644 --- a/internal/step/plugin/provider.go +++ b/internal/step/plugin/provider.go @@ -445,7 +445,7 @@ func (r *runnableStep) DisabledOutputSchema() *schema.StepOutputSchema { ) } -func ClosureTimeoutSchema() *schema.PropertySchema { +func closureTimeoutSchema() *schema.PropertySchema { return schema.NewPropertySchema( schema.NewIntSchema(schema.PointerTo(int64(0)), nil, nil), schema.NewDisplayValue( @@ -610,7 +610,7 @@ func (r *runnableStep) Lifecycle(input map[string]any) (result step.Lifecycle[st nil, nil, ), - "closure_wait_timeout": ClosureTimeoutSchema(), + "closure_wait_timeout": closureTimeoutSchema(), }, Outputs: map[string]*schema.StepOutputSchema{ "started": r.StartedSchema(), @@ -930,7 +930,7 @@ func (r *runningStep) provideStartingInput(input map[string]any) error { // Now get the other necessary data for running the step var timeout int64 if input["closure_wait_timeout"] != nil { - unserializedTimeout, err := ClosureTimeoutSchema().Unserialize(input["closure_wait_timeout"]) + unserializedTimeout, err := closureTimeoutSchema().Unserialize(input["closure_wait_timeout"]) if err != nil { return err } @@ -1086,38 +1086,50 @@ func (r *runningStep) run() { r.cancel() // Close before WaitGroup done r.wg.Done() // Done. Close may now exit. }() - container, contextDoneEarly, err := r.deployStage() + pluginConnection := r.startPlugin() + if pluginConnection == nil { + return + } + defer func() { + err := pluginConnection.Close() + if err != nil { + r.logger.Errorf("failed to close deployed container for step %s/%s", r.runID, r.pluginStepID) + } + }() + r.postDeployment(pluginConnection) +} + +// Deploys the plugin, and handles failure cases. +func (r *runningStep) startPlugin() deployer.Plugin { + pluginConnection, contextDoneEarly, err := r.deployStage() if contextDoneEarly { if err != nil { r.logger.Debugf("error due to step early closure: %s", err.Error()) } r.closedEarly(StageIDEnabling, true) - return + return nil } else if err != nil { r.deployFailed(err) - return + return nil } r.lock.Lock() select { case <-r.ctx.Done(): - if err := container.Close(); err != nil { + if err := pluginConnection.Close(); err != nil { r.logger.Warningf("failed to close deployed container for step %s/%s", r.runID, r.pluginStepID) } r.lock.Unlock() r.closedEarly(StageIDEnabling, false) - return + return nil default: - r.container = container + r.container = pluginConnection } - defer func() { - err := container.Close() - if err != nil { - r.logger.Errorf("failed to close deployed container for step %s/%s", r.runID, r.pluginStepID) - } - }() r.lock.Unlock() - r.logger.Debugf("Successfully deployed container with ID '%s' for step %s/%s", container.ID(), r.runID, r.pluginStepID) + r.logger.Debugf("Successfully deployed container with ID '%s' for step %s/%s", pluginConnection.ID(), r.runID, r.pluginStepID) + return pluginConnection +} +func (r *runningStep) postDeployment(pluginConnection deployer.Plugin) { r.logger.Debugf("Checking to see if step %s/%s is enabled", r.runID, r.pluginStepID) enabled, contextDoneEarly := r.enableStage() if contextDoneEarly { @@ -1131,7 +1143,8 @@ func (r *runningStep) run() { } var forceCloseTimeoutMS int64 - if contextDoneEarly, forceCloseTimeoutMS, err = r.startStage(container); contextDoneEarly { + var err error + if contextDoneEarly, forceCloseTimeoutMS, err = r.startStage(pluginConnection); contextDoneEarly { r.closedEarly(StageIDRunning, true) return } else if err != nil { @@ -1357,7 +1370,7 @@ func (r *runningStep) runStage(forCloseWaitMS int64) error { } else { r.logger.Warningf("Got step context done before step run complete. Force closing step %s/%s.", r.runID, r.pluginStepID) err := r.forceCloseInternal() - return fmt.Errorf("step forced closed after timeout without result (%s)", err) + return fmt.Errorf("step forced closed after timeout without result (%w)", err) } // Wait for cancellation to occur. select { @@ -1379,7 +1392,7 @@ func (r *runningStep) runStage(forCloseWaitMS int64) error { } // Execution complete, move to state running stage outputs, then to state finished stage. - r.transitionStage(StageIDOutput, step.RunningStepStateRunning) + r.transitionRunningStage(StageIDOutput) r.completeStep(r.currentStage, step.RunningStepStateFinished, &result.OutputID, &result.OutputData) return nil @@ -1413,7 +1426,7 @@ func (r *runningStep) markNotClosable(err error) { func (r *runningStep) deployFailed(err error) { r.logger.Debugf("Deploy failed stage for step %s/%s", r.runID, r.pluginStepID) - r.transitionStage(StageIDDeployFailed, step.RunningStepStateRunning) + r.transitionRunningStage(StageIDDeployFailed) r.logger.Warningf("Plugin step %s/%s deploy failed. %v", r.runID, r.pluginStepID, err) // Now it's done. @@ -1457,7 +1470,7 @@ func (r *runningStep) closedEarly(stageToMarkUnresolvable StageID, priorStageFai if priorStageFailed { r.transitionFromFailedStage(StageIDClosed, step.RunningStepStateRunning, fmt.Errorf("step closed early")) } else { - r.transitionStage(StageIDClosed, step.RunningStepStateRunning) + r.transitionRunningStage(StageIDClosed) } closedOutput := any(map[any]any{"cancelled": r.cancelled, "close_requested": r.closed.Load()}) @@ -1488,7 +1501,7 @@ func (r *runningStep) startFailed(err error) { } func (r *runningStep) runFailed(err error) { - r.transitionStage(StageIDCrashed, step.RunningStepStateRunning) + r.transitionRunningStage(StageIDCrashed) r.logger.Warningf("Plugin step %s/%s run failed. %v", r.runID, r.pluginStepID, err) // Now it's done. @@ -1501,9 +1514,10 @@ func (r *runningStep) runFailed(err error) { r.markNotClosable(err) } -// TransitionStage transitions the stage to the specified stage, and the state to the specified state. -func (r *runningStep) transitionStage(newStage StageID, state step.RunningStepState) { - r.transitionStageWithOutput(newStage, state, nil, nil) +// TransitionStage transitions the running step to the specified stage, and the state running. +// For other situations, use transitionFromFailedStage, completeStep, or transitionStageWithOutput +func (r *runningStep) transitionRunningStage(newStage StageID) { + r.transitionStageWithOutput(newStage, step.RunningStepStateRunning, nil, nil) } func (r *runningStep) transitionFromFailedStage(newStage StageID, state step.RunningStepState, err error) { diff --git a/workflow/workflow.go b/workflow/workflow.go index 4ecc8605..a47ee407 100644 --- a/workflow/workflow.go +++ b/workflow/workflow.go @@ -216,7 +216,9 @@ func (e *executableWorkflow) Execute(ctx context.Context, serializedInput any) ( case <-ctx.Done(): lastErrors := l.handleErrors() if lastErrors == nil { - e.logger.Warningf("Workflow execution aborted. Waiting 6 more seconds for output. %s", lastErrors) + e.logger.Warningf( + "Workflow execution aborted. Waiting 6 more seconds for output (%w)", + lastErrors) } else { return "", nil, lastErrors } @@ -233,7 +235,10 @@ func (e *executableWorkflow) Execute(ctx context.Context, serializedInput any) ( case outputDataEntry, ok := <-l.outputDataChannel: if !ok { lastErrors := l.handleErrors() - return "", nil, fmt.Errorf("output data channel unexpectedly closed while waiting after execution aborted. %s", lastErrors) + return "", nil, + fmt.Errorf( + "output data channel unexpectedly closed while waiting after execution aborted (%w)", + lastErrors) } return e.handleOutput(l, outputDataEntry) case err := <-l.recentErrors: // The context is done, so instead just check for errors. diff --git a/workflow/workflow_test.go b/workflow/workflow_test.go index 91cce163..97b83a0a 100644 --- a/workflow/workflow_test.go +++ b/workflow/workflow_test.go @@ -236,7 +236,7 @@ steps: deployment_type: "builtin" step: wait input: - wait_time_ms: 0 + wait_time_ms: 1000 # You can verify that this test works by commenting out this line. It should fail. stop_if: !expr $.steps.short_wait.outputs # Delay needs to be delayed long enough to ensure that it's in a deploy state when it's cancelled by short_wait From d61ba517f3ccf5edeb3218410ef5483a3186fa0b Mon Sep 17 00:00:00 2001 From: Jared O'Connell Date: Fri, 26 Jul 2024 11:46:09 -0400 Subject: [PATCH 18/22] Address linter errors --- internal/step/foreach/provider.go | 201 ++++++++++++++++-------------- 1 file changed, 106 insertions(+), 95 deletions(-) diff --git a/internal/step/foreach/provider.go b/internal/step/foreach/provider.go index ace5e6e5..0745ac61 100644 --- a/internal/step/foreach/provider.go +++ b/internal/step/foreach/provider.go @@ -464,111 +464,122 @@ func (r *runningStep) run() { waitingForInput, &r.wg, ) + r.runOnInput() +} + +func (r *runningStep) runOnInput() { select { case loopData, ok := <-r.inputData: if !ok { + r.logger.Debugf("aborted waiting for result in foreach") return } + r.processInput(loopData) + case <-r.ctx.Done(): + r.logger.Debugf("context done") + return + } +} - itemOutputs := make([]any, len(loopData)) - itemErrors := make(map[int]string, len(loopData)) - - r.logger.Debugf("Executing subworkflow for step %s...", r.runID) - wg := &sync.WaitGroup{} - wg.Add(len(loopData)) - errors := false - sem := make(chan struct{}, r.parallelism) - for i, input := range loopData { - i := i - input := input - go func() { - defer func() { - select { - case <-sem: - case <-r.ctx.Done(): // Must not deadlock if closed early. - } - wg.Done() - }() - r.logger.Debugf("Queuing item %d...", i) - select { - case sem <- struct{}{}: - case <-r.ctx.Done(): - r.logger.Debugf("Aborting item %d execution.", i) - return - } +func (r *runningStep) processInput(inputData []any) { + r.logger.Debugf("Executing subworkflow for step %s...", r.runID) + outputs, errors := r.executeSubWorkflows(inputData) - r.logger.Debugf("Executing item %d...", i) - // Ignore the output ID here because it can only be "success" - _, outputData, err := r.workflow.Execute(r.ctx, input) - r.lock.Lock() - if err != nil { - errors = true - itemErrors[i] = err.Error() - } else { - itemOutputs[i] = outputData - } - r.lock.Unlock() - r.logger.Debugf("Item %d complete.", i) - }() + r.logger.Debugf("Subworkflow %s complete.", r.runID) + r.lock.Lock() + previousStage := string(r.currentStage) + r.currentState = step.RunningStepStateRunning + var outputID string + var outputData any + var unresolvableStage StageID + var unresolvableError error + if len(errors) > 0 { + r.currentStage = StageIDFailed + unresolvableStage = StageIDOutputs + unresolvableError = fmt.Errorf("foreach subworkflow failed with errors (%v)", errors) + outputID = "error" + dataMap := make(map[int]any, len(inputData)) + for i, entry := range outputs { + if entry != nil { + dataMap[i] = entry + } } - wg.Wait() - r.logger.Debugf("Subworkflow %s complete.", r.runID) - r.lock.Lock() - previousStage := string(r.currentStage) - r.currentState = step.RunningStepStateRunning - var outputID string - var outputData any - var unresolvableStage StageID - var unresolvableError error - if errors { - r.currentStage = StageIDFailed - unresolvableStage = StageIDOutputs - unresolvableError = fmt.Errorf("foreach subworkflow failed with errors (%v)", itemErrors) - outputID = "error" - dataMap := make(map[int]any, len(loopData)) - for i, entry := range itemOutputs { - if entry != nil { - dataMap[i] = entry + outputData = map[string]any{ + "data": dataMap, + "messages": errors, + } + } else { + r.currentStage = StageIDOutputs + unresolvableStage = StageIDFailed + unresolvableError = fmt.Errorf("foreach succeeded, so error case is unresolvable") + outputID = "success" + outputData = map[string]any{ + "data": outputs, + } + } + currentStage := r.currentStage + r.lock.Unlock() + r.stageChangeHandler.OnStageChange( + r, + &previousStage, + nil, + nil, + string(currentStage), + false, + &r.wg, + ) + r.stageChangeHandler.OnStepStageFailure( + r, + string(unresolvableStage), + &r.wg, + unresolvableError, + ) + r.lock.Lock() + r.currentState = step.RunningStepStateFinished + previousStage = string(r.currentStage) + r.lock.Unlock() + r.stageChangeHandler.OnStepComplete(r, previousStage, &outputID, &outputData, &r.wg) +} + +// returns true if there is an error +func (r *runningStep) executeSubWorkflows(inputData []any) ([]any, map[int]string) { + itemOutputs := make([]any, len(inputData)) + itemErrors := make(map[int]string, len(inputData)) + wg := &sync.WaitGroup{} + wg.Add(len(inputData)) + sem := make(chan struct{}, r.parallelism) + for i, input := range inputData { + i := i + input := input + go func() { + defer func() { + select { + case <-sem: + case <-r.ctx.Done(): // Must not deadlock if closed early. } + wg.Done() + }() + r.logger.Debugf("Queuing item %d...", i) + select { + case sem <- struct{}{}: + case <-r.ctx.Done(): + r.logger.Debugf("Aborting item %d execution.", i) + return } - outputData = map[string]any{ - "data": dataMap, - "messages": itemErrors, - } - } else { - r.currentStage = StageIDOutputs - unresolvableStage = StageIDFailed - unresolvableError = fmt.Errorf("foreach succeeded, so error case is unresolvable") - outputID = "success" - outputData = map[string]any{ - "data": itemOutputs, + + r.logger.Debugf("Executing item %d...", i) + // Ignore the output ID here because it can only be "success" + _, outputData, err := r.workflow.Execute(r.ctx, input) + r.lock.Lock() + if err != nil { + itemErrors[i] = err.Error() + } else { + itemOutputs[i] = outputData } - } - currentStage := r.currentStage - r.lock.Unlock() - r.stageChangeHandler.OnStageChange( - r, - &previousStage, - nil, - nil, - string(currentStage), - false, - &r.wg, - ) - r.stageChangeHandler.OnStepStageFailure( - r, - string(unresolvableStage), - &r.wg, - unresolvableError, - ) - r.lock.Lock() - r.currentState = step.RunningStepStateFinished - previousStage = string(r.currentStage) - r.lock.Unlock() - r.stageChangeHandler.OnStepComplete(r, previousStage, &outputID, &outputData, &r.wg) - case <-r.ctx.Done(): - r.logger.Debugf("context done") - return + r.lock.Unlock() + r.logger.Debugf("Item %d complete.", i) + }() } - + wg.Wait() + return itemOutputs, itemErrors } From 7d7a04edd54e4cd7cd466afa583a9eb69a035d14 Mon Sep 17 00:00:00 2001 From: Jared O'Connell Date: Fri, 26 Jul 2024 11:57:50 -0400 Subject: [PATCH 19/22] Address linter errors --- internal/step/foreach/provider.go | 2 +- internal/step/plugin/provider.go | 2 +- workflow/executor.go | 2 ++ workflow/workflow.go | 11 ++++++----- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/internal/step/foreach/provider.go b/internal/step/foreach/provider.go index 0745ac61..8e21d871 100644 --- a/internal/step/foreach/provider.go +++ b/internal/step/foreach/provider.go @@ -541,7 +541,7 @@ func (r *runningStep) processInput(inputData []any) { r.stageChangeHandler.OnStepComplete(r, previousStage, &outputID, &outputData, &r.wg) } -// returns true if there is an error +// returns true if there is an error. func (r *runningStep) executeSubWorkflows(inputData []any) ([]any, map[int]string) { itemOutputs := make([]any, len(inputData)) itemErrors := make(map[int]string, len(inputData)) diff --git a/internal/step/plugin/provider.go b/internal/step/plugin/provider.go index 9e583f5d..07060727 100644 --- a/internal/step/plugin/provider.go +++ b/internal/step/plugin/provider.go @@ -1515,7 +1515,7 @@ func (r *runningStep) runFailed(err error) { } // TransitionStage transitions the running step to the specified stage, and the state running. -// For other situations, use transitionFromFailedStage, completeStep, or transitionStageWithOutput +// For other situations, use transitionFromFailedStage, completeStep, or transitionStageWithOutput. func (r *runningStep) transitionRunningStage(newStage StageID) { r.transitionStageWithOutput(newStage, step.RunningStepStateRunning, nil, nil) } diff --git a/workflow/executor.go b/workflow/executor.go index 5a31f055..ce446ef8 100644 --- a/workflow/executor.go +++ b/workflow/executor.go @@ -403,6 +403,8 @@ func addScopesWithReferences(allNamespaces map[string]map[string]*schema.ObjectS } // connectStepDependencies connects the steps based on their expressions. +// +//nolint:gocognit func (e *executor) connectStepDependencies( workflow *Workflow, workflowContext map[string][]byte, diff --git a/workflow/workflow.go b/workflow/workflow.go index a47ee407..9833ea73 100644 --- a/workflow/workflow.go +++ b/workflow/workflow.go @@ -210,7 +210,8 @@ func (e *executableWorkflow) Execute(ctx context.Context, serializedInput any) ( case outputDataEntry, ok := <-l.outputDataChannel: if !ok { lastErrors := l.handleErrors() - return "", nil, fmt.Errorf("output data channel unexpectedly closed. %s", lastErrors) + return "", nil, + fmt.Errorf("output data channel unexpectedly closed. %w", lastErrors) } return e.handleOutput(l, outputDataEntry) case <-ctx.Done(): @@ -249,7 +250,7 @@ func (e *executableWorkflow) Execute(ctx context.Context, serializedInput any) ( return "", nil, lastErrors case <-timedContext.Done(): lastErrors := l.handleErrors() - return "", nil, fmt.Errorf("workflow execution aborted (%w) (%s)", ctx.Err(), lastErrors) + return "", nil, fmt.Errorf("workflow execution aborted (%w) (%s)", ctx.Err(), lastErrors.Error()) } } @@ -266,15 +267,15 @@ func (e *executableWorkflow) handleOutput(l *loopState, outputDataEntry outputDa outputSchema, ok := e.outputSchema[outputID] if !ok { return "", nil, fmt.Errorf( - "bug: no output named '%s' found in output schema (%s)", + "bug: no output named '%s' found in output schema (%w)", outputID, lastErrors, ) } _, err = outputSchema.Unserialize(outputDataEntry.outputData) if err != nil { return "", nil, fmt.Errorf( - "bug: output schema cannot unserialize output data (%w) (%s)", - err, lastErrors, + "bug: output schema cannot unserialize output data (%s) (%w)", + err.Error(), lastErrors, ) } return outputDataEntry.outputID, outputData, nil From 3ba5e06e08567bdceee0dc6d767e17767eff129b Mon Sep 17 00:00:00 2001 From: Jared O'Connell Date: Mon, 29 Jul 2024 17:16:33 -0400 Subject: [PATCH 20/22] Addressed changes brought up in meeting --- internal/step/foreach/provider.go | 8 +------- internal/step/plugin/provider.go | 19 +++++++++++-------- workflow/workflow.go | 9 +++++---- workflow/workflow_test.go | 6 +++--- 4 files changed, 20 insertions(+), 22 deletions(-) diff --git a/internal/step/foreach/provider.go b/internal/step/foreach/provider.go index 8e21d871..906db40f 100644 --- a/internal/step/foreach/provider.go +++ b/internal/step/foreach/provider.go @@ -370,9 +370,9 @@ type runningStep struct { func (r *runningStep) ProvideStageInput(stage string, input map[string]any) error { r.lock.Lock() + defer r.lock.Unlock() if r.closed { r.logger.Debugf("exiting foreach ProvideStageInput due to step being closed") - r.lock.Unlock() return nil } switch stage { @@ -384,13 +384,11 @@ func (r *runningStep) ProvideStageInput(stage string, input map[string]any) erro item := v.Index(i).Interface() _, err := r.workflow.Input().Unserialize(item) if err != nil { - r.lock.Unlock() return fmt.Errorf("invalid input item %d for subworkflow (%w) for run/step %s", i, err, r.runID) } input[i] = item } if r.inputAvailable { - r.lock.Unlock() return fmt.Errorf("input for execute workflow provided twice for run/step %s", r.runID) } if r.currentState == step.RunningStepStateWaitingForInput && r.currentStage == StageIDExecute { @@ -398,16 +396,12 @@ func (r *runningStep) ProvideStageInput(stage string, input map[string]any) erro } r.inputAvailable = true r.inputData <- input // Send before unlock to ensure that it never gets closed before sending. - r.lock.Unlock() return nil case string(StageIDOutputs): - r.lock.Unlock() return nil case string(StageIDFailed): - r.lock.Unlock() return nil default: - r.lock.Unlock() return fmt.Errorf("invalid stage: %s", stage) } } diff --git a/internal/step/plugin/provider.go b/internal/step/plugin/provider.go index 07060727..0900dcf6 100644 --- a/internal/step/plugin/provider.go +++ b/internal/step/plugin/provider.go @@ -6,6 +6,7 @@ import ( "go.arcalot.io/dgraph" "go.flow.arcalot.io/pluginsdk/plugin" "reflect" + "strconv" "strings" "sync" "sync/atomic" @@ -160,7 +161,8 @@ const ( StageIDOutput StageID = "outputs" // StageIDCrashed is a stage that indicates that the plugin has quit unexpectedly. StageIDCrashed StageID = "crashed" - // StageIDClosed is a stage that indicates that the plugin has exited due to workflow termination. + // StageIDClosed is a stage that indicates that the plugin has exited due to workflow + // termination or step cancellation. StageIDClosed StageID = "closed" // StageIDStarting is a stage that indicates that the plugin execution has begun. StageIDStarting StageID = "starting" @@ -445,6 +447,8 @@ func (r *runnableStep) DisabledOutputSchema() *schema.StepOutputSchema { ) } +const DEFAULT_CLOSURE_TIMEOUT = 5000 + func closureTimeoutSchema() *schema.PropertySchema { return schema.NewPropertySchema( schema.NewIntSchema(schema.PointerTo(int64(0)), nil, nil), @@ -458,7 +462,7 @@ func closureTimeoutSchema() *schema.PropertySchema { nil, nil, nil, - schema.PointerTo("5000"), // Default 5 seconds. + schema.PointerTo(strconv.FormatInt(DEFAULT_CLOSURE_TIMEOUT, 10)), nil, ) } @@ -936,7 +940,7 @@ func (r *runningStep) provideStartingInput(input map[string]any) error { } timeout = unserializedTimeout.(int64) } else { - timeout = 5000 + timeout = DEFAULT_CLOSURE_TIMEOUT } // Make sure we transition the state before unlocking so there are no race conditions. @@ -1034,7 +1038,6 @@ func (r *runningStep) forceCloseInternal() error { } func (r *runningStep) forceClose() error { - r.closed.Store(true) r.cancel() err := r.closeComponents(false) return err @@ -1352,7 +1355,7 @@ func (r *runningStep) startStage(container deployer.Plugin) (bool, int64, error) return false, runInput.forceCloseTimeoutMS, nil } -func (r *runningStep) runStage(forCloseWaitMS int64) error { +func (r *runningStep) runStage(forceCloseTimeoutMS int64) error { r.logger.Debugf("Running stage for step %s/%s", r.runID, r.pluginStepID) startedOutput := any(map[any]any{}) r.transitionStageWithOutput(StageIDRunning, step.RunningStepStateRunning, schema.PointerTo("started"), &startedOutput) @@ -1363,7 +1366,7 @@ func (r *runningStep) runStage(forCloseWaitMS int64) error { case <-r.ctx.Done(): // In this case, it is being instructed to stop. A cancellation signal should be sent if supported. if r.hasCancellationHandler() { - r.logger.Debugf("Got step context done before step run complete. Sending cancellation signal. Waiting up to %d milliseconds for result.", forCloseWaitMS) + r.logger.Debugf("Got step context done before step run complete. Sending cancellation signal. Waiting up to %d milliseconds for result.", forceCloseTimeoutMS) r.lock.Lock() r.cancelStep() r.lock.Unlock() @@ -1376,9 +1379,9 @@ func (r *runningStep) runStage(forCloseWaitMS int64) error { select { case result = <-r.executionChannel: // Successfully stopped before end of timeout. - case <-time.After(time.Duration(forCloseWaitMS) * time.Millisecond): + case <-time.After(time.Duration(forceCloseTimeoutMS) * time.Millisecond): r.logger.Warningf("Cancelled step %s/%s did not complete within the %d millisecond time limit. Force closing container.", - r.runID, r.pluginStepID, forCloseWaitMS) + r.runID, r.pluginStepID, forceCloseTimeoutMS) if err := r.forceCloseInternal(); err != nil { r.logger.Warningf("Error in step %s/%s while closing plugin container (%s)", r.runID, r.pluginStepID, err.Error()) return fmt.Errorf("step closed after timeout without result with error (%s)", err.Error()) diff --git a/workflow/workflow.go b/workflow/workflow.go index 9833ea73..06d3a58b 100644 --- a/workflow/workflow.go +++ b/workflow/workflow.go @@ -218,7 +218,7 @@ func (e *executableWorkflow) Execute(ctx context.Context, serializedInput any) ( lastErrors := l.handleErrors() if lastErrors == nil { e.logger.Warningf( - "Workflow execution aborted. Waiting 6 more seconds for output (%w)", + "Workflow execution aborted. Waiting for output before terminating (%w)", lastErrors) } else { return "", nil, lastErrors @@ -259,9 +259,10 @@ func (e *executableWorkflow) Execute(ctx context.Context, serializedInput any) ( func (e *executableWorkflow) handleOutput(l *loopState, outputDataEntry outputDataType) (outputID string, outputData any, err error) { lastErrors := l.handleErrors() if lastErrors != nil { - e.logger.Warningf("errors occurred alongside completed output (%s)", lastErrors.Error()) + e.logger.Warningf("output completed with errors (%s)", lastErrors.Error()) + } else { + e.logger.Debugf("output complete.") } - e.logger.Debugf("Output complete.") outputID = outputDataEntry.outputID outputData = outputDataEntry.outputData outputSchema, ok := e.outputSchema[outputID] @@ -524,7 +525,7 @@ func (l *loopState) notifySteps() { //nolint:gocognit // untypedInputData stores the resolved data untypedInputData, err := l.resolveExpressions(inputData, l.data) if err != nil { - // An error here often indicates a locking issue in a step provider. This would be caused + // An error here often indicates a locking issue in a step provider. This could be caused // by the lock not being held when the output was marked resolved. panic(fmt.Errorf("cannot resolve expressions for %s (%w)", nodeID, err)) } diff --git a/workflow/workflow_test.go b/workflow/workflow_test.go index 97b83a0a..0f46eb27 100644 --- a/workflow/workflow_test.go +++ b/workflow/workflow_test.go @@ -242,7 +242,7 @@ steps: # Delay needs to be delayed long enough to ensure that it's in a deploy state when it's cancelled by short_wait deploy: deployer_name: "test-impl" - deploy_time: 10 # ms + deploy_time: 20 # ms short_wait: plugin: src: "n/a" @@ -847,7 +847,7 @@ func TestEarlyContextCancellation(t *testing.T) { )) wf := lang.Must2(workflow.NewYAMLConverter(stepRegistry).FromYAML([]byte(fourSecWaitWorkflowDefinition))) preparedWorkflow := lang.Must2(executor.Prepare(wf, map[string][]byte{})) - // Cancel the context after 30 ms to simulate cancellation with ctrl-c. + // Cancel the context after a timeout to simulate cancellation with ctrl-c. ctx, cancel := context.WithTimeout(context.Background(), time.Millisecond*10) startTime := time.Now() // Right before execute to not include pre-processing time. //nolint:dogsled @@ -1451,7 +1451,7 @@ steps: stop_if: !expr '$.input.fail_purposefully' deploy: deployer_name: "test-impl" - deploy_time: 10 # 10 ms of delay to make the cancellation more reliable. + deploy_time: 10 # (ms) delay to make the cancellation more reliable. long_wait: plugin: src: "n/a" From 711d7044ed1da3f5bbca4f6e1d1b8dea6d1c92a1 Mon Sep 17 00:00:00 2001 From: Jared O'Connell Date: Mon, 29 Jul 2024 17:23:34 -0400 Subject: [PATCH 21/22] Fix linter error --- internal/step/plugin/provider.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/internal/step/plugin/provider.go b/internal/step/plugin/provider.go index 0900dcf6..23ed21a4 100644 --- a/internal/step/plugin/provider.go +++ b/internal/step/plugin/provider.go @@ -447,7 +447,7 @@ func (r *runnableStep) DisabledOutputSchema() *schema.StepOutputSchema { ) } -const DEFAULT_CLOSURE_TIMEOUT = 5000 +const defaultClosureTimeout = 5000 func closureTimeoutSchema() *schema.PropertySchema { return schema.NewPropertySchema( @@ -462,7 +462,7 @@ func closureTimeoutSchema() *schema.PropertySchema { nil, nil, nil, - schema.PointerTo(strconv.FormatInt(DEFAULT_CLOSURE_TIMEOUT, 10)), + schema.PointerTo(strconv.FormatInt(defaultClosureTimeout, 10)), nil, ) } @@ -940,7 +940,7 @@ func (r *runningStep) provideStartingInput(input map[string]any) error { } timeout = unserializedTimeout.(int64) } else { - timeout = DEFAULT_CLOSURE_TIMEOUT + timeout = defaultClosureTimeout } // Make sure we transition the state before unlocking so there are no race conditions. From 6de27d32bc383d542e36223703e1429c5e01dfd1 Mon Sep 17 00:00:00 2001 From: Jared O'Connell Date: Fri, 16 Aug 2024 18:49:49 -0400 Subject: [PATCH 22/22] Fix cross-test failure bug --- go.mod | 10 +++++----- go.sum | 10 ++++++++++ workflow/workflow.go | 5 +++-- workflow/workflow_test.go | 6 +++--- 4 files changed, 21 insertions(+), 10 deletions(-) diff --git a/go.mod b/go.mod index ecd31b85..9cf3f781 100644 --- a/go.mod +++ b/go.mod @@ -11,10 +11,10 @@ require ( go.flow.arcalot.io/dockerdeployer v0.7.2 go.flow.arcalot.io/expressions v0.4.3 go.flow.arcalot.io/kubernetesdeployer v0.9.3 - go.flow.arcalot.io/pluginsdk v0.12.3 + go.flow.arcalot.io/pluginsdk v0.12.5 go.flow.arcalot.io/podmandeployer v0.11.2 go.flow.arcalot.io/pythondeployer v0.6.1 - go.flow.arcalot.io/testdeployer v0.6.1 + go.flow.arcalot.io/testdeployer v0.6.2 gopkg.in/yaml.v3 v3.0.1 ) @@ -57,7 +57,7 @@ require ( github.com/rogpeppe/go-internal v1.12.0 // indirect github.com/x448/float16 v0.8.4 // indirect go.arcalot.io/exex v0.2.0 // indirect - go.flow.arcalot.io/testplugin v0.4.1 // indirect + go.flow.arcalot.io/testplugin v0.4.2 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.52.0 // indirect go.opentelemetry.io/otel v1.27.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.27.0 // indirect @@ -66,8 +66,8 @@ require ( go.opentelemetry.io/proto/otlp v1.2.0 // indirect golang.org/x/net v0.26.0 // indirect golang.org/x/oauth2 v0.20.0 // indirect - golang.org/x/sys v0.22.0 // indirect - golang.org/x/term v0.22.0 // indirect + golang.org/x/sys v0.24.0 // indirect + golang.org/x/term v0.23.0 // indirect golang.org/x/text v0.16.0 // indirect golang.org/x/time v0.5.0 // indirect google.golang.org/grpc v1.64.0 // indirect diff --git a/go.sum b/go.sum index aa4b96aa..3ffc7871 100644 --- a/go.sum +++ b/go.sum @@ -141,14 +141,20 @@ go.flow.arcalot.io/kubernetesdeployer v0.9.3 h1:XKiqmCqXb6ZLwP5IQTAKS/gJHpq0Ub/y go.flow.arcalot.io/kubernetesdeployer v0.9.3/go.mod h1:DtB6HR7HBt/HA1vME0faIpOQ/lhfBJjL6OAGgT3Bu/Q= go.flow.arcalot.io/pluginsdk v0.12.3 h1:TQyzRltJ92ZiRv3JMkVKywu04xhfIq+aFAlH4oSfK0I= go.flow.arcalot.io/pluginsdk v0.12.3/go.mod h1:5kMCVigP89J/KU5T72EAczQXPWXZkRfUFcpnIkOECV8= +go.flow.arcalot.io/pluginsdk v0.12.5 h1:9b3pKeoHCRH4yF2xIdu8L7SqNUYJFxTsk8L5+inR1xw= +go.flow.arcalot.io/pluginsdk v0.12.5/go.mod h1:5kMCVigP89J/KU5T72EAczQXPWXZkRfUFcpnIkOECV8= go.flow.arcalot.io/podmandeployer v0.11.2 h1:aqrHaNaCXYDREqDJpKhVeVIIZPiPld5SDvnUcc0Tjiw= go.flow.arcalot.io/podmandeployer v0.11.2/go.mod h1:70+9M6eVQa0EEynDZ720P3AEzXt1gZto2lvoMlyjuzo= go.flow.arcalot.io/pythondeployer v0.6.1 h1:IyaA9BVfHJ2fhC+fNfT6VicrtRGFlZOlSaAVGGPwo1E= go.flow.arcalot.io/pythondeployer v0.6.1/go.mod h1:gfEZtkDR/UURlwexuH0jARykh6hvFz1V5dC32Ub/YaM= go.flow.arcalot.io/testdeployer v0.6.1 h1:Bo0bP1U4QoMEMCHCOBVBTENpIlB8P19/5sh609r8mek= go.flow.arcalot.io/testdeployer v0.6.1/go.mod h1:FLVrEhYWH1a9RvIgtjZ88lRaodP+bOGEFqDqtRuN8B0= +go.flow.arcalot.io/testdeployer v0.6.2 h1:Bys5WJvHakXz543DsoRPJNBDS6MQUZca2STiaE6ERjM= +go.flow.arcalot.io/testdeployer v0.6.2/go.mod h1:0GooNIfYPWSMLlN6RDJjtsoekQGZRh0iVyRFDxwea1g= go.flow.arcalot.io/testplugin v0.4.1 h1:czJFi2kg54as9qVbDPW4cjrRcF3lyKyPlBew+Fmy5sE= go.flow.arcalot.io/testplugin v0.4.1/go.mod h1:yU+8dTuaqPlxNkceLN9AOAwe5lG+nsztCZYgEMmw3sc= +go.flow.arcalot.io/testplugin v0.4.2 h1:JTS8AIXntJOuwTfXwUmrXV2v2Wk7z48TYvC52fJPdkE= +go.flow.arcalot.io/testplugin v0.4.2/go.mod h1:yU+8dTuaqPlxNkceLN9AOAwe5lG+nsztCZYgEMmw3sc= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.52.0 h1:9l89oX4ba9kHbBol3Xin3leYJ+252h0zszDtBwyKe2A= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.52.0/go.mod h1:XLZfZboOJWHNKUv7eH0inh0E9VV6eWDFB/9yJyTLPp0= go.opentelemetry.io/otel v1.27.0 h1:9BZoF3yMK/O1AafMiQTVu0YDj5Ea4hPhxCs7sGva+cg= @@ -187,8 +193,12 @@ golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20210616094352-59db8d763f22/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI= golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg= +golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.22.0 h1:BbsgPEJULsl2fV/AT3v15Mjva5yXKQDyKf+TbDz7QJk= golang.org/x/term v0.22.0/go.mod h1:F3qCibpT5AMpCRfhfT53vVJwhLtIVHhB9XDjfFvnMI4= +golang.org/x/term v0.23.0 h1:F6D4vR+EHoL9/sWAWgAR1H2DcHr4PareCbAaCo1RpuU= +golang.org/x/term v0.23.0/go.mod h1:DgV24QBUrK6jhZXl+20l6UWznPlwAHm1Q1mGHtydmSk= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4= diff --git a/workflow/workflow.go b/workflow/workflow.go index 06d3a58b..9a0a0a33 100644 --- a/workflow/workflow.go +++ b/workflow/workflow.go @@ -221,6 +221,7 @@ func (e *executableWorkflow) Execute(ctx context.Context, serializedInput any) ( "Workflow execution aborted. Waiting for output before terminating (%w)", lastErrors) } else { + e.logger.Debugf("Workflow execution exited with error after context done") return "", nil, lastErrors } timedContext, cancelFunction := context.WithTimeout(context.Background(), 5*time.Second) @@ -261,7 +262,7 @@ func (e *executableWorkflow) handleOutput(l *loopState, outputDataEntry outputDa if lastErrors != nil { e.logger.Warningf("output completed with errors (%s)", lastErrors.Error()) } else { - e.logger.Debugf("output complete.") + e.logger.Debugf("output complete with output ID %s", outputDataEntry.outputID) } outputID = outputDataEntry.outputID outputData = outputDataEntry.outputData @@ -503,7 +504,7 @@ func (l *loopState) notifySteps() { //nolint:gocognit // Check to see if there are any remaining output nodes, and if there aren't, // cancel the context. delete(l.waitingOutputs, nodeID) - if len(l.waitingOutputs) == 0 { + if len(l.waitingOutputs) == 0 && !l.outputDone { l.recentErrors <- &ErrNoMorePossibleOutputs{ l.dag, } diff --git a/workflow/workflow_test.go b/workflow/workflow_test.go index 0f46eb27..8ba4053d 100644 --- a/workflow/workflow_test.go +++ b/workflow/workflow_test.go @@ -362,7 +362,7 @@ outputs: ` func TestWaitForSerial(t *testing.T) { - // For this test, a workflow runs two steps, where each step runs a wait step for 5s + // For this test, a workflow runs two steps, where each step runs a wait step for 10ms // The second wait step waits for the first to succeed after which it runs // Due to the wait for condition, the steps will execute serially // The total execution time for this test function should be greater than 10seconds @@ -525,7 +525,7 @@ steps: wait_time_ms: !expr $.input.wait_time_ms outputs: success: - b: !expr $.steps.wait_1.outputs + b: !expr $.steps.wait_1.outputs ` func TestWaitForSerial_Foreach(t *testing.T) { @@ -539,7 +539,7 @@ func TestWaitForSerial_Foreach(t *testing.T) { // run serially. logConfig := log.Config{ - Level: log.LevelInfo, + Level: log.LevelDebug, Destination: log.DestinationStdout, } logger := log.New(