From e3a907966ee130f2f56d1e5187589157482bca12 Mon Sep 17 00:00:00 2001
From: Piotr Truszkowski <truszkowski@gmail.com>
Date: Thu, 18 Apr 2024 10:14:44 +0200
Subject: [PATCH]  feat: wait for run in resource `spacelift_run` (#535)

* feat: added generic functions AsError and IsErrorType for working with error types inline in if-statements
* refactor: added github.com/hashicorp/terraform-plugin-log as direct requirement to go.mod
* feat: implemented wait logic for resource spacelift_run
* test: added TestRunResourceWait to spacelift/resource_run_test.go
* docs: updated docs for spacelift_run resource
* refactor: using continue_on_state for a list of possible end states to wait spacelift/resource_run.go
* refactor: change "enabled" to "disabled" in schema and waitConfiguration struct resource_run.go
* fix test: should be "disabled" instead of "enabled"
* let's treat unconfirmed as the target state.
* use RunResourceState (global scope) insteaf of Run (user scope)
* added two more tests for "wait_for_run"
* fix logs
* fix tests: same WP name for different tests
* fix field description; move piece of code into separate method

---------

Co-authored-by: Thomas <tmeckel@users.noreply.github.com>
---
 docs/resources/run.md            |  19 +++
 go.mod                           |   2 +-
 spacelift/internal/error.go      |  14 +++
 spacelift/resource_run.go        | 207 ++++++++++++++++++++++++++++++-
 spacelift/resource_run_test.go   | 160 ++++++++++++++++++++++++
 spacelift/resource_stack_test.go |   8 +-
 6 files changed, 402 insertions(+), 8 deletions(-)
diff --git a/docs/resources/run.md b/docs/resources/run.md
index 26f1cbc0..471305b7 100644
--- a/docs/resources/run.md
+++ b/docs/resources/run.md
@@ -40,7 +40,26 @@ resource "spacelift_run" "this" {
 - `commit_sha` (String) The commit SHA for which to trigger a run.
 - `keepers` (Map of String) Arbitrary map of values that, when changed, will trigger recreation of the resource.
 - `proposed` (Boolean) Whether the run is a proposed run. Defaults to `false`.
+- `timeouts` (Block, Optional) (see [below for nested schema](#nestedblock--timeouts))
+- `wait` (Block List, Max: 1) Wait for the run to finish (see [below for nested schema](#nestedblock--wait))
 
 ### Read-Only
 
 - `id` (String) The ID of the triggered run.
+
+<a id="nestedblock--timeouts"></a>
+### Nested Schema for `timeouts`
+
+Optional:
+
+- `create` (String)
+
+
+<a id="nestedblock--wait"></a>
+### Nested Schema for `wait`
+
+Optional:
+
+- `continue_on_state` (Set of String) Continue on the specified states of a finished run. If not specified, the default is `[ 'finished' ]`. You can use following states: `applying`, `canceled`, `confirmed`, `destroying`, `discarded`, `failed`, `finished`, `initializing`, `pending_review`, `performing`, `planning`, `preparing_apply`, `preparing_replan`, `preparing`, `queued`, `ready`, `replan_requested`, `skipped`, `stopped`, `unconfirmed`.
+- `continue_on_timeout` (Boolean) Continue if run timed out, i.e. did not reach any defined end state in time. Default: `false`
+- `disabled` (Boolean) Whether waiting for a job is disabled or not. Default: `false`
diff --git a/go.mod b/go.mod
index 34f0c96d..e1806895 100644
--- a/go.mod
+++ b/go.mod
@@ -6,6 +6,7 @@ require (
 	github.com/dgrijalva/jwt-go/v4 v4.0.0-preview1
 	github.com/hashicorp/go-cty v1.4.1-0.20200414143053-d3edf31b6320
 	github.com/hashicorp/go-retryablehttp v0.7.4
+	github.com/hashicorp/terraform-plugin-log v0.9.0
 	github.com/hashicorp/terraform-plugin-sdk/v2 v2.29.0
 	github.com/kelseyhightower/envconfig v1.4.0
 	github.com/pkg/errors v0.9.1
@@ -36,7 +37,6 @@ require (
 	github.com/hashicorp/terraform-exec v0.19.0 // indirect
 	github.com/hashicorp/terraform-json v0.17.1 // indirect
 	github.com/hashicorp/terraform-plugin-go v0.19.0 // indirect
-	github.com/hashicorp/terraform-plugin-log v0.9.0 // indirect
 	github.com/hashicorp/terraform-registry-address v0.2.2 // indirect
 	github.com/hashicorp/terraform-svchost v0.1.1 // indirect
 	github.com/hashicorp/yamux v0.0.0-20181012175058-2f1d1f20f75d // indirect
diff --git a/spacelift/internal/error.go b/spacelift/internal/error.go
index d1d8ca66..1ec8c66a 100644
--- a/spacelift/internal/error.go
+++ b/spacelift/internal/error.go
@@ -46,3 +46,17 @@ func parseExtensions(ext map[string]interface{}) string {
 
 	return strings.Join(errorParts, ", ")
 }
+
+// AsError is an inline form of errors.As.
+func AsError[TError error](err error) (TError, bool) {
+	var as TError
+	ok := errors.As(err, &as)
+	return as, ok
+}
+
+// IsErrorType reports whether or not the type of any error in err's chain matches
+// the Error type.
+func IsErrorType[TError error](err error) bool {
+	_, ok := AsError[TError](err)
+	return ok
+}
diff --git a/spacelift/resource_run.go b/spacelift/resource_run.go
index 4eb39e05..80137f5b 100644
--- a/spacelift/resource_run.go
+++ b/spacelift/resource_run.go
@@ -2,9 +2,16 @@ package spacelift
 
 import (
 	"context"
+	"fmt"
+	"slices"
+	"strings"
+	"time"
 
+	"github.com/hashicorp/terraform-plugin-log/tflog"
 	"github.com/hashicorp/terraform-plugin-sdk/v2/diag"
+	"github.com/hashicorp/terraform-plugin-sdk/v2/helper/retry"
 	"github.com/hashicorp/terraform-plugin-sdk/v2/helper/schema"
+	"github.com/pkg/errors"
 	"github.com/shurcooL/graphql"
 
 	"github.com/spacelift-io/terraform-provider-spacelift/spacelift/internal"
@@ -20,6 +27,11 @@ func resourceRun() *schema.Resource {
 		CreateContext: resourceRunCreate,
 		ReadContext:   schema.NoopContext,
 		Delete:        schema.RemoveFromState,
+		UpdateContext: schema.NoopContext,
+
+		Timeouts: &schema.ResourceTimeout{
+			Create: schema.DefaultTimeout(30 * time.Minute),
+		},
 
 		Schema: map[string]*schema.Schema{
 			"stack_id": {
@@ -55,16 +67,144 @@ func resourceRun() *schema.Resource {
 				Type:        schema.TypeString,
 				Computed:    true,
 			},
+			"wait": {
+				Type:        schema.TypeList,
+				Optional:    true,
+				Description: "Wait for the run to finish",
+				MaxItems:    1,
+				Elem: &schema.Resource{
+					Schema: map[string]*schema.Schema{
+						"disabled": {
+							Type:        schema.TypeBool,
+							Description: "Whether waiting for a job is disabled or not. Default: `false`",
+							Optional:    true,
+							Default:     false,
+						},
+						"continue_on_state": {
+							Type: schema.TypeSet,
+							Elem: &schema.Schema{
+								Type: schema.TypeString,
+							},
+							Description: "Continue on the specified states of a finished run. If not specified, the default is `[ 'finished' ]`. You can use following states: `applying`, `canceled`, `confirmed`, `destroying`, `discarded`, `failed`, `finished`, `initializing`, `pending_review`, `performing`, `planning`, `preparing_apply`, `preparing_replan`, `preparing`, `queued`, `ready`, `replan_requested`, `skipped`, `stopped`, `unconfirmed`.",
+							Optional:    true,
+						},
+						"continue_on_timeout": {
+							Type:        schema.TypeBool,
+							Description: "Continue if run timed out, i.e. did not reach any defined end state in time. Default: `false`",
+							Optional:    true,
+							Default:     false,
+						},
+					},
+				},
+			},
 		},
 	}
 }
 
+type waitConfiguration struct {
+	disabled          bool
+	continueOnState   []string
+	continueOnTimeout bool
+}
+
+func expandWaitConfiguration(input []interface{}) *waitConfiguration {
+	if len(input) == 0 {
+		return nil
+	}
+	v := input[0].(map[string]interface{})
+	cfg := &waitConfiguration{
+		disabled:          v["disabled"].(bool),
+		continueOnState:   []string{},
+		continueOnTimeout: v["continue_on_timeout"].(bool),
+	}
+
+	if v, ok := v["continue_on_state"]; ok {
+		for _, item := range v.(*schema.Set).List() {
+			str, ok := item.(string)
+			if !ok {
+				panic(fmt.Sprintf("continue_on_state contains a non-string element %+v", str))
+			}
+			cfg.continueOnState = append(cfg.continueOnState, str)
+		}
+	}
+	if len(cfg.continueOnState) == 0 {
+		cfg.continueOnState = append(cfg.continueOnState, "finished")
+	}
+	return cfg
+}
+
+func (wait *waitConfiguration) Wait(ctx context.Context, d *schema.ResourceData, client *internal.Client, stackID, mutationID string) diag.Diagnostics {
+	if wait.disabled {
+		return nil
+	}
+
+	stateConf := &retry.StateChangeConf{
+		ContinuousTargetOccurence: 1,
+		Delay:                     10 * time.Second,
+		MinTimeout:                10 * time.Second,
+		Pending: []string{
+			"running",
+		},
+		Target: []string{
+			"finished",
+			"unconfirmed", // Let's treat unconfirmed as the target state.
+			// It's not finished, but we don't want to wait for it because it requires confirmation from someone.
+		},
+		Refresh: checkStackStatusFunc(ctx, client, stackID, mutationID),
+		Timeout: d.Timeout(schema.TimeoutCreate),
+	}
+
+	finalState, err := stateConf.WaitForStateContext(ctx)
+	if err != nil {
+		if timeoutErr, ok := internal.AsError[*retry.TimeoutError](err); ok {
+			tflog.Debug(ctx, "received retry.TimeoutError from WaitForStateContext", map[string]any{
+				"stackID":       stackID,
+				"runID":         mutationID,
+				"lastState":     timeoutErr.LastState,
+				"expectedState": timeoutErr.ExpectedState,
+			})
+			finalState = "__timeout__"
+		} else if err == context.DeadlineExceeded {
+			tflog.Debug(ctx, "received context.DeadlineExceeded from WaitForStateContext", map[string]any{
+				"stackID": stackID,
+				"runID":   mutationID,
+			})
+			finalState = "__timeout__"
+		} else {
+			return diag.Errorf("failed waiting for run %s on stack %s to finish. error(%T): %+v ", mutationID, stackID, err, err)
+		}
+	}
+
+	switch finalState.(string) {
+	case "__timeout__":
+		if !wait.continueOnTimeout {
+			return diag.Errorf("run %s on stack %s has timed out", mutationID, stackID)
+		}
+		tflog.Info(ctx, "run timed out but continue_on_timeout=true",
+			map[string]any{
+				"stackID": stackID,
+				"runID":   mutationID,
+			})
+	default:
+		if !slices.Contains[[]string](wait.continueOnState, finalState.(string)) {
+			return diag.Errorf("run %s on stack %s has ended with status %s. expected %v", mutationID, stackID, finalState, wait.continueOnState)
+		}
+		tflog.Debug(ctx, "run finished", map[string]any{
+			"stackID":    stackID,
+			"runID":      mutationID,
+			"finalState": finalState,
+		})
+	}
+
+	return nil
+}
+
 func resourceRunCreate(ctx context.Context, d *schema.ResourceData, meta interface{}) diag.Diagnostics {
 	var mutation struct {
 		ID string `graphql:"runResourceCreate(stack: $stack, commitSha: $sha, proposed: $proposed)"`
 	}
 
-	stackID := d.Get("stack_id")
+	stackID := d.Get("stack_id").(string)
 
 	variables := map[string]interface{}{
 		"stack":    toID(stackID),
@@ -80,11 +220,72 @@ func resourceRunCreate(ctx context.Context, d *schema.ResourceData, meta interfa
 		variables["proposed"] = graphql.NewBoolean(graphql.Boolean(proposed.(bool)))
 	}
 
-	if err := meta.(*internal.Client).Mutate(ctx, "ResourceRunCreate", &mutation, variables); err != nil {
+	client := meta.(*internal.Client)
+	if err := client.Mutate(ctx, "ResourceRunCreate", &mutation, variables); err != nil {
 		return diag.Errorf("could not trigger run for stack %s: %v", stackID, internal.FromSpaceliftError(err))
 	}
 
-	d.SetId(mutation.ID)
+	if waitRaw, ok := d.GetOk("wait"); ok {
+		wait := expandWaitConfiguration(waitRaw.([]interface{}))
+		if diag := wait.Wait(ctx, d, client, stackID, mutation.ID); len(diag) > 0 {
+			return diag
+		}
+	}
 
+	d.SetId(mutation.ID)
 	return nil
 }
+
+func checkStackStatusFunc(ctx context.Context, client *internal.Client, stackID string, runID string) retry.StateRefreshFunc {
+	return func() (result any, state string, err error) {
+		// instead of a resource handle we return the current state as result
+		// Makes it easier to detect which end state has been reached.
+		// Otherwise we would need another GraphQL query
+		result, finished, err := getStackRunStateByID(ctx, client, stackID, runID)
+		if err != nil {
+			return
+		}
+		state = "running"
+		if finished {
+			state = "finished"
+		}
+		// Let's treat unconfirmed as the target state.
+		// It's not finished, but we don't want to wait for it because it requires confirmation from someone.
+		if result == "unconfirmed" {
+			state = "unconfirmed"
+		}
+		return
+	}
+}
+
+func getStackRunStateByID(ctx context.Context, client *internal.Client, stackID string, runID string) (string, bool, error) {
+	var query struct {
+		Stack struct {
+			RunResourceState struct {
+				ID       graphql.String
+				State    graphql.String
+				Finished graphql.Boolean
+			} `graphql:"runResourceState(id: $runId)"`
+		} `graphql:"stack(id: $stackId)"`
+	}
+
+	variables := map[string]interface{}{
+		"stackId": graphql.ID(stackID),
+		"runId":   graphql.ID(runID),
+	}
+
+	if err := client.Query(ctx, "StackRunRead", &query, variables); err != nil {
+		return "", false, errors.Wrap(err, fmt.Sprintf("could not query for run %s of stack %s", runID, stackID))
+	}
+
+	rrs := query.Stack.RunResourceState
+
+	currentState := strings.ToLower(string(rrs.State))
+	tflog.Debug(ctx, "current state of run", map[string]interface{}{
+		"stackID":      stackID,
+		"runID":        runID,
+		"currentState": currentState,
+		"finished":     rrs.Finished,
+	})
+	return currentState, bool(rrs.Finished), nil
+}
diff --git a/spacelift/resource_run_test.go b/spacelift/resource_run_test.go
index e2150ae5..aeb99baa 100644
--- a/spacelift/resource_run_test.go
+++ b/spacelift/resource_run_test.go
@@ -2,6 +2,7 @@ package spacelift
 
 import (
 	"fmt"
+	"regexp"
 	"testing"
 
 	"github.com/hashicorp/terraform-plugin-sdk/v2/helper/acctest"
@@ -47,3 +48,162 @@ func TestRunResource(t *testing.T) {
 		})
 	})
 }
+
+func TestRunResourceWait(t *testing.T) {
+
+	t.Run("on a new stack", func(t *testing.T) {
+		const resourceName = "spacelift_run.test"
+
+		randomID := acctest.RandStringFromCharSet(5, acctest.CharSetAlphaNum)
+		randomIDwp := acctest.RandStringFromCharSet(5, acctest.CharSetAlphaNum)
+
+		testSteps(t, []resource.TestStep{
+			{
+				Config: fmt.Sprintf(`
+					resource "spacelift_worker_pool" "test" {
+						name        = "Let's create a dummy worker pool to avoid running the job %s"
+					}
+
+					resource "spacelift_stack" "test" {
+						name           = "Test stack %s"
+						repository     = "demo"
+						branch         = "feat_wait_for_run"
+						worker_pool_id = spacelift_worker_pool.test.id
+					}
+
+					resource "spacelift_run" "test" {
+						stack_id = spacelift_stack.test.id
+
+						keepers = { "bacon" = "tasty" }
+
+						timeouts {
+							create = "10s"
+						}
+
+						wait {
+							disabled            = false
+							continue_on_timeout = true
+						}
+					}`, randomIDwp, randomID),
+				Check: Resource(
+					resourceName,
+					Attribute("id", IsNotEmpty()),
+					Attribute("stack_id", Contains(randomID)),
+				),
+			},
+		})
+	})
+
+	t.Run("timed out run", func(t *testing.T) {
+		const resourceName = "spacelift_run.test"
+
+		randomID := acctest.RandStringFromCharSet(5, acctest.CharSetAlphaNum)
+		randomIDwp := acctest.RandStringFromCharSet(5, acctest.CharSetAlphaNum)
+
+		testSteps(t, []resource.TestStep{
+			{
+				Config: fmt.Sprintf(`
+					resource "spacelift_worker_pool" "test" {
+						name        = "Let's create a dummy worker pool to avoid running the job %s"
+					}
+
+					resource "spacelift_stack" "test" {
+						name           = "Test stack %s"
+						repository     = "demo"
+						branch         = "feat_wait_for_run"
+						worker_pool_id = spacelift_worker_pool.test.id
+					}
+
+					resource "spacelift_run" "test" {
+						stack_id = spacelift_stack.test.id
+
+						keepers = { "bacon" = "tasty" }
+
+						timeouts {
+							create = "10s"
+						}
+
+						wait {
+							disabled            = false
+							continue_on_timeout = false
+						}
+					}`, randomIDwp, randomID),
+				ExpectError: regexp.MustCompile("run [0-9A-Z]* on stack test-stack-[a-z0-9]* has timed out"),
+			},
+		})
+	})
+
+	t.Run("continue on unconfirmed", func(t *testing.T) {
+		const resourceName = "spacelift_run.test"
+
+		randomID := acctest.RandStringFromCharSet(5, acctest.CharSetAlphaNum)
+
+		testSteps(t, []resource.TestStep{
+			{
+				Config: fmt.Sprintf(`
+					resource "spacelift_stack" "test" {
+						name           = "Test stack %s"
+						repository     = "demo"
+						branch         = "feat_wait_for_run"
+					}
+
+					resource "spacelift_run" "test" {
+						stack_id = spacelift_stack.test.id
+
+						keepers = { "bacon" = "tasty" }
+
+						timeouts {
+							create = "120s"
+						}
+
+						wait {
+							disabled            = false
+							continue_on_state   = ["unconfirmed"]
+						}
+					}`, randomID),
+				Check: Resource(
+					resourceName,
+					Attribute("id", IsNotEmpty()),
+					Attribute("stack_id", Contains(randomID)),
+				),
+			},
+		})
+	})
+
+	t.Run("finished with autodeploy", func(t *testing.T) {
+		const resourceName = "spacelift_run.test"
+
+		randomID := acctest.RandStringFromCharSet(5, acctest.CharSetAlphaNum)
+
+		testSteps(t, []resource.TestStep{
+			{
+				Config: fmt.Sprintf(`
+					resource "spacelift_stack" "test" {
+						name           = "Test stack %s"
+						repository     = "demo"
+						branch         = "feat_wait_for_run"
+						autodeploy     = true
+					}
+
+					resource "spacelift_run" "test" {
+						stack_id = spacelift_stack.test.id
+
+						keepers = { "bacon" = "tasty" }
+
+						timeouts {
+							create = "180s"
+						}
+
+						wait {
+							disabled            = false
+						}
+					}`, randomID),
+				Check: Resource(
+					resourceName,
+					Attribute("id", IsNotEmpty()),
+					Attribute("stack_id", Contains(randomID)),
+				),
+			},
+		})
+	})
+}
diff --git a/spacelift/resource_stack_test.go b/spacelift/resource_stack_test.go
index af30da40..9759aafc 100644
--- a/spacelift/resource_stack_test.go
+++ b/spacelift/resource_stack_test.go
@@ -146,10 +146,10 @@ func TestStackResource(t *testing.T) {
 					worker_pool_id       = spacelift_worker_pool.test.id
 				}
 				resource "spacelift_worker_pool" "test" {
-					name        = "Autoretryable worker pool."
+					name        = "Autoretryable worker pool (%s)."
 					description = "test worker pool"
 				}
-			`, description, randomID, randomID)
+			`, description, randomID, randomID, randomID)
 		}
 
 		testSteps(t, []resource.TestStep{
@@ -801,10 +801,10 @@ func TestStackResourceSpace(t *testing.T) {
 				}
 
 				resource "spacelift_worker_pool" "test" {
-					name        = "Autoretryable worker pool."
+					name        = "Autoretryable worker pool (%s)."
 					description = "test worker pool"
 				}
-			`, description, randomID, randomID)
+			`, description, randomID, randomID, randomID)
 		}
 
 		testSteps(t, []resource.TestStep{