Skip to content

Commit

Permalink
CSU-2024: Drift detection for other clouds (#423)
Browse files Browse the repository at this point in the history
* Add tests for update flow and credentials ID handling

* Credentials drift for GKE

* Expose common update for the three resoruce types. Use in AKS.

* Add common update in EKS

* Added comment why we don't do drift for EKS

* Tests for EKS. Added nil check in update handling

* Generate-all

* Add explicit provider name to tfplugindocs so it works even if folder name does not match

* Remove forgotten todo

* Add drift for EKS in case credentials are reset remotely without changing role.

* Increase acc test timeout a bit
  • Loading branch information
Tsonov authored Nov 15, 2024
1 parent ad777a3 commit bcc3f37
Show file tree
Hide file tree
Showing 9 changed files with 679 additions and 208 deletions.
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ generate-sdk:
.PHONY: generate-docs
generate-docs:
go install github.com/hashicorp/terraform-plugin-docs/cmd/[email protected]
tfplugindocs generate --rendered-provider-name "CAST AI" --ignore-deprecated
tfplugindocs generate --rendered-provider-name "CAST AI" --ignore-deprecated --provider-name terraform-provider-castai

.PHONY: generate-all
generate-all: generate-sdk generate-docs
Expand All @@ -63,7 +63,7 @@ test:
.PHONY: testacc
testacc:
@echo "==> Running acceptance tests"
TF_ACC=1 go test ./castai/... '-run=^TestAcc' -v -timeout 40m
TF_ACC=1 go test ./castai/... '-run=^TestAcc' -v -timeout 50m

.PHONY: validate-terraform-examples
validate-terraform-examples:
Expand Down
74 changes: 74 additions & 0 deletions castai/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,13 @@ package castai

import (
"context"
"errors"
"fmt"
"log"
"net/http"
"time"

"github.com/cenkalti/backoff/v4"
"github.com/google/uuid"
"github.com/hashicorp/terraform-plugin-sdk/v2/diag"
"github.com/hashicorp/terraform-plugin-sdk/v2/helper/retry"
Expand Down Expand Up @@ -132,6 +135,77 @@ func fetchClusterData(ctx context.Context, client *sdk.ClientWithResponses, clus
return resp, nil
}

// resourceCastaiClusterUpdate performs the update call to Cast API for a given cluster.
// Handles backoffs and data drift for fields that are not provider-specific.
// Caller is responsible to populate data and request parameters with all data.
func resourceCastaiClusterUpdate(
ctx context.Context,
client *sdk.ClientWithResponses,
data *schema.ResourceData,
request *sdk.ExternalClusterAPIUpdateClusterJSONRequestBody,
) error {
b := backoff.WithContext(backoff.NewExponentialBackOff(), ctx)

var lastErr error
var credentialsID string
if err := backoff.RetryNotify(func() error {
response, err := client.ExternalClusterAPIUpdateClusterWithResponse(ctx, data.Id(), *request)
if err != nil {
return fmt.Errorf("error when calling update cluster API: %w", err)
}

err = sdk.StatusOk(response)

if err != nil {
// In case of malformed user request return error to user right away.
// Credentials error is omitted as permissions propagate eventually and sometimes aren't visible immediately.
if response.StatusCode() == 400 && !sdk.IsCredentialsError(response) {
return backoff.Permanent(err)
}

if response.StatusCode() == 400 && sdk.IsCredentialsError(response) {
log.Printf("[WARN] Received credentials error from backend, will retry in case the issue is caused by IAM eventual consistency.")
}
return fmt.Errorf("error in update cluster response: %w", err)
}

if response.JSON200.CredentialsId != nil {
credentialsID = *response.JSON200.CredentialsId
}
return nil
}, b, func(err error, _ time.Duration) {
// Only store non-context errors so we can surface the last "real" error to the user at the end
if !errors.Is(err, context.DeadlineExceeded) && !errors.Is(err, context.Canceled) {
lastErr = err
}
log.Printf("[WARN] Encountered error while updating cluster settings, will retry: %v", err)
}); err != nil {
// Reset CredentialsID in state in case of failed updates.
// This is because TF will save the raw credentials in state even on failed updates.
// Since the raw values are not exposed via API, TF cannot see drift and will not try to re-apply them next time, leaving the caller stuck.
// Resetting this value here will trigger our credentialsID drift detection on Read() and force re-apply to fix the drift.
// Note: cannot use empty string; if first update failed then credentials will also be empty on remote => no drift on Read.
// Src: https://developer.hashicorp.com/terraform/plugin/framework/diagnostics#returning-errors-and-warnings
if err := data.Set(FieldClusterCredentialsId, "drift-protection-failed-update"); err != nil {
log.Printf("[ERROR] Failed to reset cluster credentials ID after failed update: %v", err)
}

if errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) {
return fmt.Errorf("updating cluster configuration failed due to context: %w; last observed error was: %v", err, lastErr)
}
return fmt.Errorf("updating cluster configuration: %w", err)
}

// In case the update succeeded, we must update the state with the *generated* credentials_id before re-reading.
// This is because on update, the credentials_id always changes => read drift detection would see that and trigger infinite drift
err := data.Set(FieldClusterCredentialsId, credentialsID)
if err != nil {
return fmt.Errorf("failed to update credentials ID after successful update: %w", err)
}

return nil
}

func createClusterToken(ctx context.Context, client *sdk.ClientWithResponses, clusterID string) (string, error) {
resp, err := client.ExternalClusterAPICreateClusterTokenWithResponse(ctx, clusterID)
if err != nil {
Expand Down
61 changes: 1 addition & 60 deletions castai/resource_aks_cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,10 @@ package castai

import (
"context"
"errors"
"fmt"
"log"
"time"

"github.com/cenkalti/backoff/v4"
"github.com/hashicorp/terraform-plugin-sdk/v2/diag"
"github.com/hashicorp/terraform-plugin-sdk/v2/helper/schema"
"github.com/hashicorp/terraform-plugin-sdk/v2/helper/validation"
Expand Down Expand Up @@ -223,62 +221,5 @@ func updateAKSClusterSettings(ctx context.Context, data *schema.ResourceData, cl

req.Credentials = &credentials

// Retries are required for newly created IAM resources to initialise on Azure side.
b := backoff.WithContext(backoff.WithMaxRetries(backoff.NewConstantBackOff(10*time.Second), 30), ctx)
var lastErr error
var credentialsID string
if err = backoff.RetryNotify(func() error {
response, err := client.ExternalClusterAPIUpdateClusterWithResponse(ctx, data.Id(), req)
if err != nil {
return fmt.Errorf("error when calling update cluster API: %w", err)
}

err = sdk.StatusOk(response)

if err != nil {
// In case of malformed user request return error to user right away.
// Credentials error is omitted as permissions propagate eventually and sometimes aren't visible immediately.
if response.StatusCode() == 400 && !sdk.IsCredentialsError(response) {
return backoff.Permanent(err)
}

if response.StatusCode() == 400 && sdk.IsCredentialsError(response) {
log.Printf("[WARN] Received credentials error from backend, will retry in case the issue is caused by IAM eventual consistency.")
}
return fmt.Errorf("error in update cluster response: %w", err)
}

credentialsID = *response.JSON200.CredentialsId
return nil
}, b, func(err error, _ time.Duration) {
// Only store non-context errors so we can surface the last "real" error to the user at the end
if !errors.Is(err, context.DeadlineExceeded) && !errors.Is(err, context.Canceled) {
lastErr = err
}
log.Printf("[WARN] Encountered error while updating cluster settings, will retry: %v", err)
}); err != nil {
// Reset CredentialsID in state in case of failed updates.
// This is because TF will save the raw credentials in state even on failed updates.
// Since the raw values are not exposed via API, TF cannot see drift and will not try to re-apply them next time, leaving the caller stuck.
// Resetting this value here will trigger our credentialsID drift detection on Read() and force re-apply to fix the drift.
// Note: cannot use empty string; if first update failed then credentials will also be empty on remote => no drift on Read.
// Src: https://developer.hashicorp.com/terraform/plugin/framework/diagnostics#returning-errors-and-warnings
if err := data.Set(FieldClusterCredentialsId, "drift-protection-failed-update"); err != nil {
log.Printf("[ERROR] Failed to reset cluster credentials ID after failed update: %v", err)
}

if errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) {
return fmt.Errorf("updating cluster configuration failed due to context: %w; last observed error was: %v", err, lastErr)
}
return fmt.Errorf("updating cluster configuration: %w", err)
}

// In case the update succeeded, we must update the state with the *generated* credentials_id before re-reading.
// This is because on update, the credentials_id always changes => read drift detection would see that and trigger infinite drift
err = data.Set(FieldClusterCredentialsId, credentialsID)
if err != nil {
return fmt.Errorf("failed to update credentials ID after successful update: %w", err)
}

return nil
return resourceCastaiClusterUpdate(ctx, client, data, &req)
}
90 changes: 83 additions & 7 deletions castai/resource_aks_cluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"github.com/hashicorp/go-cty/cty"
"github.com/hashicorp/terraform-plugin-sdk/v2/helper/acctest"
"github.com/hashicorp/terraform-plugin-sdk/v2/helper/resource"
"github.com/hashicorp/terraform-plugin-sdk/v2/helper/schema"
"github.com/hashicorp/terraform-plugin-sdk/v2/terraform"
"github.com/stretchr/testify/require"

Expand All @@ -22,7 +23,7 @@ import (
func TestAKSClusterResourceReadContext(t *testing.T) {
ctx := context.Background()

clusterId := "b6bfc074-a267-400f-b8f1-db0850c369b1"
clusterID := "b6bfc074-a267-400f-b8f1-db0850c369b1"

t.Run("read should populate data correctly", func(t *testing.T) {
r := require.New(t)
Expand Down Expand Up @@ -55,14 +56,14 @@ func TestAKSClusterResourceReadContext(t *testing.T) {
"private": true
}`)))
mockClient.EXPECT().
ExternalClusterAPIGetCluster(gomock.Any(), clusterId).
ExternalClusterAPIGetCluster(gomock.Any(), clusterID).
Return(&http.Response{StatusCode: 200, Body: body, Header: map[string][]string{"Content-Type": {"json"}}}, nil)

aksResource := resourceAKSCluster()

val := cty.ObjectVal(map[string]cty.Value{})
state := terraform.NewInstanceStateShimmedFromValue(val, 0)
state.ID = clusterId
state.ID = clusterID
// If local credentials don't match remote, drift detection would trigger.
// If local state has no credentials but remote has them, then the drift does exist so - there is separate test for that.
state.Attributes[FieldClusterCredentialsId] = "9b8d0456-177b-4a3d-b162-e68030d656aa"
Expand Down Expand Up @@ -115,14 +116,14 @@ Tainted = false

body := io.NopCloser(bytes.NewReader([]byte(fmt.Sprintf(`{"credentialsId": "%s"}`, tc.apiValue))))
mockClient.EXPECT().
ExternalClusterAPIGetCluster(gomock.Any(), clusterId).
ExternalClusterAPIGetCluster(gomock.Any(), clusterID).
Return(&http.Response{StatusCode: 200, Body: body, Header: map[string][]string{"Content-Type": {"json"}}}, nil)

aksResource := resourceAKSCluster()

val := cty.ObjectVal(map[string]cty.Value{})
state := terraform.NewInstanceStateShimmedFromValue(val, 0)
state.ID = clusterId
state.ID = clusterID
state.Attributes[FieldClusterCredentialsId] = tc.stateValue
state.Attributes[FieldAKSClusterClientID] = clientIDBeforeRead

Expand Down Expand Up @@ -171,14 +172,14 @@ Tainted = false

body := io.NopCloser(bytes.NewReader([]byte(fmt.Sprintf(`{"credentialsId": "%s"}`, tc.apiValue))))
mockClient.EXPECT().
ExternalClusterAPIGetCluster(gomock.Any(), clusterId).
ExternalClusterAPIGetCluster(gomock.Any(), clusterID).
Return(&http.Response{StatusCode: 200, Body: body, Header: map[string][]string{"Content-Type": {"json"}}}, nil)

aksResource := resourceAKSCluster()

val := cty.ObjectVal(map[string]cty.Value{})
state := terraform.NewInstanceStateShimmedFromValue(val, 0)
state.ID = clusterId
state.ID = clusterID
state.Attributes[FieldClusterCredentialsId] = tc.stateValue
state.Attributes[FieldAKSClusterClientID] = clientIDBeforeRead

Expand All @@ -196,6 +197,81 @@ Tainted = false
})
}

func TestAKSClusterResourceUpdateContext(t *testing.T) {
clusterID := "b6bfc074-a267-400f-b8f1-db0850c369b1"
ctx := context.Background()

t.Run("credentials_id special handling", func(t *testing.T) {
t.Run("on successful update, should avoid drift on the read", func(t *testing.T) {
r := require.New(t)
mockctrl := gomock.NewController(t)
mockClient := mock_sdk.NewMockClientInterface(mockctrl)
provider := &ProviderConfig{
api: &sdk.ClientWithResponses{
ClientInterface: mockClient,
},
}

credentialsIDAfterUpdate := "after-update-credentialsid"
clientID := "clientID"
updateResponse := io.NopCloser(bytes.NewReader([]byte(fmt.Sprintf(`{"credentialsId": "%s"}`, credentialsIDAfterUpdate))))
readResponse := io.NopCloser(bytes.NewReader([]byte(fmt.Sprintf(`{"credentialsId": "%s"}`, credentialsIDAfterUpdate))))
mockClient.EXPECT().
ExternalClusterAPIGetCluster(gomock.Any(), clusterID).
Return(&http.Response{StatusCode: 200, Body: readResponse, Header: map[string][]string{"Content-Type": {"json"}}}, nil)
mockClient.EXPECT().
ExternalClusterAPIUpdateCluster(gomock.Any(), clusterID, gomock.Any()).
Return(&http.Response{StatusCode: 200, Body: updateResponse, Header: map[string][]string{"Content-Type": {"json"}}}, nil)

aksResource := resourceAKSCluster()

diff := map[string]any{
FieldAKSClusterClientID: clientID,
FieldClusterCredentialsId: "before-update-credentialsid",
}
data := schema.TestResourceDataRaw(t, aksResource.Schema, diff)
data.SetId(clusterID)
diagnostics := aksResource.UpdateContext(ctx, data, provider)

r.Empty(diagnostics)

r.Equal(credentialsIDAfterUpdate, data.Get(FieldClusterCredentialsId))
r.Equal(clientID, data.Get(FieldAKSClusterClientID))
})

t.Run("on failed update, should overwrite credentialsID to force drift on next read", func(t *testing.T) {
r := require.New(t)
mockctrl := gomock.NewController(t)
mockClient := mock_sdk.NewMockClientInterface(mockctrl)
provider := &ProviderConfig{
api: &sdk.ClientWithResponses{
ClientInterface: mockClient,
},
}

mockClient.EXPECT().
ExternalClusterAPIUpdateCluster(gomock.Any(), clusterID, gomock.Any()).
Return(&http.Response{StatusCode: 400, Body: http.NoBody}, nil)

aksResource := resourceAKSCluster()

credentialsID := "credentialsID-before-updates"
diff := map[string]any{
FieldClusterCredentialsId: credentialsID,
}
data := schema.TestResourceDataRaw(t, aksResource.Schema, diff)
data.SetId(clusterID)
diagnostics := aksResource.UpdateContext(ctx, data, provider)

r.NotEmpty(diagnostics)

valueAfter := data.Get(FieldClusterCredentialsId)
r.NotEqual(credentialsID, valueAfter)
r.Contains(valueAfter, "drift")
})
})
}

func TestAccResourceAKSCluster(t *testing.T) {
rName := fmt.Sprintf("%v-aks-%v", ResourcePrefix, acctest.RandString(8))
resourceName := "castai_aks_cluster.test"
Expand Down
Loading

0 comments on commit bcc3f37

Please sign in to comment.