Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CSU-2424: AKS drift detection improvements #422

Merged
merged 6 commits into from
Nov 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 58 additions & 6 deletions castai/resource_aks_cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,14 @@ func resourceCastaiAKSClusterRead(ctx context.Context, data *schema.ResourceData
return nil
}

if resp.JSON200.CredentialsId != nil && *resp.JSON200.CredentialsId != data.Get(FieldClusterCredentialsId) {
log.Printf("[WARN] Drift in credentials from state (%q) and in API (%q), resetting client ID to force re-applying credentials from configuration",
data.Get(FieldClusterCredentialsId), *resp.JSON200.CredentialsId)
if err := data.Set(FieldAKSClusterClientID, "credentials-drift-detected-force-apply"); err != nil {
return diag.FromErr(fmt.Errorf("setting client ID: %w", err))
}
}

if err := data.Set(FieldClusterCredentialsId, *resp.JSON200.CredentialsId); err != nil {
return diag.FromErr(fmt.Errorf("setting credentials: %w", err))
}
Expand Down Expand Up @@ -193,6 +201,7 @@ func updateAKSClusterSettings(ctx context.Context, data *schema.ResourceData, cl
FieldAKSClusterClientSecret,
FieldAKSClusterTenantID,
FieldAKSClusterSubscriptionID,
FieldClusterCredentialsId,
) {
log.Printf("[INFO] Nothing to update in cluster setttings.")
return nil
Expand All @@ -217,16 +226,59 @@ func updateAKSClusterSettings(ctx context.Context, data *schema.ResourceData, cl
// Retries are required for newly created IAM resources to initialise on Azure side.
b := backoff.WithContext(backoff.WithMaxRetries(backoff.NewConstantBackOff(10*time.Second), 30), ctx)
var lastErr error
if err = backoff.Retry(func() error {
var credentialsID string
if err = backoff.RetryNotify(func() error {
response, err := client.ExternalClusterAPIUpdateClusterWithResponse(ctx, data.Id(), req)
lastErr = err
return sdk.CheckOKResponse(response, err)
}, b); err != nil {
if errors.Is(err, context.DeadlineExceeded) && lastErr != nil {
return fmt.Errorf("updating cluster configuration: %w: %v", err, lastErr)
if err != nil {
return fmt.Errorf("error when calling update cluster API: %w", err)
}

err = sdk.StatusOk(response)

if err != nil {
// In case of malformed user request return error to user right away.
// Credentials error is omitted as permissions propagate eventually and sometimes aren't visible immediately.
if response.StatusCode() == 400 && !sdk.IsCredentialsError(response) {
return backoff.Permanent(err)
}

if response.StatusCode() == 400 && sdk.IsCredentialsError(response) {
log.Printf("[WARN] Received credentials error from backend, will retry in case the issue is caused by IAM eventual consistency.")
}
return fmt.Errorf("error in update cluster response: %w", err)
}

credentialsID = *response.JSON200.CredentialsId
return nil
}, b, func(err error, _ time.Duration) {
// Only store non-context errors so we can surface the last "real" error to the user at the end
if !errors.Is(err, context.DeadlineExceeded) && !errors.Is(err, context.Canceled) {
lastErr = err
}
log.Printf("[WARN] Encountered error while updating cluster settings, will retry: %v", err)
}); err != nil {
// Reset CredentialsID in state in case of failed updates.
// This is because TF will save the raw credentials in state even on failed updates.
// Since the raw values are not exposed via API, TF cannot see drift and will not try to re-apply them next time, leaving the caller stuck.
// Resetting this value here will trigger our credentialsID drift detection on Read() and force re-apply to fix the drift.
// Note: cannot use empty string; if first update failed then credentials will also be empty on remote => no drift on Read.
// Src: https://developer.hashicorp.com/terraform/plugin/framework/diagnostics#returning-errors-and-warnings
if err := data.Set(FieldClusterCredentialsId, "drift-protection-failed-update"); err != nil {
log.Printf("[ERROR] Failed to reset cluster credentials ID after failed update: %v", err)
}

if errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) {
return fmt.Errorf("updating cluster configuration failed due to context: %w; last observed error was: %v", err, lastErr)
}
return fmt.Errorf("updating cluster configuration: %w", err)
}

// In case the update succeeded, we must update the state with the *generated* credentials_id before re-reading.
// This is because on update, the credentials_id always changes => read drift detection would see that and trigger infinite drift
err = data.Set(FieldClusterCredentialsId, credentialsID)
if err != nil {
return fmt.Errorf("failed to update credentials ID after successful update: %w", err)
}

return nil
}
210 changes: 166 additions & 44 deletions castai/resource_aks_cluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,58 +20,180 @@ import (
)

func TestAKSClusterResourceReadContext(t *testing.T) {
r := require.New(t)
mockctrl := gomock.NewController(t)
mockClient := mock_sdk.NewMockClientInterface(mockctrl)

ctx := context.Background()
provider := &ProviderConfig{
api: &sdk.ClientWithResponses{
ClientInterface: mockClient,
},
}

clusterId := "b6bfc074-a267-400f-b8f1-db0850c369b1"

body := io.NopCloser(bytes.NewReader([]byte(`{
"id": "b6bfc074-a267-400f-b8f1-db0850c369b1",
"name": "aks-cluster",
"organizationId": "2836f775-aaaa-eeee-bbbb-3d3c29512692",
"credentialsId": "9b8d0456-177b-4a3d-b162-e68030d656aa",
"createdAt": "2022-01-27T19:03:31.570829Z",
"status": "ready",
"agentSnapshotReceivedAt": "2022-03-21T10:33:56.192020Z",
"agentStatus": "online",
"providerType": "aks",
"aks": {
"maxPodsPerNode": 100,
"networkPlugin": "calico",
"nodeResourceGroup": "ng",
"region": "westeurope",
"subscriptionId": "subID"
},
"clusterNameId": "aks-cluster-b6bfc074",
"private": true
}`)))
mockClient.EXPECT().
ExternalClusterAPIGetCluster(gomock.Any(), clusterId).
Return(&http.Response{StatusCode: 200, Body: body, Header: map[string][]string{"Content-Type": {"json"}}}, nil)

aksResource := resourceAKSCluster()

val := cty.ObjectVal(map[string]cty.Value{})
state := terraform.NewInstanceStateShimmedFromValue(val, 0)
state.ID = clusterId

data := aksResource.Data(state)
result := aksResource.ReadContext(ctx, data, provider)
r.Nil(result)
r.False(result.HasError())
r.Equal(`ID = b6bfc074-a267-400f-b8f1-db0850c369b1
t.Run("read should populate data correctly", func(t *testing.T) {
r := require.New(t)
mockctrl := gomock.NewController(t)
mockClient := mock_sdk.NewMockClientInterface(mockctrl)
provider := &ProviderConfig{
api: &sdk.ClientWithResponses{
ClientInterface: mockClient,
},
}

body := io.NopCloser(bytes.NewReader([]byte(`{
"id": "b6bfc074-a267-400f-b8f1-db0850c369b1",
"name": "aks-cluster",
"organizationId": "2836f775-aaaa-eeee-bbbb-3d3c29512692",
"credentialsId": "9b8d0456-177b-4a3d-b162-e68030d656aa",
"createdAt": "2022-01-27T19:03:31.570829Z",
"status": "ready",
"agentSnapshotReceivedAt": "2022-03-21T10:33:56.192020Z",
"agentStatus": "online",
"providerType": "aks",
"aks": {
"maxPodsPerNode": 100,
"networkPlugin": "calico",
"nodeResourceGroup": "ng",
"region": "westeurope",
"subscriptionId": "subID"
},
"clusterNameId": "aks-cluster-b6bfc074",
"private": true
}`)))
mockClient.EXPECT().
ExternalClusterAPIGetCluster(gomock.Any(), clusterId).
Return(&http.Response{StatusCode: 200, Body: body, Header: map[string][]string{"Content-Type": {"json"}}}, nil)

aksResource := resourceAKSCluster()

val := cty.ObjectVal(map[string]cty.Value{})
state := terraform.NewInstanceStateShimmedFromValue(val, 0)
state.ID = clusterId
// If local credentials don't match remote, drift detection would trigger.
// If local state has no credentials but remote has them, then the drift does exist so - there is separate test for that.
state.Attributes[FieldClusterCredentialsId] = "9b8d0456-177b-4a3d-b162-e68030d656aa"

data := aksResource.Data(state)
result := aksResource.ReadContext(ctx, data, provider)
r.Nil(result)
r.False(result.HasError())
r.Equal(`ID = b6bfc074-a267-400f-b8f1-db0850c369b1
credentials_id = 9b8d0456-177b-4a3d-b162-e68030d656aa
region = westeurope
Tainted = false
`, data.State().String())
})

t.Run("on credentials drift, changes client_id to trigger drift and re-apply", func(t *testing.T) {
testCase := []struct {
name string
stateValue string
apiValue string
}{
{
name: "empty credentials in remote",
stateValue: "credentials-id-local",
apiValue: "",
},
{
name: "different credentials in remote",
stateValue: "credentials-id-local",
apiValue: "credentials-id-remote",
},
{
name: "empty credentials in local but exist in remote",
stateValue: "",
apiValue: "credentials-id-remote",
},
}

for _, tc := range testCase {
t.Run(tc.name, func(t *testing.T) {
r := require.New(t)
mockctrl := gomock.NewController(t)
mockClient := mock_sdk.NewMockClientInterface(mockctrl)
provider := &ProviderConfig{
api: &sdk.ClientWithResponses{
ClientInterface: mockClient,
},
}
clientIDBeforeRead := "dummy-client-id"

body := io.NopCloser(bytes.NewReader([]byte(fmt.Sprintf(`{"credentialsId": "%s"}`, tc.apiValue))))
mockClient.EXPECT().
ExternalClusterAPIGetCluster(gomock.Any(), clusterId).
Return(&http.Response{StatusCode: 200, Body: body, Header: map[string][]string{"Content-Type": {"json"}}}, nil)

aksResource := resourceAKSCluster()

val := cty.ObjectVal(map[string]cty.Value{})
state := terraform.NewInstanceStateShimmedFromValue(val, 0)
state.ID = clusterId
state.Attributes[FieldClusterCredentialsId] = tc.stateValue
state.Attributes[FieldAKSClusterClientID] = clientIDBeforeRead

data := aksResource.Data(state)
result := aksResource.ReadContext(ctx, data, provider)
r.Nil(result)
r.False(result.HasError())

clientIDAfterRead := data.Get(FieldAKSClusterClientID)

r.NotEqual(clientIDBeforeRead, clientIDAfterRead)
r.NotEmpty(clientIDAfterRead)
})
}
})

t.Run("when credentials match, no drift should be triggered", func(t *testing.T) {
testCase := []struct {
name string
stateValue string
apiValue string
}{
{
name: "empty credentials in both",
stateValue: "",
apiValue: "",
},
{
name: "matching credentials",
stateValue: "credentials-id",
apiValue: "credentials-id",
},
}

for _, tc := range testCase {
t.Run(tc.name, func(t *testing.T) {
r := require.New(t)
mockctrl := gomock.NewController(t)
mockClient := mock_sdk.NewMockClientInterface(mockctrl)
provider := &ProviderConfig{
api: &sdk.ClientWithResponses{
ClientInterface: mockClient,
},
}
clientIDBeforeRead := "dummy-client-id"

body := io.NopCloser(bytes.NewReader([]byte(fmt.Sprintf(`{"credentialsId": "%s"}`, tc.apiValue))))
mockClient.EXPECT().
ExternalClusterAPIGetCluster(gomock.Any(), clusterId).
Return(&http.Response{StatusCode: 200, Body: body, Header: map[string][]string{"Content-Type": {"json"}}}, nil)

aksResource := resourceAKSCluster()

val := cty.ObjectVal(map[string]cty.Value{})
state := terraform.NewInstanceStateShimmedFromValue(val, 0)
state.ID = clusterId
state.Attributes[FieldClusterCredentialsId] = tc.stateValue
state.Attributes[FieldAKSClusterClientID] = clientIDBeforeRead

data := aksResource.Data(state)
result := aksResource.ReadContext(ctx, data, provider)
r.Nil(result)
r.False(result.HasError())

clientIDAfterRead := data.Get(FieldAKSClusterClientID)

r.Equal(clientIDBeforeRead, clientIDAfterRead)
r.NotEmpty(clientIDAfterRead)
})
}
})
}

func TestAccResourceAKSCluster(t *testing.T) {
Expand Down
Loading