From 8282b311b35f680018d22b88cceafb26f582125e Mon Sep 17 00:00:00 2001 From: Carter McKinnon Date: Tue, 17 Sep 2024 12:13:55 -0700 Subject: [PATCH] Assume instance exists within eventual-consistency grace period --- pkg/providers/v1/aws.go | 41 +++++++++++++++++++++---------- pkg/providers/v1/config/config.go | 9 ++++++- pkg/providers/v1/instances_v2.go | 17 ++++++++++++- 3 files changed, 52 insertions(+), 15 deletions(-) diff --git a/pkg/providers/v1/aws.go b/pkg/providers/v1/aws.go index 6b6c7084fe..55df3964a5 100644 --- a/pkg/providers/v1/aws.go +++ b/pkg/providers/v1/aws.go @@ -390,7 +390,8 @@ type Cloud struct { nodeInformer informercorev1.NodeInformer // Extract the function out to make it easier to test - nodeInformerHasSynced cache.InformerSynced + nodeInformerHasSynced cache.InformerSynced + nodeEventualConsistencyGracePeriod time.Duration eventBroadcaster record.EventBroadcaster eventRecorder record.EventRecorder @@ -600,13 +601,14 @@ func newAWSCloud2(cfg config.CloudConfig, awsServices Services, provider config. } awsCloud := &Cloud{ - ec2: ec2, - elb: elb, - elbv2: elbv2, - metadata: metadata, - kms: kms, - cfg: &cfg, - region: regionName, + ec2: ec2, + elb: elb, + elbv2: elbv2, + metadata: metadata, + kms: kms, + cfg: &cfg, + region: regionName, + nodeEventualConsistencyGracePeriod: cfg.Global.NodeEventualConsistencyGracePeriod, } awsCloud.instanceCache.cloud = awsCloud awsCloud.zoneCache.cloud = awsCloud @@ -869,9 +871,26 @@ func (c *Cloud) NodeAddressesByProviderID(ctx context.Context, providerID string return addresses, nil } -// InstanceExistsByProviderID returns true if the instance with the given provider id still exists. +// InstanceExistsByProviderID implements Instances.InstanceExistsByProviderID (v1) +// returns true if the instance with the given provider id still exists. // If false is returned with no error, the instance will be immediately deleted by the cloud controller manager. func (c *Cloud) InstanceExistsByProviderID(ctx context.Context, providerID string) (bool, error) { + exists, err := c.instanceExistsByProviderID(ctx, providerID) + if err != nil { + if IsAWSErrorInstanceNotFound(err) { + // we have to assume this means the instance was terminated a while ago + return false, nil + } + return false, err + } + return exists, nil +} + +// instanceExistsByProviderID returns true if the instance with the given provider id still exists. +// If error InvalidInstanceId.NotFound is returned, the caller should decide how to handle this. It could mean: +// a. the instance was launched recently and isn't found in the API due to eventual-consistency +// b. the instance was terminated a while ago, and no longer appears in the API with "terminated" status +func (c *Cloud) instanceExistsByProviderID(_ context.Context, providerID string) (bool, error) { instanceID, err := KubernetesInstanceID(providerID).MapToAWSInstanceID() if err != nil { return false, err @@ -887,10 +906,6 @@ func (c *Cloud) InstanceExistsByProviderID(ctx context.Context, providerID strin instances, err := c.ec2.DescribeInstances(request) if err != nil { - // if err is InstanceNotFound, return false with no error - if IsAWSErrorInstanceNotFound(err) { - return false, nil - } return false, err } if len(instances) == 0 { diff --git a/pkg/providers/v1/config/config.go b/pkg/providers/v1/config/config.go index ef6e371115..7a3764c0c4 100644 --- a/pkg/providers/v1/config/config.go +++ b/pkg/providers/v1/config/config.go @@ -2,8 +2,10 @@ package config import ( "fmt" - "github.com/aws/aws-sdk-go/aws/request" "strings" + "time" + + "github.com/aws/aws-sdk-go/aws/request" "github.com/aws/aws-sdk-go/aws/endpoints" @@ -62,6 +64,11 @@ type CloudConfig struct { // NodeIPFamilies determines which IP addresses are added to node objects and their ordering. NodeIPFamilies []string + + // NodeEventualConsistencyGracePeriod is used to account for propogation delays in the EC2 API. + // An instance may not appear in `ec2:DescribeInstances` output for a period of time after launch. + // The cloud-node-lifecycle-controller must not delete the Node prematurely in this case. + NodeEventualConsistencyGracePeriod time.Duration } // [ServiceOverride "1"] // Service = s3 diff --git a/pkg/providers/v1/instances_v2.go b/pkg/providers/v1/instances_v2.go index f7a08a2d16..881e70d5b5 100644 --- a/pkg/providers/v1/instances_v2.go +++ b/pkg/providers/v1/instances_v2.go @@ -22,7 +22,9 @@ package aws import ( "context" + "fmt" "strconv" + "time" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/types" @@ -50,7 +52,20 @@ func (c *Cloud) InstanceExists(ctx context.Context, node *v1.Node) (bool, error) return false, err } - return c.InstanceExistsByProviderID(ctx, providerID) + exists, err := c.instanceExistsByProviderID(ctx, providerID) + if err != nil { + if IsAWSErrorInstanceNotFound(err) { + if time.Since(node.CreationTimestamp.Time) < c.nodeEventualConsistencyGracePeriod { + // recently-launched EC2 instances may not appear in `ec2:DescribeInstances` + // we return an error if we're within the eventual-consistency grace period + // e.g. to cause the cloud-node-lifecycle-controller to ignore this node + return false, fmt.Errorf("node is within eventual-consistency grace period (%v): %v", c.nodeEventualConsistencyGracePeriod, err) + } + return false, nil + } + + } + return exists, nil } // InstanceShutdown returns true if the instance is shutdown according to the cloud provider.