From a8ef6f778b075e3eaf51e033a15313edaca33d89 Mon Sep 17 00:00:00 2001 From: zhihonl <61301537+zhihonl@users.noreply.github.com> Date: Wed, 4 Dec 2024 15:39:07 -0500 Subject: [PATCH] [Bug Fix] Fix Excessive IMDS related error logging (#1440) --- extension/entitystore/ec2Info.go | 51 ++++++----------------- extension/entitystore/ec2Info_test.go | 33 +++++++++++++++ extension/entitystore/extension.go | 8 +++- extension/entitystore/extension_test.go | 52 ++++++++++++++++++++++++ extension/entitystore/retryer.go | 2 +- extension/entitystore/serviceprovider.go | 2 +- internal/retryer/imdsretryer.go | 1 - 7 files changed, 105 insertions(+), 44 deletions(-) diff --git a/extension/entitystore/ec2Info.go b/extension/entitystore/ec2Info.go index cfb2eccba8..e646fca78c 100644 --- a/extension/entitystore/ec2Info.go +++ b/extension/entitystore/ec2Info.go @@ -14,6 +14,7 @@ import ( "github.com/aws/amazon-cloudwatch-agent/internal/ec2metadataprovider" "github.com/aws/amazon-cloudwatch-agent/plugins/processors/ec2tagger" + "github.com/aws/amazon-cloudwatch-agent/translator/config" ) const ( @@ -32,7 +33,8 @@ type EC2Info struct { AutoScalingGroup string // region is used while making call to describeTags Ec2 API for AutoScalingGroup - Region string + Region string + kubernetesMode string metadataProvider ec2metadataprovider.MetadataProvider logger *zap.Logger @@ -48,8 +50,11 @@ func (ei *EC2Info) initEc2Info() { if err := ei.setInstanceIDAccountID(); err != nil { return } - if err := ei.setAutoScalingGroup(); err != nil { - return + // Instance metadata tags is not usable for EKS nodes + // https://github.com/kubernetes/cloud-provider-aws/issues/762 + if ei.kubernetesMode != config.ModeEKS { + limitedRetryer := NewRetryer(true, true, defaultJitterMin, defaultJitterMax, ec2tagger.BackoffSleepArray, maxRetry, ei.done, ei.logger) + limitedRetryer.refreshLoop(ei.retrieveAsgName) } ei.logger.Debug("Finished initializing EC2Info") } @@ -99,49 +104,16 @@ func (ei *EC2Info) setInstanceIDAccountID() error { } } -func (ei *EC2Info) setAutoScalingGroup() error { - retry := 0 - for { - var waitDuration time.Duration - if retry < len(ec2tagger.BackoffSleepArray) { - waitDuration = ec2tagger.BackoffSleepArray[retry] - } else { - waitDuration = ec2tagger.BackoffSleepArray[len(ec2tagger.BackoffSleepArray)-1] - } - - wait := time.NewTimer(waitDuration) - select { - case <-ei.done: - wait.Stop() - return errors.New("shutdown signal received") - case <-wait.C: - } - - if retry > 0 { - ei.logger.Debug("Initial retrieval of tags and volumes", zap.Int("retry", retry)) - } - - if err := ei.retrieveAsgName(); err != nil { - ei.logger.Debug("Unable to fetch instance tags with imds", zap.Int("retry", retry), zap.Error(err)) - } else { - ei.logger.Debug("Retrieval of auto-scaling group tags succeeded") - return nil - } - - retry++ - } - -} - func (ei *EC2Info) retrieveAsgName() error { tags, err := ei.metadataProvider.InstanceTags(context.Background()) if err != nil { - ei.logger.Debug("Failed to get tags through metadata provider", zap.Error(err)) + ei.logger.Debug("Failed to get AutoScalingGroup from instance tags. This is likely because instance tag is not enabled for IMDS but will not affect agent functionality.") return err } else if strings.Contains(tags, ec2tagger.Ec2InstanceTagKeyASG) { asg, err := ei.metadataProvider.InstanceTagValue(context.Background(), ec2tagger.Ec2InstanceTagKeyASG) if err != nil { ei.logger.Error("Failed to get AutoScalingGroup through metadata provider", zap.Error(err)) + return err } else { ei.logger.Debug("AutoScalingGroup retrieved through IMDS") ei.mutex.Lock() @@ -156,9 +128,10 @@ func (ei *EC2Info) retrieveAsgName() error { return nil } -func newEC2Info(metadataProvider ec2metadataprovider.MetadataProvider, done chan struct{}, region string, logger *zap.Logger) *EC2Info { +func newEC2Info(metadataProvider ec2metadataprovider.MetadataProvider, kubernetesMode string, done chan struct{}, region string, logger *zap.Logger) *EC2Info { return &EC2Info{ metadataProvider: metadataProvider, + kubernetesMode: kubernetesMode, done: done, Region: region, logger: logger, diff --git a/extension/entitystore/ec2Info_test.go b/extension/entitystore/ec2Info_test.go index 6602752c5a..9cc4efd896 100644 --- a/extension/entitystore/ec2Info_test.go +++ b/extension/entitystore/ec2Info_test.go @@ -15,6 +15,7 @@ import ( "go.uber.org/zap" "github.com/aws/amazon-cloudwatch-agent/internal/ec2metadataprovider" + "github.com/aws/amazon-cloudwatch-agent/translator/config" ) var mockedInstanceIdentityDoc = &ec2metadata.EC2InstanceIdentityDocument{ @@ -236,3 +237,35 @@ func TestNotInitIfMetadataProviderIsEmpty(t *testing.T) { }) } } + +func TestNoASGRetrievalInKubernetesMode(t *testing.T) { + type args struct { + metadataProvider ec2metadataprovider.MetadataProvider + kubernetesMode string + } + tests := []struct { + name string + args args + wantErr bool + want string + }{ + { + name: "EKSNoASGFromEC2Info", + args: args{ + metadataProvider: &mockMetadataProvider{InstanceIdentityDocument: mockedInstanceIdentityDoc, Tags: map[string]string{"aws:autoscaling:groupName": tagVal3}}, + kubernetesMode: config.ModeEKS, + }, + wantErr: false, + want: "", + }, + } + for _, tt := range tests { + logger, _ := zap.NewDevelopment() + t.Run(tt.name, func(t *testing.T) { + ei := &EC2Info{metadataProvider: tt.args.metadataProvider, kubernetesMode: tt.args.kubernetesMode, logger: logger} + go ei.initEc2Info() + time.Sleep(3 * time.Second) + assert.Equal(t, tt.want, ei.GetAutoScalingGroup()) + }) + } +} diff --git a/extension/entitystore/extension.go b/extension/entitystore/extension.go index a6af693cb3..a486134507 100644 --- a/extension/entitystore/extension.go +++ b/extension/entitystore/extension.go @@ -94,9 +94,13 @@ func (e *EntityStore) Start(ctx context.Context, host component.Host) error { e.serviceprovider = newServiceProvider(e.mode, e.config.Region, &e.ec2Info, e.metadataprovider, getEC2Provider, ec2CredentialConfig, e.done, e.logger) switch e.mode { case config.ModeEC2: - e.ec2Info = *newEC2Info(e.metadataprovider, e.done, e.config.Region, e.logger) + e.ec2Info = *newEC2Info(e.metadataprovider, e.kubernetesMode, e.done, e.config.Region, e.logger) go e.ec2Info.initEc2Info() - go e.serviceprovider.startServiceProvider() + // Instance metadata tags is not usable for EKS nodes + // https://github.com/kubernetes/cloud-provider-aws/issues/762 + if e.kubernetesMode == "" { + go e.serviceprovider.startServiceProvider() + } } if e.kubernetesMode != "" { e.eksInfo = newEKSInfo(e.logger) diff --git a/extension/entitystore/extension_test.go b/extension/entitystore/extension_test.go index 5662cf1d88..02cdff56d3 100644 --- a/extension/entitystore/extension_test.go +++ b/extension/entitystore/extension_test.go @@ -626,6 +626,58 @@ func TestEntityStore_LogMessageDoesNotIncludeResourceInfo(t *testing.T) { } } +func TestEntityStore_ServiceProviderInDifferentEnv(t *testing.T) { + type args struct { + mode string + kubernetesMode string + } + tests := []struct { + name string + args args + }{ + { + name: "EC2inEKS", + args: args{ + mode: config.ModeEC2, + kubernetesMode: config.ModeEKS, + }, + }, + { + name: "EC2Only", + args: args{ + mode: config.ModeEC2, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + + esConfig := &Config{ + Mode: tt.args.mode, + KubernetesMode: tt.args.kubernetesMode, + } + getMetaDataProvider = mockMetadataProviderFunc + e := EntityStore{ + logger: zap.NewNop(), + config: esConfig, + } + e.Start(context.TODO(), nil) + time.Sleep(3 * time.Second) + + name, source := e.serviceprovider.getServiceNameAndSource() + if tt.args.mode == config.ModeEC2 && tt.args.kubernetesMode != "" { + assert.Equal(t, name, ServiceNameUnknown) + assert.Equal(t, source, ServiceNameSourceUnknown) + } else if tt.args.mode == config.ModeEC2 && tt.args.kubernetesMode == "" { + assert.Equal(t, name, "TestRole") + assert.Equal(t, source, ServiceNameSourceClientIamRole) + } + + }) + } + +} + func assertIfNonEmpty(t *testing.T, message string, pattern string) { if pattern != "" { assert.NotContains(t, message, pattern) diff --git a/extension/entitystore/retryer.go b/extension/entitystore/retryer.go index cefa06d374..65829f8970 100644 --- a/extension/entitystore/retryer.go +++ b/extension/entitystore/retryer.go @@ -77,7 +77,7 @@ func (r *Retryer) refreshLoop(updateFunc func() error) int { if err != nil { retry++ - r.logger.Debug("there was an error when retrieving service attribute.", zap.Error(err)) + r.logger.Debug("there was an issue when retrieving entity attributes but will not affect agent functionality", zap.Error(err)) } else { retry = 1 } diff --git a/extension/entitystore/serviceprovider.go b/extension/entitystore/serviceprovider.go index c65a0daf62..9f36dd9005 100644 --- a/extension/entitystore/serviceprovider.go +++ b/extension/entitystore/serviceprovider.go @@ -240,7 +240,7 @@ func (s *serviceprovider) scrapeIAMRole() error { func (s *serviceprovider) scrapeImdsServiceName() error { tags, err := s.metadataProvider.InstanceTags(context.Background()) if err != nil { - s.logger.Debug("Failed to get tags through metadata provider", zap.Error(err)) + s.logger.Debug("Failed to get service name from instance tags. This is likely because instance tag is not enabled for IMDS but will not affect agent functionality.") return err } // This will check whether the tags contains SERVICE, APPLICATION, APP, in that order. diff --git a/internal/retryer/imdsretryer.go b/internal/retryer/imdsretryer.go index 29dec2976f..5a4322c479 100644 --- a/internal/retryer/imdsretryer.go +++ b/internal/retryer/imdsretryer.go @@ -43,7 +43,6 @@ func (r IMDSRetryer) ShouldRetry(req *request.Request) bool { if awsError, ok := req.Error.(awserr.Error); r.DefaultRetryer.ShouldRetry(req) || (ok && awsError != nil && awsError.Code() == "EC2MetadataError") { shouldRetry = true } - fmt.Printf("D! should retry %t for imds error : %v", shouldRetry, req.Error) return shouldRetry }