From d2fcc0347e816f31845d1592f646fa53a52da20d Mon Sep 17 00:00:00 2001 From: Zhihong Lin Date: Thu, 21 Nov 2024 16:12:09 -0500 Subject: [PATCH 1/6] [Bug Fix] Fix Excessive IMDS related error logging --- extension/entitystore/ec2Info.go | 48 +++++----------------- extension/entitystore/ec2Info_test.go | 35 ++++++++++++++++ extension/entitystore/extension.go | 8 +++- extension/entitystore/extension_test.go | 54 +++++++++++++++++++++++++ internal/retryer/imdsretryer.go | 10 ++++- 5 files changed, 114 insertions(+), 41 deletions(-) diff --git a/extension/entitystore/ec2Info.go b/extension/entitystore/ec2Info.go index cfb2eccba8..86904eda3a 100644 --- a/extension/entitystore/ec2Info.go +++ b/extension/entitystore/ec2Info.go @@ -32,7 +32,8 @@ type EC2Info struct { AutoScalingGroup string // region is used while making call to describeTags Ec2 API for AutoScalingGroup - Region string + Region string + kubernetesMode string metadataProvider ec2metadataprovider.MetadataProvider logger *zap.Logger @@ -48,8 +49,11 @@ func (ei *EC2Info) initEc2Info() { if err := ei.setInstanceIDAccountID(); err != nil { return } - if err := ei.setAutoScalingGroup(); err != nil { - return + // Instance metadata tags is not usable for EKS nodes + // https://github.com/kubernetes/cloud-provider-aws/issues/762 + if ei.kubernetesMode == "" { + limitedRetryer := NewRetryer(false, true, defaultJitterMin, defaultJitterMax, ec2tagger.BackoffSleepArray, maxRetry, ei.done, ei.logger) + limitedRetryer.refreshLoop(ei.retrieveAsgName) } ei.logger.Debug("Finished initializing EC2Info") } @@ -99,40 +103,6 @@ func (ei *EC2Info) setInstanceIDAccountID() error { } } -func (ei *EC2Info) setAutoScalingGroup() error { - retry := 0 - for { - var waitDuration time.Duration - if retry < len(ec2tagger.BackoffSleepArray) { - waitDuration = ec2tagger.BackoffSleepArray[retry] - } else { - waitDuration = ec2tagger.BackoffSleepArray[len(ec2tagger.BackoffSleepArray)-1] - } - - wait := time.NewTimer(waitDuration) - select { - case <-ei.done: - wait.Stop() - return errors.New("shutdown signal received") - case <-wait.C: - } - - if retry > 0 { - ei.logger.Debug("Initial retrieval of tags and volumes", zap.Int("retry", retry)) - } - - if err := ei.retrieveAsgName(); err != nil { - ei.logger.Debug("Unable to fetch instance tags with imds", zap.Int("retry", retry), zap.Error(err)) - } else { - ei.logger.Debug("Retrieval of auto-scaling group tags succeeded") - return nil - } - - retry++ - } - -} - func (ei *EC2Info) retrieveAsgName() error { tags, err := ei.metadataProvider.InstanceTags(context.Background()) if err != nil { @@ -142,6 +112,7 @@ func (ei *EC2Info) retrieveAsgName() error { asg, err := ei.metadataProvider.InstanceTagValue(context.Background(), ec2tagger.Ec2InstanceTagKeyASG) if err != nil { ei.logger.Error("Failed to get AutoScalingGroup through metadata provider", zap.Error(err)) + return err } else { ei.logger.Debug("AutoScalingGroup retrieved through IMDS") ei.mutex.Lock() @@ -156,9 +127,10 @@ func (ei *EC2Info) retrieveAsgName() error { return nil } -func newEC2Info(metadataProvider ec2metadataprovider.MetadataProvider, done chan struct{}, region string, logger *zap.Logger) *EC2Info { +func newEC2Info(metadataProvider ec2metadataprovider.MetadataProvider, kubernetesMode string, done chan struct{}, region string, logger *zap.Logger) *EC2Info { return &EC2Info{ metadataProvider: metadataProvider, + kubernetesMode: kubernetesMode, done: done, Region: region, logger: logger, diff --git a/extension/entitystore/ec2Info_test.go b/extension/entitystore/ec2Info_test.go index 6602752c5a..d216c39e32 100644 --- a/extension/entitystore/ec2Info_test.go +++ b/extension/entitystore/ec2Info_test.go @@ -5,6 +5,7 @@ package entitystore import ( "bytes" + "github.com/aws/amazon-cloudwatch-agent/translator/config" "log" "strings" "testing" @@ -236,3 +237,37 @@ func TestNotInitIfMetadataProviderIsEmpty(t *testing.T) { }) } } + +func TestNoASGRetrievalInKubernetesMode(t *testing.T) { + type args struct { + metadataProvider ec2metadataprovider.MetadataProvider + kubernetesMode string + } + tests := []struct { + name string + args args + wantErr bool + want EC2Info + }{ + { + name: "happy path", + args: args{ + metadataProvider: &mockMetadataProvider{InstanceIdentityDocument: mockedInstanceIdentityDoc, Tags: map[string]string{"aws:autoscaling:groupName": tagVal3}}, + kubernetesMode: config.ModeEKS, + }, + wantErr: false, + want: EC2Info{ + AutoScalingGroup: "", + }, + }, + } + for _, tt := range tests { + logger, _ := zap.NewDevelopment() + t.Run(tt.name, func(t *testing.T) { + ei := &EC2Info{metadataProvider: tt.args.metadataProvider, kubernetesMode: tt.args.kubernetesMode, logger: logger} + go ei.initEc2Info() + time.Sleep(3 * time.Second) + assert.Equal(t, tt.want.AutoScalingGroup, ei.GetAutoScalingGroup()) + }) + } +} diff --git a/extension/entitystore/extension.go b/extension/entitystore/extension.go index a6af693cb3..a486134507 100644 --- a/extension/entitystore/extension.go +++ b/extension/entitystore/extension.go @@ -94,9 +94,13 @@ func (e *EntityStore) Start(ctx context.Context, host component.Host) error { e.serviceprovider = newServiceProvider(e.mode, e.config.Region, &e.ec2Info, e.metadataprovider, getEC2Provider, ec2CredentialConfig, e.done, e.logger) switch e.mode { case config.ModeEC2: - e.ec2Info = *newEC2Info(e.metadataprovider, e.done, e.config.Region, e.logger) + e.ec2Info = *newEC2Info(e.metadataprovider, e.kubernetesMode, e.done, e.config.Region, e.logger) go e.ec2Info.initEc2Info() - go e.serviceprovider.startServiceProvider() + // Instance metadata tags is not usable for EKS nodes + // https://github.com/kubernetes/cloud-provider-aws/issues/762 + if e.kubernetesMode == "" { + go e.serviceprovider.startServiceProvider() + } } if e.kubernetesMode != "" { e.eksInfo = newEKSInfo(e.logger) diff --git a/extension/entitystore/extension_test.go b/extension/entitystore/extension_test.go index 5662cf1d88..2669b54cb4 100644 --- a/extension/entitystore/extension_test.go +++ b/extension/entitystore/extension_test.go @@ -29,6 +29,7 @@ import ( type mockServiceProvider struct { mock.Mock + started bool } // This helper function creates a test logger @@ -626,6 +627,59 @@ func TestEntityStore_LogMessageDoesNotIncludeResourceInfo(t *testing.T) { } } +func TestEntityStore_ServiceProviderInDifferentEnv(t *testing.T) { + type args struct { + metadataProvider ec2metadataprovider.MetadataProvider + mode string + kubernetesMode string + } + tests := []struct { + name string + args args + }{ + { + name: "EC2inEKS", + args: args{ + mode: config.ModeEC2, + kubernetesMode: config.ModeEKS, + }, + }, + { + name: "EC2Only", + args: args{ + mode: config.ModeEC2, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + + esConfig := &Config{ + Mode: tt.args.mode, + KubernetesMode: tt.args.kubernetesMode, + } + getMetaDataProvider = mockMetadataProviderFunc + e := EntityStore{ + logger: zap.NewNop(), + config: esConfig, + } + e.Start(nil, nil) + time.Sleep(3 * time.Second) + + name, source := e.serviceprovider.getServiceNameAndSource() + if tt.args.mode == config.ModeEC2 && tt.args.kubernetesMode != "" { + assert.Equal(t, name, ServiceNameUnknown) + assert.Equal(t, source, ServiceNameSourceUnknown) + } else if tt.args.mode == config.ModeEC2 && tt.args.kubernetesMode == "" { + assert.Equal(t, name, "TestRole") + assert.Equal(t, source, ServiceNameSourceClientIamRole) + } + + }) + } + +} + func assertIfNonEmpty(t *testing.T, message string, pattern string) { if pattern != "" { assert.NotContains(t, message, pattern) diff --git a/internal/retryer/imdsretryer.go b/internal/retryer/imdsretryer.go index 29dec2976f..ab66b3d493 100644 --- a/internal/retryer/imdsretryer.go +++ b/internal/retryer/imdsretryer.go @@ -11,6 +11,7 @@ import ( "github.com/aws/aws-sdk-go/aws/awserr" "github.com/aws/aws-sdk-go/aws/client" "github.com/aws/aws-sdk-go/aws/request" + "github.com/influxdata/wlog" "github.com/aws/amazon-cloudwatch-agent/cfg/envconfig" ) @@ -43,7 +44,7 @@ func (r IMDSRetryer) ShouldRetry(req *request.Request) bool { if awsError, ok := req.Error.(awserr.Error); r.DefaultRetryer.ShouldRetry(req) || (ok && awsError != nil && awsError.Code() == "EC2MetadataError") { shouldRetry = true } - fmt.Printf("D! should retry %t for imds error : %v", shouldRetry, req.Error) + fmtDebugLog(wlog.LogLevel(), "D! should retry %t for imds error : %v", shouldRetry, req.Error) return shouldRetry } @@ -55,3 +56,10 @@ func GetDefaultRetryNumber() int { } return DefaultImdsRetries } + +// fmtDebugLog logs the content only if the log level is DEBUG +func fmtDebugLog(logLevel wlog.Level, format string, args ...interface{}) { + if logLevel == wlog.DEBUG { + fmt.Printf(format, args...) + } +} From fb911a1ba28e0d4525a6203cd6587dd93609f198 Mon Sep 17 00:00:00 2001 From: Zhihong Lin Date: Thu, 21 Nov 2024 16:14:12 -0500 Subject: [PATCH 2/6] Fix linter errors --- extension/entitystore/ec2Info_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extension/entitystore/ec2Info_test.go b/extension/entitystore/ec2Info_test.go index d216c39e32..3a0ccca43a 100644 --- a/extension/entitystore/ec2Info_test.go +++ b/extension/entitystore/ec2Info_test.go @@ -5,7 +5,6 @@ package entitystore import ( "bytes" - "github.com/aws/amazon-cloudwatch-agent/translator/config" "log" "strings" "testing" @@ -16,6 +15,7 @@ import ( "go.uber.org/zap" "github.com/aws/amazon-cloudwatch-agent/internal/ec2metadataprovider" + "github.com/aws/amazon-cloudwatch-agent/translator/config" ) var mockedInstanceIdentityDoc = &ec2metadata.EC2InstanceIdentityDocument{ From c3f343250a450c1120054203d30ff0b7e59de613 Mon Sep 17 00:00:00 2001 From: Zhihong Lin Date: Thu, 21 Nov 2024 16:37:00 -0500 Subject: [PATCH 3/6] Update test name and only retrieve ASG once --- extension/entitystore/ec2Info.go | 2 +- extension/entitystore/ec2Info_test.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/extension/entitystore/ec2Info.go b/extension/entitystore/ec2Info.go index 86904eda3a..417f718750 100644 --- a/extension/entitystore/ec2Info.go +++ b/extension/entitystore/ec2Info.go @@ -52,7 +52,7 @@ func (ei *EC2Info) initEc2Info() { // Instance metadata tags is not usable for EKS nodes // https://github.com/kubernetes/cloud-provider-aws/issues/762 if ei.kubernetesMode == "" { - limitedRetryer := NewRetryer(false, true, defaultJitterMin, defaultJitterMax, ec2tagger.BackoffSleepArray, maxRetry, ei.done, ei.logger) + limitedRetryer := NewRetryer(true, true, defaultJitterMin, defaultJitterMax, ec2tagger.BackoffSleepArray, maxRetry, ei.done, ei.logger) limitedRetryer.refreshLoop(ei.retrieveAsgName) } ei.logger.Debug("Finished initializing EC2Info") diff --git a/extension/entitystore/ec2Info_test.go b/extension/entitystore/ec2Info_test.go index 3a0ccca43a..1a7394d8a0 100644 --- a/extension/entitystore/ec2Info_test.go +++ b/extension/entitystore/ec2Info_test.go @@ -250,7 +250,7 @@ func TestNoASGRetrievalInKubernetesMode(t *testing.T) { want EC2Info }{ { - name: "happy path", + name: "EKSNoASGFromEC2Info", args: args{ metadataProvider: &mockMetadataProvider{InstanceIdentityDocument: mockedInstanceIdentityDoc, Tags: map[string]string{"aws:autoscaling:groupName": tagVal3}}, kubernetesMode: config.ModeEKS, From 023572828bf09786c54936b7963f29765ecc4b9e Mon Sep 17 00:00:00 2001 From: Zhihong Lin Date: Thu, 21 Nov 2024 17:01:06 -0500 Subject: [PATCH 4/6] Only disbale ec2info for EKS and clean up debug logging --- extension/entitystore/ec2Info.go | 3 ++- internal/retryer/imdsretryer.go | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/extension/entitystore/ec2Info.go b/extension/entitystore/ec2Info.go index 417f718750..1581d2afd2 100644 --- a/extension/entitystore/ec2Info.go +++ b/extension/entitystore/ec2Info.go @@ -14,6 +14,7 @@ import ( "github.com/aws/amazon-cloudwatch-agent/internal/ec2metadataprovider" "github.com/aws/amazon-cloudwatch-agent/plugins/processors/ec2tagger" + "github.com/aws/amazon-cloudwatch-agent/translator/config" ) const ( @@ -51,7 +52,7 @@ func (ei *EC2Info) initEc2Info() { } // Instance metadata tags is not usable for EKS nodes // https://github.com/kubernetes/cloud-provider-aws/issues/762 - if ei.kubernetesMode == "" { + if ei.kubernetesMode != config.ModeEKS { limitedRetryer := NewRetryer(true, true, defaultJitterMin, defaultJitterMax, ec2tagger.BackoffSleepArray, maxRetry, ei.done, ei.logger) limitedRetryer.refreshLoop(ei.retrieveAsgName) } diff --git a/internal/retryer/imdsretryer.go b/internal/retryer/imdsretryer.go index ab66b3d493..b8d81127df 100644 --- a/internal/retryer/imdsretryer.go +++ b/internal/retryer/imdsretryer.go @@ -44,7 +44,7 @@ func (r IMDSRetryer) ShouldRetry(req *request.Request) bool { if awsError, ok := req.Error.(awserr.Error); r.DefaultRetryer.ShouldRetry(req) || (ok && awsError != nil && awsError.Code() == "EC2MetadataError") { shouldRetry = true } - fmtDebugLog(wlog.LogLevel(), "D! should retry %t for imds error : %v", shouldRetry, req.Error) + fmtDebugLog("D! should retry %t for imds error : %v", shouldRetry, req.Error) return shouldRetry } @@ -58,8 +58,8 @@ func GetDefaultRetryNumber() int { } // fmtDebugLog logs the content only if the log level is DEBUG -func fmtDebugLog(logLevel wlog.Level, format string, args ...interface{}) { - if logLevel == wlog.DEBUG { +func fmtDebugLog(format string, args ...interface{}) { + if wlog.LogLevel() == wlog.DEBUG { fmt.Printf(format, args...) } } From 442a935f0c53455451c5479be4afac0c295871d7 Mon Sep 17 00:00:00 2001 From: Zhihong Lin Date: Thu, 21 Nov 2024 18:05:19 -0500 Subject: [PATCH 5/6] Remove excessive log lines and update log messages for clarity --- extension/entitystore/ec2Info.go | 2 +- extension/entitystore/retryer.go | 2 +- extension/entitystore/serviceprovider.go | 2 +- internal/retryer/imdsretryer.go | 9 --------- 4 files changed, 3 insertions(+), 12 deletions(-) diff --git a/extension/entitystore/ec2Info.go b/extension/entitystore/ec2Info.go index 1581d2afd2..e646fca78c 100644 --- a/extension/entitystore/ec2Info.go +++ b/extension/entitystore/ec2Info.go @@ -107,7 +107,7 @@ func (ei *EC2Info) setInstanceIDAccountID() error { func (ei *EC2Info) retrieveAsgName() error { tags, err := ei.metadataProvider.InstanceTags(context.Background()) if err != nil { - ei.logger.Debug("Failed to get tags through metadata provider", zap.Error(err)) + ei.logger.Debug("Failed to get AutoScalingGroup from instance tags. This is likely because instance tag is not enabled for IMDS but will not affect agent functionality.") return err } else if strings.Contains(tags, ec2tagger.Ec2InstanceTagKeyASG) { asg, err := ei.metadataProvider.InstanceTagValue(context.Background(), ec2tagger.Ec2InstanceTagKeyASG) diff --git a/extension/entitystore/retryer.go b/extension/entitystore/retryer.go index cefa06d374..65829f8970 100644 --- a/extension/entitystore/retryer.go +++ b/extension/entitystore/retryer.go @@ -77,7 +77,7 @@ func (r *Retryer) refreshLoop(updateFunc func() error) int { if err != nil { retry++ - r.logger.Debug("there was an error when retrieving service attribute.", zap.Error(err)) + r.logger.Debug("there was an issue when retrieving entity attributes but will not affect agent functionality", zap.Error(err)) } else { retry = 1 } diff --git a/extension/entitystore/serviceprovider.go b/extension/entitystore/serviceprovider.go index c65a0daf62..9f36dd9005 100644 --- a/extension/entitystore/serviceprovider.go +++ b/extension/entitystore/serviceprovider.go @@ -240,7 +240,7 @@ func (s *serviceprovider) scrapeIAMRole() error { func (s *serviceprovider) scrapeImdsServiceName() error { tags, err := s.metadataProvider.InstanceTags(context.Background()) if err != nil { - s.logger.Debug("Failed to get tags through metadata provider", zap.Error(err)) + s.logger.Debug("Failed to get service name from instance tags. This is likely because instance tag is not enabled for IMDS but will not affect agent functionality.") return err } // This will check whether the tags contains SERVICE, APPLICATION, APP, in that order. diff --git a/internal/retryer/imdsretryer.go b/internal/retryer/imdsretryer.go index b8d81127df..5a4322c479 100644 --- a/internal/retryer/imdsretryer.go +++ b/internal/retryer/imdsretryer.go @@ -11,7 +11,6 @@ import ( "github.com/aws/aws-sdk-go/aws/awserr" "github.com/aws/aws-sdk-go/aws/client" "github.com/aws/aws-sdk-go/aws/request" - "github.com/influxdata/wlog" "github.com/aws/amazon-cloudwatch-agent/cfg/envconfig" ) @@ -44,7 +43,6 @@ func (r IMDSRetryer) ShouldRetry(req *request.Request) bool { if awsError, ok := req.Error.(awserr.Error); r.DefaultRetryer.ShouldRetry(req) || (ok && awsError != nil && awsError.Code() == "EC2MetadataError") { shouldRetry = true } - fmtDebugLog("D! should retry %t for imds error : %v", shouldRetry, req.Error) return shouldRetry } @@ -56,10 +54,3 @@ func GetDefaultRetryNumber() int { } return DefaultImdsRetries } - -// fmtDebugLog logs the content only if the log level is DEBUG -func fmtDebugLog(format string, args ...interface{}) { - if wlog.LogLevel() == wlog.DEBUG { - fmt.Printf(format, args...) - } -} From 5b93fab426a4629f8e89ac9bc56e37a67f2fb3c2 Mon Sep 17 00:00:00 2001 From: Zhihong Lin Date: Wed, 4 Dec 2024 13:53:58 -0500 Subject: [PATCH 6/6] Fix linter errors --- extension/entitystore/ec2Info_test.go | 8 +++----- extension/entitystore/extension_test.go | 8 +++----- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/extension/entitystore/ec2Info_test.go b/extension/entitystore/ec2Info_test.go index 1a7394d8a0..9cc4efd896 100644 --- a/extension/entitystore/ec2Info_test.go +++ b/extension/entitystore/ec2Info_test.go @@ -247,7 +247,7 @@ func TestNoASGRetrievalInKubernetesMode(t *testing.T) { name string args args wantErr bool - want EC2Info + want string }{ { name: "EKSNoASGFromEC2Info", @@ -256,9 +256,7 @@ func TestNoASGRetrievalInKubernetesMode(t *testing.T) { kubernetesMode: config.ModeEKS, }, wantErr: false, - want: EC2Info{ - AutoScalingGroup: "", - }, + want: "", }, } for _, tt := range tests { @@ -267,7 +265,7 @@ func TestNoASGRetrievalInKubernetesMode(t *testing.T) { ei := &EC2Info{metadataProvider: tt.args.metadataProvider, kubernetesMode: tt.args.kubernetesMode, logger: logger} go ei.initEc2Info() time.Sleep(3 * time.Second) - assert.Equal(t, tt.want.AutoScalingGroup, ei.GetAutoScalingGroup()) + assert.Equal(t, tt.want, ei.GetAutoScalingGroup()) }) } } diff --git a/extension/entitystore/extension_test.go b/extension/entitystore/extension_test.go index 2669b54cb4..02cdff56d3 100644 --- a/extension/entitystore/extension_test.go +++ b/extension/entitystore/extension_test.go @@ -29,7 +29,6 @@ import ( type mockServiceProvider struct { mock.Mock - started bool } // This helper function creates a test logger @@ -629,9 +628,8 @@ func TestEntityStore_LogMessageDoesNotIncludeResourceInfo(t *testing.T) { func TestEntityStore_ServiceProviderInDifferentEnv(t *testing.T) { type args struct { - metadataProvider ec2metadataprovider.MetadataProvider - mode string - kubernetesMode string + mode string + kubernetesMode string } tests := []struct { name string @@ -663,7 +661,7 @@ func TestEntityStore_ServiceProviderInDifferentEnv(t *testing.T) { logger: zap.NewNop(), config: esConfig, } - e.Start(nil, nil) + e.Start(context.TODO(), nil) time.Sleep(3 * time.Second) name, source := e.serviceprovider.getServiceNameAndSource()