Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Bug Fix] Fix Excessive IMDS related error logging #1440

Merged
merged 11 commits into from
Dec 4, 2024
48 changes: 10 additions & 38 deletions extension/entitystore/ec2Info.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ type EC2Info struct {
AutoScalingGroup string

// region is used while making call to describeTags Ec2 API for AutoScalingGroup
Region string
Region string
kubernetesMode string

metadataProvider ec2metadataprovider.MetadataProvider
logger *zap.Logger
Expand All @@ -48,8 +49,11 @@ func (ei *EC2Info) initEc2Info() {
if err := ei.setInstanceIDAccountID(); err != nil {
return
}
if err := ei.setAutoScalingGroup(); err != nil {
return
// Instance metadata tags is not usable for EKS nodes
// https://github.com/kubernetes/cloud-provider-aws/issues/762
if ei.kubernetesMode == "" {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we want to keep the retryer on native k8s on ec2 right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in new commit

limitedRetryer := NewRetryer(true, true, defaultJitterMin, defaultJitterMax, ec2tagger.BackoffSleepArray, maxRetry, ei.done, ei.logger)
limitedRetryer.refreshLoop(ei.retrieveAsgName)
}
ei.logger.Debug("Finished initializing EC2Info")
}
Expand Down Expand Up @@ -99,40 +103,6 @@ func (ei *EC2Info) setInstanceIDAccountID() error {
}
}

func (ei *EC2Info) setAutoScalingGroup() error {
zhihonl marked this conversation as resolved.
Show resolved Hide resolved
retry := 0
for {
var waitDuration time.Duration
if retry < len(ec2tagger.BackoffSleepArray) {
waitDuration = ec2tagger.BackoffSleepArray[retry]
} else {
waitDuration = ec2tagger.BackoffSleepArray[len(ec2tagger.BackoffSleepArray)-1]
}

wait := time.NewTimer(waitDuration)
select {
case <-ei.done:
wait.Stop()
return errors.New("shutdown signal received")
case <-wait.C:
}

if retry > 0 {
ei.logger.Debug("Initial retrieval of tags and volumes", zap.Int("retry", retry))
}

if err := ei.retrieveAsgName(); err != nil {
ei.logger.Debug("Unable to fetch instance tags with imds", zap.Int("retry", retry), zap.Error(err))
} else {
ei.logger.Debug("Retrieval of auto-scaling group tags succeeded")
return nil
}

retry++
}

}

func (ei *EC2Info) retrieveAsgName() error {
tags, err := ei.metadataProvider.InstanceTags(context.Background())
if err != nil {
Expand All @@ -142,6 +112,7 @@ func (ei *EC2Info) retrieveAsgName() error {
asg, err := ei.metadataProvider.InstanceTagValue(context.Background(), ec2tagger.Ec2InstanceTagKeyASG)
if err != nil {
ei.logger.Error("Failed to get AutoScalingGroup through metadata provider", zap.Error(err))
return err
} else {
ei.logger.Debug("AutoScalingGroup retrieved through IMDS")
ei.mutex.Lock()
Expand All @@ -156,9 +127,10 @@ func (ei *EC2Info) retrieveAsgName() error {
return nil
}

func newEC2Info(metadataProvider ec2metadataprovider.MetadataProvider, done chan struct{}, region string, logger *zap.Logger) *EC2Info {
func newEC2Info(metadataProvider ec2metadataprovider.MetadataProvider, kubernetesMode string, done chan struct{}, region string, logger *zap.Logger) *EC2Info {
return &EC2Info{
metadataProvider: metadataProvider,
kubernetesMode: kubernetesMode,
done: done,
Region: region,
logger: logger,
Expand Down
35 changes: 35 additions & 0 deletions extension/entitystore/ec2Info_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (
"go.uber.org/zap"

"github.com/aws/amazon-cloudwatch-agent/internal/ec2metadataprovider"
"github.com/aws/amazon-cloudwatch-agent/translator/config"
)

var mockedInstanceIdentityDoc = &ec2metadata.EC2InstanceIdentityDocument{
Expand Down Expand Up @@ -236,3 +237,37 @@ func TestNotInitIfMetadataProviderIsEmpty(t *testing.T) {
})
}
}

func TestNoASGRetrievalInKubernetesMode(t *testing.T) {
type args struct {
metadataProvider ec2metadataprovider.MetadataProvider
kubernetesMode string
}
tests := []struct {
name string
args args
wantErr bool
want EC2Info
}{
{
name: "EKSNoASGFromEC2Info",
args: args{
metadataProvider: &mockMetadataProvider{InstanceIdentityDocument: mockedInstanceIdentityDoc, Tags: map[string]string{"aws:autoscaling:groupName": tagVal3}},
kubernetesMode: config.ModeEKS,
},
wantErr: false,
want: EC2Info{
AutoScalingGroup: "",
},
},
}
for _, tt := range tests {
logger, _ := zap.NewDevelopment()
t.Run(tt.name, func(t *testing.T) {
ei := &EC2Info{metadataProvider: tt.args.metadataProvider, kubernetesMode: tt.args.kubernetesMode, logger: logger}
go ei.initEc2Info()
time.Sleep(3 * time.Second)
assert.Equal(t, tt.want.AutoScalingGroup, ei.GetAutoScalingGroup())
})
}
}
8 changes: 6 additions & 2 deletions extension/entitystore/extension.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,13 @@ func (e *EntityStore) Start(ctx context.Context, host component.Host) error {
e.serviceprovider = newServiceProvider(e.mode, e.config.Region, &e.ec2Info, e.metadataprovider, getEC2Provider, ec2CredentialConfig, e.done, e.logger)
switch e.mode {
case config.ModeEC2:
e.ec2Info = *newEC2Info(e.metadataprovider, e.done, e.config.Region, e.logger)
e.ec2Info = *newEC2Info(e.metadataprovider, e.kubernetesMode, e.done, e.config.Region, e.logger)
okankoAMZ marked this conversation as resolved.
Show resolved Hide resolved
go e.ec2Info.initEc2Info()
go e.serviceprovider.startServiceProvider()
// Instance metadata tags is not usable for EKS nodes
// https://github.com/kubernetes/cloud-provider-aws/issues/762
if e.kubernetesMode == "" {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This may seem drastic but based on the codebase serviceprovider isn't really used in Kubernetes scenario. It always gets overriden by either Instrumentation or K8sWorkload service name source.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

similar to the above -- shouldn't this be if e.kubernetesMode != modeEKS ? So that we can account for native k8s?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I disabled it for all kubernetes environment because serviceprovider isn't used for any Kubernetes related logics. GetMetricServiceNameAndSource is the only public function that is used for awsentity processor and that is only referenced in EC2 specific processor logic:

entityServiceName, entityServiceNameSource = getServiceNameSource()

go e.serviceprovider.startServiceProvider()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does e.serviceprovider.startServiceProvider() do anything beneficial on EKS? Seems strange that we would have to do this just to avoid the IMDS check

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think so. startServiceProvider scrapes IAM role or service name from instance tags so they can be used for service names. In Kubernetes environment neither of these will be used because they have lower priority than Instrumentation or K8sWorkload.

Code wise, I only see serviceprovider public function called in awsentity processor if we are strictly on EC2:

entityServiceName, entityServiceNameSource = getServiceNameSource()

}
}
if e.kubernetesMode != "" {
e.eksInfo = newEKSInfo(e.logger)
Expand Down
54 changes: 54 additions & 0 deletions extension/entitystore/extension_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (

type mockServiceProvider struct {
mock.Mock
started bool
}

// This helper function creates a test logger
Expand Down Expand Up @@ -626,6 +627,59 @@ func TestEntityStore_LogMessageDoesNotIncludeResourceInfo(t *testing.T) {
}
}

func TestEntityStore_ServiceProviderInDifferentEnv(t *testing.T) {
type args struct {
metadataProvider ec2metadataprovider.MetadataProvider
mode string
kubernetesMode string
}
tests := []struct {
name string
args args
}{
{
name: "EC2inEKS",
args: args{
mode: config.ModeEC2,
kubernetesMode: config.ModeEKS,
},
},
{
name: "EC2Only",
args: args{
mode: config.ModeEC2,
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {

esConfig := &Config{
Mode: tt.args.mode,
KubernetesMode: tt.args.kubernetesMode,
}
getMetaDataProvider = mockMetadataProviderFunc
e := EntityStore{
logger: zap.NewNop(),
config: esConfig,
}
e.Start(nil, nil)
time.Sleep(3 * time.Second)

name, source := e.serviceprovider.getServiceNameAndSource()
if tt.args.mode == config.ModeEC2 && tt.args.kubernetesMode != "" {
assert.Equal(t, name, ServiceNameUnknown)
assert.Equal(t, source, ServiceNameSourceUnknown)
} else if tt.args.mode == config.ModeEC2 && tt.args.kubernetesMode == "" {
assert.Equal(t, name, "TestRole")
assert.Equal(t, source, ServiceNameSourceClientIamRole)
}

})
}

}

func assertIfNonEmpty(t *testing.T, message string, pattern string) {
if pattern != "" {
assert.NotContains(t, message, pattern)
Expand Down
10 changes: 9 additions & 1 deletion internal/retryer/imdsretryer.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"github.com/aws/aws-sdk-go/aws/awserr"
"github.com/aws/aws-sdk-go/aws/client"
"github.com/aws/aws-sdk-go/aws/request"
"github.com/influxdata/wlog"

"github.com/aws/amazon-cloudwatch-agent/cfg/envconfig"
)
Expand Down Expand Up @@ -43,7 +44,7 @@ func (r IMDSRetryer) ShouldRetry(req *request.Request) bool {
if awsError, ok := req.Error.(awserr.Error); r.DefaultRetryer.ShouldRetry(req) || (ok && awsError != nil && awsError.Code() == "EC2MetadataError") {
shouldRetry = true
}
fmt.Printf("D! should retry %t for imds error : %v", shouldRetry, req.Error)
fmtDebugLog(wlog.LogLevel(), "D! should retry %t for imds error : %v", shouldRetry, req.Error)
return shouldRetry
}

Expand All @@ -55,3 +56,10 @@ func GetDefaultRetryNumber() int {
}
return DefaultImdsRetries
}

// fmtDebugLog logs the content only if the log level is DEBUG
lisguo marked this conversation as resolved.
Show resolved Hide resolved
func fmtDebugLog(logLevel wlog.Level, format string, args ...interface{}) {
if logLevel == wlog.DEBUG {
fmt.Printf(format, args...)
}
}
Loading