From f3b333f219e8696716c46418a3857dba655b2879 Mon Sep 17 00:00:00 2001 From: Hyunsoo Kim <884273+movence@users.noreply.github.com> Date: Tue, 3 Dec 2024 12:40:41 -0500 Subject: [PATCH 1/4] Fix GPU E2E integ test (#1448) --- .github/workflows/integration-test.yml | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index 35253ed4a1..6ad082a8a3 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -842,7 +842,7 @@ jobs: uses: actions/cache@v3 with: path: go.mod - key: ${{ matrix.arrays.terraform_dir }}-${{ matrix.arrays.k8s_version }}-${{ matrix.arrays.instanceType }}-${{ github.sha }}-${{ matrix.arrays.os }}-${{ matrix.arrays.test_dir }} + key: ${{ matrix.arrays.terraform_dir }}-${{ matrix.arrays.k8sVersion }}-${{ matrix.arrays.instanceType }}-${{ github.sha }}-${{ matrix.arrays.os }}-${{ matrix.arrays.test_dir }} - name: Login ECR id: login-ecr @@ -874,7 +874,7 @@ jobs: -var="cwagent_image_tag=${{ github.sha }}" \ -var="ami_type=${{ matrix.arrays.ami }}" \ -var="instance_type=${{ matrix.arrays.instanceType }}" \ - -var="k8s_version=${{ matrix.arrays.k8s_version }}"; then + -var="k8s_version=${{ matrix.arrays.k8sVersion }}"; then terraform destroy -auto-approve else terraform destroy -auto-approve && exit 1 @@ -1254,7 +1254,7 @@ jobs: GPUEndToEndTest: name: "GPU E2E Test" - needs: [ StartLocalStack, GenerateTestMatrix, OutputEnvVariables ] + needs: [ GenerateTestMatrix, OutputEnvVariables ] runs-on: ubuntu-latest strategy: fail-fast: false @@ -1292,18 +1292,19 @@ jobs: terraform init if terraform apply --auto-approve \ -var="beta=true" \ - -var="addon_name=amazon-cloudwatch-observability" \ - -var="addon_version=v1.6.0-eksbuild.1" \ - -var="k8s_version=1.29" ; then + -var="ami_type=${{ matrix.arrays.ami }}" \ + -var="instance_type=${{ matrix.arrays.instanceType }}" \ + -var="k8s_version=${{ matrix.arrays.k8sVersion }}"; then echo "Terraform apply successful." # Capture the output echo "Getting EKS cluster name" EKS_CLUSTER_NAME=$(terraform output -raw eks_cluster_name) echo "Cluster name is ${EKS_CLUSTER_NAME}" - kubectl apply -f ./gpuBurner.yaml - kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.15.0/deployments/static/nvidia-device-plugin.yml + kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.17.0/deployments/static/nvidia-device-plugin.yml kubectl patch amazoncloudwatchagents -n amazon-cloudwatch cloudwatch-agent --type='json' -p='[{"op": "replace", "path": "/spec/image", "value": ${{ secrets.AWS_ECR_PRIVATE_REGISTRY }}/${{ env.ECR_INTEGRATION_TEST_REPO }}:${{ github.sha }}}]' + kubectl rollout status daemonset nvidia-device-plugin-daemonset -n kube-system --timeout 10s + kubectl apply -f ./gpuBurner.yaml else terraform destroy -var="beta=${{ github.event.inputs.run_in_beta }}" -auto-approve && exit 1 fi @@ -1311,9 +1312,9 @@ jobs: - name: Run Go tests with retry uses: nick-fields/retry@v2 with: - max_attempts: 10 + max_attempts: 5 timeout_minutes: 60 - retry_wait_seconds: 60 + retry_wait_seconds: 30 command: | if [ "${{ matrix.arrays.terraform_dir }}" != "" ]; then cd "${{ matrix.arrays.terraform_dir }}" @@ -1344,4 +1345,4 @@ jobs: else cd terraform/eks/addon/gpu fi - terraform destroy --auto-approve + terraform destroy -var="beta=${{ github.event.inputs.run_in_beta }}" -auto-approve From 748073fcb5573953104f9dcdd7204b2f7af2153a Mon Sep 17 00:00:00 2001 From: Jeffrey Chien Date: Tue, 3 Dec 2024 15:05:32 -0500 Subject: [PATCH 2/4] Add data race test to PR build (#1450) --- .github/workflows/PR-build.yml | 22 +++++++++++++++++++++- Makefile | 17 +++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/.github/workflows/PR-build.yml b/.github/workflows/PR-build.yml index 03a15b6c5f..8b8fa7e878 100644 --- a/.github/workflows/PR-build.yml +++ b/.github/workflows/PR-build.yml @@ -78,7 +78,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ ubuntu-latest, windows-2019, windows-latest, macos-12] + os: [ubuntu-latest, windows-2019, windows-latest, macos-12] include: - os: ubuntu-latest family: linux @@ -138,3 +138,23 @@ jobs: - name: Build if: steps.cached_binaries.outputs.cache-hit != 'true' && needs.changes.outputs.build == 'true' run: make amazon-cloudwatch-agent-${{ matrix.family }} + + test-data-race: + needs: [lint, changes] + name: Test data race + runs-on: ubuntu-latest + steps: + - name: Set up Go 1.x + if: needs.changes.outputs.build == 'true' + uses: actions/setup-go@v4 + with: + go-version: ~1.22.2 + cache: false + + - name: Check out code + if: needs.changes.outputs.build == 'true' + uses: actions/checkout@v3 + + - name: Test data race + if: needs.changes.outputs.build == 'true' + run: make test-data-race diff --git a/Makefile b/Makefile index b958a0e276..03e22d2f20 100644 --- a/Makefile +++ b/Makefile @@ -201,6 +201,23 @@ lint: install-golangci-lint simple-lint test: CGO_ENABLED=0 go test -timeout 15m -coverprofile coverage.txt -failfast ./... +# List of existing packages with data races +# TODO: Fix each +PKG_WITH_DATA_RACE := extension/entitystore +PKG_WITH_DATA_RACE += extension/server +PKG_WITH_DATA_RACE += internal/publisher +PKG_WITH_DATA_RACE += internal/retryer +PKG_WITH_DATA_RACE += internal/tls +PKG_WITH_DATA_RACE += plugins/inputs/logfile +PKG_WITH_DATA_RACE += plugins/inputs/logfile/tail +PKG_WITH_DATA_RACE += plugins/outputs/cloudwatch +PKG_WITH_DATA_RACE += plugins/outputs/cloudwatchlogs +PKG_WITH_DATA_RACE += plugins/processors/awsapplicationsignals +PKG_WITH_DATA_RACE += plugins/processors/ec2tagger +PKG_WITH_DATA_RACE_PATTERN := $(shell echo '$(PKG_WITH_DATA_RACE)' | tr ' ' '|') +test-data-race: + CGO_ENABLED=1 go test -timeout 15m -race -parallel 4 $(shell go list ./... | grep -v -E '$(PKG_WITH_DATA_RACE_PATTERN)') + clean:: rm -rf release/ build/ rm -f CWAGENT_VERSION From b8531ce98cc05b0e83f228ec16c36c469f2c888c Mon Sep 17 00:00:00 2001 From: Lisa Guo Date: Tue, 3 Dec 2024 15:49:04 -0500 Subject: [PATCH 3/4] [CleanAMI] Ensure at least 1 AMI exists for each test image architecture (#1452) --- tool/clean/clean_ami/clean_ami.go | 84 ++++++++++++++++++++++--------- 1 file changed, 59 insertions(+), 25 deletions(-) diff --git a/tool/clean/clean_ami/clean_ami.go b/tool/clean/clean_ami/clean_ami.go index edfdc9c7b0..fb254ba1d6 100644 --- a/tool/clean/clean_ami/clean_ami.go +++ b/tool/clean/clean_ami/clean_ami.go @@ -12,7 +12,6 @@ import ( "fmt" "log" "sort" - "strings" "time" "github.com/aws/aws-sdk-go-v2/aws" @@ -24,6 +23,36 @@ import ( "github.com/aws/amazon-cloudwatch-agent/tool/clean" ) +// Image Prefixes are taken from checking the Image Builder Pipelines in us-west-2 +var imagePrefixes = []string{ + "cloudwatch-agent-integration-test-aarch64-al2023", + "cloudwatch-agent-integration-test-al2", + "cloudwatch-agent-integration-test-alma-linux-8", + "cloudwatch-agent-integration-test-alma-linux-9", + "cloudwatch-agent-integration-test-arm64-al2", + "cloudwatch-agent-integration-test-debian-11-arm64", + "cloudwatch-agent-integration-test-debian-12-arm64", + "cloudwatch-agent-integration-test-nvidia-gpu-al2", + "cloudwatch-agent-integration-test-ol7", + "cloudwatch-agent-integration-test-ol8", + "cloudwatch-agent-integration-test-ol9", + "cloudwatch-agent-integration-test-rocky-linux-8", + "cloudwatch-agent-integration-test-rocky-linux-9", + "cloudwatch-agent-integration-test-sles-15", + "cloudwatch-agent-integration-test-ubuntu-23", + "cloudwatch-agent-integration-test-ubuntu-24", + "cloudwatch-agent-integration-test-ubuntu", + "cloudwatch-agent-integration-test-ubuntu-LTS-22", + "cloudwatch-agent-integration-test-win-10", + "cloudwatch-agent-integration-test-win-11", + "cloudwatch-agent-integration-test-win-2016", + "cloudwatch-agent-integration-test-win-2019", + "cloudwatch-agent-integration-test-win-2022", + "cloudwatch-agent-integration-test-x86-al2023", + "cloudwatch-agent-integration-test-mac", + "cloudwatch-agent-integration-test-nvidia-gpu", +} + func main() { err := cleanAMIs() if err != nil { @@ -137,38 +166,43 @@ func cleanAMIs() error { } ec2client := ec2.NewFromConfig(defaultConfig) - // Get list of ami - nameFilter := types.Filter{Name: aws.String("name"), Values: []string{ - "cloudwatch-agent-integration-test*", - }} - - //get instances to delete - describeImagesInput := ec2.DescribeImagesInput{Filters: []types.Filter{nameFilter}} - describeImagesOutput, err := ec2client.DescribeImages(ctx, &describeImagesInput) - if err != nil { - return err - } - - var errList []error // stores a list of AMIs per each macos version/architecture macosImageAmiMap := make(map[string][]types.Image) - for _, image := range describeImagesOutput.Images { - if image.Name != nil && strings.HasPrefix(*image.Name, "cloudwatch-agent-integration-test-mac") { - // mac image - add it to the map and do nothing else for now - macosImageAmiMap[*image.Name] = append(macosImageAmiMap[*image.Name], image) - } else { - // non mac image - clean it if it's older than 60 days - cleanNonMacAMIs(ctx, ec2client, image, expirationDate, &errList) + // Cleanup for each AMI image type + var errList []error + for _, filter := range imagePrefixes { + nameFilter := types.Filter{Name: aws.String("name"), Values: []string{ + fmt.Sprintf("%s*", filter), + }} + + //get instances to delete + describeImagesInput := ec2.DescribeImagesInput{Filters: []types.Filter{nameFilter}} + describeImagesOutput, err := ec2client.DescribeImages(ctx, &describeImagesInput) + if err != nil { + log.Printf("Image filter %s returned an error, skipping :%v", filter, err.Error()) + continue + } + + log.Printf("%s: %d images found", filter, len(describeImagesOutput.Images)) + if len(describeImagesOutput.Images) <= 1 { + log.Printf("1 or less image found for filter %s, skipping", filter) + continue + } + + for _, image := range describeImagesOutput.Images { + if image.Name != nil && filter == "cloudwatch-agent-integration-test-mac" { + // mac image - add it to the map and do nothing else for now + macosImageAmiMap[*image.Name] = append(macosImageAmiMap[*image.Name], image) + } else { + // non mac image - clean it if it's older than 60 days + cleanNonMacAMIs(ctx, ec2client, image, expirationDate, &errList) + } } } // handle the mac AMIs cleanMacAMIs(ctx, ec2client, macosImageAmiMap, expirationDate, &errList) - if len(errList) != 0 { - return fmt.Errorf("%v", errList) - } - return nil } From 9af44776b6bce5b484f10d9f06f4853edd597aa3 Mon Sep 17 00:00:00 2001 From: Jeffrey Chien Date: Tue, 3 Dec 2024 17:52:08 -0500 Subject: [PATCH 4/4] Bumping macos runner due to deprecation of macos-12 (#1455) --- .github/workflows/PR-build.yml | 4 ++-- .github/workflows/test-build-packages.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/PR-build.yml b/.github/workflows/PR-build.yml index 8b8fa7e878..130ebff790 100644 --- a/.github/workflows/PR-build.yml +++ b/.github/workflows/PR-build.yml @@ -78,14 +78,14 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-latest, windows-2019, windows-latest, macos-12] + os: [ubuntu-latest, windows-2019, windows-latest, macos-13] include: - os: ubuntu-latest family: linux cache-path: | ~/.cache/go-build ~/go/pkg/mod - - os: macos-12 + - os: macos-13 family: darwin cache-path: | ~/Library/Caches/go-build diff --git a/.github/workflows/test-build-packages.yml b/.github/workflows/test-build-packages.yml index c2e4e3c1b9..e44ddb6c7f 100644 --- a/.github/workflows/test-build-packages.yml +++ b/.github/workflows/test-build-packages.yml @@ -62,7 +62,7 @@ on: jobs: MakeMacPkg: name: 'MakeMacPkg' - runs-on: macos-12 + runs-on: macos-13 permissions: id-token: write contents: read