Skip to content

Commit

Permalink
Workflow to add eks-addon gpu test (#1214)
Browse files Browse the repository at this point in the history
  • Loading branch information
Paramadon authored Jun 19, 2024
1 parent e0ce119 commit f095643
Showing 1 changed file with 100 additions and 1 deletion.
101 changes: 100 additions & 1 deletion .github/workflows/integration-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,8 @@ jobs:
eks_deployment_matrix: ${{ steps.set-matrix.outputs.eks_deployment_matrix }}
ec2_linux_itar_matrix: ${{ steps.set-matrix.outputs.ec2_linux_itar_matrix }}
ec2_linux_china_matrix: ${{ steps.set-matrix.outputs.ec2_linux_china_matrix }}
eks_addon_matrix: ${{ steps.set-matrix.outputs.eks_addon_matrix }}


steps:
- uses: actions/checkout@v3
Expand All @@ -142,6 +144,7 @@ jobs:
run: |
go run --tags=generator generator/test_case_generator.go
echo "::set-output name=ec2_gpu_matrix::$(echo $(cat generator/resources/ec2_gpu_complete_test_matrix.json))"
echo "::set-output name=eks_addon_matrix::$(echo $(cat generator/resources/eks_addon_complete_test_matrix.json))"
echo "::set-output name=ec2_linux_matrix::$(echo $(cat generator/resources/ec2_linux_complete_test_matrix.json))"
echo "::set-output name=ec2_windows_matrix::$(echo $(cat generator/resources/ec2_windows_complete_test_matrix.json))"
echo "::set-output name=ec2_mac_matrix::$(echo $(cat generator/resources/ec2_mac_complete_test_matrix.json))"
Expand All @@ -159,6 +162,7 @@ jobs:
- name: Echo test plan matrix
run: |
echo "ec2_gpu_matrix: ${{ steps.set-matrix.outputs.ec2_gpu_matrix }}"
echo "eks_addon_matrix: ${{ steps.set-matrix.outputs.eks_addon_matrix }}"
echo "ec2_linux_matrix: ${{ steps.set-matrix.outputs.ec2_linux_matrix }}"
echo "ec2_windows_matrix: ${{ steps.set-matrix.outputs.ec2_windows_matrix }}"
echo "ec2_mac_matrix: ${{ steps.set-matrix.outputs.ec2_mac_matrix }}"
Expand Down Expand Up @@ -1354,4 +1358,99 @@ jobs:
permissions:
id-token: write
contents: read
secrets: inherit
secrets: inherit


GPUEndToEndTest:
name: "GPU E2E Test"
needs: [ BuildAndUpload, StartLocalStack, GenerateTestMatrix, OutputEnvVariables ]
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
arrays: ${{ fromJson(needs.GenerateTestMatrix.outputs.eks_addon_matrix) }}
permissions:
id-token: write
contents: read
steps:
- uses: actions/checkout@v3
with:
repository: ${{env.CWA_GITHUB_TEST_REPO_NAME}}
ref: ${{env.CWA_GITHUB_TEST_REPO_BRANCH}}

- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v2
with:
role-to-assume: ${{ env.TERRAFORM_AWS_ASSUME_ROLE }}
aws-region: us-west-2
role-duration-seconds: ${{ env.TERRAFORM_AWS_ASSUME_ROLE_DURATION }}


- name: Verify Terraform version
run: terraform --version


- name: Terraform apply and setup
run: |
if [ "${{ matrix.arrays.terraform_dir }}" != "" ]; then
cd "${{ matrix.arrays.terraform_dir }}"
else
cd terraform/eks/addon/gpu
fi
terraform init
if terraform apply --auto-approve \
-var="beta=true" \
-var="addon_name=amazon-cloudwatch-observability" \
-var="addon_version=v1.6.0-eksbuild.1" \
-var="k8s_version=1.29" ; then
echo "Terraform apply successful."
# Capture the output
echo "Getting EKS cluster name"
EKS_CLUSTER_NAME=$(terraform output -raw eks_cluster_name)
echo "Cluster name is ${EKS_CLUSTER_NAME}"
kubectl apply -f ./gpuBurner.yaml
kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.15.0/deployments/static/nvidia-device-plugin.yml
kubectl patch amazoncloudwatchagents -n amazon-cloudwatch cloudwatch-agent --type='json' -p='[{"op": "replace", "path": "/spec/image", "value": ${{ secrets.AWS_ECR_PRIVATE_REGISTRY }}/${{ env.ECR_INTEGRATION_TEST_REPO }}:${{ github.sha }}}]'
else
terraform destroy -var="beta=${{ github.event.inputs.run_in_beta }}" -auto-approve && exit 1
fi
- name: Run Go tests with retry
uses: nick-fields/retry@v2
with:
max_attempts: 10
timeout_minutes: 60
retry_wait_seconds: 60
command: |
if [ "${{ matrix.arrays.terraform_dir }}" != "" ]; then
cd "${{ matrix.arrays.terraform_dir }}"
else
cd terraform/eks/addon/gpu
fi
echo "Getting EKS cluster name"
EKS_CLUSTER_NAME=$(terraform output -raw eks_cluster_name)
echo "Cluster name is ${EKS_CLUSTER_NAME}"
if go test ${{ matrix.arrays.test_dir }} -eksClusterName ${EKS_CLUSTER_NAME} -computeType=EKS -v -eksDeploymentStrategy=DAEMON -eksGpuType=nvidia -useE2EMetrics; then
echo "Tests passed"
else
echo "Tests failed"
exit 1
fi
- name: Terraform destroy
if: always()
uses: nick-fields/retry@v2
with:
max_attempts: 3
timeout_minutes: 8
retry_wait_seconds: 5
command: |
if [ "${{ matrix.arrays.terraform_dir }}" != "" ]; then
cd "${{ matrix.arrays.terraform_dir }}"
else
cd terraform/eks/addon/gpu
fi
terraform destroy --auto-approve

0 comments on commit f095643

Please sign in to comment.