Skip to content

Commit

Permalink
Fix GPU E2E integ test (#1448)
Browse files Browse the repository at this point in the history
  • Loading branch information
movence authored Dec 3, 2024
1 parent 0d7b114 commit f3b333f
Showing 1 changed file with 12 additions and 11 deletions.
23 changes: 12 additions & 11 deletions .github/workflows/integration-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -842,7 +842,7 @@ jobs:
uses: actions/cache@v3
with:
path: go.mod
key: ${{ matrix.arrays.terraform_dir }}-${{ matrix.arrays.k8s_version }}-${{ matrix.arrays.instanceType }}-${{ github.sha }}-${{ matrix.arrays.os }}-${{ matrix.arrays.test_dir }}
key: ${{ matrix.arrays.terraform_dir }}-${{ matrix.arrays.k8sVersion }}-${{ matrix.arrays.instanceType }}-${{ github.sha }}-${{ matrix.arrays.os }}-${{ matrix.arrays.test_dir }}

- name: Login ECR
id: login-ecr
Expand Down Expand Up @@ -874,7 +874,7 @@ jobs:
-var="cwagent_image_tag=${{ github.sha }}" \
-var="ami_type=${{ matrix.arrays.ami }}" \
-var="instance_type=${{ matrix.arrays.instanceType }}" \
-var="k8s_version=${{ matrix.arrays.k8s_version }}"; then
-var="k8s_version=${{ matrix.arrays.k8sVersion }}"; then
terraform destroy -auto-approve
else
terraform destroy -auto-approve && exit 1
Expand Down Expand Up @@ -1254,7 +1254,7 @@ jobs:

GPUEndToEndTest:
name: "GPU E2E Test"
needs: [ StartLocalStack, GenerateTestMatrix, OutputEnvVariables ]
needs: [ GenerateTestMatrix, OutputEnvVariables ]
runs-on: ubuntu-latest
strategy:
fail-fast: false
Expand Down Expand Up @@ -1292,28 +1292,29 @@ jobs:
terraform init
if terraform apply --auto-approve \
-var="beta=true" \
-var="addon_name=amazon-cloudwatch-observability" \
-var="addon_version=v1.6.0-eksbuild.1" \
-var="k8s_version=1.29" ; then
-var="ami_type=${{ matrix.arrays.ami }}" \
-var="instance_type=${{ matrix.arrays.instanceType }}" \
-var="k8s_version=${{ matrix.arrays.k8sVersion }}"; then
echo "Terraform apply successful."
# Capture the output
echo "Getting EKS cluster name"
EKS_CLUSTER_NAME=$(terraform output -raw eks_cluster_name)
echo "Cluster name is ${EKS_CLUSTER_NAME}"
kubectl apply -f ./gpuBurner.yaml
kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.15.0/deployments/static/nvidia-device-plugin.yml
kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.17.0/deployments/static/nvidia-device-plugin.yml
kubectl patch amazoncloudwatchagents -n amazon-cloudwatch cloudwatch-agent --type='json' -p='[{"op": "replace", "path": "/spec/image", "value": ${{ secrets.AWS_ECR_PRIVATE_REGISTRY }}/${{ env.ECR_INTEGRATION_TEST_REPO }}:${{ github.sha }}}]'
kubectl rollout status daemonset nvidia-device-plugin-daemonset -n kube-system --timeout 10s
kubectl apply -f ./gpuBurner.yaml
else
terraform destroy -var="beta=${{ github.event.inputs.run_in_beta }}" -auto-approve && exit 1
fi
- name: Run Go tests with retry
uses: nick-fields/retry@v2
with:
max_attempts: 10
max_attempts: 5
timeout_minutes: 60
retry_wait_seconds: 60
retry_wait_seconds: 30
command: |
if [ "${{ matrix.arrays.terraform_dir }}" != "" ]; then
cd "${{ matrix.arrays.terraform_dir }}"
Expand Down Expand Up @@ -1344,4 +1345,4 @@ jobs:
else
cd terraform/eks/addon/gpu
fi
terraform destroy --auto-approve
terraform destroy -var="beta=${{ github.event.inputs.run_in_beta }}" -auto-approve

0 comments on commit f3b333f

Please sign in to comment.