Add Retry Mechanism to E2E EKS Terraform Deployment (#634)

* Add Retry Mechanism to E2E EKS Terraform Deployment * Add Extra Comments * Call Test APIs First before Validation * Add clean-app-signals to retry logic * Change App Signal Download Directory and modify if statement for validation * Modify while loop and refactor code
aws-observability · Dec 13, 2023 · 6a85d5f · 6a85d5f
1 parent 4f21215
commit 6a85d5f
Showing 1 changed file with 111 additions and 54 deletions.
diff --git a/.github/workflows/appsignals-e2e-eks-test.yml b/.github/workflows/appsignals-e2e-eks-test.yml
@@ -83,45 +83,116 @@ jobs:
         with:
           terraform_wrapper: false
 
-      - name: Deploy sample app via terraform
-        working-directory: testing/terraform/eks
-        run: |
-          terraform init
-          terraform validate
-          terraform apply -auto-approve \
-            -var="test_id=${{ env.TESTING_ID }}" \
-            -var="aws_region=${{ env.AWS_DEFAULT_REGION }}" \
-            -var="kube_directory_path=${{ github.workspace }}/.kube" \
-            -var="eks_cluster_name=${{ inputs.test-cluster-name }}" \
-            -var="eks_cluster_context_name=$(kubectl config current-context)" \
-            -var="test_namespace=${{ env.SAMPLE_APP_NAMESPACE }}" \
-            -var="service_account_aws_access=service-account-${{ env.TESTING_ID }}" \
-            -var="sample_app_image=${{ env.SAMPLE_APP_FRONTEND_SERVICE_IMAGE }}" \
-            -var="sample_remote_app_image=${{ env.SAMPLE_APP_REMOTE_SERVICE_IMAGE }}"
-
       # Enable App Signals on the test cluster
       - name: Pull and unzip enablement script from S3
+        working-directory: testing/terraform/eks
         run: aws s3 cp ${{ env.ENABLEMENT_SCRIPT_S3_BUCKET }} . && unzip -j onboarding.zip
 
       - name: Change ADOT image if main-build
         if: inputs.caller-workflow-name == 'main-build'
         run: "sed -i 's#image:.*#image: ${{ inputs.appsignals-adot-image-name }}#g' instrumentation.yaml"
 
-      - name: Enable App Signals
+      - name: Remove log group deletion command
+        if: always()
+        working-directory: testing/terraform/eks
         run: |
-          ./enable-app-signals.sh \
-          ${{ inputs.test-cluster-name }} \
-          ${{ env.AWS_DEFAULT_REGION }} \
-          ${{ env.SAMPLE_APP_NAMESPACE }}
-
-      # Application pods need to be restarted for the
-      # app signals instrumentation to take effect
-      - name: Restart the app pods
-        run: kubectl delete pods --all -n ${{ env.SAMPLE_APP_NAMESPACE }}
+          delete_log_group="aws logs delete-log-group --log-group-name '${{ env.LOG_GROUP_NAME }}' --region \$REGION"
+          sed -i "s#$delete_log_group##g" clean-app-signals.sh
 
-      - name: Wait for sample app pods to come up
+      - name: Deploy sample app via terraform and wait for the endpoint to come online
+        id: deploy-sample-app
+        working-directory: testing/terraform/eks
         run: |
-          kubectl wait --for=condition=Ready pod --all -n ${{ env.SAMPLE_APP_NAMESPACE }}
+          terraform init
+          terraform validate
+          
+          # Attempt to deploy the sample app on an EKS instance and wait for its endpoint to come online. 
+          # There may be occasional failures due to transitivity issues, so try up to 2 times. 
+          # deployment_failed of 0 indicates that both the terraform deployment and the endpoint are running, while 1 indicates
+          # that it failed at some point
+          retry_counter=0
+          max_retry=2
+          while [ $retry_counter -lt $max_retry ]; do
+            echo "Attempt $retry_counter"
+            deployment_failed=0
+            terraform apply -auto-approve \
+              -var="test_id=${{ env.TESTING_ID }}" \
+              -var="aws_region=${{ env.AWS_DEFAULT_REGION }}" \
+              -var="kube_directory_path=${{ github.workspace }}/.kube" \
+              -var="eks_cluster_name=${{ inputs.test-cluster-name }}" \
+              -var="eks_cluster_context_name=$(kubectl config current-context)" \
+              -var="test_namespace=${{ env.SAMPLE_APP_NAMESPACE }}" \
+              -var="service_account_aws_access=service-account-${{ env.TESTING_ID }}" \
+              -var="sample_app_image=${{ env.SAMPLE_APP_FRONTEND_SERVICE_IMAGE }}" \
+              -var="sample_remote_app_image=${{ env.SAMPLE_APP_REMOTE_SERVICE_IMAGE }}" \
+            || deployment_failed=$?
+          
+            if [ $deployment_failed -eq 1 ]; then
+              echo "Terraform deployment was unsuccessful. Will attempt to retry deployment."
+            fi
+
+            # If the deployment_failed is still 0, then the terraform deployment succeeded and now try to connect to the endpoint 
+            # after installing App Signals. Attempts to connect will be made for up to 10 minutes
+            if [ $deployment_failed -eq 0 ]; then
+              echo "Installing app signals to the sample app"
+              ./enable-app-signals.sh \
+              ${{ inputs.test-cluster-name }} \
+              ${{ env.AWS_DEFAULT_REGION }} \
+              ${{ env.SAMPLE_APP_NAMESPACE }}
+          
+              kubectl delete pods --all -n ${{ env.SAMPLE_APP_NAMESPACE }}
+              kubectl wait --for=condition=Ready pod --all -n ${{ env.SAMPLE_APP_NAMESPACE }}
+          
+              echo "Attempting to connect to the endpoint"
+              sample_app_endpoint=http://$(terraform output sample_app_endpoint)
+              attempt_counter=0
+              max_attempts=60
+              until $(curl --output /dev/null --silent --head --fail $(echo "$sample_app_endpoint" | tr -d '"')); do
+                if [ ${attempt_counter} -eq ${max_attempts} ];then
+                  echo "Failed to connect to endpoint. Will attempt to redeploy sample app."
+                  deployment_failed=1
+                  break
+                fi
+          
+                printf '.'
+                attempt_counter=$(($attempt_counter+1))
+                sleep 10
+              done
+            fi
+          
+            # If the deployment_failed is 1 then either the terraform deployment or the endpoint connection failed, so first destroy the
+            # resources created from terraform and try again.
+            if [ $deployment_failed -eq 1 ]; then
+              echo "Cleaning up App Signal"
+              ./clean-app-signals.sh \
+              ${{ inputs.test-cluster-name }} \
+              ${{ env.AWS_DEFAULT_REGION }} \
+              ${{ env.SAMPLE_APP_NAMESPACE }}
+          
+              # Running clean-app-signal.sh removes the current cluster from the config. Update the cluster again for subsequent runs.
+              aws eks update-kubeconfig --name ${{ inputs.test-cluster-name }} --region ${{ env.AWS_DEFAULT_REGION }}
+
+              echo "Destroying terraform"
+              terraform destroy -auto-approve \
+                -var="test_id=${{ env.TESTING_ID }}" \
+                -var="aws_region=${{ env.AWS_DEFAULT_REGION }}" \
+                -var="kube_directory_path=${{ github.workspace }}/.kube" \
+                -var="eks_cluster_name=${{ inputs.test-cluster-name }}" \
+                -var="test_namespace=${{ env.SAMPLE_APP_NAMESPACE }}" \
+                -var="service_account_aws_access=service-account-${{ env.TESTING_ID }}" \
+                -var="sample_app_image=${{ env.SAMPLE_APP_IMAGE }}"
+          
+              retry_counter=$(($retry_counter+1))
+            else
+              # If deployment succeeded, then exit the loop
+              break
+            fi
+          
+            if [ $retry_counter -eq $max_retry ]; then
+              echo "Max retry reached, failed to deploy terraform and connect to the endpoint. Exiting code"
+              exit 1
+            fi
+          done
 
       - name: Get remote service pod name and IP
         run: |
@@ -139,30 +210,22 @@ jobs:
           jq '.items[0].status.containerStatuses[0].imageID'
 
       - name: Get the sample app endpoint
-        run: |
-          echo "APP_ENDPOINT=$(terraform output sample_app_endpoint)" >> $GITHUB_ENV
+        run: echo "APP_ENDPOINT=$(terraform output sample_app_endpoint)" >> $GITHUB_ENV
         working-directory: testing/terraform/eks
 
-      - name: Wait for app endpoint to come online
-        id: endpoint-check
+      # This steps increases the speed of the validation by creating the telemetry data in advance
+      - name: Call all test APIs
+        continue-on-error: true
         run: |
-          attempt_counter=0
-          max_attempts=30
-          until $(curl --output /dev/null --silent --head --fail http://${{ env.APP_ENDPOINT }}); do
-            if [ ${attempt_counter} -eq ${max_attempts} ];then
-              echo "Max attempts reached"
-              exit 1
-            fi
-
-            printf '.'
-            attempt_counter=$(($attempt_counter+1))
-            sleep 10
-          done
+          curl -S -s -o /dev/null http://${{ env.APP_ENDPOINT }}/outgoing-http-call/
+          curl -S -s -o /dev/null http://${{ env.APP_ENDPOINT }}/aws-sdk-call/
+          curl -S -s -o /dev/null http://${{ env.APP_ENDPOINT }}/remote-service?ip=${{ env.REMOTE_SERVICE_POD_IP }}/
+          curl -S -s -o /dev/null http://${{ env.APP_ENDPOINT }}/client-call/
 
       # Validation for app signals telemetry data
       - name: Call endpoint and validate generated EMF logs
         id: log-validation
-        if: steps.endpoint-check.outcome == 'success' && !cancelled()
+        if: steps.deploy-sample-app.outcome == 'success' && !cancelled()
         run: ./gradlew testing:validator:run --args='-c eks/log-validation.yml
           --testing-id ${{ env.TESTING_ID }}
           --endpoint http://${{ env.APP_ENDPOINT }}
@@ -179,7 +242,7 @@ jobs:
 
       - name: Call endpoints and validate generated metrics
         id: metric-validation
-        if: (success() || steps.log-validation.outcome == 'failure') && !cancelled()
+        if: (steps.deploy-sample-app.outcome == 'success' || steps.log-validation.outcome == 'failure') && !cancelled()
         run: ./gradlew testing:validator:run --args='-c eks/metric-validation.yml
           --testing-id ${{ env.TESTING_ID }}
           --endpoint http://${{ env.APP_ENDPOINT }}
@@ -197,7 +260,7 @@ jobs:
 
       - name: Call endpoints and validate generated traces
         id: trace-validation
-        if: (success() || steps.log-validation.outcome == 'failure' || steps.metric-validation.outcome == 'failure') && !cancelled()
+        if: (steps.deploy-sample-app.outcome == 'success' || steps.log-validation.outcome == 'failure' || steps.metric-validation.outcome == 'failure') && !cancelled()
         run: ./gradlew testing:validator:run --args='-c eks/trace-validation.yml
           --testing-id ${{ env.TESTING_ID }}
           --endpoint http://${{ env.APP_ENDPOINT }}
@@ -231,12 +294,6 @@ jobs:
 
       # Clean up Procedures
 
-      - name: Remove log group deletion command
-        if: always()
-        run: |
-          delete_log_group="aws logs delete-log-group --log-group-name '${{ env.LOG_GROUP_NAME }}' --region \$REGION"
-          sed -i "s#$delete_log_group##g" clean-app-signals.sh
-
       - name: Clean Up App Signals
         if: always()
         continue-on-error: true
@@ -275,4 +332,4 @@ jobs:
           --name service-account-${{ env.TESTING_ID }} \
           --namespace ${{ env.SAMPLE_APP_NAMESPACE }} \
           --cluster ${{ inputs.test-cluster-name }} \
-          --region ${{ env.AWS_DEFAULT_REGION }} \
+          --region ${{ env.AWS_DEFAULT_REGION }}