Skip to content

Commit

Permalink
Add Retry Mechanism to E2E EKS Terraform Deployment (#634)
Browse files Browse the repository at this point in the history
* Add Retry Mechanism to E2E EKS Terraform Deployment

* Add Extra Comments

* Call Test APIs First before Validation

* Add clean-app-signals to retry logic

* Change App Signal Download Directory and modify if statement for validation

* Modify while loop and refactor code
  • Loading branch information
harrryr authored Dec 13, 2023
1 parent 4f21215 commit 6a85d5f
Showing 1 changed file with 111 additions and 54 deletions.
165 changes: 111 additions & 54 deletions .github/workflows/appsignals-e2e-eks-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -83,45 +83,116 @@ jobs:
with:
terraform_wrapper: false

- name: Deploy sample app via terraform
working-directory: testing/terraform/eks
run: |
terraform init
terraform validate
terraform apply -auto-approve \
-var="test_id=${{ env.TESTING_ID }}" \
-var="aws_region=${{ env.AWS_DEFAULT_REGION }}" \
-var="kube_directory_path=${{ github.workspace }}/.kube" \
-var="eks_cluster_name=${{ inputs.test-cluster-name }}" \
-var="eks_cluster_context_name=$(kubectl config current-context)" \
-var="test_namespace=${{ env.SAMPLE_APP_NAMESPACE }}" \
-var="service_account_aws_access=service-account-${{ env.TESTING_ID }}" \
-var="sample_app_image=${{ env.SAMPLE_APP_FRONTEND_SERVICE_IMAGE }}" \
-var="sample_remote_app_image=${{ env.SAMPLE_APP_REMOTE_SERVICE_IMAGE }}"
# Enable App Signals on the test cluster
- name: Pull and unzip enablement script from S3
working-directory: testing/terraform/eks
run: aws s3 cp ${{ env.ENABLEMENT_SCRIPT_S3_BUCKET }} . && unzip -j onboarding.zip

- name: Change ADOT image if main-build
if: inputs.caller-workflow-name == 'main-build'
run: "sed -i 's#image:.*#image: ${{ inputs.appsignals-adot-image-name }}#g' instrumentation.yaml"

- name: Enable App Signals
- name: Remove log group deletion command
if: always()
working-directory: testing/terraform/eks
run: |
./enable-app-signals.sh \
${{ inputs.test-cluster-name }} \
${{ env.AWS_DEFAULT_REGION }} \
${{ env.SAMPLE_APP_NAMESPACE }}
# Application pods need to be restarted for the
# app signals instrumentation to take effect
- name: Restart the app pods
run: kubectl delete pods --all -n ${{ env.SAMPLE_APP_NAMESPACE }}
delete_log_group="aws logs delete-log-group --log-group-name '${{ env.LOG_GROUP_NAME }}' --region \$REGION"
sed -i "s#$delete_log_group##g" clean-app-signals.sh
- name: Wait for sample app pods to come up
- name: Deploy sample app via terraform and wait for the endpoint to come online
id: deploy-sample-app
working-directory: testing/terraform/eks
run: |
kubectl wait --for=condition=Ready pod --all -n ${{ env.SAMPLE_APP_NAMESPACE }}
terraform init
terraform validate
# Attempt to deploy the sample app on an EKS instance and wait for its endpoint to come online.
# There may be occasional failures due to transitivity issues, so try up to 2 times.
# deployment_failed of 0 indicates that both the terraform deployment and the endpoint are running, while 1 indicates
# that it failed at some point
retry_counter=0
max_retry=2
while [ $retry_counter -lt $max_retry ]; do
echo "Attempt $retry_counter"
deployment_failed=0
terraform apply -auto-approve \
-var="test_id=${{ env.TESTING_ID }}" \
-var="aws_region=${{ env.AWS_DEFAULT_REGION }}" \
-var="kube_directory_path=${{ github.workspace }}/.kube" \
-var="eks_cluster_name=${{ inputs.test-cluster-name }}" \
-var="eks_cluster_context_name=$(kubectl config current-context)" \
-var="test_namespace=${{ env.SAMPLE_APP_NAMESPACE }}" \
-var="service_account_aws_access=service-account-${{ env.TESTING_ID }}" \
-var="sample_app_image=${{ env.SAMPLE_APP_FRONTEND_SERVICE_IMAGE }}" \
-var="sample_remote_app_image=${{ env.SAMPLE_APP_REMOTE_SERVICE_IMAGE }}" \
|| deployment_failed=$?
if [ $deployment_failed -eq 1 ]; then
echo "Terraform deployment was unsuccessful. Will attempt to retry deployment."
fi
# If the deployment_failed is still 0, then the terraform deployment succeeded and now try to connect to the endpoint
# after installing App Signals. Attempts to connect will be made for up to 10 minutes
if [ $deployment_failed -eq 0 ]; then
echo "Installing app signals to the sample app"
./enable-app-signals.sh \
${{ inputs.test-cluster-name }} \
${{ env.AWS_DEFAULT_REGION }} \
${{ env.SAMPLE_APP_NAMESPACE }}
kubectl delete pods --all -n ${{ env.SAMPLE_APP_NAMESPACE }}
kubectl wait --for=condition=Ready pod --all -n ${{ env.SAMPLE_APP_NAMESPACE }}
echo "Attempting to connect to the endpoint"
sample_app_endpoint=http://$(terraform output sample_app_endpoint)
attempt_counter=0
max_attempts=60
until $(curl --output /dev/null --silent --head --fail $(echo "$sample_app_endpoint" | tr -d '"')); do
if [ ${attempt_counter} -eq ${max_attempts} ];then
echo "Failed to connect to endpoint. Will attempt to redeploy sample app."
deployment_failed=1
break
fi
printf '.'
attempt_counter=$(($attempt_counter+1))
sleep 10
done
fi
# If the deployment_failed is 1 then either the terraform deployment or the endpoint connection failed, so first destroy the
# resources created from terraform and try again.
if [ $deployment_failed -eq 1 ]; then
echo "Cleaning up App Signal"
./clean-app-signals.sh \
${{ inputs.test-cluster-name }} \
${{ env.AWS_DEFAULT_REGION }} \
${{ env.SAMPLE_APP_NAMESPACE }}
# Running clean-app-signal.sh removes the current cluster from the config. Update the cluster again for subsequent runs.
aws eks update-kubeconfig --name ${{ inputs.test-cluster-name }} --region ${{ env.AWS_DEFAULT_REGION }}
echo "Destroying terraform"
terraform destroy -auto-approve \
-var="test_id=${{ env.TESTING_ID }}" \
-var="aws_region=${{ env.AWS_DEFAULT_REGION }}" \
-var="kube_directory_path=${{ github.workspace }}/.kube" \
-var="eks_cluster_name=${{ inputs.test-cluster-name }}" \
-var="test_namespace=${{ env.SAMPLE_APP_NAMESPACE }}" \
-var="service_account_aws_access=service-account-${{ env.TESTING_ID }}" \
-var="sample_app_image=${{ env.SAMPLE_APP_IMAGE }}"
retry_counter=$(($retry_counter+1))
else
# If deployment succeeded, then exit the loop
break
fi
if [ $retry_counter -eq $max_retry ]; then
echo "Max retry reached, failed to deploy terraform and connect to the endpoint. Exiting code"
exit 1
fi
done
- name: Get remote service pod name and IP
run: |
Expand All @@ -139,30 +210,22 @@ jobs:
jq '.items[0].status.containerStatuses[0].imageID'
- name: Get the sample app endpoint
run: |
echo "APP_ENDPOINT=$(terraform output sample_app_endpoint)" >> $GITHUB_ENV
run: echo "APP_ENDPOINT=$(terraform output sample_app_endpoint)" >> $GITHUB_ENV
working-directory: testing/terraform/eks

- name: Wait for app endpoint to come online
id: endpoint-check
# This steps increases the speed of the validation by creating the telemetry data in advance
- name: Call all test APIs
continue-on-error: true
run: |
attempt_counter=0
max_attempts=30
until $(curl --output /dev/null --silent --head --fail http://${{ env.APP_ENDPOINT }}); do
if [ ${attempt_counter} -eq ${max_attempts} ];then
echo "Max attempts reached"
exit 1
fi
printf '.'
attempt_counter=$(($attempt_counter+1))
sleep 10
done
curl -S -s -o /dev/null http://${{ env.APP_ENDPOINT }}/outgoing-http-call/
curl -S -s -o /dev/null http://${{ env.APP_ENDPOINT }}/aws-sdk-call/
curl -S -s -o /dev/null http://${{ env.APP_ENDPOINT }}/remote-service?ip=${{ env.REMOTE_SERVICE_POD_IP }}/
curl -S -s -o /dev/null http://${{ env.APP_ENDPOINT }}/client-call/
# Validation for app signals telemetry data
- name: Call endpoint and validate generated EMF logs
id: log-validation
if: steps.endpoint-check.outcome == 'success' && !cancelled()
if: steps.deploy-sample-app.outcome == 'success' && !cancelled()
run: ./gradlew testing:validator:run --args='-c eks/log-validation.yml
--testing-id ${{ env.TESTING_ID }}
--endpoint http://${{ env.APP_ENDPOINT }}
Expand All @@ -179,7 +242,7 @@ jobs:

- name: Call endpoints and validate generated metrics
id: metric-validation
if: (success() || steps.log-validation.outcome == 'failure') && !cancelled()
if: (steps.deploy-sample-app.outcome == 'success' || steps.log-validation.outcome == 'failure') && !cancelled()
run: ./gradlew testing:validator:run --args='-c eks/metric-validation.yml
--testing-id ${{ env.TESTING_ID }}
--endpoint http://${{ env.APP_ENDPOINT }}
Expand All @@ -197,7 +260,7 @@ jobs:

- name: Call endpoints and validate generated traces
id: trace-validation
if: (success() || steps.log-validation.outcome == 'failure' || steps.metric-validation.outcome == 'failure') && !cancelled()
if: (steps.deploy-sample-app.outcome == 'success' || steps.log-validation.outcome == 'failure' || steps.metric-validation.outcome == 'failure') && !cancelled()
run: ./gradlew testing:validator:run --args='-c eks/trace-validation.yml
--testing-id ${{ env.TESTING_ID }}
--endpoint http://${{ env.APP_ENDPOINT }}
Expand Down Expand Up @@ -231,12 +294,6 @@ jobs:
# Clean up Procedures

- name: Remove log group deletion command
if: always()
run: |
delete_log_group="aws logs delete-log-group --log-group-name '${{ env.LOG_GROUP_NAME }}' --region \$REGION"
sed -i "s#$delete_log_group##g" clean-app-signals.sh
- name: Clean Up App Signals
if: always()
continue-on-error: true
Expand Down Expand Up @@ -275,4 +332,4 @@ jobs:
--name service-account-${{ env.TESTING_ID }} \
--namespace ${{ env.SAMPLE_APP_NAMESPACE }} \
--cluster ${{ inputs.test-cluster-name }} \
--region ${{ env.AWS_DEFAULT_REGION }} \
--region ${{ env.AWS_DEFAULT_REGION }}

0 comments on commit 6a85d5f

Please sign in to comment.