From 09d5672ba75fc74e5f6d3ba7e613701a654a2f99 Mon Sep 17 00:00:00 2001 From: Harry Date: Fri, 16 Aug 2024 14:17:11 -0700 Subject: [PATCH] Revert "Clean terraform" (#176) Reverts aws-observability/aws-application-signals-test-framework#158 --- .../actions/execute_and_retry/action.yml | 9 -- .github/workflows/java-ec2-asg-e2e-test.yml | 57 +++++-- .../workflows/java-ec2-default-e2e-test.yml | 57 +++++-- .github/workflows/java-eks-e2e-test.yml | 134 +++++++++++------ .../java-metric-limiter-e2e-test.yml | 129 ++++++++++------ .github/workflows/python-ec2-asg-e2e-test.yml | 55 +++++-- .../workflows/python-ec2-default-e2e-test.yml | 55 +++++-- .github/workflows/python-eks-e2e-test.yml | 139 ++++++++++++------ 8 files changed, 434 insertions(+), 201 deletions(-) diff --git a/.github/workflows/actions/execute_and_retry/action.yml b/.github/workflows/actions/execute_and_retry/action.yml index cc6fbfd3d..3dfe73cd2 100644 --- a/.github/workflows/actions/execute_and_retry/action.yml +++ b/.github/workflows/actions/execute_and_retry/action.yml @@ -28,10 +28,6 @@ inputs: post-command: required: false type: string - # (Optional) Directory to run on - working-directory: - required: false - type: string runs: using: "composite" @@ -45,12 +41,7 @@ runs: CLEANUP: ${{ inputs.cleanup }} POST_COMMAND: ${{ inputs.post-command }} SLEEP_TIME: ${{ inputs.sleep_time }} - WORKING_DIRECTORY: ${{ inputs.working-directory }} run: | - if [ -n "$WORKING_DIRECTORY" ]; then - echo "Moving directory to $WORKING_DIRECTORY" - cd $WORKING_DIRECTORY - fi echo "Starting the execute_and_retry action for command $COMMAND" echo "Executing pre-command for the execute_and_retry action" eval "$PRE_COMMAND" diff --git a/.github/workflows/java-ec2-asg-e2e-test.yml b/.github/workflows/java-ec2-asg-e2e-test.yml index b89c81547..ceb7d48c2 100644 --- a/.github/workflows/java-ec2-asg-e2e-test.yml +++ b/.github/workflows/java-ec2-asg-e2e-test.yml @@ -106,25 +106,54 @@ jobs: - name: Initiate Terraform uses: ./.github/workflows/actions/execute_and_retry with: - command: "terraform init && terraform validate" + command: "cd ${{ env.TEST_RESOURCES_FOLDER }}/terraform/java/ec2/asg && terraform init && terraform validate" cleanup: "rm -rf .terraform && rm -rf .terraform.lock.hcl" max_retry: 6 sleep_time: 60 - working-directory: ./terraform/java/ec2/asg - name: Deploy sample app via terraform and wait for endpoint to come online - uses: ./.github/workflows/actions/execute_and_retry - with: - command: 'terraform apply -auto-approve - -var="aws_region=${{ env.E2E_TEST_AWS_REGION }}" - -var="test_id=${{ env.TESTING_ID }}" - -var="sample_app_jar=${{ env.SAMPLE_APP_FRONTEND_SERVICE_JAR }}" - -var="sample_remote_app_jar=${{ env.SAMPLE_APP_REMOTE_SERVICE_JAR }}" - -var="get_cw_agent_rpm_command=${{ env.GET_CW_AGENT_RPM_COMMAND }}" - -var="get_adot_jar_command=${{ env.GET_ADOT_JAR_COMMAND }}"' - cleanup: 'terraform destroy -auto-approve -var="test_id=${{ env.TESTING_ID }}' - max_retry: 2 - working-directory: ./terraform/java/ec2/asg + working-directory: terraform/java/ec2/asg + run: | + # Attempt to deploy the sample app on an EC2 instance and wait for its endpoint to come online. + # There may be occasional failures due to transitivity issues, so try up to 2 times. + # deployment_failed of 0 indicates that both the terraform deployment and the endpoint are running, while 1 indicates + # that it failed at some point + retry_counter=0 + max_retry=2 + while [ $retry_counter -lt $max_retry ]; do + echo "Attempt $retry_counter" + deployment_failed=0 + terraform apply -auto-approve \ + -var="aws_region=${{ env.E2E_TEST_AWS_REGION }}" \ + -var="test_id=${{ env.TESTING_ID }}" \ + -var="sample_app_jar=${{ env.SAMPLE_APP_FRONTEND_SERVICE_JAR }}" \ + -var="sample_remote_app_jar=${{ env.SAMPLE_APP_REMOTE_SERVICE_JAR }}" \ + -var="get_cw_agent_rpm_command=${{ env.GET_CW_AGENT_RPM_COMMAND }}" \ + -var="get_adot_jar_command=${{ env.GET_ADOT_JAR_COMMAND }}" \ + || deployment_failed=$? + + if [ $deployment_failed -eq 1 ]; then + echo "Terraform deployment was unsuccessful. Will attempt to retry deployment." + fi + + # If the success is 1 then either the terraform deployment or the endpoint connection failed, so first destroy the + # resources created from terraform and try again. + if [ $deployment_failed -eq 1 ]; then + echo "Destroying terraform" + terraform destroy -auto-approve \ + -var="test_id=${{ env.TESTING_ID }}" + + retry_counter=$(($retry_counter+1)) + else + # If deployment succeeded, then exit the loop + break + fi + + if [ $retry_counter -eq $max_retry ]; then + echo "Max retry reached, failed to deploy terraform and connect to the endpoint. Exiting code" + exit 1 + fi + done - name: Get the sample app and EC2 instance information working-directory: terraform/java/ec2/asg diff --git a/.github/workflows/java-ec2-default-e2e-test.yml b/.github/workflows/java-ec2-default-e2e-test.yml index bb994e7c7..1c0533a93 100644 --- a/.github/workflows/java-ec2-default-e2e-test.yml +++ b/.github/workflows/java-ec2-default-e2e-test.yml @@ -106,25 +106,54 @@ jobs: - name: Initiate Terraform uses: ./.github/workflows/actions/execute_and_retry with: - command: "terraform init && terraform validate" + command: "cd ${{ env.TEST_RESOURCES_FOLDER }}/terraform/java/ec2/default && terraform init && terraform validate" cleanup: "rm -rf .terraform && rm -rf .terraform.lock.hcl" max_retry: 6 sleep_time: 60 - working-directory: ./terraform/java/ec2/default - name: Deploy sample app via terraform and wait for endpoint to come online - uses: ./.github/workflows/actions/execute_and_retry - with: - command: 'terraform apply -auto-approve - -var="aws_region=${{ inputs.aws-region }}" - -var="test_id=${{ env.TESTING_ID }}" - -var="sample_app_jar=${{ env.SAMPLE_APP_FRONTEND_SERVICE_JAR }}" - -var="sample_remote_app_jar=${{ env.SAMPLE_APP_REMOTE_SERVICE_JAR }}" - -var="get_cw_agent_rpm_command=${{ env.GET_CW_AGENT_RPM_COMMAND }}" - -var="get_adot_jar_command=${{ env.GET_ADOT_JAR_COMMAND }}"' - cleanup: 'terraform destroy -auto-approve -var="test_id=${{ env.TESTING_ID }}' - max_retry: 2 - working-directory: ./terraform/java/ec2/default + working-directory: terraform/java/ec2/default + run: | + # Attempt to deploy the sample app on an EC2 instance and wait for its endpoint to come online. + # There may be occasional failures due to transitivity issues, so try up to 2 times. + # deployment_failed of 0 indicates that both the terraform deployment and the endpoint are running, while 1 indicates + # that it failed at some point + retry_counter=0 + max_retry=2 + while [ $retry_counter -lt $max_retry ]; do + echo "Attempt $retry_counter" + deployment_failed=0 + terraform apply -auto-approve \ + -var="aws_region=${{ inputs.aws-region }}" \ + -var="test_id=${{ env.TESTING_ID }}" \ + -var="sample_app_jar=${{ env.SAMPLE_APP_FRONTEND_SERVICE_JAR }}" \ + -var="sample_remote_app_jar=${{ env.SAMPLE_APP_REMOTE_SERVICE_JAR }}" \ + -var="get_cw_agent_rpm_command=${{ env.GET_CW_AGENT_RPM_COMMAND }}" \ + -var="get_adot_jar_command=${{ env.GET_ADOT_JAR_COMMAND }}" \ + || deployment_failed=$? + + if [ $deployment_failed -eq 1 ]; then + echo "Terraform deployment was unsuccessful. Will attempt to retry deployment." + fi + + # If the success is 1 then either the terraform deployment or the endpoint connection failed, so first destroy the + # resources created from terraform and try again. + if [ $deployment_failed -eq 1 ]; then + echo "Destroying terraform" + terraform destroy -auto-approve \ + -var="test_id=${{ env.TESTING_ID }}" + + retry_counter=$(($retry_counter+1)) + else + # If deployment succeeded, then exit the loop + break + fi + + if [ $retry_counter -eq $max_retry ]; then + echo "Max retry reached, failed to deploy terraform and connect to the endpoint. Exiting code" + exit 1 + fi + done - name: Get the ec2 instance ami id working-directory: terraform/java/ec2/default diff --git a/.github/workflows/java-eks-e2e-test.yml b/.github/workflows/java-eks-e2e-test.yml index 94a16f5b0..3a3ae8ace 100644 --- a/.github/workflows/java-eks-e2e-test.yml +++ b/.github/workflows/java-eks-e2e-test.yml @@ -197,61 +197,103 @@ jobs: - name: Initiate Terraform uses: ./.github/workflows/actions/execute_and_retry with: - command: "terraform init && terraform validate" + command: "cd ${{ env.TEST_RESOURCES_FOLDER }}/terraform/java/eks && terraform init && terraform validate" cleanup: "rm -rf .terraform && rm -rf .terraform.lock.hcl" max_retry: 6 sleep_time: 60 - working-directory: ./terraform/java/eks - name: Set Sample App Image run: | echo MAIN_SAMPLE_APP_IMAGE_ARN="${{ env.ACCOUNT_ID }}.dkr.ecr.${{ env.E2E_TEST_AWS_REGION }}.amazonaws.com/${{ env.JAVA_MAIN_SAMPLE_APP_IMAGE }}" >> $GITHUB_ENV echo REMOTE_SAMPLE_APP_IMAGE_ARN="${{ env.ACCOUNT_ID }}.dkr.ecr.${{ env.E2E_TEST_AWS_REGION }}.amazonaws.com/${{ env.JAVA_REMOTE_SAMPLE_APP_IMAGE }}" >> $GITHUB_ENV - - name: Deploy sample app via terraform and wait for endpoint to come online - uses: ./.github/workflows/actions/execute_and_retry - with: - command: 'terraform apply -auto-approve - -var="test_id=${{ env.TESTING_ID }}" - -var="aws_region=${{ env.E2E_TEST_AWS_REGION }}" - -var="kube_directory_path=${{ github.workspace }}/.kube" - -var="eks_cluster_name=${{ env.CLUSTER_NAME }}" - -var="eks_cluster_context_name=$(kubectl config current-context)" - -var="test_namespace=${{ env.SAMPLE_APP_NAMESPACE }}" - -var="service_account_aws_access=service-account-${{ env.TESTING_ID }}" - -var="sample_app_image=${{ env.MAIN_SAMPLE_APP_IMAGE_ARN }}" - -var="sample_remote_app_image=${{ env.REMOTE_SAMPLE_APP_IMAGE_ARN }}" - -var="rds_mysql_cluster_endpoint=${{env.RDS_MYSQL_CLUSTER_ENDPOINT}}" - -var="rds_mysql_cluster_username=${{env.RDS_MYSQL_CLUSTER_SECRETS_USERNAME}}" - -var="rds_mysql_cluster_password=${{env.RDS_MYSQL_CLUSTER_SECRETS_PASSWORD}}" - -var="account_id=${{ env.ACCOUNT_ID }}"' - cleanup: 'terraform destroy -auto-approve - -var="test_id=${{ env.TESTING_ID }}" - -var="aws_region=${{ env.E2E_TEST_AWS_REGION }}" - -var="kube_directory_path=${{ github.workspace }}/.kube" - -var="eks_cluster_name=${{ env.CLUSTER_NAME }}" - -var="test_namespace=${{ env.SAMPLE_APP_NAMESPACE }}" - -var="service_account_aws_access=service-account-${{ env.TESTING_ID }}" - -var="sample_app_image=${{ env.MAIN_SAMPLE_APP_IMAGE_ARN }}" - -var="sample_remote_app_image=${{ env.REMOTE_SAMPLE_APP_IMAGE_ARN }}"' - max_retry: 2 - working-directory: ./terraform/java/eks - - - name: Enable App Signals - uses: ./.github/workflows/actions/execute_and_retry - with: - command: './enable-app-signals.sh - ${{ env.CLUSTER_NAME }} - ${{ env.E2E_TEST_AWS_REGION }} - ${{ env.SAMPLE_APP_NAMESPACE }}' - cleanup: '${{ env.CLUSTER_NAME }} - ${{ env.E2E_TEST_AWS_REGION }} - ${{ env.SAMPLE_APP_NAMESPACE }} && - aws eks update-kubeconfig --name ${{ env.CLUSTER_NAME }} --region ${{ env.E2E_TEST_AWS_REGION }}"' - post_command: 'kubectl delete pods --all -n ${{ env.SAMPLE_APP_NAMESPACE }} && kubectl wait --for=condition=Ready --request-timeout "5m" pod --all -n ${{ env.SAMPLE_APP_NAMESPACE }}' - max_retry: 3 - sleep_time: 60 - working-directory: ./enablement-script + - name: Deploy sample app via terraform and wait for the endpoint to come online + id: deploy-sample-app + working-directory: terraform/java/eks + run: | + # Attempt to deploy the sample app on an EKS instance and wait for its endpoint to come online. + # There may be occasional failures due to transitivity issues, so try up to 2 times. + # deployment_failed of 0 indicates that both the terraform deployment and the endpoint are running, while 1 indicates + # that it failed at some point + retry_counter=0 + max_retry=2 + while [ $retry_counter -lt $max_retry ]; do + echo "Attempt $retry_counter" + deployment_failed=0 + terraform apply -auto-approve \ + -var="test_id=${{ env.TESTING_ID }}" \ + -var="aws_region=${{ env.E2E_TEST_AWS_REGION }}" \ + -var="kube_directory_path=${{ github.workspace }}/.kube" \ + -var="eks_cluster_name=${{ env.CLUSTER_NAME }}" \ + -var="eks_cluster_context_name=$(kubectl config current-context)" \ + -var="test_namespace=${{ env.SAMPLE_APP_NAMESPACE }}" \ + -var="service_account_aws_access=service-account-${{ env.TESTING_ID }}" \ + -var="sample_app_image=${{ env.MAIN_SAMPLE_APP_IMAGE_ARN }}" \ + -var="sample_remote_app_image=${{ env.REMOTE_SAMPLE_APP_IMAGE_ARN }}" \ + -var="rds_mysql_cluster_endpoint=${{env.RDS_MYSQL_CLUSTER_ENDPOINT}}" \ + -var="rds_mysql_cluster_username=${{env.RDS_MYSQL_CLUSTER_SECRETS_USERNAME}}" \ + -var='rds_mysql_cluster_password=${{env.RDS_MYSQL_CLUSTER_SECRETS_PASSWORD}}' \ + -var='account_id=${{ env.ACCOUNT_ID }}' \ + || deployment_failed=$? + + if [ $deployment_failed -ne 0 ]; then + echo "Terraform deployment was unsuccessful. Will attempt to retry deployment." + fi + + # If the deployment_failed is still 0, then the terraform deployment succeeded and now try to connect to the endpoint + # after installing App Signals. Attempts to connect will be made for up to 10 minutes + if [ $deployment_failed -eq 0 ]; then + . ${{ env.TEST_RESOURCES_FOLDER }}/.github/workflows/util/execute_and_retry.sh + execute_and_retry 3 \ + "${{ env.TEST_RESOURCES_FOLDER }}/enablement-script/enable-app-signals.sh \ + ${{ env.CLUSTER_NAME }} \ + ${{ env.E2E_TEST_AWS_REGION }} \ + ${{ env.SAMPLE_APP_NAMESPACE }}" \ + "${{ env.TEST_RESOURCES_FOLDER }}/enablement-script/clean-app-signals.sh \ + ${{ env.CLUSTER_NAME }} \ + ${{ env.E2E_TEST_AWS_REGION }} \ + ${{ env.SAMPLE_APP_NAMESPACE }} && \ + aws eks update-kubeconfig --name ${{ env.CLUSTER_NAME }} --region ${{ env.E2E_TEST_AWS_REGION }}" \ + 60 + + execute_and_retry 2 "kubectl delete pods --all -n ${{ env.SAMPLE_APP_NAMESPACE }}" "" 60 + execute_and_retry 2 "kubectl wait --for=condition=Ready --request-timeout '5m' pod --all -n ${{ env.SAMPLE_APP_NAMESPACE }}" "" 10 + fi + + # If the deployment_failed is 1 then either the terraform deployment or the endpoint connection failed, so first destroy the + # resources created from terraform and try again. + if [ $deployment_failed -eq 1 ]; then + echo "Cleaning up App Signal" + ${{ env.TEST_RESOURCES_FOLDER }}/enablement-script/clean-app-signals.sh \ + ${{ env.CLUSTER_NAME }} \ + ${{ env.E2E_TEST_AWS_REGION }} \ + ${{ env.SAMPLE_APP_NAMESPACE }} + + # Running clean-app-signal.sh removes the current cluster from the config. Update the cluster again for subsequent runs. + aws eks update-kubeconfig --name ${{ env.CLUSTER_NAME }} --region ${{ env.E2E_TEST_AWS_REGION }} + + echo "Destroying terraform" + terraform destroy -auto-approve \ + -var="test_id=${{ env.TESTING_ID }}" \ + -var="aws_region=${{ env.E2E_TEST_AWS_REGION }}" \ + -var="kube_directory_path=${{ github.workspace }}/.kube" \ + -var="eks_cluster_name=${{ env.CLUSTER_NAME }}" \ + -var="test_namespace=${{ env.SAMPLE_APP_NAMESPACE }}" \ + -var="service_account_aws_access=service-account-${{ env.TESTING_ID }}" \ + -var="sample_app_image=${{ env.MAIN_SAMPLE_APP_IMAGE_ARN }}" \ + -var="sample_remote_app_image=${{ env.REMOTE_SAMPLE_APP_IMAGE_ARN }}" + + retry_counter=$(($retry_counter+1)) + else + # If deployment succeeded, then exit the loop + break + fi + + if [ $retry_counter -ge $max_retry ]; then + echo "Max retry reached, failed to deploy terraform and connect to the endpoint. Exiting code" + exit 1 + fi + done - name: Get ECR to Patch run: | diff --git a/.github/workflows/java-metric-limiter-e2e-test.yml b/.github/workflows/java-metric-limiter-e2e-test.yml index fd832de39..847cb9c23 100644 --- a/.github/workflows/java-metric-limiter-e2e-test.yml +++ b/.github/workflows/java-metric-limiter-e2e-test.yml @@ -165,59 +165,100 @@ jobs: - name: Initiate Terraform uses: ./.github/workflows/actions/execute_and_retry with: - command: "terraform init && terraform validate" + command: "cd ${{ env.TEST_RESOURCES_FOLDER }}/terraform/java/eks && terraform init && terraform validate" cleanup: "rm -rf .terraform && rm -rf .terraform.lock.hcl" max_retry: 6 sleep_time: 60 - working-directory: ./terraform/java/eks - name: Set Sample App Image run: | echo MAIN_SAMPLE_APP_IMAGE_ARN="${{ env.ACCOUNT_ID }}.dkr.ecr.${{ env.E2E_TEST_AWS_REGION }}.amazonaws.com/${{ env.JAVA_MAIN_SAMPLE_APP_IMAGE }}" >> $GITHUB_ENV echo REMOTE_SAMPLE_APP_IMAGE_ARN="${{ env.ACCOUNT_ID }}.dkr.ecr.${{ env.E2E_TEST_AWS_REGION }}.amazonaws.com/${{ env.JAVA_REMOTE_SAMPLE_APP_IMAGE }}" >> $GITHUB_ENV - - - name: Deploy sample app via terraform and wait for endpoint to come online + + - name: Deploy sample app via terraform and wait for the endpoint to come online id: deploy-sample-app - uses: ./.github/workflows/actions/execute_and_retry - with: - command: 'terraform apply -auto-approve - -var="test_id=${{ env.TESTING_ID }}" - -var="aws_region=${{ env.E2E_TEST_AWS_REGION }}" - -var="kube_directory_path=${{ github.workspace }}/.kube" - -var="eks_cluster_name=${{ env.CLUSTER_NAME }}" - -var="eks_cluster_context_name=$(kubectl config current-context)" - -var="test_namespace=${{ env.SAMPLE_APP_NAMESPACE }}" - -var="service_account_aws_access=sa-${{ env.TESTING_ID }}" - -var="sample_app_image=${{ env.MAIN_SAMPLE_APP_IMAGE_ARN }}" - -var="sample_remote_app_image=${{ env.REMOTE_SAMPLE_APP_IMAGE_ARN }}" - -var="account_id=${{ env.ACCOUNT_ID }}"' - cleanup: 'terraform destroy -auto-approve - -var="test_id=${{ env.TESTING_ID }}" - -var="aws_region=${{ env.E2E_TEST_AWS_REGION }}" - -var="kube_directory_path=${{ github.workspace }}/.kube" - -var="eks_cluster_name=${{ env.CLUSTER_NAME }}" - -var="test_namespace=${{ env.SAMPLE_APP_NAMESPACE }}" - -var="service_account_aws_access=sa-${{ env.TESTING_ID }}" - -var="sample_app_image=${{ env.MAIN_SAMPLE_APP_IMAGE_ARN }}" - -var="sample_remote_app_image=${{ env.REMOTE_SAMPLE_APP_IMAGE_ARN }}"' - max_retry: 2 - working-directory: ./terraform/java/eks - - - name: Enable App Signals - uses: ./.github/workflows/actions/execute_and_retry - with: - command: './enable-app-signals.sh - ${{ env.CLUSTER_NAME }} - ${{ env.E2E_TEST_AWS_REGION }} - ${{ env.SAMPLE_APP_NAMESPACE }}' - cleanup: '${{ env.CLUSTER_NAME }} - ${{ env.E2E_TEST_AWS_REGION }} - ${{ env.SAMPLE_APP_NAMESPACE }} && - aws eks update-kubeconfig --name ${{ env.CLUSTER_NAME }} --region ${{ env.E2E_TEST_AWS_REGION }}"' - post-command: 'kubectl delete pods --all -n ${{ env.SAMPLE_APP_NAMESPACE }} && kubectl wait --for=condition=Ready --request-timeout "5m" pod --all -n ${{ env.SAMPLE_APP_NAMESPACE }}' - max_retry: 3 - sleep_time: 60 - working-directory: ./enablement-script + working-directory: terraform/java/eks + run: | + # Attempt to deploy the sample app on an EKS instance and wait for its endpoint to come online. + # There may be occasional failures due to transitivity issues, so try up to 2 times. + # deployment_failed of 0 indicates that both the terraform deployment and the endpoint are running, while 1 indicates + # that it failed at some point + retry_counter=0 + max_retry=2 + while [ $retry_counter -lt $max_retry ]; do + echo "Attempt $retry_counter" + deployment_failed=0 + terraform apply -auto-approve \ + -var="test_id=${{ env.TESTING_ID }}" \ + -var="aws_region=${{ env.E2E_TEST_AWS_REGION }}" \ + -var="kube_directory_path=${{ github.workspace }}/.kube" \ + -var="eks_cluster_name=${{ env.CLUSTER_NAME }}" \ + -var="eks_cluster_context_name=$(kubectl config current-context)" \ + -var="test_namespace=${{ env.SAMPLE_APP_NAMESPACE }}" \ + -var="service_account_aws_access=sa-${{ env.TESTING_ID }}" \ + -var="sample_app_image=${{ env.MAIN_SAMPLE_APP_IMAGE_ARN }}" \ + -var="sample_remote_app_image=${{ env.REMOTE_SAMPLE_APP_IMAGE_ARN }}" \ + -var='account_id=${{ env.ACCOUNT_ID }}' \ + || deployment_failed=$? + + if [ $deployment_failed -ne 0 ]; then + echo "Terraform deployment was unsuccessful. Will attempt to retry deployment." + fi + + # If the deployment_failed is still 0, then the terraform deployment succeeded and now try to connect to the endpoint + # after installing App Signals. Attempts to connect will be made for up to 10 minutes + if [ $deployment_failed -eq 0 ]; then + . ${{ env.TEST_RESOURCES_FOLDER }}/.github/workflows/util/execute_and_retry.sh + execute_and_retry 3 \ + "${{ env.TEST_RESOURCES_FOLDER }}/enablement-script/enable-app-signals.sh \ + ${{ env.CLUSTER_NAME }} \ + ${{ env.E2E_TEST_AWS_REGION }} \ + ${{ env.SAMPLE_APP_NAMESPACE }}" \ + "${{ env.TEST_RESOURCES_FOLDER }}/enablement-script/clean-app-signals.sh \ + ${{ env.CLUSTER_NAME }} \ + ${{ env.E2E_TEST_AWS_REGION }} \ + ${{ env.SAMPLE_APP_NAMESPACE }} && \ + aws eks update-kubeconfig --name ${{ env.CLUSTER_NAME }} --region ${{ env.E2E_TEST_AWS_REGION }}" \ + 60 + + execute_and_retry 2 "kubectl delete pods --all -n ${{ env.SAMPLE_APP_NAMESPACE }}" "" 60 + execute_and_retry 2 "kubectl wait --for=condition=Ready --request-timeout '5m' pod --all -n ${{ env.SAMPLE_APP_NAMESPACE }}" "" 10 + fi + + # If the deployment_failed is 1 then either the terraform deployment or the endpoint connection failed, so first destroy the + # resources created from terraform and try again. + if [ $deployment_failed -eq 1 ]; then + echo "Cleaning up App Signal" + ${{ env.TEST_RESOURCES_FOLDER }}/enablement-script/clean-app-signals.sh \ + ${{ env.CLUSTER_NAME }} \ + ${{ env.E2E_TEST_AWS_REGION }} \ + ${{ env.SAMPLE_APP_NAMESPACE }} + + # Running clean-app-signal.sh removes the current cluster from the config. Update the cluster again for subsequent runs. + aws eks update-kubeconfig --name ${{ env.CLUSTER_NAME }} --region ${{ env.E2E_TEST_AWS_REGION }} + + echo "Destroying terraform" + terraform destroy -auto-approve \ + -var="test_id=${{ env.TESTING_ID }}" \ + -var="aws_region=${{ env.E2E_TEST_AWS_REGION }}" \ + -var="kube_directory_path=${{ github.workspace }}/.kube" \ + -var="eks_cluster_name=${{ env.CLUSTER_NAME }}" \ + -var="test_namespace=${{ env.SAMPLE_APP_NAMESPACE }}" \ + -var="service_account_aws_access=sa-${{ env.TESTING_ID }}" \ + -var="sample_app_image=${{ env.MAIN_SAMPLE_APP_IMAGE_ARN }}" \ + -var="sample_remote_app_image=${{ env.REMOTE_SAMPLE_APP_IMAGE_ARN }}" + + retry_counter=$(($retry_counter+1)) + else + # If deployment succeeded, then exit the loop + break + fi + + if [ $retry_counter -ge $max_retry ]; then + echo "Max retry reached, failed to deploy terraform and connect to the endpoint. Exiting code" + exit 1 + fi + done - name: Get ECR to Patch run: | diff --git a/.github/workflows/python-ec2-asg-e2e-test.yml b/.github/workflows/python-ec2-asg-e2e-test.yml index 48d6f695a..3864d016a 100644 --- a/.github/workflows/python-ec2-asg-e2e-test.yml +++ b/.github/workflows/python-ec2-asg-e2e-test.yml @@ -110,24 +110,53 @@ jobs: - name: Initiate Terraform uses: ./.github/workflows/actions/execute_and_retry with: - command: "terraform init && terraform validate" + command: "cd ${{ env.TEST_RESOURCES_FOLDER }}/terraform/python/ec2/asg && terraform init && terraform validate" cleanup: "rm -rf .terraform && rm -rf .terraform.lock.hcl" max_retry: 6 sleep_time: 60 - working-directory: ./terraform/python/ec2/asg - name: Deploy sample app via terraform and wait for endpoint to come online - uses: ./.github/workflows/actions/execute_and_retry - with: - command: 'terraform apply -auto-approve - -var="aws_region=${{ env.E2E_TEST_AWS_REGION }}" - -var="test_id=${{ env.TESTING_ID }}" - -var="sample_app_zip=${{ env.SAMPLE_APP_ZIP }}" - -var="get_cw_agent_rpm_command=${{ env.GET_CW_AGENT_RPM_COMMAND }}" - -var="get_adot_wheel_command=${{ env.GET_ADOT_WHEEL_COMMAND }}"' - cleanup: 'terraform destroy -auto-approve -var="test_id=${{ env.TESTING_ID }}' - max_retry: 2 - working-directory: ./terraform/python/ec2/asg + working-directory: terraform/python/ec2/asg + run: | + # Attempt to deploy the sample app on an EC2 instance and wait for its endpoint to come online. + # There may be occasional failures due to transitivity issues, so try up to 2 times. + # deployment_failed of 0 indicates that both the terraform deployment and the endpoint are running, while 1 indicates + # that it failed at some point + retry_counter=0 + max_retry=2 + while [ $retry_counter -lt $max_retry ]; do + echo "Attempt $retry_counter" + deployment_failed=0 + terraform apply -auto-approve \ + -var="aws_region=${{ env.E2E_TEST_AWS_REGION }}" \ + -var="test_id=${{ env.TESTING_ID }}" \ + -var="sample_app_zip=${{ env.SAMPLE_APP_ZIP }}" \ + -var="get_cw_agent_rpm_command=${{ env.GET_CW_AGENT_RPM_COMMAND }}" \ + -var="get_adot_wheel_command=${{ env.GET_ADOT_WHEEL_COMMAND }}" \ + || deployment_failed=$? + + if [ $deployment_failed -eq 1 ]; then + echo "Terraform deployment was unsuccessful. Will attempt to retry deployment." + fi + + # If the success is 1 then either the terraform deployment or the endpoint connection failed, so first destroy the + # resources created from terraform and try again. + if [ $deployment_failed -eq 1 ]; then + echo "Destroying terraform" + terraform destroy -auto-approve \ + -var="test_id=${{ env.TESTING_ID }}" + + retry_counter=$(($retry_counter+1)) + else + # If deployment succeeded, then exit the loop + break + fi + + if [ $retry_counter -eq $max_retry ]; then + echo "Max retry reached, failed to deploy terraform and connect to the endpoint. Exiting code" + exit 1 + fi + done - name: Get the sample app and EC2 instance information working-directory: terraform/python/ec2/asg diff --git a/.github/workflows/python-ec2-default-e2e-test.yml b/.github/workflows/python-ec2-default-e2e-test.yml index ad039fd04..48232893b 100644 --- a/.github/workflows/python-ec2-default-e2e-test.yml +++ b/.github/workflows/python-ec2-default-e2e-test.yml @@ -109,24 +109,53 @@ jobs: - name: Initiate Terraform uses: ./.github/workflows/actions/execute_and_retry with: - command: "terraform init && terraform validate" + command: "cd ${{ env.TEST_RESOURCES_FOLDER }}/terraform/python/ec2/default && terraform init && terraform validate" cleanup: "rm -rf .terraform && rm -rf .terraform.lock.hcl" max_retry: 6 sleep_time: 60 - working-directory: ./terraform/python/ec2/default - name: Deploy sample app via terraform and wait for endpoint to come online - uses: ./.github/workflows/actions/execute_and_retry - with: - command: 'terraform apply -auto-approve - -var="aws_region=${{ env.E2E_TEST_AWS_REGION }}" - -var="test_id=${{ env.TESTING_ID }}" - -var="sample_app_zip=${{ env.SAMPLE_APP_ZIP }}" - -var="get_cw_agent_rpm_command=${{ env.GET_CW_AGENT_RPM_COMMAND }}" - -var="get_adot_wheel_command=${{ env.GET_ADOT_WHEEL_COMMAND }}"' - cleanup: 'terraform destroy -auto-approve -var="test_id=${{ env.TESTING_ID }}' - max_retry: 2 - working-directory: ./terraform/python/ec2/default + working-directory: terraform/python/ec2/default + run: | + # Attempt to deploy the sample app on an EC2 instance and wait for its endpoint to come online. + # There may be occasional failures due to transitivity issues, so try up to 2 times. + # deployment_failed of 0 indicates that both the terraform deployment and the endpoint are running, while 1 indicates + # that it failed at some point + retry_counter=0 + max_retry=2 + while [ $retry_counter -lt $max_retry ]; do + echo "Attempt $retry_counter" + deployment_failed=0 + terraform apply -auto-approve \ + -var="aws_region=${{ env.E2E_TEST_AWS_REGION }}" \ + -var="test_id=${{ env.TESTING_ID }}" \ + -var="sample_app_zip=${{ env.SAMPLE_APP_ZIP }}" \ + -var="get_cw_agent_rpm_command=${{ env.GET_CW_AGENT_RPM_COMMAND }}" \ + -var="get_adot_wheel_command=${{ env.GET_ADOT_WHEEL_COMMAND }}" \ + || deployment_failed=$? + + if [ $deployment_failed -eq 1 ]; then + echo "Terraform deployment was unsuccessful. Will attempt to retry deployment." + fi + + # If the success is 1 then either the terraform deployment or the endpoint connection failed, so first destroy the + # resources created from terraform and try again. + if [ $deployment_failed -eq 1 ]; then + echo "Destroying terraform" + terraform destroy -auto-approve \ + -var="test_id=${{ env.TESTING_ID }}" + + retry_counter=$(($retry_counter+1)) + else + # If deployment succeeded, then exit the loop + break + fi + + if [ $retry_counter -eq $max_retry ]; then + echo "Max retry reached, failed to deploy terraform and connect to the endpoint. Exiting code" + exit 1 + fi + done - name: Get the ec2 instance ami id run: | diff --git a/.github/workflows/python-eks-e2e-test.yml b/.github/workflows/python-eks-e2e-test.yml index a9e9c9159..9f4c90896 100644 --- a/.github/workflows/python-eks-e2e-test.yml +++ b/.github/workflows/python-eks-e2e-test.yml @@ -198,63 +198,106 @@ jobs: - name: Initiate Terraform uses: ./.github/workflows/actions/execute_and_retry with: - command: "terraform init && terraform validate" + command: "cd ${{ env.TEST_RESOURCES_FOLDER }}/terraform/python/eks && terraform init && terraform validate" cleanup: "rm -rf .terraform && rm -rf .terraform.lock.hcl" max_retry: 6 sleep_time: 60 - working-directory: ./terraform/python/eks - name: Set Sample App Image run: | echo MAIN_SAMPLE_APP_IMAGE_ARN="${{ env.ACCOUNT_ID }}.dkr.ecr.${{ env.E2E_TEST_AWS_REGION }}.amazonaws.com/${{ env.PYTHON_MAIN_SAMPLE_APP_IMAGE }}" >> $GITHUB_ENV echo REMOTE_SAMPLE_APP_IMAGE_ARN="${{ env.ACCOUNT_ID }}.dkr.ecr.${{ env.E2E_TEST_AWS_REGION }}.amazonaws.com/${{ env.PYTHON_REMOTE_SAMPLE_APP_IMAGE }}" >> $GITHUB_ENV + + - name: Deploy sample app via terraform and wait for the endpoint to come online + id: deploy-python-app + working-directory: terraform/python/eks + run: | + # Attempt to deploy the sample app on an EKS instance and wait for its endpoint to come online. + # There may be occasional failures due to transitivity issues, so try up to 2 times. + # deployment_failed of 0 indicates that both the terraform deployment and the endpoint are running, while 1 indicates + # that it failed at some point + retry_counter=0 + max_retry=2 + while [ $retry_counter -lt $max_retry ]; do + echo "Attempt $retry_counter" + deployment_failed=0 + terraform apply -auto-approve \ + -var='test_id=${{ env.TESTING_ID }}' \ + -var='aws_region=${{ env.E2E_TEST_AWS_REGION }}' \ + -var='kube_directory_path=${{ github.workspace }}/.kube' \ + -var='eks_cluster_name=${{ env.CLUSTER_NAME }}' \ + -var="eks_cluster_context_name=$(kubectl config current-context)" \ + -var='test_namespace=${{ env.SAMPLE_APP_NAMESPACE }}' \ + -var='service_account_aws_access=service-account-${{ env.TESTING_ID }}' \ + -var='python_app_image=${{ env.MAIN_SAMPLE_APP_IMAGE_ARN }}' \ + -var='python_remote_app_image=${{ env.REMOTE_SAMPLE_APP_IMAGE_ARN }}' \ + -var='rds_mysql_cluster_endpoint=${{env.RDS_MYSQL_CLUSTER_ENDPOINT}}' \ + -var='rds_mysql_cluster_username=${{env.RDS_MYSQL_CLUSTER_SECRETS_USERNAME}}' \ + -var='rds_mysql_cluster_password=${{env.RDS_MYSQL_CLUSTER_SECRETS_PASSWORD}}' \ + -var='rds_mysql_cluster_database=information_schema' \ + -var='account_id=${{ env.ACCOUNT_ID }}' \ + || deployment_failed=$? - - name: Deploy sample app via terraform and wait for endpoint to come online - uses: ./.github/workflows/actions/execute_and_retry - with: - command: 'terraform apply -auto-approve - -var="test_id=${{ env.TESTING_ID }}" - -var="aws_region=${{ env.E2E_TEST_AWS_REGION }}" - -var="kube_directory_path=${{ github.workspace }}/.kube" - -var="eks_cluster_name=${{ env.CLUSTER_NAME }}" - -var="eks_cluster_context_name=$(kubectl config current-context)" - -var="test_namespace=${{ env.SAMPLE_APP_NAMESPACE }}" - -var="service_account_aws_access=service-account-${{ env.TESTING_ID }}" - -var="python_app_image=${{ env.MAIN_SAMPLE_APP_IMAGE_ARN }}" - -var="python_remote_app_image=${{ env.REMOTE_SAMPLE_APP_IMAGE_ARN }}" - -var="rds_mysql_cluster_endpoint=${{env.RDS_MYSQL_CLUSTER_ENDPOINT}}" - -var="rds_mysql_cluster_username=${{env.RDS_MYSQL_CLUSTER_SECRETS_USERNAME}}" - -var="rds_mysql_cluster_password=${{env.RDS_MYSQL_CLUSTER_SECRETS_PASSWORD}}" - -var="rds_mysql_cluster_database=information_schema" - -var="account_id=${{ env.ACCOUNT_ID }}"' - cleanup: 'terraform destroy -auto-approve - -var="test_id=${{ env.TESTING_ID }}" - -var="aws_region=${{ env.E2E_TEST_AWS_REGION }}" - -var="kube_directory_path=${{ github.workspace }}/.kube" - -var="eks_cluster_name=${{ env.CLUSTER_NAME }}" - -var="eks_cluster_context_name=$(kubectl config current-context)" - -var="test_namespace=${{ env.SAMPLE_APP_NAMESPACE }}" - -var="service_account_aws_access=service-account-${{ env.TESTING_ID }}" - -var="python_app_image=${{ env.MAIN_SAMPLE_APP_IMAGE_ARN }}" - -var="python_remote_app_image=${{ env.REMOTE_SAMPLE_APP_IMAGE_ARN }}"' - max_retry: 2 - working-directory: ./terraform/python/eks - - - name: Enable App Signals - uses: ./.github/workflows/actions/execute_and_retry - with: - command: './enable-app-signals.sh - ${{ env.CLUSTER_NAME }} - ${{ env.E2E_TEST_AWS_REGION }} - ${{ env.SAMPLE_APP_NAMESPACE }}' - cleanup: '${{ env.CLUSTER_NAME }} - ${{ env.E2E_TEST_AWS_REGION }} - ${{ env.SAMPLE_APP_NAMESPACE }} && - aws eks update-kubeconfig --name ${{ env.CLUSTER_NAME }} --region ${{ env.E2E_TEST_AWS_REGION }}"' - post_command: 'kubectl delete pods --all -n ${{ env.SAMPLE_APP_NAMESPACE }} && kubectl wait --for=condition=Ready --request-timeout "5m" pod --all -n ${{ env.SAMPLE_APP_NAMESPACE }}' - max_retry: 3 - sleep_time: 60 - working-directory: ./enablement-script + if [ $deployment_failed -eq 1 ]; then + echo "Terraform deployment was unsuccessful. Will attempt to retry deployment." + fi + + # If the deployment_failed is still 0, then the terraform deployment succeeded and now try to connect to the endpoint + # after installing Application Signals. Attempts to connect will be made for up to 10 minutes + if [ $deployment_failed -eq 0 ]; then + echo "Installing application signals to the sample app" + . ${{ env.TEST_RESOURCES_FOLDER }}/.github/workflows/util/execute_and_retry.sh + execute_and_retry 3 \ + "${{ env.TEST_RESOURCES_FOLDER }}/enablement-script/enable-app-signals.sh \ + ${{ inputs.test-cluster-name }} \ + ${{ env.E2E_TEST_AWS_REGION }} \ + ${{ env.SAMPLE_APP_NAMESPACE }}" \ + "${{ env.TEST_RESOURCES_FOLDER }}/enablement-script/clean-app-signals.sh \ + ${{ inputs.test-cluster-name }} \ + ${{ env.E2E_TEST_AWS_REGION }} \ + ${{ env.SAMPLE_APP_NAMESPACE }} && \ + aws eks update-kubeconfig --name ${{ inputs.test-cluster-name }} --region ${{ env.E2E_TEST_AWS_REGION }}" \ + 60 + + execute_and_retry 2 "kubectl delete pods --all -n ${{ env.SAMPLE_APP_NAMESPACE }}" "" 60 + execute_and_retry 2 "kubectl wait --for=condition=Ready --request-timeout '5m' pod --all -n ${{ env.SAMPLE_APP_NAMESPACE }}" "" 10 + fi + + # If the deployment_failed is 1 then either the terraform deployment or the endpoint connection failed, so first destroy the + # resources created from terraform and try again. + if [ $deployment_failed -eq 1 ]; then + echo "Cleaning up Application Signal" + ${{ env.TEST_RESOURCES_FOLDER }}/enablement-script/clean-app-signals.sh \ + ${{ env.CLUSTER_NAME }} \ + ${{ env.E2E_TEST_AWS_REGION }} \ + ${{ env.SAMPLE_APP_NAMESPACE }} + + # Running clean-app-signal.sh removes the current cluster from the config. Update the cluster again for subsequent runs. + aws eks update-kubeconfig --name ${{ inputs.test-cluster-name }} --region ${{ env.E2E_TEST_AWS_REGION }} + + echo "Destroying terraform" + terraform destroy -auto-approve \ + -var='test_id=${{ env.TESTING_ID }}' \ + -var='aws_region=${{ env.E2E_TEST_AWS_REGION }}' \ + -var='kube_directory_path=${{ github.workspace }}/.kube' \ + -var='eks_cluster_name=${{ env.CLUSTER_NAME }}' \ + -var="eks_cluster_context_name=$(kubectl config current-context)" \ + -var='test_namespace=${{ env.SAMPLE_APP_NAMESPACE }}' \ + -var='service_account_aws_access=service-account-${{ env.TESTING_ID }}' \ + -var='python_app_image=${{ env.MAIN_SAMPLE_APP_IMAGE_ARN }}' \ + -var='python_remote_app_image=${{ env.REMOTE_SAMPLE_APP_IMAGE_ARN }}' + + retry_counter=$(($retry_counter+1)) + else + # If deployment succeeded, then exit the loop + break + fi + + if [ $retry_counter -ge $max_retry ]; then + echo "Max retry reached, failed to deploy terraform and connect to the endpoint. Exiting code" + exit 1 + fi + done - name: Get ECR to Patch run: |