forked from aws/amazon-cloudwatch-agent-operator
-
Notifications
You must be signed in to change notification settings - Fork 0
263 lines (231 loc) · 11 KB
/
apm-e2e-test.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
# This is a reusable workflow for running the E2E test for APM.
# It is meant to be called from another workflow.
# Read more about reusable workflows: https://docs.github.com/en/actions/using-workflows/reusing-workflows#overview
name: APM Enablement E2E Testing
on:
workflow_call:
inputs:
test-cluster-name:
required: true
type: string
apm-cwagent-operator-image-name:
required: true
type: string
permissions:
id-token: write
contents: read
env:
AWS_DEFAULT_REGION: us-east-1
TEST_ACCOUNT: 417921506511
SAMPLE_APP_NAMESPACE: sample-app-namespace
SAMPLE_APP_FRONTEND_SERVICE_IMAGE: 417921506511.dkr.ecr.us-east-1.amazonaws.com/aws-apm-test/aws-apm-java-springboot:latest
SAMPLE_APP_REMOTE_SERVICE_IMAGE: 417921506511.dkr.ecr.us-east-1.amazonaws.com/aws-apm-test/aws-apm-java-springboot-remote-service:latest
CW_AGENT_OPERATOR_IMAGE: 506463145083.dkr.ecr.us-west-2.amazonaws.com/cwagent-operator-release:latest
CW_AGENT_OPERATOR_PRE_IMAGE: 506463145083.dkr.ecr.us-west-2.amazonaws.com/cwagent-operator-pre-release:latest
jobs:
apm-e2e-test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Generate testing id
run: echo TESTING_ID="${{ github.run_id }}-${{ github.run_number }}" >> $GITHUB_ENV
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v2
with:
role-to-assume: arn:aws:iam::417921506511:role/pulse-e2e-test-gh-role
aws-region: ${{ env.AWS_DEFAULT_REGION }}
# local directory to store the kubernetes config
- name: Create kubeconfig directory
run: mkdir -p ${{ github.workspace }}/.kube
- name: Set KUBECONFIG environment variable
run: echo KUBECONFIG="${{ github.workspace }}/.kube/config" >> $GITHUB_ENV
- name: Install eksctl
run: |
mkdir ${{ github.workspace }}/eksctl
curl -sLO "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_Linux_amd64.tar.gz"
tar -xzf eksctl_Linux_amd64.tar.gz -C ${{ github.workspace }}/eksctl && rm eksctl_Linux_amd64.tar.gz
echo "${{ github.workspace }}/eksctl" >> $GITHUB_PATH
# Create the K8s service account to grant the sample app
# permissions to call AWS services from within the pods.
# https://docs.aws.amazon.com/eks/latest/userguide/iam-roles-for-service-accounts.html
#
# This needs to be done before launching the sample app
# because the sample app manifest requires the service
# account to be created upfront.
- name: Enable OIDC for cluster
run: eksctl utils associate-iam-oidc-provider --cluster ${{ inputs.test-cluster-name }} --region ${{ env.AWS_DEFAULT_REGION }} --approve
- name: Create role for AWS access from the sample app
id: create_service_account
run: |
eksctl create iamserviceaccount \
--name service-account-${{ env.TESTING_ID }} \
--namespace ${{ env.SAMPLE_APP_NAMESPACE }} \
--cluster ${{ inputs.test-cluster-name }} \
--role-name eks-s3-access-${{ env.TESTING_ID }} \
--attach-policy-arn arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess \
--region ${{ env.AWS_DEFAULT_REGION }} \
--approve
- name: Set up terraform
uses: hashicorp/setup-terraform@v2
with:
terraform_wrapper: false
- name: Deploy sample app via terraform
working-directory: test/terraform/eks
run: |
terraform init
terraform validate
terraform apply -auto-approve \
-var="test_id=${{ env.TESTING_ID }}" \
-var="kube_directory_path=${{ github.workspace }}/.kube" \
-var="eks_cluster_name=${{ inputs.test-cluster-name }}" \
-var="test_namespace=${{ env.SAMPLE_APP_NAMESPACE }}" \
-var="service_account_aws_access=service-account-${{ env.TESTING_ID }}" \
-var="sample_app_image=${{ env.SAMPLE_APP_FRONTEND_SERVICE_IMAGE }}" \
-var="sample_remote_app_image=${{ env.SAMPLE_APP_REMOTE_SERVICE_IMAGE }}"
# Enable APM on the test cluster
- name: Pull and unzip enablement script from S3
run: aws s3 cp s3://apm-private-beta/onboarding.zip . && unzip -j onboarding.zip
# - name: Set the CW Agent Operator image in the manifest
# run: yq -i '.spec.template.spec.containers.[].image = "506463145083.dkr.ecr.us-west-2.amazonaws.com/cwagent-operator-release"' apm.yaml
- name: Set the CW Agent Operator image in the manifest
run: "sed -i 's#${{ env.CW_AGENT_OPERATOR_IMAGE }}#${{ env.CW_AGENT_OPERATOR_PRE_IMAGE }}#g' apm.yaml"
- name: Test
run: cat apm.yaml
- name: Change OTEL_TRACES_SAMPLER to always_on
run: "sed -i 's#parentbased_traceidratio#always_on#g' instrumentation.yaml"
- name: Enable APM
run: |
cat apm.yaml
./enable-apm.sh \
${{ inputs.test-cluster-name }} \
${{ env. AWS_DEFAULT_REGION }} \
${{ env.SAMPLE_APP_NAMESPACE }}
# Application pods need to be restarted for the
# apm instrumentation to take effect
- name: Restart the app pods
run: kubectl delete pods --all -n ${{ env.SAMPLE_APP_NAMESPACE }}
- name: Wait for sample app pods to come up
run: |
kubectl wait --for=condition=Ready pod --all -n ${{ env.SAMPLE_APP_NAMESPACE }} \
- name: Verify pod ADOT image
run: |
kubectl get pods -n ${{ env.SAMPLE_APP_NAMESPACE }} --output json | \
jq '.items[0].status.initContainerStatuses[0].imageID'
- name: Verify pod CWAgent image
run: |
kubectl get pods -n amazon-cloudwatch --output json | \
jq '.items[0].status.containerStatuses[0].imageID'
- name: Get the sample app endpoint
run: |
echo "APP_ENDPOINT=$(terraform output sample_app_endpoint)" >> $GITHUB_ENV
working-directory: test/terraform/eks
- name: Wait for app endpoint to come online
run: |
attempt_counter=0
max_attempts=12
until $(curl --output /dev/null --silent --head --fail http://${{ env.APP_ENDPOINT }}); do
if [ ${attempt_counter} -eq ${max_attempts} ];then
echo "Max attempts reached"
exit 1
fi
printf '.'
attempt_counter=$(($attempt_counter+1))
sleep 10
done
#
# # Validation for pulse telemetry data
# - name: Call endpoint and validate generated EMF logs
# id: log-validation
# working-directory: test/validator
# run: ./gradlew run --args='-c log-validation.yml
# --testing-id ${{ env.TESTING_ID }}
# --endpoint http://${{ env.APP_ENDPOINT }}
# --region ${{ env.AWS_DEFAULT_REGION }}
# --account-id ${{ env.TEST_ACCOUNT }}
# --metric-namespace AWS/APM
# --app-namespace ${{ env.SAMPLE_APP_NAMESPACE }}
# --cluster ${{ inputs.test-cluster-name }}
# --service-name sample-application-${{ env.TESTING_ID }}
# --rollup'
#
# - name: Call endpoints and validate generated metrics
# id: metric-validation
# if: success() || steps.log-validation.outcome == 'failure'
# working-directory: test/validator
# run: ./gradlew run --args='-c metric-validation.yml
# --endpoint http://${{ env.APP_ENDPOINT }}
# --region ${{ env.AWS_DEFAULT_REGION }}
# --account-id ${{ env.TEST_ACCOUNT }}
# --metric-namespace AWS/APM
# --app-namespace ${{ env.SAMPLE_APP_NAMESPACE }}
# --cluster ${{ inputs.test-cluster-name }}
# --service-name sample-application-${{ env.TESTING_ID }}
# --remote-service-name sample-remote-application-${{ env.TESTING_ID }}
# --rollup'
#
# - name: Call endpoints and validate generated traces
# if: success() || steps.log-validation.outcome == 'failure' || steps.metric-validation.outcome == 'failure'
# working-directory: test/validator
# run: ./gradlew run --args='-c trace-validation.yml
# --endpoint http://${{ env.APP_ENDPOINT }}
# --region ${{ env.AWS_DEFAULT_REGION }}
# --account-id ${{ env.TEST_ACCOUNT }}
# --metric-namespace AWS/APM
# --app-namespace ${{ env.SAMPLE_APP_NAMESPACE }}
# --cluster ${{ inputs.test-cluster-name }}
# --service-name sample-application-${{ env.TESTING_ID }}
# --remote-service-name sample-remote-application-${{ env.TESTING_ID }}
# --rollup'
# Clean up Procedures
- name: Remove log group deletion command
if: always()
run: |
delete_log_group="aws logs delete-log-group --log-group-name '/aws/apm/eks' --region \$REGION"
sed -i "s#$delete_log_group##g" clean-apm.sh
- name: Clean APM
if: always()
run: |
./clean-apm.sh \
${{ inputs.test-cluster-name }} \
${{ env. AWS_DEFAULT_REGION }} \
${{ env.SAMPLE_APP_NAMESPACE }}
# Find name of log streams last used more than a week ago and delete them
- name: Delete outdated log streams
if: always()
run: |
current_timestamp_s=$(date -d "7 days ago" +%s)
current_timestamp_ms=$((current_timestamp_s * 1000))
queryline='logStreams[?lastEventTimestamp<`'$current_timestamp_ms'`].logStreamName'
old_logstreams=$(aws logs describe-log-streams --log-group-name /aws/apm/eks --query $queryline --output text)
read -a logstream_names_array <<< $old_logstreams
for logstream in "${logstream_names_array[@]}"
do
echo "Deleting log stream $logstream"
aws logs delete-log-stream --log-group-name /aws/apm/eks --log-stream-name $logstream --region ${{ env.AWS_DEFAULT_REGION }}
done
# This step also deletes lingering resources from previous test runs
- name: Delete all sample app resources
run: kubectl delete namespace ${{ env.SAMPLE_APP_NAMESPACE }}
- name: Terraform destroy
if: always()
continue-on-error: true
run: |
cd test/terraform/eks
terraform destroy -auto-approve \
-var="test_id=${{ env.TESTING_ID }}" \
-var="kube_directory_path=${{ github.workspace }}/.kube" \
-var="eks_cluster_name=${{ inputs.test-cluster-name }}" \
-var="test_namespace=${{ env.SAMPLE_APP_NAMESPACE }}" \
-var="service_account_aws_access=service-account-${{ env.TESTING_ID }}" \
-var="sample_app_image=${{ env.SAMPLE_APP_IMAGE }}"
- name: Remove aws access service account
if: always()
continue-on-error: true
run: |
eksctl delete iamserviceaccount \
--name service-account-${{ env.TESTING_ID }} \
--namespace ${{ env.SAMPLE_APP_NAMESPACE }} \
--cluster ${{ inputs.test-cluster-name }} \
--region ${{ env.AWS_DEFAULT_REGION }} \