From 21ee0139974160034940afc8de43521a20b8b1b7 Mon Sep 17 00:00:00 2001 From: Tim Cheung <152907271+timckt@users.noreply.github.com> Date: Wed, 20 Mar 2024 17:07:45 +0000 Subject: [PATCH 1/3] update recycle node page --- runbooks/source/recycle-all-nodes.html.md.erb | 70 ++++++++++--------- 1 file changed, 36 insertions(+), 34 deletions(-) diff --git a/runbooks/source/recycle-all-nodes.html.md.erb b/runbooks/source/recycle-all-nodes.html.md.erb index f7e60259..cf47e8b6 100644 --- a/runbooks/source/recycle-all-nodes.html.md.erb +++ b/runbooks/source/recycle-all-nodes.html.md.erb @@ -1,8 +1,8 @@ --- title: Recycling all the nodes in a cluster weight: 255 -last_reviewed_on: 2024-01-09 -review_in: 3 months +last_reviewed_on: 2024-03-20 +review_in: 6 months --- # Recycle-all-nodes @@ -46,56 +46,58 @@ This will eventually cause the update to stop, wait and then exit, leaving the r AWS EKS in some circumstances has even reverted the update and started to drain the new nodes and replace them with the old update template again. So it's important to monitor how the update is going and act when nodes get stuck. -To resolve the issue, evict the offending pod by running the below script: +To resolve the issue: -```watch -n 300 ./delete-pods-in-namespace.sh '/aws/eks//cluster' > deleted_pods.log``` +1. Copy below script save it as `delete-pods-in-namespace.sh`. -and `tail -f delete_pods.log` in another terminal. The `` is the short name of the cluster e.g. `cp-2901-1531` + ``` + #!/bin/bash -``` -#!/bin/bash + delete_pods() { + NAMESPACE=$(echo "$1" | sed -E 's/\/api\/v1\/namespaces\/(.*)\/pods\/.*/\1/') + POD=$(echo "$1" | sed -E 's/.*\/pods\/(.*)\/eviction/\1/') -delete_pods() { - NAMESPACE=$(echo "$1" | sed -E 's/\/api\/v1\/namespaces\/(.*)\/pods\/.*/\1/') - POD=$(echo "$1" | sed -E 's/.*\/pods\/(.*)\/eviction/\1/') + echo $NAMESPACE - echo $NAMESPACE + echo $POD - echo $POD + kubectl delete pod -n $NAMESPACE $POD + } - kubectl delete pod -n $NAMESPACE $POD -} + export -f delete_pods -export -f delete_pods + TIME_NOW_EPOCH=$(date +%s) -TIME_NOW_EPOCH=$(date +%s) + START_TIME=$(($TIME_NOW_EPOCH - 180)) -START_TIME=$(($TIME_NOW_EPOCH - 180)) + CLUSTER_LOG_GROUP=$1 -CLUSTER_LOG_GROUP=$1 + QUERY_ID=$(aws logs start-query \ + --start-time $START_TIME \ + --end-time $TIME_NOW_EPOCH \ + --log-group-name $CLUSTER_LOG_GROUP \ + --query-string 'fields @timestamp, @message | filter @logStream like "kube-apiserver-audit" | filter ispresent(requestURI) | filter objectRef.subresource = "eviction" | filter responseObject.status = "Failure" | display @logStream, requestURI, responseObject.message | stats count(*) as retry by requestURI, requestObject.message' \ + | jq -r '.queryId' ) -QUERY_ID=$(aws logs start-query \ - --start-time $START_TIME \ - --end-time $TIME_NOW_EPOCH \ - --log-group-name $CLUSTER_LOG_GROUP \ - --query-string 'fields @timestamp, @message | filter @logStream like "kube-apiserver-audit" | filter ispresent(requestURI) | filter objectRef.subresource = "eviction" | filter responseObject.status = "Failure" | display @logStream, requestURI, responseObject.message | stats count(*) as retry by requestURI, requestObject.message' \ - | jq -r '.queryId' ) + sleep 2 -sleep 2 + RESULTS=$(aws logs get-query-results --query-id $QUERY_ID) -RESULTS=$(aws logs get-query-results --query-id $QUERY_ID) + echo -n $RESULTS | jq '.results[]' | grep '/api/v1' | awk '{ print $2 }' | xargs -I {} bash -c 'delete_pods {}' -echo -n $RESULTS | jq '.results[]' | grep '/api/v1' | awk '{ print $2 }' | xargs -I {} bash -c 'delete_pods {}' + exit 0 -exit 0 + ``` +2. Run `chmod +x delete-pods-in-namespace.sh`. -``` -Run the script with the command editing the cluster log group +3. Evict the offending pod by running the below script: -``` -watch -n 180 ./delete-pods-in-namespace.sh deleted_pods.log -``` -and `tail -f delete_pods.log` in another terminal. + ``` + watch -n 300 ./delete-pods-in-namespace.sh '/aws/eks//cluster' > deleted_pods.log + ``` + The `` is the short name of the cluster e.g. `cp-2901-1531` + +4. Run `tail -f deleted_pods.log` in another terminal. If you want to find the offending pod manually, follow these steps: From 7ab2278e0a9221742ddce6e92188a3dd93702128 Mon Sep 17 00:00:00 2001 From: Tim Cheung <152907271+timckt@users.noreply.github.com> Date: Wed, 20 Mar 2024 17:13:55 +0000 Subject: [PATCH 2/3] update typo --- runbooks/source/recycle-all-nodes.html.md.erb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runbooks/source/recycle-all-nodes.html.md.erb b/runbooks/source/recycle-all-nodes.html.md.erb index cf47e8b6..42ea5df3 100644 --- a/runbooks/source/recycle-all-nodes.html.md.erb +++ b/runbooks/source/recycle-all-nodes.html.md.erb @@ -48,7 +48,7 @@ AWS EKS in some circumstances has even reverted the update and started to drain To resolve the issue: -1. Copy below script save it as `delete-pods-in-namespace.sh`. +1. Copy below script and save it as `delete-pods-in-namespace.sh`. ``` #!/bin/bash From a85dea7682750da917919835dad527c17ef75d46 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 20 Mar 2024 17:16:00 +0000 Subject: [PATCH 3/3] Commit changes made by code formatters --- runbooks/source/recycle-all-nodes.html.md.erb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runbooks/source/recycle-all-nodes.html.md.erb b/runbooks/source/recycle-all-nodes.html.md.erb index 42ea5df3..d0a2ae67 100644 --- a/runbooks/source/recycle-all-nodes.html.md.erb +++ b/runbooks/source/recycle-all-nodes.html.md.erb @@ -94,10 +94,10 @@ To resolve the issue: ``` watch -n 300 ./delete-pods-in-namespace.sh '/aws/eks//cluster' > deleted_pods.log - ``` + ``` The `` is the short name of the cluster e.g. `cp-2901-1531` -4. Run `tail -f deleted_pods.log` in another terminal. +4. Run `tail -f deleted_pods.log` in another terminal. If you want to find the offending pod manually, follow these steps: