From 22269d552166def1d8a53775b974aa5258c97afa Mon Sep 17 00:00:00 2001
From: Chi Zhang <chizhg@google.com>
Date: Thu, 17 Sep 2020 19:12:44 -0700
Subject: [PATCH] Simplify the wait_for_xx bash functions (#2440)

* Use one single command for the wait_until_pods_running function

* Use one single command for the wait_until_batch_job_complete function

* Also wait for all the job pods to be completed.
---
 scripts/library.sh | 78 +++++++++-------------------------------------
 1 file changed, 14 insertions(+), 64 deletions(-)

diff --git a/scripts/library.sh b/scripts/library.sh
index 92fccb17a1a..e7ecbc9bed5 100755
--- a/scripts/library.sh
+++ b/scripts/library.sh
@@ -69,12 +69,12 @@ function abort() {
 # Parameters: $1 - character to use for the box.
 #             $2 - banner message.
 function make_banner() {
-    local msg="$1$1$1$1 $2 $1$1$1$1"
-    local border="${msg//[-0-9A-Za-z _.,\/()\']/$1}"
-    echo -e "${border}\n${msg}\n${border}"
-    # TODO(adrcunha): Remove once logs have timestamps on Prow
-    # For details, see https://github.com/kubernetes/test-infra/issues/10100
-    echo -e "$1$1$1$1 $(TZ='America/Los_Angeles' date)\n${border}"
+  local msg="$1$1$1$1 $2 $1$1$1$1"
+  local border="${msg//[-0-9A-Za-z _.,\/()\']/$1}"
+  echo -e "${border}\n${msg}\n${border}"
+  # TODO(adrcunha): Remove once logs have timestamps on Prow
+  # For details, see https://github.com/kubernetes/test-infra/issues/10100
+  echo -e "$1$1$1$1 $(TZ='America/Los_Angeles' date)\n${border}"
 }
 
 # Simple header for logging purposes.
@@ -127,70 +127,20 @@ function wait_until_object_does_not_exist() {
 # Waits until all pods are running in the given namespace.
 # Parameters: $1 - namespace.
 function wait_until_pods_running() {
-  echo -n "Waiting until all pods in namespace $1 are up"
-  local failed_pod=""
-  for i in {1..150}; do  # timeout after 5 minutes
-    # List all pods. Ignore Terminating pods as those have either been replaced through
-    # a deployment or terminated on purpose (through chaosduck for example).
-    local pods="$(kubectl get pods --no-headers -n $1 2>/dev/null | grep -v Terminating)"
-    # All pods must be running (ignore ImagePull error to allow the pod to retry)
-    local not_running_pods=$(echo "${pods}" | grep -v Running | grep -v Completed | grep -v ErrImagePull | grep -v ImagePullBackOff)
-    if [[ -n "${pods}" ]] && [[ -z "${not_running_pods}" ]]; then
-      # All Pods are running or completed. Verify the containers on each Pod.
-      local all_ready=1
-      while read pod ; do
-        local status=(`echo -n ${pod} | cut -f2 -d' ' | tr '/' ' '`)
-        # Set this Pod as the failed_pod. If nothing is wrong with it, then after the checks, set
-        # failed_pod to the empty string.
-        failed_pod=$(echo -n "${pod}" | cut -f1 -d' ')
-        # All containers must be ready
-        [[ -z ${status[0]} ]] && all_ready=0 && break
-        [[ -z ${status[1]} ]] && all_ready=0 && break
-        [[ ${status[0]} -lt 1 ]] && all_ready=0 && break
-        [[ ${status[1]} -lt 1 ]] && all_ready=0 && break
-        [[ ${status[0]} -ne ${status[1]} ]] && all_ready=0 && break
-        # All the tests passed, this is not a failed pod.
-        failed_pod=""
-      done <<< "$(echo "${pods}" | grep -v Completed)"
-      if (( all_ready )); then
-        echo -e "\nAll pods are up:\n${pods}"
-        return 0
-      fi
-    elif [[ -n "${not_running_pods}" ]]; then
-      # At least one Pod is not running, just save the first one's name as the failed_pod.
-      failed_pod="$(echo "${not_running_pods}" | head -n 1 | cut -f1 -d' ')"
-    fi
-    echo -n "."
-    sleep 2
-  done
-  echo -e "\n\nERROR: timeout waiting for pods to come up\n${pods}"
-  if [[ -n "${failed_pod}" ]]; then
-    echo -e "\n\nFailed Pod (data in YAML format) - ${failed_pod}\n"
-    kubectl -n $1 get pods "${failed_pod}" -oyaml
-    echo -e "\n\nPod Logs\n"
-    kubectl -n $1 logs "${failed_pod}" --all-containers
+  echo "Waiting until all pods in namespace $1 are up"
+  kubectl wait pod --for=condition=Ready -n "$1" -l '!job-name' --timeout=5m || return 1
+  # Also wait for all the job pods to be completed.
+  # This is mainly for maintaining backward compatibility.
+  if [[ $(kubectl get jobs --ignore-not-found=true -n "$1") ]]; then
+    kubectl wait job --for=condition=Complete --all -n "$1" --timeout=5m || return 1
   fi
-  return 1
 }
 
 # Waits until all batch jobs complete in the given namespace.
 # Parameters: $1 - namespace.
 function wait_until_batch_job_complete() {
-  echo -n "Waiting until all batch jobs in namespace $1 run to completion."
-  for i in {1..150}; do  # timeout after 5 minutes
-    local jobs=$(kubectl get jobs -n $1 --no-headers \
-                 -ocustom-columns='n:{.metadata.name},c:{.spec.completions},s:{.status.succeeded}')
-    # All jobs must be complete
-    local not_complete=$(echo "${jobs}" | awk '{if ($2!=$3) print $0}' | wc -l)
-    if [[ ${not_complete} -eq 0 ]]; then
-      echo -e "\nAll jobs are complete:\n${jobs}"
-      return 0
-    fi
-    echo -n "."
-    sleep 2
-  done
-  echo -e "\n\nERROR: timeout waiting for jobs to complete\n${jobs}"
-  return 1
+  echo "Waiting until all batch jobs in namespace $1 run to completion."
+  kubectl wait job --for=condition=Complete --all -n "$1" --timeout=5m || return 1
 }
 
 # Waits until the given service has an external address (IP/hostname).