From 656a1fda768376b69c496127544b93458013dc21 Mon Sep 17 00:00:00 2001 From: Le Zhang Date: Mon, 13 Nov 2023 10:19:21 -0500 Subject: [PATCH] Issue open-horizon#3944 - Bug: agent-install.sh breaks on some k8s environments when when looking for HZN_NAMESPACE_SCOPED env variable Signed-off-by: Le Zhang --- agent-install/agent-install.sh | 6 ++-- agent-install/agent-uninstall.sh | 31 ++++++++++++------- .../k8s/auto-upgrade-cronjob-template.yml | 2 ++ 3 files changed, 25 insertions(+), 14 deletions(-) diff --git a/agent-install/agent-install.sh b/agent-install/agent-install.sh index 30ddab7eb..3fbd40f77 100755 --- a/agent-install/agent-install.sh +++ b/agent-install/agent-install.sh @@ -3537,7 +3537,7 @@ function check_cluster_agent_scope() { IFS="," read -ra namespace_array <<< "$namespaces_have_agent" namespace_to_check=${namespace_array[0]} - local namespace_scoped_env_value_in_use=$($KUBECTL get deployment agent -n ${namespace_to_check} -o jsonpath='{.spec.template.spec.containers[0].env}' | jq -r '.[] | select(.name=="HZN_NAMESPACE_SCOPED").value') + local namespace_scoped_env_value_in_use=$($KUBECTL get deployment agent -n ${namespace_to_check} -o json | jq '.spec.template.spec.containers[0].env' | jq -r '.[] | select(.name=="HZN_NAMESPACE_SCOPED").value') log_debug "Current HZN_NAMESPACE_SCOPED in agent deployment under namespace $namespace_to_check is: $namespace_scoped_env_value_in_use" log_debug "NAMESPACE_SCOPED passed to this script is: $NAMESPACE_SCOPED" # namespace scoped @@ -3574,7 +3574,7 @@ function check_agent_deployment_exist() { log_fatal 3 "Previous agent pod in not in RUNNING status, please run agent-uninstall.sh to clean up and re-run the agent-install.sh" else # check 0) agent scope in deployment - local namespace_scoped_env_value_in_use=$($KUBECTL get deployment agent -n ${AGENT_NAMESPACE} -o jsonpath='{.spec.template.spec.containers[0].env}' | jq -r '.[] | select(.name=="HZN_NAMESPACE_SCOPED").value') + local namespace_scoped_env_value_in_use=$($KUBECTL get deployment agent -n ${AGENT_NAMESPACE} -o json | jq '.spec.template.spec.containers[0].env' | jq -r '.[] | select(.name=="HZN_NAMESPACE_SCOPED").value') log_debug "Current HZN_NAMESPACE_SCOPED in agent deployment is $namespace_scoped_env_value_in_use" log_debug "NAMESPACE_SCOPED passed to this script is: $NAMESPACE_SCOPED" @@ -3637,7 +3637,7 @@ function check_agent_deployment_exist() { fi # check 3) HZN_ORG_ID set in deployment - local horizon_org_id_env_value_in_use=$($KUBECTL get deployment agent -n ${AGENT_NAMESPACE} -o jsonpath='{.spec.template.spec.containers[0].env}' | jq -r '.[] | select(.name=="HZN_ORG_ID").value') + local horizon_org_id_env_value_in_use=$($KUBECTL get deployment agent -n ${AGENT_NAMESPACE} -o json | jq '.spec.template.spec.containers[0].env' | jq -r '.[] | select(.name=="HZN_ORG_ID").value') log_debug "Current HZN_ORG_ID in agent deployment is: $horizon_org_id_env_value_in_use" log_debug "HZN_ORG_ID passed to this script is: $HZN_ORG_ID" diff --git a/agent-install/agent-uninstall.sh b/agent-install/agent-uninstall.sh index 0b14ec975..f109ae668 100644 --- a/agent-install/agent-uninstall.sh +++ b/agent-install/agent-uninstall.sh @@ -198,7 +198,7 @@ function validate_positive_int() { function get_agent_pod_id() { log_debug "get_agent_pod_id() begin" - if [[ $($KUBECTL get pods -n ${AGENT_NAMESPACE} -l app=agent -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}') != "True" ]]; then + if [[ $($KUBECTL get pods -n ${AGENT_NAMESPACE} -l app=agent,type!=auto-upgrade-cronjob -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}') != "True" ]]; then AGENT_POD_READY="false" else AGENT_POD_READY="true" @@ -335,23 +335,23 @@ function deleteAgentResources() { # give pods sometime to terminate by themselves sleep 10 - log_info "Checking if pods are deleted" - PODS=$($KUBECTL get pod -n $AGENT_NAMESPACE 2>/dev/null) + log_info "Checking if agent pods are deleted" + PODS=$($KUBECTL get pod -l app=agent -n $AGENT_NAMESPACE 2>/dev/null) if [[ -n "$PODS" ]]; then - log_info "Pods are not deleted by deleting deployment, delete pods now" + log_info "Agent pods are not deleted by deleting deployment, delete pods now" if [ "$USE_DELETE_FORCE" != true ]; then - $KUBECTL delete --all pods --namespace=$AGENT_NAMESPACE --grace-period=$DELETE_TIMEOUT + $KUBECTL delete pods -l app=agent --namespace=$AGENT_NAMESPACE --grace-period=$DELETE_TIMEOUT - PODS=$($KUBECTL get pod -n $AGENT_NAMESPACE 2>/dev/null) + PODS=$($KUBECTL get pod -l app=agent -n $AGENT_NAMESPACE 2>/dev/null) if [[ -n "$PODS" ]]; then - log_info "Pods still exist" + log_info "Agent pods still exist" PODS_STILL_EXIST="true" fi fi if [ "$USE_DELETE_FORCE" == true ] || [ "$PODS_STILL_EXIST" == true ]; then - log_info "Force deleting all the pods under $AGENT_NAMESPACE" - $KUBECTL delete --all pods --namespace=$AGENT_NAMESPACE --force=true --grace-period=0 + log_info "Force deleting agent pods under $AGENT_NAMESPACE" + $KUBECTL delete pods -l app=agent --namespace=$AGENT_NAMESPACE --force=true --grace-period=0 pkill -f anax.service fi fi @@ -383,8 +383,15 @@ function deleteAgentResources() { log_info "Deleting serviceaccount..." $KUBECTL delete serviceaccount $SERVICE_ACCOUNT_NAME -n $AGENT_NAMESPACE - log_info "Deleting namespace..." - $KUBECTL delete namespace $AGENT_NAMESPACE --force=true --grace-period=0 + log_info "Checking deployment and statefulset under namespace $AGENT_NAMESPACE" + deployment=$($KUBECTL get deployment -n $AGENT_NAMESPACE) + statefulset=$($KUBECTL get statefulset -n $AGENT_NAMESPACE) + if [[ -z "$deployment" ]] && [[ -z "$statefulset" ]]; then + log_info "No deployment and statefulset left under namespace $AGENT_NAMESPACE, deleting it..." + $KUBECTL delete namespace $AGENT_NAMESPACE --force=true --grace-period=0 + else + log_info "Deployment or statefulset exists in the namespace $AGENT_NAMESPACE, skip deleting namespace $AGENT_NAMESPACE. Please delete namespace manually" + fi log_info "Deleting cert file from /etc/default/cert ..." rm /etc/default/cert/agent-install.crt @@ -402,6 +409,8 @@ function uninstall_cluster() { if [[ "$AGENT_POD_READY" == "true" ]]; then removeNodeFromLocalAndManagementHub + else + log_info "agent pod under $AGENT_NAMESPACE is not ready, skip unregister process. Please remove node from management hub later if needed" fi deleteAgentResources diff --git a/agent-install/k8s/auto-upgrade-cronjob-template.yml b/agent-install/k8s/auto-upgrade-cronjob-template.yml index 7416d0e2e..d4b67f4b7 100644 --- a/agent-install/k8s/auto-upgrade-cronjob-template.yml +++ b/agent-install/k8s/auto-upgrade-cronjob-template.yml @@ -5,6 +5,7 @@ metadata: labels: app: agent openhorizon.org/component: agent + type: auto-upgrade-cronjob spec: schedule: '*/1 * * * *' concurrencyPolicy: Forbid @@ -18,6 +19,7 @@ spec: labels: app: agent openhorizon.org/component: agent + type: auto-upgrade-cronjob spec: volumes: - name: agent-pvc-storage