Skip to content
This repository has been archived by the owner on Sep 30, 2020. It is now read-only.

Commit

Permalink
Implement a simple upgradeHelper plugin which will disable existing c…
Browse files Browse the repository at this point in the history
…ontrollers when a new kubernetes release is being rolled out. (#1678)

Save mutating and validating webhooks before install-kube-system runs and restore again afterwards.

Webhook feature can be toggled using 'disableWebhooks' boolean plugin config entry.
  • Loading branch information
davidmccormick authored Jul 12, 2019
1 parent f073d3d commit 6bf0e67
Show file tree
Hide file tree
Showing 4 changed files with 327 additions and 1 deletion.
16 changes: 15 additions & 1 deletion builtin/files/cluster.yaml.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -1573,7 +1573,9 @@ kubeAwsPlugins:
# See plugins/aws-iam-authenticator/plugin.yaml for more info
awsIamAuthenticator:
enabled: false
# see plugins/cluster-autoscaler/plugin.yaml for more info

# clusterAutoscaler provides kubernetes cluster-autoscaler functionality - https://github.com/kubernetes/autoscaler/tree/master/cluster-autoscaler
# Replaces original built-in functionality with a plugin and upgrades to the latest version
clusterAutoscaler:
enabled: false
replicas: 2
Expand Down Expand Up @@ -1604,3 +1606,15 @@ kubeAwsPlugins:
# selectors for autodiscovery
selector:
prometheus: monitoring

# upgradeHelper - assists when rolling out new versions of kubernetes
# It actively disables old controllers and temporarily removes mutating/validating webhooks whilst
# the upgraded controller is starting up.
# NOTE: You will normally not need this plugin - so ONLY enable if you are experiencing issues testing migrating across versions.
# It will only kill controller that are a different release from currently spinning up version, e.g. :-
# it will kill v1.13.2 controllers when rolling out v1.14.0
# it will NOT kill v1.14.0 controllers when rolling out v1.14.3
upgradeHelper:
enabled: false
# disableWebhooks can be used to turn off the webhook feature if required
disableWebhooks: true
44 changes: 44 additions & 0 deletions builtin/files/plugins/upgrade-helper/assets/restore-webhooks.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/bin/bash
# Restore webhooks that were exported and then deleted by upgrade-helper.sh

retries=5
hyperkube_image="{{ .Config.HyperkubeImage.RepoWithTag }}"
disable_webhooks="{{ if .Values.disableWebhooks }}true{{else}}false{{end}}"

kubectl() {
/usr/bin/docker run -i --rm -v /etc/kubernetes:/etc/kubernetes:ro --net=host ${hyperkube_image} /hyperkube kubectl --kubeconfig=/etc/kubernetes/kubeconfig/admin.yaml "$@"
}

list_not_empty() {
local file=$1
if ! [[ -s $file ]]; then
return 1
fi
if cat $file | grep -se 'items: \[\]'; then
return 1
fi
return 0
}

applyall() {
kubectl apply --force -f $(echo "$@" | tr ' ' ',')
}

restore_webhooks() {
local type=$1
local file=$2

if list_not_empty $file; then
echo "Restoring all ${type} webhooks from ${file}"
applyall $file
else
echo "no webhooks to restore in $file"
fi
}

if [[ "${disable_webhooks}" == "true" ]]; then
echo "Restoring all validating and mutating webhooks..."
restore_webhooks validating /srv/kubernetes/validating_webhooks.yaml
restore_webhooks mutating /srv/kubernetes/mutating_webhooks.yaml
fi
exit 0
214 changes: 214 additions & 0 deletions builtin/files/plugins/upgrade-helper/assets/upgrade-helper.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
#!/bin/bash
# Smooths upgrades/roll-backs where the release of kubernetes jumps a release
# It kills old controllers so that this one takes over all api functions, so we don't get an
# extended period of old and new running side-by-side and the incompatibilities that this can bring.
# It also removes any mutating and validating webhooks in the system so that install-kube-system can run without interference.
#
# A request to disable is a configmap matching the hostname and kubernetes version containing a list of core service to stop: -
# apiVersion: v1
# kind: ConfigMap
# metadata:
# name: kube-aws-migration-disable-ip-10-29-26-83.us-west-2.compute.internal
# namespace: kube-system
# data:
# kubernetesVersion: v1.9.3
# disable: "kube-apiserver kube-controller-manager kube-scheduler"

retries=5
hyperkube_image="{{ .Config.HyperkubeImage.RepoWithTag }}"
my_kubernetes_version="{{ .Config.HyperkubeImage.Tag }}"
myhostname=$(hostname -f)
disable_webhooks="{{ if .Values.disableWebhooks }}true{{else}}false{{end}}"

kubectl() {
/usr/bin/docker run -i --rm -v /etc/kubernetes:/etc/kubernetes:ro --net=host ${hyperkube_image} /hyperkube kubectl --kubeconfig=/etc/kubernetes/kubeconfig/admin.yaml "$@"
}

kubectl_with_retries() {
local tries=0
local result_text=""
local return_code=0

while [ "$tries" -lt "$retries" ]; do
result_text=$(kubectl "$@")
return_code=$?
if [ "$return_code" -eq "0" ]; then
echo "${result_text}"
break
fi
sleep 10
tries=$((tries+1))
done
return $return_code
}

log() {
echo "$@" >&2
}

get_masters() {
kubectl get nodes -l kubernetes.io/role=master --no-headers -o custom-columns=NAME:metadata.name,VERSION:status.nodeInfo.kubeletVersion | awk '{printf "%s:%s\n", $1, $2}'
}

valid_version() {
match=$(echo $1 | awk -e '(/^v[0-9]+\.[0-9]+\.[0-9]+/){print "match"}')
[[ "$match" == "match" ]]
}

version_jumps() {
# only a minor release change is NOT a version jump
if [[ "${1%.*}" != "${2%.*}" ]]; then
return 0
fi
return 1
}

# stop a controller by writing a special kube-aws disable service configmap
disable_controller() {
local controller=$1
local version=$2

local request="$(cat <<EOT
apiVersion: v1
kind: ConfigMap
metadata:
name: kube-aws-migration-disable-${controller}
namespace: kube-system
data:
kubernetesVersion: ${version}
disable: "kube-controller-manager kube-scheduler kube-apiserver"
EOT
)"

log "Creating disable service configmap kube-system/kube-aws-migration-disable-${controller}"
echo "${request}" | kubectl_with_retries -n kube-system apply -f - || return 1
return 0
}

find_pod() {
local name=$1
local host=$2

kubectl -n kube-system get pod "${name}-${host}" --no-headers -o wide --ignore-not-found
}

node_running() {
local node=$1

ready=$(kubectl -n kube-system get node "${node}" --no-headers --ignore-not-found | awk '{print $2}')
if [[ "${ready}" == "Ready" ]]; then
return 0
fi

return 1
}

wait_stopped() {
local controllers=$1
log ""
log "WAITING FOR ALL MATCHED CONTROLLERS TO STOP:-"
log "${controllers}"
log ""

local test=1
while [ "$test" -eq "1" ]; do
test=0

for cont in $controllers; do
if node_running $cont; then
test=1
fi
done

if [ "$test" -eq "1" ]; then
log "Controllers still active, waiting 5 seconds..."
sleep 5
fi
done
}

save_webhooks() {
local type=$1
local file=$2

echo "Storing and removing all ${type} webhooks to ${file}"
if [[ -s $file ]]; then
echo "$file already saved"
else
kubectl get ${type}webhookconfigurations -o yaml --export >$file
if list_not_empty $file; then
echo "deleting $type webhooks..."
ensuredelete $file
fi
fi
}

list_not_empty() {
local file=$1
if ! [[ -s $file ]]; then
return 1
fi
if cat $file | grep -se 'items: \[\]'; then
return 1
fi
return 0
}

ensuredelete() {
kubectl delete --cascade=true --ignore-not-found=true -f $(echo "$@" | tr ' ' ',')
}

# MAIN

if ! $(valid_version ${my_kubernetes_version}); then
log "My kubernetes version ${my_kubernetes_version} is invalid - aborting!"
exit 1
fi

while ! kubectl get ns kube-system; do
echo "waiting for apiserver to be available..."
sleep 3
done

# Disable all mutating and validating webhooks because they can interfere with the stack migration)
if [[ "${disable_webhooks}" == "true" ]]; then
echo "Storing and removing all validating and mutating webhooks..."
save_webhooks validating /srv/kubernetes/validating_webhooks.yaml
save_webhooks mutating /srv/kubernetes/mutating_webhooks.yaml
fi

log ""
log "CHECKING CONTROLLER VERSIONS..."
log ""
found=""
for controller in $(get_masters); do
controller_name=$(echo "${controller%%:*}")
controller_version=$(echo "${controller##*:}")
if [[ "${controller_name}" != "$myhostname" ]]; then
if ! $(valid_version ${controller_version}); then
log "Controller ${controller_name} has an invalid version number ${controller_version}!"
continue
fi

if $(version_jumps ${my_kubernetes_version} ${controller_version}); then
log "Detected a version jump on ${controller_name}: my version is ${my_kubernetes_version} and theirs is ${controller_version}"
log "Disabling kube-apiserver, kube-scheduler and kube-controller-manager..."
if [[ -z "${found}" ]]; then
found="${controller_name}"
else
found="${found} ${controller_name}"
fi
disable_controller ${controller_name} ${controller_version}
else
log "No version jump on ${controller_name}: my version is ${my_kubernetes_version} and theirs is ${controller_version}"
fi
fi
done

if [[ -n "${found}" ]]; then
log ""
log "WAITING FOR FOUND CONTROLLERS TO STOP..."
log ""
wait_stopped "${found}"
fi
exit 0
54 changes: 54 additions & 0 deletions builtin/files/plugins/upgrade-helper/plugin.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
metadata:
name: upgrade-helper
version: 0.1.0
spec:
cluster:
values:
disableWebhooks: true
machine:
roles:
controller:
files:
- path: /etc/systemd/system/install-kube-system.service.d/10-upgrade-helper-dependency.conf
permissions: 0644
content: |
[Unit]
Requires=kube-aws-upgrade-helper.service
After=kube-aws-upgrade-helper.service
Before=restore-webhooks.service
[Service]
ExecStartPre=/usr/bin/bash -c "until /usr/bin/systemctl is-active kube-aws-upgrade-helper.service; do echo waiting until kube-aws-upgrade-helper.service starts; sleep 10; done"
- path: /opt/bin/upgrade-helper.sh
permissions: 0755
source:
path: assets/upgrade-helper.sh
- path: /opt/bin/restore-webhooks.sh
permissions: 0755
source:
path: assets/restore-webhooks.sh
systemd:
units:
- name: kube-aws-upgrade-helper.service
content: |
[Unit]
Requires=kubelet.service
After=kubelet.service
Before=install-kube-system.service
[Service]
Type=oneshot
StartLimitInterval=0
RemainAfterExit=true
ExecStart=/usr/bin/bash -c '/opt/bin/upgrade-helper.sh'
- name: restore-webhooks.service
content: |
[Unit]
Requires=install-kube-system.service
After=install-kube-system.service
[Service]
Type=oneshot
StartLimitInterval=0
RemainAfterExit=true
ExecStart=/usr/bin/bash -c '/opt/bin/restore-webhooks.sh'

0 comments on commit 6bf0e67

Please sign in to comment.