From c168891c8409f37ad42a452a15f8a96aeee4c104 Mon Sep 17 00:00:00 2001 From: Paulo Gomes Date: Thu, 20 Aug 2020 07:29:16 +0100 Subject: [PATCH] Support unprivileged container --- cmd/kured/main.go | 41 ++++++++++- kured-ds-unprivileged.yaml | 140 +++++++++++++++++++++++++++++++++++++ 2 files changed, 178 insertions(+), 3 deletions(-) create mode 100644 kured-ds-unprivileged.yaml diff --git a/cmd/kured/main.go b/cmd/kured/main.go index a35d67475..741b4f9f4 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -7,6 +7,8 @@ import ( "os" "os/exec" "regexp" + "strings" + "syscall" "time" log "github.com/sirupsen/logrus" @@ -127,6 +129,16 @@ func newCommand(name string, arg ...string) *exec.Cmd { func sentinelExists() bool { // Relies on hostPID:true and privileged:true to enter host mount space sentinelCmd := newCommand("/usr/bin/nsenter", "-m/proc/1/ns/mnt", "--", "/usr/bin/test", "-f", rebootSentinel) + + if strings.EqualFold(os.Getenv("UNPRIVILEGED"), "true") { + // Relies on the sentinel existing inside the same container mount namespace. + // Note that by default the host sentinel tends to be placed in a sensitive folder (/var/run/), + // mounting that as a volume is not recommended - but probably still safer than running the container as privileged. + + // Other approaches could involve an external process to check the existance of the sentinel and safely place a file + // in a location that is safe to be mounted into this container. + sentinelCmd = newCommand("/usr/bin/test", "-f", rebootSentinel) + } if err := sentinelCmd.Run(); err != nil { switch err := err.(type) { case *exec.ExitError: @@ -265,9 +277,32 @@ func commandReboot(nodeID string) { } } - // Relies on hostPID:true and privileged:true to enter host mount space - rebootCmd := newCommand("/usr/bin/nsenter", "-m/proc/1/ns/mnt", "/bin/systemctl", "reboot") - if err := rebootCmd.Run(); err != nil { + var err error + if strings.EqualFold(os.Getenv("UNPRIVILEGED"), "true") { + ch := make(chan bool, 1) + defer close(ch) + + go func() { + // Commits filesystem caches to disk in order to avoid data loss. + syscall.Sync() + }() + + select { + case <-ch: + log.Info("sync successful") + case <-time.After(30 * time.Second): + log.Error("sync timed out") + } + + // Relies on hostPID:true and SYS_BOOT to restart machine + err = syscall.Reboot(syscall.LINUX_REBOOT_CMD_RESTART) + } else { + // Relies on hostPID:true and privileged:true to enter host mount space + rebootCmd := newCommand("/usr/bin/nsenter", "-m/proc/1/ns/mnt", "/bin/systemctl", "reboot") + err = rebootCmd.Run() + } + + if err != nil { log.Fatalf("Error invoking reboot command: %v", err) } } diff --git a/kured-ds-unprivileged.yaml b/kured-ds-unprivileged.yaml new file mode 100644 index 000000000..7362d688b --- /dev/null +++ b/kured-ds-unprivileged.yaml @@ -0,0 +1,140 @@ +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: kured + namespace: kube-system +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: kured # Must match `--ds-name` + namespace: kube-system # Must match `--ds-namespace` +spec: + selector: + matchLabels: + name: kured + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + name: kured + spec: + serviceAccountName: kured + tolerations: + - key: node-role.kubernetes.io/master + effect: NoSchedule + hostPID: true # Required to execute reboot syscall + restartPolicy: Always + volumes: + - name: host-opt-kured + hostPath: + path: /opt/kured + type: DirectoryOrCreate + containers: + - name: kured + image: docker.io/weaveworks/kured + # If you find yourself here wondering why there is no + # :latest tag on Docker Hub,see the FAQ in the README + imagePullPolicy: IfNotPresent + volumeMounts: + - name: host-opt-kured + mountPath: /opt/kured + readOnly: true + securityContext: + allowPrivilegeEscalation: false + capabilities: + add: ["SYS_BOOT"] + drop: ["ALL"] + env: + - name: UNPRIVILEGED + value: "true" + # Pass in the name of the node on which this pod is scheduled + # for use with drain/uncordon operations and lock acquisition + - name: KURED_NODE_ID + valueFrom: + fieldRef: + fieldPath: spec.nodeName + command: + - /usr/bin/kured + - --reboot-sentinel=/opt/kured/reboot-required +# - --alert-filter-regexp=^RebootRequired$ +# - --blocking-pod-selector=runtime=long,cost=expensive +# - --blocking-pod-selector=name=temperamental +# - --blocking-pod-selector=... +# - --ds-name=kured +# - --ds-namespace=kube-system +# - --end-time=23:59:59 +# - --lock-annotation=weave.works/kured-node-lock +# - --period=1h +# - --prometheus-url=http://prometheus.monitoring.svc.cluster.local +# - --reboot-days=sun,mon,tue,wed,thu,fri,sat +# - --slack-hook-url=https://hooks.slack.com/... +# - --slack-username=prod +# - --slack-channel=alerting +# - --start-time=0:00 +# - --time-zone=UTC +--- +# kured-sentinel-sync places the sentinel into /opt/kured +# which is a safer place for kured to mount. This can be +# safely replaced by a similar process happening at the host. +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: kured-sentinel-sync # Must match `--ds-name` + namespace: kube-system # Must match `--ds-namespace` +spec: + selector: + matchLabels: + name: kured-sentinel-sync + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + name: kured-sentinel-sync + spec: + automountServiceAccountToken: false + tolerations: + - key: node-role.kubernetes.io/master + effect: NoSchedule + restartPolicy: Always + volumes: + - name: host-opt-kured + hostPath: + path: /opt/kured + type: DirectoryOrCreate + - name: host-var-run + hostPath: + path: /var/run + type: Directory + containers: + - name: sentinel-sync + image: bash:5.0 + command: [bash, -c] + args: + - |+ + while true + do + if [ -f $REBOOT_SENTINEL ]; then + /bin/touch /opt/kured/reboot-required + else + /bin/rm /opt/kured/reboot-required + fi + /bin/sleep 60 + done + env: + - name: REBOOT_SENTINEL + value: /var/run/reboot-required + volumeMounts: + - name: host-opt-kured + mountPath: /opt/kured + - name: host-var-run + mountPath: /var/run + readOnly: true + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: ["ALL"]