Skip to content

Commit

Permalink
feat: add volsync
Browse files Browse the repository at this point in the history
  • Loading branch information
prehor committed May 12, 2024
1 parent 26e7270 commit b3cdf7e
Show file tree
Hide file tree
Showing 34 changed files with 698 additions and 13 deletions.
1 change: 0 additions & 1 deletion .github/renovate/clusters.json5
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
"description": ["Separate PRs for main cluster"],
"matchFileNames": [
"**/kubernetes/main/**",
"**/kubernetes/storage/apps/storage/minio/secrets/**",
"**/ansible/main/**",
"**/terraform/main/**"
],
Expand Down
230 changes: 230 additions & 0 deletions .taskfiles/Volsync/Taskfile.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
---
# yaml-language-server: $schema=https://taskfile.dev/schema.json
version: "3"

# This taskfile is used to manage certain VolSync tasks for a given application, limitations are described below.
# 1. Fluxtomization, HelmRelease, PVC, ReplicationSource all have the same name (e.g. plex)
# 2. ReplicationSource and ReplicationDestination are a Restic repository
# 3. Applications are deployed as either a Kubernetes Deployment or StatefulSet
# 4. Each application only has one PVC that is being replicated

x-env: &env
app: "{{.app}}"
claim: "{{.claim}}"
controller: "{{.controller}}"
job: "{{.job}}"
ns: "{{.ns}}"
pgid: "{{.pgid}}"
previous: "{{.previous}}"
puid: "{{.puid}}"
repository: "{{.repository}}"

vars:
VOLSYNC_SCRIPTS_DIR: "{{.ROOT_DIR}}/.taskfiles/VolSync/scripts"
VOLSYNC_TEMPLATES_DIR: "{{.ROOT_DIR}}/.taskfiles/VolSync/templates"

tasks:

state-*:
desc: Suspend or Resume Volsync
summary: |
Args:
cluster: Cluster to run command against (default:main)
state: resume or suspend (required)
cmds:
- flux --context {{.cluster}} {{.state}} kustomization volsync
- flux --context {{.cluster}} -n {{.ns}} {{.state}} helmrelease volsync
- kubectl --context {{.cluster}} -n {{.ns}} scale deployment volsync --replicas {{if eq "suspend" .state}}0{{else}}1{{end}}
env: *env
vars:
cluster: '{{.cluster | default "main"}}'
ns: '{{.ns | default "storage"}}'
state: '{{index .MATCH 0}}'

list:
desc: List snapshots for an application
summary: |
Args:
cluster: Cluster to run command against (default:main)
ns: Namespace the PVC is in (default: default)
app: Application to list snapshots for (required)
cmds:
- envsubst < <(cat {{.VOLSYNC_TEMPLATES_DIR}}/list.tmpl.yaml) | kubectl --context {{.cluster}} apply -f -
- bash {{.VOLSYNC_SCRIPTS_DIR}}/wait-for-job.sh {{.job}} {{.ns}} {{.cluster}}
- kubectl --context {{.cluster}} -n {{.ns}} wait job/{{.job}} --for condition=complete --timeout=1m
- kubectl --context {{.cluster}} -n {{.ns}} logs job/{{.job}} --container main
- kubectl --context {{.cluster}} -n {{.ns}} delete job {{.job}}
env: *env
requires:
vars: ["app"]
vars:
cluster: '{{.cluster | default "main"}}'
ns: '{{.ns | default "default"}}'
job: volsync-list-{{.app}}
preconditions:
- test -f {{.VOLSYNC_SCRIPTS_DIR}}/wait-for-job.sh
- test -f {{.VOLSYNC_TEMPLATES_DIR}}/list.tmpl.yaml
silent: true

unlock:
desc: Unlock a Restic repositories for an application
summary: |
Args:
cluster: Cluster to run command against (default:main)
ns: Namespace the PVC is in (default: default)
app: Application to unlock (required)
cmds:
- envsubst < <(cat {{.VOLSYNC_TEMPLATES_DIR}}/unlock.tmpl.yaml) | kubectl --context {{.cluster}} apply -f -
- bash {{.VOLSYNC_SCRIPTS_DIR}}/wait-for-job.sh {{.job}} {{.ns}} {{.cluster}}
- kubectl --context {{.cluster}} -n {{.ns}} wait job/{{.job}} --for condition=complete --timeout=1m
- kubectl --context {{.cluster}} -n {{.ns}} logs job/{{.job}} --container minio
- kubectl --context {{.cluster}} -n {{.ns}} logs job/{{.job}} --container backblaze
- kubectl --context {{.cluster}} -n {{.ns}} delete job {{.job}}
env: *env
requires:
vars: ["app"]
vars:
cluster: '{{.cluster | default "main"}}'
ns: '{{.ns | default "default"}}'
job: volsync-unlock-{{.app}}
preconditions:
- test -f {{.VOLSYNC_SCRIPTS_DIR}}/wait-for-job.sh
- test -f {{.VOLSYNC_TEMPLATES_DIR}}/unlock.tmpl.yaml
silent: true

# To run backup jobs in parallel for all replicationsources:
# - kubectl get replicationsources --all-namespaces --no-headers | awk '{print $2, $1}' | xargs --max-procs=4 -l bash -c 'task volsync:snapshot app=$0 ns=$1'
snapshot:
desc: Snapshot a PVC for an application
summary: |
Args:
cluster: Cluster to run command against (default:main)
ns: Namespace the PVC is in (default: default)
app: Application to snapshot (required)
cmds:
- kubectl --context {{.cluster}} -n {{.ns}} patch replicationsources {{.app}} --type merge -p '{"spec":{"trigger":{"manual":"{{.now}}"}}}'
- bash {{.VOLSYNC_SCRIPTS_DIR}}/wait-for-job.sh {{.job}} {{.ns}} {{.cluster}}
- kubectl --context {{.cluster}} -n {{.ns}} wait job/{{.job}} --for condition=complete --timeout=120m
env: *env
requires:
vars: ["app"]
vars:
cluster: '{{.cluster | default "main"}}'
now: '{{now | date "150405"}}'
ns: '{{.ns | default "default"}}'
job: volsync-snapshot-{{.app}}
controller:
sh: true && {{.VOLSYNC_SCRIPTS_DIR}}/which-controller.sh {{.app}} {{.ns}} {{.cluster}}
preconditions:
- test -f {{.VOLSYNC_SCRIPTS_DIR}}/which-controller.sh
- test -f {{.VOLSYNC_SCRIPTS_DIR}}/wait-for-job.sh
- kubectl --context {{.cluster}} -n {{.ns}} get replicationsources {{.app}}

# To run restore jobs in parallel for all replicationdestinations:
# - kubectl get replicationsources --all-namespaces --no-headers | awk '{print $2, $1}' | xargs --max-procs=4 -l bash -c 'task volsync:restore app=$0 ns=$1'
restore:
desc: Restore a PVC for an application
summary: |
Args:
cluster: Cluster to run command against (default:main)
ns: Namespace the PVC is in (default: default)
app: Application to restore (required)
repository: Restic application from this repository (default: minio)
previous: Previous number of snapshots to restore (default: 2)
cmds:
- { task: .suspend, vars: *env }
- { task: .wipe, vars: *env }
- { task: .restore, vars: *env }
- { task: .resume, vars: *env }
env: *env
requires:
vars: ["app"]
vars:
cluster: '{{.cluster | default "main"}}'
ns: '{{.ns | default "default"}}'
repository: '{{.repository | default "minio"}}'
previous: '{{.previous | default 2}}'
controller:
sh: "{{.VOLSYNC_SCRIPTS_DIR}}/which-controller.sh {{.app}} {{.ns}}"
claim:
sh: kubectl --context {{.cluster}} -n {{.ns}} get replicationsources/{{.app}}-{{.repository}} -o jsonpath="{.spec.sourcePVC}"
puid:
sh: kubectl --context {{.cluster}} -n {{.ns}} get replicationsources/{{.app}}-{{.repository}} -o jsonpath="{.spec.restic.moverSecurityContext.runAsUser}"
pgid:
sh: kubectl --context {{.cluster}} -n {{.ns}} get replicationsources/{{.app}}-{{.repository}} -o jsonpath="{.spec.restic.moverSecurityContext.runAsGroup}"
preconditions:
- test -f {{.VOLSYNC_SCRIPTS_DIR}}/which-controller.sh
- test -f {{.VOLSYNC_SCRIPTS_DIR}}/wait-for-job.sh
- test -f {{.VOLSYNC_TEMPLATES_DIR}}/replicationdestination.tmpl.yaml
- test -f {{.VOLSYNC_TEMPLATES_DIR}}/wipe.tmpl.yaml

cleanup:
desc: Delete volume populator PVCs in all namespaces
summary: |
Args:
cluster: Cluster to run command against (default:main)
cmds:
- for: { var: dest }
cmd: |
{{- $items := (split "/" .ITEM) }}
kubectl --context {{.cluster}} delete pvc -n {{ $items._0 }} {{ $items._1 }}
- for: { var: cache }
cmd: |
{{- $items := (split "/" .ITEM) }}
kubectl --context {{.cluster}} delete pvc -n {{ $items._0 }} {{ $items._1 }}
- for: { var: snaps }
cmd: |
{{- $items := (split "/" .ITEM) }}
kubectl --context {{.cluster}} delete volumesnapshot -n {{ $items._0 }} {{ $items._1 }}
env: *env
vars:
cluster: '{{.cluster | default "main"}}'
dest:
sh: kubectl --context {{.cluster}} get pvc --all-namespaces --no-headers | grep "dst-dest" | awk '{print $1 "/" $2}'
cache:
sh: kubectl --context {{.cluster}} get pvc --all-namespaces --no-headers | grep "dst-cache" | awk '{print $1 "/" $2}'
snaps:
sh: kubectl --context {{.cluster}} get volumesnapshot --all-namespaces --no-headers | grep "dst-dest" | awk '{print $1 "/" $2}'

# Suspend the Flux ks and hr
.suspend:
internal: true
cmds:
- flux --context {{.cluster}} -n flux-system suspend kustomization {{.app}}
- flux --context {{.cluster}} -n {{.ns}} suspend helmrelease {{.app}}
- kubectl --context {{.cluster}} -n {{.ns}} scale {{.controller}} --replicas 0
- kubectl --context {{.cluster}} -n {{.ns}} wait pod --for delete --selector="app.kubernetes.io/name={{.app}}" --timeout=2m
env: *env

# Wipe the PVC of all data
.wipe:
internal: true
cmds:
- envsubst < <(cat {{.VOLSYNC_TEMPLATES_DIR}}/wipe.tmpl.yaml) | kubectl --context {{.cluster}} apply -f -
- bash {{.VOLSYNC_SCRIPTS_DIR}}/wait-for-job.sh {{.job}} {{.ns}} {{.cluster}}
- kubectl --context {{.cluster}} -n {{.ns}} wait job/{{.job}} --for condition=complete --timeout=120m
- kubectl --context {{.cluster}} -n {{.ns}} logs job/{{.job}} --container main
- kubectl --context {{.cluster}} -n {{.ns}} delete job {{.job}}
env: *env
vars:
job: volsync-wipe-{{.app}}

# Create VolSync replicationdestination CR to restore data
.restore:
internal: true
cmds:
- envsubst < <(cat {{.VOLSYNC_TEMPLATES_DIR}}/replicationdestination.tmpl.yaml) | kubectl --context {{.cluster}} apply -f -
- bash {{.VOLSYNC_SCRIPTS_DIR}}/wait-for-job.sh {{.job}} {{.ns}} {{.cluster}}
- kubectl --context {{.cluster}} -n {{.ns}} wait job/{{.job}} --for condition=complete --timeout=120m
- kubectl --context {{.cluster}} -n {{.ns}} delete replicationdestination {{.job}}
env: *env
vars:
job: volsync-restore-{{.app}}

# Resume Flux ks and hr
.resume:
internal: true
cmds:
- flux --context {{.cluster}} -n {{.ns}} resume helmrelease {{.app}}
- flux --context {{.cluster}} -n flux-system resume kustomization {{.app}}
env: *env
14 changes: 14 additions & 0 deletions .taskfiles/Volsync/scripts/wait-for-job.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/usr/bin/env bash

JOB=$1
NAMESPACE="${2:-default}"
CLUSTER="${3:-main}"

[[ -z "${JOB}" ]] && echo "Job name not specified" && exit 1
while true; do
STATUS="$(kubectl --context "${CLUSTER}" -n "${NAMESPACE}" get pod -l job-name="${JOB}" -o jsonpath='{.items[*].status.phase}')"
if [ "${STATUS}" == "Pending" ]; then
break
fi
sleep 1
done
22 changes: 22 additions & 0 deletions .taskfiles/Volsync/scripts/which-controller.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/usr/bin/env bash

APP=$1
NAMESPACE="${2:-default}"
CLUSTER="${3:-main}"

is_deployment() {
kubectl --context "${CLUSTER}" -n "${NAMESPACE}" get deployment "${APP}" >/dev/null 2>&1
}

is_statefulset() {
kubectl --context "${CLUSTER}" -n "${NAMESPACE}" get statefulset "${APP}" >/dev/null 2>&1
}

if is_deployment; then
echo "deployment.apps/${APP}"
elif is_statefulset; then
echo "statefulset.apps/${APP}"
else
echo "No deployment or statefulset found for ${APP}"
exit 1
fi
27 changes: 27 additions & 0 deletions .taskfiles/Volsync/templates/list.tmpl.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
---
apiVersion: batch/v1
kind: Job
metadata:
name: ${job}
namespace: ${ns}
spec:
ttlSecondsAfterFinished: 3600
template:
spec:
automountServiceAccountToken: false
restartPolicy: OnFailure
containers:
- name: minio
image: docker.io/restic/restic:0.16.4
args: ["snapshots"]
envFrom:
- secretRef:
name: ${app}-volsync-minio
resources: {}
- name: backblaze
image: docker.io/restic/restic:0.16.4
args: ["snapshots"]
envFrom:
- secretRef:
name: ${app}-volsync-backblaze
resources: {}
31 changes: 31 additions & 0 deletions .taskfiles/Volsync/templates/replicationdestination.tmpl.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
---
apiVersion: volsync.backube/v1alpha1
kind: ReplicationDestination
metadata:
name: ${job}
namespace: ${ns}
spec:
trigger:
manual: restore-once
restic:
repository: ${app}-volsync-${repository}
destinationPVC: ${claim}
copyMethod: Direct
storageClassName: ceph-block
# storageClassName: ceph-filesystem
# accessModes: ["ReadWriteMany"]
# IMPORTANT NOTE:
# Set to the last X number of snapshots to restore from
previous: ${previous}
# OR;
# IMPORTANT NOTE:
# On bootstrap set `restoreAsOf` to the time the old cluster was destroyed.
# This will essentially prevent volsync from trying to restore a backup
# from a application that started with default data in the PVC.
# Do not restore snapshots made after the following RFC3339 Timestamp.
# date --rfc-3339=seconds (--utc)
# restoreAsOf: "2022-12-10T16:00:00-05:00"
moverSecurityContext:
runAsUser: ${puid}
runAsGroup: ${pgid}
fsGroup: ${pgid}
27 changes: 27 additions & 0 deletions .taskfiles/Volsync/templates/unlock.tmpl.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
---
apiVersion: batch/v1
kind: Job
metadata:
name: ${job}
namespace: ${ns}
spec:
ttlSecondsAfterFinished: 3600
template:
spec:
automountServiceAccountToken: false
restartPolicy: OnFailure
containers:
- name: minio
image: docker.io/restic/restic:0.16.4
args: ["unlock", "--remove-all"]
envFrom:
- secretRef:
name: ${app}-volsync-minio
resources: {}
- name: backblaze
image: docker.io/restic/restic:0.16.4
args: ["unlock", "--remove-all"]
envFrom:
- secretRef:
name: ${app}-volsync-backblaze
resources: {}
26 changes: 26 additions & 0 deletions .taskfiles/Volsync/templates/wipe.tmpl.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
---
apiVersion: batch/v1
kind: Job
metadata:
name: ${job}
namespace: ${ns}
spec:
ttlSecondsAfterFinished: 3600
template:
spec:
automountServiceAccountToken: false
restartPolicy: OnFailure
containers:
- name: main
image: docker.io/library/alpine:latest
command: ["/bin/sh", "-c", "cd /config; find . -delete"]
volumeMounts:
- name: config
mountPath: /config
securityContext:
privileged: true
resources: {}
volumes:
- name: config
persistentVolumeClaim:
claimName: ${claim}
1 change: 1 addition & 0 deletions Taskfile.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ includes:
taskfile: .taskfiles/Repository/Taskfile.yaml
talos: .taskfiles/Talos/Taskfile.yaml
sops: .taskfiles/Sops/Taskfile.yaml
volsync: .taskfiles/Volsync/Taskfile.yaml
workstation: .taskfiles/Workstation/Taskfile.yaml
user:
taskfile: .taskfiles/User
Expand Down
Loading

0 comments on commit b3cdf7e

Please sign in to comment.