Skip to content

Commit

Permalink
add in ganymede build scripts and backup scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
delgadom committed Dec 13, 2019
2 parents 955fd6e + efd3a8d commit 841cb43
Show file tree
Hide file tree
Showing 7 changed files with 484 additions and 33 deletions.
85 changes: 85 additions & 0 deletions deploy/backup/backup_from_volume.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#!/usr/bin/#!/usr/bin/env bash

project=rhg-project-1
zone=us-west1-a
namespace=rhodium-jupyter

# create a google compute instance in the same zone & project as the
# cluster. the below commands assume you're running ubuntu... so... that's
# preferable
instance=mikes-crazy-solo-instance-please-dont-let-this-run-past-jan2020

gcs_backup_dir=gs://compute-rhg-backups/test-manual-backups-$(date +%F)

token_file=rhg-project-1-compute-rhg-backup-manager.json

# copy our gcloud service account token to the instance
gcloud compute scp -q $token_file $instance:~/ > /dev/null

gcloud compute ssh --zone $zone $instance -- bash -c "echo && "\
"sudo apt-get update -qq > /dev/null; "\
"sudo apt-get --yes -qq install --upgrade apt-utils kubectl google-cloud-sdk > /dev/null 2>&1; "\
"gcloud auth activate-service-account -q --key-file ~/$token_file >/dev/null 2>&1; " > /dev/null 2>&1

in_list() {
local search="$1"
shift
local list=("$@")
for file in "${list[@]}" ; do
[[ $file == $search ]] && return 0
done
return 1
}

# compile a list of cluster users
claims=$(kubectl -n $namespace get PersistentVolumes | grep claim- | awk '{print $6}')

# get a list of currently running pods
running_pods=$(for pod in $(kubectl -n $namespace get pods | grep jupyter- | awk '{print $1}'); do echo ${pod/jupyter-/}; done)

cluster_users=$(
for claim in $claims; do
claim_user=${claim#*/};
cluster_user=${claim_user/claim-/};
if ! in_list $cluster_user $running_pods; then
echo $cluster_user;
fi
done
);

# cluster_users=$(for user in mattgoldklang smohan moonlimb; do echo $user; done)

# enumerate counter
i=0

# loop over our user list
for cluster_user in $cluster_users; do

# get the GKE persistent volume claim and associated GCE Volume ID
claim=$(kubectl -n $namespace get PersistentVolumes | grep "$namespace/claim-$cluster_user\ " | awk '{print $1}')
volume=$(gcloud compute disks list --filter="zone:($zone) name:($claim)" | grep $claim | awk '{print $1}');

# attach the volume to the instance
gcloud compute instances attach-disk -q $instance --disk $volume --zone $zone > /dev/null

# mount the volume and copy the data to GCS
gcloud compute ssh --zone $zone $instance -- bash -c "echo &&\
sudo mkdir /mnt/$cluster_user && \
sudo mount /dev/sdb /mnt/$cluster_user && \
gsutil -m cp -r /mnt/$cluster_user $gcs_backup_dir/$cluster_user/home/jovyan; \
sudo umount /mnt/$cluster_user && \
sudo rm -r /mnt/$cluster_user"

# detach the volume from the instance
gcloud compute instances detach-disk -q $instance --disk $volume --zone $zone > /dev/null

echo $i
i=$((i+1));

done
# done | tqdm --total $(echo "$cluster_users" | wc -w) > /dev/null

# remove the credentials from the temporary instance
gcloud compute ssh --zone $zone $instance -- bash -c "echo && "\
"gcloud auth revoke [email protected] >/dev/null 2>&1; "\
"rm -f ~/$token_file; " > /dev/null 2>&1
Empty file.
30 changes: 30 additions & 0 deletions deploy/backup/recover_from_gcs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/usr/bin/#!/usr/bin/env bash

project=rhg-project-1
zone=us-west1-a
namespace=compute-rhg

# not true for rhg-hub or test-hub. be warned.
cluster=$namespace

gcloud container clusters get-credentials $cluster --zone $zone --project $project

gcs_backup_dir=gs://compute-rhg-backups/test-manual-backups-2019-12-07

token_file=rhg-project-1-compute-rhg-backup-manager.json

# active_users=$(for pod in $(kubectl -n $namespace get pods | grep jupyter- | awk '{print $1}'); do user=${pod/jupyter-/}; echo $user; done);
active_users=$(for user in delgadom; do echo $user; done);

i=0;
for cluster_user in $active_users; do
kubectl cp -n $namespace $token_file jupyter-$cluster_user:/home/jovyan/;
kubectl exec jupyter-$cluster_user --namespace $namespace -- bash -c "\
sudo apt-get update -qq > /dev/null; \
sudo apt-get --yes -qq install --upgrade apt-utils kubectl google-cloud-sdk > /dev/null 2>&1; \
gcloud auth activate-service-account -q --key-file /home/jovyan/$token_file >/dev/null 2>&1; \
gsutil -m -q cp -r $gcs_backup_dir/$cluster_user/home/jovyan/ /home/ >/dev/null; \
gcloud auth revoke [email protected] >/dev/null 2>&1; \
rm -f /home/jovyan/$token_file";
echo $((i++));
done | tqdm --total $(echo $active_users | wc -w) > /dev/null
135 changes: 135 additions & 0 deletions deploy/create_scripts/compute-rhg.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
#!/usr/bin/env bash

set -e

# Make sure you're logged in to gcloud and have the correct permissions
EMAIL=$(gcloud config get-value account)
PROJECTID=$(gcloud config get-value project)
ZONE=$(gcloud config get-value compute/zone)
CLUSTER_NAME=compute-rhg
DEPLOYMENT_NAME=compute-rhg
URL=testing.climate-kube.com
HELM_SPEC=jupyter-config.yml
NUM_NODES=1
MAX_WORKER_NODES=200
MIN_WORKER_NODES=0
DISK_SIZE=100
NB_MACHINE_TYPE=n1-highmem-8
WORKER_MACHINE_TYPE=n1-highmem-8
# PREEMPTIBLE_FLAG=
PREEMPTIBLE_FLAG=--preemptible

# Start cluster on Google cloud
gcloud container clusters create $CLUSTER_NAME --num-nodes=$NUM_NODES \
--machine-type=n1-standard-2 --zone=$ZONE --project=$PROJECTID \
--enable-ip-alias --no-enable-legacy-authorization

# get rid of default pool that we don't want
echo deleting default pool
gcloud container node-pools delete default-pool --cluster $CLUSTER_NAME \
--zone=$ZONE --project=$PROJECTID --quiet

# core-pool
echo creating core pool...
core_machine_type="n1-standard-2"
core_labels="hub.jupyter.org/node-purpose=core"
gcloud container node-pools create core-pool --cluster=${CLUSTER_NAME} \
--machine-type=${core_machine_type} --zone=${ZONE} --num-nodes=2 \
--node-labels ${core_labels}

# jupyter-pools
echo creating jupyter pool...
jupyter_taints="hub.jupyter.org_dedicated=user:NoSchedule"
jupyter_labels="hub.jupyter.org/node-purpose=user"
gcloud container node-pools create jupyter-pool --cluster=${CLUSTER_NAME} \
--machine-type=${NB_MACHINE_TYPE} --disk-type=pd-ssd --zone=${ZONE} \
--num-nodes=0 --enable-autoscaling --min-nodes=0 --max-nodes=10 \
--node-taints ${jupyter_taints} --node-labels ${jupyter_labels}

# dask-pool
echo creating dask pool...
dask_taints="k8s.dask.org_dedicated=worker:NoSchedule"
dask_labels="k8s.dask.org/node-purpose=worker"
gcloud container node-pools create dask-pool --cluster=${CLUSTER_NAME} \
${PREEMPTIBLE_FLAG} --machine-type=${WORKER_MACHINE_TYPE} --disk-type=pd-ssd \
--zone=${ZONE} --num-nodes=0 --enable-autoscaling --min-nodes=0 \
--max-nodes=${MAX_WORKER_NODES} --node-taints ${dask_taints} \
--node-labels ${dask_labels}

# make sure you have the credentials for this cluster loaded
echo get credentials for cluster
gcloud container clusters get-credentials $CLUSTER_NAME --zone $ZONE \
--project $PROJECTID

#this will give you admin access on the cluster
kubectl create clusterrolebinding cluster-admin-binding \
--clusterrole=cluster-admin --user=$EMAIL

# ############
# ## Only strictly necessary if helm 3 (but ok to do either way)
# create namespace
echo creating namespace...
kubectl create namespace $DEPLOYMENT_NAME
# ############


# # ############
# ## Only necessary if helm 2 (will break otherwise b/c helm 3 has no tiller)
# #Give the tiller process cluster-admin status
# kubectl create serviceaccount tiller --namespace=kube-system
# kubectl create clusterrolebinding tiller --clusterrole cluster-admin \
# --serviceaccount=kube-system:tiller
#
# #strangely this allows helm to install tiller into the kubernetes cluster
# helm init --service-account tiller
#
# # this patches the security of the deployment so that no other processes in the cluster can access the other pods
# kubectl --namespace=kube-system patch deployment tiller-deploy --type=json \
# --patch='[{"op": "add", "path": "/spec/template/spec/containers/0/command", "value": ["/tiller", "--listen=localhost:44134"]}]'
# # ############

# Make sure you are in the rhg-hub repo for this:
echo add pangeo repo to cluster...
helm repo add pangeo https://pangeo-data.github.io/helm-chart/
helm repo update

# generate a secret token for the cluster
echo generating secret token...
secret_token=$(openssl rand -hex 32)
echo "SECRET_TOKEN=$secret_token"

# secret_token=5486775e2cbb0a533aa81977a4ba9cf9697ba33de12b3f819edeed2596cba820
# secret_token=782d44af360f3d7f41b86a15555f817cd67d1f6e880dff421bf23105c931ea70

## NOTE: you will need to change 600s to 600 in both the install and upgrade commands
## if working with Helm 2
echo installing helm chart...
helm install $DEPLOYMENT_NAME pangeo/pangeo --version 19.09.27-86dd66c --namespace=$DEPLOYMENT_NAME \
--timeout 600s -f $HELM_SPEC \
--set jupyterhub.proxy.https.hosts="{${URL}}" \
--set jupyterhub.proxy.secretToken="${secret_token}" \
--set jupyterhub.auth.github.clientId="${GITHUB_CLIENT_ID}" \
--set jupyterhub.auth.github.clientSecret="${GITHUB_SECRET_TOKEN}" \
--set jupyterhub.auth.github.callbackUrl="https://${URL}/hub/oauth_callback"

echo "waiting for cluster to boot"
sleep 120

echo "retrieving external IP"
EXTERNAL_IP=$(kubectl -n ${CLUSTER_NAME} get service proxy-public -o wide | awk '{print $4}' | tail -n1)

echo "IMPORTANT"
echo "To update the cluster, run the following command. Save this somewhere as you will need the secret tokens:"
echo

echo "helm upgrade ${DEPLOYMENT_NAME} pangeo/pangeo --version 19.09.27-86dd66c --timeout 600s --namespace=${DEPLOYMENT_NAME} -f $HELM_SPEC \\"
echo " --set jupyterhub.proxy.service.loadBalancerIP=${EXTERNAL_IP} \\"
echo " --set jupyterhub.proxy.https.hosts=\"{${URL}}\" \\"
echo " --set jupyterhub.proxy.secretToken=\"${secret_token}\" \\"
echo " --set jupyterhub.auth.github.clientId=\"<GITHUB_CLIENT_ID>\" \\"
echo " --set jupyterhub.auth.github.clientSecret=\"<GITHUB_SECRET_TOKEN>\" \\"
echo " --set jupyterhub.auth.github.callbackUrl=\"https://${URL}/hub/oauth_callback\""


# Complete the installation using the cluster deployment instructions
# https://paper.dropbox.com/doc/Cluster-Deployments--AgOxfFIh7eCjBgsbFjTjjMpOAg-TQN0OpVDCIR3zW5PGJSRf
Loading

0 comments on commit 841cb43

Please sign in to comment.