From fdc44574c32cb1cefdce7a2f51f505ab03055f9b Mon Sep 17 00:00:00 2001 From: Konrad Ponichtera Date: Fri, 23 Sep 2022 00:04:32 +0200 Subject: [PATCH] Streamlined Jupyter notebooks for Terraform and experiment deployment (installing extractor, enabling/disabling autoscaling so that experiments don't end up deployed on the default node pool, removed changing working directory from within the notebook) --- jupyter/experiment_notebook.ipynb | 80 +++++++++++++++++++++++++++---- jupyter/terraform_notebook.ipynb | 10 ++-- 2 files changed, 77 insertions(+), 13 deletions(-) diff --git a/jupyter/experiment_notebook.ipynb b/jupyter/experiment_notebook.ipynb index 44eb224f..121d124d 100644 --- a/jupyter/experiment_notebook.ipynb +++ b/jupyter/experiment_notebook.ipynb @@ -74,7 +74,7 @@ "source": [ "# These commands might take a while to complete.\n", "gcloud container clusters resize $CLUSTER_NAME --node-pool $DEFAULT_POOL \\\n", - " --num-nodes 1 --region us-central1-c --quiet" + " --num-nodes 1 --region $REGION --quiet" ] }, { @@ -132,7 +132,8 @@ "metadata": {}, "outputs": [], "source": [ - "helm install -n test extractor ../charts/extractor -f ../charts/fltk-values.yaml" + "helm upgrade --install -n test extractor ../charts/extractor -f ../charts/fltk-values.yaml \\\n", + " --set provider.projectName=$PROJECT_ID" ] }, { @@ -200,9 +201,10 @@ }, "outputs": [], "source": [ - "helm uninstall experiment-orchestrator -n test\n", - "helm install experiment-orchestrator ../charts/orchestrator --namespace test -f ../charts/fltk-values.yaml \\\n", - " --set-file orchestrator.experiment=$EXPERIMENT_FILE,orchestrator.configuration=$CLUSTER_CONFIG\n" + "helm uninstall -n test experiment-orchestrator\n", + "helm install -n test experiment-orchestrator ../charts/orchestrator -f ../charts/fltk-values.yaml \\\n", + " --set-file orchestrator.experiment=$EXPERIMENT_FILE,orchestrator.configuration=$CLUSTER_CONFIG \\\n", + " --set provider.projectName=$PROJECT_ID" ] }, { @@ -216,7 +218,7 @@ "outputs": [], "source": [ "# To get logs from the orchestrator\n", - "kubectl logs -n test fl-learner" + "kubectl logs -n test fl-server" ] }, { @@ -236,6 +238,68 @@ "kubectl logs -n test trainjob-eb056010-7c33-4c46-9559-b197afc7cb84-worker-0" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Copy experiment results from the extractor\n", + "\n", + "Extractor holds the experiment results in the format that can be processedby TensorBoard.\n", + "In order to download it to the local machine, execute:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "EXTRACTOR_POD_NAME=$(kubectl get pods -n test -l \"app.kubernetes.io/name=fltk.extractor\" -o jsonpath=\"{.items[0].metadata.name}\")\n", + "\n", + "kubectl cp -n test $EXTRACTOR_POD_NAME:/opt/federation-lab/logging ./logging" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Cleanup" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Removing orchestrator" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "helm uninstall -n test experiment-orchestrator" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Removing extractor\n", + "\n", + "IMPORTANT: Removing extractor chart will result in deleting the already collected experiment results, stored in the NFS!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "helm uninstall extractor -n test" + ] + }, { "cell_type": "markdown", "metadata": { @@ -244,7 +308,7 @@ } }, "source": [ - "# Wrapping up\n", + "## Wrapping up\n", "\n", "To scale down the cluster nodepools, run the cell below. This will scale the node pools down and remove all the experiments deployed (on the cluster).\n", "\n", @@ -266,7 +330,7 @@ "kubectl delete pytorchjobs.kubeflow.org --all-namespaces --all\n", "\n", "gcloud container clusters resize $CLUSTER_NAME --node-pool $DEFAULT_POOL \\\n", - " --num-nodes 0 --region $REGION --quiet\n", + " --num-nodes 0 --region $REGION --quiet\n", "\n", "gcloud container clusters resize $CLUSTER_NAME --node-pool $EXPERIMENT_POOL \\\n", " --num-nodes 0 --region $REGION --quiet" diff --git a/jupyter/terraform_notebook.ipynb b/jupyter/terraform_notebook.ipynb index e2539085..dbaffbbe 100644 --- a/jupyter/terraform_notebook.ipynb +++ b/jupyter/terraform_notebook.ipynb @@ -203,7 +203,7 @@ "##################\n", "### CHANGE ME! ###\n", "##################\n", - "BILLING_ACCOUNT=\"015594-41687F-092941\" " + "BILLING_ACCOUNT=\"015594-41687F-092941\"" ] }, { @@ -310,7 +310,7 @@ "##################\n", "### CHANGE ME! ###\n", "##################\n", - "OWNER_MAIL=\"jargsnork@gmail.com\"\n", + "OWNER_MAIL=\"mygoogleaccount@gmail.com\"\n", "\n", "gcloud iam service-accounts add-iam-policy-binding $PRIVILEGED_ACCOUNT_ID \\\n", " --member=\"user:$OWNER_MAIL\" \\\n", @@ -480,11 +480,11 @@ "outputs": [], "source": [ "gcloud container clusters update $CLUSTER_NAME --node-pool $DEFAULT_POOL \\\n", - " --disable-autoscaling --quiet\n", + " --no-enable-autoscaling --region $REGION --quiet\n", " \n", "# The high performance node will scale up automatically whenever the workloads are deployed\n", "gcloud container clusters update $CLUSTER_NAME --node-pool $EXPERIMENT_POOL \\\n", - " --enable-autoscaling --quiet\n", + " --enable-autoscaling --min-nodes=0 --max-nodes=10 --region $REGION --quiet\n", "\n", "gcloud container clusters resize $CLUSTER_NAME --node-pool $DEFAULT_POOL \\\n", " --num-nodes 1 --region $REGION --quiet\n" @@ -636,7 +636,7 @@ "outputs": [], "source": [ "# Retrieve all CRD Pytorchjob from Kubeflow.\n", - "kubectl get pytorchjobs.kubeflow.org --all-namespaces --all\n", + "kubectl get pytorchjobs.kubeflow.org --all-namespaces\n", "\n", "# Alternatively, we can remove all jobs, this will remove all information and logs as well.\n", "kubectl delete pytorchjobs.kubeflow.org --all-namespaces --all"