From 2b827ea1398519a98dcbf988180112fc963fefb1 Mon Sep 17 00:00:00 2001
From: Bernd Verst <berndverst@users.noreply.github.com>
Date: Fri, 28 Feb 2020 18:15:53 -0800
Subject: [PATCH] Adds MNIST E2E Example for Azure. (#759)

* Adds MNIST E2E Example for Azure.

* Remove auto-generated ToC

* Remove incompatible script to retrieve Ingress URL

* Remove orphaned ToC entry
---
 mnist/README.md         |  47 +-
 mnist/mnist_azure.ipynb | 942 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 971 insertions(+), 18 deletions(-)
 create mode 100644 mnist/mnist_azure.ipynb
diff --git a/mnist/README.md b/mnist/README.md
index d37f8c1af..00ed43ccc 100644
--- a/mnist/README.md
+++ b/mnist/README.md
@@ -1,20 +1,3 @@
-<!-- START doctoc generated TOC please keep comment here to allow auto update -->
-<!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
-**Table of Contents**  *generated with [DocToc](https://github.com/thlorenz/doctoc)*
-
-- [MNIST on Kubeflow](#mnist-on-kubeflow)
-- [MNIST on Kubeflow on GCP](#mnist-on-kubeflow-on-gcp)
-- [MNIST on Kubeflow on AWS](#mnist-on-kubeflow-on-aws)
-- [MNIST on Kubeflow on IBM Cloud](#mnist-on-kubeflow-on-ibm-cloud)
-- [MNIST on Kubeflow on Vanilla k8s](#mnist-on-kubeflow-on-vanilla-k8s)
-    - [Prerequisites](#prerequisites)
-    - [Configure docker credentials](#configure-docker-credentials)
-      - [Why do we need this?](#why-do-we-need-this)
-    - [Create a config-map in the namespace you're using with the docker config](#create-a-config-map-in-the-namespace-youre-using-with-the-docker-config)
-
-<!-- END doctoc generated TOC please keep comment here to allow auto update -->
-
-
 # MNIST on Kubeflow
 
 This example guides you through the process of taking an example model, modifying it to run better within Kubeflow, and serving the resulting trained model.
@@ -23,9 +6,9 @@ Follow the version of the guide that is specific to how you have deployed Kubefl
 
 1. [MNIST on Kubeflow on GCP](#gcp)
 1. [MNIST on Kubeflow on AWS](#aws)
+1. [MNIST on Kubeflow on Azure](#azure)
 1. [MNIST on Kubeflow on IBM Cloud](#ibm)
 1. [MNIST on Kubeflow on vanilla k8s](#vanilla)
-1. [MNIST on other platforms](#other)
 
 <a id=gcp></a>
 # MNIST on Kubeflow on GCP
@@ -79,6 +62,34 @@ Follow these instructions to run the MNIST tutorial on AWS
 
 1. Follow the notebook to train and deploy MNIST on Kubeflow
 
+<a id=azure></a>
+# MNIST on Kubeflow on Azure
+
+Follow these instructions to run the MNIST tutorial on Azure
+
+1. Follow the [Azure instructions](https://www.kubeflow.org/docs/azure/deploy/install-kubeflow/) to deploy Kubeflow on Azure
+
+1. If you do not already have a notebook server, [create a new server](https://www.kubeflow.org/docs/notebooks/setup/)
+
+1. Launch a Jupyter notebook server
+
+   * The tutorial has been tested using the Jupyter Tensorflow 1.15 image
+
+1. Launch a terminal in Jupyter and clone the kubeflow examples repo
+
+   ```
+   git clone https://github.com/kubeflow/examples.git git_kubeflow-examples
+   ```
+
+   * **Tip** When you start a terminal in Jupyter, run the command `bash` to start
+      a bash terminal which is much more friendly then the default shell
+
+   * **Tip** You can change the URL from '/tree' to '/lab' to switch to using Jupyterlab
+
+1. Open the notebook `mnist/mnist_azure.ipynb`
+
+1. Follow the notebook to train and deploy MNIST on Kubeflow
+
 <a id=ibm></a>
 # MNIST on Kubeflow on IBM Cloud
 
diff --git a/mnist/mnist_azure.ipynb b/mnist/mnist_azure.ipynb
new file mode 100644
index 000000000..4ea612870
--- /dev/null
+++ b/mnist/mnist_azure.ipynb
@@ -0,0 +1,942 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# MNIST E2E on Kubeflow on Azure\n",
+    "\n",
+    "This example guides you through:\n",
+    "  \n",
+    "  1. Taking an example TensorFlow model and modifying it to support distributed training\n",
+    "  1. Serving the resulting model using TFServing\n",
+    "  1. Deploying and using a web-app that uses the model\n",
+    "  \n",
+    "## Requirements\n",
+    "\n",
+    "  * You must be running Kubeflow 1.0 on Azure\n",
+    "  \n",
+    "## Credentials\n",
+    "\n",
+    "Before you can deploy MNIST you will need to obtain credentials that allow creating a storage account and also obtain credentials to n Azure Container Registry.\n",
+    "\n",
+    "Run the following command in Bash using the Azure CLI. You may also want to use the Cloud Shell in your browser at shell.azure.com.\n",
+    "\n",
+    "```bash\n",
+    "# Creates an Azure Active Directory Service Principal\n",
+    "\n",
+    "az ad sp create-for-rbac --name kubeflow\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "# This is the Kubernetes namespace into which you installed Kubeflow\n",
+    "os.environ['TARGET_NAMESPACE'] = '<your-kubeflow-namespace-name>'\n",
+    "\n",
+    "# The credentials you obtained from the newly created Service Principal\n",
+    "os.environ['AZ_CLIENT_ID'] = '<your-service-principal-client-id>'\n",
+    "os.environ['AZ_CLIENT_SECRET'] = '<your-service-principal-client-secret>'\n",
+    "os.environ['AZ_TENANT_ID'] = '<your-service-principal-tenant-id>'\n",
+    "\n",
+    "# Your Azure Subcription ID\n",
+    "os.environ['AZ_SUBSCRIPTION_ID'] = '<your-azure-subscription-id>'\n",
+    "\n",
+    "# If you haven't already created an Azure Container Registry (ACR), follow the instructions at\n",
+    "# https://docs.microsoft.com/azure/container-registry/container-registry-get-started-azure-cli\n",
+    "os.environ['ACR_NAME'] = '<your-azure-container-registry-name>'\n",
+    "os.environ['ACR_RESOURCE_GROUP_NAME'] = '<your-azure-container-registry-resource-group-name>'\n",
+    "\n",
+    "# The existing resource group where a storage account should be created to hold all our data\n",
+    "os.environ['STORAGE_ACCOUNT_NAME'] = '<your-globally-unique-storage-account-name>'\n",
+    "os.environ['STORAGE_RESOURCE_GROUP_NAME'] = '<your-storage-resource-group-name>'\n",
+    "os.environ['STORAGE_RESOURCE_LOCATION'] = '<your-storage-account-resource-location>'\n",
+    "\n",
+    "# Stores the Service Principal as a Kubernetes Secret for reuse\n",
+    "!kubectl create secret generic -n ${TARGET_NAMESPACE} azcreds \\\n",
+    "--from-literal=AZ_CLIENT_ID=${AZ_CLIENT_ID} \\\n",
+    "--from-literal=AZ_CLIENT_SECRET=${AZ_CLIENT_SECRET} \\\n",
+    "--from-literal=AZ_TENANT_ID=${AZ_TENANT_ID} \\\n",
+    "--from-literal=AZ_SUBSCRIPTION_ID=${AZ_SUBSCRIPTION_ID}\n",
+    "\n",
+    "\n",
+    "# Stores credentials for accessing the private Azure Container Registry\n",
+    "!kubectl create secret docker-registry -n ${TARGET_NAMESPACE} acrcreds \\\n",
+    "--docker-server=${ACR_NAME}.azurecr.io \\\n",
+    "--docker-username=${AZ_CLIENT_ID} \\\n",
+    "--docker-password=${AZ_CLIENT_SECRET}\n",
+    "\n",
+    "!kubectl patch serviceaccount default-editor -n ${TARGET_NAMESPACE} \\\n",
+    "-p \"{\\\"imagePullSecrets\\\": [{\\\"name\\\": \\\"acrcreds\\\"}]}\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Run the following command in Bash using the Azure CLI or use the Cloud Shell\n",
+    "\n",
+    "```bash\n",
+    "# Gives the service principal permission to create storage accounts in the desired resource group\n",
+    "export AZ_CLIENT_ID='<your-service-principal-client-id>'\n",
+    "export AZ_SUBSCRIPTION_ID='<your-azure-subscription-id>'\n",
+    "export STORAGE_RESOURCE_GROUP_NAME='<your-storage-resource-group-name>'\n",
+    "\n",
+    "az role assignment create --assignee $AZ_CLIENT_ID --scope /subscriptions/$AZURE_SUBSCRIPTION_ID/resourceGroups/$STORAGE_RESOURCE_GROUP_NAME --role \"Storage Account Contributor\"\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prepare model\n",
+    "\n",
+    "There is a delta between existing distributed mnist examples and what's needed to run well as a TFJob.\n",
+    "\n",
+    "Basically, we must:\n",
+    "\n",
+    "1. Add options in order to make the model configurable.\n",
+    "1. Use `tf.estimator.train_and_evaluate` to enable model exporting and serving.\n",
+    "1. Define serving signatures for model serving.\n",
+    "\n",
+    "The resulting model is [model.py](model.py)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Install Required Libraries\n",
+    "\n",
+    "Import the libraries required to train this model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import logging\n",
+    "from importlib import reload\n",
+    "\n",
+    "import notebook_setup\n",
+    "reload(notebook_setup)\n",
+    "notebook_setup.notebook_setup(platform='azure')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import k8s_util\n",
+    "# Force a reload of kubeflow; since kubeflow is a multi namespace module\n",
+    "# it looks like doing this in notebook_setup may not be sufficient\n",
+    "import kubeflow\n",
+    "reload(kubeflow)\n",
+    "from kubernetes import client as k8s_client\n",
+    "from kubernetes import config as k8s_config\n",
+    "from kubeflow.tfjob.api import tf_job_client as tf_job_client_module\n",
+    "from IPython.core.display import display, HTML\n",
+    "import yaml"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Configure The Docker Registry For Kubeflow Fairing\n",
+    "\n",
+    "* In order to build docker images from your notebook we need a docker registry where the images will be stored\n",
+    "* We will be using Azure Container Registry"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "from kubernetes import client as k8s_client\n",
+    "from kubernetes.client import rest as k8s_rest\n",
+    "from kubeflow import fairing   \n",
+    "from kubeflow.fairing import utils as fairing_utils\n",
+    "from kubeflow.fairing.builders import append\n",
+    "from kubeflow.fairing.deployers import job\n",
+    "from kubeflow.fairing.preprocessors import base as base_preprocessor\n",
+    "\n",
+    "AZURE_ACR_NAME = os.environ.get('ACR_NAME')\n",
+    "\n",
+    "# Setting up AWS Elastic Container Registry (ECR) for storing output containers\n",
+    "# You can use any docker container registry istead of ECR\n",
+    "# AWS_ACCOUNT_ID=fairing.cloud.azure.guess_account_id()\n",
+    "# AWS_ACCOUNT_ID = boto3.client('sts').get_caller_identity().get('Account')\n",
+    "DOCKER_REGISTRY = '{}.azurecr.io'.format(AZURE_ACR_NAME)\n",
+    "\n",
+    "namespace = fairing_utils.get_current_k8s_namespace()\n",
+    "\n",
+    "logging.info(f\"Running in namespace {namespace}\")\n",
+    "logging.info(f\"Using docker registry {DOCKER_REGISTRY}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Use Kubeflow fairing to build the docker image\n",
+    "\n",
+    "* You will use kubeflow fairing's kaniko builder to build a docker image that includes all your dependencies\n",
+    "  * You use kaniko because you want to be able to run `pip` to install dependencies\n",
+    "  * Kaniko gives you the flexibility to build images from Dockerfiles"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# TODO(https://github.com/kubeflow/fairing/issues/426): We should get rid of this once the default \n",
+    "# Kaniko image is updated to a newer image than 0.7.0.\n",
+    "from kubeflow.fairing import constants\n",
+    "constants.constants.KANIKO_IMAGE = \"gcr.io/kaniko-project/executor:v0.14.0\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from kubeflow.fairing.builders import cluster\n",
+    "\n",
+    "# output_map is a map of extra files to add to the notebook.\n",
+    "# It is a map from source location to the location inside the context.\n",
+    "output_map =  {\n",
+    "    \"Dockerfile.model\": \"Dockerfile\",\n",
+    "    \"model.py\": \"model.py\"\n",
+    "}\n",
+    "\n",
+    "preprocessor = base_preprocessor.BasePreProcessor(\n",
+    "    command=[\"python\"], # The base class will set this.\n",
+    "    input_files=[],\n",
+    "    path_prefix=\"/app\", # irrelevant since we aren't preprocessing any files\n",
+    "    output_map=output_map)\n",
+    "\n",
+    "preprocessor.preprocess()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "STORAGE_GROUP_NAME = os.environ.get('STORAGE_RESOURCE_GROUP_NAME')\n",
+    "STORAGE_ACCOUNT_NAME = os.environ.get('STORAGE_ACCOUNT_NAME')\n",
+    "AZURE_REGION = os.environ.get('STORAGE_RESOURCE_LOCATION')\n",
+    "\n",
+    "# Use a Tensorflow image as the base image\n",
+    "# We use a custom Dockerfile \n",
+    "cluster_builder = cluster.cluster.ClusterBuilder(registry=DOCKER_REGISTRY,\n",
+    "                                                 base_image=\"\", # base_image is set in the Dockerfile\n",
+    "                                                 preprocessor=preprocessor,\n",
+    "                                                 image_name=\"mnist\",\n",
+    "                                                 dockerfile_path=\"Dockerfile\",\n",
+    "                                                 pod_spec_mutators=[fairing.cloud.azure.add_acr_config, fairing.cloud.azure.add_azure_files],\n",
+    "                                                 context_source=cluster.azurestorage_context.StorageContextSource(region=AZURE_REGION, storage_account_name=STORAGE_ACCOUNT_NAME, resource_group_name=STORAGE_GROUP_NAME))\n",
+    "cluster_builder.build()\n",
+    "logging.info(f\"Built image {cluster_builder.image_tag}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create an Azure File Share\n",
+    "\n",
+    "Create an Azure File Share bucket to store our models and other results.\n",
+    "\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Run the following in your local Bash terminal with Azure CLI or in the Azure Cloud Shell\n",
+    "\n",
+    "```bash\n",
+    "\n",
+    "export AZ_STORAGE_ACCOUNT_NAME=\"<your-storage-account-name>\"\n",
+    "export AZ_STORAGE_RESOURCE_GROUP=\"<your-storage-account-resource-group>\"\n",
+    "export AZ_SHARE_NAME=\"mnist\"\n",
+    "STORAGE_KEY=$(az storage account keys list --resource-group $AZ_STORAGE_RESOURCE_GROUP --account-name $AZ_STORAGE_ACCOUNT_NAME --query \"[0].value\" -o tsv)\n",
+    "az storage share create --name $AZ_SHARE_NAME --account-name $AZ_STORAGE_ACCOUNT_NAME --account-key $STORAGE_KEY\n",
+    "\n",
+    "echo $STORAGE_KEY\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Insert the Storage Key in the command below to save the storage access credentials as a Kubernetes secret\n",
+    "%env STORAGE_KEY='<your-storage-account-access-key>'\n",
+    "\n",
+    "!kubectl create secret generic azure-share-secret --namespace $TARGET_NAMESPACE --from-literal=azurestorageaccountname=$AZ_STORAGE_ACCOUNT_NAME --from-literal=azurestorageaccountkey=$STORAGE_KEY"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Distributed training\n",
+    "\n",
+    "* We will train the model by using TFJob to run a distributed training job"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import uuid\n",
+    "\n",
+    "train_name = f\"mnist-train-{uuid.uuid4().hex[:4]}\"\n",
+    "num_ps = 1\n",
+    "num_workers = 2\n",
+    "model_dir = \"/mnt/azure/mnist\"\n",
+    "export_path = \"/mnt/azure/mnist/export\"\n",
+    "train_steps = 200\n",
+    "batch_size = 100\n",
+    "learning_rate = .01\n",
+    "image = cluster_builder.image_tag\n",
+    "\n",
+    "train_spec = f\"\"\"apiVersion: kubeflow.org/v1\n",
+    "kind: TFJob\n",
+    "metadata:\n",
+    "  name: {train_name}  \n",
+    "spec:\n",
+    "  tfReplicaSpecs:\n",
+    "    Ps:\n",
+    "      replicas: {num_ps}\n",
+    "      template:\n",
+    "        metadata:\n",
+    "          annotations:\n",
+    "            sidecar.istio.io/inject: \"false\"\n",
+    "        spec:\n",
+    "          serviceAccount: default-editor\n",
+    "          containers:\n",
+    "          - name: tensorflow\n",
+    "            command:\n",
+    "            - python\n",
+    "            - /opt/model.py\n",
+    "            - --tf-model-dir={model_dir}\n",
+    "            - --tf-export-dir={export_path}\n",
+    "            - --tf-train-steps={train_steps}\n",
+    "            - --tf-batch-size={batch_size}\n",
+    "            - --tf-learning-rate={learning_rate}\n",
+    "            image: {image}\n",
+    "            workingDir: /opt\n",
+    "            volumeMounts:\n",
+    "              - name: azure\n",
+    "                mountPath: /mnt/azure\n",
+    "                readOnly: false\n",
+    "          volumes:\n",
+    "            - name: azure\n",
+    "              azureFile:\n",
+    "                secretName: azure-share-secret\n",
+    "                shareName: mnist\n",
+    "                readOnly: false\n",
+    "          restartPolicy: OnFailure\n",
+    "    Chief:\n",
+    "      replicas: 1\n",
+    "      template:\n",
+    "        metadata:\n",
+    "          annotations:\n",
+    "            sidecar.istio.io/inject: \"false\"\n",
+    "        spec:\n",
+    "          serviceAccount: default-editor\n",
+    "          containers:\n",
+    "          - name: tensorflow\n",
+    "            command:\n",
+    "            - python\n",
+    "            - /opt/model.py\n",
+    "            - --tf-model-dir={model_dir}\n",
+    "            - --tf-export-dir={export_path}\n",
+    "            - --tf-train-steps={train_steps}\n",
+    "            - --tf-batch-size={batch_size}\n",
+    "            - --tf-learning-rate={learning_rate}\n",
+    "            image: {image}\n",
+    "            workingDir: /opt\n",
+    "            volumeMounts:\n",
+    "              - name: azure\n",
+    "                mountPath: /mnt/azure\n",
+    "                readOnly: false\n",
+    "          volumes:\n",
+    "            - name: azure\n",
+    "              azureFile:\n",
+    "                secretName: azure-share-secret\n",
+    "                shareName: mnist\n",
+    "                readOnly: false\n",
+    "          restartPolicy: OnFailure\n",
+    "    Worker:\n",
+    "      replicas: 1\n",
+    "      template:\n",
+    "        metadata:\n",
+    "          annotations:\n",
+    "            sidecar.istio.io/inject: \"false\"\n",
+    "        spec:\n",
+    "          serviceAccount: default-editor\n",
+    "          containers:\n",
+    "          - name: tensorflow\n",
+    "            command:\n",
+    "            - python\n",
+    "            - /opt/model.py\n",
+    "            - --tf-model-dir={model_dir}\n",
+    "            - --tf-export-dir={export_path}\n",
+    "            - --tf-train-steps={train_steps}\n",
+    "            - --tf-batch-size={batch_size}\n",
+    "            - --tf-learning-rate={learning_rate}\n",
+    "            image: {image}\n",
+    "            workingDir: /opt\n",
+    "          volumeMounts:\n",
+    "              - name: azure\n",
+    "                mountPath: /mnt/azure\n",
+    "                readOnly: false\n",
+    "          volumes:\n",
+    "            - name: azure\n",
+    "              azureFile:\n",
+    "                secretName: azure-share-secret\n",
+    "                shareName: mnist\n",
+    "                readOnly: false\n",
+    "          restartPolicy: OnFailure\n",
+    "\"\"\"           "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Create the training job\n",
+    "\n",
+    "* You could write the spec to a YAML file and then do `kubectl apply -f {FILE}`\n",
+    "* Since you are running in jupyter you will use the TFJob client\n",
+    "* You will run the TFJob in a namespace created by a Kubeflow profile\n",
+    "  * The namespace will be the same namespace you are running the notebook in\n",
+    "  * Creating a profile ensures the namespace is provisioned with service accounts and other resources needed for Kubeflow"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tf_job_client = tf_job_client_module.TFJobClient()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tf_job_body = yaml.safe_load(train_spec)\n",
+    "tf_job = tf_job_client.create(tf_job_body, namespace=namespace)  \n",
+    "\n",
+    "logging.info(f\"Created job {namespace}.{train_name}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Check the job\n",
+    "\n",
+    "* Above you used the python SDK for TFJob to check the status\n",
+    "* You can also use kubectl get the status of your job\n",
+    "* The job conditions will tell you whether the job is running, succeeded or failed"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!kubectl get tfjobs -o yaml {train_name}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Get The Logs\n",
+    "\n",
+    "* There are two ways to get the logs for the training job\n",
+    "\n",
+    "  1. Using kubectl to fetch the pod logs\n",
+    "     * These logs are ephemeral; they will be unavailable when the pod is garbage collected to free up resources\n",
+    "  1. Using Fluentd\n",
+    "     * You need to install a fluentd plugin\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Deploy TensorBoard\n",
+    "\n",
+    "* You will create a Kubernetes Deployment to run TensorBoard\n",
+    "* TensorBoard will be accessible behind the Kubeflow endpoint"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tb_name = \"mnist-tensorboard\"\n",
+    "tb_deploy = f\"\"\"apiVersion: apps/v1\n",
+    "kind: Deployment\n",
+    "metadata:\n",
+    "  labels:\n",
+    "    app: mnist-tensorboard\n",
+    "  name: {tb_name}\n",
+    "  namespace: {namespace}\n",
+    "spec:\n",
+    "  selector:\n",
+    "    matchLabels:\n",
+    "      app: mnist-tensorboard\n",
+    "  template:\n",
+    "    metadata:\n",
+    "      labels:\n",
+    "        app: mnist-tensorboard\n",
+    "        version: v1\n",
+    "    spec:\n",
+    "      serviceAccount: default-editor\n",
+    "      containers:\n",
+    "      - command:\n",
+    "        - /usr/local/bin/tensorboard\n",
+    "        - --logdir={model_dir}\n",
+    "        - --port=80\n",
+    "        image: tensorflow/tensorflow:1.15.2-py3\n",
+    "        name: tensorboard\n",
+    "        ports:\n",
+    "        - containerPort: 80\n",
+    "        volumeMounts:\n",
+    "          - name: azure\n",
+    "            mountPath: /mnt/azure\n",
+    "            readOnly: false\n",
+    "      volumes:\n",
+    "        - name: azure\n",
+    "          azureFile:\n",
+    "            secretName: azure-share-secret\n",
+    "            shareName: mnist\n",
+    "            readOnly: false\n",
+    "\"\"\"\n",
+    "tb_service = f\"\"\"apiVersion: v1\n",
+    "kind: Service\n",
+    "metadata:\n",
+    "  labels:\n",
+    "    app: mnist-tensorboard\n",
+    "  name: {tb_name}\n",
+    "  namespace: {namespace}\n",
+    "spec:\n",
+    "  ports:\n",
+    "  - name: http-tb\n",
+    "    port: 80\n",
+    "    targetPort: 80\n",
+    "  selector:\n",
+    "    app: mnist-tensorboard\n",
+    "  type: ClusterIP\n",
+    "\"\"\"\n",
+    "\n",
+    "tb_virtual_service = f\"\"\"apiVersion: networking.istio.io/v1alpha3\n",
+    "kind: VirtualService\n",
+    "metadata:\n",
+    "  name: {tb_name}\n",
+    "  namespace: {namespace}\n",
+    "spec:\n",
+    "  gateways:\n",
+    "  - kubeflow/kubeflow-gateway\n",
+    "  hosts:\n",
+    "  - '*'\n",
+    "  http:\n",
+    "  - match:\n",
+    "    - uri:\n",
+    "        prefix: /mnist/{namespace}/tensorboard/\n",
+    "    rewrite:\n",
+    "      uri: /\n",
+    "    route:\n",
+    "    - destination:\n",
+    "        host: {tb_name}.{namespace}.svc.cluster.local\n",
+    "        port:\n",
+    "          number: 80\n",
+    "    timeout: 300s\n",
+    "\"\"\"\n",
+    "\n",
+    "tb_specs = [tb_deploy, tb_service, tb_virtual_service]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "k8s_util.apply_k8s_specs(tb_specs, k8s_util.K8S_CREATE_OR_REPLACE)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Access The TensorBoard UI\n",
+    "\n",
+    "Recall we are forwarding the cluster-internal ingress.\n",
+    "\n",
+    "```bash\n",
+    "kubectl port-forward svc/istio-ingressgateway -n istio-system 8080:80\n",
+    "```\n",
+    "\n",
+    "To access TensorBoard, manually visit the path:\n",
+    "[/mnist/your-kubeflow-namespace/tensorboard/](/mnist/your-kubeflow-namespace/tensorboard/)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Wait For the Training Job to finish"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "* You can use the TFJob client to wait for it to finish."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tf_job = tf_job_client.wait_for_condition(train_name, expected_condition=[\"Succeeded\", \"Failed\"], namespace=namespace)\n",
+    "\n",
+    "if tf_job_client.is_job_succeeded(train_name, namespace):\n",
+    "    logging.info(f\"TFJob {namespace}.{train_name} succeeded\")\n",
+    "else:\n",
+    "    raise ValueError(f\"TFJob {namespace}.{train_name} failed\")  "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Serve the model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "* Deploy the model using tensorflow serving\n",
+    "* We need to create\n",
+    "  1. A Kubernetes Deployment\n",
+    "  1. A Kubernetes service\n",
+    "  1. (Optional) Create a configmap containing the prometheus monitoring config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "deploy_name = \"mnist-model\"\n",
+    "model_base_path = export_path\n",
+    "\n",
+    "# The web ui defaults to mnist-service so if you change it you will\n",
+    "# need to change it in the UI as well to send predictions to the mode\n",
+    "model_service = \"mnist-service\"\n",
+    "\n",
+    "deploy_spec = f\"\"\"apiVersion: apps/v1\n",
+    "kind: Deployment\n",
+    "metadata:\n",
+    "  labels:\n",
+    "    app: mnist\n",
+    "  name: {deploy_name}\n",
+    "  namespace: {namespace}\n",
+    "spec:\n",
+    "  selector:\n",
+    "    matchLabels:\n",
+    "      app: mnist-model\n",
+    "  template:\n",
+    "    metadata:\n",
+    "      # TODO(jlewi): Right now we disable the istio side car because otherwise ISTIO rbac will prevent the\n",
+    "      # UI from sending RPCs to the server. We should create an appropriate ISTIO rbac authorization\n",
+    "      # policy to allow traffic from the UI to the model servier.\n",
+    "      # https://istio.io/docs/concepts/security/#target-selectors\n",
+    "      annotations:        \n",
+    "        sidecar.istio.io/inject: \"false\"\n",
+    "      labels:\n",
+    "        app: mnist-model\n",
+    "        version: v1\n",
+    "    spec:\n",
+    "      serviceAccount: default-editor\n",
+    "      containers:\n",
+    "      - args:\n",
+    "        - --port=9000\n",
+    "        - --rest_api_port=8500\n",
+    "        - --model_name=mnist\n",
+    "        - --model_base_path={model_base_path}\n",
+    "        - --monitoring_config_file=/var/config/monitoring_config.txt\n",
+    "        command:\n",
+    "        - /usr/bin/tensorflow_model_server\n",
+    "        env:\n",
+    "        - name: modelBasePath\n",
+    "          value: {model_base_path}\n",
+    "        image: tensorflow/serving:1.15.0\n",
+    "        imagePullPolicy: IfNotPresent\n",
+    "        livenessProbe:\n",
+    "          initialDelaySeconds: 30\n",
+    "          periodSeconds: 30\n",
+    "          tcpSocket:\n",
+    "            port: 9000\n",
+    "        name: mnist\n",
+    "        ports:\n",
+    "        - containerPort: 9000\n",
+    "        - containerPort: 8500\n",
+    "        resources:\n",
+    "          limits:\n",
+    "            cpu: \"1\"\n",
+    "            memory: 1Gi\n",
+    "          requests:\n",
+    "            cpu: \"1\"\n",
+    "            memory: 1Gi\n",
+    "        volumeMounts:\n",
+    "        - mountPath: /var/config/\n",
+    "          name: model-config\n",
+    "        - name: azure\n",
+    "          mountPath: /mnt/azure\n",
+    "      volumes:\n",
+    "      - configMap:\n",
+    "          name: {deploy_name}\n",
+    "        name: model-config\n",
+    "      - name: azure\n",
+    "        azureFile:\n",
+    "          secretName: azure-share-secret\n",
+    "          shareName: mnist\n",
+    "          readOnly: false\n",
+    "\"\"\"\n",
+    "\n",
+    "service_spec = f\"\"\"apiVersion: v1\n",
+    "kind: Service\n",
+    "metadata:\n",
+    "  annotations:    \n",
+    "    prometheus.io/path: /monitoring/prometheus/metrics\n",
+    "    prometheus.io/port: \"8500\"\n",
+    "    prometheus.io/scrape: \"true\"\n",
+    "  labels:\n",
+    "    app: mnist-model\n",
+    "  name: {model_service}\n",
+    "  namespace: {namespace}\n",
+    "spec:\n",
+    "  ports:\n",
+    "  - name: grpc-tf-serving\n",
+    "    port: 9000\n",
+    "    targetPort: 9000\n",
+    "  - name: http-tf-serving\n",
+    "    port: 8500\n",
+    "    targetPort: 8500\n",
+    "  selector:\n",
+    "    app: mnist-model\n",
+    "  type: ClusterIP\n",
+    "\"\"\"\n",
+    "\n",
+    "monitoring_config = f\"\"\"kind: ConfigMap\n",
+    "apiVersion: v1\n",
+    "metadata:\n",
+    "  name: {deploy_name}\n",
+    "  namespace: {namespace}\n",
+    "data:\n",
+    "  monitoring_config.txt: |-\n",
+    "    prometheus_config: {{\n",
+    "      enable: true,\n",
+    "      path: \"/monitoring/prometheus/metrics\"\n",
+    "    }}\n",
+    "\"\"\"\n",
+    "\n",
+    "model_specs = [deploy_spec, service_spec, monitoring_config]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "k8s_util.apply_k8s_specs(model_specs, k8s_util.K8S_CREATE_OR_REPLACE)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Deploy the mnist UI\n",
+    "\n",
+    "* We will now deploy the UI to visual the mnist results\n",
+    "* Note: This is using a prebuilt and public docker image for the UI"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ui_name = \"mnist-ui\"\n",
+    "ui_deploy = f\"\"\"apiVersion: apps/v1\n",
+    "kind: Deployment\n",
+    "metadata:\n",
+    "  name: {ui_name}\n",
+    "  namespace: {namespace}\n",
+    "spec:\n",
+    "  replicas: 1\n",
+    "  selector:\n",
+    "    matchLabels:\n",
+    "      app: mnist-web-ui\n",
+    "  template:\n",
+    "    metadata:\n",
+    "      labels:\n",
+    "        app: mnist-web-ui\n",
+    "    spec:\n",
+    "      containers:\n",
+    "      - image: gcr.io/kubeflow-examples/mnist/web-ui:v20190112-v0.2-142-g3b38225\n",
+    "        name: web-ui\n",
+    "        ports:\n",
+    "        - containerPort: 5000        \n",
+    "      serviceAccount: default-editor\n",
+    "\"\"\"\n",
+    "\n",
+    "ui_service = f\"\"\"apiVersion: v1\n",
+    "kind: Service\n",
+    "metadata:\n",
+    "  annotations:\n",
+    "  name: {ui_name}\n",
+    "  namespace: {namespace}\n",
+    "spec:\n",
+    "  ports:\n",
+    "  - name: http-mnist-ui\n",
+    "    port: 80\n",
+    "    targetPort: 5000\n",
+    "  selector:\n",
+    "    app: mnist-web-ui\n",
+    "  type: ClusterIP\n",
+    "\"\"\"\n",
+    "\n",
+    "ui_virtual_service = f\"\"\"apiVersion: networking.istio.io/v1alpha3\n",
+    "kind: VirtualService\n",
+    "metadata:\n",
+    "  name: {ui_name}\n",
+    "  namespace: {namespace}\n",
+    "spec:\n",
+    "  gateways:\n",
+    "  - kubeflow/kubeflow-gateway\n",
+    "  hosts:\n",
+    "  - '*'\n",
+    "  http:\n",
+    "  - match:\n",
+    "    - uri:\n",
+    "        prefix: /mnist/{namespace}/ui/\n",
+    "    rewrite:\n",
+    "      uri: /\n",
+    "    route:\n",
+    "    - destination:\n",
+    "        host: {ui_name}.{namespace}.svc.cluster.local\n",
+    "        port:\n",
+    "          number: 80\n",
+    "    timeout: 300s\n",
+    "\"\"\"\n",
+    "\n",
+    "ui_specs = [ui_deploy, ui_service, ui_virtual_service]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "k8s_util.apply_k8s_specs(ui_specs, k8s_util.K8S_CREATE_OR_REPLACE)  "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Access the  web UI\n",
+    "\n",
+    "Recall we are forwarding the cluster-internal ingress.\n",
+    "\n",
+    "```bash\n",
+    "kubectl port-forward svc/istio-ingressgateway -n istio-system 8080:80\n",
+    "```\n",
+    "\n",
+    "To access the web UI, manually visit the path:\n",
+    "[/mnist/your-kubeflow-namespace/ui/](/mnist/your-kubeflow-namespace/ui/)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
\ No newline at end of file