From 2b827ea1398519a98dcbf988180112fc963fefb1 Mon Sep 17 00:00:00 2001 From: Bernd Verst Date: Fri, 28 Feb 2020 18:15:53 -0800 Subject: [PATCH] Adds MNIST E2E Example for Azure. (#759) * Adds MNIST E2E Example for Azure. * Remove auto-generated ToC * Remove incompatible script to retrieve Ingress URL * Remove orphaned ToC entry --- mnist/README.md | 47 +- mnist/mnist_azure.ipynb | 942 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 971 insertions(+), 18 deletions(-) create mode 100644 mnist/mnist_azure.ipynb diff --git a/mnist/README.md b/mnist/README.md index d37f8c1af..00ed43ccc 100644 --- a/mnist/README.md +++ b/mnist/README.md @@ -1,20 +1,3 @@ - - -**Table of Contents** *generated with [DocToc](https://github.com/thlorenz/doctoc)* - -- [MNIST on Kubeflow](#mnist-on-kubeflow) -- [MNIST on Kubeflow on GCP](#mnist-on-kubeflow-on-gcp) -- [MNIST on Kubeflow on AWS](#mnist-on-kubeflow-on-aws) -- [MNIST on Kubeflow on IBM Cloud](#mnist-on-kubeflow-on-ibm-cloud) -- [MNIST on Kubeflow on Vanilla k8s](#mnist-on-kubeflow-on-vanilla-k8s) - - [Prerequisites](#prerequisites) - - [Configure docker credentials](#configure-docker-credentials) - - [Why do we need this?](#why-do-we-need-this) - - [Create a config-map in the namespace you're using with the docker config](#create-a-config-map-in-the-namespace-youre-using-with-the-docker-config) - - - - # MNIST on Kubeflow This example guides you through the process of taking an example model, modifying it to run better within Kubeflow, and serving the resulting trained model. @@ -23,9 +6,9 @@ Follow the version of the guide that is specific to how you have deployed Kubefl 1. [MNIST on Kubeflow on GCP](#gcp) 1. [MNIST on Kubeflow on AWS](#aws) +1. [MNIST on Kubeflow on Azure](#azure) 1. [MNIST on Kubeflow on IBM Cloud](#ibm) 1. [MNIST on Kubeflow on vanilla k8s](#vanilla) -1. [MNIST on other platforms](#other) # MNIST on Kubeflow on GCP @@ -79,6 +62,34 @@ Follow these instructions to run the MNIST tutorial on AWS 1. Follow the notebook to train and deploy MNIST on Kubeflow + +# MNIST on Kubeflow on Azure + +Follow these instructions to run the MNIST tutorial on Azure + +1. Follow the [Azure instructions](https://www.kubeflow.org/docs/azure/deploy/install-kubeflow/) to deploy Kubeflow on Azure + +1. If you do not already have a notebook server, [create a new server](https://www.kubeflow.org/docs/notebooks/setup/) + +1. Launch a Jupyter notebook server + + * The tutorial has been tested using the Jupyter Tensorflow 1.15 image + +1. Launch a terminal in Jupyter and clone the kubeflow examples repo + + ``` + git clone https://github.com/kubeflow/examples.git git_kubeflow-examples + ``` + + * **Tip** When you start a terminal in Jupyter, run the command `bash` to start + a bash terminal which is much more friendly then the default shell + + * **Tip** You can change the URL from '/tree' to '/lab' to switch to using Jupyterlab + +1. Open the notebook `mnist/mnist_azure.ipynb` + +1. Follow the notebook to train and deploy MNIST on Kubeflow + # MNIST on Kubeflow on IBM Cloud diff --git a/mnist/mnist_azure.ipynb b/mnist/mnist_azure.ipynb new file mode 100644 index 000000000..4ea612870 --- /dev/null +++ b/mnist/mnist_azure.ipynb @@ -0,0 +1,942 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# MNIST E2E on Kubeflow on Azure\n", + "\n", + "This example guides you through:\n", + " \n", + " 1. Taking an example TensorFlow model and modifying it to support distributed training\n", + " 1. Serving the resulting model using TFServing\n", + " 1. Deploying and using a web-app that uses the model\n", + " \n", + "## Requirements\n", + "\n", + " * You must be running Kubeflow 1.0 on Azure\n", + " \n", + "## Credentials\n", + "\n", + "Before you can deploy MNIST you will need to obtain credentials that allow creating a storage account and also obtain credentials to n Azure Container Registry.\n", + "\n", + "Run the following command in Bash using the Azure CLI. You may also want to use the Cloud Shell in your browser at shell.azure.com.\n", + "\n", + "```bash\n", + "# Creates an Azure Active Directory Service Principal\n", + "\n", + "az ad sp create-for-rbac --name kubeflow\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "# This is the Kubernetes namespace into which you installed Kubeflow\n", + "os.environ['TARGET_NAMESPACE'] = ''\n", + "\n", + "# The credentials you obtained from the newly created Service Principal\n", + "os.environ['AZ_CLIENT_ID'] = ''\n", + "os.environ['AZ_CLIENT_SECRET'] = ''\n", + "os.environ['AZ_TENANT_ID'] = ''\n", + "\n", + "# Your Azure Subcription ID\n", + "os.environ['AZ_SUBSCRIPTION_ID'] = ''\n", + "\n", + "# If you haven't already created an Azure Container Registry (ACR), follow the instructions at\n", + "# https://docs.microsoft.com/azure/container-registry/container-registry-get-started-azure-cli\n", + "os.environ['ACR_NAME'] = ''\n", + "os.environ['ACR_RESOURCE_GROUP_NAME'] = ''\n", + "\n", + "# The existing resource group where a storage account should be created to hold all our data\n", + "os.environ['STORAGE_ACCOUNT_NAME'] = ''\n", + "os.environ['STORAGE_RESOURCE_GROUP_NAME'] = ''\n", + "os.environ['STORAGE_RESOURCE_LOCATION'] = ''\n", + "\n", + "# Stores the Service Principal as a Kubernetes Secret for reuse\n", + "!kubectl create secret generic -n ${TARGET_NAMESPACE} azcreds \\\n", + "--from-literal=AZ_CLIENT_ID=${AZ_CLIENT_ID} \\\n", + "--from-literal=AZ_CLIENT_SECRET=${AZ_CLIENT_SECRET} \\\n", + "--from-literal=AZ_TENANT_ID=${AZ_TENANT_ID} \\\n", + "--from-literal=AZ_SUBSCRIPTION_ID=${AZ_SUBSCRIPTION_ID}\n", + "\n", + "\n", + "# Stores credentials for accessing the private Azure Container Registry\n", + "!kubectl create secret docker-registry -n ${TARGET_NAMESPACE} acrcreds \\\n", + "--docker-server=${ACR_NAME}.azurecr.io \\\n", + "--docker-username=${AZ_CLIENT_ID} \\\n", + "--docker-password=${AZ_CLIENT_SECRET}\n", + "\n", + "!kubectl patch serviceaccount default-editor -n ${TARGET_NAMESPACE} \\\n", + "-p \"{\\\"imagePullSecrets\\\": [{\\\"name\\\": \\\"acrcreds\\\"}]}\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run the following command in Bash using the Azure CLI or use the Cloud Shell\n", + "\n", + "```bash\n", + "# Gives the service principal permission to create storage accounts in the desired resource group\n", + "export AZ_CLIENT_ID=''\n", + "export AZ_SUBSCRIPTION_ID=''\n", + "export STORAGE_RESOURCE_GROUP_NAME=''\n", + "\n", + "az role assignment create --assignee $AZ_CLIENT_ID --scope /subscriptions/$AZURE_SUBSCRIPTION_ID/resourceGroups/$STORAGE_RESOURCE_GROUP_NAME --role \"Storage Account Contributor\"\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prepare model\n", + "\n", + "There is a delta between existing distributed mnist examples and what's needed to run well as a TFJob.\n", + "\n", + "Basically, we must:\n", + "\n", + "1. Add options in order to make the model configurable.\n", + "1. Use `tf.estimator.train_and_evaluate` to enable model exporting and serving.\n", + "1. Define serving signatures for model serving.\n", + "\n", + "The resulting model is [model.py](model.py)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Install Required Libraries\n", + "\n", + "Import the libraries required to train this model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "from importlib import reload\n", + "\n", + "import notebook_setup\n", + "reload(notebook_setup)\n", + "notebook_setup.notebook_setup(platform='azure')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import k8s_util\n", + "# Force a reload of kubeflow; since kubeflow is a multi namespace module\n", + "# it looks like doing this in notebook_setup may not be sufficient\n", + "import kubeflow\n", + "reload(kubeflow)\n", + "from kubernetes import client as k8s_client\n", + "from kubernetes import config as k8s_config\n", + "from kubeflow.tfjob.api import tf_job_client as tf_job_client_module\n", + "from IPython.core.display import display, HTML\n", + "import yaml" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Configure The Docker Registry For Kubeflow Fairing\n", + "\n", + "* In order to build docker images from your notebook we need a docker registry where the images will be stored\n", + "* We will be using Azure Container Registry" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "from kubernetes import client as k8s_client\n", + "from kubernetes.client import rest as k8s_rest\n", + "from kubeflow import fairing \n", + "from kubeflow.fairing import utils as fairing_utils\n", + "from kubeflow.fairing.builders import append\n", + "from kubeflow.fairing.deployers import job\n", + "from kubeflow.fairing.preprocessors import base as base_preprocessor\n", + "\n", + "AZURE_ACR_NAME = os.environ.get('ACR_NAME')\n", + "\n", + "# Setting up AWS Elastic Container Registry (ECR) for storing output containers\n", + "# You can use any docker container registry istead of ECR\n", + "# AWS_ACCOUNT_ID=fairing.cloud.azure.guess_account_id()\n", + "# AWS_ACCOUNT_ID = boto3.client('sts').get_caller_identity().get('Account')\n", + "DOCKER_REGISTRY = '{}.azurecr.io'.format(AZURE_ACR_NAME)\n", + "\n", + "namespace = fairing_utils.get_current_k8s_namespace()\n", + "\n", + "logging.info(f\"Running in namespace {namespace}\")\n", + "logging.info(f\"Using docker registry {DOCKER_REGISTRY}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Use Kubeflow fairing to build the docker image\n", + "\n", + "* You will use kubeflow fairing's kaniko builder to build a docker image that includes all your dependencies\n", + " * You use kaniko because you want to be able to run `pip` to install dependencies\n", + " * Kaniko gives you the flexibility to build images from Dockerfiles" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO(https://github.com/kubeflow/fairing/issues/426): We should get rid of this once the default \n", + "# Kaniko image is updated to a newer image than 0.7.0.\n", + "from kubeflow.fairing import constants\n", + "constants.constants.KANIKO_IMAGE = \"gcr.io/kaniko-project/executor:v0.14.0\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from kubeflow.fairing.builders import cluster\n", + "\n", + "# output_map is a map of extra files to add to the notebook.\n", + "# It is a map from source location to the location inside the context.\n", + "output_map = {\n", + " \"Dockerfile.model\": \"Dockerfile\",\n", + " \"model.py\": \"model.py\"\n", + "}\n", + "\n", + "preprocessor = base_preprocessor.BasePreProcessor(\n", + " command=[\"python\"], # The base class will set this.\n", + " input_files=[],\n", + " path_prefix=\"/app\", # irrelevant since we aren't preprocessing any files\n", + " output_map=output_map)\n", + "\n", + "preprocessor.preprocess()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "STORAGE_GROUP_NAME = os.environ.get('STORAGE_RESOURCE_GROUP_NAME')\n", + "STORAGE_ACCOUNT_NAME = os.environ.get('STORAGE_ACCOUNT_NAME')\n", + "AZURE_REGION = os.environ.get('STORAGE_RESOURCE_LOCATION')\n", + "\n", + "# Use a Tensorflow image as the base image\n", + "# We use a custom Dockerfile \n", + "cluster_builder = cluster.cluster.ClusterBuilder(registry=DOCKER_REGISTRY,\n", + " base_image=\"\", # base_image is set in the Dockerfile\n", + " preprocessor=preprocessor,\n", + " image_name=\"mnist\",\n", + " dockerfile_path=\"Dockerfile\",\n", + " pod_spec_mutators=[fairing.cloud.azure.add_acr_config, fairing.cloud.azure.add_azure_files],\n", + " context_source=cluster.azurestorage_context.StorageContextSource(region=AZURE_REGION, storage_account_name=STORAGE_ACCOUNT_NAME, resource_group_name=STORAGE_GROUP_NAME))\n", + "cluster_builder.build()\n", + "logging.info(f\"Built image {cluster_builder.image_tag}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create an Azure File Share\n", + "\n", + "Create an Azure File Share bucket to store our models and other results.\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run the following in your local Bash terminal with Azure CLI or in the Azure Cloud Shell\n", + "\n", + "```bash\n", + "\n", + "export AZ_STORAGE_ACCOUNT_NAME=\"\"\n", + "export AZ_STORAGE_RESOURCE_GROUP=\"\"\n", + "export AZ_SHARE_NAME=\"mnist\"\n", + "STORAGE_KEY=$(az storage account keys list --resource-group $AZ_STORAGE_RESOURCE_GROUP --account-name $AZ_STORAGE_ACCOUNT_NAME --query \"[0].value\" -o tsv)\n", + "az storage share create --name $AZ_SHARE_NAME --account-name $AZ_STORAGE_ACCOUNT_NAME --account-key $STORAGE_KEY\n", + "\n", + "echo $STORAGE_KEY\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Insert the Storage Key in the command below to save the storage access credentials as a Kubernetes secret\n", + "%env STORAGE_KEY=''\n", + "\n", + "!kubectl create secret generic azure-share-secret --namespace $TARGET_NAMESPACE --from-literal=azurestorageaccountname=$AZ_STORAGE_ACCOUNT_NAME --from-literal=azurestorageaccountkey=$STORAGE_KEY" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Distributed training\n", + "\n", + "* We will train the model by using TFJob to run a distributed training job" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import uuid\n", + "\n", + "train_name = f\"mnist-train-{uuid.uuid4().hex[:4]}\"\n", + "num_ps = 1\n", + "num_workers = 2\n", + "model_dir = \"/mnt/azure/mnist\"\n", + "export_path = \"/mnt/azure/mnist/export\"\n", + "train_steps = 200\n", + "batch_size = 100\n", + "learning_rate = .01\n", + "image = cluster_builder.image_tag\n", + "\n", + "train_spec = f\"\"\"apiVersion: kubeflow.org/v1\n", + "kind: TFJob\n", + "metadata:\n", + " name: {train_name} \n", + "spec:\n", + " tfReplicaSpecs:\n", + " Ps:\n", + " replicas: {num_ps}\n", + " template:\n", + " metadata:\n", + " annotations:\n", + " sidecar.istio.io/inject: \"false\"\n", + " spec:\n", + " serviceAccount: default-editor\n", + " containers:\n", + " - name: tensorflow\n", + " command:\n", + " - python\n", + " - /opt/model.py\n", + " - --tf-model-dir={model_dir}\n", + " - --tf-export-dir={export_path}\n", + " - --tf-train-steps={train_steps}\n", + " - --tf-batch-size={batch_size}\n", + " - --tf-learning-rate={learning_rate}\n", + " image: {image}\n", + " workingDir: /opt\n", + " volumeMounts:\n", + " - name: azure\n", + " mountPath: /mnt/azure\n", + " readOnly: false\n", + " volumes:\n", + " - name: azure\n", + " azureFile:\n", + " secretName: azure-share-secret\n", + " shareName: mnist\n", + " readOnly: false\n", + " restartPolicy: OnFailure\n", + " Chief:\n", + " replicas: 1\n", + " template:\n", + " metadata:\n", + " annotations:\n", + " sidecar.istio.io/inject: \"false\"\n", + " spec:\n", + " serviceAccount: default-editor\n", + " containers:\n", + " - name: tensorflow\n", + " command:\n", + " - python\n", + " - /opt/model.py\n", + " - --tf-model-dir={model_dir}\n", + " - --tf-export-dir={export_path}\n", + " - --tf-train-steps={train_steps}\n", + " - --tf-batch-size={batch_size}\n", + " - --tf-learning-rate={learning_rate}\n", + " image: {image}\n", + " workingDir: /opt\n", + " volumeMounts:\n", + " - name: azure\n", + " mountPath: /mnt/azure\n", + " readOnly: false\n", + " volumes:\n", + " - name: azure\n", + " azureFile:\n", + " secretName: azure-share-secret\n", + " shareName: mnist\n", + " readOnly: false\n", + " restartPolicy: OnFailure\n", + " Worker:\n", + " replicas: 1\n", + " template:\n", + " metadata:\n", + " annotations:\n", + " sidecar.istio.io/inject: \"false\"\n", + " spec:\n", + " serviceAccount: default-editor\n", + " containers:\n", + " - name: tensorflow\n", + " command:\n", + " - python\n", + " - /opt/model.py\n", + " - --tf-model-dir={model_dir}\n", + " - --tf-export-dir={export_path}\n", + " - --tf-train-steps={train_steps}\n", + " - --tf-batch-size={batch_size}\n", + " - --tf-learning-rate={learning_rate}\n", + " image: {image}\n", + " workingDir: /opt\n", + " volumeMounts:\n", + " - name: azure\n", + " mountPath: /mnt/azure\n", + " readOnly: false\n", + " volumes:\n", + " - name: azure\n", + " azureFile:\n", + " secretName: azure-share-secret\n", + " shareName: mnist\n", + " readOnly: false\n", + " restartPolicy: OnFailure\n", + "\"\"\" " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create the training job\n", + "\n", + "* You could write the spec to a YAML file and then do `kubectl apply -f {FILE}`\n", + "* Since you are running in jupyter you will use the TFJob client\n", + "* You will run the TFJob in a namespace created by a Kubeflow profile\n", + " * The namespace will be the same namespace you are running the notebook in\n", + " * Creating a profile ensures the namespace is provisioned with service accounts and other resources needed for Kubeflow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tf_job_client = tf_job_client_module.TFJobClient()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tf_job_body = yaml.safe_load(train_spec)\n", + "tf_job = tf_job_client.create(tf_job_body, namespace=namespace) \n", + "\n", + "logging.info(f\"Created job {namespace}.{train_name}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Check the job\n", + "\n", + "* Above you used the python SDK for TFJob to check the status\n", + "* You can also use kubectl get the status of your job\n", + "* The job conditions will tell you whether the job is running, succeeded or failed" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!kubectl get tfjobs -o yaml {train_name}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get The Logs\n", + "\n", + "* There are two ways to get the logs for the training job\n", + "\n", + " 1. Using kubectl to fetch the pod logs\n", + " * These logs are ephemeral; they will be unavailable when the pod is garbage collected to free up resources\n", + " 1. Using Fluentd\n", + " * You need to install a fluentd plugin\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Deploy TensorBoard\n", + "\n", + "* You will create a Kubernetes Deployment to run TensorBoard\n", + "* TensorBoard will be accessible behind the Kubeflow endpoint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tb_name = \"mnist-tensorboard\"\n", + "tb_deploy = f\"\"\"apiVersion: apps/v1\n", + "kind: Deployment\n", + "metadata:\n", + " labels:\n", + " app: mnist-tensorboard\n", + " name: {tb_name}\n", + " namespace: {namespace}\n", + "spec:\n", + " selector:\n", + " matchLabels:\n", + " app: mnist-tensorboard\n", + " template:\n", + " metadata:\n", + " labels:\n", + " app: mnist-tensorboard\n", + " version: v1\n", + " spec:\n", + " serviceAccount: default-editor\n", + " containers:\n", + " - command:\n", + " - /usr/local/bin/tensorboard\n", + " - --logdir={model_dir}\n", + " - --port=80\n", + " image: tensorflow/tensorflow:1.15.2-py3\n", + " name: tensorboard\n", + " ports:\n", + " - containerPort: 80\n", + " volumeMounts:\n", + " - name: azure\n", + " mountPath: /mnt/azure\n", + " readOnly: false\n", + " volumes:\n", + " - name: azure\n", + " azureFile:\n", + " secretName: azure-share-secret\n", + " shareName: mnist\n", + " readOnly: false\n", + "\"\"\"\n", + "tb_service = f\"\"\"apiVersion: v1\n", + "kind: Service\n", + "metadata:\n", + " labels:\n", + " app: mnist-tensorboard\n", + " name: {tb_name}\n", + " namespace: {namespace}\n", + "spec:\n", + " ports:\n", + " - name: http-tb\n", + " port: 80\n", + " targetPort: 80\n", + " selector:\n", + " app: mnist-tensorboard\n", + " type: ClusterIP\n", + "\"\"\"\n", + "\n", + "tb_virtual_service = f\"\"\"apiVersion: networking.istio.io/v1alpha3\n", + "kind: VirtualService\n", + "metadata:\n", + " name: {tb_name}\n", + " namespace: {namespace}\n", + "spec:\n", + " gateways:\n", + " - kubeflow/kubeflow-gateway\n", + " hosts:\n", + " - '*'\n", + " http:\n", + " - match:\n", + " - uri:\n", + " prefix: /mnist/{namespace}/tensorboard/\n", + " rewrite:\n", + " uri: /\n", + " route:\n", + " - destination:\n", + " host: {tb_name}.{namespace}.svc.cluster.local\n", + " port:\n", + " number: 80\n", + " timeout: 300s\n", + "\"\"\"\n", + "\n", + "tb_specs = [tb_deploy, tb_service, tb_virtual_service]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "k8s_util.apply_k8s_specs(tb_specs, k8s_util.K8S_CREATE_OR_REPLACE)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Access The TensorBoard UI\n", + "\n", + "Recall we are forwarding the cluster-internal ingress.\n", + "\n", + "```bash\n", + "kubectl port-forward svc/istio-ingressgateway -n istio-system 8080:80\n", + "```\n", + "\n", + "To access TensorBoard, manually visit the path:\n", + "[/mnist/your-kubeflow-namespace/tensorboard/](/mnist/your-kubeflow-namespace/tensorboard/)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Wait For the Training Job to finish" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "* You can use the TFJob client to wait for it to finish." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tf_job = tf_job_client.wait_for_condition(train_name, expected_condition=[\"Succeeded\", \"Failed\"], namespace=namespace)\n", + "\n", + "if tf_job_client.is_job_succeeded(train_name, namespace):\n", + " logging.info(f\"TFJob {namespace}.{train_name} succeeded\")\n", + "else:\n", + " raise ValueError(f\"TFJob {namespace}.{train_name} failed\") " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Serve the model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "* Deploy the model using tensorflow serving\n", + "* We need to create\n", + " 1. A Kubernetes Deployment\n", + " 1. A Kubernetes service\n", + " 1. (Optional) Create a configmap containing the prometheus monitoring config" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "deploy_name = \"mnist-model\"\n", + "model_base_path = export_path\n", + "\n", + "# The web ui defaults to mnist-service so if you change it you will\n", + "# need to change it in the UI as well to send predictions to the mode\n", + "model_service = \"mnist-service\"\n", + "\n", + "deploy_spec = f\"\"\"apiVersion: apps/v1\n", + "kind: Deployment\n", + "metadata:\n", + " labels:\n", + " app: mnist\n", + " name: {deploy_name}\n", + " namespace: {namespace}\n", + "spec:\n", + " selector:\n", + " matchLabels:\n", + " app: mnist-model\n", + " template:\n", + " metadata:\n", + " # TODO(jlewi): Right now we disable the istio side car because otherwise ISTIO rbac will prevent the\n", + " # UI from sending RPCs to the server. We should create an appropriate ISTIO rbac authorization\n", + " # policy to allow traffic from the UI to the model servier.\n", + " # https://istio.io/docs/concepts/security/#target-selectors\n", + " annotations: \n", + " sidecar.istio.io/inject: \"false\"\n", + " labels:\n", + " app: mnist-model\n", + " version: v1\n", + " spec:\n", + " serviceAccount: default-editor\n", + " containers:\n", + " - args:\n", + " - --port=9000\n", + " - --rest_api_port=8500\n", + " - --model_name=mnist\n", + " - --model_base_path={model_base_path}\n", + " - --monitoring_config_file=/var/config/monitoring_config.txt\n", + " command:\n", + " - /usr/bin/tensorflow_model_server\n", + " env:\n", + " - name: modelBasePath\n", + " value: {model_base_path}\n", + " image: tensorflow/serving:1.15.0\n", + " imagePullPolicy: IfNotPresent\n", + " livenessProbe:\n", + " initialDelaySeconds: 30\n", + " periodSeconds: 30\n", + " tcpSocket:\n", + " port: 9000\n", + " name: mnist\n", + " ports:\n", + " - containerPort: 9000\n", + " - containerPort: 8500\n", + " resources:\n", + " limits:\n", + " cpu: \"1\"\n", + " memory: 1Gi\n", + " requests:\n", + " cpu: \"1\"\n", + " memory: 1Gi\n", + " volumeMounts:\n", + " - mountPath: /var/config/\n", + " name: model-config\n", + " - name: azure\n", + " mountPath: /mnt/azure\n", + " volumes:\n", + " - configMap:\n", + " name: {deploy_name}\n", + " name: model-config\n", + " - name: azure\n", + " azureFile:\n", + " secretName: azure-share-secret\n", + " shareName: mnist\n", + " readOnly: false\n", + "\"\"\"\n", + "\n", + "service_spec = f\"\"\"apiVersion: v1\n", + "kind: Service\n", + "metadata:\n", + " annotations: \n", + " prometheus.io/path: /monitoring/prometheus/metrics\n", + " prometheus.io/port: \"8500\"\n", + " prometheus.io/scrape: \"true\"\n", + " labels:\n", + " app: mnist-model\n", + " name: {model_service}\n", + " namespace: {namespace}\n", + "spec:\n", + " ports:\n", + " - name: grpc-tf-serving\n", + " port: 9000\n", + " targetPort: 9000\n", + " - name: http-tf-serving\n", + " port: 8500\n", + " targetPort: 8500\n", + " selector:\n", + " app: mnist-model\n", + " type: ClusterIP\n", + "\"\"\"\n", + "\n", + "monitoring_config = f\"\"\"kind: ConfigMap\n", + "apiVersion: v1\n", + "metadata:\n", + " name: {deploy_name}\n", + " namespace: {namespace}\n", + "data:\n", + " monitoring_config.txt: |-\n", + " prometheus_config: {{\n", + " enable: true,\n", + " path: \"/monitoring/prometheus/metrics\"\n", + " }}\n", + "\"\"\"\n", + "\n", + "model_specs = [deploy_spec, service_spec, monitoring_config]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "k8s_util.apply_k8s_specs(model_specs, k8s_util.K8S_CREATE_OR_REPLACE)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Deploy the mnist UI\n", + "\n", + "* We will now deploy the UI to visual the mnist results\n", + "* Note: This is using a prebuilt and public docker image for the UI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ui_name = \"mnist-ui\"\n", + "ui_deploy = f\"\"\"apiVersion: apps/v1\n", + "kind: Deployment\n", + "metadata:\n", + " name: {ui_name}\n", + " namespace: {namespace}\n", + "spec:\n", + " replicas: 1\n", + " selector:\n", + " matchLabels:\n", + " app: mnist-web-ui\n", + " template:\n", + " metadata:\n", + " labels:\n", + " app: mnist-web-ui\n", + " spec:\n", + " containers:\n", + " - image: gcr.io/kubeflow-examples/mnist/web-ui:v20190112-v0.2-142-g3b38225\n", + " name: web-ui\n", + " ports:\n", + " - containerPort: 5000 \n", + " serviceAccount: default-editor\n", + "\"\"\"\n", + "\n", + "ui_service = f\"\"\"apiVersion: v1\n", + "kind: Service\n", + "metadata:\n", + " annotations:\n", + " name: {ui_name}\n", + " namespace: {namespace}\n", + "spec:\n", + " ports:\n", + " - name: http-mnist-ui\n", + " port: 80\n", + " targetPort: 5000\n", + " selector:\n", + " app: mnist-web-ui\n", + " type: ClusterIP\n", + "\"\"\"\n", + "\n", + "ui_virtual_service = f\"\"\"apiVersion: networking.istio.io/v1alpha3\n", + "kind: VirtualService\n", + "metadata:\n", + " name: {ui_name}\n", + " namespace: {namespace}\n", + "spec:\n", + " gateways:\n", + " - kubeflow/kubeflow-gateway\n", + " hosts:\n", + " - '*'\n", + " http:\n", + " - match:\n", + " - uri:\n", + " prefix: /mnist/{namespace}/ui/\n", + " rewrite:\n", + " uri: /\n", + " route:\n", + " - destination:\n", + " host: {ui_name}.{namespace}.svc.cluster.local\n", + " port:\n", + " number: 80\n", + " timeout: 300s\n", + "\"\"\"\n", + "\n", + "ui_specs = [ui_deploy, ui_service, ui_virtual_service]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "k8s_util.apply_k8s_specs(ui_specs, k8s_util.K8S_CREATE_OR_REPLACE) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Access the web UI\n", + "\n", + "Recall we are forwarding the cluster-internal ingress.\n", + "\n", + "```bash\n", + "kubectl port-forward svc/istio-ingressgateway -n istio-system 8080:80\n", + "```\n", + "\n", + "To access the web UI, manually visit the path:\n", + "[/mnist/your-kubeflow-namespace/ui/](/mnist/your-kubeflow-namespace/ui/)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file