From 4c4f1c0f882b6ad84f2c15a5743fb8bf88720e4f Mon Sep 17 00:00:00 2001 From: Jiaxin Shan Date: Thu, 20 Feb 2020 18:32:32 -0800 Subject: [PATCH] Create a notebook for mnist E2E on AWS (#740) * Add method to get ALB hostname for aws users * Revoke setup based on the platform * Add AWS notebook for mnist e2e example * Remove legacy kustomize manifests for mnist example * Address feedbacks from reviewers --- mnist/README.md | 389 +------- mnist/k8s_util.py | 26 + mnist/mnist_aws.ipynb | 1002 +++++++++++++++++++++ mnist/monitoring/S3/deployment_patch.yaml | 46 - mnist/monitoring/S3/kustomization.yaml | 81 -- mnist/monitoring/S3/params.yaml | 5 - mnist/notebook_setup.py | 24 +- mnist/serving/S3/deployment_patch.yaml | 41 - mnist/serving/S3/kustomization.yaml | 74 -- mnist/serving/S3/params.yaml | 5 - mnist/training/S3/Chief_patch.yaml | 46 - mnist/training/S3/Ps_patch.yaml | 46 - mnist/training/S3/Worker_patch.yaml | 46 - mnist/training/S3/kustomization.yaml | 86 -- mnist/training/S3/params.yaml | 17 - 15 files changed, 1076 insertions(+), 858 deletions(-) create mode 100644 mnist/mnist_aws.ipynb delete mode 100644 mnist/monitoring/S3/deployment_patch.yaml delete mode 100644 mnist/monitoring/S3/kustomization.yaml delete mode 100644 mnist/monitoring/S3/params.yaml delete mode 100644 mnist/serving/S3/deployment_patch.yaml delete mode 100644 mnist/serving/S3/kustomization.yaml delete mode 100644 mnist/serving/S3/params.yaml delete mode 100644 mnist/training/S3/Chief_patch.yaml delete mode 100644 mnist/training/S3/Ps_patch.yaml delete mode 100644 mnist/training/S3/Worker_patch.yaml delete mode 100644 mnist/training/S3/kustomization.yaml delete mode 100644 mnist/training/S3/params.yaml diff --git a/mnist/README.md b/mnist/README.md index 1ff4731be..c06ff2c98 100644 --- a/mnist/README.md +++ b/mnist/README.md @@ -4,6 +4,7 @@ - [MNIST on Kubeflow](#mnist-on-kubeflow) - [MNIST on Kubeflow on GCP](#mnist-on-kubeflow-on-gcp) +- [MNIST on Kubeflow on AWS](#mnist-on-kubeflow-on-aws) - [MNIST on Kubeflow on Vanilla k8s](#vanilla) - [MNIST on other platforms](#mnist-on-other-platforms) - [Prerequisites](#prerequisites) @@ -16,14 +17,11 @@ - [Preparing your Kubernetes Cluster](#preparing-your-kubernetes-cluster) - [Training your model](#training-your-model) - [Local storage](#local-storage) - - [Using S3](#using-s3) - [Monitoring](#monitoring) - [Tensorboard](#tensorboard) - [Local storage](#local-storage-1) - - [Using S3](#using-s3-1) - [Deploying TensorBoard](#deploying-tensorboard) - [Serving the model](#serving-the-model) - - [S3](#s3) - [Local storage](#local-storage-2) - [Web Front End](#web-front-end) - [Connecting via port forwarding](#connecting-via-port-forwarding) @@ -39,6 +37,7 @@ This example guides you through the process of taking an example model, modifyin Follow the version of the guide that is specific to how you have deployed Kubeflow 1. [MNIST on Kubeflow on GCP](#gcp) +1. [MNIST on Kubeflow on AWS](#aws) 1. [MNIST on Kubeflow on vanilla k8s](#vanilla) 1. [MNIST on other platforms](#other) @@ -68,6 +67,32 @@ Follow these instructions to run the MNIST tutorial on GCP 1. Follow the notebook to train and deploy MNIST on Kubeflow + +# MNIST on Kubeflow on AWS + +Follow these instructions to run the MNIST tutorial on AWS + +1. Follow the [AWS instructions](https://www.kubeflow.org/docs/aws/deploy/install-kubeflow/) to deploy Kubeflow on AWS + +1. Launch a Jupyter notebook + + * The tutorial has been tested using the Jupyter Tensorflow 1.15 image + +1. Launch a terminal in Jupyter and clone the kubeflow examples repo + + ``` + git clone https://github.com/kubeflow/examples.git git_kubeflow-examples + ``` + + * **Tip** When you start a terminal in Jupyter, run the command `bash` to start + a bash terminal which is much more friendly then the default shell + + * **Tip** You can change the URL from '/tree' to '/lab' to switch to using Jupyterlab + +1. Open the notebook `mnist/mnist_aws.ipynb` + +1. Follow the notebook to train and deploy MNIST on Kubeflow + # MNIST on Kubeflow on Vanilla k8s @@ -82,7 +107,7 @@ Follow these instructions to run the MNIST tutorial on GCP 1. Launch a terminal in Jupyter and clone the kubeflow/examples repo ```bash - git clone https://github.com/kubeflow/examples.git git_kubeflow-examples + git clone https://github.com/kubeflow/examples.git git_kubeflow-examples ``` 1. Open the notebook `mnist/mnist_vanilla_k8s.ipynb` @@ -127,7 +152,6 @@ Source documentation: [Kaniko docs](https://github.com/GoogleContainerTools/kani The tutorial is currently not up to date for Kubeflow 1.0. Please check the issues -* [kubeflow/examples#724](https://github.com/kubeflow/examples/issues/724) for AWS * [kubeflow/examples#725](https://github.com/kubeflow/examples/issues/725) for other platforms ## Prerequisites @@ -192,7 +216,7 @@ kubectl config set-context $(kubectl config current-context) --namespace=kubeflo #### Local storage -Let's start by runing the training job on Kubeflow and storing the model in a local storage. +Let's start by runing the training job on Kubeflow and storing the model in a local storage. Fristly, refer to the [document](https://kubernetes.io/docs/concepts/storage/persistent-volumes/) to create Persistent Volume(PV) and Persistent Volume Claim(PVC), the PVC name (${PVC_NAME}) will be used by pods of training and serving for local mode in steps below. @@ -241,7 +265,7 @@ kustomize edit add configmap mnist-map-training --from-literal=modelDir=/mnt kustomize edit add configmap mnist-map-training --from-literal=exportDir=/mnt/export ``` -You can now submit the job +You can now submit the job ``` kustomize build . |kubectl apply -f - @@ -253,158 +277,12 @@ And you can check the job kubectl get tfjobs -o yaml mnist-train-local ``` -And to check the logs +And to check the logs ``` kubectl logs mnist-train-local-chief-0 ``` -#### Using S3 - -To use S3 we need to configure TensorFlow to use S3 credentials and variables. These credentials will be provided as kubernetes secrets and the variables will be passed in as environment variables. Modify the below values to suit your environment. - -Enter the `training/S3` from the `mnist` application directory. - -``` -cd training/S3 -``` - -Give the job a different name (to distinguish it from your job which didn't use S3) - -``` -kustomize edit add configmap mnist-map-training --from-literal=name=mnist-train-dist -``` - -Optionally, if you want to use your custom training image, configurate that as below. - -``` -kustomize edit set image training-image=$DOCKER_URL -``` - -Next we configure it to run distributed by setting the number of parameter servers and workers to use. The `numPs` means the number of Ps and the `numWorkers` means the number of Worker. - -``` -../base/definition.sh --numPs 1 --numWorkers 2 -``` - -Set the training parameters, such as training steps, batch size and learning rate. - -``` -kustomize edit add configmap mnist-map-training --from-literal=trainSteps=200 -kustomize edit add configmap mnist-map-training --from-literal=batchSize=100 -kustomize edit add configmap mnist-map-training --from-literal=learningRate=0.01 -``` - -In order to write to S3 we need to supply the TensorFlow code with AWS credentials we also need to set various environment variables configuring access to S3. - - 1. Define a bunch of environment variables corresponding to your S3 settings; these will be used in subsequent steps - - ``` - export S3_ENDPOINT=s3.us-west-2.amazonaws.com #replace with your s3 endpoint in a host:port format, e.g. minio:9000 - export AWS_ENDPOINT_URL=https://${S3_ENDPOINT} #use http instead of https for default minio installs - export AWS_ACCESS_KEY_ID=xxxxx - export AWS_SECRET_ACCESS_KEY=xxxxx - export AWS_REGION=us-west-2 - export BUCKET_NAME=mybucket - export S3_USE_HTTPS=1 #set to 0 for default minio installs - export S3_VERIFY_SSL=1 #set to 0 for defaul minio installs - export S3_MODEL_PATH_URI=s3://${BUCKET_NAME}/model - export S3_MODEL_EXPORT_URI=s3://${BUCKET_NAME}/export - ``` - - 1. Create a K8s secret containing your AWS credentials - - ``` - kustomize edit add secret aws-creds --from-literal=awsAccessKeyID=${AWS_ACCESS_KEY_ID} \ - --from-literal=awsSecretAccessKey=${AWS_SECRET_ACCESS_KEY} - ``` - - 1. Pass secrets as environment variables into pod - - ``` - kustomize edit add configmap mnist-map-training --from-literal=awsAccessKeyIDName=awsAccessKeyID - kustomize edit add configmap mnist-map-training --from-literal=awsSecretAccessKeyName=awsSecretAccessKey - ``` - - 1. Next we need to set a whole bunch of S3 related environment variables so that TensorFlow knows how to talk to S3 - - ``` - kustomize edit add configmap mnist-map-training --from-literal=S3_ENDPOINT=${S3_ENDPOINT} - kustomize edit add configmap mnist-map-training --from-literal=AWS_ENDPOINT_URL=${AWS_ENDPOINT_URL} - kustomize edit add configmap mnist-map-training --from-literal=AWS_REGION=${AWS_REGION} - kustomize edit add configmap mnist-map-training --from-literal=BUCKET_NAME=${BUCKET_NAME} - kustomize edit add configmap mnist-map-training --from-literal=S3_USE_HTTPS=${S3_USE_HTTPS} - kustomize edit add configmap mnist-map-training --from-literal=S3_VERIFY_SSL=${S3_VERIFY_SSL} - kustomize edit add configmap mnist-map-training --from-literal=modelDir=${S3_MODEL_PATH_URI} - kustomize edit add configmap mnist-map-training --from-literal=exportDir=${S3_MODEL_EXPORT_URI} - ``` - - * If we look at the spec for our job we can see that the environment variables related to S3 are set. - - ``` - kustomize build . - - apiVersion: kubeflow.org/v1beta2 - kind: TFJob - metadata: - ... - spec: - tfReplicaSpecs: - Chief: - replicas: 1 - template: - spec: - containers: - - command: - .. - env: - ... - - name: S3_ENDPOINT - value: s3.us-west-2.amazonaws.com - - name: AWS_ENDPOINT_URL - value: https://s3.us-west-2.amazonaws.com - - name: AWS_REGION - value: us-west-2 - - name: BUCKET_NAME - value: mybucket - - name: S3_USE_HTTPS - value: "1" - - name: S3_VERIFY_SSL - value: "1" - - name: AWS_ACCESS_KEY_ID - valueFrom: - secretKeyRef: - key: awsAccessKeyID - name: aws-creds-somevalue - - name: AWS_SECRET_ACCESS_KEY - valueFrom: - secretKeyRef: - key: awsSecretAccessKey - name: aws-creds-somevalue - ... - ... - ... - ``` - - -You can now submit the job - -``` -kustomize build . |kubectl apply -f - -``` - -And you can check the job - -``` -kubectl get tfjobs -o yaml mnist-train-dist -``` - -And to check the logs - -``` -kubectl logs -f mnist-train-dist-chief-0 -``` - ## Monitoring There are various ways to monitor workflow/training job. In addition to using `kubectl` to query for the status of `pods`, some basic dashboards are also available. @@ -425,88 +303,6 @@ kustomize edit add configmap mnist-map-monitoring --from-literal=pvcMountPath=/m kustomize edit add configmap mnist-map-monitoring --from-literal=logDir=/mnt ``` -#### Using S3 - -Enter the `monitoring/S3` from the `mnist` application directory. - -``` -cd monitoring/S3 -``` - -Assuming you followed the directions above if you used S3 you can use the following value - -``` -LOGDIR=${S3_MODEL_PATH_URI} -kustomize edit add configmap mnist-map-monitoring --from-literal=logDir=${LOGDIR} -``` - -You need to point TensorBoard to AWS credentials to access S3 bucket with model. - - 1. Create a K8s secret containing your AWS credentials - - ``` - kustomize edit add secret aws-creds --from-literal=awsAccessKeyID=${AWS_ACCESS_KEY_ID} \ - --from-literal=awsSecretAccessKey=${AWS_SECRET_ACCESS_KEY} - ``` - - 1. Pass secrets as environment variables into pod - - ``` - kustomize edit add configmap mnist-map-monitoring --from-literal=awsAccessKeyIDName=awsAccessKeyID - kustomize edit add configmap mnist-map-monitoring --from-literal=awsSecretAccessKeyName=awsSecretAccessKey - ``` - - 1. Next we need to set a whole bunch of S3 related environment variables so that TensorBoard knows how to talk to S3 - - ``` - kustomize edit add configmap mnist-map-monitoring --from-literal=S3_ENDPOINT=${S3_ENDPOINT} - kustomize edit add configmap mnist-map-monitoring --from-literal=AWS_ENDPOINT_URL=${AWS_ENDPOINT_URL} - kustomize edit add configmap mnist-map-monitoring --from-literal=AWS_REGION=${AWS_REGION} - kustomize edit add configmap mnist-map-monitoring --from-literal=BUCKET_NAME=${BUCKET_NAME} - kustomize edit add configmap mnist-map-monitoring --from-literal=S3_USE_HTTPS=${S3_USE_HTTPS} - kustomize edit add configmap mnist-map-monitoring --from-literal=S3_VERIFY_SSL=${S3_VERIFY_SSL} - ``` - - * If we look at the spec for TensorBoard deployment we can see that the environment variables related to S3 are set. - - ``` - kustomize build . - ``` - - ``` - ... - spec: - containers: - - command: - .. - env: - ... - - name: S3_ENDPOINT - value: s3.us-west-2.amazonaws.com - - name: AWS_ENDPOINT_URL - value: https://s3.us-west-2.amazonaws.com - - name: AWS_REGION - value: us-west-2 - - name: BUCKET_NAME - value: mybucket - - name: S3_USE_HTTPS - value: "1" - - name: S3_VERIFY_SSL - value: "1" - - name: AWS_ACCESS_KEY_ID - valueFrom: - secretKeyRef: - key: awsAccessKeyID - name: aws-creds-somevalue - - name: AWS_SECRET_ACCESS_KEY - valueFrom: - secretKeyRef: - key: awsSecretAccessKey - name: aws-creds-somevalue - ... - ``` - - #### Deploying TensorBoard Now you can deploy TensorBoard @@ -529,125 +325,6 @@ The model code will export the model in saved model format which is suitable for To serve the model follow the instructions below. The instructins vary slightly based on where you are storing your model (e.g. GCS, S3, PVC). Depending on the storage system we provide different kustomization as a convenience for setting relevant environment variables. -### S3 - -We can also serve the model when it is stored on S3. This assumes that when you trained the model you set `exportDir` to a S3 -URI; if not you can always copy it to S3 using the AWS CLI. - -Assuming you followed the directions above, you should have set the following environment variables that will be used in this section: - -``` -echo ${S3_MODEL_EXPORT_URI} -echo ${AWS_REGION} -echo ${S3_ENDPOINT} -echo ${S3_USE_HTTPS} -echo ${S3_VERIFY_SSL} -``` - -Check that a model was exported to s3 - -``` -aws s3 ls ${S3_MODEL_EXPORT_URI} --recursive -``` - -The output should look something like - -``` -${S3_MODEL_EXPORT_URI}/1547100373/saved_model.pb -${S3_MODEL_EXPORT_URI}/1547100373/variables/ -${S3_MODEL_EXPORT_URI}/1547100373/variables/variables.data-00000-of-00001 -${S3_MODEL_EXPORT_URI}/1547100373/variables/variables.index -``` - -The number `1547100373` is a version number auto-generated by TensorFlow; it will vary on each run but should be monotonically increasing if you save a model to the same location as a previous location. - -Enter the `serving/S3` folder from the `mnist` application directory. -``` -cd serving/S3 -``` - -Set a different name for the tf-serving. - -``` -kustomize edit add configmap mnist-map-serving --from-literal=name=mnist-s3-serving -``` - -Create a K8s secret containing your AWS credentials - -``` -kustomize edit add secret aws-creds --from-literal=awsAccessKeyID=${AWS_ACCESS_KEY_ID} \ - --from-literal=awsSecretAccessKey=${AWS_SECRET_ACCESS_KEY} -``` - -Enable serving from S3 by configuring the following ksonnet parameters using the environment variables from above: - -``` -kustomize edit add configmap mnist-map-serving --from-literal=s3Enable=1 #This needs to be true for S3 connection to work -kustomize edit add configmap mnist-map-serving --from-literal=modelBasePath=${S3_MODEL_EXPORT_URI}/ -kustomize edit add configmap mnist-map-serving --from-literal=S3_ENDPOINT=${S3_ENDPOINT} -kustomize edit add configmap mnist-map-serving --from-literal=AWS_REGION=${AWS_REGION} -kustomize edit add configmap mnist-map-serving --from-literal=S3_USE_HTTPS=${S3_USE_HTTPS} -kustomize edit add configmap mnist-map-serving --from-literal=S3_VERIFY_SSL=${S3_VERIFY_SSL} -kustomize edit add configmap mnist-map-serving --from-literal=AWS_ACCESS_KEY_ID=awsAccessKeyID -kustomize edit add configmap mnist-map-serving --from-literal=AWS_SECRET_ACCESS_KEY=awsSecretAccessKey -``` - -If we look at the spec for TensorFlow deployment we can see that the environment variables related to S3 are set. -``` -kustomize build . -``` - -``` -... -spec: - containers: - - command: - .. - env: - ... - - name: modelBasePath - value: s3://mybucket/export/ - - name: s3Enable - value: "1" - - name: S3_ENDPOINT - value: s3.us-west-2.amazonaws.com - - name: AWS_REGION - value: us-west-2 - - name: S3_USE_HTTPS - value: "1" - - name: S3_VERIFY_SSL - value: "1" - - name: AWS_ACCESS_KEY_ID - valueFrom: - secretKeyRef: - key: awsAccessKeyID - name: aws-creds-somevalue - - name: AWS_SECRET_ACCESS_KEY - valueFrom: - secretKeyRef: - key: awsSecretAccessKey - name: aws-creds-somevalue - ... -``` - -Deploy it, and run a service to make the deployment accessible to other pods in the cluster - -``` -kustomize build . |kubectl apply -f - -``` - -You can check the deployment by running - -``` -kubectl describe deployments mnist-s3-serving -``` - -The service should make the `mnist-s3-serving` deployment accessible over port 9000 - -``` -kubectl describe service mnist-s3-serving -``` - ### Local storage The section shows how to serve the local model that was stored in PVC while training. @@ -716,7 +393,7 @@ To connect to the web app via port-forwarding ``` POD_NAME=$(kubectl get pods --selector=app=web-ui --template '{{range .items}}{{.metadata.name}}{{"\n"}}{{end}}') -kubectl port-forward ${POD_NAME} 8080:5000 +kubectl port-forward ${POD_NAME} 8080:5000 ``` You should now be able to open up the web app at your localhost. [Local Storage](http://localhost:8080) or [S3](http://localhost:8080/?addr=mnist-s3-serving). diff --git a/mnist/k8s_util.py b/mnist/k8s_util.py index 570fb56ee..5af2bc8af 100644 --- a/mnist/k8s_util.py +++ b/mnist/k8s_util.py @@ -145,3 +145,29 @@ def get_iap_endpoint(): raise return f"https://{kf_ingress.spec.rules[0].host}" + +## Use by AWS +def get_ingress_endpoint(): + """Return the URL of the Ingress endpoint""" + extensions = k8s_client.ExtensionsV1beta1Api() + kf_ingress = None + + try: + kf_ingress = extensions.read_namespaced_ingress("istio-ingress", "istio-system") + except k8s_rest.ApiException as e: + if e.status == 403: + logging.warning(f"The service account doesn't have sufficient privileges " + f"to get the istio-system ingress. " + f"You will have to manually enter the Kubeflow endpoint. " + f"To make this function work ask someone with cluster " + f"priveleges to create an appropriate " + f"clusterrolebinding by running a command.\n" + f"kubectl create --namespace=istio-system rolebinding " + "--clusterrole=kubeflow-view " + "--serviceaccount=${NAMESPACE}:default-editor " + "${NAMESPACE}-istio-view") + return "" + + raise + + return f"http://{kf_ingress.status.load_balancer.ingress[0].hostname}" \ No newline at end of file diff --git a/mnist/mnist_aws.ipynb b/mnist/mnist_aws.ipynb new file mode 100644 index 000000000..8d85bbdcc --- /dev/null +++ b/mnist/mnist_aws.ipynb @@ -0,0 +1,1002 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# MNIST E2E on Kubeflow on AWS\n", + "\n", + "This example guides you through:\n", + " \n", + " 1. Taking an example TensorFlow model and modifying it to support distributed training\n", + " 1. Serving the resulting model using TFServing\n", + " 1. Deploying and using a web-app that uses the model\n", + " \n", + "## Requirements\n", + "\n", + " * You must be running Kubeflow 1.0 on EKS\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Install AWS CLI\n", + "\n", + "\n", + "Click `Kernal` -> `Restart` after your install new packages." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install boto3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create AWS secret in kubernetes and grant aws access to your notebook\n", + "\n", + "> Note: Once IAM for Service Account is merged in 1.0.1, we don't have to use credentials\n", + "\n", + "1. Please create an AWS secret in current namespace. \n", + "\n", + "> Note: To get base64 string, try `echo -n $AWS_ACCESS_KEY_ID | base64`. \n", + "> Make sure you have `AmazonEC2ContainerRegistryFullAccess` and `AmazonS3FullAccess` for this experiment. Pods will use credentials to talk to AWS services." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "\n", + "# Replace placeholder with your own AWS credentials\n", + "AWS_ACCESS_KEY_ID=''\n", + "AWS_SECRET_ACCESS_KEY=''\n", + "\n", + "kubectl create secret generic aws-secret --from-literal=AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} --from-literal=AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2. Attach `AmazonEC2ContainerRegistryFullAccess` and `AmazonS3FullAccess` to EKS node group role and grant AWS access to notebook." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Verify you have access to AWS services\n", + "\n", + "* The cell below checks that this notebook was spawned with credentials to access AWS S3 and ECR" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import os\n", + "import uuid\n", + "from importlib import reload\n", + "import boto3\n", + "\n", + "# Set REGION for s3 bucket and elastic contaienr registry\n", + "AWS_REGION='us-west-2'\n", + "boto3.client('s3', region_name=AWS_REGION).list_buckets()\n", + "boto3.client('ecr', region_name=AWS_REGION).describe_repositories()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prepare model\n", + "\n", + "There is a delta between existing distributed mnist examples and what's needed to run well as a TFJob.\n", + "\n", + "Basically, we must:\n", + "\n", + "1. Add options in order to make the model configurable.\n", + "1. Use `tf.estimator.train_and_evaluate` to enable model exporting and serving.\n", + "1. Define serving signatures for model serving.\n", + "\n", + "The resulting model is [model.py](model.py)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Install Required Libraries\n", + "\n", + "Import the libraries required to train this model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import notebook_setup\n", + "reload(notebook_setup)\n", + "notebook_setup.notebook_setup(platform='aws')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import k8s_util\n", + "# Force a reload of kubeflow; since kubeflow is a multi namespace module\n", + "# it looks like doing this in notebook_setup may not be sufficient\n", + "import kubeflow\n", + "reload(kubeflow)\n", + "from kubernetes import client as k8s_client\n", + "from kubernetes import config as k8s_config\n", + "from kubeflow.tfjob.api import tf_job_client as tf_job_client_module\n", + "from IPython.core.display import display, HTML\n", + "import yaml" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Configure The Docker Registry For Kubeflow Fairing\n", + "\n", + "* In order to build docker images from your notebook we need a docker registry where the images will be stored\n", + "* Below you set some variables specifying a [Amazon Elastic Container Registry](https://aws.amazon.com/ecr/)\n", + "* Kubeflow Fairing provides a utility function to guess the name of your AWS account" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from kubernetes import client as k8s_client\n", + "from kubernetes.client import rest as k8s_rest\n", + "from kubeflow import fairing \n", + "from kubeflow.fairing import utils as fairing_utils\n", + "from kubeflow.fairing.builders import append\n", + "from kubeflow.fairing.deployers import job\n", + "from kubeflow.fairing.preprocessors import base as base_preprocessor\n", + "\n", + "# Setting up AWS Elastic Container Registry (ECR) for storing output containers\n", + "# You can use any docker container registry istead of ECR\n", + "AWS_ACCOUNT_ID=fairing.cloud.aws.guess_account_id()\n", + "AWS_ACCOUNT_ID = boto3.client('sts').get_caller_identity().get('Account')\n", + "DOCKER_REGISTRY = '{}.dkr.ecr.{}.amazonaws.com'.format(AWS_ACCOUNT_ID, AWS_REGION)\n", + "\n", + "namespace = fairing_utils.get_current_k8s_namespace()\n", + "\n", + "logging.info(f\"Running in aws region {AWS_REGION}, account {AWS_ACCOUNT_ID}\")\n", + "logging.info(f\"Running in namespace {namespace}\")\n", + "logging.info(f\"Using docker registry {DOCKER_REGISTRY}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Use Kubeflow fairing to build the docker image\n", + "\n", + "* You will use kubeflow fairing's kaniko builder to build a docker image that includes all your dependencies\n", + " * You use kaniko because you want to be able to run `pip` to install dependencies\n", + " * Kaniko gives you the flexibility to build images from Dockerfiles" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO(https://github.com/kubeflow/fairing/issues/426): We should get rid of this once the default \n", + "# Kaniko image is updated to a newer image than 0.7.0.\n", + "from kubeflow.fairing import constants\n", + "constants.constants.KANIKO_IMAGE = \"gcr.io/kaniko-project/executor:v0.14.0\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from kubeflow.fairing.builders import cluster\n", + "\n", + "# output_map is a map of extra files to add to the notebook.\n", + "# It is a map from source location to the location inside the context.\n", + "output_map = {\n", + " \"Dockerfile.model\": \"Dockerfile\",\n", + " \"model.py\": \"model.py\"\n", + "}\n", + "\n", + "preprocessor = base_preprocessor.BasePreProcessor(\n", + " command=[\"python\"], # The base class will set this.\n", + " input_files=[],\n", + " path_prefix=\"/app\", # irrelevant since we aren't preprocessing any files\n", + " output_map=output_map)\n", + "\n", + "preprocessor.preprocess()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a new ECR repository to host model image\n", + "!aws ecr create-repository --repository-name mnist --region=$AWS_REGION" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Use a Tensorflow image as the base image\n", + "# We use a custom Dockerfile \n", + "cluster_builder = cluster.cluster.ClusterBuilder(registry=DOCKER_REGISTRY,\n", + " base_image=\"\", # base_image is set in the Dockerfile\n", + " preprocessor=preprocessor,\n", + " image_name=\"mnist\",\n", + " dockerfile_path=\"Dockerfile\",\n", + " pod_spec_mutators=[fairing.cloud.aws.add_aws_credentials_if_exists, fairing.cloud.aws.add_ecr_config],\n", + " context_source=cluster.s3_context.S3ContextSource(region=AWS_REGION))\n", + "cluster_builder.build()\n", + "logging.info(f\"Built image {cluster_builder.image_tag}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create a S3 Bucket\n", + "\n", + "* Create a S3 bucket to store our models and other results.\n", + "* Since we are running in python we use the python client libraries but you could also use the `gsutil` command line" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import boto3\n", + "from botocore.exceptions import ClientError\n", + "\n", + "bucket = f\"{AWS_ACCOUNT_ID}-mnist\"\n", + "\n", + "def create_bucket(bucket_name, region=None):\n", + " \"\"\"Create an S3 bucket in a specified region\n", + "\n", + " If a region is not specified, the bucket is created in the S3 default\n", + " region (us-east-1).\n", + "\n", + " :param bucket_name: Bucket to create\n", + " :param region: String region to create bucket in, e.g., 'us-west-2'\n", + " :return: True if bucket created, else False\n", + " \"\"\"\n", + "\n", + " # Create bucket\n", + " try:\n", + " if region is None:\n", + " s3_client = boto3.client('s3')\n", + " s3_client.create_bucket(Bucket=bucket_name)\n", + " else:\n", + " s3_client = boto3.client('s3', region_name=region)\n", + " location = {'LocationConstraint': region}\n", + " s3_client.create_bucket(Bucket=bucket_name,\n", + " CreateBucketConfiguration=location)\n", + " except ClientError as e:\n", + " logging.error(e)\n", + " return False\n", + " return True\n", + "\n", + "create_bucket(bucket, AWS_REGION)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Distributed training\n", + "\n", + "* We will train the model by using TFJob to run a distributed training job" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_name = f\"mnist-train-{uuid.uuid4().hex[:4]}\"\n", + "num_ps = 1\n", + "num_workers = 2\n", + "model_dir = f\"s3://{bucket}/mnist\"\n", + "export_path = f\"s3://{bucket}/mnist/export\"\n", + "train_steps = 200\n", + "batch_size = 100\n", + "learning_rate = .01\n", + "image = cluster_builder.image_tag\n", + "\n", + "train_spec = f\"\"\"apiVersion: kubeflow.org/v1\n", + "kind: TFJob\n", + "metadata:\n", + " name: {train_name} \n", + "spec:\n", + " tfReplicaSpecs:\n", + " Ps:\n", + " replicas: {num_ps}\n", + " template:\n", + " metadata:\n", + " annotations:\n", + " sidecar.istio.io/inject: \"false\"\n", + " spec:\n", + " serviceAccount: default-editor\n", + " containers:\n", + " - name: tensorflow\n", + " command:\n", + " - python\n", + " - /opt/model.py\n", + " - --tf-model-dir={model_dir}\n", + " - --tf-export-dir={export_path}\n", + " - --tf-train-steps={train_steps}\n", + " - --tf-batch-size={batch_size}\n", + " - --tf-learning-rate={learning_rate}\n", + " image: {image}\n", + " workingDir: /opt\n", + " env:\n", + " - name: AWS_REGION\n", + " value: {AWS_REGION}\n", + " - name: AWS_ACCESS_KEY_ID\n", + " valueFrom:\n", + " secretKeyRef:\n", + " name: aws-secret\n", + " key: AWS_ACCESS_KEY_ID\n", + " - name: AWS_SECRET_ACCESS_KEY\n", + " valueFrom:\n", + " secretKeyRef:\n", + " name: aws-secret\n", + " key: AWS_SECRET_ACCESS_KEY\n", + "\n", + " restartPolicy: OnFailure\n", + " Chief:\n", + " replicas: 1\n", + " template:\n", + " metadata:\n", + " annotations:\n", + " sidecar.istio.io/inject: \"false\"\n", + " spec:\n", + " serviceAccount: default-editor\n", + " containers:\n", + " - name: tensorflow\n", + " command:\n", + " - python\n", + " - /opt/model.py\n", + " - --tf-model-dir={model_dir}\n", + " - --tf-export-dir={export_path}\n", + " - --tf-train-steps={train_steps}\n", + " - --tf-batch-size={batch_size}\n", + " - --tf-learning-rate={learning_rate}\n", + " image: {image}\n", + " workingDir: /opt\n", + " env:\n", + " - name: AWS_REGION\n", + " value: {AWS_REGION}\n", + " - name: AWS_ACCESS_KEY_ID\n", + " valueFrom:\n", + " secretKeyRef:\n", + " name: aws-secret\n", + " key: AWS_ACCESS_KEY_ID\n", + " - name: AWS_SECRET_ACCESS_KEY\n", + " valueFrom:\n", + " secretKeyRef:\n", + " name: aws-secret\n", + " key: AWS_SECRET_ACCESS_KEY\n", + "\n", + " restartPolicy: OnFailure\n", + " Worker:\n", + " replicas: 1\n", + " template:\n", + " metadata:\n", + " annotations:\n", + " sidecar.istio.io/inject: \"false\"\n", + " spec:\n", + " serviceAccount: default-editor\n", + " containers:\n", + " - name: tensorflow\n", + " command:\n", + " - python\n", + " - /opt/model.py\n", + " - --tf-model-dir={model_dir}\n", + " - --tf-export-dir={export_path}\n", + " - --tf-train-steps={train_steps}\n", + " - --tf-batch-size={batch_size}\n", + " - --tf-learning-rate={learning_rate}\n", + " image: {image}\n", + " workingDir: /opt\n", + " env:\n", + " - name: AWS_REGION\n", + " value: {AWS_REGION}\n", + " - name: AWS_ACCESS_KEY_ID\n", + " valueFrom:\n", + " secretKeyRef:\n", + " name: aws-secret\n", + " key: AWS_ACCESS_KEY_ID\n", + " - name: AWS_SECRET_ACCESS_KEY\n", + " valueFrom:\n", + " secretKeyRef:\n", + " name: aws-secret\n", + " key: AWS_SECRET_ACCESS_KEY\n", + " restartPolicy: OnFailure\n", + "\"\"\" " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create the training job\n", + "\n", + "* You could write the spec to a YAML file and then do `kubectl apply -f {FILE}`\n", + "* Since you are running in jupyter you will use the TFJob client\n", + "* You will run the TFJob in a namespace created by a Kubeflow profile\n", + " * The namespace will be the same namespace you are running the notebook in\n", + " * Creating a profile ensures the namespace is provisioned with service accounts and other resources needed for Kubeflow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tf_job_client = tf_job_client_module.TFJobClient()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tf_job_body = yaml.safe_load(train_spec)\n", + "tf_job = tf_job_client.create(tf_job_body, namespace=namespace) \n", + "\n", + "logging.info(f\"Created job {namespace}.{train_name}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Check the job\n", + "\n", + "* Above you used the python SDK for TFJob to check the status\n", + "* You can also use kubectl get the status of your job\n", + "* The job conditions will tell you whether the job is running, succeeded or failed" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!kubectl get tfjobs -o yaml {train_name}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get The Logs\n", + "\n", + "* There are two ways to get the logs for the training job\n", + "\n", + " 1. Using kubectl to fetch the pod logs\n", + " * These logs are ephemeral; they will be unavailable when the pod is garbage collected to free up resources\n", + " 1. Using Fluentd-Cloud-Watch\n", + " * Kubernetes data plane logs are not automatically available in AWS\n", + " * You need to install fluentd-cloud-watch plugin to ship containers logs to Cloud Watch \n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Deploy TensorBoard\n", + "\n", + "* You will create a Kubernetes Deployment to run TensorBoard\n", + "* TensorBoard will be accessible behind the Kubeflow endpoint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tb_name = \"mnist-tensorboard\"\n", + "tb_deploy = f\"\"\"apiVersion: apps/v1\n", + "kind: Deployment\n", + "metadata:\n", + " labels:\n", + " app: mnist-tensorboard\n", + " name: {tb_name}\n", + " namespace: {namespace}\n", + "spec:\n", + " selector:\n", + " matchLabels:\n", + " app: mnist-tensorboard\n", + " template:\n", + " metadata:\n", + " labels:\n", + " app: mnist-tensorboard\n", + " version: v1\n", + " spec:\n", + " serviceAccount: default-editor\n", + " containers:\n", + " - command:\n", + " - /usr/local/bin/tensorboard\n", + " - --logdir={model_dir}\n", + " - --port=80\n", + " image: tensorflow/tensorflow:1.15.2-py3\n", + " name: tensorboard\n", + " env:\n", + " - name: AWS_REGION\n", + " value: {AWS_REGION}\n", + " - name: AWS_ACCESS_KEY_ID\n", + " valueFrom:\n", + " secretKeyRef:\n", + " name: aws-secret\n", + " key: AWS_ACCESS_KEY_ID\n", + " - name: AWS_SECRET_ACCESS_KEY\n", + " valueFrom:\n", + " secretKeyRef:\n", + " name: aws-secret\n", + " key: AWS_SECRET_ACCESS_KEY\n", + " ports:\n", + " - containerPort: 80\n", + "\"\"\"\n", + "tb_service = f\"\"\"apiVersion: v1\n", + "kind: Service\n", + "metadata:\n", + " labels:\n", + " app: mnist-tensorboard\n", + " name: {tb_name}\n", + " namespace: {namespace}\n", + "spec:\n", + " ports:\n", + " - name: http-tb\n", + " port: 80\n", + " targetPort: 80\n", + " selector:\n", + " app: mnist-tensorboard\n", + " type: ClusterIP\n", + "\"\"\"\n", + "\n", + "tb_virtual_service = f\"\"\"apiVersion: networking.istio.io/v1alpha3\n", + "kind: VirtualService\n", + "metadata:\n", + " name: {tb_name}\n", + " namespace: {namespace}\n", + "spec:\n", + " gateways:\n", + " - kubeflow/kubeflow-gateway\n", + " hosts:\n", + " - '*'\n", + " http:\n", + " - match:\n", + " - uri:\n", + " prefix: /mnist/{namespace}/tensorboard/\n", + " rewrite:\n", + " uri: /\n", + " route:\n", + " - destination:\n", + " host: {tb_name}.{namespace}.svc.cluster.local\n", + " port:\n", + " number: 80\n", + " timeout: 300s\n", + "\"\"\"\n", + "\n", + "tb_specs = [tb_deploy, tb_service, tb_virtual_service]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "k8s_util.apply_k8s_specs(tb_specs, k8s_util.K8S_CREATE_OR_REPLACE)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Access The TensorBoard UI\n", + "\n", + "> Note: By default, your namespace may not have access to `istio-system` namespace to get " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "endpoint = k8s_util.get_ingress_endpoint() \n", + "if endpoint: \n", + " vs = yaml.safe_load(tb_virtual_service)\n", + " path= vs[\"spec\"][\"http\"][0][\"match\"][0][\"uri\"][\"prefix\"]\n", + " tb_endpoint = endpoint + path\n", + " display(HTML(f\"TensorBoard UI is at {tb_endpoint}\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Wait For the Training Job to finish" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "* You can use the TFJob client to wait for it to finish." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tf_job = tf_job_client.wait_for_condition(train_name, expected_condition=[\"Succeeded\", \"Failed\"], namespace=namespace)\n", + "\n", + "if tf_job_client.is_job_succeeded(train_name, namespace):\n", + " logging.info(f\"TFJob {namespace}.{train_name} succeeded\")\n", + "else:\n", + " raise ValueError(f\"TFJob {namespace}.{train_name} failed\") " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Serve the model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "* Deploy the model using tensorflow serving\n", + "* We need to create\n", + " 1. A Kubernetes Deployment\n", + " 1. A Kubernetes service\n", + " 1. (Optional) Create a configmap containing the prometheus monitoring config" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "deploy_name = \"mnist-model\"\n", + "model_base_path = export_path\n", + "\n", + "# The web ui defaults to mnist-service so if you change it you will\n", + "# need to change it in the UI as well to send predictions to the mode\n", + "model_service = \"mnist-service\"\n", + "\n", + "deploy_spec = f\"\"\"apiVersion: apps/v1\n", + "kind: Deployment\n", + "metadata:\n", + " labels:\n", + " app: mnist\n", + " name: {deploy_name}\n", + " namespace: {namespace}\n", + "spec:\n", + " selector:\n", + " matchLabels:\n", + " app: mnist-model\n", + " template:\n", + " metadata:\n", + " # TODO(jlewi): Right now we disable the istio side car because otherwise ISTIO rbac will prevent the\n", + " # UI from sending RPCs to the server. We should create an appropriate ISTIO rbac authorization\n", + " # policy to allow traffic from the UI to the model servier.\n", + " # https://istio.io/docs/concepts/security/#target-selectors\n", + " annotations: \n", + " sidecar.istio.io/inject: \"false\"\n", + " labels:\n", + " app: mnist-model\n", + " version: v1\n", + " spec:\n", + " serviceAccount: default-editor\n", + " containers:\n", + " - args:\n", + " - --port=9000\n", + " - --rest_api_port=8500\n", + " - --model_name=mnist\n", + " - --model_base_path={model_base_path}\n", + " - --monitoring_config_file=/var/config/monitoring_config.txt\n", + " command:\n", + " - /usr/bin/tensorflow_model_server\n", + " env:\n", + " - name: modelBasePath\n", + " value: {model_base_path}\n", + " - name: AWS_REGION\n", + " value: {AWS_REGION}\n", + " - name: AWS_ACCESS_KEY_ID\n", + " valueFrom:\n", + " secretKeyRef:\n", + " name: aws-secret\n", + " key: AWS_ACCESS_KEY_ID\n", + " - name: AWS_SECRET_ACCESS_KEY\n", + " valueFrom:\n", + " secretKeyRef:\n", + " name: aws-secret\n", + " key: AWS_SECRET_ACCESS_KEY\n", + " image: tensorflow/serving:1.15.0\n", + " imagePullPolicy: IfNotPresent\n", + " livenessProbe:\n", + " initialDelaySeconds: 30\n", + " periodSeconds: 30\n", + " tcpSocket:\n", + " port: 9000\n", + " name: mnist\n", + " ports:\n", + " - containerPort: 9000\n", + " - containerPort: 8500\n", + " resources:\n", + " limits:\n", + " cpu: \"1\"\n", + " memory: 1Gi\n", + " requests:\n", + " cpu: \"1\"\n", + " memory: 1Gi\n", + " volumeMounts:\n", + " - mountPath: /var/config/\n", + " name: model-config\n", + " volumes:\n", + " - configMap:\n", + " name: {deploy_name}\n", + " name: model-config\n", + "\"\"\"\n", + "\n", + "service_spec = f\"\"\"apiVersion: v1\n", + "kind: Service\n", + "metadata:\n", + " annotations: \n", + " prometheus.io/path: /monitoring/prometheus/metrics\n", + " prometheus.io/port: \"8500\"\n", + " prometheus.io/scrape: \"true\"\n", + " labels:\n", + " app: mnist-model\n", + " name: {model_service}\n", + " namespace: {namespace}\n", + "spec:\n", + " ports:\n", + " - name: grpc-tf-serving\n", + " port: 9000\n", + " targetPort: 9000\n", + " - name: http-tf-serving\n", + " port: 8500\n", + " targetPort: 8500\n", + " selector:\n", + " app: mnist-model\n", + " type: ClusterIP\n", + "\"\"\"\n", + "\n", + "monitoring_config = f\"\"\"kind: ConfigMap\n", + "apiVersion: v1\n", + "metadata:\n", + " name: {deploy_name}\n", + " namespace: {namespace}\n", + "data:\n", + " monitoring_config.txt: |-\n", + " prometheus_config: {{\n", + " enable: true,\n", + " path: \"/monitoring/prometheus/metrics\"\n", + " }}\n", + "\"\"\"\n", + "\n", + "model_specs = [deploy_spec, service_spec, monitoring_config]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "k8s_util.apply_k8s_specs(model_specs, k8s_util.K8S_CREATE_OR_REPLACE)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Deploy the mnist UI\n", + "\n", + "* We will now deploy the UI to visual the mnist results\n", + "* Note: This is using a prebuilt and public docker image for the UI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ui_name = \"mnist-ui\"\n", + "ui_deploy = f\"\"\"apiVersion: apps/v1\n", + "kind: Deployment\n", + "metadata:\n", + " name: {ui_name}\n", + " namespace: {namespace}\n", + "spec:\n", + " replicas: 1\n", + " selector:\n", + " matchLabels:\n", + " app: mnist-web-ui\n", + " template:\n", + " metadata:\n", + " labels:\n", + " app: mnist-web-ui\n", + " spec:\n", + " containers:\n", + " - image: gcr.io/kubeflow-examples/mnist/web-ui:v20190112-v0.2-142-g3b38225\n", + " name: web-ui\n", + " ports:\n", + " - containerPort: 5000 \n", + " serviceAccount: default-editor\n", + "\"\"\"\n", + "\n", + "ui_service = f\"\"\"apiVersion: v1\n", + "kind: Service\n", + "metadata:\n", + " annotations:\n", + " name: {ui_name}\n", + " namespace: {namespace}\n", + "spec:\n", + " ports:\n", + " - name: http-mnist-ui\n", + " port: 80\n", + " targetPort: 5000\n", + " selector:\n", + " app: mnist-web-ui\n", + " type: ClusterIP\n", + "\"\"\"\n", + "\n", + "ui_virtual_service = f\"\"\"apiVersion: networking.istio.io/v1alpha3\n", + "kind: VirtualService\n", + "metadata:\n", + " name: {ui_name}\n", + " namespace: {namespace}\n", + "spec:\n", + " gateways:\n", + " - kubeflow/kubeflow-gateway\n", + " hosts:\n", + " - '*'\n", + " http:\n", + " - match:\n", + " - uri:\n", + " prefix: /mnist/{namespace}/ui/\n", + " rewrite:\n", + " uri: /\n", + " route:\n", + " - destination:\n", + " host: {ui_name}.{namespace}.svc.cluster.local\n", + " port:\n", + " number: 80\n", + " timeout: 300s\n", + "\"\"\"\n", + "\n", + "ui_specs = [ui_deploy, ui_service, ui_virtual_service]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "k8s_util.apply_k8s_specs(ui_specs, k8s_util.K8S_CREATE_OR_REPLACE) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Access the web UI\n", + "\n", + "* A reverse proxy route is automatically added to the Kubeflow endpoint\n", + "* The endpoint will be\n", + "\n", + " ```\n", + " http:/${KUBEflOW_ENDPOINT}/mnist/${NAMESPACE}/ui/ \n", + " ```\n", + "* You can get the KUBEFLOW_ENDPOINT\n", + "\n", + " ```\n", + " KUBEfLOW_ENDPOINT=`kubectl -n istio-system get ingress istio-ingress -o jsonpath=\"{.status.loadBalancer.ingress[0].hostname}\"`\n", + " ```\n", + " \n", + " * You must run this command with sufficient RBAC permissions to get the ingress.\n", + " \n", + "* If you have sufficient privileges you can run the cell below to get the endpoint if you don't have sufficient priveleges you can \n", + " grant appropriate permissions by running the command\n", + " \n", + " ```\n", + " kubectl create --namespace=istio-system rolebinding --clusterrole=kubeflow-view --serviceaccount=${NAMESPACE}:default-editor ${NAMESPACE}-istio-view\n", + " ```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "endpoint = k8s_util.get_ingress_endpoint() \n", + "if endpoint: \n", + " vs = yaml.safe_load(ui_virtual_service)\n", + " path= vs[\"spec\"][\"http\"][0][\"match\"][0][\"uri\"][\"prefix\"]\n", + " ui_endpoint = endpoint + path\n", + " display(HTML(f\"mnist UI is at {ui_endpoint}\"))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/mnist/monitoring/S3/deployment_patch.yaml b/mnist/monitoring/S3/deployment_patch.yaml deleted file mode 100644 index 7e5407d41..000000000 --- a/mnist/monitoring/S3/deployment_patch.yaml +++ /dev/null @@ -1,46 +0,0 @@ -- op: add - path: /spec/template/spec/containers/0/env/- - value: - name: S3_ENDPOINT - value: $(S3_ENDPOINT) -- op: add - path: /spec/template/spec/containers/0/env/- - value: - name: AWS_ENDPOINT_URL - value: $(AWS_ENDPOINT_URL) -- op: add - path: /spec/template/spec/containers/0/env/- - value: - name: AWS_REGION - value: $(AWS_REGION) -- op: add - path: /spec/template/spec/containers/0/env/- - value: - name: BUCKET_NAME - value: $(BUCKET_NAME) -- op: add - path: /spec/template/spec/containers/0/env/- - value: - name: S3_USE_HTTPS - value: $(S3_USE_HTTPS) -- op: add - path: /spec/template/spec/containers/0/env/- - value: - name: S3_VERIFY_SSL - value: $(S3_VERIFY_SSL) -- op: add - path: /spec/template/spec/containers/0/env/- - value: - name: AWS_ACCESS_KEY_ID - valueFrom: - secretKeyRef: - key: $(awsAccessKeyIDName) - name: $(awsSecretName) -- op: add - path: /spec/template/spec/containers/0/env/- - value: - name: AWS_SECRET_ACCESS_KEY - valueFrom: - secretKeyRef: - key: $(awsSecretAccessKeyName) - name: $(awsSecretName) diff --git a/mnist/monitoring/S3/kustomization.yaml b/mnist/monitoring/S3/kustomization.yaml deleted file mode 100644 index b6e5865af..000000000 --- a/mnist/monitoring/S3/kustomization.yaml +++ /dev/null @@ -1,81 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -bases: -- ../base - -configurations: -- params.yaml - -vars: -- fieldref: - fieldPath: data.S3_ENDPOINT - name: S3_ENDPOINT - objref: - apiVersion: v1 - kind: ConfigMap - name: mnist-map-monitoring -- fieldref: - fieldPath: data.AWS_ENDPOINT_URL - name: AWS_ENDPOINT_URL - objref: - apiVersion: v1 - kind: ConfigMap - name: mnist-map-monitoring -- fieldref: - fieldPath: data.AWS_REGION - name: AWS_REGION - objref: - apiVersion: v1 - kind: ConfigMap - name: mnist-map-monitoring -- fieldref: - fieldPath: data.BUCKET_NAME - name: BUCKET_NAME - objref: - apiVersion: v1 - kind: ConfigMap - name: mnist-map-monitoring -- fieldref: - fieldPath: data.S3_USE_HTTPS - name: S3_USE_HTTPS - objref: - apiVersion: v1 - kind: ConfigMap - name: mnist-map-monitoring -- fieldref: - fieldPath: data.S3_VERIFY_SSL - name: S3_VERIFY_SSL - objref: - apiVersion: v1 - kind: ConfigMap - name: mnist-map-monitoring -- fieldref: - fieldPath: metadata.name - name: awsSecretName - objref: - apiVersion: v1 - kind: Secret - name: aws-creds -- fieldref: - fieldPath: data.awsAccessKeyIDName - name: awsAccessKeyIDName - objref: - apiVersion: v1 - kind: ConfigMap - name: mnist-map-monitoring -- fieldref: - fieldPath: data.awsSecretAccessKeyName - name: awsSecretAccessKeyName - objref: - apiVersion: v1 - kind: ConfigMap - name: mnist-map-monitoring - -patchesJson6902: -- path: deployment_patch.yaml - target: - group: apps - kind: Deployment - name: tensorboard-tb - version: v1beta1 diff --git a/mnist/monitoring/S3/params.yaml b/mnist/monitoring/S3/params.yaml deleted file mode 100644 index 771a702b5..000000000 --- a/mnist/monitoring/S3/params.yaml +++ /dev/null @@ -1,5 +0,0 @@ -varReference: -- path: spec/template/spec/containers/env/valueFrom/secretKeyRef/name - kind: Deployment -- path: spec/template/spec/containers/env/valueFrom/secretKeyRef/key - kind: Deployment diff --git a/mnist/notebook_setup.py b/mnist/notebook_setup.py index acd8950e2..3fa395ffa 100644 --- a/mnist/notebook_setup.py +++ b/mnist/notebook_setup.py @@ -14,7 +14,8 @@ TF_OPERATOR_COMMIT = "9238906" -def notebook_setup(): +# add env default to google, we can override it. +def notebook_setup(platform ='gcp'): # Install the SDK logging.basicConfig(format='%(message)s') logging.getLogger().setLevel(logging.INFO) @@ -32,14 +33,6 @@ def notebook_setup(): logging.info(f"Checkout kubeflow/tf-operator @{TF_OPERATOR_COMMIT}") subprocess.check_call(["git", "checkout", TF_OPERATOR_COMMIT], cwd=clone_dir) - logging.info("Configure docker credentials") - subprocess.check_call(["gcloud", "auth", "configure-docker", "--quiet"]) - if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): - logging.info("Activating service account") - subprocess.check_call(["gcloud", "auth", "activate-service-account", - "--key-file=" + - os.getenv("GOOGLE_APPLICATION_CREDENTIALS"), - "--quiet"]) # Installing the python packages locally doesn't appear to have them automatically # added the path so we need to manually add the directory local_py_path = os.path.join(home, ".local/lib/python3.6/site-packages") @@ -51,7 +44,20 @@ def notebook_setup(): # Insert at front because we want to override any installed packages sys.path.insert(0, p) + if platform == 'gcp': + setup_gcp() + # Force a reload of kubeflow; since kubeflow is a multi namespace module # if we've loaded up some new kubeflow subpackages we need to force a reload to see them. import kubeflow reload(kubeflow) + +def setup_gcp(): + logging.info("Configure docker credentials") + subprocess.check_call(["gcloud", "auth", "configure-docker", "--quiet"]) + if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"): + logging.info("Activating service account") + subprocess.check_call(["gcloud", "auth", "activate-service-account", + "--key-file=" + + os.getenv("GOOGLE_APPLICATION_CREDENTIALS"), + "--quiet"]) \ No newline at end of file diff --git a/mnist/serving/S3/deployment_patch.yaml b/mnist/serving/S3/deployment_patch.yaml deleted file mode 100644 index c133b32c4..000000000 --- a/mnist/serving/S3/deployment_patch.yaml +++ /dev/null @@ -1,41 +0,0 @@ -- op: add - path: /spec/template/spec/containers/0/env/- - value: - name: s3Enable - value: $(s3Enable) -- op: add - path: /spec/template/spec/containers/0/env/- - value: - name: S3_ENDPOINT - value: $(S3_ENDPOINT) -- op: add - path: /spec/template/spec/containers/0/env/- - value: - name: AWS_REGION - value: $(AWS_REGION) -- op: add - path: /spec/template/spec/containers/0/env/- - value: - name: S3_USE_HTTPS - value: $(S3_USE_HTTPS) -- op: add - path: /spec/template/spec/containers/0/env/- - value: - name: S3_VERIFY_SSL - value: $(S3_VERIFY_SSL) -- op: add - path: /spec/template/spec/containers/0/env/- - value: - name: AWS_ACCESS_KEY_ID - valueFrom: - secretKeyRef: - key: $(AWS_ACCESS_KEY_ID) - name: $(awsSecretName) -- op: add - path: /spec/template/spec/containers/0/env/- - value: - name: AWS_SECRET_ACCESS_KEY - valueFrom: - secretKeyRef: - key: $(AWS_SECRET_ACCESS_KEY) - name: $(awsSecretName) diff --git a/mnist/serving/S3/kustomization.yaml b/mnist/serving/S3/kustomization.yaml deleted file mode 100644 index 711a7464f..000000000 --- a/mnist/serving/S3/kustomization.yaml +++ /dev/null @@ -1,74 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -bases: -- ../base - -configurations: -- params.yaml - -vars: -- fieldref: - fieldPath: data.s3Enable - name: s3Enable - objref: - apiVersion: v1 - kind: ConfigMap - name: mnist-map-serving -- fieldref: - fieldPath: data.S3_ENDPOINT - name: S3_ENDPOINT - objref: - apiVersion: v1 - kind: ConfigMap - name: mnist-map-serving -- fieldref: - fieldPath: data.AWS_REGION - name: AWS_REGION - objref: - apiVersion: v1 - kind: ConfigMap - name: mnist-map-serving -- fieldref: - fieldPath: data.S3_USE_HTTPS - name: S3_USE_HTTPS - objref: - apiVersion: v1 - kind: ConfigMap - name: mnist-map-serving -- fieldref: - fieldPath: data.S3_VERIFY_SSL - name: S3_VERIFY_SSL - objref: - apiVersion: v1 - kind: ConfigMap - name: mnist-map-serving -- fieldref: - fieldPath: metadata.name - name: awsSecretName - objref: - apiVersion: v1 - kind: Secret - name: aws-creds -- fieldref: - fieldPath: data.AWS_ACCESS_KEY_ID - name: AWS_ACCESS_KEY_ID - objref: - apiVersion: v1 - kind: ConfigMap - name: mnist-map-serving -- fieldref: - fieldPath: data.AWS_SECRET_ACCESS_KEY - name: AWS_SECRET_ACCESS_KEY - objref: - apiVersion: v1 - kind: ConfigMap - name: mnist-map-serving - -patchesJson6902: -- path: deployment_patch.yaml - target: - group: extensions - kind: Deployment - name: $(svcName) - version: v1beta1 \ No newline at end of file diff --git a/mnist/serving/S3/params.yaml b/mnist/serving/S3/params.yaml deleted file mode 100644 index 771a702b5..000000000 --- a/mnist/serving/S3/params.yaml +++ /dev/null @@ -1,5 +0,0 @@ -varReference: -- path: spec/template/spec/containers/env/valueFrom/secretKeyRef/name - kind: Deployment -- path: spec/template/spec/containers/env/valueFrom/secretKeyRef/key - kind: Deployment diff --git a/mnist/training/S3/Chief_patch.yaml b/mnist/training/S3/Chief_patch.yaml deleted file mode 100644 index e9acba697..000000000 --- a/mnist/training/S3/Chief_patch.yaml +++ /dev/null @@ -1,46 +0,0 @@ -- op: add - path: /spec/tfReplicaSpecs/Chief/template/spec/containers/0/env/- - value: - name: S3_ENDPOINT - value: $(S3_ENDPOINT) -- op: add - path: /spec/tfReplicaSpecs/Chief/template/spec/containers/0/env/- - value: - name: AWS_ENDPOINT_URL - value: $(AWS_ENDPOINT_URL) -- op: add - path: /spec/tfReplicaSpecs/Chief/template/spec/containers/0/env/- - value: - name: AWS_REGION - value: $(AWS_REGION) -- op: add - path: /spec/tfReplicaSpecs/Chief/template/spec/containers/0/env/- - value: - name: BUCKET_NAME - value: $(BUCKET_NAME) -- op: add - path: /spec/tfReplicaSpecs/Chief/template/spec/containers/0/env/- - value: - name: S3_USE_HTTPS - value: $(S3_USE_HTTPS) -- op: add - path: /spec/tfReplicaSpecs/Chief/template/spec/containers/0/env/- - value: - name: S3_VERIFY_SSL - value: $(S3_VERIFY_SSL) -- op: add - path: /spec/tfReplicaSpecs/Chief/template/spec/containers/0/env/- - value: - name: AWS_ACCESS_KEY_ID - valueFrom: - secretKeyRef: - key: $(awsAccessKeyIDName) - name: $(awsSecretName) -- op: add - path: /spec/tfReplicaSpecs/Chief/template/spec/containers/0/env/- - value: - name: AWS_SECRET_ACCESS_KEY - valueFrom: - secretKeyRef: - key: $(awsSecretAccessKeyName) - name: $(awsSecretName) diff --git a/mnist/training/S3/Ps_patch.yaml b/mnist/training/S3/Ps_patch.yaml deleted file mode 100644 index 17733dff9..000000000 --- a/mnist/training/S3/Ps_patch.yaml +++ /dev/null @@ -1,46 +0,0 @@ -- op: add - path: /spec/tfReplicaSpecs/Ps/template/spec/containers/0/env/- - value: - name: S3_ENDPOINT - value: $(S3_ENDPOINT) -- op: add - path: /spec/tfReplicaSpecs/Ps/template/spec/containers/0/env/- - value: - name: AWS_ENDPOINT_URL - value: $(AWS_ENDPOINT_URL) -- op: add - path: /spec/tfReplicaSpecs/Ps/template/spec/containers/0/env/- - value: - name: AWS_REGION - value: $(AWS_REGION) -- op: add - path: /spec/tfReplicaSpecs/Ps/template/spec/containers/0/env/- - value: - name: BUCKET_NAME - value: $(BUCKET_NAME) -- op: add - path: /spec/tfReplicaSpecs/Ps/template/spec/containers/0/env/- - value: - name: S3_USE_HTTPS - value: $(S3_USE_HTTPS) -- op: add - path: /spec/tfReplicaSpecs/Ps/template/spec/containers/0/env/- - value: - name: S3_VERIFY_SSL - value: $(S3_VERIFY_SSL) -- op: add - path: /spec/tfReplicaSpecs/Ps/template/spec/containers/0/env/- - value: - name: AWS_ACCESS_KEY_ID - valueFrom: - secretKeyRef: - key: $(awsAccessKeyIDName) - name: $(awsSecretName) -- op: add - path: /spec/tfReplicaSpecs/Ps/template/spec/containers/0/env/- - value: - name: AWS_SECRET_ACCESS_KEY - valueFrom: - secretKeyRef: - key: $(awsSecretAccessKeyName) - name: $(awsSecretName) diff --git a/mnist/training/S3/Worker_patch.yaml b/mnist/training/S3/Worker_patch.yaml deleted file mode 100644 index 6abd192aa..000000000 --- a/mnist/training/S3/Worker_patch.yaml +++ /dev/null @@ -1,46 +0,0 @@ -- op: add - path: /spec/tfReplicaSpecs/Worker/template/spec/containers/0/env/- - value: - name: S3_ENDPOINT - value: $(S3_ENDPOINT) -- op: add - path: /spec/tfReplicaSpecs/Worker/template/spec/containers/0/env/- - value: - name: AWS_ENDPOINT_URL - value: $(AWS_ENDPOINT_URL) -- op: add - path: /spec/tfReplicaSpecs/Worker/template/spec/containers/0/env/- - value: - name: AWS_REGION - value: $(AWS_REGION) -- op: add - path: /spec/tfReplicaSpecs/Worker/template/spec/containers/0/env/- - value: - name: BUCKET_NAME - value: $(BUCKET_NAME) -- op: add - path: /spec/tfReplicaSpecs/Worker/template/spec/containers/0/env/- - value: - name: S3_USE_HTTPS - value: $(S3_USE_HTTPS) -- op: add - path: /spec/tfReplicaSpecs/Worker/template/spec/containers/0/env/- - value: - name: S3_VERIFY_SSL - value: $(S3_VERIFY_SSL) -- op: add - path: /spec/tfReplicaSpecs/Worker/template/spec/containers/0/env/- - value: - name: AWS_ACCESS_KEY_ID - valueFrom: - secretKeyRef: - key: $(awsAccessKeyIDName) - name: $(awsSecretName) -- op: add - path: /spec/tfReplicaSpecs/Worker/template/spec/containers/0/env/- - value: - name: AWS_SECRET_ACCESS_KEY - valueFrom: - secretKeyRef: - key: $(awsSecretAccessKeyName) - name: $(awsSecretName) diff --git a/mnist/training/S3/kustomization.yaml b/mnist/training/S3/kustomization.yaml deleted file mode 100644 index a57ad171d..000000000 --- a/mnist/training/S3/kustomization.yaml +++ /dev/null @@ -1,86 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -bases: -- ../base - -configurations: -- params.yaml - -images: -- name: training-image - newName: gcr.io/kubeflow-examples/mnist/model - newTag: build-1202842504546750464 - -vars: -- fieldref: - fieldPath: data.S3_ENDPOINT - name: S3_ENDPOINT - objref: - apiVersion: v1 - kind: ConfigMap - name: mnist-map-training -- fieldref: - fieldPath: data.AWS_ENDPOINT_URL - name: AWS_ENDPOINT_URL - objref: - apiVersion: v1 - kind: ConfigMap - name: mnist-map-training -- fieldref: - fieldPath: data.AWS_REGION - name: AWS_REGION - objref: - apiVersion: v1 - kind: ConfigMap - name: mnist-map-training -- fieldref: - fieldPath: data.BUCKET_NAME - name: BUCKET_NAME - objref: - apiVersion: v1 - kind: ConfigMap - name: mnist-map-training -- fieldref: - fieldPath: data.S3_USE_HTTPS - name: S3_USE_HTTPS - objref: - apiVersion: v1 - kind: ConfigMap - name: mnist-map-training -- fieldref: - fieldPath: data.S3_VERIFY_SSL - name: S3_VERIFY_SSL - objref: - apiVersion: v1 - kind: ConfigMap - name: mnist-map-training -- fieldref: - fieldPath: metadata.name - name: awsSecretName - objref: - apiVersion: v1 - kind: Secret - name: aws-creds -- fieldref: - fieldPath: data.awsAccessKeyIDName - name: awsAccessKeyIDName - objref: - apiVersion: v1 - kind: ConfigMap - name: mnist-map-training -- fieldref: - fieldPath: data.awsSecretAccessKeyName - name: awsSecretAccessKeyName - objref: - apiVersion: v1 - kind: ConfigMap - name: mnist-map-training - -patchesJson6902: -- path: Chief_patch.yaml - target: - group: kubeflow.org - kind: TFJob - name: $(trainingName) - version: v1beta2 diff --git a/mnist/training/S3/params.yaml b/mnist/training/S3/params.yaml deleted file mode 100644 index d0756151d..000000000 --- a/mnist/training/S3/params.yaml +++ /dev/null @@ -1,17 +0,0 @@ -varReference: -- path: metadata/name - kind: TFJob -- path: metadata/name - kind: TFJob -- path: spec/tfReplicaSpecs/Chief/template/spec/containers/env/valueFrom/secretKeyRef/name - kind: TFJob -- path: spec/tfReplicaSpecs/Chief/template/spec/containers/env/valueFrom/secretKeyRef/key - kind: TFJob -- path: spec/tfReplicaSpecs/Ps/template/spec/containers/env/valueFrom/secretKeyRef/name - kind: TFJob -- path: spec/tfReplicaSpecs/Ps/template/spec/containers/env/valueFrom/secretKeyRef/key - kind: TFJob -- path: spec/tfReplicaSpecs/Worker/template/spec/containers/env/valueFrom/secretKeyRef/name - kind: TFJob -- path: spec/tfReplicaSpecs/Worker/template/spec/containers/env/valueFrom/secretKeyRef/key - kind: TFJob