diff --git a/.gitignore b/.gitignore index e6ad9798..53db6445 100644 --- a/.gitignore +++ b/.gitignore @@ -25,3 +25,6 @@ jupyter_execute/ # exclusions !source/examples/rapids-1brc-single-node/lookup.csv + +package.json +package-lock.json diff --git a/source/cloud/azure/aks.md b/source/cloud/azure/aks.md index 8917fc3c..1ecc5009 100644 --- a/source/cloud/azure/aks.md +++ b/source/cloud/azure/aks.md @@ -22,8 +22,8 @@ $ az login Now we can launch a GPU enabled AKS cluster. First launch an AKS cluster. -```bash -az aks create -g -n rapids \ +```console + az aks create -g -n rapids \ --enable-managed-identity \ --node-count 1 \ --enable-addons monitoring \ @@ -91,8 +91,8 @@ $ az extension add --name aks-preview ````` -```bash -az aks nodepool add \ +```console +$ az aks nodepool add \ --resource-group \ --cluster-name rapids \ --name gpunp \ diff --git a/source/examples/rapids-sagemaker-hpo/HPODatasets.py b/source/examples/rapids-sagemaker-hpo/HPODatasets.py index 35f347d3..3b0a139d 100644 --- a/source/examples/rapids-sagemaker-hpo/HPODatasets.py +++ b/source/examples/rapids-sagemaker-hpo/HPODatasets.py @@ -1,4 +1,4 @@ -""" Airline Dataset target label and feature column names """ +"""Airline Dataset target label and feature column names""" airline_label_column = "ArrDel15" airline_feature_columns = [ diff --git a/source/examples/xgboost-azure-mnmg-daskcloudprovider/notebook.ipynb b/source/examples/xgboost-azure-mnmg-daskcloudprovider/notebook.ipynb index 01586d14..73cf685e 100644 --- a/source/examples/xgboost-azure-mnmg-daskcloudprovider/notebook.ipynb +++ b/source/examples/xgboost-azure-mnmg-daskcloudprovider/notebook.ipynb @@ -178,10 +178,10 @@ "metadata": {}, "outputs": [], "source": [ - "location = \"West US 2\"\n", - "resource_group = \"rapidsai-deployment\"\n", - "vnet = \"rapidsai-deployment-vnet\"\n", - "security_group = \"rapidsaiclouddeploymenttest-nsg\"\n", + "location = \"FILL-THIS-IN\"\n", + "resource_group = \"FILL-THIS-IN\"\n", + "vnet = \"FILL-THIS-IN\"\n", + "security_group = \"FILL-THIS-IN\"\n", "vm_size = \"Standard_NC12s_v3\" # or choose a different GPU enabled VM type\n", "\n", "docker_image = \"{{rapids_container}}\"\n", diff --git a/source/examples/xgboost-gpu-hpo-mnmg-parallel-k8s/notebook.ipynb b/source/examples/xgboost-gpu-hpo-mnmg-parallel-k8s/notebook.ipynb index 37200062..524ed498 100644 --- a/source/examples/xgboost-gpu-hpo-mnmg-parallel-k8s/notebook.ipynb +++ b/source/examples/xgboost-gpu-hpo-mnmg-parallel-k8s/notebook.ipynb @@ -1,741 +1,741 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "c1db247a-15ab-41b1-a124-152484a29f29", - "metadata": { - "tags": [ - "library/xgboost", - "library/optuna", - "library/dask", - "library/dask-kubernetes", - "library/scikit-learn", - "workflow/hpo", - "platforms/kubeflow", - "dataset/nyc-taxi", - "data-storage/gcs", - "data-format/csv", - "platforms/kubernetes" - ] - }, - "source": [ - "# Scaling up Hyperparameter Optimization with Multi-GPU Workload on Kubernetes" - ] - }, - { - "cell_type": "markdown", - "id": "f7f02171-ed7b-48b4-9d55-32bb1149a3cf", - "metadata": {}, - "source": [ - "Choosing an optimal set of hyperparameters is a daunting task, especially for algorithms like XGBoost that have many hyperparameters to tune. In this notebook, we will speed up hyperparameter optimization by running multiple training jobs in parallel on a Kubernetes cluster. We handle larger data sets by splitting the data into multiple GPU devices." - ] - }, - { - "cell_type": "markdown", - "id": "a718e21f-5543-4f44-8a68-6ad8e78cb433", - "metadata": {}, - "source": [ - "## Prerequisites\n", - "Please follow instructions in [Dask Operator: Installation](../../tools/kubernetes/dask-operator) to install the Dask operator on top of a GPU-enabled Kubernetes cluster. (For the purpose of this example, you may ignore other sections of the linked document.\n", - "\n", - "### Optional: Kubeflow\n", - "Kubeflow gives you a nice notebook environment to run this notebook within the k8s cluster. Install Kubeflow by following instructions in [Installing Kubeflow](https://www.kubeflow.org/docs/started/installing-kubeflow/). You may choose any method; we tested this example after installing Kubeflow from manifests." - ] - }, - { - "cell_type": "markdown", - "id": "7b7f7bb3-5d53-4b8f-8472-bb974c8a597d", - "metadata": {}, - "source": [ - "## Install extra Python modules\n", - "We'll need a few extra Python modules." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "27b79db5-bbcd-422c-80a7-af873eb47711", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting dask_kubernetes\n", - " Downloading dask_kubernetes-2024.5.0-py3-none-any.whl.metadata (4.2 kB)\n", - "Collecting optuna\n", - " Downloading optuna-3.6.1-py3-none-any.whl.metadata (17 kB)\n", - "Requirement already satisfied: dask>=2022.08.1 in /opt/conda/lib/python3.11/site-packages (from dask_kubernetes) (2024.1.1)\n", - "Requirement already satisfied: distributed>=2022.08.1 in /opt/conda/lib/python3.11/site-packages (from dask_kubernetes) (2024.1.1)\n", - "Collecting kopf>=1.35.3 (from dask_kubernetes)\n", - " Downloading kopf-1.37.2-py3-none-any.whl.metadata (9.7 kB)\n", - "Collecting kr8s==0.14.* (from dask_kubernetes)\n", - " Downloading kr8s-0.14.4-py3-none-any.whl.metadata (6.7 kB)\n", - "Collecting kubernetes-asyncio>=12.0.1 (from dask_kubernetes)\n", - " Downloading kubernetes_asyncio-29.0.0-py3-none-any.whl.metadata (1.3 kB)\n", - "Collecting kubernetes>=12.0.1 (from dask_kubernetes)\n", - " Downloading kubernetes-29.0.0-py2.py3-none-any.whl.metadata (1.5 kB)\n", - "Collecting pykube-ng>=22.9.0 (from dask_kubernetes)\n", - " Downloading pykube_ng-23.6.0-py3-none-any.whl.metadata (8.0 kB)\n", - "Requirement already satisfied: rich>=12.5.1 in /opt/conda/lib/python3.11/site-packages (from dask_kubernetes) (13.7.1)\n", - "Requirement already satisfied: anyio>=3.7.0 in /opt/conda/lib/python3.11/site-packages (from kr8s==0.14.*->dask_kubernetes) (4.3.0)\n", - "Collecting asyncache>=0.3.1 (from kr8s==0.14.*->dask_kubernetes)\n", - " Downloading asyncache-0.3.1-py3-none-any.whl.metadata (2.0 kB)\n", - "Collecting cryptography>=35 (from kr8s==0.14.*->dask_kubernetes)\n", - " Downloading cryptography-42.0.7-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (5.3 kB)\n", - "Requirement already satisfied: exceptiongroup>=1.2.0 in /opt/conda/lib/python3.11/site-packages (from kr8s==0.14.*->dask_kubernetes) (1.2.0)\n", - "Collecting httpx-ws>=0.5.1 (from kr8s==0.14.*->dask_kubernetes)\n", - " Downloading httpx_ws-0.6.0-py3-none-any.whl.metadata (7.8 kB)\n", - "Requirement already satisfied: httpx>=0.24.1 in /opt/conda/lib/python3.11/site-packages (from kr8s==0.14.*->dask_kubernetes) (0.27.0)\n", - "Collecting python-box>=7.0.1 (from kr8s==0.14.*->dask_kubernetes)\n", - " Downloading python_box-7.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.8 kB)\n", - "Collecting python-jsonpath>=0.7.1 (from kr8s==0.14.*->dask_kubernetes)\n", - " Downloading python_jsonpath-1.1.1-py3-none-any.whl.metadata (5.3 kB)\n", - "Requirement already satisfied: pyyaml>=6.0 in /opt/conda/lib/python3.11/site-packages (from kr8s==0.14.*->dask_kubernetes) (6.0.1)\n", - "Collecting alembic>=1.5.0 (from optuna)\n", - " Downloading alembic-1.13.1-py3-none-any.whl.metadata (7.4 kB)\n", - "Collecting colorlog (from optuna)\n", - " Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)\n", - "Requirement already satisfied: numpy in /opt/conda/lib/python3.11/site-packages (from optuna) (1.26.4)\n", - "Requirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.11/site-packages (from optuna) (24.0)\n", - "Collecting sqlalchemy>=1.3.0 (from optuna)\n", - " Downloading SQLAlchemy-2.0.30-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)\n", - "Requirement already satisfied: tqdm in /opt/conda/lib/python3.11/site-packages (from optuna) (4.66.2)\n", - "Collecting Mako (from alembic>=1.5.0->optuna)\n", - " Downloading Mako-1.3.3-py3-none-any.whl.metadata (2.9 kB)\n", - "Requirement already satisfied: typing-extensions>=4 in /opt/conda/lib/python3.11/site-packages (from alembic>=1.5.0->optuna) (4.11.0)\n", - "Requirement already satisfied: click>=8.1 in /opt/conda/lib/python3.11/site-packages (from dask>=2022.08.1->dask_kubernetes) (8.1.7)\n", - "Requirement already satisfied: cloudpickle>=1.5.0 in /opt/conda/lib/python3.11/site-packages (from dask>=2022.08.1->dask_kubernetes) (3.0.0)\n", - "Requirement already satisfied: fsspec>=2021.09.0 in /opt/conda/lib/python3.11/site-packages (from dask>=2022.08.1->dask_kubernetes) (2024.3.1)\n", - "Requirement already satisfied: partd>=1.2.0 in /opt/conda/lib/python3.11/site-packages (from dask>=2022.08.1->dask_kubernetes) (1.4.1)\n", - "Requirement already satisfied: toolz>=0.10.0 in /opt/conda/lib/python3.11/site-packages (from dask>=2022.08.1->dask_kubernetes) (0.12.1)\n", - "Requirement already satisfied: importlib-metadata>=4.13.0 in /opt/conda/lib/python3.11/site-packages (from dask>=2022.08.1->dask_kubernetes) (7.1.0)\n", - "Requirement already satisfied: jinja2>=2.10.3 in /opt/conda/lib/python3.11/site-packages (from distributed>=2022.08.1->dask_kubernetes) (3.1.3)\n", - "Requirement already satisfied: locket>=1.0.0 in /opt/conda/lib/python3.11/site-packages (from distributed>=2022.08.1->dask_kubernetes) (1.0.0)\n", - "Requirement already satisfied: msgpack>=1.0.0 in /opt/conda/lib/python3.11/site-packages (from distributed>=2022.08.1->dask_kubernetes) (1.0.7)\n", - "Requirement already satisfied: psutil>=5.7.2 in /opt/conda/lib/python3.11/site-packages (from distributed>=2022.08.1->dask_kubernetes) (5.9.8)\n", - "Requirement already satisfied: sortedcontainers>=2.0.5 in /opt/conda/lib/python3.11/site-packages (from distributed>=2022.08.1->dask_kubernetes) (2.4.0)\n", - "Requirement already satisfied: tblib>=1.6.0 in /opt/conda/lib/python3.11/site-packages (from distributed>=2022.08.1->dask_kubernetes) (3.0.0)\n", - "Requirement already satisfied: tornado>=6.0.4 in /opt/conda/lib/python3.11/site-packages (from distributed>=2022.08.1->dask_kubernetes) (6.4)\n", - "Requirement already satisfied: urllib3>=1.24.3 in /opt/conda/lib/python3.11/site-packages (from distributed>=2022.08.1->dask_kubernetes) (1.26.18)\n", - "Requirement already satisfied: zict>=3.0.0 in /opt/conda/lib/python3.11/site-packages (from distributed>=2022.08.1->dask_kubernetes) (3.0.0)\n", - "Requirement already satisfied: python-json-logger in /opt/conda/lib/python3.11/site-packages (from kopf>=1.35.3->dask_kubernetes) (2.0.7)\n", - "Collecting iso8601 (from kopf>=1.35.3->dask_kubernetes)\n", - " Downloading iso8601-2.1.0-py3-none-any.whl.metadata (3.7 kB)\n", - "Requirement already satisfied: aiohttp in /opt/conda/lib/python3.11/site-packages (from kopf>=1.35.3->dask_kubernetes) (3.9.5)\n", - "Requirement already satisfied: certifi>=14.05.14 in /opt/conda/lib/python3.11/site-packages (from kubernetes>=12.0.1->dask_kubernetes) (2024.2.2)\n", - "Requirement already satisfied: six>=1.9.0 in /opt/conda/lib/python3.11/site-packages (from kubernetes>=12.0.1->dask_kubernetes) (1.16.0)\n", - "Requirement already satisfied: python-dateutil>=2.5.3 in /opt/conda/lib/python3.11/site-packages (from kubernetes>=12.0.1->dask_kubernetes) (2.9.0)\n", - "Collecting google-auth>=1.0.1 (from kubernetes>=12.0.1->dask_kubernetes)\n", - " Downloading google_auth-2.29.0-py2.py3-none-any.whl.metadata (4.7 kB)\n", - "Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /opt/conda/lib/python3.11/site-packages (from kubernetes>=12.0.1->dask_kubernetes) (1.8.0)\n", - "Requirement already satisfied: requests in /opt/conda/lib/python3.11/site-packages (from kubernetes>=12.0.1->dask_kubernetes) (2.31.0)\n", - "Collecting requests-oauthlib (from kubernetes>=12.0.1->dask_kubernetes)\n", - " Downloading requests_oauthlib-2.0.0-py2.py3-none-any.whl.metadata (11 kB)\n", - "Collecting oauthlib>=3.2.2 (from kubernetes>=12.0.1->dask_kubernetes)\n", - " Downloading oauthlib-3.2.2-py3-none-any.whl.metadata (7.5 kB)\n", - "Requirement already satisfied: setuptools>=21.0.0 in /opt/conda/lib/python3.11/site-packages (from kubernetes-asyncio>=12.0.1->dask_kubernetes) (69.5.1)\n", - "Requirement already satisfied: markdown-it-py>=2.2.0 in /opt/conda/lib/python3.11/site-packages (from rich>=12.5.1->dask_kubernetes) (3.0.0)\n", - "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /opt/conda/lib/python3.11/site-packages (from rich>=12.5.1->dask_kubernetes) (2.17.2)\n", - "Collecting greenlet!=0.4.17 (from sqlalchemy>=1.3.0->optuna)\n", - " Downloading greenlet-3.0.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (3.8 kB)\n", - "Requirement already satisfied: aiosignal>=1.1.2 in /opt/conda/lib/python3.11/site-packages (from aiohttp->kopf>=1.35.3->dask_kubernetes) (1.3.1)\n", - "Requirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.11/site-packages (from aiohttp->kopf>=1.35.3->dask_kubernetes) (23.2.0)\n", - "Requirement already satisfied: frozenlist>=1.1.1 in /opt/conda/lib/python3.11/site-packages (from aiohttp->kopf>=1.35.3->dask_kubernetes) (1.4.1)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/lib/python3.11/site-packages (from aiohttp->kopf>=1.35.3->dask_kubernetes) (6.0.5)\n", - "Requirement already satisfied: yarl<2.0,>=1.0 in /opt/conda/lib/python3.11/site-packages (from aiohttp->kopf>=1.35.3->dask_kubernetes) (1.9.4)\n", - "Requirement already satisfied: idna>=2.8 in /opt/conda/lib/python3.11/site-packages (from anyio>=3.7.0->kr8s==0.14.*->dask_kubernetes) (3.7)\n", - "Requirement already satisfied: sniffio>=1.1 in /opt/conda/lib/python3.11/site-packages (from anyio>=3.7.0->kr8s==0.14.*->dask_kubernetes) (1.3.1)\n", - "Requirement already satisfied: cachetools<6.0.0,>=5.2.0 in /opt/conda/lib/python3.11/site-packages (from asyncache>=0.3.1->kr8s==0.14.*->dask_kubernetes) (5.3.3)\n", - "Requirement already satisfied: cffi>=1.12 in /opt/conda/lib/python3.11/site-packages (from cryptography>=35->kr8s==0.14.*->dask_kubernetes) (1.16.0)\n", - "Collecting pyasn1-modules>=0.2.1 (from google-auth>=1.0.1->kubernetes>=12.0.1->dask_kubernetes)\n", - " Downloading pyasn1_modules-0.4.0-py3-none-any.whl.metadata (3.4 kB)\n", - "Collecting rsa<5,>=3.1.4 (from google-auth>=1.0.1->kubernetes>=12.0.1->dask_kubernetes)\n", - " Downloading rsa-4.9-py3-none-any.whl.metadata (4.2 kB)\n", - "Requirement already satisfied: httpcore==1.* in /opt/conda/lib/python3.11/site-packages (from httpx>=0.24.1->kr8s==0.14.*->dask_kubernetes) (1.0.5)\n", - "Requirement already satisfied: h11<0.15,>=0.13 in /opt/conda/lib/python3.11/site-packages (from httpcore==1.*->httpx>=0.24.1->kr8s==0.14.*->dask_kubernetes) (0.14.0)\n", - "Collecting wsproto (from httpx-ws>=0.5.1->kr8s==0.14.*->dask_kubernetes)\n", - " Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)\n", - "Requirement already satisfied: zipp>=0.5 in /opt/conda/lib/python3.11/site-packages (from importlib-metadata>=4.13.0->dask>=2022.08.1->dask_kubernetes) (3.17.0)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.11/site-packages (from jinja2>=2.10.3->distributed>=2022.08.1->dask_kubernetes) (2.1.5)\n", - "Requirement already satisfied: mdurl~=0.1 in /opt/conda/lib/python3.11/site-packages (from markdown-it-py>=2.2.0->rich>=12.5.1->dask_kubernetes) (0.1.2)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.11/site-packages (from requests->kubernetes>=12.0.1->dask_kubernetes) (3.3.2)\n", - "Requirement already satisfied: pycparser in /opt/conda/lib/python3.11/site-packages (from cffi>=1.12->cryptography>=35->kr8s==0.14.*->dask_kubernetes) (2.22)\n", - "Collecting pyasn1<0.7.0,>=0.4.6 (from pyasn1-modules>=0.2.1->google-auth>=1.0.1->kubernetes>=12.0.1->dask_kubernetes)\n", - " Downloading pyasn1-0.6.0-py2.py3-none-any.whl.metadata (8.3 kB)\n", - "Downloading dask_kubernetes-2024.5.0-py3-none-any.whl (157 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m157.2/157.2 kB\u001b[0m \u001b[31m2.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m0:01\u001b[0m\n", - "\u001b[?25hDownloading kr8s-0.14.4-py3-none-any.whl (60 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.7/60.7 kB\u001b[0m \u001b[31m7.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading optuna-3.6.1-py3-none-any.whl (380 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m380.1/380.1 kB\u001b[0m \u001b[31m8.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", - "\u001b[?25hDownloading alembic-1.13.1-py3-none-any.whl (233 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m233.4/233.4 kB\u001b[0m \u001b[31m23.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading kopf-1.37.2-py3-none-any.whl (207 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m207.8/207.8 kB\u001b[0m \u001b[31m15.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading kubernetes-29.0.0-py2.py3-none-any.whl (1.6 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m27.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", - "\u001b[?25hDownloading kubernetes_asyncio-29.0.0-py3-none-any.whl (2.0 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m83.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading pykube_ng-23.6.0-py3-none-any.whl (26 kB)\n", - "Downloading SQLAlchemy-2.0.30-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.2/3.2 MB\u001b[0m \u001b[31m122.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading colorlog-6.8.2-py3-none-any.whl (11 kB)\n", - "Downloading asyncache-0.3.1-py3-none-any.whl (3.7 kB)\n", - "Downloading cryptography-42.0.7-cp39-abi3-manylinux_2_28_x86_64.whl (3.8 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.8/3.8 MB\u001b[0m \u001b[31m125.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading google_auth-2.29.0-py2.py3-none-any.whl (189 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m189.2/189.2 kB\u001b[0m \u001b[31m29.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading greenlet-3.0.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (620 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m620.0/620.0 kB\u001b[0m \u001b[31m61.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading httpx_ws-0.6.0-py3-none-any.whl (13 kB)\n", - "Downloading oauthlib-3.2.2-py3-none-any.whl (151 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m151.7/151.7 kB\u001b[0m \u001b[31m24.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading python_box-7.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.3 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.3/4.3 MB\u001b[0m \u001b[31m131.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading python_jsonpath-1.1.1-py3-none-any.whl (51 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m51.5/51.5 kB\u001b[0m \u001b[31m8.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading iso8601-2.1.0-py3-none-any.whl (7.5 kB)\n", - "Downloading Mako-1.3.3-py3-none-any.whl (78 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.8/78.8 kB\u001b[0m \u001b[31m12.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading requests_oauthlib-2.0.0-py2.py3-none-any.whl (24 kB)\n", - "Downloading pyasn1_modules-0.4.0-py3-none-any.whl (181 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m181.2/181.2 kB\u001b[0m \u001b[31m27.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading rsa-4.9-py3-none-any.whl (34 kB)\n", - "Downloading wsproto-1.2.0-py3-none-any.whl (24 kB)\n", - "Downloading pyasn1-0.6.0-py2.py3-none-any.whl (85 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m85.3/85.3 kB\u001b[0m \u001b[31m12.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hInstalling collected packages: wsproto, python-jsonpath, python-box, pyasn1, oauthlib, Mako, iso8601, greenlet, colorlog, asyncache, sqlalchemy, rsa, requests-oauthlib, pykube-ng, pyasn1-modules, cryptography, kubernetes-asyncio, kopf, httpx-ws, google-auth, alembic, optuna, kubernetes, kr8s, dask_kubernetes\n", - "Successfully installed Mako-1.3.3 alembic-1.13.1 asyncache-0.3.1 colorlog-6.8.2 cryptography-42.0.7 dask_kubernetes-2024.5.0 google-auth-2.29.0 greenlet-3.0.3 httpx-ws-0.6.0 iso8601-2.1.0 kopf-1.37.2 kr8s-0.14.4 kubernetes-29.0.0 kubernetes-asyncio-29.0.0 oauthlib-3.2.2 optuna-3.6.1 pyasn1-0.6.0 pyasn1-modules-0.4.0 pykube-ng-23.6.0 python-box-7.1.1 python-jsonpath-1.1.1 requests-oauthlib-2.0.0 rsa-4.9 sqlalchemy-2.0.30 wsproto-1.2.0\n" - ] - } - ], - "source": [ - "!pip install dask_kubernetes optuna" - ] - }, - { - "cell_type": "markdown", - "id": "acc8f524-dc9b-41d7-8faa-3aea23ee1983", - "metadata": {}, - "source": [ - "## Import Python modules" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "0c8a1ffb-0b03-4d4a-9ab1-0561bf5533d9", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import threading\n", - "import warnings\n", - "\n", - "import cupy as cp\n", - "import cuspatial\n", - "import dask_cudf\n", - "import optuna\n", - "from cuml.dask.common import utils as dask_utils\n", - "from dask.distributed import Client, wait\n", - "from dask_kubernetes.operator import KubeCluster\n", - "from dask_ml.metrics import mean_squared_error\n", - "from dask_ml.model_selection import KFold\n", - "from xgboost import dask as dxgb" - ] - }, - { - "cell_type": "markdown", - "id": "b2d61e0b-229b-40c0-889d-b8242e574fc8", - "metadata": {}, - "source": [ - "## Set up multiple Dask clusters\n", - "\n", - "To run multi-GPU training jobs in parallel, we will create multiple Dask clusters each controlling its share of GPUs. It's best to think of each Dask cluster as a portion of the compute resource of the Kubernetes cluster.\n", - "\n", - "Fill in the following variables:" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "d1c22c3c-51b2-4526-b1fa-ac012f616e13", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "n_clusters=2\n", - "n_worker_per_dask_cluster=2\n", - "n_node_per_dask_cluster=3\n" - ] - } - ], - "source": [ - "# Number of nodes in the Kubernetes cluster.\n", - "# Each node is assumed to have a single NVIDIA GPU attached\n", - "n_nodes = 7\n", - "\n", - "# Number of worker nodes to be assigned to each Dask cluster\n", - "n_worker_per_dask_cluster = 2\n", - "\n", - "# Number of nodes to be assigned to each Dask cluster\n", - "# 1 is added since the Dask cluster's scheduler process needs to be mapped to its own node\n", - "n_node_per_dask_cluster = n_worker_per_dask_cluster + 1\n", - "\n", - "# Number of Dask clusters to be created\n", - "# Subtract 1 to account for the notebook pod (it requires its own node)\n", - "n_clusters = (n_nodes - 1) // n_node_per_dask_cluster\n", - "\n", - "print(f\"{n_clusters=}\")\n", - "if n_clusters == 0:\n", - " raise ValueError(\n", - " \"No cluster can be created. Reduce `n_worker_per_dask_cluster` or create more compute nodes\"\n", - " )\n", - "print(f\"{n_worker_per_dask_cluster=}\")\n", - "print(f\"{n_node_per_dask_cluster=}\")\n", - "\n", - "n_node_active = n_clusters * n_node_per_dask_cluster + 1\n", - "if n_node_active != n_nodes:\n", - " n_idle = n_nodes - n_node_active\n", - " warnings.warn(f\"{n_idle} node(s) will not be used\", stacklevel=2)" - ] - }, - { - "cell_type": "markdown", - "id": "c0eee823-162f-47e9-be4c-41447b2d7ee9", - "metadata": {}, - "source": [ - "Once we've determined the number of Dask clusters and their size, we are now ready to launch them:" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "8d0b632a-b73d-4351-bb5d-8a1f4ab1a2c4", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Choose the same RAPIDS image you used for launching the notebook session\n", - "rapids_image = \"{{ rapids_notebook_container }}\"" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "62aa9e52-c5b6-487c-8f02-88ea84980cfc", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "e24e5095ae78458e804d5f1212372f9a", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Output()" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Launching cluster 0...\n" - ] - }, - { - "data": { - "text/html": [ - "
\n"
-                        ],
-                        "text/plain": []
-                    },
-                    "metadata": {},
-                    "output_type": "display_data"
-                },
-                {
-                    "data": {
-                        "application/vnd.jupyter.widget-view+json": {
-                            "model_id": "240e689def1549c1b5dfd87284192e96",
-                            "version_major": 2,
-                            "version_minor": 0
-                        },
-                        "text/plain": [
-                            "Output()"
-                        ]
-                    },
-                    "metadata": {},
-                    "output_type": "display_data"
-                },
-                {
-                    "name": "stdout",
-                    "output_type": "stream",
-                    "text": [
-                        "Launching cluster 1...\n"
-                    ]
-                },
-                {
-                    "data": {
-                        "text/html": [
-                            "
\n"
-                        ],
-                        "text/plain": []
-                    },
-                    "metadata": {},
-                    "output_type": "display_data"
-                }
-            ],
-            "source": [
-                "clusters = []\n",
-                "for i in range(n_clusters):\n",
-                "    print(f\"Launching cluster {i}...\")\n",
-                "    clusters.append(\n",
-                "        KubeCluster(\n",
-                "            name=f\"rapids-dask{i}\",\n",
-                "            image=rapids_image,\n",
-                "            worker_command=\"dask-cuda-worker\",\n",
-                "            n_workers=2,\n",
-                "            resources={\"limits\": {\"nvidia.com/gpu\": \"1\"}},\n",
-                "            env={\"EXTRA_PIP_PACKAGES\": \"optuna\"},\n",
-                "        )\n",
-                "    )"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "f37fa67f-fa90-432c-bed3-8f2a8a095795",
-            "metadata": {},
-            "source": [
-                "## Set up Hyperparameter Optimization Task with NYC Taxi data\n",
-                "\n",
-                "Anaconda has graciously made some of the NYC Taxi dataset available in a public Google Cloud Storage bucket. We'll use our Cluster of GPUs to process it and train a model that predicts the fare amount. We'll use our Dask clusters to process it and train a model that predicts the fare amount."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 34,
-            "id": "c84929a5-f13b-4a61-9ed6-aa8060129e17",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [],
-            "source": [
-                "col_dtype = {\n",
-                "    \"VendorID\": \"int32\",\n",
-                "    \"tpep_pickup_datetime\": \"datetime64[ms]\",\n",
-                "    \"tpep_dropoff_datetime\": \"datetime64[ms]\",\n",
-                "    \"passenger_count\": \"int32\",\n",
-                "    \"trip_distance\": \"float32\",\n",
-                "    \"pickup_longitude\": \"float32\",\n",
-                "    \"pickup_latitude\": \"float32\",\n",
-                "    \"RatecodeID\": \"int32\",\n",
-                "    \"store_and_fwd_flag\": \"int32\",\n",
-                "    \"dropoff_longitude\": \"float32\",\n",
-                "    \"dropoff_latitude\": \"float32\",\n",
-                "    \"payment_type\": \"int32\",\n",
-                "    \"fare_amount\": \"float32\",\n",
-                "    \"extra\": \"float32\",\n",
-                "    \"mta_tax\": \"float32\",\n",
-                "    \"tip_amount\": \"float32\",\n",
-                "    \"total_amount\": \"float32\",\n",
-                "    \"tolls_amount\": \"float32\",\n",
-                "    \"improvement_surcharge\": \"float32\",\n",
-                "}\n",
-                "\n",
-                "\n",
-                "must_haves = {\n",
-                "    \"pickup_datetime\": \"datetime64[ms]\",\n",
-                "    \"dropoff_datetime\": \"datetime64[ms]\",\n",
-                "    \"passenger_count\": \"int32\",\n",
-                "    \"trip_distance\": \"float32\",\n",
-                "    \"pickup_longitude\": \"float32\",\n",
-                "    \"pickup_latitude\": \"float32\",\n",
-                "    \"rate_code\": \"int32\",\n",
-                "    \"dropoff_longitude\": \"float32\",\n",
-                "    \"dropoff_latitude\": \"float32\",\n",
-                "    \"fare_amount\": \"float32\",\n",
-                "}\n",
-                "\n",
-                "\n",
-                "def compute_haversine_distance(df):\n",
-                "    pickup = cuspatial.GeoSeries.from_points_xy(\n",
-                "        df[[\"pickup_longitude\", \"pickup_latitude\"]].interleave_columns()\n",
-                "    )\n",
-                "    dropoff = cuspatial.GeoSeries.from_points_xy(\n",
-                "        df[[\"dropoff_longitude\", \"dropoff_latitude\"]].interleave_columns()\n",
-                "    )\n",
-                "    df[\"haversine_distance\"] = cuspatial.haversine_distance(pickup, dropoff)\n",
-                "    df[\"haversine_distance\"] = df[\"haversine_distance\"].astype(\"float32\")\n",
-                "    return df\n",
-                "\n",
-                "\n",
-                "def clean(ddf, must_haves):\n",
-                "    # replace the extraneous spaces in column names and lower the font type\n",
-                "    tmp = {col: col.strip().lower() for col in list(ddf.columns)}\n",
-                "    ddf = ddf.rename(columns=tmp)\n",
-                "\n",
-                "    ddf = ddf.rename(\n",
-                "        columns={\n",
-                "            \"tpep_pickup_datetime\": \"pickup_datetime\",\n",
-                "            \"tpep_dropoff_datetime\": \"dropoff_datetime\",\n",
-                "            \"ratecodeid\": \"rate_code\",\n",
-                "        }\n",
-                "    )\n",
-                "\n",
-                "    ddf[\"pickup_datetime\"] = ddf[\"pickup_datetime\"].astype(\"datetime64[ms]\")\n",
-                "    ddf[\"dropoff_datetime\"] = ddf[\"dropoff_datetime\"].astype(\"datetime64[ms]\")\n",
-                "\n",
-                "    for col in ddf.columns:\n",
-                "        if col not in must_haves:\n",
-                "            ddf = ddf.drop(columns=col)\n",
-                "            continue\n",
-                "        if ddf[col].dtype == \"object\":\n",
-                "            # Fixing error: could not convert arg to str\n",
-                "            ddf = ddf.drop(columns=col)\n",
-                "        else:\n",
-                "            # downcast from 64bit to 32bit types\n",
-                "            # Tesla T4 are faster on 32bit ops\n",
-                "            if \"int\" in str(ddf[col].dtype):\n",
-                "                ddf[col] = ddf[col].astype(\"int32\")\n",
-                "            if \"float\" in str(ddf[col].dtype):\n",
-                "                ddf[col] = ddf[col].astype(\"float32\")\n",
-                "            ddf[col] = ddf[col].fillna(-1)\n",
-                "\n",
-                "    return ddf\n",
-                "\n",
-                "\n",
-                "def prepare_data(client):\n",
-                "    taxi_df = dask_cudf.read_csv(\n",
-                "        \"https://storage.googleapis.com/anaconda-public-data/nyc-taxi/csv/2016/yellow_tripdata_2016-02.csv\",\n",
-                "        dtype=col_dtype,\n",
-                "    )\n",
-                "    taxi_df = taxi_df.map_partitions(clean, must_haves, meta=must_haves)\n",
-                "\n",
-                "    ## add features\n",
-                "    taxi_df[\"hour\"] = taxi_df[\"pickup_datetime\"].dt.hour.astype(\"int32\")\n",
-                "    taxi_df[\"year\"] = taxi_df[\"pickup_datetime\"].dt.year.astype(\"int32\")\n",
-                "    taxi_df[\"month\"] = taxi_df[\"pickup_datetime\"].dt.month.astype(\"int32\")\n",
-                "    taxi_df[\"day\"] = taxi_df[\"pickup_datetime\"].dt.day.astype(\"int32\")\n",
-                "    taxi_df[\"day_of_week\"] = taxi_df[\"pickup_datetime\"].dt.weekday.astype(\"int32\")\n",
-                "    taxi_df[\"is_weekend\"] = (taxi_df[\"day_of_week\"] >= 5).astype(\"int32\")\n",
-                "\n",
-                "    # calculate the time difference between dropoff and pickup.\n",
-                "    taxi_df[\"diff\"] = taxi_df[\"dropoff_datetime\"].astype(\"int32\") - taxi_df[\n",
-                "        \"pickup_datetime\"\n",
-                "    ].astype(\"int32\")\n",
-                "    taxi_df[\"diff\"] = (taxi_df[\"diff\"] / 1000).astype(\"int32\")\n",
-                "\n",
-                "    taxi_df[\"pickup_latitude_r\"] = taxi_df[\"pickup_latitude\"] // 0.01 * 0.01\n",
-                "    taxi_df[\"pickup_longitude_r\"] = taxi_df[\"pickup_longitude\"] // 0.01 * 0.01\n",
-                "    taxi_df[\"dropoff_latitude_r\"] = taxi_df[\"dropoff_latitude\"] // 0.01 * 0.01\n",
-                "    taxi_df[\"dropoff_longitude_r\"] = taxi_df[\"dropoff_longitude\"] // 0.01 * 0.01\n",
-                "\n",
-                "    taxi_df = taxi_df.drop(\"pickup_datetime\", axis=1)\n",
-                "    taxi_df = taxi_df.drop(\"dropoff_datetime\", axis=1)\n",
-                "\n",
-                "    taxi_df = taxi_df.map_partitions(compute_haversine_distance)\n",
-                "\n",
-                "    X = (\n",
-                "        taxi_df.drop([\"fare_amount\"], axis=1)\n",
-                "        .astype(\"float32\")\n",
-                "        .to_dask_array(lengths=True)\n",
-                "    )\n",
-                "    y = taxi_df[\"fare_amount\"].astype(\"float32\").to_dask_array(lengths=True)\n",
-                "\n",
-                "    X._meta = cp.asarray(X._meta)\n",
-                "    y._meta = cp.asarray(y._meta)\n",
-                "\n",
-                "    X, y = dask_utils.persist_across_workers(client, [X, y])\n",
-                "    return X, y\n",
-                "\n",
-                "\n",
-                "def train_model(params):\n",
-                "    cluster = get_cluster(threading.get_ident())\n",
-                "\n",
-                "    default_params = {\n",
-                "        \"objective\": \"reg:squarederror\",\n",
-                "        \"eval_metric\": \"rmse\",\n",
-                "        \"verbosity\": 0,\n",
-                "        \"tree_method\": \"hist\",\n",
-                "        \"device\": \"cuda\",\n",
-                "    }\n",
-                "    params = dict(default_params, **params)\n",
-                "\n",
-                "    with Client(cluster) as client:\n",
-                "        X, y = prepare_data(client)\n",
-                "        wait([X, y])\n",
-                "\n",
-                "        scores = []\n",
-                "        kfold = KFold(n_splits=5, shuffle=False)\n",
-                "        for train_index, test_index in kfold.split(X, y):\n",
-                "            dtrain = dxgb.DaskQuantileDMatrix(client, X[train_index, :], y[train_index])\n",
-                "            dtest = dxgb.DaskQuantileDMatrix(client, X[test_index, :], y[test_index])\n",
-                "            model = dxgb.train(\n",
-                "                client,\n",
-                "                params,\n",
-                "                dtrain,\n",
-                "                num_boost_round=10,\n",
-                "                verbose_eval=False,\n",
-                "            )\n",
-                "            y_test_pred = dxgb.predict(client, model, dtest).to_backend(\"cupy\")\n",
-                "            rmse_score = mean_squared_error(y[test_index], y_test_pred, squared=False)\n",
-                "            scores.append(rmse_score)\n",
-                "        return sum(scores) / len(scores)\n",
-                "\n",
-                "\n",
-                "def objective(trial):\n",
-                "    params = {\n",
-                "        \"n_estimators\": trial.suggest_int(\"n_estimators\", 2, 4),\n",
-                "        \"learning_rate\": trial.suggest_float(\"learning_rate\", 0.5, 0.7),\n",
-                "        \"colsample_bytree\": trial.suggest_float(\"colsample_bytree\", 0.5, 1),\n",
-                "        \"colsample_bynode\": trial.suggest_float(\"colsample_bynode\", 0.5, 1),\n",
-                "        \"colsample_bylevel\": trial.suggest_float(\"colsample_bylevel\", 0.5, 1),\n",
-                "        \"reg_lambda\": trial.suggest_float(\"reg_lambda\", 0, 1),\n",
-                "        \"max_depth\": trial.suggest_int(\"max_depth\", 1, 6),\n",
-                "        \"max_leaves\": trial.suggest_int(\"max_leaves\", 0, 2),\n",
-                "        \"max_cat_to_onehot\": trial.suggest_int(\"max_cat_to_onehot\", 1, 10),\n",
-                "    }\n",
-                "    return train_model(params)"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "0c401aa1-2aeb-43d9-955b-4dfd7b495fe9",
-            "metadata": {},
-            "source": [
-                "To kick off multiple training jobs in parallel, we will launch multiple threads, so that each thread controls a Dask cluster.\n",
-                "One important utility function is `get_cluster`, which returns the Dask cluster that's mapped to a given thread."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 35,
-            "id": "97cdeb8a-330e-4d96-92d4-d48c93828d9d",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [],
-            "source": [
-                "# Map each thread's integer ID to a sequential number (0, 1, 2 ...)\n",
-                "thread_id_map: dict[int, KubeCluster] = {}\n",
-                "thread_id_map_lock = threading.Lock()\n",
-                "\n",
-                "\n",
-                "def get_cluster(thread_id: int) -> KubeCluster:\n",
-                "    with thread_id_map_lock:\n",
-                "        try:\n",
-                "            return clusters[thread_id_map[thread_id]]\n",
-                "        except KeyError:\n",
-                "            seq_id = len(thread_id_map)\n",
-                "            thread_id_map[thread_id] = seq_id\n",
-                "            return clusters[seq_id]"
-            ]
-        },
-        {
-            "cell_type": "markdown",
-            "id": "2e7c923b-f4ea-4f38-b3a5-92dfcd47dfff",
-            "metadata": {},
-            "source": [
-                "Now we are ready to start hyperparameter optimization."
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 36,
-            "id": "c557d769-0be6-4319-b7f5-8ad52b824961",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [
-                {
-                    "name": "stderr",
-                    "output_type": "stream",
-                    "text": [
-                        "[I 2024-05-09 07:53:00,718] A new study created in memory with name: no-name-da830427-bce3-4e42-98e6-c98c0c3da0d7\n"
-                    ]
-                }
-            ],
-            "source": [
-                "n_trials = (\n",
-                "    10  # set to a low number so that the demo finishes quickly. Feel free to adjust\n",
-                ")\n",
-                "study = optuna.create_study(direction=\"minimize\")"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": 37,
-            "id": "94ece2d0-b3f7-44c8-9b4e-a2f60fd623b9",
-            "metadata": {
-                "tags": []
-            },
-            "outputs": [
-                {
-                    "name": "stderr",
-                    "output_type": "stream",
-                    "text": [
-                        "[I 2024-05-09 07:54:10,229] Trial 1 finished with value: 59.449462890625 and parameters: {'n_estimators': 4, 'learning_rate': 0.6399993857892183, 'colsample_bytree': 0.7020623988319513, 'colsample_bynode': 0.777468318546648, 'colsample_bylevel': 0.7890749134903386, 'reg_lambda': 0.4464953694744921, 'max_depth': 3, 'max_leaves': 0, 'max_cat_to_onehot': 9}. Best is trial 1 with value: 59.449462890625.\n",
-                        "[I 2024-05-09 07:54:19,507] Trial 0 finished with value: 57.77985763549805 and parameters: {'n_estimators': 4, 'learning_rate': 0.674087333032356, 'colsample_bytree': 0.557642421113256, 'colsample_bynode': 0.9719449711676733, 'colsample_bylevel': 0.6984302171973646, 'reg_lambda': 0.7201514298169174, 'max_depth': 4, 'max_leaves': 1, 'max_cat_to_onehot': 4}. Best is trial 0 with value: 57.77985763549805.\n",
-                        "[I 2024-05-09 07:54:59,524] Trial 2 finished with value: 57.77985763549805 and parameters: {'n_estimators': 2, 'learning_rate': 0.6894880267544121, 'colsample_bytree': 0.8171662437182604, 'colsample_bynode': 0.549527686217645, 'colsample_bylevel': 0.890212178266078, 'reg_lambda': 0.5847298606135033, 'max_depth': 2, 'max_leaves': 1, 'max_cat_to_onehot': 5}. Best is trial 0 with value: 57.77985763549805.\n",
-                        "[I 2024-05-09 07:55:22,013] Trial 3 finished with value: 55.01234817504883 and parameters: {'n_estimators': 4, 'learning_rate': 0.6597614733926671, 'colsample_bytree': 0.8437061126308156, 'colsample_bynode': 0.621479934699203, 'colsample_bylevel': 0.8330951489228277, 'reg_lambda': 0.7830102753448884, 'max_depth': 2, 'max_leaves': 2, 'max_cat_to_onehot': 2}. Best is trial 3 with value: 55.01234817504883.\n",
-                        "[I 2024-05-09 07:56:00,678] Trial 4 finished with value: 57.77985763549805 and parameters: {'n_estimators': 4, 'learning_rate': 0.5994587326401378, 'colsample_bytree': 0.9799078215504886, 'colsample_bynode': 0.9766955839079614, 'colsample_bylevel': 0.5088864363378924, 'reg_lambda': 0.18103184809548734, 'max_depth': 3, 'max_leaves': 1, 'max_cat_to_onehot': 4}. Best is trial 3 with value: 55.01234817504883.\n",
-                        "[I 2024-05-09 07:56:11,773] Trial 5 finished with value: 54.936126708984375 and parameters: {'n_estimators': 2, 'learning_rate': 0.5208827661289628, 'colsample_bytree': 0.866258912492528, 'colsample_bynode': 0.6368815844513638, 'colsample_bylevel': 0.9539603435186208, 'reg_lambda': 0.21390618865079458, 'max_depth': 4, 'max_leaves': 2, 'max_cat_to_onehot': 4}. Best is trial 5 with value: 54.936126708984375.\n",
-                        "[I 2024-05-09 07:56:48,737] Trial 6 finished with value: 57.77985763549805 and parameters: {'n_estimators': 2, 'learning_rate': 0.6137888371528442, 'colsample_bytree': 0.9621063205689744, 'colsample_bynode': 0.5306812468481084, 'colsample_bylevel': 0.8527827651989199, 'reg_lambda': 0.3315799968401767, 'max_depth': 6, 'max_leaves': 1, 'max_cat_to_onehot': 9}. Best is trial 5 with value: 54.936126708984375.\n",
-                        "[I 2024-05-09 07:56:59,261] Trial 7 finished with value: 55.204200744628906 and parameters: {'n_estimators': 3, 'learning_rate': 0.6831416027240611, 'colsample_bytree': 0.5311840770388268, 'colsample_bynode': 0.9572535535110238, 'colsample_bylevel': 0.6846894032354778, 'reg_lambda': 0.6091211134408249, 'max_depth': 3, 'max_leaves': 2, 'max_cat_to_onehot': 5}. Best is trial 5 with value: 54.936126708984375.\n",
-                        "[I 2024-05-09 07:57:37,674] Trial 8 finished with value: 54.93584442138672 and parameters: {'n_estimators': 4, 'learning_rate': 0.620742285616388, 'colsample_bytree': 0.7969398985157778, 'colsample_bynode': 0.9049707375663323, 'colsample_bylevel': 0.7209693969245297, 'reg_lambda': 0.6158847054585023, 'max_depth': 1, 'max_leaves': 0, 'max_cat_to_onehot': 10}. Best is trial 8 with value: 54.93584442138672.\n",
-                        "[I 2024-05-09 07:57:50,310] Trial 9 finished with value: 57.76123809814453 and parameters: {'n_estimators': 3, 'learning_rate': 0.5475197727057007, 'colsample_bytree': 0.5381502848057452, 'colsample_bynode': 0.8514705732161596, 'colsample_bylevel': 0.9139277684007088, 'reg_lambda': 0.5117732009332318, 'max_depth': 4, 'max_leaves': 0, 'max_cat_to_onehot': 5}. Best is trial 8 with value: 54.93584442138672.\n"
-                    ]
-                }
-            ],
-            "source": [
-                "# With n_jobs parameter, Optuna will launch [n_clusters] threads internally\n",
-                "# Each thread will deploy a training job to a Dask cluster\n",
-                "study.optimize(objective, n_trials=n_trials, n_jobs=n_clusters)"
-            ]
-        },
-        {
-            "cell_type": "code",
-            "execution_count": null,
-            "id": "ac5b3cba-87ba-4470-a166-b6a0815f85e4",
-            "metadata": {},
-            "outputs": [],
-            "source": []
-        }
-    ],
-    "metadata": {
-        "kernelspec": {
-            "display_name": "Python 3 (ipykernel)",
-            "language": "python",
-            "name": "python3"
-        },
-        "language_info": {
-            "codemirror_mode": {
-                "name": "ipython",
-                "version": 3
-            },
-            "file_extension": ".py",
-            "mimetype": "text/x-python",
-            "name": "python",
-            "nbconvert_exporter": "python",
-            "pygments_lexer": "ipython3",
-            "version": "3.11.9"
-        }
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "c1db247a-15ab-41b1-a124-152484a29f29",
+   "metadata": {
+    "tags": [
+     "library/xgboost",
+     "library/optuna",
+     "library/dask",
+     "library/dask-kubernetes",
+     "library/scikit-learn",
+     "workflow/hpo",
+     "platforms/kubeflow",
+     "dataset/nyc-taxi",
+     "data-storage/gcs",
+     "data-format/csv",
+     "platforms/kubernetes"
+    ]
+   },
+   "source": [
+    "# Scaling up Hyperparameter Optimization with Multi-GPU Workload on Kubernetes"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f7f02171-ed7b-48b4-9d55-32bb1149a3cf",
+   "metadata": {},
+   "source": [
+    "Choosing an optimal set of hyperparameters is a daunting task, especially for algorithms like XGBoost that have many hyperparameters to tune. In this notebook, we will speed up hyperparameter optimization by running multiple training jobs in parallel on a Kubernetes cluster. We handle larger data sets by splitting the data into multiple GPU devices."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a718e21f-5543-4f44-8a68-6ad8e78cb433",
+   "metadata": {},
+   "source": [
+    "## Prerequisites\n",
+    "Please follow instructions in [Dask Operator: Installation](../../tools/kubernetes/dask-operator) to install the Dask operator on top of a GPU-enabled Kubernetes cluster. (For the purpose of this example, you may ignore other sections of the linked document.\n",
+    "\n",
+    "### Optional: Kubeflow\n",
+    "Kubeflow gives you a nice notebook environment to run this notebook within the k8s cluster. Install Kubeflow by following instructions in [Installing Kubeflow](https://www.kubeflow.org/docs/started/installing-kubeflow/). You may choose any method; we tested this example after installing Kubeflow from manifests."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7b7f7bb3-5d53-4b8f-8472-bb974c8a597d",
+   "metadata": {},
+   "source": [
+    "## Install extra Python modules\n",
+    "We'll need a few extra Python modules."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "27b79db5-bbcd-422c-80a7-af873eb47711",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collecting dask_kubernetes\n",
+      "  Downloading dask_kubernetes-2024.5.0-py3-none-any.whl.metadata (4.2 kB)\n",
+      "Collecting optuna\n",
+      "  Downloading optuna-3.6.1-py3-none-any.whl.metadata (17 kB)\n",
+      "Requirement already satisfied: dask>=2022.08.1 in /opt/conda/lib/python3.11/site-packages (from dask_kubernetes) (2024.1.1)\n",
+      "Requirement already satisfied: distributed>=2022.08.1 in /opt/conda/lib/python3.11/site-packages (from dask_kubernetes) (2024.1.1)\n",
+      "Collecting kopf>=1.35.3 (from dask_kubernetes)\n",
+      "  Downloading kopf-1.37.2-py3-none-any.whl.metadata (9.7 kB)\n",
+      "Collecting kr8s==0.14.* (from dask_kubernetes)\n",
+      "  Downloading kr8s-0.14.4-py3-none-any.whl.metadata (6.7 kB)\n",
+      "Collecting kubernetes-asyncio>=12.0.1 (from dask_kubernetes)\n",
+      "  Downloading kubernetes_asyncio-29.0.0-py3-none-any.whl.metadata (1.3 kB)\n",
+      "Collecting kubernetes>=12.0.1 (from dask_kubernetes)\n",
+      "  Downloading kubernetes-29.0.0-py2.py3-none-any.whl.metadata (1.5 kB)\n",
+      "Collecting pykube-ng>=22.9.0 (from dask_kubernetes)\n",
+      "  Downloading pykube_ng-23.6.0-py3-none-any.whl.metadata (8.0 kB)\n",
+      "Requirement already satisfied: rich>=12.5.1 in /opt/conda/lib/python3.11/site-packages (from dask_kubernetes) (13.7.1)\n",
+      "Requirement already satisfied: anyio>=3.7.0 in /opt/conda/lib/python3.11/site-packages (from kr8s==0.14.*->dask_kubernetes) (4.3.0)\n",
+      "Collecting asyncache>=0.3.1 (from kr8s==0.14.*->dask_kubernetes)\n",
+      "  Downloading asyncache-0.3.1-py3-none-any.whl.metadata (2.0 kB)\n",
+      "Collecting cryptography>=35 (from kr8s==0.14.*->dask_kubernetes)\n",
+      "  Downloading cryptography-42.0.7-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (5.3 kB)\n",
+      "Requirement already satisfied: exceptiongroup>=1.2.0 in /opt/conda/lib/python3.11/site-packages (from kr8s==0.14.*->dask_kubernetes) (1.2.0)\n",
+      "Collecting httpx-ws>=0.5.1 (from kr8s==0.14.*->dask_kubernetes)\n",
+      "  Downloading httpx_ws-0.6.0-py3-none-any.whl.metadata (7.8 kB)\n",
+      "Requirement already satisfied: httpx>=0.24.1 in /opt/conda/lib/python3.11/site-packages (from kr8s==0.14.*->dask_kubernetes) (0.27.0)\n",
+      "Collecting python-box>=7.0.1 (from kr8s==0.14.*->dask_kubernetes)\n",
+      "  Downloading python_box-7.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.8 kB)\n",
+      "Collecting python-jsonpath>=0.7.1 (from kr8s==0.14.*->dask_kubernetes)\n",
+      "  Downloading python_jsonpath-1.1.1-py3-none-any.whl.metadata (5.3 kB)\n",
+      "Requirement already satisfied: pyyaml>=6.0 in /opt/conda/lib/python3.11/site-packages (from kr8s==0.14.*->dask_kubernetes) (6.0.1)\n",
+      "Collecting alembic>=1.5.0 (from optuna)\n",
+      "  Downloading alembic-1.13.1-py3-none-any.whl.metadata (7.4 kB)\n",
+      "Collecting colorlog (from optuna)\n",
+      "  Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)\n",
+      "Requirement already satisfied: numpy in /opt/conda/lib/python3.11/site-packages (from optuna) (1.26.4)\n",
+      "Requirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.11/site-packages (from optuna) (24.0)\n",
+      "Collecting sqlalchemy>=1.3.0 (from optuna)\n",
+      "  Downloading SQLAlchemy-2.0.30-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)\n",
+      "Requirement already satisfied: tqdm in /opt/conda/lib/python3.11/site-packages (from optuna) (4.66.2)\n",
+      "Collecting Mako (from alembic>=1.5.0->optuna)\n",
+      "  Downloading Mako-1.3.3-py3-none-any.whl.metadata (2.9 kB)\n",
+      "Requirement already satisfied: typing-extensions>=4 in /opt/conda/lib/python3.11/site-packages (from alembic>=1.5.0->optuna) (4.11.0)\n",
+      "Requirement already satisfied: click>=8.1 in /opt/conda/lib/python3.11/site-packages (from dask>=2022.08.1->dask_kubernetes) (8.1.7)\n",
+      "Requirement already satisfied: cloudpickle>=1.5.0 in /opt/conda/lib/python3.11/site-packages (from dask>=2022.08.1->dask_kubernetes) (3.0.0)\n",
+      "Requirement already satisfied: fsspec>=2021.09.0 in /opt/conda/lib/python3.11/site-packages (from dask>=2022.08.1->dask_kubernetes) (2024.3.1)\n",
+      "Requirement already satisfied: partd>=1.2.0 in /opt/conda/lib/python3.11/site-packages (from dask>=2022.08.1->dask_kubernetes) (1.4.1)\n",
+      "Requirement already satisfied: toolz>=0.10.0 in /opt/conda/lib/python3.11/site-packages (from dask>=2022.08.1->dask_kubernetes) (0.12.1)\n",
+      "Requirement already satisfied: importlib-metadata>=4.13.0 in /opt/conda/lib/python3.11/site-packages (from dask>=2022.08.1->dask_kubernetes) (7.1.0)\n",
+      "Requirement already satisfied: jinja2>=2.10.3 in /opt/conda/lib/python3.11/site-packages (from distributed>=2022.08.1->dask_kubernetes) (3.1.3)\n",
+      "Requirement already satisfied: locket>=1.0.0 in /opt/conda/lib/python3.11/site-packages (from distributed>=2022.08.1->dask_kubernetes) (1.0.0)\n",
+      "Requirement already satisfied: msgpack>=1.0.0 in /opt/conda/lib/python3.11/site-packages (from distributed>=2022.08.1->dask_kubernetes) (1.0.7)\n",
+      "Requirement already satisfied: psutil>=5.7.2 in /opt/conda/lib/python3.11/site-packages (from distributed>=2022.08.1->dask_kubernetes) (5.9.8)\n",
+      "Requirement already satisfied: sortedcontainers>=2.0.5 in /opt/conda/lib/python3.11/site-packages (from distributed>=2022.08.1->dask_kubernetes) (2.4.0)\n",
+      "Requirement already satisfied: tblib>=1.6.0 in /opt/conda/lib/python3.11/site-packages (from distributed>=2022.08.1->dask_kubernetes) (3.0.0)\n",
+      "Requirement already satisfied: tornado>=6.0.4 in /opt/conda/lib/python3.11/site-packages (from distributed>=2022.08.1->dask_kubernetes) (6.4)\n",
+      "Requirement already satisfied: urllib3>=1.24.3 in /opt/conda/lib/python3.11/site-packages (from distributed>=2022.08.1->dask_kubernetes) (1.26.18)\n",
+      "Requirement already satisfied: zict>=3.0.0 in /opt/conda/lib/python3.11/site-packages (from distributed>=2022.08.1->dask_kubernetes) (3.0.0)\n",
+      "Requirement already satisfied: python-json-logger in /opt/conda/lib/python3.11/site-packages (from kopf>=1.35.3->dask_kubernetes) (2.0.7)\n",
+      "Collecting iso8601 (from kopf>=1.35.3->dask_kubernetes)\n",
+      "  Downloading iso8601-2.1.0-py3-none-any.whl.metadata (3.7 kB)\n",
+      "Requirement already satisfied: aiohttp in /opt/conda/lib/python3.11/site-packages (from kopf>=1.35.3->dask_kubernetes) (3.9.5)\n",
+      "Requirement already satisfied: certifi>=14.05.14 in /opt/conda/lib/python3.11/site-packages (from kubernetes>=12.0.1->dask_kubernetes) (2024.2.2)\n",
+      "Requirement already satisfied: six>=1.9.0 in /opt/conda/lib/python3.11/site-packages (from kubernetes>=12.0.1->dask_kubernetes) (1.16.0)\n",
+      "Requirement already satisfied: python-dateutil>=2.5.3 in /opt/conda/lib/python3.11/site-packages (from kubernetes>=12.0.1->dask_kubernetes) (2.9.0)\n",
+      "Collecting google-auth>=1.0.1 (from kubernetes>=12.0.1->dask_kubernetes)\n",
+      "  Downloading google_auth-2.29.0-py2.py3-none-any.whl.metadata (4.7 kB)\n",
+      "Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /opt/conda/lib/python3.11/site-packages (from kubernetes>=12.0.1->dask_kubernetes) (1.8.0)\n",
+      "Requirement already satisfied: requests in /opt/conda/lib/python3.11/site-packages (from kubernetes>=12.0.1->dask_kubernetes) (2.31.0)\n",
+      "Collecting requests-oauthlib (from kubernetes>=12.0.1->dask_kubernetes)\n",
+      "  Downloading requests_oauthlib-2.0.0-py2.py3-none-any.whl.metadata (11 kB)\n",
+      "Collecting oauthlib>=3.2.2 (from kubernetes>=12.0.1->dask_kubernetes)\n",
+      "  Downloading oauthlib-3.2.2-py3-none-any.whl.metadata (7.5 kB)\n",
+      "Requirement already satisfied: setuptools>=21.0.0 in /opt/conda/lib/python3.11/site-packages (from kubernetes-asyncio>=12.0.1->dask_kubernetes) (69.5.1)\n",
+      "Requirement already satisfied: markdown-it-py>=2.2.0 in /opt/conda/lib/python3.11/site-packages (from rich>=12.5.1->dask_kubernetes) (3.0.0)\n",
+      "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /opt/conda/lib/python3.11/site-packages (from rich>=12.5.1->dask_kubernetes) (2.17.2)\n",
+      "Collecting greenlet!=0.4.17 (from sqlalchemy>=1.3.0->optuna)\n",
+      "  Downloading greenlet-3.0.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (3.8 kB)\n",
+      "Requirement already satisfied: aiosignal>=1.1.2 in /opt/conda/lib/python3.11/site-packages (from aiohttp->kopf>=1.35.3->dask_kubernetes) (1.3.1)\n",
+      "Requirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.11/site-packages (from aiohttp->kopf>=1.35.3->dask_kubernetes) (23.2.0)\n",
+      "Requirement already satisfied: frozenlist>=1.1.1 in /opt/conda/lib/python3.11/site-packages (from aiohttp->kopf>=1.35.3->dask_kubernetes) (1.4.1)\n",
+      "Requirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/lib/python3.11/site-packages (from aiohttp->kopf>=1.35.3->dask_kubernetes) (6.0.5)\n",
+      "Requirement already satisfied: yarl<2.0,>=1.0 in /opt/conda/lib/python3.11/site-packages (from aiohttp->kopf>=1.35.3->dask_kubernetes) (1.9.4)\n",
+      "Requirement already satisfied: idna>=2.8 in /opt/conda/lib/python3.11/site-packages (from anyio>=3.7.0->kr8s==0.14.*->dask_kubernetes) (3.7)\n",
+      "Requirement already satisfied: sniffio>=1.1 in /opt/conda/lib/python3.11/site-packages (from anyio>=3.7.0->kr8s==0.14.*->dask_kubernetes) (1.3.1)\n",
+      "Requirement already satisfied: cachetools<6.0.0,>=5.2.0 in /opt/conda/lib/python3.11/site-packages (from asyncache>=0.3.1->kr8s==0.14.*->dask_kubernetes) (5.3.3)\n",
+      "Requirement already satisfied: cffi>=1.12 in /opt/conda/lib/python3.11/site-packages (from cryptography>=35->kr8s==0.14.*->dask_kubernetes) (1.16.0)\n",
+      "Collecting pyasn1-modules>=0.2.1 (from google-auth>=1.0.1->kubernetes>=12.0.1->dask_kubernetes)\n",
+      "  Downloading pyasn1_modules-0.4.0-py3-none-any.whl.metadata (3.4 kB)\n",
+      "Collecting rsa<5,>=3.1.4 (from google-auth>=1.0.1->kubernetes>=12.0.1->dask_kubernetes)\n",
+      "  Downloading rsa-4.9-py3-none-any.whl.metadata (4.2 kB)\n",
+      "Requirement already satisfied: httpcore==1.* in /opt/conda/lib/python3.11/site-packages (from httpx>=0.24.1->kr8s==0.14.*->dask_kubernetes) (1.0.5)\n",
+      "Requirement already satisfied: h11<0.15,>=0.13 in /opt/conda/lib/python3.11/site-packages (from httpcore==1.*->httpx>=0.24.1->kr8s==0.14.*->dask_kubernetes) (0.14.0)\n",
+      "Collecting wsproto (from httpx-ws>=0.5.1->kr8s==0.14.*->dask_kubernetes)\n",
+      "  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)\n",
+      "Requirement already satisfied: zipp>=0.5 in /opt/conda/lib/python3.11/site-packages (from importlib-metadata>=4.13.0->dask>=2022.08.1->dask_kubernetes) (3.17.0)\n",
+      "Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.11/site-packages (from jinja2>=2.10.3->distributed>=2022.08.1->dask_kubernetes) (2.1.5)\n",
+      "Requirement already satisfied: mdurl~=0.1 in /opt/conda/lib/python3.11/site-packages (from markdown-it-py>=2.2.0->rich>=12.5.1->dask_kubernetes) (0.1.2)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.11/site-packages (from requests->kubernetes>=12.0.1->dask_kubernetes) (3.3.2)\n",
+      "Requirement already satisfied: pycparser in /opt/conda/lib/python3.11/site-packages (from cffi>=1.12->cryptography>=35->kr8s==0.14.*->dask_kubernetes) (2.22)\n",
+      "Collecting pyasn1<0.7.0,>=0.4.6 (from pyasn1-modules>=0.2.1->google-auth>=1.0.1->kubernetes>=12.0.1->dask_kubernetes)\n",
+      "  Downloading pyasn1-0.6.0-py2.py3-none-any.whl.metadata (8.3 kB)\n",
+      "Downloading dask_kubernetes-2024.5.0-py3-none-any.whl (157 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m157.2/157.2 kB\u001b[0m \u001b[31m2.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m0:01\u001b[0m\n",
+      "\u001b[?25hDownloading kr8s-0.14.4-py3-none-any.whl (60 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.7/60.7 kB\u001b[0m \u001b[31m7.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading optuna-3.6.1-py3-none-any.whl (380 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m380.1/380.1 kB\u001b[0m \u001b[31m8.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hDownloading alembic-1.13.1-py3-none-any.whl (233 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m233.4/233.4 kB\u001b[0m \u001b[31m23.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading kopf-1.37.2-py3-none-any.whl (207 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m207.8/207.8 kB\u001b[0m \u001b[31m15.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading kubernetes-29.0.0-py2.py3-none-any.whl (1.6 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m27.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hDownloading kubernetes_asyncio-29.0.0-py3-none-any.whl (2.0 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m83.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading pykube_ng-23.6.0-py3-none-any.whl (26 kB)\n",
+      "Downloading SQLAlchemy-2.0.30-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.2/3.2 MB\u001b[0m \u001b[31m122.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading colorlog-6.8.2-py3-none-any.whl (11 kB)\n",
+      "Downloading asyncache-0.3.1-py3-none-any.whl (3.7 kB)\n",
+      "Downloading cryptography-42.0.7-cp39-abi3-manylinux_2_28_x86_64.whl (3.8 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.8/3.8 MB\u001b[0m \u001b[31m125.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading google_auth-2.29.0-py2.py3-none-any.whl (189 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m189.2/189.2 kB\u001b[0m \u001b[31m29.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading greenlet-3.0.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (620 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m620.0/620.0 kB\u001b[0m \u001b[31m61.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading httpx_ws-0.6.0-py3-none-any.whl (13 kB)\n",
+      "Downloading oauthlib-3.2.2-py3-none-any.whl (151 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m151.7/151.7 kB\u001b[0m \u001b[31m24.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading python_box-7.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.3 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.3/4.3 MB\u001b[0m \u001b[31m131.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading python_jsonpath-1.1.1-py3-none-any.whl (51 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m51.5/51.5 kB\u001b[0m \u001b[31m8.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading iso8601-2.1.0-py3-none-any.whl (7.5 kB)\n",
+      "Downloading Mako-1.3.3-py3-none-any.whl (78 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.8/78.8 kB\u001b[0m \u001b[31m12.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading requests_oauthlib-2.0.0-py2.py3-none-any.whl (24 kB)\n",
+      "Downloading pyasn1_modules-0.4.0-py3-none-any.whl (181 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m181.2/181.2 kB\u001b[0m \u001b[31m27.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading rsa-4.9-py3-none-any.whl (34 kB)\n",
+      "Downloading wsproto-1.2.0-py3-none-any.whl (24 kB)\n",
+      "Downloading pyasn1-0.6.0-py2.py3-none-any.whl (85 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m85.3/85.3 kB\u001b[0m \u001b[31m12.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hInstalling collected packages: wsproto, python-jsonpath, python-box, pyasn1, oauthlib, Mako, iso8601, greenlet, colorlog, asyncache, sqlalchemy, rsa, requests-oauthlib, pykube-ng, pyasn1-modules, cryptography, kubernetes-asyncio, kopf, httpx-ws, google-auth, alembic, optuna, kubernetes, kr8s, dask_kubernetes\n",
+      "Successfully installed Mako-1.3.3 alembic-1.13.1 asyncache-0.3.1 colorlog-6.8.2 cryptography-42.0.7 dask_kubernetes-2024.5.0 google-auth-2.29.0 greenlet-3.0.3 httpx-ws-0.6.0 iso8601-2.1.0 kopf-1.37.2 kr8s-0.14.4 kubernetes-29.0.0 kubernetes-asyncio-29.0.0 oauthlib-3.2.2 optuna-3.6.1 pyasn1-0.6.0 pyasn1-modules-0.4.0 pykube-ng-23.6.0 python-box-7.1.1 python-jsonpath-1.1.1 requests-oauthlib-2.0.0 rsa-4.9 sqlalchemy-2.0.30 wsproto-1.2.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install dask_kubernetes optuna"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "acc8f524-dc9b-41d7-8faa-3aea23ee1983",
+   "metadata": {},
+   "source": [
+    "## Import Python modules"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "0c8a1ffb-0b03-4d4a-9ab1-0561bf5533d9",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import threading\n",
+    "import warnings\n",
+    "\n",
+    "import cupy as cp\n",
+    "import cuspatial\n",
+    "import dask_cudf\n",
+    "import optuna\n",
+    "from cuml.dask.common import utils as dask_utils\n",
+    "from dask.distributed import Client, wait\n",
+    "from dask_kubernetes.operator import KubeCluster\n",
+    "from dask_ml.metrics import mean_squared_error\n",
+    "from dask_ml.model_selection import KFold\n",
+    "from xgboost import dask as dxgb"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b2d61e0b-229b-40c0-889d-b8242e574fc8",
+   "metadata": {},
+   "source": [
+    "## Set up multiple Dask clusters\n",
+    "\n",
+    "To run multi-GPU training jobs in parallel, we will create multiple Dask clusters each controlling its share of GPUs. It's best to think of each Dask cluster as a portion of the compute resource of the Kubernetes cluster.\n",
+    "\n",
+    "Fill in the following variables:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "d1c22c3c-51b2-4526-b1fa-ac012f616e13",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "n_clusters=2\n",
+      "n_worker_per_dask_cluster=2\n",
+      "n_node_per_dask_cluster=3\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Number of nodes in the Kubernetes cluster.\n",
+    "# Each node is assumed to have a single NVIDIA GPU attached\n",
+    "n_nodes = 7\n",
+    "\n",
+    "# Number of worker nodes to be assigned to each Dask cluster\n",
+    "n_worker_per_dask_cluster = 2\n",
+    "\n",
+    "# Number of nodes to be assigned to each Dask cluster\n",
+    "# 1 is added since the Dask cluster's scheduler process needs to be mapped to its own node\n",
+    "n_node_per_dask_cluster = n_worker_per_dask_cluster + 1\n",
+    "\n",
+    "# Number of Dask clusters to be created\n",
+    "# Subtract 1 to account for the notebook pod (it requires its own node)\n",
+    "n_clusters = (n_nodes - 1) // n_node_per_dask_cluster\n",
+    "\n",
+    "print(f\"{n_clusters=}\")\n",
+    "if n_clusters == 0:\n",
+    "    raise ValueError(\n",
+    "        \"No cluster can be created. Reduce `n_worker_per_dask_cluster` or create more compute nodes\"\n",
+    "    )\n",
+    "print(f\"{n_worker_per_dask_cluster=}\")\n",
+    "print(f\"{n_node_per_dask_cluster=}\")\n",
+    "\n",
+    "n_node_active = n_clusters * n_node_per_dask_cluster + 1\n",
+    "if n_node_active != n_nodes:\n",
+    "    n_idle = n_nodes - n_node_active\n",
+    "    warnings.warn(f\"{n_idle} node(s) will not be used\", stacklevel=2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c0eee823-162f-47e9-be4c-41447b2d7ee9",
+   "metadata": {},
+   "source": [
+    "Once we've determined the number of Dask clusters and their size, we are now ready to launch them:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "8d0b632a-b73d-4351-bb5d-8a1f4ab1a2c4",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Choose the same RAPIDS image you used for launching the notebook session\n",
+    "rapids_image = \"{{ rapids_notebook_container }}\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "62aa9e52-c5b6-487c-8f02-88ea84980cfc",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e24e5095ae78458e804d5f1212372f9a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Output()"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     },
-    "nbformat": 4,
-    "nbformat_minor": 5
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Launching cluster 0...\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "240e689def1549c1b5dfd87284192e96",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Output()"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Launching cluster 1...\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "clusters = []\n",
+    "for i in range(n_clusters):\n",
+    "    print(f\"Launching cluster {i}...\")\n",
+    "    clusters.append(\n",
+    "        KubeCluster(\n",
+    "            name=f\"rapids-dask{i}\",\n",
+    "            image=rapids_image,\n",
+    "            worker_command=\"dask-cuda-worker\",\n",
+    "            n_workers=2,\n",
+    "            resources={\"limits\": {\"nvidia.com/gpu\": \"1\"}},\n",
+    "            env={\"EXTRA_PIP_PACKAGES\": \"optuna\"},\n",
+    "        )\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f37fa67f-fa90-432c-bed3-8f2a8a095795",
+   "metadata": {},
+   "source": [
+    "## Set up Hyperparameter Optimization Task with NYC Taxi data\n",
+    "\n",
+    "Anaconda has graciously made some of the NYC Taxi dataset available in a public Google Cloud Storage bucket. We'll use our Cluster of GPUs to process it and train a model that predicts the fare amount. We'll use our Dask clusters to process it and train a model that predicts the fare amount."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "c84929a5-f13b-4a61-9ed6-aa8060129e17",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "col_dtype = {\n",
+    "    \"VendorID\": \"int32\",\n",
+    "    \"tpep_pickup_datetime\": \"datetime64[ms]\",\n",
+    "    \"tpep_dropoff_datetime\": \"datetime64[ms]\",\n",
+    "    \"passenger_count\": \"int32\",\n",
+    "    \"trip_distance\": \"float32\",\n",
+    "    \"pickup_longitude\": \"float32\",\n",
+    "    \"pickup_latitude\": \"float32\",\n",
+    "    \"RatecodeID\": \"int32\",\n",
+    "    \"store_and_fwd_flag\": \"int32\",\n",
+    "    \"dropoff_longitude\": \"float32\",\n",
+    "    \"dropoff_latitude\": \"float32\",\n",
+    "    \"payment_type\": \"int32\",\n",
+    "    \"fare_amount\": \"float32\",\n",
+    "    \"extra\": \"float32\",\n",
+    "    \"mta_tax\": \"float32\",\n",
+    "    \"tip_amount\": \"float32\",\n",
+    "    \"total_amount\": \"float32\",\n",
+    "    \"tolls_amount\": \"float32\",\n",
+    "    \"improvement_surcharge\": \"float32\",\n",
+    "}\n",
+    "\n",
+    "\n",
+    "must_haves = {\n",
+    "    \"pickup_datetime\": \"datetime64[ms]\",\n",
+    "    \"dropoff_datetime\": \"datetime64[ms]\",\n",
+    "    \"passenger_count\": \"int32\",\n",
+    "    \"trip_distance\": \"float32\",\n",
+    "    \"pickup_longitude\": \"float32\",\n",
+    "    \"pickup_latitude\": \"float32\",\n",
+    "    \"rate_code\": \"int32\",\n",
+    "    \"dropoff_longitude\": \"float32\",\n",
+    "    \"dropoff_latitude\": \"float32\",\n",
+    "    \"fare_amount\": \"float32\",\n",
+    "}\n",
+    "\n",
+    "\n",
+    "def compute_haversine_distance(df):\n",
+    "    pickup = cuspatial.GeoSeries.from_points_xy(\n",
+    "        df[[\"pickup_longitude\", \"pickup_latitude\"]].interleave_columns()\n",
+    "    )\n",
+    "    dropoff = cuspatial.GeoSeries.from_points_xy(\n",
+    "        df[[\"dropoff_longitude\", \"dropoff_latitude\"]].interleave_columns()\n",
+    "    )\n",
+    "    df[\"haversine_distance\"] = cuspatial.haversine_distance(pickup, dropoff)\n",
+    "    df[\"haversine_distance\"] = df[\"haversine_distance\"].astype(\"float32\")\n",
+    "    return df\n",
+    "\n",
+    "\n",
+    "def clean(ddf, must_haves):\n",
+    "    # replace the extraneous spaces in column names and lower the font type\n",
+    "    tmp = {col: col.strip().lower() for col in list(ddf.columns)}\n",
+    "    ddf = ddf.rename(columns=tmp)\n",
+    "\n",
+    "    ddf = ddf.rename(\n",
+    "        columns={\n",
+    "            \"tpep_pickup_datetime\": \"pickup_datetime\",\n",
+    "            \"tpep_dropoff_datetime\": \"dropoff_datetime\",\n",
+    "            \"ratecodeid\": \"rate_code\",\n",
+    "        }\n",
+    "    )\n",
+    "\n",
+    "    ddf[\"pickup_datetime\"] = ddf[\"pickup_datetime\"].astype(\"datetime64[ms]\")\n",
+    "    ddf[\"dropoff_datetime\"] = ddf[\"dropoff_datetime\"].astype(\"datetime64[ms]\")\n",
+    "\n",
+    "    for col in ddf.columns:\n",
+    "        if col not in must_haves:\n",
+    "            ddf = ddf.drop(columns=col)\n",
+    "            continue\n",
+    "        if ddf[col].dtype == \"object\":\n",
+    "            # Fixing error: could not convert arg to str\n",
+    "            ddf = ddf.drop(columns=col)\n",
+    "        else:\n",
+    "            # downcast from 64bit to 32bit types\n",
+    "            # Tesla T4 are faster on 32bit ops\n",
+    "            if \"int\" in str(ddf[col].dtype):\n",
+    "                ddf[col] = ddf[col].astype(\"int32\")\n",
+    "            if \"float\" in str(ddf[col].dtype):\n",
+    "                ddf[col] = ddf[col].astype(\"float32\")\n",
+    "            ddf[col] = ddf[col].fillna(-1)\n",
+    "\n",
+    "    return ddf\n",
+    "\n",
+    "\n",
+    "def prepare_data(client):\n",
+    "    taxi_df = dask_cudf.read_csv(\n",
+    "        \"https://storage.googleapis.com/anaconda-public-data/nyc-taxi/csv/2016/yellow_tripdata_2016-02.csv\",\n",
+    "        dtype=col_dtype,\n",
+    "    )\n",
+    "    taxi_df = taxi_df.map_partitions(clean, must_haves, meta=must_haves)\n",
+    "\n",
+    "    ## add features\n",
+    "    taxi_df[\"hour\"] = taxi_df[\"pickup_datetime\"].dt.hour.astype(\"int32\")\n",
+    "    taxi_df[\"year\"] = taxi_df[\"pickup_datetime\"].dt.year.astype(\"int32\")\n",
+    "    taxi_df[\"month\"] = taxi_df[\"pickup_datetime\"].dt.month.astype(\"int32\")\n",
+    "    taxi_df[\"day\"] = taxi_df[\"pickup_datetime\"].dt.day.astype(\"int32\")\n",
+    "    taxi_df[\"day_of_week\"] = taxi_df[\"pickup_datetime\"].dt.weekday.astype(\"int32\")\n",
+    "    taxi_df[\"is_weekend\"] = (taxi_df[\"day_of_week\"] >= 5).astype(\"int32\")\n",
+    "\n",
+    "    # calculate the time difference between dropoff and pickup.\n",
+    "    taxi_df[\"diff\"] = taxi_df[\"dropoff_datetime\"].astype(\"int32\") - taxi_df[\n",
+    "        \"pickup_datetime\"\n",
+    "    ].astype(\"int32\")\n",
+    "    taxi_df[\"diff\"] = (taxi_df[\"diff\"] / 1000).astype(\"int32\")\n",
+    "\n",
+    "    taxi_df[\"pickup_latitude_r\"] = taxi_df[\"pickup_latitude\"] // 0.01 * 0.01\n",
+    "    taxi_df[\"pickup_longitude_r\"] = taxi_df[\"pickup_longitude\"] // 0.01 * 0.01\n",
+    "    taxi_df[\"dropoff_latitude_r\"] = taxi_df[\"dropoff_latitude\"] // 0.01 * 0.01\n",
+    "    taxi_df[\"dropoff_longitude_r\"] = taxi_df[\"dropoff_longitude\"] // 0.01 * 0.01\n",
+    "\n",
+    "    taxi_df = taxi_df.drop(\"pickup_datetime\", axis=1)\n",
+    "    taxi_df = taxi_df.drop(\"dropoff_datetime\", axis=1)\n",
+    "\n",
+    "    taxi_df = taxi_df.map_partitions(compute_haversine_distance)\n",
+    "\n",
+    "    X = (\n",
+    "        taxi_df.drop([\"fare_amount\"], axis=1)\n",
+    "        .astype(\"float32\")\n",
+    "        .to_dask_array(lengths=True)\n",
+    "    )\n",
+    "    y = taxi_df[\"fare_amount\"].astype(\"float32\").to_dask_array(lengths=True)\n",
+    "\n",
+    "    X._meta = cp.asarray(X._meta)\n",
+    "    y._meta = cp.asarray(y._meta)\n",
+    "\n",
+    "    X, y = dask_utils.persist_across_workers(client, [X, y])\n",
+    "    return X, y\n",
+    "\n",
+    "\n",
+    "def train_model(params):\n",
+    "    cluster = get_cluster(threading.get_ident())\n",
+    "\n",
+    "    default_params = {\n",
+    "        \"objective\": \"reg:squarederror\",\n",
+    "        \"eval_metric\": \"rmse\",\n",
+    "        \"verbosity\": 0,\n",
+    "        \"tree_method\": \"hist\",\n",
+    "        \"device\": \"cuda\",\n",
+    "    }\n",
+    "    params = dict(default_params, **params)\n",
+    "\n",
+    "    with Client(cluster) as client:\n",
+    "        X, y = prepare_data(client)\n",
+    "        wait([X, y])\n",
+    "\n",
+    "        scores = []\n",
+    "        kfold = KFold(n_splits=5, shuffle=False)\n",
+    "        for train_index, test_index in kfold.split(X, y):\n",
+    "            dtrain = dxgb.DaskQuantileDMatrix(client, X[train_index, :], y[train_index])\n",
+    "            dtest = dxgb.DaskQuantileDMatrix(client, X[test_index, :], y[test_index])\n",
+    "            model = dxgb.train(\n",
+    "                client,\n",
+    "                params,\n",
+    "                dtrain,\n",
+    "                num_boost_round=10,\n",
+    "                verbose_eval=False,\n",
+    "            )\n",
+    "            y_test_pred = dxgb.predict(client, model, dtest).to_backend(\"cupy\")\n",
+    "            rmse_score = mean_squared_error(y[test_index], y_test_pred, squared=False)\n",
+    "            scores.append(rmse_score)\n",
+    "        return sum(scores) / len(scores)\n",
+    "\n",
+    "\n",
+    "def objective(trial):\n",
+    "    params = {\n",
+    "        \"n_estimators\": trial.suggest_int(\"n_estimators\", 2, 4),\n",
+    "        \"learning_rate\": trial.suggest_float(\"learning_rate\", 0.5, 0.7),\n",
+    "        \"colsample_bytree\": trial.suggest_float(\"colsample_bytree\", 0.5, 1),\n",
+    "        \"colsample_bynode\": trial.suggest_float(\"colsample_bynode\", 0.5, 1),\n",
+    "        \"colsample_bylevel\": trial.suggest_float(\"colsample_bylevel\", 0.5, 1),\n",
+    "        \"reg_lambda\": trial.suggest_float(\"reg_lambda\", 0, 1),\n",
+    "        \"max_depth\": trial.suggest_int(\"max_depth\", 1, 6),\n",
+    "        \"max_leaves\": trial.suggest_int(\"max_leaves\", 0, 2),\n",
+    "        \"max_cat_to_onehot\": trial.suggest_int(\"max_cat_to_onehot\", 1, 10),\n",
+    "    }\n",
+    "    return train_model(params)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0c401aa1-2aeb-43d9-955b-4dfd7b495fe9",
+   "metadata": {},
+   "source": [
+    "To kick off multiple training jobs in parallel, we will launch multiple threads, so that each thread controls a Dask cluster.\n",
+    "One important utility function is `get_cluster`, which returns the Dask cluster that's mapped to a given thread."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "97cdeb8a-330e-4d96-92d4-d48c93828d9d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Map each thread's integer ID to a sequential number (0, 1, 2 ...)\n",
+    "thread_id_map: dict[int, KubeCluster] = {}\n",
+    "thread_id_map_lock = threading.Lock()\n",
+    "\n",
+    "\n",
+    "def get_cluster(thread_id: int) -> KubeCluster:\n",
+    "    with thread_id_map_lock:\n",
+    "        try:\n",
+    "            return clusters[thread_id_map[thread_id]]\n",
+    "        except KeyError:\n",
+    "            seq_id = len(thread_id_map)\n",
+    "            thread_id_map[thread_id] = seq_id\n",
+    "            return clusters[seq_id]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2e7c923b-f4ea-4f38-b3a5-92dfcd47dfff",
+   "metadata": {},
+   "source": [
+    "Now we are ready to start hyperparameter optimization."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "c557d769-0be6-4319-b7f5-8ad52b824961",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[I 2024-05-09 07:53:00,718] A new study created in memory with name: no-name-da830427-bce3-4e42-98e6-c98c0c3da0d7\n"
+     ]
+    }
+   ],
+   "source": [
+    "n_trials = (\n",
+    "    10  # set to a low number so that the demo finishes quickly. Feel free to adjust\n",
+    ")\n",
+    "study = optuna.create_study(direction=\"minimize\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "id": "94ece2d0-b3f7-44c8-9b4e-a2f60fd623b9",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[I 2024-05-09 07:54:10,229] Trial 1 finished with value: 59.449462890625 and parameters: {'n_estimators': 4, 'learning_rate': 0.6399993857892183, 'colsample_bytree': 0.7020623988319513, 'colsample_bynode': 0.777468318546648, 'colsample_bylevel': 0.7890749134903386, 'reg_lambda': 0.4464953694744921, 'max_depth': 3, 'max_leaves': 0, 'max_cat_to_onehot': 9}. Best is trial 1 with value: 59.449462890625.\n",
+      "[I 2024-05-09 07:54:19,507] Trial 0 finished with value: 57.77985763549805 and parameters: {'n_estimators': 4, 'learning_rate': 0.674087333032356, 'colsample_bytree': 0.557642421113256, 'colsample_bynode': 0.9719449711676733, 'colsample_bylevel': 0.6984302171973646, 'reg_lambda': 0.7201514298169174, 'max_depth': 4, 'max_leaves': 1, 'max_cat_to_onehot': 4}. Best is trial 0 with value: 57.77985763549805.\n",
+      "[I 2024-05-09 07:54:59,524] Trial 2 finished with value: 57.77985763549805 and parameters: {'n_estimators': 2, 'learning_rate': 0.6894880267544121, 'colsample_bytree': 0.8171662437182604, 'colsample_bynode': 0.549527686217645, 'colsample_bylevel': 0.890212178266078, 'reg_lambda': 0.5847298606135033, 'max_depth': 2, 'max_leaves': 1, 'max_cat_to_onehot': 5}. Best is trial 0 with value: 57.77985763549805.\n",
+      "[I 2024-05-09 07:55:22,013] Trial 3 finished with value: 55.01234817504883 and parameters: {'n_estimators': 4, 'learning_rate': 0.6597614733926671, 'colsample_bytree': 0.8437061126308156, 'colsample_bynode': 0.621479934699203, 'colsample_bylevel': 0.8330951489228277, 'reg_lambda': 0.7830102753448884, 'max_depth': 2, 'max_leaves': 2, 'max_cat_to_onehot': 2}. Best is trial 3 with value: 55.01234817504883.\n",
+      "[I 2024-05-09 07:56:00,678] Trial 4 finished with value: 57.77985763549805 and parameters: {'n_estimators': 4, 'learning_rate': 0.5994587326401378, 'colsample_bytree': 0.9799078215504886, 'colsample_bynode': 0.9766955839079614, 'colsample_bylevel': 0.5088864363378924, 'reg_lambda': 0.18103184809548734, 'max_depth': 3, 'max_leaves': 1, 'max_cat_to_onehot': 4}. Best is trial 3 with value: 55.01234817504883.\n",
+      "[I 2024-05-09 07:56:11,773] Trial 5 finished with value: 54.936126708984375 and parameters: {'n_estimators': 2, 'learning_rate': 0.5208827661289628, 'colsample_bytree': 0.866258912492528, 'colsample_bynode': 0.6368815844513638, 'colsample_bylevel': 0.9539603435186208, 'reg_lambda': 0.21390618865079458, 'max_depth': 4, 'max_leaves': 2, 'max_cat_to_onehot': 4}. Best is trial 5 with value: 54.936126708984375.\n",
+      "[I 2024-05-09 07:56:48,737] Trial 6 finished with value: 57.77985763549805 and parameters: {'n_estimators': 2, 'learning_rate': 0.6137888371528442, 'colsample_bytree': 0.9621063205689744, 'colsample_bynode': 0.5306812468481084, 'colsample_bylevel': 0.8527827651989199, 'reg_lambda': 0.3315799968401767, 'max_depth': 6, 'max_leaves': 1, 'max_cat_to_onehot': 9}. Best is trial 5 with value: 54.936126708984375.\n",
+      "[I 2024-05-09 07:56:59,261] Trial 7 finished with value: 55.204200744628906 and parameters: {'n_estimators': 3, 'learning_rate': 0.6831416027240611, 'colsample_bytree': 0.5311840770388268, 'colsample_bynode': 0.9572535535110238, 'colsample_bylevel': 0.6846894032354778, 'reg_lambda': 0.6091211134408249, 'max_depth': 3, 'max_leaves': 2, 'max_cat_to_onehot': 5}. Best is trial 5 with value: 54.936126708984375.\n",
+      "[I 2024-05-09 07:57:37,674] Trial 8 finished with value: 54.93584442138672 and parameters: {'n_estimators': 4, 'learning_rate': 0.620742285616388, 'colsample_bytree': 0.7969398985157778, 'colsample_bynode': 0.9049707375663323, 'colsample_bylevel': 0.7209693969245297, 'reg_lambda': 0.6158847054585023, 'max_depth': 1, 'max_leaves': 0, 'max_cat_to_onehot': 10}. Best is trial 8 with value: 54.93584442138672.\n",
+      "[I 2024-05-09 07:57:50,310] Trial 9 finished with value: 57.76123809814453 and parameters: {'n_estimators': 3, 'learning_rate': 0.5475197727057007, 'colsample_bytree': 0.5381502848057452, 'colsample_bynode': 0.8514705732161596, 'colsample_bylevel': 0.9139277684007088, 'reg_lambda': 0.5117732009332318, 'max_depth': 4, 'max_leaves': 0, 'max_cat_to_onehot': 5}. Best is trial 8 with value: 54.93584442138672.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# With n_jobs parameter, Optuna will launch [n_clusters] threads internally\n",
+    "# Each thread will deploy a training job to a Dask cluster\n",
+    "study.optimize(objective, n_trials=n_trials, n_jobs=n_clusters)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ac5b3cba-87ba-4470-a166-b6a0815f85e4",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
 }
diff --git a/source/guides/azure/infiniband.md b/source/guides/azure/infiniband.md
index aaaff5a4..f2feff4e 100644
--- a/source/guides/azure/infiniband.md
+++ b/source/guides/azure/infiniband.md
@@ -257,10 +257,9 @@ Then start a new shell.
 
 Create a conda environment (see [UCX-Py](https://ucx-py.readthedocs.io/en/latest/install.html) docs)
 
-```shell
+````shell
 mamba create -n ucxpy {{ rapids_conda_channels }} {{ rapids_conda_packages }} ipython ucx-proc=*=gpu ucx ucx-py dask distributed numpy cupy pytest pynvml -y
 mamba activate ucxpy
-```
 
 Clone UCX-Py repo locally