From ce9b01fc14e1b4c4d916422e48c37c3fea9b80ca Mon Sep 17 00:00:00 2001 From: Jessica Wang Date: Tue, 4 Oct 2022 15:01:21 -0700 Subject: [PATCH 1/3] chore: bump to spark 3.3.2 --- build.sbt | 2 +- .../azure/synapse/ml/codegen/PyCodegen.scala | 8 +++--- .../azure/synapse/ml/codegen/RTestGen.scala | 2 +- .../ml/nbtest/DatabricksGPUTests.scala | 6 +---- .../ml/nbtest/DatabricksUtilities.scala | 16 +++++++----- .../synapse/ml/nbtest/SynapseUtilities.scala | 2 +- .../src/main/python/horovod_installation.sh | 15 +++++------ .../synapse/ml/dl/DeepTextClassifier.py | 6 ++--- .../synapse/ml/dl/DeepVisionClassifier.py | 6 ++--- .../python/synapse/ml/dl/LitDeepTextModel.py | 6 ++--- .../Quickstart - Isolation Forests.ipynb | 10 ++++---- ...kstart - Fine-tune a Text Classifier.ipynb | 25 +++++++++++++------ ...tart - Fine-tune a Vision Classifier.ipynb | 9 +++++++ .../Hyperparameter Tuning/HyperOpt.ipynb | 2 +- environment.yml | 12 ++++----- pipeline.yaml | 4 +-- start | 4 +-- tools/docker/demo/Dockerfile | 4 +-- tools/docker/minimal/Dockerfile | 4 +-- tools/dotnet/dotnetSetup.sh | 8 +++--- tools/tests/run_r_tests.R | 2 +- 21 files changed, 84 insertions(+), 69 deletions(-) diff --git a/build.sbt b/build.sbt index fac44f06cd..af3232b6a6 100644 --- a/build.sbt +++ b/build.sbt @@ -8,7 +8,7 @@ import scala.xml.transform.{RewriteRule, RuleTransformer} import scala.xml.{Node => XmlNode, NodeSeq => XmlNodeSeq, _} val condaEnvName = "synapseml" -val sparkVersion = "3.2.3" +val sparkVersion = "3.3.2" name := "synapseml" ThisBuild / organization := "com.microsoft.azure" ThisBuild / scalaVersion := "2.12.15" diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/codegen/PyCodegen.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/codegen/PyCodegen.scala index f6fd86e438..be19cc81a2 100644 --- a/core/src/main/scala/com/microsoft/azure/synapse/ml/codegen/PyCodegen.scala +++ b/core/src/main/scala/com/microsoft/azure/synapse/ml/codegen/PyCodegen.scala @@ -68,11 +68,11 @@ object PyCodegen { // There's `Already borrowed` error found in transformers 4.16.2 when using tokenizers s"""extras_require={"extras": [ | "cmake", - | "horovod==0.25.0", + | "horovod==0.27.0", | "pytorch_lightning>=1.5.0,<1.5.10", - | "torch==1.11.0", - | "torchvision>=0.12.0", - | "transformers==4.15.0", + | "torch==1.13.1", + | "torchvision>=0.14.1", + | "transformers==4.32.1", | "petastorm>=0.12.0", | "huggingface-hub>=0.8.1", |]}, diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/codegen/RTestGen.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/codegen/RTestGen.scala index 8e3fd1e85f..0b129e180f 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/codegen/RTestGen.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/codegen/RTestGen.scala @@ -101,7 +101,7 @@ object RTestGen { | "spark.sql.shuffle.partitions=10", | "spark.sql.crossJoin.enabled=true") | - |sc <- spark_connect(master = "local", version = "3.2.4", config = conf) + |sc <- spark_connect(master = "local", version = "3.3.2", config = conf) | |""".stripMargin, StandardOpenOption.CREATE) diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/DatabricksGPUTests.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/DatabricksGPUTests.scala index be308c7af7..d99ac4a672 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/DatabricksGPUTests.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/DatabricksGPUTests.scala @@ -11,11 +11,7 @@ import java.io.File import scala.collection.mutable.ListBuffer class DatabricksGPUTests extends DatabricksTestHelper { - val horovodInstallationScript: File = FileUtilities.join( - BuildInfo.baseDirectory.getParent, "deep-learning", - "src", "main", "python", "horovod_installation.sh").getCanonicalFile - uploadFileToDBFS(horovodInstallationScript, "/FileStore/horovod-fix-commit/horovod_installation.sh") - val clusterId: String = createClusterInPool(GPUClusterName, AdbGpuRuntime, 2, GpuPoolId, GPUInitScripts) + val clusterId: String = createClusterInPool(GPUClusterName, AdbGpuRuntime, 2, GpuPoolId, "[]") val jobIdsToCancel: ListBuffer[Int] = databricksTestHelper( clusterId, GPULibraries, GPUNotebooks) diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/DatabricksUtilities.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/DatabricksUtilities.scala index 43f5203324..678c3fa4ee 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/DatabricksUtilities.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/DatabricksUtilities.scala @@ -29,10 +29,11 @@ object DatabricksUtilities { // ADB Info val Region = "eastus" - val PoolName = "synapseml-build-10.4" - val GpuPoolName = "synapseml-build-10.4-gpu" - val AdbRuntime = "10.4.x-scala2.12" - val AdbGpuRuntime = "10.4.x-gpu-ml-scala2.12" + val PoolName = "synapseml-build-12.2" + val GpuPoolName = "synapseml-build-12.2-gpu" + val AdbRuntime = "12.2.x-scala2.12" + // https://learn.microsoft.com/en-us/azure/databricks/release-notes/runtime/ + val AdbGpuRuntime = "12.2.x-gpu-ml-scala2.12" val NumWorkers = 5 val AutoTerminationMinutes = 15 @@ -75,8 +76,11 @@ object DatabricksUtilities { // TODO: install synapse.ml.dl wheel package here val GPULibraries: String = List( Map("maven" -> Map("coordinates" -> PackageMavenCoordinate, "repo" -> PackageRepository)), - Map("pypi" -> Map("package" -> "transformers==4.15.0")), - Map("pypi" -> Map("package" -> "petastorm==0.12.0")) + Map("pypi" -> Map("package" -> "pytorch-lightning==1.5.0")), + Map("pypi" -> Map("package" -> "torchvision==0.14.1")), + Map("pypi" -> Map("package" -> "transformers==4.25.1")), + Map("pypi" -> Map("package" -> "petastorm==0.12.1")), + Map("pypi" -> Map("package" -> "protobuf==3.19.4")) ).toJson.compactPrint val GPUInitScripts: String = List( diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala index 317218c08d..478e829d79 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/nbtest/SynapseUtilities.scala @@ -255,7 +255,7 @@ object SynapseUtilities { | "nodeSizeFamily": "MemoryOptimized", | "provisioningState": "Succeeded", | "sessionLevelPackagesEnabled": "true", - | "sparkVersion": "3.2" + | "sparkVersion": "3.3" | } |} |""".stripMargin diff --git a/deep-learning/src/main/python/horovod_installation.sh b/deep-learning/src/main/python/horovod_installation.sh index b983be0dad..8bd5f19c02 100644 --- a/deep-learning/src/main/python/horovod_installation.sh +++ b/deep-learning/src/main/python/horovod_installation.sh @@ -7,10 +7,10 @@ set -eu # Install prerequisite libraries that horovod depends on pip install pytorch-lightning==1.5.0 -pip install torchvision==0.12.0 -pip install transformers==4.15.0 +pip install torchvision==0.14.1 +pip install transformers==4.25.1 pip install petastorm>=0.12.0 -pip install protobuf==3.20.3 +pip install protobuf==3.19.1 # Remove Outdated Signing Key: sudo apt-key del 7fa2af80 @@ -35,11 +35,8 @@ libcusparse-dev-11-0=11.1.1.245-1 git clone --recursive https://github.com/horovod/horovod.git cd horovod -# # fix version 0.25.0 -# git fetch origin refs/tags/v0.25.0:tags/v0.25.0 -# git checkout tags/v0.25.0 -b v0.25.0-branch -# fix to this commit number until they release a new version -git checkout ab97fd15bbba3258adcdd12983f36a1cdeacbc94 +# git fetch origin refs/tags/v0.27.0:tags/v0.27.0 +git checkout bfaca90d5cf66780a97d8799d4e1573855b64560 git checkout -b tmp-branch rm -rf build/ dist/ HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_CUDA_HOME=/usr/local/cuda-11/ HOROVOD_WITH_PYTORCH=1 HOROVOD_WITHOUT_MXNET=1 \ @@ -47,4 +44,4 @@ HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_CUDA_HOME=/usr/local/cuda-11/ HOROVOD_WITH_PY readlink -f dist/horovod-*.whl -pip install --no-cache-dir dist/horovod-0.25.0-cp38-cp38-linux_x86_64.whl --force-reinstall --no-deps +pip install --no-cache-dir dist/horovod-0.27.0-cp38-cp38-linux_x86_64.whl --force-reinstall --no-deps diff --git a/deep-learning/src/main/python/synapse/ml/dl/DeepTextClassifier.py b/deep-learning/src/main/python/synapse/ml/dl/DeepTextClassifier.py index 0702fc828b..b6f3bc81dc 100644 --- a/deep-learning/src/main/python/synapse/ml/dl/DeepTextClassifier.py +++ b/deep-learning/src/main/python/synapse/ml/dl/DeepTextClassifier.py @@ -11,12 +11,12 @@ if _TRANSFORMERS_AVAILABLE: import transformers - _TRANSFORMERS_EQUAL_4_15_0 = transformers.__version__ == "4.15.0" - if _TRANSFORMERS_EQUAL_4_15_0: + _TRANSFORMERS_EQUAL_4_25_1 = transformers.__version__ == "4.25.1" + if _TRANSFORMERS_EQUAL_4_25_1: from transformers import AutoTokenizer else: raise RuntimeError( - "transformers should be == 4.15.0, found: {}".format( + "transformers should be == 4.25.1, found: {}".format( transformers.__version__ ) ) diff --git a/deep-learning/src/main/python/synapse/ml/dl/DeepVisionClassifier.py b/deep-learning/src/main/python/synapse/ml/dl/DeepVisionClassifier.py index 2968fbd7a8..59dee2b5c9 100644 --- a/deep-learning/src/main/python/synapse/ml/dl/DeepVisionClassifier.py +++ b/deep-learning/src/main/python/synapse/ml/dl/DeepVisionClassifier.py @@ -19,10 +19,10 @@ if _HOROVOD_AVAILABLE: import horovod - _HOROVOD_EQUAL_0_25_0 = horovod.__version__ == "0.25.0" - if not _HOROVOD_EQUAL_0_25_0: + _HOROVOD_EQUAL_0_27_0 = horovod.__version__ == "0.27.0" + if not _HOROVOD_EQUAL_0_27_0: raise RuntimeError( - "horovod should be of version 0.25.0, found: {}".format(horovod.__version__) + "horovod should be of version 0.27.0, found: {}".format(horovod.__version__) ) else: raise ModuleNotFoundError("module not found: horovod") diff --git a/deep-learning/src/main/python/synapse/ml/dl/LitDeepTextModel.py b/deep-learning/src/main/python/synapse/ml/dl/LitDeepTextModel.py index 2283281c0b..b17b9f5f18 100644 --- a/deep-learning/src/main/python/synapse/ml/dl/LitDeepTextModel.py +++ b/deep-learning/src/main/python/synapse/ml/dl/LitDeepTextModel.py @@ -13,12 +13,12 @@ if _TRANSFORMERS_AVAILABLE: import transformers - _TRANSFORMERS_EQUAL_4_15_0 = transformers.__version__ == "4.15.0" - if _TRANSFORMERS_EQUAL_4_15_0: + _TRANSFORMERS_EQUAL_4_25_1 = transformers.__version__ == "4.25.1" + if _TRANSFORMERS_EQUAL_4_25_1: from transformers import AutoModelForSequenceClassification else: raise RuntimeError( - "transformers should be == 4.15.0, found: {}".format( + "transformers should be == 4.25.1, found: {}".format( transformers.__version__ ) ) diff --git a/docs/Explore Algorithms/Anomaly Detection/Quickstart - Isolation Forests.ipynb b/docs/Explore Algorithms/Anomaly Detection/Quickstart - Isolation Forests.ipynb index 3de3075f50..50e8cf6414 100644 --- a/docs/Explore Algorithms/Anomaly Detection/Quickstart - Isolation Forests.ipynb +++ b/docs/Explore Algorithms/Anomaly Detection/Quickstart - Isolation Forests.ipynb @@ -30,16 +30,16 @@ { "cell_type": "code", "execution_count": null, - "outputs": [], - "source": [ - "%pip install sqlparse raiwidgets interpret-community mlflow==2.6.0" - ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } - } + }, + "outputs": [], + "source": [ + "%pip install sqlparse raiwidgets interpret-community mlflow==2.5.0" + ] }, { "cell_type": "markdown", diff --git a/docs/Explore Algorithms/Deep Learning/Quickstart - Fine-tune a Text Classifier.ipynb b/docs/Explore Algorithms/Deep Learning/Quickstart - Fine-tune a Text Classifier.ipynb index f730d14b58..8098b70162 100644 --- a/docs/Explore Algorithms/Deep Learning/Quickstart - Fine-tune a Text Classifier.ipynb +++ b/docs/Explore Algorithms/Deep Learning/Quickstart - Fine-tune a Text Classifier.ipynb @@ -16,24 +16,33 @@ }, { "cell_type": "markdown", - "source": [ - "### Environment Setup on databricks" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "### Environment Setup on databricks" + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "# install cloudpickle 2.0.0 to add synapse module for usage of horovod\n", "%pip install cloudpickle==2.0.0 --force-reinstall --no-deps" - ], - "metadata": { - "collapsed": false - } + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install protobuf==3.20.1 --force-reinstall" + ] }, { "cell_type": "code", diff --git a/docs/Explore Algorithms/Deep Learning/Quickstart - Fine-tune a Vision Classifier.ipynb b/docs/Explore Algorithms/Deep Learning/Quickstart - Fine-tune a Vision Classifier.ipynb index a6e0930399..129ce05e5c 100644 --- a/docs/Explore Algorithms/Deep Learning/Quickstart - Fine-tune a Vision Classifier.ipynb +++ b/docs/Explore Algorithms/Deep Learning/Quickstart - Fine-tune a Vision Classifier.ipynb @@ -25,6 +25,15 @@ "%pip install cloudpickle==2.0.0 --force-reinstall --no-deps" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install protobuf==3.20.1 --force-reinstall" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/docs/Explore Algorithms/Hyperparameter Tuning/HyperOpt.ipynb b/docs/Explore Algorithms/Hyperparameter Tuning/HyperOpt.ipynb index 808f3c1488..d97c718ce7 100644 --- a/docs/Explore Algorithms/Hyperparameter Tuning/HyperOpt.ipynb +++ b/docs/Explore Algorithms/Hyperparameter Tuning/HyperOpt.ipynb @@ -31,7 +31,7 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install hyperopt mlflow" + "%pip install hyperopt mlflow==2.5.0" ] }, { diff --git a/environment.yml b/environment.yml index 9dac854ab7..729fdeae30 100644 --- a/environment.yml +++ b/environment.yml @@ -11,8 +11,7 @@ dependencies: - r-devtools=2.4.2 - pip: - pyarrow>=0.15.0 - - numpy>=1.19.3 - - pyspark==3.2.3 + - pyspark==3.3.2 - pandas==1.2.5 - wheel - sphinx==4.2.0 @@ -32,15 +31,16 @@ dependencies: - twine - jupyter - mlflow - - torch==1.11.0 - - torchvision==0.12.0 - - horovod==0.25.0 + - numpy + - torch==1.13.1 + - torchvision==0.14.1 + - horovod==0.27.0 - petastorm>=0.11.0 - pytorch_lightning==1.5.0 - onnxmltools==1.7.0 - matplotlib - Pillow - - transformers==4.15.0 + - transformers==4.25.1 - huggingface-hub>=0.8.1 - langchain==0.0.151 - openai==0.27.5 diff --git a/pipeline.yaml b/pipeline.yaml index b7a1bc4532..7bff95ea25 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -511,8 +511,8 @@ jobs: fi sbt publishM2 - SPARK_VERSION=3.2.4 - HADOOP_VERSION=3.2 + SPARK_VERSION=3.3.2 + HADOOP_VERSION=3 wget https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz (timeout 20m sbt "project $(PACKAGE)" coverage testR) || (echo "retrying" && timeout 20m sbt "project $(PACKAGE)" coverage testR) || (echo "retrying" && timeout 20m sbt "project $(PACKAGE)" coverage testR) - task: PublishTestResults@2 diff --git a/start b/start index 923654659e..8551543db4 100644 --- a/start +++ b/start @@ -1,8 +1,8 @@ #!/bin/bash export OPENMPI_VERSION="3.1.2" -export SPARK_VERSION="3.2.3" -export HADOOP_VERSION="2.7" +export SPARK_VERSION="3.3.2" +export HADOOP_VERSION="3.3" export SYNAPSEML_VERSION="0.11.2" # Binder compatibility version echo "Beginning Spark Session..." diff --git a/tools/docker/demo/Dockerfile b/tools/docker/demo/Dockerfile index 1400052515..16b4ebd0d5 100644 --- a/tools/docker/demo/Dockerfile +++ b/tools/docker/demo/Dockerfile @@ -3,8 +3,8 @@ FROM mcr.microsoft.com/oss/mirror/docker.io/library/ubuntu:20.04 ARG SYNAPSEML_VERSION=0.11.2 ARG DEBIAN_FRONTEND=noninteractive -ENV SPARK_VERSION=3.2.3 -ENV HADOOP_VERSION=2.7 +ENV SPARK_VERSION=3.3.2 +ENV HADOOP_VERSION=3 ENV SYNAPSEML_VERSION=${SYNAPSEML_VERSION} ENV JAVA_HOME /usr/lib/jvm/java-1.11.0-openjdk-amd64 diff --git a/tools/docker/minimal/Dockerfile b/tools/docker/minimal/Dockerfile index 44e298a762..e64269652d 100644 --- a/tools/docker/minimal/Dockerfile +++ b/tools/docker/minimal/Dockerfile @@ -3,8 +3,8 @@ FROM mcr.microsoft.com/oss/mirror/docker.io/library/ubuntu:20.04 ARG SYNAPSEML_VERSION=0.11.2 ARG DEBIAN_FRONTEND=noninteractive -ENV SPARK_VERSION=3.2.3 -ENV HADOOP_VERSION=2.7 +ENV SPARK_VERSION=3.3.2 +ENV HADOOP_VERSION=3 ENV SYNAPSEML_VERSION=${SYNAPSEML_VERSION} ENV JAVA_HOME /usr/lib/jvm/java-1.11.0-openjdk-amd64 diff --git a/tools/dotnet/dotnetSetup.sh b/tools/dotnet/dotnetSetup.sh index 1244caf479..1c7d732fec 100644 --- a/tools/dotnet/dotnetSetup.sh +++ b/tools/dotnet/dotnetSetup.sh @@ -20,11 +20,11 @@ echo "##vso[task.setvariable variable=DOTNET_WORKER_DIR]$DOTNET_WORKER_DIR" # Install Sleet dotnet tool install -g sleet -# Install Apache Spark-3.2 -curl https://archive.apache.org/dist/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz -o spark-3.2.0-bin-hadoop3.2.tgz +# Install Apache Spark-3.3 +curl https://archive.apache.org/dist/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz -o spark-3.3.2-bin-hadoop3.tgz mkdir ~/bin -tar -xzvf spark-3.2.0-bin-hadoop3.2.tgz -C ~/bin -export SPARK_HOME=~/bin/spark-3.2.0-bin-hadoop3.2/ +tar -xzvf spark-3.3.2-bin-hadoop3.tgz -C ~/bin +export SPARK_HOME=~/bin/spark-3.3.2-bin-hadoop3/ export PATH=$SPARK_HOME/bin:$PATH echo "##vso[task.setvariable variable=SPARK_HOME]$SPARK_HOME" echo "##vso[task.setvariable variable=PATH]$SPARK_HOME/bin:$PATH" diff --git a/tools/tests/run_r_tests.R b/tools/tests/run_r_tests.R index 0d66844fef..9c8f99a40f 100644 --- a/tools/tests/run_r_tests.R +++ b/tools/tests/run_r_tests.R @@ -3,7 +3,7 @@ if (!require("sparklyr")) { library("sparklyr") } -spark_install_tar(paste(getwd(), "/../../../../../../spark-3.2.4-bin-hadoop3.2.tgz", sep = "")) +spark_install_tar(paste(getwd(), "/../../../../../../spark-3.3.2-bin-hadoop3.tgz", sep = "")) options("testthat.output_file" = "../../../../r-test-results.xml") devtools::test(reporter = JunitReporter$new()) From 12cdec8e2d4480989bdf42174d1b666890996335 Mon Sep 17 00:00:00 2001 From: JessicaXYWang <108437381+JessicaXYWang@users.noreply.github.com> Date: Tue, 26 Sep 2023 15:40:28 -0700 Subject: [PATCH 2/3] update explanation dashboard notebook --- .../Explanation Dashboard.ipynb | 122 +++++++++--------- 1 file changed, 61 insertions(+), 61 deletions(-) diff --git a/docs/Explore Algorithms/Responsible AI/Explanation Dashboard.ipynb b/docs/Explore Algorithms/Responsible AI/Explanation Dashboard.ipynb index f543dc78f5..4b7211fb09 100644 --- a/docs/Explore Algorithms/Responsible AI/Explanation Dashboard.ipynb +++ b/docs/Explore Algorithms/Responsible AI/Explanation Dashboard.ipynb @@ -2,31 +2,34 @@ "cells": [ { "cell_type": "markdown", + "metadata": { + "collapsed": false + }, "source": [ "## Interpretability - Explanation Dashboard\n", "\n", "In this example, similar to the \"Interpretability - Tabular SHAP explainer\" notebook, we use Kernel SHAP to explain a tabular classification model built from the Adults Census dataset and then visualize the explanation in the ExplanationDashboard from https://github.com/microsoft/responsible-ai-widgets.\n", "\n", "First we import the packages and define some UDFs we will need later." - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": null, - "outputs": [], - "source": [ - "%pip install raiwidgets itsdangerous==2.0.1 interpret-community" - ], "metadata": { "collapsed": false - } + }, + "outputs": [], + "source": [ + "%pip install raiwidgets itsdangerous==2.0.1 interpret-community numpy==1.21.6" + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "from IPython.terminal.interactiveshell import TerminalInteractiveShell\n", @@ -40,23 +43,23 @@ "\n", "vec_access = udf(lambda v, i: float(v[i]), FloatType())\n", "vec2array = udf(lambda vec: vec.toArray().tolist(), ArrayType(FloatType()))" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", - "source": [ - "Now let's read the data and train a simple binary classification model." - ], "metadata": { "collapsed": false - } + }, + "source": [ + "Now let's read the data and train a simple binary classification model." + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "df = spark.read.parquet(\n", @@ -102,46 +105,46 @@ "lr = LogisticRegression(featuresCol=\"features\", labelCol=\"label\", weightCol=\"fnlwgt\")\n", "pipeline = Pipeline(stages=[strIndexer, onehotEnc, vectAssem, lr])\n", "model = pipeline.fit(training)" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", - "source": [ - "After the model is trained, we randomly select some observations to be explained." - ], "metadata": { "collapsed": false - } + }, + "source": [ + "After the model is trained, we randomly select some observations to be explained." + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "explain_instances = (\n", " model.transform(training).orderBy(rand()).limit(5).repartition(200).cache()\n", ")\n", "display(explain_instances)" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", - "source": [ - "We create a TabularSHAP explainer, set the input columns to all the features the model takes, specify the model and the target output column we are trying to explain. In this case, we are trying to explain the \"probability\" output which is a vector of length 2, and we are only looking at class 1 probability. Specify targetClasses to `[0, 1]` if you want to explain class 0 and 1 probability at the same time. Finally we sample 100 rows from the training data for background data, which is used for integrating out features in Kernel SHAP." - ], "metadata": { "collapsed": false - } + }, + "source": [ + "We create a TabularSHAP explainer, set the input columns to all the features the model takes, specify the model and the target output column we are trying to explain. In this case, we are trying to explain the \"probability\" output which is a vector of length 2, and we are only looking at class 1 probability. Specify targetClasses to `[0, 1]` if you want to explain class 0 and 1 probability at the same time. Finally we sample 100 rows from the training data for background data, which is used for integrating out features in Kernel SHAP." + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "shap = TabularSHAP(\n", @@ -155,24 +158,24 @@ ")\n", "\n", "shap_df = shap.transform(explain_instances)" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", + "metadata": { + "collapsed": false + }, "source": [ "Once we have the resulting dataframe, we extract the class 1 probability of the model output, the SHAP values for the target class, the original features and the true label. Then we convert it to a pandas dataframe for visualization.\n", "For each observation, the first element in the SHAP values vector is the base value (the mean output of the background dataset), and each of the following element is the SHAP values for each feature." - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "shaps = (\n", @@ -187,23 +190,23 @@ "shaps_local.sort_values(\"probability\", ascending=False, inplace=True, ignore_index=True)\n", "pd.set_option(\"display.max_colwidth\", None)\n", "shaps_local" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", - "source": [ - "We can visualize the explanation in the [interpret-community format](https://github.com/interpretml/interpret-community) in the ExplanationDashboard from https://github.com/microsoft/responsible-ai-widgets/" - ], "metadata": { "collapsed": false - } + }, + "source": [ + "We can visualize the explanation in the [interpret-community format](https://github.com/interpretml/interpret-community) in the ExplanationDashboard from https://github.com/microsoft/responsible-ai-widgets/" + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "import numpy as np\n", @@ -216,14 +219,14 @@ "local_importance_values = shaps_local[[\"shapValues\"]]\n", "eval_data = shaps_local[features]\n", "true_y = np.array(shaps_local[[\"label\"]])" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false + }, "outputs": [], "source": [ "list_local_importance_values = local_importance_values.values.tolist()\n", @@ -236,19 +239,16 @@ " # remove the bias from local importance values\n", " del converted_list[0]\n", " converted_importance_values.append(converted_list)" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "markdown", - "source": [ - "When running Synapse Analytics, please follow instructions here [Package management - Azure Synapse Analytics | Microsoft Docs](https://docs.microsoft.com/en-us/azure/synapse-analytics/spark/apache-spark-azure-portal-add-libraries) to install [\"raiwidgets\"](https://pypi.org/project/raiwidgets/) and [\"interpret-community\"](https://pypi.org/project/interpret-community/) packages." - ], "metadata": { "collapsed": false - } + }, + "source": [ + "When running Synapse Analytics, please follow instructions here [Package management - Azure Synapse Analytics | Microsoft Docs](https://docs.microsoft.com/en-us/azure/synapse-analytics/spark/apache-spark-azure-portal-add-libraries) to install [\"raiwidgets\"](https://pypi.org/project/raiwidgets/) and [\"interpret-community\"](https://pypi.org/project/interpret-community/) packages." + ] }, { "cell_type": "code", From afb0054fa6fd7d21141b111bc35b1027a093379b Mon Sep 17 00:00:00 2001 From: JessicaXYWang <108437381+JessicaXYWang@users.noreply.github.com> Date: Tue, 26 Sep 2023 17:33:53 -0700 Subject: [PATCH 3/3] update explanation dashboard --- .../Responsible AI/Explanation Dashboard.ipynb | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/docs/Explore Algorithms/Responsible AI/Explanation Dashboard.ipynb b/docs/Explore Algorithms/Responsible AI/Explanation Dashboard.ipynb index 4b7211fb09..e06d6e07f1 100644 --- a/docs/Explore Algorithms/Responsible AI/Explanation Dashboard.ipynb +++ b/docs/Explore Algorithms/Responsible AI/Explanation Dashboard.ipynb @@ -13,6 +13,15 @@ "First we import the packages and define some UDFs we will need later." ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install numpy==1.21.6" + ] + }, { "cell_type": "code", "execution_count": null, @@ -21,7 +30,7 @@ }, "outputs": [], "source": [ - "%pip install raiwidgets itsdangerous==2.0.1 interpret-community numpy==1.21.6" + "%pip install raiwidgets itsdangerous==2.0.1 interpret-community" ] }, {