Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: update to spark 3.3.1 #1930

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 9 additions & 10 deletions build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@ import scala.xml.transform.{RewriteRule, RuleTransformer}
import scala.xml.{Node => XmlNode, NodeSeq => XmlNodeSeq, _}

val condaEnvName = "synapseml"
val sparkVersion = "3.4.1"
val sparkVersion = "3.3.2"
name := "synapseml"
ThisBuild / organization := "com.microsoft.azure"
ThisBuild / scalaVersion := "2.12.17"
ThisBuild / scalaVersion := "2.12.15"

val scalaMajorVersion = 2.12

Expand All @@ -20,24 +20,23 @@ val excludes = Seq(
)

val coreDependencies = Seq(
// Excluding protobuf-java, as spark-core is bringing the older version transitively.
"org.apache.spark" %% "spark-core" % sparkVersion % "compile" exclude("com.google.protobuf", "protobuf-java"),
"org.apache.spark" %% "spark-core" % sparkVersion % "compile",
"org.apache.spark" %% "spark-mllib" % sparkVersion % "compile",
"org.apache.spark" %% "spark-avro" % sparkVersion % "compile",
"org.apache.spark" %% "spark-avro" % sparkVersion % "provided",
"org.apache.spark" %% "spark-tags" % sparkVersion % "test",
"com.globalmentor" % "hadoop-bare-naked-local-fs" % "0.1.0" % "test",
"org.scalatest" %% "scalatest" % "3.2.14" % "test")
val extraDependencies = Seq(
"commons-lang" % "commons-lang" % "2.6",
"org.scalactic" %% "scalactic" % "3.2.14",
"io.spray" %% "spray-json" % "1.3.5",
"com.jcraft" % "jsch" % "0.1.54",
"org.apache.httpcomponents.client5" % "httpclient5" % "5.1.3",
"org.apache.httpcomponents" % "httpmime" % "4.5.13",
"com.linkedin.isolation-forest" %% "isolation-forest_3.4.1" % "3.0.3",
// Although breeze 2.1.0 is already provided by Spark, this is needed for Azure Synapse Spark 3.4 pools.
// Otherwise a NoSuchMethodError will be thrown by interpretability code.
"org.scalanlp" %% "breeze" % "2.1.0"
"com.linkedin.isolation-forest" %% "isolation-forest_3.3.3" % "3.0.3",
// Although breeze 1.2 is already provided by Spark, this is needed for Azure Synapse Spark 3.2 pools.
// Otherwise a NoSuchMethodError will be thrown by interpretability code. This problem only happens
// to Azure Synapse Spark 3.2 pools.
"org.scalanlp" %% "breeze" % "1.2"
).map(d => d excludeAll (excludes: _*))
val dependencies = coreDependencies ++ extraDependencies

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ object PyCodegen {
// There's `Already borrowed` error found in transformers 4.16.2 when using tokenizers
s"""extras_require={"extras": [
| "cmake",
| "horovod==0.28.1",
| "horovod==0.27.0",
| "pytorch_lightning>=1.5.0,<1.5.10",
| "torch==1.13.1",
| "torchvision>=0.14.1",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ object RTestGen {
| "spark.sql.shuffle.partitions=10",
| "spark.sql.crossJoin.enabled=true")
|
|sc <- spark_connect(master = "local", version = "3.4.1", config = conf)
|sc <- spark_connect(master = "local", version = "3.3.2", config = conf)
|
|""".stripMargin, StandardOpenOption.CREATE)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,11 @@ object DatabricksUtilities {

// ADB Info
val Region = "eastus"
val PoolName = "synapseml-build-13.3"
val GpuPoolName = "synapseml-build-13.3-gpu"
val AdbRuntime = "13.3.x-scala2.12"
// https://docs.databricks.com/en/release-notes/runtime/13.3lts-ml.html
val AdbGpuRuntime = "13.3.x-gpu-ml-scala2.12"
val PoolName = "synapseml-build-12.2"
val GpuPoolName = "synapseml-build-12.2-gpu"
val AdbRuntime = "12.2.x-scala2.12"
// https://learn.microsoft.com/en-us/azure/databricks/release-notes/runtime/
val AdbGpuRuntime = "12.2.x-gpu-ml-scala2.12"
val NumWorkers = 5
val AutoTerminationMinutes = 15

Expand Down Expand Up @@ -82,9 +82,9 @@ object DatabricksUtilities {
Map("maven" -> Map("coordinates" -> PackageMavenCoordinate, "repo" -> PackageRepository)),
Map("pypi" -> Map("package" -> "pytorch-lightning==1.5.0")),
Map("pypi" -> Map("package" -> "torchvision==0.14.1")),
Map("pypi" -> Map("package" -> "transformers==4.32.1")),
Map("pypi" -> Map("package" -> "petastorm==0.12.0")),
Map("pypi" -> Map("package" -> "protobuf==3.20.3"))
Map("pypi" -> Map("package" -> "transformers==4.25.1")),
Map("pypi" -> Map("package" -> "petastorm==0.12.1")),
Map("pypi" -> Map("package" -> "protobuf==3.19.4"))
).toJson.compactPrint

val GPUInitScripts: String = List(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ object SynapseExtensionUtilities {
|"{
| 'Default${store}ArtifactId': '$storeId',
| 'ExecutableFile': '$path',
| 'SparkVersion':'3.4',
| 'SparkVersion':'3.3',
| 'SparkSettings': {
| 'spark.jars.packages' : '$SparkMavenPackageList',
| 'spark.jars.repositories' : '$SparkMavenRepositoryList',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ object SynapseUtilities {
| "nodeSizeFamily": "MemoryOptimized",
| "provisioningState": "Succeeded",
| "sessionLevelPackagesEnabled": "true",
| "sparkVersion": "3.4"
| "sparkVersion": "3.3"
| }
|}
|""".stripMargin
Expand Down
10 changes: 5 additions & 5 deletions deep-learning/src/main/python/horovod_installation.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ set -eu
# Install prerequisite libraries that horovod depends on
pip install pytorch-lightning==1.5.0
pip install torchvision==0.14.1
pip install transformers==4.32.1
pip install transformers==4.25.1
pip install petastorm>=0.12.0
pip install protobuf==3.20.3
pip install protobuf==3.19.1

# Remove Outdated Signing Key:
sudo apt-key del 7fa2af80
Expand All @@ -35,13 +35,13 @@ libcusparse-dev-11-0=11.1.1.245-1

git clone --recursive https://github.com/horovod/horovod.git
cd horovod
# git fetch origin refs/tags/v0.28.1:tags/v0.28.1
git checkout 1d217b59949986d025f6db93c49943fb6b6cc78f
# git fetch origin refs/tags/v0.27.0:tags/v0.27.0
git checkout bfaca90d5cf66780a97d8799d4e1573855b64560
git checkout -b tmp-branch
rm -rf build/ dist/
HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_CUDA_HOME=/usr/local/cuda-11/ HOROVOD_WITH_PYTORCH=1 HOROVOD_WITHOUT_MXNET=1 \
/databricks/python3/bin/python setup.py bdist_wheel

readlink -f dist/horovod-*.whl

pip install --no-cache-dir dist/horovod-0.28.1-cp38-cp38-linux_x86_64.whl --force-reinstall --no-deps
pip install --no-cache-dir dist/horovod-0.27.0-cp38-cp38-linux_x86_64.whl --force-reinstall --no-deps
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@
if _TRANSFORMERS_AVAILABLE:
import transformers

_TRANSFORMERS_EQUAL_4_32_1 = transformers.__version__ == "4.32.1"
if _TRANSFORMERS_EQUAL_4_32_1:
_TRANSFORMERS_EQUAL_4_25_1 = transformers.__version__ == "4.25.1"
if _TRANSFORMERS_EQUAL_4_25_1:
from transformers import AutoTokenizer
else:
raise RuntimeError(
"transformers should be == 4.32.1, found: {}".format(
"transformers should be == 4.25.1, found: {}".format(
transformers.__version__
)
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@
if _HOROVOD_AVAILABLE:
import horovod

_HOROVOD_EQUAL_0_28_1 = horovod.__version__ == "0.28.1"
if not _HOROVOD_EQUAL_0_28_1:
_HOROVOD_EQUAL_0_27_0 = horovod.__version__ == "0.27.0"
if not _HOROVOD_EQUAL_0_27_0:
raise RuntimeError(
"horovod should be of version 0.28.1, found: {}".format(horovod.__version__)
"horovod should be of version 0.27.0, found: {}".format(horovod.__version__)
)
else:
raise ModuleNotFoundError("module not found: horovod")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@
if _TRANSFORMERS_AVAILABLE:
import transformers

_TRANSFORMERS_EQUAL_4_32_1 = transformers.__version__ == "4.32.1"
if _TRANSFORMERS_EQUAL_4_32_1:
_TRANSFORMERS_EQUAL_4_25_1 = transformers.__version__ == "4.25.1"
if _TRANSFORMERS_EQUAL_4_25_1:
from transformers import AutoModelForSequenceClassification
else:
raise RuntimeError(
"transformers should be == 4.32.1, found: {}".format(
"transformers should be == 4.25.1, found: {}".format(
transformers.__version__
)
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,24 +16,33 @@
},
{
"cell_type": "markdown",
"source": [
"### Environment Setup on databricks"
],
"metadata": {
"collapsed": false
}
},
"source": [
"### Environment Setup on databricks"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# install cloudpickle 2.0.0 to add synapse module for usage of horovod\n",
"%pip install cloudpickle==2.0.0 --force-reinstall --no-deps"
],
"metadata": {
"collapsed": false
}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%pip install protobuf==3.20.1 --force-reinstall"
]
},
{
"cell_type": "code",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,15 @@
"%pip install cloudpickle==2.0.0 --force-reinstall --no-deps"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%pip install protobuf==3.20.1 --force-reinstall"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
"metadata": {},
"outputs": [],
"source": [
"%pip install hyperopt mlflow"
"%pip install hyperopt mlflow==2.5.0"
]
},
{
Expand Down
6 changes: 3 additions & 3 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ dependencies:
- r-devtools=2.4.2
- pip:
- pyarrow>=0.15.0
- pyspark==3.4.1
- pyspark==3.3.2
- pandas==1.2.5
- wheel
- sphinx==4.2.0
Expand All @@ -34,13 +34,13 @@ dependencies:
- numpy
- torch==1.13.1
- torchvision==0.14.1
- horovod==0.28.1
- horovod==0.27.0
- petastorm>=0.11.0
- pytorch_lightning==1.5.0
- onnxmltools==1.7.0
- matplotlib
- Pillow
- transformers==4.32.1
- transformers==4.25.1
- huggingface-hub>=0.8.1
- langchain==0.0.151
- openai==0.27.5
Expand Down
2 changes: 1 addition & 1 deletion pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -527,7 +527,7 @@ jobs:
fi
sbt publishM2

SPARK_VERSION=3.4.1
SPARK_VERSION=3.3.2
HADOOP_VERSION=3
wget https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz
(timeout 20m sbt "project $(PACKAGE)" coverage testR) || (echo "retrying" && timeout 20m sbt "project $(PACKAGE)" coverage testR) || (echo "retrying" && timeout 20m sbt "project $(PACKAGE)" coverage testR)
Expand Down
3 changes: 1 addition & 2 deletions start
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
#!/bin/bash

export OPENMPI_VERSION="3.1.2"

export SPARK_VERSION="3.4.1"
export SPARK_VERSION="3.3.2"
export HADOOP_VERSION="3.3"
export SYNAPSEML_VERSION="0.11.4" # Binder compatibility version

Expand Down
2 changes: 1 addition & 1 deletion tools/docker/demo/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ FROM mcr.microsoft.com/oss/mirror/docker.io/library/ubuntu:20.04
ARG SYNAPSEML_VERSION=0.11.4
ARG DEBIAN_FRONTEND=noninteractive

ENV SPARK_VERSION=3.4.1
ENV SPARK_VERSION=3.3.2
ENV HADOOP_VERSION=3
ENV SYNAPSEML_VERSION=${SYNAPSEML_VERSION}
ENV JAVA_HOME /usr/lib/jvm/java-1.11.0-openjdk-amd64
Expand Down
2 changes: 1 addition & 1 deletion tools/docker/minimal/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ FROM mcr.microsoft.com/oss/mirror/docker.io/library/ubuntu:20.04
ARG SYNAPSEML_VERSION=0.11.4
ARG DEBIAN_FRONTEND=noninteractive

ENV SPARK_VERSION=3.4.1
ENV SPARK_VERSION=3.3.2
ENV HADOOP_VERSION=3
ENV SYNAPSEML_VERSION=${SYNAPSEML_VERSION}
ENV JAVA_HOME /usr/lib/jvm/java-1.11.0-openjdk-amd64
Expand Down
8 changes: 4 additions & 4 deletions tools/dotnet/dotnetSetup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@ echo "##vso[task.setvariable variable=DOTNET_WORKER_DIR]$DOTNET_WORKER_DIR"
# Install Sleet
dotnet tool install -g sleet

# Install Apache Spark-3.4.1
curl https://archive.apache.org/dist/spark/spark-3.4.1/spark-3.4.1-bin-hadoop3.tgz -o spark-3.4.1-bin-hadoop3.tgz
# Install Apache Spark-3.3
curl https://archive.apache.org/dist/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz -o spark-3.3.2-bin-hadoop3.tgz
mkdir ~/bin
tar -xzvf spark-3.4.1-bin-hadoop3.tgz -C ~/bin
export SPARK_HOME=~/bin/spark-3.4.1-bin-hadoop3/
tar -xzvf spark-3.3.2-bin-hadoop3.tgz -C ~/bin
export SPARK_HOME=~/bin/spark-3.3.2-bin-hadoop3/
export PATH=$SPARK_HOME/bin:$PATH
echo "##vso[task.setvariable variable=SPARK_HOME]$SPARK_HOME"
echo "##vso[task.setvariable variable=PATH]$SPARK_HOME/bin:$PATH"
2 changes: 1 addition & 1 deletion tools/tests/run_r_tests.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ if (!require("sparklyr")) {
library("sparklyr")
}

spark_install_tar(paste(getwd(), "/../../../../../../spark-3.4.1-bin-hadoop3.tgz", sep = ""))
spark_install_tar(paste(getwd(), "/../../../../../../spark-3.3.2-bin-hadoop3.tgz", sep = ""))

options("testthat.output_file" = "../../../../r-test-results.xml")
devtools::test(reporter = JunitReporter$new())
Loading