Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

draft: Spark33rebase #2075

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import scala.xml.transform.{RewriteRule, RuleTransformer}
import scala.xml.{Node => XmlNode, NodeSeq => XmlNodeSeq, _}

val condaEnvName = "synapseml"
val sparkVersion = "3.2.3"
val sparkVersion = "3.3.2"
name := "synapseml"
ThisBuild / organization := "com.microsoft.azure"
ThisBuild / scalaVersion := "2.12.15"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,11 +68,11 @@ object PyCodegen {
// There's `Already borrowed` error found in transformers 4.16.2 when using tokenizers
s"""extras_require={"extras": [
| "cmake",
| "horovod==0.25.0",
| "horovod==0.27.0",
| "pytorch_lightning>=1.5.0,<1.5.10",
| "torch==1.11.0",
| "torchvision>=0.12.0",
| "transformers==4.15.0",
| "torch==1.13.1",
| "torchvision>=0.14.1",
| "transformers==4.32.1",
| "petastorm>=0.12.0",
| "huggingface-hub>=0.8.1",
|]},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ object RTestGen {
| "spark.sql.shuffle.partitions=10",
| "spark.sql.crossJoin.enabled=true")
|
|sc <- spark_connect(master = "local", version = "3.2.4", config = conf)
|sc <- spark_connect(master = "local", version = "3.3.2", config = conf)
|
|""".stripMargin, StandardOpenOption.CREATE)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,7 @@ import java.io.File
import scala.collection.mutable.ListBuffer

class DatabricksGPUTests extends DatabricksTestHelper {
val horovodInstallationScript: File = FileUtilities.join(
BuildInfo.baseDirectory.getParent, "deep-learning",
"src", "main", "python", "horovod_installation.sh").getCanonicalFile
uploadFileToDBFS(horovodInstallationScript, "/FileStore/horovod-fix-commit/horovod_installation.sh")
val clusterId: String = createClusterInPool(GPUClusterName, AdbGpuRuntime, 2, GpuPoolId, GPUInitScripts)
val clusterId: String = createClusterInPool(GPUClusterName, AdbGpuRuntime, 2, GpuPoolId, "[]")
val jobIdsToCancel: ListBuffer[Int] = databricksTestHelper(
clusterId, GPULibraries, GPUNotebooks)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,11 @@ object DatabricksUtilities {

// ADB Info
val Region = "eastus"
val PoolName = "synapseml-build-10.4"
val GpuPoolName = "synapseml-build-10.4-gpu"
val AdbRuntime = "10.4.x-scala2.12"
val AdbGpuRuntime = "10.4.x-gpu-ml-scala2.12"
val PoolName = "synapseml-build-12.2"
val GpuPoolName = "synapseml-build-12.2-gpu"
val AdbRuntime = "12.2.x-scala2.12"
// https://learn.microsoft.com/en-us/azure/databricks/release-notes/runtime/
val AdbGpuRuntime = "12.2.x-gpu-ml-scala2.12"
val NumWorkers = 5
val AutoTerminationMinutes = 15

Expand Down Expand Up @@ -75,8 +76,11 @@ object DatabricksUtilities {
// TODO: install synapse.ml.dl wheel package here
val GPULibraries: String = List(
Map("maven" -> Map("coordinates" -> PackageMavenCoordinate, "repo" -> PackageRepository)),
Map("pypi" -> Map("package" -> "transformers==4.15.0")),
Map("pypi" -> Map("package" -> "petastorm==0.12.0"))
Map("pypi" -> Map("package" -> "pytorch-lightning==1.5.0")),
Map("pypi" -> Map("package" -> "torchvision==0.14.1")),
Map("pypi" -> Map("package" -> "transformers==4.25.1")),
Map("pypi" -> Map("package" -> "petastorm==0.12.1")),
Map("pypi" -> Map("package" -> "protobuf==3.19.4"))
).toJson.compactPrint

val GPUInitScripts: String = List(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ object SynapseUtilities {
| "nodeSizeFamily": "MemoryOptimized",
| "provisioningState": "Succeeded",
| "sessionLevelPackagesEnabled": "true",
| "sparkVersion": "3.2"
| "sparkVersion": "3.3"
| }
|}
|""".stripMargin
Expand Down
15 changes: 6 additions & 9 deletions deep-learning/src/main/python/horovod_installation.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@ set -eu

# Install prerequisite libraries that horovod depends on
pip install pytorch-lightning==1.5.0
pip install torchvision==0.12.0
pip install transformers==4.15.0
pip install torchvision==0.14.1
pip install transformers==4.25.1
pip install petastorm>=0.12.0
pip install protobuf==3.20.3
pip install protobuf==3.19.1

# Remove Outdated Signing Key:
sudo apt-key del 7fa2af80
Expand All @@ -35,16 +35,13 @@ libcusparse-dev-11-0=11.1.1.245-1

git clone --recursive https://github.com/horovod/horovod.git
cd horovod
# # fix version 0.25.0
# git fetch origin refs/tags/v0.25.0:tags/v0.25.0
# git checkout tags/v0.25.0 -b v0.25.0-branch
# fix to this commit number until they release a new version
git checkout ab97fd15bbba3258adcdd12983f36a1cdeacbc94
# git fetch origin refs/tags/v0.27.0:tags/v0.27.0
git checkout bfaca90d5cf66780a97d8799d4e1573855b64560
git checkout -b tmp-branch
rm -rf build/ dist/
HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_CUDA_HOME=/usr/local/cuda-11/ HOROVOD_WITH_PYTORCH=1 HOROVOD_WITHOUT_MXNET=1 \
/databricks/python3/bin/python setup.py bdist_wheel

readlink -f dist/horovod-*.whl

pip install --no-cache-dir dist/horovod-0.25.0-cp38-cp38-linux_x86_64.whl --force-reinstall --no-deps
pip install --no-cache-dir dist/horovod-0.27.0-cp38-cp38-linux_x86_64.whl --force-reinstall --no-deps
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@
if _TRANSFORMERS_AVAILABLE:
import transformers

_TRANSFORMERS_EQUAL_4_15_0 = transformers.__version__ == "4.15.0"
if _TRANSFORMERS_EQUAL_4_15_0:
_TRANSFORMERS_EQUAL_4_25_1 = transformers.__version__ == "4.25.1"
if _TRANSFORMERS_EQUAL_4_25_1:
from transformers import AutoTokenizer
else:
raise RuntimeError(
"transformers should be == 4.15.0, found: {}".format(
"transformers should be == 4.25.1, found: {}".format(
transformers.__version__
)
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@
if _HOROVOD_AVAILABLE:
import horovod

_HOROVOD_EQUAL_0_25_0 = horovod.__version__ == "0.25.0"
if not _HOROVOD_EQUAL_0_25_0:
_HOROVOD_EQUAL_0_27_0 = horovod.__version__ == "0.27.0"
if not _HOROVOD_EQUAL_0_27_0:
raise RuntimeError(
"horovod should be of version 0.25.0, found: {}".format(horovod.__version__)
"horovod should be of version 0.27.0, found: {}".format(horovod.__version__)
)
else:
raise ModuleNotFoundError("module not found: horovod")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@
if _TRANSFORMERS_AVAILABLE:
import transformers

_TRANSFORMERS_EQUAL_4_15_0 = transformers.__version__ == "4.15.0"
if _TRANSFORMERS_EQUAL_4_15_0:
_TRANSFORMERS_EQUAL_4_25_1 = transformers.__version__ == "4.25.1"
if _TRANSFORMERS_EQUAL_4_25_1:
from transformers import AutoModelForSequenceClassification
else:
raise RuntimeError(
"transformers should be == 4.15.0, found: {}".format(
"transformers should be == 4.25.1, found: {}".format(
transformers.__version__
)
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,16 @@
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"%pip install sqlparse raiwidgets interpret-community mlflow==2.6.0"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
},
"outputs": [],
"source": [
"%pip install sqlparse raiwidgets interpret-community mlflow==2.5.0"
]
},
{
"cell_type": "markdown",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,24 +16,33 @@
},
{
"cell_type": "markdown",
"source": [
"### Environment Setup on databricks"
],
"metadata": {
"collapsed": false
}
},
"source": [
"### Environment Setup on databricks"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# install cloudpickle 2.0.0 to add synapse module for usage of horovod\n",
"%pip install cloudpickle==2.0.0 --force-reinstall --no-deps"
],
"metadata": {
"collapsed": false
}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%pip install protobuf==3.20.1 --force-reinstall"
]
},
{
"cell_type": "code",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,15 @@
"%pip install cloudpickle==2.0.0 --force-reinstall --no-deps"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%pip install protobuf==3.20.1 --force-reinstall"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
"metadata": {},
"outputs": [],
"source": [
"%pip install hyperopt mlflow"
"%pip install hyperopt mlflow==2.5.0"
]
},
{
Expand Down
Loading
Loading