From 8f7f01b4da62aaff8855de39d80a334146721be3 Mon Sep 17 00:00:00 2001 From: Eli Barzilay Date: Thu, 10 Aug 2017 20:45:51 -0400 Subject: [PATCH] Patch the Spark jars to make WASB access work Ideally, we would just add `hadoop-azure` to the SBT dependencies, but that brings in a some hadoop classes that are incompatible with the ones that are included in Spark (since it uses an older hadoop version). Instead, get the jar file directly into `lib/spark/jars`, and do the same for `azure-storage` which is a dependency. --- src/project/build.scala | 16 ++++++++++------ tools/config.sh | 9 +++++++++ tools/runme/install.sh | 2 +- tools/runme/utils.sh | 6 ++++++ 4 files changed, 26 insertions(+), 7 deletions(-) diff --git a/src/project/build.scala b/src/project/build.scala index d5ee8364fd..e9b75ac345 100644 --- a/src/project/build.scala +++ b/src/project/build.scala @@ -26,13 +26,17 @@ object Extras { def sparkVer = env("SPARK_VERSION", null) def commonLibs = Seq( - "org.apache.spark" %% "spark-core" % sparkVer % "provided", - "org.apache.spark" %% "spark-mllib" % sparkVer % "provided", - "org.scalatest" %% "scalatest" % "3.0.0" % "provided", + "org.apache.spark" %% "spark-core" % sparkVer % "provided", + "org.apache.spark" %% "spark-mllib" % sparkVer % "provided", + "org.scalatest" %% "scalatest" % "3.0.0" % "provided", // should include these things in the distributed jar - "io.spray" %% "spray-json" % "1.3.2", - "com.microsoft.cntk" % "cntk" % "2.1", - "org.openpnp" % "opencv" % "3.2.0-1" + "io.spray" %% "spray-json" % "1.3.2", + "com.microsoft.cntk" % "cntk" % "2.1", + "org.openpnp" % "opencv" % "3.2.0-1" + // needed for wasb access, but it collides with the version that comes with Spark, + // so it gets installed manually for now (see "tools/config.sh") + + // "org.apache.hadoop" % "hadoop-azure" % "2.7.3" ) def overrideLibs = Set( // spark wants 2.2.6, but we don't use its tests anyway diff --git a/tools/config.sh b/tools/config.sh index ed677cd0da..7d93f8302d 100644 --- a/tools/config.sh +++ b/tools/config.sh @@ -174,6 +174,15 @@ Spark.setup() { echo " " echo "" } > "conf/hive-site.xml" + cd "jars" + # Patch the Spark jars: add hadoop-azure and azure-storage to make WASB access + # work. Ideally, we would just add `hadoop-azure` to the SBT dependencies, + # but that collides with the hadoop version that comes with Spark (see comment + # in "src/project/build.scala"). When/if spark is updated for a newer hadoop, + # then go back to the sbt route. + local mvn="http://central.maven.org/maven2" + _curl -O "$mvn/com/microsoft/azure/azure-storage/2.0.0/azure-storage-2.0.0.jar" + _curl -O "$mvn/org/apache/hadoop/hadoop-azure/2.7.3/hadoop-azure-2.7.3.jar" } Spark.init() { local f; for f in "python/lib/"*.zip; do diff --git a/tools/runme/install.sh b/tools/runme/install.sh index 007cab3b1f..461fe4d34a 100644 --- a/tools/runme/install.sh +++ b/tools/runme/install.sh @@ -121,7 +121,7 @@ _retrieve_file() { # url file sha256 && "$(< "$cache.sha256")" = "$sha256" ]]; then _ ln -sf "$cache" "$target"; return fi - _ curl --output "$target" $CURL_FLAGS "$url" + _curl --output "$target" "$url" local sha256sum="$(__ sha256sum "$target")"; sha256sum="${sha256sum%% *}" if [[ "x$sha256sum" = "x" ]]; then failwith "could not get sha256 checksum"; fi if [[ "$sha256sum" != "$sha256" ]]; then diff --git a/tools/runme/utils.sh b/tools/runme/utils.sh index 100025b577..2ee890d0f7 100644 --- a/tools/runme/utils.sh +++ b/tools/runme/utils.sh @@ -348,6 +348,12 @@ azblob() { az storage blob "$verb" --account-name "$MAIN_STORAGE" "$@" } +# ---< _curl arg... >----------------------------------------------------------- +# Convenience for running curl as "_ curl $CURL_FLAGS arg...". +_curl() { + _ curl $CURL_FLAGS "$@" +} + # ------------------------------------------------------------------------------ # Internal functions follow