Patch the Spark jars to make WASB access work

Ideally, we would just add `hadoop-azure` to the SBT dependencies, but that brings in a some hadoop classes that are incompatible with the ones that are included in Spark (since it uses an older hadoop version). Instead, get the jar file directly into `lib/spark/jars`, and do the same for `azure-storage` which is a dependency.
microsoft · Aug 17, 2017 · 8f7f01b · 8f7f01b
1 parent 2426bf0
commit 8f7f01b
Show file tree

Hide file tree

Showing 4 changed files with 26 additions and 7 deletions.
diff --git a/src/project/build.scala b/src/project/build.scala
@@ -26,13 +26,17 @@ object Extras {
   def sparkVer = env("SPARK_VERSION", null)
 
   def commonLibs = Seq(
-    "org.apache.spark"   %% "spark-core"  % sparkVer % "provided",
-    "org.apache.spark"   %% "spark-mllib" % sparkVer % "provided",
-    "org.scalatest"      %% "scalatest"   % "3.0.0"  % "provided",
+    "org.apache.spark"   %% "spark-core"   % sparkVer % "provided",
+    "org.apache.spark"   %% "spark-mllib"  % sparkVer % "provided",
+    "org.scalatest"      %% "scalatest"    % "3.0.0"  % "provided",
     // should include these things in the distributed jar
-    "io.spray"           %% "spray-json"  % "1.3.2",
-    "com.microsoft.cntk" %  "cntk"        % "2.1",
-    "org.openpnp"        %  "opencv"      % "3.2.0-1"
+    "io.spray"           %% "spray-json"   % "1.3.2",
+    "com.microsoft.cntk"  % "cntk"         % "2.1",
+    "org.openpnp"         % "opencv"       % "3.2.0-1"
+    // needed for wasb access, but it collides with the version that comes with Spark,
+    // so it gets installed manually for now (see "tools/config.sh")
+
+    // "org.apache.hadoop"   % "hadoop-azure" % "2.7.3"
     )
   def overrideLibs = Set(
     // spark wants 2.2.6, but we don't use its tests anyway

diff --git a/tools/config.sh b/tools/config.sh
@@ -174,6 +174,15 @@ Spark.setup() {
     echo "  </property>"
     echo "</configuration>"
   } > "conf/hive-site.xml"
+  cd "jars"
+  # Patch the Spark jars: add hadoop-azure and azure-storage to make WASB access
+  # work.  Ideally, we would just add `hadoop-azure` to the SBT dependencies,
+  # but that collides with the hadoop version that comes with Spark (see comment
+  # in "src/project/build.scala").  When/if spark is updated for a newer hadoop,
+  # then go back to the sbt route.
+  local mvn="http://central.maven.org/maven2"
+  _curl -O "$mvn/com/microsoft/azure/azure-storage/2.0.0/azure-storage-2.0.0.jar"
+  _curl -O "$mvn/org/apache/hadoop/hadoop-azure/2.7.3/hadoop-azure-2.7.3.jar"
 }
 Spark.init() {
   local f; for f in "python/lib/"*.zip; do

diff --git a/tools/runme/install.sh b/tools/runme/install.sh
@@ -121,7 +121,7 @@ _retrieve_file() { # url file sha256
         && "$(< "$cache.sha256")" = "$sha256" ]]; then
     _ ln -sf "$cache" "$target"; return
   fi
-  _ curl --output "$target" $CURL_FLAGS "$url"
+  _curl --output "$target" "$url"
   local sha256sum="$(__ sha256sum "$target")"; sha256sum="${sha256sum%% *}"
   if [[ "x$sha256sum" = "x" ]]; then failwith "could not get sha256 checksum"; fi
   if [[ "$sha256sum" != "$sha256" ]]; then

diff --git a/tools/runme/utils.sh b/tools/runme/utils.sh
@@ -348,6 +348,12 @@ azblob() {
   az storage blob "$verb" --account-name "$MAIN_STORAGE" "$@"
 }
 
+# ---< _curl arg... >-----------------------------------------------------------
+# Convenience for running curl as "_ curl $CURL_FLAGS arg...".
+_curl() {
+  _ curl $CURL_FLAGS "$@"
+}
+
 # ------------------------------------------------------------------------------
 # Internal functions follow