Skip to content

Commit

Permalink
Patch the Spark jars to make WASB access work
Browse files Browse the repository at this point in the history
Ideally, we would just add `hadoop-azure` to the SBT dependencies, but
that brings in a some hadoop classes that are incompatible with the ones
that are included in Spark (since it uses an older hadoop version).
Instead, get the jar file directly into `lib/spark/jars`, and do the
same for `azure-storage` which is a dependency.
  • Loading branch information
elibarzilay committed Aug 17, 2017
1 parent 2426bf0 commit 8f7f01b
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 7 deletions.
16 changes: 10 additions & 6 deletions src/project/build.scala
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,17 @@ object Extras {
def sparkVer = env("SPARK_VERSION", null)

def commonLibs = Seq(
"org.apache.spark" %% "spark-core" % sparkVer % "provided",
"org.apache.spark" %% "spark-mllib" % sparkVer % "provided",
"org.scalatest" %% "scalatest" % "3.0.0" % "provided",
"org.apache.spark" %% "spark-core" % sparkVer % "provided",
"org.apache.spark" %% "spark-mllib" % sparkVer % "provided",
"org.scalatest" %% "scalatest" % "3.0.0" % "provided",
// should include these things in the distributed jar
"io.spray" %% "spray-json" % "1.3.2",
"com.microsoft.cntk" % "cntk" % "2.1",
"org.openpnp" % "opencv" % "3.2.0-1"
"io.spray" %% "spray-json" % "1.3.2",
"com.microsoft.cntk" % "cntk" % "2.1",
"org.openpnp" % "opencv" % "3.2.0-1"
// needed for wasb access, but it collides with the version that comes with Spark,
// so it gets installed manually for now (see "tools/config.sh")

// "org.apache.hadoop" % "hadoop-azure" % "2.7.3"
)
def overrideLibs = Set(
// spark wants 2.2.6, but we don't use its tests anyway
Expand Down
9 changes: 9 additions & 0 deletions tools/config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,15 @@ Spark.setup() {
echo " </property>"
echo "</configuration>"
} > "conf/hive-site.xml"
cd "jars"
# Patch the Spark jars: add hadoop-azure and azure-storage to make WASB access
# work. Ideally, we would just add `hadoop-azure` to the SBT dependencies,
# but that collides with the hadoop version that comes with Spark (see comment
# in "src/project/build.scala"). When/if spark is updated for a newer hadoop,
# then go back to the sbt route.
local mvn="http://central.maven.org/maven2"
_curl -O "$mvn/com/microsoft/azure/azure-storage/2.0.0/azure-storage-2.0.0.jar"
_curl -O "$mvn/org/apache/hadoop/hadoop-azure/2.7.3/hadoop-azure-2.7.3.jar"
}
Spark.init() {
local f; for f in "python/lib/"*.zip; do
Expand Down
2 changes: 1 addition & 1 deletion tools/runme/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ _retrieve_file() { # url file sha256
&& "$(< "$cache.sha256")" = "$sha256" ]]; then
_ ln -sf "$cache" "$target"; return
fi
_ curl --output "$target" $CURL_FLAGS "$url"
_curl --output "$target" "$url"
local sha256sum="$(__ sha256sum "$target")"; sha256sum="${sha256sum%% *}"
if [[ "x$sha256sum" = "x" ]]; then failwith "could not get sha256 checksum"; fi
if [[ "$sha256sum" != "$sha256" ]]; then
Expand Down
6 changes: 6 additions & 0 deletions tools/runme/utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,12 @@ azblob() {
az storage blob "$verb" --account-name "$MAIN_STORAGE" "$@"
}

# ---< _curl arg... >-----------------------------------------------------------
# Convenience for running curl as "_ curl $CURL_FLAGS arg...".
_curl() {
_ curl $CURL_FLAGS "$@"
}

# ------------------------------------------------------------------------------
# Internal functions follow

Expand Down

0 comments on commit 8f7f01b

Please sign in to comment.