From fe04a23146f0c6b862f7a7deaf59ac838b0d23c3 Mon Sep 17 00:00:00 2001 From: Jeff Klukas Date: Wed, 27 Jun 2018 16:25:28 -0400 Subject: [PATCH 1/4] Bug 1466936 - Distribute via spark-packages.org This PR changes deployment for both the python and Scala packages. In addition to deploying to Mozilla's S3 Maven repo, we deploy to spark-packages.org so that both Scala and python bindings are available via invoking Spark with the --packages option set. We also move all CI to CircleCI 2.0, update some dependency versions, etc. --- .circleci/config.yml | 40 ++++++++++++ .travis.yml | 19 ------ README.md | 13 +++- VERSION | 2 +- build.sbt | 16 +++-- circle.yml | 15 ----- project/build.properties | 1 + project/plugins.sbt | 6 +- python/README.md | 29 ++++++--- python/VERSION | 1 + python/pyspark_hyperloglog/__init__.py | 3 + python/{src => pyspark_hyperloglog}/hll.py | 0 python/setup.py | 61 +------------------ python/src/__init__.py | 0 python/tox.ini | 12 ++-- .../sql/hyperloglog/test/HyperLogLog.scala | 2 +- 16 files changed, 102 insertions(+), 118 deletions(-) create mode 100644 .circleci/config.yml delete mode 100644 .travis.yml delete mode 100644 circle.yml create mode 120000 python/VERSION create mode 100644 python/pyspark_hyperloglog/__init__.py rename python/{src => pyspark_hyperloglog}/hll.py (100%) delete mode 100644 python/src/__init__.py diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 0000000..d4e1aa0 --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,40 @@ +#################### +# CircleCI configuration reference: +# https://circleci.com/docs/2.0/configuration-reference +#################### +# CircleCI built-in environment variables: +# https://circleci.com/docs/2.0/env-vars/#built-in-environment-variables +#################### + +version: 2 +jobs: + build: + docker: + - image: mozilla/sbt:8u171_0.13.13 + steps: + - checkout + - run: + name: Test + command: | + sbt coverage scalastyle test:scalastyle test coverageReport + - run: + name: Submit coverage data + command: | + bash <(curl -s https://codecov.io/bash) + - run: + name: Early return if this build is from a forked PR + command: | + if [ -n "$CIRCLE_PR_NUMBER" ] || [ "$CIRCLE_BRANCH" != "master" ]; then + echo "Stopping here; we only publish for pushes to the master branch" + circleci step halt + fi + - run: + name: Publish + command: | + sbt publish spPublish + +workflows: + version: 2 + build: + jobs: + - build diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 82aa5dc..0000000 --- a/.travis.yml +++ /dev/null @@ -1,19 +0,0 @@ -language: scala -sudo: false -cache: - directories: - - $HOME/.ivy2 -matrix: - include: - - jdk: openjdk7 - scala: 2.10.5 - env: TEST_SPARK_VERSION="1.6.0" - - jdk: openjdk7 - scala: 2.11.7 - env: TEST_SPARK_VERSION="1.6.0" -script: - - sbt -Dspark.testVersion=$TEST_SPARK_VERSION ++$TRAVIS_SCALA_VERSION coverage test - - sbt ++$TRAVIS_SCALA_VERSION scalastyle - - sbt ++$TRAVIS_SCALA_VERSION "test:scalastyle" -after_success: - - bash <(curl -s https://codecov.io/bash) diff --git a/README.md b/README.md index f1f4ff2..f76d9b4 100644 --- a/README.md +++ b/README.md @@ -3,10 +3,17 @@ Algebird's HyperLogLog support for Apache Spark. This package can be used in con with [presto-hyperloglog](https://github.com/vitillo/presto-hyperloglog) to share HyperLogLog sets between Spark and Presto. -[![Build Status](https://travis-ci.org/vitillo/spark-hyperloglog.svg?branch=master)](https://travis-ci.org/vitillo/spark-hyperloglog) -[![codecov.io](https://codecov.io/github/vitillo/spark-hyperloglog/coverage.svg?branch=master)](https://codecov.io/github/vitillo/spark-hyperloglog?branch=master) +[![codecov.io](https://codecov.io/github/mozilla/spark-hyperloglog/coverage.svg?branch=master)](https://codecov.io/github/mozilla/spark-hyperloglog?branch=master) [![CircleCi](https://circleci.com/gh/mozilla/spark-hyperloglog.svg?style=shield&circle-token=5506f56072f0198ece2995a8539c174cc648c9e4)](https://circleci.com/gh/mozilla/spark-hyperloglog) +### Installing + +This project is published as +[mozilla/spark-hyperloglog](https://spark-packages.org/package/mozilla/spark-hyperloglog) +on spark-packages.org, so is available via: + + spark --packages mozilla:spark-hyperloglog:2.2.0 + ### Example usage ```scala @@ -38,4 +45,4 @@ yields: ### Deployment Any commits to master should also trigger a circleci build that will do the sbt publishing for you -to our local maven repo in s3. +to our local maven repo in s3 and to spark-packages.org. diff --git a/VERSION b/VERSION index 176867e..ccbccc3 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.1.1-SNAPSHOT +2.2.0 diff --git a/build.sbt b/build.sbt index c2c275b..37f308d 100644 --- a/build.sbt +++ b/build.sbt @@ -1,21 +1,29 @@ name := "spark-hyperloglog" -version := scala.io.Source.fromFile("VERSION").mkString +version := scala.io.Source.fromFile("VERSION").mkString.stripLineEnd scalaVersion := "2.11.8" organization := "com.mozilla.telemetry" +// As required by https://github.com/databricks/sbt-spark-package#spark-package-developers +spName := "mozilla/spark-hyperloglog" +spShortDescription := "Algebird's HyperLogLog support for Apache Spark" +spDescription := "Algebird's HyperLogLog support for Apache Spark" sparkVersion := "2.0.2" - -sparkComponents ++= Seq("core", "sql") +sparkComponents ++= Seq("sql") libraryDependencies ++= Seq( "org.scalatest" %% "scalatest" % "2.2.6" % "test", "com.twitter" %% "algebird-core" % "0.12.0" ) -credentials += Credentials(Path.userHome / ".ivy2" / ".sbtcredentials") +// Appropriate environment variables for publishing are provided in the CircleCI environment. +credentials += Credentials( + "Spark Packages Realm", + "spark-packages.org", + sys.env.getOrElse("GITHUB_USERNAME", ""), + sys.env.getOrElse("GITHUB_PERSONAL_ACCESS_TOKEN", "")) publishMavenStyle := true diff --git a/circle.yml b/circle.yml deleted file mode 100644 index 26457ce..0000000 --- a/circle.yml +++ /dev/null @@ -1,15 +0,0 @@ -machine: - pre: - # Install sbt 0.13.16 - - sudo apt-get install openjdk-8-jdk - - wget -q https://dl.bintray.com/sbt/debian/sbt-0.13.16.deb - - sudo dpkg -i sbt-0.13.16.deb - cache_directories: - - "~/.ivy2" - - "~/.sbt" - -deployment: - latest: - branch: master - commands: - - sbt publish diff --git a/project/build.properties b/project/build.properties index 27e88aa..398fe81 100644 --- a/project/build.properties +++ b/project/build.properties @@ -1 +1,2 @@ +# sbt-spark-package does not yet support sbt 1.x sbt.version=0.13.13 diff --git a/project/plugins.sbt b/project/plugins.sbt index b37d512..3d9e826 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -2,10 +2,10 @@ resolvers += "bintray-spark-packages" at "https://dl.bintray.com/spark-packages/ addSbtPlugin("org.spark-packages" % "sbt-spark-package" % "0.2.6") -addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.1.0") +addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.5.1") -addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "0.8.0") +addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0") -addSbtPlugin("com.frugalmechanic" % "fm-sbt-s3-resolver" % "0.12.0") +addSbtPlugin("com.frugalmechanic" % "fm-sbt-s3-resolver" % "0.14.0") addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.6") diff --git a/python/README.md b/python/README.md index 842f8c9..1ab878b 100644 --- a/python/README.md +++ b/python/README.md @@ -4,11 +4,12 @@ Python bindings for the spark-hyperloglog package. ## Usage -Include the bindings in your project. +The python bindings are included in the distribution on spark-packages.org, +so they should be automatically available if the spark-hyperloglog library +is loaded on the cluster or specified via `--packages` +(but see the section below about EMR for caveats on that platform): -```bash -pip install pyspark_hyperloglog -``` + pyspark --packages mozilla:spark-hyperloglog:2.2.0 The package will register itself with the current pyspark installation location in the current site-packages. This allows for tests against spark in standalone mode. @@ -41,8 +42,8 @@ User Defined Functions. ## Building In the top-level directory, build the `spark-hyperloglog` package. - - ```bash + +```bash sbt assembly ``` @@ -55,8 +56,20 @@ pip install dist/*.tar.gz ## Tests -Tests are run using tox. +Tests are run using tox and assume you've already run `sbt assembly` as discussed in the previous section: ```bash -tox +PYSPARK_SUBMIT_ARGS="--jars ../target/scala-2.11/spark-hyperloglog-assembly-*.jar pyspark-shell" tox +``` + +## Using the package on Amazon EMR + +EMR does not correctly build the python environment to include python code from +Spark packages, but you can work around this in your pySpark session via: + +```python +import sys + +pyfiles = str(sc.getConf().get(u'spark.submit.pyFiles')).split(',') +sys.path.extend(pyfiles) ``` diff --git a/python/VERSION b/python/VERSION new file mode 120000 index 0000000..6ff19de --- /dev/null +++ b/python/VERSION @@ -0,0 +1 @@ +../VERSION \ No newline at end of file diff --git a/python/pyspark_hyperloglog/__init__.py b/python/pyspark_hyperloglog/__init__.py new file mode 100644 index 0000000..5a365cf --- /dev/null +++ b/python/pyspark_hyperloglog/__init__.py @@ -0,0 +1,3 @@ +from . import hll + +__all__ = ['hll'] diff --git a/python/src/hll.py b/python/pyspark_hyperloglog/hll.py similarity index 100% rename from python/src/hll.py rename to python/pyspark_hyperloglog/hll.py diff --git a/python/setup.py b/python/setup.py index 81bc506..a73ed67 100644 --- a/python/setup.py +++ b/python/setup.py @@ -1,54 +1,8 @@ -"""" -This build script is modeled after the pyspark package in the apache/spark -repository. - -https://github.com/apache/spark/blob/master/python/setup.py -""" - from setuptools import setup -import os -import glob -import sys -import shutil - -# read the version file in the package or in the root project directory -version_file = "VERSION" if os.path.isfile("VERSION") else "../VERSION" -with open(version_file, 'r') as f: +with open('VERSION', 'r') as f: VERSION = f.read().strip() -JARS_TARGET = 'deps/jars' -JAR_FILE = "*-assembly-{}.jar".format(VERSION) - - -is_packaging = ( - os.path.isfile("../build.sbt") and - not os.path.isfile(os.path.join(JARS_TARGET, JAR_FILE)) -) - -if is_packaging: - SPARK_HLL_HOME = os.path.abspath("../") - JAR_PATH = glob.glob(os.path.join( - SPARK_HLL_HOME, "target/scala-*", JAR_FILE)) - - if len(JAR_PATH) != 1: - print("Could not find assembled jar") - sys.exit(-1) - - JAR_PATH = JAR_PATH[0] - - try: - os.makedirs(JARS_TARGET) - except: - print("Temporary path to jars already exists {}".format(JARS_TARGET)) - sys.exit(-1) - - os.symlink(JAR_PATH, os.path.join(JARS_TARGET, os.path.basename(JAR_PATH))) - os.symlink("../VERSION", "VERSION") -else: - if not os.path.exists(JARS_TARGET): - print("The jar folder must exist") - setup( name='pyspark-hyperloglog', version=VERSION.split('-')[0], @@ -59,7 +13,6 @@ url='https://github.com/mozilla/spark-hyperloglog', packages=[ 'pyspark_hyperloglog', - 'pyspark.jars' ], install_requires=['pyspark'], extras_require={ @@ -68,16 +21,4 @@ 'tox' ] }, - include_package_data=True, - package_dir={ - 'pyspark_hyperloglog': 'src', - 'pyspark.jars': 'deps/jars' - }, - package_data={ - 'pyspark.jars': ['*.jar'] - }, ) - -if is_packaging: - shutil.rmtree('deps') - os.remove("VERSION") diff --git a/python/src/__init__.py b/python/src/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/python/tox.ini b/python/tox.ini index 612baa0..d92183e 100644 --- a/python/tox.ini +++ b/python/tox.ini @@ -1,7 +1,11 @@ - [tox] -envlist = py2.7 +envlist = py27 [testenv] -extras = dev -commands = pytest {posargs} \ No newline at end of file +deps = + pytest + pyspark +passenv = + PYSPARK_SUBMIT_ARGS +commands = + pytest {posargs} diff --git a/src/test/scala/com/mozilla/spark/sql/hyperloglog/test/HyperLogLog.scala b/src/test/scala/com/mozilla/spark/sql/hyperloglog/test/HyperLogLog.scala index 98065ec..e75a12a 100644 --- a/src/test/scala/com/mozilla/spark/sql/hyperloglog/test/HyperLogLog.scala +++ b/src/test/scala/com/mozilla/spark/sql/hyperloglog/test/HyperLogLog.scala @@ -84,7 +84,7 @@ class HyperLogLogTest extends FlatSpec with Matchers with BeforeAndAfterAll { rows(0)(0) should be (2) } - override def afterAll = { + override def afterAll: Unit = { spark.stop } } From f16066b82d9d61e4535de2c1b2618e3b4b3c0607 Mon Sep 17 00:00:00 2001 From: Jeff Klukas Date: Thu, 28 Jun 2018 11:42:13 -0400 Subject: [PATCH 2/4] Avoid pyspark UserWarning --- python/pyspark_hyperloglog/hll.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/pyspark_hyperloglog/hll.py b/python/pyspark_hyperloglog/hll.py index 62f22ac..344012d 100644 --- a/python/pyspark_hyperloglog/hll.py +++ b/python/pyspark_hyperloglog/hll.py @@ -1,11 +1,10 @@ -from pyspark.sql import SparkSession +from pyspark.sql import SparkSession, Row from pyspark.sql.functions import expr - def register(): spark = SparkSession.builder.getOrCreate() # NOTE: at least one dataframe should be created before registration - spark.createDataFrame([{'a': 1}]).count() + spark.createDataFrame([Row(a=1)]).count() sc = spark.sparkContext sc._jvm.com.mozilla.spark.sql.hyperloglog.functions.package.registerUdf() From f221cf9f8a8f4fb343edb1de94021d29c1620ce2 Mon Sep 17 00:00:00 2001 From: Jeff Klukas Date: Fri, 29 Jun 2018 11:43:16 -0400 Subject: [PATCH 3/4] Move CI publish to a separate job --- .circleci/config.yml | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index d4e1aa0..0b76194 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -8,7 +8,7 @@ version: 2 jobs: - build: + test: docker: - image: mozilla/sbt:8u171_0.13.13 steps: @@ -21,13 +21,12 @@ jobs: name: Submit coverage data command: | bash <(curl -s https://codecov.io/bash) - - run: - name: Early return if this build is from a forked PR - command: | - if [ -n "$CIRCLE_PR_NUMBER" ] || [ "$CIRCLE_BRANCH" != "master" ]; then - echo "Stopping here; we only publish for pushes to the master branch" - circleci step halt - fi + + publish: + docker: + - image: mozilla/sbt:8u171_0.13.13 + steps: + - checkout - run: name: Publish command: | @@ -35,6 +34,13 @@ jobs: workflows: version: 2 - build: + test-publish: jobs: - - build + - test + # Only publish if tests pass and this is a commit to master + - publish: + requires: + - test + filters: + branches: + only: master From ec3f75fba5854b28a805b488b25fdc1858de57ec Mon Sep 17 00:00:00 2001 From: Jeff Klukas Date: Fri, 29 Jun 2018 12:38:11 -0400 Subject: [PATCH 4/4] Comment on publish job in circle config --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 0b76194..0a6e8dc 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -22,6 +22,7 @@ jobs: command: | bash <(curl -s https://codecov.io/bash) + # The publish job only gets scheduled for commits to master; see workflows section below publish: docker: - image: mozilla/sbt:8u171_0.13.13 @@ -37,7 +38,6 @@ workflows: test-publish: jobs: - test - # Only publish if tests pass and this is a commit to master - publish: requires: - test