diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 0000000..0a6e8dc --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,46 @@ +#################### +# CircleCI configuration reference: +# https://circleci.com/docs/2.0/configuration-reference +#################### +# CircleCI built-in environment variables: +# https://circleci.com/docs/2.0/env-vars/#built-in-environment-variables +#################### + +version: 2 +jobs: + test: + docker: + - image: mozilla/sbt:8u171_0.13.13 + steps: + - checkout + - run: + name: Test + command: | + sbt coverage scalastyle test:scalastyle test coverageReport + - run: + name: Submit coverage data + command: | + bash <(curl -s https://codecov.io/bash) + + # The publish job only gets scheduled for commits to master; see workflows section below + publish: + docker: + - image: mozilla/sbt:8u171_0.13.13 + steps: + - checkout + - run: + name: Publish + command: | + sbt publish spPublish + +workflows: + version: 2 + test-publish: + jobs: + - test + - publish: + requires: + - test + filters: + branches: + only: master diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 82aa5dc..0000000 --- a/.travis.yml +++ /dev/null @@ -1,19 +0,0 @@ -language: scala -sudo: false -cache: - directories: - - $HOME/.ivy2 -matrix: - include: - - jdk: openjdk7 - scala: 2.10.5 - env: TEST_SPARK_VERSION="1.6.0" - - jdk: openjdk7 - scala: 2.11.7 - env: TEST_SPARK_VERSION="1.6.0" -script: - - sbt -Dspark.testVersion=$TEST_SPARK_VERSION ++$TRAVIS_SCALA_VERSION coverage test - - sbt ++$TRAVIS_SCALA_VERSION scalastyle - - sbt ++$TRAVIS_SCALA_VERSION "test:scalastyle" -after_success: - - bash <(curl -s https://codecov.io/bash) diff --git a/README.md b/README.md index f1f4ff2..f76d9b4 100644 --- a/README.md +++ b/README.md @@ -3,10 +3,17 @@ Algebird's HyperLogLog support for Apache Spark. This package can be used in con with [presto-hyperloglog](https://github.com/vitillo/presto-hyperloglog) to share HyperLogLog sets between Spark and Presto. -[![Build Status](https://travis-ci.org/vitillo/spark-hyperloglog.svg?branch=master)](https://travis-ci.org/vitillo/spark-hyperloglog) -[![codecov.io](https://codecov.io/github/vitillo/spark-hyperloglog/coverage.svg?branch=master)](https://codecov.io/github/vitillo/spark-hyperloglog?branch=master) +[![codecov.io](https://codecov.io/github/mozilla/spark-hyperloglog/coverage.svg?branch=master)](https://codecov.io/github/mozilla/spark-hyperloglog?branch=master) [![CircleCi](https://circleci.com/gh/mozilla/spark-hyperloglog.svg?style=shield&circle-token=5506f56072f0198ece2995a8539c174cc648c9e4)](https://circleci.com/gh/mozilla/spark-hyperloglog) +### Installing + +This project is published as +[mozilla/spark-hyperloglog](https://spark-packages.org/package/mozilla/spark-hyperloglog) +on spark-packages.org, so is available via: + + spark --packages mozilla:spark-hyperloglog:2.2.0 + ### Example usage ```scala @@ -38,4 +45,4 @@ yields: ### Deployment Any commits to master should also trigger a circleci build that will do the sbt publishing for you -to our local maven repo in s3. +to our local maven repo in s3 and to spark-packages.org. diff --git a/VERSION b/VERSION index 176867e..ccbccc3 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.1.1-SNAPSHOT +2.2.0 diff --git a/build.sbt b/build.sbt index c2c275b..37f308d 100644 --- a/build.sbt +++ b/build.sbt @@ -1,21 +1,29 @@ name := "spark-hyperloglog" -version := scala.io.Source.fromFile("VERSION").mkString +version := scala.io.Source.fromFile("VERSION").mkString.stripLineEnd scalaVersion := "2.11.8" organization := "com.mozilla.telemetry" +// As required by https://github.com/databricks/sbt-spark-package#spark-package-developers +spName := "mozilla/spark-hyperloglog" +spShortDescription := "Algebird's HyperLogLog support for Apache Spark" +spDescription := "Algebird's HyperLogLog support for Apache Spark" sparkVersion := "2.0.2" - -sparkComponents ++= Seq("core", "sql") +sparkComponents ++= Seq("sql") libraryDependencies ++= Seq( "org.scalatest" %% "scalatest" % "2.2.6" % "test", "com.twitter" %% "algebird-core" % "0.12.0" ) -credentials += Credentials(Path.userHome / ".ivy2" / ".sbtcredentials") +// Appropriate environment variables for publishing are provided in the CircleCI environment. +credentials += Credentials( + "Spark Packages Realm", + "spark-packages.org", + sys.env.getOrElse("GITHUB_USERNAME", ""), + sys.env.getOrElse("GITHUB_PERSONAL_ACCESS_TOKEN", "")) publishMavenStyle := true diff --git a/circle.yml b/circle.yml deleted file mode 100644 index 26457ce..0000000 --- a/circle.yml +++ /dev/null @@ -1,15 +0,0 @@ -machine: - pre: - # Install sbt 0.13.16 - - sudo apt-get install openjdk-8-jdk - - wget -q https://dl.bintray.com/sbt/debian/sbt-0.13.16.deb - - sudo dpkg -i sbt-0.13.16.deb - cache_directories: - - "~/.ivy2" - - "~/.sbt" - -deployment: - latest: - branch: master - commands: - - sbt publish diff --git a/project/build.properties b/project/build.properties index 27e88aa..398fe81 100644 --- a/project/build.properties +++ b/project/build.properties @@ -1 +1,2 @@ +# sbt-spark-package does not yet support sbt 1.x sbt.version=0.13.13 diff --git a/project/plugins.sbt b/project/plugins.sbt index b37d512..3d9e826 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -2,10 +2,10 @@ resolvers += "bintray-spark-packages" at "https://dl.bintray.com/spark-packages/ addSbtPlugin("org.spark-packages" % "sbt-spark-package" % "0.2.6") -addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.1.0") +addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.5.1") -addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "0.8.0") +addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0") -addSbtPlugin("com.frugalmechanic" % "fm-sbt-s3-resolver" % "0.12.0") +addSbtPlugin("com.frugalmechanic" % "fm-sbt-s3-resolver" % "0.14.0") addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.6") diff --git a/python/README.md b/python/README.md index 842f8c9..1ab878b 100644 --- a/python/README.md +++ b/python/README.md @@ -4,11 +4,12 @@ Python bindings for the spark-hyperloglog package. ## Usage -Include the bindings in your project. +The python bindings are included in the distribution on spark-packages.org, +so they should be automatically available if the spark-hyperloglog library +is loaded on the cluster or specified via `--packages` +(but see the section below about EMR for caveats on that platform): -```bash -pip install pyspark_hyperloglog -``` + pyspark --packages mozilla:spark-hyperloglog:2.2.0 The package will register itself with the current pyspark installation location in the current site-packages. This allows for tests against spark in standalone mode. @@ -41,8 +42,8 @@ User Defined Functions. ## Building In the top-level directory, build the `spark-hyperloglog` package. - - ```bash + +```bash sbt assembly ``` @@ -55,8 +56,20 @@ pip install dist/*.tar.gz ## Tests -Tests are run using tox. +Tests are run using tox and assume you've already run `sbt assembly` as discussed in the previous section: ```bash -tox +PYSPARK_SUBMIT_ARGS="--jars ../target/scala-2.11/spark-hyperloglog-assembly-*.jar pyspark-shell" tox +``` + +## Using the package on Amazon EMR + +EMR does not correctly build the python environment to include python code from +Spark packages, but you can work around this in your pySpark session via: + +```python +import sys + +pyfiles = str(sc.getConf().get(u'spark.submit.pyFiles')).split(',') +sys.path.extend(pyfiles) ``` diff --git a/python/VERSION b/python/VERSION new file mode 120000 index 0000000..6ff19de --- /dev/null +++ b/python/VERSION @@ -0,0 +1 @@ +../VERSION \ No newline at end of file diff --git a/python/pyspark_hyperloglog/__init__.py b/python/pyspark_hyperloglog/__init__.py new file mode 100644 index 0000000..5a365cf --- /dev/null +++ b/python/pyspark_hyperloglog/__init__.py @@ -0,0 +1,3 @@ +from . import hll + +__all__ = ['hll'] diff --git a/python/src/hll.py b/python/pyspark_hyperloglog/hll.py similarity index 85% rename from python/src/hll.py rename to python/pyspark_hyperloglog/hll.py index 62f22ac..344012d 100644 --- a/python/src/hll.py +++ b/python/pyspark_hyperloglog/hll.py @@ -1,11 +1,10 @@ -from pyspark.sql import SparkSession +from pyspark.sql import SparkSession, Row from pyspark.sql.functions import expr - def register(): spark = SparkSession.builder.getOrCreate() # NOTE: at least one dataframe should be created before registration - spark.createDataFrame([{'a': 1}]).count() + spark.createDataFrame([Row(a=1)]).count() sc = spark.sparkContext sc._jvm.com.mozilla.spark.sql.hyperloglog.functions.package.registerUdf() diff --git a/python/setup.py b/python/setup.py index 81bc506..a73ed67 100644 --- a/python/setup.py +++ b/python/setup.py @@ -1,54 +1,8 @@ -"""" -This build script is modeled after the pyspark package in the apache/spark -repository. - -https://github.com/apache/spark/blob/master/python/setup.py -""" - from setuptools import setup -import os -import glob -import sys -import shutil - -# read the version file in the package or in the root project directory -version_file = "VERSION" if os.path.isfile("VERSION") else "../VERSION" -with open(version_file, 'r') as f: +with open('VERSION', 'r') as f: VERSION = f.read().strip() -JARS_TARGET = 'deps/jars' -JAR_FILE = "*-assembly-{}.jar".format(VERSION) - - -is_packaging = ( - os.path.isfile("../build.sbt") and - not os.path.isfile(os.path.join(JARS_TARGET, JAR_FILE)) -) - -if is_packaging: - SPARK_HLL_HOME = os.path.abspath("../") - JAR_PATH = glob.glob(os.path.join( - SPARK_HLL_HOME, "target/scala-*", JAR_FILE)) - - if len(JAR_PATH) != 1: - print("Could not find assembled jar") - sys.exit(-1) - - JAR_PATH = JAR_PATH[0] - - try: - os.makedirs(JARS_TARGET) - except: - print("Temporary path to jars already exists {}".format(JARS_TARGET)) - sys.exit(-1) - - os.symlink(JAR_PATH, os.path.join(JARS_TARGET, os.path.basename(JAR_PATH))) - os.symlink("../VERSION", "VERSION") -else: - if not os.path.exists(JARS_TARGET): - print("The jar folder must exist") - setup( name='pyspark-hyperloglog', version=VERSION.split('-')[0], @@ -59,7 +13,6 @@ url='https://github.com/mozilla/spark-hyperloglog', packages=[ 'pyspark_hyperloglog', - 'pyspark.jars' ], install_requires=['pyspark'], extras_require={ @@ -68,16 +21,4 @@ 'tox' ] }, - include_package_data=True, - package_dir={ - 'pyspark_hyperloglog': 'src', - 'pyspark.jars': 'deps/jars' - }, - package_data={ - 'pyspark.jars': ['*.jar'] - }, ) - -if is_packaging: - shutil.rmtree('deps') - os.remove("VERSION") diff --git a/python/src/__init__.py b/python/src/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/python/tox.ini b/python/tox.ini index 612baa0..d92183e 100644 --- a/python/tox.ini +++ b/python/tox.ini @@ -1,7 +1,11 @@ - [tox] -envlist = py2.7 +envlist = py27 [testenv] -extras = dev -commands = pytest {posargs} \ No newline at end of file +deps = + pytest + pyspark +passenv = + PYSPARK_SUBMIT_ARGS +commands = + pytest {posargs} diff --git a/src/test/scala/com/mozilla/spark/sql/hyperloglog/test/HyperLogLog.scala b/src/test/scala/com/mozilla/spark/sql/hyperloglog/test/HyperLogLog.scala index 98065ec..e75a12a 100644 --- a/src/test/scala/com/mozilla/spark/sql/hyperloglog/test/HyperLogLog.scala +++ b/src/test/scala/com/mozilla/spark/sql/hyperloglog/test/HyperLogLog.scala @@ -84,7 +84,7 @@ class HyperLogLogTest extends FlatSpec with Matchers with BeforeAndAfterAll { rows(0)(0) should be (2) } - override def afterAll = { + override def afterAll: Unit = { spark.stop } }