Skip to content
This repository has been archived by the owner on Feb 4, 2021. It is now read-only.

Commit

Permalink
Merge pull request #6 from jklukas/spark-packages
Browse files Browse the repository at this point in the history
Bug 1466936 - Distribute via spark-packages.org
  • Loading branch information
jklukas authored Jun 29, 2018
2 parents ea3f427 + ec3f75f commit 6c18f92
Show file tree
Hide file tree
Showing 16 changed files with 110 additions and 121 deletions.
46 changes: 46 additions & 0 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
####################
# CircleCI configuration reference:
# https://circleci.com/docs/2.0/configuration-reference
####################
# CircleCI built-in environment variables:
# https://circleci.com/docs/2.0/env-vars/#built-in-environment-variables
####################

version: 2
jobs:
test:
docker:
- image: mozilla/sbt:8u171_0.13.13
steps:
- checkout
- run:
name: Test
command: |
sbt coverage scalastyle test:scalastyle test coverageReport
- run:
name: Submit coverage data
command: |
bash <(curl -s https://codecov.io/bash)
# The publish job only gets scheduled for commits to master; see workflows section below
publish:
docker:
- image: mozilla/sbt:8u171_0.13.13
steps:
- checkout
- run:
name: Publish
command: |
sbt publish spPublish
workflows:
version: 2
test-publish:
jobs:
- test
- publish:
requires:
- test
filters:
branches:
only: master
19 changes: 0 additions & 19 deletions .travis.yml

This file was deleted.

13 changes: 10 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,17 @@ Algebird's HyperLogLog support for Apache Spark. This package can be used in con
with [presto-hyperloglog](https://github.com/vitillo/presto-hyperloglog) to share
HyperLogLog sets between Spark and Presto.

[![Build Status](https://travis-ci.org/vitillo/spark-hyperloglog.svg?branch=master)](https://travis-ci.org/vitillo/spark-hyperloglog)
[![codecov.io](https://codecov.io/github/vitillo/spark-hyperloglog/coverage.svg?branch=master)](https://codecov.io/github/vitillo/spark-hyperloglog?branch=master)
[![codecov.io](https://codecov.io/github/mozilla/spark-hyperloglog/coverage.svg?branch=master)](https://codecov.io/github/mozilla/spark-hyperloglog?branch=master)
[![CircleCi](https://circleci.com/gh/mozilla/spark-hyperloglog.svg?style=shield&circle-token=5506f56072f0198ece2995a8539c174cc648c9e4)](https://circleci.com/gh/mozilla/spark-hyperloglog)

### Installing

This project is published as
[mozilla/spark-hyperloglog](https://spark-packages.org/package/mozilla/spark-hyperloglog)
on spark-packages.org, so is available via:

spark --packages mozilla:spark-hyperloglog:2.2.0


### Example usage
```scala
Expand Down Expand Up @@ -38,4 +45,4 @@ yields:

### Deployment
Any commits to master should also trigger a circleci build that will do the sbt publishing for you
to our local maven repo in s3.
to our local maven repo in s3 and to spark-packages.org.
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.1.1-SNAPSHOT
2.2.0
16 changes: 12 additions & 4 deletions build.sbt
Original file line number Diff line number Diff line change
@@ -1,21 +1,29 @@
name := "spark-hyperloglog"

version := scala.io.Source.fromFile("VERSION").mkString
version := scala.io.Source.fromFile("VERSION").mkString.stripLineEnd

scalaVersion := "2.11.8"

organization := "com.mozilla.telemetry"

// As required by https://github.com/databricks/sbt-spark-package#spark-package-developers
spName := "mozilla/spark-hyperloglog"
spShortDescription := "Algebird's HyperLogLog support for Apache Spark"
spDescription := "Algebird's HyperLogLog support for Apache Spark"
sparkVersion := "2.0.2"

sparkComponents ++= Seq("core", "sql")
sparkComponents ++= Seq("sql")

libraryDependencies ++= Seq(
"org.scalatest" %% "scalatest" % "2.2.6" % "test",
"com.twitter" %% "algebird-core" % "0.12.0"
)

credentials += Credentials(Path.userHome / ".ivy2" / ".sbtcredentials")
// Appropriate environment variables for publishing are provided in the CircleCI environment.
credentials += Credentials(
"Spark Packages Realm",
"spark-packages.org",
sys.env.getOrElse("GITHUB_USERNAME", ""),
sys.env.getOrElse("GITHUB_PERSONAL_ACCESS_TOKEN", ""))

publishMavenStyle := true

Expand Down
15 changes: 0 additions & 15 deletions circle.yml

This file was deleted.

1 change: 1 addition & 0 deletions project/build.properties
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
# sbt-spark-package does not yet support sbt 1.x
sbt.version=0.13.13
6 changes: 3 additions & 3 deletions project/plugins.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@ resolvers += "bintray-spark-packages" at "https://dl.bintray.com/spark-packages/

addSbtPlugin("org.spark-packages" % "sbt-spark-package" % "0.2.6")

addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.1.0")
addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.5.1")

addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "0.8.0")
addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0")

addSbtPlugin("com.frugalmechanic" % "fm-sbt-s3-resolver" % "0.12.0")
addSbtPlugin("com.frugalmechanic" % "fm-sbt-s3-resolver" % "0.14.0")

addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.6")
29 changes: 21 additions & 8 deletions python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@ Python bindings for the spark-hyperloglog package.

## Usage

Include the bindings in your project.
The python bindings are included in the distribution on spark-packages.org,
so they should be automatically available if the spark-hyperloglog library
is loaded on the cluster or specified via `--packages`
(but see the section below about EMR for caveats on that platform):

```bash
pip install pyspark_hyperloglog
```
pyspark --packages mozilla:spark-hyperloglog:2.2.0

The package will register itself with the current pyspark installation
location in the current site-packages. This allows for tests against spark in standalone mode.
Expand Down Expand Up @@ -41,8 +42,8 @@ User Defined Functions.
## Building

In the top-level directory, build the `spark-hyperloglog` package.
```bash

```bash
sbt assembly
```

Expand All @@ -55,8 +56,20 @@ pip install dist/*.tar.gz

## Tests

Tests are run using tox.
Tests are run using tox and assume you've already run `sbt assembly` as discussed in the previous section:

```bash
tox
PYSPARK_SUBMIT_ARGS="--jars ../target/scala-2.11/spark-hyperloglog-assembly-*.jar pyspark-shell" tox
```

## Using the package on Amazon EMR

EMR does not correctly build the python environment to include python code from
Spark packages, but you can work around this in your pySpark session via:

```python
import sys

pyfiles = str(sc.getConf().get(u'spark.submit.pyFiles')).split(',')
sys.path.extend(pyfiles)
```
1 change: 1 addition & 0 deletions python/VERSION
3 changes: 3 additions & 0 deletions python/pyspark_hyperloglog/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from . import hll

__all__ = ['hll']
5 changes: 2 additions & 3 deletions python/src/hll.py → python/pyspark_hyperloglog/hll.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
from pyspark.sql import SparkSession
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import expr


def register():
spark = SparkSession.builder.getOrCreate()
# NOTE: at least one dataframe should be created before registration
spark.createDataFrame([{'a': 1}]).count()
spark.createDataFrame([Row(a=1)]).count()
sc = spark.sparkContext
sc._jvm.com.mozilla.spark.sql.hyperloglog.functions.package.registerUdf()

Expand Down
61 changes: 1 addition & 60 deletions python/setup.py
Original file line number Diff line number Diff line change
@@ -1,54 +1,8 @@
""""
This build script is modeled after the pyspark package in the apache/spark
repository.
https://github.com/apache/spark/blob/master/python/setup.py
"""

from setuptools import setup
import os
import glob
import sys
import shutil


# read the version file in the package or in the root project directory
version_file = "VERSION" if os.path.isfile("VERSION") else "../VERSION"
with open(version_file, 'r') as f:
with open('VERSION', 'r') as f:
VERSION = f.read().strip()

JARS_TARGET = 'deps/jars'
JAR_FILE = "*-assembly-{}.jar".format(VERSION)


is_packaging = (
os.path.isfile("../build.sbt") and
not os.path.isfile(os.path.join(JARS_TARGET, JAR_FILE))
)

if is_packaging:
SPARK_HLL_HOME = os.path.abspath("../")
JAR_PATH = glob.glob(os.path.join(
SPARK_HLL_HOME, "target/scala-*", JAR_FILE))

if len(JAR_PATH) != 1:
print("Could not find assembled jar")
sys.exit(-1)

JAR_PATH = JAR_PATH[0]

try:
os.makedirs(JARS_TARGET)
except:
print("Temporary path to jars already exists {}".format(JARS_TARGET))
sys.exit(-1)

os.symlink(JAR_PATH, os.path.join(JARS_TARGET, os.path.basename(JAR_PATH)))
os.symlink("../VERSION", "VERSION")
else:
if not os.path.exists(JARS_TARGET):
print("The jar folder must exist")

setup(
name='pyspark-hyperloglog',
version=VERSION.split('-')[0],
Expand All @@ -59,7 +13,6 @@
url='https://github.com/mozilla/spark-hyperloglog',
packages=[
'pyspark_hyperloglog',
'pyspark.jars'
],
install_requires=['pyspark'],
extras_require={
Expand All @@ -68,16 +21,4 @@
'tox'
]
},
include_package_data=True,
package_dir={
'pyspark_hyperloglog': 'src',
'pyspark.jars': 'deps/jars'
},
package_data={
'pyspark.jars': ['*.jar']
},
)

if is_packaging:
shutil.rmtree('deps')
os.remove("VERSION")
Empty file removed python/src/__init__.py
Empty file.
12 changes: 8 additions & 4 deletions python/tox.ini
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@

[tox]
envlist = py2.7
envlist = py27

[testenv]
extras = dev
commands = pytest {posargs}
deps =
pytest
pyspark
passenv =
PYSPARK_SUBMIT_ARGS
commands =
pytest {posargs}
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ class HyperLogLogTest extends FlatSpec with Matchers with BeforeAndAfterAll {
rows(0)(0) should be (2)
}

override def afterAll = {
override def afterAll: Unit = {
spark.stop
}
}

0 comments on commit 6c18f92

Please sign in to comment.