diff --git a/docs/sphinx/conf.py b/docs/sphinx/conf.py index 8a67970cc..6a6e8b3e6 100644 --- a/docs/sphinx/conf.py +++ b/docs/sphinx/conf.py @@ -17,6 +17,18 @@ import shlex import sphinx_rtd_theme +# extract version from __init__.py +pysparkling_init_filename = os.path.join( + os.path.dirname(__file__), + '..', + '..', + 'pysparkling', + '__init__.py', +) +with open(pysparkling_init_filename, 'r') as f: + version_line = [l for l in f if l.startswith('__version__')][0] + PYSPARKLING_VERSION = version_line.split('=')[1].strip()[1:-1] + # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. @@ -56,7 +68,7 @@ # General information about the project. project = u'pysparkling' -copyright = u'2015, Sven Kreiss' +copyright = u'2015-2016, a project started by Sven Kreiss' author = u'Sven Kreiss' # The version info for the project you're documenting, acts as replacement for @@ -64,9 +76,9 @@ # built documents. # # The short X.Y version. -version = '0.3' +version = PYSPARKLING_VERSION # The full version, including alpha/beta/rc tags. -release = '0.3.8' +release = PYSPARKLING_VERSION # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. @@ -90,7 +102,7 @@ #default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True +add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). @@ -138,12 +150,12 @@ # The name of an image file (relative to this directory) to place at the top # of the sidebar. -#html_logo = None +# html_logo = 'images/logo-w600.png' # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -#html_favicon = None +html_favicon = 'images/favicon.ico' # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, @@ -180,10 +192,10 @@ #html_split_index = False # If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True +html_show_sourcelink = False # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -#html_show_sphinx = True +html_show_sphinx = False # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. #html_show_copyright = True diff --git a/docs/sphinx/images/favicon.ico b/docs/sphinx/images/favicon.ico new file mode 100644 index 000000000..01365747d Binary files /dev/null and b/docs/sphinx/images/favicon.ico differ diff --git a/docs/sphinx/images/logo-w600.png b/docs/sphinx/images/logo-w600.png new file mode 100644 index 000000000..9ef54aa94 Binary files /dev/null and b/docs/sphinx/images/logo-w600.png differ diff --git a/docs/sphinx/index.rst b/docs/sphinx/index.rst index 3a747bff5..772f15034 100644 --- a/docs/sphinx/index.rst +++ b/docs/sphinx/index.rst @@ -10,15 +10,109 @@ Welcome to pysparkling's documentation! ======================================= -Version 0.2 docs: http://pysparkling.trivial.io/v0.2/ - -Please read the README at https://github.com/svenkreiss/pysparkling and -checkout the examples in this notebook: -https://github.com/svenkreiss/pysparkling/blob/master/docs/demo.ipynb - -``pysparkling`` is native Python implementation of Spark's RDD interface. The primary objective to remove the dependency on the JVM and Hadoop. The focus is on having a lightweight and fast implementation for small datasets at the expense of some data resilience features and some parallel processing features. It is a drop-in replacement for PySpark's SparkContext and RDD. - -Use case: you have a pipeline that processes 100k input documents and converts them to normalized features. They are used to train a local scikit-learn classifier. The preprocessing is perfect for a full Spark task. Now, you want to use this trained classifier in an API endpoint. Assume you need the same pre-processing pipeline for a single document per API call. This does not have to be done in parallel, but there should be only a small overhead in initialization and preferably no dependency on the JVM. This is what pysparkling is for. +**Pysparkling** provides a faster, more responsive way to develop programs +for PySpark. It enables code intended for Spark applications to execute +entirely in Python, without incurring the overhead of initializing and +passing data through the JVM and Hadoop. The focus is on having a lightweight +and fast implementation for small datasets at the expense of some data +resilience features and some parallel processing features. + +**How does it work?** To switch execution of a script from PySpark to pysparkling, +have the code initialize a pysparkling Context instead of a SparkContext, and +use the pysparkling Context to set up your RDDs. The beauty is you don't have +to change a single line of code after the Context initialization, because +pysparkling's API is (almost) exactly the same as PySpark's. Since it's so easy +to switch between PySpark and pysparkling, you can choose the right tool for your +use case. + +**When would I use it?** Say you are writing a Spark application because you +need robust computation on huge datasets, but you also want the same application +to provide fast answers on a small dataset. You're finding Spark is not responsive +enough for your needs, but you don't want to rewrite an entire separate application +for the *small-answers-fast* problem. You'd rather reuse your Spark code but somehow +get it to run fast. Pysparkling bypasses the stuff that causes Spark's long startup +times and less responsive feel. + +Here are a few areas where pysparkling excels: + +- Small to medium-scale exploratory data analysis +- Application prototyping +- Low-latency web deployments +- Unit tests + +*Example:* you have a pipeline that processes 100k input documents +and converts them to normalized features. They are used to train a local +scikit-learn classifier. The preprocessing is perfect for a full Spark +task. Now, you want to use this trained classifier in an API +endpoint. Assume you need the same pre-processing pipeline for a single +document per API call. This does not have to be done in parallel, but there +should be only a small overhead in initialization and preferably no +dependency on the JVM. This is what ``pysparkling`` is for. + +.. image:: https://badge.fury.io/py/pysparkling.svg + :target: https://pypi.python.org/pypi/pysparkling/ +.. image:: https://img.shields.io/pypi/dm/pysparkling.svg + :target: https://pypi.python.org/pypi/pysparkling/ +.. image:: https://badges.gitter.im/Join%20Chat.svg + :alt: Join the chat at https://gitter.im/svenkreiss/pysparkling + :target: https://gitter.im/svenkreiss/pysparkling?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge + + +Install +======= + +.. code-block:: bash + + pip install pysparkling[s3,hdfs,http] + + +Features +======== + +* Supports multiple URI scheme: ``s3://``, ``hdfs://``, ``http://`` and ``file://``. + Specify multiple files separated by comma. + Resolves ``*`` and ``?`` wildcards. +* Handles ``.gz``, ``.zip``, ``.lzma``, ``.xz``, ``.bz2``, ``.tar``, + ``.tar.gz`` and ``.tar.bz2`` compressed files. + Supports reading of ``.7z`` files. +* Parallelization via ``multiprocessing.Pool``, + ``concurrent.futures.ThreadPoolExecutor`` or any other Pool-like + objects that have a ``map(func, iterable)`` method. + +* Plain pysparkling does not have any dependencies (use ``pip install pysparkling``). + Some file access methods have optional dependencies: + ``boto`` for AWS S3, ``requests`` for http, ``hdfs`` for hdfs + + +Examples +======== + +Some demos are in the notebooks +`docs/demo.ipynb `_ +and +`docs/iris.ipynb `_ +. + +**Word Count** + +.. code-block:: python + + from pysparkling import Context + + counts = Context().textFile( + 'README.rst' + ).map( + lambda line: ''.join(ch if ch.isalnum() else ' ' for ch in line) + ).flatMap( + lambda line: line.split(' ') + ).map( + lambda word: (word, 1) + ).reduceByKey( + lambda a, b: a + b + ) + print(counts.collect()) + +which prints a long list of pairs of words and their counts. Contents: diff --git a/pysparkling/rdd.py b/pysparkling/rdd.py index 4e26e472b..5421a4a25 100644 --- a/pysparkling/rdd.py +++ b/pysparkling/rdd.py @@ -616,7 +616,9 @@ def fullOuterJoin(self, other, numPartitions=None): .. note:: Creating the new RDD is currently implemented as a local operation. + Example: + >>> from pysparkling import Context >>> sc = Context() >>> rdd1 = sc.parallelize([('a', 0), ('b', 1)]) @@ -625,6 +627,7 @@ def fullOuterJoin(self, other, numPartitions=None): ... rdd1.fullOuterJoin(rdd2).collect() ... ) # doctest: +IGNORE_UNICODE [('a', (0, None)), ('b', (1, 2)), ('c', (None, 3))] + """ grouped = self.cogroup(other, numPartitions)