diff --git a/.buildinfo b/.buildinfo index 77ce9891..29220e7c 100644 --- a/.buildinfo +++ b/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: 8eb14ef0987ca17d9dfbb8af61480095 +config: 5d67110e4b7f3308f48f9b0005f1974e tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/.doctrees/environment.pickle b/.doctrees/environment.pickle index 0ea6ce80..76a594ae 100644 Binary files a/.doctrees/environment.pickle and b/.doctrees/environment.pickle differ diff --git a/.doctrees/index.doctree b/.doctrees/index.doctree index 67805054..05761b6c 100644 Binary files a/.doctrees/index.doctree and b/.doctrees/index.doctree differ diff --git a/.doctrees/spark_usage.doctree b/.doctrees/spark_usage.doctree index 9d890dc8..04e52361 100644 Binary files a/.doctrees/spark_usage.doctree and b/.doctrees/spark_usage.doctree differ diff --git a/_sources/spark_usage.rst.txt b/_sources/spark_usage.rst.txt index e064f0fb..7bd942cd 100644 --- a/_sources/spark_usage.rst.txt +++ b/_sources/spark_usage.rst.txt @@ -3,15 +3,15 @@ Spark Usage .. important:: - With version ``v0.12.0`` the original ``SparkCompare`` was replaced with a - Pandas on Spark implementation The original ``SparkCompare`` - implementation differs from all the other native implementations. To align the API better, - and keep behaviour consistent we are deprecating the original ``SparkCompare`` + With version ``v0.12.0`` the original ``SparkCompare`` was replaced with a + Pandas on Spark implementation The original ``SparkCompare`` + implementation differs from all the other native implementations. To align the API better, + and keep behaviour consistent we are deprecating the original ``SparkCompare`` into a new module ``LegacySparkCompare`` Subsequently in ``v0.13.0`` a PySaprk DataFrame class has been introduced (``SparkSQLCompare``) - which accepts ``pyspark.sql.DataFrame`` and should provide better performance. With this version - the Pandas on Spark implementation has been renamed to ``SparkPandasCompare`` and all the spark + which accepts ``pyspark.sql.DataFrame`` and should provide better performance. With this version + the Pandas on Spark implementation has been renamed to ``SparkPandasCompare`` and all the spark logic is now under the ``spark`` submodule. If you wish to use the old SparkCompare moving forward you can import it like so: @@ -19,29 +19,24 @@ Spark Usage .. code-block:: python from datacompy.spark.legacy import LegacySparkCompare - -For both ``SparkSQLCompare`` and ``SparkPandasCompare`` - -- ``on_index`` is not supported. -- Joining is done using ``<=>`` which is the equality test that is safe for null values. -- ``SparkPandasCompare`` compares ``pyspark.pandas.DataFrame``'s -- ``SparkSQLCompare`` compares ``pyspark.sql.DataFrame``'s - -Supported Version ------------------- .. important:: - Spark will not offically support Pandas 2 until Spark 4: https://issues.apache.org/jira/browse/SPARK-44101 + Starting with ``v0.14.1``, ``SparkPandasCompare`` is slated for deprecation. ``SparkSQLCompare`` + is the prefered and much more performant. It should be noted that if you continue to use ``SparkPandasCompare`` + that ``numpy`` 2+ is not supported due to dependnecy issues. + +For ``SparkSQLCompare`` -Until then we will not be supporting Pandas 2 for the Pandas on Spark API implementaion. -For Fugue, the Native Pandas (`Compare`), and `SparkSQLCompare` implementations, Pandas 2 is supported. +- ``on_index`` is not supported. +- Joining is done using ``<=>`` which is the equality test that is safe for null values. +- ``SparkSQLCompare`` compares ``pyspark.sql.DataFrame``'s -SparkPandasCompare and SparkSQLCompare Object Setup ---------------------------------------------------- +SparkSQLCompare +--------------- There is currently only one supported method for joining your dataframes - by join column(s). @@ -52,7 +47,7 @@ join column(s). from io import StringIO import pandas as pd import pyspark.pandas as ps - from datacompy import SparkPandasCompare, SparkSQLCompare + from datacompy import SparkSQLCompare from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() @@ -73,25 +68,6 @@ join column(s). 10000001238,1.05,Loose Seal Bluth,111 """ - # SparkPandasCompare - df1 = ps.from_pandas(pd.read_csv(StringIO(data1))) - df2 = ps.from_pandas(pd.read_csv(StringIO(data2))) - - compare = SparkPandasCompare( - df1, - df2, - join_columns='acct_id', # You can also specify a list of columns - abs_tol=0, # Optional, defaults to 0 - rel_tol=0, # Optional, defaults to 0 - df1_name='Original', # Optional, defaults to 'df1' - df2_name='New' # Optional, defaults to 'df2' - ) - compare.matches(ignore_extra_columns=False) - # False - # This method prints out a human-readable report summarizing and sampling differences - print(compare.report()) - - # SparkSQLCompare df1 = spark.createDataFrame(pd.read_csv(StringIO(data1))) df2 = spark.createDataFrame(pd.read_csv(StringIO(data2))) diff --git a/_static/documentation_options.js b/_static/documentation_options.js index 9242cb85..5c726b82 100644 --- a/_static/documentation_options.js +++ b/_static/documentation_options.js @@ -1,5 +1,5 @@ const DOCUMENTATION_OPTIONS = { - VERSION: '0.14.0', + VERSION: '0.14.1', LANGUAGE: 'en', COLLAPSE_INDEX: false, BUILDER: 'html', diff --git a/api/datacompy.html b/api/datacompy.html index dfe06df6..0a34a8da 100644 --- a/api/datacompy.html +++ b/api/datacompy.html @@ -6,7 +6,7 @@ -
Starting with v0.14.1
, SparkPandasCompare
is slated for deprecation. SparkSQLCompare
is the prefered and much more performant.
+It should be noted that if you continue to use SparkPandasCompare
that numpy
2+ is not supported due to dependency issues.
Different versions of Spark, Pandas, and Python interact differently. Below is a matrix of what we test with. @@ -426,8 +431,7 @@