Switch parquet default engine (dask#9140)

The deprecation warning has been there long enough, we can now remove it and fully switch the default engine to `pyarrow`. Also removes a deprecation warning for `compression="default"` which has also sat long enough.
mrocklin · Jun 1, 2022 · cfca592 · cfca592
1 parent 4769b56
commit cfca592
Show file tree

Hide file tree

Showing 3 changed files with 25 additions and 83 deletions.
diff --git a/dask/dataframe/io/parquet/core.py b/dask/dataframe/io/parquet/core.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import importlib
 import math
 import warnings
 
@@ -239,12 +238,9 @@ def read_parquet(
         ``"precache_options"`` key. Also, a custom file-open function can be
         used (instead of ``AbstractFileSystem.open``), by specifying the
         desired function under the ``"open_file_func"`` key.
-    engine : {'auto', 'fastparquet', 'pyarrow'}, default 'auto'
-        Parquet library to use. Options include: 'auto', 'fastparquet', and
-        'pyarrow'. Defaults to 'auto', which uses ``fastparquet`` if it is
-        installed, and falls back to ``pyarrow`` otherwise. Note that in the
-        future this default ordering for 'auto' will switch, with ``pyarrow``
-        being used if it is installed, and falling back to ``fastparquet``.
+    engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
+        Parquet library to use. Defaults to 'auto', which uses ``pyarrow`` if
+        it is installed, and falls back to ``fastparquet`` otherwise.
     calculate_divisions : bool, default False
         Whether to use min/max statistics from the footer metadata (or global
         ``_metadata`` file) to calculate divisions for the output DataFrame
@@ -443,7 +439,7 @@ def read_parquet(
         columns = list(columns)
 
     if isinstance(engine, str):
-        engine = get_engine(engine, bool(kwargs))
+        engine = get_engine(engine)
 
     if hasattr(path, "name"):
         path = stringify_path(path)
@@ -626,12 +622,9 @@ def to_parquet(
     path : string or pathlib.Path
         Destination directory for data.  Prepend with protocol like ``s3://``
         or ``hdfs://`` for remote data.
-    engine : {'auto', 'fastparquet', 'pyarrow'}, default 'auto'
-        Parquet library to use. Options include: 'auto', 'fastparquet', and
-        'pyarrow'. Defaults to 'auto', which uses ``fastparquet`` if it is
-        installed, and falls back to ``pyarrow`` otherwise. Note that in the
-        future this default ordering for 'auto' will switch, with ``pyarrow``
-        being used if it is installed, and falling back to ``fastparquet``.
+    engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
+        Parquet library to use. Defaults to 'auto', which uses ``pyarrow`` if
+        it is installed, and falls back to ``fastparquet`` otherwise.
     compression : string or dict, default 'snappy'
         Either a string like ``"snappy"`` or a dictionary mapping column names
         to compressors like ``{"name": "gzip", "values": "snappy"}``. Defaults
@@ -731,14 +724,6 @@ def to_parquet(
     """
     compute_kwargs = compute_kwargs or {}
 
-    if compression == "default":
-        warnings.warn(
-            "compression='default' is deprecated and will be removed in a "
-            "future version, the default for all engines is 'snappy' now.",
-            FutureWarning,
-        )
-        compression = "snappy"
-
     partition_on = partition_on or []
     if isinstance(partition_on, str):
         partition_on = [partition_on]
@@ -754,7 +739,7 @@ def to_parquet(
         raise ValueError("parquet doesn't support non-string column names")
 
     if isinstance(engine, str):
-        engine = get_engine(engine, bool(kwargs))
+        engine = get_engine(engine)
 
     if hasattr(path, "name"):
         path = stringify_path(path)
@@ -1097,19 +1082,14 @@ def create_metadata_file(
 _ENGINES: dict[str, Engine] = {}
 
 
-# TODO: remove _warn_engine_default_changing once the default has changed to
-# pyarrow.
-def get_engine(engine, _warn_engine_default_changing=False):
+def get_engine(engine):
     """Get the parquet engine backend implementation.
 
     Parameters
     ----------
-    engine : str, default 'auto'
-        Parquet library to use. Options include: 'auto', 'fastparquet', and
-        'pyarrow'. Defaults to 'auto', which uses ``fastparquet`` if it is
-        installed, and falls back to ``pyarrow`` otherwise. Note that in the
-        future this default ordering for 'auto' will switch, with ``pyarrow``
-        being used if it is installed, and falling back to ``fastparquet``.
+    engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
+        Parquet library to use. Defaults to 'auto', which uses ``pyarrow`` if
+        it is installed, and falls back to ``fastparquet`` otherwise.
 
     Returns
     -------
@@ -1120,23 +1100,14 @@ def get_engine(engine, _warn_engine_default_changing=False):
 
     if engine == "auto":
         try:
-            engine = get_engine("fastparquet")
-            if _warn_engine_default_changing and importlib.util.find_spec("pyarrow"):
-                warnings.warn(
-                    "engine='auto' will switch to using pyarrow by default in "
-                    "a future version. To continue using fastparquet even if "
-                    "pyarrow is installed in the future please explicitly "
-                    "specify engine='fastparquet'.",
-                    FutureWarning,
-                )
-            return engine
+            return get_engine("pyarrow")
         except RuntimeError:
             pass
 
         try:
-            return get_engine("pyarrow")
+            return get_engine("fastparquet")
         except RuntimeError:
-            raise RuntimeError("Please install either fastparquet or pyarrow") from None
+            raise RuntimeError("Please install either pyarrow or fastparquet") from None
 
     elif engine == "fastparquet":
         import_required("fastparquet", "`fastparquet` not installed")

diff --git a/dask/dataframe/io/tests/test_parquet.py b/dask/dataframe/io/tests/test_parquet.py
@@ -156,16 +156,6 @@ def test_get_engine_fastparquet():
     assert get_engine("fastparquet") == FastParquetEngine
 
 
-@PYARROW_MARK
-@FASTPARQUET_MARK
-def test_get_engine_auto_warning_if_both_installed():
-    from dask.dataframe.io.parquet.fastparquet import FastParquetEngine
-
-    with pytest.warns(FutureWarning, match="engine='auto' will switch"):
-        engine = get_engine("auto", True)
-        assert engine == FastParquetEngine
-
-
 @write_read_engines()
 @pytest.mark.parametrize("has_metadata", [False, True])
 def test_local(tmpdir, write_engine, read_engine, has_metadata):
@@ -1824,23 +1814,6 @@ def check_compression(engine, filename, compression):
                     )
 
 
-def test_explicit_compression_default_deprecated(tmpdir, engine):
-    """TODO: remove this test when `compression="default"` is fully removed"""
-    fn = str(tmpdir)
-
-    df = pd.DataFrame({"x": ["a", "b", "c"] * 10, "y": [1, 2, 3] * 10})
-    df.index.name = "index"
-    ddf = dd.from_pandas(df, npartitions=3)
-
-    with pytest.warns(FutureWarning, match="compression='default'"):
-        ddf.to_parquet(
-            fn, compression="default", engine=engine, write_metadata_file=True
-        )
-    out = dd.read_parquet(fn, engine=engine, calculate_divisions=True)
-    assert_eq(out, ddf)
-    check_compression(engine, fn, "default")
-
-
 @pytest.mark.parametrize("compression,", [None, "gzip", "snappy"])
 def test_writing_parquet_with_compression(tmpdir, compression, engine):
     fn = str(tmpdir)

diff --git a/docs/source/dataframe-parquet.rst b/docs/source/dataframe-parquet.rst
@@ -65,16 +65,15 @@ Engine
 ~~~~~~
 
 :func:`read_parquet` supports two backend engines - ``pyarrow`` and
-``fastparquet``.  For historical reasons this defaults to ``fastparquet`` if it
-is installed, and falls back to ``pyarrow`` otherwise. We recommend using
-``pyarrow`` when possible. This can be explicitly set by passing
-``engine="pyarrow"``.
+``fastparquet``. The ``pyarrow`` engine is used by default, falling back to
+``fastparquet`` if ``pyarrow`` isn't installed. If desired, you may explicitly
+specify the engine using the ``engine`` keyword argument:
 
 .. code-block:: python
 
    >>> df = dd.read_parquet(
    ...      "s3://bucket-name/my/parquet/",
-   ...      engine="pyarrow"  # explicitly specify the pyarrow engine
+   ...      engine="fastparquet"  # explicitly specify the fastparquet engine
    ... )
 
 Metadata
@@ -165,7 +164,7 @@ calculating divisions should be avoided for large datasets without a
 global ``_metadata`` file. This is especially true for remote storage.
 
 For more information about divisions, see :ref:`dataframe.design`.
- 
+
 Writing
 -------
 
@@ -219,16 +218,15 @@ Engine
 ~~~~~~
 
 :func:`to_parquet` supports two backend engines - ``pyarrow`` and
-``fastparquet``.  For historical reasons this defaults to ``fastparquet`` if it
-is installed, and falls back to ``pyarrow`` otherwise. We recommend using
-``pyarrow`` when possible. This can be explicitly set by passing
-``engine="pyarrow"``.
+``fastparquet``. The ``pyarrow`` engine is used by default, falling back to
+``fastparquet`` if ``pyarrow`` isn't installed. If desired, you may explicitly
+specify the engine using the ``engine`` keyword argument:
 
 .. code-block:: python
 
    >>> df.to_parquet(
-   ...     "s3://bucket-name/my/parquet/",
-   ...     engine="pyarrow"  # explicitly specify the pyarrow engine
+   ...      "s3://bucket-name/my/parquet/",
+   ...      engine="fastparquet"  # explicitly specify the fastparquet engine
    ... )
 
 Metadata