Skip to content

Commit

Permalink
Switch parquet default engine (dask#9140)
Browse files Browse the repository at this point in the history
The deprecation warning has been there long enough, we can now remove it
and fully switch the default engine to `pyarrow`.

Also removes a deprecation warning for `compression="default"` which
has also sat long enough.
  • Loading branch information
jcrist authored Jun 1, 2022
1 parent 4769b56 commit cfca592
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 83 deletions.
59 changes: 15 additions & 44 deletions dask/dataframe/io/parquet/core.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from __future__ import annotations

import importlib
import math
import warnings

Expand Down Expand Up @@ -239,12 +238,9 @@ def read_parquet(
``"precache_options"`` key. Also, a custom file-open function can be
used (instead of ``AbstractFileSystem.open``), by specifying the
desired function under the ``"open_file_func"`` key.
engine : {'auto', 'fastparquet', 'pyarrow'}, default 'auto'
Parquet library to use. Options include: 'auto', 'fastparquet', and
'pyarrow'. Defaults to 'auto', which uses ``fastparquet`` if it is
installed, and falls back to ``pyarrow`` otherwise. Note that in the
future this default ordering for 'auto' will switch, with ``pyarrow``
being used if it is installed, and falling back to ``fastparquet``.
engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
Parquet library to use. Defaults to 'auto', which uses ``pyarrow`` if
it is installed, and falls back to ``fastparquet`` otherwise.
calculate_divisions : bool, default False
Whether to use min/max statistics from the footer metadata (or global
``_metadata`` file) to calculate divisions for the output DataFrame
Expand Down Expand Up @@ -443,7 +439,7 @@ def read_parquet(
columns = list(columns)

if isinstance(engine, str):
engine = get_engine(engine, bool(kwargs))
engine = get_engine(engine)

if hasattr(path, "name"):
path = stringify_path(path)
Expand Down Expand Up @@ -626,12 +622,9 @@ def to_parquet(
path : string or pathlib.Path
Destination directory for data. Prepend with protocol like ``s3://``
or ``hdfs://`` for remote data.
engine : {'auto', 'fastparquet', 'pyarrow'}, default 'auto'
Parquet library to use. Options include: 'auto', 'fastparquet', and
'pyarrow'. Defaults to 'auto', which uses ``fastparquet`` if it is
installed, and falls back to ``pyarrow`` otherwise. Note that in the
future this default ordering for 'auto' will switch, with ``pyarrow``
being used if it is installed, and falling back to ``fastparquet``.
engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
Parquet library to use. Defaults to 'auto', which uses ``pyarrow`` if
it is installed, and falls back to ``fastparquet`` otherwise.
compression : string or dict, default 'snappy'
Either a string like ``"snappy"`` or a dictionary mapping column names
to compressors like ``{"name": "gzip", "values": "snappy"}``. Defaults
Expand Down Expand Up @@ -731,14 +724,6 @@ def to_parquet(
"""
compute_kwargs = compute_kwargs or {}

if compression == "default":
warnings.warn(
"compression='default' is deprecated and will be removed in a "
"future version, the default for all engines is 'snappy' now.",
FutureWarning,
)
compression = "snappy"

partition_on = partition_on or []
if isinstance(partition_on, str):
partition_on = [partition_on]
Expand All @@ -754,7 +739,7 @@ def to_parquet(
raise ValueError("parquet doesn't support non-string column names")

if isinstance(engine, str):
engine = get_engine(engine, bool(kwargs))
engine = get_engine(engine)

if hasattr(path, "name"):
path = stringify_path(path)
Expand Down Expand Up @@ -1097,19 +1082,14 @@ def create_metadata_file(
_ENGINES: dict[str, Engine] = {}


# TODO: remove _warn_engine_default_changing once the default has changed to
# pyarrow.
def get_engine(engine, _warn_engine_default_changing=False):
def get_engine(engine):
"""Get the parquet engine backend implementation.
Parameters
----------
engine : str, default 'auto'
Parquet library to use. Options include: 'auto', 'fastparquet', and
'pyarrow'. Defaults to 'auto', which uses ``fastparquet`` if it is
installed, and falls back to ``pyarrow`` otherwise. Note that in the
future this default ordering for 'auto' will switch, with ``pyarrow``
being used if it is installed, and falling back to ``fastparquet``.
engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
Parquet library to use. Defaults to 'auto', which uses ``pyarrow`` if
it is installed, and falls back to ``fastparquet`` otherwise.
Returns
-------
Expand All @@ -1120,23 +1100,14 @@ def get_engine(engine, _warn_engine_default_changing=False):

if engine == "auto":
try:
engine = get_engine("fastparquet")
if _warn_engine_default_changing and importlib.util.find_spec("pyarrow"):
warnings.warn(
"engine='auto' will switch to using pyarrow by default in "
"a future version. To continue using fastparquet even if "
"pyarrow is installed in the future please explicitly "
"specify engine='fastparquet'.",
FutureWarning,
)
return engine
return get_engine("pyarrow")
except RuntimeError:
pass

try:
return get_engine("pyarrow")
return get_engine("fastparquet")
except RuntimeError:
raise RuntimeError("Please install either fastparquet or pyarrow") from None
raise RuntimeError("Please install either pyarrow or fastparquet") from None

elif engine == "fastparquet":
import_required("fastparquet", "`fastparquet` not installed")
Expand Down
27 changes: 0 additions & 27 deletions dask/dataframe/io/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,16 +156,6 @@ def test_get_engine_fastparquet():
assert get_engine("fastparquet") == FastParquetEngine


@PYARROW_MARK
@FASTPARQUET_MARK
def test_get_engine_auto_warning_if_both_installed():
from dask.dataframe.io.parquet.fastparquet import FastParquetEngine

with pytest.warns(FutureWarning, match="engine='auto' will switch"):
engine = get_engine("auto", True)
assert engine == FastParquetEngine


@write_read_engines()
@pytest.mark.parametrize("has_metadata", [False, True])
def test_local(tmpdir, write_engine, read_engine, has_metadata):
Expand Down Expand Up @@ -1824,23 +1814,6 @@ def check_compression(engine, filename, compression):
)


def test_explicit_compression_default_deprecated(tmpdir, engine):
"""TODO: remove this test when `compression="default"` is fully removed"""
fn = str(tmpdir)

df = pd.DataFrame({"x": ["a", "b", "c"] * 10, "y": [1, 2, 3] * 10})
df.index.name = "index"
ddf = dd.from_pandas(df, npartitions=3)

with pytest.warns(FutureWarning, match="compression='default'"):
ddf.to_parquet(
fn, compression="default", engine=engine, write_metadata_file=True
)
out = dd.read_parquet(fn, engine=engine, calculate_divisions=True)
assert_eq(out, ddf)
check_compression(engine, fn, "default")


@pytest.mark.parametrize("compression,", [None, "gzip", "snappy"])
def test_writing_parquet_with_compression(tmpdir, compression, engine):
fn = str(tmpdir)
Expand Down
22 changes: 10 additions & 12 deletions docs/source/dataframe-parquet.rst
Original file line number Diff line number Diff line change
Expand Up @@ -65,16 +65,15 @@ Engine
~~~~~~

:func:`read_parquet` supports two backend engines - ``pyarrow`` and
``fastparquet``. For historical reasons this defaults to ``fastparquet`` if it
is installed, and falls back to ``pyarrow`` otherwise. We recommend using
``pyarrow`` when possible. This can be explicitly set by passing
``engine="pyarrow"``.
``fastparquet``. The ``pyarrow`` engine is used by default, falling back to
``fastparquet`` if ``pyarrow`` isn't installed. If desired, you may explicitly
specify the engine using the ``engine`` keyword argument:

.. code-block:: python
>>> df = dd.read_parquet(
... "s3://bucket-name/my/parquet/",
... engine="pyarrow" # explicitly specify the pyarrow engine
... engine="fastparquet" # explicitly specify the fastparquet engine
... )
Metadata
Expand Down Expand Up @@ -165,7 +164,7 @@ calculating divisions should be avoided for large datasets without a
global ``_metadata`` file. This is especially true for remote storage.

For more information about divisions, see :ref:`dataframe.design`.

Writing
-------

Expand Down Expand Up @@ -219,16 +218,15 @@ Engine
~~~~~~

:func:`to_parquet` supports two backend engines - ``pyarrow`` and
``fastparquet``. For historical reasons this defaults to ``fastparquet`` if it
is installed, and falls back to ``pyarrow`` otherwise. We recommend using
``pyarrow`` when possible. This can be explicitly set by passing
``engine="pyarrow"``.
``fastparquet``. The ``pyarrow`` engine is used by default, falling back to
``fastparquet`` if ``pyarrow`` isn't installed. If desired, you may explicitly
specify the engine using the ``engine`` keyword argument:

.. code-block:: python
>>> df.to_parquet(
... "s3://bucket-name/my/parquet/",
... engine="pyarrow" # explicitly specify the pyarrow engine
... "s3://bucket-name/my/parquet/",
... engine="fastparquet" # explicitly specify the fastparquet engine
... )
Metadata
Expand Down

0 comments on commit cfca592

Please sign in to comment.