Tweak ORC engine= parameter (dask#10746)

mrocklin · Jan 4, 2024 · 6dc8655 · 6dc8655
1 parent f2d448e
commit 6dc8655
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 8 deletions.
diff --git a/dask/dataframe/io/orc/core.py b/dask/dataframe/io/orc/core.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import copy
+from typing import TYPE_CHECKING, Literal
 
 from fsspec.core import get_fs_token_paths
 from fsspec.utils import stringify_path
@@ -14,6 +15,9 @@
 from dask.highlevelgraph import HighLevelGraph
 from dask.utils import apply
 
+if TYPE_CHECKING:
+    from dask.dataframe.io.orc.arrow import ArrowORCEngine
+
 
 class ORCFunctionWrapper(DataFrameIOFunction):
     """
@@ -55,14 +59,15 @@ def __call__(self, parts):
         return _df
 
 
-def _get_engine(engine, write=False):
-    # Get engine
+def _get_engine(
+    engine: Literal["pyarrow"] | ORCEngine,
+) -> type[ArrowORCEngine] | ORCEngine:
     if engine == "pyarrow":
         from dask.dataframe.io.orc.arrow import ArrowORCEngine
 
         return ArrowORCEngine
     elif not isinstance(engine, ORCEngine):
-        raise TypeError("engine must be 'pyarrow', or an ORCEngine object")
+        raise TypeError("engine must be 'pyarrow' or an ORCEngine object")
     return engine
 
 
@@ -84,7 +89,7 @@ def read_orc(
         Location of file(s), which can be a full URL with protocol
         specifier, and may include glob character if a single string.
     engine: 'pyarrow' or ORCEngine
-        Backend ORC engine to use for IO. Default is "pyarrow".
+        Backend ORC engine to use for I/O. Default is "pyarrow".
     columns: None or list(str)
         Columns to load. If None, loads all.
     index: str
@@ -165,9 +170,8 @@ def to_orc(
     path : string or pathlib.Path
         Destination directory for data.  Prepend with protocol like ``s3://``
         or ``hdfs://`` for remote data.
-    engine : 'pyarrow' or ORCEngine
-        Parquet library to use. If only one library is installed, it will use
-        that one; if both, it will use 'fastparquet'.
+    engine: 'pyarrow' or ORCEngine
+        Backend ORC engine to use for I/O. Default is "pyarrow".
     write_index : boolean, default True
         Whether or not to write the index. Defaults to True.
     storage_options : dict, default None
@@ -189,7 +193,7 @@ def to_orc(
     """
 
     # Get engine
-    engine = _get_engine(engine, write=True)
+    engine = _get_engine(engine)
 
     if hasattr(path, "name"):
         path = stringify_path(path)

diff --git a/dask/dataframe/io/parquet/core.py b/dask/dataframe/io/parquet/core.py
@@ -257,6 +257,8 @@ def read_parquet(
     engine : {'auto', 'pyarrow'}
         Parquet library to use. Defaults to 'auto', which uses ``pyarrow`` if
         it is installed, and falls back to the deprecated ``fastparquet`` otherwise.
+        Note that ``fastparquet`` does not support all functionality offered by
+        ``pyarrow``.
         This is also used by third-party packages (e.g. CuDF) to inject bespoke engines.
     use_nullable_dtypes : {False, True}
         Whether to use extension dtypes for the resulting ``DataFrame``.
@@ -718,6 +720,8 @@ def to_parquet(
     engine : {'auto', 'pyarrow'}
         Parquet library to use. Defaults to 'auto', which uses ``pyarrow`` if
         it is installed, and falls back to the deprecated ``fastparquet`` otherwise.
+        Note that ``fastparquet`` does not support all functionality offered by
+        ``pyarrow``.
         This is also used by third-party packages (e.g. CuDF) to inject bespoke engines.
     compression : string or dict, default 'snappy'
         Either a string like ``"snappy"`` or a dictionary mapping column names