geopandas · jorisvandenbossche · Apr 17, 2024 · Jan 25, 2024 · Jan 25, 2024 · Mar 12, 2024
diff --git a/CHANGES.md b/CHANGES.md
@@ -7,6 +7,11 @@
 -   `read_arrow` and `open_arrow` now provide
     [GeoArrow-compliant extension metadata](https://geoarrow.org/extension-types.html),
     including the CRS, when using GDAL 3.8 or higher (#366).
+-   The `open_arrow` function can now be used without a `pyarrow` dependency. In
+    that case, specify `use_pyarrow=False` and the returned reader will be a
+    generic object implementing the [Arrow PyCapsule Protocol](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html) (i.e. having an `__arrow_c_stream__`
+    method). This object can then be consumed by your Arrow implementation of choice
+    that supports this protocol (#349).
 -   Warn when reading from a multilayer file without specifying a layer (#362).
 
 

diff --git a/pyogrio/_io.pyx b/pyogrio/_io.pyx
@@ -18,6 +18,8 @@ from libc.string cimport strlen
 from libc.math cimport isnan
 
 cimport cython
+from cpython.pycapsule cimport PyCapsule_New, PyCapsule_GetPointer
+
 import numpy as np
 cimport numpy as np
 
@@ -1238,6 +1240,35 @@ def ogr_read(
         field_data
     )
 
+
+cdef void pycapsule_array_stream_deleter(object stream_capsule) noexcept:
+    cdef ArrowArrayStream* stream = <ArrowArrayStream*>PyCapsule_GetPointer(
+        stream_capsule, 'arrow_array_stream'
+    )
+    # Do not invoke the deleter on a used/moved capsule
+    if stream.release != NULL:
+        stream.release(stream)
+
+    free(stream)
+
+
+cdef object alloc_c_stream(ArrowArrayStream** c_stream):
+    c_stream[0] = <ArrowArrayStream*> malloc(sizeof(ArrowArrayStream))
+    # Ensure the capsule destructor doesn't call a random release pointer
+    c_stream[0].release = NULL
+    return PyCapsule_New(c_stream[0], 'arrow_array_stream', &pycapsule_array_stream_deleter)
+
+
+class _ArrowStream:
+    def __init__(self, capsule):
+        self._capsule = capsule
+
+    def __arrow_c_stream__(self, requested_schema=None):
+        if requested_schema is not None:
+            raise NotImplementedError("requested_schema is not supported")
+        return self._capsule
+
+
 @contextlib.contextmanager
 def ogr_open_arrow(
     str path,
@@ -1256,7 +1287,9 @@ def ogr_open_arrow(
     str sql=None,
     str sql_dialect=None,
     int return_fids=False,
-    int batch_size=0):
+    int batch_size=0,
+    return_pyarrow=True,
+):
 
     cdef int err = 0
     cdef const char *path_c = NULL
@@ -1267,7 +1300,7 @@ def ogr_open_arrow(
     cdef char **fields_c = NULL
     cdef const char *field_c = NULL
     cdef char **options = NULL
-    cdef ArrowArrayStream stream
+    cdef ArrowArrayStream* stream
     cdef ArrowSchema schema
 
     IF CTE_GDAL_VERSION < (3, 6, 0):
@@ -1390,19 +1423,28 @@ def ogr_open_arrow(
         # make sure layer is read from beginning
         OGR_L_ResetReading(ogr_layer)
 
-        if not OGR_L_GetArrowStream(ogr_layer, &stream, options):
-            raise RuntimeError("Failed to open ArrowArrayStream from Layer")
+        # allocate the stream struct and wrap in capsule to ensure clean-up on error
+        if not return_pyarrow:
+            capsule = alloc_c_stream(&stream)
+        else:
+            stream = <ArrowArrayStream*> malloc(sizeof(ArrowArrayStream))
 
-        stream_ptr = <uintptr_t> &stream
+        if not OGR_L_GetArrowStream(ogr_layer, stream, options):
+            if return_pyarrow:
+                free(stream)
+            raise RuntimeError("Failed to open ArrowArrayStream from Layer")
 
         if skip_features:
             # only supported for GDAL >= 3.8.0; have to do this after getting
             # the Arrow stream
             OGR_L_SetNextByIndex(ogr_layer, skip_features)
 
-        # stream has to be consumed before the Dataset is closed
-        import pyarrow as pa
-        reader = pa.RecordBatchStreamReader._import_from_c(stream_ptr)
+        if return_pyarrow:
+            import pyarrow as pa
+            stream_ptr = <uintptr_t> stream
+            reader = pa.RecordBatchStreamReader._import_from_c(stream_ptr)
-            reader = pa.RecordBatchStreamReader._import_from_c(stream_ptr)
+            reader = pa.RecordBatchStreamReader._import_from_c(<uintptr_t> stream)
-            reader = pa.RecordBatchStreamReader._import_from_c(stream_ptr)
+            reader = pa.RecordBatchStreamReader._import_from_c(<uintptr_t> stream)
+        else:
+            reader = _ArrowStream(capsule)
 
         meta = {
             'crs': crs,
@@ -1413,10 +1455,11 @@ def ogr_open_arrow(
             'fid_column': fid_column,
         }
 
+        # stream has to be consumed before the Dataset is closed
         yield meta, reader
 
     finally:
-        if reader is not None:
+        if return_pyarrow and reader is not None:
             # Mark reader as closed to prevent reading batches
             reader.close()
 
@@ -1436,6 +1479,10 @@ def ogr_open_arrow(
             GDALClose(ogr_dataset)
             ogr_dataset = NULL
 
+        if return_pyarrow:
+            free(stream)
+
+
 def ogr_read_bounds(
     str path,
     object layer=None,

diff --git a/pyogrio/_ogr.pxd b/pyogrio/_ogr.pxd
@@ -196,6 +196,7 @@ cdef extern from "arrow_bridge.h":
 
     struct ArrowArrayStream:
         int (*get_schema)(ArrowArrayStream* stream, ArrowSchema* out)
+        void (*release)(ArrowArrayStream*) noexcept nogil
 
 
 cdef extern from "ogr_api.h":

diff --git a/pyogrio/raw.py b/pyogrio/raw.py
@@ -361,17 +361,36 @@ def open_arrow(
     sql_dialect=None,
     return_fids=False,
     batch_size=65_536,
+    return_pyarrow=True,
     **kwargs,
 ):
     """
-    Open OGR data source as a stream of pyarrow record batches.
+    Open OGR data source as a stream of Arrow record batches.
 
     See docstring of `read` for parameters.
 
-    The RecordBatchStreamReader is reading from a stream provided by OGR and must not be
+    The RecordBatchReader is reading from a stream provided by OGR and must not be
     accessed after the OGR dataset has been closed, i.e. after the context manager has
     been closed.
 
+    By default this function returns a `pyarrow.RecordBatchReader`. Optionally,
+    you can use this function without a `pyarrow` dependency by specifying
+    ``return_pyarrow=False``. In that case, the returned reader will be a
+    generic object implementing the `Arrow PyCapsule Protocol`_ (i.e. having
+    an `__arrow_c_stream__` method). This object can then be consumed by
+    your Arrow implementation of choice that supports this protocol.
+
+    .. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
+
+    Other Parameters
+    ----------------
+    batch_size : int (default: 65_536)
+        Maximum number of features to retrieve in a batch.
+    return_pyarrow : bool (default: True)
+        If False, return a generic ArrowStream object instead of a pyarrow
+        RecordBatchReader. This object needs to be passed to another library
+        supporting the Arrow PyCapsule Protocol to consume the stream of data.
+
     Examples
     --------
 
@@ -384,12 +403,22 @@ def open_arrow(
     >>>     for table in reader:
     >>>         geometries = shapely.from_wkb(table[meta["geometry_name"]])
 
+    Or without directly returning a pyarrow object:
+
+    >>> with open_arrow(path) as source:
+    >>>     meta, stream = source
+    >>>     reader = pa.RecordBatchReader.from_stream(stream)
+    >>>     for table in reader:
+    >>>         geometries = shapely.from_wkb(table[meta["geometry_name"]])
+
     Returns
     -------
-    (dict, pyarrow.RecordBatchStreamReader)
+    (dict, pyarrow.RecordBatchReader or ArrowStream)
 
         Returns a tuple of meta information about the data source in a dict,
-        and a pyarrow RecordBatchStreamReader with data.
+        and a data stream object (a pyarrow RecordBatchReader if
+        `return_pyarrow` is set to True, otherwise a generic ArrowStrem
+        object).
 
         Meta is: {
             "crs": "<crs>",
@@ -425,6 +454,7 @@ def open_arrow(
             return_fids=return_fids,
             dataset_kwargs=dataset_kwargs,
             batch_size=batch_size,
+            return_pyarrow=return_pyarrow,
         )
     finally:
         if buffer is not None:

diff --git a/pyogrio/tests/test_arrow.py b/pyogrio/tests/test_arrow.py
@@ -4,8 +4,9 @@
 import os
 
 import pytest
-
 import numpy as np
+
+import pyogrio
 from pyogrio import __gdal_version__, read_dataframe
 from pyogrio.raw import open_arrow, read_arrow, write
 from pyogrio.tests.conftest import requires_arrow_api
@@ -207,6 +208,19 @@ def test_read_arrow_geoarrow_metadata(naturalearth_lowres):
     assert parsed_meta["crs"]["id"]["code"] == 4326
 
 
+def test_open_arrow_capsule_protocol(naturalearth_lowres):
+    pytest.importorskip("pyarrow", minversion="14")
+
+    with open_arrow(naturalearth_lowres, return_pyarrow=False) as (meta, reader):
+        assert isinstance(meta, dict)
+        assert isinstance(reader, pyogrio._io._ArrowStream)
+
+        result = pyarrow.table(reader)
+
+    _, expected = read_arrow(naturalearth_lowres)
+    assert result.equals(expected)
+
+
 @contextlib.contextmanager
 def use_arrow_context():
     original = os.environ.get("PYOGRIO_USE_ARROW", None)