From d8f469fb61af1824a9fe175b6c4bcdc1a0ceda3b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 19 Dec 2024 14:05:04 -0800 Subject: [PATCH] Fix pylibcudf to_arrow with multiple nested data types (#17504) Fixes the following case ```python In [25]: import pyarrow as pa, pylibcudf as plc In [26]: pa_array = pa.array([[{"a": 1}]]) In [27]: pa_array.type Out[27]: ListType(list>) In [28]: plc_table = plc.Table([plc.interop.from_arrow(pa_array)]) In [29]: plc.interop.to_arrow(plc_table) RuntimeError: CUDF failure at: cpp/src/interop/to_arrow_schema.cpp:146: Number of field names and number of children doesn't match ``` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/17504 --- python/pylibcudf/pylibcudf/interop.pyx | 13 +++++++++-- .../pylibcudf/pylibcudf/tests/test_interop.py | 22 +++++++++++++++++++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/python/pylibcudf/pylibcudf/interop.pyx b/python/pylibcudf/pylibcudf/interop.pyx index bd5397ac328..7a102cf0c88 100644 --- a/python/pylibcudf/pylibcudf/interop.pyx +++ b/python/pylibcudf/pylibcudf/interop.pyx @@ -273,10 +273,19 @@ cdef void _release_array(object array_capsule) noexcept: free(array) +def _maybe_create_nested_column_metadata(Column col): + return ColumnMetadata( + children_meta=[ + _maybe_create_nested_column_metadata(child) for child in col.children() + ] + ) + + def _table_to_schema(Table tbl, metadata): if metadata is None: - metadata = [ColumnMetadata() for _ in range(len(tbl.columns()))] - metadata = [ColumnMetadata(m) if isinstance(m, str) else m for m in metadata] + metadata = [_maybe_create_nested_column_metadata(col) for col in tbl.columns()] + else: + metadata = [ColumnMetadata(m) if isinstance(m, str) else m for m in metadata] cdef vector[column_metadata] c_metadata c_metadata.reserve(len(metadata)) diff --git a/python/pylibcudf/pylibcudf/tests/test_interop.py b/python/pylibcudf/pylibcudf/tests/test_interop.py index af80b6e5978..ca42eacdfdb 100644 --- a/python/pylibcudf/pylibcudf/tests/test_interop.py +++ b/python/pylibcudf/pylibcudf/tests/test_interop.py @@ -40,6 +40,28 @@ def test_struct_dtype_roundtrip(): assert arrow_type == struct_type +def test_table_with_nested_dtype_to_arrow(): + pa_array = pa.array([[{"": 1}]]) + plc_table = plc.Table([plc.interop.from_arrow(pa_array)]) + result = plc.interop.to_arrow(plc_table) + expected_schema = pa.schema( + [ + pa.field( + "", + pa.list_( + pa.field( + "", + pa.struct([pa.field("", pa.int64(), nullable=False)]), + nullable=False, + ) + ), + nullable=False, + ) + ] + ) + assert result.schema == expected_schema + + def test_decimal128_roundtrip(): decimal_type = pa.decimal128(10, 2) plc_type = plc.interop.from_arrow(decimal_type)