Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support compression= in DataFrame.to_json #17634

Merged
merged 1 commit into from
Dec 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 19 additions & 11 deletions python/cudf/cudf/io/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,22 @@ def _get_cudf_schema_element_from_dtype(
return lib_type, child_types


def _to_plc_compression(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we use something like this in other places? If so, we could reuse it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like we have something similar we use in parquet, but given that each format supports different compression (and maps slightly differently from Python), I can take a look at this in a follow up

compression: Literal["infer", "gzip", "bz2", "zip", "xz", None],
) -> plc.io.types.CompressionType:
if compression is not None:
if compression == "gzip":
return plc.io.types.CompressionType.GZIP
elif compression == "bz2":
return plc.io.types.CompressionType.BZIP2
elif compression == "zip":
return plc.io.types.CompressionType.ZIP
else:
return plc.io.types.CompressionType.AUTO
else:
return plc.io.types.CompressionType.NONE


@ioutils.doc_read_json()
def read_json(
path_or_buf,
Expand Down Expand Up @@ -115,17 +131,7 @@ def read_json(
if isinstance(source, str) and not os.path.isfile(source):
filepaths_or_buffers[idx] = source.encode()

if compression is not None:
if compression == "gzip":
c_compression = plc.io.types.CompressionType.GZIP
elif compression == "bz2":
c_compression = plc.io.types.CompressionType.BZIP2
elif compression == "zip":
c_compression = plc.io.types.CompressionType.ZIP
else:
c_compression = plc.io.types.CompressionType.AUTO
else:
c_compression = plc.io.types.CompressionType.NONE
c_compression = _to_plc_compression(compression)

if on_bad_lines.lower() == "error":
c_on_bad_lines = plc.io.types.JSONRecoveryMode.FAIL
Expand Down Expand Up @@ -291,6 +297,7 @@ def _plc_write_json(
include_nulls: bool = True,
lines: bool = False,
rows_per_chunk: int = 1024 * 64, # 64K rows
compression: Literal["infer", "gzip", "bz2", "zip", "xz", None] = None,
) -> None:
try:
tbl_w_meta = plc.io.TableWithMetadata(
Expand All @@ -307,6 +314,7 @@ def _plc_write_json(
.na_rep(na_rep)
.include_nulls(include_nulls)
.lines(lines)
.compression(_to_plc_compression(compression))
.build()
)
if rows_per_chunk != np.iinfo(np.int32).max:
Expand Down
9 changes: 9 additions & 0 deletions python/cudf/cudf/tests/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -1453,3 +1453,12 @@ def test_chunked_json_reader():
with cudf.option_context("io.json.low_memory", True):
gdf = cudf.read_json(buf, lines=True)
assert_eq(df, gdf)


@pytest.mark.parametrize("compression", ["gzip", None])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's a compression_params you could use

Suggested change
@pytest.mark.parametrize("compression", ["gzip", None])
@pytest.mark.parametrize("compression", compression_params)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I tried this originally, but it appears gzip is the only supported compression for writing json

def test_roundtrip_compression(compression, tmp_path):
expected = cudf.DataFrame({"a": 1, "b": "2"})
fle = BytesIO()
expected.to_json(fle, engine="cudf", compression=compression)
result = cudf.read_json(fle, engine="cudf", compression=compression)
assert_eq(result, expected)
2 changes: 2 additions & 0 deletions python/pylibcudf/pylibcudf/io/json.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ cdef class JsonWriterOptions:
cpdef void set_rows_per_chunk(self, size_type val)
cpdef void set_true_value(self, str val)
cpdef void set_false_value(self, str val)
cpdef void set_compression(self, compression_type comptype)

cdef class JsonWriterOptionsBuilder:
cdef json_writer_options_builder c_obj
Expand All @@ -71,6 +72,7 @@ cdef class JsonWriterOptionsBuilder:
cpdef JsonWriterOptionsBuilder na_rep(self, str val)
cpdef JsonWriterOptionsBuilder include_nulls(self, bool val)
cpdef JsonWriterOptionsBuilder lines(self, bool val)
cpdef JsonWriterOptionsBuilder compression(self, compression_type comptype)
cpdef JsonWriterOptions build(self)

cpdef void write_json(JsonWriterOptions options)
Expand Down
2 changes: 2 additions & 0 deletions python/pylibcudf/pylibcudf/io/json.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,14 @@ class JsonWriterOptions:
def set_rows_per_chunk(self, val: int) -> None: ...
def set_true_value(self, val: str) -> None: ...
def set_false_value(self, val: str) -> None: ...
def set_compression(self, comptype: CompressionType) -> None: ...

class JsonWriterOptionsBuilder:
def metadata(self, tbl_w_meta: TableWithMetadata) -> Self: ...
def na_rep(self, val: str) -> Self: ...
def include_nulls(self, val: bool) -> Self: ...
def lines(self, val: bool) -> Self: ...
def compression(self, comptype: CompressionType) -> Self: ...
def build(self) -> JsonWriterOptions: ...

def write_json(options: JsonWriterOptions) -> None: ...
Expand Down
30 changes: 30 additions & 0 deletions python/pylibcudf/pylibcudf/io/json.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -587,6 +587,20 @@ cdef class JsonWriterOptions:
"""
self.c_obj.set_false_value(val.encode())

cpdef void set_compression(self, compression_type comptype):
"""
Sets compression type to be used
Parameters
----------
comptype : CompressionType
Compression type for sink
Returns
-------
None
"""
self.c_obj.set_compression(comptype)

cdef class JsonWriterOptionsBuilder:
cpdef JsonWriterOptionsBuilder metadata(self, TableWithMetadata tbl_w_meta):
Expand Down Expand Up @@ -653,6 +667,22 @@ cdef class JsonWriterOptionsBuilder:
self.c_obj.lines(val)
return self

cpdef JsonWriterOptionsBuilder compression(self, compression_type comptype):
"""
Sets compression type of output sink.
Parameters
----------
comptype : CompressionType
Compression type used
Returns
-------
Self
"""
self.c_obj.compression(comptype)
return self

cpdef JsonWriterOptions build(self):
"""Create a JsonWriterOptions object"""
cdef JsonWriterOptions json_options = JsonWriterOptions.__new__(
Expand Down
8 changes: 8 additions & 0 deletions python/pylibcudf/pylibcudf/libcudf/io/json.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,8 @@ cdef extern from "cudf/io/json.hpp" \
size_type get_rows_per_chunk() except +libcudf_exception_handler
string get_true_value() except +libcudf_exception_handler
string get_false_value() except +libcudf_exception_handler
cudf_io_types.compression_type get_compression()\
except +libcudf_exception_handler

# setter
void set_table(
Expand All @@ -181,6 +183,9 @@ cdef extern from "cudf/io/json.hpp" \
void set_rows_per_chunk(size_type val) except +libcudf_exception_handler
void set_true_value(string val) except +libcudf_exception_handler
void set_false_value(string val) except +libcudf_exception_handler
void set_compression(
cudf_io_types.compression_type comptype
) except +libcudf_exception_handler

@staticmethod
json_writer_options_builder builder(
Expand Down Expand Up @@ -218,6 +223,9 @@ cdef extern from "cudf/io/json.hpp" \
json_writer_options_builder& false_value(
string val
) except +libcudf_exception_handler
json_writer_options_builder& compression(
cudf_io_types.compression_type comptype
) except +libcudf_exception_handler

json_writer_options build() except +libcudf_exception_handler

Expand Down
Loading