Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add write_parquet to pylibcudf #17263

Merged
merged 30 commits into from
Nov 22, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
ff0d51f
Add writer, supporting objects, and tests; compilation passes
mroeschke Nov 7, 2024
bb2c258
Add fix test, add python method for construction
mroeschke Nov 7, 2024
625e254
Merge remote-tracking branch 'upstream/branch-24.12' into plc/io/parq…
mroeschke Nov 7, 2024
4b8402d
Merge remote-tracking branch 'upstream/branch-24.12' into plc/io/parq…
mroeschke Nov 14, 2024
097decb
Use HostBuffer, rename to c_obj
mroeschke Nov 15, 2024
3133ee1
fix up tests
mroeschke Nov 15, 2024
35984c9
Merge remote-tracking branch 'upstream/branch-24.12' into plc/io/parq…
mroeschke Nov 15, 2024
14c4501
keep table and sink references alive
mroeschke Nov 15, 2024
46cbb46
Return memoryview
mroeschke Nov 15, 2024
46db84e
Adjust test too
mroeschke Nov 15, 2024
efe24d4
Add back contiguous split changes
mroeschke Nov 15, 2024
0d0b5ba
Merge remote-tracking branch 'upstream/branch-24.12' into plc/io/parq…
mroeschke Nov 15, 2024
a0fdcfa
Allow construction of HostBuffer from nullptr
wence- Nov 19, 2024
4d802f2
Parquet writing does not support gzip compression
wence- Nov 19, 2024
91e847e
Use valid values for row_group/max_page_size_bytes
wence- Nov 19, 2024
f2a905e
Skip zero-sized table and non-None partition info
wence- Nov 19, 2024
010c1da
Add type stub information
wence- Nov 19, 2024
da6b730
Merge branch 'branch-24.12' into plc/io/parquet_writer_only
wence- Nov 19, 2024
81c9839
Address reviews
mroeschke Nov 20, 2024
c057bf7
merge conflict
Matt711 Nov 20, 2024
85a5505
Update python/pylibcudf/pylibcudf/io/parquet.pxd
mroeschke Nov 21, 2024
2f65032
Update python/pylibcudf/pylibcudf/io/parquet.pyi
mroeschke Nov 21, 2024
44a7bba
Update python/pylibcudf/pylibcudf/io/parquet.pyx
mroeschke Nov 21, 2024
1564fae
Update python/pylibcudf/pylibcudf/io/parquet.pyx
mroeschke Nov 21, 2024
91ed038
Update python/pylibcudf/pylibcudf/io/parquet.pyx
mroeschke Nov 21, 2024
2faa151
Update python/pylibcudf/pylibcudf/io/parquet.pyx
mroeschke Nov 21, 2024
ed096f3
Update python/pylibcudf/pylibcudf/io/parquet.pyx
mroeschke Nov 21, 2024
66f555f
Update python/pylibcudf/pylibcudf/io/parquet.pyx
mroeschke Nov 21, 2024
0df14b1
address docstring review, reduce test parameterization
mroeschke Nov 21, 2024
741c95c
Merge remote-tracking branch 'upstream/branch-24.12' into plc/io/parq…
mroeschke Nov 21, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 73 additions & 2 deletions python/pylibcudf/pylibcudf/io/parquet.pxd
Original file line number Diff line number Diff line change
@@ -1,14 +1,26 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libc.stdint cimport int64_t
from libc.stdint cimport int64_t, uint8_t
from libcpp cimport bool
from libcpp.memory cimport unique_ptr
from libcpp.vector cimport vector
from pylibcudf.expressions cimport Expression
from pylibcudf.io.types cimport SourceInfo, TableWithMetadata
from pylibcudf.io.types cimport (
compression_type,
dictionary_policy,
statistics_freq,
SinkInfo,
SourceInfo,
TableInputMetadata,
TableWithMetadata,
)
from pylibcudf.libcudf.io.parquet cimport (
chunked_parquet_reader as cpp_chunked_parquet_reader,
parquet_writer_options,
parquet_writer_options_builder,
)
from pylibcudf.libcudf.types cimport size_type
from pylibcudf.table cimport Table
from pylibcudf.types cimport DataType


Expand All @@ -33,3 +45,62 @@ cpdef read_parquet(
# ReaderColumnSchema reader_column_schema = *,
# DataType timestamp_type = *
)

cdef class ParquetWriterOptions:
cdef parquet_writer_options options

@staticmethod
cdef ParquetWriterOptionsBuilder builder(SinkInfo sink, Table table)

cpdef void set_partitions(self, list partitions)

cpdef void set_column_chunks_file_paths(self, list file_paths)

cpdef void set_row_group_size_bytes(self, int size_bytes)

cpdef void set_row_group_size_rows(self, int size_rows)

cpdef void set_max_page_size_bytes(self, int size_bytes)

cpdef void set_max_page_size_rows(self, int size_rows)

cpdef void set_max_dictionary_size(self, int size_rows)
bdice marked this conversation as resolved.
Show resolved Hide resolved

cdef class ParquetWriterOptionsBuilder:
cdef parquet_writer_options_builder builder

cpdef ParquetWriterOptionsBuilder metadata(self, TableInputMetadata metadata)

cpdef ParquetWriterOptionsBuilder key_value_metadata(self, list metadata)

cpdef ParquetWriterOptionsBuilder compression(self, compression_type compression)

cpdef ParquetWriterOptionsBuilder stats_level(self, statistics_freq sf)

cpdef ParquetWriterOptionsBuilder int96_timestamps(self, bool enabled)

cpdef ParquetWriterOptionsBuilder write_v2_headers(self, bool enabled)

cpdef ParquetWriterOptionsBuilder dictionary_policy(self, dictionary_policy val)

cpdef ParquetWriterOptionsBuilder utc_timestamps(self, bool enabled)

cpdef ParquetWriterOptionsBuilder write_arrow_schema(self, bool enabled)

cpdef ParquetWriterOptions build(self)


cdef class BufferArrayFromVector:
cdef Py_ssize_t length
cdef unique_ptr[vector[uint8_t]] in_vec

# these two things declare part of the buffer interface
cdef Py_ssize_t shape[1]
cdef Py_ssize_t strides[1]

@staticmethod
cdef BufferArrayFromVector from_unique_ptr(
unique_ptr[vector[uint8_t]] in_vec
)

cpdef BufferArrayFromVector write_parquet(ParquetWriterOptions options)
Loading
Loading