From 3ac409dc26437deb77d30f64ec148121394878e4 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 19 Aug 2024 21:18:11 -0700 Subject: [PATCH] Fix C++ and Cython io types (#16610) The C++ I/O types were previously not specifying a base type despite the fact that the Cython code was relying on the base being an int32. This has apparently never bitten us before, but in theory this could go very wrong since it leaves the underlying type up to the compiler and if the C++ binary used something other than an int32 that would result in an ABI incompatibility with the Python build that would produce spurious results. While fixing this, I also noticed that the Cython contained a number of erroneous (likely outdated) declarations. Since Cython extern declarations are simply an indicate to Cython of how to resolve a function call _if_ it appears in compiled Cython code, these were not causing any build failures because these were all unused APIs, so I removed them from the Cython with no further changes needed. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Bradley Dice (https://github.com/bdice) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/16610 --- cpp/include/cudf/io/types.hpp | 12 ++--- .../pylibcudf/pylibcudf/libcudf/io/types.pxd | 50 ++++++++----------- 2 files changed, 27 insertions(+), 35 deletions(-) diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp index 3df737413fa..a34881942ce 100644 --- a/cpp/include/cudf/io/types.hpp +++ b/cpp/include/cudf/io/types.hpp @@ -54,7 +54,7 @@ namespace io { /** * @brief Compression algorithms */ -enum class compression_type { +enum class compression_type : int32_t { NONE, ///< No compression AUTO, ///< Automatically detect or select compression format SNAPPY, ///< Snappy format, using byte-oriented LZ77 @@ -72,7 +72,7 @@ enum class compression_type { /** * @brief Data source or destination types */ -enum class io_type { +enum class io_type : int32_t { FILEPATH, ///< Input/output is a file path HOST_BUFFER, ///< Input/output is a buffer in host memory DEVICE_BUFFER, ///< Input/output is a buffer in device memory @@ -83,7 +83,7 @@ enum class io_type { /** * @brief Behavior when handling quotations in field data */ -enum class quote_style { +enum class quote_style : int32_t { MINIMAL, ///< Quote only fields which contain special characters ALL, ///< Quote all fields NONNUMERIC, ///< Quote all non-numeric fields @@ -93,7 +93,7 @@ enum class quote_style { /** * @brief Column statistics granularity type for parquet/orc writers */ -enum statistics_freq { +enum statistics_freq : int32_t { STATISTICS_NONE = 0, ///< No column statistics STATISTICS_ROWGROUP = 1, ///< Per-Rowgroup column statistics STATISTICS_PAGE = 2, ///< Per-page column statistics @@ -103,7 +103,7 @@ enum statistics_freq { /** * @brief Valid encodings for use with `column_in_metadata::set_encoding()` */ -enum class column_encoding { +enum class column_encoding : int32_t { // Common encodings: USE_DEFAULT = -1, ///< No encoding has been requested, use default encoding DICTIONARY, ///< Use dictionary encoding @@ -222,7 +222,7 @@ class writer_compression_statistics { /** * @brief Control use of dictionary encoding for parquet writer */ -enum dictionary_policy { +enum dictionary_policy : int32_t { NEVER = 0, ///< Never use dictionary encoding ADAPTIVE = 1, ///< Use dictionary when it will not impact compression ALWAYS = 2 ///< Use dictionary regardless of impact on compression diff --git a/python/pylibcudf/pylibcudf/libcudf/io/types.pxd b/python/pylibcudf/pylibcudf/libcudf/io/types.pxd index a3d99807876..5f3be2f0727 100644 --- a/python/pylibcudf/pylibcudf/libcudf/io/types.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/io/types.pxd @@ -6,12 +6,10 @@ cimport pylibcudf.libcudf.table.table_view as cudf_table_view from libc.stdint cimport int32_t, uint8_t from libcpp cimport bool from libcpp.map cimport map -from libcpp.memory cimport shared_ptr, unique_ptr -from libcpp.pair cimport pair +from libcpp.memory cimport unique_ptr from libcpp.string cimport string from libcpp.unordered_map cimport unordered_map from libcpp.vector cimport vector -from pyarrow.includes.libarrow cimport CRandomAccessFile from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.types cimport size_type @@ -42,32 +40,32 @@ cdef extern from "cudf/io/types.hpp" \ cpdef enum class io_type(int32_t): FILEPATH HOST_BUFFER + DEVICE_BUFFER VOID USER_IMPLEMENTED cpdef enum class statistics_freq(int32_t): - STATISTICS_NONE = 0, - STATISTICS_ROWGROUP = 1, - STATISTICS_PAGE = 2, - STATISTICS_COLUMN = 3, + STATISTICS_NONE, + STATISTICS_ROWGROUP, + STATISTICS_PAGE, + STATISTICS_COLUMN, cpdef enum class dictionary_policy(int32_t): - NEVER = 0, - ADAPTIVE = 1, - ALWAYS = 2, - - cdef extern from "cudf/io/types.hpp" namespace "cudf::io" nogil: - cpdef enum class column_encoding(int32_t): - USE_DEFAULT = -1 - DICTIONARY = 0 - PLAIN = 1 - DELTA_BINARY_PACKED = 2 - DELTA_LENGTH_BYTE_ARRAY =3 - DELTA_BYTE_ARRAY = 4 - BYTE_STREAM_SPLIT = 5 - DIRECT = 6 - DIRECT_V2 = 7 - DICTIONARY_V2 = 8 + NEVER, + ADAPTIVE, + ALWAYS, + + cpdef enum class column_encoding(int32_t): + USE_DEFAULT + DICTIONARY + PLAIN + DELTA_BINARY_PACKED + DELTA_LENGTH_BYTE_ARRAY + DELTA_BYTE_ARRAY + BYTE_STREAM_SPLIT + DIRECT + DIRECT_V2 + DICTIONARY_V2 cdef cppclass column_name_info: string name @@ -76,7 +74,6 @@ cdef extern from "cudf/io/types.hpp" \ cdef cppclass table_metadata: table_metadata() except + - vector[string] column_names map[string, string] user_data vector[unordered_map[string, string]] per_file_user_data vector[column_name_info] schema_info @@ -120,10 +117,7 @@ cdef extern from "cudf/io/types.hpp" \ host_buffer(const char* data, size_t size) cdef cppclass source_info: - io_type type const vector[string]& filepaths() except + - const vector[host_buffer]& buffers() except + - vector[shared_ptr[CRandomAccessFile]] files source_info() except + source_info(const vector[string] &filepaths) except + @@ -132,9 +126,7 @@ cdef extern from "cudf/io/types.hpp" \ source_info(const vector[cudf_io_datasource.datasource*] &datasources) except + cdef cppclass sink_info: - io_type type const vector[string]& filepaths() - const vector[vector[char] *]& buffers() const vector[cudf_io_data_sink.data_sink *]& user_sinks() sink_info() except +