From 3ac409dc26437deb77d30f64ec148121394878e4 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 19 Aug 2024 21:18:11 -0700
Subject: [PATCH] Fix C++ and Cython io types (#16610)

The C++ I/O types were previously not specifying a base type despite the fact that the Cython code was relying on the base being an int32. This has apparently never bitten us before, but in theory this could go very wrong since it leaves the underlying type up to the compiler and if the C++ binary used something other than an int32 that would result in an ABI incompatibility with the Python build that would produce spurious results.

While fixing this, I also noticed that the Cython contained a number of erroneous (likely outdated) declarations. Since Cython extern declarations are simply an indicate to Cython of how to resolve a function call _if_ it appears in compiled Cython code, these were not causing any build failures because these were all unused APIs, so I removed them from the Cython with no further changes needed.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16610
---
 cpp/include/cudf/io/types.hpp                 | 12 ++---
 .../pylibcudf/pylibcudf/libcudf/io/types.pxd  | 50 ++++++++-----------
 2 files changed, 27 insertions(+), 35 deletions(-)

diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index 3df737413fa..a34881942ce 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -54,7 +54,7 @@ namespace io {
 /**
  * @brief Compression algorithms
  */
-enum class compression_type {
+enum class compression_type : int32_t {
   NONE,    ///< No compression
   AUTO,    ///< Automatically detect or select compression format
   SNAPPY,  ///< Snappy format, using byte-oriented LZ77
@@ -72,7 +72,7 @@ enum class compression_type {
 /**
  * @brief Data source or destination types
  */
-enum class io_type {
+enum class io_type : int32_t {
   FILEPATH,          ///< Input/output is a file path
   HOST_BUFFER,       ///< Input/output is a buffer in host memory
   DEVICE_BUFFER,     ///< Input/output is a buffer in device memory
@@ -83,7 +83,7 @@ enum class io_type {
 /**
  * @brief Behavior when handling quotations in field data
  */
-enum class quote_style {
+enum class quote_style : int32_t {
   MINIMAL,     ///< Quote only fields which contain special characters
   ALL,         ///< Quote all fields
   NONNUMERIC,  ///< Quote all non-numeric fields
@@ -93,7 +93,7 @@ enum class quote_style {
 /**
  * @brief Column statistics granularity type for parquet/orc writers
  */
-enum statistics_freq {
+enum statistics_freq : int32_t {
   STATISTICS_NONE     = 0,  ///< No column statistics
   STATISTICS_ROWGROUP = 1,  ///< Per-Rowgroup column statistics
   STATISTICS_PAGE     = 2,  ///< Per-page column statistics
@@ -103,7 +103,7 @@ enum statistics_freq {
 /**
  * @brief Valid encodings for use with `column_in_metadata::set_encoding()`
  */
-enum class column_encoding {
+enum class column_encoding : int32_t {
   // Common encodings:
   USE_DEFAULT = -1,  ///< No encoding has been requested, use default encoding
   DICTIONARY,        ///< Use dictionary encoding
@@ -222,7 +222,7 @@ class writer_compression_statistics {
 /**
  * @brief Control use of dictionary encoding for parquet writer
  */
-enum dictionary_policy {
+enum dictionary_policy : int32_t {
   NEVER    = 0,  ///< Never use dictionary encoding
   ADAPTIVE = 1,  ///< Use dictionary when it will not impact compression
   ALWAYS   = 2   ///< Use dictionary regardless of impact on compression
diff --git a/python/pylibcudf/pylibcudf/libcudf/io/types.pxd b/python/pylibcudf/pylibcudf/libcudf/io/types.pxd
index a3d99807876..5f3be2f0727 100644
--- a/python/pylibcudf/pylibcudf/libcudf/io/types.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/types.pxd
@@ -6,12 +6,10 @@ cimport pylibcudf.libcudf.table.table_view as cudf_table_view
 from libc.stdint cimport int32_t, uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
-from libcpp.memory cimport shared_ptr, unique_ptr
-from libcpp.pair cimport pair
+from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
-from pyarrow.includes.libarrow cimport CRandomAccessFile
 from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.types cimport size_type
 
@@ -42,32 +40,32 @@ cdef extern from "cudf/io/types.hpp" \
     cpdef enum class io_type(int32_t):
         FILEPATH
         HOST_BUFFER
+        DEVICE_BUFFER
         VOID
         USER_IMPLEMENTED
 
     cpdef enum class statistics_freq(int32_t):
-        STATISTICS_NONE = 0,
-        STATISTICS_ROWGROUP = 1,
-        STATISTICS_PAGE = 2,
-        STATISTICS_COLUMN = 3,
+        STATISTICS_NONE,
+        STATISTICS_ROWGROUP,
+        STATISTICS_PAGE,
+        STATISTICS_COLUMN,
 
     cpdef enum class dictionary_policy(int32_t):
-        NEVER = 0,
-        ADAPTIVE = 1,
-        ALWAYS = 2,
-
-    cdef extern from "cudf/io/types.hpp" namespace "cudf::io" nogil:
-        cpdef enum class column_encoding(int32_t):
-            USE_DEFAULT = -1
-            DICTIONARY = 0
-            PLAIN = 1
-            DELTA_BINARY_PACKED = 2
-            DELTA_LENGTH_BYTE_ARRAY =3
-            DELTA_BYTE_ARRAY = 4
-            BYTE_STREAM_SPLIT = 5
-            DIRECT = 6
-            DIRECT_V2 = 7
-            DICTIONARY_V2 = 8
+        NEVER,
+        ADAPTIVE,
+        ALWAYS,
+
+    cpdef enum class column_encoding(int32_t):
+        USE_DEFAULT
+        DICTIONARY
+        PLAIN
+        DELTA_BINARY_PACKED
+        DELTA_LENGTH_BYTE_ARRAY
+        DELTA_BYTE_ARRAY
+        BYTE_STREAM_SPLIT
+        DIRECT
+        DIRECT_V2
+        DICTIONARY_V2
 
     cdef cppclass column_name_info:
         string name
@@ -76,7 +74,6 @@ cdef extern from "cudf/io/types.hpp" \
     cdef cppclass table_metadata:
         table_metadata() except +
 
-        vector[string] column_names
         map[string, string] user_data
         vector[unordered_map[string, string]] per_file_user_data
         vector[column_name_info] schema_info
@@ -120,10 +117,7 @@ cdef extern from "cudf/io/types.hpp" \
         host_buffer(const char* data, size_t size)
 
     cdef cppclass source_info:
-        io_type type
         const vector[string]& filepaths() except +
-        const vector[host_buffer]& buffers() except +
-        vector[shared_ptr[CRandomAccessFile]] files
 
         source_info() except +
         source_info(const vector[string] &filepaths) except +
@@ -132,9 +126,7 @@ cdef extern from "cudf/io/types.hpp" \
         source_info(const vector[cudf_io_datasource.datasource*] &datasources) except +
 
     cdef cppclass sink_info:
-        io_type type
         const vector[string]& filepaths()
-        const vector[vector[char] *]& buffers()
         const vector[cudf_io_data_sink.data_sink *]& user_sinks()
 
         sink_info() except +