From a95fbc88f94df24c3418766fbbea5b6633ff2328 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Wed, 18 Dec 2024 16:30:46 -0500 Subject: [PATCH 1/2] Add JSON reader options structs to pylibcudf (#17614) Apart of #17565 Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/17614 --- python/cudf/cudf/io/json.py | 46 +- python/cudf_polars/cudf_polars/dsl/ir.py | 10 +- python/pylibcudf/pylibcudf/io/json.pxd | 60 ++- python/pylibcudf/pylibcudf/io/json.pyi | 54 +- python/pylibcudf/pylibcudf/io/json.pyx | 495 +++++++++++++----- .../pylibcudf/pylibcudf/tests/io/test_json.py | 57 +- 6 files changed, 498 insertions(+), 224 deletions(-) diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index e0c9e535e6f..39a85465deb 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -161,13 +161,15 @@ def read_json( if cudf.get_option("io.json.low_memory") and lines: res_cols, res_col_names, res_child_names = ( plc.io.json.chunked_read_json( - plc.io.SourceInfo(filepaths_or_buffers), - processed_dtypes, - c_compression, - keep_quotes=keep_quotes, - mixed_types_as_string=mixed_types_as_string, - prune_columns=prune_columns, - recovery_mode=c_on_bad_lines, + plc.io.json._setup_json_reader_options( + plc.io.SourceInfo(filepaths_or_buffers), + processed_dtypes, + c_compression, + keep_quotes=keep_quotes, + mixed_types_as_string=mixed_types_as_string, + prune_columns=prune_columns, + recovery_mode=c_on_bad_lines, + ) ) ) df = cudf.DataFrame._from_data( @@ -181,19 +183,23 @@ def read_json( return df else: table_w_meta = plc.io.json.read_json( - plc.io.SourceInfo(filepaths_or_buffers), - processed_dtypes, - c_compression, - lines, - byte_range_offset=byte_range[0] - if byte_range is not None - else 0, - byte_range_size=byte_range[1] if byte_range is not None else 0, - keep_quotes=keep_quotes, - mixed_types_as_string=mixed_types_as_string, - prune_columns=prune_columns, - recovery_mode=c_on_bad_lines, - extra_parameters=kwargs, + plc.io.json._setup_json_reader_options( + plc.io.SourceInfo(filepaths_or_buffers), + processed_dtypes, + c_compression, + lines, + byte_range_offset=byte_range[0] + if byte_range is not None + else 0, + byte_range_size=byte_range[1] + if byte_range is not None + else 0, + keep_quotes=keep_quotes, + mixed_types_as_string=mixed_types_as_string, + prune_columns=prune_columns, + recovery_mode=c_on_bad_lines, + extra_parameters=kwargs, + ) ) df = cudf.DataFrame._from_data( diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index b5af3bb80bf..1c1d4860eec 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -604,10 +604,12 @@ def slice_skip(tbl: plc.Table): (name, typ, []) for name, typ in schema.items() ] plc_tbl_w_meta = plc.io.json.read_json( - plc.io.SourceInfo(paths), - lines=True, - dtypes=json_schema, - prune_columns=True, + plc.io.json._setup_json_reader_options( + plc.io.SourceInfo(paths), + lines=True, + dtypes=json_schema, + prune_columns=True, + ) ) # TODO: I don't think cudf-polars supports nested types in general right now # (but when it does, we should pass child column names from nested columns in) diff --git a/python/pylibcudf/pylibcudf/io/json.pxd b/python/pylibcudf/pylibcudf/io/json.pxd index 4894ca3bd6e..7e446298ba9 100644 --- a/python/pylibcudf/pylibcudf/io/json.pxd +++ b/python/pylibcudf/pylibcudf/io/json.pxd @@ -8,6 +8,8 @@ from pylibcudf.io.types cimport ( ) from pylibcudf.libcudf.io.json cimport ( json_recovery_mode_t, + json_reader_options, + json_reader_options_builder, json_writer_options, json_writer_options_builder, ) @@ -15,19 +17,43 @@ from pylibcudf.libcudf.types cimport size_type from pylibcudf.table cimport Table -cpdef TableWithMetadata read_json( - SourceInfo source_info, - list dtypes = *, - compression_type compression = *, - bool lines = *, - size_t byte_range_offset = *, - size_t byte_range_size = *, - bool keep_quotes = *, - bool mixed_types_as_string = *, - bool prune_columns = *, - json_recovery_mode_t recovery_mode = *, - dict extra_parameters = *, -) +cdef class JsonReaderOptions: + cdef json_reader_options c_obj + cdef SourceInfo source + cpdef void set_dtypes(self, list types) + cpdef void enable_keep_quotes(self, bool keep_quotes) + cpdef void enable_mixed_types_as_string(self, bool mixed_types_as_string) + cpdef void enable_prune_columns(self, bool prune_columns) + cpdef void set_byte_range_offset(self, size_t offset) + cpdef void set_byte_range_size(self, size_t size) + cpdef void enable_lines(self, bool val) + # These hidden options are subjected to change without deprecation cycle. + # These are used to test libcudf JSON reader features, not used in cuDF. + cpdef void set_delimiter(self, str val) + cpdef void enable_dayfirst(self, bool val) + cpdef void enable_experimental(self, bool val) + cpdef void enable_normalize_single_quotes(self, bool val) + cpdef void enable_normalize_whitespace(self, bool val) + cpdef void set_strict_validation(self, bool val) + cpdef void allow_unquoted_control_chars(self, bool val) + cpdef void allow_numeric_leading_zeros(self, bool val) + cpdef void allow_nonnumeric_numbers(self, bool val) + cpdef void set_na_values(self, list vals) + +cdef class JsonReaderOptionsBuilder: + cdef json_reader_options_builder c_obj + cdef SourceInfo source + cpdef JsonReaderOptionsBuilder compression(self, compression_type compression) + cpdef JsonReaderOptionsBuilder lines(self, bool val) + cpdef JsonReaderOptionsBuilder keep_quotes(self, bool val) + cpdef JsonReaderOptionsBuilder byte_range_offset(self, size_t byte_range_offset) + cpdef JsonReaderOptionsBuilder byte_range_size(self, size_t byte_range_size) + cpdef JsonReaderOptionsBuilder recovery_mode( + self, json_recovery_mode_t recovery_mode + ) + cpdef build(self) + +cpdef TableWithMetadata read_json(JsonReaderOptions options) cdef class JsonWriterOptions: cdef json_writer_options c_obj @@ -50,12 +76,6 @@ cdef class JsonWriterOptionsBuilder: cpdef void write_json(JsonWriterOptions options) cpdef tuple chunked_read_json( - SourceInfo source_info, - list dtypes = *, - compression_type compression = *, - bool keep_quotes = *, - bool mixed_types_as_string = *, - bool prune_columns = *, - json_recovery_mode_t recovery_mode = *, + JsonReaderOptions options, int chunk_size= *, ) diff --git a/python/pylibcudf/pylibcudf/io/json.pyi b/python/pylibcudf/pylibcudf/io/json.pyi index e0489742cd0..b84b437a3a2 100644 --- a/python/pylibcudf/pylibcudf/io/json.pyi +++ b/python/pylibcudf/pylibcudf/io/json.pyi @@ -19,18 +19,40 @@ ChildNameToTypeMap: TypeAlias = Mapping[str, ChildNameToTypeMap] NameAndType: TypeAlias = tuple[str, DataType, list[NameAndType]] -def read_json( - source_info: SourceInfo, - dtypes: list[NameAndType] | None = None, - compression: CompressionType = CompressionType.AUTO, - lines: bool = False, - byte_range_offset: int = 0, - byte_range_size: int = 0, - keep_quotes: bool = False, - mixed_types_as_string: bool = False, - prune_columns: bool = False, - recovery_mode: JSONRecoveryMode = JSONRecoveryMode.FAIL, -) -> TableWithMetadata: ... +class JsonReaderOptions: + def set_dtypes( + self, types: list[DataType] | list[NameAndType] + ) -> None: ... + def enable_keep_quotes(self, keep_quotes: bool) -> None: ... + def enable_mixed_types_as_string( + self, mixed_types_as_string: bool + ) -> None: ... + def enable_prune_columns(self, prune_columns: bool) -> None: ... + def set_byte_range_offset(self, offset: int) -> None: ... + def set_byte_range_size(self, size: int) -> None: ... + def enable_lines(self, val: bool) -> None: ... + def set_delimiter(self, val: str) -> None: ... + def enable_dayfirst(self, val: bool) -> None: ... + def enable_experimental(self, val: bool) -> None: ... + def enable_normalize_single_quotes(self, val: bool) -> None: ... + def enable_normalize_whitespace(self, val: bool) -> None: ... + def set_strict_validation(self, val: bool) -> None: ... + def allow_unquoted_control_chars(self, val: bool) -> None: ... + def allow_numeric_leading_zeros(self, val: bool) -> None: ... + def allow_nonnumeric_numbers(self, val: bool) -> None: ... + def set_na_values(self, vals: list[str]) -> None: ... + @staticmethod + def builder(source: SourceInfo) -> JsonReaderOptionsBuilder: ... + +class JsonReaderOptionsBuilder: + def compression(self, compression: CompressionType) -> Self: ... + def lines(self, lines: bool) -> Self: ... + def byte_range_offset(self, byte_range_offset: int) -> Self: ... + def byte_range_size(self, byte_range_size: int) -> Self: ... + def recovery_mode(self, recovery_mode: JSONRecoveryMode) -> Self: ... + def build(self) -> JsonReaderOptions: ... + +def read_json(options: JsonReaderOptions) -> TableWithMetadata: ... class JsonWriterOptions: @staticmethod @@ -48,12 +70,6 @@ class JsonWriterOptionsBuilder: def write_json(options: JsonWriterOptions) -> None: ... def chunked_read_json( - source_info: SourceInfo, - dtypes: list[NameAndType] | None = None, - compression: CompressionType = CompressionType.AUTO, - keep_quotes: bool = False, - mixed_types_as_string: bool = False, - prune_columns: bool = False, - recovery_mode: JSONRecoveryMode = JSONRecoveryMode.FAIL, + options: JsonReaderOptions, chunk_size: int = 100_000_000, ) -> tuple[list[Column], list[str], ChildNameToTypeMap]: ... diff --git a/python/pylibcudf/pylibcudf/io/json.pyx b/python/pylibcudf/pylibcudf/io/json.pyx index 16078b31566..1d8a559afad 100644 --- a/python/pylibcudf/pylibcudf/io/json.pyx +++ b/python/pylibcudf/pylibcudf/io/json.pyx @@ -25,6 +25,8 @@ __all__ = [ "chunked_read_json", "read_json", "write_json", + "JsonReaderOptions", + "JsonReaderOptionsBuilder", "JsonWriterOptions", "JsonWriterOptionsBuilder" ] @@ -51,23 +53,21 @@ cdef map[string, schema_element] _generate_schema_map(list dtypes): return schema_map -cdef json_reader_options _setup_json_reader_options( +cpdef JsonReaderOptions _setup_json_reader_options( SourceInfo source_info, list dtypes, - compression_type compression, - bool lines, - size_t byte_range_offset, - size_t byte_range_size, - bool keep_quotes, - bool mixed_types_as_string, - bool prune_columns, - json_recovery_mode_t recovery_mode, - dict extra_parameters=None): - - cdef vector[string] na_vec - cdef vector[data_type] types_vec - cdef json_reader_options opts = ( - json_reader_options.builder(source_info.c_obj) + compression_type compression = compression_type.AUTO, + bool lines = False, + size_t byte_range_offset = 0, + size_t byte_range_size = 0, + bool keep_quotes = False, + bool mixed_types_as_string = False, + bool prune_columns = False, + json_recovery_mode_t recovery_mode = json_recovery_mode_t.FAIL, + dict extra_parameters=None, +): + options = ( + JsonReaderOptions.builder(source_info) .compression(compression) .lines(lines) .byte_range_offset(byte_range_offset) @@ -77,88 +77,359 @@ cdef json_reader_options _setup_json_reader_options( ) if dtypes is not None: - if isinstance(dtypes[0], tuple): - opts.set_dtypes(move(_generate_schema_map(dtypes))) - else: - for dtype in dtypes: - types_vec.push_back((dtype).c_obj) - opts.set_dtypes(types_vec) + options.set_dtypes(dtypes) - opts.enable_keep_quotes(keep_quotes) - opts.enable_mixed_types_as_string(mixed_types_as_string) - opts.enable_prune_columns(prune_columns) + options.enable_keep_quotes(keep_quotes) + options.enable_mixed_types_as_string(mixed_types_as_string) + options.enable_prune_columns(prune_columns) # These hidden options are subjected to change without deprecation cycle. # These are used to test libcudf JSON reader features, not used in cuDF. if extra_parameters is not None: for key, value in extra_parameters.items(): if key == 'delimiter': - opts.set_delimiter(ord(value)) + options.set_delimiter(value) elif key == 'dayfirst': - opts.enable_dayfirst(value) + options.enable_dayfirst(value) elif key == 'experimental': - opts.enable_experimental(value) + options.enable_experimental(value) elif key == 'normalize_single_quotes': - opts.enable_normalize_single_quotes(value) + options.enable_normalize_single_quotes(value) elif key == 'normalize_whitespace': - opts.enable_normalize_whitespace(value) + options.enable_normalize_whitespace(value) elif key == 'strict_validation': - opts.set_strict_validation(value) + options.set_strict_validation(value) elif key == 'allow_unquoted_control_chars': - opts.allow_unquoted_control_chars(value) + options.allow_unquoted_control_chars(value) elif key == 'allow_numeric_leading_zeros': - opts.allow_numeric_leading_zeros(value) + options.allow_numeric_leading_zeros(value) elif key == 'allow_nonnumeric_numbers': - opts.allow_nonnumeric_numbers(value) + options.allow_nonnumeric_numbers(value) elif key == 'na_values': - for na_val in value: - if isinstance(na_val, str): - na_vec.push_back(na_val.encode()) - opts.set_na_values(na_vec) + options.set_na_values(value) else: raise ValueError( "cudf engine doesn't support the " f"'{key}' keyword argument for read_json" ) - return opts + return options + + +cdef class JsonReaderOptions: + """ + The settings to use for ``read_json`` + + For details, see `:cpp:class:`cudf::io::json_reader_options` + """ + @staticmethod + def builder(SourceInfo source): + """ + Create a JsonReaderOptionsBuilder object + + For details, see :cpp:func:`cudf::io::json_reader_options::builder` + + Parameters + ---------- + sink : SourceInfo + The source to read the JSON file from. + + Returns + ------- + JsonReaderOptionsBuilder + Builder to build JsonReaderOptions + """ + cdef JsonReaderOptionsBuilder json_builder = ( + JsonReaderOptionsBuilder.__new__(JsonReaderOptionsBuilder) + ) + json_builder.c_obj = json_reader_options.builder(source.c_obj) + json_builder.source = source + return json_builder + + cpdef void set_dtypes(self, list types): + """ + Set data types for columns to be read. + + Parameters + ---------- + types : list + List of dtypes or a list of tuples of + column names, dtypes, and list of tuples + (to support nested column hierarchy) + + Returns + ------- + None + """ + cdef vector[data_type] types_vec + if isinstance(types[0], tuple): + self.c_obj.set_dtypes(_generate_schema_map(types)) + else: + types_vec.reserve(len(types)) + for dtype in types: + types_vec.push_back((dtype).c_obj) + self.c_obj.set_dtypes(types_vec) + + cpdef void enable_keep_quotes(self, bool keep_quotes): + """ + Set whether the reader should keep quotes of string values. + + Parameters + ---------- + keep_quotes : bool + Boolean value to indicate whether the reader should + keep quotes of string values + + Returns + ------- + None + """ + self.c_obj.enable_keep_quotes(keep_quotes) + + cpdef void enable_mixed_types_as_string(self, bool mixed_types_as_string): + """ + Set whether to parse mixed types as a string column. + Also enables forcing to read a struct as string column using schema. + + Parameters + ---------- + mixed_types_as_string : bool + Boolean value to enable/disable parsing mixed types + as a string column + + Returns + ------- + None + """ + self.c_obj.enable_mixed_types_as_string(mixed_types_as_string) + + cpdef void enable_prune_columns(self, bool prune_columns): + """ + Set whether to prune columns on read, selected + based on the ``set_dtypes`` option. + + Parameters + ---------- + prune_columns : bool + When set as true, if the reader options include + ``set_dtypes``, then the reader will only return those + columns which are mentioned in ``set_dtypes``. If false, + then all columns are returned, independent of the + ``set_dtypes`` setting. + + Returns + ------- + None + """ + self.c_obj.enable_prune_columns(prune_columns) + + cpdef void set_byte_range_offset(self, size_t offset): + """ + Set number of bytes to skip from source start. + + Parameters + ---------- + offset : size_t + Number of bytes of offset + + Returns + ------- + None + """ + self.c_obj.set_byte_range_offset(offset) + + cpdef void set_byte_range_size(self, size_t size): + """ + Set number of bytes to read. + + Parameters + ---------- + size : size_t + Number of bytes to read + + Returns + ------- + None + """ + self.c_obj.set_byte_range_size(size) + + cpdef void enable_lines(self, bool val): + """ + Set whether to read the file as a json object per line. + + Parameters + ---------- + val : bool + Boolean value to enable/disable the option + to read each line as a json object + + Returns + ------- + None + """ + self.c_obj.enable_lines(val) + + # These hidden options are subjected to change without deprecation cycle. + # These are used to test libcudf JSON reader features, not used in cuDF. + + cpdef void set_delimiter(self, str val): + self.c_obj.set_delimiter(val.encode()) + + cpdef void enable_dayfirst(self, bool val): + self.c_obj.enable_dayfirst(val) + + cpdef void enable_experimental(self, bool val): + self.c_obj.enable_experimental(val) + + cpdef void enable_normalize_single_quotes(self, bool val): + self.c_obj.enable_normalize_single_quotes(val) + + cpdef void enable_normalize_whitespace(self, bool val): + self.c_obj.enable_normalize_whitespace(val) + + cpdef void set_strict_validation(self, bool val): + self.c_obj.set_strict_validation(val) + + cpdef void allow_unquoted_control_chars(self, bool val): + self.c_obj.allow_unquoted_control_chars(val) + + cpdef void allow_numeric_leading_zeros(self, bool val): + self.c_obj.allow_numeric_leading_zeros(val) + + cpdef void allow_nonnumeric_numbers(self, bool val): + self.c_obj.allow_nonnumeric_numbers(val) + + cpdef void set_na_values(self, list vals): + cdef vector[string] vec + for val in vals: + if isinstance(val, str): + vec.push_back(val.encode()) + self.c_obj.set_na_values(vec) + + +cdef class JsonReaderOptionsBuilder: + cpdef JsonReaderOptionsBuilder compression(self, compression_type compression): + """ + Sets compression type. + + Parameters + ---------- + compression : CompressionType + The compression type to use + + Returns + ------- + Self + """ + self.c_obj.compression(compression) + return self + + cpdef JsonReaderOptionsBuilder lines(self, bool val): + """ + Set whether to read the file as a json object per line. + + Parameters + ---------- + val : bool + Boolean value to enable/disable the option + to read each line as a json object + + Returns + ------- + Self + """ + self.c_obj.lines(val) + return self + + cpdef JsonReaderOptionsBuilder keep_quotes(self, bool val): + """ + Set whether the reader should keep quotes of string values. + + Parameters + ---------- + val : bool + Boolean value to indicate whether the + reader should keep quotes of string values + + Returns + ------- + Self + """ + self.c_obj.keep_quotes(val) + return self + + cpdef JsonReaderOptionsBuilder byte_range_offset(self, size_t byte_range_offset): + """ + Set number of bytes to skip from source start. + + Parameters + ---------- + byte_range_offset : size_t + Number of bytes of offset + + Returns + ------- + Self + """ + self.c_obj.byte_range_offset(byte_range_offset) + return self + + cpdef JsonReaderOptionsBuilder byte_range_size(self, size_t byte_range_size): + """ + Set number of bytes to read. + + Parameters + ---------- + byte_range_size : size_t + Number of bytes to read + + Returns + ------- + Self + """ + self.c_obj.byte_range_size(byte_range_size) + return self + + cpdef JsonReaderOptionsBuilder recovery_mode( + self, + json_recovery_mode_t recovery_mode + ): + """ + Specifies the JSON reader's behavior on invalid JSON lines. + + Parameters + ---------- + recovery_mode : json_recovery_mode_t + An enum value to indicate the JSON reader's + behavior on invalid JSON lines. + + Returns + ------- + Self + """ + self.c_obj.recovery_mode(recovery_mode) + return self + + cpdef build(self): + """Create a JsonReaderOptions object""" + cdef JsonReaderOptions json_options = JsonReaderOptions.__new__( + JsonReaderOptions + ) + json_options.c_obj = move(self.c_obj.build()) + json_options.source = self.source + return json_options cpdef tuple chunked_read_json( - SourceInfo source_info, - list dtypes = None, - compression_type compression = compression_type.AUTO, - bool keep_quotes = False, - bool mixed_types_as_string = False, - bool prune_columns = False, - json_recovery_mode_t recovery_mode = json_recovery_mode_t.FAIL, + JsonReaderOptions options, int chunk_size=100_000_000, ): - """Reads an JSON file into a :py:class:`~.types.TableWithMetadata`. + """ + Reads chunks of a JSON file into a :py:class:`~.types.TableWithMetadata`. Parameters ---------- - source_info : SourceInfo - The SourceInfo object to read the JSON file from. - dtypes : list, default None - Set data types for the columns in the JSON file. - - Each element of the list has the format - (column_name, column_dtype, list of child dtypes), where - the list of child dtypes is an empty list if the child is not - a nested type (list or struct dtype), and is of format - (column_child_name, column_child_type, list of grandchild dtypes). - compression: CompressionType, default CompressionType.AUTO - The compression format of the JSON source. - keep_quotes : bool, default False - Whether the reader should keep quotes of string values. - mixed_types_as_string : bool, default False - If True, mixed type columns are returned as string columns. - If `False` parsing mixed type columns will thrown an error. - prune_columns : bool, default False - Whether to only read columns specified in dtypes. - recover_mode : JSONRecoveryMode, default JSONRecoveryMode.FAIL - Whether to raise an error or set corresponding values to null - when encountering an invalid JSON line. + options : JsonReaderOptions + Settings for controlling reading behavior chunk_size : int, default 100_000_000 bytes. The number of bytes to be read in chunks. The chunk_size should be set to at least row_size. @@ -171,20 +442,6 @@ cpdef tuple chunked_read_json( cdef size_type c_range_size = ( chunk_size if chunk_size is not None else 0 ) - cdef json_reader_options opts = _setup_json_reader_options( - source_info=source_info, - dtypes=dtypes, - compression=compression, - lines=True, - byte_range_offset=0, - byte_range_size=0, - keep_quotes=keep_quotes, - mixed_types_as_string=mixed_types_as_string, - prune_columns=prune_columns, - recovery_mode=recovery_mode, - ) - - # Read JSON cdef table_with_metadata c_result final_columns = [] @@ -192,12 +449,13 @@ cpdef tuple chunked_read_json( child_names = None i = 0 while True: - opts.set_byte_range_offset(c_range_size * i) - opts.set_byte_range_size(c_range_size) + options.enable_lines(True) + options.set_byte_range_offset(c_range_size * i) + options.set_byte_range_size(c_range_size) try: with nogil: - c_result = move(cpp_read_json(opts)) + c_result = move(cpp_read_json(options.c_obj)) except (ValueError, OverflowError): break if meta_names is None: @@ -225,75 +483,30 @@ cpdef tuple chunked_read_json( cpdef TableWithMetadata read_json( - SourceInfo source_info, - list dtypes = None, - compression_type compression = compression_type.AUTO, - bool lines = False, - size_t byte_range_offset = 0, - size_t byte_range_size = 0, - bool keep_quotes = False, - bool mixed_types_as_string = False, - bool prune_columns = False, - json_recovery_mode_t recovery_mode = json_recovery_mode_t.FAIL, - dict extra_parameters = None, + JsonReaderOptions options ): - """Reads an JSON file into a :py:class:`~.types.TableWithMetadata`. + """ + Read from JSON format. + + The source to read from and options are encapsulated + by the `options` object. + + For details, see :cpp:func:`read_json`. Parameters ---------- - source_info : SourceInfo - The SourceInfo object to read the JSON file from. - dtypes : list, default None - Set data types for the columns in the JSON file. - - Each element of the list has the format - (column_name, column_dtype, list of child dtypes), where - the list of child dtypes is an empty list if the child is not - a nested type (list or struct dtype), and is of format - (column_child_name, column_child_type, list of grandchild dtypes). - compression: CompressionType, default CompressionType.AUTO - The compression format of the JSON source. - byte_range_offset : size_t, default 0 - Number of bytes to skip from source start. - byte_range_size : size_t, default 0 - Number of bytes to read. By default, will read all bytes. - keep_quotes : bool, default False - Whether the reader should keep quotes of string values. - mixed_types_as_string : bool, default False - If True, mixed type columns are returned as string columns. - If `False` parsing mixed type columns will thrown an error. - prune_columns : bool, default False - Whether to only read columns specified in dtypes. - recover_mode : JSONRecoveryMode, default JSONRecoveryMode.FAIL - Whether to raise an error or set corresponding values to null - when encountering an invalid JSON line. - extra_parameters : dict, default None - Additional hidden parameters to pass to the JSON reader. + options: JsonReaderOptions + Settings for controlling reading behavior Returns ------- TableWithMetadata The Table and its corresponding metadata (column names) that were read in. """ - cdef json_reader_options opts = _setup_json_reader_options( - source_info=source_info, - dtypes=dtypes, - compression=compression, - lines=lines, - byte_range_offset=byte_range_offset, - byte_range_size=byte_range_size, - keep_quotes=keep_quotes, - mixed_types_as_string=mixed_types_as_string, - prune_columns=prune_columns, - recovery_mode=recovery_mode, - extra_parameters=extra_parameters, - ) - - # Read JSON cdef table_with_metadata c_result with nogil: - c_result = move(cpp_read_json(opts)) + c_result = move(cpp_read_json(options.c_obj)) return TableWithMetadata.from_libcudf(c_result) diff --git a/python/pylibcudf/pylibcudf/tests/io/test_json.py b/python/pylibcudf/pylibcudf/tests/io/test_json.py index 9b0c5a29fe8..747bbfa1370 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_json.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_json.py @@ -167,9 +167,12 @@ def test_read_json_basic( source.seek(0) res = plc.io.json.read_json( - plc.io.SourceInfo([source]), - compression=compression_type, - lines=lines, + ( + plc.io.json.JsonReaderOptions.builder(plc.io.SourceInfo([source])) + .compression(compression_type) + .lines(lines) + .build() + ) ) # Adjustments to correct for the fact orient=records is lossy @@ -243,9 +246,14 @@ def get_child_types(typ): new_schema = pa.schema(new_fields) - res = plc.io.json.read_json( - plc.io.SourceInfo([source]), dtypes=dtypes, lines=True + options = ( + plc.io.json.JsonReaderOptions.builder(plc.io.SourceInfo([source])) + .lines(True) + .build() ) + options.set_dtypes(dtypes) + + res = plc.io.json.read_json(options) new_table = pa_table.cast(new_schema) # orient=records is lossy @@ -269,10 +277,15 @@ def test_read_json_lines_byte_range(source_or_sink, chunk_size): for chunk_start in range(0, len(json_str.encode("utf-8")), chunk_size): tbls_w_meta.append( plc.io.json.read_json( - plc.io.SourceInfo([source]), - lines=True, - byte_range_offset=chunk_start, - byte_range_size=chunk_start + chunk_size, + ( + plc.io.json.JsonReaderOptions.builder( + plc.io.SourceInfo([source]) + ) + .lines(True) + .byte_range_offset(chunk_start) + .byte_range_size(chunk_start + chunk_size) + .build() + ) ) ) @@ -302,7 +315,12 @@ def test_read_json_lines_keep_quotes(keep_quotes, source_or_sink): write_source_str(source, json_bytes) tbl_w_meta = plc.io.json.read_json( - plc.io.SourceInfo([source]), lines=True, keep_quotes=keep_quotes + ( + plc.io.json.JsonReaderOptions.builder(plc.io.SourceInfo([source])) + .lines(True) + .keep_quotes(keep_quotes) + .build() + ) ) template = "{0}" @@ -330,20 +348,19 @@ def test_read_json_lines_recovery_mode(recovery_mode, source_or_sink): json_str = '{"a":1,"b":10}\n{"a":2,"b":11}\nabc\n{"a":3,"b":12}\n' write_source_str(source, json_str) + options = ( + plc.io.json.JsonReaderOptions.builder(plc.io.SourceInfo([source])) + .lines(True) + .recovery_mode(recovery_mode) + .build() + ) + if recovery_mode == plc.io.types.JSONRecoveryMode.FAIL: with pytest.raises(RuntimeError): - plc.io.json.read_json( - plc.io.SourceInfo([source]), - lines=True, - recovery_mode=recovery_mode, - ) + plc.io.json.read_json(options) else: # Recover case (bad values replaced with nulls) - tbl_w_meta = plc.io.json.read_json( - plc.io.SourceInfo([source]), - lines=True, - recovery_mode=recovery_mode, - ) + tbl_w_meta = plc.io.json.read_json(options) exp = pa.Table.from_arrays( [[1, 2, None, 3], [10, 11, None, 12]], names=["a", "b"] ) From 88df0ad548d664039b2572bac398040e5d70d421 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 18 Dec 2024 17:48:00 -0800 Subject: [PATCH 2/2] Replace the outdated cuco window concept with buckets (#17602) Recently, cuco refined the term "window" as "bucket," as the latter more accurately represents a contiguous memory space containing one or more hash table slots. This PR implements the necessary changes to replace "window" with "bucket" in all relevant use cases. Authors: - Yunsong Wang (https://github.com/PointKernel) Approvers: - Muhammad Haseeb (https://github.com/mhaseeb123) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/17602 --- cpp/src/groupby/hash/compute_groupby.cu | 2 +- .../groupby/hash/compute_mapping_indices.cuh | 6 +++--- cpp/src/groupby/hash/helpers.cuh | 16 +++++++-------- cpp/src/io/orc/dict_enc.cu | 6 +++--- cpp/src/io/orc/orc_gpu.hpp | 14 ++++++------- cpp/src/io/parquet/chunk_dict.cu | 20 +++++++++---------- cpp/src/io/parquet/parquet_gpu.cuh | 18 ++++++++--------- cpp/src/io/parquet/writer_impl.cu | 4 ++-- 8 files changed, 43 insertions(+), 43 deletions(-) diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu index e1dbf2a3d9e..9648d942513 100644 --- a/cpp/src/groupby/hash/compute_groupby.cu +++ b/cpp/src/groupby/hash/compute_groupby.cu @@ -61,7 +61,7 @@ std::unique_ptr compute_groupby(table_view const& keys, d_row_equal, probing_scheme_t{d_row_hash}, cuco::thread_scope_device, - cuco::storage{}, + cuco::storage{}, cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, stream.value()}; diff --git a/cpp/src/groupby/hash/compute_mapping_indices.cuh b/cpp/src/groupby/hash/compute_mapping_indices.cuh index d353830780f..f86a93109be 100644 --- a/cpp/src/groupby/hash/compute_mapping_indices.cuh +++ b/cpp/src/groupby/hash/compute_mapping_indices.cuh @@ -106,15 +106,15 @@ CUDF_KERNEL void mapping_indices_kernel(cudf::size_type num_input_rows, __shared__ cudf::size_type shared_set_indices[GROUPBY_SHM_MAX_ELEMENTS]; // Shared set initialization - __shared__ cuco::window windows[window_extent.value()]; + __shared__ cuco::bucket buckets[bucket_extent.value()]; auto raw_set = cuco::static_set_ref{ cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, global_set.key_eq(), probing_scheme_t{global_set.hash_function()}, cuco::thread_scope_block, - cuco::aow_storage_ref{ - window_extent, windows}}; + cuco::bucket_storage_ref{ + bucket_extent, buckets}}; auto shared_set = raw_set.rebind_operators(cuco::insert_and_find); auto const block = cooperative_groups::this_thread_block(); diff --git a/cpp/src/groupby/hash/helpers.cuh b/cpp/src/groupby/hash/helpers.cuh index f950e03e0fb..92925e11bac 100644 --- a/cpp/src/groupby/hash/helpers.cuh +++ b/cpp/src/groupby/hash/helpers.cuh @@ -27,7 +27,7 @@ namespace cudf::groupby::detail::hash { CUDF_HOST_DEVICE auto constexpr GROUPBY_CG_SIZE = 1; /// Number of slots per thread -CUDF_HOST_DEVICE auto constexpr GROUPBY_WINDOW_SIZE = 1; +CUDF_HOST_DEVICE auto constexpr GROUPBY_BUCKET_SIZE = 1; /// Thread block size CUDF_HOST_DEVICE auto constexpr GROUPBY_BLOCK_SIZE = 128; @@ -48,9 +48,9 @@ using shmem_extent_t = cuco::extent(static_cast(GROUPBY_SHM_MAX_ELEMENTS) * 1.43)>; -/// Number of windows needed by each shared memory hash set -CUDF_HOST_DEVICE auto constexpr window_extent = - cuco::make_window_extent(shmem_extent_t{}); +/// Number of buckets needed by each shared memory hash set +CUDF_HOST_DEVICE auto constexpr bucket_extent = + cuco::make_bucket_extent(shmem_extent_t{}); using row_hash_t = cudf::experimental::row::hash::device_row_hasher, - cuco::storage>; + cuco::storage>; using nullable_global_set_t = cuco::static_set, @@ -83,7 +83,7 @@ using nullable_global_set_t = cuco::static_set, - cuco::storage>; + cuco::storage>; template using hash_set_ref_t = cuco::static_set_ref< @@ -91,7 +91,7 @@ using hash_set_ref_t = cuco::static_set_ref< cuda::thread_scope_device, row_comparator_t, probing_scheme_t, - cuco::aow_storage_ref>, + cuco::bucket_storage_ref>, Op>; template @@ -100,6 +100,6 @@ using nullable_hash_set_ref_t = cuco::static_set_ref< cuda::thread_scope_device, nullable_row_comparator_t, probing_scheme_t, - cuco::aow_storage_ref>, + cuco::bucket_storage_ref>, Op>; } // namespace cudf::groupby::detail::hash diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu index 0cb5c382631..7facc6497ed 100644 --- a/cpp/src/io/orc/dict_enc.cu +++ b/cpp/src/io/orc/dict_enc.cu @@ -180,9 +180,9 @@ CUDF_KERNEL void __launch_bounds__(block_size) for (size_type i = 0; i < dict.map_slots.size(); i += block_size) { if (t + i < dict.map_slots.size()) { - auto window = dict.map_slots.begin() + t + i; - // Collect all slots from each window. - for (auto& slot : *window) { + auto bucket = dict.map_slots.begin() + t + i; + // Collect all slots from each bucket. + for (auto& slot : *bucket) { auto const key = slot.first; if (key != KEY_SENTINEL) { auto loc = counter.fetch_add(1, memory_order_relaxed); diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp index 0949fafe9a4..654ee1e012c 100644 --- a/cpp/src/io/orc/orc_gpu.hpp +++ b/cpp/src/io/orc/orc_gpu.hpp @@ -47,16 +47,16 @@ using slot_type = cuco::pair; auto constexpr map_cg_size = 1; ///< A CUDA Cooperative Group of 1 thread (set for best performance) to handle each subset. ///< Note: Adjust insert and find loops to use `cg::tile` if increasing this. -auto constexpr window_size = +auto constexpr bucket_size = 1; ///< Number of concurrent slots (set for best performance) handled by each thread. auto constexpr occupancy_factor = 1.43f; ///< cuCollections suggests using a hash map of size ///< N * (1/0.7) = 1.43 to target a 70% occupancy factor. -using storage_type = cuco::aow_storage, - cudf::detail::cuco_allocator>; +using storage_type = cuco::bucket_storage, + cudf::detail::cuco_allocator>; using storage_ref_type = typename storage_type::ref_type; -using window_type = typename storage_type::window_type; +using bucket_type = typename storage_type::bucket_type; using slot_type = cuco::pair; auto constexpr KEY_SENTINEL = size_type{-1}; @@ -193,7 +193,7 @@ struct StripeStream { */ struct stripe_dictionary { // input - device_span map_slots; // hash map (windows) storage + device_span map_slots; // hash map (buckets) storage uint32_t column_idx = 0; // column index size_type start_row = 0; // first row in the stripe size_type start_rowgroup = 0; // first rowgroup in the stripe diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index b85ebf2fa1a..b5f9b894c46 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -210,7 +210,7 @@ struct map_find_fn { template CUDF_KERNEL void __launch_bounds__(block_size) - populate_chunk_hash_maps_kernel(device_span const map_storage, + populate_chunk_hash_maps_kernel(device_span const map_storage, cudf::detail::device_2dspan frags) { auto const col_idx = blockIdx.y; @@ -239,7 +239,7 @@ CUDF_KERNEL void __launch_bounds__(block_size) template CUDF_KERNEL void __launch_bounds__(block_size) - collect_map_entries_kernel(device_span const map_storage, + collect_map_entries_kernel(device_span const map_storage, device_span chunks) { auto& chunk = chunks[blockIdx.x]; @@ -251,11 +251,11 @@ CUDF_KERNEL void __launch_bounds__(block_size) if (t == 0) { new (&counter) cuda::atomic{0}; } __syncthreads(); - // Iterate over all windows in the map. + // Iterate over all buckets in the map. for (; t < chunk.dict_map_size; t += block_size) { - auto window = map_storage.data() + chunk.dict_map_offset + t; - // Collect all slots from each window. - for (auto& slot : *window) { + auto bucket = map_storage.data() + chunk.dict_map_offset + t; + // Collect all slots from each bucket. + for (auto& slot : *bucket) { auto const key = slot.first; if (key != KEY_SENTINEL) { auto const loc = counter.fetch_add(1, memory_order_relaxed); @@ -272,7 +272,7 @@ CUDF_KERNEL void __launch_bounds__(block_size) template CUDF_KERNEL void __launch_bounds__(block_size) - get_dictionary_indices_kernel(device_span const map_storage, + get_dictionary_indices_kernel(device_span const map_storage, cudf::detail::device_2dspan frags) { auto const col_idx = blockIdx.y; @@ -302,7 +302,7 @@ CUDF_KERNEL void __launch_bounds__(block_size) s_ck_start_val_idx); } -void populate_chunk_hash_maps(device_span const map_storage, +void populate_chunk_hash_maps(device_span const map_storage, cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream) { @@ -311,7 +311,7 @@ void populate_chunk_hash_maps(device_span const map_storage, <<>>(map_storage, frags); } -void collect_map_entries(device_span const map_storage, +void collect_map_entries(device_span const map_storage, device_span chunks, rmm::cuda_stream_view stream) { @@ -320,7 +320,7 @@ void collect_map_entries(device_span const map_storage, <<>>(map_storage, chunks); } -void get_dictionary_indices(device_span const map_storage, +void get_dictionary_indices(device_span const map_storage, cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream) { diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh index 7c09764da2d..800875f7448 100644 --- a/cpp/src/io/parquet/parquet_gpu.cuh +++ b/cpp/src/io/parquet/parquet_gpu.cuh @@ -34,7 +34,7 @@ using slot_type = cuco::pair; auto constexpr map_cg_size = 1; ///< A CUDA Cooperative Group of 1 thread (set for best performance) to handle each subset. ///< Note: Adjust insert and find loops to use `cg::tile` if increasing this. -auto constexpr window_size = +auto constexpr bucket_size = 1; ///< Number of concurrent slots (set for best performance) handled by each thread. auto constexpr occupancy_factor = 1.43f; ///< cuCollections suggests using a hash map of size ///< N * (1/0.7) = 1.43 to target a 70% occupancy factor. @@ -43,12 +43,12 @@ auto constexpr KEY_SENTINEL = key_type{-1}; auto constexpr VALUE_SENTINEL = mapped_type{-1}; auto constexpr SCOPE = cuda::thread_scope_block; -using storage_type = cuco::aow_storage, - cudf::detail::cuco_allocator>; +using storage_type = cuco::bucket_storage, + cudf::detail::cuco_allocator>; using storage_ref_type = typename storage_type::ref_type; -using window_type = typename storage_type::window_type; +using bucket_type = typename storage_type::bucket_type; /** * @brief Return the byte length of parquet dtypes that are physically represented by INT32 @@ -100,7 +100,7 @@ inline size_type __device__ row_to_value_idx(size_type idx, * @param frags Column fragments * @param stream CUDA stream to use */ -void populate_chunk_hash_maps(device_span const map_storage, +void populate_chunk_hash_maps(device_span const map_storage, cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream); @@ -111,7 +111,7 @@ void populate_chunk_hash_maps(device_span const map_storage, * @param chunks Flat span of chunks to compact hash maps for * @param stream CUDA stream to use */ -void collect_map_entries(device_span const map_storage, +void collect_map_entries(device_span const map_storage, device_span chunks, rmm::cuda_stream_view stream); @@ -128,7 +128,7 @@ void collect_map_entries(device_span const map_storage, * @param frags Column fragments * @param stream CUDA stream to use */ -void get_dictionary_indices(device_span const map_storage, +void get_dictionary_indices(device_span const map_storage, cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream); diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 188e6a8c0d8..6db92462498 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1302,7 +1302,7 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, } else { chunk.use_dictionary = true; chunk.dict_map_size = - static_cast(cuco::make_window_extent( + static_cast(cuco::make_bucket_extent( static_cast(occupancy_factor * chunk.num_values))); chunk.dict_map_offset = total_map_storage_size; total_map_storage_size += chunk.dict_map_size; @@ -1317,7 +1317,7 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, total_map_storage_size, cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}}; // Create a span of non-const map_storage as map_storage_ref takes in a non-const pointer. - device_span const map_storage_data{map_storage.data(), total_map_storage_size}; + device_span const map_storage_data{map_storage.data(), total_map_storage_size}; // Synchronize chunks.host_to_device_async(stream);