From 1802a8b9b9c3ea07766fef3a723749fbf65f767d Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Tue, 3 Sep 2024 23:44:00 +0000 Subject: [PATCH 1/3] Change mixed type as string to have higher priority over schema --- cpp/src/io/json/json_column.cu | 6 +++--- cpp/tests/io/json/json_test.cpp | 28 ++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 8d6890045be..54454da785e 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -987,15 +987,15 @@ std::pair, std::vector> device_json_co data_type target_type{}; - if (schema.has_value()) { + if (json_col.forced_as_string_column) { + target_type = data_type{type_id::STRING}; + } else if (schema.has_value()) { #ifdef NJP_DEBUG_PRINT std::cout << "-> explicit type: " << (schema.has_value() ? std::to_string(static_cast(schema->type.id())) : "n/a"); #endif target_type = schema.value().type; - } else if (json_col.forced_as_string_column) { - target_type = data_type{type_id::STRING}; } // Infer column type, if we don't have an explicit type for it else { diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp index c26e5ca3edb..9b73af9bbfd 100644 --- a/cpp/tests/io/json/json_test.cpp +++ b/cpp/tests/io/json/json_test.cpp @@ -2776,4 +2776,32 @@ TEST_F(JsonReaderTest, JSONMixedTypeChildren) } } +TEST_F(JsonReaderTest, MixedTypesWithSchema) +{ + std::string data = "{\"data\": {\"A\": 0, \"B\": 1}}\n{\"data\": [1,0]}\n"; + + std::map data_types; + data_types.insert( + std::pair{"data", cudf::io::schema_element{cudf::data_type{cudf::type_id::LIST}}}); + + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) + .dtypes(data_types) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL) + .normalize_single_quotes(true) + .normalize_whitespace(true) + .mixed_types_as_string(true) + .strict_validation(true) + .keep_quotes(true) + .lines(true); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + + EXPECT_EQ(result.tbl->num_columns(), 1); + EXPECT_EQ(result.tbl->num_rows(), 2); + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRING); + // expected output without whitespace + cudf::test::strings_column_wrapper expected({R"({"A":0,"B":1})", "[1,0]"}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result.tbl->get_column(0)); +} + CUDF_TEST_PROGRAM_MAIN() From 55c84d3fe2a1e259b2eaa8af8b1d631fad37cb42 Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Thu, 5 Sep 2024 02:05:03 -0500 Subject: [PATCH 2/3] fix merge issues --- cpp/tests/io/json/json_test.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp index 9b73af9bbfd..1127f9ad641 100644 --- a/cpp/tests/io/json/json_test.cpp +++ b/cpp/tests/io/json/json_test.cpp @@ -2788,10 +2788,7 @@ TEST_F(JsonReaderTest, MixedTypesWithSchema) cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) .dtypes(data_types) .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL) - .normalize_single_quotes(true) - .normalize_whitespace(true) .mixed_types_as_string(true) - .strict_validation(true) .keep_quotes(true) .lines(true); cudf::io::table_with_metadata result = cudf::io::read_json(in_options); @@ -2800,7 +2797,7 @@ TEST_F(JsonReaderTest, MixedTypesWithSchema) EXPECT_EQ(result.tbl->num_rows(), 2); EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRING); // expected output without whitespace - cudf::test::strings_column_wrapper expected({R"({"A":0,"B":1})", "[1,0]"}); + cudf::test::strings_column_wrapper expected({R"({"A": 0, "B": 1})", "[1,0]"}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result.tbl->get_column(0)); } From cdc417e7bb457e5fa4e4af1c566c655b92903e0a Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Thu, 5 Sep 2024 02:09:03 -0500 Subject: [PATCH 3/3] Update doc mixed type as string in json.hpp --- cpp/include/cudf/io/json.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index fde1857cb7f..2534140e326 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -376,6 +376,7 @@ class json_reader_options { /** * @brief Set whether to parse mixed types as a string column. * Also enables forcing to read a struct as string column using schema. + * If enable, mixed types are parsed a string column regardless of schema. * * @param val Boolean value to enable/disable parsing mixed types as a string column */