From 851e3208f09243bf28692bdfb1c1e72638302c23 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 24 Jul 2024 16:31:33 -0700 Subject: [PATCH 1/5] Run thread parallel kernel depending on number of paths, not input size Signed-off-by: Nghia Truong --- src/main/cpp/src/get_json_object.cu | 32 +++++++++++------------------ 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/src/main/cpp/src/get_json_object.cu b/src/main/cpp/src/get_json_object.cu index 29de3be3bb..5fa899c8c2 100644 --- a/src/main/cpp/src/get_json_object.cu +++ b/src/main/cpp/src/get_json_object.cu @@ -908,8 +908,8 @@ void launch_kernel(bool exec_thread_parallel, // the performance is really bad. This essentially tells NVCC to prefer using lots // of registers over spilling. if (exec_thread_parallel) { - constexpr int block_size = 256; - constexpr int min_block_per_sm = 1; + constexpr int block_size = 128; + constexpr int min_block_per_sm = 8; auto const num_blocks = cudf::util::div_rounding_up_safe(static_cast(input.size()) * path_data.size(), static_cast(block_size)); @@ -1027,21 +1027,14 @@ std::vector> get_json_object( auto const [d_json_paths, h_json_paths, d_inst_names, h_inst_names] = construct_path_commands(json_paths, stream); - auto const [max_row_size, sum_row_size] = - thrust::transform_reduce(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(input.size()), - cuda::proclaim_return_type>( - [in_offsets] __device__(auto const idx) { - auto const size = in_offsets[idx + 1] - in_offsets[idx]; - return thrust::pair{size, size}; - }), - thrust::pair{0, 0}, - cuda::proclaim_return_type>( - [] __device__(auto const& lhs, auto const& rhs) { - return thrust::pair{ - std::max(lhs.first, rhs.first), lhs.second + rhs.second}; - })); + auto const max_row_size = thrust::transform_reduce( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(input.size()), + cuda::proclaim_return_type( + [in_offsets] __device__(auto const idx) { return in_offsets[idx + 1] - in_offsets[idx]; }), + int64_t{0}, + thrust::maximum{}); // We will use scratch buffers to store the output strings without knowing their sizes. // Since we do not know their sizes, we need to allocate the buffer a bit larger than the input @@ -1082,9 +1075,8 @@ std::vector> get_json_object( rmm::exec_policy(stream), d_has_out_of_bound.begin(), d_has_out_of_bound.end(), 0); // Threshold to decide on using thread parallel or warp parallel algorithms. - constexpr int64_t AVG_CHAR_BYTES_THRESHOLD = 256; - auto const exec_thread_parallel = - (sum_row_size / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD; + constexpr int64_t PATH_SIZE_THRESHOLD = 2; + auto const exec_thread_parallel = json_paths.size() > PATH_SIZE_THRESHOLD; launch_kernel(exec_thread_parallel, *d_input_ptr, d_path_data, stream); // Do not use parallel check since we do not have many elements. From 09d7105815e74168f0b281365aa168395edb97a4 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 24 Jul 2024 22:46:51 -0700 Subject: [PATCH 2/5] Change path size threshold Signed-off-by: Nghia Truong --- src/main/cpp/src/get_json_object.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/cpp/src/get_json_object.cu b/src/main/cpp/src/get_json_object.cu index 5fa899c8c2..22dde0c8a0 100644 --- a/src/main/cpp/src/get_json_object.cu +++ b/src/main/cpp/src/get_json_object.cu @@ -1075,8 +1075,8 @@ std::vector> get_json_object( rmm::exec_policy(stream), d_has_out_of_bound.begin(), d_has_out_of_bound.end(), 0); // Threshold to decide on using thread parallel or warp parallel algorithms. - constexpr int64_t PATH_SIZE_THRESHOLD = 2; - auto const exec_thread_parallel = json_paths.size() > PATH_SIZE_THRESHOLD; + constexpr int64_t PATH_SIZE_THRESHOLD = 8; + auto const exec_thread_parallel = json_paths.size() >= PATH_SIZE_THRESHOLD; launch_kernel(exec_thread_parallel, *d_input_ptr, d_path_data, stream); // Do not use parallel check since we do not have many elements. From 2432f338a596ef51bbed8d9c10e54648a80c64bc Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 24 Jul 2024 22:47:01 -0700 Subject: [PATCH 3/5] Add test data Signed-off-by: Nghia Truong --- src/main/cpp/benchmarks/get_json_object.cu | 1216 +++++++++++++++++++- 1 file changed, 1211 insertions(+), 5 deletions(-) diff --git a/src/main/cpp/benchmarks/get_json_object.cu b/src/main/cpp/benchmarks/get_json_object.cu index 51f9299dba..cbcd266d64 100644 --- a/src/main/cpp/benchmarks/get_json_object.cu +++ b/src/main/cpp/benchmarks/get_json_object.cu @@ -17,8 +17,11 @@ #include #include +#include #include +#include +#include #include #include #include @@ -26,6 +29,56 @@ #include #include +#include +#include +#include + +// #define printf(...) void(0) +// #define fflush(...) void(0) + +class Timer { + protected: + using Clock = std::chrono::high_resolution_clock; + + public: + Timer() = default; + virtual ~Timer() = default; + + void tick() + { + assert(!m_TimerTicked); + m_StartTime = Clock::now(); + m_TimerTicked = true; + } + + double tock() + { + assert(m_TimerTicked); + m_EndTime = Clock::now(); + m_TimerTicked = false; + m_ElapsedTime = std::chrono::duration(m_EndTime - m_StartTime).count(); + + return m_ElapsedTime; + } + + std::string getRunTime() + { + if (m_TimerTicked) { tock(); } + m_StrBuilder.str(""); + m_StrBuilder << std::to_string(m_ElapsedTime); + m_StrBuilder << "ms"; + return m_StrBuilder.str(); + } + + private: + Clock::time_point m_StartTime; + Clock::time_point m_EndTime; + std::stringstream m_StrBuilder; + + double m_ElapsedTime{0.0}; + bool m_TimerTicked{false}; +}; + // #define DEBUG_PRINT #ifdef DEBUG_PRINT @@ -86,7 +139,7 @@ std::vector to_host_strings(CV const& c) } // namespace #endif // #ifdef DEBUG_PRINT -constexpr auto list_depth = 2; +constexpr auto list_depth = 1; constexpr auto min_width = 10; constexpr auto max_width = 10; @@ -100,7 +153,8 @@ auto generate_input(std::size_t size_bytes, cudf::size_type max_depth) .list_depth(list_depth) .list_type(cudf::type_id::STRING) .struct_depth(max_depth > list_depth ? max_depth - list_depth : 1) - .struct_types(std::vector{cudf::type_id::LIST}); + .struct_types(std::vector{ + cudf::type_id::LIST, cudf::type_id::STRING, cudf::type_id::INT32}); auto const input_table = create_random_table( std::vector{cudf::type_id::INT32, cudf::type_id::STRING, cudf::type_id::STRUCT}, @@ -133,7 +187,7 @@ auto generate_input(std::size_t size_bytes, cudf::size_type max_depth) return std::move(json_strings); } -void BM_get_json_object(nvbench::state& state) +void BM_get_json_object_single(nvbench::state& state) { auto const size_bytes = static_cast(state.get_int64("size_bytes")); auto const max_depth = static_cast(state.get_int64("max_depth")); @@ -163,7 +217,1159 @@ void BM_get_json_object(nvbench::state& state) state.add_global_memory_reads(size_bytes); } -NVBENCH_BENCH(BM_get_json_object) - .set_name("get_json_object") +#if 0 +void BM_get_json_object_multiple(nvbench::state& state) +{ + auto const size_bytes = static_cast(state.get_int64("size_bytes")); + auto const num_paths = state.get_int64("num_paths"); + auto const json_strings = generate_input(size_bytes, 2 /*max_depth*/); + + using path_instruction_type = spark_rapids_jni::path_instruction_type; + using instruction_array = std::vector>; + std::vector instructions_arrays; + + instructions_arrays.emplace_back(); + instructions_arrays.back().emplace_back(path_instruction_type::NAMED, "int32", -1); + + if (num_paths > 1) { + instructions_arrays.emplace_back(); + instructions_arrays.back().emplace_back(path_instruction_type::NAMED, "string", -1); + } + + if (num_paths > 2) { + instructions_arrays.emplace_back(); + instructions_arrays.back().emplace_back(path_instruction_type::NAMED, "struct", -1); + instructions_arrays.back().emplace_back(path_instruction_type::NAMED, "0", -1); + } + + if (num_paths > 3) { + instructions_arrays.emplace_back(); + instructions_arrays.back().emplace_back(path_instruction_type::NAMED, "struct", -1); + instructions_arrays.back().emplace_back(path_instruction_type::NAMED, "2", -1); + } + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + // Can also verify at https://jsonpath.com/. +#if 1 + [[maybe_unused]] auto const output = spark_rapids_jni::get_json_object( + cudf::strings_column_view{json_strings->view()}, instructions_arrays); +#else + for (int64_t i = 0; i < num_paths; ++i) { + auto const& instructions = instructions_arrays[i]; + [[maybe_unused]] auto const output = spark_rapids_jni::get_json_object( + cudf::strings_column_view{json_strings->view()}, instructions); +#ifdef DEBUG_PRINT + { + auto const strs = to_host_strings(output->view()); + std::cout << "First output row: \n" << strs.front() << std::endl << std::endl << std::endl; + } +#endif // #ifdef DEBUG_PRINT + } +#endif + }); + state.add_global_memory_reads(size_bytes); +} +#endif + +#if 1 +using spark_rapids_jni::path_instruction_type; +std::vector>> generate_paths0() +{ + return std::vector>>{ + std::vector>{ + {static_cast(2), "EIFGPHGLOPELFBN", -1}} + // + }; +} + +std::vector>> generate_paths1() +{ + return std::vector>>{ + std::vector>{ + {static_cast(1), "", 0}, + {static_cast(2), "GHPKNICLNDAGCNDBMFGEK", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "KIKNFPPAPGDO", -1}, + {static_cast(2), "KLFALIBALPPK", -1}, + {static_cast(2), "HGABIFNPHAHHGP", -1}} + // + }; +} + +std::vector>> generate_paths2() +{ + auto tmp = std::vector>>{ + std::vector>{ + {static_cast(2), "JEBEDJPKEFHPHGLLGPM", -1}}, + + std::vector>{ + {static_cast(2), "FLMEPG", -1}, + {static_cast(2), "CGEGPD", -1}}, + + std::vector>{ + {static_cast(2), "JACICCCIMMHJHKPDED", -1}, + {static_cast(2), "ACHCPIHLFCPHMBPNKJNOLNO", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(0), "", -1}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "JACICCCIMMHJHKPDED", -1}, + {static_cast(2), "OGGC", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "MDGA", -1}}, + + std::vector>{ + {static_cast(2), "AGHF", -1}, + {static_cast(2), "DPKEAPDACLPHGPEMH", -1}}, + + std::vector>{ + {static_cast(2), "AGHF", -1}, + {static_cast(2), "ONNILHPABGIKKFJOEK", -1}}, + + std::vector>{ + {static_cast(2), "AGHF", -1}, + {static_cast(2), "FFFPOENCNBBNOOMOJGDBNIPD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "POFNDBFHDEJ", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(0), "", -1}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "PIGOFCPIPPBNNB", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "CCBJKBHGPBJCKFPCBHGLOAFE", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "LMPCGHBIJGCIPDPNELPBCOP", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "PKBGI", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "ILPIJKBLDB", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "GHBBEOAC", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "EKGPKGCJPMI", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "BDEGLFGMCPKOCNDGJMFPANNBPK", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "LILJMMPPO", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "EAGCHCMLMOLGJK", -1}, + {static_cast(2), "BEACAHEBBO", -1}, + {static_cast(2), "BNLFCI", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "EAGCHCMLMOLGJK", -1}, + {static_cast(2), "BEACAHEBBO", -1}, + {static_cast(2), "GPIHMJ", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "EAGCHCMLMOLGJK", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GJFKCFJELPJEDBAD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "EAGCHCMLMOLGJK", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "DLJPDEPFEKDCKBI", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(0), "", -1}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "PMJPCGCHAALKBPKHDM", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "OCFGAF", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "GMJICFMBNPLBEOLMGDN", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "CBMI", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "NPAGLLFCHAI", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "LFKAJEPMJPLGLICEEMAHFEJGPLGIAKPIOPPP", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "HGNHKIOEGKIJJJPEC", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "JAGGKPKOICKOBABAJPNHF", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "PLEJAKDBBGLCDLGDIBHPPBHB", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "MMNHNPKGLLBJMAOGOCBEOIOKIM", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "JLKDBLFFFPPCNANBKMELJKFOPKPNC", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "OCJGMOAJJKBKNCHOJKBJG", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "PMOAGIJAFOGGLINIOEBFGHBN", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "JPDILOFKPCNBKDB", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "CPBFNDGC", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "KPOPPCFLFCNAPIJEDJDGGFBOPLDCMLLGOMO", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "LBDGCNJNOGMJPNHMLLBMA", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "EIHBDLNJDOAHPMCNGGLLEF", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "GIPPDMMAFOBAALMHMGJBM", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "FKBODHACMMGHL", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "KMEJHDA", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "FKBODHACMMGHL", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "CJKIKCGA", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "HFFDKEDMFBAKEHHM", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "KGJLLAPHJNKCEOIAMCAABCJP", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 1}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "KLJNBPLECGCA", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "NBJNFKKKCHEGCABDGKG", -1}, + {static_cast(2), "BEACAHEBBO", -1}, + {static_cast(2), "BNLFCI", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "NBJNFKKKCHEGCABDGKG", -1}, + {static_cast(2), "BEACAHEBBO", -1}, + {static_cast(2), "GPIHMJ", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "NBJNFKKKCHEGCABDGKG", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GJFKCFJELPJEDBAD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "NBJNFKKKCHEGCABDGKG", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "AOHKGCPAOGANLKEJDLMIGDD", -1}, + {static_cast(2), "BEACAHEBBO", -1}, + {static_cast(2), "BNLFCI", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "AOHKGCPAOGANLKEJDLMIGDD", -1}, + {static_cast(2), "BEACAHEBBO", -1}, + {static_cast(2), "GPIHMJ", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "AOHKGCPAOGANLKEJDLMIGDD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "IKHLECMHMONKLKIBD", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "PNJPGEHPDLMPBDMFPLKABFFGG", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "IGAJPHHGOENI", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "LDPMFNAGLJGDMFOLAKH", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "KMEJHDA", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "LDPMFNAGLJGDMFOLAKH", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "CJKIKCGA", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "BFAJJIOLJBEOMFKLE", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "DOONHL", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GMFDD", -1}}, + + std::vector>{ + {static_cast(2), "OCIKAF", -1}}, + + std::vector>{ + {static_cast(2), "IBMBCGNOCGCPCEN", -1}, + {static_cast(0), "", -1}, + {static_cast(2), "GLNLBEA", -1}} + // + }; + + // tmp.resize(std::min(8UL, tmp.size())); + return tmp; + // return std::vector>>( + // tmp.begin() + 7, tmp.end()); +} + +std::vector>> generate_paths3() +{ + return std::vector>>{ + std::vector>{ + {static_cast(2), "KPIGLEDEOCFELKLJLAFE", -1}} + // + }; +} + +std::vector>> generate_paths4() +{ + return std::vector>>{ + std::vector>{ + {static_cast(2), "NHKDIEPJNND", -1}, + {static_cast(2), "DPBFKLKAKDHLMDLIONCCLJ", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GPGACKDIBMPAKJMDMJ", -1}}, + + std::vector>{ + {static_cast(2), "NHKDIEPJNND", -1}, + {static_cast(2), "DPBFKLKAKDHLMDLIONCCLJ", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "NOIIFOJOPJP", -1}}, + + std::vector>{ + {static_cast(2), "NHKDIEPJNND", -1}, + {static_cast(2), "DPBFKLKAKDHLMDLIONCCLJ", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "CEJOOHNF", -1}}, + + std::vector>{ + {static_cast(2), "NHKDIEPJNND", -1}, + {static_cast(2), "DPBFKLKAKDHLMDLIONCCLJ", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "HODJK", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "HHKEKMIIGI", -1}}, + + std::vector>{ + {static_cast(2), "NHKDIEPJNND", -1}, + {static_cast(2), "DPBFKLKAKDHLMDLIONCCLJ", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "KDGJICMEANMA", -1}, + {static_cast(0), "", -1}, + {static_cast(2), "ILEADAN", -1}}, + + std::vector>{ + {static_cast(2), "NHKDIEPJNND", -1}, + {static_cast(2), "DPBFKLKAKDHLMDLIONCCLJ", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "OKPLFLHHEBDJELFA", -1}}, + + std::vector>{ + {static_cast(2), "NHKDIEPJNND", -1}, + {static_cast(2), "CHNFGBB", -1}, + {static_cast(2), "KIKNFPPAPGDO", -1}, + {static_cast(2), "KLFALIBALPPK", -1}, + {static_cast(2), "HGABIFNPHAHHGP", -1}}, + + std::vector>{ + {static_cast(2), "NHKDIEPJNND", -1}, + {static_cast(2), "IHIIKIHHMPFL", -1}, + {static_cast(2), "KCCCHAM", -1}, + {static_cast(2), "KCCCHAM", -1}}, + + std::vector>{ + {static_cast(2), "KFPJHMGFEELFG", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "AFHKGOFNFID", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "DPBFKLKAKDHLMDLIONCCLJ", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "CEJOOHNF", -1}}, + + std::vector>{ + {static_cast(2), "KFPJHMGFEELFG", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "AFHKGOFNFID", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "DPBFKLKAKDHLMDLIONCCLJ", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "HODJK", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "HHKEKMIIGI", -1}}, + + std::vector>{ + {static_cast(2), "KFPJHMGFEELFG", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "AFHKGOFNFID", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "DPBFKLKAKDHLMDLIONCCLJ", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "OKPLFLHHEBDJELFA", -1}}, + + std::vector>{ + {static_cast(2), "JJKPNPFMNICGLC", -1}, + {static_cast(2), "GGLF", -1}, + {static_cast(2), "JKKJDAKAB", -1}}, + + std::vector>{ + {static_cast(2), "KPIGLEDEOCFELKLJLAFE", -1}}, + + std::vector>{ + {static_cast(2), "PACKGGMDGCLEHD", -1}, + {static_cast(2), "IAFMNJMMNJPDAAHND", -1}}, + + std::vector>{ + {static_cast(2), "PACKGGMDGCLEHD", -1}, + {static_cast(2), "MNIMBEMMOJFHILDMDBML", -1}} + + // + }; +} + +#endif + +#if 1 + +std::vector>> +generate_paths_test_1() +{ + /* + * '$.NHKDIEPJNND.DPBFKLKAKDHLMDLIONCCLJ[0].OKPLFLHHEBDJELFA' + * '$.NHKDIEPJNND.CHNFGBB.KIKNFPPAPGDO.KLFALIBALPPK.HGABIFNPHAHHGP' + * '$.NHKDIEPJNND.IHIIKIHHMPFL.KCCCHAM.KCCCHAM' + */ + return std::vector>>{ + + std::vector>{ + {static_cast(2), "NHKDIEPJNND", -1}, + {static_cast(2), "DPBFKLKAKDHLMDLIONCCLJ", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "OKPLFLHHEBDJELFA", -1}}, + + std::vector>{ + {static_cast(2), "NHKDIEPJNND", -1}, + {static_cast(2), "CHNFGBB", -1}, + {static_cast(2), "KIKNFPPAPGDO", -1}, + {static_cast(2), "KLFALIBALPPK", -1}, + {static_cast(2), "HGABIFNPHAHHGP", -1}}, + + std::vector>{ + {static_cast(2), "NHKDIEPJNND", -1}, + {static_cast(2), "IHIIKIHHMPFL", -1}, + {static_cast(2), "KCCCHAM", -1}, + {static_cast(2), "KCCCHAM", -1}} + // + }; +} + +std::vector>> +generate_paths_test_2() +{ + /* + * '$.AENBHHGIABBBDDGOEI.EAGCHCMLMOLGJK.BEACAHEBBO.BNLFCI' + * '$.AENBHHGIABBBDDGOEI.EAGCHCMLMOLGJK.BEACAHEBBO.GPIHMJ' + * '$.AENBHHGIABBBDDGOEI.EAGCHCMLMOLGJK.CGEGPD[0].GJFKCFJELPJEDBAD' + * '$.AENBHHGIABBBDDGOEI.EAGCHCMLMOLGJK.CGEGPD[0].GMFDD' + */ + return std::vector>>{ + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "EAGCHCMLMOLGJK", -1}, + {static_cast(2), "BEACAHEBBO", -1}, + {static_cast(2), "BNLFCI", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "EAGCHCMLMOLGJK", -1}, + {static_cast(2), "BEACAHEBBO", -1}, + {static_cast(2), "GPIHMJ", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "EAGCHCMLMOLGJK", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GJFKCFJELPJEDBAD", -1}}, + + std::vector>{ + {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, + {static_cast(2), "EAGCHCMLMOLGJK", -1}, + {static_cast(2), "CGEGPD", -1}, + {static_cast(1), "", 0}, + {static_cast(2), "GMFDD", -1}}, + + // + }; +} + +#endif + +void test(rmm::cuda_stream_view stream, int method, bool warm_up = false) +{ + std::vector>>> + paths(5); +#if 1 + paths[0] = generate_paths0(); + paths[1] = generate_paths1(); + paths[3] = generate_paths3(); +#endif + paths[2] = generate_paths2(); + paths[4] = generate_paths4(); + + printf("Method: %d\n", method); + fflush(stdout); + + { + int idx{0}; + std::size_t count{0}; + for (auto const& path : paths) { + if (warm_up) { + fprintf(stdout, "Path %d, size: %d\n", idx++, (int)path.size()); + } else { + printf("Path %d, size: %d\n", idx++, (int)path.size()); + } + count += path.size(); + } + printf("Total path: %d\n", (int)count); + fflush(stdout); + } + + auto const read_opts = + cudf::io::parquet_reader_options::builder( + cudf::io::source_info{"/home/nghiat/Devel/data/WM_MOCKED_3/data.parquet"}) + .build(); + + auto const limit = 256 * 1024 * 1024UL; + auto reader = cudf::io::chunked_parquet_reader(limit, 4 * limit, read_opts); + + Timer timer; + double test_time = 0; + int num_chunks{0}; + + do { + auto chunk = reader.read_chunk(); + num_chunks++; + stream.synchronize(); + + timer.tick(); + if (method == 0) { + /* Test with 80 paths: + * Test time: 7189.04ms, num chunks: 1 + * Test time: 410163ms, num chunks: 51 + * + * Test with 15 paths: + * Test time: 117.39ms, num chunks: 1 + * Test time: 5112.7ms, num chunks: 51 + */ + for (int i = 0; i < 5; ++i) { + for (auto const& path : paths[i]) { + if (path.size() > 0) { + [[maybe_unused]] auto const output = spark_rapids_jni::get_json_object( + cudf::strings_column_view{chunk.tbl->get_column(i).view()}, path); + } + } + } + } else if (method == 1) { + /* Test with 80 paths: + * Test time: 6896.2ms, num chunks: 1 + * Test time: 385526ms, num chunks: 51 + * + * Test with 15 paths: + * Test time: 144.366ms, num chunks: 1 + * Test time: 4889.46ms, num chunks: 51 + */ + // for (int i = 0; i < 5; ++i) { + // if (paths[i].size() > 0) { + // [[maybe_unused]] auto const output = spark_rapids_jni::get_json_object( + // cudf::strings_column_view{chunk.tbl->get_column(i).view()}, paths[i]); + // } + // } + } else { + for (int i = 0; i < 5; ++i) { + if (paths[i].size() > 0) { + /* + * thread par, max path size: + * 2: 40s + * 4: 17s + * 6: 15s + * 8: 11.8s + * 10: 9.3s + * 16: 7.6s + * 32: 5.9s + * + * warp par, max path size: + * 2: 19s + * 4: 13s + * 6: 13s + * 8: 12s + * 10: 11.5s + * 16: 10.6s + * 32: 10.2s + */ + + constexpr int path_size = 4; + + std::size_t offset = 0; + std::vector>> segment; + segment.reserve(path_size); + while (offset < paths[i].size()) { + auto const last = std::min(offset + path_size, paths[i].size()); + segment.resize(0); + segment.insert(segment.end(), paths[i].begin() + offset, paths[i].begin() + last); + offset += path_size; + + [[maybe_unused]] auto const output = spark_rapids_jni::get_json_object_multiple_paths( + cudf::strings_column_view{chunk.tbl->get_column(i).view()}, segment); + } + } + } + } + stream.synchronize(); + test_time += timer.tock(); + + if (num_chunks > 10) { break; } + + if (warm_up) { + break; // just test one chunk + } + } while (reader.has_next()); + + std::cout << "Test time: " << test_time << "ms, num chunks: " << num_chunks << std::endl; +} + +void BM_get_json_object_multiple(nvbench::state& state) +{ + auto const stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); +#if 0 + test(stream, 0, true); // warm up + test(stream, 0, false); +#else + // test(stream, 0, true); // warm up + // test(stream, 1, true); // warm up + test(stream, 2, true); // warm up + + // test(stream, 0, false); // warm up + // test(stream, 1, false); // warm up + test(stream, 2, false); // warm up +#endif + // 7/8: 32774 ms + // 7/10: 26340 ms +} + +#if 0 +void print_nodes(std::vector const& nodes) +{ + std::cout << "\n\n\nHost data\n"; + int count{0}; + for (auto& node : nodes) { + std::cout << "node: " << count++ << std::endl; + std::cout << " " + << "level: " << node.level << std::endl; + std::cout << " " + << "type: " << static_cast(node.data.type) << std::endl; + std::cout << " " + << "name: " << node.data.name << std::endl; + std::cout << " " + << "index: " << node.data.index << std::endl; + std::cout << " " + << "parent: " << node.parent << std::endl; + std::cout << " output indices: "; + for (auto i : node.output_indices) { + std::cout << i << ", "; + } + std::cout << std::endl; + std::cout << " children: "; + for (auto i : node.children) { + std::cout << i << ", "; + } + std::cout << std::endl; + std::cout << " " + << "first child: " << node.first_child << std::endl; + std::cout << " " + << "num children: " << node.num_children << std::endl; + std::cout << " " + << "next sibling: " << node.next_sibling << std::endl; + } + fflush(stdout); +} + + +void BM_test_path_1(nvbench::state& state) +{ + auto const paths = generate_paths_test_1(); + auto output = spark_rapids_jni::generate_path_nodes(paths); + print_nodes(output); +} + +void BM_test_path_2(nvbench::state& state) +{ + auto const paths = generate_paths_test_2(); + auto output = spark_rapids_jni::generate_path_nodes(paths); + print_nodes(output); +} +#endif + +auto debug_data1() +{ + auto input = cudf::test::strings_column_wrapper{"{'a':1, 'b':2, 'c':[1, 2, 3]}"}; + + auto paths = std::vector>>{ + std::vector>{ + {static_cast(2), "a", -1}}, + std::vector>{ + {static_cast(2), "b", -1}}, + std::vector>{ + {static_cast(2), "c", -1}, + {static_cast(0), "", -1}}, + std::vector>{ + {static_cast(2), "c", -1}, + {static_cast(1), "", 1}} + // + }; + + return std::pair{std::move(input), std::move(paths)}; +} + +auto debug_data2() +{ + auto input = cudf::test::strings_column_wrapper{"{\"k1\":{\"k2\":\"v2\"}}"}; + + auto paths = std::vector>>{ + std::vector>{ + {static_cast(2), "k1", -1}, + {static_cast(2), "k2", -1}} + // + }; + + return std::pair{std::move(input), std::move(paths)}; +} + +auto debug_data3() +{ + auto input = cudf::test::strings_column_wrapper{ + "{\"k1\":{\"k2\":{\"k3\":{\"k4\":{\"k5\":{\"k6\":{\"k7\":{\"k8\":\"v8\"}}}}}}}}"}; + + auto paths = std::vector>>{ + std::vector>{ + {static_cast(2), "k1", -1}, + {static_cast(2), "k2", -1}, + {static_cast(2), "k3", -1}, + {static_cast(2), "k4", -1}, + {static_cast(2), "k5", -1}, + {static_cast(2), "k6", -1}, + {static_cast(2), "k7", -1}, + {static_cast(2), "k8", -1}, + } + // + }; + + return std::pair{std::move(input), std::move(paths)}; +} + +auto debug_data4() +{ + auto input = cudf::test::strings_column_wrapper{ + "{\"brand\":\"ssssss\",\"duratRon\":15,\"eqTosuresurl\":\"\",\"RsZxarthrl\":false," + "\"xonRtorsurl\":\"\",\"xonRtorsurlstOTe\":0,\"TRctures\":[{\"RxaGe\":\"VttTs:\\/\\/" + "feed-RxaGe.baRdu.cox\\/0\\/TRc\\/" + "-196588744s840172444s-773690137.zTG\"}],\"Toster\":\"VttTs:\\/\\/feed-RxaGe.baRdu.cox\\/0\\/" + "TRc\\/" + "-196588744s840172444s-773690137.zTG\",\"reserUed\":{\"bRtLate\":391.79,\"xooUZRke\":26876," + "\"nahrlIeneratRonNOTe\":0,\"useJublRc\":6,\"URdeoRd\":821284086},\"tRtle\":" + "\"ssssssssssmMsssssssssssssssssss\",\"url\":\"s{storehrl}\",\"usersTortraRt\":\"VttTs:\\/\\/" + "feed-RxaGe.baRdu.cox\\/0\\/TRc\\/" + "-6971178959s-664926866s-6096674871.zTG\",\"URdeosurl\":\"http:\\/\\/nadURdeo2.baRdu.cox\\/" + "5fa3893aed7fc0f8231dab7be23efc75s820s6240.xT3\",\"URdeoRd\":821284086}"}; + + auto paths = std::vector>>{ + std::vector>{ + {static_cast(2), "URdeosurl", -1}, + } + // + }; + + return std::pair{std::move(input), std::move(paths)}; +} + +auto debug_data5() +{ + auto input = cudf::test::strings_column_wrapper{ + "[ [[[ {'k': 'v1'} ], {'k': 'v2'}]], [[{'k': 'v3'}], {'k': 'v4'}], {'k': 'v5'} ]"}; + + auto paths = std::vector>>{ + std::vector>{ + {static_cast(0), "", -1}, + {static_cast(0), "", -1}, + {static_cast(2), "k", -1}} + // + }; + + return std::pair{std::move(input), std::move(paths)}; +} + +auto debug_data6() +{ + auto input = cudf::test::strings_column_wrapper{"[ {'k': 'v1'}, {'k': 'v2'} ]"}; + + auto paths = std::vector>>{ + std::vector>{ + {static_cast(0), "", -1}, + {static_cast(2), "k", -1}} + // + }; + + return std::pair{std::move(input), std::move(paths)}; +} + +auto debug_data7() +{ + auto input = cudf::test::strings_column_wrapper{"[1, [21, 22], 3]"}; + + auto paths = std::vector>>{ + std::vector>{ + {static_cast(0), "", -1}, + } + // + }; + + return std::pair{std::move(input), std::move(paths)}; +} + +auto debug_data8() +{ + auto input = cudf::test::strings_column_wrapper{ + "[ {'k': [0, 1]}, {'k': {'a': 'b'}}, {'k': [10, 11, 12]} ]"}; + + auto paths = std::vector>>{ + std::vector>{ + {static_cast(0), "", -1}, + {static_cast(2), "k", -1}}, + std::vector>{ + {static_cast(0), "", -1}, + {static_cast(2), "k", -1}, + {static_cast(0), "", -1}, + } + // + }; + + return std::pair{std::move(input), std::move(paths)}; +} + +auto debug_data9() +{ + auto input = cudf::test::strings_column_wrapper{ + R"({"KAEMAPGKBFGNHPJ":"G2s!8-b@n2p","GKOGAKJDMEMFLJJD":[{"FMCMNJH":"FsZ!","OOPCOILM":".LX4HI1#6ZXQu"},{"FMCMNJH":"p,mt","OOPCOILM":"6zfMlis\"/pH,n-:v\\.;N[","MLDKOIIPGBFAKKLJC":[],"DHLLEABGIMMFBILMCJI":{},"IIEEGFP":9133996313993,"EIFGPHGLOPELFBN":[{"OAJHOPFELNNIJPFBNBG":{"KJKGAGFJNGBIGBM":{"ECHAGOLMEHCN":{"LJNHE":"\"/eP4`=WHg~kUOm2AJoXHk"}},"BJHMGKLP":"h1uU#","ONLPCJKJDIAKK":false,"GACCEMNH":"d+Z%N\\jZ9']wGOl%V9gj@"},"DIMPFKELHHIEHNM":[{"JBMLPPOFI":5,"GAHIANAFHIBONL":"rvtH}C0uFmK\\#A*D)x}I7c@5M'Z_fm6>PviE=(J]T+KlsNuO-\"lxD~.N/9EQ<^l%Qb-eFRBv","PPMNGNHGGONNLFAJKK":{"ECHAGOLMEHCN":{"LJNHE":"kH#D'(VRt)*kH<{ITQ^|By~:6o2","PPMNGNHGGONNLFAJKK":{"ECHAGOLMEHCN":{"LJNHE":""}},"ONBMEJGFIFMPO":"h1uU#"}],"HDDDGJFELLA":true,"KEEBNLDHJAPJNIGHHLPPCCEDJ":[],"BHCNDBIIICDJDB":{"MGAGILEALPHFOPBO":{}}},{"OAJHOPFELNNIJPFBNBG":{"KJKGAGFJNGBIGBM":{"ECHAGOLMEHCN":{"LJNHE":"s[2nryS^V|=D]h5JgNHE+h"}},"BJHMGKLP":"Az[M`Q","ONLPCJKJDIAKK":true,"GACCEMNH":"`GkuDVB)},b9kD+(MUEJU5eSw5f","PPMNGNHGGONNLFAJKK":{"ECHAGOLMEHCN":{"LJNHE":"!mJue a[n}#S:Hn"}},"ONBMEJGFIFMPO":"h1uU#"}],"HDDDGJFELLA":true,"KEEBNLDHJAPJNIGHHLPPCCEDJ":[],"BHCNDBIIICDJDB":{"MGAGILEALPHFOPBO":{}}},{"OAJHOPFELNNIJPFBNBG":{"KJKGAGFJNGBIGBM":{"ECHAGOLMEHCN":{"LJNHE":":e:)wr^Mp;hk7YiPb"}},"BJHMGKLP":"Az[M`","ONLPCJKJDIAKK":true,"GACCEMNH":"|<*(OwRo([egvXq"},"DIMPFKELHHIEHNM":[{"JBMLPPOFI":1,"GAHIANAFHIBONL":"ire`w7YQ=+6v<>ML\\o%0O@12jW1qQzy?5E`ye{!TVMl`$i=cR)k0TT_KK(bf3wGEHn&K]3g3IhZO7\\tQ2a","PPMNGNHGGONNLFAJKK":{"ECHAGOLMEHCN":{"LJNHE":"yv7*i\"b>gLJ;l3l?CmCUS@i]ce:65U2o;buv%zo_/Ox>!t)+;kjVj56g'84"}},"ONBMEJGFIFMPO":"h1uU#C"}],"HDDDGJFELLA":true,"KEEBNLDHJAPJNIGHHLPPCCEDJ":[]},{"OAJHOPFELNNIJPFBNBG":{"KJKGAGFJNGBIGBM":{"ECHAGOLMEHCN":{"LJNHE":"tU'S+mwf}H,Hp)VmgTXV['"}},"BJHMGKLP":"p,mtx?CX","ONLPCJKJDIAKK":false,"GACCEMNH":"}+H1}a1o!H1ji&%9ZC+"},"DIMPFKELHHIEHNM":[{"JBMLPPOFI":1,"GAHIANAFHIBONL":"}\"UN}:D{?7}%{kXHzGI$x\\e|,D0kvN","HGFOBILDJONCIHF":"h1uU#C[.Qc","PPMNGNHGGONNLFAJKK":{"ECHAGOLMEHCN":{"LJNHE":"iu,Eu%,vEPu)Joovs4SDwP"}},"ONBMEJGFIFMPO":"Az[M"}],"HDDDGJFELLA":true,"KEEBNLDHJAPJNIGHHLPPCCEDJ":[],"BHCNDBIIICDJDB":{"MGAGILEALPHFOPBO":{}}},{"OAJHOPFELNNIJPFBNBG":{"KJKGAGFJNGBIGBM":{"ECHAGOLMEHCN":{"LJNHE":">bE[dUO=4eXQk 6vL,rMlN"}},"BJHMGKLP":"Az[M`Q","ONLPCJKJDIAKK":true,"GACCEMNH":"Iw92[v,?T`o2G"},"DIMPFKELHHIEHNM":[{"JBMLPPOFI":1,"GAHIANAFHIBONL":"EmJ@.R?h","PPMNGNHGGONNLFAJKK":{"ECHAGOLMEHCN":{"LJNHE":""}},"ONBMEJGFIFMPO":"7c5vD"}],"HDDDGJFELLA":true,"KEEBNLDHJAPJNIGHHLPPCCEDJ":[],"BHCNDBIIICDJDB":{"MGAGILEALPHFOPBO":{}}},{"OAJHOPFELNNIJPFBNBG":{"KJKGAGFJNGBIGBM":{"ECHAGOLMEHCN":{"LJNHE":"%,b~~]9sV1v#89H;-EU'JY{g"}},"BJHMGKLP":"FsZ!/*!","ONLPCJKJDIAKK":false,"GACCEMNH":"Z4N/&MHYFJIR/1rqLn"},"DIMPFKELHHIEHNM":[{"JBMLPPOFI":8,"GAHIANAFHIBONL":"v]W\\\\$JTlKpij#:v+ta`1zUJJMf","HGFOBILDJONCIHF":" aMZ({x5#1N=9(yM\"C","PPMNGNHGGONNLFAJKK":{"ECHAGOLMEHCN":{"LJNHE":"/Vo46pg='[eV+"}},"ONBMEJGFIFMPO":"Az[M`Q"}],"HDDDGJFELLA":true,"KEEBNLDHJAPJNIGHHLPPCCEDJ":[],"BHCNDBIIICDJDB":{"MGAGILEALPHFOPBO":{}}},{"OAJHOPFELNNIJPFBNBG":{"KJKGAGFJNGBIGBM":{"ECHAGOLMEHCN":{"LJNHE":"$&ys4cUgN*OwnXH\\u"}},"BJHMGKLP":"FsZ","ONLPCJKJDIAKK":true,"GACCEMNH":"sp.R`h{>oKPb@H:HvtN&"},"DIMPFKELHHIEHNM":[{"JBMLPPOFI":3,"GAHIANAFHIBONL":"rs}8P(6 Co>|5h\"CI)u!\"wuhxfR/Q_\\rVr+v$5nibOOecKZ`INV0kqc&f%9(msUq%+g","HGFOBILDJONCIHF":"7c5vDh-|yd","PPMNGNHGGONNLFAJKK":{"ECHAGOLMEHCN":{"LJNHE":"H)Lm%i=LYm}l@g2]g$5v<8')(7o"}},"ONBMEJGFIFMPO":"h1uU#"}],"HDDDGJFELLA":true,"KEEBNLDHJAPJNIGHHLPPCCEDJ":[],"BHCNDBIIICDJDB":{"MGAGILEALPHFOPBO":{}}},{"OAJHOPFELNNIJPFBNBG":{"KJKGAGFJNGBIGBM":{"ECHAGOLMEHCN":{"LJNHE":"uq=kN4JVxf)kOnw'{3"}},"BJHMGKLP":"FsZ!/*!","ONLPCJKJDIAKK":true,"GACCEMNH":"[X>{IRK/)Se49+QS"},"DIMPFKELHHIEHNM":[{"JBMLPPOFI":5,"GAHIANAFHIBONL":"","PPMNGNHGGONNLFAJKK":{},"ONBMEJGFIFMPO":"p,mtx?CX"}],"HDDDGJFELLA":true,"KEEBNLDHJAPJNIGHHLPPCCEDJ":[],"BHCNDBIIICDJDB":{"MGAGILEALPHFOPBO":{}}}],"LDIHMDHNPANNGNMAI":{"CMBLBPGBPBLFOD":"Az[","LJCMAKCMDLOBC":"aK"},"KDOJAIFEFELP":{},"LHPGDLOGEGNGKCJMHEDKFJCBMH":[{"GEJGHKEHLAPFJ":"Az[M`","MHBGOLHCHBBIOM":{"CLELPFHJMKGHFNGKMBHAHAH":"BMq%ea #9u+F.AO%","KCCCHAM":"{&]1J_iH^}Eq>oE,#@R;T\"N1uwgXdH;M","AOICHPCGCHAMC":"*^r:\">l1+7XRSYU&g\"AU","PEJGENFFO":5074070918344,"CPFLBGNLMPLDFEGDHHG":[],"EIPCCEBCIMIFAEDOL":6202435079206,"AOHEGDPMAEPEAL":["b\\B53^![]\\A:n!","ji?Y()=t_+w-7R"],"KPLEHGGBK":"Az[M`Q.'mn`","AFDJNAEOHKK":8213889546936,"HMLBEEHBBJBIOJDABJ":"FsZ!/*!){O5","GKDACMEJIFMIEKMANHNNCPFGDCDGGGCG":{},"CIECJLNPP":"\\`Tx+HeoX`OU","FJPIMCHIJACHJE":[],"KHBKEAFCB":"7c5vDh-|","HABNLKACAJHCIOPFOPBBK":true,"IOJLOKAIK":1,"JIPBOH":"F","EGNIJOJDDCPKK":"x!ajLb(","GNBDNIMEEFCGKADKOAIE":"p,mtx?CX","HGEFKFCGJJNBBJIGKOO":false,"LGMLEEGAPIKBLFBL":[{"OAJHOPFELNNIJPFBNBG":{"KJKGAGFJNGBIGBM":{"ECHAGOLMEHCN":{"LJNHE":"y&\"MvM*"}},"BJHMGKLP":"h1uU#","ONLPCJKJDIAKK":true,"GACCEMNH":"&4k6jDc{%nzw`HO*\"&K_{8q","PPMNGNHGGONNLFAJKK":{"ECHAGOLMEHCN":{"LJNHE":"$]AP*.>#{S`c#VcSy"}},"ONBMEJGFIFMPO":"p,mtx?C"}],"HDDDGJFELLA":true,"KEEBNLDHJAPJNIGHHLPPCCEDJ":[],"BHCNDBIIICDJDB":{"MGAGILEALPHFOPBO":{}}},{"OAJHOPFELNNIJPFBNBG":{"KJKGAGFJNGBIGBM":{"ECHAGOLMEHCN":{"LJNHE":"5y:oMq%ea #9u+F.A","PPMNGNHGGONNLFAJKK":{"ECHAGOLMEHCN":{"LJNHE":"x\"7:h+D7K{TC9`","HGFOBILDJONCIHF":"/#_v9kRtI'L_\\dtQl","PPMNGNHGGONNLFAJKK":{"ECHAGOLMEHCN":{"LJNHE":"mZH3+r@7uv07uY<,4S9{z`cyzYj8zfv5{XW~(%*f@\\r?Fug"}},"ONBMEJGFIFMPO":"Az[M`Q"}],"HDDDGJFELLA":true,"KEEBNLDHJAPJNIGHHLPPCCEDJ":[],"BHCNDBIIICDJDB":{"MGAGILEALPHFOPBO":{}}}],"OGFDLCDDIPBFH":["TlBC40,WgNae"],"KOOJKBKFOKBH":[]})"}; + + auto paths = std::vector>>{ + std::vector>{ + {static_cast(2), "EIFGPHGLOPELFBN", -1}} + // + }; + + return std::pair{std::move(input), std::move(paths)}; +} + +void BM_unit_tests(nvbench::state& state) +{ + std::vector input; + std::vector>>> + paths; + + ////////////////////////////////////////////////////////////////////////////////////////// + { + auto [new_input, new_paths] = debug_data1(); + input.emplace_back(std::move(new_input)); + paths.emplace_back(std::move(new_paths)); + } + { + auto [new_input, new_paths] = debug_data2(); + input.emplace_back(std::move(new_input)); + paths.emplace_back(std::move(new_paths)); + } + { + auto [new_input, new_paths] = debug_data3(); + input.emplace_back(std::move(new_input)); + paths.emplace_back(std::move(new_paths)); + } + { + auto [new_input, new_paths] = debug_data4(); + input.emplace_back(std::move(new_input)); + paths.emplace_back(std::move(new_paths)); + } + { + auto [new_input, new_paths] = debug_data5(); + input.emplace_back(std::move(new_input)); + paths.emplace_back(std::move(new_paths)); + } + { + auto [new_input, new_paths] = debug_data6(); + input.emplace_back(std::move(new_input)); + paths.emplace_back(std::move(new_paths)); + } + + { + auto [new_input, new_paths] = debug_data7(); + input.emplace_back(std::move(new_input)); + paths.emplace_back(std::move(new_paths)); + } + + { + auto [new_input, new_paths] = debug_data8(); + input.emplace_back(std::move(new_input)); + paths.emplace_back(std::move(new_paths)); + } + +#if 0 + ////////////////////////////////////////////////////////////////////////////////////////// + for (std::size_t i = 0; i < input.size(); ++i) { + auto const& curr_input = input[i]; + auto const& curr_paths = paths[i]; + + freopen("/dev/null", "w", stdout); + std::vector> output_old_method; + for (auto const& path : curr_paths) { + output_old_method.emplace_back( + spark_rapids_jni::get_json_object(cudf::strings_column_view{curr_input}, path)); + } + auto const output_new_method = spark_rapids_jni::get_json_object_multiple_paths( + cudf::strings_column_view{curr_input}, curr_paths); + freopen("/dev/tty", "w", stdout); + + CUDF_EXPECTS(output_old_method.size() == output_new_method.size(), ""); + for (std::size_t j = 0; j < curr_paths.size(); ++j) { + auto const comp = cudf::test::detail::expect_columns_equal(output_old_method[j]->view(), + output_new_method[j]->view()); + if (!comp) { + printf("Failure at test data %d\n", (int)i + 1); + exit(0); + } + } + } + printf("All test passed.\n"); +#endif +} + +void BM_debug(nvbench::state& state) +{ + auto const stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + + auto const old = state.get_int64("method") == 0; + // debug(old); +} + +NVBENCH_BENCH(BM_get_json_object_single) + .set_name("get_json_object_single") .add_int64_axis("size_bytes", {1'000'000, 10'000'000, 100'000'000, 1'000'000'000}) .add_int64_axis("max_depth", {2, 4, 6, 8}); + +NVBENCH_BENCH(BM_get_json_object_multiple).set_name("get_json_object_multiple"); + +// NVBENCH_BENCH(BM_test_path_1).set_name("test_path1"); +// NVBENCH_BENCH(BM_test_path_2).set_name("test_path2"); + +NVBENCH_BENCH(BM_debug).set_name("debug").add_int64_axis("method", {0, 1}); +NVBENCH_BENCH(BM_unit_tests).set_name("tests"); From 571fc3648632d8c89728e791fb72ffcf26d54dfe Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 24 Jul 2024 22:47:08 -0700 Subject: [PATCH 4/5] Revert "Add test data" This reverts commit 2432f338a596ef51bbed8d9c10e54648a80c64bc. --- src/main/cpp/benchmarks/get_json_object.cu | 1216 +------------------- 1 file changed, 5 insertions(+), 1211 deletions(-) diff --git a/src/main/cpp/benchmarks/get_json_object.cu b/src/main/cpp/benchmarks/get_json_object.cu index cbcd266d64..51f9299dba 100644 --- a/src/main/cpp/benchmarks/get_json_object.cu +++ b/src/main/cpp/benchmarks/get_json_object.cu @@ -17,11 +17,8 @@ #include #include -#include #include -#include -#include #include #include #include @@ -29,56 +26,6 @@ #include #include -#include -#include -#include - -// #define printf(...) void(0) -// #define fflush(...) void(0) - -class Timer { - protected: - using Clock = std::chrono::high_resolution_clock; - - public: - Timer() = default; - virtual ~Timer() = default; - - void tick() - { - assert(!m_TimerTicked); - m_StartTime = Clock::now(); - m_TimerTicked = true; - } - - double tock() - { - assert(m_TimerTicked); - m_EndTime = Clock::now(); - m_TimerTicked = false; - m_ElapsedTime = std::chrono::duration(m_EndTime - m_StartTime).count(); - - return m_ElapsedTime; - } - - std::string getRunTime() - { - if (m_TimerTicked) { tock(); } - m_StrBuilder.str(""); - m_StrBuilder << std::to_string(m_ElapsedTime); - m_StrBuilder << "ms"; - return m_StrBuilder.str(); - } - - private: - Clock::time_point m_StartTime; - Clock::time_point m_EndTime; - std::stringstream m_StrBuilder; - - double m_ElapsedTime{0.0}; - bool m_TimerTicked{false}; -}; - // #define DEBUG_PRINT #ifdef DEBUG_PRINT @@ -139,7 +86,7 @@ std::vector to_host_strings(CV const& c) } // namespace #endif // #ifdef DEBUG_PRINT -constexpr auto list_depth = 1; +constexpr auto list_depth = 2; constexpr auto min_width = 10; constexpr auto max_width = 10; @@ -153,8 +100,7 @@ auto generate_input(std::size_t size_bytes, cudf::size_type max_depth) .list_depth(list_depth) .list_type(cudf::type_id::STRING) .struct_depth(max_depth > list_depth ? max_depth - list_depth : 1) - .struct_types(std::vector{ - cudf::type_id::LIST, cudf::type_id::STRING, cudf::type_id::INT32}); + .struct_types(std::vector{cudf::type_id::LIST}); auto const input_table = create_random_table( std::vector{cudf::type_id::INT32, cudf::type_id::STRING, cudf::type_id::STRUCT}, @@ -187,7 +133,7 @@ auto generate_input(std::size_t size_bytes, cudf::size_type max_depth) return std::move(json_strings); } -void BM_get_json_object_single(nvbench::state& state) +void BM_get_json_object(nvbench::state& state) { auto const size_bytes = static_cast(state.get_int64("size_bytes")); auto const max_depth = static_cast(state.get_int64("max_depth")); @@ -217,1159 +163,7 @@ void BM_get_json_object_single(nvbench::state& state) state.add_global_memory_reads(size_bytes); } -#if 0 -void BM_get_json_object_multiple(nvbench::state& state) -{ - auto const size_bytes = static_cast(state.get_int64("size_bytes")); - auto const num_paths = state.get_int64("num_paths"); - auto const json_strings = generate_input(size_bytes, 2 /*max_depth*/); - - using path_instruction_type = spark_rapids_jni::path_instruction_type; - using instruction_array = std::vector>; - std::vector instructions_arrays; - - instructions_arrays.emplace_back(); - instructions_arrays.back().emplace_back(path_instruction_type::NAMED, "int32", -1); - - if (num_paths > 1) { - instructions_arrays.emplace_back(); - instructions_arrays.back().emplace_back(path_instruction_type::NAMED, "string", -1); - } - - if (num_paths > 2) { - instructions_arrays.emplace_back(); - instructions_arrays.back().emplace_back(path_instruction_type::NAMED, "struct", -1); - instructions_arrays.back().emplace_back(path_instruction_type::NAMED, "0", -1); - } - - if (num_paths > 3) { - instructions_arrays.emplace_back(); - instructions_arrays.back().emplace_back(path_instruction_type::NAMED, "struct", -1); - instructions_arrays.back().emplace_back(path_instruction_type::NAMED, "2", -1); - } - - state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - // Can also verify at https://jsonpath.com/. -#if 1 - [[maybe_unused]] auto const output = spark_rapids_jni::get_json_object( - cudf::strings_column_view{json_strings->view()}, instructions_arrays); -#else - for (int64_t i = 0; i < num_paths; ++i) { - auto const& instructions = instructions_arrays[i]; - [[maybe_unused]] auto const output = spark_rapids_jni::get_json_object( - cudf::strings_column_view{json_strings->view()}, instructions); -#ifdef DEBUG_PRINT - { - auto const strs = to_host_strings(output->view()); - std::cout << "First output row: \n" << strs.front() << std::endl << std::endl << std::endl; - } -#endif // #ifdef DEBUG_PRINT - } -#endif - }); - state.add_global_memory_reads(size_bytes); -} -#endif - -#if 1 -using spark_rapids_jni::path_instruction_type; -std::vector>> generate_paths0() -{ - return std::vector>>{ - std::vector>{ - {static_cast(2), "EIFGPHGLOPELFBN", -1}} - // - }; -} - -std::vector>> generate_paths1() -{ - return std::vector>>{ - std::vector>{ - {static_cast(1), "", 0}, - {static_cast(2), "GHPKNICLNDAGCNDBMFGEK", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "KIKNFPPAPGDO", -1}, - {static_cast(2), "KLFALIBALPPK", -1}, - {static_cast(2), "HGABIFNPHAHHGP", -1}} - // - }; -} - -std::vector>> generate_paths2() -{ - auto tmp = std::vector>>{ - std::vector>{ - {static_cast(2), "JEBEDJPKEFHPHGLLGPM", -1}}, - - std::vector>{ - {static_cast(2), "FLMEPG", -1}, - {static_cast(2), "CGEGPD", -1}}, - - std::vector>{ - {static_cast(2), "JACICCCIMMHJHKPDED", -1}, - {static_cast(2), "ACHCPIHLFCPHMBPNKJNOLNO", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(0), "", -1}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "JACICCCIMMHJHKPDED", -1}, - {static_cast(2), "OGGC", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "MDGA", -1}}, - - std::vector>{ - {static_cast(2), "AGHF", -1}, - {static_cast(2), "DPKEAPDACLPHGPEMH", -1}}, - - std::vector>{ - {static_cast(2), "AGHF", -1}, - {static_cast(2), "ONNILHPABGIKKFJOEK", -1}}, - - std::vector>{ - {static_cast(2), "AGHF", -1}, - {static_cast(2), "FFFPOENCNBBNOOMOJGDBNIPD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "POFNDBFHDEJ", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(0), "", -1}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "PIGOFCPIPPBNNB", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "CCBJKBHGPBJCKFPCBHGLOAFE", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "LMPCGHBIJGCIPDPNELPBCOP", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "PKBGI", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "ILPIJKBLDB", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "GHBBEOAC", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "EKGPKGCJPMI", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "BDEGLFGMCPKOCNDGJMFPANNBPK", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "LILJMMPPO", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "EAGCHCMLMOLGJK", -1}, - {static_cast(2), "BEACAHEBBO", -1}, - {static_cast(2), "BNLFCI", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "EAGCHCMLMOLGJK", -1}, - {static_cast(2), "BEACAHEBBO", -1}, - {static_cast(2), "GPIHMJ", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "EAGCHCMLMOLGJK", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GJFKCFJELPJEDBAD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "EAGCHCMLMOLGJK", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "DLJPDEPFEKDCKBI", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(0), "", -1}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "PMJPCGCHAALKBPKHDM", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "OCFGAF", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "GMJICFMBNPLBEOLMGDN", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "CBMI", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "NPAGLLFCHAI", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "LFKAJEPMJPLGLICEEMAHFEJGPLGIAKPIOPPP", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "HGNHKIOEGKIJJJPEC", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "JAGGKPKOICKOBABAJPNHF", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "PLEJAKDBBGLCDLGDIBHPPBHB", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "MMNHNPKGLLBJMAOGOCBEOIOKIM", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "JLKDBLFFFPPCNANBKMELJKFOPKPNC", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "OCJGMOAJJKBKNCHOJKBJG", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "PMOAGIJAFOGGLINIOEBFGHBN", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "JPDILOFKPCNBKDB", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "CPBFNDGC", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "KPOPPCFLFCNAPIJEDJDGGFBOPLDCMLLGOMO", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "LBDGCNJNOGMJPNHMLLBMA", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "EIHBDLNJDOAHPMCNGGLLEF", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "GIPPDMMAFOBAALMHMGJBM", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "FKBODHACMMGHL", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "KMEJHDA", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "FKBODHACMMGHL", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "CJKIKCGA", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "HFFDKEDMFBAKEHHM", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "KGJLLAPHJNKCEOIAMCAABCJP", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 1}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "KLJNBPLECGCA", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "NBJNFKKKCHEGCABDGKG", -1}, - {static_cast(2), "BEACAHEBBO", -1}, - {static_cast(2), "BNLFCI", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "NBJNFKKKCHEGCABDGKG", -1}, - {static_cast(2), "BEACAHEBBO", -1}, - {static_cast(2), "GPIHMJ", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "NBJNFKKKCHEGCABDGKG", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GJFKCFJELPJEDBAD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "NBJNFKKKCHEGCABDGKG", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "AOHKGCPAOGANLKEJDLMIGDD", -1}, - {static_cast(2), "BEACAHEBBO", -1}, - {static_cast(2), "BNLFCI", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "AOHKGCPAOGANLKEJDLMIGDD", -1}, - {static_cast(2), "BEACAHEBBO", -1}, - {static_cast(2), "GPIHMJ", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "AOHKGCPAOGANLKEJDLMIGDD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "IKHLECMHMONKLKIBD", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "PNJPGEHPDLMPBDMFPLKABFFGG", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "IGAJPHHGOENI", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "LDPMFNAGLJGDMFOLAKH", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "KMEJHDA", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "LDPMFNAGLJGDMFOLAKH", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "CJKIKCGA", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "BFAJJIOLJBEOMFKLE", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "DOONHL", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GMFDD", -1}}, - - std::vector>{ - {static_cast(2), "OCIKAF", -1}}, - - std::vector>{ - {static_cast(2), "IBMBCGNOCGCPCEN", -1}, - {static_cast(0), "", -1}, - {static_cast(2), "GLNLBEA", -1}} - // - }; - - // tmp.resize(std::min(8UL, tmp.size())); - return tmp; - // return std::vector>>( - // tmp.begin() + 7, tmp.end()); -} - -std::vector>> generate_paths3() -{ - return std::vector>>{ - std::vector>{ - {static_cast(2), "KPIGLEDEOCFELKLJLAFE", -1}} - // - }; -} - -std::vector>> generate_paths4() -{ - return std::vector>>{ - std::vector>{ - {static_cast(2), "NHKDIEPJNND", -1}, - {static_cast(2), "DPBFKLKAKDHLMDLIONCCLJ", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GPGACKDIBMPAKJMDMJ", -1}}, - - std::vector>{ - {static_cast(2), "NHKDIEPJNND", -1}, - {static_cast(2), "DPBFKLKAKDHLMDLIONCCLJ", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "NOIIFOJOPJP", -1}}, - - std::vector>{ - {static_cast(2), "NHKDIEPJNND", -1}, - {static_cast(2), "DPBFKLKAKDHLMDLIONCCLJ", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "CEJOOHNF", -1}}, - - std::vector>{ - {static_cast(2), "NHKDIEPJNND", -1}, - {static_cast(2), "DPBFKLKAKDHLMDLIONCCLJ", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "HODJK", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "HHKEKMIIGI", -1}}, - - std::vector>{ - {static_cast(2), "NHKDIEPJNND", -1}, - {static_cast(2), "DPBFKLKAKDHLMDLIONCCLJ", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "KDGJICMEANMA", -1}, - {static_cast(0), "", -1}, - {static_cast(2), "ILEADAN", -1}}, - - std::vector>{ - {static_cast(2), "NHKDIEPJNND", -1}, - {static_cast(2), "DPBFKLKAKDHLMDLIONCCLJ", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "OKPLFLHHEBDJELFA", -1}}, - - std::vector>{ - {static_cast(2), "NHKDIEPJNND", -1}, - {static_cast(2), "CHNFGBB", -1}, - {static_cast(2), "KIKNFPPAPGDO", -1}, - {static_cast(2), "KLFALIBALPPK", -1}, - {static_cast(2), "HGABIFNPHAHHGP", -1}}, - - std::vector>{ - {static_cast(2), "NHKDIEPJNND", -1}, - {static_cast(2), "IHIIKIHHMPFL", -1}, - {static_cast(2), "KCCCHAM", -1}, - {static_cast(2), "KCCCHAM", -1}}, - - std::vector>{ - {static_cast(2), "KFPJHMGFEELFG", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "AFHKGOFNFID", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "DPBFKLKAKDHLMDLIONCCLJ", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "CEJOOHNF", -1}}, - - std::vector>{ - {static_cast(2), "KFPJHMGFEELFG", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "AFHKGOFNFID", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "DPBFKLKAKDHLMDLIONCCLJ", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "HODJK", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "HHKEKMIIGI", -1}}, - - std::vector>{ - {static_cast(2), "KFPJHMGFEELFG", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "AFHKGOFNFID", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "DPBFKLKAKDHLMDLIONCCLJ", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "OKPLFLHHEBDJELFA", -1}}, - - std::vector>{ - {static_cast(2), "JJKPNPFMNICGLC", -1}, - {static_cast(2), "GGLF", -1}, - {static_cast(2), "JKKJDAKAB", -1}}, - - std::vector>{ - {static_cast(2), "KPIGLEDEOCFELKLJLAFE", -1}}, - - std::vector>{ - {static_cast(2), "PACKGGMDGCLEHD", -1}, - {static_cast(2), "IAFMNJMMNJPDAAHND", -1}}, - - std::vector>{ - {static_cast(2), "PACKGGMDGCLEHD", -1}, - {static_cast(2), "MNIMBEMMOJFHILDMDBML", -1}} - - // - }; -} - -#endif - -#if 1 - -std::vector>> -generate_paths_test_1() -{ - /* - * '$.NHKDIEPJNND.DPBFKLKAKDHLMDLIONCCLJ[0].OKPLFLHHEBDJELFA' - * '$.NHKDIEPJNND.CHNFGBB.KIKNFPPAPGDO.KLFALIBALPPK.HGABIFNPHAHHGP' - * '$.NHKDIEPJNND.IHIIKIHHMPFL.KCCCHAM.KCCCHAM' - */ - return std::vector>>{ - - std::vector>{ - {static_cast(2), "NHKDIEPJNND", -1}, - {static_cast(2), "DPBFKLKAKDHLMDLIONCCLJ", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "OKPLFLHHEBDJELFA", -1}}, - - std::vector>{ - {static_cast(2), "NHKDIEPJNND", -1}, - {static_cast(2), "CHNFGBB", -1}, - {static_cast(2), "KIKNFPPAPGDO", -1}, - {static_cast(2), "KLFALIBALPPK", -1}, - {static_cast(2), "HGABIFNPHAHHGP", -1}}, - - std::vector>{ - {static_cast(2), "NHKDIEPJNND", -1}, - {static_cast(2), "IHIIKIHHMPFL", -1}, - {static_cast(2), "KCCCHAM", -1}, - {static_cast(2), "KCCCHAM", -1}} - // - }; -} - -std::vector>> -generate_paths_test_2() -{ - /* - * '$.AENBHHGIABBBDDGOEI.EAGCHCMLMOLGJK.BEACAHEBBO.BNLFCI' - * '$.AENBHHGIABBBDDGOEI.EAGCHCMLMOLGJK.BEACAHEBBO.GPIHMJ' - * '$.AENBHHGIABBBDDGOEI.EAGCHCMLMOLGJK.CGEGPD[0].GJFKCFJELPJEDBAD' - * '$.AENBHHGIABBBDDGOEI.EAGCHCMLMOLGJK.CGEGPD[0].GMFDD' - */ - return std::vector>>{ - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "EAGCHCMLMOLGJK", -1}, - {static_cast(2), "BEACAHEBBO", -1}, - {static_cast(2), "BNLFCI", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "EAGCHCMLMOLGJK", -1}, - {static_cast(2), "BEACAHEBBO", -1}, - {static_cast(2), "GPIHMJ", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "EAGCHCMLMOLGJK", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GJFKCFJELPJEDBAD", -1}}, - - std::vector>{ - {static_cast(2), "AENBHHGIABBBDDGOEI", -1}, - {static_cast(2), "EAGCHCMLMOLGJK", -1}, - {static_cast(2), "CGEGPD", -1}, - {static_cast(1), "", 0}, - {static_cast(2), "GMFDD", -1}}, - - // - }; -} - -#endif - -void test(rmm::cuda_stream_view stream, int method, bool warm_up = false) -{ - std::vector>>> - paths(5); -#if 1 - paths[0] = generate_paths0(); - paths[1] = generate_paths1(); - paths[3] = generate_paths3(); -#endif - paths[2] = generate_paths2(); - paths[4] = generate_paths4(); - - printf("Method: %d\n", method); - fflush(stdout); - - { - int idx{0}; - std::size_t count{0}; - for (auto const& path : paths) { - if (warm_up) { - fprintf(stdout, "Path %d, size: %d\n", idx++, (int)path.size()); - } else { - printf("Path %d, size: %d\n", idx++, (int)path.size()); - } - count += path.size(); - } - printf("Total path: %d\n", (int)count); - fflush(stdout); - } - - auto const read_opts = - cudf::io::parquet_reader_options::builder( - cudf::io::source_info{"/home/nghiat/Devel/data/WM_MOCKED_3/data.parquet"}) - .build(); - - auto const limit = 256 * 1024 * 1024UL; - auto reader = cudf::io::chunked_parquet_reader(limit, 4 * limit, read_opts); - - Timer timer; - double test_time = 0; - int num_chunks{0}; - - do { - auto chunk = reader.read_chunk(); - num_chunks++; - stream.synchronize(); - - timer.tick(); - if (method == 0) { - /* Test with 80 paths: - * Test time: 7189.04ms, num chunks: 1 - * Test time: 410163ms, num chunks: 51 - * - * Test with 15 paths: - * Test time: 117.39ms, num chunks: 1 - * Test time: 5112.7ms, num chunks: 51 - */ - for (int i = 0; i < 5; ++i) { - for (auto const& path : paths[i]) { - if (path.size() > 0) { - [[maybe_unused]] auto const output = spark_rapids_jni::get_json_object( - cudf::strings_column_view{chunk.tbl->get_column(i).view()}, path); - } - } - } - } else if (method == 1) { - /* Test with 80 paths: - * Test time: 6896.2ms, num chunks: 1 - * Test time: 385526ms, num chunks: 51 - * - * Test with 15 paths: - * Test time: 144.366ms, num chunks: 1 - * Test time: 4889.46ms, num chunks: 51 - */ - // for (int i = 0; i < 5; ++i) { - // if (paths[i].size() > 0) { - // [[maybe_unused]] auto const output = spark_rapids_jni::get_json_object( - // cudf::strings_column_view{chunk.tbl->get_column(i).view()}, paths[i]); - // } - // } - } else { - for (int i = 0; i < 5; ++i) { - if (paths[i].size() > 0) { - /* - * thread par, max path size: - * 2: 40s - * 4: 17s - * 6: 15s - * 8: 11.8s - * 10: 9.3s - * 16: 7.6s - * 32: 5.9s - * - * warp par, max path size: - * 2: 19s - * 4: 13s - * 6: 13s - * 8: 12s - * 10: 11.5s - * 16: 10.6s - * 32: 10.2s - */ - - constexpr int path_size = 4; - - std::size_t offset = 0; - std::vector>> segment; - segment.reserve(path_size); - while (offset < paths[i].size()) { - auto const last = std::min(offset + path_size, paths[i].size()); - segment.resize(0); - segment.insert(segment.end(), paths[i].begin() + offset, paths[i].begin() + last); - offset += path_size; - - [[maybe_unused]] auto const output = spark_rapids_jni::get_json_object_multiple_paths( - cudf::strings_column_view{chunk.tbl->get_column(i).view()}, segment); - } - } - } - } - stream.synchronize(); - test_time += timer.tock(); - - if (num_chunks > 10) { break; } - - if (warm_up) { - break; // just test one chunk - } - } while (reader.has_next()); - - std::cout << "Test time: " << test_time << "ms, num chunks: " << num_chunks << std::endl; -} - -void BM_get_json_object_multiple(nvbench::state& state) -{ - auto const stream = cudf::get_default_stream(); - state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); -#if 0 - test(stream, 0, true); // warm up - test(stream, 0, false); -#else - // test(stream, 0, true); // warm up - // test(stream, 1, true); // warm up - test(stream, 2, true); // warm up - - // test(stream, 0, false); // warm up - // test(stream, 1, false); // warm up - test(stream, 2, false); // warm up -#endif - // 7/8: 32774 ms - // 7/10: 26340 ms -} - -#if 0 -void print_nodes(std::vector const& nodes) -{ - std::cout << "\n\n\nHost data\n"; - int count{0}; - for (auto& node : nodes) { - std::cout << "node: " << count++ << std::endl; - std::cout << " " - << "level: " << node.level << std::endl; - std::cout << " " - << "type: " << static_cast(node.data.type) << std::endl; - std::cout << " " - << "name: " << node.data.name << std::endl; - std::cout << " " - << "index: " << node.data.index << std::endl; - std::cout << " " - << "parent: " << node.parent << std::endl; - std::cout << " output indices: "; - for (auto i : node.output_indices) { - std::cout << i << ", "; - } - std::cout << std::endl; - std::cout << " children: "; - for (auto i : node.children) { - std::cout << i << ", "; - } - std::cout << std::endl; - std::cout << " " - << "first child: " << node.first_child << std::endl; - std::cout << " " - << "num children: " << node.num_children << std::endl; - std::cout << " " - << "next sibling: " << node.next_sibling << std::endl; - } - fflush(stdout); -} - - -void BM_test_path_1(nvbench::state& state) -{ - auto const paths = generate_paths_test_1(); - auto output = spark_rapids_jni::generate_path_nodes(paths); - print_nodes(output); -} - -void BM_test_path_2(nvbench::state& state) -{ - auto const paths = generate_paths_test_2(); - auto output = spark_rapids_jni::generate_path_nodes(paths); - print_nodes(output); -} -#endif - -auto debug_data1() -{ - auto input = cudf::test::strings_column_wrapper{"{'a':1, 'b':2, 'c':[1, 2, 3]}"}; - - auto paths = std::vector>>{ - std::vector>{ - {static_cast(2), "a", -1}}, - std::vector>{ - {static_cast(2), "b", -1}}, - std::vector>{ - {static_cast(2), "c", -1}, - {static_cast(0), "", -1}}, - std::vector>{ - {static_cast(2), "c", -1}, - {static_cast(1), "", 1}} - // - }; - - return std::pair{std::move(input), std::move(paths)}; -} - -auto debug_data2() -{ - auto input = cudf::test::strings_column_wrapper{"{\"k1\":{\"k2\":\"v2\"}}"}; - - auto paths = std::vector>>{ - std::vector>{ - {static_cast(2), "k1", -1}, - {static_cast(2), "k2", -1}} - // - }; - - return std::pair{std::move(input), std::move(paths)}; -} - -auto debug_data3() -{ - auto input = cudf::test::strings_column_wrapper{ - "{\"k1\":{\"k2\":{\"k3\":{\"k4\":{\"k5\":{\"k6\":{\"k7\":{\"k8\":\"v8\"}}}}}}}}"}; - - auto paths = std::vector>>{ - std::vector>{ - {static_cast(2), "k1", -1}, - {static_cast(2), "k2", -1}, - {static_cast(2), "k3", -1}, - {static_cast(2), "k4", -1}, - {static_cast(2), "k5", -1}, - {static_cast(2), "k6", -1}, - {static_cast(2), "k7", -1}, - {static_cast(2), "k8", -1}, - } - // - }; - - return std::pair{std::move(input), std::move(paths)}; -} - -auto debug_data4() -{ - auto input = cudf::test::strings_column_wrapper{ - "{\"brand\":\"ssssss\",\"duratRon\":15,\"eqTosuresurl\":\"\",\"RsZxarthrl\":false," - "\"xonRtorsurl\":\"\",\"xonRtorsurlstOTe\":0,\"TRctures\":[{\"RxaGe\":\"VttTs:\\/\\/" - "feed-RxaGe.baRdu.cox\\/0\\/TRc\\/" - "-196588744s840172444s-773690137.zTG\"}],\"Toster\":\"VttTs:\\/\\/feed-RxaGe.baRdu.cox\\/0\\/" - "TRc\\/" - "-196588744s840172444s-773690137.zTG\",\"reserUed\":{\"bRtLate\":391.79,\"xooUZRke\":26876," - "\"nahrlIeneratRonNOTe\":0,\"useJublRc\":6,\"URdeoRd\":821284086},\"tRtle\":" - "\"ssssssssssmMsssssssssssssssssss\",\"url\":\"s{storehrl}\",\"usersTortraRt\":\"VttTs:\\/\\/" - "feed-RxaGe.baRdu.cox\\/0\\/TRc\\/" - "-6971178959s-664926866s-6096674871.zTG\",\"URdeosurl\":\"http:\\/\\/nadURdeo2.baRdu.cox\\/" - "5fa3893aed7fc0f8231dab7be23efc75s820s6240.xT3\",\"URdeoRd\":821284086}"}; - - auto paths = std::vector>>{ - std::vector>{ - {static_cast(2), "URdeosurl", -1}, - } - // - }; - - return std::pair{std::move(input), std::move(paths)}; -} - -auto debug_data5() -{ - auto input = cudf::test::strings_column_wrapper{ - "[ [[[ {'k': 'v1'} ], {'k': 'v2'}]], [[{'k': 'v3'}], {'k': 'v4'}], {'k': 'v5'} ]"}; - - auto paths = std::vector>>{ - std::vector>{ - {static_cast(0), "", -1}, - {static_cast(0), "", -1}, - {static_cast(2), "k", -1}} - // - }; - - return std::pair{std::move(input), std::move(paths)}; -} - -auto debug_data6() -{ - auto input = cudf::test::strings_column_wrapper{"[ {'k': 'v1'}, {'k': 'v2'} ]"}; - - auto paths = std::vector>>{ - std::vector>{ - {static_cast(0), "", -1}, - {static_cast(2), "k", -1}} - // - }; - - return std::pair{std::move(input), std::move(paths)}; -} - -auto debug_data7() -{ - auto input = cudf::test::strings_column_wrapper{"[1, [21, 22], 3]"}; - - auto paths = std::vector>>{ - std::vector>{ - {static_cast(0), "", -1}, - } - // - }; - - return std::pair{std::move(input), std::move(paths)}; -} - -auto debug_data8() -{ - auto input = cudf::test::strings_column_wrapper{ - "[ {'k': [0, 1]}, {'k': {'a': 'b'}}, {'k': [10, 11, 12]} ]"}; - - auto paths = std::vector>>{ - std::vector>{ - {static_cast(0), "", -1}, - {static_cast(2), "k", -1}}, - std::vector>{ - {static_cast(0), "", -1}, - {static_cast(2), "k", -1}, - {static_cast(0), "", -1}, - } - // - }; - - return std::pair{std::move(input), std::move(paths)}; -} - -auto debug_data9() -{ - auto input = cudf::test::strings_column_wrapper{ - R"({"KAEMAPGKBFGNHPJ":"G2s!8-b@n2p","GKOGAKJDMEMFLJJD":[{"FMCMNJH":"FsZ!","OOPCOILM":".LX4HI1#6ZXQu"},{"FMCMNJH":"p,mt","OOPCOILM":"6zfMlis\"/pH,n-:v\\.;N[","MLDKOIIPGBFAKKLJC":[],"DHLLEABGIMMFBILMCJI":{},"IIEEGFP":9133996313993,"EIFGPHGLOPELFBN":[{"OAJHOPFELNNIJPFBNBG":{"KJKGAGFJNGBIGBM":{"ECHAGOLMEHCN":{"LJNHE":"\"/eP4`=WHg~kUOm2AJoXHk"}},"BJHMGKLP":"h1uU#","ONLPCJKJDIAKK":false,"GACCEMNH":"d+Z%N\\jZ9']wGOl%V9gj@"},"DIMPFKELHHIEHNM":[{"JBMLPPOFI":5,"GAHIANAFHIBONL":"rvtH}C0uFmK\\#A*D)x}I7c@5M'Z_fm6>PviE=(J]T+KlsNuO-\"lxD~.N/9EQ<^l%Qb-eFRBv","PPMNGNHGGONNLFAJKK":{"ECHAGOLMEHCN":{"LJNHE":"kH#D'(VRt)*kH<{ITQ^|By~:6o2","PPMNGNHGGONNLFAJKK":{"ECHAGOLMEHCN":{"LJNHE":""}},"ONBMEJGFIFMPO":"h1uU#"}],"HDDDGJFELLA":true,"KEEBNLDHJAPJNIGHHLPPCCEDJ":[],"BHCNDBIIICDJDB":{"MGAGILEALPHFOPBO":{}}},{"OAJHOPFELNNIJPFBNBG":{"KJKGAGFJNGBIGBM":{"ECHAGOLMEHCN":{"LJNHE":"s[2nryS^V|=D]h5JgNHE+h"}},"BJHMGKLP":"Az[M`Q","ONLPCJKJDIAKK":true,"GACCEMNH":"`GkuDVB)},b9kD+(MUEJU5eSw5f","PPMNGNHGGONNLFAJKK":{"ECHAGOLMEHCN":{"LJNHE":"!mJue a[n}#S:Hn"}},"ONBMEJGFIFMPO":"h1uU#"}],"HDDDGJFELLA":true,"KEEBNLDHJAPJNIGHHLPPCCEDJ":[],"BHCNDBIIICDJDB":{"MGAGILEALPHFOPBO":{}}},{"OAJHOPFELNNIJPFBNBG":{"KJKGAGFJNGBIGBM":{"ECHAGOLMEHCN":{"LJNHE":":e:)wr^Mp;hk7YiPb"}},"BJHMGKLP":"Az[M`","ONLPCJKJDIAKK":true,"GACCEMNH":"|<*(OwRo([egvXq"},"DIMPFKELHHIEHNM":[{"JBMLPPOFI":1,"GAHIANAFHIBONL":"ire`w7YQ=+6v<>ML\\o%0O@12jW1qQzy?5E`ye{!TVMl`$i=cR)k0TT_KK(bf3wGEHn&K]3g3IhZO7\\tQ2a","PPMNGNHGGONNLFAJKK":{"ECHAGOLMEHCN":{"LJNHE":"yv7*i\"b>gLJ;l3l?CmCUS@i]ce:65U2o;buv%zo_/Ox>!t)+;kjVj56g'84"}},"ONBMEJGFIFMPO":"h1uU#C"}],"HDDDGJFELLA":true,"KEEBNLDHJAPJNIGHHLPPCCEDJ":[]},{"OAJHOPFELNNIJPFBNBG":{"KJKGAGFJNGBIGBM":{"ECHAGOLMEHCN":{"LJNHE":"tU'S+mwf}H,Hp)VmgTXV['"}},"BJHMGKLP":"p,mtx?CX","ONLPCJKJDIAKK":false,"GACCEMNH":"}+H1}a1o!H1ji&%9ZC+"},"DIMPFKELHHIEHNM":[{"JBMLPPOFI":1,"GAHIANAFHIBONL":"}\"UN}:D{?7}%{kXHzGI$x\\e|,D0kvN","HGFOBILDJONCIHF":"h1uU#C[.Qc","PPMNGNHGGONNLFAJKK":{"ECHAGOLMEHCN":{"LJNHE":"iu,Eu%,vEPu)Joovs4SDwP"}},"ONBMEJGFIFMPO":"Az[M"}],"HDDDGJFELLA":true,"KEEBNLDHJAPJNIGHHLPPCCEDJ":[],"BHCNDBIIICDJDB":{"MGAGILEALPHFOPBO":{}}},{"OAJHOPFELNNIJPFBNBG":{"KJKGAGFJNGBIGBM":{"ECHAGOLMEHCN":{"LJNHE":">bE[dUO=4eXQk 6vL,rMlN"}},"BJHMGKLP":"Az[M`Q","ONLPCJKJDIAKK":true,"GACCEMNH":"Iw92[v,?T`o2G"},"DIMPFKELHHIEHNM":[{"JBMLPPOFI":1,"GAHIANAFHIBONL":"EmJ@.R?h","PPMNGNHGGONNLFAJKK":{"ECHAGOLMEHCN":{"LJNHE":""}},"ONBMEJGFIFMPO":"7c5vD"}],"HDDDGJFELLA":true,"KEEBNLDHJAPJNIGHHLPPCCEDJ":[],"BHCNDBIIICDJDB":{"MGAGILEALPHFOPBO":{}}},{"OAJHOPFELNNIJPFBNBG":{"KJKGAGFJNGBIGBM":{"ECHAGOLMEHCN":{"LJNHE":"%,b~~]9sV1v#89H;-EU'JY{g"}},"BJHMGKLP":"FsZ!/*!","ONLPCJKJDIAKK":false,"GACCEMNH":"Z4N/&MHYFJIR/1rqLn"},"DIMPFKELHHIEHNM":[{"JBMLPPOFI":8,"GAHIANAFHIBONL":"v]W\\\\$JTlKpij#:v+ta`1zUJJMf","HGFOBILDJONCIHF":" aMZ({x5#1N=9(yM\"C","PPMNGNHGGONNLFAJKK":{"ECHAGOLMEHCN":{"LJNHE":"/Vo46pg='[eV+"}},"ONBMEJGFIFMPO":"Az[M`Q"}],"HDDDGJFELLA":true,"KEEBNLDHJAPJNIGHHLPPCCEDJ":[],"BHCNDBIIICDJDB":{"MGAGILEALPHFOPBO":{}}},{"OAJHOPFELNNIJPFBNBG":{"KJKGAGFJNGBIGBM":{"ECHAGOLMEHCN":{"LJNHE":"$&ys4cUgN*OwnXH\\u"}},"BJHMGKLP":"FsZ","ONLPCJKJDIAKK":true,"GACCEMNH":"sp.R`h{>oKPb@H:HvtN&"},"DIMPFKELHHIEHNM":[{"JBMLPPOFI":3,"GAHIANAFHIBONL":"rs}8P(6 Co>|5h\"CI)u!\"wuhxfR/Q_\\rVr+v$5nibOOecKZ`INV0kqc&f%9(msUq%+g","HGFOBILDJONCIHF":"7c5vDh-|yd","PPMNGNHGGONNLFAJKK":{"ECHAGOLMEHCN":{"LJNHE":"H)Lm%i=LYm}l@g2]g$5v<8')(7o"}},"ONBMEJGFIFMPO":"h1uU#"}],"HDDDGJFELLA":true,"KEEBNLDHJAPJNIGHHLPPCCEDJ":[],"BHCNDBIIICDJDB":{"MGAGILEALPHFOPBO":{}}},{"OAJHOPFELNNIJPFBNBG":{"KJKGAGFJNGBIGBM":{"ECHAGOLMEHCN":{"LJNHE":"uq=kN4JVxf)kOnw'{3"}},"BJHMGKLP":"FsZ!/*!","ONLPCJKJDIAKK":true,"GACCEMNH":"[X>{IRK/)Se49+QS"},"DIMPFKELHHIEHNM":[{"JBMLPPOFI":5,"GAHIANAFHIBONL":"","PPMNGNHGGONNLFAJKK":{},"ONBMEJGFIFMPO":"p,mtx?CX"}],"HDDDGJFELLA":true,"KEEBNLDHJAPJNIGHHLPPCCEDJ":[],"BHCNDBIIICDJDB":{"MGAGILEALPHFOPBO":{}}}],"LDIHMDHNPANNGNMAI":{"CMBLBPGBPBLFOD":"Az[","LJCMAKCMDLOBC":"aK"},"KDOJAIFEFELP":{},"LHPGDLOGEGNGKCJMHEDKFJCBMH":[{"GEJGHKEHLAPFJ":"Az[M`","MHBGOLHCHBBIOM":{"CLELPFHJMKGHFNGKMBHAHAH":"BMq%ea #9u+F.AO%","KCCCHAM":"{&]1J_iH^}Eq>oE,#@R;T\"N1uwgXdH;M","AOICHPCGCHAMC":"*^r:\">l1+7XRSYU&g\"AU","PEJGENFFO":5074070918344,"CPFLBGNLMPLDFEGDHHG":[],"EIPCCEBCIMIFAEDOL":6202435079206,"AOHEGDPMAEPEAL":["b\\B53^![]\\A:n!","ji?Y()=t_+w-7R"],"KPLEHGGBK":"Az[M`Q.'mn`","AFDJNAEOHKK":8213889546936,"HMLBEEHBBJBIOJDABJ":"FsZ!/*!){O5","GKDACMEJIFMIEKMANHNNCPFGDCDGGGCG":{},"CIECJLNPP":"\\`Tx+HeoX`OU","FJPIMCHIJACHJE":[],"KHBKEAFCB":"7c5vDh-|","HABNLKACAJHCIOPFOPBBK":true,"IOJLOKAIK":1,"JIPBOH":"F","EGNIJOJDDCPKK":"x!ajLb(","GNBDNIMEEFCGKADKOAIE":"p,mtx?CX","HGEFKFCGJJNBBJIGKOO":false,"LGMLEEGAPIKBLFBL":[{"OAJHOPFELNNIJPFBNBG":{"KJKGAGFJNGBIGBM":{"ECHAGOLMEHCN":{"LJNHE":"y&\"MvM*"}},"BJHMGKLP":"h1uU#","ONLPCJKJDIAKK":true,"GACCEMNH":"&4k6jDc{%nzw`HO*\"&K_{8q","PPMNGNHGGONNLFAJKK":{"ECHAGOLMEHCN":{"LJNHE":"$]AP*.>#{S`c#VcSy"}},"ONBMEJGFIFMPO":"p,mtx?C"}],"HDDDGJFELLA":true,"KEEBNLDHJAPJNIGHHLPPCCEDJ":[],"BHCNDBIIICDJDB":{"MGAGILEALPHFOPBO":{}}},{"OAJHOPFELNNIJPFBNBG":{"KJKGAGFJNGBIGBM":{"ECHAGOLMEHCN":{"LJNHE":"5y:oMq%ea #9u+F.A","PPMNGNHGGONNLFAJKK":{"ECHAGOLMEHCN":{"LJNHE":"x\"7:h+D7K{TC9`","HGFOBILDJONCIHF":"/#_v9kRtI'L_\\dtQl","PPMNGNHGGONNLFAJKK":{"ECHAGOLMEHCN":{"LJNHE":"mZH3+r@7uv07uY<,4S9{z`cyzYj8zfv5{XW~(%*f@\\r?Fug"}},"ONBMEJGFIFMPO":"Az[M`Q"}],"HDDDGJFELLA":true,"KEEBNLDHJAPJNIGHHLPPCCEDJ":[],"BHCNDBIIICDJDB":{"MGAGILEALPHFOPBO":{}}}],"OGFDLCDDIPBFH":["TlBC40,WgNae"],"KOOJKBKFOKBH":[]})"}; - - auto paths = std::vector>>{ - std::vector>{ - {static_cast(2), "EIFGPHGLOPELFBN", -1}} - // - }; - - return std::pair{std::move(input), std::move(paths)}; -} - -void BM_unit_tests(nvbench::state& state) -{ - std::vector input; - std::vector>>> - paths; - - ////////////////////////////////////////////////////////////////////////////////////////// - { - auto [new_input, new_paths] = debug_data1(); - input.emplace_back(std::move(new_input)); - paths.emplace_back(std::move(new_paths)); - } - { - auto [new_input, new_paths] = debug_data2(); - input.emplace_back(std::move(new_input)); - paths.emplace_back(std::move(new_paths)); - } - { - auto [new_input, new_paths] = debug_data3(); - input.emplace_back(std::move(new_input)); - paths.emplace_back(std::move(new_paths)); - } - { - auto [new_input, new_paths] = debug_data4(); - input.emplace_back(std::move(new_input)); - paths.emplace_back(std::move(new_paths)); - } - { - auto [new_input, new_paths] = debug_data5(); - input.emplace_back(std::move(new_input)); - paths.emplace_back(std::move(new_paths)); - } - { - auto [new_input, new_paths] = debug_data6(); - input.emplace_back(std::move(new_input)); - paths.emplace_back(std::move(new_paths)); - } - - { - auto [new_input, new_paths] = debug_data7(); - input.emplace_back(std::move(new_input)); - paths.emplace_back(std::move(new_paths)); - } - - { - auto [new_input, new_paths] = debug_data8(); - input.emplace_back(std::move(new_input)); - paths.emplace_back(std::move(new_paths)); - } - -#if 0 - ////////////////////////////////////////////////////////////////////////////////////////// - for (std::size_t i = 0; i < input.size(); ++i) { - auto const& curr_input = input[i]; - auto const& curr_paths = paths[i]; - - freopen("/dev/null", "w", stdout); - std::vector> output_old_method; - for (auto const& path : curr_paths) { - output_old_method.emplace_back( - spark_rapids_jni::get_json_object(cudf::strings_column_view{curr_input}, path)); - } - auto const output_new_method = spark_rapids_jni::get_json_object_multiple_paths( - cudf::strings_column_view{curr_input}, curr_paths); - freopen("/dev/tty", "w", stdout); - - CUDF_EXPECTS(output_old_method.size() == output_new_method.size(), ""); - for (std::size_t j = 0; j < curr_paths.size(); ++j) { - auto const comp = cudf::test::detail::expect_columns_equal(output_old_method[j]->view(), - output_new_method[j]->view()); - if (!comp) { - printf("Failure at test data %d\n", (int)i + 1); - exit(0); - } - } - } - printf("All test passed.\n"); -#endif -} - -void BM_debug(nvbench::state& state) -{ - auto const stream = cudf::get_default_stream(); - state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); - - auto const old = state.get_int64("method") == 0; - // debug(old); -} - -NVBENCH_BENCH(BM_get_json_object_single) - .set_name("get_json_object_single") +NVBENCH_BENCH(BM_get_json_object) + .set_name("get_json_object") .add_int64_axis("size_bytes", {1'000'000, 10'000'000, 100'000'000, 1'000'000'000}) .add_int64_axis("max_depth", {2, 4, 6, 8}); - -NVBENCH_BENCH(BM_get_json_object_multiple).set_name("get_json_object_multiple"); - -// NVBENCH_BENCH(BM_test_path_1).set_name("test_path1"); -// NVBENCH_BENCH(BM_test_path_2).set_name("test_path2"); - -NVBENCH_BENCH(BM_debug).set_name("debug").add_int64_axis("method", {0, 1}); -NVBENCH_BENCH(BM_unit_tests).set_name("tests"); From a580453c0af93371d71f1365d5f80ef5b9e9bbc0 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 24 Jul 2024 22:55:13 -0700 Subject: [PATCH 5/5] Fix signed type Signed-off-by: Nghia Truong --- src/main/cpp/src/get_json_object.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/cpp/src/get_json_object.cu b/src/main/cpp/src/get_json_object.cu index 22dde0c8a0..c5d6595045 100644 --- a/src/main/cpp/src/get_json_object.cu +++ b/src/main/cpp/src/get_json_object.cu @@ -1075,8 +1075,8 @@ std::vector> get_json_object( rmm::exec_policy(stream), d_has_out_of_bound.begin(), d_has_out_of_bound.end(), 0); // Threshold to decide on using thread parallel or warp parallel algorithms. - constexpr int64_t PATH_SIZE_THRESHOLD = 8; - auto const exec_thread_parallel = json_paths.size() >= PATH_SIZE_THRESHOLD; + constexpr std::size_t PATH_SIZE_THRESHOLD = 8; + auto const exec_thread_parallel = json_paths.size() >= PATH_SIZE_THRESHOLD; launch_kernel(exec_thread_parallel, *d_input_ptr, d_path_data, stream); // Do not use parallel check since we do not have many elements.