From e84d0dd758a2423418b0827fa110cfee9bf84d80 Mon Sep 17 00:00:00 2001 From: Keita Iwabuchi Date: Thu, 21 Dec 2023 19:22:01 -0800 Subject: [PATCH 1/3] Add tests for parquet reader - Add test for parquet2json convertor - Add additional parallel parquet reading test --- .../io/arrow_parquet_stream_reader_json.cpp | 3 +- .../detail/arrow_parquet_json_converter.hpp | 72 ++++++------- test/CMakeLists.txt | 9 ++ .../parquet_files_different_sizes/0.parquet | Bin 0 -> 1461 bytes .../parquet_files_different_sizes/1.parquet | Bin 0 -> 1684 bytes .../parquet_files_different_sizes/2.parquet | Bin 0 -> 1461 bytes .../parquet_files_different_sizes/3.parquet | Bin 0 -> 1461 bytes .../parquet_files_different_sizes/4.parquet | Bin 0 -> 1658 bytes .../parquet_files_different_sizes/5.parquet | Bin 0 -> 1650 bytes .../parquet_files_different_sizes/6.parquet | Bin 0 -> 1650 bytes .../parquet_files_different_sizes/7.parquet | Bin 0 -> 1461 bytes test/data/parquet_files_json/data.parquet | Bin 0 -> 5165 bytes .../data.parquet | Bin 2583 -> 0 bytes test/test_arrow_parquet_stream_reader.cpp | 61 ++++++++++- .../test_arrow_parquet_stream_reader_json.cpp | 95 ++++++++++++++++++ 15 files changed, 197 insertions(+), 43 deletions(-) create mode 100644 test/data/parquet_files_different_sizes/0.parquet create mode 100644 test/data/parquet_files_different_sizes/1.parquet create mode 100644 test/data/parquet_files_different_sizes/2.parquet create mode 100644 test/data/parquet_files_different_sizes/3.parquet create mode 100644 test/data/parquet_files_different_sizes/4.parquet create mode 100644 test/data/parquet_files_different_sizes/5.parquet create mode 100644 test/data/parquet_files_different_sizes/6.parquet create mode 100644 test/data/parquet_files_different_sizes/7.parquet create mode 100644 test/data/parquet_files_json/data.parquet delete mode 100644 test/data/parquet_files_no_fixed_len_binary/data.parquet create mode 100644 test/test_arrow_parquet_stream_reader_json.cpp diff --git a/examples/io/arrow_parquet_stream_reader_json.cpp b/examples/io/arrow_parquet_stream_reader_json.cpp index 00cb526d..5b17d1dc 100644 --- a/examples/io/arrow_parquet_stream_reader_json.cpp +++ b/examples/io/arrow_parquet_stream_reader_json.cpp @@ -28,7 +28,8 @@ int main(int argc, char** argv) { << "Arrow Parquet file parser example (reads data as JSON objects)" << std::endl; - std::string dir_name = "../test/data/parquet_files_no_fixed_len_binary/"; + // assuming the build directory is inside the YGM root directory + std::string dir_name = "../test/data/parquet_files_json/"; if (argc == 2) { dir_name = argv[1]; } diff --git a/include/ygm/io/detail/arrow_parquet_json_converter.hpp b/include/ygm/io/detail/arrow_parquet_json_converter.hpp index c42148d0..97df3ebe 100644 --- a/include/ygm/io/detail/arrow_parquet_json_converter.hpp +++ b/include/ygm/io/detail/arrow_parquet_json_converter.hpp @@ -24,44 +24,39 @@ namespace ygm::io::detail { inline boost::json::value read_parquet_element_as_json_value( - const ygm::io::parquet_data_type& type_holder, + const ygm::io::parquet_data_type& type_holder, arrow_parquet_parser::parquet_stream_reader& stream) { boost::json::value out_value; out_value.emplace_null(); - switch (type_holder.type) { - case parquet::Type::BOOLEAN: - stream >> out_value.emplace_bool(); - break; - case parquet::Type::INT32: - stream >> out_value.emplace_int64(); - break; - case parquet::Type::INT64: - stream >> out_value.emplace_int64(); - break; - case parquet::Type::DOUBLE: - stream >> out_value.emplace_double(); - break; - case parquet::Type::FLOAT: - stream >> out_value.emplace_double(); - break; - case parquet::Type::BYTE_ARRAY: { - std::string buf; - stream >> buf; - out_value.emplace_string() = buf; - break; - } - case parquet::Type::FIXED_LEN_BYTE_ARRAY: { - throw std::runtime_error("FIXED_LEN_BYTE_ARRAY is not supported"); - break; - } - case parquet::Type::INT96: - throw std::runtime_error("INT96 is not supported"); - break; - case parquet::Type::UNDEFINED: - [[fallthrough]]; - default: - throw std::runtime_error("Undefined data type"); - break; + // Note: there is no uint types in Parquet + if (type_holder.type == parquet::Type::BOOLEAN) { + stream >> out_value.emplace_bool(); + } else if (type_holder.type == parquet::Type::INT32) { + int32_t buf; + stream >> buf; // need to read to an int32 variable + // Note: there is no int32 type in boost::json + out_value.emplace_int64() = int64_t(buf); + } else if (type_holder.type == parquet::Type::INT64) { + stream >> out_value.emplace_int64(); + } else if (type_holder.type == parquet::Type::FLOAT) { + float buf; + stream >> buf; // need to read to a float variable + // Note: there is no float type in boost::json + out_value.emplace_double() = double(buf); + } else if (type_holder.type == parquet::Type::DOUBLE) { + stream >> out_value.emplace_double(); + } else if (type_holder.type == parquet::Type::BYTE_ARRAY) { + std::string buf; + stream >> buf; + out_value.emplace_string() = buf; + } else if (type_holder.type == parquet::Type::FIXED_LEN_BYTE_ARRAY) { + throw std::runtime_error("FIXED_LEN_BYTE_ARRAY is not supported"); + + } else if (type_holder.type == parquet::Type::INT96) { + throw std::runtime_error("INT96 is not supported"); + + } else { + throw std::runtime_error("Undefined data type"); } return out_value; @@ -75,15 +70,14 @@ inline boost::json::object read_parquet_as_json_helper( const key_container& include_columns = key_container()) { boost::json::object object; for (size_t i = 0; i < schema.size(); ++i) { - const auto& data_type = std::get<0>(schema[i]); - const auto& colum_name = std::get<1>(schema[i]); + const auto& data_type = std::get<0>(schema[i]); + const auto& colum_name = std::get<1>(schema[i]); if (!read_all && std::find(std::begin(include_columns), std::end(include_columns), colum_name) == std::end(include_columns)) { continue; } - object[colum_name] = - read_parquet_element_as_json_value(data_type, reader); + object[colum_name] = read_parquet_element_as_json_value(data_type, reader); } reader.SkipColumns(schema.size()); reader.EndRow(); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index b9f7831f..8719e1ca 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -76,6 +76,15 @@ if (Arrow_FOUND AND Parquet_FOUND) add_ygm_test(test_arrow_parquet_stream_reader) target_link_libraries(MPI_test_arrow_parquet_stream_reader PUBLIC Arrow::arrow_shared Parquet::parquet_shared) + + if (Boost_FOUND) + add_ygm_test(test_arrow_parquet_stream_reader_json) + target_include_directories(MPI_test_arrow_parquet_stream_reader_json + PUBLIC + ${Boost_INCLUDE_DIRS}) + target_link_libraries(MPI_test_arrow_parquet_stream_reader_json PUBLIC + Arrow::arrow_shared Parquet::parquet_shared) + endif() endif() # diff --git a/test/data/parquet_files_different_sizes/0.parquet b/test/data/parquet_files_different_sizes/0.parquet new file mode 100644 index 0000000000000000000000000000000000000000..98418dbee5c6fb06828583da7ee7375380f99f58 GIT binary patch literal 1461 zcma)6ZEM>|5MI(Y9HpO*+Sp)<5j1d+Wl2uBOR%=;~-l`WX zYREr2?l<17NH-r-=o8J>&4pg z8wX9)>@?pUp`%}Z>(sfB)p*%H9-`J^N|rRR8V9KH`u#z505ws&)9(B~7(fQx=DW8D zwU1keu@yR2+WP0k1LuVfy}u=qp9f*eee8aF;O@W&7MEC|OS$7N7g_-YN0|FQ?9$9i zGT8PiJBqg^WhVdl9sD)kZ3Rv@u=^+rkNVE-zOSZro0Tyu$K|0%BBy4M2XVO#MD~i3 zj4=ynW;s@7LHYwnLrC{Rm!183BcJ@iJ?@a%XTd?s8NeQmC+9OB3qP{5vq9B~JsvxO z9U`nvb=`zoS(H@Ve9um-q-50M(n^x3U~p@iq+S#tPv6$<_ySXugiUUX!KPZ8pn`3QY2$ySnqGfhC^R7J6rsXuTAXiyM zt`bg{cHsDmSGY!8WU^p{_ zcUXKbj8aGEb3>?Wo5r?gKG%7l<=UZ^+7hd0#-@IMX9)jMcPs;U`KQrWhh)q+Vb9Tr zG2q{sUldjwIPwM6Mm=hX6|NGDJS)&vj#R}n(@>Xy(VKq(zLV55_2J4E)YuM4g{Nui z_~OR(7SFho67uB?`uD^OY6o&!_vkt2omePBFQH%jE+;d!1-W;*y7pLq;O-fj_zP#^ z7jTa=xJO`*M@#2?-mCF>-x501ae7hoPz3dNrF+o z!U;{EgqN``463`%Zf5yR8Wc*{W4+sG<#OQ!T1fTb-971B#P`gD37e`z%)t1ZO{{Nh1^w6OYyC3`|A|`N*h+rnG&>=2d{RBa X?>o>*v?*bl{tty+Y_Se6aro@juDD(sfZrSkupot$EKKL;oI453oBnDtcG zP=n$h>3i044Re02PwmKY1NL?0yLC!NA0Nx z=1Xdt#N;2ohsA>Lnv$DK+HIB_%e0N_ZEPXyI!rv0j*VPPpBf7Vk>e$6$5fk`dPtNx zW~dr!s6zNXYq}6_xg)ao>au36bR`s_#!@LN+t~z?!-Q<&O@zLymm_beVd?Sv%y34;4Sg{r;K! zdeK6~KeJzw!H-%P<8$~U<6DgDSQz0p#v|MsQA`lF{4oKK*ZwmMZnB0!S*TycI3^C5 zDGAK0paEDyJdhi=1EmKUrw}H*6dgMncBHyedppxkq)>Bv)|n+}X43Ulh!rQ(k?3~E znq%0a6^}%xG1lZO5)&EnJUDG(OF!z5b@v=F952^1y(PX7-{^tZ>fRY0DJDEUg*S+Q zP-_N;%tuOXk$b4Db4CN1w@Iw#N`clQ`G($5p5H39_vG!gL3{g7IqEm`4$*`yLvDD$ ze>=PoNp8@{$EMWeOhur)qSLfgN17Nd>JNuXe+(Sl-~#lX1lCY)Otf0x(;S`hNm9Ra zdO5Og9%>@VNri66(coYO*8a&463hm)8`bv3VxA2k!0U z+#6FKCq(raT zGtGi?w8VKYsmG@iY~5Ty&HvR{8!Fv9Rc=O#zO!A literal 0 HcmV?d00001 diff --git a/test/data/parquet_files_different_sizes/2.parquet b/test/data/parquet_files_different_sizes/2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..98418dbee5c6fb06828583da7ee7375380f99f58 GIT binary patch literal 1461 zcma)6ZEM>|5MI(Y9HpO*+Sp)<5j1d+Wl2uBOR%=;~-l`WX zYREr2?l<17NH-r-=o8J>&4pg z8wX9)>@?pUp`%}Z>(sfB)p*%H9-`J^N|rRR8V9KH`u#z505ws&)9(B~7(fQx=DW8D zwU1keu@yR2+WP0k1LuVfy}u=qp9f*eee8aF;O@W&7MEC|OS$7N7g_-YN0|FQ?9$9i zGT8PiJBqg^WhVdl9sD)kZ3Rv@u=^+rkNVE-zOSZro0Tyu$K|0%BBy4M2XVO#MD~i3 zj4=ynW;s@7LHYwnLrC{Rm!183BcJ@iJ?@a%XTd?s8NeQmC+9OB3qP{5vq9B~JsvxO z9U`nvb=`zoS(H@Ve9um-q-50M(n^x3U~p@iq+S#tPv6$<_ySXugiUUX!KPZ8pn`3QY2$ySnqGfhC^R7J6rsXuTAXiyM zt`bg{cHsDmSGY!8WU^p{_ zcUXKbj8aGEb3>?Wo5r?gKG%7l<=UZ^+7hd0#-@IMX9)jMcPs;U`KQrWhh)q+Vb9Tr zG2q{sUldjwIPwM6Mm=hX6|NGDJS)&vj#R}n(@>Xy(VKq(zLV55_2J4E)YuM4g{Nui z_~OR(7SFho67uB?`uD^OY6o&!_vkt2omePBFQH%jE+;d!1-W;*y7pLq;O-fj_zP#^ z7jTa=xJO`*M@#2?-mCF>-x501ae7hoPz3dNrF+o z!U;{EgqN``463`%Zf5yR8Wc*{W4+sG<#OQ!T1fTb-971B#P`gD37e`z%)t1ZO{{Nh1^w6OYyC3`|A|`N*h+rnG&>=2d{RBa X?>o>*v?*bl{tty+Y_Se6aro@j|5MI(Y9HpO*+Sp)<5j1d+Wl2uBOR%=;~-l`WX zYREr2?l<17NH-r-=o8J>&4pg z8wX9)>@?pUp`%}Z>(sfB)p*%H9-`J^N|rRR8V9KH`u#z505ws&)9(B~7(fQx=DW8D zwU1keu@yR2+WP0k1LuVfy}u=qp9f*eee8aF;O@W&7MEC|OS$7N7g_-YN0|FQ?9$9i zGT8PiJBqg^WhVdl9sD)kZ3Rv@u=^+rkNVE-zOSZro0Tyu$K|0%BBy4M2XVO#MD~i3 zj4=ynW;s@7LHYwnLrC{Rm!183BcJ@iJ?@a%XTd?s8NeQmC+9OB3qP{5vq9B~JsvxO z9U`nvb=`zoS(H@Ve9um-q-50M(n^x3U~p@iq+S#tPv6$<_ySXugiUUX!KPZ8pn`3QY2$ySnqGfhC^R7J6rsXuTAXiyM zt`bg{cHsDmSGY!8WU^p{_ zcUXKbj8aGEb3>?Wo5r?gKG%7l<=UZ^+7hd0#-@IMX9)jMcPs;U`KQrWhh)q+Vb9Tr zG2q{sUldjwIPwM6Mm=hX6|NGDJS)&vj#R}n(@>Xy(VKq(zLV55_2J4E)YuM4g{Nui z_~OR(7SFho67uB?`uD^OY6o&!_vkt2omePBFQH%jE+;d!1-W;*y7pLq;O-fj_zP#^ z7jTa=xJO`*M@#2?-mCF>-x501ae7hoPz3dNrF+o z!U;{EgqN``463`%Zf5yR8Wc*{W4+sG<#OQ!T1fTb-971B#P`gD37e`z%)t1ZO{{Nh1^w6OYyC3`|A|`N*h+rnG&>=2d{RBa X?>o>*v?*bl{tty+Y_Se6aro@ji#k30A#gMUNs#YA!F&_#FgD+E*`6?fx zX~RIjqUbNQ=!bOKWw!Zob(n(=6B>a7jxhIqSf!bfWUyq39mQbs#B{L9J^l!b1>ZFxcb2rae9^?cNZ9OSH(DI7SvFM3?WGiIJ38R$LlM5*3(i?j)%f zh1km-AF)R)^9bH$>J5-T;xI-vKS$&7x%z6+Ld`$wU*o~K5<=(#et7(VP#1|V>LcVL z$t6Bf+`Mh^UZGEll2 z|7LEUDw!FMymL=9opP8M5G&6jR}P0$Gq8QxD_l7crlxv@V{%))OV1?aS;yn46<&Z3 z70dP9lH`~2YdMvr;k`Xj3d}RoI2+?S-%D+k%Qb$Hd!ns#=DN!HIMxi6)Rb_(Njy>S zZZ-ZBc?W&a-oMw5$AUG$nviF!LJa)3lS`5tG;)Qb^jJp|39lUVJR?wMwnXr3qK&7( z!RnWw_cZk;sxUM8acl+_;Zv0H;Ow32-8|85l@Q<0pnfY|kUQw7iyA%Eyn%%x)Dr5& zZEEs!`?bz4>&uy%C~WQt&VhY1yYO)BOjTiZbv(5PC+jn~QCM8(nH4DD@0GZIaDJ(w z4oh7d3X3xo1+Yu=*djc3pp+Kx&rJ{ZA<1{6vRj6S^c?(~ie*k7{*8Bhf literal 0 HcmV?d00001 diff --git a/test/data/parquet_files_different_sizes/5.parquet b/test/data/parquet_files_different_sizes/5.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9abca47b93e9a35445c25ba2c6b33a51f651d140 GIT binary patch literal 1650 zcmcIl%Wm676r~tfaDgt2AQU7(3xPn5E@YRa#WYIM#gMUNs#YA+F&`=lgD+E*Ns8q| zG;R1t`Ufrg0bO=k^s9Pj^oV3+(?tgm=gzs8=bpK9hx&k2p4np#m{;cv%}_5XYQIhW z^XvB%42EJ}GL0_a2TYx*x10NG&nK%vee)RtMpDTc<{4qyV2d#$cWHS3u(^&$&<_P(Y`7OFwF+O78g znjPiD*tN3e?=K$NAa=d`8)qICQJk@FJ3l?LcYuM#rdI5GY`4S4 zR^$N?7hwpiEVt4emK?F?379-FJz_rn42vb-4I%edw7V>~ffu?ryVz3JOc$XJ8V zbOdq!x<~4;qhbSZiNGaV=1e^+pQl7u7}%+mR#;Y2S!p^iG1f)Vgj3MJ3!ldt%C@dyJt!D`D7xtN!+wnNK z2qfF9MyUm{%53f{@n~vCZm0yMuSDY1)~|6);p+F1&v}MKc$KIhSdlh!W#Z2#hB^ff&b$J>XIU`O z#hERri5)qFPt(-l`A0vveWu;1AbyZT{nol5cQnYBHF~aj%NI*fOQ;vWttl+rcP6*0 zFK-)?xV;U#5EBE87SDw#MaC-$c|D;iv7^4SE?-zt)rWE6=Ua7>~x~r~N z{ROj+`}Yj@AB;=Vo!COBGHFt2-M<8c2lOD41XC{&Tm)Fa`z7w%)ga912dddgtn`<{ g%a5ESIU<*^v$|xR<@IZ|+wh_O?F)+f0RJ}s0us>DQUCw| literal 0 HcmV?d00001 diff --git a/test/data/parquet_files_different_sizes/6.parquet b/test/data/parquet_files_different_sizes/6.parquet new file mode 100644 index 0000000000000000000000000000000000000000..12a8d4aa6e0c3079cfc4ddc4840ed0ebd24c0cb6 GIT binary patch literal 1650 zcmcIl%Wm676df_H;GkI;K`2Oo76O47UC1sei%Aq9i=kr4RINCcZ9WtP24AKqlN8H` zXxi`xwde=;tv3AVNsbi%?S>U9GZKMW-KX6V6;a=#Hy;qmLgLmBHhL~*?99%FV*pjgVpXl(y z^i7-AVN1mt-Vi}bw8$AdMmmeBF5fd_BQA)ns4(JqmJ_nM6DQs*B&cw7$RG0DBcRLF z8_w3DKzykB0*^*#>ib0tRsSgeOorfgF~;ZcA>$W}+gNgO4`Ua1U5ZK4mOm%p@!Efe z!A;gMC=2z=80W+RGbMp}6{LeD<+>`~>e>)8&LB*BD+hiy?kjDr^=WRND5)6^y)#cT zonjCh5Gzk-t{e^~W?=iWm%DNxPE7TR#ALR5mz;L7XB~|uR(K8=&X?=C)s^4KACD8c zJGi&|N=|r&nqU*Z-s&W_Dr8z~k$bGIb7s0K_$1a0mBj3leB)@W-rZ`g@8s?GKzr|A zI~s{rpJ>9Kt%?!w-;OV2k{dJ%xubM=N0TV89CSP*P^Na5`qQyCng9o{Ux40|#2c&P z)NG9+Gq5P1qKx{dm#%m7M7vc$d@qIit#m=|pqDIa^i=bPFO;B`P%mLqlb_iib$(f2 z!PF#i^Gt9L{HN);M`~xP3a_i9iQPY5-@%Q-le12(Kmoi{kov*>C5AdEbZsCm?oblJ zF3cl~^4Nh=SS^2UdT@@ed^arG`RNpUHy2RzZyH-;ZE$a>os4o!46nR2lNp(mXEBlcz(5d9e${P`vIZP@Ne@kT71pG literal 0 HcmV?d00001 diff --git a/test/data/parquet_files_different_sizes/7.parquet b/test/data/parquet_files_different_sizes/7.parquet new file mode 100644 index 0000000000000000000000000000000000000000..98418dbee5c6fb06828583da7ee7375380f99f58 GIT binary patch literal 1461 zcma)6ZEM>|5MI(Y9HpO*+Sp)<5j1d+Wl2uBOR%=;~-l`WX zYREr2?l<17NH-r-=o8J>&4pg z8wX9)>@?pUp`%}Z>(sfB)p*%H9-`J^N|rRR8V9KH`u#z505ws&)9(B~7(fQx=DW8D zwU1keu@yR2+WP0k1LuVfy}u=qp9f*eee8aF;O@W&7MEC|OS$7N7g_-YN0|FQ?9$9i zGT8PiJBqg^WhVdl9sD)kZ3Rv@u=^+rkNVE-zOSZro0Tyu$K|0%BBy4M2XVO#MD~i3 zj4=ynW;s@7LHYwnLrC{Rm!183BcJ@iJ?@a%XTd?s8NeQmC+9OB3qP{5vq9B~JsvxO z9U`nvb=`zoS(H@Ve9um-q-50M(n^x3U~p@iq+S#tPv6$<_ySXugiUUX!KPZ8pn`3QY2$ySnqGfhC^R7J6rsXuTAXiyM zt`bg{cHsDmSGY!8WU^p{_ zcUXKbj8aGEb3>?Wo5r?gKG%7l<=UZ^+7hd0#-@IMX9)jMcPs;U`KQrWhh)q+Vb9Tr zG2q{sUldjwIPwM6Mm=hX6|NGDJS)&vj#R}n(@>Xy(VKq(zLV55_2J4E)YuM4g{Nui z_~OR(7SFho67uB?`uD^OY6o&!_vkt2omePBFQH%jE+;d!1-W;*y7pLq;O-fj_zP#^ z7jTa=xJO`*M@#2?-mCF>-x501ae7hoPz3dNrF+o z!U;{EgqN``463`%Zf5yR8Wc*{W4+sG<#OQ!T1fTb-971B#P`gD37e`z%)t1ZO{{Nh1^w6OYyC3`|A|`N*h+rnG&>=2d{RBa X?>o>*v?*bl{tty+Y_Se6aro@jD0Uip=&^@l7* zpTGC_KF|BgKH%#aCc>nd^*tuakeh_Cgj^536(aT(cAgAjUT4^KuvvjmG|9$X4I6Vz zNSxdbkb-vk78*xG;VZWY_iZT2L=rg?T}iNEU1h@YX#ATH{Bj3e>_y@x)&V$S!US$B z@E|z2!Q4cU#0D7(u_1&-pI~s6wSg>g|GJW7NMbEY5^O~7b`1*SApVH^^vYYT-z@=_ z@Iw|!ku z`PQbpoL`5ye_ZAMa1|DGG0Y{w183snn~<;GC1mvhA-{UCFpCIES-5}oE>d*yHjn!6 zKS?q%m+xbvD-At-|9hGHVj1|paGwhnh|=FbOQFcxgQcstKimYQaDDjBB3ip^K;!n$ z+5@D;wZD*p#NR(RlFYJ$KB{&{vZ1-8Ll6D+I``3aAbl~;CD8+?bBnp{me0*?EPP4A zyDZ~fQRb=NBDcee!iFUY)IP9Z{4B{_bHFak)1h`G_50G40{&en{z-`Y=L+}vO41V{ z2vFkV9ATparK6b=bAvFEwJ`1-(3UWu{R{mGL%Vxyy~KbyYw+U1 zv_|%l06!j0BoiOJ%3bOQ?mgVg-X^$L-M!q)ZZz)OS6%zuODWj+pI?hdxmf%!uf0x4 zVx5ilBukYB?3b}8Dcw@F$&u1EMy54Ly}9~}C#h4A!6?<2thSa~Urkw(se!;683w!! zhEjhBuW35&_F%X{hZ+tuPk#w7cDy4RTyw(uqa5p+p&t2VHmsxJw1??ZBTHAhs*PY| z_NFsMS3c2{AtE!ip`=Pf3Cy3QGz-jitBt;oAC>OBk$UO@<42LwRrL3vaAb2R=rKZB z@L`yVJrIzfr|W>knN|u<^2Ic+bS{xAwtRV9>9f+%GkP)4OgxupCiiEWiN7b+)@EQT zTfCUcmM^5T#m`Db_sGRmGx1z1D&NSWgFl%^@vL1Nqe%dNyBcRwu+(4eW^diPU%hI2<(m#!J#&@`A6soWA{Fnznc+w#7bd6dtDvVn~?zGjc@%(lgd<*w0rZE!Bjm>6u|4iO( z3}vgPzpoc$ZCdS1U|TuqwkuY(Bb%yG(Z}tInd`{HW3(Bm!ep>l^psj2*267Sei9J!6 zoHnxodCMihUOI1X)N@K1G4Z-8h~o96iy_=kULOtnM0PS0`JC*?n6dfY%tQ+pVT}y)TS@MOV7izDG7~%Sjx}p4f|lmvsHk4hnmiK>&Bxu7rB96N$;Q7>n{t9 zbW5mr)bh@(PmUjs+t5>B-Ld-U&ad6|2xQuz$|hS2PrBzb9umX^fh_IRQKUnC|Zi?%&Q{4_kK)_xmDl>b!`z zZE8f%gZl^f@7+W7;bSV?Mg8ok0WUln%09jz^f??#Z?y1e0&Rhxpbt;uLss{~Gm96? z`Ao~47?4zO?2G))BlHLD&Nc;6aL+5(^)Um>&5AUZ^SD034kUP)+Ae zmA2iEGoN`rFl(WQYgdHTr2b<(x19^8%kys&cC{zU_yqwZDEO TK7hY~7k|lbeoDvz{G;&y##Mcz literal 0 HcmV?d00001 diff --git a/test/data/parquet_files_no_fixed_len_binary/data.parquet b/test/data/parquet_files_no_fixed_len_binary/data.parquet deleted file mode 100644 index 9dbfab650397e1eebabb81e37896c3e41b140671..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2583 zcmd5;TW{h<6doW8teQwsg%Lf5mdE;Z$ zIul{g#|RxGwB-OKO!LSO9X_ld&u{Bv^T^11uP-a}6Vz>WpLCfrz4&fgv< zm`$&oAF0|@ndr8ok1d+$Yk%~WKm9hG{w0*SV`^RH;&lPVf>ULvnGzF(Qac`uhn|s3)4_j3m)T`^*%Tm? z^1R;?^1B?7q1YyFw_k{PcWh#Rg+3$e(?Jnq+=5Nkml$VpVSq~*4{&ind4<9q`J6D1 zZsb%S-eFNIR9?k+jzJGjY6MQI1hfacFz<=Nd9T~RxY>Yfh5f2&%=+a{w#zp!wVKO37>h&}VzZXIbFn@*-C+lAMI0EV}mkl_p!&NiKc6FD}lz{2G~67(#wSh0C#8 zw_uPQ2`=hk&(j6l-GcDWIK>Xz6Wc?zd_=IDsp4XK2KZ^q+N>vXp1roA@Xe*vEwGZ9 zo4qq8z_^#G*-E8t%Y2sNuns3x6zm?KI|p0{7T}YcYn|Ki=XA;JjMRU?%B%6RFP2z2 zQzfwjb_=^_)rN|7PU`{q!*lIIT~a@Mzgt_;_z6a%qg8!`XN_VFaL4*hoP9&Fnlye= z(}td1z&-t}g%3$TYJ?op{-phDAHzJ5_HCWePNq$JJsKDniL87VKhpHjz*L{R)~`~1 z7$RbfQc7I?m5L9K$mmht38i49g<%lO7%OgYS4N4r<=U%wSLs0ru~%s>dvh{pbD|Gd x+@S>tSiD}Jqgc|GCg099=b2*L&*(=a>3t-U?&l+Z0RJD|Kcs}<_aFQu{2L 0) { + int64_t buf; + stream_reader >> buf; + local_sum += buf; + } + stream_reader.SkipColumns(field_count); + stream_reader.EndRow(); + local_count++; + }); + + world.barrier(); + const auto sum = world.all_reduce_sum(local_sum); + ASSERT_RELEASE(sum == 11111111111); + const auto row_count = world.all_reduce_sum(local_count); + ASSERT_RELEASE(row_count == 11); + } + return 0; } diff --git a/test/test_arrow_parquet_stream_reader_json.cpp b/test/test_arrow_parquet_stream_reader_json.cpp new file mode 100644 index 00000000..dc63a180 --- /dev/null +++ b/test/test_arrow_parquet_stream_reader_json.cpp @@ -0,0 +1,95 @@ +// Copyright 2019-2022 Lawrence Livermore National Security, LLC and other YGM +// Project Developers. See the top-level COPYRIGHT file for details. +// +// SPDX-License-Identifier: MIT + +#undef NDEBUG + +#include +#include +#include +#include +#include + +int main(int argc, char** argv) { + ygm::comm world(&argc, &argv); + + const std::string dir_name = "data/parquet_files_json/"; + + ygm::io::arrow_parquet_parser parquetp(world, {dir_name}); + + static size_t cnt1 = 0; + static size_t cnt2 = 0; + static size_t cnt3 = 0; + + // Test if ygm::io::detail::read_parquet_as_json can read all supported data + // types correctly + const auto& schema = parquetp.schema(); + parquetp.for_all([&schema, &world](auto& stream_reader, const auto&) { + const auto obj = + ygm::io::detail::read_parquet_as_json(stream_reader, schema); + + world.async( + 0, + [](auto, const auto& obj) { + ASSERT_RELEASE(obj.contains("id")); + ASSERT_RELEASE(obj.contains("bool")); + ASSERT_RELEASE(obj.contains("int32")); + ASSERT_RELEASE(obj.contains("int64")); + ASSERT_RELEASE(obj.contains("float")); + ASSERT_RELEASE(obj.contains("double")); + ASSERT_RELEASE(obj.contains("byte_array")); + + ASSERT_RELEASE(obj.at("id").is_int64()); + ASSERT_RELEASE(obj.at("bool").is_bool()); + ASSERT_RELEASE(obj.at("int32").is_int64()); + ASSERT_RELEASE(obj.at("int64").is_int64()); + ASSERT_RELEASE(obj.at("float").is_double()); + ASSERT_RELEASE(obj.at("double").is_double()); + ASSERT_RELEASE(obj.at("byte_array").is_string()); + + const auto id = obj.at("id").as_int64(); + if (id == 0) { + ASSERT_RELEASE(obj.at("bool").as_bool() == true); + ASSERT_RELEASE(obj.at("int32").as_int64() == -1); + ASSERT_RELEASE(obj.at("int64").as_int64() == -(1ULL << 32) - 1); + ASSERT_RELEASE(obj.at("float").as_double() == 1.5); + ASSERT_RELEASE(obj.at("double").as_double() == 10.5); + ASSERT_RELEASE(obj.at("byte_array").as_string() == "aa"); + ++cnt1; + } else if (id == 1) { + ASSERT_RELEASE(obj.at("bool").as_bool() == false); + ASSERT_RELEASE(obj.at("int32").as_int64() == -2); + ASSERT_RELEASE(obj.at("int64").as_int64() == -(1ULL << 32) - 2); + ASSERT_RELEASE(obj.at("float").as_double() == 2.5); + ASSERT_RELEASE(obj.at("double").as_double() == 20.5); + ASSERT_RELEASE(obj.at("byte_array").as_string() == "bb"); + ++cnt2; + } else if (id == 2) { + ASSERT_RELEASE(obj.at("bool").as_bool() == true); + ASSERT_RELEASE(obj.at("int32").as_int64() == -3); + ASSERT_RELEASE(obj.at("int64").as_int64() == -(1ULL << 32) - 3); + ASSERT_RELEASE(obj.at("float").as_double() == 3.5); + ASSERT_RELEASE(obj.at("double").as_double() == 30.5); + ASSERT_RELEASE(obj.at("byte_array").as_string() == "cc"); + ++cnt3; + } else { + ASSERT_RELEASE(false); + } + }, + obj); + }); + world.barrier(); + + if (world.rank0()) { + ASSERT_RELEASE(cnt1 == 1); + ASSERT_RELEASE(cnt2 == 1); + ASSERT_RELEASE(cnt3 == 1); + } else { + ASSERT_RELEASE(cnt1 == 0); + ASSERT_RELEASE(cnt2 == 0); + ASSERT_RELEASE(cnt3 == 0); + } + + return 0; +} From 41403679b3aa06b361603381dbf0012e84e6fb2c Mon Sep 17 00:00:00 2001 From: Keita Iwabuchi Date: Fri, 22 Dec 2023 12:16:59 -0800 Subject: [PATCH 2/3] Fix wrong indent --- test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 8719e1ca..5ebbc353 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -77,7 +77,7 @@ if (Arrow_FOUND AND Parquet_FOUND) target_link_libraries(MPI_test_arrow_parquet_stream_reader PUBLIC Arrow::arrow_shared Parquet::parquet_shared) - if (Boost_FOUND) + if (Boost_FOUND) add_ygm_test(test_arrow_parquet_stream_reader_json) target_include_directories(MPI_test_arrow_parquet_stream_reader_json PUBLIC From 4686d36713818639a2d96bf47b32c5ccdabe2f3d Mon Sep 17 00:00:00 2001 From: Keita Iwabuchi Date: Fri, 22 Dec 2023 12:19:35 -0800 Subject: [PATCH 3/3] Brush up on parque reader test --- test/test_arrow_parquet_stream_reader.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_arrow_parquet_stream_reader.cpp b/test/test_arrow_parquet_stream_reader.cpp index 30018db6..fd782945 100644 --- a/test/test_arrow_parquet_stream_reader.cpp +++ b/test/test_arrow_parquet_stream_reader.cpp @@ -88,7 +88,7 @@ int main(int argc, char** argv) { "data/parquet_files_different_sizes/"; // This test case tests the following cases (assuming there are 4 - // processes): + // processes, and Arrow >= v14): // 1. 0 item files at the top and end. // 2. read a large file by multiple processes. // 3. a small file is read by a single process.