From ecf262a455d7936ff07a3f30f2513a4032bca112 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 23 Mar 2023 20:28:39 +0300 Subject: [PATCH 01/10] Unit-test of supporting nulls in ORC files --- examples/orc_null_support.q | 273 ++++++++++++++++++++++++++++++++++++ 1 file changed, 273 insertions(+) create mode 100644 examples/orc_null_support.q diff --git a/examples/orc_null_support.q b/examples/orc_null_support.q new file mode 100644 index 0000000..ff7000b --- /dev/null +++ b/examples/orc_null_support.q @@ -0,0 +1,273 @@ +// orc_null_support.q +// Examples of creating a schema supporting null mapping and using it to read/write +// Apache ORC file with exposing null bitmap as a separate structure to kdb + +-1"\n+----------|| orc_null_support.q ||----------+\n"; + +// import the arrowkdb library +\l q/arrowkdb.q + +// Filesystem functions for Linux/MacOS/Windows +ls:{[filename] $[.z.o like "w*";system "dir /b ",filename;system "ls ",filename]}; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +///////////////////////// +// CONSTRUCTED SCHEMAS // +///////////////////////// + +//-------------------// +// Create the schema // +//-------------------// + +// Support null mapping in Apache ORC +int_opts:(`bool`int8`int16`int32`int64)!(1b;0x02;3h;4i;5); +float_opts:(`float32`float64`decimal)!(9.87e;6.54;3.21f); +cont_opts:(`utf8`binary)!("start";"x"$"alert"); +time_opts:(`date32`timestamp)!(2012.11.10;2011.01.01D00:00:00.000000000); + +compound_options:(``NULL_MAPPING)!((::);int_opts,float_opts,cont_opts,time_opts); + +// Create the datatype identifiers +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +i8_dt:.arrowkdb.dt.int8[]; +i16_dt:.arrowkdb.dt.int16[]; +i32_dt:.arrowkdb.dt.int32[]; +i64_dt:.arrowkdb.dt.int64[]; +f64_dt:.arrowkdb.dt.float64[]; + +str_dt:.arrowkdb.dt.utf8[]; +bin_dt:.arrowkdb.dt.binary[]; +dec_dt:.arrowkdb.dt.decimal128[38i;2i]; + +bool_dt:.arrowkdb.dt.boolean[]; +f32_dt:.arrowkdb.dt.float32[]; +d32_dt:.arrowkdb.dt.date32[]; + +// Create the field identifiers +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +i8_fd:.arrowkdb.fd.field[`int8;i8_dt]; +i16_fd:.arrowkdb.fd.field[`int16;i16_dt]; +i32_fd:.arrowkdb.fd.field[`int32;i32_dt]; +i64_fd:.arrowkdb.fd.field[`int64;i64_dt]; +f64_fd:.arrowkdb.fd.field[`float64;f64_dt]; + +str_fd:.arrowkdb.fd.field[`string;str_dt]; +bin_fd:.arrowkdb.fd.field[`binary;bin_dt]; +dec_fd:.arrowkdb.fd.field[`decimal;dec_dt]; + +bool_fd:.arrowkdb.fd.field[`bool;bool_dt]; +f32_fd:.arrowkdb.fd.field[`float32;f32_dt]; +d32_fd:.arrowkdb.fd.field[`date32;d32_dt]; + +numeric_schema:.arrowkdb.sc.schema[(ts_fd, i16_fd, i32_fd, i64_fd, f64_fd)]; +contiguous_schema:.arrowkdb.sc.schema[(str_fd, bin_fd, dec_fd)]; + +// Create a field containing the list datatype +list_dt:.arrowkdb.dt.list[i8_fd]; +list_fd:.arrowkdb.fd.field[`list_field;list_dt]; + +// Create a field containing the struct datatype +struct_dt:.arrowkdb.dt.struct[(bool_fd,f32_fd,d32_fd)]; +struct_fd:.arrowkdb.fd.field[`struct_field;struct_dt]; + +// Create fields containing the map datatype +map_dt:.arrowkdb.dt.map[i64_dt;f64_dt] +map_fd:.arrowkdb.fd.field[`map;map_dt]; + +// Create the schema containing the list and struct fields +compound_schema:.arrowkdb.sc.schema[(list_fd,struct_fd,map_fd)]; + +// Print the schema +-1"\nNumeric schema:"; +.arrowkdb.sc.printSchema[numeric_schema]; + +-1"\nContiguous schema:"; +.arrowkdb.sc.printSchema[contiguous_schema]; + +-1"\nCompound schema:"; +.arrowkdb.sc.printSchema[compound_schema]; + +// Number of items in each array +N:5 + +// Create data for each column in the table +ts_data:asc N?0p; + +i16_data:N?100h; +i16_data[0]:3h; +i32_data:N?100i; +i32_data[1]:4i; +i64_data:N?100; +i64_data[2]:5; +f64_data:N?100f; +f64_data[3]:6.54f; + +str_data:N?("start";"stop";"alert";"acknowledge";""); +str_data[0]:"start" +bin_data:N?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); +bin_data[1]:"x"$"alert" +dec_data:{"F"$.Q.f[2]x} each N?(10f) +dec_data[2]:3.21f + +N:3 +bool_data:N?(0b;1b); +bool_data[0]:1b; +f32_data:N?100e; +f32_data[1]:9.87e; +d32_data:N?(2012.11.10;2010.07.18;2011.07.16;2014.07.15;2016.07.11); +d32_data[2]:2012.11.10; + +// Combine the data for numeric columns +numeric_data:(ts_data;i16_data;i32_data;i64_data;f64_data); +// Combine the data for contiguous columns +contiguous_data:(str_data;bin_data;dec_data); + +// Combine the array data for the list and struct columns +list_array:(enlist 0x00;(0x0102);(0x030405)); +struct_array:(bool_data;f32_data;d32_data); +map_array:((enlist 1)!(enlist 1.23);(2 2)!(4.56 7.89);(3 3 3)!(9.87 6.54 3.21)) +compound_data:(list_array;struct_array;map_array); + +// Pretty print the Arrow table populated from the numeric data +compound_options[`DECIMAL128_AS_DOUBLE]:1 + +-1"\nNumeric table:"; +.arrowkdb.tb.prettyPrintTable[numeric_schema;numeric_data;compound_options]; + +// Show the string data as an arrow table +-1"\nContiguous table:"; +.arrowkdb.tb.prettyPrintTable[contiguous_schema;contiguous_data;compound_options] + +// Show the list data as an arrow table +-1"\nCompound table:"; +.arrowkdb.tb.prettyPrintTable[compound_schema;compound_data;compound_options] + +//-------------------------// +// Example-1. Arrow IPC file // +//-------------------------// + +// Write the schema and array data to a arrow file +arrow_numeric:"numeric_bitmap.arrow"; +arrow_contiguous:"contiguous_bitmap.arrow"; +arrow_compound:"compound_bitmap.arrow"; + +.arrowkdb.ipc.writeArrow[arrow_numeric;numeric_schema;numeric_data;compound_options]; +.arrowkdb.ipc.writeArrow[arrow_contiguous;contiguous_schema;contiguous_data;compound_options]; +.arrowkdb.ipc.writeArrow[arrow_compound;compound_schema;compound_data;compound_options]; + +show ls arrow_numeric +show ls arrow_contiguous +show ls arrow_compound + +// Read the schema back and compare +compound_options[`WITH_NULL_BITMAP]:1; + +arrow_numeric_schema:.arrowkdb.ipc.readArrowSchema[arrow_numeric]; +arrow_contiguous_schema:.arrowkdb.ipc.readArrowSchema[arrow_contiguous]; +arrow_compound_schema:.arrowkdb.ipc.readArrowSchema[arrow_compound]; + +show .arrowkdb.sc.equalSchemas[numeric_schema;arrow_numeric_schema] +show .arrowkdb.sc.equalSchemas[contiguous_schema;arrow_contiguous_schema] +show .arrowkdb.sc.equalSchemas[compound_schema;arrow_compound_schema] + +show numeric_schema~arrow_numeric_schema +show contiguous_schema~arrow_contiguous_schema +show compound_schema~arrow_compound_schema + +// Read the array data back and compare +arrow_numeric_data:.arrowkdb.ipc.readArrowData[arrow_numeric;compound_options]; +arrow_contiguous_data:.arrowkdb.ipc.readArrowData[arrow_contiguous;compound_options]; +arrow_compound_data:.arrowkdb.ipc.readArrowData[arrow_compound;compound_options]; + +show numeric_data~first arrow_numeric_data +show contiguous_data~first arrow_contiguous_data +show compound_data~first arrow_compound_data + +// Compare null bitmaps of arrow data +compound_numeric_nulls:(00000b;10000b;01000b;00100b;00010b); +compound_contiguous_nulls:(10000b;01000b;00100b); +compound_list_nulls:(enlist 0b;01b;000b); +compound_struct_nulls:(100b;010b;001b); +compound_map_nulls:((enlist 0b)!(enlist 0b);00b!00b;000b!010b) + +arrow_numeric_nulls:last arrow_numeric_data; +arrow_contiguous_nulls:last arrow_contiguous_data; +arrow_list_nulls:last[arrow_compound_data][0] +arrow_struct_nulls:last[arrow_compound_data][1] +arrow_map_nulls:last[arrow_compound_data][2] + +show compound_numeric_nulls~compound_numeric_nulls & arrow_numeric_nulls +show compound_contiguous_nulls~compound_contiguous_nulls & arrow_contiguous_nulls +show compound_list_nulls~arrow_list_nulls +show compound_struct_nulls~compound_struct_nulls & arrow_struct_nulls +show compound_map_nulls~arrow_map_nulls + +rm arrow_numeric; +rm arrow_contiguous; +rm arrow_compound; + +//---------------------------// +// Example-2. Apache ORC file// +//---------------------------// + +// Write the schema and array data to a ORC file +compound_options[`ORC_CHUNK_SIZE]:1024 + +orc_numeric:"numeric_bitmap.orc"; +orc_contiguous:"contiguous_bitmap.orc"; +orc_compound:"compound_bitmap.orc"; + +.arrowkdb.orc.writeOrc[orc_numeric;numeric_schema;numeric_data;compound_options] +.arrowkdb.orc.writeOrc[orc_contiguous;contiguous_schema;contiguous_data;compound_options] +.arrowkdb.orc.writeOrc[orc_compound;compound_schema;compound_data;compound_options] + +show ls orc_numeric +show ls orc_contiguous +show ls orc_compound + +// Read the schema back and compare +orc_numeric_schema:.arrowkdb.orc.readOrcSchema[orc_numeric]; +orc_contiguous_schema:.arrowkdb.orc.readOrcSchema[orc_contiguous]; +orc_compound_schema:.arrowkdb.orc.readOrcSchema[orc_compound]; + +show .arrowkdb.sc.equalSchemas[numeric_schema;orc_numeric_schema] +show .arrowkdb.sc.equalSchemas[contiguous_schema;orc_contiguous_schema] +show .arrowkdb.sc.equalSchemas[compound_schema;orc_compound_schema] + +show numeric_schema~orc_numeric_schema +show contiguous_schema~orc_contiguous_schema +show compound_schema~orc_compound_schema + +// Read the array data back and compare +orc_numeric_data:.arrowkdb.orc.readOrcData[orc_numeric;compound_options]; +orc_contiguous_data:.arrowkdb.orc.readOrcData[orc_contiguous;compound_options]; +orc_compound_data:.arrowkdb.orc.readOrcData[orc_compound;compound_options]; + +show numeric_data~first orc_numeric_data +show contiguous_data~first orc_contiguous_data +show compound_data~first orc_compound_data + +// Compare null bitmaps of arrow data +orc_numeric_nulls:last orc_numeric_data; +orc_contiguous_nulls:last orc_contiguous_data; +orc_list_nulls:last[orc_compound_data][0] +orc_struct_nulls:last[orc_compound_data][1] +orc_map_nulls:last[orc_compound_data][2] + +show compound_numeric_nulls~compound_numeric_nulls & orc_numeric_nulls +show compound_contiguous_nulls~compound_contiguous_nulls & orc_contiguous_nulls +show compound_list_nulls~orc_list_nulls +show compound_struct_nulls~compound_struct_nulls & orc_struct_nulls +show compound_map_nulls~orc_map_nulls + +rm orc_numeric; +rm orc_contiguous; +rm orc_compound; + + +-1 "\n+----------------------------------------+\n"; + +// Process off +exit 0; From c96e0af9f2cdf64aacf3183273ee7b09354b6b8f Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 24 Mar 2023 11:28:44 +0300 Subject: [PATCH 02/10] Supporting nulls in ORC files --- src/TableData.cpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/TableData.cpp b/src/TableData.cpp index 0e34b97..7302038 100644 --- a/src/TableData.cpp +++ b/src/TableData.cpp @@ -861,6 +861,20 @@ K readORCData(K orc_file, K options) kK(data)[i] = kx::arrowkdb::ReadChunkedArray(chunked_array, type_overrides); } + int64_t with_null_bitmap = 0; + read_options.GetIntOption( kx::arrowkdb::Options::WITH_NULL_BITMAP, with_null_bitmap ); + if( with_null_bitmap ){ + K bitmap = ktn( 0, col_num ); + for( auto i = 0; i < col_num; ++i ){ + auto chunked_array = table->column( i ); + kK( bitmap )[i] = kx::arrowkdb::ReadChunkedArrayNullBitmap( chunked_array, type_overrides ); + } + K array = data; + data = ktn( 0, 2 ); + kK( data )[0] = array; + kK( data )[1] = bitmap; + } + return data; #endif From d82b98f62f26e7ccda511107ed602492cf87c37e Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 24 Mar 2023 17:07:10 +0300 Subject: [PATCH 03/10] Unit-tests of null support for ORCs --- examples/orc_null_support.q | 30 +++--- tests/orc_dataloader/orc_compound_nulls.t | 104 ++++++++++++++++++++ tests/orc_dataloader/orc_contiguous_nulls.t | 79 +++++++++++++++ tests/orc_dataloader/orc_numeric_nulls.t | 84 ++++++++++++++++ 4 files changed, 282 insertions(+), 15 deletions(-) create mode 100644 tests/orc_dataloader/orc_compound_nulls.t create mode 100644 tests/orc_dataloader/orc_contiguous_nulls.t create mode 100644 tests/orc_dataloader/orc_numeric_nulls.t diff --git a/examples/orc_null_support.q b/examples/orc_null_support.q index ff7000b..fe03d38 100644 --- a/examples/orc_null_support.q +++ b/examples/orc_null_support.q @@ -186,11 +186,11 @@ show contiguous_data~first arrow_contiguous_data show compound_data~first arrow_compound_data // Compare null bitmaps of arrow data -compound_numeric_nulls:(00000b;10000b;01000b;00100b;00010b); -compound_contiguous_nulls:(10000b;01000b;00100b); -compound_list_nulls:(enlist 0b;01b;000b); -compound_struct_nulls:(100b;010b;001b); -compound_map_nulls:((enlist 0b)!(enlist 0b);00b!00b;000b!010b) +numeric_nulls:(00000b;10000b;01000b;00100b;00010b); +contiguous_nulls:(10000b;01000b;00100b); +list_nulls:(enlist 0b;01b;000b); +struct_nulls:(100b;010b;001b); +map_nulls:((enlist 0b)!(enlist 0b);00b!00b;000b!010b) arrow_numeric_nulls:last arrow_numeric_data; arrow_contiguous_nulls:last arrow_contiguous_data; @@ -198,11 +198,11 @@ arrow_list_nulls:last[arrow_compound_data][0] arrow_struct_nulls:last[arrow_compound_data][1] arrow_map_nulls:last[arrow_compound_data][2] -show compound_numeric_nulls~compound_numeric_nulls & arrow_numeric_nulls -show compound_contiguous_nulls~compound_contiguous_nulls & arrow_contiguous_nulls -show compound_list_nulls~arrow_list_nulls -show compound_struct_nulls~compound_struct_nulls & arrow_struct_nulls -show compound_map_nulls~arrow_map_nulls +show numeric_nulls~numeric_nulls & arrow_numeric_nulls +show contiguous_nulls~contiguous_nulls & arrow_contiguous_nulls +show list_nulls~arrow_list_nulls +show struct_nulls~struct_nulls & arrow_struct_nulls +show map_nulls~arrow_map_nulls rm arrow_numeric; rm arrow_contiguous; @@ -256,11 +256,11 @@ orc_list_nulls:last[orc_compound_data][0] orc_struct_nulls:last[orc_compound_data][1] orc_map_nulls:last[orc_compound_data][2] -show compound_numeric_nulls~compound_numeric_nulls & orc_numeric_nulls -show compound_contiguous_nulls~compound_contiguous_nulls & orc_contiguous_nulls -show compound_list_nulls~orc_list_nulls -show compound_struct_nulls~compound_struct_nulls & orc_struct_nulls -show compound_map_nulls~orc_map_nulls +show numeric_nulls~numeric_nulls & orc_numeric_nulls +show contiguous_nulls~contiguous_nulls & orc_contiguous_nulls +show list_nulls~orc_list_nulls +show struct_nulls~struct_nulls & orc_struct_nulls +show map_nulls~orc_map_nulls rm orc_numeric; rm orc_contiguous; diff --git a/tests/orc_dataloader/orc_compound_nulls.t b/tests/orc_dataloader/orc_compound_nulls.t new file mode 100644 index 0000000..451483d --- /dev/null +++ b/tests/orc_dataloader/orc_compound_nulls.t @@ -0,0 +1,104 @@ +// orc_compound_nulls.q + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l q/arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping in Apache ORC ||----------+\n"; +comp_opts:(`bool`int8`int64`float32`float64`date32)!(1b;0x02;5;9.87e;6.54;2012.11.10); + +compound_options:(``NULL_MAPPING)!((::);comp_opts); + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +i8_dt:.arrowkdb.dt.int8[]; + +bool_dt:.arrowkdb.dt.boolean[]; +f32_dt:.arrowkdb.dt.float32[]; +d32_dt:.arrowkdb.dt.date32[]; + +i64_dt:.arrowkdb.dt.int64[]; +f64_dt:.arrowkdb.dt.float64[]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +i8_fd:.arrowkdb.fd.field[`int8;i8_dt]; + +bool_fd:.arrowkdb.fd.field[`bool;bool_dt]; +f32_fd:.arrowkdb.fd.field[`float32;f32_dt]; +d32_fd:.arrowkdb.fd.field[`date32;d32_dt]; + +i64_fd:.arrowkdb.fd.field[`int64;i64_dt]; +f64_fd:.arrowkdb.fd.field[`float64;f64_dt]; + +-1"\n+----------|| Create a field containing the list datatype ||----------+\n"; +list_dt:.arrowkdb.dt.list[i8_fd]; +list_fd:.arrowkdb.fd.field[`list_field;list_dt]; + +-1"\n+----------|| Create a field containing the struct datatype ||----------+\n"; +struct_dt:.arrowkdb.dt.struct[(bool_fd,f32_fd,d32_fd)]; +struct_fd:.arrowkdb.fd.field[`struct_field;struct_dt]; + +-1"\n+----------|| Create fields containing the map datatype ||----------+\n"; +map_dt:.arrowkdb.dt.map[i64_dt;f64_dt] +map_fd:.arrowkdb.fd.field[`map;map_dt]; + +-1"\n+----------|| Create the schema containing the list and struct fields ||----------+\n"; +compound_schema:.arrowkdb.sc.schema[(list_fd,struct_fd,map_fd)]; + +-1"\n+----------|| Number of items in each array ||----------+\n"; +N:3 + +bool_data:N?(0b;1b); +bool_data[0]:1b; +f32_data:N?100e; +f32_data[1]:9.87e; +d32_data:N?(2012.11.10;2010.07.18;2011.07.16;2014.07.15;2016.07.11); +d32_data[2]:2012.11.10; + +-1"\n+----------|| Combine the array data for the list and struct columns ||----------+\n"; +list_array:(enlist 0x00;(0x0102);(0x030405)); +struct_array:(bool_data;f32_data;d32_data); +map_array:((enlist 1)!(enlist 1.23);(2 2)!(4.56 7.89);(3 3 3)!(9.87 6.54 3.21)) +compound_data:(list_array;struct_array;map_array); + +-1"\n+----------|| Write the schema and array data to a ORC file ||----------+\n"; +compound_options[`ORC_CHUNK_SIZE]:1024 + +orc_compound:"compound_bitmap.orc"; +.arrowkdb.orc.writeOrc[orc_compound;compound_schema;compound_data;compound_options] + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +compound_options[`WITH_NULL_BITMAP]:1; + +orc_compound_schema:.arrowkdb.orc.readOrcSchema[orc_compound]; +.arrowkdb.sc.equalSchemas[compound_schema;orc_compound_schema] +compound_schema~orc_compound_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +orc_compound_data:.arrowkdb.orc.readOrcData[orc_compound;compound_options]; +compound_data~first orc_compound_data + +-1"\n+----------|| Compare null bitmaps of arrow data ||----------+\n"; +list_nulls:(enlist 0b;01b;000b); +struct_nulls:(100b;010b;001b); +map_nulls:((enlist 0b)!(enlist 0b);00b!00b;000b!010b) + +orc_list_nulls:last[orc_compound_data][0] +orc_struct_nulls:last[orc_compound_data][1] +orc_map_nulls:last[orc_compound_data][2] + +list_nulls~orc_list_nulls +struct_nulls~struct_nulls & orc_struct_nulls +map_nulls~orc_map_nulls + +rm orc_compound; + + +-1 "\n+----------|| Test utils ||----------+\n"; + +.arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; diff --git a/tests/orc_dataloader/orc_contiguous_nulls.t b/tests/orc_dataloader/orc_contiguous_nulls.t new file mode 100644 index 0000000..bcc489f --- /dev/null +++ b/tests/orc_dataloader/orc_contiguous_nulls.t @@ -0,0 +1,79 @@ +// orc_contiguous_nulls.q + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l q/arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping in Apache ORC ||----------+\n"; +cont_opts:(`utf8`binary`decimal)!("start";"x"$"alert";3.21f); + +contiguous_options:(``NULL_MAPPING)!((::);cont_opts); + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +str_dt:.arrowkdb.dt.utf8[]; +bin_dt:.arrowkdb.dt.binary[]; +dec_dt:.arrowkdb.dt.decimal128[38i;2i]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +str_fd:.arrowkdb.fd.field[`string;str_dt]; +bin_fd:.arrowkdb.fd.field[`binary;bin_dt]; +dec_fd:.arrowkdb.fd.field[`decimal;dec_dt]; + +contiguous_schema:.arrowkdb.sc.schema[(str_fd, bin_fd, dec_fd)]; + +-1"\n+----------|| Number of items in each array ||----------+\n"; +N:5 + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +str_data:N?("start";"stop";"alert";"acknowledge";""); +str_data[0]:"start" +bin_data:N?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); +bin_data[1]:"x"$"alert" +dec_data:{"F"$.Q.f[2]x} each N?(10f) +dec_data[2]:3.21f + +-1"\n+----------|| Combine the data for contiguous columns ||----------+\n"; +contiguous_options[`DECIMAL128_AS_DOUBLE]:1 + +contiguous_data:(str_data;bin_data;dec_data); + +-1"\n+----------|| Write the schema and array data to a ORC file ||----------+\n"; +contiguous_options[`ORC_CHUNK_SIZE]:1024 + +orc_contiguous:"contiguous_bitmap.orc"; +.arrowkdb.orc.writeOrc[orc_contiguous;contiguous_schema;contiguous_data;contiguous_options] + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +contiguous_options[`WITH_NULL_BITMAP]:1; + +orc_contiguous_schema:.arrowkdb.orc.readOrcSchema[orc_contiguous]; +.arrowkdb.sc.equalSchemas[contiguous_schema;orc_contiguous_schema] +contiguous_schema~orc_contiguous_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +orc_contiguous_data:.arrowkdb.orc.readOrcData[orc_contiguous;contiguous_options]; +contiguous_data~first orc_contiguous_data + +-1"\n+----------|| Compare null bitmaps of arrow data ||----------+\n"; +contiguous_nulls:(10000b;01000b;00100b); +orc_contiguous_nulls:last orc_contiguous_data; +contiguous_nulls~contiguous_nulls & orc_contiguous_nulls + +rm orc_contiguous; + + +-1 "\n+----------|| Test utils ||----------+\n"; + +.arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; diff --git a/tests/orc_dataloader/orc_numeric_nulls.t b/tests/orc_dataloader/orc_numeric_nulls.t new file mode 100644 index 0000000..bb5a4b5 --- /dev/null +++ b/tests/orc_dataloader/orc_numeric_nulls.t @@ -0,0 +1,84 @@ +// orc_numeric_nulls.q + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l q/arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping in Apache ORC ||----------+\n"; +num_opts:(`int8`int16`int32`int64`float64)!(0x02;3h;4i;5;6.54); + +numeric_options:(``NULL_MAPPING)!((::);num_opts); + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +i8_dt:.arrowkdb.dt.int8[]; +i16_dt:.arrowkdb.dt.int16[]; +i32_dt:.arrowkdb.dt.int32[]; +i64_dt:.arrowkdb.dt.int64[]; +f64_dt:.arrowkdb.dt.float64[]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +i8_fd:.arrowkdb.fd.field[`int8;i8_dt]; +i16_fd:.arrowkdb.fd.field[`int16;i16_dt]; +i32_fd:.arrowkdb.fd.field[`int32;i32_dt]; +i64_fd:.arrowkdb.fd.field[`int64;i64_dt]; +f64_fd:.arrowkdb.fd.field[`float64;f64_dt]; + +-1"\n+----------|| Create the schema for the list of fields ||----------+\n"; +numeric_schema:.arrowkdb.sc.schema[(ts_fd, i16_fd, i32_fd, i64_fd, f64_fd)]; + +-1"\n+----------|| Number of items in each array ||----------+\n"; +N:5 + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +i16_data:N?100h; +i16_data[0]:3h; +i32_data:N?100i; +i32_data[1]:4i; +i64_data:N?100; +i64_data[2]:5; +f64_data:N?100f; +f64_data[3]:6.54f; + +-1"\n+----------|| Combine the data for numeric columns ||----------+\n"; +numeric_data:(ts_data;i16_data;i32_data;i64_data;f64_data); + +-1"\n+----------|| Write the schema and array data to a ORC file ||----------+\n"; +numeric_options[`ORC_CHUNK_SIZE]:1024 + +orc_numeric:"numeric_bitmap.orc"; +.arrowkdb.orc.writeOrc[orc_numeric;numeric_schema;numeric_data;numeric_options] + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +numeric_options[`WITH_NULL_BITMAP]:1; + +orc_numeric_schema:.arrowkdb.orc.readOrcSchema[orc_numeric]; +.arrowkdb.sc.equalSchemas[numeric_schema;orc_numeric_schema] +numeric_schema~orc_numeric_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +orc_numeric_data:.arrowkdb.orc.readOrcData[orc_numeric;numeric_options]; +numeric_data~first orc_numeric_data + +-1"\n+----------|| Compare null bitmaps of arrow data ||----------+\n"; +numeric_nulls:(00000b;10000b;01000b;00100b;00010b); +orc_numeric_nulls:last orc_numeric_data; +numeric_nulls~numeric_nulls & orc_numeric_nulls + +rm orc_numeric; + + +-1 "\n+----------|| Test utils ||----------+\n"; + +show .arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; From 4762c181203341657841b4c4d9af5d90d72b533e Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 28 Mar 2023 07:33:48 +0000 Subject: [PATCH 04/10] Update for WITH_NULL_BITMAP --- q/arrowkdb.q | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/q/arrowkdb.q b/q/arrowkdb.q index cfff1c2..d2f92f3 100644 --- a/q/arrowkdb.q +++ b/q/arrowkdb.q @@ -106,6 +106,19 @@ tb.prettyPrintTable_:`arrowkdb 2:(`prettyPrintTable;3); tb.prettyPrintTable:{[x;y;z] -1 tb.prettyPrintTable_[x;y;z];}; tb.prettyPrintTableFromTable:{[table;options] tb.prettyPrintTable[sc.inferSchema[table];value flip table;options]}; +// ORC files +orc.writeOrc:`arrowkdb 2:(`writeORC;4); +orc.writeOrcFromTable:{[filename;table;options] orc.writeOrc[filename;sc.inferSchema[table];value flip table;options]}; +orc.readOrcSchema:`arrowkdb 2:(`readORCSchema;1); +orc.readOrcData:`arrowkdb 2:(`readORCData;2); +orc.readOrcToTable:{[filename;options] + fields:fd.fieldName each sc.schemaFields[orc.readOrcSchema[filename]]; + data:orc.readOrcData[filename;options]; + $[1~options`WITH_NULL_BITMAP; + (flip fields!first data;flip fields!last data); + flip fields!data + ] + }; // parquet files pq.writeParquet:`arrowkdb 2:(`writeParquet;4); @@ -132,14 +145,6 @@ pq.readParquetRowGroupsToTable:{[filename;row_groups;columns;options] ] }; -// ORC files -orc.writeOrc:`arrowkdb 2:(`writeORC;4); -orc.writeOrcFromTable:{[filename;table;options] orc.writeOrc[filename;sc.inferSchema[table];value flip table;options]}; -orc.readOrcSchema:`arrowkdb 2:(`readORCSchema;1); -orc.readOrcData:`arrowkdb 2:(`readORCData;2); -orc.readOrcToTable:{[filename;options] flip (fd.fieldName each sc.schemaFields[orc.readOrcSchema[filename]])!(orc.readOrcData[filename;options])}; -// orc.readColumn (Functionality is different since dealing with stripes) - // arrow files ipc.writeArrow:`arrowkdb 2:(`writeArrow;4); ipc.writeArrowFromTable:{[filename;table;options] ipc.writeArrow[filename;sc.inferSchema[table];value flip table;options]}; From 5f7303fd55806ab09ad3b428ed4db51ed67d0fb1 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 29 Mar 2023 13:56:15 +0000 Subject: [PATCH 05/10] Cleaning up ORC example of null support out of arrow writing --- examples/orc_null_support.q | 76 +++++-------------------------------- 1 file changed, 10 insertions(+), 66 deletions(-) diff --git a/examples/orc_null_support.q b/examples/orc_null_support.q index fe03d38..53aee10 100644 --- a/examples/orc_null_support.q +++ b/examples/orc_null_support.q @@ -144,81 +144,19 @@ compound_options[`DECIMAL128_AS_DOUBLE]:1 -1"\nCompound table:"; .arrowkdb.tb.prettyPrintTable[compound_schema;compound_data;compound_options] -//-------------------------// -// Example-1. Arrow IPC file // -//-------------------------// - -// Write the schema and array data to a arrow file -arrow_numeric:"numeric_bitmap.arrow"; -arrow_contiguous:"contiguous_bitmap.arrow"; -arrow_compound:"compound_bitmap.arrow"; - -.arrowkdb.ipc.writeArrow[arrow_numeric;numeric_schema;numeric_data;compound_options]; -.arrowkdb.ipc.writeArrow[arrow_contiguous;contiguous_schema;contiguous_data;compound_options]; -.arrowkdb.ipc.writeArrow[arrow_compound;compound_schema;compound_data;compound_options]; - -show ls arrow_numeric -show ls arrow_contiguous -show ls arrow_compound - -// Read the schema back and compare -compound_options[`WITH_NULL_BITMAP]:1; - -arrow_numeric_schema:.arrowkdb.ipc.readArrowSchema[arrow_numeric]; -arrow_contiguous_schema:.arrowkdb.ipc.readArrowSchema[arrow_contiguous]; -arrow_compound_schema:.arrowkdb.ipc.readArrowSchema[arrow_compound]; - -show .arrowkdb.sc.equalSchemas[numeric_schema;arrow_numeric_schema] -show .arrowkdb.sc.equalSchemas[contiguous_schema;arrow_contiguous_schema] -show .arrowkdb.sc.equalSchemas[compound_schema;arrow_compound_schema] - -show numeric_schema~arrow_numeric_schema -show contiguous_schema~arrow_contiguous_schema -show compound_schema~arrow_compound_schema - -// Read the array data back and compare -arrow_numeric_data:.arrowkdb.ipc.readArrowData[arrow_numeric;compound_options]; -arrow_contiguous_data:.arrowkdb.ipc.readArrowData[arrow_contiguous;compound_options]; -arrow_compound_data:.arrowkdb.ipc.readArrowData[arrow_compound;compound_options]; - -show numeric_data~first arrow_numeric_data -show contiguous_data~first arrow_contiguous_data -show compound_data~first arrow_compound_data - -// Compare null bitmaps of arrow data -numeric_nulls:(00000b;10000b;01000b;00100b;00010b); -contiguous_nulls:(10000b;01000b;00100b); -list_nulls:(enlist 0b;01b;000b); -struct_nulls:(100b;010b;001b); -map_nulls:((enlist 0b)!(enlist 0b);00b!00b;000b!010b) - -arrow_numeric_nulls:last arrow_numeric_data; -arrow_contiguous_nulls:last arrow_contiguous_data; -arrow_list_nulls:last[arrow_compound_data][0] -arrow_struct_nulls:last[arrow_compound_data][1] -arrow_map_nulls:last[arrow_compound_data][2] - -show numeric_nulls~numeric_nulls & arrow_numeric_nulls -show contiguous_nulls~contiguous_nulls & arrow_contiguous_nulls -show list_nulls~arrow_list_nulls -show struct_nulls~struct_nulls & arrow_struct_nulls -show map_nulls~arrow_map_nulls - -rm arrow_numeric; -rm arrow_contiguous; -rm arrow_compound; - //---------------------------// -// Example-2. Apache ORC file// +// Example-1. Apache ORC file// //---------------------------// // Write the schema and array data to a ORC file -compound_options[`ORC_CHUNK_SIZE]:1024 +compound_options[`WITH_NULL_BITMAP]:1; orc_numeric:"numeric_bitmap.orc"; orc_contiguous:"contiguous_bitmap.orc"; orc_compound:"compound_bitmap.orc"; +compound_options[`ORC_CHUNK_SIZE]:1024 + .arrowkdb.orc.writeOrc[orc_numeric;numeric_schema;numeric_data;compound_options] .arrowkdb.orc.writeOrc[orc_contiguous;contiguous_schema;contiguous_data;compound_options] .arrowkdb.orc.writeOrc[orc_compound;compound_schema;compound_data;compound_options] @@ -250,6 +188,12 @@ show contiguous_data~first orc_contiguous_data show compound_data~first orc_compound_data // Compare null bitmaps of arrow data +numeric_nulls:(00000b;10000b;01000b;00100b;00010b); +contiguous_nulls:(10000b;01000b;00100b); +list_nulls:(enlist 0b;01b;000b); +struct_nulls:(100b;010b;001b); +map_nulls:((enlist 0b)!(enlist 0b);00b!00b;000b!010b) + orc_numeric_nulls:last orc_numeric_data; orc_contiguous_nulls:last orc_contiguous_data; orc_list_nulls:last[orc_compound_data][0] From 819d5783ee06a04ab8dcb661595e5b77ace32913 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 29 Mar 2023 13:56:53 +0000 Subject: [PATCH 06/10] Use binary format for Arrow so version --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5679b99..4077f38 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 3.1.3) project(arrowkdb CXX) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -DKXVER=3") -if(ARROW_SO_VERSION LESS "10.0") +if(ARROW_SO_VERSION LESS "1000") set(CMAKE_CXX_STANDARD 14) else() set(CMAKE_CXX_STANDARD 17) From e1e40914c915eb04bde8b96c63ea528517813e02 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 29 Mar 2023 15:18:53 +0000 Subject: [PATCH 07/10] Unit-test typo for Travis moan --- tests/orc_dataloader/orc_compound_nulls.t | 2 +- tests/orc_dataloader/orc_contiguous_nulls.t | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/orc_dataloader/orc_compound_nulls.t b/tests/orc_dataloader/orc_compound_nulls.t index 451483d..4053aa2 100644 --- a/tests/orc_dataloader/orc_compound_nulls.t +++ b/tests/orc_dataloader/orc_compound_nulls.t @@ -97,7 +97,7 @@ rm orc_compound; -1 "\n+----------|| Test utils ||----------+\n"; -.arrowkdb.util.buildInfo[] +show .arrowkdb.util.buildInfo[] (type .arrowkdb.util.buildInfo[])~99h diff --git a/tests/orc_dataloader/orc_contiguous_nulls.t b/tests/orc_dataloader/orc_contiguous_nulls.t index bcc489f..42e28d4 100644 --- a/tests/orc_dataloader/orc_contiguous_nulls.t +++ b/tests/orc_dataloader/orc_contiguous_nulls.t @@ -72,7 +72,7 @@ rm orc_contiguous; -1 "\n+----------|| Test utils ||----------+\n"; -.arrowkdb.util.buildInfo[] +show .arrowkdb.util.buildInfo[] (type .arrowkdb.util.buildInfo[])~99h From e296ac0e9b4397363e4ef7eb107147668b6648d2 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 29 Mar 2023 15:26:13 +0000 Subject: [PATCH 08/10] Supply prefix path for Arrow --- .gitignore | 1 + CMakeLists.txt | 15 +++++++++------ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index 701d371..391b645 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ arrowkdb.code-workspace .vscode/ build/ +tests/*.q test.q unit.q *.user diff --git a/CMakeLists.txt b/CMakeLists.txt index 4077f38..7472f08 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,12 +5,8 @@ endif() cmake_minimum_required(VERSION 3.1.3) project(arrowkdb CXX) +set(CMAKE_PREFIX_PATH "${CMAKE_PREFIX_PATH};${ARROW_INSTALL}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -DKXVER=3") -if(ARROW_SO_VERSION LESS "1000") - set(CMAKE_CXX_STANDARD 14) -else() - set(CMAKE_CXX_STANDARD 17) -endif() set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) @@ -29,6 +25,13 @@ include_directories ( ${CMAKE_BINARY_DIR} # For 'k.h', downloaded below ) +find_package(Arrow REQUIRED) +if(ARROW_SO_VERSION LESS "1000") + set(CMAKE_CXX_STANDARD 14) +else() + set(CMAKE_CXX_STANDARD 17) +endif() + find_library(ARROW_LIBRARY NAMES arrow HINTS "${ARROW_INSTALL}/lib/" @@ -65,7 +68,7 @@ else() set(OSFLAG l) endif() -target_link_libraries(${MY_LIBRARY_NAME} ${ARROW_LIBRARY} ${PARQUET_LIBRARY} ${LINK_LIBS}) +target_link_libraries(${MY_LIBRARY_NAME} Arrow::arrow_shared ${PARQUET_LIBRARY} ${LINK_LIBS}) set_target_properties(${MY_LIBRARY_NAME} PROPERTIES PREFIX "") # Check if 32-bit/64-bit machine From eba33d28f358fa044381a9998354acb94ce90f95 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 29 Mar 2023 16:16:05 +0000 Subject: [PATCH 09/10] Fixing shared library complain of Travis --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7472f08..8320421 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -68,7 +68,7 @@ else() set(OSFLAG l) endif() -target_link_libraries(${MY_LIBRARY_NAME} Arrow::arrow_shared ${PARQUET_LIBRARY} ${LINK_LIBS}) +target_link_libraries(${MY_LIBRARY_NAME} ${ARROW_LIBRARY} ${PARQUET_LIBRARY} ${LINK_LIBS}) set_target_properties(${MY_LIBRARY_NAME} PROPERTIES PREFIX "") # Check if 32-bit/64-bit machine From 04a77d253a24923ee0e0560bff8d62af60631c8e Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 29 Mar 2023 16:48:35 +0000 Subject: [PATCH 10/10] Add extra warning that package requires Arrow 9.0 [ci skip] --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index dc5e95f..8bbb573 100644 --- a/README.md +++ b/README.md @@ -44,10 +44,11 @@ Conversely, Arrow is an in-memory format meant for direct and efficient use for ### Requirements - kdb+ ≥ 3.5 64-bit (Linux/MacOS/Windows) -- Apache Arrow = 9.0.0 (or ≥ 6.0.0 if building `arrowkdb` from source) +- Apache Arrow ≥ 9.0.0 (or ≥ 6.0.0 if building `arrowkdb` from source) - C++14 or later - CMake ≥ 3.1.3 +> :warning: If using the packaged version of `arrowkdb` you should install version 9.0.0 of Apache Arrow ### Third-party library installation