diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..701d371 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +arrowkdb.code-workspace +.vscode/ +build/ +test.q +unit.q +*.user diff --git a/.travis.yml b/.travis.yml index 7deb2e9..0606daf 100644 --- a/.travis.yml +++ b/.travis.yml @@ -82,16 +82,16 @@ before_install: script: - if [[ $TESTS == "True" && "x$OD" != "x" && "x$QLIC_KC" != "x" ]]; then curl -o test.q -L https://github.com/KxSystems/hdf5/raw/master/test.q; - q test.q tests/ -q; + q test.q tests -q && q test.q tests/null_mapping -q && q test.q tests/null_bitmap -q; fi - if [[ $TRAVIS_OS_NAME == "windows" && $BUILD == "True" ]]; then 7z a -tzip -r $FILE_NAME ./cmake/$FILE_ROOT/*; elif [[ $BUILD == "True" && ( $TRAVIS_OS_NAME == "linux" || $TRAVIS_OS_NAME == "osx" ) ]]; then tar -zcvf $FILE_NAME -C cmake/$FILE_ROOT .; elif [[ $TRAVIS_OS_NAME == "windows" ]]; then - 7z a -tzip $FILE_NAME README.md install.bat LICENSE q examples proto; + 7z a -tzip $FILE_NAME README.md install.bat LICENSE q docs examples proto; elif [[ $TRAVIS_OS_NAME == "linux" || $TRAVIS_OS_NAME == "osx" ]]; then - tar -zcvf $FILE_NAME README.md install.sh LICENSE q examples proto; + tar -zcvf $FILE_NAME README.md install.sh LICENSE q docs examples proto; fi deploy: diff --git a/CMakeLists.txt b/CMakeLists.txt index 29f623d..33bfada 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,6 +7,9 @@ project(arrowkdb CXX) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -DKXVER=3") set(CMAKE_CXX_STANDARD 14) +IF(APPLE) + set(CMAKE_CXX_STANDARD 17) +endif() set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) diff --git a/docs/null-bitmap.md b/docs/null-bitmap.md new file mode 100755 index 0000000..55d7cb8 --- /dev/null +++ b/docs/null-bitmap.md @@ -0,0 +1,66 @@ +# Arrowkdb null bitmap + +## Problem + +Previously arrowkdb ignored the null bitmap when reading or writing an arrow array. This was due to the following reasons:  + +- Using the kdb null values will result in some strange corner cases. + +- Mapping to kdb nulls would hurt the performance. + + Users have requested that arrowkdb provides a null bitmap when reading an arrow array so that the user can use this array in their applications. + +When reading an arrow array using arrowkdb the user can now choose to return the null bitmap as well as the data values. The shape of the null bitmap structure is exactly the same as the data structure.  It is then left to the user to interpret the two structures as appropriate for their application. + +Note: there is currently no support for null bitmap with the writer functions.  + + +## Implementation + +The null bitmap feature is supported when reading: + +* Parquet files +* Arrow IPC files +* Arrow IPC streams + +To demonstrate it we first use the null mapping support to create a Parquet file containing nulls (although you can read null bitmaps from files generated by other writers such as PyArrow): + +```q +q)options:(``NULL_MAPPING)!(::;`bool`uint8`int8`uint16`int16`uint32`int32`uint64`int64`float16`float32`float64`date32`date64`month_interval`day_time_interval`timestamp`time32`time64`duration`utf8`large_utf8`binary`large_binary`fixed_size_binary!(0b;0x00;0x00;0Nh;0Nh;0Ni;0Ni;0N;0N;0Nh;0Ne;0n;0Nd;0Np;0Nm;0Nn;0Np;0Nt;0Nn;0Nn;"";"";`byte$"";`byte$"";`byte$"")) +q)table:([]col1:0N 1 2; col2:1.1 0n 2.2; col3:("aa"; "bb"; "")) +q).arrowkdb.pq.writeParquetFromTable["file.parquet";table;options] +``` + + +Each reader function in arrowkdb takes an options dictionary.  A new `WITH_NULL_BITMAP option has been added.  When this option is set the reader functions return a two item mixed list, rather than one (the data values and null bitmap): + +```q +q)read_results:.arrowkdb.pq.readParquetToTable["file.parquet";(enlist `WITH_NULL_BITMAP)!enlist 1] +q)read_results ++`col1`col2`col3!(0 1 2;1.1 0 2.2;("aa";"bb";"")) ++`col1`col2`col3!(100b;010b;001b) +``` + +The null bitmap is a separate structure to kdb: + +```q +q)first read_results +col1 col2 col3 +-------------- +0 1.1 "aa" +1 0 "bb" +2 2.2 "" +q)last read_results +col1 col2 col3 +-------------- +1 0 0 +0 1 0 +0 0 1 +``` + + +## Limitations + +- The use of a null bitmap with the writer functions is not supported.  + +- Since the null bitmap structure and data structure must have the same shape, arrow arrays which use nested datatypes (list, map, struct, union, dictionaries) where the parent array contains null values cannot be represented.  For example, an array with a struct datatype in arrow can have either null child field values or the parent struct value could be null.  The null bitmap structure will only reflect the null bitmap of the child field datatypes. diff --git a/docs/null-mapping.md b/docs/null-mapping.md new file mode 100755 index 0000000..0cbf4b1 --- /dev/null +++ b/docs/null-mapping.md @@ -0,0 +1,109 @@ +# Arrowkdb null mapping + +## Problem + +Previously arrowkdb ignored the null bitmap when reading or writing an arrow array. Users have requested that arrowkdb maps arrow nulls into kdb. + +Unlike arrow, not all kdb types have a null value and those that do overload one value in the range (the 0N* values typically map to INT_MIN or NaN).  + +For example: + +- Each item in an arrow boolean array can be 0b, 1b or NULL.  kdb has no boolean null.  + +- kdb doesn't have a byte null.  + +- Unlike arrow, kdb can't distinguish between: + - a null string and empty string.  + - the " " character and null. + + +## Implementation + +When reading and writing an arrow array using arrowkdb the user can now choose whether to map arrow nulls. Each reader and writer function in arrowkdb takes an options dictionary.  A new `NULL_MAPPING option containing a dictionary of datatypes > null values has been added which allows the user to specify whether an arrow datatype should be null mapped and what value to use for null in kdb. + +> :warning: **An identify function (::) may be required in the options dictionary values** +> +> The options dictionary values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. Therefore if the only set option is NULL_MAPPING, an additional empty key and corresponding value identity function (::) must be included in the options to make the values a mixed list. + +The following Arrow datatype are supported, along with possible null mapping values: + +```q +q)options:(``NULL_MAPPING)!(::;`bool`uint8`int8`uint16`int16`uint32`int32`uint64`int64`float16`float32`float64`date32`date64`month_interval`day_time_interval`timestamp`time32`time64`duration`utf8`large_utf8`binary`large_binary`fixed_size_binary!(0b;0x00;0x00;0Nh;0Nh;0Ni;0Ni;0N;0N;0Nh;0Ne;0Nf;0Nd;0Np;0Nm;0Nn;0Np;0Nt;0Nn;0Nn;"";"";`byte$"";`byte$"";`byte$"")) +q)options`NULL_MAPPING +bool | 0b +uint8 | 0x00 +int8 | 0x00 +uint16 | 0Nh +int16 | 0Nh +uint32 | 0Ni +int32 | 0Ni +uint64 | 0N +int64 | 0N +float16 | 0Nh +float32 | 0Ne +float64 | 0n +date32 | 0Nd +date64 | 0Np +month_interval | 0Nm +day_time_interval| 0Nn +timestamp | 0Np +time32 | 0Nt +time64 | 0Nn +duration | 0Nn +utf8 | "" +large_utf8 | "" +binary | `byte$() +large_binary | `byte$() +fixed_size_binary| `byte$() +``` + +The type of each value in this dictionary must be the atomic type of the corresponding list representation for that datatype.  Where a datatype isn't present in this dictionary, arrowkdb will ignore the null bitmap (as per the previous behaviour). + +## Example + +Using these null mapping we can pretty print an arrow arrow where the kdb nulls have been mapped to arrow nulls: + +```q +q)options:(``NULL_MAPPING)!(::;`bool`uint8`int8`uint16`int16`uint32`int32`uint64`int64`float16`float32`float64`date32`date64`month_interval`day_time_interval`timestamp`time32`time64`duration`utf8`large_utf8`binary`large_binary`fixed_size_binary!(0b;0x00;0x00;0Nh;0Nh;0Ni;0Ni;0N;0N;0Nh;0Ne;0n;0Nd;0Np;0Nm;0Nn;0Np;0Nt;0Nn;0Nn;"";"";`byte$"";`byte$"";`byte$"")) +q)table:([]col1:0N 1 2; col2:1.1 0n 2.2; col3:("aa"; "bb"; "")) +q).arrowkdb.tb.prettyPrintTableFromTable[table;options] +col1: int64 +col2: double +col3: string +---- +col1: + [ + [ + null, + 1, + 2 + ] + ] +col2: + [ + [ + 1.1, + null, + 2.2 + ] + ] +col3: + [ + [ + "aa", + "bb", + null + ] + ] + +q) +``` + + + + +## Limitations + +- There is no null mapping for arrow arrays which use nested datatypes (list, map, struct, union, dictionaries) where the parent array contains null values.  For example, an array with a struct datatype in arrow can have either null child field values or the parent struct value could be null.  Arrowkdb will only map nulls for the child fields using the above mapping. + +- There is a loss of performance when choosing to map nulls, but this should not be significant.  diff --git a/docs/reference.md b/docs/reference.md old mode 100644 new mode 100755 index 0af5cb1..98efb53 --- a/docs/reference.md +++ b/docs/reference.md @@ -1602,8 +1602,8 @@ returns the schema identifier q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] q)f2:.arrowkdb.fd.field[`float_field;.arrowkdb.dt.float64[]] q).arrowkdb.sc.printSchema[.arrowkdb.sc.schema[(f1,f2)]] -int_field: int64 not null -float_field: double not null +int_field: int64 +float_field: double ``` ### `sc.inferSchema` @@ -1648,11 +1648,9 @@ returns list of field identifiers used by the schema q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] q)f2:.arrowkdb.fd.field[`float_field;.arrowkdb.dt.float64[]] q)schema:.arrowkdb.sc.schema[(f1,f2)] -q).arrowkdb.fd.printField each .arrowkdb.sc.schemaFields[schema] -int_field: int64 not null -float_field: double not null -:: -:: +q).arrowkdb.fd.printField each .arrowkdb.sc.schemaFields[schema]; +int_field: int64 +float_field: double ``` ## Schema management @@ -1680,9 +1678,9 @@ q)f2:.arrowkdb.fd.field[`float_field;.arrowkdb.dt.float64[]] q)f3:.arrowkdb.fd.field[`str_field;.arrowkdb.dt.utf8[]] q)schema:.arrowkdb.sc.schema[(f1,f2,f3)] q).arrowkdb.sc.printSchema[schema] -int_field: int64 not null -float_field: double not null -str_field: string not null +int_field: int64 +float_field: double +str_field: string ``` ### `sc.listSchemas` @@ -1778,7 +1776,7 @@ Where: - `datatype_id` is the datatype identifier of the array - `list` is the kdb+ list data to be displayed -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. the function @@ -1788,6 +1786,7 @@ the function Supported options: - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. > :warning: **For debugging use only** > @@ -1814,13 +1813,17 @@ q).arrowkdb.ar.prettyPrintArray[int_datatype;(1 2 3j);::] Where: - `list` is the kdb+ list data to be displayed -- `options` is reserved for future use - specify generic null (`::`) +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. the function 1. prints array contents to stdout 1. returns generic null +Supported options: + +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. + The kdb+ list type is mapped to an Arrow datatype as described [here](#inferreddatatypes). > :warning: **For debugging use only** @@ -1850,7 +1853,7 @@ Where: - `schema_id` is the schema identifier of the table - `array_data` is a mixed list of array data -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list; values list can be `7h`, `11h` or mixed list of -7|-11|4h +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list; values list can be `7h`, `11h` or mixed list of -7|-11|4|99|101h the function @@ -1862,6 +1865,7 @@ The mixed list of Arrow array data should be ordered in schema field number and Supported options: - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. > :warning: **For debugging use only** > @@ -1914,7 +1918,7 @@ str_field: Where: - `table` is a kdb+ table -- `options` is reserved for future use - specify generic null (`::`) +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list; values list can be `7h`, `11h` or mixed list of -7|-11|4|99|101h the function @@ -1923,6 +1927,10 @@ the function Each column in the table is mapped to a field in the schema. The column name is used as the field name and the column’s kdb+ type is mapped to an Arrow datatype as as described [here](#inferreddatatypes). +Supported options: + +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. + > :warning: **Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors** > > Each column in the table is mapped to a field in the schema. The column name is used as the field name and the column’s kdb+ type is mapped to an Arrow datatype as as described [here](#inferred-datatypes). @@ -1978,7 +1986,7 @@ Where: - `parquet_file` is a string containing the Parquet file name - `schema_id` is the schema identifier to use for the table - `array_data` is a mixed list of array data -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. returns generic null on success @@ -1989,6 +1997,7 @@ Supported options: - `PARQUET_CHUNK_SIZE` - Controls the approximate size of encoded data pages within a column chunk. Long, default 1MB. - `PARQUET_VERSION` - Select the Parquet format version: `V1.0`, `V2.0`, `V2.4`, `V2.6` or `V2.LATEST`. Later versions are more fully featured but may be incompatible with older Parquet implementations. Default `V1.0` - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. > :warning: **The Parquet format is compressed and designed for for maximum space efficiency which may cause a performance overhead compared to Arrow. Parquet is also less fully featured than Arrow which can result in schema limitations** > @@ -2018,7 +2027,7 @@ Where: - `parquet_file` is a string containing the Parquet file name - `table` is a kdb+ table -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. returns generic null on success @@ -2026,6 +2035,7 @@ Supported options: - `PARQUET_CHUNK_SIZE` - Controls the approximate size of encoded data pages within a column chunk. Long, default 1MB. - `PARQUET_VERSION` - Select the Parquet format version: `V1.0`, `V2.0`, `V2.4`, `V2.6` or `V2.LATEST`. Later versions are more fully featured but may be incompatible with older Parquet implementations. Default `V1.0` +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. > :warning: **Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors** > @@ -2073,7 +2083,7 @@ q).arrowkdb.sc.equalSchemas[schema;.arrowkdb.pq.readParquetSchema["file.parquet" Where: - `parquet_file` is a string containing the Parquet file name -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. returns the array data @@ -2082,6 +2092,8 @@ Supported options: - `PARQUET_MULTITHREADED_READ` - Flag indicating whether the Parquet reader should run in multithreaded mode. This can improve performance by processing multiple columns in parallel. Long, default 0. - `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md) for more details. Long, default 0. ```q q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] @@ -2107,13 +2119,14 @@ Where: - `parquet_file` is a string containing the Parquet file name - `column_index` is the index of the column to read, relative to the schema field order -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. returns the array’s data Supported options: - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. ```q q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] @@ -2138,7 +2151,7 @@ q)col1~array_data[1] Where: - `parquet_file` is a string containing the Parquet file name -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. returns the kdb+ table @@ -2149,6 +2162,8 @@ Supported options: - `PARQUET_MULTITHREADED_READ` - Flag indicating whether the Parquet reader should run in multithreaded mode. This can improve performance by processing multiple columns in parallel. Long, default 0. - `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md) for more details. Long, default 0. ```q q)table:([] int_field:(1 2 3); float_field:(4 5 6f); str_field:("aa";"bb";"cc")) @@ -2190,7 +2205,7 @@ Where: - `parquet_file` is a string containing the Parquet file name - `row_groups` is an integer list (6h) of row groups indices to read, or generic null (`::`) to read all row groups - `columns` is an integer list (6h) of column indices to read, or generic null (`::`) to read all columns -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. returns the array data @@ -2199,6 +2214,8 @@ Supported options: - `PARQUET_MULTITHREADED_READ` - Flag indicating whether the Parquet reader should run in multithreaded mode. This can improve performance by processing multiple columns in parallel. Long, default 0. - `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md) for more details. Long, default 0. ```q q)table:([]a:10000000#0;b:10000000#1) @@ -2224,7 +2241,7 @@ Where: - `parquet_file` is a string containing the Parquet file name - `row_groups` is an integer list (6h) of row groups indices to read, or generic null (`::`) to read all row groups - `columns` is an integer list (6h) of column indices to read, or generic null (`::`) to read all columns -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. returns the kdb+ table @@ -2233,6 +2250,8 @@ Supported options: - `PARQUET_MULTITHREADED_READ` - Flag indicating whether the Parquet reader should run in multithreaded mode. This can improve performance by processing multiple columns in parallel. Long, default 0. - `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md) for more details. Long, default 0. ```q q)table:([]a:10000000#0;b:10000000#1) @@ -2260,7 +2279,7 @@ Where: - `arrow_file` is a string containing the Arrow file name - `schema_id` is the schema identifier to use for the table - `array_data` is a mixed list of array data -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. returns generic null on success @@ -2269,6 +2288,8 @@ The mixed list of Arrow array data should be ordered in schema field number and Supported options: - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. +- `ARROW_CHUNK_ROWS` - The number of rows to include in each arrow array. If the total rows in the kdb data are greater then the kdb lists are chunked into the arrow IPC writer. ```q q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] @@ -2282,6 +2303,11 @@ q)read_data~array_data 1b ``` +> :warning: **When writing a large table Arrow may raise 'Capacity error: Cannot write arrays larger than 2^31 - 1 in length**. +> +> Preferable [way](https://arrow.apache.org/docs/python/ipc.html) of serializing of such a table is dividing it into chunks by specifying `ARROW_CHUNK_ROWS` option. + + ### `ipc.writeArrowFromTable` *Convert a kdb+ table to an Arrow table and write to an Arrow file, inferring the schema from the kdb+ table structure* @@ -2294,10 +2320,15 @@ Where: - `arrow_file` is a string containing the Arrow file name - `table` is a kdb+ table -- `options` is reserved for future use - specify generic null (`::`) +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. returns generic null on success +Supported options: + +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. +- `ARROW_CHUNK_ROWS` - The number of rows to include in each arrow array. If the total rows in the kdb data are greater then the kdb lists are chunked into the arrow IPC writer. + > :warning: **Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors** > > Each column in the table is mapped to a field in the schema. The column name is used as the field name and the column’s kdb+ type is mapped to an Arrow datatype as as described [here](#inferred-datatypes). @@ -2310,6 +2341,15 @@ q)read_table~table 1b ``` +> :warning: **When writing a large table Arrow may raise 'Capacity error: Cannot write arrays larger than 2^31 - 1 in length**. +> +> Preferable [way](https://arrow.apache.org/docs/python/ipc.html) of serializing of such a table is dividing it into chunks by specifying `ARROW_CHUNK_ROWS` option. + +```q +table:([]col:2147483652#0x00) +options:(``ARROW_CHUNK_ROWS)!((::);214748365) +.arrowkdb.ipc.writeArrowFromTable["table.arrow";table;options] +``` ### `ipc.readArrowSchema` *Read the schema from an Arrow file* @@ -2344,7 +2384,7 @@ q).arrowkdb.sc.equalSchemas[schema;.arrowkdb.ipc.readArrowSchema["file.arrow"]] Where: - `arrow_file` is a string containing the Arrow file name -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. returns the array data @@ -2352,6 +2392,8 @@ Supported options: - `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md) for more details. Long, default 0. ```q q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] @@ -2376,7 +2418,7 @@ q)read_data~array_data Where: - `arrow_file` is a string containing the Arrow file name -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. returns the kdb+ table @@ -2386,6 +2428,8 @@ Supported options: - `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md) for more details. Long, default 0. ```q q)table:([] int_field:(1 2 3); float_field:(4 5 6f); str_field:("aa";"bb";"cc")) @@ -2409,7 +2453,7 @@ Where: - `schema_id` is the schema identifier to use for the table - `array_data` is a mixed list of array data -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. returns a byte list containing the serialized stream data @@ -2418,6 +2462,8 @@ The mixed list of Arrow array data should be ordered in schema field number and Supported options: - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. +- `ARROW_CHUNK_ROWS` - The number of rows to include in each arrow array. If the total rows in the kdb data are greater then the kdb lists are chunked into the arrow IPC writer. ```q q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] @@ -2431,6 +2477,10 @@ q)read_data~array_data 1b ``` +> :warning: **When writing a large table Arrow may raise 'Capacity error: Cannot write arrays larger than 2^31 - 1 in length**. +> +> Preferable [way](https://arrow.apache.org/docs/python/ipc.html) of serializing of such a table is dividing it into chunks by specifying `ARROW_CHUNK_ROWS` option. + ### `ipc.serializeArrowFromTable` *Convert a kdb+ table to an Arrow table and serialize to an Arrow stream, inferring the schema from the kdb+ table structure* @@ -2442,10 +2492,15 @@ q)read_data~array_data Where: - `table` is a kdb+ table -- `options` is reserved for future use - specify generic null (`::`) +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. returns a byte list containing the serialized stream data +Supported options: + +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. +- `ARROW_CHUNK_ROWS` - The number of rows to include in each arrow array. If the total rows in the kdb data are greater then the kdb lists are chunked into the arrow IPC writer. + > :warning: **Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors** > > Each column in the table is mapped to a field in the schema. The column name is used as the field name and the column’s kdb+ type is mapped to an Arrow datatype as as described [here](#inferred-datatypes). @@ -2458,6 +2513,16 @@ q)new_table~table 1b ``` +> :warning: **When writing a large table Arrow may raise 'Capacity error: Cannot write arrays larger than 2^31 - 1 in length**. +> +> Preferable [way](https://arrow.apache.org/docs/python/ipc.html) of serializing of such a table is dividing it into chunks by specifying `ARROW_CHUNK_ROWS` option. + +```q +table:([]col:2147483652#0x00) +options:(``ARROW_CHUNK_ROWS)!((::);214748365) +serialized:.arrowkdb.ipc.serializeArrowFromTable["table.arrow";table;options] +``` + ### `ipc.parseArrowSchema` *Parse the schema from an Arrow stream* @@ -2492,13 +2557,15 @@ q).arrowkdb.sc.equalSchemas[schema;.arrowkdb.ipc.parseArrowSchema[serialized]] Where: - `serialized` is a byte list containing the serialized stream data -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. returns the array data Supported options: - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md) for more details. Long, default 0. ```q q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] @@ -2523,7 +2590,7 @@ q)read_data~array_data Where: - `serialized` is a byte list containing the serialized stream data -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. returns the kdb+ table @@ -2532,6 +2599,8 @@ Each schema field name is used as the column name and the Arrow array data is us Supported options: - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md) for more details. Long, default 0. ```q q)table:([] int_field:(1 2 3); float_field:(4 5 6f); str_field:("aa";"bb";"cc")) diff --git a/examples/batching_tables.q b/examples/batching_tables.q new file mode 100644 index 0000000..1078c3d --- /dev/null +++ b/examples/batching_tables.q @@ -0,0 +1,47 @@ +// batching_tables.q +// Examples of creating a schema supporting null mapping and using it to read/write parquet and arrow tables + +-1"\n+----------|| batching_tables.q ||----------+\n"; + +// import the arrowkdb library +\l q/arrowkdb.q + +// Filesystem functions for Linux/MacOS/Windows +ls:{[filename] $[.z.o like "w*";system "dir /b ",filename;system "ls ",filename]}; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +//-------------------// +// Create the table // +//-------------------// + +// Support batching of large tables + +// Create data for a large column in the table +batching_table:([]col:2147483652#0x00) +.arrowkdb.ts.writeReadArray[.arrowkdb.dt.int8[];batching_table`col;::] + +// Write the batching table data to a parquet file +batching_options:(``PARQUET_VERSION)!((::);`V2.0) + +parquet_batching:"batching_table.parquet"; +.arrowkdb.pq.writeParquetFromTable[parquet_batching;batching_table;batching_options] +show ls parquet_batching +rm parquet_batching + +// Write the batching array data to an arrow file +batching_options[`ARROW_CHUNK_ROWS]:214748365 + +arrow_batching:"batching_table.arrow"; +.arrowkdb.ipc.writeArrowFromTable[arrow_batching;batching_table;batching_options] +show ls arrow_batching +rm arrow_batching; + +// Serialize the batching array data to an arrow stream +serialized_batching:.arrowkdb.ipc.serializeArrowFromTable[batching_table;batching_options]; +show serialized_batching + + +-1 "\n+----------------------------------------+\n"; + +// Process off +exit 0; diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q new file mode 100644 index 0000000..9dbb7b4 --- /dev/null +++ b/examples/null_bitmap.q @@ -0,0 +1,362 @@ +// null_bitmap.q +// Example of exposing null bitmap as a separate structure to kdb + +-1"\n+----------|| null_bitmap.q ||----------+\n"; + +// import the arrowkdb library +\l q/arrowkdb.q + +// Filesystem functions for Linux/MacOS/Windows +ls:{[filename] $[.z.o like "w*";system "dir /b ",filename;system "ls ",filename]}; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +///////////////////////// +// CONSTRUCTED SCHEMAS // +///////////////////////// + +//-------------------// +// Create the schema // +//-------------------// + +// Support null mapping +bitmap_opts:(`bool`int32`float64`utf8`date32)!(0b;1i;2.34;"start";2006.07.21); +nested_struct_opts:(`uint16`float32`binary`time64)!(9h;8.76e;"x"$"acknowledge";00:00:00.123456789); +nested_dict_opts:(enlist `int64)!(enlist 5); + +nested_options:(``NULL_MAPPING)!((::);bitmap_opts,nested_struct_opts,nested_dict_opts); + +// Create the datatype identifiers +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +bool_dt:.arrowkdb.dt.boolean[]; +i32_dt:.arrowkdb.dt.int32[]; +f64_dt:.arrowkdb.dt.float64[]; +str_dt:.arrowkdb.dt.utf8[]; +d32_dt:.arrowkdb.dt.date32[]; + +ui16_dt:.arrowkdb.dt.uint16[]; + +f32_dt:.arrowkdb.dt.float32[]; +bin_dt:.arrowkdb.dt.binary[]; +t64_dt:.arrowkdb.dt.time64[`nano]; + +i64_dt:.arrowkdb.dt.int64[]; + +// Create the field identifiers +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +bool_fd:.arrowkdb.fd.field[`bool;bool_dt]; +i32_fd:.arrowkdb.fd.field[`int32;i32_dt]; +f64_fd:.arrowkdb.fd.field[`float64;f64_dt]; +str_fd:.arrowkdb.fd.field[`string;str_dt]; +d32_fd:.arrowkdb.fd.field[`date32;d32_dt]; + +ui16_fd:.arrowkdb.fd.field[`uint16;ui16_dt]; + +f32_fd:.arrowkdb.fd.field[`float32;f32_dt]; +bin_fd:.arrowkdb.fd.field[`binary;bin_dt]; +t64_fd:.arrowkdb.fd.field[`time64;t64_dt]; + +i64_fd:.arrowkdb.fd.field[`int64;i64_dt]; + +// Create a field containing the list datatype +list_dt:.arrowkdb.dt.list[ui16_dt]; +list_fd:.arrowkdb.fd.field[`list_field;list_dt]; + +// Create a field containing the struct datatype +struct_dt:.arrowkdb.dt.struct[(f32_fd,bin_fd,t64_fd)]; +struct_fd:.arrowkdb.fd.field[`struct_field;struct_dt]; + +// Create fields containing dictionary datatypes +dict_dt:.arrowkdb.dt.dictionary[str_dt;i64_dt] +dict_fd:.arrowkdb.fd.field[`dictionary;dict_dt] +map_dt:.arrowkdb.dt.map[i64_dt;f64_dt] +map_fd:.arrowkdb.fd.field[`map;map_dt]; + +// Create fields containing union datatypes +sparse_dt:.arrowkdb.dt.sparse_union[(i64_fd,f64_fd)] +sparse_fd:.arrowkdb.fd.field[`sparse_union;sparse_dt] +dense_dt:.arrowkdb.dt.dense_union[(i64_fd,f64_fd)] +dense_fd:.arrowkdb.fd.field[`dense_union;dense_dt] + +// Create the schemas for primitive fields +bitmap_schema:.arrowkdb.sc.schema[(ts_fd,bool_fd,i32_fd,f64_fd,str_fd,d32_fd)]; + +// Create the schema containing the list and struct fields +struct_schema:.arrowkdb.sc.schema[(list_fd,struct_fd)]; + +// Create the schema containing the dictionary and map fields +dict_schema:.arrowkdb.sc.schema[(dict_fd, map_fd)]; + +// Create the schema containing the sparce and dense union fields +union_schema:.arrowkdb.sc.schema[(sparse_fd, dense_fd)] + +// Print the schema +-1"\nBitmap schema:"; +.arrowkdb.sc.printSchema[bitmap_schema]; + +-1"\nStruct schema:"; +.arrowkdb.sc.printSchema[struct_schema]; + +-1"\nDict schema:"; +.arrowkdb.sc.printSchema[dict_schema]; + +-1"\nUnion schema:"; +.arrowkdb.sc.printSchema[union_schema]; + +// Number of items in each array +N:10 + +// Create data for each column in the table +ts_data:asc N?0p; + +bool_data:N?(0b;1b); +bool_data[0]:0b; +i32_data:N?100i; +i32_data[1]:1i; +f64_data:N?100f; +f64_data[2]:2.34f; +str_data:N?("start";"stop";"alert";"acknowledge";""); +str_data[3]:"start" +d32_data:N?(2006.07.21;2005.07.18;2004.07.16;2003.07.15;2002.07.11); +d32_data[4]:2006.07.21; + +// Create the data for each of the struct child fields +f32_data:5?100e; +f32_data[0]:8.76e; +bin_data:5?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); +bin_data[1]:"x"$"acknowledge" +t64_data:5?(12:00:00.000000000;13:00:00.000000000;14:00:00.000000000;15:00:00.000000000;16:00:00.000000000); +t64_data[2]:00:00:00.123456789; + +// Create the data for the union child fields +i64_data:N?100; +i64_data[0]:1; + +// Combine the data for primitive columns +bitmap_data:(ts_data;bool_data;i32_data;f64_data;str_data;d32_data); + +// Combine the array data for the list and struct columns +list_array:(enlist 9h;(8h;7h);(6h;5h;4h);(1h;2h;3h;4h);(5h;6h;7h;8h;9h)); +struct_array:(f32_data;bin_data;t64_data); +struct_data:(list_array;struct_array); + +// Combine the array data for the list and struct columns +dict_data:(("aa";"bb";"cc");(2 0 1)) +map_data:((enlist 1)!(enlist 1f);(2 2)!(2 2.34f);(3 3 3)!(3 3 3f)) +dict_data:(dict_data;map_data); + +// Combine the array data for the list and struct columns +sparse_data:dense_data:(0 1 0h;5 2 3;4 2.34 6f) +union_data:(sparse_data;dense_data) + +// Pretty print the Arrow table populated from the bitmap data +-1"\nBitmap table:"; +.arrowkdb.tb.prettyPrintTable[bitmap_schema;bitmap_data;nested_options]; + +// Show the array data as an arrow table +-1"\nStruct table:"; +.arrowkdb.tb.prettyPrintTable[struct_schema;struct_data;nested_options] + +// Show the array data as an arrow table +-1"\nDict table:"; +.arrowkdb.tb.prettyPrintTable[dict_schema;dict_data;nested_options] + +// Show the array data as an arrow table +-1"\nUnion table:"; +.arrowkdb.tb.prettyPrintTable[union_schema;union_data;nested_options] + +//-------------------------// +// Example-1. Parquet file // +//-------------------------// + +// Write the schema and array data to a parquet file +nested_options[`PARQUET_VERSION]:`V2.0; + +parquet_null_bitmap:"null_bitmap.parquet"; +parquet_nested_struct:"nested_struct.parquet"; +parquet_nested_dict:"nested_dict.parquet"; +parquet_nested_union:"nested_union.parquet"; + +.arrowkdb.pq.writeParquet[parquet_null_bitmap;bitmap_schema;bitmap_data;nested_options]; +.arrowkdb.pq.writeParquet[parquet_nested_struct;struct_schema;struct_data;nested_options]; +.arrowkdb.pq.writeParquet[parquet_nested_dict;dict_schema;dict_data;nested_options]; + +show ls parquet_null_bitmap +show ls parquet_nested_struct +show ls parquet_nested_dict + +// Read the schema back and compare +nested_options[`WITH_NULL_BITMAP]:1; + +parquet_bitmap_schema:.arrowkdb.pq.readParquetSchema[parquet_null_bitmap]; +parquet_struct_schema:.arrowkdb.pq.readParquetSchema[parquet_nested_struct]; + +show .arrowkdb.sc.equalSchemas[bitmap_schema;parquet_bitmap_schema] +show .arrowkdb.sc.equalSchemas[struct_schema;parquet_struct_schema] + +show bitmap_schema~parquet_bitmap_schema +show struct_schema~parquet_struct_schema + +// Read the array data back and compare +parquet_bitmap_data:.arrowkdb.pq.readParquetData[parquet_null_bitmap;nested_options]; +parquet_struct_data:.arrowkdb.pq.readParquetData[parquet_nested_struct;nested_options]; +parquet_dict_data:.arrowkdb.pq.readParquetData[parquet_nested_dict;nested_options]; + +show bitmap_data~first parquet_bitmap_data +show struct_data~first parquet_struct_data +show first[dict_data[0]]~asc first parquet_dict_data[0] +show last[dict_data]~last parquet_dict_data[0] + +// Compare null bitmaps of parquet data +nulls_data:1b,(N-1)?1b; +bitmap_nulls:{x rotate nulls_data} each neg til {x-1} count bitmap_data; +nested_list_nulls:(enlist 1b;00b;000b;0000b;00001b); +nested_struct_nulls:(10000b;01000b;00100b); +nested_dict_nulls:(000b;000b); +nested_map_nulls:((enlist 0b)!(enlist 0b);00b!01b;000b!000b); +nested_union_nulls:((0 1 0h);100b;010b); + +parquet_bitmap_nulls:last parquet_bitmap_data; +parquet_list_nulls:first parquet_struct_data[1] +parquet_struct_nulls:last parquet_struct_data[1] +parquet_dict_nulls:parquet_dict_data[1] + +show bitmap_nulls~bitmap_nulls & sublist[{1-x} count parquet_bitmap_nulls;parquet_bitmap_nulls] +show nested_list_nulls~parquet_list_nulls +show nested_struct_nulls~parquet_struct_nulls +show nested_dict_nulls[0]~parquet_dict_nulls[0] +show nested_map_nulls~last[parquet_dict_nulls] + +rm parquet_null_bitmap; +rm parquet_nested_struct; +rm parquet_nested_dict; + +//---------------------------// +// Example-2. Arrow IPC file // +//---------------------------// + +// Write the schema and array data to an arrow file +arrow_null_bitmap:"null_bitmap.arrow"; +arrow_struct_bitmap:"nested_struct.arrow"; +arrow_dict_bitmap:"nested_dict.arrow"; +arrow_union_bitmap:"nested_union.arrow"; + +.arrowkdb.ipc.writeArrow[arrow_null_bitmap;bitmap_schema;bitmap_data;nested_options]; +.arrowkdb.ipc.writeArrow[arrow_struct_bitmap;struct_schema;struct_data;nested_options]; +.arrowkdb.ipc.writeArrow[arrow_dict_bitmap;dict_schema;dict_data;nested_options]; +.arrowkdb.ipc.writeArrow[arrow_union_bitmap;union_schema;union_data;nested_options]; + +show ls arrow_null_bitmap +show ls arrow_struct_bitmap +show ls arrow_dict_bitmap +show ls arrow_union_bitmap + +// Read the schema back and compare +arrow_bitmap_schema:.arrowkdb.ipc.readArrowSchema[arrow_null_bitmap]; +arrow_struct_schema:.arrowkdb.ipc.readArrowSchema[arrow_struct_bitmap]; +arrow_dict_schema:.arrowkdb.ipc.readArrowSchema[arrow_dict_bitmap]; +arrow_union_schema:.arrowkdb.ipc.readArrowSchema[arrow_union_bitmap]; + +show .arrowkdb.sc.equalSchemas[bitmap_schema;arrow_bitmap_schema] +show .arrowkdb.sc.equalSchemas[struct_schema;arrow_struct_schema] +show .arrowkdb.sc.equalSchemas[dict_schema;arrow_dict_schema] +show .arrowkdb.sc.equalSchemas[union_schema;arrow_union_schema] + +show bitmap_schema~arrow_bitmap_schema +show struct_schema~arrow_struct_schema +show dict_schema~arrow_dict_schema +show union_schema~arrow_union_schema + +// Read the array data back and compare +arrow_bitmap_data:.arrowkdb.ipc.readArrowData[arrow_null_bitmap;nested_options]; +arrow_struct_data:.arrowkdb.ipc.readArrowData[arrow_struct_bitmap;nested_options]; +arrow_dict_data:.arrowkdb.ipc.readArrowData[arrow_dict_bitmap;nested_options]; +arrow_union_data:.arrowkdb.ipc.readArrowData[arrow_union_bitmap;nested_options]; + +show bitmap_data~first arrow_bitmap_data +show struct_data~first arrow_struct_data +show dict_data~first arrow_dict_data +show union_data~first arrow_union_data + +// Compare null bitmaps of arrow data +arrow_bitmap_nulls:last arrow_bitmap_data; +arrow_list_nulls:first arrow_struct_data[1] +arrow_struct_nulls:last arrow_struct_data[1] +arrow_dict_nulls:arrow_dict_data[1] +arrow_union_nulls:arrow_union_data[1] + +show bitmap_nulls~bitmap_nulls & sublist[{1-x} count arrow_bitmap_nulls;arrow_bitmap_nulls] +show nested_list_nulls~arrow_list_nulls +show nested_struct_nulls~arrow_struct_nulls +show nested_dict_nulls~first[arrow_dict_nulls] +show nested_map_nulls~last[arrow_dict_nulls] +show nested_union_nulls~arrow_union_nulls[0] +show nested_union_nulls~arrow_union_nulls[1] + +rm arrow_null_bitmap; +rm arrow_struct_bitmap; +rm arrow_dict_bitmap; +rm arrow_union_bitmap; + +//-----------------------------// +// Example-3. Arrow IPC stream // +//-----------------------------// + +// Serialize the schema and array data to an arrow stream +serialized_null_bitmap:.arrowkdb.ipc.serializeArrow[bitmap_schema;bitmap_data;nested_options]; +serialized_nested_struct:.arrowkdb.ipc.serializeArrow[struct_schema;struct_data;nested_options]; +serialized_nested_dict:.arrowkdb.ipc.serializeArrow[dict_schema;dict_data;nested_options]; +serialized_nested_union:.arrowkdb.ipc.serializeArrow[union_schema;union_data;nested_options]; + +show serialized_null_bitmap +show serialized_nested_struct +show serialized_nested_dict +show serialized_nested_union + +// Parse the schema back abd compare +stream_bitmap_schema:.arrowkdb.ipc.parseArrowSchema[serialized_null_bitmap]; +stream_struct_schema:.arrowkdb.ipc.parseArrowSchema[serialized_nested_struct]; +stream_dict_schema:.arrowkdb.ipc.parseArrowSchema[serialized_nested_dict]; +stream_union_schema:.arrowkdb.ipc.parseArrowSchema[serialized_nested_union]; + +show .arrowkdb.sc.equalSchemas[bitmap_schema;stream_bitmap_schema] +show .arrowkdb.sc.equalSchemas[struct_schema;stream_struct_schema] +show .arrowkdb.sc.equalSchemas[dict_schema;stream_dict_schema] +show .arrowkdb.sc.equalSchemas[union_schema;stream_union_schema] + +show bitmap_schema~stream_bitmap_schema +show struct_schema~stream_struct_schema +show dict_schema~stream_dict_schema +show union_schema~stream_union_schema + +// Parse the array data back and compare +stream_bitmap_data:.arrowkdb.ipc.parseArrowData[serialized_null_bitmap;nested_options]; +stream_struct_data:.arrowkdb.ipc.parseArrowData[serialized_nested_struct;nested_options]; +stream_dict_data:.arrowkdb.ipc.parseArrowData[serialized_nested_dict;nested_options]; +stream_union_data:.arrowkdb.ipc.parseArrowData[serialized_nested_union;nested_options]; + +show bitmap_data~first stream_bitmap_data +show struct_data~first stream_struct_data +show dict_data~first stream_dict_data +show union_data~first stream_union_data + +// Compare null bitmaps of stream data +stream_bitmap_nulls:last stream_bitmap_data; +stream_list_nulls:first stream_struct_data[1] +stream_struct_nulls:last stream_struct_data[1] +stream_dict_nulls:stream_dict_data[1] +stream_union_nulls:stream_union_data[1] + +show bitmap_nulls~bitmap_nulls & sublist[{1-x} count stream_bitmap_nulls;stream_bitmap_nulls] +show nested_list_nulls~stream_list_nulls +show nested_struct_nulls~stream_struct_nulls +show nested_dict_nulls~first[stream_dict_nulls] +show nested_map_nulls~last[stream_dict_nulls] +show nested_union_nulls~stream_union_nulls[0] +show nested_union_nulls~stream_union_nulls[1] + +-1 "\n+----------------------------------------+\n"; + +// Process off +//exit 0; diff --git a/examples/null_mapping.q b/examples/null_mapping.q new file mode 100644 index 0000000..be083f6 --- /dev/null +++ b/examples/null_mapping.q @@ -0,0 +1,431 @@ +// null_mapping.q +// Examples of creating a schema supporting null mapping and using it to read/write parquet and arrow tables + +-1"\n+----------|| null_mapping.q ||----------+\n"; + +// import the arrowkdb library +\l q/arrowkdb.q + +// Filesystem functions for Linux/MacOS/Windows +ls:{[filename] $[.z.o like "w*";system "dir /b ",filename;system "ls ",filename]}; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +///////////////////////// +// CONSTRUCTED SCHEMAS // +///////////////////////// + +//-------------------// +// Create the schema // +//-------------------// + +// Support null mapping in parquet and arrow +short_opts:(`bool`uint8`int8`uint16`int16)!(0b;0x01;0x02;3h;4h); +long_opts:(`uint32`int32`uint64`int64)!(5i;6i;7;8); +float_opts:(`float32`float64`decimal)!(1.23e;4.56;7.89); +str_opts:(`utf8`binary`fixed_size_binary)!("start";"x"$"alert";0Ng); +time_opts:(`date32`timestamp`time64)!(2006.07.21;2011.01.01D00:00:00.000000000;12:00:00.000000000); + +// Support null mapping only in arrow +extra_opts:(`float16`large_utf8`large_binary`duration)!(9h;"stop";"x"$"acknowledge";12:00:00.000000000); +other_opts:(`date64`time32`month_interval`day_time_interval)!(2015.01.01D00:00:00.000000000;09:01:02.042;2006.07m;12:00:00.000000000); + +options:(``NULL_MAPPING)!((::);short_opts,long_opts,float_opts,str_opts,time_opts,extra_opts,other_opts); + +// Create the datatype identifiers +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +bool_dt:.arrowkdb.dt.boolean[]; +ui8_dt:.arrowkdb.dt.uint8[]; +i8_dt:.arrowkdb.dt.int8[]; +ui16_dt:.arrowkdb.dt.uint16[]; +i16_dt:.arrowkdb.dt.int16[]; + +ui32_dt:.arrowkdb.dt.uint32[]; +i32_dt:.arrowkdb.dt.int32[]; +ui64_dt:.arrowkdb.dt.uint64[]; +i64_dt:.arrowkdb.dt.int64[]; + +f32_dt:.arrowkdb.dt.float32[]; +f64_dt:.arrowkdb.dt.float64[]; +dec_dt:.arrowkdb.dt.decimal128[38i;2i]; + +str_dt:.arrowkdb.dt.utf8[]; +bin_dt:.arrowkdb.dt.binary[]; +fbin_dt:.arrowkdb.dt.fixed_size_binary[16i]; + +d32_dt:.arrowkdb.dt.date32[]; +tstamp_dt:.arrowkdb.dt.timestamp[`nano]; +t64_dt:.arrowkdb.dt.time64[`nano]; + +f16_dt:.arrowkdb.dt.float16[]; +lstr_dt:.arrowkdb.dt.large_utf8[]; +lbin_dt:.arrowkdb.dt.large_binary[]; +dur_dt:.arrowkdb.dt.duration[`milli]; + +d64_dt:.arrowkdb.dt.date64[]; +t32_dt:.arrowkdb.dt.time32[`milli]; +mint_dt:.arrowkdb.dt.month_interval[]; +dtint_dt:.arrowkdb.dt.day_time_interval[]; + +// Create the field identifiers +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +bool_fd:.arrowkdb.fd.field[`bool;bool_dt]; +ui8_fd:.arrowkdb.fd.field[`uint8;ui8_dt]; +i8_fd:.arrowkdb.fd.field[`int8;i8_dt]; +ui16_fd:.arrowkdb.fd.field[`uint16;ui16_dt]; +i16_fd:.arrowkdb.fd.field[`int16;i16_dt]; + +ui32_fd:.arrowkdb.fd.field[`uint32;ui32_dt]; +i32_fd:.arrowkdb.fd.field[`int32;i32_dt]; +ui64_fd:.arrowkdb.fd.field[`uint64;ui64_dt]; +i64_fd:.arrowkdb.fd.field[`int64;i64_dt]; + +f32_fd:.arrowkdb.fd.field[`float32;f32_dt]; +f64_fd:.arrowkdb.fd.field[`float64;f64_dt]; +dec_fd:.arrowkdb.fd.field[`decimal;dec_dt]; + +str_fd:.arrowkdb.fd.field[`string;str_dt]; +bin_fd:.arrowkdb.fd.field[`binary;bin_dt]; +fbin_fd:.arrowkdb.fd.field[`fixed_binary;fbin_dt]; + +d32_fd:.arrowkdb.fd.field[`date32;d32_dt]; +tstamp_fd:.arrowkdb.fd.field[`timestamp;tstamp_dt]; +t64_fd:.arrowkdb.fd.field[`time64;t64_dt]; + +f16_fd:.arrowkdb.fd.field[`float16;f16_dt]; +lstr_fd:.arrowkdb.fd.field[`large_string;lstr_dt]; +lbin_fd:.arrowkdb.fd.field[`large_binary;lbin_dt]; +dur_fd:.arrowkdb.fd.field[`duration;dur_dt]; + +d64_fd:.arrowkdb.fd.field[`date64;d64_dt]; +t32_fd:.arrowkdb.fd.field[`time32;t32_dt]; +mint_fd:.arrowkdb.fd.field[`month_interval;mint_dt]; +dtint_fd:.arrowkdb.fd.field[`day_time_interval;dtint_dt]; + +// Create the schemas for the list of fields +short_schema:.arrowkdb.sc.schema[(ts_fd,bool_fd,ui8_fd,i8_fd,ui16_fd,i16_fd)]; +long_schema:.arrowkdb.sc.schema[(ts_fd,ui32_fd,i32_fd,ui64_fd,i64_fd)]; +float_schema:.arrowkdb.sc.schema[(ts_fd,f32_fd,f64_fd,dec_fd)]; +str_schema:.arrowkdb.sc.schema[(ts_fd,str_fd,bin_fd,fbin_fd)]; +time_schema:.arrowkdb.sc.schema[(ts_fd,d32_fd,tstamp_fd,t64_fd)]; + +extra_schema:.arrowkdb.sc.schema[(ts_fd,f16_fd,lstr_fd,lbin_fd,dur_fd)]; +other_schema:.arrowkdb.sc.schema[(ts_fd,d64_fd,t32_fd,mint_fd,dtint_fd)]; + +// Print the schemas +.arrowkdb.sc.printSchema[short_schema]; +.arrowkdb.sc.printSchema[long_schema]; +.arrowkdb.sc.printSchema[float_schema] +.arrowkdb.sc.printSchema[str_schema]; +.arrowkdb.sc.printSchema[time_schema]; + +.arrowkdb.sc.printSchema[extra_schema]; +.arrowkdb.sc.printSchema[other_schema]; + +//-----------------------// +// Create the array data // +//-----------------------// + +// Number of items in each array +N:10 + +// Create data for each column in the table +ts_data:asc N?0p; + +bool_data:N?(0b;1b); +bool_data[0]:0b; +ui8_data:N?0x64; +ui8_data[1]:0x01; +i8_data:N?0x64; +i8_data[2]:0x02; +ui16_data:N?100h; +ui16_data[3]:3h; +i16_data:N?100h; +i16_data[4]:4h; + +ui32_data:N?100i; +ui32_data[0]:5i; +i32_data:N?100i; +i32_data[1]:6i; +ui64_data:N?100; +ui64_data[2]:7; +i64_data:N?100; +i64_data[3]:8; + +f32_data:N?100e; +f32_data[0]:1.23e; +f64_data:N?100f; +f64_data[1]:4.56f; +dec_data:{"F"$.Q.f[2]x} each N?(10f) +dec_data[2]:7.89f + +str_data:N?("start";"stop";"alert";"acknowledge";""); +str_data[0]:"start" +bin_data:N?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); +bin_data[2]:"x"$"alert" +fbin_data:N?0Ng; +fbin_data[4]:0Ng; + +d32_data:N?(2006.07.21;2008.07.18;2012.07.16;2014.07.15;2016.07.11); +d32_data[0]:2006.07.21; +tstamp_data:N?(2015.01.01D00:00:00.000000000;2014.01.01D00:00:00.000000000;2013.01.01D00:00:00.000000000;2012.01.01D00:00:00.000000000;2011.01.01D00:00:00.000000000); +tstamp_data[2]:2011.01.01D00:00:00.000000000; +t64_data:N?(12:00:00.000000000;13:00:00.000000000;14:00:00.000000000;15:00:00.000000000;16:00:00.000000000); +t64_data[3]:12:00:00.000000000; + +f16_data:N?100h; +f16_data[0]:9h; +lstr_data:N?("start";"stop";"alert";"acknowledge";""); +lstr_data[1]:"stop" +lbin_data:N?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); +lbin_data[3]:"x"$"acknowledge" +dur_data:N?(12:00:00.000000000;13:00:00.000000000;14:00:00.000000000;15:00:00.000000000;16:00:00.000000000); +dur_data[4]:12:00:00.000000000; + +d64_data:N?(2015.01.01D00:00:00.000000000;2017.01.01D00:00:00.000000000;2018.01.01D00:00:00.000000000;2019.01.01D00:00:00.000000000;2020.01.01D00:00:00.000000000); +d64_data[1]:2015.01.01D00:00:00.000000000; +t32_data:N?(09:01:02.042;08:01:02.042;07:01:02.042;06:01:02.042;05:01:02.042); +t32_data[1]:09:01:02.042; +mint_data:N?(2006.07m;2006.06m;2006.05m;2006.04m;2006.03m); +mint_data[2]:2006.07m; +dtint_data:N?(12:00:00.000000000;11:00:00.000000000;10:00:00.000000000;09:00:00.000000000;08:00:00.000000000); +dtint_data[3]:12:00:00.000000000; + +// Combine the data for all columns +short_data:(ts_data;bool_data;ui8_data;i8_data;ui16_data;i16_data); +long_data:(ts_data;ui32_data;i32_data;ui64_data;i64_data); +float_data:(ts_data;f32_data;f64_data;dec_data); +str_data:(ts_data;str_data;bin_data;fbin_data); +time_data:(ts_data;d32_data;tstamp_data;t64_data); + +extra_data:(ts_data;f16_data;lstr_data;lbin_data;dur_data); +other_data:(ts_data;d64_data;t32_data;mint_data;dtint_data); + +// Pretty print the Arrow table populated from the array data +options[`DECIMAL128_AS_DOUBLE]:1 + +.arrowkdb.tb.prettyPrintTable[short_schema;short_data;options]; +.arrowkdb.tb.prettyPrintTable[long_schema;long_data;options]; +.arrowkdb.tb.prettyPrintTable[float_schema;float_data;options]; +.arrowkdb.tb.prettyPrintTable[str_schema;str_data;options]; +.arrowkdb.tb.prettyPrintTable[time_schema;time_data;options]; +.arrowkdb.tb.prettyPrintTable[extra_schema;extra_data;options]; +.arrowkdb.tb.prettyPrintTable[other_schema;other_data;options]; + +//-------------------------// +// Example-1. Parquet file // +//-------------------------// + +// Write the schema and array data to a parquet file +options[`PARQUET_VERSION]:`V2.0 + +parquet_short:"null_mapping_short.parquet"; +parquet_long:"null_mapping_long.parquet"; +parquet_float:"null_mapping_float.parquet"; +parquet_str:"null_mapping_str.parquet"; +parquet_time:"null_mapping_time.parquet"; + +.arrowkdb.pq.writeParquet[parquet_short;short_schema;short_data;options]; +.arrowkdb.pq.writeParquet[parquet_long;long_schema;long_data;options]; +.arrowkdb.pq.writeParquet[parquet_float;float_schema;float_data;options]; +.arrowkdb.pq.writeParquet[parquet_str;str_schema;str_data;options]; +.arrowkdb.pq.writeParquet[parquet_time;time_schema;time_data;options]; + +show ls parquet_short +show ls parquet_long +show ls parquet_float +show ls parquet_str +show ls parquet_time + +// Read the schema back and compare +parquet_short_schema:.arrowkdb.pq.readParquetSchema[parquet_short]; +parquet_long_schema:.arrowkdb.pq.readParquetSchema[parquet_long]; +parquet_float_schema:.arrowkdb.pq.readParquetSchema[parquet_float]; +parquet_str_schema:.arrowkdb.pq.readParquetSchema[parquet_str]; +parquet_time_schema:.arrowkdb.pq.readParquetSchema[parquet_time]; + +show .arrowkdb.sc.equalSchemas[short_schema;parquet_short_schema] +show .arrowkdb.sc.equalSchemas[long_schema;parquet_long_schema] +show .arrowkdb.sc.equalSchemas[float_schema;parquet_float_schema] +show .arrowkdb.sc.equalSchemas[str_schema;parquet_str_schema] +show .arrowkdb.sc.equalSchemas[time_schema;parquet_time_schema] + +show short_schema~parquet_short_schema +show long_schema~parquet_long_schema +show float_schema~parquet_float_schema +show str_schema~parquet_str_schema +show time_schema~parquet_time_schema + +// Read the array data back and compare +parquet_short_data:.arrowkdb.pq.readParquetData[parquet_short;options]; +parquet_long_data:.arrowkdb.pq.readParquetData[parquet_long;options]; +parquet_float_data:.arrowkdb.pq.readParquetData[parquet_float;options]; +parquet_str_data:.arrowkdb.pq.readParquetData[parquet_str;options]; +parquet_time_data:.arrowkdb.pq.readParquetData[parquet_time;options]; + +parquet_str_data[3]:{0x0 sv x} each parquet_str_data[3] // Convert to GUIDs + +show short_data~parquet_short_data +show long_data~parquet_long_data +show float_data~parquet_float_data +show str_data~parquet_str_data +show time_data~parquet_time_data + +rm parquet_short; +rm parquet_long; +rm parquet_float; +rm parquet_str; +rm parquet_time; + +//---------------------------// +// Example-2. Arrow IPC file // +//---------------------------// + +// Write the schema and array data to an arrow file +arrow_short:"null_mapping_short.arrow"; +arrow_long:"null_mapping_long.arrow"; +arrow_float:"null_mapping_float.arrow"; +arrow_str:"null_mapping_str.arrow"; +arrow_time:"null_mapping_time.arrow"; +arrow_extra:"null_mapping_extra.arrow"; +arrow_other:"null_mapping_other.arrow"; + +.arrowkdb.ipc.writeArrow[arrow_short;short_schema;short_data;options]; +.arrowkdb.ipc.writeArrow[arrow_long;long_schema;long_data;options]; +.arrowkdb.ipc.writeArrow[arrow_float;float_schema;float_data;options]; +.arrowkdb.ipc.writeArrow[arrow_str;str_schema;str_data;options]; +.arrowkdb.ipc.writeArrow[arrow_time;time_schema;time_data;options]; +.arrowkdb.ipc.writeArrow[arrow_extra;extra_schema;extra_data;options]; +.arrowkdb.ipc.writeArrow[arrow_other;other_schema;other_data;options]; + +show ls arrow_short +show ls arrow_long +show ls arrow_float +show ls arrow_str +show ls arrow_time +show ls arrow_extra +show ls arrow_other + +// Read the schema back and compare +arrow_short_schema:.arrowkdb.ipc.readArrowSchema[arrow_short]; +arrow_long_schema:.arrowkdb.ipc.readArrowSchema[arrow_long]; +arrow_float_schema:.arrowkdb.ipc.readArrowSchema[arrow_float]; +arrow_str_schema:.arrowkdb.ipc.readArrowSchema[arrow_str]; +arrow_time_schema:.arrowkdb.ipc.readArrowSchema[arrow_time]; +arrow_extra_schema:.arrowkdb.ipc.readArrowSchema[arrow_extra]; +arrow_other_schema:.arrowkdb.ipc.readArrowSchema[arrow_other]; + +show .arrowkdb.sc.equalSchemas[short_schema;arrow_short_schema] +show .arrowkdb.sc.equalSchemas[long_schema;arrow_long_schema] +show .arrowkdb.sc.equalSchemas[float_schema;arrow_float_schema] +show .arrowkdb.sc.equalSchemas[str_schema;arrow_str_schema] +show .arrowkdb.sc.equalSchemas[time_schema;arrow_time_schema] +show .arrowkdb.sc.equalSchemas[extra_schema;arrow_extra_schema] +show .arrowkdb.sc.equalSchemas[other_schema;arrow_other_schema] + +show short_schema~arrow_short_schema +show long_schema~arrow_long_schema +show float_schema~arrow_float_schema +show str_schema~arrow_str_schema +show time_schema~arrow_time_schema +show extra_schema~arrow_extra_schema +show other_schema~arrow_other_schema + +// Read the array data back and compare +arrow_short_data:.arrowkdb.ipc.readArrowData[arrow_short;options]; +arrow_long_data:.arrowkdb.ipc.readArrowData[arrow_long;options]; +arrow_float_data:.arrowkdb.ipc.readArrowData[arrow_float;options]; +arrow_str_data:.arrowkdb.ipc.readArrowData[arrow_str;options]; +arrow_time_data:.arrowkdb.ipc.readArrowData[arrow_time;options]; +arrow_extra_data:.arrowkdb.ipc.readArrowData[arrow_extra;options]; +arrow_other_data:.arrowkdb.ipc.readArrowData[arrow_other;options]; + +arrow_str_data[3]:{0x0 sv x} each arrow_str_data[3] // Convert to GUIDs + +show short_data~arrow_short_data +show long_data~arrow_long_data +show float_data~arrow_float_data +show str_data~arrow_str_data +show time_data~arrow_time_data +show extra_data~arrow_extra_data +show other_data~arrow_other_data + +rm arrow_short; +rm arrow_long; +rm arrow_float; +rm arrow_str; +rm arrow_time; +rm arrow_extra; +rm arrow_other; + +//-----------------------------// +// Example-3. Arrow IPC stream // +//-----------------------------// + +// Serialize the schema and array data to an arrow stream +serialized_short:.arrowkdb.ipc.serializeArrow[short_schema;short_data;options]; +serialized_long:.arrowkdb.ipc.serializeArrow[long_schema;long_data;options]; +serialized_float:.arrowkdb.ipc.serializeArrow[float_schema;float_data;options]; +serialized_str:.arrowkdb.ipc.serializeArrow[str_schema;str_data;options]; +serialized_time:.arrowkdb.ipc.serializeArrow[time_schema;time_data;options]; +serialized_extra:.arrowkdb.ipc.serializeArrow[extra_schema;extra_data;options]; +serialized_other:.arrowkdb.ipc.serializeArrow[other_schema;other_data;options]; + +show serialized_short +show serialized_long +show serialized_float +show serialized_str +show serialized_time +show serialized_extra +show serialized_other + +// Parse the schema back abd compare +stream_short_schema:.arrowkdb.ipc.parseArrowSchema[serialized_short]; +stream_long_schema:.arrowkdb.ipc.parseArrowSchema[serialized_long]; +stream_float_schema:.arrowkdb.ipc.parseArrowSchema[serialized_float]; +stream_str_schema:.arrowkdb.ipc.parseArrowSchema[serialized_str]; +stream_time_schema:.arrowkdb.ipc.parseArrowSchema[serialized_time]; +stream_extra_schema:.arrowkdb.ipc.parseArrowSchema[serialized_extra]; +stream_other_schema:.arrowkdb.ipc.parseArrowSchema[serialized_other]; + +show .arrowkdb.sc.equalSchemas[short_schema;stream_short_schema] +show .arrowkdb.sc.equalSchemas[long_schema;stream_long_schema] +show .arrowkdb.sc.equalSchemas[float_schema;stream_float_schema] +show .arrowkdb.sc.equalSchemas[str_schema;stream_str_schema] +show .arrowkdb.sc.equalSchemas[time_schema;stream_time_schema] +show .arrowkdb.sc.equalSchemas[extra_schema;stream_extra_schema] +show .arrowkdb.sc.equalSchemas[other_schema;stream_other_schema] + +show short_schema~stream_short_schema +show long_schema~stream_long_schema +show float_schema~stream_float_schema +show str_schema~stream_str_schema +show time_schema~stream_time_schema +show extra_schema~stream_extra_schema +show other_schema~stream_other_schema + +// Parse the array data back and compare +stream_short_data:.arrowkdb.ipc.parseArrowData[serialized_short;options]; +stream_long_data:.arrowkdb.ipc.parseArrowData[serialized_long;options]; +stream_float_data:.arrowkdb.ipc.parseArrowData[serialized_float;options]; +stream_str_data:.arrowkdb.ipc.parseArrowData[serialized_str;options]; +stream_time_data:.arrowkdb.ipc.parseArrowData[serialized_time;options]; +stream_extra_data:.arrowkdb.ipc.parseArrowData[serialized_extra;options]; +stream_other_data:.arrowkdb.ipc.parseArrowData[serialized_other;options]; + +stream_str_data[3]:{0x0 sv x} each stream_str_data[3] // Convert to GUIDs + +show short_data~stream_short_data +show long_data~stream_long_data +show float_data~stream_float_data +show str_data~stream_str_data +show time_data~stream_time_data +show extra_data~stream_extra_data +show other_data~stream_other_data + + +-1 "\n+----------------------------------------+\n"; + +// Process off +exit 0; diff --git a/q/arrowkdb.q b/q/arrowkdb.q index 8fd01f4..b3d56cd 100644 --- a/q/arrowkdb.q +++ b/q/arrowkdb.q @@ -112,11 +112,25 @@ pq.writeParquet:`arrowkdb 2:(`writeParquet;4); pq.writeParquetFromTable:{[filename;table;options] pq.writeParquet[filename;sc.inferSchema[table];value flip table;options]}; pq.readParquetSchema:`arrowkdb 2:(`readParquetSchema;1); pq.readParquetData:`arrowkdb 2:(`readParquetData;2); -pq.readParquetToTable:{[filename;options] flip (fd.fieldName each sc.schemaFields[pq.readParquetSchema[filename]])!(pq.readParquetData[filename;options])}; +pq.readParquetToTable:{[filename;options] + fields:fd.fieldName each sc.schemaFields[pq.readParquetSchema[filename]]; + data:pq.readParquetData[filename;options]; + $[1~options`WITH_NULL_BITMAP; + (flip fields!first data;flip fields!last data); + flip fields!data + ] + }; pq.readParquetColumn:`arrowkdb 2:(`readParquetColumn;3); pq.readParquetNumRowGroups:`arrowkdb 2:(`readParquetNumRowGroups;1); pq.readParquetRowGroups:`arrowkdb 2:(`readParquetRowGroups;4); -pq.readParquetRowGroupsToTable:{[filename;row_groups;columns;options] flip (fd.fieldName each sc.schemaFields[pq.readParquetSchema[filename]](columns))!(pq.readParquetRowGroups[filename;row_groups;columns;options])}; +pq.readParquetRowGroupsToTable:{[filename;row_groups;columns;options] + fields:fd.fieldName each sc.schemaFields[pq.readParquetSchema[filename]](columns); + data:pq.readParquetRowGroups[filename;row_groups;columns;options]; + $[1~options`WITH_NULL_BITMAP; + (flip fields!first data;flip fields!last data); + flip fields!data + ] + }; // arrow files @@ -124,7 +138,14 @@ ipc.writeArrow:`arrowkdb 2:(`writeArrow;4); ipc.writeArrowFromTable:{[filename;table;options] ipc.writeArrow[filename;sc.inferSchema[table];value flip table;options]}; ipc.readArrowSchema:`arrowkdb 2:(`readArrowSchema;1); ipc.readArrowData:`arrowkdb 2:(`readArrowData;2); -ipc.readArrowToTable:{[filename;options] flip (fd.fieldName each sc.schemaFields[ipc.readArrowSchema[filename]])!(ipc.readArrowData[filename;options])}; +ipc.readArrowToTable:{[filename;options] + fields:fd.fieldName each sc.schemaFields[ipc.readArrowSchema[filename]]; + data:ipc.readArrowData[filename;options]; + $[1~options`WITH_NULL_BITMAP; + (flip fields!first data;flip fields!last data); + flip fields!data + ] + }; // arrow streams @@ -132,7 +153,14 @@ ipc.serializeArrow:`arrowkdb 2:(`serializeArrow;3); ipc.serializeArrowFromTable:{[table;options] ipc.serializeArrow[sc.inferSchema[table];value flip table;options]}; ipc.parseArrowSchema:`arrowkdb 2:(`parseArrowSchema;1); ipc.parseArrowData:`arrowkdb 2:(`parseArrowData;2); -ipc.parseArrowToTable:{[serialized;options] flip (fd.fieldName each sc.schemaFields[ipc.parseArrowSchema[serialized]])!(ipc.parseArrowData[serialized;options])}; +ipc.parseArrowToTable:{[serialized;options] + fields:fd.fieldName each sc.schemaFields[ipc.parseArrowSchema[serialized]]; + data:ipc.parseArrowData[serialized;options]; + $[1~options`WITH_NULL_BITMAP; + (flip fields!first data;flip fields!last data); + flip fields!data + ] + }; // utils diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 0456986..353d709 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -15,31 +16,35 @@ #include "HelperFunctions.h" #include "TypeCheck.h" +using namespace std; +using namespace kx::arrowkdb; -namespace kx { -namespace arrowkdb { +namespace { + +typedef K(*ReadArrayCommon)(std::shared_ptr array_data, TypeMappingOverride& type_overrides); +typedef void(*AppendArrayCommon)(std::shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides); // An arrow list array is a nested set of child lists. This is represented in // kdb as a mixed list for the parent list array containing a set of sub-lists, // one for each of the list value sets. template -void AppendList(std::shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +void AppendList(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides, ReadArrayCommon read_array) { for (auto i = 0; i < array_data->length(); ++i) { // Slice the parent array to get the list value set at the specified index - auto value_slice = std::static_pointer_cast(array_data)->value_slice(i); + auto value_slice = static_pointer_cast(array_data)->value_slice(i); // Recursively populate the kdb parent mixed list from that slice - kK(k_array)[index++] = ReadArray(value_slice, type_overrides); + kK(k_array)[index++] = read_array(value_slice, type_overrides); } } // An arrow map array is a nested set of key/item paired child arrays. This is // represented in kdb as a mixed list for the parent map array, with a // dictionary for each map value set. -void AppendMap(std::shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +void AppendMap(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides, ReadArrayCommon read_array) { - auto map_array = std::static_pointer_cast(array_data); + auto map_array = static_pointer_cast(array_data); auto keys = map_array->keys(); auto items = map_array->items(); for (auto i = 0; i < array_data->length(); ++i) { @@ -49,7 +54,7 @@ void AppendMap(std::shared_ptr array_data, K k_array, size_t& inde auto items_slice = items->Slice(map_array->value_offset(i), map_array->value_length(i)); // Recursively populate the kdb parent mixed list with a dictionary // populated from those slices - kK(k_array)[index++] = xD(ReadArray(keys_slice, type_overrides), ReadArray(items_slice, type_overrides)); + kK(k_array)[index++] = xD(read_array(keys_slice, type_overrides), read_array(items_slice, type_overrides)); } } @@ -58,16 +63,16 @@ void AppendMap(std::shared_ptr array_data, K k_array, size_t& inde // value is obtaining by slicing across all the child arrays at a given index. // This is represented in kdb as a mixed list for the parent struct array, // containing child lists for each field in the struct. -void AppendStruct(std::shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +void AppendStruct(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides, AppendArrayCommon append_array) { - auto struct_array = std::static_pointer_cast(array_data); + auto struct_array = static_pointer_cast(array_data); auto num_fields = struct_array->type()->num_fields(); for (auto i = 0; i < num_fields; ++i) { auto field_array = struct_array->field(i); // Only advance the index into the kdb mixed list at the end once all child // lists have been populated from the same initial index auto temp_index = index; - AppendArray(field_array, kK(k_array)[i], temp_index, type_overrides); + append_array(field_array, kK(k_array)[i], temp_index, type_overrides); } index += array_data->length(); } @@ -75,9 +80,9 @@ void AppendStruct(std::shared_ptr array_data, K k_array, size_t& i // An arrow union array is similar to a struct array except that it has an // additional type id array which identifies the live field in each union value // set. -void AppendUnion(std::shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +void AppendUnion(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides, AppendArrayCommon append_array) { - auto union_array = std::static_pointer_cast(array_data); + auto union_array = static_pointer_cast(array_data); // The type_id array is represented as a KH list at the start of the parent mixed list. K type_ids = kK(k_array)[0]; @@ -91,339 +96,748 @@ void AppendUnion(std::shared_ptr array_data, K k_array, size_t& in // Only advance the index into the kdb mixed list at the end once all child // lists have been populated from the same initial index auto temp_index = index; - AppendArray(field_array, kK(k_array)[i + 1], temp_index, type_overrides); + append_array(field_array, kK(k_array)[i + 1], temp_index, type_overrides); } index += array_data->length(); } // An arrow dictionary array is represented in kdb as a mixed list for the // parent dictionary array containing the values and indicies sub-lists. -void AppendDictionary(std::shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +void AppendDictionary(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides, ReadArrayCommon read_array) { - auto dictionary_array = std::static_pointer_cast(array_data); + auto dictionary_array = static_pointer_cast(array_data); // Append the dictionary and indicies arrays. Have to use a join since the // two child arrays could be a different length to each other and the parent // dictionary array which makes it difficult to preallocate the kdb lists of // the correct length. - K values = ReadArray(dictionary_array->dictionary(), type_overrides); + K values = read_array(dictionary_array->dictionary(), type_overrides); jv(&kK(k_array)[0], values); - K indices = ReadArray(dictionary_array->indices(), type_overrides); + K indices = read_array(dictionary_array->indices(), type_overrides); jv(&kK(k_array)[1], indices); } -void AppendArray(std::shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides); + +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + auto null_array = static_pointer_cast(array_data); + for (auto i = 0; i < null_array->length(); ++i) + kK(k_array)[index++] = knk(0); +} + +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - switch (array_data->type_id()) { - case arrow::Type::NA: - { - auto null_array = std::static_pointer_cast(array_data); - for (auto i = 0; i < null_array->length(); ++i) - kK(k_array)[index++] = knk(0); - break; + auto bool_array = static_pointer_cast(array_data); + // BooleanArray doesn't have a bulk reader since arrow BooleanType is only 1 bit + for (auto i = 0; i < bool_array->length(); ++i){ + kG(k_array)[index++] = // preventing branch prediction failures + ( ( type_overrides.null_mapping.have_boolean && bool_array->IsNull( i ) ) * type_overrides.null_mapping.boolean_null ) + + ( !( type_overrides.null_mapping.have_boolean && bool_array->IsNull( i ) ) * bool_array->Value( i ) ); } - case arrow::Type::BOOL: - { - auto bool_array = std::static_pointer_cast(array_data); - // BooleanArray doesn't have a bulk reader since arrow BooleanType is only 1 bit - for (auto i = 0; i < bool_array->length(); ++i) - kG(k_array)[index++] = bool_array->Value(i); - break; +} + +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + auto uint8_array = static_pointer_cast(array_data); + if( type_overrides.null_mapping.have_uint8 && uint8_array->null_count() ){ + for( auto i = 0ll; i < uint8_array->length(); ++i ){ + kG( k_array )[i] = ( uint8_array->IsNull( i ) * type_overrides.null_mapping.uint8_null ) + + ( !uint8_array->IsNull( i ) * uint8_array->Value( i ) ); + } } - case arrow::Type::UINT8: - { - auto uint8_array = std::static_pointer_cast(array_data); + else { memcpy(kG(k_array), uint8_array->raw_values(), uint8_array->length() * sizeof(arrow::UInt8Array::value_type)); - break; } - case arrow::Type::INT8: - { - auto int8_array = std::static_pointer_cast(array_data); +} + +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + auto int8_array = static_pointer_cast(array_data); + if( type_overrides.null_mapping.have_int8 && int8_array->null_count() ){ + for( auto i = 0ll; i < int8_array->length(); ++i ){ + kG( k_array )[i] = ( int8_array->IsNull( i ) * type_overrides.null_mapping.int8_null ) + + ( !int8_array->IsNull( i ) * int8_array->Value( i ) ); + } + } + else { memcpy(kG(k_array), int8_array->raw_values(), int8_array->length() * sizeof(arrow::Int8Array::value_type)); - break; } - case arrow::Type::UINT16: - { - auto uint16_array = std::static_pointer_cast(array_data); +} + +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + auto uint16_array = static_pointer_cast(array_data); + if( type_overrides.null_mapping.have_uint16 && uint16_array->null_count() ){ + for( auto i = 0ll; i < uint16_array->length(); ++i ){ + kH( k_array )[i] = ( uint16_array->IsNull( i ) * type_overrides.null_mapping.uint16_null ) + + ( !uint16_array->IsNull( i ) * uint16_array->Value( i ) ); + } + } + else { memcpy(kH(k_array), uint16_array->raw_values(), uint16_array->length() * sizeof(arrow::UInt16Array::value_type)); - break; } - case arrow::Type::INT16: - { - auto int16_array = std::static_pointer_cast(array_data); +} + +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + auto int16_array = static_pointer_cast(array_data); + if( type_overrides.null_mapping.have_int16 && int16_array->null_count() ){ + for( auto i = 0ll; i < int16_array->length(); ++i ){ + kH( k_array )[i] = ( int16_array->IsNull( i ) * type_overrides.null_mapping.int16_null ) + + ( !int16_array->IsNull( i ) * int16_array->Value( i ) ); + } + } + else { memcpy(kH(k_array), int16_array->raw_values(), int16_array->length() * sizeof(arrow::Int16Array::value_type)); - break; } - case arrow::Type::UINT32: - { - auto uint32_array = std::static_pointer_cast(array_data); +} + +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + auto uint32_array = static_pointer_cast(array_data); + if( type_overrides.null_mapping.have_uint32 && uint32_array->null_count() ){ + for( auto i = 0ll; i < uint32_array->length(); ++i ){ + kI( k_array )[i] = ( uint32_array->IsNull( i ) * type_overrides.null_mapping.uint32_null ) + + ( !uint32_array->IsNull( i ) * uint32_array->Value( i ) ); + } + } + else { memcpy(kI(k_array), uint32_array->raw_values(), uint32_array->length() * sizeof(arrow::UInt32Array::value_type)); - break; } - case arrow::Type::INT32: - { - auto int32_array = std::static_pointer_cast(array_data); +} + +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + auto int32_array = static_pointer_cast(array_data); + if( type_overrides.null_mapping.have_int32 && int32_array->null_count() ){ + for( auto i = 0ll; i < int32_array->length(); ++i ){ + kI( k_array )[i] = ( int32_array->IsNull( i ) * type_overrides.null_mapping.int32_null ) + + (!int32_array->IsNull( i ) * int32_array->Value( i ) ); + } + } + else { memcpy(kI(k_array), int32_array->raw_values(), int32_array->length() * sizeof(arrow::Int32Array::value_type)); - break; } - case arrow::Type::UINT64: - { - auto uint64_array = std::static_pointer_cast(array_data); +} + +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + auto uint64_array = static_pointer_cast(array_data); + if( type_overrides.null_mapping.have_uint64 && uint64_array->null_count() ){ + for( auto i = 0ll; i < uint64_array->length(); ++i ){ + kJ( k_array )[i] = ( uint64_array->IsNull( i ) * type_overrides.null_mapping.uint64_null ) + + ( !uint64_array->IsNull( i ) * uint64_array->Value( i ) ); + } + } + else { memcpy(kJ(k_array), uint64_array->raw_values(), uint64_array->length() * sizeof(arrow::UInt64Array::value_type)); - break; } - case arrow::Type::INT64: - { - auto int64_array = std::static_pointer_cast(array_data); +} + +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + auto int64_array = static_pointer_cast(array_data); + if( type_overrides.null_mapping.have_int64 && int64_array->null_count() ){ + for( auto i = 0ll; i < int64_array->length(); ++i ){ + kJ( k_array )[i] = ( int64_array->IsNull( i ) * type_overrides.null_mapping.int64_null ) + + (!int64_array->IsNull( i ) * int64_array->Value( i ) ); + } + } + else { memcpy(kJ(k_array), int64_array->raw_values(), int64_array->length() * sizeof(arrow::Int64Array::value_type)); - break; } - case arrow::Type::HALF_FLOAT: - { - auto hfl_array = std::static_pointer_cast(array_data); +} + +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + auto hfl_array = static_pointer_cast(array_data); + if( type_overrides.null_mapping.have_float16 && hfl_array->null_count() ){ + for( auto i = 0ll; i < hfl_array->length(); ++i ){ + kH( k_array )[i] = ( hfl_array->IsNull( i ) * type_overrides.null_mapping.float16_null ) + + ( !hfl_array->IsNull( i ) * hfl_array->Value( i ) ); + } + } + else { memcpy(kH(k_array), hfl_array->raw_values(), hfl_array->length() * sizeof(arrow::HalfFloatArray::value_type)); - break; } - case arrow::Type::FLOAT: - { - auto fl_array = std::static_pointer_cast(array_data); +} + +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + auto fl_array = static_pointer_cast(array_data); + if( type_overrides.null_mapping.have_float32 && fl_array->null_count() ){ + for( auto i = 0ll; i < fl_array->length(); ++i ){ + kE( k_array )[i] = ( fl_array->IsNull( i ) * type_overrides.null_mapping.float32_null ) + + ( !fl_array->IsNull( i ) * fl_array->Value( i ) ); + } + } + else { memcpy(kE(k_array), fl_array->raw_values(), fl_array->length() * sizeof(arrow::FloatArray::value_type)); - break; } - case arrow::Type::DOUBLE: - { - auto dbl_array = std::static_pointer_cast(array_data); +} + +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + auto dbl_array = static_pointer_cast(array_data); + if( type_overrides.null_mapping.have_float64 && dbl_array->null_count() ){ + for( auto i = 0ll; i < dbl_array->length(); ++i ){ + kF( k_array )[i] = ( dbl_array->IsNull( i ) * type_overrides.null_mapping.float64_null ) + + ( !dbl_array->IsNull( i ) * dbl_array->Value( i ) ); + } + } + else { memcpy(kF(k_array), dbl_array->raw_values(), dbl_array->length() * sizeof(arrow::DoubleArray::value_type)); - break; } - case arrow::Type::STRING: - { - auto str_array = std::static_pointer_cast(array_data); - for (auto i = 0; i < str_array->length(); ++i) { +} + +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + auto str_array = static_pointer_cast(array_data); + for (auto i = 0; i < str_array->length(); ++i) { + K k_str = nullptr; + if( type_overrides.null_mapping.have_string && str_array->IsNull( i ) ){ + k_str = ktn( KC, type_overrides.null_mapping.string_null.length() ); + memcpy( kG(k_str), type_overrides.null_mapping.string_null.data(), type_overrides.null_mapping.string_null.length() ); + } + else{ auto str_data = str_array->GetString(i); - K k_str = ktn(KC, str_data.length()); - memcpy(kG(k_str), str_data.data(), str_data.length()); - kK(k_array)[index++] = k_str; + k_str = ktn(KC, str_data.length()); + memcpy(kG( k_str ), str_data.data(), str_data.length()); } - break; + kK( k_array )[index++] = k_str; } - case arrow::Type::LARGE_STRING: - { - auto str_array = std::static_pointer_cast(array_data); - for (auto i = 0; i < str_array->length(); ++i) { +} + +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + auto str_array = static_pointer_cast(array_data); + for (auto i = 0; i < str_array->length(); ++i) { + K k_str = nullptr; + if( type_overrides.null_mapping.have_large_string && str_array->IsNull( i ) ){ + k_str = ktn( KC, type_overrides.null_mapping.large_string_null.length() ); + memcpy( kG( k_str ), type_overrides.null_mapping.large_string_null.data(), type_overrides.null_mapping.large_string_null.length() ); + } + else{ auto str_data = str_array->GetString(i); - K k_str = ktn(KC, str_data.length()); + k_str = ktn(KC, str_data.length()); memcpy(kG(k_str), str_data.data(), str_data.length()); - kK(k_array)[index++] = k_str; } - break; + kK( k_array )[index++] = k_str; } - case arrow::Type::BINARY: - { - auto bin_array = std::static_pointer_cast(array_data); - for (auto i = 0; i < bin_array->length(); ++i) { +} + +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + auto bin_array = static_pointer_cast(array_data); + for (auto i = 0; i < bin_array->length(); ++i) { + K k_bin = nullptr; + if( type_overrides.null_mapping.have_binary && bin_array->IsNull( i ) ){ + k_bin = ktn( KG, type_overrides.null_mapping.binary_null.length() ); + memcpy( kG( k_bin ), type_overrides.null_mapping.binary_null.data(), type_overrides.null_mapping.binary_null.length() ); + } + else{ auto bin_data = bin_array->GetString(i); - K k_bin = ktn(KG, bin_data.length()); + k_bin = ktn(KG, bin_data.length()); memcpy(kG(k_bin), bin_data.data(), bin_data.length()); - kK(k_array)[index++] = k_bin; } - break; + kK(k_array)[index++] = k_bin; } - case arrow::Type::LARGE_BINARY: - { - auto bin_array = std::static_pointer_cast(array_data); - for (auto i = 0; i < bin_array->length(); ++i) { - auto bin_data = bin_array->GetString(i); - K k_bin = ktn(KG, bin_data.length()); - memcpy(kG(k_bin), bin_data.data(), bin_data.length()); - kK(k_array)[index++] = k_bin; +} + +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + auto bin_array = static_pointer_cast(array_data); + for (auto i = 0; i < bin_array->length(); ++i) { + K k_bin = nullptr; + if( type_overrides.null_mapping.have_large_binary && bin_array->IsNull( i ) ){ + k_bin = ktn( KG, type_overrides.null_mapping.large_binary_null.length() ); + memcpy( kG( k_bin ), type_overrides.null_mapping.large_binary_null.data(), type_overrides.null_mapping.large_binary_null.length() ); + } + else{ + auto bin_data = bin_array->GetString(i); + k_bin = ktn(KG, bin_data.length()); + memcpy(kG(k_bin), bin_data.data(), bin_data.length()); } - break; + kK(k_array)[index++] = k_bin; } - case arrow::Type::FIXED_SIZE_BINARY: - { - auto fixed_bin_array = std::static_pointer_cast(array_data); - for (auto i = 0; i < fixed_bin_array->length(); ++i) { +} + +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + auto fixed_bin_array = static_pointer_cast(array_data); + for (auto i = 0; i < fixed_bin_array->length(); ++i) { + K k_bin = nullptr; + if( type_overrides.null_mapping.have_fixed_binary && fixed_bin_array->IsNull( i ) ){ + k_bin = ktn( KG, type_overrides.null_mapping.fixed_binary_null.length() ); + memcpy( kG( k_bin ), type_overrides.null_mapping.fixed_binary_null.data(), type_overrides.null_mapping.fixed_binary_null.length() ); + } + else{ auto bin_data = fixed_bin_array->GetString(i); - K k_bin = ktn(KG, bin_data.length()); + k_bin = ktn(KG, bin_data.length()); memcpy(kG(k_bin), bin_data.data(), bin_data.length()); - kK(k_array)[index++] = k_bin; - } - break; - } - case arrow::Type::DATE32: - { - TemporalConversion tc(array_data->type()); - auto d32_array = std::static_pointer_cast(array_data); - for (auto i = 0; i < d32_array->length(); ++i) - kI(k_array)[index++] = tc.ArrowToKdb(d32_array->Value(i)); - break; - } - case arrow::Type::DATE64: - { - TemporalConversion tc(array_data->type()); - auto d64_array = std::static_pointer_cast(array_data); - for (auto i = 0; i < d64_array->length(); ++i) - kJ(k_array)[index++] = tc.ArrowToKdb(d64_array->Value(i)); - break; - } - case arrow::Type::TIMESTAMP: - { - TemporalConversion tc(array_data->type()); - auto ts_array = std::static_pointer_cast(array_data); - auto timestamp_type = std::static_pointer_cast(ts_array->type()); - for (auto i = 0; i < ts_array->length(); ++i) - kJ(k_array)[index++] = tc.ArrowToKdb(ts_array->Value(i)); - break; - } - case arrow::Type::TIME32: - { - TemporalConversion tc(array_data->type()); - auto t32_array = std::static_pointer_cast(array_data); - auto time32_type = std::static_pointer_cast(t32_array->type()); - for (auto i = 0; i < t32_array->length(); ++i) - kI(k_array)[index++] = tc.ArrowToKdb(t32_array->Value(i)); - break; - } - case arrow::Type::TIME64: - { - TemporalConversion tc(array_data->type()); - auto t64_array = std::static_pointer_cast(array_data); - auto time64_type = std::static_pointer_cast(t64_array->type()); - for (auto i = 0; i < t64_array->length(); ++i) - kJ(k_array)[index++] = tc.ArrowToKdb(t64_array->Value(i)); - break; - } - case arrow::Type::DECIMAL: - { - auto dec_array = std::static_pointer_cast(array_data); - auto dec_type = std::static_pointer_cast(dec_array->type()); - for (auto i = 0; i < dec_array->length(); ++i) { - auto decimal = arrow::Decimal128(dec_array->Value(i)); - if (type_overrides.decimal128_as_double) { - // Convert the decimal to a double - auto dec_as_double = decimal.ToDouble(dec_type->scale()); - kF(k_array)[index++] = dec_as_double; - } else { - // Each decimal is a list of 16 bytes - K k_dec = ktn(KG, 16); - decimal.ToBytes(kG(k_dec)); - kK(k_array)[index++] = k_dec; - } } - break; - } - case arrow::Type::DURATION: - { - TemporalConversion tc(array_data->type()); - auto dur_array = std::static_pointer_cast(array_data); - auto duration_type = std::static_pointer_cast(dur_array->type()); - for (auto i = 0; i < dur_array->length(); ++i) - kJ(k_array)[index++] = tc.ArrowToKdb(dur_array->Value(i)); - break; - } - case arrow::Type::INTERVAL_MONTHS: - { - auto month_array = std::static_pointer_cast(array_data); - memcpy(kI(k_array), month_array->raw_values(), month_array->length() * sizeof(arrow::MonthIntervalArray::value_type)); - break; - } - case arrow::Type::INTERVAL_DAY_TIME: - { - auto dt_array = std::static_pointer_cast(array_data); - for (auto i = 0; i < dt_array->length(); ++i) - kJ(k_array)[index++] = DayTimeInterval_KTimespan(dt_array->Value(i)); - break; - } - case arrow::Type::LIST: - AppendList(array_data, k_array, index, type_overrides); - break; - case arrow::Type::LARGE_LIST: - AppendList(array_data, k_array, index, type_overrides); - break; - case arrow::Type::FIXED_SIZE_LIST: - AppendList(array_data, k_array, index, type_overrides); - break; - case arrow::Type::MAP: - AppendMap(array_data, k_array, index, type_overrides); - break; - case arrow::Type::STRUCT: - AppendStruct(array_data, k_array, index, type_overrides); - break; - case arrow::Type::SPARSE_UNION: - case arrow::Type::DENSE_UNION: - AppendUnion(array_data, k_array, index, type_overrides); - break; - case arrow::Type::DICTIONARY: - AppendDictionary(array_data, k_array, index, type_overrides); - break; - default: - TYPE_CHECK_UNSUPPORTED(array_data->type()->ToString()); + kK(k_array)[index++] = k_bin; } } -K InitKdbForArray(std::shared_ptr datatype, size_t length, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - switch (datatype->id()) { - case arrow::Type::STRUCT: - { - // Arrow struct becomes a mixed list of lists so create necessary lists - auto num_fields = datatype->num_fields(); - K result = knk(num_fields); - for (auto i = 0; i < num_fields; ++i) { - auto field = datatype->field(i); - kK(result)[i] = InitKdbForArray(field->type(), length, type_overrides); + TemporalConversion tc(array_data->type()); + auto d32_array = static_pointer_cast(array_data); + for (auto i = 0; i < d32_array->length(); ++i){ + kI( k_array )[index++] = + ( ( type_overrides.null_mapping.have_date32 && d32_array->IsNull( i ) ) * type_overrides.null_mapping.date32_null ) + + ( !( type_overrides.null_mapping.have_date32 && d32_array->IsNull( i ) ) * tc.ArrowToKdb( d32_array->Value( i ) ) ); + } +} + +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + TemporalConversion tc(array_data->type()); + auto d64_array = static_pointer_cast(array_data); + for (auto i = 0; i < d64_array->length(); ++i){ + kJ( k_array )[index++] = + ( ( type_overrides.null_mapping.have_date64 && d64_array->IsNull( i ) ) * type_overrides.null_mapping.date64_null ) + + ( !( type_overrides.null_mapping.have_date64 && d64_array->IsNull( i ) ) * tc.ArrowToKdb( d64_array->Value( i ) ) ); + } +} + +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + TemporalConversion tc(array_data->type()); + auto ts_array = static_pointer_cast(array_data); + auto timestamp_type = static_pointer_cast(ts_array->type()); + for (auto i = 0; i < ts_array->length(); ++i){ + kJ( k_array )[index++] = + ( ( type_overrides.null_mapping.have_timestamp && ts_array->IsNull( i ) ) * type_overrides.null_mapping.timestamp_null ) + + ( !( type_overrides.null_mapping.have_timestamp && ts_array->IsNull( i ) ) * tc.ArrowToKdb( ts_array->Value( i ) ) ); + } +} + +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + TemporalConversion tc(array_data->type()); + auto t32_array = static_pointer_cast(array_data); + auto time32_type = static_pointer_cast(t32_array->type()); + for (auto i = 0; i < t32_array->length(); ++i){ + kI( k_array )[index++] = + ( ( type_overrides.null_mapping.have_time32 && t32_array->IsNull( i ) ) * type_overrides.null_mapping.time32_null ) + + ( !( type_overrides.null_mapping.have_time32 && t32_array->IsNull( i ) ) * tc.ArrowToKdb( t32_array->Value( i ) ) ); + } +} + +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + TemporalConversion tc(array_data->type()); + auto t64_array = static_pointer_cast(array_data); + auto time64_type = static_pointer_cast(t64_array->type()); + for (auto i = 0; i < t64_array->length(); ++i){ + kJ( k_array )[index++] = + ( ( type_overrides.null_mapping.have_time64 && t64_array->IsNull( i ) ) * type_overrides.null_mapping.time64_null ) + + ( !( type_overrides.null_mapping.have_time64 && t64_array->IsNull( i ) ) * tc.ArrowToKdb( t64_array->Value( i ) ) ); + } +} + +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + auto dec_array = static_pointer_cast(array_data); + auto dec_type = static_pointer_cast(dec_array->type()); + for (auto i = 0; i < dec_array->length(); ++i) { + auto decimal = arrow::Decimal128(dec_array->Value(i)); + if (type_overrides.decimal128_as_double) { + // Convert the decimal to a double + auto dec_as_double = + ( ( type_overrides.null_mapping.have_decimal && dec_array->IsNull( i ) ) * type_overrides.null_mapping.decimal_null ) + + ( !( type_overrides.null_mapping.have_decimal && dec_array->IsNull( i ) ) * decimal.ToDouble( dec_type->scale() ) ); + + kF(k_array)[index++] = dec_as_double; + } else { + // Each decimal is a list of 16 bytes + K k_dec = ktn(KG, 16); + decimal.ToBytes(kG(k_dec)); + kK(k_array)[index++] = k_dec; } - return result; - } - case arrow::Type::SPARSE_UNION: - case arrow::Type::DENSE_UNION: - { - // Arrow union becomes a mixed list of type_id list plus the child lists - auto num_fields = datatype->num_fields(); - K result = knk(num_fields + 1); - kK(result)[0] = ktn(KH, length); // type_id list - for (auto i = 0; i < num_fields; ++i) { - auto field = datatype->field(i); - kK(result)[i + 1] = InitKdbForArray(field->type(), length, type_overrides); + } +} + +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + TemporalConversion tc(array_data->type()); + auto dur_array = static_pointer_cast(array_data); + auto duration_type = static_pointer_cast(dur_array->type()); + for (auto i = 0; i < dur_array->length(); ++i){ + kJ( k_array )[index++] = + ( ( type_overrides.null_mapping.have_duration && dur_array->IsNull( i ) ) * type_overrides.null_mapping.duration_null ) + + ( !( type_overrides.null_mapping.have_duration && dur_array->IsNull( i ) ) * tc.ArrowToKdb( dur_array->Value( i ) ) ); + } +} + +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + auto month_array = static_pointer_cast(array_data); + if( type_overrides.null_mapping.have_month_interval && month_array->null_count() ){ + for( auto i = 0ll; i < month_array->length(); ++i ){ + kI( k_array )[i] = ( month_array->IsNull( i ) * type_overrides.null_mapping.month_interval_null ) + + ( !month_array->IsNull( i ) * month_array->Value( i ) ); } - return result; } - case arrow::Type::DICTIONARY: - { - // Arrow dictionary becomes a two item mixed list - auto dictionary_type = std::static_pointer_cast(datatype); - K result = ktn(0, 2); + else { + memcpy(kI(k_array), month_array->raw_values(), month_array->length() * sizeof(arrow::MonthIntervalArray::value_type)); + } +} + +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + auto dt_array = static_pointer_cast(array_data); + for (auto i = 0; i < dt_array->length(); ++i){ + kJ( k_array )[index++] = + ( ( type_overrides.null_mapping.have_day_time_interval && dt_array->IsNull( i ) ) * type_overrides.null_mapping.day_time_interval_null ) + + ( !( type_overrides.null_mapping.have_day_time_interval && dt_array->IsNull( i ) ) * DayTimeInterval_KTimespan( dt_array->Value( i ) ) ); + } +} + +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + AppendList(array_data, k_array, index, type_overrides, kx::arrowkdb::ReadArray); +} + +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + AppendList(array_data, k_array, index, type_overrides, kx::arrowkdb::ReadArray); +} + +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + AppendList(array_data, k_array, index, type_overrides, kx::arrowkdb::ReadArray); +} + +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + AppendMap(array_data, k_array, index, type_overrides, kx::arrowkdb::ReadArray); +} + +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + AppendStruct(array_data, k_array, index, type_overrides, kx::arrowkdb::AppendArray); +} + +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + AppendUnion(array_data, k_array, index, type_overrides, kx::arrowkdb::AppendArray); +} + +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + AppendArray(array_data, k_array, index, type_overrides); +} + +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + AppendDictionary(array_data, k_array, index, type_overrides, kx::arrowkdb::ReadArray); +} + +using ArrayHandler = void (*) (shared_ptr, K, size_t&, TypeMappingOverride&); + +template +auto make_array_handler() +{ + return make_pair( TypeId, &AppendArray ); +} + +unordered_map ArrayHandlers { + make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() +}; + +using NullBitmapHandler = void ( * )(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides); + +extern unordered_map NullBitmapHandlers; + +template +void AppendArrayNullBitmap(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides); + +template<> +void AppendArrayNullBitmap(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + AppendList(array_data, k_array, index, type_overrides, kx::arrowkdb::ReadArrayNullBitmap); +} + +template<> +void AppendArrayNullBitmap(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + AppendList(array_data, k_array, index, type_overrides, kx::arrowkdb::ReadArrayNullBitmap); +} + +template<> +void AppendArrayNullBitmap(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + AppendList(array_data, k_array, index, type_overrides, kx::arrowkdb::ReadArrayNullBitmap); +} + +template<> +void AppendArrayNullBitmap(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + AppendMap(array_data, k_array, index, type_overrides, kx::arrowkdb::ReadArrayNullBitmap); +} + +template<> +void AppendArrayNullBitmap(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + AppendStruct(array_data, k_array, index, type_overrides, kx::arrowkdb::AppendArrayNullBitmap); +} + +template<> +void AppendArrayNullBitmap(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + AppendUnion(array_data, k_array, index, type_overrides, kx::arrowkdb::AppendArrayNullBitmap); +} + +template<> +void AppendArrayNullBitmap(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + AppendArrayNullBitmap(array_data, k_array, index, type_overrides); +} + +template<> +void AppendArrayNullBitmap(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + AppendDictionary(array_data, k_array, index, type_overrides, kx::arrowkdb::ReadArrayNullBitmap); +} + +template +auto make_append_array_null_bitmap_handler() +{ + return make_pair( TypeId, &AppendArrayNullBitmap ); +} + +unordered_map NullBitmapHandlers{ + make_append_array_null_bitmap_handler() + , make_append_array_null_bitmap_handler() + , make_append_array_null_bitmap_handler() + , make_append_array_null_bitmap_handler() + , make_append_array_null_bitmap_handler() + , make_append_array_null_bitmap_handler() + , make_append_array_null_bitmap_handler() + , make_append_array_null_bitmap_handler() +}; + +typedef K(*InitKdbForArrayHandler)(shared_ptr datatype, size_t length, TypeMappingOverride& type_overrides, GetKdbTypeCommon get_kdb_type); + +extern unordered_map InitKdbForArrayHandlers; - // Do not preallocate the child lists since AppendDictionary has to join to the - // indicies and values lists - kK(result)[0] = InitKdbForArray(dictionary_type->value_type(), 0, type_overrides); - kK(result)[1] = InitKdbForArray(dictionary_type->index_type(), 0, type_overrides); +template +K InitKdbForArray(shared_ptr datatype, size_t length, TypeMappingOverride& type_overrides, GetKdbTypeCommon get_kdb_type); - return result; +template<> +K InitKdbForArray(shared_ptr datatype, size_t length, TypeMappingOverride& type_overrides, GetKdbTypeCommon get_kdb_type) +{ + // Arrow struct becomes a mixed list of lists so create necessary lists + auto num_fields = datatype->num_fields(); + K result = knk(num_fields); + for (auto i = 0; i < num_fields; ++i) { + auto field = datatype->field(i); + kK(result)[i] = InitKdbForArray(field->type(), length, type_overrides, get_kdb_type); } - default: - return ktn(GetKdbType(datatype, type_overrides), length); + return result; +} + +template<> +K InitKdbForArray(shared_ptr datatype, size_t length, TypeMappingOverride& type_overrides, GetKdbTypeCommon get_kdb_type) +{ + // Arrow union becomes a mixed list of type_id list plus the child lists + auto num_fields = datatype->num_fields(); + K result = knk(num_fields + 1); + kK(result)[0] = ktn(KH, length); // type_id list + for (auto i = 0; i < num_fields; ++i) { + auto field = datatype->field(i); + kK(result)[i + 1] = InitKdbForArray(field->type(), length, type_overrides, get_kdb_type); } + return result; +} + +template<> +K InitKdbForArray(shared_ptr datatype, size_t length, TypeMappingOverride& type_overrides, GetKdbTypeCommon get_kdb_type) +{ + return InitKdbForArray(datatype, length, type_overrides, get_kdb_type); +} + +template<> +K InitKdbForArray(shared_ptr datatype, size_t length, TypeMappingOverride& type_overrides, GetKdbTypeCommon get_kdb_type) +{ + // Arrow dictionary becomes a two item mixed list + auto dictionary_type = static_pointer_cast(datatype); + K result = ktn(0, 2); + + // Do not preallocate the child lists since AppendDictionary has to join to the + // indicies and values lists + kK(result)[0] = InitKdbForArray(dictionary_type->value_type(), 0, type_overrides, get_kdb_type); + kK(result)[1] = InitKdbForArray(dictionary_type->index_type(), 0, type_overrides, get_kdb_type); + + return result; } -K ReadArray(std::shared_ptr array, TypeMappingOverride& type_overrides) +template +auto make_init_kdb_for_array_handler() { - K k_array = InitKdbForArray(array->type(), array->length(), type_overrides); + return make_pair(TypeId, &InitKdbForArray); +} + +unordered_map InitKdbForArrayHandlers{ + make_init_kdb_for_array_handler() + , make_init_kdb_for_array_handler() + , make_init_kdb_for_array_handler() + , make_init_kdb_for_array_handler() +}; + + +} // namespace + +namespace kx { +namespace arrowkdb { + +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + auto lookup = ArrayHandlers.find(array_data->type_id()); + if (lookup == ArrayHandlers.end()) { + TYPE_CHECK_UNSUPPORTED(array_data->type()->ToString()); + } else { + lookup->second(array_data, k_array, index, type_overrides); + } +} + +void AppendArrayNullBitmap(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + auto lookup = NullBitmapHandlers.find(array_data->type_id()); + if (lookup == NullBitmapHandlers.end()) { + for (int i = 0ll; i < array_data->length(); ++i) + kG(k_array)[index++] = array_data->IsNull(i); + } else { + lookup->second(array_data, k_array, index, type_overrides); + } +} + +KdbType GetKdbTypeNullBitmap(std::shared_ptr datatype, TypeMappingOverride& type_overrides) +{ + if (NullBitmapHandlers.find(datatype->id()) == NullBitmapHandlers.end()) + return KB; + else + return 0; +} + +K InitKdbForArray(shared_ptr datatype, size_t length, TypeMappingOverride& type_overrides, GetKdbTypeCommon get_kdb_type) +{ + auto lookup = InitKdbForArrayHandlers.find(datatype->id()); + if (lookup != InitKdbForArrayHandlers.end()) { + return lookup->second(datatype, length, type_overrides, get_kdb_type); + } else { + return ktn(get_kdb_type(datatype, type_overrides), length); + } +} + +K ReadArray(shared_ptr array, TypeMappingOverride& type_overrides) +{ + K k_array = InitKdbForArray(array->type(), array->length(), type_overrides, GetKdbType); size_t index = 0; AppendArray(array, k_array, index, type_overrides); return k_array; } -K ReadChunkedArray(std::shared_ptr chunked_array, TypeMappingOverride& type_overrides) +K ReadArrayNullBitmap(shared_ptr array, TypeMappingOverride& type_overrides) +{ + K k_array = InitKdbForArray(array->type(), array->length(), type_overrides, GetKdbTypeNullBitmap); + size_t index = 0; + AppendArrayNullBitmap(array, k_array, index, type_overrides); + return k_array; +} + +K ReadChunkedArray(shared_ptr chunked_array, TypeMappingOverride& type_overrides) { - K k_array = InitKdbForArray(chunked_array->type(), chunked_array->length(), type_overrides); + K k_array = InitKdbForArray(chunked_array->type(), chunked_array->length(), type_overrides, GetKdbType); size_t index = 0; for (auto j = 0; j < chunked_array->num_chunks(); ++j) AppendArray(chunked_array->chunk(j), k_array, index, type_overrides); return k_array; } +K ReadChunkedArrayNullBitmap(shared_ptr chunked_array, TypeMappingOverride& type_overrides) +{ + K k_array = InitKdbForArray(chunked_array->type(), chunked_array->length(), type_overrides, GetKdbTypeNullBitmap); + size_t index = 0; + for (auto j = 0; j < chunked_array->num_chunks(); ++j) + AppendArrayNullBitmap(chunked_array->chunk(j), k_array, index, type_overrides); + return k_array; +} + } // namespace arrowkdb } // namspace kx @@ -435,19 +849,19 @@ K writeReadArray(K datatype_id, K array, K options) if (datatype_id->t != -KI) return krr((S)"datatype_id not -6h"); - auto datatype = kx::arrowkdb::GetDatatypeStore()->Find(datatype_id->i); + auto datatype = GetDatatypeStore()->Find(datatype_id->i); if (!datatype) return krr((S)"datatype not found"); // Parse the options - auto read_options = kx::arrowkdb::KdbOptions(options, kx::arrowkdb::Options::string_options, kx::arrowkdb::Options::int_options); + auto read_options = KdbOptions(options, Options::string_options, Options::int_options); // Type mapping overrides - kx::arrowkdb::TypeMappingOverride type_overrides{ read_options }; + TypeMappingOverride type_overrides{ read_options }; - auto arrow_array = kx::arrowkdb::MakeArray(datatype, array, type_overrides); + auto arrow_array = MakeArray(datatype, array, type_overrides); - return kx::arrowkdb::ReadArray(arrow_array, type_overrides); + return ReadArray(arrow_array, type_overrides); KDB_EXCEPTION_CATCH; -} \ No newline at end of file +} diff --git a/src/ArrayReader.h b/src/ArrayReader.h index 3298190..8a0c4f2 100644 --- a/src/ArrayReader.h +++ b/src/ArrayReader.h @@ -21,18 +21,22 @@ namespace arrowkdb { * list needs to have been created with the correct length by the calling * function. * @param index The index into the kdb list at which the appending should + * @param type_overrides Overrides for type mappings configured by KdbOptions * begin. Index will be updated to account for the new offset by adding the * length of the array array. */ void AppendArray(std::shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides); +void AppendArrayNullBitmap(std::shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides); /** * @brief Copies and converts an arrow array to a kdb list * * @param array The arrow array to be converted + * @param type_overrides Overrides for type mappings configured by KdbOptions * @return A kdb list represented the arrow array */ K ReadArray(std::shared_ptr array, TypeMappingOverride& type_overrides); +K ReadArrayNullBitmap(std::shared_ptr array, TypeMappingOverride& type_overrides); /** * @brief An arrow chunked array is a set of sub-arrays which are logically but not @@ -41,10 +45,20 @@ K ReadArray(std::shared_ptr array, TypeMappingOverride& type_overr * into the list. * * @param chunked_array The chunked array to be converted + * @param type_overrides Overrides for type mappings configured by KdbOptions * @return A kdb list representing the chunked array */ K ReadChunkedArray(std::shared_ptr chunked_array, TypeMappingOverride& type_overrides); +/** + * @brief Extracts nulls bitmap of an arrow array into a boolean kdb list + * + * @param chunked_array The chunked array to be converted + * @param type_overrides Overrides for type mappings configured by KdbOptions + * @return A kdb list representing the nulls bitmap +*/ +K ReadChunkedArrayNullBitmap( std::shared_ptr chunked_array, TypeMappingOverride& type_overrides ); + /** * @brief Creates a kdb list of the correct type and specified length according * to the arrow datatype. For the arrow struct/union datatypes this includes @@ -52,9 +66,10 @@ K ReadChunkedArray(std::shared_ptr chunked_array, TypeMappi * * @param datatype The arrow datatype to be stored in the kdb list * @param length The required length of the kdb list + * @param type_overrides Overrides for type mappings configured by KdbOptions * @return Newly created kdb list */ -K InitKdbForArray(std::shared_ptr datatype, size_t length, TypeMappingOverride& type_overrides); +K InitKdbForArray(std::shared_ptr datatype, size_t length, TypeMappingOverride& type_overrides, GetKdbTypeCommon get_kdb_type); } // namespace arrowkdb } // namespace kx diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index 602c764..27d9b41 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -13,144 +14,337 @@ #include "HelperFunctions.h" #include "TypeCheck.h" +using namespace std; +using namespace kx::arrowkdb; -namespace kx { -namespace arrowkdb { +namespace +{ + +shared_ptr GetBuilder(shared_ptr datatype); + +template +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool); + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return make_shared(datatype, pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return make_shared(datatype, pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return make_shared(datatype, pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return make_shared(datatype, pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return make_shared(datatype, pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return make_shared(datatype, pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + // The parent list datatype details the child datatype so construct the child + // builder and use it to initialise the parent list builder + auto list_type = static_pointer_cast(datatype); + auto value_builder = GetBuilder(list_type->value_type()); + + // Construct the correct listbuilder + if (datatype->id() == arrow::Type::LIST) + return make_shared(pool, value_builder); + else if (datatype->id() == arrow::Type::LARGE_LIST) + return make_shared(pool, value_builder); + else + return make_shared(pool, value_builder, datatype); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return GetBuilder( datatype, pool ); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return GetBuilder( datatype, pool ); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + // The parent map datatype details the key/item child datatypes so construct + // builders for both and use these to initialise the parent map builder + auto map_type = static_pointer_cast(datatype); + auto key_builder = GetBuilder(map_type->key_type()); + auto item_builder = GetBuilder(map_type->item_type()); + return make_shared(pool, key_builder, item_builder); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + auto struct_type = static_pointer_cast(datatype); + + // Iterate through all the fields in the struct constructing and adding each + // field's builder into a vector + auto fields = struct_type->fields(); + vector> field_builders; + for (auto field : fields) + field_builders.push_back(GetBuilder(field->type())); + + // Construct the parent struct builder from this vector of all the child + // builders + return make_shared(datatype, pool, field_builders); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + auto union_type = static_pointer_cast(datatype); + + // Iterate through all the fields in the union constructing and adding each + // field's builder into a vector + auto fields = union_type->fields(); + vector> field_builders; + for (auto field : fields) + field_builders.push_back(GetBuilder(field->type())); + + // Construct the parent union builder from this vector of all the child + // builders + if (datatype->id() == arrow::Type::SPARSE_UNION) + return make_shared(pool, field_builders, datatype); + else + return make_shared(pool, field_builders, datatype); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return GetBuilder( datatype, pool ); +} + +using BuilderHandler = shared_ptr ( * ) ( shared_ptr, arrow::MemoryPool* ); + +template +auto make_builder_handler() +{ + return make_pair( TypeId, &GetBuilder ); +} + +unordered_map BuilderHandlers { + make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() +}; // Constructs and returns the correct arrow array builder for the specified // datatype. // // This handles all datatypes except Dictionary which is handled separately. -std::shared_ptr GetBuilder(std::shared_ptr datatype) +shared_ptr GetBuilder(shared_ptr datatype) { + auto type_id = datatype->id(); arrow::MemoryPool* pool = arrow::default_memory_pool(); - switch (datatype->id()) { - case arrow::Type::NA: - return std::make_shared(pool); - case arrow::Type::BOOL: - return std::make_shared(pool); - case arrow::Type::UINT8: - return std::make_shared(pool); - case arrow::Type::INT8: - return std::make_shared(pool); - case arrow::Type::UINT16: - return std::make_shared(pool); - case arrow::Type::INT16: - return std::make_shared(pool); - case arrow::Type::UINT32: - return std::make_shared(pool); - case arrow::Type::INT32: - return std::make_shared(pool); - case arrow::Type::UINT64: - return std::make_shared(pool); - case arrow::Type::INT64: - return std::make_shared(pool); - case arrow::Type::HALF_FLOAT: - return std::make_shared(pool); - case arrow::Type::FLOAT: - return std::make_shared(pool); - case arrow::Type::DOUBLE: - return std::make_shared(pool); - case arrow::Type::STRING: - return std::make_shared(pool); - case arrow::Type::LARGE_STRING: - return std::make_shared(pool); - case arrow::Type::BINARY: - return std::make_shared(pool); - case arrow::Type::LARGE_BINARY: - return std::make_shared(pool); - case arrow::Type::FIXED_SIZE_BINARY: - return std::make_shared(datatype, pool); - case arrow::Type::DATE32: - return std::make_shared(pool); - case arrow::Type::DATE64: - return std::make_shared(pool); - case arrow::Type::TIMESTAMP: - return std::make_shared(datatype, pool); - case arrow::Type::TIME32: - return std::make_shared(datatype, pool); - case arrow::Type::TIME64: - return std::make_shared(datatype, pool); - case arrow::Type::DECIMAL: - return std::make_shared(datatype, pool); - case arrow::Type::DURATION: - return std::make_shared(datatype, pool); - case arrow::Type::INTERVAL_MONTHS: - return std::make_shared(pool); - case arrow::Type::INTERVAL_DAY_TIME: - return std::make_shared(pool); - case arrow::Type::LIST: - case arrow::Type::LARGE_LIST: - case arrow::Type::FIXED_SIZE_LIST: - { - // The parent list datatype details the child datatype so construct the child - // builder and use it to initialise the parent list builder - auto list_type = std::static_pointer_cast(datatype); - auto value_builder = GetBuilder(list_type->value_type()); - - // Construct the correct listbuilder - if (datatype->id() == arrow::Type::LIST) - return std::make_shared(pool, value_builder); - else if (datatype->id() == arrow::Type::LARGE_LIST) - return std::make_shared(pool, value_builder); - else - return std::make_shared(pool, value_builder, datatype); - } - case arrow::Type::MAP: + if( BuilderHandlers.find( type_id ) == BuilderHandlers.end() ) { - // The parent map datatype details the key/item child datatypes so construct - // builders for both and use these to initialise the parent map builder - auto map_type = std::static_pointer_cast(datatype); - auto key_builder = GetBuilder(map_type->key_type()); - auto item_builder = GetBuilder(map_type->item_type()); - return std::make_shared(pool, key_builder, item_builder); - } - case arrow::Type::STRUCT: - { - auto struct_type = std::static_pointer_cast(datatype); - - // Iterate through all the fields in the struct constructing and adding each - // field's builder into a vector - auto fields = struct_type->fields(); - std::vector> field_builders; - for (auto field : fields) - field_builders.push_back(GetBuilder(field->type())); - - // Construct the parent struct builder from this vector of all the child - // builders - return std::make_shared(datatype, pool, field_builders); + TYPE_CHECK_UNSUPPORTED(datatype->ToString()); } - case arrow::Type::SPARSE_UNION: - case arrow::Type::DENSE_UNION: + else { - auto union_type = std::static_pointer_cast(datatype); - - // Iterate through all the fields in the union constructing and adding each - // field's builder into a vector - auto fields = union_type->fields(); - std::vector> field_builders; - for (auto field : fields) - field_builders.push_back(GetBuilder(field->type())); - - // Construct the parent union builder from this vector of all the child - // builders - if (datatype->id() == arrow::Type::SPARSE_UNION) - return std::make_shared(pool, field_builders, datatype); - else - return std::make_shared(pool, field_builders, datatype); - } - default: - TYPE_CHECK_UNSUPPORTED(datatype->ToString()); + return BuilderHandlers[type_id]( datatype, pool ); } } +} // namespace + +namespace +{ + // Populate a list/large_list/fixed_size_list builder // // An arrow list array is a nested set of child lists. This is represented in // kdb as a mixed list for the parent list array containing a set of sub-lists, // one for each of the list value sets. template -void PopulateListBuilder(std::shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +void PopulateListBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { // Get the value builder from the parent list builder auto list_builder = static_cast(builder); @@ -162,12 +356,12 @@ void PopulateListBuilder(std::shared_ptr datatype, K k_array, a continue; // Delimit the start/end of each child list set - list_builder->Append(); + PARQUET_THROW_NOT_OK( list_builder->Append() ); if (datatype->id() == arrow::Type::FIXED_SIZE_LIST) { // Check each sub-list is the same length as the fixed size K list_data = kK(k_array)[i]; - auto fixed_list_type = std::static_pointer_cast(datatype); + auto fixed_list_type = static_pointer_cast(datatype); TYPE_CHECK_LENGTH(fixed_list_type->list_size() != list_data->n, datatype->ToString(), fixed_list_type->list_size(), list_data->n); } @@ -182,12 +376,12 @@ void PopulateListBuilder(std::shared_ptr datatype, K k_array, a // additional type id array which identifies the live field in each union value // set. template -void PopulateUnionBuilder(std::shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +void PopulateUnionBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { // Check that the mixed list length is at least one greater (the additional // first sub-list contains the union type_ids) than the number of union // fields - auto union_type = std::static_pointer_cast(datatype); + auto union_type = static_pointer_cast(datatype); const auto min_length = union_type->num_fields() + 1; TYPE_CHECK_LENGTH(min_length > k_array->n, datatype->ToString(), min_length, k_array->n); @@ -199,7 +393,7 @@ void PopulateUnionBuilder(std::shared_ptr datatype, K k_array, // Get all the child builders from the parent union builder auto union_builder = static_cast(builder); - std::vector> child_builders; + vector> child_builders; for (auto i = 0; i < union_builder->num_children(); ++i) child_builders.push_back(union_builder->child_builder(i)); @@ -207,7 +401,7 @@ void PopulateUnionBuilder(std::shared_ptr datatype, K k_array, // for this union value for (auto index = 0; index < kK(k_array)[0]->n; ++index) { int8_t live_type_id = kH(type_ids)[index]; - union_builder->Append(live_type_id); + PARQUET_THROW_NOT_OK( union_builder->Append(live_type_id) ); } // Populate each of the child builders from its kdb list, starting from 1 to @@ -226,364 +420,776 @@ void PopulateUnionBuilder(std::shared_ptr datatype, K k_array, throw TypeCheck("Mismatched union list lengths"); } -// Populates data values from a kdb list into the specified array builder. -void PopulateBuilder(std::shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +template +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides); + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { - // Special cases for: - // symbol - string or large_string - // guid - fixed_size_binary(16) - // char - uint8 - bool is_symbol = k_array->t == KS && (datatype->id() == arrow::Type::STRING || datatype->id() == arrow::Type::LARGE_STRING); - bool is_guid = k_array->t == UU && datatype->id() == arrow::Type::FIXED_SIZE_BINARY && static_cast(builder)->byte_width() == sizeof(U); - bool is_char = k_array->t == KC && (datatype->id() == arrow::Type::UINT8 || datatype->id() == arrow::Type::INT8); + auto null_builder = static_cast(builder); + PARQUET_THROW_NOT_OK(null_builder->AppendNulls(k_array->n)); +} - // Type check the kdb structure - if (!is_symbol && !is_guid && !is_char) - TYPE_CHECK_ARRAY(kx::arrowkdb::GetKdbType(datatype, type_overrides) != k_array->t, datatype->ToString(), kx::arrowkdb::GetKdbType(datatype, type_overrides), k_array->t); +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; + auto bool_builder = static_cast(builder); + if( type_overrides.null_mapping.have_boolean ){ + std::vector null_bitmap( length ); + for( auto i = 0ll; i < length; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.boolean_null != static_cast( kG( k_array )[i+offset] ); + } + PARQUET_THROW_NOT_OK( bool_builder->AppendValues( ( uint8_t* )&kG( k_array )[offset], length, null_bitmap ) ); + } + else { + PARQUET_THROW_NOT_OK( bool_builder->AppendValues( ( uint8_t* )&kG( k_array )[offset], length ) ); + } +} - switch (datatype->id()) { - case arrow::Type::NA: - { - auto null_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(null_builder->AppendNulls(k_array->n)); - break; +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; + auto uint8_builder = static_cast(builder); + if( type_overrides.null_mapping.have_uint8 ){ + std::vector null_bitmap( length ); + for( auto i = 0ll; i < length; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.uint8_null != static_cast( kG( k_array )[i+offset] ); + } + PARQUET_THROW_NOT_OK( uint8_builder->AppendValues( ( uint8_t* )&kG( k_array )[offset], length, null_bitmap ) ); } - case arrow::Type::BOOL: - { - auto bool_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(bool_builder->AppendValues((uint8_t*)kG(k_array), k_array->n)); - break; + else { + PARQUET_THROW_NOT_OK( uint8_builder->AppendValues( ( uint8_t* )&kG( k_array )[offset], length ) ); } - case arrow::Type::UINT8: - { - auto uint8_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(uint8_builder->AppendValues((uint8_t*)kG(k_array), k_array->n)); - break; +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; + auto int8_builder = static_cast(builder); + if( type_overrides.null_mapping.have_int8 ){ + std::vector null_bitmap( length ); + for( auto i = 0ll; i < length; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.int8_null != kG( k_array )[i+offset]; + } + PARQUET_THROW_NOT_OK( int8_builder->AppendValues( ( int8_t* )&kG( k_array )[offset], length, null_bitmap ) ); } - case arrow::Type::INT8: - { - auto int8_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(int8_builder->AppendValues((int8_t*)kG(k_array), k_array->n)); - break; + else { + PARQUET_THROW_NOT_OK( int8_builder->AppendValues( ( int8_t* )&kG( k_array )[offset], length ) ); } - case arrow::Type::UINT16: - { - auto uint16_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(uint16_builder->AppendValues((uint16_t*)kH(k_array), k_array->n)); - arrow::Status s; - break; +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; + auto uint16_builder = static_cast(builder); + if( type_overrides.null_mapping.have_uint16 ){ + std::vector null_bitmap( length ); + for( auto i = 0ll; i < length; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.uint16_null != static_cast( kH( k_array )[i+offset] ); + } + PARQUET_THROW_NOT_OK( uint16_builder->AppendValues( ( uint16_t* )&kH( k_array )[offset], length, null_bitmap ) ); } - case arrow::Type::INT16: - { - auto int16_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(int16_builder->AppendValues((int16_t*)kH(k_array), k_array->n)); - break; + else { + PARQUET_THROW_NOT_OK( uint16_builder->AppendValues( ( uint16_t* )&kH( k_array )[offset], length ) ); } - case arrow::Type::UINT32: - { - auto uint32_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(uint32_builder->AppendValues((uint32_t*)kI(k_array), k_array->n)); - break; +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; + auto int16_builder = static_cast(builder); + if( type_overrides.null_mapping.have_int16 ){ + std::vector null_bitmap( length ); + for( auto i = 0ll; i < length; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.int16_null != kH( k_array )[i+offset]; + } + PARQUET_THROW_NOT_OK( int16_builder->AppendValues( ( int16_t* )&kH( k_array )[offset], length, null_bitmap ) ); } - case arrow::Type::INT32: - { - auto int32_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(int32_builder->AppendValues((int32_t*)kI(k_array), k_array->n)); - break; + else { + PARQUET_THROW_NOT_OK( int16_builder->AppendValues( ( int16_t* )&kH( k_array )[offset], length) ); } - case arrow::Type::UINT64: - { - auto uint64_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(uint64_builder->AppendValues((uint64_t*)kJ(k_array), k_array->n)); - break; +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; + auto uint32_builder = static_cast(builder); + if( type_overrides.null_mapping.have_uint32 ){ + std::vector null_bitmap( length ); + for( auto i = 0ll; i < length; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.uint32_null != static_cast( kI( k_array )[i+offset] ); + } + PARQUET_THROW_NOT_OK( uint32_builder->AppendValues( ( uint32_t* )&kI( k_array )[offset], length, null_bitmap ) ); } - case arrow::Type::INT64: - { - auto int64_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(int64_builder->AppendValues((int64_t*)kJ(k_array), k_array->n)); - break; + else{ + PARQUET_THROW_NOT_OK( uint32_builder->AppendValues( ( uint32_t* )&kI( k_array )[offset], length ) ); } - case arrow::Type::HALF_FLOAT: - { - arrow::HalfFloatType hft; - auto hfl_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(hfl_builder->AppendValues((uint16_t*)kH(k_array), k_array->n)); - break; +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; + auto int32_builder = static_cast(builder); + if( type_overrides.null_mapping.have_int32 ){ + std::vector null_bitmap( length ); + for( auto i = 0ll; i < length; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.int32_null != kI( k_array )[i+offset]; + } + PARQUET_THROW_NOT_OK( int32_builder->AppendValues( ( int32_t* )&kI( k_array )[offset], length, null_bitmap ) ); } - case arrow::Type::FLOAT: - { - auto fl_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(fl_builder->AppendValues(kE(k_array), k_array->n)); - break; + else{ + PARQUET_THROW_NOT_OK( int32_builder->AppendValues( ( int32_t* )&kI( k_array )[offset], length ) ); } - case arrow::Type::DOUBLE: - { - auto dbl_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(dbl_builder->AppendValues(kF(k_array), k_array->n)); - break; +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; + auto uint64_builder = static_cast(builder); + if( type_overrides.null_mapping.have_uint64 ){ + std::vector null_bitmap( length ); + for( auto i = 0ll; i < length; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.uint64_null != static_cast( kJ( k_array )[i+offset] ); + } + PARQUET_THROW_NOT_OK( uint64_builder->AppendValues( ( uint64_t* )&kJ( k_array )[offset], length, null_bitmap ) ); } - case arrow::Type::STRING: - { - auto str_builder = static_cast(builder); - if (is_symbol) { - // Populate from symbol list - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(str_builder->Append(kS(k_array)[i])); - } else { - // Populate from mixed list of char lists - for (auto i = 0; i < k_array->n; ++i) { - K str_data = kK(k_array)[i]; - TYPE_CHECK_ITEM(str_data->t != KC, datatype->ToString(), KC, str_data->t); - PARQUET_THROW_NOT_OK(str_builder->Append(kG(str_data), str_data->n)); + else{ + PARQUET_THROW_NOT_OK( uint64_builder->AppendValues( ( uint64_t* )&kJ( k_array )[offset], length ) ); + } +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; + auto int64_builder = static_cast(builder); + if( type_overrides.null_mapping.have_int64 ){ + std::vector null_bitmap( length ); + for( auto i = 0ll; i < length; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.int64_null != kJ( k_array )[i+offset]; + } + PARQUET_THROW_NOT_OK( int64_builder->AppendValues( ( int64_t* )&kJ( k_array )[offset], length, null_bitmap ) ); + } + else{ + PARQUET_THROW_NOT_OK( int64_builder->AppendValues( ( int64_t* )&kJ( k_array )[offset], length ) ); + } +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; + auto hfl_builder = static_cast(builder); + if( type_overrides.null_mapping.have_float16 ){ + std::vector null_bitmap( length ); + for( auto i = 0ll; i < length; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.float16_null != static_cast( kH( k_array )[i+offset] ); + } + PARQUET_THROW_NOT_OK( hfl_builder->AppendValues( ( uint16_t* )&kH( k_array )[offset], length, null_bitmap ) ); + } + else { + PARQUET_THROW_NOT_OK( hfl_builder->AppendValues( ( uint16_t* )&kH( k_array )[offset], length ) ); + } +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; + auto fl_builder = static_cast(builder); + if( type_overrides.null_mapping.have_float32 ){ + std::vector null_bitmap( length ); + for( auto i = 0ll; i < length; ++i ){ + null_bitmap[i] = !is_equal( type_overrides.null_mapping.float32_null, kE( k_array )[i+offset] ); + } + PARQUET_THROW_NOT_OK( fl_builder->AppendValues( &kE( k_array )[offset], length, null_bitmap ) ); + } + else { + PARQUET_THROW_NOT_OK( fl_builder->AppendValues( &kE( k_array )[offset], length ) ); + } +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; + auto dbl_builder = static_cast(builder); + if( type_overrides.null_mapping.have_float64 ){ + std::vector null_bitmap( length ); + for( auto i = 0ll; i < length; ++i ){ + null_bitmap[i] = !is_equal( type_overrides.null_mapping.float64_null, kF( k_array )[i+offset] ); + } + PARQUET_THROW_NOT_OK( dbl_builder->AppendValues( &kF( k_array )[offset], length, null_bitmap ) ); + } + else { + PARQUET_THROW_NOT_OK( dbl_builder->AppendValues( &kF( k_array )[offset], length ) ); + } +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; + auto str_builder = static_cast(builder); + bool is_symbol = k_array->t == KS && (datatype->id() == arrow::Type::STRING || datatype->id() == arrow::Type::LARGE_STRING); + if( is_symbol ){ + // Populate from symbol list + for( auto i = 0ll; i < length; ++i ){ + if( type_overrides.null_mapping.have_string + && type_overrides.null_mapping.string_null == kS( k_array )[i+offset] ){ + PARQUET_THROW_NOT_OK( str_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK( str_builder->Append( kS( k_array )[i+offset] ) ); + } + } + } else { + // Populate from mixed list of char lists + for( auto i = 0ll; i < length; ++i ){ + K str_data = kK( k_array )[i+offset]; + TYPE_CHECK_ITEM( str_data->t != KC, datatype->ToString(), KC, str_data->t ); + if( type_overrides.null_mapping.have_string + && type_overrides.null_mapping.string_null.length() == static_cast( str_data->n ) + && !type_overrides.null_mapping.string_null.compare( 0, str_data->n, ( char* )kG( str_data ), str_data->n ) ){ + PARQUET_THROW_NOT_OK( str_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK( str_builder->Append( kG( str_data ), str_data->n ) ); } } - break; } - case arrow::Type::LARGE_STRING: - { - auto str_builder = static_cast(builder); - if (is_symbol) { - // Populate from symbol list - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(str_builder->Append(kS(k_array)[i])); - } else { - // Populate from mixed list of char lists - for (auto i = 0; i < k_array->n; ++i) { - K str_data = kK(k_array)[i]; - TYPE_CHECK_ITEM(str_data->t != KC, datatype->ToString(), KC, str_data->t); - PARQUET_THROW_NOT_OK(str_builder->Append(kG(str_data), str_data->n)); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; + auto str_builder = static_cast(builder); + bool is_symbol = k_array->t == KS && (datatype->id() == arrow::Type::STRING || datatype->id() == arrow::Type::LARGE_STRING); + if( is_symbol ){ + // Populate from symbol list + for( auto i = 0ll; i < length; ++i ){ + if( type_overrides.null_mapping.have_large_string + && type_overrides.null_mapping.large_string_null == kS( k_array )[i+offset] ){ + PARQUET_THROW_NOT_OK( str_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK( str_builder->Append( kS( k_array )[i+offset] ) ); + } + } + } else { + // Populate from mixed list of char lists + for( auto i = 0ll; i < length; ++i ){ + K str_data = kK( k_array )[i+offset]; + TYPE_CHECK_ITEM( str_data->t != KC, datatype->ToString(), KC, str_data->t ); + if( type_overrides.null_mapping.have_large_string + && type_overrides.null_mapping.large_string_null.length() == static_cast( str_data->n ) + && !type_overrides.null_mapping.large_string_null.compare( 0, str_data->n, ( char* )kG( str_data ), str_data->n ) ){ + PARQUET_THROW_NOT_OK( str_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK( str_builder->Append( kG( str_data ), str_data->n ) ); } } - break; } - case arrow::Type::BINARY: - { - auto bin_builder = static_cast(builder); - for (auto i = 0; i < k_array->n; ++i) { - K bin_data = kK(k_array)[i]; - TYPE_CHECK_ITEM(bin_data->t != KG, datatype->ToString(), KG, bin_data->t); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; + auto bin_builder = static_cast(builder); + for( auto i = 0; i < length; ++i ){ + K bin_data = kK( k_array )[i+offset]; + TYPE_CHECK_ITEM(bin_data->t != KG, datatype->ToString(), KG, bin_data->t); + if( type_overrides.null_mapping.have_binary + && type_overrides.null_mapping.binary_null.length() == static_cast( bin_data->n ) + && !type_overrides.null_mapping.binary_null.compare( 0, bin_data->n, kG( bin_data ), bin_data->n ) ){ + PARQUET_THROW_NOT_OK( bin_builder->AppendNull() ); + } + else{ PARQUET_THROW_NOT_OK(bin_builder->Append(kG(bin_data), bin_data->n)); } - break; } - case arrow::Type::LARGE_BINARY: - { - auto bin_builder = static_cast(builder); - for (auto i = 0; i < k_array->n; ++i) { - K bin_data = kK(k_array)[i]; - TYPE_CHECK_ITEM(bin_data->t != KG, datatype->ToString(), KG, bin_data->t); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; + auto bin_builder = static_cast(builder); + for( auto i = 0; i < length; ++i ){ + K bin_data = kK( k_array )[i+offset]; + TYPE_CHECK_ITEM(bin_data->t != KG, datatype->ToString(), KG, bin_data->t); + if( type_overrides.null_mapping.have_large_binary + && type_overrides.null_mapping.large_binary_null.length() == static_cast( bin_data->n ) + && !type_overrides.null_mapping.large_binary_null.compare( 0, bin_data->n, kG( bin_data ), bin_data->n ) ){ + PARQUET_THROW_NOT_OK( bin_builder->AppendNull() ); + } + else{ PARQUET_THROW_NOT_OK(bin_builder->Append(kG(bin_data), bin_data->n)); } - break; } - case arrow::Type::FIXED_SIZE_BINARY: - { - auto fixed_bin_builder = static_cast(builder); - if (is_guid) { - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(fixed_bin_builder->Append((char*)&kU(k_array)[i])); - } else { - for (auto i = 0; i < k_array->n; ++i) { - K bin_data = kK(k_array)[i]; - TYPE_CHECK_ITEM(bin_data->t != KG, datatype->ToString(), KG, bin_data->t); - TYPE_CHECK_LENGTH(fixed_bin_builder->byte_width() != bin_data->n, builder->type()->ToString(), fixed_bin_builder->byte_width(), bin_data->n); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; + bool is_guid = k_array->t == UU && datatype->id() == arrow::Type::FIXED_SIZE_BINARY && static_cast(builder)->byte_width() == sizeof(U); + auto fixed_bin_builder = static_cast(builder); + if (is_guid) { + for( auto i = 0; i < length; ++i ){ + if( type_overrides.null_mapping.have_fixed_binary + && type_overrides.null_mapping.fixed_binary_null.length() == sizeof( U ) + && !type_overrides.null_mapping.fixed_binary_null.compare( 0, sizeof( U ), &kU( k_array )[i+offset].g[0], sizeof( U ) ) ){ + PARQUET_THROW_NOT_OK( fixed_bin_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK( fixed_bin_builder->Append( ( char* )&kU( k_array )[i+offset] ) ); + } + } + } else { + for( auto i = 0; i < length; ++i ){ + K bin_data = kK(k_array)[i+offset]; + TYPE_CHECK_ITEM(bin_data->t != KG, datatype->ToString(), KG, bin_data->t); + TYPE_CHECK_LENGTH(fixed_bin_builder->byte_width() != bin_data->n, builder->type()->ToString(), fixed_bin_builder->byte_width(), bin_data->n); + if( type_overrides.null_mapping.have_fixed_binary + && type_overrides.null_mapping.fixed_binary_null.length() == static_cast( bin_data->n ) + && !type_overrides.null_mapping.fixed_binary_null.compare( 0, bin_data->n, kG( bin_data ), bin_data->n ) ){ + PARQUET_THROW_NOT_OK( fixed_bin_builder->AppendNull() ); + } + else{ PARQUET_THROW_NOT_OK(fixed_bin_builder->Append(kG(bin_data))); } } - break; } - case arrow::Type::DATE32: - { - TemporalConversion tc(datatype); - auto d32_builder = static_cast(builder); - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(d32_builder->Append(tc.KdbToArrow(kI(k_array)[i]))); - break; +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; + TemporalConversion tc(datatype); + auto d32_builder = static_cast(builder); + for( auto i = 0; i < length; ++i ){ + if( type_overrides.null_mapping.have_date32 + && type_overrides.null_mapping.date32_null == kI( k_array )[i+offset] ){ + PARQUET_THROW_NOT_OK( d32_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK( d32_builder->Append( tc.KdbToArrow( kI( k_array )[i+offset] ) ) ); + } } - case arrow::Type::DATE64: - { - TemporalConversion tc(datatype); - auto d64_builder = static_cast(builder); - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(d64_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); - break; +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; + TemporalConversion tc(datatype); + auto d64_builder = static_cast(builder); + for( auto i = 0; i < length; ++i ){ + if( type_overrides.null_mapping.have_date64 + && type_overrides.null_mapping.date64_null == kJ( k_array )[i+offset] ){ + PARQUET_THROW_NOT_OK( d64_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK( d64_builder->Append( tc.KdbToArrow( kJ( k_array )[i+offset] ) ) ); + } } - case arrow::Type::TIMESTAMP: - { - TemporalConversion tc(datatype); - auto ts_builder = static_cast(builder); - auto timestamp_type = std::static_pointer_cast(datatype); - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(ts_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); - break; - } - case arrow::Type::TIME32: - { - TemporalConversion tc(datatype); - auto t32_builder = static_cast(builder); - auto time32_type = std::static_pointer_cast(datatype); - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(t32_builder->Append(tc.KdbToArrow(kI(k_array)[i]))); - break; - } - case arrow::Type::TIME64: - { - TemporalConversion tc(datatype); - auto t64_builder = static_cast(builder); - auto time64_type = std::static_pointer_cast(datatype); - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(t64_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); - break; - } - case arrow::Type::DECIMAL: - { - auto dec_builder = static_cast(builder); - auto dec_type = std::static_pointer_cast(datatype); - for (auto i = 0; i < k_array->n; ++i) { - if (type_overrides.decimal128_as_double) { +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; + TemporalConversion tc(datatype); + auto ts_builder = static_cast(builder); + auto timestamp_type = static_pointer_cast(datatype); + for( auto i = 0; i < length; ++i ){ + if( type_overrides.null_mapping.have_timestamp + && type_overrides.null_mapping.timestamp_null == kJ( k_array )[i+offset] ){ + PARQUET_THROW_NOT_OK( ts_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK( ts_builder->Append( tc.KdbToArrow( kJ( k_array )[i+offset] ) ) ); + } + } +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; + TemporalConversion tc(datatype); + auto t32_builder = static_cast(builder); + auto time32_type = static_pointer_cast(datatype); + for( auto i = 0; i < length; ++i ){ + if( type_overrides.null_mapping.have_time32 + && type_overrides.null_mapping.time32_null == kI( k_array )[i+offset] ){ + PARQUET_THROW_NOT_OK( t32_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK( t32_builder->Append( tc.KdbToArrow( kI( k_array )[i+offset] ) ) ); + } + } +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; + TemporalConversion tc(datatype); + auto t64_builder = static_cast(builder); + auto time64_type = static_pointer_cast(datatype); + for( auto i = 0; i < length; ++i ){ + if( type_overrides.null_mapping.have_time64 + && type_overrides.null_mapping.time64_null == kJ( k_array )[i+offset] ){ + PARQUET_THROW_NOT_OK( t64_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK( t64_builder->Append( tc.KdbToArrow( kJ( k_array )[i+offset] ) ) ); + } + } +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; + auto dec_builder = static_cast(builder); + auto dec_type = static_pointer_cast(datatype); + for (auto i = 0; i < length; ++i) { + if (type_overrides.decimal128_as_double) { + if( type_overrides.null_mapping.have_decimal + && is_equal( type_overrides.null_mapping.decimal_null, kF( k_array )[i+offset] ) ){ + PARQUET_THROW_NOT_OK( dec_builder->AppendNull() ); + } + else{ // Construct the decimal from a double arrow::Decimal128 dec128; - PARQUET_ASSIGN_OR_THROW(dec128, arrow::Decimal128::FromReal(kF(k_array)[i], dec_type->precision(), dec_type->scale())); - PARQUET_THROW_NOT_OK(dec_builder->Append(dec128)); - } else { - // Each decimal is a list of 16 bytes - K k_dec = kK(k_array)[i]; - TYPE_CHECK_LENGTH(k_dec->n != 16, datatype->ToString(), 16, k_dec->n); - TYPE_CHECK_ITEM(k_dec->t != KG, datatype->ToString(), KG, k_dec->t); - - arrow::Decimal128 dec128((const uint8_t*)kG(k_dec)); + PARQUET_ASSIGN_OR_THROW(dec128, arrow::Decimal128::FromReal(kF(k_array)[i+offset], dec_type->precision(), dec_type->scale())); PARQUET_THROW_NOT_OK(dec_builder->Append(dec128)); } + } else { + // Each decimal is a list of 16 bytes + K k_dec = kK(k_array)[i+offset]; + TYPE_CHECK_LENGTH(k_dec->n != 16, datatype->ToString(), 16, k_dec->n); + TYPE_CHECK_ITEM(k_dec->t != KG, datatype->ToString(), KG, k_dec->t); + + arrow::Decimal128 dec128((const uint8_t*)kG(k_dec)); + PARQUET_THROW_NOT_OK(dec_builder->Append(dec128)); } - break; - } - case arrow::Type::DURATION: - { - TemporalConversion tc(datatype); - auto dur_builder = static_cast(builder); - auto duration_type = std::static_pointer_cast(datatype); - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(dur_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); - break; - } - case arrow::Type::INTERVAL_MONTHS: - { - auto month_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(month_builder->AppendValues((int32_t*)kI(k_array), k_array->n)); - break; } - case arrow::Type::INTERVAL_DAY_TIME: - { - auto dt_builder = static_cast(builder); - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(dt_builder->Append(KTimespan_DayTimeInterval(kJ(k_array)[i]))); - break; - } - case arrow::Type::LIST: - PopulateListBuilder(datatype, k_array, builder, type_overrides); - break; - case arrow::Type::LARGE_LIST: - PopulateListBuilder(datatype, k_array, builder, type_overrides); - break; - case arrow::Type::FIXED_SIZE_LIST: - PopulateListBuilder(datatype, k_array, builder, type_overrides); - break; - case arrow::Type::MAP: - { - // An arrow map array is a nested set of key/item paired child arrays. This - // is represented in kdb as a mixed list for the parent map array, with a - // dictionary for each map value set. - // - // Get the key and item builders from the parent map builder - auto map_builder = static_cast(builder); - auto key_builder = map_builder->key_builder(); - auto item_builder = map_builder->item_builder(); +} - for (auto i = 0; i < k_array->n; ++i) { - // Ignore any mixed list items set to :: - if (kK(k_array)[i]->t == 101) - continue; +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; + TemporalConversion tc(datatype); + auto dur_builder = static_cast(builder); + auto duration_type = static_pointer_cast(datatype); + for( auto i = 0; i < length; ++i ){ + if( type_overrides.null_mapping.have_duration + && type_overrides.null_mapping.duration_null == kJ( k_array )[i+offset] ){ + PARQUET_THROW_NOT_OK( dur_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK( dur_builder->Append( tc.KdbToArrow( kJ( k_array )[i+offset] ) ) ); + } + } +} - // Delimit the start/end of each child map set - map_builder->Append(); +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; + auto month_builder = static_cast(builder); + if( type_overrides.null_mapping.have_month_interval ){ + std::vector null_bitmap( length ); + for( auto i = 0ll; i < length; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.month_interval_null != kI( k_array )[i+offset]; + } + PARQUET_THROW_NOT_OK( month_builder->AppendValues( ( int32_t* )&kI( k_array )[offset], length, null_bitmap ) ); + } + else{ + PARQUET_THROW_NOT_OK( month_builder->AppendValues( ( int32_t* )&kI( k_array )[offset], length ) ); + } +} - // Populate the child builders for this map set from the dictionary key/value lists - auto k_dict = kK(k_array)[i]; - TYPE_CHECK_ITEM(99 != k_dict->t, datatype->ToString(), 99, k_dict->t); - PopulateBuilder(key_builder->type(), kK(k_dict)[0], key_builder, type_overrides); - PopulateBuilder(item_builder->type(), kK(k_dict)[1], item_builder, type_overrides); +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; + auto dt_builder = static_cast(builder); + for (auto i = 0; i < length; ++i){ + if( type_overrides.null_mapping.have_day_time_interval + && type_overrides.null_mapping.day_time_interval_null == kJ( k_array )[i+offset] ){ + PARQUET_THROW_NOT_OK( dt_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK( dt_builder->Append( KTimespan_DayTimeInterval( kJ( k_array )[i+offset] ) ) ); } - break; } +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + PopulateListBuilder(datatype, k_array, builder, type_overrides); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + PopulateListBuilder(datatype, k_array, builder, type_overrides); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + PopulateListBuilder(datatype, k_array, builder, type_overrides); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + // An arrow map array is a nested set of key/item paired child arrays. This + // is represented in kdb as a mixed list for the parent map array, with a + // dictionary for each map value set. + // + // Get the key and item builders from the parent map builder + auto map_builder = static_cast(builder); + auto key_builder = map_builder->key_builder(); + auto item_builder = map_builder->item_builder(); + + for (auto i = 0; i < k_array->n; ++i) { + // Ignore any mixed list items set to :: + if (kK(k_array)[i]->t == 101) + continue; + + // Delimit the start/end of each child map set + PARQUET_THROW_NOT_OK( map_builder->Append() ); + + // Populate the child builders for this map set from the dictionary key/value lists + auto k_dict = kK(k_array)[i]; + TYPE_CHECK_ITEM(99 != k_dict->t, datatype->ToString(), 99, k_dict->t); - case arrow::Type::STRUCT: + auto items_null_mapping = type_overrides.null_mapping; + type_overrides.null_mapping = Options::NullMapping {0}; + PopulateBuilder(key_builder->type(), kK(k_dict)[0], key_builder, type_overrides); + type_overrides.null_mapping = items_null_mapping; + PopulateBuilder(item_builder->type(), kK(k_dict)[1], item_builder, type_overrides); + } +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + // An arrow struct array is a logical grouping of child arrays with each + // child array corresponding to one of the fields in the struct. A single + // struct value is obtaining by slicing across all the child arrays at a + // given index. This is represented in kdb as a mixed list for the parent + // struct array, containing child lists for each field in the struct. + // + // Check that the mixed list length is at least equal to the number of struct fields + auto struct_type = static_pointer_cast(datatype); + TYPE_CHECK_LENGTH(struct_type->num_fields() > k_array->n, datatype->ToString(), struct_type->num_fields(), k_array->n); + + // Get all the field builders from the parent struct builder + auto struct_builder = static_cast(builder); + vector field_builders; + for (auto i = 0; i < struct_builder->num_fields(); ++i) + field_builders.push_back(struct_builder->field_builder(i)); + + // Delimit each struct value in the parent builder + for (auto index = 0; index < kK(k_array)[0]->n; ++index) + PARQUET_THROW_NOT_OK( struct_builder->Append() ); + + // Populate each of the field builders from its kdb list. Only count up to + // the number of struct fields. Additional trailing data in the kdb mixed + // list is ignored (to allow for ::) + for (auto i = 0; i < struct_type->num_fields(); ++i) + PopulateBuilder(field_builders[i]->type(), kK(k_array)[i], field_builders[i], type_overrides); + + // Check that all the populated field builders have the same length. + for (auto it : field_builders) + if (it->length() != struct_builder->length()) + throw TypeCheck("Mismatched struct list lengths"); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + PopulateUnionBuilder(datatype, k_array, builder, type_overrides); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + PopulateUnionBuilder(datatype, k_array, builder, type_overrides); +} + +using PopulateHandler = void ( * ) ( shared_ptr, K, arrow::ArrayBuilder*, TypeMappingOverride& ); + +template +auto make_populate_handler() +{ + return make_pair( TypeId, &PopulateBuilder ); +} + +unordered_map PopulateHandlers { + make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() +}; + +} // namespace + +namespace kx { +namespace arrowkdb { + +// Populates data values from a kdb list into the specified array builder. +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + // Special cases for: + // symbol - string or large_string + // guid - fixed_size_binary(16) + // char - uint8 + bool is_symbol = k_array->t == KS && (datatype->id() == arrow::Type::STRING || datatype->id() == arrow::Type::LARGE_STRING); + bool is_guid = k_array->t == UU && datatype->id() == arrow::Type::FIXED_SIZE_BINARY && static_cast(builder)->byte_width() == sizeof(U); + bool is_char = k_array->t == KC && (datatype->id() == arrow::Type::UINT8 || datatype->id() == arrow::Type::INT8); + + // Type check the kdb structure + if (!is_symbol && !is_guid && !is_char) + TYPE_CHECK_ARRAY(GetKdbType(datatype, type_overrides) != k_array->t, datatype->ToString(), GetKdbType(datatype, type_overrides), k_array->t); + + auto type_id = datatype->id(); + if( PopulateHandlers.find( type_id ) == PopulateHandlers.end() ) { - // An arrow struct array is a logical grouping of child arrays with each - // child array corresponding to one of the fields in the struct. A single - // struct value is obtaining by slicing across all the child arrays at a - // given index. This is represented in kdb as a mixed list for the parent - // struct array, containing child lists for each field in the struct. - // - // Check that the mixed list length is at least equal to the number of struct fields - auto struct_type = std::static_pointer_cast(datatype); - TYPE_CHECK_LENGTH(struct_type->num_fields() > k_array->n, datatype->ToString(), struct_type->num_fields(), k_array->n); - - // Get all the field builders from the parent struct builder - auto struct_builder = static_cast(builder); - std::vector field_builders; - for (auto i = 0; i < struct_builder->num_fields(); ++i) - field_builders.push_back(struct_builder->field_builder(i)); - - // Delimit each struct value in the parent builder - for (auto index = 0; index < kK(k_array)[0]->n; ++index) - struct_builder->Append(); - - // Populate each of the field builders from its kdb list. Only count up to - // the number of struct fields. Additional trailing data in the kdb mixed - // list is ignored (to allow for ::) - for (auto i = 0; i < struct_type->num_fields(); ++i) - PopulateBuilder(field_builders[i]->type(), kK(k_array)[i], field_builders[i], type_overrides); - - // Check that all the populated field builders have the same length. - for (auto it : field_builders) - if (it->length() != struct_builder->length()) - throw TypeCheck("Mismatched struct list lengths"); - - break; - } - case arrow::Type::SPARSE_UNION: - PopulateUnionBuilder(datatype, k_array, builder, type_overrides); - break; - case arrow::Type::DENSE_UNION: - PopulateUnionBuilder(datatype, k_array, builder, type_overrides); - break; - default: TYPE_CHECK_UNSUPPORTED(datatype->ToString()); } + else + { + PopulateHandlers[type_id]( datatype, k_array, builder, type_overrides ); + } } // Construct a dictionary array from its values and indicies arrays. // // This is represented in kdb as a mixed list for the parent dictionary array // containing the values and indicies sub-lists. -std::shared_ptr MakeDictionary(std::shared_ptr datatype, K k_array, TypeMappingOverride& type_overrides) +shared_ptr MakeDictionary(shared_ptr datatype, K k_array, TypeMappingOverride& type_overrides) { K values = kK(k_array)[0]; K indicies = kK(k_array)[1]; - auto dictionary_type = std::static_pointer_cast(datatype); + auto dictionary_type = static_pointer_cast(datatype); // Recursively construct the values and indicies arrays auto values_array = MakeArray(dictionary_type->value_type(), values, type_overrides); auto indicies_array = MakeArray(dictionary_type->index_type(), indicies, type_overrides); - std::shared_ptr result; + shared_ptr result; PARQUET_ASSIGN_OR_THROW(result, arrow::DictionaryArray::FromArrays(datatype, indicies_array, values_array)); return result; } -std::shared_ptr MakeArray(std::shared_ptr datatype, K k_array, TypeMappingOverride& type_overrides) +shared_ptr MakeArray(shared_ptr datatype, K k_array, TypeMappingOverride& type_overrides) { // DictionaryBuilder works in quite an unusual and non-standard way so just // construct the dictionary array directly @@ -596,11 +1202,30 @@ std::shared_ptr MakeArray(std::shared_ptr datatyp PopulateBuilder(datatype, k_array, builder.get(), type_overrides); // Finalise the builder into the arrow array - std::shared_ptr array; + shared_ptr array; PARQUET_THROW_NOT_OK(builder->Finish(&array)); return array; } +shared_ptr MakeChunkedArray( + shared_ptr datatype + , K k_array + , TypeMappingOverride& type_overrides ) +{ + type_overrides.chunk_offset = 0; + vector> chunks; + int64_t num_chunks = type_overrides.NumChunks( k_array->n ); + for( int64_t i = 0; i < num_chunks; ++i ){ + auto array = MakeArray( datatype, k_array, type_overrides ); + chunks.push_back( array ); + type_overrides.chunk_offset += type_overrides.chunk_length; + } + + auto chunked_array = make_shared( move( chunks ) ); + + return chunked_array; +} + } // namespace arrowkdb } // namespace kx @@ -612,20 +1237,20 @@ K prettyPrintArray(K datatype_id, K array, K options) if (datatype_id->t != -KI) return krr((S)"datatype_id not -6h"); - auto datatype = kx::arrowkdb::GetDatatypeStore()->Find(datatype_id->i); + auto datatype = GetDatatypeStore()->Find(datatype_id->i); if (!datatype) return krr((S)"datatype not found"); // Parse the options - auto read_options = kx::arrowkdb::KdbOptions(options, kx::arrowkdb::Options::string_options, kx::arrowkdb::Options::int_options); + auto read_options = KdbOptions(options, Options::string_options, Options::int_options); // Type mapping overrides - kx::arrowkdb::TypeMappingOverride type_overrides{ read_options }; + TypeMappingOverride type_overrides{ read_options }; - auto arrow_array = kx::arrowkdb::MakeArray(datatype, array, type_overrides); + auto arrow_array = MakeArray(datatype, array, type_overrides); auto options = arrow::PrettyPrintOptions(); - std::string result; - arrow::PrettyPrint(*arrow_array, options, &result); + string result; + PARQUET_THROW_NOT_OK( arrow::PrettyPrint(*arrow_array, options, &result) ); return kp((S)result.c_str()); diff --git a/src/ArrayWriter.h b/src/ArrayWriter.h index 53a9b1b..e73aede 100644 --- a/src/ArrayWriter.h +++ b/src/ArrayWriter.h @@ -29,6 +29,15 @@ void PopulateBuilder(std::shared_ptr datatype, K k_array, arrow */ std::shared_ptr MakeArray(std::shared_ptr datatype, K k_array, TypeMappingOverride& type_overrides); +/** + * @brief Copies and converts a kdb list to an arrow chunked array + * + * @param datatype The datatype to use when creating the arrow array + * @param k_array The kdb list from which to source the data + * @return The arrow array +*/ +std::shared_ptr MakeChunkedArray( std::shared_ptr datatype, K k_array, TypeMappingOverride& type_overrides ); + } // namespace arrowkdb } // namespace kx diff --git a/src/FieldStore.cpp b/src/FieldStore.cpp index 53cf6b0..04af9f5 100644 --- a/src/FieldStore.cpp +++ b/src/FieldStore.cpp @@ -108,11 +108,5 @@ K field(K field_name, K datatype_id) if (!datatype) return krr((S)"datatype not found"); - // Converting between kdb nulls are arrow nulls would incur a massive - // performance hit (up to 10x worse with trival datatypes that could otherwise - // be memcpy'ed). Also, not all kdb types have a null value, e.g. KB, KG, KS, - // 0 of KC, 0 of KG, etc. So don't allow fields to be created as nullable - // (other than NA type which is all nulls). - bool nullable = datatype->id() == arrow::Type::NA; - return ki(kx::arrowkdb::GetFieldStore()->Add(arrow::field(kx::arrowkdb::GetKdbString(field_name), datatype, nullable))); -} \ No newline at end of file + return ki(kx::arrowkdb::GetFieldStore()->Add(arrow::field(kx::arrowkdb::GetKdbString(field_name), datatype, true))); +} diff --git a/src/HelperFunctions.cpp b/src/HelperFunctions.cpp index 5ade109..868cf07 100644 --- a/src/HelperFunctions.cpp +++ b/src/HelperFunctions.cpp @@ -148,6 +148,7 @@ const std::string GetKdbString(K str) TypeMappingOverride::TypeMappingOverride(const KdbOptions& options) { options.GetIntOption(Options::DECIMAL128_AS_DOUBLE, decimal128_as_double); + options.GetNullMappingOptions( null_mapping ); } KdbType GetKdbType(std::shared_ptr datatype, TypeMappingOverride& type_overrides) diff --git a/src/HelperFunctions.h b/src/HelperFunctions.h index d6faaef..dd06a86 100644 --- a/src/HelperFunctions.h +++ b/src/HelperFunctions.h @@ -1,6 +1,8 @@ #ifndef __HELPER_FUNCTIONS_H__ #define __HELPER_FUNCTIONS_H__ +#include +#include #include #include @@ -71,6 +73,23 @@ bool IsKdbString(K str); const std::string GetKdbString(K str); +//////////////////// +// FLOATS COMPARE // +//////////////////// + +//! Compares floating point numbers, because of unreliable direct compare +//! @param lhs - left-hand side value +//! @param rhs - right-hand side value +//! @return true if values are nearby +template +inline bool is_equal( T lhs, T rhs ) +{ + static const T epsilon = 2 * std::numeric_limits::epsilon(); + + return (std::isnan(lhs) && std::isnan(rhs)) || (std::fabs(lhs -= rhs) <= epsilon); +} + + ////////////////// // TYPE MAPPING // ////////////////// @@ -80,8 +99,22 @@ typedef signed char KdbType; struct TypeMappingOverride { int64_t decimal128_as_double = 0; + Options::NullMapping null_mapping; + int64_t chunk_offset = 0; + int64_t chunk_length = 0; + TypeMappingOverride(void) {}; TypeMappingOverride(const KdbOptions& options); + + int64_t NumChunks( long long array_length ) { return !chunk_length ? 1 + : array_length / chunk_length + ( array_length % chunk_length ? 1 : 0 ); + } + std::pair GetChunk( long long array_length ){ + int64_t offset = chunk_length ? chunk_offset : 0; + int64_t length = std::min( array_length - offset, chunk_length ? chunk_length : array_length ); + + return std::make_pair( offset, length ); + } }; /** @@ -138,6 +171,13 @@ KdbType GetKdbType(std::shared_ptr datatype, TypeMappingOverrid */ std::shared_ptr GetArrowType(K k_array); + +/////////////////////// +// FUNCTION HANDLERS // +/////////////////////// + +typedef KdbType(*GetKdbTypeCommon)(std::shared_ptr datatype, TypeMappingOverride& type_overrides); + } // namespace arrowkdb } // namespace kx diff --git a/src/KdbOptions.cpp b/src/KdbOptions.cpp new file mode 100644 index 0000000..ce6f325 --- /dev/null +++ b/src/KdbOptions.cpp @@ -0,0 +1,276 @@ +#include + +#include "KdbOptions.h" + +namespace{ + +template +auto make_handler() +{ + return std::make_pair( TypeId, &kx::arrowkdb::KdbOptions::HandleNullMapping ); +} + +} // namespace + +namespace kx { + +namespace arrowkdb { + +const KdbOptions::NullMappingHandlers KdbOptions::null_mapping_handlers = { + make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() +}; + +KdbOptions::KdbOptions( + K options + , const std::set& supported_string_options_ + , const std::set& supported_int_options_ + , const std::set& supported_dict_options_ ) + : null_mapping_options {0} + , supported_string_options(supported_string_options_) + , supported_int_options(supported_int_options_) + , supported_dict_options( supported_dict_options_ ) + , null_mapping_types { + { arrow::Type::BOOL, arrowkdb::Options::NM_BOOLEAN } + , { arrow::Type::UINT8, arrowkdb::Options::NM_UINT_8 } + , { arrow::Type::INT8, arrowkdb::Options::NM_INT_8 } + , { arrow::Type::UINT16, arrowkdb::Options::NM_UINT_16 } + , { arrow::Type::INT16, arrowkdb::Options::NM_INT_16 } + , { arrow::Type::UINT32, arrowkdb::Options::NM_UINT_32 } + , { arrow::Type::INT32, arrowkdb::Options::NM_INT_32 } + , { arrow::Type::UINT64, arrowkdb::Options::NM_UINT_64 } + , { arrow::Type::INT64, arrowkdb::Options::NM_INT_64 } + , { arrow::Type::HALF_FLOAT, arrowkdb::Options::NM_FLOAT_16 } + , { arrow::Type::FLOAT, arrowkdb::Options::NM_FLOAT_32 } + , { arrow::Type::DOUBLE, arrowkdb::Options::NM_FLOAT_64 } + , { arrow::Type::STRING, arrowkdb::Options::NM_STRING } + , { arrow::Type::LARGE_STRING, arrowkdb::Options::NM_LARGE_STRING } + , { arrow::Type::BINARY, arrowkdb::Options::NM_BINARY } + , { arrow::Type::LARGE_BINARY, arrowkdb::Options::NM_LARGE_BINARY } + , { arrow::Type::FIXED_SIZE_BINARY, arrowkdb::Options::NM_FIXED_BINARY } + , { arrow::Type::DATE32, arrowkdb::Options::NM_DATE_32 } + , { arrow::Type::DATE64, arrowkdb::Options::NM_DATE_64 } + , { arrow::Type::TIMESTAMP, arrowkdb::Options::NM_TIMESTAMP } + , { arrow::Type::TIME32, arrowkdb::Options::NM_TIME_32 } + , { arrow::Type::TIME64, arrowkdb::Options::NM_TIME_64 } + , { arrow::Type::DECIMAL, arrowkdb::Options::NM_DECIMAL } + , { arrow::Type::DURATION, arrowkdb::Options::NM_DURATION } + , { arrow::Type::INTERVAL_MONTHS, arrowkdb::Options::NM_MONTH_INTERVAL } + , { arrow::Type::INTERVAL_DAY_TIME, arrowkdb::Options::NM_DAY_TIME_INTERVAL } } +{ + std::transform( + null_mapping_types.begin() + , null_mapping_types.end() + , std::inserter( supported_null_mapping_options, end( supported_null_mapping_options ) ) + , []( const auto& value ){ + return value.second; + } ); + if (options != NULL && options->t != 101) { + if (options->t != 99) + throw InvalidOption("options not -99h"); + K keys = kK(options)[0]; + if (keys->t != KS) + throw InvalidOption("options keys not 11h"); + K values = kK(options)[1]; + switch (values->t) { + case KJ: + PopulateIntOptions(keys, values); + break; + case KS: + PopulateStringOptions(keys, values); + break; + case XD: + PopulateDictOptions(keys, values); + break; + case 0: + PopulateMixedOptions(keys, values); + break; + default: + throw InvalidOption("options values not 7|11|0h"); + } + } +} + +const std::string KdbOptions::ToUpper(std::string str) const +{ + std::string upper; + for (auto i : str) + upper.push_back((unsigned char)std::toupper(i)); + return upper; +} + +const std::string KdbOptions::ToLower( std::string str ) const +{ + std::transform( str.begin(), str.end(), str.begin(), ::tolower ); + + return str; +} + +void KdbOptions::PopulateIntOptions(K keys, K values) +{ + for (auto i = 0ll; i < values->n; ++i) { + const std::string key = ToUpper(kS(keys)[i]); + if (supported_int_options.find(key) == supported_int_options.end()) + throw InvalidOption(("Unsupported int option '" + key + "'").c_str()); + int_options[key] = kJ(values)[i]; + } +} + +void KdbOptions::PopulateStringOptions(K keys, K values) +{ + for (auto i = 0ll; i < values->n; ++i) { + const std::string key = ToUpper(kS(keys)[i]); + if (supported_string_options.find(key) == supported_string_options.end()) + throw InvalidOption(("Unsupported string option '" + key + "'").c_str()); + string_options[key] = ToUpper(kS(values)[i]); + } +} + +void KdbOptions::PopulateNullMappingOptions( long long index, K dict ) +{ + K keys = kK( kK( dict )[index] )[0]; + K values = kK( kK( dict )[index] )[1]; + if( KS != keys->t ){ + throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING keys (expected=11h), type=" + std::to_string( keys->t ) + "h" ); + } + if( 0 != values->t ){ + throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING values (extected=0h), type=" + std::to_string( values->t ) + "h" ); + } + for( auto i = 0ll; i < values->n; ++i ){ + const std::string key = ToLower( kS( keys )[i] ); + if( supported_null_mapping_options.find( key ) == supported_null_mapping_options.end() ){ + throw InvalidOption( "Unsupported NULL_MAPPING option '" + key + "'" ); + } + K value = kK( values )[i]; + auto option = GetNullMappingType( key ); + auto it = null_mapping_handlers.find( option ); + if( it != null_mapping_handlers.end() ){ + ( this->*it->second )( key, value ); + } + else if( 101 == value->t ){ + // Ignore generic null, which may be used here to ensure mixed list of options + } + else{ + throw InvalidOption( "Unhandled NULL_MAPPING option '" + key + "', type=" + std::to_string( keys->t ) + "h" ); + } + } +} + +void KdbOptions::PopulateDictOptions( K keys, K values ) +{ + for( auto i = 0ll; i < values->n; ++i ) { + const std::string key = ToUpper( kS( keys )[i] ); + if( supported_dict_options.find( key ) == supported_dict_options.end() ){ + throw InvalidOption(("Unsupported dict option '" + key + "'").c_str()); + } + if( Options::NULL_MAPPING == key ) + { + PopulateNullMappingOptions( i, values ); + } + } +} + +void KdbOptions::PopulateMixedOptions(K keys, K values) +{ + for (auto i = 0ll; i < values->n; ++i) { + const std::string key = ToUpper(kS(keys)[i]); + K value = kK(values)[i]; + switch (value->t) { + case -KJ: + if (supported_int_options.find(key) == supported_int_options.end()) + throw InvalidOption(("Unsupported int option '" + key + "'").c_str()); + int_options[key] = value->j; + break; + case -KS: + if (supported_string_options.find(key) == supported_string_options.end()) + throw InvalidOption(("Unsupported string option '" + key + "'").c_str()); + string_options[key] = ToUpper(value->s); + break; + case KC: + { + if (supported_string_options.find(key) == supported_string_options.end()) + throw InvalidOption(("Unsupported string option '" + key + "'").c_str()); + string_options[key] = ToUpper(std::string((char*)kG(value), value->n)); + break; + } + case XD: + { + if( supported_dict_options.find( key ) == supported_dict_options.end() ){ + throw InvalidOption(("Unsupported dict option '" + key + "'").c_str()); + } + if( Options::NULL_MAPPING == key ) + { + PopulateNullMappingOptions( i, values ); + } + break; + } + case 101: + // Ignore :: + break; + default: + throw InvalidOption(("option '" + key + "' value not -7|-11|10h").c_str()); + } + } +} + +arrow::Type::type KdbOptions::GetNullMappingType( const std::string& option ) +{ + auto it = std::find_if( null_mapping_types.begin(), null_mapping_types.end(), [&option]( const auto& value ){ + return option == value.second; + } ); + if( it != null_mapping_types.end() ){ + return it->first; + } + + return arrow::Type::NA; +} + +bool KdbOptions::GetStringOption(const std::string key, std::string& result) const +{ + const auto it = string_options.find(key); + if (it == string_options.end()) + return false; + else { + result = it->second; + return true; + } +} + +bool KdbOptions::GetIntOption(const std::string key, int64_t& result) const +{ + const auto it = int_options.find(key); + if (it == int_options.end()) + return false; + else { + result = it->second; + return true; + } +} + +} // namespace arrowkdb + +} // kx diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 1af50a8..80254b6 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -3,12 +3,14 @@ #include #include +#include #include #include #include +#include #include "k.h" - +#include namespace kx { namespace arrowkdb { @@ -17,101 +19,168 @@ namespace arrowkdb { namespace Options { // Int options + const std::string ARROW_CHUNK_ROWS = "ARROW_CHUNK_ROWS"; const std::string PARQUET_CHUNK_SIZE = "PARQUET_CHUNK_SIZE"; const std::string PARQUET_MULTITHREADED_READ = "PARQUET_MULTITHREADED_READ"; const std::string USE_MMAP = "USE_MMAP"; const std::string DECIMAL128_AS_DOUBLE = "DECIMAL128_AS_DOUBLE"; + const std::string WITH_NULL_BITMAP = "WITH_NULL_BITMAP"; // String options const std::string PARQUET_VERSION = "PARQUET_VERSION"; + // Dict options + const std::string NULL_MAPPING = "NULL_MAPPING"; + + // Null mapping options + const std::string NM_BOOLEAN = "bool"; + const std::string NM_UINT_8 = "uint8"; + const std::string NM_INT_8 = "int8"; + const std::string NM_UINT_16 = "uint16"; + const std::string NM_INT_16 = "int16"; + const std::string NM_UINT_32 = "uint32"; + const std::string NM_INT_32 = "int32"; + const std::string NM_UINT_64 = "uint64"; + const std::string NM_INT_64 = "int64"; + const std::string NM_FLOAT_16 = "float16"; + const std::string NM_FLOAT_32 = "float32"; + const std::string NM_FLOAT_64 = "float64"; + const std::string NM_STRING = "utf8"; + const std::string NM_LARGE_STRING = "large_utf8"; + const std::string NM_BINARY = "binary"; + const std::string NM_LARGE_BINARY = "large_binary"; + const std::string NM_FIXED_BINARY = "fixed_size_binary"; + const std::string NM_DATE_32 = "date32"; + const std::string NM_DATE_64 = "date64"; + const std::string NM_TIMESTAMP = "timestamp"; + const std::string NM_TIME_32 = "time32"; + const std::string NM_TIME_64 = "time64"; + const std::string NM_DECIMAL = "decimal"; + const std::string NM_DURATION = "duration"; + const std::string NM_MONTH_INTERVAL = "month_interval"; + const std::string NM_DAY_TIME_INTERVAL = "day_time_interval"; + const static std::set int_options = { + ARROW_CHUNK_ROWS, PARQUET_CHUNK_SIZE, PARQUET_MULTITHREADED_READ, USE_MMAP, DECIMAL128_AS_DOUBLE, + WITH_NULL_BITMAP }; const static std::set string_options = { PARQUET_VERSION, }; -} + const static std::set dict_options = { + NULL_MAPPING, + }; + + struct NullMapping + { + bool have_boolean; + bool have_uint8; + bool have_int8; + bool have_uint16; + bool have_int16; + bool have_uint32; + bool have_int32; + bool have_uint64; + bool have_int64; + bool have_float16; + bool have_float32; + bool have_float64; + bool have_string; + bool have_large_string; + bool have_binary; + bool have_large_binary; + bool have_fixed_binary; + bool have_date32; + bool have_date64; + bool have_timestamp; + bool have_time32; + bool have_time64; + bool have_decimal; + bool have_duration; + bool have_month_interval; + bool have_day_time_interval; + + using Binary = std::basic_string; + + bool boolean_null; + + uint8_t uint8_null; + int8_t int8_null; + + uint16_t uint16_null; + int16_t int16_null; + uint32_t uint32_null; + int32_t int32_null; + + uint64_t uint64_null; + int64_t int64_null; + + uint16_t float16_null; + float float32_null; + double float64_null; + + std::string string_null; + std::string large_string_null; + Binary binary_null; + Binary large_binary_null; + Binary fixed_binary_null; + + int32_t date32_null; + int64_t date64_null; + int64_t timestamp_null; + int32_t time32_null; + int64_t time64_null; + double decimal_null; + int64_t duration_null; + int32_t month_interval_null; + int64_t day_time_interval_null; + }; + +} // namespace Options // Helper class for reading dictionary of options // // Dictionary key: KS // Dictionary value: KS or // KJ or -// 0 of -KS|-KJ|KC +// XD or +// 0 of -KS|-KJ|XD|KC class KdbOptions { private: + Options::NullMapping null_mapping_options; std::map string_options; std::map int_options; const std::set& supported_string_options; const std::set& supported_int_options; + const std::set& supported_dict_options; + std::set supported_null_mapping_options; + using NullMappingHandler = void ( KdbOptions::* )( const std::string&, K ); + using NullMappingHandlers = std::unordered_map; + const std::unordered_map null_mapping_types; + + static const NullMappingHandlers null_mapping_handlers; private: - const std::string ToUpper(std::string str) const - { - std::string upper; - for (auto i : str) - upper.push_back((unsigned char)std::toupper(i)); - return upper; - } + const std::string ToUpper(std::string str) const; - void PopulateIntOptions(K keys, K values) - { - for (auto i = 0ll; i < values->n; ++i) { - const std::string key = ToUpper(kS(keys)[i]); - if (supported_int_options.find(key) == supported_int_options.end()) - throw InvalidOption(("Unsupported int option '" + key + "'").c_str()); - int_options[key] = kJ(values)[i]; - } - } + const std::string ToLower( std::string str ) const; - void PopulateStringOptions(K keys, K values) - { - for (auto i = 0ll; i < values->n; ++i) { - const std::string key = ToUpper(kS(keys)[i]); - if (supported_string_options.find(key) == supported_string_options.end()) - throw InvalidOption(("Unsupported string option '" + key + "'").c_str()); - string_options[key] = ToUpper(kS(values)[i]); - } - } + void PopulateIntOptions(K keys, K values); - void PopulateMixedOptions(K keys, K values) - { - for (auto i = 0ll; i < values->n; ++i) { - const std::string key = ToUpper(kS(keys)[i]); - K value = kK(values)[i]; - switch (value->t) { - case -KJ: - if (supported_int_options.find(key) == supported_int_options.end()) - throw InvalidOption(("Unsupported int option '" + key + "'").c_str()); - int_options[key] = value->j; - break; - case -KS: - if (supported_string_options.find(key) == supported_string_options.end()) - throw InvalidOption(("Unsupported string option '" + key + "'").c_str()); - string_options[key] = ToUpper(value->s); - break; - case KC: - { - if (supported_string_options.find(key) == supported_string_options.end()) - throw InvalidOption(("Unsupported string option '" + key + "'").c_str()); - string_options[key] = ToUpper(std::string((char*)kG(value), value->n)); - break; - } - case 101: - // Ignore :: - break; - default: - throw InvalidOption(("option '" + key + "' value not -7|-11|10h").c_str()); - } - } - } + void PopulateStringOptions(K keys, K values); + + void PopulateNullMappingOptions( long long index, K dict ); + + void PopulateDictOptions( K keys, K values ); + + void PopulateMixedOptions(K keys, K values); public: class InvalidOption : public std::invalid_argument @@ -121,55 +190,355 @@ class KdbOptions {}; }; - KdbOptions(K options, const std::set supported_string_options_, const std::set supported_int_options_) : - supported_string_options(supported_string_options_), supported_int_options(supported_int_options_) - { - if (options != NULL && options->t != 101) { - if (options->t != 99) - throw InvalidOption("options not -99h"); - K keys = kK(options)[0]; - if (keys->t != KS) - throw InvalidOption("options keys not 11h"); - K values = kK(options)[1]; - switch (values->t) { - case KJ: - PopulateIntOptions(keys, values); - break; - case KS: - PopulateStringOptions(keys, values); - break; - case 0: - PopulateMixedOptions(keys, values); - break; - default: - throw InvalidOption("options values not 7|11|0h"); - } - } - } - - bool GetStringOption(const std::string key, std::string& result) const - { - const auto it = string_options.find(key); - if (it == string_options.end()) - return false; - else { - result = it->second; - return true; - } - } + KdbOptions( + K options + , const std::set& supported_string_options_ + , const std::set& supported_int_options_ + , const std::set& supported_dict_options_ = Options::dict_options ); - bool GetIntOption(const std::string key, int64_t& result) const - { - const auto it = int_options.find(key); - if (it == int_options.end()) - return false; - else { - result = it->second; - return true; - } + template + inline void HandleNullMapping( const std::string& key, K value ); + + arrow::Type::type GetNullMappingType( const std::string& option ); + + void GetNullMappingOptions( Options::NullMapping& null_mapping ) const{ + null_mapping = null_mapping_options; } + + bool GetStringOption(const std::string key, std::string& result) const; + + bool GetIntOption(const std::string key, int64_t& result) const; }; +inline void null_mapping_error( const std::string& key, K value ) +{ + std::string message = std::string( "Unsupported KDB data type for NULL_MAPPING option '") + .append( key ) + .append( "', type=" ) + .append( std::to_string( value->t ) ) + .append( "h" ); + + throw KdbOptions::InvalidOption( message ); +} + +template<> +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) +{ + if( value->t == -KB || value->t == -KG ){ + null_mapping_options.boolean_null = value->g; + null_mapping_options.have_boolean = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) +{ + if( -KG == value->t ){ + null_mapping_options.uint8_null = static_cast( value->g ); + null_mapping_options.have_uint8 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) +{ + if( -KG == value->t ){ + null_mapping_options.int8_null = value->g; + null_mapping_options.have_int8 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) +{ + if( -KH == value->t ){ + null_mapping_options.uint16_null = static_cast( value->h ); + null_mapping_options.have_uint16 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) +{ + if( -KH == value->t ){ + null_mapping_options.int16_null = value->h; + null_mapping_options.have_int16 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) +{ + if( -KI == value->t ){ + null_mapping_options.uint32_null = static_cast( value->i ); + null_mapping_options.have_uint32 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) +{ + if( -KI == value->t ){ + null_mapping_options.int32_null = value->i; + null_mapping_options.have_int32 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) +{ + if( -KJ == value->t ){ + null_mapping_options.uint64_null = static_cast( value->j ); + null_mapping_options.have_uint64 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) +{ + if( -KJ == value->t ){ + null_mapping_options.int64_null = value->j; + null_mapping_options.have_int64 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) +{ + if( -KH == value->t ){ + null_mapping_options.float16_null = static_cast( value->h ); + null_mapping_options.have_float16 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) +{ + if( -KE == value->t ){ + null_mapping_options.float32_null = value->e; + null_mapping_options.have_float32 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) +{ + if( -KF == value->t ){ + null_mapping_options.float64_null = value->f; + null_mapping_options.have_float64 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) +{ + if( KC == value->t ){ + null_mapping_options.string_null.assign( ( char* )kC( value ), value->n ); + null_mapping_options.have_string = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) +{ + if( KC == value->t ){ + null_mapping_options.large_string_null.assign( ( char* )kC( value ), value->n ); + null_mapping_options.have_large_string = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) +{ + if( value->t == KG || value->t == KC ){ + null_mapping_options.binary_null.assign( kG( value ), value->n ); + null_mapping_options.have_binary = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) +{ + if( value->t == KG || value->t == KC ){ + null_mapping_options.large_binary_null.assign( kG( value ), value->n ); + null_mapping_options.have_large_binary = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) +{ + switch( value->t ){ + case -UU: + null_mapping_options.fixed_binary_null.assign( &kU( value )->g[0], sizeof( U ) ); + null_mapping_options.have_fixed_binary = true; + break; + case KG: + case KC: + null_mapping_options.fixed_binary_null.assign( kG( value ), value->n ); + null_mapping_options.have_fixed_binary = true; + break; + default: + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) +{ + if( value->t == -KD ){ + null_mapping_options.date32_null = value->i; + null_mapping_options.have_date32 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) +{ + if( value->t == -KP ){ + null_mapping_options.date64_null = value->j; + null_mapping_options.have_date64 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) +{ + if( value->t == -KP ){ + null_mapping_options.timestamp_null = value->j; + null_mapping_options.have_timestamp = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) +{ + if( value->t == -KT ){ + null_mapping_options.time32_null = value->i; + null_mapping_options.have_time32 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) +{ + if( value->t == -KN ){ + null_mapping_options.time64_null = value->j; + null_mapping_options.have_time64 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) +{ + if( -KF == value->t ){ + null_mapping_options.decimal_null = value->f; + null_mapping_options.have_decimal = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) +{ + if( value->t == -KN ){ + null_mapping_options.duration_null = value->j; + null_mapping_options.have_duration = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) +{ + if( value->t == -KM ){ + null_mapping_options.month_interval_null = value->i; + null_mapping_options.have_month_interval = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) +{ + if( value->t == -KN ){ + null_mapping_options.day_time_interval_null = value->j; + null_mapping_options.have_day_time_interval = true; + } + else{ + null_mapping_error( key, value ); + } +} + } // namespace arrowkdb } // namespace kx diff --git a/src/SchemaStore.cpp b/src/SchemaStore.cpp index b95a1b9..03635f3 100644 --- a/src/SchemaStore.cpp +++ b/src/SchemaStore.cpp @@ -143,19 +143,13 @@ K inferSchema(K table) // Determine the arrow datatype for each data set K k_array_data = kK(dict)[1]; - assert(k_array_data->n == field_names.size()); + assert(static_cast( k_array_data->n ) == field_names.size()); arrow::FieldVector fields; for (auto i = 0ul; i < field_names.size(); ++i) { auto datatype = kx::arrowkdb::GetArrowType(kK(k_array_data)[i]); // Construct each arrow field - // Converting between kdb nulls are arrow nulls would incur a massive - // performance hit (up to 10x worse with trival datatypes that could otherwise - // be memcpy'ed). Also, not all kdb types have a null value, e.g. KB, KG, KS, - // 0 of KC, 0 of KG, etc. So don't allow fields to be created as nullable - // (other than NA type which is all nulls). - bool nullable = datatype->id() == arrow::Type::NA; - fields.push_back(arrow::field(field_names[i], datatype, nullable)); + fields.push_back(arrow::field(field_names[i], datatype, true)); } // Create the schema with these fields diff --git a/src/TableData.cpp b/src/TableData.cpp index 0deeb88..b9ba2e9 100644 --- a/src/TableData.cpp +++ b/src/TableData.cpp @@ -63,6 +63,31 @@ std::vector> MakeArrays(std::shared_ptr> MakeChunkedArrays( + std::shared_ptr schema + , K array_data + , kx::arrowkdb::TypeMappingOverride& type_overrides ) +{ + if( array_data->t != 0 ) + throw kx::arrowkdb::TypeCheck( "array_data not mixed list" ); + if( array_data->n < schema->num_fields() ) + throw kx::arrowkdb::TypeCheck( "array_data length less than number of schema fields" ); + std::vector> chunked_arrays; + if( array_data->t == 0 && array_data->n == 0 ){ + // Empty table + } + else{ + // Only count up to the number of schema fields. Additional trailing data + // in the kdb mixed list is ignored (to allow for ::) + for( auto i = 0; i < schema->num_fields(); ++i ){ + auto k_array = kK( array_data )[i]; + chunked_arrays.push_back( kx::arrowkdb::MakeChunkedArray( schema->field(i)->type(), k_array, type_overrides ) ); + } + } + + return chunked_arrays; +} + // Create a an arrow table from the arrow schema and mixed list of kdb array objects std::shared_ptr MakeTable(std::shared_ptr schema, K array_data, kx::arrowkdb::TypeMappingOverride& type_overrides) { @@ -294,6 +319,20 @@ K readParquetData(K parquet_file, K options) kK(data)[i] = kx::arrowkdb::ReadChunkedArray(chunked_array, type_overrides); } + int64_t with_null_bitmap = 0; + read_options.GetIntOption( kx::arrowkdb::Options::WITH_NULL_BITMAP, with_null_bitmap ); + if( with_null_bitmap ){ + K bitmap = ktn( 0, col_num ); + for( auto i = 0; i < col_num; ++i ){ + auto chunked_array = table->column( i ); + kK( bitmap )[i] = kx::arrowkdb::ReadChunkedArrayNullBitmap( chunked_array, type_overrides ); + } + K array = data; + data = ktn( 0, 2 ); + kK( data )[0] = array; + kK( data )[1] = bitmap; + } + return data; KDB_EXCEPTION_CATCH; @@ -407,6 +446,20 @@ K readParquetRowGroups(K parquet_file, K row_groups, K columns, K options) kK(data)[i] = kx::arrowkdb::ReadChunkedArray(chunked_array, type_overrides); } + int64_t with_null_bitmap = 0; + read_options.GetIntOption(kx::arrowkdb::Options::WITH_NULL_BITMAP, with_null_bitmap); + if (with_null_bitmap) { + K bitmap = ktn(0, col_num); + for (auto i = 0; i < col_num; ++i) { + auto chunked_array = table->column(i); + kK(bitmap)[i] = kx::arrowkdb::ReadChunkedArrayNullBitmap(chunked_array, type_overrides); + } + K array = data; + data = ktn(0, 2); + kK(data)[0] = array; + kK(data)[1] = bitmap; + } + return data; KDB_EXCEPTION_CATCH; @@ -439,19 +492,44 @@ K writeArrow(K arrow_file, K schema_id, K array_data, K options) std::shared_ptr writer; PARQUET_ASSIGN_OR_THROW(writer, arrow::ipc::MakeFileWriter(outfile.get(), schema)); - auto arrays = MakeArrays(schema, array_data, type_overrides); + // Chunk size + read_options.GetIntOption( kx::arrowkdb::Options::ARROW_CHUNK_ROWS, type_overrides.chunk_length ); + + auto check_length = []( const auto& arrays ) -> int64_t { + // Check all arrays are same length + int64_t len = -1; + for (auto i : arrays) { + if (len == -1) + len = i->length(); + else if (len != i->length()) + return -1l; + } - // Check all arrays are same length - int64_t len = -1; - for (auto i : arrays) { - if (len == -1) - len = i->length(); - else if (len != i->length()) + return len; + }; + + if( !type_overrides.chunk_length ){ // arrow not chunked + auto arrays = MakeArrays(schema, array_data, type_overrides); + + auto len = check_length( arrays ); + if( len < 0 ){ return krr((S)"unequal length arrays"); + } + + auto batch = arrow::RecordBatch::Make(schema, len, arrays); + PARQUET_THROW_NOT_OK(writer->WriteRecordBatch(*batch)); } + else{ + auto chunked_arrays = MakeChunkedArrays( schema, array_data, type_overrides ); - auto batch = arrow::RecordBatch::Make(schema, len, arrays); - PARQUET_THROW_NOT_OK(writer->WriteRecordBatch(*batch)); + auto len = check_length( chunked_arrays ); + if( len < 0 ){ + return krr((S)"unequal length arrays"); + } + + auto table = arrow::Table::Make( schema, chunked_arrays ); + PARQUET_THROW_NOT_OK( writer->WriteTable( *table ) ); + } PARQUET_THROW_NOT_OK(writer->Close()); @@ -547,6 +625,23 @@ K readArrowData(K arrow_file, K options) kK(data)[i] = kx::arrowkdb::ReadChunkedArray(chunked_array, type_overrides); } + int64_t with_null_bitmap = 0; + read_options.GetIntOption( kx::arrowkdb::Options::WITH_NULL_BITMAP, with_null_bitmap ); + if( with_null_bitmap ){ + K bitmap = ktn( 0, col_num ); + for( auto i = 0; i < col_num; ++i ){ + arrow::ArrayVector column_arrays; + for( auto batch: all_batches ) + column_arrays.push_back( batch->column( i ) ); + auto chunked_array = std::make_shared( column_arrays ); + kK( bitmap )[i] = kx::arrowkdb::ReadChunkedArrayNullBitmap( chunked_array, type_overrides ); + } + K array = data; + data = ktn( 0, 2 ); + kK( data )[0] = array; + kK( data )[1] = bitmap; + } + return data; KDB_EXCEPTION_CATCH; @@ -576,19 +671,44 @@ K serializeArrow(K schema_id, K array_data, K options) sink.reset(new arrow::io::BufferOutputStream(buffer)); PARQUET_ASSIGN_OR_THROW(writer, arrow::ipc::MakeStreamWriter(sink.get(), schema)); - auto arrays = MakeArrays(schema, array_data, type_overrides); + // Chunk size + read_options.GetIntOption( kx::arrowkdb::Options::ARROW_CHUNK_ROWS, type_overrides.chunk_length ); + + auto check_length = []( const auto& arrays ) -> int64_t { + // Check all arrays are same length + int64_t len = -1; + for (auto i : arrays) { + if (len == -1) + len = i->length(); + else if (len != i->length()) + return -1l; + } + + return len; + }; - // Check all arrays are same length - int64_t len = -1; - for (auto i : arrays) { - if (len == -1) - len = i->length(); - else if (len != i->length()) + if( !type_overrides.chunk_length ){ // arrow not chunked + auto arrays = MakeArrays(schema, array_data, type_overrides); + + auto len = check_length( arrays ); + if( len < 0 ){ return krr((S)"unequal length arrays"); + } + + auto batch = arrow::RecordBatch::Make(schema, len, arrays); + PARQUET_THROW_NOT_OK(writer->WriteRecordBatch(*batch)); } + else{ + auto chunked_arrays = MakeChunkedArrays( schema, array_data, type_overrides ); + + auto len = check_length( chunked_arrays ); + if( len < 0 ){ + return krr((S)"unequal length arrays"); + } - auto batch = arrow::RecordBatch::Make(schema, len, arrays); - PARQUET_THROW_NOT_OK(writer->WriteRecordBatch(*batch)); + auto table = arrow::Table::Make( schema, chunked_arrays ); + PARQUET_THROW_NOT_OK( writer->WriteTable( *table ) ); + } PARQUET_THROW_NOT_OK(writer->Close()); std::shared_ptr final_buffer; @@ -664,6 +784,23 @@ K parseArrowData(K char_array, K options) kK(data)[i] = kx::arrowkdb::ReadChunkedArray(chunked_array, type_overrides); } + int64_t with_null_bitmap = 0; + read_options.GetIntOption( kx::arrowkdb::Options::WITH_NULL_BITMAP, with_null_bitmap ); + if( with_null_bitmap ){ + K bitmap = ktn( 0, col_num ); + for( auto i = 0; i < col_num; ++i ){ + arrow::ArrayVector column_arrays; + for( auto batch: all_batches ) + column_arrays.push_back( batch->column( i ) ); + auto chunked_array = std::make_shared( column_arrays ); + kK( bitmap )[i] = kx::arrowkdb::ReadChunkedArrayNullBitmap( chunked_array, type_overrides ); + } + K array = data; + data = ktn( 0, 2 ); + kK( data )[0] = array; + kK( data )[1] = bitmap; + } + return data; KDB_EXCEPTION_CATCH; diff --git a/tests/.gitignore b/tests/.gitignore new file mode 100644 index 0000000..5415741 --- /dev/null +++ b/tests/.gitignore @@ -0,0 +1,12 @@ +basic.q +crucial_null_bitmap.q +glossary_null_bitmap.q +nested_null_bitmap.q +union_null_bitmap.q +null_mapping_short.q +null_mapping_long.q +null_mapping_float.q +null_mapping_str.q +null_mapping_time.q +null_mapping_extra.q +null_mapping_other.q diff --git a/tests/test.t b/tests/basic.t similarity index 100% rename from tests/test.t rename to tests/basic.t diff --git a/tests/null_bitmap/crucial_null_bitmap.t b/tests/null_bitmap/crucial_null_bitmap.t new file mode 100644 index 0000000..8443408 --- /dev/null +++ b/tests/null_bitmap/crucial_null_bitmap.t @@ -0,0 +1,114 @@ +// crucial_null_bitmap.t + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l q/arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping ||----------+\n"; +crucial_opts:(`bool`int32`float64`utf8`date32)!(0b;1i;2.34;"start";2006.07.21); + +crucial_options:(``NULL_MAPPING)!((::);crucial_opts); + +N:5 + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +bool_dt:.arrowkdb.dt.boolean[]; +i32_dt:.arrowkdb.dt.int32[]; +f64_dt:.arrowkdb.dt.float64[]; +str_dt:.arrowkdb.dt.utf8[]; +d32_dt:.arrowkdb.dt.date32[]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +bool_fd:.arrowkdb.fd.field[`bool;bool_dt]; +i32_fd:.arrowkdb.fd.field[`int32;i32_dt]; +f64_fd:.arrowkdb.fd.field[`float64;f64_dt]; +str_fd:.arrowkdb.fd.field[`string;str_dt]; +d32_fd:.arrowkdb.fd.field[`date32;d32_dt]; + +-1"\n+----------|| Create the schemas for the list of fields ||----------+\n"; +crucial_schema:.arrowkdb.sc.schema[(ts_fd,bool_fd,i32_fd,f64_fd,str_fd,d32_fd)]; + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +bool_data:N?(0b;1b); +bool_data[0]:0b; +i32_data:N?100i; +i32_data[1]:1i; +f64_data:N?100f; +f64_data[2]:2.34f; +str_data:N?("start";"stop";"alert";"acknowledge";""); +str_data[3]:"start" +d32_data:N?(2006.07.21;2005.07.18;2004.07.16;2003.07.15;2002.07.11); +d32_data[4]:2006.07.21; + +-1"\n+----------|| Combine the data for all columns ||----------+\n"; +crucial_data:(ts_data;bool_data;i32_data;f64_data;str_data;d32_data); + +-1"\n+----------|| Write the schema and array data to a parquet file ||----------+\n"; +crucial_options[`PARQUET_VERSION]:`V2.0; + +parquet_crucial_bitmap:"null_bitmap.parquet"; +.arrowkdb.pq.writeParquet[parquet_crucial_bitmap;crucial_schema;crucial_data;crucial_options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +parquet_crucial_schema:.arrowkdb.pq.readParquetSchema[parquet_crucial_bitmap]; +.arrowkdb.sc.equalSchemas[crucial_schema;parquet_crucial_schema] +crucial_schema~parquet_crucial_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +crucial_options[`WITH_NULL_BITMAP]:1; +parquet_crucial_data:.arrowkdb.pq.readParquetData[parquet_crucial_bitmap;crucial_options]; +crucial_data~first parquet_crucial_data + +nulls_data:1b,(N-1)?1b; +crucial_nulls:{x rotate nulls_data} each neg til {x-1} count crucial_data; +parquet_crucial_nulls:last parquet_crucial_data; +crucial_nulls~crucial_nulls & sublist[{1-x} count parquet_crucial_nulls;parquet_crucial_nulls] +rm parquet_crucial_bitmap; + +-1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; +arrow_crucial_bitmap:"null_bitmap.arrow"; +.arrowkdb.ipc.writeArrow[arrow_crucial_bitmap;crucial_schema;crucial_data;crucial_options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +arrow_crucial_schema:.arrowkdb.ipc.readArrowSchema[arrow_crucial_bitmap]; +.arrowkdb.sc.equalSchemas[crucial_schema;arrow_crucial_schema] +crucial_schema~arrow_crucial_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +arrow_crucial_data:.arrowkdb.ipc.readArrowData[arrow_crucial_bitmap;crucial_options]; +crucial_data~first arrow_crucial_data +arrow_crucial_nulls:last arrow_crucial_data; +crucial_nulls~crucial_nulls & sublist[{1-x} count arrow_crucial_nulls;arrow_crucial_nulls] +rm arrow_crucial_bitmap; + +-1"\n+----------|| Serialize the schema and array data to an arrow stream ||----------+\n"; +serialized_bitmap:.arrowkdb.ipc.serializeArrow[crucial_schema;crucial_data;crucial_options]; + +-1"\n+----------|| Parse the schema back abd compare ||----------+\n"; +stream_crucial_schema:.arrowkdb.ipc.parseArrowSchema[serialized_bitmap]; +.arrowkdb.sc.equalSchemas[crucial_schema;stream_crucial_schema] +crucial_schema~stream_crucial_schema + +-1"\n+----------|| Parse the array data back and compare ||----------+\n"; +stream_crucial_data:.arrowkdb.ipc.parseArrowData[serialized_bitmap;crucial_options]; +crucial_data~first stream_crucial_data + +stream_crucial_nulls:last stream_crucial_data; +crucial_nulls~crucial_nulls & sublist[{1-x} count stream_crucial_nulls;stream_crucial_nulls] + + +-1 "\n+----------|| Test utils ||----------+\n"; + +show .arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; diff --git a/tests/null_bitmap/formation_null_bitmap.t b/tests/null_bitmap/formation_null_bitmap.t new file mode 100644 index 0000000..a3f7706 --- /dev/null +++ b/tests/null_bitmap/formation_null_bitmap.t @@ -0,0 +1,144 @@ +// nested_null_bitmap.t + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l q/arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping ||----------+\n"; +nested_opts:(`uint16`float32`binary`time64)!(9h;8.76e;"x"$"acknowledge";00:00:00.123456789); + +nested_options:(``NULL_MAPPING)!((::);nested_opts); + +N:5 + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +ui16_dt:.arrowkdb.dt.uint16[]; + +f32_dt:.arrowkdb.dt.float32[]; +bin_dt:.arrowkdb.dt.binary[]; +t64_dt:.arrowkdb.dt.time64[`nano]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +ui16_fd:.arrowkdb.fd.field[`uint16;ui16_dt]; + +f32_fd:.arrowkdb.fd.field[`float32;f32_dt]; +bin_fd:.arrowkdb.fd.field[`binary;bin_dt]; +t64_fd:.arrowkdb.fd.field[`time64;t64_dt]; + +-1"\n+----------|| Create a list datatype, using the uint16 datatype as its child ||----------+\n"; +list_dt:.arrowkdb.dt.list[ui16_dt]; + +-1"\n+----------|| Create a field containing the list datatype ||----------+\n"; +list_fd:.arrowkdb.fd.field[`list_field;list_dt]; + +-1"\n+----------|| Create a struct datatype using the float32, binary and time64 fields as its children ||----------+\n"; +struct_dt:.arrowkdb.dt.struct[(f32_fd,bin_dt,t64_dt)]; + +-1"\n+----------|| Create a field containing the struct datatype ||----------+\n"; +struct_fd:.arrowkdb.fd.field[`struct_field;struct_dt]; + +-1"\n+----------|| Create the schema containing the list and struct fields ||----------+\n"; +nested_schema:.arrowkdb.sc.schema[(list_fd,struct_fd)]; + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +-1"\n+----------|| Create the data for each of the struct child fields ||----------+\n"; +f32_data:5?100e; +f32_data[0]:8.76e; +bin_data:5?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); +bin_data[1]:"x"$"acknowledge" +t64_data:5?(12:00:00.000000000;13:00:00.000000000;14:00:00.000000000;15:00:00.000000000;16:00:00.000000000); +t64_data[2]:00:00:00.123456789; + +-1"\n+----------|| Create the data for the list array ||----------+\n"; +list_data:(enlist 9h;(8h;7h);(6h;5h;4h);(1h;2h;3h;4h);(5h;6h;7h;8h;9h)); + +-1"\n+----------|| Create the data for the struct array from its child arrays ||----------+\n"; +struct_data:(f32_data;bin_data;t64_data); + +-1"\n+----------|| Combine the array data for the list and struct columns ||----------+\n"; +nested_data:(list_data;struct_data); + +-1"\n+----------|| Write the schema and array data to a parquet file ||----------+\n"; +nested_options[`PARQUET_VERSION]:`V2.0; + +parquet_nested_bitmap:"nested_bitmap.parquet"; +.arrowkdb.pq.writeParquet[parquet_nested_bitmap;nested_schema;nested_data;nested_options]; + +-1"\n+----------|| Read the array back and compare ||----------+\n"; +nested_options[`WITH_NULL_BITMAP]:1; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +parquet_nested_schema:.arrowkdb.pq.readParquetSchema[parquet_nested_bitmap]; +.arrowkdb.sc.equalSchemas[nested_schema;parquet_nested_schema] +nested_schema~parquet_nested_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +parquet_nested_data:.arrowkdb.pq.readParquetData[parquet_nested_bitmap;nested_options]; +nested_data~first parquet_nested_data + +-1"\n+----------|| Compare nested null bitmaps ||----------+\n"; +nested_list_nulls:(enlist 1b;00b;000b;0000b;00001b) +nested_struct_nulls:(10000b;01000b;00100b) + +parquet_list_nulls:first parquet_nested_data[1] +parquet_struct_nulls:last parquet_nested_data[1] +nested_list_nulls~parquet_list_nulls +nested_struct_nulls~parquet_struct_nulls + +rm parquet_nested_bitmap; + +-1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; +arrow_nested_bitmap:"nested_bitmap.arrow"; +.arrowkdb.ipc.writeArrow[arrow_nested_bitmap;nested_schema;nested_data;nested_options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +arrow_nested_schema:.arrowkdb.ipc.readArrowSchema[arrow_nested_bitmap]; +.arrowkdb.sc.equalSchemas[nested_schema;arrow_nested_schema] +nested_schema~arrow_nested_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +arrow_nested_data:.arrowkdb.ipc.readArrowData[arrow_nested_bitmap;nested_options]; +nested_data~first arrow_nested_data + +-1"\n+----------|| Compare nested null bitmaps ||----------+\n"; +arrow_list_nulls:first arrow_nested_data[1] +arrow_struct_nulls:last arrow_nested_data[1] +nested_list_nulls~arrow_list_nulls +nested_struct_nulls~arrow_struct_nulls + +rm arrow_nested_bitmap; + +-1"\n+----------|| Serialize the schema and array data to an arrow stream ||----------+\n"; +serialized_nested_bitmap:.arrowkdb.ipc.serializeArrow[nested_schema;nested_data;nested_options]; + +-1"\n+----------|| Parse the schema back abd compare ||----------+\n"; +stream_nested_schema:.arrowkdb.ipc.parseArrowSchema[serialized_nested_bitmap]; +.arrowkdb.sc.equalSchemas[nested_schema;stream_nested_schema] +nested_schema~stream_nested_schema + +-1"\n+----------|| Parse the array data back and compare ||----------+\n"; +stream_nested_data:.arrowkdb.ipc.parseArrowData[serialized_nested_bitmap;nested_options]; +nested_data~first stream_nested_data + +-1"\n+----------|| Compare nested null bitmaps ||----------+\n"; +stream_list_nulls:first stream_nested_data[1] +stream_struct_nulls:last stream_nested_data[1] +nested_list_nulls~stream_list_nulls +nested_struct_nulls~stream_struct_nulls + + +-1 "\n+----------|| Test utils ||----------+\n"; + +show .arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; diff --git a/tests/null_bitmap/glossary_null_bitmap.t b/tests/null_bitmap/glossary_null_bitmap.t new file mode 100644 index 0000000..8c0eab5 --- /dev/null +++ b/tests/null_bitmap/glossary_null_bitmap.t @@ -0,0 +1,102 @@ +// glossary_null_bitmap.q + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l q/arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping ||----------+\n"; +glossary_opts:(`int64`float64)!(5;2.34); + +glossary_options:(``NULL_MAPPING)!((::);glossary_opts); + +N:5 + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +str_dt:.arrowkdb.dt.utf8[]; +i64_dt:.arrowkdb.dt.int64[]; +f64_dt:.arrowkdb.dt.float64[]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +str_fd:.arrowkdb.fd.field[`string;str_dt]; +i64_fd:.arrowkdb.fd.field[`int64;i64_dt]; +f64_fd:.arrowkdb.fd.field[`float64;f64_dt]; + +-1"\n+----------|| Create a field containing glossary datatypes ||----------+\n"; +dict_dt:.arrowkdb.dt.dictionary[str_dt;i64_dt] +dict_fd:.arrowkdb.fd.field[`dictionary;dict_dt] +map_dt:.arrowkdb.dt.map[i64_dt;f64_dt] +map_fd:.arrowkdb.fd.field[`map;map_dt]; + +-1"\n+----------|| Create the schema containing the large list, dictionary and sparce union fields ||----------+\n"; +glossary_schema:.arrowkdb.sc.schema[(dict_fd, map_fd)]; + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +str_data:N?("start";"stop";"alert";"acknowledge";""); +str_data[0]:"start" +i64_data:N?100i; +i64_data[0]:1i; +f64_data:N?100f; +f64_data[1]:2.34f; + +dict_data:(("aa";"bb";"cc");(2 0 1)) +map_data:((enlist 1)!(enlist 1f);(2 2)!(2 2.34f);(3 3 3)!(3 3 3f)) + +-1"\n+----------|| Combine the array data for the glossary columns ||----------+\n"; +glossary_data:(dict_data;map_data); + +-1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; +arrow_glossary_bitmap:"nested_map.arrow"; +.arrowkdb.ipc.writeArrow[arrow_glossary_bitmap;glossary_schema;glossary_data;glossary_options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +glossary_options[`WITH_NULL_BITMAP]:1; + +arrow_glossary_schema:.arrowkdb.ipc.readArrowSchema[arrow_glossary_bitmap]; +.arrowkdb.sc.equalSchemas[glossary_schema;arrow_glossary_schema] +glossary_schema~arrow_glossary_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +arrow_glossary_data:.arrowkdb.ipc.readArrowData[arrow_glossary_bitmap;glossary_options]; +glossary_data~first arrow_glossary_data + +-1"\n+----------|| Compare null bitmaps of arrow data ||----------+\n"; +nested_dict_nulls:(000b;000b); +nested_map_nulls:((enlist 0b)!(enlist 0b);00b!01b;000b!000b) +arrow_glossary_nulls:arrow_glossary_data[1] +nested_dict_nulls~first arrow_glossary_nulls +nested_map_nulls~last arrow_glossary_nulls + +rm arrow_glossary_bitmap; + +-1"\n+----------|| Serialize the schema and array data to an arrow stream ||----------+\n"; +serialized_glossary:.arrowkdb.ipc.serializeArrow[glossary_schema;glossary_data;glossary_options]; + +-1"\n+----------|| Parse the schema back abd compare ||----------+\n"; +stream_glossary_schema:.arrowkdb.ipc.parseArrowSchema[serialized_glossary]; +.arrowkdb.sc.equalSchemas[glossary_schema;stream_glossary_schema] +glossary_schema~stream_glossary_schema + +-1"\n+----------|| Parse the array data back and compare ||----------+\n"; +stream_glossary_data:.arrowkdb.ipc.parseArrowData[serialized_glossary;glossary_options]; +glossary_data~first stream_glossary_data + +-1"\n+----------|| Compare null bitmaps of stream data ||----------+\n"; +stream_glossary_nulls:stream_glossary_data[1] +nested_dict_nulls~first arrow_glossary_nulls +nested_map_nulls~last arrow_glossary_nulls + +-1 "\n+----------|| Test utils ||----------+\n"; + +show .arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; diff --git a/tests/null_bitmap/union_null_bitmap.t b/tests/null_bitmap/union_null_bitmap.t new file mode 100644 index 0000000..2812dcf --- /dev/null +++ b/tests/null_bitmap/union_null_bitmap.t @@ -0,0 +1,98 @@ +// union_null_bitmap.q + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l q/arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping ||----------+\n"; +nested_union_opts:(`float64`int64)!(2.34;5); +union_options:(``NULL_MAPPING)!((::);nested_union_opts); +N:5 + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +f64_dt:.arrowkdb.dt.float64[]; +i64_dt:.arrowkdb.dt.int64[]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +f64_fd:.arrowkdb.fd.field[`float64;f64_dt]; +i64_fd:.arrowkdb.fd.field[`int64;i64_dt]; + +-1"\n+----------|| Create fields containing union datatypes ||----------+\n"; +sparse_dt:.arrowkdb.dt.sparse_union[(i64_fd,f64_fd)] +sparse_fd:.arrowkdb.fd.field[`sparse_union;sparse_dt] +dense_dt:.arrowkdb.dt.dense_union[(i64_fd,f64_fd)] +dense_fd:.arrowkdb.fd.field[`dense_union;dense_dt] + +-1"\n+----------|| Create the schema containing the sparce and dense union fields ||----------+\n"; +union_schema:.arrowkdb.sc.schema[(sparse_fd, dense_fd)] + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +f64_data:N?100f; +f64_data[0]:2.34f; +i64_data:N?100h; +i64_data[1]:5h; + +-1"\n+----------|| Create the data the union child fields ||----------+\n"; +i64_data:N?100; +i64_data[0]:1; + +-1"\n+----------|| Combine the array data for the list and struct columns ||----------+\n"; +sparse_data:dense_data:(0 1 0h;5 2 3;4 2.34 6f) +union_data:(sparse_data;dense_data) + +-1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; +union_options[`WITH_NULL_BITMAP]:1; +arrow_union_bitmap:"nested_union.arrow"; +.arrowkdb.ipc.writeArrow[arrow_union_bitmap;union_schema;union_data;union_options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +arrow_union_schema:.arrowkdb.ipc.readArrowSchema[arrow_union_bitmap]; +.arrowkdb.sc.equalSchemas[union_schema;arrow_union_schema] +union_schema~arrow_union_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +arrow_union_data:.arrowkdb.ipc.readArrowData[arrow_union_bitmap;union_options]; +union_data~first arrow_union_data + +-1"\n+----------|| Compare null bitmaps of arrow data ||----------+\n"; +nested_union_nulls:((0 1 0h);100b;010b); + +arrow_union_nulls:arrow_union_data[1] +nested_union_nulls~arrow_union_nulls[0] +nested_union_nulls~arrow_union_nulls[1] + +rm arrow_union_bitmap; + +-1"\n+----------|| Serialize the schema and array data to an arrow stream ||----------+\n"; +serialized_nested_union:.arrowkdb.ipc.serializeArrow[union_schema;union_data;union_options]; + +-1"\n+----------|| Parse the schema back abd compare ||----------+\n"; +stream_union_schema:.arrowkdb.ipc.parseArrowSchema[serialized_nested_union]; +.arrowkdb.sc.equalSchemas[union_schema;stream_union_schema] +union_schema~stream_union_schema + +-1"\n+----------|| Parse the array data back and compare ||----------+\n"; +stream_union_data:.arrowkdb.ipc.parseArrowData[serialized_nested_union;union_options]; +union_data~first stream_union_data + +-1"\n+----------|| Compare null bitmaps of stream data ||----------+\n"; +stream_union_nulls:stream_union_data[1] +nested_union_nulls~stream_union_nulls[0] +nested_union_nulls~stream_union_nulls[1] + + +-1 "\n+----------|| Test utils ||----------+\n"; + +show .arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; diff --git a/tests/null_mapping/null_mapping_extra.t b/tests/null_mapping/null_mapping_extra.t new file mode 100644 index 0000000..87695d2 --- /dev/null +++ b/tests/null_mapping/null_mapping_extra.t @@ -0,0 +1,84 @@ +// null_mapping_extra.q + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l q/arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping only in arrow ||----------+\n"; +extra_opts:(`float16`large_utf8`large_binary`duration)!(9h;"stop";"x"$"acknowledge";12:00:00.000000000); + +options:(``NULL_MAPPING)!((::);extra_opts); + +N:5 + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +f16_dt:.arrowkdb.dt.float16[]; +lstr_dt:.arrowkdb.dt.large_utf8[]; +lbin_dt:.arrowkdb.dt.large_binary[]; +dur_dt:.arrowkdb.dt.duration[`milli]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +f16_fd:.arrowkdb.fd.field[`float16;f16_dt]; +lstr_fd:.arrowkdb.fd.field[`large_string;lstr_dt]; +lbin_fd:.arrowkdb.fd.field[`large_binary;lbin_dt]; +dur_fd:.arrowkdb.fd.field[`duration;dur_dt]; + +-1"\n+----------|| Create the schemas for the list of fields ||----------+\n"; +extra_schema:.arrowkdb.sc.schema[(ts_fd,f16_fd,lstr_fd,lbin_fd,dur_fd)]; + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +f16_data:N?100h; +f16_data[0]:9h; +lstr_data:N?("start";"stop";"alert";"acknowledge";""); +lstr_data[1]:"stop" +lbin_data:N?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); +lbin_data[3]:"x"$"acknowledge" +dur_data:N?(12:00:00.000000000;13:00:00.000000000;14:00:00.000000000;15:00:00.000000000;16:00:00.000000000); +dur_data[4]:12:00:00.000000000; + +-1"\n+----------|| Combine the data for all columns ||----------+\n"; +extra_data:(ts_data;f16_data;lstr_data;lbin_data;dur_data); + +-1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; +arrow_extra:"null_mapping_extra.arrow"; +.arrowkdb.ipc.writeArrow[arrow_extra;extra_schema;extra_data;options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +arrow_extra_schema:.arrowkdb.ipc.readArrowSchema[arrow_extra]; +.arrowkdb.sc.equalSchemas[extra_schema;arrow_extra_schema] +extra_schema~arrow_extra_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +arrow_extra_data:.arrowkdb.ipc.readArrowData[arrow_extra;options]; +extra_data~arrow_extra_data +rm arrow_extra; + +-1"\n+----------|| Serialize the schema and array data to an arrow stream ||----------+\n"; +serialized_extra:.arrowkdb.ipc.serializeArrow[extra_schema;extra_data;options]; + +-1"\n+----------|| Parse the schema back abd compare ||----------+\n"; +stream_extra_schema:.arrowkdb.ipc.parseArrowSchema[serialized_extra]; +.arrowkdb.sc.equalSchemas[extra_schema;stream_extra_schema] +extra_schema~stream_extra_schema + +-1"\n+----------|| Parse the array data back and compare ||----------+\n"; +stream_extra_data:.arrowkdb.ipc.parseArrowData[serialized_extra;options]; + +extra_data~stream_extra_data + + +-1 "\n+----------|| Test utils ||----------+\n"; + +show .arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; diff --git a/tests/null_mapping/null_mapping_float.t b/tests/null_mapping/null_mapping_float.t new file mode 100644 index 0000000..e61b9ed --- /dev/null +++ b/tests/null_mapping/null_mapping_float.t @@ -0,0 +1,96 @@ +// null_mapping_float.q + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l q/arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping ||----------+\n"; +float_opts:(`float32`float64`decimal)!(1.23e;4.56;7.89); + +options:(``NULL_MAPPING)!((::);float_opts); + +N:5 + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +f32_dt:.arrowkdb.dt.float32[]; +f64_dt:.arrowkdb.dt.float64[]; +dec_dt:.arrowkdb.dt.decimal128[38i;2i]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +f32_fd:.arrowkdb.fd.field[`float32;f32_dt]; +f64_fd:.arrowkdb.fd.field[`float64;f64_dt]; +dec_fd:.arrowkdb.fd.field[`decimal;dec_dt]; + +-1"\n+----------|| Create the schemas for the list of fields ||----------+\n"; +float_schema:.arrowkdb.sc.schema[(ts_fd,f32_fd,f64_fd,dec_fd)]; + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +f32_data:N?100e; +f32_data[0]:1.23e; +f64_data:N?100f; +f64_data[1]:4.56f; +dec_data:{"F"$.Q.f[2]x} each N?(10f) +dec_data[2]:7.89f + +-1"\n+----------|| Combine the data for all columns ||----------+\n"; +float_data:(ts_data;f32_data;f64_data;dec_data); + +-1"\n+----------|| Write the schema and array data to a parquet file ||----------+\n"; +options[`DECIMAL128_AS_DOUBLE]:1 +options[`PARQUET_VERSION]:`V2.0 + +parquet_float:"null_mapping_float.parquet"; +.arrowkdb.pq.writeParquet[parquet_float;float_schema;float_data;options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +parquet_float_schema:.arrowkdb.pq.readParquetSchema[parquet_float]; +.arrowkdb.sc.equalSchemas[float_schema;parquet_float_schema] +float_schema~parquet_float_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +parquet_float_data:.arrowkdb.pq.readParquetData[parquet_float;options]; +float_data~parquet_float_data +rm parquet_float; + +-1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; +arrow_float:"null_mapping_float.arrow"; +.arrowkdb.ipc.writeArrow[arrow_float;float_schema;float_data;options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +arrow_float_schema:.arrowkdb.ipc.readArrowSchema[arrow_float]; +.arrowkdb.sc.equalSchemas[float_schema;arrow_float_schema] +float_schema~arrow_float_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +arrow_float_data:.arrowkdb.ipc.readArrowData[arrow_float;options]; +float_data~arrow_float_data +rm arrow_float; + +-1"\n+----------|| Serialize the schema and array data to an arrow stream ||----------+\n"; +serialized_float:.arrowkdb.ipc.serializeArrow[float_schema;float_data;options]; + +-1"\n+----------|| Parse the schema back abd compare ||----------+\n"; +stream_float_schema:.arrowkdb.ipc.parseArrowSchema[serialized_float]; +.arrowkdb.sc.equalSchemas[float_schema;stream_float_schema] +float_schema~stream_float_schema + +-1"\n+----------|| Parse the array data back and compare ||----------+\n"; +stream_float_data:.arrowkdb.ipc.parseArrowData[serialized_float;options]; +float_data~stream_float_data + + +-1 "\n+----------|| Test utils ||----------+\n"; + +show .arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; diff --git a/tests/null_mapping/null_mapping_long.t b/tests/null_mapping/null_mapping_long.t new file mode 100644 index 0000000..6c64a4b --- /dev/null +++ b/tests/null_mapping/null_mapping_long.t @@ -0,0 +1,99 @@ +// null_mapping_long.q + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l q/arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping ||----------+\n"; +long_opts:(`uint32`int32`uint64`int64)!(5i;6i;7;8); + +options:(``NULL_MAPPING)!((::);long_opts); + +N:5 + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +ui32_dt:.arrowkdb.dt.uint32[]; +i32_dt:.arrowkdb.dt.int32[]; +ui64_dt:.arrowkdb.dt.uint64[]; +i64_dt:.arrowkdb.dt.int64[]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +ui32_fd:.arrowkdb.fd.field[`uint32;ui32_dt]; +i32_fd:.arrowkdb.fd.field[`int32;i32_dt]; +ui64_fd:.arrowkdb.fd.field[`uint64;ui64_dt]; +i64_fd:.arrowkdb.fd.field[`int64;i64_dt]; + +-1"\n+----------|| Create the schemas for the list of fields ||----------+\n"; +long_schema:.arrowkdb.sc.schema[(ts_fd,ui32_fd,i32_fd,ui64_fd,i64_fd)]; + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +ui32_data:N?100i; +ui32_data[0]:5i; +i32_data:N?100i; +i32_data[1]:6i; +ui64_data:N?100; +ui64_data[2]:7; +i64_data:N?100; +i64_data[3]:8; + +-1"\n+----------|| Combine the data for all columns ||----------+\n"; +long_data:(ts_data;ui32_data;i32_data;ui64_data;i64_data); + +-1"\n+----------|| Write the schema and array data to a parquet file ||----------+\n"; +options[`PARQUET_VERSION]:`V2.0 + +parquet_long:"null_mapping_long.parquet"; +.arrowkdb.pq.writeParquet[parquet_long;long_schema;long_data;options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +parquet_long_schema:.arrowkdb.pq.readParquetSchema[parquet_long]; +.arrowkdb.sc.equalSchemas[long_schema;parquet_long_schema] +long_schema~parquet_long_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +parquet_long_data:.arrowkdb.pq.readParquetData[parquet_long;options]; +long_data~parquet_long_data +rm parquet_long; + +-1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; +arrow_long:"null_mapping_long.arrow"; +.arrowkdb.ipc.writeArrow[arrow_long;long_schema;long_data;options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +arrow_long_schema:.arrowkdb.ipc.readArrowSchema[arrow_long]; +.arrowkdb.sc.equalSchemas[long_schema;arrow_long_schema] +long_schema~arrow_long_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +arrow_long_data:.arrowkdb.ipc.readArrowData[arrow_long;options]; +long_data~arrow_long_data +rm arrow_long; + +-1"\n+----------|| Serialize the schema and array data to an arrow stream ||----------+\n"; +serialized_long:.arrowkdb.ipc.serializeArrow[long_schema;long_data;options]; + +-1"\n+----------|| Parse the schema back abd compare ||----------+\n"; +stream_long_schema:.arrowkdb.ipc.parseArrowSchema[serialized_long]; +.arrowkdb.sc.equalSchemas[long_schema;stream_long_schema] +long_schema~stream_long_schema + +-1"\n+----------|| Parse the array data back and compare ||----------+\n"; +stream_long_data:.arrowkdb.ipc.parseArrowData[serialized_long;options]; +long_data~stream_long_data + + +-1 "\n+----------|| Test utils ||----------+\n"; + +show .arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; diff --git a/tests/null_mapping/null_mapping_other.t b/tests/null_mapping/null_mapping_other.t new file mode 100644 index 0000000..6228a28 --- /dev/null +++ b/tests/null_mapping/null_mapping_other.t @@ -0,0 +1,83 @@ +// null_mapping_other.q + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l q/arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping only in arrow ||----------+\n"; +other_opts:(`date64`time32`month_interval`day_time_interval)!(2015.01.01D00:00:00.000000000;09:01:02.042;2006.07m;12:00:00.000000000); + +options:(``NULL_MAPPING)!((::);other_opts); + +N:5 + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +d64_dt:.arrowkdb.dt.date64[]; +t32_dt:.arrowkdb.dt.time32[`milli]; +mint_dt:.arrowkdb.dt.month_interval[]; +dtint_dt:.arrowkdb.dt.day_time_interval[]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +d64_fd:.arrowkdb.fd.field[`date64;d64_dt]; +t32_fd:.arrowkdb.fd.field[`time32;t32_dt]; +mint_fd:.arrowkdb.fd.field[`month_interval;mint_dt]; +dtint_fd:.arrowkdb.fd.field[`day_time_interval;dtint_dt]; + +-1"\n+----------|| Create the schemas for the list of fields ||----------+\n"; +other_schema:.arrowkdb.sc.schema[(ts_fd,d64_fd,t32_fd,mint_fd,dtint_fd)]; + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +d64_data:N?(2015.01.01D00:00:00.000000000;2017.01.01D00:00:00.000000000;2018.01.01D00:00:00.000000000;2019.01.01D00:00:00.000000000;2020.01.01D00:00:00.000000000); +d64_data[1]:2015.01.01D00:00:00.000000000; +t32_data:N?(09:01:02.042;08:01:02.042;07:01:02.042;06:01:02.042;05:01:02.042); +t32_data[1]:09:01:02.042; +mint_data:N?(2006.07m;2006.06m;2006.05m;2006.04m;2006.03m); +mint_data[2]:2006.07m; +dtint_data:N?(12:00:00.000000000;11:00:00.000000000;10:00:00.000000000;09:00:00.000000000;08:00:00.000000000); +dtint_data[3]:12:00:00.000000000; + +-1"\n+----------|| Combine the data for all columns ||----------+\n"; +other_data:(ts_data;d64_data;t32_data;mint_data;dtint_data); + +-1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; +arrow_other:"null_mapping_other.arrow"; +.arrowkdb.ipc.writeArrow[arrow_other;other_schema;other_data;options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +arrow_other_schema:.arrowkdb.ipc.readArrowSchema[arrow_other]; +.arrowkdb.sc.equalSchemas[other_schema;arrow_other_schema] +other_schema~arrow_other_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +arrow_other_data:.arrowkdb.ipc.readArrowData[arrow_other;options]; +other_data~arrow_other_data +rm arrow_other; + +-1"\n+----------|| Serialize the schema and array data to an arrow stream ||----------+\n"; +serialized_other:.arrowkdb.ipc.serializeArrow[other_schema;other_data;options]; + +-1"\n+----------|| Parse the schema back abd compare ||----------+\n"; +stream_other_schema:.arrowkdb.ipc.parseArrowSchema[serialized_other]; +.arrowkdb.sc.equalSchemas[other_schema;stream_other_schema] +other_schema~stream_other_schema + +-1"\n+----------|| Parse the array data back and compare ||----------+\n"; +stream_other_data:.arrowkdb.ipc.parseArrowData[serialized_other;options]; +other_data~stream_other_data + + +-1 "\n+----------|| Test utils ||----------+\n"; + +show .arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; diff --git a/tests/null_mapping/null_mapping_short.t b/tests/null_mapping/null_mapping_short.t new file mode 100644 index 0000000..1c4dfec --- /dev/null +++ b/tests/null_mapping/null_mapping_short.t @@ -0,0 +1,103 @@ +// null_mapping_short.t + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l q/arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping ||----------+\n"; +short_opts:(`bool`uint8`int8`uint16`int16)!(0b;0x01;0x02;3h;4h); + +options:(``NULL_MAPPING)!((::);short_opts); + +N:5 + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +bool_dt:.arrowkdb.dt.boolean[]; +ui8_dt:.arrowkdb.dt.uint8[]; +i8_dt:.arrowkdb.dt.int8[]; +ui16_dt:.arrowkdb.dt.uint16[]; +i16_dt:.arrowkdb.dt.int16[]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +bool_fd:.arrowkdb.fd.field[`bool;bool_dt]; +ui8_fd:.arrowkdb.fd.field[`uint8;ui8_dt]; +i8_fd:.arrowkdb.fd.field[`int8;i8_dt]; +ui16_fd:.arrowkdb.fd.field[`uint16;ui16_dt]; +i16_fd:.arrowkdb.fd.field[`int16;i16_dt]; + +-1"\n+----------|| Create the schemas for the list of fields ||----------+\n"; +short_schema:.arrowkdb.sc.schema[(ts_fd,bool_fd,ui8_fd,i8_fd,ui16_fd,i16_fd)]; + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +bool_data:N?(0b;1b); +bool_data[0]:0b; +ui8_data:N?0x64; +ui8_data[1]:0x01; +i8_data:N?0x64; +i8_data[2]:0x02; +ui16_data:N?100h; +ui16_data[3]:3h; +i16_data:N?100h; +i16_data[4]:4h; + +-1"\n+----------|| Combine the data for all columns ||----------+\n"; +short_data:(ts_data;bool_data;ui8_data;i8_data;ui16_data;i16_data); + +-1"\n+----------|| Write the schema and array data to a parquet file ||----------+\n"; +options[`PARQUET_VERSION]:`V2.0 + +parquet_short:"null_mapping_short.parquet"; +.arrowkdb.pq.writeParquet[parquet_short;short_schema;short_data;options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +parquet_short_schema:.arrowkdb.pq.readParquetSchema[parquet_short]; +.arrowkdb.sc.equalSchemas[short_schema;parquet_short_schema] +short_schema~parquet_short_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +parquet_short_data:.arrowkdb.pq.readParquetData[parquet_short;options]; +short_data~parquet_short_data +rm parquet_short; + +-1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; +arrow_short:"null_mapping_short.arrow"; +.arrowkdb.ipc.writeArrow[arrow_short;short_schema;short_data;options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +arrow_short_schema:.arrowkdb.ipc.readArrowSchema[arrow_short]; +.arrowkdb.sc.equalSchemas[short_schema;arrow_short_schema] +short_schema~arrow_short_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +arrow_short_data:.arrowkdb.ipc.readArrowData[arrow_short;options]; +short_data~arrow_short_data +rm arrow_short; + +-1"\n+----------|| Serialize the schema and array data to an arrow stream ||----------+\n"; +serialized_short:.arrowkdb.ipc.serializeArrow[short_schema;short_data;options]; + +-1"\n+----------|| Parse the schema back and compare ||----------+\n"; +stream_short_schema:.arrowkdb.ipc.parseArrowSchema[serialized_short]; +.arrowkdb.sc.equalSchemas[short_schema;stream_short_schema] +short_schema~stream_short_schema + +-1"\n+----------|| Parse the array data back and compare ||----------+\n"; +stream_short_data:.arrowkdb.ipc.parseArrowData[serialized_short;options]; +short_data~stream_short_data + + +-1 "\n+----------|| Test utils ||----------+\n"; + +show .arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; diff --git a/tests/null_mapping/null_mapping_str.t b/tests/null_mapping/null_mapping_str.t new file mode 100644 index 0000000..349c38f --- /dev/null +++ b/tests/null_mapping/null_mapping_str.t @@ -0,0 +1,98 @@ +// null_mapping_str.q + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l q/arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping ||----------+\n"; +str_opts:(`utf8`binary`fixed_size_binary)!("start";"x"$"alert";0Ng); + +options:(``NULL_MAPPING)!((::);str_opts); + +N:5 + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +str_dt:.arrowkdb.dt.utf8[]; +bin_dt:.arrowkdb.dt.binary[]; +fbin_dt:.arrowkdb.dt.fixed_size_binary[16i]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +str_fd:.arrowkdb.fd.field[`string;str_dt]; +bin_fd:.arrowkdb.fd.field[`binary;bin_dt]; +fbin_fd:.arrowkdb.fd.field[`fixed_binary;fbin_dt]; + +-1"\n+----------|| Create the schemas for the list of fields ||----------+\n"; +str_schema:.arrowkdb.sc.schema[(ts_fd,str_fd,bin_fd,fbin_fd)]; + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +str_data:N?("start";"stop";"alert";"acknowledge";""); +str_data[0]:"start" +bin_data:N?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); +bin_data[2]:"x"$"alert" +fbin_data:N?0Ng; +fbin_data[4]:0Ng; + +-1"\n+----------|| Combine the data for all columns ||----------+\n"; +str_data:(ts_data;str_data;bin_data;fbin_data); + +-1"\n+----------|| Write the schema and array data to a parquet file ||----------+\n"; +options[`PARQUET_VERSION]:`V2.0 + +parquet_str:"null_mapping_str.parquet"; +.arrowkdb.pq.writeParquet[parquet_str;str_schema;str_data;options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +parquet_str_schema:.arrowkdb.pq.readParquetSchema[parquet_str]; +.arrowkdb.sc.equalSchemas[str_schema;parquet_str_schema] +str_schema~parquet_str_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +parquet_str_data:.arrowkdb.pq.readParquetData[parquet_str;options]; +parquet_str_data[3]:{0x0 sv x} each parquet_str_data[3] // Convert to GUIDs +str_data~parquet_str_data +rm parquet_str; + +-1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; +arrow_str:"null_mapping_str.arrow"; +.arrowkdb.ipc.writeArrow[arrow_str;str_schema;str_data;options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +arrow_str_schema:.arrowkdb.ipc.readArrowSchema[arrow_str]; +.arrowkdb.sc.equalSchemas[str_schema;arrow_str_schema] +str_schema~arrow_str_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +arrow_str_data:.arrowkdb.ipc.readArrowData[arrow_str;options]; +arrow_str_data[3]:{0x0 sv x} each arrow_str_data[3] // Convert to GUIDs +str_data~arrow_str_data +rm arrow_str; + +-1"\n+----------|| Serialize the schema and array data to an arrow stream ||----------+\n"; +serialized_str:.arrowkdb.ipc.serializeArrow[str_schema;str_data;options]; + +-1"\n+----------|| Parse the schema back abd compare ||----------+\n"; +stream_str_schema:.arrowkdb.ipc.parseArrowSchema[serialized_str]; +.arrowkdb.sc.equalSchemas[str_schema;stream_str_schema] +str_schema~stream_str_schema + +-1"\n+----------|| Parse the array data back and compare ||----------+\n"; +stream_str_data:.arrowkdb.ipc.parseArrowData[serialized_str;options]; +stream_str_data[3]:{0x0 sv x} each stream_str_data[3] // Convert to GUIDs +str_data~stream_str_data + + +-1 "\n+----------|| Test utils ||----------+\n"; + +show .arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; diff --git a/tests/null_mapping/null_mapping_time.t b/tests/null_mapping/null_mapping_time.t new file mode 100644 index 0000000..c006ed4 --- /dev/null +++ b/tests/null_mapping/null_mapping_time.t @@ -0,0 +1,95 @@ +// null_mapping_time.q + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l q/arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping ||----------+\n"; +time_opts:(`date32`timestamp`time64)!(2006.07.21;2011.01.01D00:00:00.000000000;12:00:00.000000000); + +options:(``NULL_MAPPING)!((::);time_opts); + +N:5 + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +d32_dt:.arrowkdb.dt.date32[]; +tstamp_dt:.arrowkdb.dt.timestamp[`nano]; +t64_dt:.arrowkdb.dt.time64[`nano]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +d32_fd:.arrowkdb.fd.field[`date32;d32_dt]; +tstamp_fd:.arrowkdb.fd.field[`timestamp;tstamp_dt]; +t64_fd:.arrowkdb.fd.field[`time64;t64_dt]; + +-1"\n+----------|| Create the schemas for the list of fields ||----------+\n"; +time_schema:.arrowkdb.sc.schema[(ts_fd,d32_fd,tstamp_fd,t64_fd)]; + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +d32_data:N?(2006.07.21;2008.07.18;2012.07.16;2014.07.15;2016.07.11); +d32_data[0]:2006.07.21; +tstamp_data:N?(2015.01.01D00:00:00.000000000;2014.01.01D00:00:00.000000000;2013.01.01D00:00:00.000000000;2012.01.01D00:00:00.000000000;2011.01.01D00:00:00.000000000); +tstamp_data[2]:2011.01.01D00:00:00.000000000; +t64_data:N?(12:00:00.000000000;13:00:00.000000000;14:00:00.000000000;15:00:00.000000000;16:00:00.000000000); +t64_data[3]:12:00:00.000000000; + +-1"\n+----------|| Combine the data for all columns ||----------+\n"; +time_data:(ts_data;d32_data;tstamp_data;t64_data); + +-1"\n+----------|| Write the schema and array data to a parquet file ||----------+\n"; +options[`PARQUET_VERSION]:`V2.0 + +parquet_time:"null_mapping_time.parquet"; +.arrowkdb.pq.writeParquet[parquet_time;time_schema;time_data;options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +parquet_time_schema:.arrowkdb.pq.readParquetSchema[parquet_time]; +.arrowkdb.sc.equalSchemas[time_schema;parquet_time_schema] +time_schema~parquet_time_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +parquet_time_data:.arrowkdb.pq.readParquetData[parquet_time;options]; +time_data~parquet_time_data +rm parquet_time; + +-1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; +arrow_time:"null_mapping_time.arrow"; +.arrowkdb.ipc.writeArrow[arrow_time;time_schema;time_data;options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +arrow_time_schema:.arrowkdb.ipc.readArrowSchema[arrow_time]; +.arrowkdb.sc.equalSchemas[time_schema;arrow_time_schema] +time_schema~arrow_time_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +arrow_time_data:.arrowkdb.ipc.readArrowData[arrow_time;options]; +time_data~arrow_time_data +rm arrow_time; + +-1"\n+----------|| Serialize the schema and array data to an arrow stream ||----------+\n"; +serialized_time:.arrowkdb.ipc.serializeArrow[time_schema;time_data;options]; + +-1"\n+----------|| Parse the schema back abd compare ||----------+\n"; +stream_time_schema:.arrowkdb.ipc.parseArrowSchema[serialized_time]; +.arrowkdb.sc.equalSchemas[time_schema;stream_time_schema] +time_schema~stream_time_schema + +-1"\n+----------|| Parse the array data back and compare ||----------+\n"; +stream_time_data:.arrowkdb.ipc.parseArrowData[serialized_time;options]; +time_data~stream_time_data + + +-1 "\n+----------|| Test utils ||----------+\n"; + +show .arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n";