diff --git a/.travis.yml b/.travis.yml index da33d13..3237bc0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,7 +1,5 @@ jobs: include: - - dist: xenial - os: linux - dist: bionic os: linux - dist: focal @@ -102,6 +100,6 @@ deploy: file: "$FILE_NAME" on: tags: true - condition: $TRAVIS_OS_NAME = windows || $TRAVIS_OS_NAME = osx || ($TRAVIS_DIST = xenial && $TRAVIS_OS_NAME = linux) + condition: $TRAVIS_OS_NAME = windows || $TRAVIS_OS_NAME = osx || ($TRAVIS_DIST = bionic && $TRAVIS_OS_NAME = linux) skip_cleanup: 'true' diff --git a/docs/arrow-types.md b/docs/arrow-types.md new file mode 100644 index 0000000..2e10258 --- /dev/null +++ b/docs/arrow-types.md @@ -0,0 +1,168 @@ +--- +title: Type mapping between Arrow and kdb+ +description: The data layout of an Arrow table is defined by its schema. +author: Neal McDonnell +date: February 2021 +--- +# Type mapping between Arrow and kdb+ + + +The data layout of an Arrow table is defined by its schema. The schema is composed from a list of fields, one for each column in the table. The field describes the name of the column and its datatype. This page examines each of these and details how they are mapped in kdb+. + +:fontawesome-brands-github: +[KxSystems/arrowkdb](https://github.com/KxSystems/arrowkdb) + + +## Arrow datatypes + +Currently Arrow supports over 35 datatypes including concrete, parameterized and nested datatypes. + +Similar to the [C++ Arrow library](https://wesm.github.io/arrow-site-test/cpp/index.html) and [PyArrow](https://pypi.org/project/pyarrow/), `arrowkdb` exposes the Arrow datatype constructors to q. When one of these constructors is called it will return an integer datatype identifier which can then be passed to other functions, e.g. when creating a field. + +### Concrete + +Concrete datatypes have a single fixed representation. + +arrow datatype | description | kdb+ representation +------------------ | ------------------------------------------------------- | ---------------------------------------------------- +na | NULL type having no physical storage | mixed list of empty lists +boolean | boolean as 1 bit, LSB bit-packed ordering | `1h` +uint8 | unsigned 8-bit little-endian integer | `4h` +int8 | signed 8-bit little-endian integer | `4h` +uint16 | unsigned 16-bit little-endian integer | `5h` +int16 | signed 16-bit little-endian integer | `5h` +uint32 | unsigned 32-bit little-endian integer | `6h` +int32 | signed 32-bit little-endian integer | `6h` +uint64 | unsigned 64-bit little-endian integer | `7h` +int64 | signed 64-bit little-endian integer | `7h` +float16 | 2-byte floating point value (populated from `uint16_t`) | `5h` +float32 | 4-byte floating point value | `8h` +float64 | 8-byte floating point value | `9h` +utf8 | UTF8 variable-length string | mixed list of `10h` +large_utf8 | large UTF8 variable-length string | mixed list of `10h` +binary | variable-length bytes (no guarantee of UTF8-ness) | mixed list of `4h` +large_binary | large variable-length bytes (no guarantee of UTF8-ness) | mixed list of `4h` +date32 | `int32_t` days since the Unix epoch | `14h` (with automatic epoch offsetting) +date64 | `int64_t` milliseconds since the Unix epoch | `12h` (with automatic epoch offsetting and ms scaling) +month_interval | interval described as a number of months | `13h` +day_time_interval | interval described as number of days and milliseconds | `16h` (with automatic ns scaling) + + + +### Parameterized + +Parameterized datatypes represent multiple logical interpretations of the underlying physical data, where each parameterized interpretation is a distinct datatype in its own right. + +arrow datatype | description | kdb+ representation +------------------------------ | ------------------------------------------------------------ | ---------------------------------------------------------- +fixed_size_binary (byte_width) | fixed-size binary: each value occupies the same number of bytes | mixed list of `4h` +timestamp (time_unit) | exact timestamp encoded with `int64_t` (as number of seconds, milliseconds, microseconds or nanoseconds since Unix epoch) | `12h` (with automatic epoch offsetting and TimeUnit scaling) +time32 (time_unit) | time as signed 32-bit integer, representing either seconds or milliseconds since midnight | `19h` (with automatic TimeUnit scaling) +time64 (time_unit) | time as signed 64-bit integer, representing either microseconds or nanoseconds since midnight | `16h` (with automatic TimeUnit scaling) +duration (time_unit) | measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds | `16h` (with automatic TimeUnit scaling) +decimal128 (precision, scale) | precision- and scale-based signed 128-bit integer in twos complement | mixed list of `4h` (each of length 16) + + + +### Nested + +Nested datatypes define higher-level groupings of either the child datatypes or its constituent fields. (A field specifies its datatype and the field’s name.) + +arrow datatype | description | kdb+ representation +------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ +list (datatype_id) | list datatype specified in terms of its child datatype | mixed list for the parent list array containing a set of sublists (of type determined by the child datatype), one for each of the list value sets +large_list (datatype_id) | large list datatype specified in terms of its child datatype | mixed list for the parent list array containing a set of sublists (of type determined by the child datatype), one for each of the list value sets +fixed_size_list (datatype_id, list_size) | fixed size list datatype specified in terms of its child datatype and the fixed size of each of the child lists | same as variable-length lists, except each of the sublists must be of length equal to the `list_size` +map (key_datatype_id, item_datatype_id) | map datatype specified in terms of its key and item child datatypes | mixed list for the parent map array, with a dictionary for each map value set +struct (field_ids) | struct datatype specified in terms of a list of its constituent child field identifiers | mixed list for the parent struct array, containing child lists for each field in the struct +dictionary (value_datatype_id, index_datatype_id) | a dictionary datatype specified in terms of its value and index datatypes, similar to pandas categorical | two-item mixed list: values and indexes lists +sparse_union (field_ids) | union datatype specified in terms of a list of its constituent child field identifiers | similar to a struct array except the mixed list has an additional `type_id` array (5h) at the start which identifies the live field in each union value set +dense_union (field_ids) | union datatype specified in terms of a list of its constituent child field identifiers | similar to a struct array except the mixed list has an additional `type_id` array (5h) at the start which identifies the live field in each union value set + + + +### Inferred + +You can have `arrowkbd` infer a suitable Arrow datatype from the type of a kdb+ list. +Similarly, Arrow schemas can be inferred from a kdb+ table. + +This approach is easier to use but supports only a subset of the Arrow datatypes and is considerably less flexible. + +!!! tip "Infer Arrow datatypes if you are less familiar with Arrow or do not wish to use the more complex or nested Arrow datatypes." + +kdb+ list type | inferred Arrow datatype | notes +------------------|-------------------------|--------------------------------------------- +1h | boolean | +2h | fixed_size_binary (16) | writing path only, reads as mixed list of `4h` +4h | int8 | +5h | int16 | +6h | int32 | +7h | int64 | +8h | float32 | +9h | float64 | +10h | int8 | writing path only, reads as `4h` +11h | utf8 | writing path only, reads as mixed list of `10h` +12h | timestamp (nano) | +13h | month_interval | +14h | date32 | +15h | NA | cast in q with `` `timestamp$`` +16h | time64 (nano) | +17h | NA | cast in q with `` `time$`` +18h | NA | cast in q with `` `time$`` +19h | time32 (milli) | +mixed list of 4h | binary | +mixed list of 10h | utf8 | + +??? warning "The inference works only for trivial kdb+ lists containing simple datatypes" + + Only mixed lists of char arrays or byte arrays are supported, mapped to Arrow UTF8 and binary datatypes respectively. Other mixed list structures (e.g. those used by the nested arrow datatypes) cannot be interpreted – if required, create manually using the datatype constructors + + + +### Parquet datatype limitations + +The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype: + +Arrow datatype | status as of apache-arrow-2.0.0 +------------------|-------------------------------------------------------- +float16 | unsupported +month_interval | unsupported +day_time_interval | unsupported +duration | unsupported +large_utf8 | unsupported +large_binary | unsupported +sparse_union | unsupported +dense_union | unsupported +date64 | mapped to date32 (days) +fixed_size_list | mapped to list +dictionary | categorical representation stored +uint32 | Parquet v2.0 only, otherwise mapped to int64 +timestamp(nano) | Parquet v2.0 only, otherwise mapped to timestamp (milli) + + +## Arrow fields + +An Arrow field describes a column in the table and is composed of a datatype and a string field name. + +Similar to the C++ Arrow library and PyArrow, `arrowkdb` exposes the Arrow field constructor to q. The field constructor takes the field name and its datatype identifier and returns an integer field identifier which can then be passed to other functions, e.g. when creating a schema. + + +## Arrow schemas + +An Arrow schema is built up from a list of fields and is used when working with table data. The datatype of each field in the schema determines the array data layout for that column in the table. + +Similar to the C++ Arrow library and PyArrow, `arrowkdb` exposes the Arrow schema constructor to q. The schema constructor takes a list of field identifiers and will return an integer schema identifier which can then be passed to other functions, e.g. when writing Arrow or Parquet files. + +If you are less familiar with Arrow or do not wish to use the more complex or nested Arrow datatypes, `arrowkdb` can infer the schema from a kdb+ table. Each column in the table is mapped to a field in the schema. The column’s name is used as the field name and the field’s datatype is [inferred from the column’s kdb+ type](#inferred). + + +## Arrow tables + +An Arrow table is composed from a schema and a mixed list of Arrow array data kdb+ objects: + +- The array data for each column in the table is then populated using a builder object specific to the field’s datatype +- Similarly, datatype-specific reader objects are used to interpret and inspect the array data for each column in the table + +The mixed list of Arrow array data kdb+ objects should be ordered in schema field number. Each kdb+ object representing one of the arrays must be structured according to the field's datatype. This required array data structure is detailed above for each of the datatypes. + +Alternatively, separate APIs are provided where the Arrow table is created from a kdb+ table using an inferred schema. diff --git a/docs/examples.md b/docs/examples.md new file mode 100644 index 0000000..6d93c7c --- /dev/null +++ b/docs/examples.md @@ -0,0 +1,547 @@ +--- +title: Example usage of interface | Arrow/Parquet interface +description: Examples of how to read and write Parquet files, Arrow files and Arrow streams from a kdb+ session +author: Neal McDonnell +date: February 2021 +--- +# Example usage of interface + +_Examples of how to read and write Parquet files, Arrow files and Arrow streams from a kdb+ session_ + + +The repository has examples with more functionality. + +:fontawesome-brands-github: +[KxSystems/arrowkdb/examples](https://github.com/KxSystems/arrowkdb/tree/master/examples) + + +## Inferred schemas + +The data layout of an Arrow table is defined by its schema. The schema is composed from a list of fields, one for each column in the table. The field describes the name of the column and its datatype. + +If you are less familiar with Arrow or do not wish to use the more complex or nested Arrow datatypes, `arrowkdb` can infer the schema from a kdb+ table. Each column in the table is mapped to a field in the schema. The column’s name is used as the field name and the column’s kdb+ type is [mapped to an Arrow datatype](arrow-types.md#inferred-datatypes). + + +### Create a table + +Create a kdb+ table contain temporal, floating, integer, boolean and string columns. + +```q +// Create table with dummy data +q)N:5 +q)table:([]tstamp:asc N?0p;temperature:N?100f;fill_level:N?100;pump_status:N?0b;comment:N?("start";"stop";"alert";"acknowledge";"")) +q)table +tstamp temperature fill_level pump_status comment +------------------------------------------------------------------------ +2001.11.14D02:41:59.687131048 40.31667 63 0 "" +2002.07.31D14:36:38.714581136 32.06061 75 1 "alert" +2003.01.09D08:10:33.261897408 57.81857 60 1 "start" +2003.03.03D18:09:25.390797712 57.62816 24 0 "" +2003.10.25D23:44:20.338068016 77.2916 37 1 "start" + +// Pretty print the Arrow table populated from a kdb+ table +// The schema is inferred from the kdb+ table structure +q).arrowkdb.tb.prettyPrintTableFromTable[table;::] +tstamp: timestamp[ns] not null +temperature: double not null +fill_level: int64 not null +pump_status: bool not null +comment: string not null +---- +tstamp: + [ + [ + 2001-11-14 02:41:59.687131048, + 2002-07-31 14:36:38.714581136, + 2003-01-09 08:10:33.261897408, + 2003-03-03 18:09:25.390797712, + 2003-10-25 23:44:20.338068016 + ] + ] +temperature: + [ + [ + 40.3167, + 32.0606, + 57.8186, + 57.6282, + 77.2916 + ] + ] +fill_level: + [ + [ + 63, + 75, + 60, + 24, + 37 + ] + ] +pump_status: + [ + [ + false, + true, + true, + false, + true + ] + ] +comment: + [ + [ + "", + "alert", + "start", + "", + "start" + ] + ] +``` + + +### Parquet files + +Write the kdb+ table to a Parquet file then read it back + +```q +// Use Parquet v2.0 +// This is required otherwise the timestamp(ns) datatype will be converted to +// timestamp(us) resulting in a loss of precision +q)parquet_write_options:(enlist `PARQUET_VERSION)!(enlist `V2.0) + +// Write the table to a parquet file +q).arrowkdb.pq.writeParquetFromTable["inferred_schema.parquet";table;parquet_write_options] +q)show system "ls inferred_schema.parquet" +"inferred_schema.parquet" + +// Read the parquet file into another table +q)new_table:.arrowkdb.pq.readParquetToTable["inferred_schema.parquet";::] + +// Compare the kdb+ tables +q)show table~new_table +1b +``` + + +### Arrow IPC files + +Write the kdb+ table to an Arrow file then read it back + +```q +// Write the table to an arrow file +q).arrowkdb.ipc.writeArrowFromTable["inferred_schema.arrow";table;::] +q)show system "ls inferred_schema.arrow" +"inferred_schema.arrow" + +// Read the arrow file into another table +q)new_table:.arrowkdb.ipc.readArrowToTable["inferred_schema.arrow";::] + +// Compare the kdb+ tables +q)show table~new_table +1b +``` + + +### Arrow IPC streams + +Write the kdb+ table to an Arrow stream then read it back + +```q +// Serialize the table to an arrow stream +q)serialized:.arrowkdb.ipc.serializeArrowFromTable[table;::] +q)show serialized +0xffffffff500100001000000000000a000c000600050008000a000000000104000c000000080.. + +// Parse the arrow stream into another table +q)new_table:.arrowkdb.ipc.parseArrowToTable[serialized;::] + +// Compare the kdb+ tables +q)show table~new_table +1b +``` + + +## Constructed schemas + +Although inferred schemas are easy to use, they support only a subset of the Arrow datatypes and are considerably less flexible. +The inference works only for kdb+ tables where the columns contain simple datatypes. +Only mixed lists of char arrays or byte arrays are supported, mapped to Arrow UTF8 and binary datatypes respectively. Other mixed-list structures (e.g. those used by the nested arrow datatypes) cannot be interpreted. + +More complex schemas should be manually constructed, in three steps: + +1. Create a datatype identifier for each column in the table by calling the appropriate datatype constructor +2. Create a field identifier for each column in table by calling the field constructor, specifying the field’s name and its datatype identifier +3. Create a schema identifier for the table by calling the schema constructor with the list of field identifiers + + +### Create the schema + +For comparison we begin by creating explicitly the schema inferred above + +```q +// Create the datatype identifiers +q)ts_dt:.arrowkdb.dt.timestamp[`nano] +q)f64_dt:.arrowkdb.dt.float64[] +q)i64_dt:.arrowkdb.dt.int64[] +q)bool_dt:.arrowkdb.dt.boolean[] +q)str_dt:.arrowkdb.dt.utf8[] + +// Create the field identifiers +q)tstamp_fd:.arrowkdb.fd.field[`tstamp;ts_dt] +q)temp_fd:.arrowkdb.fd.field[`temperature;f64_dt] +q)fill_fd:.arrowkdb.fd.field[`fill_level;i64_dt] +q)pump_fd:.arrowkdb.fd.field[`pump_status;bool_dt] +q)comment_fd:.arrowkdb.fd.field[`comment;str_dt] + +// Create the schema for the list of fields +q)schema:.arrowkdb.sc.schema[(tstamp_fd,temp_fd,fill_fd,pump_fd,comment_fd)] + +// Print the schema +q).arrowkdb.sc.printSchema[schema] +tstamp: timestamp[ns] not null +temperature: double not null +fill_level: int64 not null +pump_status: bool not null +comment: string not null +``` + + +### Create the array data + +Create a mixed list of array data for each column in the table + +```q +// Create data for each column in the table +q)tstamp_data:asc N?0p +q)temp_data:N?100f +q)fill_data:N?100 +q)pump_data:N?0b +q)comment_data:N?("start";"stop";"alert";"acknowledge";"") + +// Combine the data for all columns +q)array_data:(tstamp_data;temp_data;fill_data;pump_data;comment_data) + +// Pretty print the Arrow table populated from the array data +q).arrowkdb.tb.prettyPrintTable[schema;array_data;::] +tstamp: timestamp[ns] not null +temperature: double not null +fill_level: int64 not null +pump_status: bool not null +comment: string not null +---- +tstamp: + [ + [ + 2001-07-22 09:51:37.461634128, + 2001-10-03 11:56:09.607143848, + 2002-04-21 09:32:16.187244944, + 2002-05-14 18:23:48.381811824, + 2003-05-24 03:45:53.202889856 + ] + ] +temperature: + [ + [ + 39.1543, + 8.12355, + 93.675, + 27.8212, + 23.9234 + ] + ] +fill_level: + [ + [ + 23, + 12, + 66, + 36, + 37 + ] + ] +pump_status: + [ + [ + false, + true, + true, + true, + false + ] + ] +comment: + [ + [ + "alert", + "start", + "alert", + "", + "" + ] + ] +``` + + +### Parquet files + +Write the schema and array data to a Parquet file then read them back + +```q +// Use Parquet v2.0 +// This is required otherwise the timestamp(ns) datatype will be converted to +// timestamp(us) resulting in a loss of precision +q)parquet_write_options:(enlist `PARQUET_VERSION)!(enlist `V2.0) + +// Write the schema and array data to a parquet file +q).arrowkdb.pq.writeParquet["constructed_schema.parquet";schema;array_data;parquet_write_options] +q)show system "ls constructed_schema.parquet" +"constructed_schema.parquet" + +// Read the schema back from the parquet file +q)new_schema:.arrowkdb.pq.readParquetSchema["constructed_schema.parquet"] + +// Compare the schemas +q)show .arrowkdb.sc.equalSchemas[schema;new_schema] +1b +q)show schema~new_schema +1b + +// Read the array data back from the parquet file +q)new_array_data:.arrowkdb.pq.readParquetData["constructed_schema.parquet";::] + +// Compare the array data +q)show array_data~new_array_data +1b +``` + + +### Arrow IPC files + +Write the schema and array data to an Arrow file then read them back + +```q +// Write the schema and array data to an arrow file +q).arrowkdb.ipc.writeArrow["constructed_schema.arrow";schema;array_data;::] +q)show system "ls constructed_schema.arrow" +"constructed_schema.arrow" + +// Read the schema back from the arrow file +q)new_schema:.arrowkdb.ipc.readArrowSchema["constructed_schema.arrow"] + +// Compare the schemas +q)show .arrowkdb.sc.equalSchemas[schema;new_schema] +1b +q)show schema~new_schema +1b + +// Read the array data back from the arrow file +q)new_array_data:.arrowkdb.ipc.readArrowData["constructed_schema.arrow";::] + +// Compare the array data +q)show array_data~new_array_data +1b +``` + +### Arrow IPC streams + +Write the schema and array data to an Arrow stream then read them back + +```q +// Serialize the schema and array data to an arrow stream +q)serialized:.arrowkdb.ipc.serializeArrow[schema;array_data;::] +q)show serialized +0xffffffff500100001000000000000a000c000600050008000a000000000104000c000000080.. + +// Parse the schema back for the arrow stream +q)new_schema:.arrowkdb.ipc.parseArrowSchema[serialized] + +// Compare the schemas +q)show .arrowkdb.sc.equalSchemas[schema;new_schema] +1b +q)show schema~new_schema +1b + +// Read the array data back from the arrow file +q)new_array_data:.arrowkdb.ipc.parseArrowData[serialized;::] + +// Compare the array data +q)show array_data~new_array_data +1b +``` + + +## Constructed schemas with nested datatypes + +Nested datatypes are constructed in two ways: + +1. List, Map and Dictionary datatypes are specified in terms of their child datatypes +2. Struct and Union datatypes are specified in terms of their child fields + +Continuing with the constructed schemas example, we update the schema as follows: + +- The `temperature` and `fill_level` fields will be combined under a struct datatype +- The `utf8` `comment` field will be replaced with a `list` field so that each array item can store multiple comments + + +### Create the schema + +Create the new schema, reusing the datatype and field identifiers from the previous example + +```q +// Taken from previous example: +q)ts_dt:.arrowkdb.dt.timestamp[`nano] +q)f64_dt:.arrowkdb.dt.float64[] +q)i64_dt:.arrowkdb.dt.int64[] +q)bool_dt:.arrowkdb.dt.boolean[] +q)str_dt:.arrowkdb.dt.utf8[] +q)tstamp_fd:.arrowkdb.fd.field[`tstamp;ts_dt] +q)temp_fd:.arrowkdb.fd.field[`temperature;f64_dt] +q)fill_fd:.arrowkdb.fd.field[`fill_level;i64_dt] +q)pump_fd:.arrowkdb.fd.field[`pump_status;bool_dt] + +// Create a struct datatype which bundles the temperature and fill level fields +q)struct_dt:.arrowkdb.dt.struct[(temp_fd,fill_fd)] + +// Create a list datatype which repeats the utf8 datatype +q)list_dt:.arrowkdb.dt.list[str_dt] + +// Create the struct and list field identifiers +q)sensors_fd:.arrowkdb.fd.field[`sensors_data;struct_dt] +q)multi_comments_fd:.arrowkdb.fd.field[`multi_comments;list_dt] + +// Create the nested schema +q)nested_schema:.arrowkdb.sc.schema[(tstamp_fd,sensors_fd,pump_fd,multi_comments_fd)] + +// Print the schema +q).arrowkdb.sc.printSchema[nested_schema] +tstamp: timestamp[ns] not null +sensors_data: struct not null +pump_status: bool not null +multi_comments: list not null +``` + + +### Create the array data + +Create a mixed list of array data, reusing the data from the previous example + +```q +// Taken from previous example: +q)tstamp_data:asc N?0p +q)temp_data:N?100f +q)fill_data:N?100 +q)pump_data:N?0b + +// The sensors struct array data is composed from its child arrays +q)sensors_data:(temp_data;fill_data); + +// Generate the multi-comments array data as lists of strings +q)getCommentsSet:{[] + n:(1?5)[0]+1; + enlist (n?("start";"stop";"alert";"acknowledge"; "")) + } +q)multi_comments_data:getCommentsSet[] +q)x:N +q)while[x-:1;multi_comments_data:multi_comments_data,getCommentsSet[]] + +// Combine the arrays data for all columns, including the struct and list data +q)nested_array_data:(tstamp_data;sensors_data;pump_data;multi_comments_data) + +// Pretty print the Arrow table populated from the array data +q).arrowkdb.tb.prettyPrintTable[nested_schema;nested_array_data;::] +tstamp: timestamp[ns] not null + -- field metadata -- + PARQUET:field_id: '1' +sensors_data: struct not null + child 0, temperature: double not null + -- field metadata -- + PARQUET:field_id: '2' + child 1, fill_level: int64 not null + -- field metadata -- + PARQUET:field_id: '3' +pump_status: bool not null + -- field metadata -- + PARQUET:field_id: '4' +multi_comments: list not null + child 0, item: string +---- +tstamp: + [ + [ + 2000-08-25 19:14:03.975714596, + 2002-10-02 20:42:32.814873312, + 2002-11-29 09:25:44.198182224, + 2003-04-09 17:45:03.539744768, + 2003-06-20 20:19:57.851794208 + ] + ] +sensors_data: + [ + -- is_valid: all not null + -- child 0 type: double + [ + 75.201, + 10.8682, + 95.9896, + 3.66834, + 64.3098 + ] + -- child 1 type: int64 + [ + 52, + 66, + 24, + 60, + 69 + ] + ] +pump_status: + [ + [ + false, + true, + true, + true, + true + ] + ] +multi_comments: + [ + [ + [ + "alert", + "alert", + "start", + "stop" + ], + [ + "start" + ], + [ + "acknowledge" + ], + [ + "stop", + "alert", + "acknowledge", + "acknowledge", + "" + ], + [ + "stop", + "alert", + "stop", + "", + "" + ] + ] + ] +``` + +It is left as an exercise to write the schema and array data to Parquet or Arrow files. + +??? tip "Remember to use Parquet v2.0" + + Otherwise the `timestamp(ns)` datatype will be converted to `timestamp(us)` resulting in a loss of precision. \ No newline at end of file diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..2b087d3 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,78 @@ +--- +title: Using Apache Arrow/Parquet data with kdb+ +description: Apache Arrow is a software-development platform for building high-performance applications that process and transport large datasets +author: Neal McDonnell +date: February 2021 +--- +![Arrow](../img/apache_arrow.png) +# Using Apache Arrow/Parquet data with kdb+ + +:fontawesome-brands-github: +[KxSystems/arrowkdb](https://github.com/KxSystems/arrowkdb) + + + +[Apache Arrow](https://arrow.apache.org/) is a software-development platform for building high-performance applications that process and transport large datasets. It is designed to both improve the performance of analytical algorithms and the efficiency of moving data from one system (or programming language to another). + +A critical component of Apache Arrow is its **in-memory columnar format**, a standardized, language-agnostic specification for representing structured, table-like datasets in memory. This data format has a rich datatype system (included nested data types) designed to support the needs of analytic database systems, dataframe libraries, and more. + + + +## Arrow and Parquet + +[Apache Parquet](https://parquet.apache.org/) is a storage format designed for maximum space efficiency, using advanced compression and encoding techniques. It is ideal for minimizing disk usage while storing gigabytes of data, or perhaps more. The efficiency comes at the cost of relatively expensive reading into memory, as Parquet data cannot be directly operated on but must be decoded in large chunks. + +Conversely, Apache Arrow is an in-memory format meant for direct and efficient use for computational purposes. Arrow data is not compressed but laid out in natural format for the CPU, so that data can be accessed at arbitrary places at full speed. + +Arrow and Parquet complement each other, with Arrow being used as the in-memory data structure for deserialized Parquet data. + + + +## Arrow/kdb+ integration + +This interface lets you convert data between Arrow tables and kdb+ +to analyze data in whichever format you are more familiar with. + +Currently Arrow supports over 35 datatypes including concrete, parameterized and nested datatypes. Each [Arrow datatype is mapped to a kdb+ type](arrow-types.md) and `arrowkdb` can seamlessly convert between both representations. + +The data layout of an Arrow table is defined by its schema. The schema is composed from a list of fields, one for each column in the table. The field describes the name of the column and its datatype. Schemas can be setup in two ways: + + +Inferred schemas + +: If you are less familiar with Arrow or do not wish to use the more complex or nested Arrow datatypes, `arrowkdb` can infer the schema from a kdb+ table. Each column in the table is mapped to a field in the schema. The column’s name is used as the field name and the field’s datatype is [inferred from the column’s kdb+ type](arrow-types/#inferred-datatypes). + +Constructed schemas + +: Although inferred schemas are easy to use, they support only a subset of the Arrow datatypes and are considerably less flexible. The inference works only for kdb+ tables where the columns contain simple datatypes. Where more complex schemas are required then these should be manually constructed. This is done using the datatype/field/schema constructor functions which `arrowkdb` exposes, similar to the C++ Arrow library and PyArrow. + + +## Arrow tables + +Users can read and write Arrow tables created from kdb+ data using: + +- Parquet file format +- Arrow IPC record batch file format +- Arrow IPC record batch stream format + +Separate APIs are provided where the Arrow table is either created from a kdb+ table using an inferred schema or from an Arrow schema and the table’s list of array data. + +:fontawesome-regular-hand-point-right: +[API reference](reference.md) +
+:fontawesome-regular-hand-point-right: +[Example implementations](examples.md) +
+:fontawesome-brands-github: +[Install guide](https://github.com/KxSystems/arrowkdb#installation) + + +## Project + +The `arrowkdb` interface is published under an Apache 2.0 license. + +:fontawesome-brands-github: +[Raise an issue](https://github.com/KxSystems/arrowkdb/issues) +
+:fontawesome-brands-github: +[Contribute](https://github.com/KxSystems/arrowkdb/blob/master/CONTRIBUTING.md) diff --git a/docs/reference.md b/docs/reference.md new file mode 100644 index 0000000..7366a71 --- /dev/null +++ b/docs/reference.md @@ -0,0 +1,2555 @@ +--- +title: 'Function reference | Arrow/Parquet interface' +description: 'These functions are exposed within the .arrowkdb namespace, allowing users to convert data between the Arrow/Parquet and kdb+' +author: Neal McDonnell +date: February 2021 +--- +# Function reference + +These functions are exposed within the `.arrowkdb` namespace, allowing users to convert data between the Arrow/Parquet and kdb+. + +:fontawesome-brands-github: +[KxSystems/arrowkdb](https://github.com/KxSystems/arrowkdb) + + + +
+.arrowkdb **Arrow/Parquet interface** +[Datatype constructors](#datatype-constructors) + [dt.na](#dtna) Create a NULL datatype + [dt.boolean](#dtboolean) Create a boolean datatype + [dt.int8](#dtint8) Create an int8 datatype + [dt.int16](#dtint16) Create an int16 datatype + [dt.int32](#dtint32) Create an int32 datatype + [dt.int64](#dtint64) Create an int64 datatype + [dt.uint8](#dtuint8) Create an uint8 datatype + [dt.uint16](#dtuint16) Create an uint16 datatype + [dt.uint32](#dtuint32) Create an uint32 datatype + [dt.uint64](#dtuint64) Create an uint64 datatype + [dt.float16](#dtfloat16) Create a float16 (represented as uint16_t) datatype + [dt.float32](#dtfloat32) Create a float32 datatype + [dt.float64](#dtfloat64) Create a float64 datatype + [dt.time32](#dttime32) Create a 32-bit time (units since midnight with specified + granularity) datatype + [dt.time64](#dttime64) Create a 64-bit time (units since midnight with specified + granularity) datatype + [dt.timestamp](#dttimestamp) Create a 64-bit timestamp (units since UNIX epoch with + specified granularity) datatype + [dt.date32](#dtdate32) Create a 32-bit date (days since UNIX epoch) datatype + [dt.date64](#dtdate64) Create a 64-bit date (milliseconds since UNIX epoch) + datatype + [dt.month_interval](#dtmonth_interval) Create a 32-bit interval (described as a number of months, + similar to YEAR_MONTH in SQL) datatype + [dt.day_time_interval](#dtday_time_interval) Create a 64-bit interval (described as a number of days + and milliseconds, similar to DAY_TIME in SQL) datatype + [dt.duration](#dtduration) Create a 64-bit duration (measured in units of specified + granularity) datatype + [dt.binary](#dtbinary) Create a variable length bytes datatype + [dt.utf8](#dtutf8) Create a UTF8 variable length string datatype + [dt.large_binary](#dtlarge_binary) Create a large (64-bit offsets) variable length bytes + datatype + [dt.large_utf8](#dtlarge_utf8) Create a large (64-bit offsets) UTF8 variable length + string datatype + [dt.fixed_size_binary](#dtfixed_size_binary) Create a fixed width bytes datatype + [dt.decimal128](#dtdecimal128) Create a 128-bit integer (with precision and scale in + twos complement) datatype + [dt.list](#dtlist) Create a list datatype, specified in terms of its child + datatype + [dt.large_list](#dtlarge_list) Create a large (64-bit offsets) list datatype, specified + in terms of its child datatype + [dt.fixed_size_list](#dt_fixed_size_list) Create a fixed size list datatype, specified in terms of + its child datatype + [dt.map](#dtmap) Create a map datatype, specified in terms of its key and + item child datatypes + [dt.struct](#dtstruct) Create a struct datatype, specified in terms of the field + identifiers of its children + [dt.sparse_union](#dtsparse_union) Create a sparse union datatype, specified in terms of the + field identifiers of its children + [dt.dense_union](#dtdense_union) Create a dense union datatype, specified in terms of the + field identifiers of its children + [dt.dictionary](#dtdictionary) Create a dictionary datatype specified in terms of its + value and index datatypes, similar to pandas categorical + [dt.inferDatatype](#dtinferDatatype) Infer and construct a datatype from a kdb+ list + +[Datatype inspection](#datatype-inspection) + [dt.datatypeName](#dtdatatypename) Return the base name of a datatype, ignoring any + parameters or child datatypes/fields + [dt.getTimeUnit](#dtgettimeunit) Return the TimeUnit of a time32/time64/timestamp/duration + datatype + [dt.getByteWidth](#dtgetbytewidth) Return the byte_width of a fixed_size_binary datatype + [dt.getListSize](#dtgetlistsize) Returns the list_size of a fixed_size_list datatype + [dt.getPrecisionScale](#dtgetprecisionscale) Return the precision and scale of a decimal128 datatype + [dt.getListDatatype](#dtgetlistdatatype) Return the child datatype identifier of a + list/large_list/fixed_size_list datatype + [dt.getMapDatatypes](#dtgetmapdatatypes) Return the key and item child datatype identifiers of a + map datatype + [dt.getDictionaryDatatypes](#dtgetdictionarydatatypes) Return the value and index child datatype identifiers of a + dictionary datatype + [dt.getChildFields](#dtgetchildfields) Return the list of child field identifiers of a + struct/spare_union/dense_union datatype + +[Datatype management](#datatype-management) + [dt.printDatatype](#dtprintdatatype) Display user readable information for a datatype, + including parameters and nested child datatypes + [dt.listDatatypes](#dtlistdatatypes) Return the list of identifiers for all datatypes held in + the DatatypeStore + [dt.removeDatatype](#dtremovedatatype) Remove a datatype from the DatatypeStore + [dt.equalDatatypes](#dtequaldatatypes) Check if two datatypes are logically equal, including + parameters and nested child datatypes + +[Field Constructor](#field-constructor) + [fd.field](#fdfield) Create a field instance from its name and datatype + +[Field Inspection](#field-inspection) + [fd.fieldName](#fdfieldname) Return the name of a field + [fd.fieldDatatype](#fdfielddatatype) Return the datatype of a field + +[Field management](#field-management) + [fd.printField](#fdprintfield) Display user readable information for a field, including + name and datatype + [fd.listFields](#fdlistfields) Return the list of identifiers for all fields held in the + FieldStore + [fd.removeField](#fdremovefield) Remove a field from the FieldStore + [fd.equalFields](#fdequalfields) Check if two fields are logically equal, including names + and datatypes + +[Schema constructors](#schema-constructors) + [sc.schema](#scschema) Create a schema instance from a list of field identifiers + [sc.inferSchema](#scinferschema) Infer and construct a schema based on a kdb+ table + +[Schema inspection](#schema-inspection) + [sc.schemaFields](#scschemafields) Return the list of field identifiers used by a schema + +[Schema management](#schema-management) + [sc.printSchema](#scprintschema) Display user readable information for a schema, including + its fields and their order + [sc.listSchemas](#sclistschemas) Return the list of identifiers for all schemas held in the + SchemaStore + [sc.removeSchema](#scremoveschema) Remove a schema from the SchemaStore + [sc.equalSchemas](#scequalschemas) Check if two schemas are logically equal, including their + fields and the fields' order + +[Array data](#array-data) + [ar.prettyPrintArray](#arprettyprintarray) Convert a kdb+ list to an Arrow array and pretty print the + array + [ar.prettyPrintArrayFromList](#arprettyprintarrayfromlist) Convert a kdb+ list to an Arrow array and pretty print the + array, inferring the datatype from the kdb+ list type + + +[Table data](#table-data) + [tb.prettyPrintTable](#tbprettyprinttable) Convert a kdb+ mixed list of array data to an Arrow table + and pretty print the table + [tb.prettyPrintTableFromTable](#tbprettyprinttablefromtable) Convert a kdb+ table to an Arrow table and pretty print + the table, inferring the schema from the kdb+ table + structure + +[Parquet files](#parquet-files) + [pq.writeParquet](#pqwriteparquet) Convert a kdb+ mixed list of array data to an Arrow table + and write to a Parquet file + [pq.writeParquetFromTable](#pqwriteparquetfromtable) Convert a kdb+ table to an Arrow table and write to a + Parquet file, inferring the schema from the kdb+ table + structure + [pq.readParquetSchema](#pqreadparquetschema) Read the schema from a Parquet file + [pq.readParquetData](#pqreadparquetdata) Read an Arrow table from a Parquet file and convert to a + kdb+ mixed list of array data + [pq.readParquetColumn](#pqreadparquetcolumn) Read a single column from a Parquet file and convert to a + kdb+ list + [pq.readParquetToTable](#pqreadparquettotable) Read an Arrow table from a Parquet file and convert to a + kdb+ table + +[Arrow IPC files](#arrow-ipc-files) + [ipc.writeArrow](#ipcwritearrow) Convert a kdb+ mixed list of array data to an Arrow table + and write to an Arrow file + [ipc.writeArrowFromTable](#ipcwritearrowfromtable) Convert a kdb+ table to an Arrow table and write to an + Arrow file, inferring the schema from the kdb+ table + structure + [ipc.readArrowSchema](#ipcreadarrowschema) Read the schema from an Arrow file + [ipc.readArrowData](#ipcreadarrowdata) Read an Arrow table from an Arrow file and convert to a + kdb+ mixed list of array data + [ipc.readArrowToTable](#ipcreadarrowtotable) Read an Arrow table from an Arrow file and convert to a + kdb+ table + +[Arrow IPC streams](#arrow-ipc-streams) + [ipc.serializeArrow](#ipcserializearrow) Convert a kdb+ mixed list of array data to an Arrow table + and serialize to an Arrow stream + [ipc.serializeArrowFromTable](#ipcserializearrowfromtable) Convert a kdb+ table to an Arrow table and serialize to an + Arrow stream, inferring the schema from the kdb+ table + structure + [ipc.parseArrowSchema](#ipcparsearrowschema) Parse the schema from an Arrow stream + [ipc.parseArrowData](#ipcparsearrowdata) Parse an Arrow table from an Arrow stream and convert to a + kdb+ mixed list of array data + [ipc.parseArrowToTable](#ipcparsearrowtotable) Parse an Arrow table from an Arrow file and convert to a + kdb+ table + +[Utilities](#utilities) + [util.buildInfo](#utilbuildinfo) Return build information regarding the in use Arrow + library + +
+ +## Datatype constructors + +### **`dt.na`** + +*Create a NULL datatype* + +```syntax +.arrowkdb.dt.na[] +``` + +Returns the datatype identifier + +```q +q).arrowkdb.dt.printDatatype[.arrowkdb.dt.na[]] +null +q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.na[];(();();());::] +3 nulls +``` + +### **`dt.boolean`** + +*Create a boolean datatype* + +```syntax +.arrowkdb.dt.boolean[] +``` + +Returns the datatype identifier + +```q +q).arrowkdb.dt.printDatatype[.arrowkdb.dt.boolean[]] +bool +q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.boolean[];(010b);::] +[ + false, + true, + false +] +``` + +### **`dt.int8`** + +*Create an int8 datatype* + +```syntax +.arrowkdb.dt.int8[] +``` + +??? note "kdb+ type 10h can be written to an `int8` array" + + The is supported on the writing path only. Reading from an int8 array returns a 4h list + +Returns the datatype identifier + +```q +q).arrowkdb.dt.printDatatype[.arrowkdb.dt.int8[]] +int8 +q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.int8[];(0x102030);::] +[ + 16, + 32, + 48 +] +``` + +### **`dt.int16`** + +*Create an int16 datatype* + +```syntax +.arrowkdb.dt.int16[] +``` + +Returns the datatype identifier + +```q +q).arrowkdb.dt.printDatatype[.arrowkdb.dt.int16[]] +int16 +q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.int16[];(11 22 33h);::] +[ + 11, + 22, + 33 +] +``` + +### **`dt.int32`** + +*Create an int32 datatype* + +```syntax +.arrowkdb.dt.int32[] +``` + +Returns the datatype identifier + +```q +q).arrowkdb.dt.printDatatype[.arrowkdb.dt.int32[]] +int32 +q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.int32[];(11 22 33i);::] +[ + 11, + 22, + 33 +] +``` + +### **`dt.int64`** + +*Create an int64 datatype* + +```syntax +.arrowkdb.dt.int64[] +``` + +Returns the datatype identifier + +```q +q).arrowkdb.dt.printDatatype[.arrowkdb.dt.int64[]] +int64 +q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.int64[];(11 22 33j);::] +[ + 11, + 22, + 33 +] +``` + +### **`dt.uint8`** + +*Create an uint8 datatype* + +```syntax +.arrowkdb.dt.uint8[] +``` + +Returns the datatype identifier + +```q +q).arrowkdb.dt.printDatatype[.arrowkdb.dt.uint8[]] +uint8 +q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.uint8[];(0x102030);::] +[ + 16, + 32, + 48 +] +``` + +### **`dt.uint16`** + +*Create an uint16 datatype* + +```syntax +.arrowkdb.dt.uint16[] +``` + +Returns the datatype identifier + +```q +q).arrowkdb.dt.printDatatype[.arrowkdb.dt.uint16[]] +uint16 +q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.uint16[];(11 22 33h);::] +[ + 11, + 22, + 33 +] +``` + +### **`dt.uint32`** + +*Create an uint32 datatype* + +```syntax +.arrowkdb.dt.uint32[] +``` + +Returns the datatype identifier + +??? warning "`uint32` datatype is supported by Parquet v2.0 only, being changed to `int64` otherwise" + + The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) + +```q +q).arrowkdb.dt.printDatatype[.arrowkdb.dt.uint32[]] +uint32 +q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.uint32[];(11 22 33i);::] +[ + 11, + 22, + 33 +] +``` + +### **`dt.uint64`** + +*Create an uint64 datatype* + +```syntax +.arrowkdb.dt.uint64[] +``` + +Returns the datatype identifier + +```q +q).arrowkdb.dt.printDatatype[.arrowkdb.dt.uint64[]] +uint64 +q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.uint64[];(11 22 33j);::] +[ + 11, + 22, + 33 +] +``` + +### **`dt.float16`** + +*Create a float16 (represented as uint16_t) datatype* + +```syntax +.arrowkdb.dt.float16[] +``` + +Returns the datatype identifier + +??? warning "`float16` datatype is not supported by Parquet" + + The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) + +```q +q).arrowkdb.dt.printDatatype[.arrowkdb.dt.float16[]] +halffloat +q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.float16[];(11 22 33h);::] +[ + 11, + 22, + 33 +] +``` + +### **`dt.float32`** + +*Create a float32 datatype* + +```syntax +.arrowkdb.dt.float32[] +``` + +Returns the datatype identifier + +```q +q).arrowkdb.dt.printDatatype[.arrowkdb.dt.float32[]] +float +q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.float32[];(1.1 2.2 3.3e);::] +[ + 1.1, + 2.2, + 3.3 +] +``` + +### **`dt.float64`** + +*Create a float64 datatype* + +```syntax +.arrowkdb.dt.float64[] +``` + +Returns the datatype identifier + +```q +q).arrowkdb.dt.printDatatype[.arrowkdb.dt.float64[]] +double +q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.float64[];(1.1 2.2 3.3f);::] +[ + 1.1, + 2.2, + 3.3 +] +``` + +### **`dt.time32`** + +*Create a 32-bit time (units since midnight with specified granularity) datatype* + +```syntax +.arrowkdb.dt.time32[time_unit] +``` + +Where `time_unit` is the time unit string: SECOND or MILLI + +returns the datatype identifier + +```q +q).arrowkdb.dt.printDatatype[.arrowkdb.dt.time32[`MILLI]] +time32[ms] +q).arrowkdb.dt.getTimeUnit[.arrowkdb.dt.time32[`MILLI]] +`MILLI +q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.time32[`MILLI];(01:00:00.100 02:00:00.200 03:00:00.300);::] +[ + 01:00:00.100, + 02:00:00.200, + 03:00:00.300 +] +``` + +### **`dt.time64`** + +*Create a 64-bit time (units since midnight with specified granularity) datatype* + +```syntax +.arrowkdb.dt.time64[time_unit] +``` + +Where `time_unit` is the time unit string: MICRO or NANO + +returns the datatype identifier + +```q +q).arrowkdb.dt.printDatatype[.arrowkdb.dt.time64[`NANO]] +time64[ns] +q).arrowkdb.dt.getTimeUnit[.arrowkdb.dt.time64[`NANO]] +`NANO +q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.time64[`NANO];(0D01:00:00.100000001 0D02:00:00.200000002 0D03:00:00.300000003);::] +[ + 01:00:00.100000001, + 02:00:00.200000002, + 03:00:00.300000003 +] +``` + +### **`dt.timestamp`** + +*Create a 64-bit timestamp (units since UNIX epoch with specified granularity) datatype* + +```syntax +.arrowkdb.dt.timestamp[time_unit] +``` + +Where `time_unit` is the time unit string: SECOND, MILLI, MICRO or NANO + +returns the datatype identifier + +??? warning "`timestamp(nano)` datatype is supported by Parquet v2.0 only, being mapped to `timestamp(milli)` otherwise" + + The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) + +```q +q).arrowkdb.dt.printDatatype[.arrowkdb.dt.timestamp[`NANO]] +timestamp[ns] +q).arrowkdb.dt.getTimeUnit[.arrowkdb.dt.timestamp[`NANO]] +`NANO +q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.timestamp[`NANO];(2001.01.01D00:00:00.100000001 2002.02.02D00:00:00.200000002 2003.03.03D00:00:00.300000003);::] +[ + 2001-01-01 00:00:00.100000001, + 2002-02-02 00:00:00.200000002, + 2003-03-03 00:00:00.300000003 +] +``` + +### **`dt.date32`** + +*Create a 32-bit date (days since UNIX epoch) datatype* + +```syntax +.arrowkdb.dt.date32[] +``` + +Returns the datatype identifier + +```q +q).arrowkdb.dt.printDatatype[.arrowkdb.dt.date32[]] +date32[day] +q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.date32[];(2001.01.01 2002.02.02 2003.03.03);::] +[ + 2001-01-01, + 2002-02-02, + 2003-03-03 +] +``` + +### **`dt.date64`** + +*Create a 64-bit date (milliseconds since UNIX epoch) datatype* + +```syntax +.arrowkdb.dt.date64[] +``` + +Returns the datatype identifier + +??? warning "`date64` datatype is changed to `date32(days)` by Parquet" + + The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) + +```q +q).arrowkdb.dt.printDatatype[.arrowkdb.dt.date64[]] +date64[ms] +q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.date64[];(2001.01.01D00:00:00.000000000 2002.02.02D00:00:00.000000000 2003.03.03D00:00:00.000000000);::] +[ + 2001-01-01, + 2002-02-02, + 2003-03-03 +] +``` + +### **`dt.month_interval`** + +*Create a 32-bit interval (described as a number of months, similar to YEAR_MONTH in SQL) datatype* + +```syntax +.arrowkdb.dt.month_interval[] +``` + +Returns the datatype identifier + +??? warning "`month_interval` datatype is not supported by Parquet" + + The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) + +```q +q).arrowkdb.dt.printDatatype[.arrowkdb.dt.month_interval[]] +month_interval +q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.month_interval[];(2001.01m,2002.02m,2003.03m);::] +[ + 12, + 25, + 38 +] +``` + +### **`dt.day_time_interval`** + +*Create a 64-bit interval (described as a number of days and milliseconds, similar to DAY_TIME in SQL) datatype* + +```syntax +.arrowkdb.dt.day_time_interval[] +``` + +Returns the datatype identifier + +??? warning "`day_time_interval` datatype is not supported by Parquet" + + The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) + +```q +q).arrowkdb.dt.printDatatype[.arrowkdb.dt.day_time_interval[]] +day_time_interval +q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.day_time_interval[];(0D01:00:00.100000000 0D02:00:00.200000000 0D03:00:00.300000000);::] +[ + 0d3600100ms, + 0d7200200ms, + 0d10800300ms +] +``` + +### **`dt.duration`** + +*Create a 64-bit duration (measured in units of specified granularity) datatype* + +```syntax +.arrowkdb.dt.duration[time_unit] +``` + +Where `time_unit` is the time unit string: SECOND, MILLI, MICRO or NANO + +returns the datatype identifier + +??? warning "`duration` datatype is not supported by Parquet" + + The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) + +```q +q).arrowkdb.dt.printDatatype[.arrowkdb.dt.duration[`NANO]] +duration[ns] +q).arrowkdb.dt.getTimeUnit[.arrowkdb.dt.duration[`NANO]] +`NANO +q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.duration[`NANO];(0D01:00:00.100000000 0D02:00:00.200000000 0D03:00:00.300000000);::] +[ + 3600100000000, + 7200200000000, + 10800300000000 +] +``` + +### **`dt.binary`** + +*Create a variable length bytes datatype* + +```syntax +.arrowkdb.dt.binary[] +``` + +Returns the datatype identifier + +```q +q).arrowkdb.dt.printDatatype[.arrowkdb.dt.binary[]] +binary +q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.binary[];(enlist 0x11;0x2222;0x333333);::] +[ + 11, + 2222, + 333333 +] +``` + +### **`dt.utf8`** + +*Create a UTF8 variable length string datatype* + +```syntax +.arrowkdb.dt.utf8[] +``` + +Returns the datatype identifier + +??? note "kdb+ type 11h can be written to an `utf8` array" + + The is supported on the writing path only. Reading from an utf8 array returns a mixed list of 10h + +```q +q).arrowkdb.dt.printDatatype[.arrowkdb.dt.utf8[]] +string +q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.utf8[];(enlist "a";"bb";"ccc");::] +[ + "a", + "bb", + "ccc" +] +``` + +### **`dt.large_binary`** + +*Create a large (64-bit offsets) variable length bytes datatype* + +```syntax +.arrowkdb.dt.large_binary[] +``` + +Returns the datatype identifier + +??? warning "`large_binary` datatype is not supported by Parquet" + + The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) + +```q +q).arrowkdb.dt.printDatatype[.arrowkdb.dt.large_binary[]] +large_binary +q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.large_binary[];(enlist 0x11;0x2222;0x333333);::] +[ + 11, + 2222, + 333333 +] +``` + +### **`dt.large_utf8`** + +*Create a large (64-bit offsets) UTF8 variable length string datatype* + +```syntax +.arrowkdb.dt.large_utf8[] +``` + +Returns the datatype identifier + +??? warning "`large_utf8` datatype is not supported by Parquet" + + The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) + +```q +q).arrowkdb.dt.printDatatype[.arrowkdb.dt.large_utf8[]] +large_string +q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.large_utf8[];(enlist "a";"bb";"ccc");::] +[ + "a", + "bb", + "ccc" +] +``` + +### **`dt.fixed_size_binary`** + +*Create a fixed width bytes datatype* + +```syntax +.arrowkdb.dt.fixed_size_binary[byte_width] +``` + +Where `byte_width` is the int32 fixed size byte width (each value in the array occupies the same number of bytes). + +returns the datatype identifier + +??? note "kdb+ type 2h can be written to a `fixed_size_binary(16)` array" + + The is supported on the writing path only. Reading from a fixed_size_binary array returns a mixed list of 4h + +```q +q).arrowkdb.dt.printDatatype[.arrowkdb.dt.fixed_size_binary[2i]] +fixed_size_binary[2] +q).arrowkdb.dt.getByteWidth[.arrowkdb.dt.fixed_size_binary[2i]] +2i +q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.fixed_size_binary[2i];(0x1111;0x2222;0x3333);::] +[ + 1111, + 2222, + 3333 +] +``` + +### **`dt.decimal128`** + +*Create a 128-bit integer (with precision and scale in twos complement) datatype* + +```syntax +.arrowkdb.dt.decimal128[precision;scale] +``` + +Where: + +- `precision` is the int32 precision width +- `scale` is the int32 scaling factor + +returns the datatype identifier + +```q +q).arrowkdb.dt.printDatatype[.arrowkdb.dt.decimal128[38i;2i]] +decimal(38, 2) +q).arrowkdb.dt.getPrecisionScale[.arrowkdb.dt.decimal128[38i;2i]] +38 +2 +q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.decimal128[38i;2i];(0x00000000000000000000000000000000; 0x01000000000000000000000000000000; 0x00000000000000000000000000000080);::] +[ + 0.00, + 0.01, + -1701411834604692317316873037158841057.28 +] +q) // With little endian twos complement the decimal128 values are 0, minimum positive, maximum negative +``` + +### **`dt.list`** + +*Create a list datatype, specified in terms of its child datatype* + +```syntax +.arrowkdb.dt.list[child_datatype_id] +``` + +Where `child_datatype_id` is the identifier of the list’s child datatype + +returns the datatype identifier + +```q +q)list_datatype:.arrowkdb.dt.list[.arrowkdb.dt.int64[]] +q).arrowkdb.dt.printDatatype[list_datatype] +list +q).arrowkdb.dt.printDatatype[.arrowkdb.dt.getListDatatype[list_datatype]] +int64 +q).arrowkdb.ar.prettyPrintArray[list_datatype;((enlist 1);(2 2);(3 3 3));::] +[ + [ + 1 + ], + [ + 2, + 2 + ], + [ + 3, + 3, + 3 + ] +] +``` + +### **`dt.large_list`** + +*Create a large (64-bit offsets) list datatype, specified in terms of its child datatype* + +```syntax +.arrowkdb.dt.large_list[child_datatype_id] +``` + +Where `child_datatype_id` is the identifier of the list’s child datatype + +returns the datatype identifier + +```q +q)list_datatype:.arrowkdb.dt.large_list[.arrowkdb.dt.int64[]] +q).arrowkdb.dt.printDatatype[list_datatype] +large_list +q).arrowkdb.dt.printDatatype[.arrowkdb.dt.getListDatatype[list_datatype]] +int64 +q).arrowkdb.ar.prettyPrintArray[list_datatype;((enlist 1);(2 2);(3 3 3));::] +[ + [ + 1 + ], + [ + 2, + 2 + ], + [ + 3, + 3, + 3 + ] +] +``` + +### **`dt.fixed_size_list`** + +*Create a fixed size list datatype, specified in terms of its child datatype* + +```syntax +.arrowkdb.dt.fixed_size_list[child_datatype_id;list_size] +``` + +Where: + +- `child_datatype_id` is the identifier of the list’s child datatype +- `list_size` is the int32 fixed size of each of the child lists + +returns the datatype identifier + +??? warning "`fixed_size_list` datatype is changed to `list` by Parquet" + + The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) + +```q +q)list_datatype:.arrowkdb.dt.fixed_size_list[.arrowkdb.dt.int64[];2i] +q).arrowkdb.dt.printDatatype[list_datatype] +fixed_size_list[2] +q).arrowkdb.dt.printDatatype[.arrowkdb.dt.getListDatatype[list_datatype]] +int64 +q).arrowkdb.dt.getListSize[list_datatype] +2i +q).arrowkdb.ar.prettyPrintArray[list_datatype;((1 1);(2 2);(3 3));::] +[ + [ + 1, + 1 + ], + [ + 2, + 2 + ], + [ + 3, + 3 + ] +] +``` + +### **`dt.map`** + +*Create a map datatype, specified in terms of its key and item child datatypes* + +```syntax +.arrowkdb.dt.map[key_datatype_id;item_datatype_id] +``` + +Where: + +- `key_datatype_id` is the identifier of the map key child datatype +- `item_datatype_id` is the identifier of the map item child datatype + +returns the datatype identifier + +```q +q)map_datatype:.arrowkdb.dt.map[.arrowkdb.dt.int64[];.arrowkdb.dt.float64[]] +q).arrowkdb.dt.printDatatype[map_datatype] +map +q).arrowkdb.dt.printDatatype each .arrowkdb.dt.getMapDatatypes[map_datatype] +int64 +double +:: +:: +q).arrowkdb.ar.prettyPrintArray[map_datatype;((enlist 1)!(enlist 1f);(2 2)!(2 2f);(3 3 3)!(3 3 3f));::] +[ + keys: + [ + 1 + ] + values: + [ + 1 + ], + keys: + [ + 2, + 2 + ] + values: + [ + 2, + 2 + ], + keys: + [ + 3, + 3, + 3 + ] + values: + [ + 3, + 3, + 3 + ] +] +``` + +### **`dt.struct`** + +*Create a struct datatype, specified in terms of the field identifiers of its children* + +```syntax +.arrowkdb.dt.struct[field_ids] +``` + +Where `field_ids` is the list of field identifiers of the struct’s children + +returns the datatype identifier + +```q +q)field_one:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] +q)field_two:.arrowkdb.fd.field[`utf8_field;.arrowkdb.dt.utf8[]] +q)struct_datatype:.arrowkdb.dt.struct[field_one,field_two] +q).arrowkdb.dt.printDatatype[struct_datatype] +struct +q).arrowkdb.fd.fieldName each .arrowkdb.dt.getChildFields[struct_datatype] +`int_field`utf8_field +q).arrowkdb.dt.printDatatype each .arrowkdb.fd.fieldDatatype each .arrowkdb.dt.getChildFields[struct_datatype] +int64 +string +:: +:: +q).arrowkdb.ar.prettyPrintArray[struct_datatype;((1 2 3);("aa";"bb";"cc"));::] +-- is_valid: all not null +-- child 0 type: int64 + [ + 1, + 2, + 3 + ] +-- child 1 type: string + [ + "aa", + "bb", + "cc" + ] +q) // By slicing across the lists the logical struct values are: (1,"aa"); (2,"bb"); (3,"cc") +``` + +### **`dt.sparse_union`** + +*Create a sparse union datatype, specified in terms of the field identifiers of its children* + +```syntax +.arrowkdb.dt.sparse_union[field_ids] +``` + +Where `field_ids` is the list of field identifiers of the union’s children + +returns the datatype identifier + +An arrow union array is similar to a struct array except that it has an additional type_id array which identifies the live field in each union value set. + +??? warning "`sparse_union` datatype is not supported by Parquet" + + The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) + +```q +q)field_one:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] +q)field_two:.arrowkdb.fd.field[`utf8_field;.arrowkdb.dt.utf8[]] +q)union_datatype:.arrowkdb.dt.sparse_union[field_one,field_two] +q).arrowkdb.dt.printDatatype[union_datatype] +sparse_union +q).arrowkdb.fd.fieldName each .arrowkdb.dt.getChildFields[union_datatype] +`int_field`utf8_field +q).arrowkdb.dt.printDatatype each .arrowkdb.fd.fieldDatatype each .arrowkdb.dt.getChildFields[union_datatype] +int64 +string +:: +:: +q).arrowkdb.ar.prettyPrintArray[union_datatype;((1 0 1h);(1 2 3);("aa";"bb";"cc"));::] +-- is_valid: all not null +-- type_ids: [ + 1, + 0, + 1 + ] +-- child 0 type: int64 + [ + 1, + 2, + 3 + ] +-- child 1 type: string + [ + "aa", + "bb", + "cc" + ] +q) // Looking up the type_id array the logical union values are: "aa", 2, "cc" +``` + +### **`dt.dense_union`** + +*Create a dense union datatype, specified in terms of the field identifiers of its children* + +```syntax +.arrowkdb.dt.dense_union[field_ids] +``` + +Where `field_ids` is the list of field identifiers of the union’s children + +returns the datatype identifier + +An arrow union array is similar to a struct array except that it has an additional type_id array which identifies the live field in each union value set. + +??? warning "`dense_union` datatype is not supported by Parquet" + + The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) + +```q +q)field_one:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] +q)field_two:.arrowkdb.fd.field[`utf8_field;.arrowkdb.dt.utf8[]] +q)union_datatype:.arrowkdb.dt.dense_union[field_one,field_two] +q).arrowkdb.dt.printDatatype[union_datatype] +dense_union +q).arrowkdb.fd.fieldName each .arrowkdb.dt.getChildFields[union_datatype] +`int_field`utf8_field +q).arrowkdb.dt.printDatatype each .arrowkdb.fd.fieldDatatype each .arrowkdb.dt.getChildFields[union_datatype] +int64 +string +:: +:: +q).arrowkdb.ar.prettyPrintArray[union_datatype;((1 0 1h);(1 2 3);("aa";"bb";"cc"));::] +-- is_valid: all not null +-- type_ids: [ + 1, + 0, + 1 + ] +-- value_offsets: [ + 0, + 0, + 0 + ] +-- child 0 type: int64 + [ + 1, + 2, + 3 + ] +-- child 1 type: string + [ + "aa", + "bb", + "cc" + ] +q) // Looking up the type_id array the logical union values are: "aa", 2, "cc" +``` + +### `dt.dictionary` + +*Create a dictionary datatype specified in terms of its value and index datatypes, similar to pandas categorical* + +```syntax +.arrowkdb.dt.dictionary[value_datatype_id;index_datatype_id] +``` + +Where: + +- `value_datatype_id` is the identifier of the dictionary value datatype, must be a scalar type +- `index_datatype_id` is the identifier of the dictionary index datatype, must be a signed int type + +returns the datatype identifier + +??? warning "Only the categorical interpretation of a `dictionary` datatype array is saved by Parquet" + + The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) + +```q +q)dict_datatype:.arrowkdb.dt.dictionary[.arrowkdb.dt.utf8[];.arrowkdb.dt.int64[]] +q).arrowkdb.dt.printDatatype[dict_datatype] +dictionary +q).arrowkdb.dt.printDatatype each .arrowkdb.dt.getDictionaryDatatypes[dict_datatype] +string +int64 +:: +:: +q).arrowkdb.ar.prettyPrintArray[dict_datatype;(("aa";"bb";"cc");(2 0 1 0 0));::] + +-- dictionary: + [ + "aa", + "bb", + "cc" + ] +-- indices: + [ + 2, + 0, + 1, + 0, + 0 + ] +q) // The categorical interpretation of the dictionary (looking up the values set at each index) would be: "cc", "aa", "bb", "aa", "aa" +``` + +### `dt.inferDatatype` + +*Infer and construct a datatype from a kdb+ list* + +```syntax +.arrowkdb.dt.inferDatatype[list] +``` + +Where `list` is a kdb+ list + +returns the datatype identifier + +The kdb+ list type is mapped to an Arrow datatype as described [here](#inferreddatatypes). + +```q +q).arrowkdb.dt.printDatatype[.arrowkdb.dt.inferDatatype[(1 2 3j)]] +int64 +q).arrowkdb.dt.printDatatype[.arrowkdb.dt.inferDatatype[("aa";"bb";"cc")]] +string +``` + +## Datatype inspection + +### `dt.datatypeName` + +*Return the base name of a datatype, ignoring any parameters or child datatypes/fields* + +```syntax +.arrowkdb.dt.datatypeName[datatype_id] +``` + +Where `datatype_id` is the identifier of the datatype + +returns a symbol containing the base name of the datatype + +```q +q).arrowkdb.dt.datatypeName[.arrowkdb.dt.int64[]] +`int64 +q).arrowkdb.dt.datatypeName[.arrowkdb.dt.fixed_size_binary[4i]] +`fixed_size_binary +``` + +### `dt.getTimeUnit` + +*Return the TimeUnit of a time32/time64/timestamp/duration datatype* + +```syntax +.arrowkdb.dt.getTimeUnit[datatype_id] +``` + +Where `datatype_id` is the identifier of the datatype + +returns a symbol containing the time unit string: SECOND/MILLI/MICRO/NANO + +```q +q).arrowkdb.dt.getTimeUnit[.arrowkdb.dt.timestamp[`NANO]] +`NANO +``` + +### `dt.getByteWidth` + +*Return the byte_width of a fixed_size_binary datatype* + +```syntax +.arrowkdb.dt.getByteWidth[datatype_id] +``` + +Where `datatype_id` is the identifier of the datatype + +returns the int32 byte width + +```q +q).arrowkdb.dt.getByteWidth[.arrowkdb.dt.fixed_size_binary[4i]] +4i +``` + +### `dt.getListSize` + +*Returns the list_size of a fixed_size_list datatype* + +```syntax +.arrowkdb.dt.getListSize[datatype_id] +``` + +Where `datatype_id` is the identifier of the datatype + +returns the int32 list size + +```q +q).arrowkdb.dt.getListSize[.arrowkdb.dt.fixed_size_list[.arrowkdb.dt.int64[];4i]] +4i +``` + +### `dt.getPrecisionScale` + +*Return the precision and scale of a decimal128 datatype* + +```syntax +.arrowkdb.dt.getPrecisionScale[datatype_id] +``` + +Where `datatype_id` is the identifier of the datatype + +returns the int32 precision and scale + +```q +q).arrowkdb.dt.getPrecisionScale[.arrowkdb.dt.decimal128[38i;2i]] +38 +2 +``` + +### `dt.getListDatatype` + +*Return the child datatype identifier of a list/large_list/fixed_size_list datatype* + +```syntax +.arrowkdb.dt.getListDatatype[datatype_id] +``` + +Where `datatype_id` is the identifier of the datatype + +returns the list’s child datatype identifier + +```q +q)list_datatype:.arrowkdb.dt.list[.arrowkdb.dt.int64[]] +q).arrowkdb.dt.printDatatype[.arrowkdb.dt.getListDatatype[list_datatype]] +int64 +``` + +### `dt.getMapDatatypes` + +*Return the key and item child datatype identifiers of a map datatype* + +```syntax +.arrowkdb.dt.getMapDatatypes[datatype_id] +``` + +Where `datatype_id` is the identifier of the datatype + +returns the map’s key and item child datatype identifiers + +```q +q)map_datatype:.arrowkdb.dt.map[.arrowkdb.dt.int64[];.arrowkdb.dt.float64[]] +q).arrowkdb.dt.printDatatype each .arrowkdb.dt.getMapDatatypes[map_datatype] +int64 +double +:: +:: +``` + +### `dt.getDictionaryDatatypes` + +*Return the value and index child datatype identifiers of a dictionary datatype* + +```syntax +.arrowkdb.dt.getDictionaryDatatypes[datatype_id] +``` + +Where `datatype_id` is the identifier of the datatype + +returns the dictionary’s value and index child datatype identifiers + +```q +q)dict_datatype:.arrowkdb.dt.dictionary[.arrowkdb.dt.utf8[];.arrowkdb.dt.int64[]] +q).arrowkdb.dt.printDatatype each .arrowkdb.dt.getDictionaryDatatypes[dict_datatype] +string +int64 +:: +:: +``` + +### `dt.getChildFields` + +*Return the list of child field identifiers of a struct/spare_union/dense_union datatype* + +```syntax +.arrowkdb.dt.getChildFields[datatype_id] +``` + +Where `datatype_id` is the identifier of the datatype + +returns the list of child field identifiers + +```q +q)field_one:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] +q)field_two:.arrowkdb.fd.field[`utf8_field;.arrowkdb.dt.utf8[]] +q)struct_datatype:.arrowkdb.dt.struct[field_one,field_two] +q).arrowkdb.fd.printField each .arrowkdb.dt.getChildFields[struct_datatype] +int_field: int64 not null +utf8_field: string not null +:: +:: +``` + +## Datatype management + +### `dt.printDatatype` + +*Display user-readable information for a datatype, including parameters and nested child datatypes* + +```syntax +.arrowkdb.dt.printDatatype[datatype_id] +``` + +Where `datatype_id` is the identifier of the datatype, + +1. prints datatype information to stdout +1. returns generic null + +??? warning "For debugging use only" + + The information is generated by the `arrow::DataType::ToString()` functionality and displayed on stdout to preserve formatting and indentation. + +```q +q).arrowkdb.dt.printDatatype[.arrowkdb.dt.fixed_size_list[.arrowkdb.dt.int64[];4i]] +fixed_size_list[4] +``` + +### `dt.listDatatypes` + +*Return the list of identifiers for all datatypes held in the DatatypeStore* + +```syntax +.arrowkdb.dt.listDatatypes[] +``` + +Returns list of datatype identifiers + +```q +q).arrowkdb.dt.int64[] +1i +q).arrowkdb.dt.float64[] +2i +q).arrowkdb.dt.printDatatype each .arrowkdb.dt.listDatatypes[] +int64 +double +:: +:: +``` + +### `dt.removeDatatype` + +*Remove a datatype from the DatatypeStore* + +```syntax +.arrowkdb.dt.removeDatatype[datatype_id] +``` + +Where `datatype_id` is the identifier of the datatype + +returns generic null on success + +```q +q).arrowkdb.dt.int64[] +1i +q).arrowkdb.dt.float64[] +2i +q).arrowkdb.dt.listDatatypes[] +1 2i +q).arrowkdb.dt.removeDatatype[1i] +q).arrowkdb.dt.listDatatypes[] +,2i +``` + +### `dt.equalDatatypes` + +*Check if two datatypes are logically equal, including parameters and nested child datatypes* + +```syntax +.arrowkdb.dt.equalDatatypes[first_datatype_id;second_datatype_id] +``` + +Where: + +- `first_datatype_id` is the identifier of the first datatype +- `second_datatype_id` is the identifier of the second datatype + +returns boolean result + +Internally the DatatypeStore uses the `equalDatatypes` functionality to prevent a new datatype identifier being created when an equal datatype is already present in the DatatypeStore, returning the existing datatype identifier instead. + +```q +q).arrowkdb.dt.equalDatatypes[.arrowkdb.dt.int64[];.arrowkdb.dt.int64[]] +1b +q).arrowkdb.dt.equalDatatypes[.arrowkdb.dt.int64[];.arrowkdb.dt.float64[]] +0b +q).arrowkdb.dt.equalDatatypes[.arrowkdb.dt.fixed_size_binary[4i];.arrowkdb.dt.fixed_size_binary[4i]] +1b +q).arrowkdb.dt.equalDatatypes[.arrowkdb.dt.fixed_size_binary[2i];.arrowkdb.dt.fixed_size_binary[4i]] +0b +q).arrowkdb.dt.equalDatatypes[.arrowkdb.dt.list[.arrowkdb.dt.int64[]];.arrowkdb.dt.list[.arrowkdb.dt.int64[]]] +1b +q).arrowkdb.dt.equalDatatypes[.arrowkdb.dt.list[.arrowkdb.dt.int64[]];.arrowkdb.dt.list[.arrowkdb.dt.float64[]]] +0b +``` + + +## Field constructor + +### `fd.field` + +*Create a field instance from its name and datatype* + +```syntax +.arrowkdb.fd.field[field_name;datatype_id] +``` + +Where: + +- `field_name` is a symbol containing the field’s name +- `datatype_id` is the identifier of the field’s datatype + +returns the field identifier + +```q +q).arrowkdb.fd.printField[.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]]] +int_field: int64 not null +``` + +## Field inspection + +### `fd.fieldName` + +_Name of a field_ + +```syntax +.arrowkdb.fd.fieldName[field_id] +``` + +Where `field_id` is the field identifier + +returns a symbol containing the field’s name + +```q +q)field:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] +q).arrowkdb.fd.fieldName[field] +`int_field +``` + +### `fd.fieldDatatype` + +_Datatype of a field_ + +```syntax +.arrowkdb.fd.fieldDatatype[field_id] +``` + +Where `field_id` is the field identifier + +returns the datatype identifier + +```q +q)field:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] +q).arrowkdb.dt.printDatatype[.arrowkdb.fd.fieldDatatype[field]] +int64 +``` + + +## Field management + +### `fd.printField` + +*Display user readable information for a field, including name and datatype* + +```syntax +.arrowkdb.fd.printField[field_id] +``` + +Where `field_id` is the identifier of the field, + +1. prints field information to stdout +1. returns generic null + +??? warning "For debugging use only" + + The information is generated by the `arrow::Field::ToString()` functionality and displayed on stdout to preserve formatting and indentation. + +```q +q).arrowkdb.fd.printField[.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]]] +int_field: int64 not null +``` + +### `fd.listFields` + +_List of identifiers for all fields held in the FieldStore_ + +```syntax +.arrowkdb.fd.listFields[] +``` + +Returns list of field identifiers + +```q +q).arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] +1i +q).arrowkdb.fd.field[`float_field;.arrowkdb.dt.float64[]] +2i +q).arrowkdb.fd.printField each .arrowkdb.fd.listFields[] +int_field: int64 not null +float_field: double not null +:: +:: +``` + +### `fd.removeField` + +*Remove a field from the FieldStore* + +```syntax +.arrowkdb.fd.removeField[field_id] +``` + +Where `field_id` is the identifier of the field + +returns generic null on success + +```q +q).arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] +1i +q).arrowkdb.fd.field[`float_field;.arrowkdb.dt.float64[]] +2i +q).arrowkdb.fd.listFields[] +1 2i +q).arrowkdb.fd.removeField[1i] +q).arrowkdb.fd.listFields[] +,2i +``` + +### `fd.equalFields` + +*Check if two fields are logically equal, including names and datatypes* + +```syntax +.arrowkdb.fd.equalDatatypes[first_field_id;second_field_id] +``` + +Where: + +- `first_field_id` is the identifier of the first field +- `second_field_id` is the identifier of the second field + +returns boolean result + +Internally the FieldStore uses the `equalFields` functionality to prevent a new field identifier being created when an equal field is already present in the FieldStore, returning the existing field identifier instead. + +```q +q)int_dt:.arrowkdb.dt.int64[] +q)float_dt:.arrowkdb.dt.float64[] +q).arrowkdb.fd.equalFields[.arrowkdb.fd.field[`f1;int_dt];.arrowkdb.fd.field[`f1;int_dt]] +1b +q).arrowkdb.fd.equalFields[.arrowkdb.fd.field[`f1;int_dt];.arrowkdb.fd.field[`f2;int_dt]] +0b +q).arrowkdb.fd.equalFields[.arrowkdb.fd.field[`f1;int_dt];.arrowkdb.fd.field[`f1;float_dt]] +0b +``` + +## Schema constructors + +### `sc.schema` + +*Create a schema instance from a list of field identifiers* + +```syntax +.arrowkdb.sc.schema[field_ids] +``` + +Where `fields_ids` is a list of field identifiers + +returns the schema identifier + +```q +q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] +q)f2:.arrowkdb.fd.field[`float_field;.arrowkdb.dt.float64[]] +q).arrowkdb.sc.printSchema[.arrowkdb.sc.schema[(f1,f2)]] +int_field: int64 not null +float_field: double not null +``` + +### `sc.inferSchema` + +*Infer and construct a schema based on a kdb+ table* + +```syntax +.arrowkdb.sc.inferSchema[table] +``` + +Where `table` is a kdb+ table or dictionary + +returns the schema identifier + +??? warning "Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors" + + Each column in the table is mapped to a field in the schema. The column name is used as the field name and the column’s kdb+ type is mapped to an Arrow datatype as as described [here](#inferred-datatypes). + +```q +q)schema_from_table:.arrowkdb.sc.inferSchema[([] int_field:(1 2 3); float_field:(4 5 6f); str_field:("aa";"bb";"cc"))] +q).arrowkdb.sc.printSchema[schema_from_table] +int_field: int64 +float_field: double +str_field: string +``` + +## Schema inspection + +### `sc.schemaFields` + +*Return the list of field identifiers used by a schema* + +```syntax +.arrowkdb.sc.schemaFields[schema_id] +``` + +Where `schema_id` is the schema identifier + +returns list of field identifiers used by the schema + +```q +q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] +q)f2:.arrowkdb.fd.field[`float_field;.arrowkdb.dt.float64[]] +q)schema:.arrowkdb.sc.schema[(f1,f2)] +q).arrowkdb.fd.printField each .arrowkdb.sc.schemaFields[schema] +int_field: int64 not null +float_field: double not null +:: +:: +``` + +## Schema management + +### `sc.printSchema` + +*Display user readable information for a schema, including its fields and their order* + +```syntax +.arrowkdb.sc.printSchema[schema_id] +``` + +Where `schema_id` is the identifier of the schema, + +1. prints schema information to stdout +1. returns generic null + +??? warning "For debugging use only" + + The information is generated by the `arrow::Schema::ToString()` functionality and displayed on stdout to preserve formatting and indentation. + +```q +q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] +q)f2:.arrowkdb.fd.field[`float_field;.arrowkdb.dt.float64[]] +q)f3:.arrowkdb.fd.field[`str_field;.arrowkdb.dt.utf8[]] +q)schema:.arrowkdb.sc.schema[(f1,f2,f3)] +q).arrowkdb.sc.printSchema[schema] +int_field: int64 not null +float_field: double not null +str_field: string not null +``` + +### `sc.listSchemas` + +*Return the list of identifiers for all schemas held in the SchemaStore* + +```syntax +.arrowkdb.sc.listSchemas[] +``` + +Returns list of schema identifiers + +```q +q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] +q)f2:.arrowkdb.fd.field[`float_field;.arrowkdb.dt.float64[]] +q).arrowkdb.sc.schema[(f1,f2)] +1i +q).arrowkdb.sc.schema[(f2,f1)] +2i +q).arrowkdb.sc.listSchemas[] +1 2i +``` + +### `sc.removeSchema` + +*Remove a schema from the SchemaStore* + +```syntax +.arrowkdb.sc.removeSchema[schema_id] +``` + +Where `schema_id` is the identifier of the schema + +returns generic null on success + +```q +q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] +q)f2:.arrowkdb.fd.field[`float_field;.arrowkdb.dt.float64[]] +q).arrowkdb.sc.schema[(f1,f2)] +1i +q).arrowkdb.sc.schema[(f2,f1)] +2i +q).arrowkdb.sc.listSchemas[] +1 2i +q).arrowkdb.sc.removeSchema[1i] +q).arrowkdb.sc.listSchemas[] +,2i +``` + +### `sc.equalSchemas` + +*Check if two schemas are logically equal, including their fields and the fields' order* + +```syntax +.arrowkdb.sc.equalSchemas[first_schema_id;second_schema_id] +``` + +Where: + +- `first_schema_id` is the identifier of the first schema +- `second_schema_id` is the identifier of the second schema + +returns boolean result + +Internally the SchemaStore uses the `equalSchemas` functionality to prevent a new schema identifier being created when an equal schema is already present in the SchemaStore, returning the existing schema identifier instead. + +```q +q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] +q)f2:.arrowkdb.fd.field[`float_field;.arrowkdb.dt.float64[]] +q).arrowkdb.sc.schema[(f1,f2)] +1i +q).arrowkdb.sc.schema[(f2,f1)] +2i +q).arrowkdb.sc.equalSchemas[.arrowkdb.sc.schema[(f1,f2)];.arrowkdb.sc.schema[(f1,f2)]] +1b +q).arrowkdb.sc.equalSchemas[.arrowkdb.sc.schema[(f1,f2)];.arrowkdb.sc.schema[(f1,f1)]] +0b +q).arrowkdb.sc.equalSchemas[.arrowkdb.sc.schema[(f1,f2)];.arrowkdb.sc.schema[(f2,f1)]] +0b +``` + +## Array data + +### `ar.prettyPrintArray` + +*Convert a kdb+ list to an Arrow array and pretty print the array* + +```syntax +.arrowkdb.ar.prettyPrintArray[datatype_id;list;options] +``` + +Where: + +- `datatype_id` is the datatype identifier of the array +- `list` is the kdb+ list data to be displayed +- `options` is a kdb+ dictionary of options or generic null (::) to use defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or mixed list of -7|-11|4h. + +the function + +1. prints array contents to stdout +1. returns generic null + +Supported options: + +- `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. + +??? warning "For debugging use only" + + The information is generated by the `arrow::PrettyPrint()` functionality and displayed on stdout to preserve formatting and indentation. + +```q +q)int_datatype:.arrowkdb.dt.int64[] +q).arrowkdb.ar.prettyPrintArray[int_datatype;(1 2 3j);::] +[ + 1, + 2, + 3 +] +``` + +### `ar.prettyPrintArrayFromList` + +*Convert a kdb+ list to an Arrow array and pretty print the array, inferring the datatype from the kdb+ list type* + +```syntax +.arrowkdb.ar.prettyPrintArrayFromList[list;options] +``` + +Where: + +- `list` is the kdb+ list data to be displayed +- `options` is reserved for future use - specify generic null (::) + +the function + +1. prints array contents to stdout +1. returns generic null + +The kdb+ list type is mapped to an Arrow datatype as described [here](#inferreddatatypes). + +??? warning "For debugging use only" + + The information is generated by the `arrow::PrettyPrint()` functionality and displayed on stdout to preserve formatting and indentation. + +```q +q).arrowkdb.ar.prettyPrintArrayFromList[(1 2 3j);::] +[ + 1, + 2, + 3 +] +``` + +## Table data + +### `tb.prettyPrintTable` + +*Convert a kdb+ mixed list of array data to an Arrow table and pretty print the table* + +``` +.arrowkdb.tb.prettyPrintTable[schema_id;array_data;options] +``` + +Where: + +- `schema_id` is the schema identifier of the table +- `array_data` is a mixed list of array data +- `options` is a kdb+ dictionary of options or generic null (::) to use defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or mixed list of -7|-11|4h. + +the function + +1. prints table contents to stdout +1. returns generic null + +The mixed list of Arrow array data should be ordered in schema field number and each list item representing one of the arrays must be structured according to the field’s datatype. + +Supported options: + +- `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. + +??? warning "For debugging use only" + + The information is generated by the `arrow::Table::ToString()` functionality and displayed on stdout to preserve formatting and indentation. + +```q +q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] +q)f2:.arrowkdb.fd.field[`float_field;.arrowkdb.dt.float64[]] +q)f3:.arrowkdb.fd.field[`str_field;.arrowkdb.dt.utf8[]] +q)schema:.arrowkdb.sc.schema[(f1,f2,f3)] +q).arrowkdb.tb.prettyPrintTable[schema;((1 2 3j);(4 5 6f);("aa";"bb";"cc"));::] +int_field: int64 not null +float_field: double not null +str_field: string not null +---- +int_field: + [ + [ + 1, + 2, + 3 + ] + ] +float_field: + [ + [ + 4, + 5, + 6 + ] + ] +str_field: + [ + [ + "aa", + "bb", + "cc" + ] + ] +``` + +### `tb.prettyPrintTableFromTable` + +*Convert a kdb+ table to an Arrow table and pretty print the table, inferring the schema from the kdb+ table structure* + +```syntax +.arrowkdb.tb.prettyPrintTableFromTable[table;options] +``` + +Where: + +- `table` is a kdb+ table +- `options` is reserved for future use - specify generic null (::) + +the function + +1. prints table contents to stdout +1. returns generic null + +Each column in the table is mapped to a field in the schema. The column name is used as the field name and the column’s kdb+ type is mapped to an Arrow datatype as as described [here](#inferreddatatypes). + +??? warning "Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors" + + Each column in the table is mapped to a field in the schema. The column name is used as the field name and the column’s kdb+ type is mapped to an Arrow datatype as as described [here](#inferred-datatypes). + +??? warning "For debugging use only" + + The information is generated by the `arrow::Table::ToString()` functionality and displayed on stdout to preserve formatting and indentation. + +```q +q).arrowkdb.tb.prettyPrintTableFromTable[([] int_field:(1 2 3); float_field:(4 5 6f); str_field:("aa";"bb";"cc"));::] +int_field: int64 +float_field: double +str_field: string +---- +int_field: + [ + [ + 1, + 2, + 3 + ] + ] +float_field: + [ + [ + 4, + 5, + 6 + ] + ] +str_field: + [ + [ + "aa", + "bb", + "cc" + ] + ] +``` + +## Parquet files + +### `pq.writeParquet` + +*Convert a kdb+ mixed list of array data to an Arrow table and write to a Parquet file* + +```syntax +.arrowkdb.pq.writeParquet[parquet_file;schema_id;array_data;options] +``` + +Where: + +- `parquet_file` is a string containing the Parquet file name +- `schema_id` is the schema identifier to use for the table +- `array_data` is a mixed list of array data +- `options` is a kdb+ dictionary of options or generic null (::) to use defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or mixed list of -7|-11|4h. + +returns generic null on success + +The mixed list of Arrow array data should be ordered in schema field number and each list item representing one of the arrays must be structured according to the field’s datatype. + +Supported options: + +- `PARQUET_CHUNK_SIZE` - Controls the approximate size of encoded data pages within a column chunk. Long, default 1MB. +- `PARQUET_VERSION` - Select the Parquet format version, either `V1.0` or `V2.0`. `V2.0` is more fully featured but may be incompatible with older Parquet implementations. String, default `V1.0` +- `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. + +??? warning "The Parquet format is compressed and designed for for maximum space efficiency which may cause a performance overhead compared to Arrow. Parquet is also less fully featured than Arrow which can result in schema limitations" + + The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) + +```q +q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] +q)f2:.arrowkdb.fd.field[`float_field;.arrowkdb.dt.float64[]] +q)f3:.arrowkdb.fd.field[`str_field;.arrowkdb.dt.utf8[]] +q)schema:.arrowkdb.sc.schema[(f1,f2,f3)] +q)array_data:((1 2 3j);(4 5 6f);("aa";"bb";"cc")) +q).arrowkdb.pq.writeParquet["file.parquet";schema;array_data;::] +q)read_data:.arrowkdb.pq.readParquetData["file.parquet";::] +q)array_data~read_data +1b +``` + +### `pq.writeParquetFromTable` + +*Convert a kdb+ table to an Arrow table and write to a Parquet file, inferring the schema from the kdb+ table structure* + +```syntax +.arrowkdb.pq.writeParquetFromTable[parquet_file;table;options] +``` + +Where: + +- `parquet_file` is a string containing the Parquet file name +- `table` is a kdb+ table +- `options` is a kdb+ dictionary of options or generic null (::) to use defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or mixed list of -7|-11|4h. + +returns generic null on success + +Supported options: + +- `PARQUET_CHUNK_SIZE` - Controls the approximate size of encoded data pages within a column chunk. Long, default 1MB. +- `PARQUET_VERSION` - Select the Parquet format version, either `V1.0` or `V2.0`. `V2.0` is more fully featured but may be incompatible with older Parquet implementations. String, default `V1.0` + +??? warning "Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors" + + Each column in the table is mapped to a field in the schema. The column name is used as the field name and the column’s kdb+ type is mapped to an Arrow datatype as as described [here](#inferred-datatypes). + +```q +q)table:([] int_field:(1 2 3); float_field:(4 5 6f); str_field:("aa";"bb";"cc")) +q).arrowkdb.pq.writeParquetFromTable["file.parquet";table;::] +q)read_table:.arrowkdb.pq.readParquetToTable["file.parquet";::] +q)read_table~table +1b +``` + +### `pq.readParquetSchema` + +*Read the schema from a Parquet file* + +```syntax +.arrowkdb.pq.readParquetSchema[parquet_file] +``` + +Where `parquet_file` is a string containing the Parquet file name + +returns the schema identifier + +```q +q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] +q)f2:.arrowkdb.fd.field[`float_field;.arrowkdb.dt.float64[]] +q)f3:.arrowkdb.fd.field[`str_field;.arrowkdb.dt.utf8[]] +q)schema:.arrowkdb.sc.schema[(f1,f2,f3)] +q)array_data:((1 2 3j);(4 5 6f);("aa";"bb";"cc")) +q).arrowkdb.pq.writeParquet["file.parquet";schema;array_data;::] +q).arrowkdb.sc.equalSchemas[schema;.arrowkdb.pq.readParquetSchema["file.parquet"]] +1b +``` + +### `pq.readParquetData` + +*Read an Arrow table from a Parquet file and convert to a kdb+ mixed list of array data* + +```syntax +.arrowkdb.pq.readParquetData[parquet_file;options] +``` + +Where: + +- `parquet_file` is a string containing the Parquet file name +- `options` is a kdb+ dictionary of options or generic null (::) to use defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or mixed list of -7|-11|4h. + +returns the array data + +Supported options: + +- `PARQUET_MULTITHREADED_READ` - Flag indicating whether the Parquet reader should run in multithreaded mode. This can improve performance by processing multiple columns in parallel. Long, default 0. +- `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. +- `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. + +```q +q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] +q)f2:.arrowkdb.fd.field[`float_field;.arrowkdb.dt.float64[]] +q)f3:.arrowkdb.fd.field[`str_field;.arrowkdb.dt.utf8[]] +q)schema:.arrowkdb.sc.schema[(f1,f2,f3)] +q)array_data:((1 2 3j);(4 5 6f);("aa";"bb";"cc")) +q).arrowkdb.pq.writeParquet["file.parquet";schema;array_data;::] +q)read_data:.arrowkdb.pq.readParquetData["file.parquet";::] +q)array_data~read_data +1b +``` + +### `pq.readParquetColumn` + +*Read a single column from a Parquet file and convert to a kdb+ list* + +```syntax +.arrowkdb.pq.readParquetColumn[parquet_file;column_index;options] +``` + +Where: + +- `parquet_file` is a string containing the Parquet file name +- `column_index` is the index of the column to read, relative to the schema field order +- `options` is a kdb+ dictionary of options or generic null (::) to use defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or mixed list of -7|-11|4h. + +returns the array’s data + +Supported options: + +- `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. + +```q +q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] +q)f2:.arrowkdb.fd.field[`float_field;.arrowkdb.dt.float64[]] +q)f3:.arrowkdb.fd.field[`str_field;.arrowkdb.dt.utf8[]] +q)schema:.arrowkdb.sc.schema[(f1,f2,f3)] +q)array_data:((1 2 3j);(4 5 6f);("aa";"bb";"cc")) +q).arrowkdb.pq.writeParquet["file.parquet";schema;array_data;::] +q)col1:.arrowkdb.pq.readParquetColumn["file.parquet";1i;::] +q)col1~array_data[1] +1b +``` + +### `pq.readParquetToTable` + +*Read an Arrow table from a Parquet file and convert to a kdb+ table* + +```syntax +.arrowkdb.pq.readParquetToTable[parquet_file;options] +``` + +Where: + +- `parquet_file` is a string containing the Parquet file name +- `options` is a kdb+ dictionary of options or generic null (::) to use defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or mixed list of -7|-11|4h. + +returns the kdb+ table + +Each schema field name is used as the column name and the Arrow array data is used as the column data. + +Supported options: + +- `PARQUET_MULTITHREADED_READ` - Flag indicating whether the Parquet reader should run in multithreaded mode. This can improve performance by processing multiple columns in parallel. Long, default 0. +- `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. +- `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. + +```q +q)table:([] int_field:(1 2 3); float_field:(4 5 6f); str_field:("aa";"bb";"cc")) +q).arrowkdb.pq.writeParquetFromTable["file.parquet";table;::] +q)read_table:.arrowkdb.pq.readParquetToTable["file.parquet";::] +q)read_table~table +1b +``` + +## Arrow IPC files + +### `ipc.writeArrow` + +*Convert a kdb+ mixed list of array data to an Arrow table and write to an Arrow file* + +```syntax +.arrowkdb.ipc.writeArrow[arrow_file;schema_id;array_data;options] +``` + +Where: + +- `arrow_file` is a string containing the Arrow file name +- `schema_id` is the schema identifier to use for the table +- `array_data` is a mixed list of array data +- `options` is a kdb+ dictionary of options or generic null (::) to use defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or mixed list of -7|-11|4h. + +returns generic null on success + +The mixed list of Arrow array data should be ordered in schema field number and each list item representing one of the arrays must be structured according to the field’s datatype. + +Supported options: + +- `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. + +```q +q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] +q)f2:.arrowkdb.fd.field[`float_field;.arrowkdb.dt.float64[]] +q)f3:.arrowkdb.fd.field[`str_field;.arrowkdb.dt.utf8[]] +q)schema:.arrowkdb.sc.schema[(f1,f2,f3)] +q)array_data:((1 2 3j);(4 5 6f);("aa";"bb";"cc")) +q).arrowkdb.ipc.writeArrow["file.arrow";schema;array_data;::] +q)read_data:.arrowkdb.ipc.readArrowData["file.arrow";::] +q)read_data~array_data +1b +``` + +### `ipc.writeArrowFromTable` + +*Convert a kdb+ table to an Arrow table and write to an Arrow file, inferring the schema from the kdb+ table structure* + +```syntax +.arrowkdb.ipc.writeArrowFromTable[arrow_file;table;options] +``` + +Where: + +- `arrow_file` is a string containing the Arrow file name +- `table` is a kdb+ table +- `options` is reserved for future use - specify generic null (::) + +returns generic null on success + +??? warning "Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors" + + Each column in the table is mapped to a field in the schema. The column name is used as the field name and the column’s kdb+ type is mapped to an Arrow datatype as as described [here](#inferred-datatypes). + +```q +q)table:([] int_field:(1 2 3); float_field:(4 5 6f); str_field:("aa";"bb";"cc")) +q).arrowkdb.ipc.writeArrowFromTable["file.arrow";table;::] +q)read_table:.arrowkdb.ipc.readArrowToTable["file.arrow";::] +q)read_table~table +1b +``` + +### `ipc.readArrowSchema` + +*Read the schema from an Arrow file* + +```syntax +.arrowkdb.ipc.readArrowSchema[arrow_file] +``` + +Where `arrow_file` is a string containing the Arrow file name + +returns the schema identifier + +```q +q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] +q)f2:.arrowkdb.fd.field[`float_field;.arrowkdb.dt.float64[]] +q)f3:.arrowkdb.fd.field[`str_field;.arrowkdb.dt.utf8[]] +q)schema:.arrowkdb.sc.schema[(f1,f2,f3)] +q)array_data:((1 2 3j);(4 5 6f);("aa";"bb";"cc")) +q).arrowkdb.ipc.writeArrow["file.arrow";schema;array_data;::] +q).arrowkdb.sc.equalSchemas[schema;.arrowkdb.ipc.readArrowSchema["file.arrow"]] +1b +``` + +### `ipc.readArrowData` + +*Read an Arrow table from an Arrow file and convert to a kdb+ mixed list of array data* + +```syntax +.arrowkdb.ipc.readArrowData[arrow_file;options] +``` + +Where: + +- `arrow_file` is a string containing the Arrow file name +- `options` is a kdb+ dictionary of options or generic null (::) to use defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or mixed list of -7|-11|4h. + +returns the array data + +Supported options: + +- `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. +- `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. + +```q +q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] +q)f2:.arrowkdb.fd.field[`float_field;.arrowkdb.dt.float64[]] +q)f3:.arrowkdb.fd.field[`str_field;.arrowkdb.dt.utf8[]] +q)schema:.arrowkdb.sc.schema[(f1,f2,f3)] +q)array_data:((1 2 3j);(4 5 6f);("aa";"bb";"cc")) +q).arrowkdb.ipc.writeArrow["file.arrow";schema;array_data;::] +q)read_data:.arrowkdb.ipc.readArrowData["file.arrow";::] +q)read_data~array_data +1b +``` + +### `ipc.readArrowToTable` + +*Read an Arrow table from an Arrow file and convert to a kdb+ table* + +```syntax +.arrowkdb.ipc.readArrowToTable[arrow_file;options] +``` + +Where: + +- `arrow_file` is a string containing the Arrow file name +- `options` is a kdb+ dictionary of options or generic null (::) to use defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or mixed list of -7|-11|4h. + +returns the kdb+ table + +Each schema field name is used as the column name and the Arrow array data is used as the column data. + +Supported options: + +- `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. +- `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. + +```q +q)table:([] int_field:(1 2 3); float_field:(4 5 6f); str_field:("aa";"bb";"cc")) +q).arrowkdb.ipc.writeArrowFromTable["file.arrow";table;::] +q)read_table:.arrowkdb.ipc.readArrowToTable["file.arrow";::] +q)read_table~table +1b +``` + +## Arrow IPC streams + +### `ipc.serializeArrow` + +*Convert a kdb+ mixed list of array data to an Arrow table and serialize to an Arrow stream* + +```syntax +.arrowkdb.ipc.serializeArrow[schema_id;array_data;options] +``` + +Where: + +- `schema_id` is the schema identifier to use for the table +- `array_data` is a mixed list of array data +- `options` is a kdb+ dictionary of options or generic null (::) to use defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or mixed list of -7|-11|4h. + +returns a byte list containing the serialized stream data + +The mixed list of Arrow array data should be ordered in schema field number and each list item representing one of the arrays must be structured according to the field’s datatype. + +Supported options: + +- `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. + +```q +q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] +q)f2:.arrowkdb.fd.field[`float_field;.arrowkdb.dt.float64[]] +q)f3:.arrowkdb.fd.field[`str_field;.arrowkdb.dt.utf8[]] +q)schema:.arrowkdb.sc.schema[(f1,f2,f3)] +q)array_data:((1 2 3j);(4 5 6f);("aa";"bb";"cc")) +q)serialized:.arrowkdb.ipc.serializeArrow[schema;array_data;::] +q)read_data:.arrowkdb.ipc.parseArrowData[serialized;::] +q)read_data~array_data +1b +``` + +### `ipc.serializeArrowFromTable` + +*Convert a kdb+ table to an Arrow table and serialize to an Arrow stream, inferring the schema from the kdb+ table structure* + +```syntax +.arrowkdb.ipc.serializeArrowFromTable[table;options] +``` + +Where: + +- `table` is a kdb+ table +- `options` is reserved for future use - specify generic null (::) + +returns a byte list containing the serialized stream data + +??? warning "Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors" + + Each column in the table is mapped to a field in the schema. The column name is used as the field name and the column’s kdb+ type is mapped to an Arrow datatype as as described [here](#inferred-datatypes). + +```q +q)table:([] int_field:(1 2 3); float_field:(4 5 6f); str_field:("aa";"bb";"cc")) +q)serialized:.arrowkdb.ipc.serializeArrowFromTable[table;::] +q)new_table:.arrowkdb.ipc.parseArrowToTable[serialized;::] +q)new_table~table +1b +``` + +### `ipc.parseArrowSchema` + +*Parse the schema from an Arrow stream* + +```syntax +.arrowkdb.ipc.parseArrowSchema[serialized] +``` + +Where `serialized` is a byte list containing the serialized stream data + +returns the schema identifier + +```q +q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] +q)f2:.arrowkdb.fd.field[`float_field;.arrowkdb.dt.float64[]] +q)f3:.arrowkdb.fd.field[`str_field;.arrowkdb.dt.utf8[]] +q)schema:.arrowkdb.sc.schema[(f1,f2,f3)] +q)array_data:((1 2 3j);(4 5 6f);("aa";"bb";"cc")) +q)serialized:.arrowkdb.ipc.serializeArrow[schema;array_data;::] +q).arrowkdb.sc.equalSchemas[schema;.arrowkdb.ipc.parseArrowSchema[serialized]] +1b +``` + +### `ipc.parseArrowData` + +*Parse an Arrow table from an Arrow stream and convert to a kdb+ mixed list of array data* + +```syntax +.arrowkdb.ipc.parseArrowData[serialized;options] +``` + +Where: + +- `serialized` is a byte list containing the serialized stream data +- `options` is a kdb+ dictionary of options or generic null (::) to use defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or mixed list of -7|-11|4h. + +returns the array data + +Supported options: + +- `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. + +```q +q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] +q)f2:.arrowkdb.fd.field[`float_field;.arrowkdb.dt.float64[]] +q)f3:.arrowkdb.fd.field[`str_field;.arrowkdb.dt.utf8[]] +q)schema:.arrowkdb.sc.schema[(f1,f2,f3)] +q)array_data:((1 2 3j);(4 5 6f);("aa";"bb";"cc")) +q)serialized:.arrowkdb.ipc.serializeArrow[schema;array_data;::] +q)read_data:.arrowkdb.ipc.parseArrowData[serialized;::] +q)read_data~array_data +1b +``` + +### `ipc.parseArrowToTable` + +*Parse an Arrow table from an Arrow file and convert to a kdb+ table* + +```syntax +.arrowkdb.ipc.parseArrowToTable[serialized;options] +``` + +Where: + +- `serialized` is a byte list containing the serialized stream data +- `options` is a kdb+ dictionary of options or generic null (::) to use defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or mixed list of -7|-11|4h. + +returns the kdb+ table + +Each schema field name is used as the column name and the Arrow array data is used as the column data. + +Supported options: + +- `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. + +```q +q)table:([] int_field:(1 2 3); float_field:(4 5 6f); str_field:("aa";"bb";"cc")) +q)serialized:.arrowkdb.ipc.serializeArrowFromTable[table;::] +q)new_table:.arrowkdb.ipc.parseArrowToTable[serialized;::] +q)new_table~table +1b +``` + +## Utilities + +### `util.buildInfo` + +*Return build information regarding the in use Arrow library* + +```syntax +.arrowkdb.util.buildInfo[] +``` + +Returns a dictionary detailing various Arrow build info including: Arrow version, shared object version, git description and compiler used. + +```q +q).arrowkdb.util.buildInfo[] +version | 3000000i +version_string | `3.0.0-SNAPSHOT +full_so_version | `300.0.0 +compiler_id | `MSVC +compiler_version| `19.26.28806.0 +compiler_flags | `/DWIN32 /D_WINDOWS /GR /EHsc /D_SILENCE_TR1_NAMESPACE_DEP.. +git_id | `c8c2110cd7d01d2f4420079c450997ef5fa89029 +git_description | `apache-arrow-2.0.0-194-gc8c2110cd +package_kind | ` +``` + diff --git a/examples/README.md b/examples/README.md index 6b7e3c7..d0d6605 100644 --- a/examples/README.md +++ b/examples/README.md @@ -30,7 +30,7 @@ tstamp temperature fill_level pump_status comment // Pretty print the Arrow table populated from a kdb+ table // The schema is inferred from the kdb+ table structure -q).arrowkdb.tb.prettyPrintTableFromTable[table] +q).arrowkdb.tb.prettyPrintTableFromTable[table;::] tstamp: timestamp[ns] not null temperature: double not null fill_level: int64 not null @@ -117,7 +117,7 @@ Write the kdb+ table to an Arrow file then read it back ```q // Write the table to an arrow file -q).arrowkdb.ipc.writeArrowFromTable["inferred_schema.arrow";table] +q).arrowkdb.ipc.writeArrowFromTable["inferred_schema.arrow";table;::] q)show system "ls inferred_schema.arrow" "inferred_schema.arrow" @@ -135,12 +135,12 @@ Write the kdb+ table to an Arrow stream then read it back ```q // Serialize the table to an arrow stream -q)serialized:.arrowkdb.ipc.serializeArrowFromTable[table] +q)serialized:.arrowkdb.ipc.serializeArrowFromTable[table;::] q)show serialized 0xffffffff500100001000000000000a000c000600050008000a000000000104000c000000080.. // Parse the arrow stream into another table -q)new_table:.arrowkdb.ipc.parseArrowToTable[serialized] +q)new_table:.arrowkdb.ipc.parseArrowToTable[serialized;::] // Compare the kdb+ tables q)show table~new_table @@ -204,7 +204,7 @@ q)comment_data:N?("start";"stop";"alert";"acknowledge";"") q)array_data:(tstamp_data;temp_data;fill_data;pump_data;comment_data) // Pretty print the Arrow table populated from the array data -q).arrowkdb.tb.prettyPrintTable[schema;array_data] +q).arrowkdb.tb.prettyPrintTable[schema;array_data;::] tstamp: timestamp[ns] not null temperature: double not null fill_level: int64 not null @@ -300,7 +300,7 @@ Write the schema and array data to an Arrow file then read them back ```q // Write the schema and array data to an arrow file -q).arrowkdb.ipc.writeArrow["constructed_schema.arrow";schema;array_data] +q).arrowkdb.ipc.writeArrow["constructed_schema.arrow";schema;array_data;::] q)show system "ls constructed_schema.arrow" "constructed_schema.arrow" @@ -327,7 +327,7 @@ Write the schema and array data to an Arrow stream then read them back ```q // Serialize the schema and array data to an arrow stream -q)serialized:.arrowkdb.ipc.serializeArrow[schema;array_data] +q)serialized:.arrowkdb.ipc.serializeArrow[schema;array_data;::] q)show serialized 0xffffffff500100001000000000000a000c000600050008000a000000000104000c000000080.. @@ -341,7 +341,7 @@ q)show schema~new_schema 1b // Read the array data back from the arrow file -q)new_array_data:.arrowkdb.ipc.parseArrowData[serialized] +q)new_array_data:.arrowkdb.ipc.parseArrowData[serialized;::] // Compare the array data q)show array_data~new_array_data @@ -424,7 +424,7 @@ q)while[x-:1;multi_comments_data:multi_comments_data,getCommentsSet[]] q)nested_array_data:(tstamp_data;sensors_data;pump_data;multi_comments_data) // Pretty print the Arrow table populated from the array data -q).arrowkdb.tb.prettyPrintTable[nested_schema;nested_array_data] +q).arrowkdb.tb.prettyPrintTable[nested_schema;nested_array_data;::] tstamp: timestamp[ns] not null -- field metadata -- PARQUET:field_id: '1' diff --git a/examples/concrete_datatypes.q b/examples/concrete_datatypes.q index 3770724..50d8de2 100644 --- a/examples/concrete_datatypes.q +++ b/examples/concrete_datatypes.q @@ -59,7 +59,7 @@ array_data:(col1;col2;col3;col4;col5); // Show the array data as an arrow table -1"\nTable:"; -.arrowkdb.tb.prettyPrintTable[schema;array_data] +.arrowkdb.tb.prettyPrintTable[schema;array_data;::] //-------------------------// @@ -88,7 +88,7 @@ rm filename; // Write the schema and array data to an arrow file filename:"concrete_datatypes.arrow"; -.arrowkdb.ipc.writeArrow[filename;schema;array_data]; +.arrowkdb.ipc.writeArrow[filename;schema;array_data;::]; show ls filename // Read the schema back and compare @@ -107,7 +107,7 @@ rm filename; //-----------------------------// // Serialize the schema and array data to an arrow stream -serialized:.arrowkdb.ipc.serializeArrow[schema;array_data]; +serialized:.arrowkdb.ipc.serializeArrow[schema;array_data;::]; show serialized // Parse the schema back abd compare @@ -116,7 +116,7 @@ show .arrowkdb.sc.equalSchemas[schema;new_schema] show schema~new_schema // Parse the array data back and compare -new_array_data:.arrowkdb.ipc.parseArrowData[serialized]; +new_array_data:.arrowkdb.ipc.parseArrowData[serialized;::]; show array_data~new_array_data diff --git a/examples/inferred_schema.q b/examples/inferred_schema.q index 82c2228..a84e23e 100644 --- a/examples/inferred_schema.q +++ b/examples/inferred_schema.q @@ -54,7 +54,7 @@ show table; // Write the table to an arrow file filename:"inferred_schema.arrow"; -.arrowkdb.ipc.writeArrowFromTable[filename;table]; +.arrowkdb.ipc.writeArrowFromTable[filename;table;::]; show ls filename // Read the arrow file into another table @@ -78,11 +78,11 @@ show table; .arrowkdb.sc.printSchema[.arrowkdb.sc.inferSchema[table]]; // Serialize the table to an arrow stream -serialized:.arrowkdb.ipc.serializeArrowFromTable[table]; +serialized:.arrowkdb.ipc.serializeArrowFromTable[table;::]; show serialized // Parse the arrow stream into another table -new_table:.arrowkdb.ipc.parseArrowToTable[serialized]; +new_table:.arrowkdb.ipc.parseArrowToTable[serialized;::]; // Compare the kdb+ tables show table~new_table diff --git a/examples/nested_datatypes.q b/examples/nested_datatypes.q index 16b02b1..88180fd 100644 --- a/examples/nested_datatypes.q +++ b/examples/nested_datatypes.q @@ -78,7 +78,7 @@ array_data:(list_data;struct_data); // Show the array data as an arrow table -1"\nTable:"; -.arrowkdb.tb.prettyPrintTable[schema;array_data] +.arrowkdb.tb.prettyPrintTable[schema;array_data;::] //-------------------------// @@ -106,7 +106,7 @@ rm filename; // Write the schema and array data to an arrow file filename:"nested_datatypes.arrow"; -.arrowkdb.ipc.writeArrow[filename;schema;array_data]; +.arrowkdb.ipc.writeArrow[filename;schema;array_data;::]; show ls filename // Read the schema back and compare @@ -124,7 +124,7 @@ rm filename; //-----------------------------// // Serialize the schema and array data to an arrow stream -serialized:.arrowkdb.ipc.serializeArrow[schema;array_data]; +serialized:.arrowkdb.ipc.serializeArrow[schema;array_data;::]; show serialized // Parse the schema back abd compare @@ -133,7 +133,7 @@ show .arrowkdb.sc.equalSchemas[schema;new_schema] show schema~new_schema // Parse the array data back and compare -new_array_data:.arrowkdb.ipc.parseArrowData[serialized]; +new_array_data:.arrowkdb.ipc.parseArrowData[serialized;::]; show array_data~new_array_data diff --git a/examples/parameterized_datatypes.q b/examples/parameterized_datatypes.q index 7888f07..a8e0188 100644 --- a/examples/parameterized_datatypes.q +++ b/examples/parameterized_datatypes.q @@ -49,7 +49,7 @@ array_data:(col1;col2;col3); // Show the array data as an arrow table -1"\nTable:"; -.arrowkdb.tb.prettyPrintTable[schema;array_data] +.arrowkdb.tb.prettyPrintTable[schema;array_data;::] //-------------------------// @@ -78,7 +78,7 @@ rm filename; // Write the schema and array data to an arrow file filename:"parameterized_datatypes.arrow"; -.arrowkdb.ipc.writeArrow[filename;schema;array_data]; +.arrowkdb.ipc.writeArrow[filename;schema;array_data;::]; show ls filename // Read the schema back and compare @@ -97,7 +97,7 @@ rm filename; //-----------------------------// // Serialize the schema and array data to an arrow stream -serialized:.arrowkdb.ipc.serializeArrow[schema;array_data]; +serialized:.arrowkdb.ipc.serializeArrow[schema;array_data;::]; show serialized // Parse the schema back abd compare @@ -106,7 +106,7 @@ show .arrowkdb.sc.equalSchemas[schema;new_schema] show schema~new_schema // Parse the array data back and compare -new_array_data:.arrowkdb.ipc.parseArrowData[serialized]; +new_array_data:.arrowkdb.ipc.parseArrowData[serialized;::]; show array_data~new_array_data diff --git a/examples/readme.q b/examples/readme.q index 4b24ea3..883104a 100644 --- a/examples/readme.q +++ b/examples/readme.q @@ -20,7 +20,7 @@ show table // Pretty print the Arrow table populated from a kdb+ table // The schema is inferred from the kdb+ table structure -.arrowkdb.tb.prettyPrintTableFromTable[table] +.arrowkdb.tb.prettyPrintTableFromTable[table;::] //---------------// // Parquet files // @@ -48,7 +48,7 @@ rm filename; // Write the table to an arrow file filename:"inferred_schema.arrow"; -.arrowkdb.ipc.writeArrowFromTable[filename;table]; +.arrowkdb.ipc.writeArrowFromTable[filename;table;::]; show ls filename // Read the arrow file into another table @@ -63,11 +63,11 @@ rm filename; //-------------------// // Serialize the table to an arrow stream -serialized:.arrowkdb.ipc.serializeArrowFromTable[table]; +serialized:.arrowkdb.ipc.serializeArrowFromTable[table;::]; show serialized // Parse the arrow stream into another table -new_table:.arrowkdb.ipc.parseArrowToTable[serialized]; +new_table:.arrowkdb.ipc.parseArrowToTable[serialized;::]; // Compare the kdb+ tables show table~new_table @@ -116,7 +116,7 @@ comment_data:N?("start";"stop";"alert";"acknowledge";""); array_data:(tstamp_data;temp_data;fill_data;pump_data;comment_data); // Pretty print the Arrow table populated from the array data -.arrowkdb.tb.prettyPrintTable[schema;array_data] +.arrowkdb.tb.prettyPrintTable[schema;array_data;::] //---------------// // Parquet files // @@ -151,7 +151,7 @@ rm filename; // Write the schema and array data to an arrow file filename:"constructed_schema.arrow"; -.arrowkdb.ipc.writeArrow[filename;schema;array_data]; +.arrowkdb.ipc.writeArrow[filename;schema;array_data;::]; show ls filename // Read the schema back from the arrow file @@ -173,7 +173,7 @@ rm filename //-------------------// // Serialize the schema and array data to an arrow stream -serialized:.arrowkdb.ipc.serializeArrow[schema;array_data]; +serialized:.arrowkdb.ipc.serializeArrow[schema;array_data;::]; show serialized // Parse the schema back for the arrow stream @@ -184,7 +184,7 @@ show .arrowkdb.sc.equalSchemas[schema;new_schema] show schema~new_schema // Read the array data back from the arrow file -new_array_data:.arrowkdb.ipc.parseArrowData[serialized]; +new_array_data:.arrowkdb.ipc.parseArrowData[serialized;::]; // Compare the array data show array_data~new_array_data @@ -252,4 +252,4 @@ while[x-:1;multi_comments_data:multi_comments_data,getCommentsSet[]]; nested_array_data:(tstamp_data;sensors_data;pump_data;multi_comments_data); // Pretty print the Arrow table populated from the array data -.arrowkdb.tb.prettyPrintTable[nested_schema;nested_array_data] +.arrowkdb.tb.prettyPrintTable[nested_schema;nested_array_data;::] diff --git a/q/arrowkdb.q b/q/arrowkdb.q index 7445564..7dc5027 100644 --- a/q/arrowkdb.q +++ b/q/arrowkdb.q @@ -96,15 +96,15 @@ sc.equalSchemas:`arrowkdb 2:(`equalSchemas;2); // array data -ar.prettyPrintArray_:`arrowkdb 2:(`prettyPrintArray;2); -ar.prettyPrintArray:{[x;y] -1 ar.prettyPrintArray_[x;y];}; -ar.prettyPrintArrayFromList:{[list] ar.prettyPrintArray[dt.inferDatatype[list];list]}; +ar.prettyPrintArray_:`arrowkdb 2:(`prettyPrintArray;3); +ar.prettyPrintArray:{[x;y;z] -1 ar.prettyPrintArray_[x;y;z];}; +ar.prettyPrintArrayFromList:{[list;options] ar.prettyPrintArray[dt.inferDatatype[list];list;options]}; // table data -tb.prettyPrintTable_:`arrowkdb 2:(`prettyPrintTable;2); -tb.prettyPrintTable:{[x;y] -1 tb.prettyPrintTable_[x;y];}; -tb.prettyPrintTableFromTable:{[table] tb.prettyPrintTable[sc.inferSchema[table];value flip table]}; +tb.prettyPrintTable_:`arrowkdb 2:(`prettyPrintTable;3); +tb.prettyPrintTable:{[x;y;z] -1 tb.prettyPrintTable_[x;y;z];}; +tb.prettyPrintTableFromTable:{[table;options] tb.prettyPrintTable[sc.inferSchema[table];value flip table;options]}; // parquet files @@ -113,24 +113,29 @@ pq.writeParquetFromTable:{[filename;table;options] pq.writeParquet[filename;sc.i pq.readParquetSchema:`arrowkdb 2:(`readParquetSchema;1); pq.readParquetData:`arrowkdb 2:(`readParquetData;2); pq.readParquetToTable:{[filename;options] flip (fd.fieldName each sc.schemaFields[pq.readParquetSchema[filename]])!(pq.readParquetData[filename;options])}; -pq.readParquetColumn:`arrowkdb 2:(`readParquetColumn;2); +pq.readParquetColumn:`arrowkdb 2:(`readParquetColumn;3); // arrow files -ipc.writeArrow:`arrowkdb 2:(`writeArrow;3); -ipc.writeArrowFromTable:{[filename;table] ipc.writeArrow[filename;sc.inferSchema[table];value flip table]}; +ipc.writeArrow:`arrowkdb 2:(`writeArrow;4); +ipc.writeArrowFromTable:{[filename;table;options] ipc.writeArrow[filename;sc.inferSchema[table];value flip table;options]}; ipc.readArrowSchema:`arrowkdb 2:(`readArrowSchema;1); ipc.readArrowData:`arrowkdb 2:(`readArrowData;2); ipc.readArrowToTable:{[filename;options] flip (fd.fieldName each sc.schemaFields[ipc.readArrowSchema[filename]])!(ipc.readArrowData[filename;options])}; // arrow streams -ipc.serializeArrow:`arrowkdb 2:(`serializeArrow;2); -ipc.serializeArrowFromTable:{[table] ipc.serializeArrow[sc.inferSchema[table];value flip table]}; +ipc.serializeArrow:`arrowkdb 2:(`serializeArrow;3); +ipc.serializeArrowFromTable:{[table;options] ipc.serializeArrow[sc.inferSchema[table];value flip table;options]}; ipc.parseArrowSchema:`arrowkdb 2:(`parseArrowSchema;1); -ipc.parseArrowData:`arrowkdb 2:(`parseArrowData;1); -ipc.parseArrowToTable:{[serialized] flip (fd.fieldName each sc.schemaFields[ipc.parseArrowSchema[serialized]])!(ipc.parseArrowData[serialized])}; +ipc.parseArrowData:`arrowkdb 2:(`parseArrowData;2); +ipc.parseArrowToTable:{[serialized;options] flip (fd.fieldName each sc.schemaFields[ipc.parseArrowSchema[serialized]])!(ipc.parseArrowData[serialized;options])}; -// utils: +// utils util.buildInfo:`arrowkdb 2:(`buildInfo;1); + + +// testing +ts.writeReadArray:`arrowkdb 2:(`writeReadArray;3); +ts.writeReadTable:`arrowkdb 2:(`writeReadTable;3); \ No newline at end of file diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index bc35fd6..0456986 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -23,21 +23,21 @@ namespace arrowkdb { // kdb as a mixed list for the parent list array containing a set of sub-lists, // one for each of the list value sets. template -void AppendList(std::shared_ptr array_data, K k_array, size_t& index) +void AppendList(std::shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { for (auto i = 0; i < array_data->length(); ++i) { // Slice the parent array to get the list value set at the specified index auto value_slice = std::static_pointer_cast(array_data)->value_slice(i); // Recursively populate the kdb parent mixed list from that slice - kK(k_array)[index++] = ReadArray(value_slice); + kK(k_array)[index++] = ReadArray(value_slice, type_overrides); } } // An arrow map array is a nested set of key/item paired child arrays. This is // represented in kdb as a mixed list for the parent map array, with a // dictionary for each map value set. -void AppendMap(std::shared_ptr array_data, K k_array, size_t& index) +void AppendMap(std::shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto map_array = std::static_pointer_cast(array_data); auto keys = map_array->keys(); @@ -49,7 +49,7 @@ void AppendMap(std::shared_ptr array_data, K k_array, size_t& inde auto items_slice = items->Slice(map_array->value_offset(i), map_array->value_length(i)); // Recursively populate the kdb parent mixed list with a dictionary // populated from those slices - kK(k_array)[index++] = xD(ReadArray(keys_slice), ReadArray(items_slice)); + kK(k_array)[index++] = xD(ReadArray(keys_slice, type_overrides), ReadArray(items_slice, type_overrides)); } } @@ -58,7 +58,7 @@ void AppendMap(std::shared_ptr array_data, K k_array, size_t& inde // value is obtaining by slicing across all the child arrays at a given index. // This is represented in kdb as a mixed list for the parent struct array, // containing child lists for each field in the struct. -void AppendStruct(std::shared_ptr array_data, K k_array, size_t& index) +void AppendStruct(std::shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto struct_array = std::static_pointer_cast(array_data); auto num_fields = struct_array->type()->num_fields(); @@ -67,7 +67,7 @@ void AppendStruct(std::shared_ptr array_data, K k_array, size_t& i // Only advance the index into the kdb mixed list at the end once all child // lists have been populated from the same initial index auto temp_index = index; - AppendArray(field_array, kK(k_array)[i], temp_index); + AppendArray(field_array, kK(k_array)[i], temp_index, type_overrides); } index += array_data->length(); } @@ -75,7 +75,7 @@ void AppendStruct(std::shared_ptr array_data, K k_array, size_t& i // An arrow union array is similar to a struct array except that it has an // additional type id array which identifies the live field in each union value // set. -void AppendUnion(std::shared_ptr array_data, K k_array, size_t& index) +void AppendUnion(std::shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto union_array = std::static_pointer_cast(array_data); @@ -91,14 +91,14 @@ void AppendUnion(std::shared_ptr array_data, K k_array, size_t& in // Only advance the index into the kdb mixed list at the end once all child // lists have been populated from the same initial index auto temp_index = index; - AppendArray(field_array, kK(k_array)[i + 1], temp_index); + AppendArray(field_array, kK(k_array)[i + 1], temp_index, type_overrides); } index += array_data->length(); } // An arrow dictionary array is represented in kdb as a mixed list for the // parent dictionary array containing the values and indicies sub-lists. -void AppendDictionary(std::shared_ptr array_data, K k_array, size_t& index) +void AppendDictionary(std::shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto dictionary_array = std::static_pointer_cast(array_data); @@ -106,13 +106,13 @@ void AppendDictionary(std::shared_ptr array_data, K k_array, size_ // two child arrays could be a different length to each other and the parent // dictionary array which makes it difficult to preallocate the kdb lists of // the correct length. - K values = ReadArray(dictionary_array->dictionary()); + K values = ReadArray(dictionary_array->dictionary(), type_overrides); jv(&kK(k_array)[0], values); - K indices = ReadArray(dictionary_array->indices()); + K indices = ReadArray(dictionary_array->indices(), type_overrides); jv(&kK(k_array)[1], indices); } -void AppendArray(std::shared_ptr array_data, K k_array, size_t& index) +void AppendArray(std::shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { switch (array_data->type_id()) { case arrow::Type::NA: @@ -297,12 +297,19 @@ void AppendArray(std::shared_ptr array_data, K k_array, size_t& in case arrow::Type::DECIMAL: { auto dec_array = std::static_pointer_cast(array_data); + auto dec_type = std::static_pointer_cast(dec_array->type()); for (auto i = 0; i < dec_array->length(); ++i) { - // Each decimal is a list of 16 bytes auto decimal = arrow::Decimal128(dec_array->Value(i)); - K k_dec = ktn(KG, 16); - decimal.ToBytes(kG(k_dec)); - kK(k_array)[index++] = k_dec; + if (type_overrides.decimal128_as_double) { + // Convert the decimal to a double + auto dec_as_double = decimal.ToDouble(dec_type->scale()); + kF(k_array)[index++] = dec_as_double; + } else { + // Each decimal is a list of 16 bytes + K k_dec = ktn(KG, 16); + decimal.ToBytes(kG(k_dec)); + kK(k_array)[index++] = k_dec; + } } break; } @@ -329,33 +336,33 @@ void AppendArray(std::shared_ptr array_data, K k_array, size_t& in break; } case arrow::Type::LIST: - AppendList(array_data, k_array, index); + AppendList(array_data, k_array, index, type_overrides); break; case arrow::Type::LARGE_LIST: - AppendList(array_data, k_array, index); + AppendList(array_data, k_array, index, type_overrides); break; case arrow::Type::FIXED_SIZE_LIST: - AppendList(array_data, k_array, index); + AppendList(array_data, k_array, index, type_overrides); break; case arrow::Type::MAP: - AppendMap(array_data, k_array, index); + AppendMap(array_data, k_array, index, type_overrides); break; case arrow::Type::STRUCT: - AppendStruct(array_data, k_array, index); + AppendStruct(array_data, k_array, index, type_overrides); break; case arrow::Type::SPARSE_UNION: case arrow::Type::DENSE_UNION: - AppendUnion(array_data, k_array, index); + AppendUnion(array_data, k_array, index, type_overrides); break; case arrow::Type::DICTIONARY: - AppendDictionary(array_data, k_array, index); + AppendDictionary(array_data, k_array, index, type_overrides); break; default: TYPE_CHECK_UNSUPPORTED(array_data->type()->ToString()); } } -K InitKdbForArray(std::shared_ptr datatype, size_t length) +K InitKdbForArray(std::shared_ptr datatype, size_t length, TypeMappingOverride& type_overrides) { switch (datatype->id()) { case arrow::Type::STRUCT: @@ -365,7 +372,7 @@ K InitKdbForArray(std::shared_ptr datatype, size_t length) K result = knk(num_fields); for (auto i = 0; i < num_fields; ++i) { auto field = datatype->field(i); - kK(result)[i] = InitKdbForArray(field->type(), length); + kK(result)[i] = InitKdbForArray(field->type(), length, type_overrides); } return result; } @@ -378,7 +385,7 @@ K InitKdbForArray(std::shared_ptr datatype, size_t length) kK(result)[0] = ktn(KH, length); // type_id list for (auto i = 0; i < num_fields; ++i) { auto field = datatype->field(i); - kK(result)[i + 1] = InitKdbForArray(field->type(), length); + kK(result)[i + 1] = InitKdbForArray(field->type(), length, type_overrides); } return result; } @@ -390,30 +397,30 @@ K InitKdbForArray(std::shared_ptr datatype, size_t length) // Do not preallocate the child lists since AppendDictionary has to join to the // indicies and values lists - kK(result)[0] = InitKdbForArray(dictionary_type->value_type(), 0); - kK(result)[1] = InitKdbForArray(dictionary_type->index_type(), 0); + kK(result)[0] = InitKdbForArray(dictionary_type->value_type(), 0, type_overrides); + kK(result)[1] = InitKdbForArray(dictionary_type->index_type(), 0, type_overrides); return result; } default: - return ktn(GetKdbType(datatype), length); + return ktn(GetKdbType(datatype, type_overrides), length); } } -K ReadArray(std::shared_ptr array) +K ReadArray(std::shared_ptr array, TypeMappingOverride& type_overrides) { - K k_array = InitKdbForArray(array->type(), array->length()); + K k_array = InitKdbForArray(array->type(), array->length(), type_overrides); size_t index = 0; - AppendArray(array, k_array, index); + AppendArray(array, k_array, index, type_overrides); return k_array; } -K ReadChunkedArray(std::shared_ptr chunked_array) +K ReadChunkedArray(std::shared_ptr chunked_array, TypeMappingOverride& type_overrides) { - K k_array = InitKdbForArray(chunked_array->type(), chunked_array->length()); + K k_array = InitKdbForArray(chunked_array->type(), chunked_array->length(), type_overrides); size_t index = 0; for (auto j = 0; j < chunked_array->num_chunks(); ++j) - AppendArray(chunked_array->chunk(j), k_array, index); + AppendArray(chunked_array->chunk(j), k_array, index, type_overrides); return k_array; } @@ -421,7 +428,7 @@ K ReadChunkedArray(std::shared_ptr chunked_array) } // namspace kx -K writeReadArray(K datatype_id, K array) +K writeReadArray(K datatype_id, K array, K options) { KDB_EXCEPTION_TRY; @@ -432,9 +439,15 @@ K writeReadArray(K datatype_id, K array) if (!datatype) return krr((S)"datatype not found"); - auto arrow_array = kx::arrowkdb::MakeArray(datatype, array); + // Parse the options + auto read_options = kx::arrowkdb::KdbOptions(options, kx::arrowkdb::Options::string_options, kx::arrowkdb::Options::int_options); - return kx::arrowkdb::ReadArray(arrow_array); + // Type mapping overrides + kx::arrowkdb::TypeMappingOverride type_overrides{ read_options }; + + auto arrow_array = kx::arrowkdb::MakeArray(datatype, array, type_overrides); + + return kx::arrowkdb::ReadArray(arrow_array, type_overrides); KDB_EXCEPTION_CATCH; } \ No newline at end of file diff --git a/src/ArrayReader.h b/src/ArrayReader.h index ecf2825..3298190 100644 --- a/src/ArrayReader.h +++ b/src/ArrayReader.h @@ -5,6 +5,7 @@ #include #include "ArrowKdb.h" +#include "HelperFunctions.h" namespace kx { @@ -23,7 +24,7 @@ namespace arrowkdb { * begin. Index will be updated to account for the new offset by adding the * length of the array array. */ -void AppendArray(std::shared_ptr array_data, K k_array, size_t& index); +void AppendArray(std::shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides); /** * @brief Copies and converts an arrow array to a kdb list @@ -31,7 +32,7 @@ void AppendArray(std::shared_ptr array_data, K k_array, size_t& in * @param array The arrow array to be converted * @return A kdb list represented the arrow array */ -K ReadArray(std::shared_ptr array); +K ReadArray(std::shared_ptr array, TypeMappingOverride& type_overrides); /** * @brief An arrow chunked array is a set of sub-arrays which are logically but not @@ -42,7 +43,7 @@ K ReadArray(std::shared_ptr array); * @param chunked_array The chunked array to be converted * @return A kdb list representing the chunked array */ -K ReadChunkedArray(std::shared_ptr chunked_array); +K ReadChunkedArray(std::shared_ptr chunked_array, TypeMappingOverride& type_overrides); /** * @brief Creates a kdb list of the correct type and specified length according @@ -53,7 +54,7 @@ K ReadChunkedArray(std::shared_ptr chunked_array); * @param length The required length of the kdb list * @return Newly created kdb list */ -K InitKdbForArray(std::shared_ptr datatype, size_t length); +K InitKdbForArray(std::shared_ptr datatype, size_t length, TypeMappingOverride& type_overrides); } // namespace arrowkdb } // namespace kx @@ -68,13 +69,19 @@ extern "C" * Developer use only - Only useful for manual testing, do not expose in * release version of arrowkdb.q since it has no practical use * + * Supported options: + * + * DECIMAL128_AS_DOUBLE (long) - Flag indicating whether to override the + * default type mapping for the arrow decimal128 datatype and instead + * represent it as a double (9h). Default 0. + * * @param datatype_id The arrow datatype identifier to use for the intemediate * arrow array * @param array The kdb list to be written to the intermediate arrow * array * @return The kdb list created from the intermediate arrow array */ - EXP K writeReadArray(K datatype_id, K array); + EXP K writeReadArray(K datatype_id, K array, K options); } #endif // __ARRAY_READER_H__ diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index 7d0fb8c..602c764 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -150,7 +150,7 @@ std::shared_ptr GetBuilder(std::shared_ptr // kdb as a mixed list for the parent list array containing a set of sub-lists, // one for each of the list value sets. template -void PopulateListBuilder(std::shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder) +void PopulateListBuilder(std::shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { // Get the value builder from the parent list builder auto list_builder = static_cast(builder); @@ -172,7 +172,7 @@ void PopulateListBuilder(std::shared_ptr datatype, K k_array, a } // Populate the child builder for this list set - PopulateBuilder(value_builder->type(), kK(k_array)[i], value_builder); + PopulateBuilder(value_builder->type(), kK(k_array)[i], value_builder, type_overrides); } } @@ -182,7 +182,7 @@ void PopulateListBuilder(std::shared_ptr datatype, K k_array, a // additional type id array which identifies the live field in each union value // set. template -void PopulateUnionBuilder(std::shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder) +void PopulateUnionBuilder(std::shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { // Check that the mixed list length is at least one greater (the additional // first sub-list contains the union type_ids) than the number of union @@ -217,7 +217,7 @@ void PopulateUnionBuilder(std::shared_ptr datatype, K k_array, for (auto i = 1; i < min_length; ++i) { // type_id is zero indexed so used i-1 to reference the field builders auto builder_num = i - 1; - PopulateBuilder(child_builders[builder_num]->type(), kK(k_array)[i], child_builders[builder_num].get()); + PopulateBuilder(child_builders[builder_num]->type(), kK(k_array)[i], child_builders[builder_num].get(), type_overrides); } // Check that all the populated child builders have the same length @@ -227,7 +227,7 @@ void PopulateUnionBuilder(std::shared_ptr datatype, K k_array, } // Populates data values from a kdb list into the specified array builder. -void PopulateBuilder(std::shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder) +void PopulateBuilder(std::shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { // Special cases for: // symbol - string or large_string @@ -239,7 +239,7 @@ void PopulateBuilder(std::shared_ptr datatype, K k_array, arrow // Type check the kdb structure if (!is_symbol && !is_guid && !is_char) - TYPE_CHECK_ARRAY(kx::arrowkdb::GetKdbType(datatype) != k_array->t, datatype->ToString(), kx::arrowkdb::GetKdbType(datatype), k_array->t); + TYPE_CHECK_ARRAY(kx::arrowkdb::GetKdbType(datatype, type_overrides) != k_array->t, datatype->ToString(), kx::arrowkdb::GetKdbType(datatype, type_overrides), k_array->t); switch (datatype->id()) { case arrow::Type::NA: @@ -438,14 +438,22 @@ void PopulateBuilder(std::shared_ptr datatype, K k_array, arrow case arrow::Type::DECIMAL: { auto dec_builder = static_cast(builder); + auto dec_type = std::static_pointer_cast(datatype); for (auto i = 0; i < k_array->n; ++i) { - // Each decimal is a list of 16 bytes - K k_dec = kK(k_array)[i]; - TYPE_CHECK_LENGTH(k_dec->n != 16, datatype->ToString(), 16, k_dec->n); - TYPE_CHECK_ITEM(k_dec->t != KG, datatype->ToString(), KG, k_dec->t); - - arrow::BasicDecimal128 dec128((const uint8_t*)kG(k_dec)); - PARQUET_THROW_NOT_OK(dec_builder->Append(dec128)); + if (type_overrides.decimal128_as_double) { + // Construct the decimal from a double + arrow::Decimal128 dec128; + PARQUET_ASSIGN_OR_THROW(dec128, arrow::Decimal128::FromReal(kF(k_array)[i], dec_type->precision(), dec_type->scale())); + PARQUET_THROW_NOT_OK(dec_builder->Append(dec128)); + } else { + // Each decimal is a list of 16 bytes + K k_dec = kK(k_array)[i]; + TYPE_CHECK_LENGTH(k_dec->n != 16, datatype->ToString(), 16, k_dec->n); + TYPE_CHECK_ITEM(k_dec->t != KG, datatype->ToString(), KG, k_dec->t); + + arrow::Decimal128 dec128((const uint8_t*)kG(k_dec)); + PARQUET_THROW_NOT_OK(dec_builder->Append(dec128)); + } } break; } @@ -472,13 +480,13 @@ void PopulateBuilder(std::shared_ptr datatype, K k_array, arrow break; } case arrow::Type::LIST: - PopulateListBuilder(datatype, k_array, builder); + PopulateListBuilder(datatype, k_array, builder, type_overrides); break; case arrow::Type::LARGE_LIST: - PopulateListBuilder(datatype, k_array, builder); + PopulateListBuilder(datatype, k_array, builder, type_overrides); break; case arrow::Type::FIXED_SIZE_LIST: - PopulateListBuilder(datatype, k_array, builder); + PopulateListBuilder(datatype, k_array, builder, type_overrides); break; case arrow::Type::MAP: { @@ -502,8 +510,8 @@ void PopulateBuilder(std::shared_ptr datatype, K k_array, arrow // Populate the child builders for this map set from the dictionary key/value lists auto k_dict = kK(k_array)[i]; TYPE_CHECK_ITEM(99 != k_dict->t, datatype->ToString(), 99, k_dict->t); - PopulateBuilder(key_builder->type(), kK(k_dict)[0], key_builder); - PopulateBuilder(item_builder->type(), kK(k_dict)[1], item_builder); + PopulateBuilder(key_builder->type(), kK(k_dict)[0], key_builder, type_overrides); + PopulateBuilder(item_builder->type(), kK(k_dict)[1], item_builder, type_overrides); } break; } @@ -534,7 +542,7 @@ void PopulateBuilder(std::shared_ptr datatype, K k_array, arrow // the number of struct fields. Additional trailing data in the kdb mixed // list is ignored (to allow for ::) for (auto i = 0; i < struct_type->num_fields(); ++i) - PopulateBuilder(field_builders[i]->type(), kK(k_array)[i], field_builders[i]); + PopulateBuilder(field_builders[i]->type(), kK(k_array)[i], field_builders[i], type_overrides); // Check that all the populated field builders have the same length. for (auto it : field_builders) @@ -544,10 +552,10 @@ void PopulateBuilder(std::shared_ptr datatype, K k_array, arrow break; } case arrow::Type::SPARSE_UNION: - PopulateUnionBuilder(datatype, k_array, builder); + PopulateUnionBuilder(datatype, k_array, builder, type_overrides); break; case arrow::Type::DENSE_UNION: - PopulateUnionBuilder(datatype, k_array, builder); + PopulateUnionBuilder(datatype, k_array, builder, type_overrides); break; default: TYPE_CHECK_UNSUPPORTED(datatype->ToString()); @@ -558,7 +566,7 @@ void PopulateBuilder(std::shared_ptr datatype, K k_array, arrow // // This is represented in kdb as a mixed list for the parent dictionary array // containing the values and indicies sub-lists. -std::shared_ptr MakeDictionary(std::shared_ptr datatype, K k_array) +std::shared_ptr MakeDictionary(std::shared_ptr datatype, K k_array, TypeMappingOverride& type_overrides) { K values = kK(k_array)[0]; K indicies = kK(k_array)[1]; @@ -566,8 +574,8 @@ std::shared_ptr MakeDictionary(std::shared_ptr da auto dictionary_type = std::static_pointer_cast(datatype); // Recursively construct the values and indicies arrays - auto values_array = MakeArray(dictionary_type->value_type(), values); - auto indicies_array = MakeArray(dictionary_type->index_type(), indicies); + auto values_array = MakeArray(dictionary_type->value_type(), values, type_overrides); + auto indicies_array = MakeArray(dictionary_type->index_type(), indicies, type_overrides); std::shared_ptr result; PARQUET_ASSIGN_OR_THROW(result, arrow::DictionaryArray::FromArrays(datatype, indicies_array, values_array)); @@ -575,17 +583,17 @@ std::shared_ptr MakeDictionary(std::shared_ptr da return result; } -std::shared_ptr MakeArray(std::shared_ptr datatype, K k_array) +std::shared_ptr MakeArray(std::shared_ptr datatype, K k_array, TypeMappingOverride& type_overrides) { // DictionaryBuilder works in quite an unusual and non-standard way so just // construct the dictionary array directly if (datatype->id() == arrow::Type::DICTIONARY) - return MakeDictionary(datatype, k_array); + return MakeDictionary(datatype, k_array, type_overrides); // Construct a array builder for this datatype and populate it from the kdb // list auto builder = GetBuilder(datatype); - PopulateBuilder(datatype, k_array, builder.get()); + PopulateBuilder(datatype, k_array, builder.get(), type_overrides); // Finalise the builder into the arrow array std::shared_ptr array; @@ -597,7 +605,7 @@ std::shared_ptr MakeArray(std::shared_ptr datatyp } // namespace kx -K prettyPrintArray(K datatype_id, K array) +K prettyPrintArray(K datatype_id, K array, K options) { KDB_EXCEPTION_TRY; @@ -608,7 +616,13 @@ K prettyPrintArray(K datatype_id, K array) if (!datatype) return krr((S)"datatype not found"); - auto arrow_array = kx::arrowkdb::MakeArray(datatype, array); + // Parse the options + auto read_options = kx::arrowkdb::KdbOptions(options, kx::arrowkdb::Options::string_options, kx::arrowkdb::Options::int_options); + + // Type mapping overrides + kx::arrowkdb::TypeMappingOverride type_overrides{ read_options }; + + auto arrow_array = kx::arrowkdb::MakeArray(datatype, array, type_overrides); auto options = arrow::PrettyPrintOptions(); std::string result; arrow::PrettyPrint(*arrow_array, options, &result); diff --git a/src/ArrayWriter.h b/src/ArrayWriter.h index 886dedd..53a9b1b 100644 --- a/src/ArrayWriter.h +++ b/src/ArrayWriter.h @@ -5,6 +5,7 @@ #include #include "ArrowKdb.h" +#include "HelperFunctions.h" namespace kx { @@ -17,7 +18,7 @@ namespace arrowkdb { * @param k_array Kdb list data to be populated * @param builder Arrow array builder for this datatype */ -void PopulateBuilder(std::shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder); +void PopulateBuilder(std::shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides); /** * @brief Copies and converts a kdb list to an arrow array @@ -26,7 +27,7 @@ void PopulateBuilder(std::shared_ptr datatype, K k_array, arrow * @param k_array The kdb list from which to source the data * @return The arrow array */ -std::shared_ptr MakeArray(std::shared_ptr datatype, K k_array); +std::shared_ptr MakeArray(std::shared_ptr datatype, K k_array, TypeMappingOverride& type_overrides); } // namespace arrowkdb } // namespace kx @@ -38,13 +39,22 @@ extern "C" * @brief Debugging function which converts a kdb list to an arrow array, * pretty prints the array into a buffer which is then returned back to kdb. * + * Supported options: + * + * DECIMAL128_AS_DOUBLE (long) - Flag indicating whether to override the + * default type mapping for the arrow decimal128 datatype and instead + * represent it as a double (9h). Default 0. + * * @param datatype_id The arrow datatype identifier to use for the intemediate * arrow array * @param array The kdb list to be written to the intermediate arrow * array + * @options Dictionary of options or generic null (::) to use + * defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or + * mixed list of -7|-11|4h. * @return kdb char list containing the pretty printed buffer */ - EXP K prettyPrintArray(K datatype_id, K array); + EXP K prettyPrintArray(K datatype_id, K array, K options); } #endif // __ARRAY_WRITER_H__ diff --git a/src/ArrowKdb.cpp b/src/ArrowKdb.cpp index 01edc0b..75c2d4b 100644 --- a/src/ArrowKdb.cpp +++ b/src/ArrowKdb.cpp @@ -32,7 +32,7 @@ int main(int argc, char* argv[]) K result; K schema = readParquetSchema(file); for (auto i = 0; i < 10; ++i) { - result = writeReadTable(schema, data); + result = writeReadTable(schema, data, NULL); } std::cout << "Read " << kK(result)[0]->n << std::endl; diff --git a/src/HelperFunctions.cpp b/src/HelperFunctions.cpp index 9045f85..5ade109 100644 --- a/src/HelperFunctions.cpp +++ b/src/HelperFunctions.cpp @@ -145,7 +145,12 @@ const std::string GetKdbString(K str) return str->t == -KS ? str->s : std::string((S)kG(str), str->n); } -KdbType GetKdbType(std::shared_ptr datatype) +TypeMappingOverride::TypeMappingOverride(const KdbOptions& options) +{ + options.GetIntOption(Options::DECIMAL128_AS_DOUBLE, decimal128_as_double); +} + +KdbType GetKdbType(std::shared_ptr datatype, TypeMappingOverride& type_overrides) { switch (datatype->id()) { case arrow::Type::NA: @@ -188,7 +193,10 @@ KdbType GetKdbType(std::shared_ptr datatype) case arrow::Type::TIME64: return KN; case arrow::Type::DECIMAL: - return 0; // mixed list of KG lists of length 16 + if (type_overrides.decimal128_as_double) + return KF; // map decimal128 to double + else + return 0; // mixed list of KG lists of length 16 case arrow::Type::DURATION: return KN; case arrow::Type::INTERVAL_MONTHS: diff --git a/src/HelperFunctions.h b/src/HelperFunctions.h index d7d946d..d6faaef 100644 --- a/src/HelperFunctions.h +++ b/src/HelperFunctions.h @@ -7,6 +7,7 @@ #include #include "TypeCheck.h" +#include "KdbOptions.h" #include @@ -76,6 +77,13 @@ const std::string GetKdbString(K str); typedef signed char KdbType; + struct TypeMappingOverride +{ + int64_t decimal128_as_double = 0; + TypeMappingOverride(void) {}; + TypeMappingOverride(const KdbOptions& options); +}; + /** * @brief Maps an arrow datatype to a kdb list type. Used to: * 1. Create kdb list of the correct type when reading from an arrow array to @@ -85,7 +93,7 @@ typedef signed char KdbType; * @param datatype Required arrow datatype * @return KdbType (k0->t) */ -KdbType GetKdbType(std::shared_ptr datatype); +KdbType GetKdbType(std::shared_ptr datatype, TypeMappingOverride& type_overrides); /** * @brief Maps a kdb list to a suitable arrow datatype as follows: diff --git a/src/KdbOptions.h b/src/KdbOptions.h index b0ea93b..1af50a8 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -13,24 +13,47 @@ namespace kx { namespace arrowkdb { -// Supported options for arrowkdb -const static std::set supported_int_options = { "PARQUET_CHUNK_SIZE", "PARQUET_MULTITHREADED_READ", "USE_MMAP" }; -const static std::set supported_string_options = { "PARQUET_VERSION" }; +// Supported options +namespace Options +{ + // Int options + const std::string PARQUET_CHUNK_SIZE = "PARQUET_CHUNK_SIZE"; + const std::string PARQUET_MULTITHREADED_READ = "PARQUET_MULTITHREADED_READ"; + const std::string USE_MMAP = "USE_MMAP"; + const std::string DECIMAL128_AS_DOUBLE = "DECIMAL128_AS_DOUBLE"; + + // String options + const std::string PARQUET_VERSION = "PARQUET_VERSION"; + + const static std::set int_options = { + PARQUET_CHUNK_SIZE, + PARQUET_MULTITHREADED_READ, + USE_MMAP, + DECIMAL128_AS_DOUBLE, + }; + const static std::set string_options = { + PARQUET_VERSION, + }; +} -// Helper class for reading function argument containing dictionary of options + +// Helper class for reading dictionary of options // // Dictionary key: KS // Dictionary value: KS or // KJ or -// 0 of -KS|-KJ +// 0 of -KS|-KJ|KC class KdbOptions { private: std::map string_options; - std::map int_options; + std::map int_options; + + const std::set& supported_string_options; + const std::set& supported_int_options; private: - std::string ToUpper(std::string str) + const std::string ToUpper(std::string str) const { std::string upper; for (auto i : str) @@ -40,8 +63,8 @@ class KdbOptions void PopulateIntOptions(K keys, K values) { - for (auto i = 0; i < values->n; ++i) { - std::string key = ToUpper(kS(keys)[i]); + for (auto i = 0ll; i < values->n; ++i) { + const std::string key = ToUpper(kS(keys)[i]); if (supported_int_options.find(key) == supported_int_options.end()) throw InvalidOption(("Unsupported int option '" + key + "'").c_str()); int_options[key] = kJ(values)[i]; @@ -50,8 +73,8 @@ class KdbOptions void PopulateStringOptions(K keys, K values) { - for (auto i = 0; i < values->n; ++i) { - std::string key = ToUpper(kS(keys)[i]); + for (auto i = 0ll; i < values->n; ++i) { + const std::string key = ToUpper(kS(keys)[i]); if (supported_string_options.find(key) == supported_string_options.end()) throw InvalidOption(("Unsupported string option '" + key + "'").c_str()); string_options[key] = ToUpper(kS(values)[i]); @@ -60,8 +83,8 @@ class KdbOptions void PopulateMixedOptions(K keys, K values) { - for (auto i = 0; i < values->n; ++i) { - std::string key = ToUpper(kS(keys)[i]); + for (auto i = 0ll; i < values->n; ++i) { + const std::string key = ToUpper(kS(keys)[i]); K value = kK(values)[i]; switch (value->t) { case -KJ: @@ -74,11 +97,18 @@ class KdbOptions throw InvalidOption(("Unsupported string option '" + key + "'").c_str()); string_options[key] = ToUpper(value->s); break; + case KC: + { + if (supported_string_options.find(key) == supported_string_options.end()) + throw InvalidOption(("Unsupported string option '" + key + "'").c_str()); + string_options[key] = ToUpper(std::string((char*)kG(value), value->n)); + break; + } case 101: // Ignore :: break; default: - throw InvalidOption(("option '" + key + "' value not -7|-11h").c_str()); + throw InvalidOption(("option '" + key + "' value not -7|-11|10h").c_str()); } } } @@ -87,11 +117,12 @@ class KdbOptions class InvalidOption : public std::invalid_argument { public: - InvalidOption(std::string message) : std::invalid_argument(message.c_str()) + InvalidOption(const std::string message) : std::invalid_argument(message.c_str()) {}; }; - KdbOptions(K options) + KdbOptions(K options, const std::set supported_string_options_, const std::set supported_int_options_) : + supported_string_options(supported_string_options_), supported_int_options(supported_int_options_) { if (options != NULL && options->t != 101) { if (options->t != 99) @@ -116,9 +147,9 @@ class KdbOptions } } - bool GetStringOption(std::string key, std::string& result) + bool GetStringOption(const std::string key, std::string& result) const { - auto it = string_options.find(ToUpper(key)); + const auto it = string_options.find(key); if (it == string_options.end()) return false; else { @@ -127,9 +158,9 @@ class KdbOptions } } - bool GetIntOption(std::string key, int64_t& result) + bool GetIntOption(const std::string key, int64_t& result) const { - auto it = int_options.find(ToUpper(key)); + const auto it = int_options.find(key); if (it == int_options.end()) return false; else { diff --git a/src/TableData.cpp b/src/TableData.cpp index fc8a8d7..cf04159 100644 --- a/src/TableData.cpp +++ b/src/TableData.cpp @@ -41,7 +41,7 @@ bool SchemaContainsNullable(const std::shared_ptr schema) } // Create a vector of arrow arrays from the arrow schema and mixed list of kdb array objects -std::vector> MakeArrays(std::shared_ptr schema, K array_data) +std::vector> MakeArrays(std::shared_ptr schema, K array_data, kx::arrowkdb::TypeMappingOverride& type_overrides) { if (array_data->t != 0) throw kx::arrowkdb::TypeCheck("array_data not mixed list"); @@ -56,7 +56,7 @@ std::vector> MakeArrays(std::shared_ptrnum_fields(); ++i) { auto k_array = kK(array_data)[i]; - arrays.push_back(kx::arrowkdb::MakeArray(schema->field(i)->type(), k_array)); + arrays.push_back(kx::arrowkdb::MakeArray(schema->field(i)->type(), k_array, type_overrides)); } } @@ -64,12 +64,12 @@ std::vector> MakeArrays(std::shared_ptr MakeTable(std::shared_ptr schema, K array_data) +std::shared_ptr MakeTable(std::shared_ptr schema, K array_data, kx::arrowkdb::TypeMappingOverride& type_overrides) { - return arrow::Table::Make(schema, MakeArrays(schema, array_data)); + return arrow::Table::Make(schema, MakeArrays(schema, array_data, type_overrides)); } -K prettyPrintTable(K schema_id, K array_data) +K prettyPrintTable(K schema_id, K array_data, K options) { KDB_EXCEPTION_TRY; @@ -80,14 +80,20 @@ K prettyPrintTable(K schema_id, K array_data) if (!schema) return krr((S)"unknown schema"); - auto table = MakeTable(schema, array_data); + // Parse the options + auto read_options = kx::arrowkdb::KdbOptions(options, kx::arrowkdb::Options::string_options, kx::arrowkdb::Options::int_options); + + // Type mapping overrides + kx::arrowkdb::TypeMappingOverride type_overrides{ read_options }; + + auto table = MakeTable(schema, array_data, type_overrides); return kp((S)table->ToString().c_str()); KDB_EXCEPTION_CATCH; } -K writeReadTable(K schema_id, K array_data) +K writeReadTable(K schema_id, K array_data, K options) { KDB_EXCEPTION_TRY; @@ -98,12 +104,18 @@ K writeReadTable(K schema_id, K array_data) if (!schema) return krr((S)"unknown schema"); - auto table = MakeTable(schema, array_data); + // Parse the options + auto read_options = kx::arrowkdb::KdbOptions(options, kx::arrowkdb::Options::string_options, kx::arrowkdb::Options::int_options); + + // Type mapping overrides + kx::arrowkdb::TypeMappingOverride type_overrides{ read_options }; + + auto table = MakeTable(schema, array_data, type_overrides); const auto col_num = table->num_columns(); K data = ktn(0, col_num); for (auto i = 0; i < col_num; ++i) - kK(data)[i] = kx::arrowkdb::ReadChunkedArray(table->column(i)); + kK(data)[i] = kx::arrowkdb::ReadChunkedArray(table->column(i), type_overrides); return data; @@ -128,13 +140,12 @@ K writeParquet(K parquet_file, K schema_id, K array_data, K options) outfile, arrow::io::FileOutputStream::Open(kx::arrowkdb::GetKdbString(parquet_file))); - // Create the arrow table - auto table = MakeTable(schema, array_data); - // Parse the options - auto write_options = kx::arrowkdb::KdbOptions(options); + auto write_options = kx::arrowkdb::KdbOptions(options, kx::arrowkdb::Options::string_options, kx::arrowkdb::Options::int_options); + + // Chunk size int64_t parquet_chunk_size = 1024 * 1024; // default to 1MB - write_options.GetIntOption("parquet_chunk_size", parquet_chunk_size); + write_options.GetIntOption(kx::arrowkdb::Options::PARQUET_CHUNK_SIZE, parquet_chunk_size); // Set writer properties parquet::WriterProperties::Builder parquet_props_builder; @@ -142,7 +153,7 @@ K writeParquet(K parquet_file, K schema_id, K array_data, K options) // Parquet version std::string parquet_version; - write_options.GetStringOption("parquet_version", parquet_version); + write_options.GetStringOption(kx::arrowkdb::Options::PARQUET_VERSION, parquet_version); if (parquet_version == "V2.0") { parquet_props_builder.version(parquet::ParquetVersion::PARQUET_2_0); parquet_props_builder.data_page_version(parquet::ParquetDataPageVersion::V2); @@ -152,9 +163,15 @@ K writeParquet(K parquet_file, K schema_id, K array_data, K options) arrow_props_builder.allow_truncated_timestamps(); } + // Type mapping overrides + kx::arrowkdb::TypeMappingOverride type_overrides{ write_options }; + auto parquet_props = parquet_props_builder.build(); auto arrow_props = arrow_props_builder.build(); + // Create the arrow table + auto table = MakeTable(schema, array_data, type_overrides); + PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, arrow::default_memory_pool(), outfile, parquet_chunk_size, parquet_props, arrow_props)); return (K)0; @@ -203,11 +220,19 @@ K readParquetData(K parquet_file, K options) if (!kx::arrowkdb::IsKdbString(parquet_file)) return krr((S)"parquet_file not 11h or 0 of 10h"); - auto read_options = kx::arrowkdb::KdbOptions(options); + // Parse the options + auto read_options = kx::arrowkdb::KdbOptions(options, kx::arrowkdb::Options::string_options, kx::arrowkdb::Options::int_options); + + // Use multi threading int64_t parquet_multithreaded_read = 0; - read_options.GetIntOption("parquet_multithreaded_read", parquet_multithreaded_read); + read_options.GetIntOption(kx::arrowkdb::Options::PARQUET_MULTITHREADED_READ, parquet_multithreaded_read); + + // Use memmap int64_t use_mmap = 0; - read_options.GetIntOption("use_mmap", use_mmap); + read_options.GetIntOption(kx::arrowkdb::Options::USE_MMAP, use_mmap); + + // Type mapping overrides + kx::arrowkdb::TypeMappingOverride type_overrides{ read_options }; std::shared_ptr infile; if (use_mmap) { @@ -236,7 +261,7 @@ K readParquetData(K parquet_file, K options) K data = ktn(0, col_num); for (auto i = 0; i < col_num; ++i) { auto chunked_array = table->column(i); - kK(data)[i] = kx::arrowkdb::ReadChunkedArray(chunked_array); + kK(data)[i] = kx::arrowkdb::ReadChunkedArray(chunked_array, type_overrides); } return data; @@ -244,7 +269,7 @@ K readParquetData(K parquet_file, K options) KDB_EXCEPTION_CATCH; } -K readParquetColumn(K parquet_file, K column_index) +K readParquetColumn(K parquet_file, K column_index, K options) { KDB_EXCEPTION_TRY; @@ -253,6 +278,12 @@ K readParquetColumn(K parquet_file, K column_index) if (column_index->t != -KI) return krr((S)"column not -6h"); + // Parse the options + auto read_options = kx::arrowkdb::KdbOptions(options, kx::arrowkdb::Options::string_options, kx::arrowkdb::Options::int_options); + + // Type mapping overrides + kx::arrowkdb::TypeMappingOverride type_overrides{ read_options }; + std::shared_ptr infile; PARQUET_ASSIGN_OR_THROW( infile, @@ -265,12 +296,12 @@ K readParquetColumn(K parquet_file, K column_index) std::shared_ptr<::arrow::ChunkedArray> chunked_array; PARQUET_THROW_NOT_OK(reader->ReadColumn(column_index->i, &chunked_array)); - return kx::arrowkdb::ReadChunkedArray(chunked_array); + return kx::arrowkdb::ReadChunkedArray(chunked_array, type_overrides); KDB_EXCEPTION_CATCH; } -K writeArrow(K arrow_file, K schema_id, K array_data) +K writeArrow(K arrow_file, K schema_id, K array_data, K options) { KDB_EXCEPTION_TRY; @@ -283,6 +314,12 @@ K writeArrow(K arrow_file, K schema_id, K array_data) if (!schema) return krr((S)"unknown schema"); + // Parse the options + auto read_options = kx::arrowkdb::KdbOptions(options, kx::arrowkdb::Options::string_options, kx::arrowkdb::Options::int_options); + + // Type mapping overrides + kx::arrowkdb::TypeMappingOverride type_overrides{ read_options }; + std::shared_ptr outfile; PARQUET_ASSIGN_OR_THROW( outfile, @@ -291,7 +328,7 @@ K writeArrow(K arrow_file, K schema_id, K array_data) std::shared_ptr writer; PARQUET_ASSIGN_OR_THROW(writer, arrow::ipc::MakeFileWriter(outfile.get(), schema)); - auto arrays = MakeArrays(schema, array_data); + auto arrays = MakeArrays(schema, array_data, type_overrides); // Check all arrays are same length int64_t len = -1; @@ -350,9 +387,15 @@ K readArrowData(K arrow_file, K options) if (!kx::arrowkdb::IsKdbString(arrow_file)) return krr((S)"arrow_file not 11h or 0 of 10h"); - auto read_options = kx::arrowkdb::KdbOptions(options); + // Parse the options + auto read_options = kx::arrowkdb::KdbOptions(options, kx::arrowkdb::Options::string_options, kx::arrowkdb::Options::int_options); + + // Use memmap int64_t use_mmap = 0; - read_options.GetIntOption("use_mmap", use_mmap); + read_options.GetIntOption(kx::arrowkdb::Options::USE_MMAP, use_mmap); + + // Type mapping overrides + kx::arrowkdb::TypeMappingOverride type_overrides{ read_options }; std::shared_ptr infile; if (use_mmap) { @@ -390,7 +433,7 @@ K readArrowData(K arrow_file, K options) column_arrays.push_back(batch->column(i)); auto chunked_array = std::make_shared(column_arrays); // Convert the chunked array to kdb object - kK(data)[i] = kx::arrowkdb::ReadChunkedArray(chunked_array); + kK(data)[i] = kx::arrowkdb::ReadChunkedArray(chunked_array, type_overrides); } return data; @@ -398,7 +441,7 @@ K readArrowData(K arrow_file, K options) KDB_EXCEPTION_CATCH; } -K serializeArrow(K schema_id, K array_data) +K serializeArrow(K schema_id, K array_data, K options) { KDB_EXCEPTION_TRY; @@ -409,6 +452,12 @@ K serializeArrow(K schema_id, K array_data) if (!schema) return krr((S)"unknown schema"); + // Parse the options + auto read_options = kx::arrowkdb::KdbOptions(options, kx::arrowkdb::Options::string_options, kx::arrowkdb::Options::int_options); + + // Type mapping overrides + kx::arrowkdb::TypeMappingOverride type_overrides{ read_options }; + std::shared_ptr buffer; std::unique_ptr sink; std::shared_ptr writer; @@ -416,7 +465,7 @@ K serializeArrow(K schema_id, K array_data) sink.reset(new arrow::io::BufferOutputStream(buffer)); PARQUET_ASSIGN_OR_THROW(writer, arrow::ipc::MakeStreamWriter(sink.get(), schema)); - auto arrays = MakeArrays(schema, array_data); + auto arrays = MakeArrays(schema, array_data, type_overrides); // Check all arrays are same length int64_t len = -1; @@ -468,13 +517,19 @@ K parseArrowSchema(K char_array) KDB_EXCEPTION_CATCH; } -K parseArrowData(K char_array) +K parseArrowData(K char_array, K options) { KDB_EXCEPTION_TRY; if (char_array->t != KG && char_array->t != KC) return krr((S)"char_array not 4|10h"); + // Parse the options + auto read_options = kx::arrowkdb::KdbOptions(options, kx::arrowkdb::Options::string_options, kx::arrowkdb::Options::int_options); + + // Type mapping overrides + kx::arrowkdb::TypeMappingOverride type_overrides{ read_options }; + auto buf_reader = std::make_shared(kG(char_array), char_array->n); std::shared_ptr reader; PARQUET_ASSIGN_OR_THROW(reader, arrow::ipc::RecordBatchStreamReader::Open(buf_reader)); @@ -495,7 +550,7 @@ K parseArrowData(K char_array) column_arrays.push_back(batch->column(i)); auto chunked_array = std::make_shared(column_arrays); // Convert the chunked array to kdb object - kK(data)[i] = kx::arrowkdb::ReadChunkedArray(chunked_array); + kK(data)[i] = kx::arrowkdb::ReadChunkedArray(chunked_array, type_overrides); } return data; diff --git a/src/TableData.h b/src/TableData.h index 459386d..06d19d7 100644 --- a/src/TableData.h +++ b/src/TableData.h @@ -19,13 +19,22 @@ extern "C" * according to the field's datatype. This required array data structure is * detailed for each of the datatype constructor functions. * + * Supported options: + * + * DECIMAL128_AS_DOUBLE (long) - Flag indicating whether to override the + * default type mapping for the arrow decimal128 datatype and instead + * represent it as a double (9h). Default 0. + * * @param schema_id The schema identifier to use for the intermediate arrow * table * @param array_data Mixed list of arrow array data to be written to the * intermediate arrow table + * @options Dictionary of options or generic null (::) to use + * defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or + * mixed list of -7|-11|4h. * @return kdb char list containing the pretty printed buffer */ - EXP K prettyPrintTable(K schema_id, K array_data); + EXP K prettyPrintTable(K schema_id, K array_data, K options); /** * @brief Debugging function which converts a kdb mixed list of arrow array @@ -35,15 +44,27 @@ extern "C" * number. Each kdb object representing one of the arrays must be structured * according to the field's datatype. This required array data structure is * detailed for each of the datatype constructor functions. + * + * Developer use only - Only useful for manual testing, do not expose in + * release version of arrowkdb.q since it has no practical use + * + * Supported options: + * + * DECIMAL128_AS_DOUBLE (long) - Flag indicating whether to override the + * default type mapping for the arrow decimal128 datatype and instead + * represent it as a double (9h). Default 0. * * @param schema_id The schema identifier to use for the intermediate arrow * table * @param array_data Mixed list of arrow array data to be written to the * intermediate arrow table + * @options Dictionary of options or generic null (::) to use + * defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or + * mixed list of -7|-11|4h. * @return The kdb mixed list created from the intermediate arrow * table */ - EXP K writeReadTable(K schema_id, K array_data); + EXP K writeReadTable(K schema_id, K array_data, K options); /** * @brief Creates a parquet file with the specified arrow schema and populates @@ -62,15 +83,25 @@ extern "C" * an error. * * Supported options: - * parquet_chunk_size - * Controls the approximate size of encoded data pages within a column - * chunk (long, default: 1MB) + * + * PARQUET_CHUNK_SIZE (long) - Controls the approximate size of encoded data + * pages within a column chunk. Default 1MB + * + * PARQUET_VERSION (string) - Selects the Parquet format version, either + * `V1.0` or `V2.0`. `V2.0` is more fully featured but may be incompatible + * with older Parquet implementations. Default `V1.0` + * + * DECIMAL128_AS_DOUBLE (long) - Flag indicating whether to override the + * default type mapping for the arrow decimal128 datatype and instead + * represent it as a double (9h). Default 0. * * @param parquet_file String name of the parquet file to write * @param schema_id The schema identifier * @param array_data Mixed list of arrow array data to be written to the * file - * @param options Dictionary of symbol options to long values + * @options Dictionary of options or generic null (::) to use + * defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or + * mixed list of -7|-11|4h. * @return NULL on success, error otherwise */ EXP K writeParquet(K parquet_file, K schema_id, K array_data, K options); @@ -87,29 +118,44 @@ extern "C" * @brief Reads the arrow array data from the specified parquet file * * Supported options: - * parquet_multithreaded_read - * Flag indicating whether the parquet reader should run in multithreaded - * mode. This can improve performance by processing multiple columns in - * parallel (long, default: 0) - * use_mmap - * Flag indicating whether the parquet file should be memory mapped in. - * This can improve performance on systems which support mmap (long, - * default: 0) + * + * PARQUET_MULTITHREADED_READ (long) - Flag indicating whether the parquet + * reader should run in multithreaded mode. This can improve performance by + * processing multiple columns in parallel. Default 0 + * + * USE_MMAP (long) - Flag indicating whether the parquet file should be memory + * mapped in. This can improve performance on systems which support mmap. + * Default 0 + * + * DECIMAL128_AS_DOUBLE (long) - Flag indicating whether to override the + * default type mapping for the arrow decimal128 datatype and instead + * represent it as a double (9h). Default 0. * * @param parquet_file String name of the parquet file to read - * @param options Dictionary of symbol options to long values + * @options Dictionary of options or generic null (::) to use + * defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or + * mixed list of -7|-11|4h. * @return Mixed list of arrow array objects */ EXP K readParquetData(K parquet_file, K options); /** * @brief Reads a single column from a parquet file - * + * + * Supported options: + * + * DECIMAL128_AS_DOUBLE (long) - Flag indicating whether to override the + * default type mapping for the arrow decimal128 datatype and instead + * represent it as a double (9h). Default 0. + * * @param parquet_file String name of the parquet file to read * @param column_index The index of the column to be read + * @options Dictionary of options or generic null (::) to use + * defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or + * mixed list of -7|-11|4h. * @return Arrow array object */ - EXP K readParquetColumn(K parquet_file, K column_index); + EXP K readParquetColumn(K parquet_file, K column_index, K options); /** * @brief Creates an arrow IPC record batch file with the specified arrow @@ -120,13 +166,22 @@ extern "C" * according to the field's datatype. This required array data structure is * detailed for each of the datatype constructor functions. * + * Supported options: + * + * DECIMAL128_AS_DOUBLE (long) - Flag indicating whether to override the + * default type mapping for the arrow decimal128 datatype and instead + * represent it as a double (9h). Default 0. + * * @param arrow_file String name of the arrow file to write * @param schema_id The schema identifier * @param array_data Mixed list of arrow array data to be written to the * file + * @options Dictionary of options or generic null (::) to use + * defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or + * mixed list of -7|-11|4h. * @return NULL on success, error otherwise */ - EXP K writeArrow(K arrow_file, K schema_id, K array_data); + EXP K writeArrow(K arrow_file, K schema_id, K array_data, K options); /** * @brief Reads the arrow schema from the specified arrow IPC record batch @@ -142,12 +197,19 @@ extern "C" * batch file * * Supported options: - * use_mmap - * Flag indicating whether the arrow file should be memory mapped in. This - * can improve performance on systems which support mmap (long, default: 0) + * + * USE_MMAP (long) - Flag indicating whether the parquet file should be memory + * mapped in. This can improve performance on systems which support mmap. + * Default 0 + * + * DECIMAL128_AS_DOUBLE (long) - Flag indicating whether to override the + * default type mapping for the arrow decimal128 datatype and instead + * represent it as a double (9h). Default 0. * * @param arrow_file String name of the arrow file to read - * @param options Dictionary of symbol options to long values + * @options Dictionary of options or generic null (::) to use + * defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or + * mixed list of -7|-11|4h. * @return Mixed list of arrow array objects */ EXP K readArrowData(K arrow_file, K options); @@ -161,11 +223,20 @@ extern "C" * according to the field's datatype. This required array data structure is * detailed for each of the datatype constructor functions. * + * Supported options: + * + * DECIMAL128_AS_DOUBLE (long) - Flag indicating whether to override the + * default type mapping for the arrow decimal128 datatype and instead + * represent it as a double (9h). Default 0. + * * @param schema_id The schema identifier * @param array_data Mixed list of arrow array data to be serialized + * @options Dictionary of options or generic null (::) to use + * defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or + * mixed list of -7|-11|4h. * @return KG list containing the serialized stream data */ - EXP K serializeArrow(K schema_id, K array_data); + EXP K serializeArrow(K schema_id, K array_data, K options); /** * @brief Parses the arrow schema from the specified arrow IPC record batch @@ -180,10 +251,19 @@ extern "C" * @brief Parses the arrow array data from the specified arrow IPC record * batch stream * + * Supported options: + * + * DECIMAL128_AS_DOUBLE (long) - Flag indicating whether to override the + * default type mapping for the arrow decimal128 datatype and instead + * represent it as a double (9h). Default 0. + * * @param char_array KG list containing the serialized stream data + * @options Dictionary of options or generic null (::) to use + * defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or + * mixed list of -7|-11|4h. * @return Mixed list of arrow array objects */ - EXP K parseArrowData(K char_array); + EXP K parseArrowData(K char_array, K options); } diff --git a/tests/test.t b/tests/test.t index 5454f58..f42ae75 100644 --- a/tests/test.t +++ b/tests/test.t @@ -4,7 +4,7 @@ \l q/arrowkdb.q -// Move to protobuf namespace +// Move to arrowkdb namespace \d .arrowkdb @@ -179,16 +179,16 @@ rm filename; -1 "<--- Read/write arrow file --->"; filename:"ints.arrow" -ipc.writeArrow[filename;schema;array_data] +ipc.writeArrow[filename;schema;array_data;::] ipc.readArrowSchema[filename]~schema ipc.readArrowData[filename;::]~array_data rm filename; -1 "<--- Read/write arrow stream --->"; -serialized:ipc.serializeArrow[schema;array_data] +serialized:ipc.serializeArrow[schema;array_data;::] ipc.parseArrowSchema[serialized]~schema -ipc.parseArrowData[serialized]~array_data +ipc.parseArrowData[serialized;::]~array_data sc.removeSchema[schema] @@ -211,16 +211,16 @@ rm filename; -1 "<--- Read/write arrow file --->"; filename:"floats_bool_na_dec.arrow" -ipc.writeArrow[filename;schema;array_data] +ipc.writeArrow[filename;schema;array_data;::] ipc.readArrowSchema[filename]~schema ipc.readArrowData[filename;::]~array_data rm filename; -1 "<--- Read/write arrow stream --->"; -serialized:ipc.serializeArrow[schema;array_data] +serialized:ipc.serializeArrow[schema;array_data;::] ipc.parseArrowSchema[serialized]~schema -ipc.parseArrowData[serialized]~array_data +ipc.parseArrowData[serialized;::]~array_data sc.removeSchema[schema] @@ -243,16 +243,16 @@ rm filename; -1 "<--- Read/write arrow file --->"; filename:"utf8_binary.arrow" -ipc.writeArrow[filename;schema;array_data] +ipc.writeArrow[filename;schema;array_data;::] ipc.readArrowSchema[filename]~schema ipc.readArrowData[filename;::]~array_data rm filename; -1 "<--- Read/write arrow stream --->"; -serialized:ipc.serializeArrow[schema;array_data] +serialized:ipc.serializeArrow[schema;array_data;::] ipc.parseArrowSchema[serialized]~schema -ipc.parseArrowData[serialized]~array_data +ipc.parseArrowData[serialized;::]~array_data sc.removeSchema[schema] @@ -279,16 +279,16 @@ rm filename; -1 "<--- Read/write arrow file --->"; filename:"temporal.arrow" -ipc.writeArrow[filename;schema;array_data] +ipc.writeArrow[filename;schema;array_data;::] ipc.readArrowSchema[filename]~schema ipc.readArrowData[filename;::]~array_data rm filename; -1 "<--- Read/write arrow stream --->"; -serialized:ipc.serializeArrow[schema;array_data] +serialized:ipc.serializeArrow[schema;array_data;::] ipc.parseArrowSchema[serialized]~schema -ipc.parseArrowData[serialized]~array_data +ipc.parseArrowData[serialized;::]~array_data sc.removeSchema[schema] @@ -311,16 +311,16 @@ rm filename; -1 "<--- Read/write arrow file --->"; filename:"lists.arrow" -ipc.writeArrow[filename;schema;array_data] +ipc.writeArrow[filename;schema;array_data;::] ipc.readArrowSchema[filename]~schema ipc.readArrowData[filename;::]~array_data rm filename; -1 "<--- Read/write arrow stream --->"; -serialized:ipc.serializeArrow[schema;array_data] +serialized:ipc.serializeArrow[schema;array_data;::] ipc.parseArrowSchema[serialized]~schema -ipc.parseArrowData[serialized]~array_data +ipc.parseArrowData[serialized;::]~array_data sc.removeSchema[schema] @@ -343,16 +343,16 @@ rm filename; -1 "<--- Read/write arrow file --->"; filename:"map_struct.arrow" -ipc.writeArrow[filename;schema;array_data] +ipc.writeArrow[filename;schema;array_data;::] ipc.readArrowSchema[filename]~schema ipc.readArrowData[filename;::]~array_data rm filename; -1 "<--- Read/write arrow stream --->"; -serialized:ipc.serializeArrow[schema;array_data] +serialized:ipc.serializeArrow[schema;array_data;::] ipc.parseArrowSchema[serialized]~schema -ipc.parseArrowData[serialized]~array_data +ipc.parseArrowData[serialized;::]~array_data sc.removeSchema[schema] @@ -378,16 +378,16 @@ array_data:(float16_data;large_utf8_data;large_binary_data;month_interval_data;d -1 "<--- Read/write arrow file --->"; filename:"simple_arrow_only.arrow" -ipc.writeArrow[filename;schema;array_data] +ipc.writeArrow[filename;schema;array_data;::] ipc.readArrowSchema[filename]~schema ipc.readArrowData[filename;::]~array_data rm filename; -1 "<--- Read/write arrow stream --->"; -serialized:ipc.serializeArrow[schema;array_data] +serialized:ipc.serializeArrow[schema;array_data;::] ipc.parseArrowSchema[serialized]~schema -ipc.parseArrowData[serialized]~array_data +ipc.parseArrowData[serialized;::]~array_data sc.removeSchema[schema] @@ -410,16 +410,16 @@ array_data:(fixed_size_list_data;sparse_union_data;dense_union_data;dictionary_d -1 "<--- Read/write arrow file --->"; filename:"nested_arrow_only.arrow" -ipc.writeArrow[filename;schema;array_data] +ipc.writeArrow[filename;schema;array_data;::] ipc.readArrowSchema[filename]~schema ipc.readArrowData[filename;::]~array_data rm filename; -1 "<--- Read/write arrow stream --->"; -serialized:ipc.serializeArrow[schema;array_data] +serialized:ipc.serializeArrow[schema;array_data;::] ipc.parseArrowSchema[serialized]~schema -ipc.parseArrowData[serialized]~array_data +ipc.parseArrowData[serialized;::]~array_data sc.removeSchema[schema] @@ -460,22 +460,22 @@ filename:"inferred.parquet" pq.writeParquetFromTable[filename;table;parquet_write_options] pq.readParquetSchema[filename]~schema pq.readParquetToTable[filename;::]~table -pq.readParquetColumn[filename;6i]~float64_data +pq.readParquetColumn[filename;6i;::]~float64_data rm filename; -1 "<--- Read/write arrow file --->"; filename:"inferred.arrow" -ipc.writeArrowFromTable[filename;table] +ipc.writeArrowFromTable[filename;table;::] ipc.readArrowSchema[filename]~schema ipc.readArrowToTable[filename;::]~table rm filename; -1 "<--- Read/write arrow stream --->"; -serialized:ipc.serializeArrowFromTable[table] +serialized:ipc.serializeArrowFromTable[table;::] ipc.parseArrowSchema[serialized]~schema -ipc.parseArrowToTable[serialized]~table +ipc.parseArrowToTable[serialized;::]~table sc.removeSchema[schema] diff --git a/travis_setup.sh b/travis_setup.sh index 41ab7db..a94d9f6 100644 --- a/travis_setup.sh +++ b/travis_setup.sh @@ -3,13 +3,9 @@ if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo apt update sudo apt install -y -V ca-certificates lsb-release wget - if [ $(lsb_release --codename --short) = "stretch" ]; then - sudo tee /etc/apt/sources.list.d/backports.list <