From b093cb4c160a74f652bbb0726c160aa48a7d1b60 Mon Sep 17 00:00:00 2001 From: nmcdonnell-kx Date: Tue, 21 May 2024 14:29:43 +0100 Subject: [PATCH 1/2] String arrays in libarrow have a maximum size of 2GB. Therefore when writing a very long string array to parquet it needs to be chunked into the parquet file writer. --- docs/reference.md | 2 ++ src/TableData.cpp | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/reference.md b/docs/reference.md index df43315..086c8ce 100755 --- a/docs/reference.md +++ b/docs/reference.md @@ -2005,6 +2005,7 @@ Supported options: - `COMPRESSION` - Selects the compression type for Arrow to use when writing Parquet files. The libarrow build being used must include the corresponding libraries. Values supported: `UNCOMPRESSED` (default), `SNAPPY`, `GZIP`, `BROTLI`, `ZSTD`, `LZ4_RAW`, `LZ4`, `LZ4_HADOOP`, `LZO`, `BZ2`. - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. - `NULL_MAPPING` - Sub-dictionary of null mapping datatypes and values. See [here](null-mapping.md) for more details. +- `ARROW_CHUNK_ROWS` - The number of rows to include in each arrow array. If the total rows in the kdb data are greater then the kdb lists are internally chunked into the parquet file writer. This is different to row groups (set using `PARQUET_CHUNK_SIZE`) which control how the parquet file is structured. Long, default 0 (not enabled). > :warning: **The Parquet format is compressed and designed for for maximum space efficiency which may cause a performance overhead compared to Arrow. Parquet is also less fully featured than Arrow which can result in schema limitations** > @@ -2044,6 +2045,7 @@ Supported options: - `PARQUET_VERSION` - Select the Parquet format version: `V1.0`, `V2.0`, `V2.4`, `V2.6` or `V2.LATEST`. Later versions are more fully featured but may be incompatible with older Parquet implementations. Default `V1.0` - `COMPRESSION` - Selects the compression type for Arrow to use when writing Parquet files. The libarrow build being used must include the corresponding libraries. Values supported: `UNCOMPRESSED` (default), `SNAPPY`, `GZIP`, `BROTLI`, `ZSTD`, `LZ4_RAW`, `LZ4`, `LZ4_HADOOP`, `LZO`, `BZ2`. - `NULL_MAPPING` - Sub-dictionary of null mapping datatypes and values. See [here](null-mapping.md) for more details. +- `ARROW_CHUNK_ROWS` - The number of rows to include in each arrow array. If the total rows in the kdb data are greater then the kdb lists are internally chunked into the parquet file writer. This is different to row groups (set using `PARQUET_CHUNK_SIZE`) which control how the parquet file is structured. Long, default 0 (not enabled). > :warning: **Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors** > diff --git a/src/TableData.cpp b/src/TableData.cpp index 4b16b12..9c00302 100644 --- a/src/TableData.cpp +++ b/src/TableData.cpp @@ -96,7 +96,7 @@ std::vector> MakeChunkedArrays( // Create a an arrow table from the arrow schema and mixed list of kdb array objects std::shared_ptr MakeTable(std::shared_ptr schema, K array_data, kx::arrowkdb::TypeMappingOverride& type_overrides) { - return arrow::Table::Make(schema, MakeArrays(schema, array_data, type_overrides)); + return arrow::Table::Make(schema, MakeChunkedArrays(schema, array_data, type_overrides)); } K prettyPrintTable(K schema_id, K array_data, K options) @@ -222,6 +222,9 @@ K writeParquet(K parquet_file, K schema_id, K array_data, K options) // Type mapping overrides kx::arrowkdb::TypeMappingOverride type_overrides{ write_options }; + // Chunk size + write_options.GetIntOption( kx::arrowkdb::Options::ARROW_CHUNK_ROWS, type_overrides.chunk_length ); + auto parquet_props = parquet_props_builder.compression(getCompressionType(write_options))->build(); auto arrow_props = arrow_props_builder.build(); From 0fea2f902cba370af5c8e7bb12050f0808c55c17 Mon Sep 17 00:00:00 2001 From: nmcdonnell-kx Date: Tue, 21 May 2024 15:47:04 +0100 Subject: [PATCH 2/2] Bump the xcode version --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 357f415..a62423e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,7 +4,7 @@ jobs: os: linux - dist: focal os: linux - - osx_image: xcode12.5 + - osx_image: xcode14 os: osx - os: windows language: c