From 407496b904282f142973571bcd0808b0d6b38df3 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 5 Jan 2023 17:10:43 +0000 Subject: [PATCH 001/276] Array reader decomposition --- src/ArrayReader.cpp | 565 +++++++++++++++++++++++++------------------- 1 file changed, 316 insertions(+), 249 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 0456986..255c664 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -15,29 +16,27 @@ #include "HelperFunctions.h" #include "TypeCheck.h" - -namespace kx { -namespace arrowkdb { +namespace { // An arrow list array is a nested set of child lists. This is represented in // kdb as a mixed list for the parent list array containing a set of sub-lists, // one for each of the list value sets. template -void AppendList(std::shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +void AppendList(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) { for (auto i = 0; i < array_data->length(); ++i) { // Slice the parent array to get the list value set at the specified index auto value_slice = std::static_pointer_cast(array_data)->value_slice(i); // Recursively populate the kdb parent mixed list from that slice - kK(k_array)[index++] = ReadArray(value_slice, type_overrides); + kK(k_array)[index++] = kx::arrowkdb::ReadArray(value_slice, type_overrides); } } // An arrow map array is a nested set of key/item paired child arrays. This is // represented in kdb as a mixed list for the parent map array, with a // dictionary for each map value set. -void AppendMap(std::shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +void AppendMap(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) { auto map_array = std::static_pointer_cast(array_data); auto keys = map_array->keys(); @@ -49,7 +48,7 @@ void AppendMap(std::shared_ptr array_data, K k_array, size_t& inde auto items_slice = items->Slice(map_array->value_offset(i), map_array->value_length(i)); // Recursively populate the kdb parent mixed list with a dictionary // populated from those slices - kK(k_array)[index++] = xD(ReadArray(keys_slice, type_overrides), ReadArray(items_slice, type_overrides)); + kK(k_array)[index++] = xD(kx::arrowkdb::ReadArray(keys_slice, type_overrides), kx::arrowkdb::ReadArray(items_slice, type_overrides)); } } @@ -58,7 +57,7 @@ void AppendMap(std::shared_ptr array_data, K k_array, size_t& inde // value is obtaining by slicing across all the child arrays at a given index. // This is represented in kdb as a mixed list for the parent struct array, // containing child lists for each field in the struct. -void AppendStruct(std::shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +void AppendStruct(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) { auto struct_array = std::static_pointer_cast(array_data); auto num_fields = struct_array->type()->num_fields(); @@ -75,7 +74,7 @@ void AppendStruct(std::shared_ptr array_data, K k_array, size_t& i // An arrow union array is similar to a struct array except that it has an // additional type id array which identifies the live field in each union value // set. -void AppendUnion(std::shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +void AppendUnion(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) { auto union_array = std::static_pointer_cast(array_data); @@ -98,7 +97,7 @@ void AppendUnion(std::shared_ptr array_data, K k_array, size_t& in // An arrow dictionary array is represented in kdb as a mixed list for the // parent dictionary array containing the values and indicies sub-lists. -void AppendDictionary(std::shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +void AppendDictionary(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) { auto dictionary_array = std::static_pointer_cast(array_data); @@ -106,259 +105,327 @@ void AppendDictionary(std::shared_ptr array_data, K k_array, size_ // two child arrays could be a different length to each other and the parent // dictionary array which makes it difficult to preallocate the kdb lists of // the correct length. - K values = ReadArray(dictionary_array->dictionary(), type_overrides); + K values = kx::arrowkdb::ReadArray(dictionary_array->dictionary(), type_overrides); jv(&kK(k_array)[0], values); - K indices = ReadArray(dictionary_array->indices(), type_overrides); + K indices = kx::arrowkdb::ReadArray(dictionary_array->indices(), type_overrides); jv(&kK(k_array)[1], indices); } -void AppendArray(std::shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +void AppendArray_NA(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) { - switch (array_data->type_id()) { - case arrow::Type::NA: - { - auto null_array = std::static_pointer_cast(array_data); - for (auto i = 0; i < null_array->length(); ++i) - kK(k_array)[index++] = knk(0); - break; - } - case arrow::Type::BOOL: - { - auto bool_array = std::static_pointer_cast(array_data); - // BooleanArray doesn't have a bulk reader since arrow BooleanType is only 1 bit - for (auto i = 0; i < bool_array->length(); ++i) - kG(k_array)[index++] = bool_array->Value(i); - break; - } - case arrow::Type::UINT8: - { - auto uint8_array = std::static_pointer_cast(array_data); - memcpy(kG(k_array), uint8_array->raw_values(), uint8_array->length() * sizeof(arrow::UInt8Array::value_type)); - break; - } - case arrow::Type::INT8: - { - auto int8_array = std::static_pointer_cast(array_data); - memcpy(kG(k_array), int8_array->raw_values(), int8_array->length() * sizeof(arrow::Int8Array::value_type)); - break; - } - case arrow::Type::UINT16: - { - auto uint16_array = std::static_pointer_cast(array_data); - memcpy(kH(k_array), uint16_array->raw_values(), uint16_array->length() * sizeof(arrow::UInt16Array::value_type)); - break; - } - case arrow::Type::INT16: - { - auto int16_array = std::static_pointer_cast(array_data); - memcpy(kH(k_array), int16_array->raw_values(), int16_array->length() * sizeof(arrow::Int16Array::value_type)); - break; - } - case arrow::Type::UINT32: - { - auto uint32_array = std::static_pointer_cast(array_data); - memcpy(kI(k_array), uint32_array->raw_values(), uint32_array->length() * sizeof(arrow::UInt32Array::value_type)); - break; - } - case arrow::Type::INT32: - { - auto int32_array = std::static_pointer_cast(array_data); - memcpy(kI(k_array), int32_array->raw_values(), int32_array->length() * sizeof(arrow::Int32Array::value_type)); - break; - } - case arrow::Type::UINT64: - { - auto uint64_array = std::static_pointer_cast(array_data); - memcpy(kJ(k_array), uint64_array->raw_values(), uint64_array->length() * sizeof(arrow::UInt64Array::value_type)); - break; - } - case arrow::Type::INT64: - { - auto int64_array = std::static_pointer_cast(array_data); - memcpy(kJ(k_array), int64_array->raw_values(), int64_array->length() * sizeof(arrow::Int64Array::value_type)); - break; - } - case arrow::Type::HALF_FLOAT: - { - auto hfl_array = std::static_pointer_cast(array_data); - memcpy(kH(k_array), hfl_array->raw_values(), hfl_array->length() * sizeof(arrow::HalfFloatArray::value_type)); - break; - } - case arrow::Type::FLOAT: - { - auto fl_array = std::static_pointer_cast(array_data); - memcpy(kE(k_array), fl_array->raw_values(), fl_array->length() * sizeof(arrow::FloatArray::value_type)); - break; - } - case arrow::Type::DOUBLE: - { - auto dbl_array = std::static_pointer_cast(array_data); - memcpy(kF(k_array), dbl_array->raw_values(), dbl_array->length() * sizeof(arrow::DoubleArray::value_type)); - break; - } - case arrow::Type::STRING: - { - auto str_array = std::static_pointer_cast(array_data); - for (auto i = 0; i < str_array->length(); ++i) { - auto str_data = str_array->GetString(i); - K k_str = ktn(KC, str_data.length()); - memcpy(kG(k_str), str_data.data(), str_data.length()); - kK(k_array)[index++] = k_str; - } - break; - } - case arrow::Type::LARGE_STRING: - { - auto str_array = std::static_pointer_cast(array_data); - for (auto i = 0; i < str_array->length(); ++i) { - auto str_data = str_array->GetString(i); - K k_str = ktn(KC, str_data.length()); - memcpy(kG(k_str), str_data.data(), str_data.length()); - kK(k_array)[index++] = k_str; - } - break; - } - case arrow::Type::BINARY: - { - auto bin_array = std::static_pointer_cast(array_data); - for (auto i = 0; i < bin_array->length(); ++i) { - auto bin_data = bin_array->GetString(i); - K k_bin = ktn(KG, bin_data.length()); - memcpy(kG(k_bin), bin_data.data(), bin_data.length()); - kK(k_array)[index++] = k_bin; - } - break; - } - case arrow::Type::LARGE_BINARY: - { - auto bin_array = std::static_pointer_cast(array_data); - for (auto i = 0; i < bin_array->length(); ++i) { - auto bin_data = bin_array->GetString(i); - K k_bin = ktn(KG, bin_data.length()); - memcpy(kG(k_bin), bin_data.data(), bin_data.length()); - kK(k_array)[index++] = k_bin; - } - break; - } - case arrow::Type::FIXED_SIZE_BINARY: - { - auto fixed_bin_array = std::static_pointer_cast(array_data); - for (auto i = 0; i < fixed_bin_array->length(); ++i) { - auto bin_data = fixed_bin_array->GetString(i); - K k_bin = ktn(KG, bin_data.length()); - memcpy(kG(k_bin), bin_data.data(), bin_data.length()); - kK(k_array)[index++] = k_bin; - } - break; - } - case arrow::Type::DATE32: - { - TemporalConversion tc(array_data->type()); - auto d32_array = std::static_pointer_cast(array_data); - for (auto i = 0; i < d32_array->length(); ++i) - kI(k_array)[index++] = tc.ArrowToKdb(d32_array->Value(i)); - break; + auto null_array = std::static_pointer_cast(array_data); + for (auto i = 0; i < null_array->length(); ++i) + kK(k_array)[index++] = knk(0); +} + +void AppendArray_BOOL(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + auto bool_array = std::static_pointer_cast(array_data); + // BooleanArray doesn't have a bulk reader since arrow BooleanType is only 1 bit + for (auto i = 0; i < bool_array->length(); ++i) + kG(k_array)[index++] = bool_array->Value(i); +} + +void AppendArray_UINT8(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + auto uint8_array = std::static_pointer_cast(array_data); + memcpy(kG(k_array), uint8_array->raw_values(), uint8_array->length() * sizeof(arrow::UInt8Array::value_type)); +} + +void AppendArray_INT8(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + auto int8_array = std::static_pointer_cast(array_data); + memcpy(kG(k_array), int8_array->raw_values(), int8_array->length() * sizeof(arrow::Int8Array::value_type)); +} + +void AppendArray_UINT16(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + auto uint16_array = std::static_pointer_cast(array_data); + memcpy(kH(k_array), uint16_array->raw_values(), uint16_array->length() * sizeof(arrow::UInt16Array::value_type)); +} + +void AppendArray_INT16(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + auto int16_array = std::static_pointer_cast(array_data); + memcpy(kH(k_array), int16_array->raw_values(), int16_array->length() * sizeof(arrow::Int16Array::value_type)); +} + +void AppendArray_UINT32(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + auto uint32_array = std::static_pointer_cast(array_data); + memcpy(kI(k_array), uint32_array->raw_values(), uint32_array->length() * sizeof(arrow::UInt32Array::value_type)); +} + +void AppendArray_INT32(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + auto int32_array = std::static_pointer_cast(array_data); + memcpy(kI(k_array), int32_array->raw_values(), int32_array->length() * sizeof(arrow::Int32Array::value_type)); +} + +void AppendArray_UINT64(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + auto uint64_array = std::static_pointer_cast(array_data); + memcpy(kJ(k_array), uint64_array->raw_values(), uint64_array->length() * sizeof(arrow::UInt64Array::value_type)); +} + +void AppendArray_INT64(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + auto int64_array = std::static_pointer_cast(array_data); + memcpy(kJ(k_array), int64_array->raw_values(), int64_array->length() * sizeof(arrow::Int64Array::value_type)); +} + +void AppendArray_HALF_FLOAT(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + auto hfl_array = std::static_pointer_cast(array_data); + memcpy(kH(k_array), hfl_array->raw_values(), hfl_array->length() * sizeof(arrow::HalfFloatArray::value_type)); +} + +void AppendArray_FLOAT(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + auto fl_array = std::static_pointer_cast(array_data); + memcpy(kE(k_array), fl_array->raw_values(), fl_array->length() * sizeof(arrow::FloatArray::value_type)); +} + +void AppendArray_DOUBLE(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + auto dbl_array = std::static_pointer_cast(array_data); + memcpy(kF(k_array), dbl_array->raw_values(), dbl_array->length() * sizeof(arrow::DoubleArray::value_type)); +} + +void AppendArray_STRING(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + auto str_array = std::static_pointer_cast(array_data); + for (auto i = 0; i < str_array->length(); ++i) { + auto str_data = str_array->GetString(i); + K k_str = ktn(KC, str_data.length()); + memcpy(kG(k_str), str_data.data(), str_data.length()); + kK(k_array)[index++] = k_str; } - case arrow::Type::DATE64: - { - TemporalConversion tc(array_data->type()); - auto d64_array = std::static_pointer_cast(array_data); - for (auto i = 0; i < d64_array->length(); ++i) - kJ(k_array)[index++] = tc.ArrowToKdb(d64_array->Value(i)); - break; +} + +void AppendArray_LARGE_STRING(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + auto str_array = std::static_pointer_cast(array_data); + for (auto i = 0; i < str_array->length(); ++i) { + auto str_data = str_array->GetString(i); + K k_str = ktn(KC, str_data.length()); + memcpy(kG(k_str), str_data.data(), str_data.length()); + kK(k_array)[index++] = k_str; } - case arrow::Type::TIMESTAMP: - { - TemporalConversion tc(array_data->type()); - auto ts_array = std::static_pointer_cast(array_data); - auto timestamp_type = std::static_pointer_cast(ts_array->type()); - for (auto i = 0; i < ts_array->length(); ++i) - kJ(k_array)[index++] = tc.ArrowToKdb(ts_array->Value(i)); - break; +} + +void AppendArray_BINARY(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + auto bin_array = std::static_pointer_cast(array_data); + for (auto i = 0; i < bin_array->length(); ++i) { + auto bin_data = bin_array->GetString(i); + K k_bin = ktn(KG, bin_data.length()); + memcpy(kG(k_bin), bin_data.data(), bin_data.length()); + kK(k_array)[index++] = k_bin; } - case arrow::Type::TIME32: - { - TemporalConversion tc(array_data->type()); - auto t32_array = std::static_pointer_cast(array_data); - auto time32_type = std::static_pointer_cast(t32_array->type()); - for (auto i = 0; i < t32_array->length(); ++i) - kI(k_array)[index++] = tc.ArrowToKdb(t32_array->Value(i)); - break; +} + +void AppendArray_LARGE_BINARY(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + auto bin_array = std::static_pointer_cast(array_data); + for (auto i = 0; i < bin_array->length(); ++i) { + auto bin_data = bin_array->GetString(i); + K k_bin = ktn(KG, bin_data.length()); + memcpy(kG(k_bin), bin_data.data(), bin_data.length()); + kK(k_array)[index++] = k_bin; } - case arrow::Type::TIME64: - { - TemporalConversion tc(array_data->type()); - auto t64_array = std::static_pointer_cast(array_data); - auto time64_type = std::static_pointer_cast(t64_array->type()); - for (auto i = 0; i < t64_array->length(); ++i) - kJ(k_array)[index++] = tc.ArrowToKdb(t64_array->Value(i)); - break; +} + +void AppendArray_FIXED_SIZE_BINARY(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + auto fixed_bin_array = std::static_pointer_cast(array_data); + for (auto i = 0; i < fixed_bin_array->length(); ++i) { + auto bin_data = fixed_bin_array->GetString(i); + K k_bin = ktn(KG, bin_data.length()); + memcpy(kG(k_bin), bin_data.data(), bin_data.length()); + kK(k_array)[index++] = k_bin; } - case arrow::Type::DECIMAL: - { - auto dec_array = std::static_pointer_cast(array_data); - auto dec_type = std::static_pointer_cast(dec_array->type()); - for (auto i = 0; i < dec_array->length(); ++i) { - auto decimal = arrow::Decimal128(dec_array->Value(i)); - if (type_overrides.decimal128_as_double) { - // Convert the decimal to a double - auto dec_as_double = decimal.ToDouble(dec_type->scale()); - kF(k_array)[index++] = dec_as_double; - } else { - // Each decimal is a list of 16 bytes - K k_dec = ktn(KG, 16); - decimal.ToBytes(kG(k_dec)); - kK(k_array)[index++] = k_dec; - } +} + +void AppendArray_DATE32(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + kx::arrowkdb::TemporalConversion tc(array_data->type()); + auto d32_array = std::static_pointer_cast(array_data); + for (auto i = 0; i < d32_array->length(); ++i) + kI(k_array)[index++] = tc.ArrowToKdb(d32_array->Value(i)); +} + +void AppendArray_DATE64(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + kx::arrowkdb::TemporalConversion tc(array_data->type()); + auto d64_array = std::static_pointer_cast(array_data); + for (auto i = 0; i < d64_array->length(); ++i) + kJ(k_array)[index++] = tc.ArrowToKdb(d64_array->Value(i)); +} + +void AppendArray_TIMESTAMP(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + kx::arrowkdb::TemporalConversion tc(array_data->type()); + auto ts_array = std::static_pointer_cast(array_data); + auto timestamp_type = std::static_pointer_cast(ts_array->type()); + for (auto i = 0; i < ts_array->length(); ++i) + kJ(k_array)[index++] = tc.ArrowToKdb(ts_array->Value(i)); +} + +void AppendArray_TIME32(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + kx::arrowkdb::TemporalConversion tc(array_data->type()); + auto t32_array = std::static_pointer_cast(array_data); + auto time32_type = std::static_pointer_cast(t32_array->type()); + for (auto i = 0; i < t32_array->length(); ++i) + kI(k_array)[index++] = tc.ArrowToKdb(t32_array->Value(i)); +} + +void AppendArray_TIME64(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + kx::arrowkdb::TemporalConversion tc(array_data->type()); + auto t64_array = std::static_pointer_cast(array_data); + auto time64_type = std::static_pointer_cast(t64_array->type()); + for (auto i = 0; i < t64_array->length(); ++i) + kJ(k_array)[index++] = tc.ArrowToKdb(t64_array->Value(i)); +} + +void AppendArray_DECIMAL(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + auto dec_array = std::static_pointer_cast(array_data); + auto dec_type = std::static_pointer_cast(dec_array->type()); + for (auto i = 0; i < dec_array->length(); ++i) { + auto decimal = arrow::Decimal128(dec_array->Value(i)); + if (type_overrides.decimal128_as_double) { + // Convert the decimal to a double + auto dec_as_double = decimal.ToDouble(dec_type->scale()); + kF(k_array)[index++] = dec_as_double; + } else { + // Each decimal is a list of 16 bytes + K k_dec = ktn(KG, 16); + decimal.ToBytes(kG(k_dec)); + kK(k_array)[index++] = k_dec; } - break; - } - case arrow::Type::DURATION: - { - TemporalConversion tc(array_data->type()); - auto dur_array = std::static_pointer_cast(array_data); - auto duration_type = std::static_pointer_cast(dur_array->type()); - for (auto i = 0; i < dur_array->length(); ++i) - kJ(k_array)[index++] = tc.ArrowToKdb(dur_array->Value(i)); - break; } - case arrow::Type::INTERVAL_MONTHS: +} + +void AppendArray_DURATION(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + kx::arrowkdb::TemporalConversion tc(array_data->type()); + auto dur_array = std::static_pointer_cast(array_data); + auto duration_type = std::static_pointer_cast(dur_array->type()); + for (auto i = 0; i < dur_array->length(); ++i) + kJ(k_array)[index++] = tc.ArrowToKdb(dur_array->Value(i)); +} + +void AppendArray_INTERVAL_MONTHS(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + auto month_array = std::static_pointer_cast(array_data); + memcpy(kI(k_array), month_array->raw_values(), month_array->length() * sizeof(arrow::MonthIntervalArray::value_type)); +} + +void AppendArray_INTERVAL_DAY_TIME(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + auto dt_array = std::static_pointer_cast(array_data); + for (auto i = 0; i < dt_array->length(); ++i) + kJ(k_array)[index++] = kx::arrowkdb::DayTimeInterval_KTimespan(dt_array->Value(i)); +} + +void AppendArray_LIST(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + AppendList(array_data, k_array, index, type_overrides); +} + +void AppendArray_LARGE_LIST(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + AppendList(array_data, k_array, index, type_overrides); +} + +void AppendArray_FIXED_SIZE_LIST(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + AppendList(array_data, k_array, index, type_overrides); +} + +void AppendArray_MAP(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + AppendMap(array_data, k_array, index, type_overrides); +} + +void AppendArray_STRUCT(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + AppendStruct(array_data, k_array, index, type_overrides); +} + +void AppendArray_SPARSE_UNION(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + AppendUnion(array_data, k_array, index, type_overrides); +} + +void AppendArray_DENSE_UNION(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + AppendArray_SPARSE_UNION(array_data, k_array, index, type_overrides); +} + +void AppendArray_DICTIONARY(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + AppendDictionary(array_data, k_array, index, type_overrides); +} + +using ArrayHandler = void (*) (std::shared_ptr, K, size_t&, kx::arrowkdb::TypeMappingOverride&); + +std::unordered_map ArrayHandlers { + std::make_pair( arrow::Type::NA, &AppendArray_NA ) + , std::make_pair( arrow::Type::BOOL, &AppendArray_BOOL ) + , std::make_pair( arrow::Type::UINT8, &AppendArray_UINT8 ) + , std::make_pair( arrow::Type::INT8, &AppendArray_INT8 ) + , std::make_pair( arrow::Type::UINT16, &AppendArray_UINT16 ) + , std::make_pair( arrow::Type::INT16, &AppendArray_INT16 ) + , std::make_pair( arrow::Type::UINT32, &AppendArray_UINT32 ) + , std::make_pair( arrow::Type::INT32, &AppendArray_INT32 ) + , std::make_pair( arrow::Type::UINT64, &AppendArray_UINT64 ) + , std::make_pair( arrow::Type::INT64, &AppendArray_INT64 ) + , std::make_pair( arrow::Type::HALF_FLOAT, &AppendArray_HALF_FLOAT ) + , std::make_pair( arrow::Type::FLOAT, &AppendArray_FLOAT ) + , std::make_pair( arrow::Type::DOUBLE, &AppendArray_DOUBLE ) + , std::make_pair( arrow::Type::STRING, &AppendArray_STRING ) + , std::make_pair( arrow::Type::LARGE_STRING, &AppendArray_LARGE_STRING ) + , std::make_pair( arrow::Type::BINARY, &AppendArray_BINARY ) + , std::make_pair( arrow::Type::LARGE_BINARY, &AppendArray_LARGE_BINARY ) + , std::make_pair( arrow::Type::FIXED_SIZE_BINARY, &AppendArray_FIXED_SIZE_BINARY ) + , std::make_pair( arrow::Type::DATE32, &AppendArray_DATE32 ) + , std::make_pair( arrow::Type::DATE64, &AppendArray_DATE64 ) + , std::make_pair( arrow::Type::TIMESTAMP, &AppendArray_TIMESTAMP ) + , std::make_pair( arrow::Type::TIME32, &AppendArray_TIME32 ) + , std::make_pair( arrow::Type::TIME64, &AppendArray_TIME64 ) + , std::make_pair( arrow::Type::DECIMAL, &AppendArray_DECIMAL ) + , std::make_pair( arrow::Type::DURATION, &AppendArray_DURATION ) + , std::make_pair( arrow::Type::INTERVAL_MONTHS, &AppendArray_INTERVAL_MONTHS ) + , std::make_pair( arrow::Type::INTERVAL_DAY_TIME, &AppendArray_INTERVAL_DAY_TIME ) + , std::make_pair( arrow::Type::LIST, &AppendArray_LIST ) + , std::make_pair( arrow::Type::LARGE_LIST, &AppendArray_LARGE_LIST ) + , std::make_pair( arrow::Type::FIXED_SIZE_LIST, &AppendArray_FIXED_SIZE_LIST ) + , std::make_pair( arrow::Type::MAP, &AppendArray_MAP ) + , std::make_pair( arrow::Type::STRUCT, &AppendArray_STRUCT ) + , std::make_pair( arrow::Type::SPARSE_UNION, &AppendArray_SPARSE_UNION ) + , std::make_pair( arrow::Type::DENSE_UNION, &AppendArray_DENSE_UNION ) + , std::make_pair( arrow::Type::DICTIONARY, &AppendArray_DICTIONARY ) +}; + +} // namespace + +namespace kx { +namespace arrowkdb { + +void AppendArray(std::shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + auto type_id = array_data->type_id(); + if( ArrayHandlers.find( type_id ) == ArrayHandlers.end() ) { - auto month_array = std::static_pointer_cast(array_data); - memcpy(kI(k_array), month_array->raw_values(), month_array->length() * sizeof(arrow::MonthIntervalArray::value_type)); - break; + TYPE_CHECK_UNSUPPORTED(array_data->type()->ToString()); } - case arrow::Type::INTERVAL_DAY_TIME: + else { - auto dt_array = std::static_pointer_cast(array_data); - for (auto i = 0; i < dt_array->length(); ++i) - kJ(k_array)[index++] = DayTimeInterval_KTimespan(dt_array->Value(i)); - break; - } - case arrow::Type::LIST: - AppendList(array_data, k_array, index, type_overrides); - break; - case arrow::Type::LARGE_LIST: - AppendList(array_data, k_array, index, type_overrides); - break; - case arrow::Type::FIXED_SIZE_LIST: - AppendList(array_data, k_array, index, type_overrides); - break; - case arrow::Type::MAP: - AppendMap(array_data, k_array, index, type_overrides); - break; - case arrow::Type::STRUCT: - AppendStruct(array_data, k_array, index, type_overrides); - break; - case arrow::Type::SPARSE_UNION: - case arrow::Type::DENSE_UNION: - AppendUnion(array_data, k_array, index, type_overrides); - break; - case arrow::Type::DICTIONARY: - AppendDictionary(array_data, k_array, index, type_overrides); - break; - default: - TYPE_CHECK_UNSUPPORTED(array_data->type()->ToString()); + ArrayHandlers[type_id]( array_data, k_array, index, type_overrides ); } } @@ -450,4 +517,4 @@ K writeReadArray(K datatype_id, K array, K options) return kx::arrowkdb::ReadArray(arrow_array, type_overrides); KDB_EXCEPTION_CATCH; -} \ No newline at end of file +} From 8792ab973235da7b43c22a460b5444c0fafe00ba Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 5 Jan 2023 17:25:22 +0000 Subject: [PATCH 002/276] Using implicit namespaces --- src/ArrayReader.cpp | 275 ++++++++++++++++++++++---------------------- 1 file changed, 139 insertions(+), 136 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 255c664..217af1e 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -16,29 +16,32 @@ #include "HelperFunctions.h" #include "TypeCheck.h" +using namespace std; +using namespace kx::arrowkdb; + namespace { // An arrow list array is a nested set of child lists. This is represented in // kdb as a mixed list for the parent list array containing a set of sub-lists, // one for each of the list value sets. template -void AppendList(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendList(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { for (auto i = 0; i < array_data->length(); ++i) { // Slice the parent array to get the list value set at the specified index - auto value_slice = std::static_pointer_cast(array_data)->value_slice(i); + auto value_slice = static_pointer_cast(array_data)->value_slice(i); // Recursively populate the kdb parent mixed list from that slice - kK(k_array)[index++] = kx::arrowkdb::ReadArray(value_slice, type_overrides); + kK(k_array)[index++] = ReadArray(value_slice, type_overrides); } } // An arrow map array is a nested set of key/item paired child arrays. This is // represented in kdb as a mixed list for the parent map array, with a // dictionary for each map value set. -void AppendMap(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendMap(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto map_array = std::static_pointer_cast(array_data); + auto map_array = static_pointer_cast(array_data); auto keys = map_array->keys(); auto items = map_array->items(); for (auto i = 0; i < array_data->length(); ++i) { @@ -48,7 +51,7 @@ void AppendMap(std::shared_ptr array_data, K k_array, size_t& inde auto items_slice = items->Slice(map_array->value_offset(i), map_array->value_length(i)); // Recursively populate the kdb parent mixed list with a dictionary // populated from those slices - kK(k_array)[index++] = xD(kx::arrowkdb::ReadArray(keys_slice, type_overrides), kx::arrowkdb::ReadArray(items_slice, type_overrides)); + kK(k_array)[index++] = xD(ReadArray(keys_slice, type_overrides), ReadArray(items_slice, type_overrides)); } } @@ -57,9 +60,9 @@ void AppendMap(std::shared_ptr array_data, K k_array, size_t& inde // value is obtaining by slicing across all the child arrays at a given index. // This is represented in kdb as a mixed list for the parent struct array, // containing child lists for each field in the struct. -void AppendStruct(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendStruct(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto struct_array = std::static_pointer_cast(array_data); + auto struct_array = static_pointer_cast(array_data); auto num_fields = struct_array->type()->num_fields(); for (auto i = 0; i < num_fields; ++i) { auto field_array = struct_array->field(i); @@ -74,9 +77,9 @@ void AppendStruct(std::shared_ptr array_data, K k_array, size_t& i // An arrow union array is similar to a struct array except that it has an // additional type id array which identifies the live field in each union value // set. -void AppendUnion(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendUnion(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto union_array = std::static_pointer_cast(array_data); + auto union_array = static_pointer_cast(array_data); // The type_id array is represented as a KH list at the start of the parent mixed list. K type_ids = kK(k_array)[0]; @@ -97,104 +100,104 @@ void AppendUnion(std::shared_ptr array_data, K k_array, size_t& in // An arrow dictionary array is represented in kdb as a mixed list for the // parent dictionary array containing the values and indicies sub-lists. -void AppendDictionary(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendDictionary(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto dictionary_array = std::static_pointer_cast(array_data); + auto dictionary_array = static_pointer_cast(array_data); // Append the dictionary and indicies arrays. Have to use a join since the // two child arrays could be a different length to each other and the parent // dictionary array which makes it difficult to preallocate the kdb lists of // the correct length. - K values = kx::arrowkdb::ReadArray(dictionary_array->dictionary(), type_overrides); + K values = ReadArray(dictionary_array->dictionary(), type_overrides); jv(&kK(k_array)[0], values); - K indices = kx::arrowkdb::ReadArray(dictionary_array->indices(), type_overrides); + K indices = ReadArray(dictionary_array->indices(), type_overrides); jv(&kK(k_array)[1], indices); } -void AppendArray_NA(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_NA(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto null_array = std::static_pointer_cast(array_data); + auto null_array = static_pointer_cast(array_data); for (auto i = 0; i < null_array->length(); ++i) kK(k_array)[index++] = knk(0); } -void AppendArray_BOOL(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_BOOL(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto bool_array = std::static_pointer_cast(array_data); + auto bool_array = static_pointer_cast(array_data); // BooleanArray doesn't have a bulk reader since arrow BooleanType is only 1 bit for (auto i = 0; i < bool_array->length(); ++i) kG(k_array)[index++] = bool_array->Value(i); } -void AppendArray_UINT8(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_UINT8(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto uint8_array = std::static_pointer_cast(array_data); + auto uint8_array = static_pointer_cast(array_data); memcpy(kG(k_array), uint8_array->raw_values(), uint8_array->length() * sizeof(arrow::UInt8Array::value_type)); } -void AppendArray_INT8(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_INT8(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto int8_array = std::static_pointer_cast(array_data); + auto int8_array = static_pointer_cast(array_data); memcpy(kG(k_array), int8_array->raw_values(), int8_array->length() * sizeof(arrow::Int8Array::value_type)); } -void AppendArray_UINT16(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_UINT16(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto uint16_array = std::static_pointer_cast(array_data); + auto uint16_array = static_pointer_cast(array_data); memcpy(kH(k_array), uint16_array->raw_values(), uint16_array->length() * sizeof(arrow::UInt16Array::value_type)); } -void AppendArray_INT16(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_INT16(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto int16_array = std::static_pointer_cast(array_data); + auto int16_array = static_pointer_cast(array_data); memcpy(kH(k_array), int16_array->raw_values(), int16_array->length() * sizeof(arrow::Int16Array::value_type)); } -void AppendArray_UINT32(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_UINT32(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto uint32_array = std::static_pointer_cast(array_data); + auto uint32_array = static_pointer_cast(array_data); memcpy(kI(k_array), uint32_array->raw_values(), uint32_array->length() * sizeof(arrow::UInt32Array::value_type)); } -void AppendArray_INT32(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_INT32(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto int32_array = std::static_pointer_cast(array_data); + auto int32_array = static_pointer_cast(array_data); memcpy(kI(k_array), int32_array->raw_values(), int32_array->length() * sizeof(arrow::Int32Array::value_type)); } -void AppendArray_UINT64(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_UINT64(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto uint64_array = std::static_pointer_cast(array_data); + auto uint64_array = static_pointer_cast(array_data); memcpy(kJ(k_array), uint64_array->raw_values(), uint64_array->length() * sizeof(arrow::UInt64Array::value_type)); } -void AppendArray_INT64(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_INT64(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto int64_array = std::static_pointer_cast(array_data); + auto int64_array = static_pointer_cast(array_data); memcpy(kJ(k_array), int64_array->raw_values(), int64_array->length() * sizeof(arrow::Int64Array::value_type)); } -void AppendArray_HALF_FLOAT(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_HALF_FLOAT(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto hfl_array = std::static_pointer_cast(array_data); + auto hfl_array = static_pointer_cast(array_data); memcpy(kH(k_array), hfl_array->raw_values(), hfl_array->length() * sizeof(arrow::HalfFloatArray::value_type)); } -void AppendArray_FLOAT(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_FLOAT(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto fl_array = std::static_pointer_cast(array_data); + auto fl_array = static_pointer_cast(array_data); memcpy(kE(k_array), fl_array->raw_values(), fl_array->length() * sizeof(arrow::FloatArray::value_type)); } -void AppendArray_DOUBLE(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_DOUBLE(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto dbl_array = std::static_pointer_cast(array_data); + auto dbl_array = static_pointer_cast(array_data); memcpy(kF(k_array), dbl_array->raw_values(), dbl_array->length() * sizeof(arrow::DoubleArray::value_type)); } -void AppendArray_STRING(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_STRING(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto str_array = std::static_pointer_cast(array_data); + auto str_array = static_pointer_cast(array_data); for (auto i = 0; i < str_array->length(); ++i) { auto str_data = str_array->GetString(i); K k_str = ktn(KC, str_data.length()); @@ -203,9 +206,9 @@ void AppendArray_STRING(std::shared_ptr array_data, K k_array, siz } } -void AppendArray_LARGE_STRING(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_LARGE_STRING(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto str_array = std::static_pointer_cast(array_data); + auto str_array = static_pointer_cast(array_data); for (auto i = 0; i < str_array->length(); ++i) { auto str_data = str_array->GetString(i); K k_str = ktn(KC, str_data.length()); @@ -214,9 +217,9 @@ void AppendArray_LARGE_STRING(std::shared_ptr array_data, K k_arra } } -void AppendArray_BINARY(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_BINARY(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto bin_array = std::static_pointer_cast(array_data); + auto bin_array = static_pointer_cast(array_data); for (auto i = 0; i < bin_array->length(); ++i) { auto bin_data = bin_array->GetString(i); K k_bin = ktn(KG, bin_data.length()); @@ -225,9 +228,9 @@ void AppendArray_BINARY(std::shared_ptr array_data, K k_array, siz } } -void AppendArray_LARGE_BINARY(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_LARGE_BINARY(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto bin_array = std::static_pointer_cast(array_data); + auto bin_array = static_pointer_cast(array_data); for (auto i = 0; i < bin_array->length(); ++i) { auto bin_data = bin_array->GetString(i); K k_bin = ktn(KG, bin_data.length()); @@ -236,9 +239,9 @@ void AppendArray_LARGE_BINARY(std::shared_ptr array_data, K k_arra } } -void AppendArray_FIXED_SIZE_BINARY(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_FIXED_SIZE_BINARY(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto fixed_bin_array = std::static_pointer_cast(array_data); + auto fixed_bin_array = static_pointer_cast(array_data); for (auto i = 0; i < fixed_bin_array->length(); ++i) { auto bin_data = fixed_bin_array->GetString(i); K k_bin = ktn(KG, bin_data.length()); @@ -247,53 +250,53 @@ void AppendArray_FIXED_SIZE_BINARY(std::shared_ptr array_data, K k } } -void AppendArray_DATE32(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_DATE32(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - kx::arrowkdb::TemporalConversion tc(array_data->type()); - auto d32_array = std::static_pointer_cast(array_data); + TemporalConversion tc(array_data->type()); + auto d32_array = static_pointer_cast(array_data); for (auto i = 0; i < d32_array->length(); ++i) kI(k_array)[index++] = tc.ArrowToKdb(d32_array->Value(i)); } -void AppendArray_DATE64(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_DATE64(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - kx::arrowkdb::TemporalConversion tc(array_data->type()); - auto d64_array = std::static_pointer_cast(array_data); + TemporalConversion tc(array_data->type()); + auto d64_array = static_pointer_cast(array_data); for (auto i = 0; i < d64_array->length(); ++i) kJ(k_array)[index++] = tc.ArrowToKdb(d64_array->Value(i)); } -void AppendArray_TIMESTAMP(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_TIMESTAMP(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - kx::arrowkdb::TemporalConversion tc(array_data->type()); - auto ts_array = std::static_pointer_cast(array_data); - auto timestamp_type = std::static_pointer_cast(ts_array->type()); + TemporalConversion tc(array_data->type()); + auto ts_array = static_pointer_cast(array_data); + auto timestamp_type = static_pointer_cast(ts_array->type()); for (auto i = 0; i < ts_array->length(); ++i) kJ(k_array)[index++] = tc.ArrowToKdb(ts_array->Value(i)); } -void AppendArray_TIME32(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_TIME32(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - kx::arrowkdb::TemporalConversion tc(array_data->type()); - auto t32_array = std::static_pointer_cast(array_data); - auto time32_type = std::static_pointer_cast(t32_array->type()); + TemporalConversion tc(array_data->type()); + auto t32_array = static_pointer_cast(array_data); + auto time32_type = static_pointer_cast(t32_array->type()); for (auto i = 0; i < t32_array->length(); ++i) kI(k_array)[index++] = tc.ArrowToKdb(t32_array->Value(i)); } -void AppendArray_TIME64(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_TIME64(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - kx::arrowkdb::TemporalConversion tc(array_data->type()); - auto t64_array = std::static_pointer_cast(array_data); - auto time64_type = std::static_pointer_cast(t64_array->type()); + TemporalConversion tc(array_data->type()); + auto t64_array = static_pointer_cast(array_data); + auto time64_type = static_pointer_cast(t64_array->type()); for (auto i = 0; i < t64_array->length(); ++i) kJ(k_array)[index++] = tc.ArrowToKdb(t64_array->Value(i)); } -void AppendArray_DECIMAL(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_DECIMAL(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto dec_array = std::static_pointer_cast(array_data); - auto dec_type = std::static_pointer_cast(dec_array->type()); + auto dec_array = static_pointer_cast(array_data); + auto dec_type = static_pointer_cast(dec_array->type()); for (auto i = 0; i < dec_array->length(); ++i) { auto decimal = arrow::Decimal128(dec_array->Value(i)); if (type_overrides.decimal128_as_double) { @@ -309,106 +312,106 @@ void AppendArray_DECIMAL(std::shared_ptr array_data, K k_array, si } } -void AppendArray_DURATION(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_DURATION(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - kx::arrowkdb::TemporalConversion tc(array_data->type()); - auto dur_array = std::static_pointer_cast(array_data); - auto duration_type = std::static_pointer_cast(dur_array->type()); + TemporalConversion tc(array_data->type()); + auto dur_array = static_pointer_cast(array_data); + auto duration_type = static_pointer_cast(dur_array->type()); for (auto i = 0; i < dur_array->length(); ++i) kJ(k_array)[index++] = tc.ArrowToKdb(dur_array->Value(i)); } -void AppendArray_INTERVAL_MONTHS(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_INTERVAL_MONTHS(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto month_array = std::static_pointer_cast(array_data); + auto month_array = static_pointer_cast(array_data); memcpy(kI(k_array), month_array->raw_values(), month_array->length() * sizeof(arrow::MonthIntervalArray::value_type)); } -void AppendArray_INTERVAL_DAY_TIME(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_INTERVAL_DAY_TIME(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto dt_array = std::static_pointer_cast(array_data); + auto dt_array = static_pointer_cast(array_data); for (auto i = 0; i < dt_array->length(); ++i) - kJ(k_array)[index++] = kx::arrowkdb::DayTimeInterval_KTimespan(dt_array->Value(i)); + kJ(k_array)[index++] = DayTimeInterval_KTimespan(dt_array->Value(i)); } -void AppendArray_LIST(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_LIST(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { AppendList(array_data, k_array, index, type_overrides); } -void AppendArray_LARGE_LIST(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_LARGE_LIST(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { AppendList(array_data, k_array, index, type_overrides); } -void AppendArray_FIXED_SIZE_LIST(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_FIXED_SIZE_LIST(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { AppendList(array_data, k_array, index, type_overrides); } -void AppendArray_MAP(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_MAP(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { AppendMap(array_data, k_array, index, type_overrides); } -void AppendArray_STRUCT(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_STRUCT(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { AppendStruct(array_data, k_array, index, type_overrides); } -void AppendArray_SPARSE_UNION(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_SPARSE_UNION(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { AppendUnion(array_data, k_array, index, type_overrides); } -void AppendArray_DENSE_UNION(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_DENSE_UNION(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { AppendArray_SPARSE_UNION(array_data, k_array, index, type_overrides); } -void AppendArray_DICTIONARY(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_DICTIONARY(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { AppendDictionary(array_data, k_array, index, type_overrides); } -using ArrayHandler = void (*) (std::shared_ptr, K, size_t&, kx::arrowkdb::TypeMappingOverride&); - -std::unordered_map ArrayHandlers { - std::make_pair( arrow::Type::NA, &AppendArray_NA ) - , std::make_pair( arrow::Type::BOOL, &AppendArray_BOOL ) - , std::make_pair( arrow::Type::UINT8, &AppendArray_UINT8 ) - , std::make_pair( arrow::Type::INT8, &AppendArray_INT8 ) - , std::make_pair( arrow::Type::UINT16, &AppendArray_UINT16 ) - , std::make_pair( arrow::Type::INT16, &AppendArray_INT16 ) - , std::make_pair( arrow::Type::UINT32, &AppendArray_UINT32 ) - , std::make_pair( arrow::Type::INT32, &AppendArray_INT32 ) - , std::make_pair( arrow::Type::UINT64, &AppendArray_UINT64 ) - , std::make_pair( arrow::Type::INT64, &AppendArray_INT64 ) - , std::make_pair( arrow::Type::HALF_FLOAT, &AppendArray_HALF_FLOAT ) - , std::make_pair( arrow::Type::FLOAT, &AppendArray_FLOAT ) - , std::make_pair( arrow::Type::DOUBLE, &AppendArray_DOUBLE ) - , std::make_pair( arrow::Type::STRING, &AppendArray_STRING ) - , std::make_pair( arrow::Type::LARGE_STRING, &AppendArray_LARGE_STRING ) - , std::make_pair( arrow::Type::BINARY, &AppendArray_BINARY ) - , std::make_pair( arrow::Type::LARGE_BINARY, &AppendArray_LARGE_BINARY ) - , std::make_pair( arrow::Type::FIXED_SIZE_BINARY, &AppendArray_FIXED_SIZE_BINARY ) - , std::make_pair( arrow::Type::DATE32, &AppendArray_DATE32 ) - , std::make_pair( arrow::Type::DATE64, &AppendArray_DATE64 ) - , std::make_pair( arrow::Type::TIMESTAMP, &AppendArray_TIMESTAMP ) - , std::make_pair( arrow::Type::TIME32, &AppendArray_TIME32 ) - , std::make_pair( arrow::Type::TIME64, &AppendArray_TIME64 ) - , std::make_pair( arrow::Type::DECIMAL, &AppendArray_DECIMAL ) - , std::make_pair( arrow::Type::DURATION, &AppendArray_DURATION ) - , std::make_pair( arrow::Type::INTERVAL_MONTHS, &AppendArray_INTERVAL_MONTHS ) - , std::make_pair( arrow::Type::INTERVAL_DAY_TIME, &AppendArray_INTERVAL_DAY_TIME ) - , std::make_pair( arrow::Type::LIST, &AppendArray_LIST ) - , std::make_pair( arrow::Type::LARGE_LIST, &AppendArray_LARGE_LIST ) - , std::make_pair( arrow::Type::FIXED_SIZE_LIST, &AppendArray_FIXED_SIZE_LIST ) - , std::make_pair( arrow::Type::MAP, &AppendArray_MAP ) - , std::make_pair( arrow::Type::STRUCT, &AppendArray_STRUCT ) - , std::make_pair( arrow::Type::SPARSE_UNION, &AppendArray_SPARSE_UNION ) - , std::make_pair( arrow::Type::DENSE_UNION, &AppendArray_DENSE_UNION ) - , std::make_pair( arrow::Type::DICTIONARY, &AppendArray_DICTIONARY ) +using ArrayHandler = void (*) (shared_ptr, K, size_t&, TypeMappingOverride&); + +unordered_map ArrayHandlers { + make_pair( arrow::Type::NA, &AppendArray_NA ) + , make_pair( arrow::Type::BOOL, &AppendArray_BOOL ) + , make_pair( arrow::Type::UINT8, &AppendArray_UINT8 ) + , make_pair( arrow::Type::INT8, &AppendArray_INT8 ) + , make_pair( arrow::Type::UINT16, &AppendArray_UINT16 ) + , make_pair( arrow::Type::INT16, &AppendArray_INT16 ) + , make_pair( arrow::Type::UINT32, &AppendArray_UINT32 ) + , make_pair( arrow::Type::INT32, &AppendArray_INT32 ) + , make_pair( arrow::Type::UINT64, &AppendArray_UINT64 ) + , make_pair( arrow::Type::INT64, &AppendArray_INT64 ) + , make_pair( arrow::Type::HALF_FLOAT, &AppendArray_HALF_FLOAT ) + , make_pair( arrow::Type::FLOAT, &AppendArray_FLOAT ) + , make_pair( arrow::Type::DOUBLE, &AppendArray_DOUBLE ) + , make_pair( arrow::Type::STRING, &AppendArray_STRING ) + , make_pair( arrow::Type::LARGE_STRING, &AppendArray_LARGE_STRING ) + , make_pair( arrow::Type::BINARY, &AppendArray_BINARY ) + , make_pair( arrow::Type::LARGE_BINARY, &AppendArray_LARGE_BINARY ) + , make_pair( arrow::Type::FIXED_SIZE_BINARY, &AppendArray_FIXED_SIZE_BINARY ) + , make_pair( arrow::Type::DATE32, &AppendArray_DATE32 ) + , make_pair( arrow::Type::DATE64, &AppendArray_DATE64 ) + , make_pair( arrow::Type::TIMESTAMP, &AppendArray_TIMESTAMP ) + , make_pair( arrow::Type::TIME32, &AppendArray_TIME32 ) + , make_pair( arrow::Type::TIME64, &AppendArray_TIME64 ) + , make_pair( arrow::Type::DECIMAL, &AppendArray_DECIMAL ) + , make_pair( arrow::Type::DURATION, &AppendArray_DURATION ) + , make_pair( arrow::Type::INTERVAL_MONTHS, &AppendArray_INTERVAL_MONTHS ) + , make_pair( arrow::Type::INTERVAL_DAY_TIME, &AppendArray_INTERVAL_DAY_TIME ) + , make_pair( arrow::Type::LIST, &AppendArray_LIST ) + , make_pair( arrow::Type::LARGE_LIST, &AppendArray_LARGE_LIST ) + , make_pair( arrow::Type::FIXED_SIZE_LIST, &AppendArray_FIXED_SIZE_LIST ) + , make_pair( arrow::Type::MAP, &AppendArray_MAP ) + , make_pair( arrow::Type::STRUCT, &AppendArray_STRUCT ) + , make_pair( arrow::Type::SPARSE_UNION, &AppendArray_SPARSE_UNION ) + , make_pair( arrow::Type::DENSE_UNION, &AppendArray_DENSE_UNION ) + , make_pair( arrow::Type::DICTIONARY, &AppendArray_DICTIONARY ) }; } // namespace @@ -416,7 +419,7 @@ std::unordered_map ArrayHandlers { namespace kx { namespace arrowkdb { -void AppendArray(std::shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto type_id = array_data->type_id(); if( ArrayHandlers.find( type_id ) == ArrayHandlers.end() ) @@ -429,7 +432,7 @@ void AppendArray(std::shared_ptr array_data, K k_array, size_t& in } } -K InitKdbForArray(std::shared_ptr datatype, size_t length, TypeMappingOverride& type_overrides) +K InitKdbForArray(shared_ptr datatype, size_t length, TypeMappingOverride& type_overrides) { switch (datatype->id()) { case arrow::Type::STRUCT: @@ -459,7 +462,7 @@ K InitKdbForArray(std::shared_ptr datatype, size_t length, Type case arrow::Type::DICTIONARY: { // Arrow dictionary becomes a two item mixed list - auto dictionary_type = std::static_pointer_cast(datatype); + auto dictionary_type = static_pointer_cast(datatype); K result = ktn(0, 2); // Do not preallocate the child lists since AppendDictionary has to join to the @@ -474,7 +477,7 @@ K InitKdbForArray(std::shared_ptr datatype, size_t length, Type } } -K ReadArray(std::shared_ptr array, TypeMappingOverride& type_overrides) +K ReadArray(shared_ptr array, TypeMappingOverride& type_overrides) { K k_array = InitKdbForArray(array->type(), array->length(), type_overrides); size_t index = 0; @@ -482,7 +485,7 @@ K ReadArray(std::shared_ptr array, TypeMappingOverride& type_overr return k_array; } -K ReadChunkedArray(std::shared_ptr chunked_array, TypeMappingOverride& type_overrides) +K ReadChunkedArray(shared_ptr chunked_array, TypeMappingOverride& type_overrides) { K k_array = InitKdbForArray(chunked_array->type(), chunked_array->length(), type_overrides); size_t index = 0; @@ -502,19 +505,19 @@ K writeReadArray(K datatype_id, K array, K options) if (datatype_id->t != -KI) return krr((S)"datatype_id not -6h"); - auto datatype = kx::arrowkdb::GetDatatypeStore()->Find(datatype_id->i); + auto datatype = GetDatatypeStore()->Find(datatype_id->i); if (!datatype) return krr((S)"datatype not found"); // Parse the options - auto read_options = kx::arrowkdb::KdbOptions(options, kx::arrowkdb::Options::string_options, kx::arrowkdb::Options::int_options); + auto read_options = KdbOptions(options, Options::string_options, Options::int_options); // Type mapping overrides - kx::arrowkdb::TypeMappingOverride type_overrides{ read_options }; + TypeMappingOverride type_overrides{ read_options }; - auto arrow_array = kx::arrowkdb::MakeArray(datatype, array, type_overrides); + auto arrow_array = MakeArray(datatype, array, type_overrides); - return kx::arrowkdb::ReadArray(arrow_array, type_overrides); + return ReadArray(arrow_array, type_overrides); KDB_EXCEPTION_CATCH; } From 5677fdce6901cbcd7950b08e22e17de45525c72c Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 6 Jan 2023 13:11:27 +0000 Subject: [PATCH 003/276] Generalizing arrow reader handlers --- src/ArrayReader.cpp | 186 +++++++++++++++++++++++++++----------------- 1 file changed, 115 insertions(+), 71 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 217af1e..ced60fd 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -114,14 +114,19 @@ void AppendDictionary(shared_ptr array_data, K k_array, size_t& in jv(&kK(k_array)[1], indices); } -void AppendArray_NA(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides); + +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto null_array = static_pointer_cast(array_data); for (auto i = 0; i < null_array->length(); ++i) kK(k_array)[index++] = knk(0); } -void AppendArray_BOOL(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto bool_array = static_pointer_cast(array_data); // BooleanArray doesn't have a bulk reader since arrow BooleanType is only 1 bit @@ -129,73 +134,85 @@ void AppendArray_BOOL(shared_ptr array_data, K k_array, size_t& in kG(k_array)[index++] = bool_array->Value(i); } -void AppendArray_UINT8(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto uint8_array = static_pointer_cast(array_data); memcpy(kG(k_array), uint8_array->raw_values(), uint8_array->length() * sizeof(arrow::UInt8Array::value_type)); } -void AppendArray_INT8(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto int8_array = static_pointer_cast(array_data); memcpy(kG(k_array), int8_array->raw_values(), int8_array->length() * sizeof(arrow::Int8Array::value_type)); } -void AppendArray_UINT16(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto uint16_array = static_pointer_cast(array_data); memcpy(kH(k_array), uint16_array->raw_values(), uint16_array->length() * sizeof(arrow::UInt16Array::value_type)); } -void AppendArray_INT16(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto int16_array = static_pointer_cast(array_data); memcpy(kH(k_array), int16_array->raw_values(), int16_array->length() * sizeof(arrow::Int16Array::value_type)); } -void AppendArray_UINT32(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto uint32_array = static_pointer_cast(array_data); memcpy(kI(k_array), uint32_array->raw_values(), uint32_array->length() * sizeof(arrow::UInt32Array::value_type)); } -void AppendArray_INT32(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto int32_array = static_pointer_cast(array_data); memcpy(kI(k_array), int32_array->raw_values(), int32_array->length() * sizeof(arrow::Int32Array::value_type)); } -void AppendArray_UINT64(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto uint64_array = static_pointer_cast(array_data); memcpy(kJ(k_array), uint64_array->raw_values(), uint64_array->length() * sizeof(arrow::UInt64Array::value_type)); } -void AppendArray_INT64(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto int64_array = static_pointer_cast(array_data); memcpy(kJ(k_array), int64_array->raw_values(), int64_array->length() * sizeof(arrow::Int64Array::value_type)); } -void AppendArray_HALF_FLOAT(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto hfl_array = static_pointer_cast(array_data); memcpy(kH(k_array), hfl_array->raw_values(), hfl_array->length() * sizeof(arrow::HalfFloatArray::value_type)); } -void AppendArray_FLOAT(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto fl_array = static_pointer_cast(array_data); memcpy(kE(k_array), fl_array->raw_values(), fl_array->length() * sizeof(arrow::FloatArray::value_type)); } -void AppendArray_DOUBLE(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto dbl_array = static_pointer_cast(array_data); memcpy(kF(k_array), dbl_array->raw_values(), dbl_array->length() * sizeof(arrow::DoubleArray::value_type)); } -void AppendArray_STRING(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto str_array = static_pointer_cast(array_data); for (auto i = 0; i < str_array->length(); ++i) { @@ -206,7 +223,8 @@ void AppendArray_STRING(shared_ptr array_data, K k_array, size_t& } } -void AppendArray_LARGE_STRING(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto str_array = static_pointer_cast(array_data); for (auto i = 0; i < str_array->length(); ++i) { @@ -217,7 +235,8 @@ void AppendArray_LARGE_STRING(shared_ptr array_data, K k_array, si } } -void AppendArray_BINARY(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto bin_array = static_pointer_cast(array_data); for (auto i = 0; i < bin_array->length(); ++i) { @@ -228,7 +247,8 @@ void AppendArray_BINARY(shared_ptr array_data, K k_array, size_t& } } -void AppendArray_LARGE_BINARY(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto bin_array = static_pointer_cast(array_data); for (auto i = 0; i < bin_array->length(); ++i) { @@ -239,7 +259,8 @@ void AppendArray_LARGE_BINARY(shared_ptr array_data, K k_array, si } } -void AppendArray_FIXED_SIZE_BINARY(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto fixed_bin_array = static_pointer_cast(array_data); for (auto i = 0; i < fixed_bin_array->length(); ++i) { @@ -250,7 +271,8 @@ void AppendArray_FIXED_SIZE_BINARY(shared_ptr array_data, K k_arra } } -void AppendArray_DATE32(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { TemporalConversion tc(array_data->type()); auto d32_array = static_pointer_cast(array_data); @@ -258,7 +280,8 @@ void AppendArray_DATE32(shared_ptr array_data, K k_array, size_t& kI(k_array)[index++] = tc.ArrowToKdb(d32_array->Value(i)); } -void AppendArray_DATE64(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { TemporalConversion tc(array_data->type()); auto d64_array = static_pointer_cast(array_data); @@ -266,7 +289,8 @@ void AppendArray_DATE64(shared_ptr array_data, K k_array, size_t& kJ(k_array)[index++] = tc.ArrowToKdb(d64_array->Value(i)); } -void AppendArray_TIMESTAMP(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { TemporalConversion tc(array_data->type()); auto ts_array = static_pointer_cast(array_data); @@ -275,7 +299,8 @@ void AppendArray_TIMESTAMP(shared_ptr array_data, K k_array, size_ kJ(k_array)[index++] = tc.ArrowToKdb(ts_array->Value(i)); } -void AppendArray_TIME32(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { TemporalConversion tc(array_data->type()); auto t32_array = static_pointer_cast(array_data); @@ -284,7 +309,8 @@ void AppendArray_TIME32(shared_ptr array_data, K k_array, size_t& kI(k_array)[index++] = tc.ArrowToKdb(t32_array->Value(i)); } -void AppendArray_TIME64(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { TemporalConversion tc(array_data->type()); auto t64_array = static_pointer_cast(array_data); @@ -293,7 +319,8 @@ void AppendArray_TIME64(shared_ptr array_data, K k_array, size_t& kJ(k_array)[index++] = tc.ArrowToKdb(t64_array->Value(i)); } -void AppendArray_DECIMAL(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto dec_array = static_pointer_cast(array_data); auto dec_type = static_pointer_cast(dec_array->type()); @@ -312,7 +339,8 @@ void AppendArray_DECIMAL(shared_ptr array_data, K k_array, size_t& } } -void AppendArray_DURATION(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { TemporalConversion tc(array_data->type()); auto dur_array = static_pointer_cast(array_data); @@ -321,97 +349,113 @@ void AppendArray_DURATION(shared_ptr array_data, K k_array, size_t kJ(k_array)[index++] = tc.ArrowToKdb(dur_array->Value(i)); } -void AppendArray_INTERVAL_MONTHS(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto month_array = static_pointer_cast(array_data); memcpy(kI(k_array), month_array->raw_values(), month_array->length() * sizeof(arrow::MonthIntervalArray::value_type)); } -void AppendArray_INTERVAL_DAY_TIME(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto dt_array = static_pointer_cast(array_data); for (auto i = 0; i < dt_array->length(); ++i) kJ(k_array)[index++] = DayTimeInterval_KTimespan(dt_array->Value(i)); } -void AppendArray_LIST(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { AppendList(array_data, k_array, index, type_overrides); } -void AppendArray_LARGE_LIST(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { AppendList(array_data, k_array, index, type_overrides); } -void AppendArray_FIXED_SIZE_LIST(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { AppendList(array_data, k_array, index, type_overrides); } -void AppendArray_MAP(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { AppendMap(array_data, k_array, index, type_overrides); } -void AppendArray_STRUCT(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { AppendStruct(array_data, k_array, index, type_overrides); } -void AppendArray_SPARSE_UNION(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { AppendUnion(array_data, k_array, index, type_overrides); } -void AppendArray_DENSE_UNION(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - AppendArray_SPARSE_UNION(array_data, k_array, index, type_overrides); + AppendArray(array_data, k_array, index, type_overrides); } -void AppendArray_DICTIONARY(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { AppendDictionary(array_data, k_array, index, type_overrides); } using ArrayHandler = void (*) (shared_ptr, K, size_t&, TypeMappingOverride&); +template +auto make_array_handler() +{ + return make_pair( TypeId, &AppendArray ); +} + unordered_map ArrayHandlers { - make_pair( arrow::Type::NA, &AppendArray_NA ) - , make_pair( arrow::Type::BOOL, &AppendArray_BOOL ) - , make_pair( arrow::Type::UINT8, &AppendArray_UINT8 ) - , make_pair( arrow::Type::INT8, &AppendArray_INT8 ) - , make_pair( arrow::Type::UINT16, &AppendArray_UINT16 ) - , make_pair( arrow::Type::INT16, &AppendArray_INT16 ) - , make_pair( arrow::Type::UINT32, &AppendArray_UINT32 ) - , make_pair( arrow::Type::INT32, &AppendArray_INT32 ) - , make_pair( arrow::Type::UINT64, &AppendArray_UINT64 ) - , make_pair( arrow::Type::INT64, &AppendArray_INT64 ) - , make_pair( arrow::Type::HALF_FLOAT, &AppendArray_HALF_FLOAT ) - , make_pair( arrow::Type::FLOAT, &AppendArray_FLOAT ) - , make_pair( arrow::Type::DOUBLE, &AppendArray_DOUBLE ) - , make_pair( arrow::Type::STRING, &AppendArray_STRING ) - , make_pair( arrow::Type::LARGE_STRING, &AppendArray_LARGE_STRING ) - , make_pair( arrow::Type::BINARY, &AppendArray_BINARY ) - , make_pair( arrow::Type::LARGE_BINARY, &AppendArray_LARGE_BINARY ) - , make_pair( arrow::Type::FIXED_SIZE_BINARY, &AppendArray_FIXED_SIZE_BINARY ) - , make_pair( arrow::Type::DATE32, &AppendArray_DATE32 ) - , make_pair( arrow::Type::DATE64, &AppendArray_DATE64 ) - , make_pair( arrow::Type::TIMESTAMP, &AppendArray_TIMESTAMP ) - , make_pair( arrow::Type::TIME32, &AppendArray_TIME32 ) - , make_pair( arrow::Type::TIME64, &AppendArray_TIME64 ) - , make_pair( arrow::Type::DECIMAL, &AppendArray_DECIMAL ) - , make_pair( arrow::Type::DURATION, &AppendArray_DURATION ) - , make_pair( arrow::Type::INTERVAL_MONTHS, &AppendArray_INTERVAL_MONTHS ) - , make_pair( arrow::Type::INTERVAL_DAY_TIME, &AppendArray_INTERVAL_DAY_TIME ) - , make_pair( arrow::Type::LIST, &AppendArray_LIST ) - , make_pair( arrow::Type::LARGE_LIST, &AppendArray_LARGE_LIST ) - , make_pair( arrow::Type::FIXED_SIZE_LIST, &AppendArray_FIXED_SIZE_LIST ) - , make_pair( arrow::Type::MAP, &AppendArray_MAP ) - , make_pair( arrow::Type::STRUCT, &AppendArray_STRUCT ) - , make_pair( arrow::Type::SPARSE_UNION, &AppendArray_SPARSE_UNION ) - , make_pair( arrow::Type::DENSE_UNION, &AppendArray_DENSE_UNION ) - , make_pair( arrow::Type::DICTIONARY, &AppendArray_DICTIONARY ) + make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() }; } // namespace From 5caed5d59dc0f503317e3dcc7b623ada0b1bd52b Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 6 Jan 2023 13:13:47 +0000 Subject: [PATCH 004/276] Ignoring service files --- .gitignore | 2 ++ tests/.gitignore | 1 + 2 files changed, 3 insertions(+) create mode 100644 .gitignore create mode 100644 tests/.gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..18f4d15 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +arrowkdb.code-workspace +build/ diff --git a/tests/.gitignore b/tests/.gitignore new file mode 100644 index 0000000..492b6a4 --- /dev/null +++ b/tests/.gitignore @@ -0,0 +1 @@ +test.q From b17c24d972f3495fd50a9f8c9dc7190789353ed9 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 6 Jan 2023 14:35:29 +0000 Subject: [PATCH 005/276] Array builder decomposition --- src/ArrayWriter.cpp | 424 ++++++++++++++++++++++++++++++++------------ 1 file changed, 309 insertions(+), 115 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index 602c764..77ca309 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -13,9 +14,306 @@ #include "HelperFunctions.h" #include "TypeCheck.h" +using namespace std; +using namespace kx::arrowkdb; -namespace kx { -namespace arrowkdb { +namespace +{ + +std::shared_ptr GetBuilder(std::shared_ptr datatype); + +template +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool); + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(datatype, pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(datatype, pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(datatype, pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(datatype, pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(datatype, pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(datatype, pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + // The parent list datatype details the child datatype so construct the child + // builder and use it to initialise the parent list builder + auto list_type = std::static_pointer_cast(datatype); + auto value_builder = GetBuilder(list_type->value_type()); + + // Construct the correct listbuilder + if (datatype->id() == arrow::Type::LIST) + return std::make_shared(pool, value_builder); + else if (datatype->id() == arrow::Type::LARGE_LIST) + return std::make_shared(pool, value_builder); + else + return std::make_shared(pool, value_builder, datatype); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return GetBuilder( datatype, pool ); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return GetBuilder( datatype, pool ); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + // The parent map datatype details the key/item child datatypes so construct + // builders for both and use these to initialise the parent map builder + auto map_type = std::static_pointer_cast(datatype); + auto key_builder = GetBuilder(map_type->key_type()); + auto item_builder = GetBuilder(map_type->item_type()); + return std::make_shared(pool, key_builder, item_builder); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + auto struct_type = std::static_pointer_cast(datatype); + + // Iterate through all the fields in the struct constructing and adding each + // field's builder into a vector + auto fields = struct_type->fields(); + std::vector> field_builders; + for (auto field : fields) + field_builders.push_back(GetBuilder(field->type())); + + // Construct the parent struct builder from this vector of all the child + // builders + return std::make_shared(datatype, pool, field_builders); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + auto union_type = std::static_pointer_cast(datatype); + + // Iterate through all the fields in the union constructing and adding each + // field's builder into a vector + auto fields = union_type->fields(); + std::vector> field_builders; + for (auto field : fields) + field_builders.push_back(GetBuilder(field->type())); + + // Construct the parent union builder from this vector of all the child + // builders + if (datatype->id() == arrow::Type::SPARSE_UNION) + return std::make_shared(pool, field_builders, datatype); + else + return std::make_shared(pool, field_builders, datatype); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return GetBuilder( datatype, pool ); +} + +using BuilderHandler = shared_ptr ( * ) ( shared_ptr, arrow::MemoryPool* ); + +template +auto make_builder_handler() +{ + return make_pair( TypeId, &GetBuilder ); +} + +unordered_map BuilderHandlers { + make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() +}; // Constructs and returns the correct arrow array builder for the specified // datatype. @@ -23,126 +321,22 @@ namespace arrowkdb { // This handles all datatypes except Dictionary which is handled separately. std::shared_ptr GetBuilder(std::shared_ptr datatype) { + auto type_id = datatype->id(); arrow::MemoryPool* pool = arrow::default_memory_pool(); - switch (datatype->id()) { - case arrow::Type::NA: - return std::make_shared(pool); - case arrow::Type::BOOL: - return std::make_shared(pool); - case arrow::Type::UINT8: - return std::make_shared(pool); - case arrow::Type::INT8: - return std::make_shared(pool); - case arrow::Type::UINT16: - return std::make_shared(pool); - case arrow::Type::INT16: - return std::make_shared(pool); - case arrow::Type::UINT32: - return std::make_shared(pool); - case arrow::Type::INT32: - return std::make_shared(pool); - case arrow::Type::UINT64: - return std::make_shared(pool); - case arrow::Type::INT64: - return std::make_shared(pool); - case arrow::Type::HALF_FLOAT: - return std::make_shared(pool); - case arrow::Type::FLOAT: - return std::make_shared(pool); - case arrow::Type::DOUBLE: - return std::make_shared(pool); - case arrow::Type::STRING: - return std::make_shared(pool); - case arrow::Type::LARGE_STRING: - return std::make_shared(pool); - case arrow::Type::BINARY: - return std::make_shared(pool); - case arrow::Type::LARGE_BINARY: - return std::make_shared(pool); - case arrow::Type::FIXED_SIZE_BINARY: - return std::make_shared(datatype, pool); - case arrow::Type::DATE32: - return std::make_shared(pool); - case arrow::Type::DATE64: - return std::make_shared(pool); - case arrow::Type::TIMESTAMP: - return std::make_shared(datatype, pool); - case arrow::Type::TIME32: - return std::make_shared(datatype, pool); - case arrow::Type::TIME64: - return std::make_shared(datatype, pool); - case arrow::Type::DECIMAL: - return std::make_shared(datatype, pool); - case arrow::Type::DURATION: - return std::make_shared(datatype, pool); - case arrow::Type::INTERVAL_MONTHS: - return std::make_shared(pool); - case arrow::Type::INTERVAL_DAY_TIME: - return std::make_shared(pool); - case arrow::Type::LIST: - case arrow::Type::LARGE_LIST: - case arrow::Type::FIXED_SIZE_LIST: + if( BuilderHandlers.find( type_id ) == BuilderHandlers.end() ) { - // The parent list datatype details the child datatype so construct the child - // builder and use it to initialise the parent list builder - auto list_type = std::static_pointer_cast(datatype); - auto value_builder = GetBuilder(list_type->value_type()); - - // Construct the correct listbuilder - if (datatype->id() == arrow::Type::LIST) - return std::make_shared(pool, value_builder); - else if (datatype->id() == arrow::Type::LARGE_LIST) - return std::make_shared(pool, value_builder); - else - return std::make_shared(pool, value_builder, datatype); + TYPE_CHECK_UNSUPPORTED(datatype->ToString()); } - case arrow::Type::MAP: + else { - // The parent map datatype details the key/item child datatypes so construct - // builders for both and use these to initialise the parent map builder - auto map_type = std::static_pointer_cast(datatype); - auto key_builder = GetBuilder(map_type->key_type()); - auto item_builder = GetBuilder(map_type->item_type()); - return std::make_shared(pool, key_builder, item_builder); + return BuilderHandlers[type_id]( datatype, pool ); } - case arrow::Type::STRUCT: - { - auto struct_type = std::static_pointer_cast(datatype); +} - // Iterate through all the fields in the struct constructing and adding each - // field's builder into a vector - auto fields = struct_type->fields(); - std::vector> field_builders; - for (auto field : fields) - field_builders.push_back(GetBuilder(field->type())); +} // namespace - // Construct the parent struct builder from this vector of all the child - // builders - return std::make_shared(datatype, pool, field_builders); - } - case arrow::Type::SPARSE_UNION: - case arrow::Type::DENSE_UNION: - { - auto union_type = std::static_pointer_cast(datatype); - - // Iterate through all the fields in the union constructing and adding each - // field's builder into a vector - auto fields = union_type->fields(); - std::vector> field_builders; - for (auto field : fields) - field_builders.push_back(GetBuilder(field->type())); - - // Construct the parent union builder from this vector of all the child - // builders - if (datatype->id() == arrow::Type::SPARSE_UNION) - return std::make_shared(pool, field_builders, datatype); - else - return std::make_shared(pool, field_builders, datatype); - } - default: - TYPE_CHECK_UNSUPPORTED(datatype->ToString()); - } -} +namespace kx { +namespace arrowkdb { // Populate a list/large_list/fixed_size_list builder // From 07d737a65bc157ab8ee220f2864cf9adf88455ef Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 6 Jan 2023 15:35:42 +0000 Subject: [PATCH 006/276] Populate builder decomposition --- src/ArrayWriter.cpp | 845 +++++++++++++++++++++++++------------------- 1 file changed, 474 insertions(+), 371 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index 77ca309..ad05e33 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -20,7 +20,7 @@ using namespace kx::arrowkdb; namespace { -std::shared_ptr GetBuilder(std::shared_ptr datatype); +shared_ptr GetBuilder(shared_ptr datatype); template shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool); @@ -28,163 +28,163 @@ shared_ptr GetBuilder(shared_ptr datatype, template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(datatype, pool); + return make_shared(datatype, pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(datatype, pool); + return make_shared(datatype, pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(datatype, pool); + return make_shared(datatype, pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(datatype, pool); + return make_shared(datatype, pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(datatype, pool); + return make_shared(datatype, pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(datatype, pool); + return make_shared(datatype, pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> @@ -192,16 +192,16 @@ shared_ptr GetBuilder(shared_ptr(datatype); + auto list_type = static_pointer_cast(datatype); auto value_builder = GetBuilder(list_type->value_type()); // Construct the correct listbuilder if (datatype->id() == arrow::Type::LIST) - return std::make_shared(pool, value_builder); + return make_shared(pool, value_builder); else if (datatype->id() == arrow::Type::LARGE_LIST) - return std::make_shared(pool, value_builder); + return make_shared(pool, value_builder); else - return std::make_shared(pool, value_builder, datatype); + return make_shared(pool, value_builder, datatype); } template<> @@ -221,47 +221,47 @@ shared_ptr GetBuilder(shared_ptr(datatype); + auto map_type = static_pointer_cast(datatype); auto key_builder = GetBuilder(map_type->key_type()); auto item_builder = GetBuilder(map_type->item_type()); - return std::make_shared(pool, key_builder, item_builder); + return make_shared(pool, key_builder, item_builder); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - auto struct_type = std::static_pointer_cast(datatype); + auto struct_type = static_pointer_cast(datatype); // Iterate through all the fields in the struct constructing and adding each // field's builder into a vector auto fields = struct_type->fields(); - std::vector> field_builders; + vector> field_builders; for (auto field : fields) field_builders.push_back(GetBuilder(field->type())); // Construct the parent struct builder from this vector of all the child // builders - return std::make_shared(datatype, pool, field_builders); + return make_shared(datatype, pool, field_builders); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - auto union_type = std::static_pointer_cast(datatype); + auto union_type = static_pointer_cast(datatype); // Iterate through all the fields in the union constructing and adding each // field's builder into a vector auto fields = union_type->fields(); - std::vector> field_builders; + vector> field_builders; for (auto field : fields) field_builders.push_back(GetBuilder(field->type())); // Construct the parent union builder from this vector of all the child // builders if (datatype->id() == arrow::Type::SPARSE_UNION) - return std::make_shared(pool, field_builders, datatype); + return make_shared(pool, field_builders, datatype); else - return std::make_shared(pool, field_builders, datatype); + return make_shared(pool, field_builders, datatype); } template<> @@ -319,7 +319,7 @@ unordered_map BuilderHandlers { // datatype. // // This handles all datatypes except Dictionary which is handled separately. -std::shared_ptr GetBuilder(std::shared_ptr datatype) +shared_ptr GetBuilder(shared_ptr datatype) { auto type_id = datatype->id(); arrow::MemoryPool* pool = arrow::default_memory_pool(); @@ -335,8 +335,8 @@ std::shared_ptr GetBuilder(std::shared_ptr } // namespace -namespace kx { -namespace arrowkdb { +namespace +{ // Populate a list/large_list/fixed_size_list builder // @@ -344,7 +344,7 @@ namespace arrowkdb { // kdb as a mixed list for the parent list array containing a set of sub-lists, // one for each of the list value sets. template -void PopulateListBuilder(std::shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +void PopulateListBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { // Get the value builder from the parent list builder auto list_builder = static_cast(builder); @@ -361,7 +361,7 @@ void PopulateListBuilder(std::shared_ptr datatype, K k_array, a if (datatype->id() == arrow::Type::FIXED_SIZE_LIST) { // Check each sub-list is the same length as the fixed size K list_data = kK(k_array)[i]; - auto fixed_list_type = std::static_pointer_cast(datatype); + auto fixed_list_type = static_pointer_cast(datatype); TYPE_CHECK_LENGTH(fixed_list_type->list_size() != list_data->n, datatype->ToString(), fixed_list_type->list_size(), list_data->n); } @@ -376,12 +376,12 @@ void PopulateListBuilder(std::shared_ptr datatype, K k_array, a // additional type id array which identifies the live field in each union value // set. template -void PopulateUnionBuilder(std::shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +void PopulateUnionBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { // Check that the mixed list length is at least one greater (the additional // first sub-list contains the union type_ids) than the number of union // fields - auto union_type = std::static_pointer_cast(datatype); + auto union_type = static_pointer_cast(datatype); const auto min_length = union_type->num_fields() + 1; TYPE_CHECK_LENGTH(min_length > k_array->n, datatype->ToString(), min_length, k_array->n); @@ -393,7 +393,7 @@ void PopulateUnionBuilder(std::shared_ptr datatype, K k_array, // Get all the child builders from the parent union builder auto union_builder = static_cast(builder); - std::vector> child_builders; + vector> child_builders; for (auto i = 0; i < union_builder->num_children(); ++i) child_builders.push_back(union_builder->child_builder(i)); @@ -420,364 +420,467 @@ void PopulateUnionBuilder(std::shared_ptr datatype, K k_array, throw TypeCheck("Mismatched union list lengths"); } -// Populates data values from a kdb list into the specified array builder. -void PopulateBuilder(std::shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +template +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides); + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { - // Special cases for: - // symbol - string or large_string - // guid - fixed_size_binary(16) - // char - uint8 - bool is_symbol = k_array->t == KS && (datatype->id() == arrow::Type::STRING || datatype->id() == arrow::Type::LARGE_STRING); - bool is_guid = k_array->t == UU && datatype->id() == arrow::Type::FIXED_SIZE_BINARY && static_cast(builder)->byte_width() == sizeof(U); - bool is_char = k_array->t == KC && (datatype->id() == arrow::Type::UINT8 || datatype->id() == arrow::Type::INT8); + auto null_builder = static_cast(builder); + PARQUET_THROW_NOT_OK(null_builder->AppendNulls(k_array->n)); +} - // Type check the kdb structure - if (!is_symbol && !is_guid && !is_char) - TYPE_CHECK_ARRAY(kx::arrowkdb::GetKdbType(datatype, type_overrides) != k_array->t, datatype->ToString(), kx::arrowkdb::GetKdbType(datatype, type_overrides), k_array->t); +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto bool_builder = static_cast(builder); + PARQUET_THROW_NOT_OK(bool_builder->AppendValues((uint8_t*)kG(k_array), k_array->n)); +} - switch (datatype->id()) { - case arrow::Type::NA: - { - auto null_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(null_builder->AppendNulls(k_array->n)); - break; - } - case arrow::Type::BOOL: - { - auto bool_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(bool_builder->AppendValues((uint8_t*)kG(k_array), k_array->n)); - break; - } - case arrow::Type::UINT8: - { - auto uint8_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(uint8_builder->AppendValues((uint8_t*)kG(k_array), k_array->n)); - break; - } - case arrow::Type::INT8: - { - auto int8_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(int8_builder->AppendValues((int8_t*)kG(k_array), k_array->n)); - break; - } - case arrow::Type::UINT16: - { - auto uint16_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(uint16_builder->AppendValues((uint16_t*)kH(k_array), k_array->n)); - arrow::Status s; - break; - } - case arrow::Type::INT16: - { - auto int16_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(int16_builder->AppendValues((int16_t*)kH(k_array), k_array->n)); - break; - } - case arrow::Type::UINT32: - { - auto uint32_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(uint32_builder->AppendValues((uint32_t*)kI(k_array), k_array->n)); - break; - } - case arrow::Type::INT32: - { - auto int32_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(int32_builder->AppendValues((int32_t*)kI(k_array), k_array->n)); - break; - } - case arrow::Type::UINT64: - { - auto uint64_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(uint64_builder->AppendValues((uint64_t*)kJ(k_array), k_array->n)); - break; - } - case arrow::Type::INT64: - { - auto int64_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(int64_builder->AppendValues((int64_t*)kJ(k_array), k_array->n)); - break; - } - case arrow::Type::HALF_FLOAT: - { - arrow::HalfFloatType hft; - auto hfl_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(hfl_builder->AppendValues((uint16_t*)kH(k_array), k_array->n)); - break; - } - case arrow::Type::FLOAT: - { - auto fl_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(fl_builder->AppendValues(kE(k_array), k_array->n)); - break; - } - case arrow::Type::DOUBLE: - { - auto dbl_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(dbl_builder->AppendValues(kF(k_array), k_array->n)); - break; - } - case arrow::Type::STRING: - { - auto str_builder = static_cast(builder); - if (is_symbol) { - // Populate from symbol list - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(str_builder->Append(kS(k_array)[i])); - } else { - // Populate from mixed list of char lists - for (auto i = 0; i < k_array->n; ++i) { - K str_data = kK(k_array)[i]; - TYPE_CHECK_ITEM(str_data->t != KC, datatype->ToString(), KC, str_data->t); - PARQUET_THROW_NOT_OK(str_builder->Append(kG(str_data), str_data->n)); - } - } - break; - } - case arrow::Type::LARGE_STRING: - { - auto str_builder = static_cast(builder); - if (is_symbol) { - // Populate from symbol list - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(str_builder->Append(kS(k_array)[i])); - } else { - // Populate from mixed list of char lists - for (auto i = 0; i < k_array->n; ++i) { - K str_data = kK(k_array)[i]; - TYPE_CHECK_ITEM(str_data->t != KC, datatype->ToString(), KC, str_data->t); - PARQUET_THROW_NOT_OK(str_builder->Append(kG(str_data), str_data->n)); - } - } - break; - } - case arrow::Type::BINARY: - { - auto bin_builder = static_cast(builder); +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto uint8_builder = static_cast(builder); + PARQUET_THROW_NOT_OK(uint8_builder->AppendValues((uint8_t*)kG(k_array), k_array->n)); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto int8_builder = static_cast(builder); + PARQUET_THROW_NOT_OK(int8_builder->AppendValues((int8_t*)kG(k_array), k_array->n)); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto uint16_builder = static_cast(builder); + PARQUET_THROW_NOT_OK(uint16_builder->AppendValues((uint16_t*)kH(k_array), k_array->n)); + arrow::Status s; +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto int16_builder = static_cast(builder); + PARQUET_THROW_NOT_OK(int16_builder->AppendValues((int16_t*)kH(k_array), k_array->n)); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto uint32_builder = static_cast(builder); + PARQUET_THROW_NOT_OK(uint32_builder->AppendValues((uint32_t*)kI(k_array), k_array->n)); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto int32_builder = static_cast(builder); + PARQUET_THROW_NOT_OK(int32_builder->AppendValues((int32_t*)kI(k_array), k_array->n)); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto uint64_builder = static_cast(builder); + PARQUET_THROW_NOT_OK(uint64_builder->AppendValues((uint64_t*)kJ(k_array), k_array->n)); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto int64_builder = static_cast(builder); + PARQUET_THROW_NOT_OK(int64_builder->AppendValues((int64_t*)kJ(k_array), k_array->n)); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + arrow::HalfFloatType hft; + auto hfl_builder = static_cast(builder); + PARQUET_THROW_NOT_OK(hfl_builder->AppendValues((uint16_t*)kH(k_array), k_array->n)); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto fl_builder = static_cast(builder); + PARQUET_THROW_NOT_OK(fl_builder->AppendValues(kE(k_array), k_array->n)); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto dbl_builder = static_cast(builder); + PARQUET_THROW_NOT_OK(dbl_builder->AppendValues(kF(k_array), k_array->n)); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + bool is_symbol = k_array->t == KS && (datatype->id() == arrow::Type::STRING || datatype->id() == arrow::Type::LARGE_STRING); + auto str_builder = static_cast(builder); + if (is_symbol) { + // Populate from symbol list + for (auto i = 0; i < k_array->n; ++i) + PARQUET_THROW_NOT_OK(str_builder->Append(kS(k_array)[i])); + } else { + // Populate from mixed list of char lists for (auto i = 0; i < k_array->n; ++i) { - K bin_data = kK(k_array)[i]; - TYPE_CHECK_ITEM(bin_data->t != KG, datatype->ToString(), KG, bin_data->t); - PARQUET_THROW_NOT_OK(bin_builder->Append(kG(bin_data), bin_data->n)); + K str_data = kK(k_array)[i]; + TYPE_CHECK_ITEM(str_data->t != KC, datatype->ToString(), KC, str_data->t); + PARQUET_THROW_NOT_OK(str_builder->Append(kG(str_data), str_data->n)); } - break; } - case arrow::Type::LARGE_BINARY: - { - auto bin_builder = static_cast(builder); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + bool is_symbol = k_array->t == KS && (datatype->id() == arrow::Type::STRING || datatype->id() == arrow::Type::LARGE_STRING); + auto str_builder = static_cast(builder); + if (is_symbol) { + // Populate from symbol list + for (auto i = 0; i < k_array->n; ++i) + PARQUET_THROW_NOT_OK(str_builder->Append(kS(k_array)[i])); + } else { + // Populate from mixed list of char lists for (auto i = 0; i < k_array->n; ++i) { - K bin_data = kK(k_array)[i]; - TYPE_CHECK_ITEM(bin_data->t != KG, datatype->ToString(), KG, bin_data->t); - PARQUET_THROW_NOT_OK(bin_builder->Append(kG(bin_data), bin_data->n)); + K str_data = kK(k_array)[i]; + TYPE_CHECK_ITEM(str_data->t != KC, datatype->ToString(), KC, str_data->t); + PARQUET_THROW_NOT_OK(str_builder->Append(kG(str_data), str_data->n)); } - break; } - case arrow::Type::FIXED_SIZE_BINARY: - { - auto fixed_bin_builder = static_cast(builder); - if (is_guid) { - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(fixed_bin_builder->Append((char*)&kU(k_array)[i])); - } else { - for (auto i = 0; i < k_array->n; ++i) { - K bin_data = kK(k_array)[i]; - TYPE_CHECK_ITEM(bin_data->t != KG, datatype->ToString(), KG, bin_data->t); - TYPE_CHECK_LENGTH(fixed_bin_builder->byte_width() != bin_data->n, builder->type()->ToString(), fixed_bin_builder->byte_width(), bin_data->n); - PARQUET_THROW_NOT_OK(fixed_bin_builder->Append(kG(bin_data))); - } - } - break; - } - case arrow::Type::DATE32: - { - TemporalConversion tc(datatype); - auto d32_builder = static_cast(builder); - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(d32_builder->Append(tc.KdbToArrow(kI(k_array)[i]))); - break; - } - case arrow::Type::DATE64: - { - TemporalConversion tc(datatype); - auto d64_builder = static_cast(builder); - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(d64_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); - break; - } - case arrow::Type::TIMESTAMP: - { - TemporalConversion tc(datatype); - auto ts_builder = static_cast(builder); - auto timestamp_type = std::static_pointer_cast(datatype); - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(ts_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); - break; +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto bin_builder = static_cast(builder); + for (auto i = 0; i < k_array->n; ++i) { + K bin_data = kK(k_array)[i]; + TYPE_CHECK_ITEM(bin_data->t != KG, datatype->ToString(), KG, bin_data->t); + PARQUET_THROW_NOT_OK(bin_builder->Append(kG(bin_data), bin_data->n)); } - case arrow::Type::TIME32: - { - TemporalConversion tc(datatype); - auto t32_builder = static_cast(builder); - auto time32_type = std::static_pointer_cast(datatype); - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(t32_builder->Append(tc.KdbToArrow(kI(k_array)[i]))); - break; +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto bin_builder = static_cast(builder); + for (auto i = 0; i < k_array->n; ++i) { + K bin_data = kK(k_array)[i]; + TYPE_CHECK_ITEM(bin_data->t != KG, datatype->ToString(), KG, bin_data->t); + PARQUET_THROW_NOT_OK(bin_builder->Append(kG(bin_data), bin_data->n)); } - case arrow::Type::TIME64: - { - TemporalConversion tc(datatype); - auto t64_builder = static_cast(builder); - auto time64_type = std::static_pointer_cast(datatype); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + bool is_guid = k_array->t == UU && datatype->id() == arrow::Type::FIXED_SIZE_BINARY && static_cast(builder)->byte_width() == sizeof(U); + auto fixed_bin_builder = static_cast(builder); + if (is_guid) { for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(t64_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); - break; - } - case arrow::Type::DECIMAL: - { - auto dec_builder = static_cast(builder); - auto dec_type = std::static_pointer_cast(datatype); + PARQUET_THROW_NOT_OK(fixed_bin_builder->Append((char*)&kU(k_array)[i])); + } else { for (auto i = 0; i < k_array->n; ++i) { - if (type_overrides.decimal128_as_double) { - // Construct the decimal from a double - arrow::Decimal128 dec128; - PARQUET_ASSIGN_OR_THROW(dec128, arrow::Decimal128::FromReal(kF(k_array)[i], dec_type->precision(), dec_type->scale())); - PARQUET_THROW_NOT_OK(dec_builder->Append(dec128)); - } else { - // Each decimal is a list of 16 bytes - K k_dec = kK(k_array)[i]; - TYPE_CHECK_LENGTH(k_dec->n != 16, datatype->ToString(), 16, k_dec->n); - TYPE_CHECK_ITEM(k_dec->t != KG, datatype->ToString(), KG, k_dec->t); - - arrow::Decimal128 dec128((const uint8_t*)kG(k_dec)); - PARQUET_THROW_NOT_OK(dec_builder->Append(dec128)); - } + K bin_data = kK(k_array)[i]; + TYPE_CHECK_ITEM(bin_data->t != KG, datatype->ToString(), KG, bin_data->t); + TYPE_CHECK_LENGTH(fixed_bin_builder->byte_width() != bin_data->n, builder->type()->ToString(), fixed_bin_builder->byte_width(), bin_data->n); + PARQUET_THROW_NOT_OK(fixed_bin_builder->Append(kG(bin_data))); } - break; } - case arrow::Type::DURATION: - { - TemporalConversion tc(datatype); - auto dur_builder = static_cast(builder); - auto duration_type = std::static_pointer_cast(datatype); - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(dur_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); - break; - } - case arrow::Type::INTERVAL_MONTHS: - { - auto month_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(month_builder->AppendValues((int32_t*)kI(k_array), k_array->n)); - break; - } - case arrow::Type::INTERVAL_DAY_TIME: - { - auto dt_builder = static_cast(builder); - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(dt_builder->Append(KTimespan_DayTimeInterval(kJ(k_array)[i]))); - break; - } - case arrow::Type::LIST: - PopulateListBuilder(datatype, k_array, builder, type_overrides); - break; - case arrow::Type::LARGE_LIST: - PopulateListBuilder(datatype, k_array, builder, type_overrides); - break; - case arrow::Type::FIXED_SIZE_LIST: - PopulateListBuilder(datatype, k_array, builder, type_overrides); - break; - case arrow::Type::MAP: - { - // An arrow map array is a nested set of key/item paired child arrays. This - // is represented in kdb as a mixed list for the parent map array, with a - // dictionary for each map value set. - // - // Get the key and item builders from the parent map builder - auto map_builder = static_cast(builder); - auto key_builder = map_builder->key_builder(); - auto item_builder = map_builder->item_builder(); +} - for (auto i = 0; i < k_array->n; ++i) { - // Ignore any mixed list items set to :: - if (kK(k_array)[i]->t == 101) - continue; - - // Delimit the start/end of each child map set - map_builder->Append(); - - // Populate the child builders for this map set from the dictionary key/value lists - auto k_dict = kK(k_array)[i]; - TYPE_CHECK_ITEM(99 != k_dict->t, datatype->ToString(), 99, k_dict->t); - PopulateBuilder(key_builder->type(), kK(k_dict)[0], key_builder, type_overrides); - PopulateBuilder(item_builder->type(), kK(k_dict)[1], item_builder, type_overrides); +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + TemporalConversion tc(datatype); + auto d32_builder = static_cast(builder); + for (auto i = 0; i < k_array->n; ++i) + PARQUET_THROW_NOT_OK(d32_builder->Append(tc.KdbToArrow(kI(k_array)[i]))); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + TemporalConversion tc(datatype); + auto d64_builder = static_cast(builder); + for (auto i = 0; i < k_array->n; ++i) + PARQUET_THROW_NOT_OK(d64_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + TemporalConversion tc(datatype); + auto ts_builder = static_cast(builder); + auto timestamp_type = static_pointer_cast(datatype); + for (auto i = 0; i < k_array->n; ++i) + PARQUET_THROW_NOT_OK(ts_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + TemporalConversion tc(datatype); + auto t32_builder = static_cast(builder); + auto time32_type = static_pointer_cast(datatype); + for (auto i = 0; i < k_array->n; ++i) + PARQUET_THROW_NOT_OK(t32_builder->Append(tc.KdbToArrow(kI(k_array)[i]))); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + TemporalConversion tc(datatype); + auto t64_builder = static_cast(builder); + auto time64_type = static_pointer_cast(datatype); + for (auto i = 0; i < k_array->n; ++i) + PARQUET_THROW_NOT_OK(t64_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto dec_builder = static_cast(builder); + auto dec_type = static_pointer_cast(datatype); + for (auto i = 0; i < k_array->n; ++i) { + if (type_overrides.decimal128_as_double) { + // Construct the decimal from a double + arrow::Decimal128 dec128; + PARQUET_ASSIGN_OR_THROW(dec128, arrow::Decimal128::FromReal(kF(k_array)[i], dec_type->precision(), dec_type->scale())); + PARQUET_THROW_NOT_OK(dec_builder->Append(dec128)); + } else { + // Each decimal is a list of 16 bytes + K k_dec = kK(k_array)[i]; + TYPE_CHECK_LENGTH(k_dec->n != 16, datatype->ToString(), 16, k_dec->n); + TYPE_CHECK_ITEM(k_dec->t != KG, datatype->ToString(), KG, k_dec->t); + + arrow::Decimal128 dec128((const uint8_t*)kG(k_dec)); + PARQUET_THROW_NOT_OK(dec_builder->Append(dec128)); } - break; } +} - case arrow::Type::STRUCT: - { - // An arrow struct array is a logical grouping of child arrays with each - // child array corresponding to one of the fields in the struct. A single - // struct value is obtaining by slicing across all the child arrays at a - // given index. This is represented in kdb as a mixed list for the parent - // struct array, containing child lists for each field in the struct. - // - // Check that the mixed list length is at least equal to the number of struct fields - auto struct_type = std::static_pointer_cast(datatype); - TYPE_CHECK_LENGTH(struct_type->num_fields() > k_array->n, datatype->ToString(), struct_type->num_fields(), k_array->n); - - // Get all the field builders from the parent struct builder - auto struct_builder = static_cast(builder); - std::vector field_builders; - for (auto i = 0; i < struct_builder->num_fields(); ++i) - field_builders.push_back(struct_builder->field_builder(i)); - - // Delimit each struct value in the parent builder - for (auto index = 0; index < kK(k_array)[0]->n; ++index) - struct_builder->Append(); - - // Populate each of the field builders from its kdb list. Only count up to - // the number of struct fields. Additional trailing data in the kdb mixed - // list is ignored (to allow for ::) - for (auto i = 0; i < struct_type->num_fields(); ++i) - PopulateBuilder(field_builders[i]->type(), kK(k_array)[i], field_builders[i], type_overrides); - - // Check that all the populated field builders have the same length. - for (auto it : field_builders) - if (it->length() != struct_builder->length()) - throw TypeCheck("Mismatched struct list lengths"); - - break; +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + TemporalConversion tc(datatype); + auto dur_builder = static_cast(builder); + auto duration_type = static_pointer_cast(datatype); + for (auto i = 0; i < k_array->n; ++i) + PARQUET_THROW_NOT_OK(dur_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto month_builder = static_cast(builder); + PARQUET_THROW_NOT_OK(month_builder->AppendValues((int32_t*)kI(k_array), k_array->n)); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto dt_builder = static_cast(builder); + for (auto i = 0; i < k_array->n; ++i) + PARQUET_THROW_NOT_OK(dt_builder->Append(KTimespan_DayTimeInterval(kJ(k_array)[i]))); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + PopulateListBuilder(datatype, k_array, builder, type_overrides); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + PopulateListBuilder(datatype, k_array, builder, type_overrides); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + PopulateListBuilder(datatype, k_array, builder, type_overrides); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + // An arrow map array is a nested set of key/item paired child arrays. This + // is represented in kdb as a mixed list for the parent map array, with a + // dictionary for each map value set. + // + // Get the key and item builders from the parent map builder + auto map_builder = static_cast(builder); + auto key_builder = map_builder->key_builder(); + auto item_builder = map_builder->item_builder(); + + for (auto i = 0; i < k_array->n; ++i) { + // Ignore any mixed list items set to :: + if (kK(k_array)[i]->t == 101) + continue; + + // Delimit the start/end of each child map set + map_builder->Append(); + + // Populate the child builders for this map set from the dictionary key/value lists + auto k_dict = kK(k_array)[i]; + TYPE_CHECK_ITEM(99 != k_dict->t, datatype->ToString(), 99, k_dict->t); + PopulateBuilder(key_builder->type(), kK(k_dict)[0], key_builder, type_overrides); + PopulateBuilder(item_builder->type(), kK(k_dict)[1], item_builder, type_overrides); } - case arrow::Type::SPARSE_UNION: - PopulateUnionBuilder(datatype, k_array, builder, type_overrides); - break; - case arrow::Type::DENSE_UNION: - PopulateUnionBuilder(datatype, k_array, builder, type_overrides); - break; - default: +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + // An arrow struct array is a logical grouping of child arrays with each + // child array corresponding to one of the fields in the struct. A single + // struct value is obtaining by slicing across all the child arrays at a + // given index. This is represented in kdb as a mixed list for the parent + // struct array, containing child lists for each field in the struct. + // + // Check that the mixed list length is at least equal to the number of struct fields + auto struct_type = static_pointer_cast(datatype); + TYPE_CHECK_LENGTH(struct_type->num_fields() > k_array->n, datatype->ToString(), struct_type->num_fields(), k_array->n); + + // Get all the field builders from the parent struct builder + auto struct_builder = static_cast(builder); + vector field_builders; + for (auto i = 0; i < struct_builder->num_fields(); ++i) + field_builders.push_back(struct_builder->field_builder(i)); + + // Delimit each struct value in the parent builder + for (auto index = 0; index < kK(k_array)[0]->n; ++index) + struct_builder->Append(); + + // Populate each of the field builders from its kdb list. Only count up to + // the number of struct fields. Additional trailing data in the kdb mixed + // list is ignored (to allow for ::) + for (auto i = 0; i < struct_type->num_fields(); ++i) + PopulateBuilder(field_builders[i]->type(), kK(k_array)[i], field_builders[i], type_overrides); + + // Check that all the populated field builders have the same length. + for (auto it : field_builders) + if (it->length() != struct_builder->length()) + throw TypeCheck("Mismatched struct list lengths"); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + PopulateUnionBuilder(datatype, k_array, builder, type_overrides); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + PopulateUnionBuilder(datatype, k_array, builder, type_overrides); +} + +using PopulateHandler = void ( * ) ( shared_ptr, K, arrow::ArrayBuilder*, TypeMappingOverride& ); + +template +auto make_populate_handler() +{ + return make_pair( TypeId, &PopulateBuilder ); +} + +unordered_map PopulateHandlers { + make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() +}; + +} // namespace + +namespace kx { +namespace arrowkdb { + +// Populates data values from a kdb list into the specified array builder. +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + // Special cases for: + // symbol - string or large_string + // guid - fixed_size_binary(16) + // char - uint8 + bool is_symbol = k_array->t == KS && (datatype->id() == arrow::Type::STRING || datatype->id() == arrow::Type::LARGE_STRING); + bool is_guid = k_array->t == UU && datatype->id() == arrow::Type::FIXED_SIZE_BINARY && static_cast(builder)->byte_width() == sizeof(U); + bool is_char = k_array->t == KC && (datatype->id() == arrow::Type::UINT8 || datatype->id() == arrow::Type::INT8); + + // Type check the kdb structure + if (!is_symbol && !is_guid && !is_char) + TYPE_CHECK_ARRAY(GetKdbType(datatype, type_overrides) != k_array->t, datatype->ToString(), GetKdbType(datatype, type_overrides), k_array->t); + + auto type_id = datatype->id(); + if( PopulateHandlers.find( type_id ) == PopulateHandlers.end() ) + { TYPE_CHECK_UNSUPPORTED(datatype->ToString()); } + else + { + PopulateHandlers[type_id]( datatype, k_array, builder, type_overrides ); + } } // Construct a dictionary array from its values and indicies arrays. // // This is represented in kdb as a mixed list for the parent dictionary array // containing the values and indicies sub-lists. -std::shared_ptr MakeDictionary(std::shared_ptr datatype, K k_array, TypeMappingOverride& type_overrides) +shared_ptr MakeDictionary(shared_ptr datatype, K k_array, TypeMappingOverride& type_overrides) { K values = kK(k_array)[0]; K indicies = kK(k_array)[1]; - auto dictionary_type = std::static_pointer_cast(datatype); + auto dictionary_type = static_pointer_cast(datatype); // Recursively construct the values and indicies arrays auto values_array = MakeArray(dictionary_type->value_type(), values, type_overrides); auto indicies_array = MakeArray(dictionary_type->index_type(), indicies, type_overrides); - std::shared_ptr result; + shared_ptr result; PARQUET_ASSIGN_OR_THROW(result, arrow::DictionaryArray::FromArrays(datatype, indicies_array, values_array)); return result; } -std::shared_ptr MakeArray(std::shared_ptr datatype, K k_array, TypeMappingOverride& type_overrides) +shared_ptr MakeArray(shared_ptr datatype, K k_array, TypeMappingOverride& type_overrides) { // DictionaryBuilder works in quite an unusual and non-standard way so just // construct the dictionary array directly @@ -790,7 +893,7 @@ std::shared_ptr MakeArray(std::shared_ptr datatyp PopulateBuilder(datatype, k_array, builder.get(), type_overrides); // Finalise the builder into the arrow array - std::shared_ptr array; + shared_ptr array; PARQUET_THROW_NOT_OK(builder->Finish(&array)); return array; } @@ -806,19 +909,19 @@ K prettyPrintArray(K datatype_id, K array, K options) if (datatype_id->t != -KI) return krr((S)"datatype_id not -6h"); - auto datatype = kx::arrowkdb::GetDatatypeStore()->Find(datatype_id->i); + auto datatype = GetDatatypeStore()->Find(datatype_id->i); if (!datatype) return krr((S)"datatype not found"); // Parse the options - auto read_options = kx::arrowkdb::KdbOptions(options, kx::arrowkdb::Options::string_options, kx::arrowkdb::Options::int_options); + auto read_options = KdbOptions(options, Options::string_options, Options::int_options); // Type mapping overrides - kx::arrowkdb::TypeMappingOverride type_overrides{ read_options }; + TypeMappingOverride type_overrides{ read_options }; - auto arrow_array = kx::arrowkdb::MakeArray(datatype, array, type_overrides); + auto arrow_array = MakeArray(datatype, array, type_overrides); auto options = arrow::PrettyPrintOptions(); - std::string result; + string result; arrow::PrettyPrint(*arrow_array, options, &result); return kp((S)result.c_str()); From 352d4aee4c0be5720b92f6785ff93f46dedcaba2 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 11 Jan 2023 13:30:47 +0000 Subject: [PATCH 007/276] Bump C++17 standard version to pass Mac checks on Travis --- CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 29f623d..33bfada 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,6 +7,9 @@ project(arrowkdb CXX) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -DKXVER=3") set(CMAKE_CXX_STANDARD 14) +IF(APPLE) + set(CMAKE_CXX_STANDARD 17) +endif() set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) From 2af073e0464a640c4d8680395cf3d90c384460db Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 10 Jan 2023 12:49:42 +0300 Subject: [PATCH 008/276] Dict options populating for null mapping --- src/KdbOptions.h | 85 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 82 insertions(+), 3 deletions(-) diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 1af50a8..dd79a62 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -25,6 +25,13 @@ namespace Options // String options const std::string PARQUET_VERSION = "PARQUET_VERSION"; + // Dict options + const std::string NULL_MAPPING = "NULL_MAPPING"; + + // Null mapping options + const std::string NM_INT_16 = "int16"; + const std::string NM_INT_32 = "int32"; + const static std::set int_options = { PARQUET_CHUNK_SIZE, PARQUET_MULTITHREADED_READ, @@ -34,6 +41,13 @@ namespace Options const static std::set string_options = { PARQUET_VERSION, }; + const static std::set dict_options = { + NULL_MAPPING, + }; + const static std::set null_mapping_options = { + NM_INT_16, + NM_INT_32, + }; } @@ -42,15 +56,19 @@ namespace Options // Dictionary key: KS // Dictionary value: KS or // KJ or -// 0 of -KS|-KJ|KC +// XD or +// 0 of -KS|-KJ|XD|KC class KdbOptions { private: + std::map null_mapping_options; std::map string_options; std::map int_options; const std::set& supported_string_options; const std::set& supported_int_options; + const std::set& supported_dict_options; + const std::set& supported_null_mapping_options; private: const std::string ToUpper(std::string str) const @@ -81,6 +99,26 @@ class KdbOptions } } + void PopulateDictOptions( K keys, K values ) + { + for( auto i = 0ll; i < values->n; ++i ) { + const std::string key = ToUpper( kS( keys )[i] ); + if( supported_dict_options.find( key ) == supported_dict_options.end() ){ + throw InvalidOption(("Unsupported dict option '" + key + "'").c_str()); + } + + K dict = kK( values )[0]; + K options = kK( values )[1]; + for( auto j = 0ll; j < options->n; ++j ) { + const std::string option = ToUpper( kS( dict )[j] ); + if( supported_null_mapping_options.find( key ) == supported_null_mapping_options.end() ){ + throw InvalidOption(("Unsupported '" + key + "' option '" + option + "'").c_str()); + } + null_mapping_options[option] = ToUpper( kS( options )[j] ); + } + } + } + void PopulateMixedOptions(K keys, K values) { for (auto i = 0ll; i < values->n; ++i) { @@ -104,6 +142,22 @@ class KdbOptions string_options[key] = ToUpper(std::string((char*)kG(value), value->n)); break; } + case XD: + { + if( supported_dict_options.find( key ) == supported_dict_options.end() ){ + throw InvalidOption(("Unsupported dict option '" + key + "'").c_str()); + } + K dict = kK( values )[0]; + K options = kK( values )[1]; + for( auto j = 0ll; j < options->n; ++j ) { + const std::string option = ToUpper( kS( dict )[j] ); + if( supported_null_mapping_options.find( key ) == supported_null_mapping_options.end() ){ + throw InvalidOption(("Unsupported '" + key + "' option '" + option + "'").c_str()); + } + null_mapping_options[option] = ToUpper( kS( options )[j] ); + } + break; + } case 101: // Ignore :: break; @@ -121,8 +175,16 @@ class KdbOptions {}; }; - KdbOptions(K options, const std::set supported_string_options_, const std::set supported_int_options_) : - supported_string_options(supported_string_options_), supported_int_options(supported_int_options_) + KdbOptions( + K options + , const std::set supported_string_options_ + , const std::set supported_int_options_ + , const std::set& supported_dict_options_ = std::set {} + , const std::set& supported_null_mapping_options_ = std::set {} ) + : supported_string_options(supported_string_options_) + , supported_int_options(supported_int_options_) + , supported_dict_options( supported_dict_options_ ) + , supported_null_mapping_options( supported_null_mapping_options_ ) { if (options != NULL && options->t != 101) { if (options->t != 99) @@ -138,6 +200,9 @@ class KdbOptions case KS: PopulateStringOptions(keys, values); break; + case XD: + PopulateDictOptions(keys, values); + break; case 0: PopulateMixedOptions(keys, values); break; @@ -147,6 +212,20 @@ class KdbOptions } } + bool GetNullMappingOption( const std::string key, std::string& result ) const + { + const auto it = null_mapping_options.find( key ); + if( it == null_mapping_options.end() ) + { + return false; + } + else + { + result = it->second; + return true; + } + } + bool GetStringOption(const std::string key, std::string& result) const { const auto it = string_options.find(key); From 349e17d020f17228adc16e49ae2773aba6a8d131 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 12 Jan 2023 12:34:30 +0300 Subject: [PATCH 009/276] Supporting of nested dictionaries --- src/KdbOptions.h | 51 ++++++++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 19 deletions(-) diff --git a/src/KdbOptions.h b/src/KdbOptions.h index dd79a62..1a92fed 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -29,8 +29,8 @@ namespace Options const std::string NULL_MAPPING = "NULL_MAPPING"; // Null mapping options - const std::string NM_INT_16 = "int16"; - const std::string NM_INT_32 = "int32"; + const std::string NM_INT_16 = "INT16"; + const std::string NM_INT_32 = "INT32"; const static std::set int_options = { PARQUET_CHUNK_SIZE, @@ -99,6 +99,30 @@ class KdbOptions } } + void PopulateNullMappingOptions( long long index, K dict ) + { + K keys = kK( kK( dict )[index] )[0]; + K values = kK( kK( dict )[index] )[1]; + for( auto i = 0ll; i < values->n; ++i ){ + const std::string key = ToUpper( kS( keys )[i] ); + if( supported_null_mapping_options.find( key ) == supported_null_mapping_options.end() ){ + throw InvalidOption(("Unsupported NULL_MAPPING option '" + key + "'").c_str()); + } + switch( kK( values )[i]->t ) + { + case -KH: + null_mapping_options[key]; // = kK( values )[j]->h; to_string? variant?? + break; + case -KI: + null_mapping_options[key]; // = kK( values )[j]->i; to_string? variant?? + break; + case 0: + null_mapping_options[key] = ToUpper( kS( values )[i] ); + break; + }; + } + } + void PopulateDictOptions( K keys, K values ) { for( auto i = 0ll; i < values->n; ++i ) { @@ -106,15 +130,9 @@ class KdbOptions if( supported_dict_options.find( key ) == supported_dict_options.end() ){ throw InvalidOption(("Unsupported dict option '" + key + "'").c_str()); } - - K dict = kK( values )[0]; - K options = kK( values )[1]; - for( auto j = 0ll; j < options->n; ++j ) { - const std::string option = ToUpper( kS( dict )[j] ); - if( supported_null_mapping_options.find( key ) == supported_null_mapping_options.end() ){ - throw InvalidOption(("Unsupported '" + key + "' option '" + option + "'").c_str()); - } - null_mapping_options[option] = ToUpper( kS( options )[j] ); + if( Options::NULL_MAPPING == key ) + { + PopulateNullMappingOptions( i, values ); } } } @@ -147,14 +165,9 @@ class KdbOptions if( supported_dict_options.find( key ) == supported_dict_options.end() ){ throw InvalidOption(("Unsupported dict option '" + key + "'").c_str()); } - K dict = kK( values )[0]; - K options = kK( values )[1]; - for( auto j = 0ll; j < options->n; ++j ) { - const std::string option = ToUpper( kS( dict )[j] ); - if( supported_null_mapping_options.find( key ) == supported_null_mapping_options.end() ){ - throw InvalidOption(("Unsupported '" + key + "' option '" + option + "'").c_str()); - } - null_mapping_options[option] = ToUpper( kS( options )[j] ); + if( Options::NULL_MAPPING == key ) + { + PopulateNullMappingOptions( i, values ); } break; } From c44659d11ea85e09110d70ca1d8184afaf5b7c10 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 12 Jan 2023 20:40:00 +0300 Subject: [PATCH 010/276] Arrow builder null mapping overriding --- src/ArrayWriter.cpp | 11 ++++++- src/HelperFunctions.cpp | 1 + src/HelperFunctions.h | 1 + src/KdbOptions.h | 64 +++++++++++++++++++++++++++++++---------- 4 files changed, 61 insertions(+), 16 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index ad05e33..83ca5ce 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -463,7 +463,16 @@ template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { auto int16_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(int16_builder->AppendValues((int16_t*)kH(k_array), k_array->n)); + if( type_overrides.null_mapping.have_int16 ){ + auto null_bitmap = std::unique_ptr( new uint8_t[k_array->n] ); + for( auto i = 0; i < k_array->n; ++i ){ + null_bitmap[i] = !( kH( k_array )[i] ^ type_overrides.null_mapping.int16_null ); + PARQUET_THROW_NOT_OK( int16_builder->AppendValues( ( int16_t* )kH( k_array ), k_array->n, null_bitmap.get() ) ); + } + } + else { + PARQUET_THROW_NOT_OK( int16_builder->AppendValues( ( int16_t* )kH( k_array), k_array->n ) ); + } } template<> diff --git a/src/HelperFunctions.cpp b/src/HelperFunctions.cpp index 5ade109..868cf07 100644 --- a/src/HelperFunctions.cpp +++ b/src/HelperFunctions.cpp @@ -148,6 +148,7 @@ const std::string GetKdbString(K str) TypeMappingOverride::TypeMappingOverride(const KdbOptions& options) { options.GetIntOption(Options::DECIMAL128_AS_DOUBLE, decimal128_as_double); + options.GetNullMappingOptions( null_mapping ); } KdbType GetKdbType(std::shared_ptr datatype, TypeMappingOverride& type_overrides) diff --git a/src/HelperFunctions.h b/src/HelperFunctions.h index d6faaef..201707a 100644 --- a/src/HelperFunctions.h +++ b/src/HelperFunctions.h @@ -80,6 +80,7 @@ typedef signed char KdbType; struct TypeMappingOverride { int64_t decimal128_as_double = 0; + Options::NullMapping null_mapping; TypeMappingOverride(void) {}; TypeMappingOverride(const KdbOptions& options); }; diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 1a92fed..5ba9b66 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -31,6 +31,7 @@ namespace Options // Null mapping options const std::string NM_INT_16 = "INT16"; const std::string NM_INT_32 = "INT32"; + const std::string NM_SYMBOL = "SYMBOL"; const static std::set int_options = { PARQUET_CHUNK_SIZE, @@ -47,6 +48,17 @@ namespace Options const static std::set null_mapping_options = { NM_INT_16, NM_INT_32, + NM_SYMBOL + }; + + struct NullMapping + { + bool have_int16; + int16_t int16_null; + bool have_int32; + int32_t int32_null; + bool have_symbol; + std::string symbol_null; }; } @@ -61,7 +73,7 @@ namespace Options class KdbOptions { private: - std::map null_mapping_options; + Options::NullMapping null_mapping_options; std::map string_options; std::map int_options; @@ -111,13 +123,16 @@ class KdbOptions switch( kK( values )[i]->t ) { case -KH: - null_mapping_options[key]; // = kK( values )[j]->h; to_string? variant?? + null_mapping_options.have_int16 = true; + null_mapping_options.int16_null = kK( values )[i]->h; break; case -KI: - null_mapping_options[key]; // = kK( values )[j]->i; to_string? variant?? + null_mapping_options.int32_null = true; + null_mapping_options.int32_null = kK( values )[i]->i; break; case 0: - null_mapping_options[key] = ToUpper( kS( values )[i] ); + null_mapping_options.have_symbol = true; + null_mapping_options.symbol_null = ToUpper( kS( values )[i] ); break; }; } @@ -225,18 +240,12 @@ class KdbOptions } } - bool GetNullMappingOption( const std::string key, std::string& result ) const + template + auto GetNullMappingOption( bool& result ); + + void GetNullMappingOptions( Options::NullMapping& null_mapping ) const { - const auto it = null_mapping_options.find( key ); - if( it == null_mapping_options.end() ) - { - return false; - } - else - { - result = it->second; - return true; - } + null_mapping = null_mapping_options; } bool GetStringOption(const std::string key, std::string& result) const @@ -262,6 +271,31 @@ class KdbOptions } }; +template<> +inline auto KdbOptions::GetNullMappingOption( bool& result ) +{ + result = null_mapping_options.have_int16; + + return null_mapping_options.int16_null; +} + +template<> +inline auto KdbOptions::GetNullMappingOption( bool& result ) +{ + result = null_mapping_options.have_int32; + + return null_mapping_options.int32_null; +} + +template<> +inline auto KdbOptions::GetNullMappingOption( bool& result ) +{ + result = null_mapping_options.have_symbol; + + return null_mapping_options.symbol_null; +} + + } // namespace arrowkdb } // namespace kx From e17ac6ca658f6ca7f3f97c68cee63e4201150c04 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 13 Jan 2023 17:27:37 +0300 Subject: [PATCH 011/276] Arrow string builder with mapping of nulls --- src/ArrayWriter.cpp | 53 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 11 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index 83ca5ce..e26c2ae 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -467,8 +467,8 @@ void PopulateBuilder(shared_ptr datatype, K auto null_bitmap = std::unique_ptr( new uint8_t[k_array->n] ); for( auto i = 0; i < k_array->n; ++i ){ null_bitmap[i] = !( kH( k_array )[i] ^ type_overrides.null_mapping.int16_null ); - PARQUET_THROW_NOT_OK( int16_builder->AppendValues( ( int16_t* )kH( k_array ), k_array->n, null_bitmap.get() ) ); } + PARQUET_THROW_NOT_OK( int16_builder->AppendValues( ( int16_t* )kH( k_array ), k_array->n, null_bitmap.get() ) ); } else { PARQUET_THROW_NOT_OK( int16_builder->AppendValues( ( int16_t* )kH( k_array), k_array->n ) ); @@ -486,7 +486,16 @@ template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { auto int32_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(int32_builder->AppendValues((int32_t*)kI(k_array), k_array->n)); + if( type_overrides.null_mapping.have_int32 ){ + auto null_bitmap = std::unique_ptr( new uint8_t[k_array->n] ); + for( auto i = 0; i < k_array->n; ++i ){ + null_bitmap[i] = !( kH( k_array )[i] ^ type_overrides.null_mapping.int32_null ); + } + PARQUET_THROW_NOT_OK( int32_builder->AppendValues( ( int32_t* )kH( k_array ), k_array->n, null_bitmap.get() ) ); + } + else{ + PARQUET_THROW_NOT_OK(int32_builder->AppendValues((int32_t*)kI(k_array), k_array->n)); + } } template<> @@ -528,12 +537,23 @@ void PopulateBuilder(shared_ptr datatype, template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { - bool is_symbol = k_array->t == KS && (datatype->id() == arrow::Type::STRING || datatype->id() == arrow::Type::LARGE_STRING); auto str_builder = static_cast(builder); - if (is_symbol) { - // Populate from symbol list - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(str_builder->Append(kS(k_array)[i])); + if( is_symbol ){ + if( type_overrides.null_mapping.have_symbol ){ + for( auto i = 0; i < k_array->n; ++i ){ + if( type_overrides.null_mapping.symbol_null == kS( k_array )[i] ){ + PARQUET_THROW_NOT_OK( str_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK( str_builder->Append( kS( k_array )[i] ) ); + } + } + } + else{ + // Populate from symbol list + for (auto i = 0; i < k_array->n; ++i) + PARQUET_THROW_NOT_OK(str_builder->Append(kS(k_array)[i])); + } } else { // Populate from mixed list of char lists for (auto i = 0; i < k_array->n; ++i) { @@ -547,12 +567,23 @@ void PopulateBuilder(shared_ptr datatype, template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { - bool is_symbol = k_array->t == KS && (datatype->id() == arrow::Type::STRING || datatype->id() == arrow::Type::LARGE_STRING); auto str_builder = static_cast(builder); if (is_symbol) { - // Populate from symbol list - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(str_builder->Append(kS(k_array)[i])); + if( type_overrides.null_mapping.have_symbol ){ + for( auto i = 0; i < k_array->n; ++i ){ + if( type_overrides.null_mapping.symbol_null == kS( k_array )[i] ){ + PARQUET_THROW_NOT_OK( str_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK( str_builder->Append( kS( k_array )[i] ) ); + } + } + } + else{ + // Populate from symbol list + for (auto i = 0; i < k_array->n; ++i) + PARQUET_THROW_NOT_OK(str_builder->Append(kS(k_array)[i])); + } } else { // Populate from mixed list of char lists for (auto i = 0; i < k_array->n; ++i) { From fcfb40c1c4d5b2f614b7cb1edff390d01c707be9 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 13 Jan 2023 19:18:57 +0300 Subject: [PATCH 012/276] More granular mappings for each arrow datatype --- src/ArrayWriter.cpp | 8 ++-- src/KdbOptions.h | 100 ++++++++++++++++++++++++++++++++++---------- 2 files changed, 83 insertions(+), 25 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index e26c2ae..d7977b0 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -539,9 +539,9 @@ void PopulateBuilder(shared_ptr datatype, { auto str_builder = static_cast(builder); if( is_symbol ){ - if( type_overrides.null_mapping.have_symbol ){ + if( type_overrides.null_mapping.have_string ){ for( auto i = 0; i < k_array->n; ++i ){ - if( type_overrides.null_mapping.symbol_null == kS( k_array )[i] ){ + if( type_overrides.null_mapping.string_null == kS( k_array )[i] ){ PARQUET_THROW_NOT_OK( str_builder->AppendNull() ); } else{ @@ -569,9 +569,9 @@ void PopulateBuilder(shared_ptr data { auto str_builder = static_cast(builder); if (is_symbol) { - if( type_overrides.null_mapping.have_symbol ){ + if( type_overrides.null_mapping.have_large_string ){ for( auto i = 0; i < k_array->n; ++i ){ - if( type_overrides.null_mapping.symbol_null == kS( k_array )[i] ){ + if( type_overrides.null_mapping.large_string_null == kS( k_array )[i] ){ PARQUET_THROW_NOT_OK( str_builder->AppendNull() ); } else{ diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 5ba9b66..1a32ecf 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -13,6 +13,33 @@ namespace kx { namespace arrowkdb { +template +constexpr auto toUType( E enumerator ) noexcept +{ + return static_cast>( enumerator ); +} + +template< typename E > +struct ETraits +{ + using Names = std::map< E, std::string >; + + static std::string name( E enumerator ) + { + auto it = names.find( enumerator ); + if( it != names.end() ) + { + return it->second; + } + + return "unknown"; + } + + static std::string name( int index ) { return name( static_cast( index ) ); } + + static const Names names; +}; + // Supported options namespace Options { @@ -31,7 +58,8 @@ namespace Options // Null mapping options const std::string NM_INT_16 = "INT16"; const std::string NM_INT_32 = "INT32"; - const std::string NM_SYMBOL = "SYMBOL"; + const std::string NM_STRING = "STRING"; + const std::string NM_LARGE_STRING = "LARGE_STRING"; const static std::set int_options = { PARQUET_CHUNK_SIZE, @@ -48,20 +76,37 @@ namespace Options const static std::set null_mapping_options = { NM_INT_16, NM_INT_32, - NM_SYMBOL + NM_STRING, + NM_LARGE_STRING }; struct NullMapping { + enum class Type: int{ + INT_16 + , INT_32 + , STRING + , LARGE_STRING + }; + bool have_int16; int16_t int16_null; bool have_int32; int32_t int32_null; - bool have_symbol; - std::string symbol_null; + bool have_string; + std::string string_null; + bool have_large_string; + std::string large_string_null; }; } +template<> +inline const ETraits< Options::NullMapping::Type >::Names ETraits< Options::NullMapping::Type >::names { + { Options::NullMapping::Type::INT_16, Options::NM_INT_16 } + , { Options::NullMapping::Type::INT_32, Options::NM_INT_32 } + , { Options::NullMapping::Type::STRING, Options::NM_STRING } + , { Options::NullMapping::Type::LARGE_STRING, Options::NM_LARGE_STRING } +}; // Helper class for reading dictionary of options // @@ -113,6 +158,8 @@ class KdbOptions void PopulateNullMappingOptions( long long index, K dict ) { + using NM = Options::NullMapping::Type; + K keys = kK( kK( dict )[index] )[0]; K values = kK( kK( dict )[index] )[1]; for( auto i = 0ll; i < values->n; ++i ){ @@ -120,21 +167,25 @@ class KdbOptions if( supported_null_mapping_options.find( key ) == supported_null_mapping_options.end() ){ throw InvalidOption(("Unsupported NULL_MAPPING option '" + key + "'").c_str()); } - switch( kK( values )[i]->t ) - { - case -KH: + if( ETraits::name( NM::INT_16 ) == key && -KH == kK( values )[i]->t ){ null_mapping_options.have_int16 = true; null_mapping_options.int16_null = kK( values )[i]->h; - break; - case -KI: + } + else if( ETraits::name( NM::INT_32 ) == key && -KI == kK( values )[i]->t ){ null_mapping_options.int32_null = true; null_mapping_options.int32_null = kK( values )[i]->i; - break; - case 0: - null_mapping_options.have_symbol = true; - null_mapping_options.symbol_null = ToUpper( kS( values )[i] ); - break; - }; + } + else if( ETraits::name( NM::STRING ) == key && 0 == kK( values )[i]->t ){ + null_mapping_options.have_string = true; + null_mapping_options.string_null = ToUpper( kS( values )[i] ); + } + else if( ETraits::name( NM::LARGE_STRING ) == key && 0 == kK( values )[i]->t ){ + null_mapping_options.have_large_string = true; + null_mapping_options.large_string_null = ToUpper( kS( values )[i] ); + } + else{ + throw InvalidOption(("Unsupported KDB data type for NULL_MAPPING option '" + key + "'").c_str()); + } } } @@ -240,7 +291,7 @@ class KdbOptions } } - template + template auto GetNullMappingOption( bool& result ); void GetNullMappingOptions( Options::NullMapping& null_mapping ) const @@ -272,7 +323,7 @@ class KdbOptions }; template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ) +inline auto KdbOptions::GetNullMappingOption( bool& result ) { result = null_mapping_options.have_int16; @@ -280,7 +331,7 @@ inline auto KdbOptions::GetNullMappingOption( bool& result ) } template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ) +inline auto KdbOptions::GetNullMappingOption( bool& result ) { result = null_mapping_options.have_int32; @@ -288,13 +339,20 @@ inline auto KdbOptions::GetNullMappingOption( bool& result ) } template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ) +inline auto KdbOptions::GetNullMappingOption( bool& result ) { - result = null_mapping_options.have_symbol; + result = null_mapping_options.have_string; - return null_mapping_options.symbol_null; + return null_mapping_options.string_null; } +template<> +inline auto KdbOptions::GetNullMappingOption( bool& result ) +{ + result = null_mapping_options.have_large_string; + + return null_mapping_options.large_string_null; +} } // namespace arrowkdb } // namespace kx From 2eeecc6acc762e7697ff08e9ea4056f07be1747c Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 16 Jan 2023 11:35:57 +0300 Subject: [PATCH 013/276] Default initialization of options --- src/KdbOptions.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 1a32ecf..3470adc 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -258,9 +258,10 @@ class KdbOptions K options , const std::set supported_string_options_ , const std::set supported_int_options_ - , const std::set& supported_dict_options_ = std::set {} - , const std::set& supported_null_mapping_options_ = std::set {} ) - : supported_string_options(supported_string_options_) + , const std::set& supported_dict_options_ = Options::dict_options + , const std::set& supported_null_mapping_options_ = Options::null_mapping_options ) + : null_mapping_options {0} + , supported_string_options(supported_string_options_) , supported_int_options(supported_int_options_) , supported_dict_options( supported_dict_options_ ) , supported_null_mapping_options( supported_null_mapping_options_ ) From 84e26ee278361b6dd9717976ce833803c8b8fded Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 16 Jan 2023 14:32:15 +0300 Subject: [PATCH 014/276] Integer writer debugging --- src/ArrayWriter.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index d7977b0..2883f1c 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -489,9 +489,9 @@ void PopulateBuilder(shared_ptr datatype, K if( type_overrides.null_mapping.have_int32 ){ auto null_bitmap = std::unique_ptr( new uint8_t[k_array->n] ); for( auto i = 0; i < k_array->n; ++i ){ - null_bitmap[i] = !( kH( k_array )[i] ^ type_overrides.null_mapping.int32_null ); + null_bitmap[i] = !( kI( k_array )[i] ^ type_overrides.null_mapping.int32_null ); } - PARQUET_THROW_NOT_OK( int32_builder->AppendValues( ( int32_t* )kH( k_array ), k_array->n, null_bitmap.get() ) ); + PARQUET_THROW_NOT_OK( int32_builder->AppendValues( ( int32_t* )kI( k_array ), k_array->n, null_bitmap.get() ) ); } else{ PARQUET_THROW_NOT_OK(int32_builder->AppendValues((int32_t*)kI(k_array), k_array->n)); From 65dfb5b0e04ea2e9ce69804cabecda446b1d2d7f Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 16 Jan 2023 17:46:25 +0300 Subject: [PATCH 015/276] String writer debugging --- src/ArrayWriter.cpp | 92 +++++++++++++++++++++++++-------------------- src/KdbOptions.h | 19 +++++----- 2 files changed, 61 insertions(+), 50 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index 2883f1c..b5d1bf1 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -464,11 +464,14 @@ void PopulateBuilder(shared_ptr datatype, K { auto int16_builder = static_cast(builder); if( type_overrides.null_mapping.have_int16 ){ - auto null_bitmap = std::unique_ptr( new uint8_t[k_array->n] ); for( auto i = 0; i < k_array->n; ++i ){ - null_bitmap[i] = !( kH( k_array )[i] ^ type_overrides.null_mapping.int16_null ); + if( type_overrides.null_mapping.int16_null == kH( k_array )[i]){ + PARQUET_THROW_NOT_OK( int16_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK( int16_builder->AppendValues( ( int16_t* )&kH( k_array )[i], 1 ) ); + } } - PARQUET_THROW_NOT_OK( int16_builder->AppendValues( ( int16_t* )kH( k_array ), k_array->n, null_bitmap.get() ) ); } else { PARQUET_THROW_NOT_OK( int16_builder->AppendValues( ( int16_t* )kH( k_array), k_array->n ) ); @@ -486,12 +489,17 @@ template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { auto int32_builder = static_cast(builder); + type_overrides.null_mapping.have_int32 = true; + type_overrides.null_mapping.int32_null = -2147483648; if( type_overrides.null_mapping.have_int32 ){ - auto null_bitmap = std::unique_ptr( new uint8_t[k_array->n] ); for( auto i = 0; i < k_array->n; ++i ){ - null_bitmap[i] = !( kI( k_array )[i] ^ type_overrides.null_mapping.int32_null ); + if( type_overrides.null_mapping.int32_null == kI( k_array )[i] ){ + PARQUET_THROW_NOT_OK( int32_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK( int32_builder->AppendValues( ( int32_t* )&kI( k_array )[i], 1 ) ); + } } - PARQUET_THROW_NOT_OK( int32_builder->AppendValues( ( int32_t* )kI( k_array ), k_array->n, null_bitmap.get() ) ); } else{ PARQUET_THROW_NOT_OK(int32_builder->AppendValues((int32_t*)kI(k_array), k_array->n)); @@ -539,27 +547,28 @@ void PopulateBuilder(shared_ptr datatype, { auto str_builder = static_cast(builder); if( is_symbol ){ - if( type_overrides.null_mapping.have_string ){ - for( auto i = 0; i < k_array->n; ++i ){ - if( type_overrides.null_mapping.string_null == kS( k_array )[i] ){ - PARQUET_THROW_NOT_OK( str_builder->AppendNull() ); - } - else{ - PARQUET_THROW_NOT_OK( str_builder->Append( kS( k_array )[i] ) ); - } + // Populate from symbol list + for( auto i = 0ll; i < k_array->n; ++i ){ + if( type_overrides.null_mapping.have_string + && type_overrides.null_mapping.string_null == kS( k_array )[i] ){ + PARQUET_THROW_NOT_OK( str_builder->AppendEmptyValue() ); + } + else{ + PARQUET_THROW_NOT_OK( str_builder->Append( kS( k_array )[i] ) ); } - } - else{ - // Populate from symbol list - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(str_builder->Append(kS(k_array)[i])); } } else { // Populate from mixed list of char lists - for (auto i = 0; i < k_array->n; ++i) { - K str_data = kK(k_array)[i]; - TYPE_CHECK_ITEM(str_data->t != KC, datatype->ToString(), KC, str_data->t); - PARQUET_THROW_NOT_OK(str_builder->Append(kG(str_data), str_data->n)); + for( auto i = 0ll; i < k_array->n; ++i ){ + K str_data = kK( k_array )[i]; + TYPE_CHECK_ITEM( str_data->t != KC, datatype->ToString(), KC, str_data->t ); + if( type_overrides.null_mapping.have_string + && type_overrides.null_mapping.string_null == std::string( ( char* )kG( str_data ), str_data->n ) ){ + PARQUET_THROW_NOT_OK( str_builder->AppendEmptyValue() ); + } + else{ + PARQUET_THROW_NOT_OK( str_builder->Append( kG( str_data ), str_data->n ) ); + } } } } @@ -568,28 +577,29 @@ template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { auto str_builder = static_cast(builder); - if (is_symbol) { - if( type_overrides.null_mapping.have_large_string ){ - for( auto i = 0; i < k_array->n; ++i ){ - if( type_overrides.null_mapping.large_string_null == kS( k_array )[i] ){ - PARQUET_THROW_NOT_OK( str_builder->AppendNull() ); - } - else{ - PARQUET_THROW_NOT_OK( str_builder->Append( kS( k_array )[i] ) ); - } + if( is_symbol ){ + // Populate from symbol list + for( auto i = 0ll; i < k_array->n; ++i ){ + if( type_overrides.null_mapping.have_large_string + && type_overrides.null_mapping.large_string_null == kS( k_array )[i] ){ + PARQUET_THROW_NOT_OK( str_builder->AppendEmptyValue() ); + } + else{ + PARQUET_THROW_NOT_OK( str_builder->Append( kS( k_array )[i] ) ); } - } - else{ - // Populate from symbol list - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(str_builder->Append(kS(k_array)[i])); } } else { // Populate from mixed list of char lists - for (auto i = 0; i < k_array->n; ++i) { - K str_data = kK(k_array)[i]; - TYPE_CHECK_ITEM(str_data->t != KC, datatype->ToString(), KC, str_data->t); - PARQUET_THROW_NOT_OK(str_builder->Append(kG(str_data), str_data->n)); + for( auto i = 0ll; i < k_array->n; ++i ){ + K str_data = kK( k_array )[i]; + TYPE_CHECK_ITEM( str_data->t != KC, datatype->ToString(), KC, str_data->t ); + if( type_overrides.null_mapping.have_string + && type_overrides.null_mapping.string_null == std::string( ( char* )kG( str_data ), str_data->n ) ){ + PARQUET_THROW_NOT_OK( str_builder->AppendEmptyValue() ); + } + else{ + PARQUET_THROW_NOT_OK( str_builder->Append( kG( str_data ), str_data->n ) ); + } } } } diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 3470adc..4858b8e 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -167,21 +167,22 @@ class KdbOptions if( supported_null_mapping_options.find( key ) == supported_null_mapping_options.end() ){ throw InvalidOption(("Unsupported NULL_MAPPING option '" + key + "'").c_str()); } - if( ETraits::name( NM::INT_16 ) == key && -KH == kK( values )[i]->t ){ + K value = kK( values )[i]; + if( ETraits::name( NM::INT_16 ) == key && -KH == value->t ){ + null_mapping_options.int16_null = value->h; null_mapping_options.have_int16 = true; - null_mapping_options.int16_null = kK( values )[i]->h; } - else if( ETraits::name( NM::INT_32 ) == key && -KI == kK( values )[i]->t ){ - null_mapping_options.int32_null = true; - null_mapping_options.int32_null = kK( values )[i]->i; + else if( ETraits::name( NM::INT_32 ) == key && -KI == value->t ){ + null_mapping_options.int32_null = value->i; + null_mapping_options.have_int32 = true; } - else if( ETraits::name( NM::STRING ) == key && 0 == kK( values )[i]->t ){ + else if( ETraits::name( NM::STRING ) == key && KC == value->t ){ + null_mapping_options.string_null.assign( (char*)kC( value ), value->n ); null_mapping_options.have_string = true; - null_mapping_options.string_null = ToUpper( kS( values )[i] ); } - else if( ETraits::name( NM::LARGE_STRING ) == key && 0 == kK( values )[i]->t ){ + else if( ETraits::name( NM::LARGE_STRING ) == key && KC == value->t ){ + null_mapping_options.large_string_null.assign( (char*)kC( value ), value->n ); null_mapping_options.have_large_string = true; - null_mapping_options.large_string_null = ToUpper( kS( values )[i] ); } else{ throw InvalidOption(("Unsupported KDB data type for NULL_MAPPING option '" + key + "'").c_str()); From 6868703b7f16d91aeafbe3031abd12ea7368f0fe Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 17 Jan 2023 15:53:03 +0300 Subject: [PATCH 016/276] Enable null supporting fields --- src/ArrayWriter.cpp | 8 ++++---- src/FieldStore.cpp | 10 ++-------- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index b5d1bf1..88d8e86 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -551,7 +551,7 @@ void PopulateBuilder(shared_ptr datatype, for( auto i = 0ll; i < k_array->n; ++i ){ if( type_overrides.null_mapping.have_string && type_overrides.null_mapping.string_null == kS( k_array )[i] ){ - PARQUET_THROW_NOT_OK( str_builder->AppendEmptyValue() ); + PARQUET_THROW_NOT_OK( str_builder->AppendNull() ); } else{ PARQUET_THROW_NOT_OK( str_builder->Append( kS( k_array )[i] ) ); @@ -564,7 +564,7 @@ void PopulateBuilder(shared_ptr datatype, TYPE_CHECK_ITEM( str_data->t != KC, datatype->ToString(), KC, str_data->t ); if( type_overrides.null_mapping.have_string && type_overrides.null_mapping.string_null == std::string( ( char* )kG( str_data ), str_data->n ) ){ - PARQUET_THROW_NOT_OK( str_builder->AppendEmptyValue() ); + PARQUET_THROW_NOT_OK( str_builder->AppendNull() ); } else{ PARQUET_THROW_NOT_OK( str_builder->Append( kG( str_data ), str_data->n ) ); @@ -582,7 +582,7 @@ void PopulateBuilder(shared_ptr data for( auto i = 0ll; i < k_array->n; ++i ){ if( type_overrides.null_mapping.have_large_string && type_overrides.null_mapping.large_string_null == kS( k_array )[i] ){ - PARQUET_THROW_NOT_OK( str_builder->AppendEmptyValue() ); + PARQUET_THROW_NOT_OK( str_builder->AppendNull() ); } else{ PARQUET_THROW_NOT_OK( str_builder->Append( kS( k_array )[i] ) ); @@ -595,7 +595,7 @@ void PopulateBuilder(shared_ptr data TYPE_CHECK_ITEM( str_data->t != KC, datatype->ToString(), KC, str_data->t ); if( type_overrides.null_mapping.have_string && type_overrides.null_mapping.string_null == std::string( ( char* )kG( str_data ), str_data->n ) ){ - PARQUET_THROW_NOT_OK( str_builder->AppendEmptyValue() ); + PARQUET_THROW_NOT_OK( str_builder->AppendNull() ); } else{ PARQUET_THROW_NOT_OK( str_builder->Append( kG( str_data ), str_data->n ) ); diff --git a/src/FieldStore.cpp b/src/FieldStore.cpp index 53cf6b0..04af9f5 100644 --- a/src/FieldStore.cpp +++ b/src/FieldStore.cpp @@ -108,11 +108,5 @@ K field(K field_name, K datatype_id) if (!datatype) return krr((S)"datatype not found"); - // Converting between kdb nulls are arrow nulls would incur a massive - // performance hit (up to 10x worse with trival datatypes that could otherwise - // be memcpy'ed). Also, not all kdb types have a null value, e.g. KB, KG, KS, - // 0 of KC, 0 of KG, etc. So don't allow fields to be created as nullable - // (other than NA type which is all nulls). - bool nullable = datatype->id() == arrow::Type::NA; - return ki(kx::arrowkdb::GetFieldStore()->Add(arrow::field(kx::arrowkdb::GetKdbString(field_name), datatype, nullable))); -} \ No newline at end of file + return ki(kx::arrowkdb::GetFieldStore()->Add(arrow::field(kx::arrowkdb::GetKdbString(field_name), datatype, true))); +} From 1eca0a5b3635694d87d81c8b29be7a765630bcfb Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 17 Jan 2023 15:53:28 +0300 Subject: [PATCH 017/276] Null mapping example --- examples/null_mapping.q | 52 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 examples/null_mapping.q diff --git a/examples/null_mapping.q b/examples/null_mapping.q new file mode 100644 index 0000000..2b1b31c --- /dev/null +++ b/examples/null_mapping.q @@ -0,0 +1,52 @@ +///////////////////////// +// CONSTRUCTED SCHEMAS // +///////////////////////// + +//-------------------// +// Create the schema // +//-------------------// + +// Create the datatype identifiers +ts_dt:.arrowkdb.dt.timestamp[`nano]; +f64_dt:.arrowkdb.dt.float64[]; +i32_dt:.arrowkdb.dt.int32[]; +bool_dt:.arrowkdb.dt.boolean[]; +str_dt:.arrowkdb.dt.utf8[]; + +// Create the field identifiers +tstamp_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; +temp_fd:.arrowkdb.fd.field[`temperature;f64_dt]; +fill_fd:.arrowkdb.fd.field[`fill_level;i32_dt]; +pump_fd:.arrowkdb.fd.field[`pump_status;bool_dt]; +comment_fd:.arrowkdb.fd.field[`comment;str_dt]; + +// Create the schema for the list of fields +schema:.arrowkdb.sc.schema[(tstamp_fd,temp_fd,fill_fd,pump_fd,comment_fd)]; + +// Print the schema +.arrowkdb.sc.printSchema[schema] + +//-----------------------// +// Create the array data // +//-----------------------// + +// Create data for each column in the table +tstamp_data:asc N?0p; +temp_data:N?100f; +fill_data:N?100i; +fill_data[0]:0Ni +pump_data:N?0b; +comment_data:N?("start";"stop";"alert";"acknowledge";""); + +// Combine the data for all columns +array_data:(tstamp_data;temp_data;fill_data;pump_data;comment_data); + +// Support null mapping +null_opts:(`int16`int32`string)!(0Nh;0Ni;"start") +options:(``NULL_MAPPING)!((::);null_opts) + +// Pretty print the Arrow table populated from the array data +.arrowkdb.tb.prettyPrintTable[schema;array_data;options] + +options[`PARQUET_VERSION]:`V2.0 +.arrowkdb.pq.writeParquet["null_mapping.parquet";schema;array_data;options] From aff2695c226081aaa874601878d141ef62acc68a Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 18 Jan 2023 11:29:54 +0300 Subject: [PATCH 018/276] Batch operations for integers --- src/ArrayWriter.cpp | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index 88d8e86..584c77f 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -464,17 +464,14 @@ void PopulateBuilder(shared_ptr datatype, K { auto int16_builder = static_cast(builder); if( type_overrides.null_mapping.have_int16 ){ - for( auto i = 0; i < k_array->n; ++i ){ - if( type_overrides.null_mapping.int16_null == kH( k_array )[i]){ - PARQUET_THROW_NOT_OK( int16_builder->AppendNull() ); - } - else{ - PARQUET_THROW_NOT_OK( int16_builder->AppendValues( ( int16_t* )&kH( k_array )[i], 1 ) ); - } + std::vector null_bitmap( k_array->n ); + for( auto i = 0ll; i < k_array->n; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.int16_null ^ kH( k_array )[i]; } + PARQUET_THROW_NOT_OK( int16_builder->AppendValues( ( int16_t* )kH( k_array ), k_array->n, null_bitmap ) ); } else { - PARQUET_THROW_NOT_OK( int16_builder->AppendValues( ( int16_t* )kH( k_array), k_array->n ) ); + PARQUET_THROW_NOT_OK(int16_builder->AppendValues((int16_t*)kH(k_array), k_array->n)); } } @@ -489,17 +486,12 @@ template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { auto int32_builder = static_cast(builder); - type_overrides.null_mapping.have_int32 = true; - type_overrides.null_mapping.int32_null = -2147483648; if( type_overrides.null_mapping.have_int32 ){ - for( auto i = 0; i < k_array->n; ++i ){ - if( type_overrides.null_mapping.int32_null == kI( k_array )[i] ){ - PARQUET_THROW_NOT_OK( int32_builder->AppendNull() ); - } - else{ - PARQUET_THROW_NOT_OK( int32_builder->AppendValues( ( int32_t* )&kI( k_array )[i], 1 ) ); - } + std::vector null_bitmap( k_array->n ); + for( auto i = 0ll; i < k_array->n; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.int32_null ^ kI( k_array )[i]; } + PARQUET_THROW_NOT_OK( int32_builder->AppendValues( ( int32_t* )kI( k_array ), k_array->n, null_bitmap ) ); } else{ PARQUET_THROW_NOT_OK(int32_builder->AppendValues((int32_t*)kI(k_array), k_array->n)); @@ -546,6 +538,7 @@ template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { auto str_builder = static_cast(builder); + bool is_symbol = k_array->t == KS && (datatype->id() == arrow::Type::STRING || datatype->id() == arrow::Type::LARGE_STRING); if( is_symbol ){ // Populate from symbol list for( auto i = 0ll; i < k_array->n; ++i ){ @@ -563,7 +556,8 @@ void PopulateBuilder(shared_ptr datatype, K str_data = kK( k_array )[i]; TYPE_CHECK_ITEM( str_data->t != KC, datatype->ToString(), KC, str_data->t ); if( type_overrides.null_mapping.have_string - && type_overrides.null_mapping.string_null == std::string( ( char* )kG( str_data ), str_data->n ) ){ + && type_overrides.null_mapping.string_null.length() == str_data->n + && !type_overrides.null_mapping.string_null.compare( 0, str_data->n, ( char* )kG( str_data ), str_data->n ) ){ PARQUET_THROW_NOT_OK( str_builder->AppendNull() ); } else{ @@ -577,6 +571,7 @@ template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { auto str_builder = static_cast(builder); + bool is_symbol = k_array->t == KS && (datatype->id() == arrow::Type::STRING || datatype->id() == arrow::Type::LARGE_STRING); if( is_symbol ){ // Populate from symbol list for( auto i = 0ll; i < k_array->n; ++i ){ @@ -593,8 +588,9 @@ void PopulateBuilder(shared_ptr data for( auto i = 0ll; i < k_array->n; ++i ){ K str_data = kK( k_array )[i]; TYPE_CHECK_ITEM( str_data->t != KC, datatype->ToString(), KC, str_data->t ); - if( type_overrides.null_mapping.have_string - && type_overrides.null_mapping.string_null == std::string( ( char* )kG( str_data ), str_data->n ) ){ + if( type_overrides.null_mapping.have_large_string + && type_overrides.null_mapping.large_string_null.length() == str_data->n + && !type_overrides.null_mapping.large_string_null.compare( 0, str_data->n, ( char* )kG( str_data ), str_data->n ) ){ PARQUET_THROW_NOT_OK( str_builder->AppendNull() ); } else{ From 665b2c70bced4d32fcb768c79646c0ea3f36357e Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 23 Jan 2023 17:47:37 +0300 Subject: [PATCH 019/276] Pull-request #6 changes, patch 1 https://github.com/KxSystems/arrowkdb/pull/6 --- src/ArrayWriter.cpp | 4 ++-- src/KdbOptions.h | 9 +++++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index 584c77f..41fa68c 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -466,7 +466,7 @@ void PopulateBuilder(shared_ptr datatype, K if( type_overrides.null_mapping.have_int16 ){ std::vector null_bitmap( k_array->n ); for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.int16_null ^ kH( k_array )[i]; + null_bitmap[i] = type_overrides.null_mapping.int16_null != kH( k_array )[i]; } PARQUET_THROW_NOT_OK( int16_builder->AppendValues( ( int16_t* )kH( k_array ), k_array->n, null_bitmap ) ); } @@ -489,7 +489,7 @@ void PopulateBuilder(shared_ptr datatype, K if( type_overrides.null_mapping.have_int32 ){ std::vector null_bitmap( k_array->n ); for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.int32_null ^ kI( k_array )[i]; + null_bitmap[i] = type_overrides.null_mapping.int32_null != kI( k_array )[i]; } PARQUET_THROW_NOT_OK( int32_builder->AppendValues( ( int32_t* )kI( k_array ), k_array->n, null_bitmap ) ); } diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 4858b8e..95573d8 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -162,6 +162,12 @@ class KdbOptions K keys = kK( kK( dict )[index] )[0]; K values = kK( kK( dict )[index] )[1]; + if( KS != keys->t ){ + throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING keys" ); + } + if( 0 != values->t ){ + throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING values" ); + } for( auto i = 0ll; i < values->n; ++i ){ const std::string key = ToUpper( kS( keys )[i] ); if( supported_null_mapping_options.find( key ) == supported_null_mapping_options.end() ){ @@ -184,6 +190,9 @@ class KdbOptions null_mapping_options.large_string_null.assign( (char*)kC( value ), value->n ); null_mapping_options.have_large_string = true; } + else if( 101 == value->t ){ + // Ignore generic null, which may be used here to ensure mixed list of options + } else{ throw InvalidOption(("Unsupported KDB data type for NULL_MAPPING option '" + key + "'").c_str()); } From d7e95425e6adf91886d6f2a5a67abc499b057b8b Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 24 Jan 2023 10:32:03 +0000 Subject: [PATCH 020/276] Pull-request #6 changes, patch 2 https://github.com/KxSystems/arrowkdb/pull/6 --- src/KdbOptions.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 95573d8..cf46f3e 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -163,10 +163,10 @@ class KdbOptions K keys = kK( kK( dict )[index] )[0]; K values = kK( kK( dict )[index] )[1]; if( KS != keys->t ){ - throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING keys" ); + throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING keys (expected=11h), type=" + std::to_string( keys->t ) ); } if( 0 != values->t ){ - throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING values" ); + throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING values (extected=0), type=" + std::to_string( keys->t ) ); } for( auto i = 0ll; i < values->n; ++i ){ const std::string key = ToUpper( kS( keys )[i] ); @@ -194,7 +194,7 @@ class KdbOptions // Ignore generic null, which may be used here to ensure mixed list of options } else{ - throw InvalidOption(("Unsupported KDB data type for NULL_MAPPING option '" + key + "'").c_str()); + throw InvalidOption(("Unsupported KDB data type for NULL_MAPPING option '" + key + "', type=" + std::to_string( value->t ) ).c_str()); } } } From df81c46f6bbc8e724f512274e77d02b6798d3b71 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 24 Jan 2023 13:47:23 +0000 Subject: [PATCH 021/276] Pull-request #6 changes, patch 3 --- src/KdbOptions.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/KdbOptions.h b/src/KdbOptions.h index cf46f3e..c4bf1ae 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -163,10 +163,10 @@ class KdbOptions K keys = kK( kK( dict )[index] )[0]; K values = kK( kK( dict )[index] )[1]; if( KS != keys->t ){ - throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING keys (expected=11h), type=" + std::to_string( keys->t ) ); + throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING keys (expected=11h), type=" + std::to_string( keys->t ) + "h" ); } if( 0 != values->t ){ - throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING values (extected=0), type=" + std::to_string( keys->t ) ); + throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING values (extected=0h), type=" + std::to_string( keys->t ) + "h" ); } for( auto i = 0ll; i < values->n; ++i ){ const std::string key = ToUpper( kS( keys )[i] ); @@ -194,7 +194,7 @@ class KdbOptions // Ignore generic null, which may be used here to ensure mixed list of options } else{ - throw InvalidOption(("Unsupported KDB data type for NULL_MAPPING option '" + key + "', type=" + std::to_string( value->t ) ).c_str()); + throw InvalidOption(("Unsupported KDB data type for NULL_MAPPING option '" + key + "', type=" + std::to_string( value->t ) + "h" ).c_str()); } } } From b4fa280fcdbe08256f3fc33df5b9ab87acc93781 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 24 Jan 2023 14:30:41 +0000 Subject: [PATCH 022/276] Pull-request #6 changes, patch 4 --- src/KdbOptions.h | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/KdbOptions.h b/src/KdbOptions.h index c4bf1ae..509ddd5 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -6,6 +6,7 @@ #include #include #include +#include #include "k.h" @@ -56,10 +57,10 @@ namespace Options const std::string NULL_MAPPING = "NULL_MAPPING"; // Null mapping options - const std::string NM_INT_16 = "INT16"; - const std::string NM_INT_32 = "INT32"; - const std::string NM_STRING = "STRING"; - const std::string NM_LARGE_STRING = "LARGE_STRING"; + const std::string NM_INT_16 = "int16"; + const std::string NM_INT_32 = "int32"; + const std::string NM_STRING = "string"; + const std::string NM_LARGE_STRING = "large_string"; const static std::set int_options = { PARQUET_CHUNK_SIZE, @@ -136,6 +137,13 @@ class KdbOptions return upper; } + const std::string ToLower( std::string str ) const + { + std::transform( str.begin(), str.end(), str.begin(), ::tolower ); + + return str; + } + void PopulateIntOptions(K keys, K values) { for (auto i = 0ll; i < values->n; ++i) { @@ -169,7 +177,7 @@ class KdbOptions throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING values (extected=0h), type=" + std::to_string( keys->t ) + "h" ); } for( auto i = 0ll; i < values->n; ++i ){ - const std::string key = ToUpper( kS( keys )[i] ); + const std::string key = ToLower( kS( keys )[i] ); if( supported_null_mapping_options.find( key ) == supported_null_mapping_options.end() ){ throw InvalidOption(("Unsupported NULL_MAPPING option '" + key + "'").c_str()); } From f503c72460aec82240edb7fa63dc10f19dfbd86c Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 23 Jan 2023 19:06:33 +0300 Subject: [PATCH 023/276] Null mapping example for all supported fields --- examples/null_mapping.q | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/examples/null_mapping.q b/examples/null_mapping.q index 2b1b31c..b825751 100644 --- a/examples/null_mapping.q +++ b/examples/null_mapping.q @@ -6,11 +6,36 @@ // Create the schema // //-------------------// +// Support null mapping +short_opts:(`bool`uint8`int8`uint16`int16)!(0b;0x01;0x02;3h;4h) +long_opts:(`uint32`int32`uint64`int64)!(5i;6i;7;8) +float_opts:(`float16`float32`float64)!(9h;1.23e;4.56) +string_opts:(`string`large_string`binary`large_binary`fixed_binary)!("start";"stop";"x"$"alert";"x"$"acknowledge";0Ng) +date_opts:(`date32`date64`timestamp)!("i"$2006.07.21;"j"$2015.01.01D00:00:00.000000000;"j"$12:00:00.000000000) +time_opts:(`time32`time64`decimal`duration)!("i"$09:01:02.042;"j"$2015.01.01D00:00:00.000000000;"f"$7.89;"j"$12:00:00.000000000) +interval_opts:(`month_interval`day_time_interval)!("i"$2006.07m;"j"$12:00:00.000000000) + +options:(``NULL_MAPPING)!((::);short_opts,long_opts,float_opts,string_opts,date_opts,time_opts,interval_opts) + // Create the datatype identifiers -ts_dt:.arrowkdb.dt.timestamp[`nano]; -f64_dt:.arrowkdb.dt.float64[]; -i32_dt:.arrowkdb.dt.int32[]; bool_dt:.arrowkdb.dt.boolean[]; +ui8_dt:.arrowkdb.dt.uint8[]; +i8_dt:.arrowkdb.dt.int8[]; +ui16_dt:.arrowkdb.dt.uint16[]; +i16_dt:.arrowkdb.dt.int16[]; + +ui32_dt:.arrowkdb.dt.uint32[]; +i32_dt:.arrowkdb.dt.int32[]; +ui64_dt:.arrowkdb.dt.uint64[]; +i64_dt:.arrowkdb.dt.int64[]; + +f64_dt:.arrowkdb.dt.float16[]; +f64_dt:.arrowkdb.dt.float64[]; +f64_dt:.arrowkdb.dt.float64[]; + + + +ts_dt:.arrowkdb.dt.timestamp[`nano]; str_dt:.arrowkdb.dt.utf8[]; // Create the field identifiers @@ -41,10 +66,6 @@ comment_data:N?("start";"stop";"alert";"acknowledge";""); // Combine the data for all columns array_data:(tstamp_data;temp_data;fill_data;pump_data;comment_data); -// Support null mapping -null_opts:(`int16`int32`string)!(0Nh;0Ni;"start") -options:(``NULL_MAPPING)!((::);null_opts) - // Pretty print the Arrow table populated from the array data .arrowkdb.tb.prettyPrintTable[schema;array_data;options] From e3e20e72e51e0b13c7ac604362ce6539885f511d Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 24 Jan 2023 21:56:23 +0300 Subject: [PATCH 024/276] Writing parquet files for null mapping --- examples/null_mapping.q | 192 +++++++++++++++++++++++++++++++++------- 1 file changed, 160 insertions(+), 32 deletions(-) diff --git a/examples/null_mapping.q b/examples/null_mapping.q index b825751..1d8bb93 100644 --- a/examples/null_mapping.q +++ b/examples/null_mapping.q @@ -7,15 +7,18 @@ //-------------------// // Support null mapping -short_opts:(`bool`uint8`int8`uint16`int16)!(0b;0x01;0x02;3h;4h) -long_opts:(`uint32`int32`uint64`int64)!(5i;6i;7;8) -float_opts:(`float16`float32`float64)!(9h;1.23e;4.56) -string_opts:(`string`large_string`binary`large_binary`fixed_binary)!("start";"stop";"x"$"alert";"x"$"acknowledge";0Ng) -date_opts:(`date32`date64`timestamp)!("i"$2006.07.21;"j"$2015.01.01D00:00:00.000000000;"j"$12:00:00.000000000) -time_opts:(`time32`time64`decimal`duration)!("i"$09:01:02.042;"j"$2015.01.01D00:00:00.000000000;"f"$7.89;"j"$12:00:00.000000000) -interval_opts:(`month_interval`day_time_interval)!("i"$2006.07m;"j"$12:00:00.000000000) +short_opts:(`bool`uint8`int8`uint16`int16)!(0b;0x01;0x02;3h;4h); +long_opts:(`uint32`int32`uint64`int64)!(5i;6i;7;8); +float_opts:(`float16`float32`float64)!(9h;1.23e;4.56); +//string_opts:(`string`large_string`binary`large_binary`fixed_binary)!("start";"stop";"x"$"alert";"x"$"acknowledge";0Ng); +string_opts:(`string`large_string`binary`large_binary)!("start";"stop";"alert";"acknowledge"); +date_opts:(`date32`date64`timestamp)!("i"$2006.07.21;"j"$2015.01.01D00:00:00.000000000;"j"$12:00:00.000000000); +//time_opts:(`time32`time64`decimal)!("i"$09:01:02.042;"j"$2015.01.01D00:00:00.000000000;"f"$7.89); +dur_opts:(`duration`month_interval`day_time_interval)!("j"$12:00:00.000000000;"i"$2006.07m;"j"$12:00:00.000000000); -options:(``NULL_MAPPING)!((::);short_opts,long_opts,float_opts,string_opts,date_opts,time_opts,interval_opts) +options:(``NULL_MAPPING)!((::);short_opts,long_opts,float_opts,string_opts,date_opts,time_opts,interval_opts); + +ts_dt:.arrowkdb.dt.timestamp[`nano]; // Create the datatype identifiers bool_dt:.arrowkdb.dt.boolean[]; @@ -29,45 +32,170 @@ i32_dt:.arrowkdb.dt.int32[]; ui64_dt:.arrowkdb.dt.uint64[]; i64_dt:.arrowkdb.dt.int64[]; -f64_dt:.arrowkdb.dt.float16[]; -f64_dt:.arrowkdb.dt.float64[]; +f16_dt:.arrowkdb.dt.float16[]; +f32_dt:.arrowkdb.dt.float32[]; f64_dt:.arrowkdb.dt.float64[]; - - -ts_dt:.arrowkdb.dt.timestamp[`nano]; str_dt:.arrowkdb.dt.utf8[]; +lstr_dt:.arrowkdb.dt.large_utf8[]; +bin_dt:.arrowkdb.dt.binary[]; +lbin_dt:.arrowkdb.dt.large_binary[]; +fbin_dt:.arrowkdb.dt.fixed_size_binary[2i]; -// Create the field identifiers -tstamp_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; -temp_fd:.arrowkdb.fd.field[`temperature;f64_dt]; -fill_fd:.arrowkdb.fd.field[`fill_level;i32_dt]; -pump_fd:.arrowkdb.fd.field[`pump_status;bool_dt]; -comment_fd:.arrowkdb.fd.field[`comment;str_dt]; +d32_dt:.arrowkdb.dt.date32[]; +d64_dt:.arrowkdb.dt.date64[]; +tstamp_dt:.arrowkdb.dt.timestamp[`nano]; -// Create the schema for the list of fields -schema:.arrowkdb.sc.schema[(tstamp_fd,temp_fd,fill_fd,pump_fd,comment_fd)]; +t32_dt:.arrowkdb.dt.time32[`milli]; +t64_dt:.arrowkdb.dt.time64[`nano]; +dec_dt:.arrowkdb.dt.decimal128[38i;2i]; +dur_dt:.arrowkdb.dt.duration[`milli]; -// Print the schema -.arrowkdb.sc.printSchema[schema] +mint_dt:.arrowkdb.dt.month_interval[]; +dtint_dt:.arrowkdb.dt.day_time_interval[]; + +// Create the field identifiers +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +bool_fd:.arrowkdb.fd.field[`bool;bool_dt]; +ui8_fd:.arrowkdb.fd.field[`uint8;ui8_dt]; +i8_fd:.arrowkdb.fd.field[`int8;i8_dt]; +ui16_fd:.arrowkdb.fd.field[`uint16;ui16_dt]; +i16_fd:.arrowkdb.fd.field[`int16;i16_dt]; + +ui32_fd:.arrowkdb.fd.field[`uint32;ui32_dt]; +i32_fd:.arrowkdb.fd.field[`int32;i32_dt]; +ui64_fd:.arrowkdb.fd.field[`uint64;ui64_dt]; +i64_fd:.arrowkdb.fd.field[`int64;i64_dt]; + +f16_fd:.arrowkdb.fd.field[`float16;f16_dt]; +f32_fd:.arrowkdb.fd.field[`float32;f32_dt]; +f64_fd:.arrowkdb.fd.field[`float64;f64_dt]; + +str_fd:.arrowkdb.fd.field[`string;str_dt]; +lstr_fd:.arrowkdb.fd.field[`long_string;lstr_dt]; +bin_fd:.arrowkdb.fd.field[`binary;bin_dt]; +lbin_fd:.arrowkdb.fd.field[`long_binary;lbin_dt]; +fbin_fd:.arrowkdb.fd.field[`fixed_binary;fbin_dt]; + +d32_fd:.arrowkdb.fd.field[`date32;d32_dt]; +d64_fd:.arrowkdb.fd.field[`date64;d64_dt]; +tstamp_fd:.arrowkdb.fd.field[`timestamp;tstamp_dt]; + +t32_fd:.arrowkdb.fd.field[`time32;t32_dt]; +t64_fd:.arrowkdb.fd.field[`time64;t64_dt]; +dec_fd:.arrowkdb.fd.field[`decimal;dec_dt]; + +dur_fd:.arrowkdb.fd.field[`duration;dur_dt]; +mint_fd:.arrowkdb.fd.field[`month_interval;mint_dt]; +dtint_fd:.arrowkdb.fd.field[`day_time_interval;dtint_dt]; + +// Create the schemas for the list of fields +short_schema:.arrowkdb.sc.schema[(ts_fd,bool_fd,ui8_fd,i8_fd,ui16_fd,i16_fd)]; +long_schema:.arrowkdb.sc.schema[(ts_fd,ui32_fd,i32_fd,ui64_fd,i64_fd)]; +float_schema:.arrowkdb.sc.schema[(ts_fd,f16_fd,f32_fd,f64_fd)]; +str_schema:.arrowkdb.sc.schema[(ts_fd,str_fd,lstr_fd,bin_fd,lbin_fd,fbin_fd)]; +date_schema:.arrowkdb.sc.schema[(ts_fd,d32_fd,d64_fd,tstamp_fd)]; +time_schema:.arrowkdb.sc.schema[(ts_fd,t32_fd,t64_fd,dec_fd)]; +dur_schema:.arrowkdb.sc.schema[ts_fd,dur_fd,mint_fd,dtint_fd] + +// Print the schemas +.arrowkdb.sc.printSchema[short_schema] +.arrowkdb.sc.printSchema[long_schema] +.arrowkdb.sc.printSchema[float_schema] +.arrowkdb.sc.printSchema[string_schema] +.arrowkdb.sc.printSchema[date_schema] +.arrowkdb.sc.printSchema[time_schema] +.arrowkdb.sc.printSchema[dur_schema] //-----------------------// // Create the array data // //-----------------------// // Create data for each column in the table -tstamp_data:asc N?0p; -temp_data:N?100f; -fill_data:N?100i; -fill_data[0]:0Ni -pump_data:N?0b; -comment_data:N?("start";"stop";"alert";"acknowledge";""); +ts_data:asc N?0p; + +bool_data:N?(0b;1b); +bool_data[0]:0b; +ui8_data:N?0x64; +ui8_data[1]:0x01; +i8_data:N?0x64; +i8_data[2]:0x02; +ui16_data:N?100h; +ui16_data[3]:3h; +i16_data:N?100h; +i16_data[4]:4h; + +ui32_data:N?100i; +ui32_data[0]:5i; +i32_data:N?100i; +i32_data[1]:6i; +ui64_data:N?100; +ui64_data[2]:7; +i64_data:N?100; +i64_data[3]:8; + +f16_data:N?100h; +f16_data[0]:9h; +f32_data:N?100e; +f32_data[1]:1.23e; +f64_data:N?100f; +f64_data[2]:4.56f; + +str_data:N?("start";"stop";"alert";"acknowledge";""); +str_data[0]:"start" +lstr_data:N?("start";"stop";"alert";"acknowledge";""); +lstr_data[1]:"stop" +bin_data:N?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); +bin_data[2]:"x"$"alert" +lbin_data:N?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); +lbin_data[3]:"x"$"acknowledge" +fbin_data:N?("x"$"0123456789"); +fbin_data[4]:"x"$"5" + +d32_data:N?("i"$2006.07.21;"i"$2008.07.18;"i"$2012.07.16;"i"$2014.07.15;"i"$2016.07.11); +d32_data[1]:"i"$2006.07.21; +d64_data:N?("j"$2015.01.01D00:00:00.000000000;"j"$2017.01.01D00:00:00.000000000;"j"$2018.01.01D00:00:00.000000000;"j"$2019.01.01D00:00:00.000000000;"j"$2020.01.01D00:00:00.000000000); +d64_data[2]:"j"$2015.01.01D00:00:00.000000000; +tstamp_data:N?("j"$12:00:00.000000000;"j"$13:00:00.000000000;"j"$14:00:00.000000000;"j"$15:00:00.000000000;"j"$16:00:00.000000000); +tstamp_data[3]:"j"$12:00:00.000000000; + +t32_data:N?("i"$09:01:02.042;"i"$08:01:02.042;"i"$07:01:02.042;"i"$06:01:02.042;"i"$05:01:02.042); +t32_data[0]:"i"$09:01:02.042; +t64_data:N?("j"$2015.01.01D00:00:00.000000000;"j"$2016.01.01D00:00:00.000000000;"j"$2017.01.01D00:00:00.000000000;"j"$2018.01.01D00:00:00.000000000;"j"$2019.01.01D00:00:00.000000000); +t64_data[1]:"j"$2015.01.01D00:00:00.000000000; +dec_data:N?(10f); +dec_data[2]:7.89f + +dur_data:N?("j"$12:00:00.000000000;"j"$13:00:00.000000000;"j"$14:00:00.000000000;"j"$15:00:00.000000000;"j"$16:00:00.000000000); +dur_data[0]:"j"$12:00:00.000000000 +mint_data:N?("i"$2006.07m;"i"$2006.06m;"i"$2006.05m;"i"$2006.04m;"i"$2006.03m); +mint_data[1]:"i"$2006.07m; +dtint_data:N?("j"$12:00:00.000000000;"j"$11:00:00.000000000;"j"$10:00:00.000000000;"j"$09:00:00.000000000;"j"$08:00:00.000000000); +dtint_data[2]:"j"$12:00:00.000000000; // Combine the data for all columns -array_data:(tstamp_data;temp_data;fill_data;pump_data;comment_data); +short_data:(ts_data;bool_data;ui8_data;i8_data;ui16_data;i16_data); +long_data:(ts_data;ui32_data;i32_data;ui64_data;i64_data); +float_data:(ts_data;f16_data;f32_data;f64_data); +str_data:(ts_data;str_data;lstr_data;bin_data;lbin_data;fbin_data); +date_data:(ts_data;d32_data;d64_data;tstamp_data); +time_data:(ts_data;t32_data;t64_data;dec_data) +dur_data:(ts_data;dur_data;mint_data;dtint_data) // Pretty print the Arrow table populated from the array data -.arrowkdb.tb.prettyPrintTable[schema;array_data;options] +.arrowkdb.tb.prettyPrintTable[short_schema;short_data;options] +.arrowkdb.tb.prettyPrintTable[long_schema;long_data;options] +.arrowkdb.tb.prettyPrintTable[float_schema;float_data;options] +.arrowkdb.tb.prettyPrintTable[str_schema;str_data;options] +.arrowkdb.tb.prettyPrintTable[date_schema;date_data;options] +.arrowkdb.tb.prettyPrintTable[time_schema;time_data;options] +.arrowkdb.tb.prettyPrintTable[dur_schema;dur_data;options] options[`PARQUET_VERSION]:`V2.0 -.arrowkdb.pq.writeParquet["null_mapping.parquet";schema;array_data;options] +.arrowkdb.pq.writeParquet["null_mapping_short.parquet";short_schema;short_data;options] +.arrowkdb.pq.writeParquet["null_mapping_long.parquet";long_schema;long_data;options] +.arrowkdb.pq.writeParquet["null_mapping_float.parquet";float_schema;float_data;options] +.arrowkdb.pq.writeParquet["null_mapping_str.parquet";str_schema;str_data;options] +.arrowkdb.pq.writeParquet["null_mapping_date.parquet";date_schema;date_data;options] +.arrowkdb.pq.writeParquet["null_mapping_time.parquet";time_schema;time_data;options] From 768ed3effe1884b75993f3280e65b9466c1b5206 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 25 Jan 2023 19:59:38 +0300 Subject: [PATCH 025/276] Other fields removed from parquet --- examples/null_mapping.q | 105 ++++++++++++++++++---------------------- 1 file changed, 48 insertions(+), 57 deletions(-) diff --git a/examples/null_mapping.q b/examples/null_mapping.q index 1d8bb93..629672c 100644 --- a/examples/null_mapping.q +++ b/examples/null_mapping.q @@ -9,14 +9,12 @@ // Support null mapping short_opts:(`bool`uint8`int8`uint16`int16)!(0b;0x01;0x02;3h;4h); long_opts:(`uint32`int32`uint64`int64)!(5i;6i;7;8); -float_opts:(`float16`float32`float64)!(9h;1.23e;4.56); -//string_opts:(`string`large_string`binary`large_binary`fixed_binary)!("start";"stop";"x"$"alert";"x"$"acknowledge";0Ng); -string_opts:(`string`large_string`binary`large_binary)!("start";"stop";"alert";"acknowledge"); -date_opts:(`date32`date64`timestamp)!("i"$2006.07.21;"j"$2015.01.01D00:00:00.000000000;"j"$12:00:00.000000000); -//time_opts:(`time32`time64`decimal)!("i"$09:01:02.042;"j"$2015.01.01D00:00:00.000000000;"f"$7.89); -dur_opts:(`duration`month_interval`day_time_interval)!("j"$12:00:00.000000000;"i"$2006.07m;"j"$12:00:00.000000000); +float_opts:(`float32`float64`decimal)!(1.23e;4.56;7.89); +str_opts:(`string`large_string`binary`large_binary`fixed_binary)!("start";"stop";"x"$"alert";"x"$"acknowledge";0Ng); +time_opts:(`date32`date64`timestamp`time64`duration)!("i"$2006.07.21;"j"$2015.01.01D00:00:00.000000000;"j"$2011.01.01D00:00:00.000000000;"j"$12:00:00.000000000;"j"$12:00:00.000000000); +other_opts:(`float16`time32`month_interval`day_time_interval)!(9h;"i"$09:01:02.042;"i"$2006.07m;"j"$12:00:00.000000000); -options:(``NULL_MAPPING)!((::);short_opts,long_opts,float_opts,string_opts,date_opts,time_opts,interval_opts); +options:(``NULL_MAPPING)!((::);short_opts,long_opts,float_opts,str_opts,time_opts,other_opts); ts_dt:.arrowkdb.dt.timestamp[`nano]; @@ -32,25 +30,24 @@ i32_dt:.arrowkdb.dt.int32[]; ui64_dt:.arrowkdb.dt.uint64[]; i64_dt:.arrowkdb.dt.int64[]; -f16_dt:.arrowkdb.dt.float16[]; f32_dt:.arrowkdb.dt.float32[]; f64_dt:.arrowkdb.dt.float64[]; +dec_dt:.arrowkdb.dt.decimal128[38i;2i]; str_dt:.arrowkdb.dt.utf8[]; lstr_dt:.arrowkdb.dt.large_utf8[]; bin_dt:.arrowkdb.dt.binary[]; lbin_dt:.arrowkdb.dt.large_binary[]; -fbin_dt:.arrowkdb.dt.fixed_size_binary[2i]; +fbin_dt:.arrowkdb.dt.fixed_size_binary[16i]; d32_dt:.arrowkdb.dt.date32[]; d64_dt:.arrowkdb.dt.date64[]; tstamp_dt:.arrowkdb.dt.timestamp[`nano]; - -t32_dt:.arrowkdb.dt.time32[`milli]; t64_dt:.arrowkdb.dt.time64[`nano]; -dec_dt:.arrowkdb.dt.decimal128[38i;2i]; dur_dt:.arrowkdb.dt.duration[`milli]; +f16_dt:.arrowkdb.dt.float16[]; +t32_dt:.arrowkdb.dt.time32[`milli]; mint_dt:.arrowkdb.dt.month_interval[]; dtint_dt:.arrowkdb.dt.day_time_interval[]; @@ -68,9 +65,9 @@ i32_fd:.arrowkdb.fd.field[`int32;i32_dt]; ui64_fd:.arrowkdb.fd.field[`uint64;ui64_dt]; i64_fd:.arrowkdb.fd.field[`int64;i64_dt]; -f16_fd:.arrowkdb.fd.field[`float16;f16_dt]; f32_fd:.arrowkdb.fd.field[`float32;f32_dt]; f64_fd:.arrowkdb.fd.field[`float64;f64_dt]; +dec_fd:.arrowkdb.fd.field[`decimal;dec_dt]; str_fd:.arrowkdb.fd.field[`string;str_dt]; lstr_fd:.arrowkdb.fd.field[`long_string;lstr_dt]; @@ -81,32 +78,29 @@ fbin_fd:.arrowkdb.fd.field[`fixed_binary;fbin_dt]; d32_fd:.arrowkdb.fd.field[`date32;d32_dt]; d64_fd:.arrowkdb.fd.field[`date64;d64_dt]; tstamp_fd:.arrowkdb.fd.field[`timestamp;tstamp_dt]; - -t32_fd:.arrowkdb.fd.field[`time32;t32_dt]; t64_fd:.arrowkdb.fd.field[`time64;t64_dt]; -dec_fd:.arrowkdb.fd.field[`decimal;dec_dt]; - dur_fd:.arrowkdb.fd.field[`duration;dur_dt]; + +f16_fd:.arrowkdb.fd.field[`float16;f16_dt]; +t32_fd:.arrowkdb.fd.field[`time32;t32_dt]; mint_fd:.arrowkdb.fd.field[`month_interval;mint_dt]; dtint_fd:.arrowkdb.fd.field[`day_time_interval;dtint_dt]; // Create the schemas for the list of fields short_schema:.arrowkdb.sc.schema[(ts_fd,bool_fd,ui8_fd,i8_fd,ui16_fd,i16_fd)]; long_schema:.arrowkdb.sc.schema[(ts_fd,ui32_fd,i32_fd,ui64_fd,i64_fd)]; -float_schema:.arrowkdb.sc.schema[(ts_fd,f16_fd,f32_fd,f64_fd)]; +float_schema:.arrowkdb.sc.schema[(ts_fd,f32_fd,f64_fd,dec_fd)]; str_schema:.arrowkdb.sc.schema[(ts_fd,str_fd,lstr_fd,bin_fd,lbin_fd,fbin_fd)]; -date_schema:.arrowkdb.sc.schema[(ts_fd,d32_fd,d64_fd,tstamp_fd)]; -time_schema:.arrowkdb.sc.schema[(ts_fd,t32_fd,t64_fd,dec_fd)]; -dur_schema:.arrowkdb.sc.schema[ts_fd,dur_fd,mint_fd,dtint_fd] +time_schema:.arrowkdb.sc.schema[(ts_fd,d32_fd,d64_fd,tstamp_fd,t64_fd,dur_fd)]; +other_schema:.arrowkdb.sc.schema[(ts_fd,f16_fd,t32_fd,mint_fd,dtint_fd)]; // Print the schemas .arrowkdb.sc.printSchema[short_schema] .arrowkdb.sc.printSchema[long_schema] .arrowkdb.sc.printSchema[float_schema] -.arrowkdb.sc.printSchema[string_schema] -.arrowkdb.sc.printSchema[date_schema] +.arrowkdb.sc.printSchema[str_schema] .arrowkdb.sc.printSchema[time_schema] -.arrowkdb.sc.printSchema[dur_schema] +.arrowkdb.sc.printSchema[other_schema] //-----------------------// // Create the array data // @@ -135,12 +129,12 @@ ui64_data[2]:7; i64_data:N?100; i64_data[3]:8; -f16_data:N?100h; -f16_data[0]:9h; f32_data:N?100e; -f32_data[1]:1.23e; +f32_data[0]:1.23e; f64_data:N?100f; -f64_data[2]:4.56f; +f64_data[1]:4.56f; +dec_data:N?(10f); +dec_data[2]:7.89f str_data:N?("start";"stop";"alert";"acknowledge";""); str_data[0]:"start" @@ -150,52 +144,49 @@ bin_data:N?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); bin_data[2]:"x"$"alert" lbin_data:N?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); lbin_data[3]:"x"$"acknowledge" -fbin_data:N?("x"$"0123456789"); -fbin_data[4]:"x"$"5" - -d32_data:N?("i"$2006.07.21;"i"$2008.07.18;"i"$2012.07.16;"i"$2014.07.15;"i"$2016.07.11); -d32_data[1]:"i"$2006.07.21; -d64_data:N?("j"$2015.01.01D00:00:00.000000000;"j"$2017.01.01D00:00:00.000000000;"j"$2018.01.01D00:00:00.000000000;"j"$2019.01.01D00:00:00.000000000;"j"$2020.01.01D00:00:00.000000000); -d64_data[2]:"j"$2015.01.01D00:00:00.000000000; -tstamp_data:N?("j"$12:00:00.000000000;"j"$13:00:00.000000000;"j"$14:00:00.000000000;"j"$15:00:00.000000000;"j"$16:00:00.000000000); -tstamp_data[3]:"j"$12:00:00.000000000; - -t32_data:N?("i"$09:01:02.042;"i"$08:01:02.042;"i"$07:01:02.042;"i"$06:01:02.042;"i"$05:01:02.042); -t32_data[0]:"i"$09:01:02.042; -t64_data:N?("j"$2015.01.01D00:00:00.000000000;"j"$2016.01.01D00:00:00.000000000;"j"$2017.01.01D00:00:00.000000000;"j"$2018.01.01D00:00:00.000000000;"j"$2019.01.01D00:00:00.000000000); -t64_data[1]:"j"$2015.01.01D00:00:00.000000000; -dec_data:N?(10f); -dec_data[2]:7.89f +fbin_data:N?0Ng; +fbin_data[4]:0Ng; + +d32_data:N?(2006.07.21;2008.07.18;2012.07.16;2014.07.15;2016.07.11); +d32_data[0]:2006.07.21; +d64_data:N?(2015.01.01D00:00:00.000000000;2017.01.01D00:00:00.000000000;2018.01.01D00:00:00.000000000;2019.01.01D00:00:00.000000000;2020.01.01D00:00:00.000000000); +d64_data[1]:2015.01.01D00:00:00.000000000; +tstamp_data:N?(2015.01.01D00:00:00.000000000;2014.01.01D00:00:00.000000000;2013.01.01D00:00:00.000000000;2012.01.01D00:00:00.000000000;2011.01.01D00:00:00.000000000); +tstamp_data[2]:2011.01.01D00:00:00.000000000; +t64_data:N?(12:00:00.000000000;13:00:00.000000000;14:00:00.000000000;15:00:00.000000000;16:00:00.000000000); +t64_data[3]:12:00:00.000000000; +dur_data:N?(12:00:00.000000000;13:00:00.000000000;14:00:00.000000000;15:00:00.000000000;16:00:00.000000000); +dur_data[4]:12:00:00.000000000; -dur_data:N?("j"$12:00:00.000000000;"j"$13:00:00.000000000;"j"$14:00:00.000000000;"j"$15:00:00.000000000;"j"$16:00:00.000000000); -dur_data[0]:"j"$12:00:00.000000000 -mint_data:N?("i"$2006.07m;"i"$2006.06m;"i"$2006.05m;"i"$2006.04m;"i"$2006.03m); -mint_data[1]:"i"$2006.07m; -dtint_data:N?("j"$12:00:00.000000000;"j"$11:00:00.000000000;"j"$10:00:00.000000000;"j"$09:00:00.000000000;"j"$08:00:00.000000000); -dtint_data[2]:"j"$12:00:00.000000000; +f16_data:N?100h; +f16_data[0]:9h; +t32_data:N?(09:01:02.042;08:01:02.042;07:01:02.042;06:01:02.042;05:01:02.042); +t32_data[1]:09:01:02.042; +mint_data:N?(2006.07m;2006.06m;2006.05m;2006.04m;2006.03m); +mint_data[2]:2006.07m; +dtint_data:N?(12:00:00.000000000;11:00:00.000000000;10:00:00.000000000;09:00:00.000000000;08:00:00.000000000); +dtint_data[3]:12:00:00.000000000; // Combine the data for all columns short_data:(ts_data;bool_data;ui8_data;i8_data;ui16_data;i16_data); long_data:(ts_data;ui32_data;i32_data;ui64_data;i64_data); -float_data:(ts_data;f16_data;f32_data;f64_data); +float_data:(ts_data;f32_data;f64_data;dec_data); str_data:(ts_data;str_data;lstr_data;bin_data;lbin_data;fbin_data); -date_data:(ts_data;d32_data;d64_data;tstamp_data); -time_data:(ts_data;t32_data;t64_data;dec_data) -dur_data:(ts_data;dur_data;mint_data;dtint_data) +time_data:(ts_data;d32_data;d64_data;tstamp_data;t64_data;dur_data) +other_data:(ts_data;f16_data;t32_data;mint_data;dtint_data) // Pretty print the Arrow table populated from the array data +options[`DECIMAL128_AS_DOUBLE]:1 .arrowkdb.tb.prettyPrintTable[short_schema;short_data;options] .arrowkdb.tb.prettyPrintTable[long_schema;long_data;options] .arrowkdb.tb.prettyPrintTable[float_schema;float_data;options] .arrowkdb.tb.prettyPrintTable[str_schema;str_data;options] -.arrowkdb.tb.prettyPrintTable[date_schema;date_data;options] .arrowkdb.tb.prettyPrintTable[time_schema;time_data;options] -.arrowkdb.tb.prettyPrintTable[dur_schema;dur_data;options] +.arrowkdb.tb.prettyPrintTable[other_schema;other_data;options] options[`PARQUET_VERSION]:`V2.0 .arrowkdb.pq.writeParquet["null_mapping_short.parquet";short_schema;short_data;options] .arrowkdb.pq.writeParquet["null_mapping_long.parquet";long_schema;long_data;options] .arrowkdb.pq.writeParquet["null_mapping_float.parquet";float_schema;float_data;options] .arrowkdb.pq.writeParquet["null_mapping_str.parquet";str_schema;str_data;options] -.arrowkdb.pq.writeParquet["null_mapping_date.parquet";date_schema;date_data;options] .arrowkdb.pq.writeParquet["null_mapping_time.parquet";time_schema;time_data;options] From 99261e1e4f35259dcdd480baa19636064da1f862 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 19 Jan 2023 21:37:36 +0300 Subject: [PATCH 026/276] Propagate null mapping through supported types --- src/ArrayWriter.cpp | 185 ++++++++++++++++++++++--- src/KdbOptions.h | 321 ++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 460 insertions(+), 46 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index 41fa68c..c23277f 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -1,7 +1,9 @@ #include +#include #include #include #include +#include #include #include @@ -20,6 +22,18 @@ using namespace kx::arrowkdb; namespace { +//! Compares floating point numbers, because of unreliable direct compare +//! @param lhs - left-hand side value +//! @param rhs - right-hand side value +//! @return true if values are nearby +template +bool is_equal( T lhs, T rhs ) +{ + static const T epsilon = 2 * std::numeric_limits::epsilon(); + + return ::fabs(lhs -= rhs) <= epsilon; +} + shared_ptr GetBuilder(shared_ptr datatype); template @@ -434,29 +448,64 @@ template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { auto bool_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(bool_builder->AppendValues((uint8_t*)kG(k_array), k_array->n)); + if( type_overrides.null_mapping.have_uint8 ){ + std::vector null_bitmap( k_array->n ); + for( auto i = 0ll; i < k_array->n; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.uint8_null ^ kG( k_array )[i]; + } + PARQUET_THROW_NOT_OK( bool_builder->AppendValues( ( uint8_t* )kG( k_array ), k_array->n, null_bitmap ) ); + } + else { + PARQUET_THROW_NOT_OK(bool_builder->AppendValues((uint8_t*)kG(k_array), k_array->n)); + } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { auto uint8_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(uint8_builder->AppendValues((uint8_t*)kG(k_array), k_array->n)); + if( type_overrides.null_mapping.have_uint8 ){ + std::vector null_bitmap( k_array->n ); + for( auto i = 0ll; i < k_array->n; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.uint8_null ^ kG( k_array )[i]; + } + PARQUET_THROW_NOT_OK( uint8_builder->AppendValues( ( uint8_t* )kG( k_array ), k_array->n, null_bitmap ) ); + } + else { + PARQUET_THROW_NOT_OK(uint8_builder->AppendValues((uint8_t*)kG(k_array), k_array->n)); + } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { auto int8_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(int8_builder->AppendValues((int8_t*)kG(k_array), k_array->n)); + if( type_overrides.null_mapping.have_int8 ){ + std::vector null_bitmap( k_array->n ); + for( auto i = 0ll; i < k_array->n; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.int8_null ^ kG( k_array )[i]; + } + PARQUET_THROW_NOT_OK( int8_builder->AppendValues( ( int8_t* )kG( k_array ), k_array->n, null_bitmap ) ); + } + else { + PARQUET_THROW_NOT_OK(int8_builder->AppendValues((int8_t*)kG(k_array), k_array->n)); + } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { auto uint16_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(uint16_builder->AppendValues((uint16_t*)kH(k_array), k_array->n)); - arrow::Status s; + if( type_overrides.null_mapping.have_uint16 ){ + std::vector null_bitmap( k_array->n ); + for( auto i = 0ll; i < k_array->n; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.uint16_null ^ kH( k_array )[i]; + } + PARQUET_THROW_NOT_OK( uint16_builder->AppendValues( ( uint16_t* )kH( k_array ), k_array->n, null_bitmap ) ); + } + else { + PARQUET_THROW_NOT_OK(uint16_builder->AppendValues((uint16_t*)kH(k_array), k_array->n)); + } } template<> @@ -479,7 +528,16 @@ template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { auto uint32_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(uint32_builder->AppendValues((uint32_t*)kI(k_array), k_array->n)); + if( type_overrides.null_mapping.have_uint32 ){ + std::vector null_bitmap( k_array->n ); + for( auto i = 0ll; i < k_array->n; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.uint32_null ^ kI( k_array )[i]; + } + PARQUET_THROW_NOT_OK( uint32_builder->AppendValues( ( uint32_t* )kI( k_array ), k_array->n, null_bitmap ) ); + } + else{ + PARQUET_THROW_NOT_OK(uint32_builder->AppendValues((uint32_t*)kI(k_array), k_array->n)); + } } template<> @@ -502,36 +560,80 @@ template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { auto uint64_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(uint64_builder->AppendValues((uint64_t*)kJ(k_array), k_array->n)); + if( type_overrides.null_mapping.have_uint64 ){ + std::vector null_bitmap( k_array->n ); + for( auto i = 0ll; i < k_array->n; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.uint64_null ^ kI( k_array )[i]; + } + PARQUET_THROW_NOT_OK( uint64_builder->AppendValues( ( uint64_t* )kI( k_array ), k_array->n, null_bitmap ) ); + } + else{ + PARQUET_THROW_NOT_OK(uint64_builder->AppendValues((uint64_t*)kJ(k_array), k_array->n)); + } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { auto int64_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(int64_builder->AppendValues((int64_t*)kJ(k_array), k_array->n)); + if( type_overrides.null_mapping.have_int64 ){ + std::vector null_bitmap( k_array->n ); + for( auto i = 0ll; i < k_array->n; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.int64_null ^ kI( k_array )[i]; + } + PARQUET_THROW_NOT_OK( int64_builder->AppendValues( ( int64_t* )kI( k_array ), k_array->n, null_bitmap ) ); + } + else{ + PARQUET_THROW_NOT_OK(int64_builder->AppendValues((int64_t*)kJ(k_array), k_array->n)); + } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { - arrow::HalfFloatType hft; auto hfl_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(hfl_builder->AppendValues((uint16_t*)kH(k_array), k_array->n)); + if( type_overrides.null_mapping.have_float16 ){ + std::vector null_bitmap( k_array->n ); + for( auto i = 0ll; i < k_array->n; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.float16_null ^ kH( k_array )[i]; + } + PARQUET_THROW_NOT_OK( hfl_builder->AppendValues( ( uint16_t* )kH( k_array ), k_array->n, null_bitmap ) ); + } + else { + PARQUET_THROW_NOT_OK(hfl_builder->AppendValues((uint16_t*)kH(k_array), k_array->n)); + } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { auto fl_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(fl_builder->AppendValues(kE(k_array), k_array->n)); + if( type_overrides.null_mapping.have_float32 ){ + std::vector null_bitmap( k_array->n ); + for( auto i = 0ll; i < k_array->n; ++i ){ + null_bitmap[i] = is_equal( type_overrides.null_mapping.float32_null, kE( k_array )[i] ); + } + PARQUET_THROW_NOT_OK( fl_builder->AppendValues( kE( k_array ), k_array->n, null_bitmap ) ); + } + else { + PARQUET_THROW_NOT_OK(fl_builder->AppendValues(kE(k_array), k_array->n)); + } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { auto dbl_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(dbl_builder->AppendValues(kF(k_array), k_array->n)); + if( type_overrides.null_mapping.have_float64 ){ + std::vector null_bitmap( k_array->n ); + for( auto i = 0ll; i < k_array->n; ++i ){ + null_bitmap[i] = is_equal( type_overrides.null_mapping.float64_null, kF( k_array )[i] ); + } + PARQUET_THROW_NOT_OK( dbl_builder->AppendValues( kF( k_array ), k_array->n, null_bitmap ) ); + } + else { + PARQUET_THROW_NOT_OK(dbl_builder->AppendValues(kF(k_array), k_array->n)); + } } template<> @@ -607,7 +709,14 @@ void PopulateBuilder(shared_ptr datatype, for (auto i = 0; i < k_array->n; ++i) { K bin_data = kK(k_array)[i]; TYPE_CHECK_ITEM(bin_data->t != KG, datatype->ToString(), KG, bin_data->t); - PARQUET_THROW_NOT_OK(bin_builder->Append(kG(bin_data), bin_data->n)); + if( type_overrides.null_mapping.have_binary + && type_overrides.null_mapping.binary_null.length() == bin_data->n + && !type_overrides.null_mapping.binary_null.compare( 0, bin_data->n, kG( bin_data ), bin_data->n ) ){ + PARQUET_THROW_NOT_OK( bin_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK(bin_builder->Append(kG(bin_data), bin_data->n)); + } } } @@ -618,7 +727,14 @@ void PopulateBuilder(shared_ptr data for (auto i = 0; i < k_array->n; ++i) { K bin_data = kK(k_array)[i]; TYPE_CHECK_ITEM(bin_data->t != KG, datatype->ToString(), KG, bin_data->t); - PARQUET_THROW_NOT_OK(bin_builder->Append(kG(bin_data), bin_data->n)); + if( type_overrides.null_mapping.have_large_binary + && type_overrides.null_mapping.large_binary_null.length() == bin_data->n + && !type_overrides.null_mapping.large_binary_null.compare( 0, bin_data->n, kG( bin_data ), bin_data->n ) ){ + PARQUET_THROW_NOT_OK( bin_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK(bin_builder->Append(kG(bin_data), bin_data->n)); + } } } @@ -645,8 +761,15 @@ void PopulateBuilder(shared_ptr datatype, { TemporalConversion tc(datatype); auto d32_builder = static_cast(builder); - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(d32_builder->Append(tc.KdbToArrow(kI(k_array)[i]))); + for (auto i = 0; i < k_array->n; ++i){ + if( type_overrides.null_mapping.have_date32 + && type_overrides.null_mapping.date32_null == kI( k_array )[i] ){ + PARQUET_THROW_NOT_OK( d32_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK(d32_builder->Append(tc.KdbToArrow(kI(k_array)[i]))); + } + } } template<> @@ -655,7 +778,13 @@ void PopulateBuilder(shared_ptr datatype, TemporalConversion tc(datatype); auto d64_builder = static_cast(builder); for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(d64_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); + if( type_overrides.null_mapping.have_date64 + && type_overrides.null_mapping.date64_null == kJ( k_array )[i] ){ + PARQUET_THROW_NOT_OK( d64_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK(d64_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); + } } template<> @@ -725,15 +854,31 @@ template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { auto month_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(month_builder->AppendValues((int32_t*)kI(k_array), k_array->n)); + if( type_overrides.null_mapping.have_month_interval ){ + std::vector null_bitmap( k_array->n ); + for( auto i = 0ll; i < k_array->n; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.month_interval_null ^ kI( k_array )[i]; + } + PARQUET_THROW_NOT_OK( month_builder->AppendValues( ( int32_t* )kI( k_array ), k_array->n, null_bitmap ) ); + } + else{ + PARQUET_THROW_NOT_OK(month_builder->AppendValues((int32_t*)kI(k_array), k_array->n)); + } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { auto dt_builder = static_cast(builder); - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(dt_builder->Append(KTimespan_DayTimeInterval(kJ(k_array)[i]))); + for (auto i = 0; i < k_array->n; ++i){ + if( type_overrides.null_mapping.have_day_time_interval + && type_overrides.null_mapping.day_time_interval_null == kJ( k_array )[i] ){ + PARQUET_THROW_NOT_OK( dt_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK(dt_builder->Append(KTimespan_DayTimeInterval(kJ(k_array)[i]))); + } + } } template<> diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 509ddd5..5118e49 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -23,7 +23,7 @@ constexpr auto toUType( E enumerator ) noexcept template< typename E > struct ETraits { - using Names = std::map< E, std::string >; + using Names = std::map; static std::string name( E enumerator ) { @@ -33,7 +33,7 @@ struct ETraits return it->second; } - return "unknown"; + return "UNKNOWN"; } static std::string name( int index ) { return name( static_cast( index ) ); } @@ -57,10 +57,27 @@ namespace Options const std::string NULL_MAPPING = "NULL_MAPPING"; // Null mapping options + const std::string NM_NA = "na"; + const std::string NM_BOOLEAN = "boolean"; + const std::string NM_UINT_8 = "uint8"; + const std::string NM_INT_8 = "int8"; + const std::string NM_UINT_16 = "uint16"; const std::string NM_INT_16 = "int16"; + const std::string NM_UINT_32 = "uint32"; const std::string NM_INT_32 = "int32"; + const std::string NM_UINT_64 = "uint64"; + const std::string NM_INT_64 = "int64"; + const std::string NM_FLOAT_16 = "float16"; + const std::string NM_FLOAT_32 = "float32"; + const std::string NM_FLOAT_64 = "float64"; const std::string NM_STRING = "string"; const std::string NM_LARGE_STRING = "large_string"; + const std::string NM_BINARY = "binary"; + const std::string NM_LARGE_BINARY = "large_binary"; + const std::string NM_DATE_32 = "date32"; + const std::string NM_DATE_64 = "date64"; + const std::string NM_MONTH_INTERVAL = "month_interval"; + const std::string NM_DAY_TIME_INTERVAL = "day_time_interval"; const static std::set int_options = { PARQUET_CHUNK_SIZE, @@ -75,38 +92,133 @@ namespace Options NULL_MAPPING, }; const static std::set null_mapping_options = { - NM_INT_16, - NM_INT_32, - NM_STRING, - NM_LARGE_STRING + NM_NA + , NM_BOOLEAN + , NM_UINT_8 + , NM_INT_8 + , NM_UINT_16 + , NM_INT_16 + , NM_UINT_32 + , NM_INT_32 + , NM_UINT_64 + , NM_INT_64 + , NM_FLOAT_16 + , NM_FLOAT_32 + , NM_FLOAT_64 + , NM_STRING + , NM_LARGE_STRING + , NM_BINARY + , NM_LARGE_BINARY + , NM_DATE_32 + , NM_DATE_64 + , NM_MONTH_INTERVAL + , NM_DAY_TIME_INTERVAL }; struct NullMapping { enum class Type: int{ - INT_16 + NA + , BOOLEAN + , UINT_8 + , INT_8 + , UINT_16 + , INT_16 + , UINT_32 , INT_32 + , UINT_64 + , INT_64 + , FLOAT_16 + , FLOAT_32 + , FLOAT_64 , STRING , LARGE_STRING + , BINARY + , LARGE_BINARY + , DATE_32 + , DATE_64 + , MONTH_INTERVAL + , DAY_TIME_INTERVAL }; + bool have_na; + bool have_boolean; + bool have_uint8; + bool have_int8; + bool have_uint16; bool have_int16; - int16_t int16_null; + bool have_uint32; bool have_int32; - int32_t int32_null; + bool have_uint64; + bool have_int64; + bool have_float16; + bool have_float32; + bool have_float64; bool have_string; - std::string string_null; bool have_large_string; + bool have_binary; + bool have_large_binary; + bool have_date32; + bool have_date64; + bool have_month_interval; + bool have_day_time_interval; + + using Binary = std::basic_string; + + void* na_null = nullptr; + bool boolean_null; + + uint8_t uint8_null; + int8_t int8_null; + + uint16_t uint16_null; + int16_t int16_null; + + uint32_t uint32_null; + int32_t int32_null; + + uint64_t uint64_null; + int64_t int64_null; + + uint16_t float16_null; + float float32_null; + double float64_null; + + std::string string_null; std::string large_string_null; + Binary binary_null; + Binary large_binary_null; + + int32_t date32_null; + int64_t date64_null; + int32_t month_interval_null; + int64_t day_time_interval_null; }; } template<> inline const ETraits< Options::NullMapping::Type >::Names ETraits< Options::NullMapping::Type >::names { - { Options::NullMapping::Type::INT_16, Options::NM_INT_16 } + { Options::NullMapping::Type::NA, Options::NM_NA } + , { Options::NullMapping::Type::BOOLEAN, Options::NM_BOOLEAN } + , { Options::NullMapping::Type::UINT_8, Options::NM_UINT_8 } + , { Options::NullMapping::Type::INT_8, Options::NM_INT_8 } + , { Options::NullMapping::Type::UINT_16, Options::NM_UINT_16 } + , { Options::NullMapping::Type::INT_16, Options::NM_INT_16 } + , { Options::NullMapping::Type::UINT_32, Options::NM_UINT_32 } , { Options::NullMapping::Type::INT_32, Options::NM_INT_32 } + , { Options::NullMapping::Type::UINT_64, Options::NM_UINT_64 } + , { Options::NullMapping::Type::INT_64, Options::NM_INT_64 } + , { Options::NullMapping::Type::FLOAT_16, Options::NM_FLOAT_16 } + , { Options::NullMapping::Type::FLOAT_32, Options::NM_FLOAT_32 } + , { Options::NullMapping::Type::FLOAT_64, Options::NM_FLOAT_64 } , { Options::NullMapping::Type::STRING, Options::NM_STRING } , { Options::NullMapping::Type::LARGE_STRING, Options::NM_LARGE_STRING } + , { Options::NullMapping::Type::BINARY, Options::NM_BINARY } + , { Options::NullMapping::Type::LARGE_BINARY, Options::NM_LARGE_BINARY } + , { Options::NullMapping::Type::DATE_32, Options::NM_DATE_32 } + , { Options::NullMapping::Type::DATE_64, Options::NM_DATE_64 } + , { Options::NullMapping::Type::MONTH_INTERVAL, Options::NM_MONTH_INTERVAL } + , { Options::NullMapping::Type::DAY_TIME_INTERVAL, Options::NM_DAY_TIME_INTERVAL } }; // Helper class for reading dictionary of options @@ -182,14 +294,54 @@ class KdbOptions throw InvalidOption(("Unsupported NULL_MAPPING option '" + key + "'").c_str()); } K value = kK( values )[i]; - if( ETraits::name( NM::INT_16 ) == key && -KH == value->t ){ + if( ETraits::name( NM::BOOLEAN ) == key && -KG == value->t ){ + null_mapping_options.boolean_null = value->g; + null_mapping_options.have_boolean = true; + } + else if( ETraits::name( NM::UINT_8 ) == key && -KG == value->t ){ + null_mapping_options.uint8_null = value->g; + null_mapping_options.have_uint8 = true; + } + else if( ETraits::name( NM::INT_8 ) == key && -KG == value->t ){ + null_mapping_options.int8_null = value->g; + null_mapping_options.have_int8 = true; + } + else if( ETraits::name( NM::UINT_16 ) == key && -KH == value->t ){ + null_mapping_options.uint16_null = value->h; + null_mapping_options.have_uint16 = true; + } + else if( ETraits::name( NM::INT_16 ) == key && -KH == value->t ){ null_mapping_options.int16_null = value->h; null_mapping_options.have_int16 = true; } + else if( ETraits::name( NM::UINT_32 ) == key && -KI == value->t ){ + null_mapping_options.uint32_null = value->i; + null_mapping_options.have_uint32 = true; + } else if( ETraits::name( NM::INT_32 ) == key && -KI == value->t ){ null_mapping_options.int32_null = value->i; null_mapping_options.have_int32 = true; } + else if( ETraits::name( NM::UINT_64 ) == key && -KJ == value->t ){ + null_mapping_options.uint64_null = value->j; + null_mapping_options.have_uint64 = true; + } + else if( ETraits::name( NM::INT_64 ) == key && -KJ == value->t ){ + null_mapping_options.int64_null = value->j; + null_mapping_options.have_int64 = true; + } + else if( ETraits::name( NM::FLOAT_16 ) == key && -KH == value->t ){ + null_mapping_options.float16_null = value->h; + null_mapping_options.have_float16 = true; + } + else if( ETraits::name( NM::FLOAT_32 ) == key && -KE == value->t ){ + null_mapping_options.float32_null = value->e; + null_mapping_options.have_float32 = true; + } + else if( ETraits::name( NM::FLOAT_64 ) == key && -KF == value->t ){ + null_mapping_options.float64_null = value->f; + null_mapping_options.have_float64 = true; + } else if( ETraits::name( NM::STRING ) == key && KC == value->t ){ null_mapping_options.string_null.assign( (char*)kC( value ), value->n ); null_mapping_options.have_string = true; @@ -198,6 +350,30 @@ class KdbOptions null_mapping_options.large_string_null.assign( (char*)kC( value ), value->n ); null_mapping_options.have_large_string = true; } + else if( ETraits::name( NM::BINARY ) == key && KC == value->t ){ + null_mapping_options.binary_null.assign( kC( value ), value->n ); + null_mapping_options.have_binary = true; + } + else if( ETraits::name( NM::LARGE_BINARY ) == key && KC == value->t ){ + null_mapping_options.large_binary_null.assign( kC( value ), value->n ); + null_mapping_options.have_large_binary = true; + } + else if( ETraits::name( NM::DATE_32 ) == key && -KI == value->t ){ + null_mapping_options.date32_null = value->i; + null_mapping_options.have_date32 = true; + } + else if( ETraits::name( NM::DATE_64 ) == key && -KJ == value->t ){ + null_mapping_options.date64_null = value->j; + null_mapping_options.have_date64 = true; + } + else if( ETraits::name( NM::MONTH_INTERVAL ) == key && -KI == value->t ){ + null_mapping_options.month_interval_null = value->i; + null_mapping_options.have_month_interval = true; + } + else if( ETraits::name( NM::DAY_TIME_INTERVAL ) == key && -KJ == value->t ){ + null_mapping_options.day_time_interval_null = value->j; + null_mapping_options.have_day_time_interval = true; + } else if( 101 == value->t ){ // Ignore generic null, which may be used here to ensure mixed list of options } @@ -310,8 +486,12 @@ class KdbOptions } } - template - auto GetNullMappingOption( bool& result ); + template + auto GetNullMappingOption( bool& result ) { + result = true; + + return null_mapping_options.na_null; + } void GetNullMappingOptions( Options::NullMapping& null_mapping ) const { @@ -342,37 +522,126 @@ class KdbOptions }; template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ) -{ - result = null_mapping_options.have_int16; +inline auto KdbOptions::GetNullMappingOption( bool& result ){ + result = null_mapping_options.have_boolean; + return null_mapping_options.boolean_null; +} + +template<> +inline auto KdbOptions::GetNullMappingOption( bool& result ){ + result = null_mapping_options.have_uint8; + return null_mapping_options.uint8_null; +} + +template<> +inline auto KdbOptions::GetNullMappingOption( bool& result ){ + result = null_mapping_options.have_int8; + return null_mapping_options.int8_null; +} + +template<> +inline auto KdbOptions::GetNullMappingOption( bool& result ){ + result = null_mapping_options.have_uint16; + return null_mapping_options.uint16_null; +} +template<> +inline auto KdbOptions::GetNullMappingOption( bool& result ){ + result = null_mapping_options.have_int16; return null_mapping_options.int16_null; } template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ) -{ - result = null_mapping_options.have_int32; +inline auto KdbOptions::GetNullMappingOption( bool& result ){ + result = null_mapping_options.have_uint32; + return null_mapping_options.uint32_null; +} +template<> +inline auto KdbOptions::GetNullMappingOption( bool& result ){ + result = null_mapping_options.have_int32; return null_mapping_options.int32_null; } template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ) -{ - result = null_mapping_options.have_string; +inline auto KdbOptions::GetNullMappingOption( bool& result ){ + result = null_mapping_options.have_uint64; + return null_mapping_options.uint64_null; +} +template<> +inline auto KdbOptions::GetNullMappingOption( bool& result ){ + result = null_mapping_options.have_int64; + return null_mapping_options.int64_null; +} + +template<> +inline auto KdbOptions::GetNullMappingOption( bool& result ){ + result = null_mapping_options.have_float16; + return null_mapping_options.float16_null; +} + +template<> +inline auto KdbOptions::GetNullMappingOption( bool& result ){ + result = null_mapping_options.have_float32; + return null_mapping_options.float32_null; +} + +template<> +inline auto KdbOptions::GetNullMappingOption( bool& result ){ + result = null_mapping_options.have_float64; + return null_mapping_options.float64_null; +} + +template<> +inline auto KdbOptions::GetNullMappingOption( bool& result ){ + result = null_mapping_options.have_string; return null_mapping_options.string_null; } template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ) -{ +inline auto KdbOptions::GetNullMappingOption( bool& result ){ result = null_mapping_options.have_large_string; - return null_mapping_options.large_string_null; } +template<> +inline auto KdbOptions::GetNullMappingOption( bool& result ){ + result = null_mapping_options.have_binary; + return null_mapping_options.binary_null; +} + +template<> +inline auto KdbOptions::GetNullMappingOption( bool& result ){ + result = null_mapping_options.have_large_binary; + return null_mapping_options.large_binary_null; +} + +template<> +inline auto KdbOptions::GetNullMappingOption( bool& result ){ + result = null_mapping_options.have_date32; + return null_mapping_options.date32_null; +} + +template<> +inline auto KdbOptions::GetNullMappingOption( bool& result ){ + result = null_mapping_options.have_date64; + return null_mapping_options.date64_null; +} + +template<> +inline auto KdbOptions::GetNullMappingOption( bool& result ){ + result = null_mapping_options.have_month_interval; + return null_mapping_options.month_interval_null; +} + +template<> +inline auto KdbOptions::GetNullMappingOption( bool& result ){ + result = null_mapping_options.have_day_time_interval; + return null_mapping_options.day_time_interval_null; +} + + } // namespace arrowkdb } // namespace kx From 28d79031b7ca4a06fe495a45b592d6ee4adc5f5d Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 20 Jan 2023 13:48:14 +0300 Subject: [PATCH 027/276] Arrow types primacy --- src/ArrayWriter.cpp | 4 +- src/KdbOptions.h | 257 ++++++++++++-------------------------------- 2 files changed, 68 insertions(+), 193 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index c23277f..49756e0 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -448,10 +448,10 @@ template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { auto bool_builder = static_cast(builder); - if( type_overrides.null_mapping.have_uint8 ){ + if( type_overrides.null_mapping.have_boolean ){ std::vector null_bitmap( k_array->n ); for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.uint8_null ^ kG( k_array )[i]; + null_bitmap[i] = type_overrides.null_mapping.boolean_null ^ kG( k_array )[i]; } PARQUET_THROW_NOT_OK( bool_builder->AppendValues( ( uint8_t* )kG( k_array ), k_array->n, null_bitmap ) ); } diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 5118e49..7d5bb6d 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -9,7 +9,7 @@ #include #include "k.h" - +#include namespace kx { namespace arrowkdb { @@ -117,30 +117,6 @@ namespace Options struct NullMapping { - enum class Type: int{ - NA - , BOOLEAN - , UINT_8 - , INT_8 - , UINT_16 - , INT_16 - , UINT_32 - , INT_32 - , UINT_64 - , INT_64 - , FLOAT_16 - , FLOAT_32 - , FLOAT_64 - , STRING - , LARGE_STRING - , BINARY - , LARGE_BINARY - , DATE_32 - , DATE_64 - , MONTH_INTERVAL - , DAY_TIME_INTERVAL - }; - bool have_na; bool have_boolean; bool have_uint8; @@ -193,32 +169,56 @@ namespace Options int64_t date64_null; int32_t month_interval_null; int64_t day_time_interval_null; + + template + auto GetOption() const { return std::make_pair( true, na_null );} }; + + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_boolean, boolean_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_uint8, uint8_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_int8, int8_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_uint16, uint16_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_int16, int16_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_uint32, uint32_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_int32, int32_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_uint64, uint64_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_int64, int64_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_float16, float16_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_float32, float32_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_float64, float64_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_string, string_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_large_string, large_string_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_binary, binary_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_large_binary, large_binary_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_date32, date32_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_date64, date64_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_month_interval, month_interval_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_day_time_interval, day_time_interval_null ); } } template<> -inline const ETraits< Options::NullMapping::Type >::Names ETraits< Options::NullMapping::Type >::names { - { Options::NullMapping::Type::NA, Options::NM_NA } - , { Options::NullMapping::Type::BOOLEAN, Options::NM_BOOLEAN } - , { Options::NullMapping::Type::UINT_8, Options::NM_UINT_8 } - , { Options::NullMapping::Type::INT_8, Options::NM_INT_8 } - , { Options::NullMapping::Type::UINT_16, Options::NM_UINT_16 } - , { Options::NullMapping::Type::INT_16, Options::NM_INT_16 } - , { Options::NullMapping::Type::UINT_32, Options::NM_UINT_32 } - , { Options::NullMapping::Type::INT_32, Options::NM_INT_32 } - , { Options::NullMapping::Type::UINT_64, Options::NM_UINT_64 } - , { Options::NullMapping::Type::INT_64, Options::NM_INT_64 } - , { Options::NullMapping::Type::FLOAT_16, Options::NM_FLOAT_16 } - , { Options::NullMapping::Type::FLOAT_32, Options::NM_FLOAT_32 } - , { Options::NullMapping::Type::FLOAT_64, Options::NM_FLOAT_64 } - , { Options::NullMapping::Type::STRING, Options::NM_STRING } - , { Options::NullMapping::Type::LARGE_STRING, Options::NM_LARGE_STRING } - , { Options::NullMapping::Type::BINARY, Options::NM_BINARY } - , { Options::NullMapping::Type::LARGE_BINARY, Options::NM_LARGE_BINARY } - , { Options::NullMapping::Type::DATE_32, Options::NM_DATE_32 } - , { Options::NullMapping::Type::DATE_64, Options::NM_DATE_64 } - , { Options::NullMapping::Type::MONTH_INTERVAL, Options::NM_MONTH_INTERVAL } - , { Options::NullMapping::Type::DAY_TIME_INTERVAL, Options::NM_DAY_TIME_INTERVAL } +inline const ETraits::Names ETraits::names{ + { arrow::Type::NA, Options::NM_NA } + , { arrow::Type::BOOL, Options::NM_BOOLEAN } + , { arrow::Type::UINT8, Options::NM_UINT_8 } + , { arrow::Type::INT8, Options::NM_INT_8 } + , { arrow::Type::UINT16, Options::NM_UINT_16 } + , { arrow::Type::INT16, Options::NM_INT_16 } + , { arrow::Type::UINT32, Options::NM_UINT_32 } + , { arrow::Type::INT32, Options::NM_INT_32 } + , { arrow::Type::UINT64, Options::NM_UINT_64 } + , { arrow::Type::INT64, Options::NM_INT_64 } + , { arrow::Type::HALF_FLOAT, Options::NM_FLOAT_16 } + , { arrow::Type::FLOAT, Options::NM_FLOAT_32 } + , { arrow::Type::DOUBLE, Options::NM_FLOAT_64 } + , { arrow::Type::STRING, Options::NM_STRING } + , { arrow::Type::LARGE_STRING, Options::NM_LARGE_STRING } + , { arrow::Type::BINARY, Options::NM_BINARY } + , { arrow::Type::LARGE_BINARY, Options::NM_LARGE_BINARY } + , { arrow::Type::DATE32, Options::NM_DATE_32 } + , { arrow::Type::DATE64, Options::NM_DATE_64 } + , { arrow::Type::INTERVAL_MONTHS, Options::NM_MONTH_INTERVAL } + , { arrow::Type::INTERVAL_DAY_TIME, Options::NM_DAY_TIME_INTERVAL } }; // Helper class for reading dictionary of options @@ -278,7 +278,7 @@ class KdbOptions void PopulateNullMappingOptions( long long index, K dict ) { - using NM = Options::NullMapping::Type; + using NM = arrow::Type::type; K keys = kK( kK( dict )[index] )[0]; K values = kK( kK( dict )[index] )[1]; @@ -294,51 +294,51 @@ class KdbOptions throw InvalidOption(("Unsupported NULL_MAPPING option '" + key + "'").c_str()); } K value = kK( values )[i]; - if( ETraits::name( NM::BOOLEAN ) == key && -KG == value->t ){ + if( ETraits::name( NM::BOOL ) == key && -KG == value->t ){ null_mapping_options.boolean_null = value->g; null_mapping_options.have_boolean = true; } - else if( ETraits::name( NM::UINT_8 ) == key && -KG == value->t ){ + else if( ETraits::name( NM::UINT8 ) == key && -KG == value->t ){ null_mapping_options.uint8_null = value->g; null_mapping_options.have_uint8 = true; } - else if( ETraits::name( NM::INT_8 ) == key && -KG == value->t ){ + else if( ETraits::name( NM::INT8 ) == key && -KG == value->t ){ null_mapping_options.int8_null = value->g; null_mapping_options.have_int8 = true; } - else if( ETraits::name( NM::UINT_16 ) == key && -KH == value->t ){ + else if( ETraits::name( NM::UINT16 ) == key && -KH == value->t ){ null_mapping_options.uint16_null = value->h; null_mapping_options.have_uint16 = true; } - else if( ETraits::name( NM::INT_16 ) == key && -KH == value->t ){ + else if( ETraits::name( NM::INT16 ) == key && -KH == value->t ){ null_mapping_options.int16_null = value->h; null_mapping_options.have_int16 = true; } - else if( ETraits::name( NM::UINT_32 ) == key && -KI == value->t ){ + else if( ETraits::name( NM::UINT32 ) == key && -KI == value->t ){ null_mapping_options.uint32_null = value->i; null_mapping_options.have_uint32 = true; } - else if( ETraits::name( NM::INT_32 ) == key && -KI == value->t ){ + else if( ETraits::name( NM::INT32 ) == key && -KI == value->t ){ null_mapping_options.int32_null = value->i; null_mapping_options.have_int32 = true; } - else if( ETraits::name( NM::UINT_64 ) == key && -KJ == value->t ){ + else if( ETraits::name( NM::UINT64 ) == key && -KJ == value->t ){ null_mapping_options.uint64_null = value->j; null_mapping_options.have_uint64 = true; } - else if( ETraits::name( NM::INT_64 ) == key && -KJ == value->t ){ + else if( ETraits::name( NM::INT64 ) == key && -KJ == value->t ){ null_mapping_options.int64_null = value->j; null_mapping_options.have_int64 = true; } - else if( ETraits::name( NM::FLOAT_16 ) == key && -KH == value->t ){ + else if( ETraits::name( NM::HALF_FLOAT ) == key && -KH == value->t ){ null_mapping_options.float16_null = value->h; null_mapping_options.have_float16 = true; } - else if( ETraits::name( NM::FLOAT_32 ) == key && -KE == value->t ){ + else if( ETraits::name( NM::FLOAT ) == key && -KE == value->t ){ null_mapping_options.float32_null = value->e; null_mapping_options.have_float32 = true; } - else if( ETraits::name( NM::FLOAT_64 ) == key && -KF == value->t ){ + else if( ETraits::name( NM::DOUBLE ) == key && -KF == value->t ){ null_mapping_options.float64_null = value->f; null_mapping_options.have_float64 = true; } @@ -358,19 +358,19 @@ class KdbOptions null_mapping_options.large_binary_null.assign( kC( value ), value->n ); null_mapping_options.have_large_binary = true; } - else if( ETraits::name( NM::DATE_32 ) == key && -KI == value->t ){ + else if( ETraits::name( NM::DATE32 ) == key && -KI == value->t ){ null_mapping_options.date32_null = value->i; null_mapping_options.have_date32 = true; } - else if( ETraits::name( NM::DATE_64 ) == key && -KJ == value->t ){ + else if( ETraits::name( NM::DATE64 ) == key && -KJ == value->t ){ null_mapping_options.date64_null = value->j; null_mapping_options.have_date64 = true; } - else if( ETraits::name( NM::MONTH_INTERVAL ) == key && -KI == value->t ){ + else if( ETraits::name( NM::INTERVAL_MONTHS ) == key && -KI == value->t ){ null_mapping_options.month_interval_null = value->i; null_mapping_options.have_month_interval = true; } - else if( ETraits::name( NM::DAY_TIME_INTERVAL ) == key && -KJ == value->t ){ + else if( ETraits::name( NM::INTERVAL_DAY_TIME ) == key && -KJ == value->t ){ null_mapping_options.day_time_interval_null = value->j; null_mapping_options.have_day_time_interval = true; } @@ -486,12 +486,8 @@ class KdbOptions } } - template - auto GetNullMappingOption( bool& result ) { - result = true; - - return null_mapping_options.na_null; - } + template + auto GetNullMappingOption() const { return null_mapping_options.GetOption(); } void GetNullMappingOptions( Options::NullMapping& null_mapping ) const { @@ -521,127 +517,6 @@ class KdbOptions } }; -template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ){ - result = null_mapping_options.have_boolean; - return null_mapping_options.boolean_null; -} - -template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ){ - result = null_mapping_options.have_uint8; - return null_mapping_options.uint8_null; -} - -template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ){ - result = null_mapping_options.have_int8; - return null_mapping_options.int8_null; -} - -template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ){ - result = null_mapping_options.have_uint16; - return null_mapping_options.uint16_null; -} - -template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ){ - result = null_mapping_options.have_int16; - return null_mapping_options.int16_null; -} - -template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ){ - result = null_mapping_options.have_uint32; - return null_mapping_options.uint32_null; -} - -template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ){ - result = null_mapping_options.have_int32; - return null_mapping_options.int32_null; -} - -template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ){ - result = null_mapping_options.have_uint64; - return null_mapping_options.uint64_null; -} - -template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ){ - result = null_mapping_options.have_int64; - return null_mapping_options.int64_null; -} - -template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ){ - result = null_mapping_options.have_float16; - return null_mapping_options.float16_null; -} - -template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ){ - result = null_mapping_options.have_float32; - return null_mapping_options.float32_null; -} - -template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ){ - result = null_mapping_options.have_float64; - return null_mapping_options.float64_null; -} - -template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ){ - result = null_mapping_options.have_string; - return null_mapping_options.string_null; -} - -template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ){ - result = null_mapping_options.have_large_string; - return null_mapping_options.large_string_null; -} - -template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ){ - result = null_mapping_options.have_binary; - return null_mapping_options.binary_null; -} - -template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ){ - result = null_mapping_options.have_large_binary; - return null_mapping_options.large_binary_null; -} - -template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ){ - result = null_mapping_options.have_date32; - return null_mapping_options.date32_null; -} - -template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ){ - result = null_mapping_options.have_date64; - return null_mapping_options.date64_null; -} - -template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ){ - result = null_mapping_options.have_month_interval; - return null_mapping_options.month_interval_null; -} - -template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ){ - result = null_mapping_options.have_day_time_interval; - return null_mapping_options.day_time_interval_null; -} - - } // namespace arrowkdb } // namespace kx From 2a3adc3fec42a766b9c77ccdbbfc87e1475d4426 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 20 Jan 2023 16:32:55 +0300 Subject: [PATCH 028/276] Options for extended types support --- src/KdbOptions.h | 62 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 61 insertions(+), 1 deletion(-) diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 7d5bb6d..64ce207 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -74,8 +74,14 @@ namespace Options const std::string NM_LARGE_STRING = "large_string"; const std::string NM_BINARY = "binary"; const std::string NM_LARGE_BINARY = "large_binary"; + const std::string NM_FIXED_BINARY = "fixed_binary"; const std::string NM_DATE_32 = "date32"; const std::string NM_DATE_64 = "date64"; + const std::string NM_TIMESTAMP = "timestamp"; + const std::string NM_TIME_32 = "time32"; + const std::string NM_TIME_64 = "time64"; + const std::string NM_DECIMAL = "decimal"; + const std::string NM_DURATION = "duration"; const std::string NM_MONTH_INTERVAL = "month_interval"; const std::string NM_DAY_TIME_INTERVAL = "day_time_interval"; @@ -109,8 +115,14 @@ namespace Options , NM_LARGE_STRING , NM_BINARY , NM_LARGE_BINARY + , NM_FIXED_BINARY , NM_DATE_32 , NM_DATE_64 + , NM_TIMESTAMP + , NM_TIME_32 + , NM_TIME_64 + , NM_DECIMAL + , NM_DURATION , NM_MONTH_INTERVAL , NM_DAY_TIME_INTERVAL }; @@ -134,8 +146,14 @@ namespace Options bool have_large_string; bool have_binary; bool have_large_binary; + bool have_fixed_binary; bool have_date32; bool have_date64; + bool have_timestamp; + bool have_time32; + bool have_time64; + bool have_decimal; + bool have_duration; bool have_month_interval; bool have_day_time_interval; @@ -164,9 +182,15 @@ namespace Options std::string large_string_null; Binary binary_null; Binary large_binary_null; + Binary fixed_binary_null; int32_t date32_null; int64_t date64_null; + int64_t timestamp_null; + int32_t time32_null; + int64_t time64_null; + double decimal_null; + int64_t duration_null; int32_t month_interval_null; int64_t day_time_interval_null; @@ -190,11 +214,17 @@ namespace Options template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_large_string, large_string_null ); } template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_binary, binary_null ); } template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_large_binary, large_binary_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_fixed_binary, fixed_binary_null ); } template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_date32, date32_null ); } template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_date64, date64_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_timestamp, timestamp_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_time32, time32_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_time64, time64_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_decimal, decimal_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_duration, duration_null ); } template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_month_interval, month_interval_null ); } template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_day_time_interval, day_time_interval_null ); } -} +} // namespace Options template<> inline const ETraits::Names ETraits::names{ @@ -215,8 +245,14 @@ inline const ETraits::Names ETraits::names , { arrow::Type::LARGE_STRING, Options::NM_LARGE_STRING } , { arrow::Type::BINARY, Options::NM_BINARY } , { arrow::Type::LARGE_BINARY, Options::NM_LARGE_BINARY } + , { arrow::Type::FIXED_SIZE_BINARY, Options::NM_FIXED_BINARY } + , { arrow::Type::DATE32, Options::NM_DATE_32 } + , { arrow::Type::DATE64, Options::NM_DATE_64 } + , { arrow::Type::TIMESTAMP, Options::NM_TIMESTAMP } , { arrow::Type::DATE32, Options::NM_DATE_32 } , { arrow::Type::DATE64, Options::NM_DATE_64 } + , { arrow::Type::DECIMAL, Options::NM_DECIMAL } + , { arrow::Type::DURATION, Options::NM_DURATION } , { arrow::Type::INTERVAL_MONTHS, Options::NM_MONTH_INTERVAL } , { arrow::Type::INTERVAL_DAY_TIME, Options::NM_DAY_TIME_INTERVAL } }; @@ -358,6 +394,10 @@ class KdbOptions null_mapping_options.large_binary_null.assign( kC( value ), value->n ); null_mapping_options.have_large_binary = true; } + else if( ETraits::name( NM::FIXED_SIZE_BINARY ) == key && KC == value->t ){ + null_mapping_options.fixed_binary_null.assign( kC( value ), value->n ); + null_mapping_options.have_fixed_binary = true; + } else if( ETraits::name( NM::DATE32 ) == key && -KI == value->t ){ null_mapping_options.date32_null = value->i; null_mapping_options.have_date32 = true; @@ -366,6 +406,26 @@ class KdbOptions null_mapping_options.date64_null = value->j; null_mapping_options.have_date64 = true; } + else if( ETraits::name( NM::TIMESTAMP ) == key && -KJ == value->t ){ + null_mapping_options.timestamp_null = value->j; + null_mapping_options.have_timestamp = true; + } + else if( ETraits::name( NM::TIME32 ) == key && -KI == value->t ){ + null_mapping_options.time32_null = value->i; + null_mapping_options.have_time32 = true; + } + else if( ETraits::name( NM::TIME64 ) == key && -KJ == value->t ){ + null_mapping_options.time64_null = value->j; + null_mapping_options.have_time64 = true; + } + else if( ETraits::name( NM::DECIMAL ) == key && -KF == value->t ){ + null_mapping_options.decimal_null = value->f; + null_mapping_options.have_decimal = true; + } + else if( ETraits::name( NM::DURATION ) == key && -KJ == value->t ){ + null_mapping_options.duration_null = value->j; + null_mapping_options.have_duration = true; + } else if( ETraits::name( NM::INTERVAL_MONTHS ) == key && -KI == value->t ){ null_mapping_options.month_interval_null = value->i; null_mapping_options.have_month_interval = true; From 26c420e11e5d4552fee485f605d3eaa947676643 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 20 Jan 2023 17:10:58 +0300 Subject: [PATCH 029/276] Supporting of extended types writing --- src/ArrayWriter.cpp | 83 ++++++++++++++++++++++++++++++++------------- 1 file changed, 60 insertions(+), 23 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index 49756e0..e4d4bfd 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -31,7 +31,7 @@ bool is_equal( T lhs, T rhs ) { static const T epsilon = 2 * std::numeric_limits::epsilon(); - return ::fabs(lhs -= rhs) <= epsilon; + return ::fabs( lhs -= rhs ) <= epsilon; } shared_ptr GetBuilder(shared_ptr datatype); @@ -370,7 +370,7 @@ void PopulateListBuilder(shared_ptr datatype, K k_array, arrow: continue; // Delimit the start/end of each child list set - list_builder->Append(); + PARQUET_THROW_NOT_OK( list_builder->Append() ); if (datatype->id() == arrow::Type::FIXED_SIZE_LIST) { // Check each sub-list is the same length as the fixed size @@ -415,7 +415,7 @@ void PopulateUnionBuilder(shared_ptr datatype, K k_array, arrow // for this union value for (auto index = 0; index < kK(k_array)[0]->n; ++index) { int8_t live_type_id = kH(type_ids)[index]; - union_builder->Append(live_type_id); + PARQUET_THROW_NOT_OK( union_builder->Append(live_type_id) ); } // Populate each of the child builders from its kdb list, starting from 1 to @@ -563,9 +563,9 @@ void PopulateBuilder(shared_ptr datatype, if( type_overrides.null_mapping.have_uint64 ){ std::vector null_bitmap( k_array->n ); for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.uint64_null ^ kI( k_array )[i]; + null_bitmap[i] = type_overrides.null_mapping.uint64_null ^ kJ( k_array )[i]; } - PARQUET_THROW_NOT_OK( uint64_builder->AppendValues( ( uint64_t* )kI( k_array ), k_array->n, null_bitmap ) ); + PARQUET_THROW_NOT_OK( uint64_builder->AppendValues( ( uint64_t* )kJ( k_array ), k_array->n, null_bitmap ) ); } else{ PARQUET_THROW_NOT_OK(uint64_builder->AppendValues((uint64_t*)kJ(k_array), k_array->n)); @@ -579,9 +579,9 @@ void PopulateBuilder(shared_ptr datatype, K if( type_overrides.null_mapping.have_int64 ){ std::vector null_bitmap( k_array->n ); for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.int64_null ^ kI( k_array )[i]; + null_bitmap[i] = type_overrides.null_mapping.int64_null ^ kJ( k_array )[i]; } - PARQUET_THROW_NOT_OK( int64_builder->AppendValues( ( int64_t* )kI( k_array ), k_array->n, null_bitmap ) ); + PARQUET_THROW_NOT_OK( int64_builder->AppendValues( ( int64_t* )kJ( k_array ), k_array->n, null_bitmap ) ); } else{ PARQUET_THROW_NOT_OK(int64_builder->AppendValues((int64_t*)kJ(k_array), k_array->n)); @@ -658,7 +658,7 @@ void PopulateBuilder(shared_ptr datatype, K str_data = kK( k_array )[i]; TYPE_CHECK_ITEM( str_data->t != KC, datatype->ToString(), KC, str_data->t ); if( type_overrides.null_mapping.have_string - && type_overrides.null_mapping.string_null.length() == str_data->n + && type_overrides.null_mapping.string_null.length() == static_cast( str_data->n ) && !type_overrides.null_mapping.string_null.compare( 0, str_data->n, ( char* )kG( str_data ), str_data->n ) ){ PARQUET_THROW_NOT_OK( str_builder->AppendNull() ); } @@ -691,7 +691,7 @@ void PopulateBuilder(shared_ptr data K str_data = kK( k_array )[i]; TYPE_CHECK_ITEM( str_data->t != KC, datatype->ToString(), KC, str_data->t ); if( type_overrides.null_mapping.have_large_string - && type_overrides.null_mapping.large_string_null.length() == str_data->n + && type_overrides.null_mapping.large_string_null.length() == static_cast( str_data->n ) && !type_overrides.null_mapping.large_string_null.compare( 0, str_data->n, ( char* )kG( str_data ), str_data->n ) ){ PARQUET_THROW_NOT_OK( str_builder->AppendNull() ); } @@ -710,7 +710,7 @@ void PopulateBuilder(shared_ptr datatype, K bin_data = kK(k_array)[i]; TYPE_CHECK_ITEM(bin_data->t != KG, datatype->ToString(), KG, bin_data->t); if( type_overrides.null_mapping.have_binary - && type_overrides.null_mapping.binary_null.length() == bin_data->n + && type_overrides.null_mapping.binary_null.length() == static_cast( bin_data->n ) && !type_overrides.null_mapping.binary_null.compare( 0, bin_data->n, kG( bin_data ), bin_data->n ) ){ PARQUET_THROW_NOT_OK( bin_builder->AppendNull() ); } @@ -728,7 +728,7 @@ void PopulateBuilder(shared_ptr data K bin_data = kK(k_array)[i]; TYPE_CHECK_ITEM(bin_data->t != KG, datatype->ToString(), KG, bin_data->t); if( type_overrides.null_mapping.have_large_binary - && type_overrides.null_mapping.large_binary_null.length() == bin_data->n + && type_overrides.null_mapping.large_binary_null.length() == static_cast( bin_data->n ) && !type_overrides.null_mapping.large_binary_null.compare( 0, bin_data->n, kG( bin_data ), bin_data->n ) ){ PARQUET_THROW_NOT_OK( bin_builder->AppendNull() ); } @@ -751,7 +751,14 @@ void PopulateBuilder(shared_ptr K bin_data = kK(k_array)[i]; TYPE_CHECK_ITEM(bin_data->t != KG, datatype->ToString(), KG, bin_data->t); TYPE_CHECK_LENGTH(fixed_bin_builder->byte_width() != bin_data->n, builder->type()->ToString(), fixed_bin_builder->byte_width(), bin_data->n); - PARQUET_THROW_NOT_OK(fixed_bin_builder->Append(kG(bin_data))); + if( type_overrides.null_mapping.have_fixed_binary + && type_overrides.null_mapping.fixed_binary_null.length() == static_cast( bin_data->n ) + && !type_overrides.null_mapping.fixed_binary_null.compare( 0, bin_data->n, kG( bin_data ), bin_data->n ) ){ + PARQUET_THROW_NOT_OK( fixed_bin_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK(fixed_bin_builder->Append(kG(bin_data))); + } } } } @@ -794,7 +801,13 @@ void PopulateBuilder(shared_ptr datatyp auto ts_builder = static_cast(builder); auto timestamp_type = static_pointer_cast(datatype); for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(ts_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); + if( type_overrides.null_mapping.have_timestamp + && type_overrides.null_mapping.timestamp_null == kJ( k_array )[i] ){ + PARQUET_THROW_NOT_OK( ts_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK(ts_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); + } } template<> @@ -804,7 +817,13 @@ void PopulateBuilder(shared_ptr datatype, auto t32_builder = static_cast(builder); auto time32_type = static_pointer_cast(datatype); for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(t32_builder->Append(tc.KdbToArrow(kI(k_array)[i]))); + if( type_overrides.null_mapping.have_time32 + && type_overrides.null_mapping.time32_null == kI( k_array )[i] ){ + PARQUET_THROW_NOT_OK( t32_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK(t32_builder->Append(tc.KdbToArrow(kI(k_array)[i]))); + } } template<> @@ -814,7 +833,13 @@ void PopulateBuilder(shared_ptr datatype, auto t64_builder = static_cast(builder); auto time64_type = static_pointer_cast(datatype); for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(t64_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); + if( type_overrides.null_mapping.have_time64 + && type_overrides.null_mapping.time64_null == kJ( k_array )[i] ){ + PARQUET_THROW_NOT_OK( t64_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK(t64_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); + } } template<> @@ -824,10 +849,16 @@ void PopulateBuilder(shared_ptr datatype, auto dec_type = static_pointer_cast(datatype); for (auto i = 0; i < k_array->n; ++i) { if (type_overrides.decimal128_as_double) { - // Construct the decimal from a double - arrow::Decimal128 dec128; - PARQUET_ASSIGN_OR_THROW(dec128, arrow::Decimal128::FromReal(kF(k_array)[i], dec_type->precision(), dec_type->scale())); - PARQUET_THROW_NOT_OK(dec_builder->Append(dec128)); + if( type_overrides.null_mapping.have_decimal + && is_equal( type_overrides.null_mapping.decimal_null, kF( k_array )[i] ) ){ + PARQUET_THROW_NOT_OK( dec_builder->AppendNull() ); + } + else{ + // Construct the decimal from a double + arrow::Decimal128 dec128; + PARQUET_ASSIGN_OR_THROW(dec128, arrow::Decimal128::FromReal(kF(k_array)[i], dec_type->precision(), dec_type->scale())); + PARQUET_THROW_NOT_OK(dec_builder->Append(dec128)); + } } else { // Each decimal is a list of 16 bytes K k_dec = kK(k_array)[i]; @@ -847,7 +878,13 @@ void PopulateBuilder(shared_ptr datatype auto dur_builder = static_cast(builder); auto duration_type = static_pointer_cast(datatype); for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(dur_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); + if( type_overrides.null_mapping.have_duration + && type_overrides.null_mapping.duration_null == kJ( k_array )[i] ){ + PARQUET_THROW_NOT_OK( dur_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK(dur_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); + } } template<> @@ -917,7 +954,7 @@ void PopulateBuilder(shared_ptr datatype, K k continue; // Delimit the start/end of each child map set - map_builder->Append(); + PARQUET_THROW_NOT_OK( map_builder->Append() ); // Populate the child builders for this map set from the dictionary key/value lists auto k_dict = kK(k_array)[i]; @@ -948,7 +985,7 @@ void PopulateBuilder(shared_ptr datatype, // Delimit each struct value in the parent builder for (auto index = 0; index < kK(k_array)[0]->n; ++index) - struct_builder->Append(); + PARQUET_THROW_NOT_OK( struct_builder->Append() ); // Populate each of the field builders from its kdb list. Only count up to // the number of struct fields. Additional trailing data in the kdb mixed @@ -1113,7 +1150,7 @@ K prettyPrintArray(K datatype_id, K array, K options) auto arrow_array = MakeArray(datatype, array, type_overrides); auto options = arrow::PrettyPrintOptions(); string result; - arrow::PrettyPrint(*arrow_array, options, &result); + PARQUET_THROW_NOT_OK( arrow::PrettyPrint(*arrow_array, options, &result) ); return kp((S)result.c_str()); From 3eaeb87840b8ff8f62f21fa8124340e876f173c4 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 23 Jan 2023 17:47:37 +0300 Subject: [PATCH 030/276] Pull-request #6 changes, patch 5 https://github.com/KxSystems/arrowkdb/pull/6 --- src/ArrayWriter.cpp | 18 +++++++++--------- src/KdbOptions.h | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index e4d4bfd..e1ddd1b 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -451,7 +451,7 @@ void PopulateBuilder(shared_ptr datatype, K if( type_overrides.null_mapping.have_boolean ){ std::vector null_bitmap( k_array->n ); for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.boolean_null ^ kG( k_array )[i]; + null_bitmap[i] = type_overrides.null_mapping.boolean_null != kG( k_array )[i]; } PARQUET_THROW_NOT_OK( bool_builder->AppendValues( ( uint8_t* )kG( k_array ), k_array->n, null_bitmap ) ); } @@ -467,7 +467,7 @@ void PopulateBuilder(shared_ptr datatype, K if( type_overrides.null_mapping.have_uint8 ){ std::vector null_bitmap( k_array->n ); for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.uint8_null ^ kG( k_array )[i]; + null_bitmap[i] = type_overrides.null_mapping.uint8_null != kG( k_array )[i]; } PARQUET_THROW_NOT_OK( uint8_builder->AppendValues( ( uint8_t* )kG( k_array ), k_array->n, null_bitmap ) ); } @@ -483,7 +483,7 @@ void PopulateBuilder(shared_ptr datatype, K if( type_overrides.null_mapping.have_int8 ){ std::vector null_bitmap( k_array->n ); for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.int8_null ^ kG( k_array )[i]; + null_bitmap[i] = type_overrides.null_mapping.int8_null != kG( k_array )[i]; } PARQUET_THROW_NOT_OK( int8_builder->AppendValues( ( int8_t* )kG( k_array ), k_array->n, null_bitmap ) ); } @@ -499,7 +499,7 @@ void PopulateBuilder(shared_ptr datatype, if( type_overrides.null_mapping.have_uint16 ){ std::vector null_bitmap( k_array->n ); for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.uint16_null ^ kH( k_array )[i]; + null_bitmap[i] = type_overrides.null_mapping.uint16_null != kH( k_array )[i]; } PARQUET_THROW_NOT_OK( uint16_builder->AppendValues( ( uint16_t* )kH( k_array ), k_array->n, null_bitmap ) ); } @@ -531,7 +531,7 @@ void PopulateBuilder(shared_ptr datatype, if( type_overrides.null_mapping.have_uint32 ){ std::vector null_bitmap( k_array->n ); for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.uint32_null ^ kI( k_array )[i]; + null_bitmap[i] = type_overrides.null_mapping.uint32_null != static_cast( kI( k_array )[i] ); } PARQUET_THROW_NOT_OK( uint32_builder->AppendValues( ( uint32_t* )kI( k_array ), k_array->n, null_bitmap ) ); } @@ -563,7 +563,7 @@ void PopulateBuilder(shared_ptr datatype, if( type_overrides.null_mapping.have_uint64 ){ std::vector null_bitmap( k_array->n ); for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.uint64_null ^ kJ( k_array )[i]; + null_bitmap[i] = type_overrides.null_mapping.uint64_null != static_cast( kJ( k_array )[i] ); } PARQUET_THROW_NOT_OK( uint64_builder->AppendValues( ( uint64_t* )kJ( k_array ), k_array->n, null_bitmap ) ); } @@ -579,7 +579,7 @@ void PopulateBuilder(shared_ptr datatype, K if( type_overrides.null_mapping.have_int64 ){ std::vector null_bitmap( k_array->n ); for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.int64_null ^ kJ( k_array )[i]; + null_bitmap[i] = type_overrides.null_mapping.int64_null != kJ( k_array )[i]; } PARQUET_THROW_NOT_OK( int64_builder->AppendValues( ( int64_t* )kJ( k_array ), k_array->n, null_bitmap ) ); } @@ -595,7 +595,7 @@ void PopulateBuilder(shared_ptr dataty if( type_overrides.null_mapping.have_float16 ){ std::vector null_bitmap( k_array->n ); for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.float16_null ^ kH( k_array )[i]; + null_bitmap[i] = type_overrides.null_mapping.float16_null != kH( k_array )[i]; } PARQUET_THROW_NOT_OK( hfl_builder->AppendValues( ( uint16_t* )kH( k_array ), k_array->n, null_bitmap ) ); } @@ -894,7 +894,7 @@ void PopulateBuilder(shared_ptr d if( type_overrides.null_mapping.have_month_interval ){ std::vector null_bitmap( k_array->n ); for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.month_interval_null ^ kI( k_array )[i]; + null_bitmap[i] = type_overrides.null_mapping.month_interval_null != kI( k_array )[i]; } PARQUET_THROW_NOT_OK( month_builder->AppendValues( ( int32_t* )kI( k_array ), k_array->n, null_bitmap ) ); } diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 64ce207..aa79596 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -58,7 +58,7 @@ namespace Options // Null mapping options const std::string NM_NA = "na"; - const std::string NM_BOOLEAN = "boolean"; + const std::string NM_BOOLEAN = "bool"; const std::string NM_UINT_8 = "uint8"; const std::string NM_INT_8 = "int8"; const std::string NM_UINT_16 = "uint16"; From df655b6788b275d5cd7bc304cffe75b7cbb69b97 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 25 Jan 2023 12:15:37 +0300 Subject: [PATCH 031/276] Fix of bool option initialized by -1h --- src/KdbOptions.h | 6 +++++- src/SchemaStore.cpp | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/KdbOptions.h b/src/KdbOptions.h index aa79596..1168306 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -330,7 +330,11 @@ class KdbOptions throw InvalidOption(("Unsupported NULL_MAPPING option '" + key + "'").c_str()); } K value = kK( values )[i]; - if( ETraits::name( NM::BOOL ) == key && -KG == value->t ){ + if( ETraits::name( NM::BOOL ) == key && -KB == value->t ){ + null_mapping_options.boolean_null = value->g; + null_mapping_options.have_boolean = true; + } + else if( ETraits::name( NM::BOOL ) == key && -KG == value->t ){ null_mapping_options.boolean_null = value->g; null_mapping_options.have_boolean = true; } diff --git a/src/SchemaStore.cpp b/src/SchemaStore.cpp index b95a1b9..da407e2 100644 --- a/src/SchemaStore.cpp +++ b/src/SchemaStore.cpp @@ -143,7 +143,7 @@ K inferSchema(K table) // Determine the arrow datatype for each data set K k_array_data = kK(dict)[1]; - assert(k_array_data->n == field_names.size()); + assert(static_cast( k_array_data->n ) == field_names.size()); arrow::FieldVector fields; for (auto i = 0ul; i < field_names.size(); ++i) { auto datatype = kx::arrowkdb::GetArrowType(kK(k_array_data)[i]); From 3be868bcf9f70c80b7c61aee83af7be2272e593e Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 25 Jan 2023 13:29:23 +0300 Subject: [PATCH 032/276] Support GUIDs of type 2h in Fixed Size Binarray --- src/ArrayWriter.cpp | 12 ++++++++++-- src/KdbOptions.h | 20 ++++++++++++++++++-- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index e1ddd1b..3b51605 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -744,8 +744,16 @@ void PopulateBuilder(shared_ptr bool is_guid = k_array->t == UU && datatype->id() == arrow::Type::FIXED_SIZE_BINARY && static_cast(builder)->byte_width() == sizeof(U); auto fixed_bin_builder = static_cast(builder); if (is_guid) { - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(fixed_bin_builder->Append((char*)&kU(k_array)[i])); + for (auto i = 0; i < k_array->n; ++i){ + if( type_overrides.null_mapping.have_fixed_binary + && type_overrides.null_mapping.fixed_binary_null.length() == sizeof( U ) + && !type_overrides.null_mapping.fixed_binary_null.compare( 0, sizeof( U ), &kU( k_array )[i].g[0], sizeof( U ) ) ){ + PARQUET_THROW_NOT_OK( fixed_bin_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK(fixed_bin_builder->Append((char*)&kU(k_array)[i])); + } + } } else { for (auto i = 0; i < k_array->n; ++i) { K bin_data = kK(k_array)[i]; diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 1168306..8fe82e0 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -249,8 +249,8 @@ inline const ETraits::Names ETraits::names , { arrow::Type::DATE32, Options::NM_DATE_32 } , { arrow::Type::DATE64, Options::NM_DATE_64 } , { arrow::Type::TIMESTAMP, Options::NM_TIMESTAMP } - , { arrow::Type::DATE32, Options::NM_DATE_32 } - , { arrow::Type::DATE64, Options::NM_DATE_64 } + , { arrow::Type::TIME32, Options::NM_TIME_32 } + , { arrow::Type::TIME64, Options::NM_TIME_64 } , { arrow::Type::DECIMAL, Options::NM_DECIMAL } , { arrow::Type::DURATION, Options::NM_DURATION } , { arrow::Type::INTERVAL_MONTHS, Options::NM_MONTH_INTERVAL } @@ -390,14 +390,30 @@ class KdbOptions null_mapping_options.large_string_null.assign( (char*)kC( value ), value->n ); null_mapping_options.have_large_string = true; } + else if( ETraits::name( NM::BINARY ) == key && KG == value->t ){ + null_mapping_options.binary_null.assign( kG( value ), value->n ); + null_mapping_options.have_binary = true; + } else if( ETraits::name( NM::BINARY ) == key && KC == value->t ){ null_mapping_options.binary_null.assign( kC( value ), value->n ); null_mapping_options.have_binary = true; } + else if( ETraits::name( NM::LARGE_BINARY ) == key && KG == value->t ){ + null_mapping_options.large_binary_null.assign( kG( value ), value->n ); + null_mapping_options.have_large_binary = true; + } else if( ETraits::name( NM::LARGE_BINARY ) == key && KC == value->t ){ null_mapping_options.large_binary_null.assign( kC( value ), value->n ); null_mapping_options.have_large_binary = true; } + else if( ETraits::name( NM::FIXED_SIZE_BINARY ) == key && -UU == value->t ){ + null_mapping_options.fixed_binary_null.assign( &kU( value )->g[0], sizeof( U ) ); + null_mapping_options.have_fixed_binary = true; + } + else if( ETraits::name( NM::FIXED_SIZE_BINARY ) == key && KG == value->t ){ + null_mapping_options.fixed_binary_null.assign( kG( value ), value->n ); + null_mapping_options.have_fixed_binary = true; + } else if( ETraits::name( NM::FIXED_SIZE_BINARY ) == key && KC == value->t ){ null_mapping_options.fixed_binary_null.assign( kC( value ), value->n ); null_mapping_options.have_fixed_binary = true; From 549a41b63e6c7c423f682c2273c3fe3b506427dd Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 25 Jan 2023 17:47:23 +0300 Subject: [PATCH 033/276] Reverse null_bitmap for floats --- src/ArrayWriter.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index 3b51605..4d298c1 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -611,7 +611,7 @@ void PopulateBuilder(shared_ptr datatype, K if( type_overrides.null_mapping.have_float32 ){ std::vector null_bitmap( k_array->n ); for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = is_equal( type_overrides.null_mapping.float32_null, kE( k_array )[i] ); + null_bitmap[i] = !is_equal( type_overrides.null_mapping.float32_null, kE( k_array )[i] ); } PARQUET_THROW_NOT_OK( fl_builder->AppendValues( kE( k_array ), k_array->n, null_bitmap ) ); } @@ -627,7 +627,7 @@ void PopulateBuilder(shared_ptr datatype, if( type_overrides.null_mapping.have_float64 ){ std::vector null_bitmap( k_array->n ); for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = is_equal( type_overrides.null_mapping.float64_null, kF( k_array )[i] ); + null_bitmap[i] = !is_equal( type_overrides.null_mapping.float64_null, kF( k_array )[i] ); } PARQUET_THROW_NOT_OK( dbl_builder->AppendValues( kF( k_array ), k_array->n, null_bitmap ) ); } From fedd82f7e99000476b71fd50c137220a9a8b60e2 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 26 Jan 2023 21:23:11 +0300 Subject: [PATCH 034/276] Breaking apart null mapping option ifelses --- src/KdbOptions.h | 544 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 411 insertions(+), 133 deletions(-) diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 8fe82e0..35f0ef0 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -23,13 +24,11 @@ constexpr auto toUType( E enumerator ) noexcept template< typename E > struct ETraits { - using Names = std::map; + using Names = std::unordered_map; - static std::string name( E enumerator ) - { + static std::string name( E enumerator ){ auto it = names.find( enumerator ); - if( it != names.end() ) - { + if( it != names.end() ){ return it->second; } @@ -38,6 +37,17 @@ struct ETraits static std::string name( int index ) { return name( static_cast( index ) ); } + static auto value( const std::string& name ){ + auto it = std::find_if( names.begin(), names.end(), [&name]( const auto& value ){ + return name == value.second; + } ); + if( it != names.end() ){ + return it->first; + } + + return E( 0 ); + } + static const Names names; }; @@ -266,6 +276,12 @@ inline const ETraits::Names ETraits::names // 0 of -KS|-KJ|XD|KC class KdbOptions { +public: + template + inline void NullMappingOption( const std::string& key, K value ); + + using NullMappingHandler = void ( KdbOptions::* )( const std::string&, K ); + using NullMappingHandlers = std::unordered_map; private: Options::NullMapping null_mapping_options; std::map string_options; @@ -276,6 +292,7 @@ class KdbOptions const std::set& supported_dict_options; const std::set& supported_null_mapping_options; + static const NullMappingHandlers null_mapping_handlers; private: const std::string ToUpper(std::string str) const { @@ -314,151 +331,30 @@ class KdbOptions void PopulateNullMappingOptions( long long index, K dict ) { - using NM = arrow::Type::type; - K keys = kK( kK( dict )[index] )[0]; K values = kK( kK( dict )[index] )[1]; if( KS != keys->t ){ throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING keys (expected=11h), type=" + std::to_string( keys->t ) + "h" ); } if( 0 != values->t ){ - throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING values (extected=0h), type=" + std::to_string( keys->t ) + "h" ); + throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING values (extected=0h), type=" ); } for( auto i = 0ll; i < values->n; ++i ){ const std::string key = ToLower( kS( keys )[i] ); if( supported_null_mapping_options.find( key ) == supported_null_mapping_options.end() ){ - throw InvalidOption(("Unsupported NULL_MAPPING option '" + key + "'").c_str()); + throw InvalidOption( "Unsupported NULL_MAPPING option '" + key + "'" ); } K value = kK( values )[i]; - if( ETraits::name( NM::BOOL ) == key && -KB == value->t ){ - null_mapping_options.boolean_null = value->g; - null_mapping_options.have_boolean = true; - } - else if( ETraits::name( NM::BOOL ) == key && -KG == value->t ){ - null_mapping_options.boolean_null = value->g; - null_mapping_options.have_boolean = true; - } - else if( ETraits::name( NM::UINT8 ) == key && -KG == value->t ){ - null_mapping_options.uint8_null = value->g; - null_mapping_options.have_uint8 = true; - } - else if( ETraits::name( NM::INT8 ) == key && -KG == value->t ){ - null_mapping_options.int8_null = value->g; - null_mapping_options.have_int8 = true; - } - else if( ETraits::name( NM::UINT16 ) == key && -KH == value->t ){ - null_mapping_options.uint16_null = value->h; - null_mapping_options.have_uint16 = true; - } - else if( ETraits::name( NM::INT16 ) == key && -KH == value->t ){ - null_mapping_options.int16_null = value->h; - null_mapping_options.have_int16 = true; - } - else if( ETraits::name( NM::UINT32 ) == key && -KI == value->t ){ - null_mapping_options.uint32_null = value->i; - null_mapping_options.have_uint32 = true; - } - else if( ETraits::name( NM::INT32 ) == key && -KI == value->t ){ - null_mapping_options.int32_null = value->i; - null_mapping_options.have_int32 = true; - } - else if( ETraits::name( NM::UINT64 ) == key && -KJ == value->t ){ - null_mapping_options.uint64_null = value->j; - null_mapping_options.have_uint64 = true; - } - else if( ETraits::name( NM::INT64 ) == key && -KJ == value->t ){ - null_mapping_options.int64_null = value->j; - null_mapping_options.have_int64 = true; - } - else if( ETraits::name( NM::HALF_FLOAT ) == key && -KH == value->t ){ - null_mapping_options.float16_null = value->h; - null_mapping_options.have_float16 = true; - } - else if( ETraits::name( NM::FLOAT ) == key && -KE == value->t ){ - null_mapping_options.float32_null = value->e; - null_mapping_options.have_float32 = true; - } - else if( ETraits::name( NM::DOUBLE ) == key && -KF == value->t ){ - null_mapping_options.float64_null = value->f; - null_mapping_options.have_float64 = true; - } - else if( ETraits::name( NM::STRING ) == key && KC == value->t ){ - null_mapping_options.string_null.assign( (char*)kC( value ), value->n ); - null_mapping_options.have_string = true; - } - else if( ETraits::name( NM::LARGE_STRING ) == key && KC == value->t ){ - null_mapping_options.large_string_null.assign( (char*)kC( value ), value->n ); - null_mapping_options.have_large_string = true; - } - else if( ETraits::name( NM::BINARY ) == key && KG == value->t ){ - null_mapping_options.binary_null.assign( kG( value ), value->n ); - null_mapping_options.have_binary = true; - } - else if( ETraits::name( NM::BINARY ) == key && KC == value->t ){ - null_mapping_options.binary_null.assign( kC( value ), value->n ); - null_mapping_options.have_binary = true; - } - else if( ETraits::name( NM::LARGE_BINARY ) == key && KG == value->t ){ - null_mapping_options.large_binary_null.assign( kG( value ), value->n ); - null_mapping_options.have_large_binary = true; - } - else if( ETraits::name( NM::LARGE_BINARY ) == key && KC == value->t ){ - null_mapping_options.large_binary_null.assign( kC( value ), value->n ); - null_mapping_options.have_large_binary = true; - } - else if( ETraits::name( NM::FIXED_SIZE_BINARY ) == key && -UU == value->t ){ - null_mapping_options.fixed_binary_null.assign( &kU( value )->g[0], sizeof( U ) ); - null_mapping_options.have_fixed_binary = true; - } - else if( ETraits::name( NM::FIXED_SIZE_BINARY ) == key && KG == value->t ){ - null_mapping_options.fixed_binary_null.assign( kG( value ), value->n ); - null_mapping_options.have_fixed_binary = true; - } - else if( ETraits::name( NM::FIXED_SIZE_BINARY ) == key && KC == value->t ){ - null_mapping_options.fixed_binary_null.assign( kC( value ), value->n ); - null_mapping_options.have_fixed_binary = true; - } - else if( ETraits::name( NM::DATE32 ) == key && -KI == value->t ){ - null_mapping_options.date32_null = value->i; - null_mapping_options.have_date32 = true; - } - else if( ETraits::name( NM::DATE64 ) == key && -KJ == value->t ){ - null_mapping_options.date64_null = value->j; - null_mapping_options.have_date64 = true; - } - else if( ETraits::name( NM::TIMESTAMP ) == key && -KJ == value->t ){ - null_mapping_options.timestamp_null = value->j; - null_mapping_options.have_timestamp = true; - } - else if( ETraits::name( NM::TIME32 ) == key && -KI == value->t ){ - null_mapping_options.time32_null = value->i; - null_mapping_options.have_time32 = true; - } - else if( ETraits::name( NM::TIME64 ) == key && -KJ == value->t ){ - null_mapping_options.time64_null = value->j; - null_mapping_options.have_time64 = true; - } - else if( ETraits::name( NM::DECIMAL ) == key && -KF == value->t ){ - null_mapping_options.decimal_null = value->f; - null_mapping_options.have_decimal = true; - } - else if( ETraits::name( NM::DURATION ) == key && -KJ == value->t ){ - null_mapping_options.duration_null = value->j; - null_mapping_options.have_duration = true; - } - else if( ETraits::name( NM::INTERVAL_MONTHS ) == key && -KI == value->t ){ - null_mapping_options.month_interval_null = value->i; - null_mapping_options.have_month_interval = true; - } - else if( ETraits::name( NM::INTERVAL_DAY_TIME ) == key && -KJ == value->t ){ - null_mapping_options.day_time_interval_null = value->j; - null_mapping_options.have_day_time_interval = true; + arrow::Type::type mapping = ETraits::value( key ); + auto it = null_mapping_handlers.find( mapping ); + if( it != null_mapping_handlers.end() ){ + ( this->*it->second )( key, value ); } else if( 101 == value->t ){ // Ignore generic null, which may be used here to ensure mixed list of options } else{ - throw InvalidOption(("Unsupported KDB data type for NULL_MAPPING option '" + key + "', type=" + std::to_string( value->t ) + "h" ).c_str()); + throw InvalidOption( "Unhandled NULL_MAPPING option '" + key + "', type=" + std::to_string( keys->t ) + "h" ); } } } @@ -597,6 +493,388 @@ class KdbOptions } }; +inline void null_mapping_error( const std::string& key, K value ) +{ + std::string message = std::string( "Unsupported KDB data type for NULL_MAPPING option '") + .append( key ) + .append( "', type=" ) + .append( std::to_string( value->t ) ) + .append( "h" ); + + throw KdbOptions::InvalidOption( message ); +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + switch( value->t ){ + case -KB: + null_mapping_options.boolean_null = value->g; + null_mapping_options.have_boolean = true; + break; + case -KG: + null_mapping_options.boolean_null = value->g; + null_mapping_options.have_boolean = true; + break; + default: + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KG == value->t ){ + null_mapping_options.uint8_null = value->g; + null_mapping_options.have_uint8 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KG == value->t ){ + null_mapping_options.int8_null = value->g; + null_mapping_options.have_int8 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KH == value->t ){ + null_mapping_options.uint16_null = value->h; + null_mapping_options.have_uint16 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KH == value->t ){ + null_mapping_options.int16_null = value->h; + null_mapping_options.have_int16 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KI == value->t ){ + null_mapping_options.uint32_null = value->i; + null_mapping_options.have_uint32 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KI == value->t ){ + null_mapping_options.int32_null = value->i; + null_mapping_options.have_int32 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KJ == value->t ){ + null_mapping_options.uint64_null = value->j; + null_mapping_options.have_uint64 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KJ == value->t ){ + null_mapping_options.int64_null = value->j; + null_mapping_options.have_int64 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KH == value->t ){ + null_mapping_options.float16_null = value->h; + null_mapping_options.have_float16 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KE == value->t ){ + null_mapping_options.float32_null = value->e; + null_mapping_options.have_float32 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KF == value->t ){ + null_mapping_options.float64_null = value->f; + null_mapping_options.have_float64 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KC == value->t ){ + null_mapping_options.string_null.assign( (char*)kC( value ), value->n ); + null_mapping_options.have_string = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KC == value->t ){ + null_mapping_options.large_string_null.assign( (char*)kC( value ), value->n ); + null_mapping_options.have_large_string = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + switch( value->t ){ + case KG: + null_mapping_options.binary_null.assign( kG( value ), value->n ); + null_mapping_options.have_binary = true; + break; + case KC: + null_mapping_options.binary_null.assign( kC( value ), value->n ); + null_mapping_options.have_binary = true; + break; + default: + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + switch( value->t ){ + case KG: + null_mapping_options.large_binary_null.assign( kG( value ), value->n ); + null_mapping_options.have_large_binary = true; + break; + case KC: + null_mapping_options.large_binary_null.assign( kC( value ), value->n ); + null_mapping_options.have_large_binary = true; + break; + default: + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + switch( value->t ){ + case -UU: + null_mapping_options.fixed_binary_null.assign( &kU( value )->g[0], sizeof( U ) ); + null_mapping_options.have_fixed_binary = true; + break; + case KG: + null_mapping_options.fixed_binary_null.assign( kG( value ), value->n ); + null_mapping_options.have_fixed_binary = true; + break; + case KC: + null_mapping_options.fixed_binary_null.assign( kC( value ), value->n ); + null_mapping_options.have_fixed_binary = true; + break; + default: + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KI == value->t ){ + null_mapping_options.date32_null = value->i; + null_mapping_options.have_date32 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KJ == value->t ){ + null_mapping_options.date64_null = value->j; + null_mapping_options.have_date64 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KJ == value->t ){ + null_mapping_options.timestamp_null = value->j; + null_mapping_options.have_timestamp = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KI == value->t ){ + null_mapping_options.time32_null = value->i; + null_mapping_options.have_time32 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KJ == value->t ){ + null_mapping_options.time64_null = value->j; + null_mapping_options.have_time64 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KF == value->t ){ + null_mapping_options.decimal_null = value->f; + null_mapping_options.have_decimal = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KJ == value->t ){ + null_mapping_options.duration_null = value->j; + null_mapping_options.have_duration = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KI == value->t ){ + null_mapping_options.month_interval_null = value->i; + null_mapping_options.have_month_interval = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KJ == value->t ){ + null_mapping_options.day_time_interval_null = value->j; + null_mapping_options.have_day_time_interval = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template +auto make_null_mapping() +{ + return std::make_pair( TypeId, &KdbOptions::NullMappingOption ); +} + +inline const KdbOptions::NullMappingHandlers KdbOptions::null_mapping_handlers = { + make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() +}; + } // namespace arrowkdb } // namespace kx From 7277e43edd20d63af0694f0595dbd47a7af9a3e5 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 27 Jan 2023 11:46:00 +0300 Subject: [PATCH 035/276] Unifying supported null mapping options --- src/KdbOptions.h | 257 ++++++++++++++++++++++------------------------- 1 file changed, 118 insertions(+), 139 deletions(-) diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 35f0ef0..72a93d1 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -16,39 +16,48 @@ namespace kx { namespace arrowkdb { template -constexpr auto toUType( E enumerator ) noexcept +constexpr auto toUType( E option ) noexcept { - return static_cast>( enumerator ); + return static_cast>( option ); } template< typename E > struct ETraits { - using Names = std::unordered_map; + using Options = std::unordered_map; - static std::string name( E enumerator ){ - auto it = names.find( enumerator ); - if( it != names.end() ){ + static std::string mapping( E option ){ + auto it = options.find( option ); + if( it != options.end() ){ return it->second; } - return "UNKNOWN"; + return "unknown"; } - static std::string name( int index ) { return name( static_cast( index ) ); } + static std::string mapping( int option ) { return mapping( static_cast( option ) ); } - static auto value( const std::string& name ){ - auto it = std::find_if( names.begin(), names.end(), [&name]( const auto& value ){ - return name == value.second; + static std::set mappings(){ + std::set values; + transform( options.begin(), options.end(), std::inserter( values, end( values ) ), []( const auto& option ){ + return option.second; + } ); + + return values; + } + + static E option( const std::string& value ){ + auto it = std::find_if( options.begin(), options.end(), [&value]( const auto& option ){ + return value == option.second; } ); - if( it != names.end() ){ + if( it != options.end() ){ return it->first; } return E( 0 ); } - static const Names names; + static const Options options; }; // Supported options @@ -107,35 +116,6 @@ namespace Options const static std::set dict_options = { NULL_MAPPING, }; - const static std::set null_mapping_options = { - NM_NA - , NM_BOOLEAN - , NM_UINT_8 - , NM_INT_8 - , NM_UINT_16 - , NM_INT_16 - , NM_UINT_32 - , NM_INT_32 - , NM_UINT_64 - , NM_INT_64 - , NM_FLOAT_16 - , NM_FLOAT_32 - , NM_FLOAT_64 - , NM_STRING - , NM_LARGE_STRING - , NM_BINARY - , NM_LARGE_BINARY - , NM_FIXED_BINARY - , NM_DATE_32 - , NM_DATE_64 - , NM_TIMESTAMP - , NM_TIME_32 - , NM_TIME_64 - , NM_DECIMAL - , NM_DURATION - , NM_MONTH_INTERVAL - , NM_DAY_TIME_INTERVAL - }; struct NullMapping { @@ -205,7 +185,7 @@ namespace Options int64_t day_time_interval_null; template - auto GetOption() const { return std::make_pair( true, na_null );} + inline auto GetOption() const { return std::make_pair( true, na_null );} }; template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_boolean, boolean_null ); } @@ -237,34 +217,34 @@ namespace Options } // namespace Options template<> -inline const ETraits::Names ETraits::names{ - { arrow::Type::NA, Options::NM_NA } - , { arrow::Type::BOOL, Options::NM_BOOLEAN } - , { arrow::Type::UINT8, Options::NM_UINT_8 } - , { arrow::Type::INT8, Options::NM_INT_8 } - , { arrow::Type::UINT16, Options::NM_UINT_16 } - , { arrow::Type::INT16, Options::NM_INT_16 } - , { arrow::Type::UINT32, Options::NM_UINT_32 } - , { arrow::Type::INT32, Options::NM_INT_32 } - , { arrow::Type::UINT64, Options::NM_UINT_64 } - , { arrow::Type::INT64, Options::NM_INT_64 } - , { arrow::Type::HALF_FLOAT, Options::NM_FLOAT_16 } - , { arrow::Type::FLOAT, Options::NM_FLOAT_32 } - , { arrow::Type::DOUBLE, Options::NM_FLOAT_64 } - , { arrow::Type::STRING, Options::NM_STRING } - , { arrow::Type::LARGE_STRING, Options::NM_LARGE_STRING } - , { arrow::Type::BINARY, Options::NM_BINARY } - , { arrow::Type::LARGE_BINARY, Options::NM_LARGE_BINARY } - , { arrow::Type::FIXED_SIZE_BINARY, Options::NM_FIXED_BINARY } - , { arrow::Type::DATE32, Options::NM_DATE_32 } - , { arrow::Type::DATE64, Options::NM_DATE_64 } - , { arrow::Type::TIMESTAMP, Options::NM_TIMESTAMP } - , { arrow::Type::TIME32, Options::NM_TIME_32 } - , { arrow::Type::TIME64, Options::NM_TIME_64 } - , { arrow::Type::DECIMAL, Options::NM_DECIMAL } - , { arrow::Type::DURATION, Options::NM_DURATION } - , { arrow::Type::INTERVAL_MONTHS, Options::NM_MONTH_INTERVAL } - , { arrow::Type::INTERVAL_DAY_TIME, Options::NM_DAY_TIME_INTERVAL } +inline const ETraits::Options ETraits::options{ + { arrow::Type::NA, arrowkdb::Options::NM_NA } + , { arrow::Type::BOOL, arrowkdb::Options::NM_BOOLEAN } + , { arrow::Type::UINT8, arrowkdb::Options::NM_UINT_8 } + , { arrow::Type::INT8, arrowkdb::Options::NM_INT_8 } + , { arrow::Type::UINT16, arrowkdb::Options::NM_UINT_16 } + , { arrow::Type::INT16, arrowkdb::Options::NM_INT_16 } + , { arrow::Type::UINT32, arrowkdb::Options::NM_UINT_32 } + , { arrow::Type::INT32, arrowkdb::Options::NM_INT_32 } + , { arrow::Type::UINT64, arrowkdb::Options::NM_UINT_64 } + , { arrow::Type::INT64, arrowkdb::Options::NM_INT_64 } + , { arrow::Type::HALF_FLOAT, arrowkdb::Options::NM_FLOAT_16 } + , { arrow::Type::FLOAT, arrowkdb::Options::NM_FLOAT_32 } + , { arrow::Type::DOUBLE, arrowkdb::Options::NM_FLOAT_64 } + , { arrow::Type::STRING, arrowkdb::Options::NM_STRING } + , { arrow::Type::LARGE_STRING, arrowkdb::Options::NM_LARGE_STRING } + , { arrow::Type::BINARY, arrowkdb::Options::NM_BINARY } + , { arrow::Type::LARGE_BINARY, arrowkdb::Options::NM_LARGE_BINARY } + , { arrow::Type::FIXED_SIZE_BINARY, arrowkdb::Options::NM_FIXED_BINARY } + , { arrow::Type::DATE32, arrowkdb::Options::NM_DATE_32 } + , { arrow::Type::DATE64, arrowkdb::Options::NM_DATE_64 } + , { arrow::Type::TIMESTAMP, arrowkdb::Options::NM_TIMESTAMP } + , { arrow::Type::TIME32, arrowkdb::Options::NM_TIME_32 } + , { arrow::Type::TIME64, arrowkdb::Options::NM_TIME_64 } + , { arrow::Type::DECIMAL, arrowkdb::Options::NM_DECIMAL } + , { arrow::Type::DURATION, arrowkdb::Options::NM_DURATION } + , { arrow::Type::INTERVAL_MONTHS, arrowkdb::Options::NM_MONTH_INTERVAL } + , { arrow::Type::INTERVAL_DAY_TIME, arrowkdb::Options::NM_DAY_TIME_INTERVAL } }; // Helper class for reading dictionary of options @@ -276,12 +256,6 @@ inline const ETraits::Names ETraits::names // 0 of -KS|-KJ|XD|KC class KdbOptions { -public: - template - inline void NullMappingOption( const std::string& key, K value ); - - using NullMappingHandler = void ( KdbOptions::* )( const std::string&, K ); - using NullMappingHandlers = std::unordered_map; private: Options::NullMapping null_mapping_options; std::map string_options; @@ -292,6 +266,8 @@ class KdbOptions const std::set& supported_dict_options; const std::set& supported_null_mapping_options; + using NullMappingHandler = void ( KdbOptions::* )( const std::string&, K ); + using NullMappingHandlers = std::unordered_map; static const NullMappingHandlers null_mapping_handlers; private: const std::string ToUpper(std::string str) const @@ -337,7 +313,7 @@ class KdbOptions throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING keys (expected=11h), type=" + std::to_string( keys->t ) + "h" ); } if( 0 != values->t ){ - throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING values (extected=0h), type=" ); + throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING values (extected=0h), type=" + std::to_string( values->t ) + "h" ); } for( auto i = 0ll; i < values->n; ++i ){ const std::string key = ToLower( kS( keys )[i] ); @@ -345,8 +321,8 @@ class KdbOptions throw InvalidOption( "Unsupported NULL_MAPPING option '" + key + "'" ); } K value = kK( values )[i]; - arrow::Type::type mapping = ETraits::value( key ); - auto it = null_mapping_handlers.find( mapping ); + auto option = ETraits::option( key ); + auto it = null_mapping_handlers.find( option ); if( it != null_mapping_handlers.end() ){ ( this->*it->second )( key, value ); } @@ -429,7 +405,7 @@ class KdbOptions , const std::set supported_string_options_ , const std::set supported_int_options_ , const std::set& supported_dict_options_ = Options::dict_options - , const std::set& supported_null_mapping_options_ = Options::null_mapping_options ) + , const std::set& supported_null_mapping_options_ = ETraits::mappings() ) : null_mapping_options {0} , supported_string_options(supported_string_options_) , supported_int_options(supported_int_options_) @@ -462,6 +438,9 @@ class KdbOptions } } + template + inline void HandleNullMapping( const std::string& key, K value ); + template auto GetNullMappingOption() const { return null_mapping_options.GetOption(); } @@ -505,7 +484,7 @@ inline void null_mapping_error( const std::string& key, K value ) } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { switch( value->t ){ case -KB: @@ -522,7 +501,7 @@ inline void KdbOptions::NullMappingOption( const std::string& } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KG == value->t ){ null_mapping_options.uint8_null = value->g; @@ -534,7 +513,7 @@ inline void KdbOptions::NullMappingOption( const std::string } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KG == value->t ){ null_mapping_options.int8_null = value->g; @@ -546,7 +525,7 @@ inline void KdbOptions::NullMappingOption( const std::string& } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KH == value->t ){ null_mapping_options.uint16_null = value->h; @@ -558,7 +537,7 @@ inline void KdbOptions::NullMappingOption( const std::strin } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KH == value->t ){ null_mapping_options.int16_null = value->h; @@ -570,7 +549,7 @@ inline void KdbOptions::NullMappingOption( const std::string } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KI == value->t ){ null_mapping_options.uint32_null = value->i; @@ -582,7 +561,7 @@ inline void KdbOptions::NullMappingOption( const std::strin } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KI == value->t ){ null_mapping_options.int32_null = value->i; @@ -594,7 +573,7 @@ inline void KdbOptions::NullMappingOption( const std::string } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KJ == value->t ){ null_mapping_options.uint64_null = value->j; @@ -606,7 +585,7 @@ inline void KdbOptions::NullMappingOption( const std::strin } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KJ == value->t ){ null_mapping_options.int64_null = value->j; @@ -618,7 +597,7 @@ inline void KdbOptions::NullMappingOption( const std::string } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KH == value->t ){ null_mapping_options.float16_null = value->h; @@ -630,7 +609,7 @@ inline void KdbOptions::NullMappingOption( const std::s } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KE == value->t ){ null_mapping_options.float32_null = value->e; @@ -642,7 +621,7 @@ inline void KdbOptions::NullMappingOption( const std::string } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KF == value->t ){ null_mapping_options.float64_null = value->f; @@ -654,10 +633,10 @@ inline void KdbOptions::NullMappingOption( const std::strin } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - if( -KC == value->t ){ - null_mapping_options.string_null.assign( (char*)kC( value ), value->n ); + if( KC == value->t ){ + null_mapping_options.string_null.assign( ( char* )kC( value ), value->n ); null_mapping_options.have_string = true; } else{ @@ -666,10 +645,10 @@ inline void KdbOptions::NullMappingOption( const std::strin } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - if( -KC == value->t ){ - null_mapping_options.large_string_null.assign( (char*)kC( value ), value->n ); + if( KC == value->t ){ + null_mapping_options.large_string_null.assign( ( char* )kC( value ), value->n ); null_mapping_options.have_large_string = true; } else{ @@ -678,7 +657,7 @@ inline void KdbOptions::NullMappingOption( const std: } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { switch( value->t ){ case KG: @@ -695,7 +674,7 @@ inline void KdbOptions::NullMappingOption( const std::strin } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { switch( value->t ){ case KG: @@ -712,7 +691,7 @@ inline void KdbOptions::NullMappingOption( const std: } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { switch( value->t ){ case -UU: @@ -733,7 +712,7 @@ inline void KdbOptions::NullMappingOption( const } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KI == value->t ){ null_mapping_options.date32_null = value->i; @@ -745,7 +724,7 @@ inline void KdbOptions::NullMappingOption( const std::strin } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KJ == value->t ){ null_mapping_options.date64_null = value->j; @@ -757,7 +736,7 @@ inline void KdbOptions::NullMappingOption( const std::strin } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KJ == value->t ){ null_mapping_options.timestamp_null = value->j; @@ -769,7 +748,7 @@ inline void KdbOptions::NullMappingOption( const std::st } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KI == value->t ){ null_mapping_options.time32_null = value->i; @@ -781,7 +760,7 @@ inline void KdbOptions::NullMappingOption( const std::strin } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KJ == value->t ){ null_mapping_options.time64_null = value->j; @@ -793,7 +772,7 @@ inline void KdbOptions::NullMappingOption( const std::strin } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KF == value->t ){ null_mapping_options.decimal_null = value->f; @@ -805,7 +784,7 @@ inline void KdbOptions::NullMappingOption( const std::stri } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KJ == value->t ){ null_mapping_options.duration_null = value->j; @@ -817,7 +796,7 @@ inline void KdbOptions::NullMappingOption( const std::str } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KI == value->t ){ null_mapping_options.month_interval_null = value->i; @@ -829,7 +808,7 @@ inline void KdbOptions::NullMappingOption( const s } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KJ == value->t ){ null_mapping_options.day_time_interval_null = value->j; @@ -841,38 +820,38 @@ inline void KdbOptions::NullMappingOption( const } template -auto make_null_mapping() +auto make_handler() { - return std::make_pair( TypeId, &KdbOptions::NullMappingOption ); + return std::make_pair( TypeId, &KdbOptions::HandleNullMapping ); } inline const KdbOptions::NullMappingHandlers KdbOptions::null_mapping_handlers = { - make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() + make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() }; } // namespace arrowkdb From b9982b1d7b5631f96e8b1094202c692b7222ec6f Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 30 Jan 2023 17:50:54 +0300 Subject: [PATCH 036/276] Patch #1, getting rid of NA fields --- src/ArrayWriter.cpp | 8 ++++---- src/KdbOptions.h | 13 ++++--------- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index 4d298c1..a084ffd 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -451,7 +451,7 @@ void PopulateBuilder(shared_ptr datatype, K if( type_overrides.null_mapping.have_boolean ){ std::vector null_bitmap( k_array->n ); for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.boolean_null != kG( k_array )[i]; + null_bitmap[i] = type_overrides.null_mapping.boolean_null != static_cast( kG( k_array )[i] ); } PARQUET_THROW_NOT_OK( bool_builder->AppendValues( ( uint8_t* )kG( k_array ), k_array->n, null_bitmap ) ); } @@ -467,7 +467,7 @@ void PopulateBuilder(shared_ptr datatype, K if( type_overrides.null_mapping.have_uint8 ){ std::vector null_bitmap( k_array->n ); for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.uint8_null != kG( k_array )[i]; + null_bitmap[i] = type_overrides.null_mapping.uint8_null != static_cast( kG( k_array )[i] ); } PARQUET_THROW_NOT_OK( uint8_builder->AppendValues( ( uint8_t* )kG( k_array ), k_array->n, null_bitmap ) ); } @@ -499,7 +499,7 @@ void PopulateBuilder(shared_ptr datatype, if( type_overrides.null_mapping.have_uint16 ){ std::vector null_bitmap( k_array->n ); for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.uint16_null != kH( k_array )[i]; + null_bitmap[i] = type_overrides.null_mapping.uint16_null != static_cast( kH( k_array )[i] ); } PARQUET_THROW_NOT_OK( uint16_builder->AppendValues( ( uint16_t* )kH( k_array ), k_array->n, null_bitmap ) ); } @@ -595,7 +595,7 @@ void PopulateBuilder(shared_ptr dataty if( type_overrides.null_mapping.have_float16 ){ std::vector null_bitmap( k_array->n ); for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.float16_null != kH( k_array )[i]; + null_bitmap[i] = type_overrides.null_mapping.float16_null != static_cast( kH( k_array )[i] ); } PARQUET_THROW_NOT_OK( hfl_builder->AppendValues( ( uint16_t* )kH( k_array ), k_array->n, null_bitmap ) ); } diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 72a93d1..9f14230 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -76,7 +76,6 @@ namespace Options const std::string NULL_MAPPING = "NULL_MAPPING"; // Null mapping options - const std::string NM_NA = "na"; const std::string NM_BOOLEAN = "bool"; const std::string NM_UINT_8 = "uint8"; const std::string NM_INT_8 = "int8"; @@ -119,7 +118,6 @@ namespace Options struct NullMapping { - bool have_na; bool have_boolean; bool have_uint8; bool have_int8; @@ -149,7 +147,6 @@ namespace Options using Binary = std::basic_string; - void* na_null = nullptr; bool boolean_null; uint8_t uint8_null; @@ -184,8 +181,8 @@ namespace Options int32_t month_interval_null; int64_t day_time_interval_null; - template - inline auto GetOption() const { return std::make_pair( true, na_null );} + template + inline auto GetOption() const; }; template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_boolean, boolean_null ); } @@ -218,8 +215,7 @@ namespace Options template<> inline const ETraits::Options ETraits::options{ - { arrow::Type::NA, arrowkdb::Options::NM_NA } - , { arrow::Type::BOOL, arrowkdb::Options::NM_BOOLEAN } + { arrow::Type::BOOL, arrowkdb::Options::NM_BOOLEAN } , { arrow::Type::UINT8, arrowkdb::Options::NM_UINT_8 } , { arrow::Type::INT8, arrowkdb::Options::NM_INT_8 } , { arrow::Type::UINT16, arrowkdb::Options::NM_UINT_16 } @@ -444,8 +440,7 @@ class KdbOptions template auto GetNullMappingOption() const { return null_mapping_options.GetOption(); } - void GetNullMappingOptions( Options::NullMapping& null_mapping ) const - { + void GetNullMappingOptions( Options::NullMapping& null_mapping ) const{ null_mapping = null_mapping_options; } From 0e589ee72ce7eed24d0786029f7d4bdd42828bf2 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 30 Jan 2023 20:57:51 +0300 Subject: [PATCH 037/276] Minor subset of readers --- src/ArrayReader.cpp | 50 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 10 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index ced60fd..d2a12c6 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -159,7 +159,15 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto int16_array = static_pointer_cast(array_data); - memcpy(kH(k_array), int16_array->raw_values(), int16_array->length() * sizeof(arrow::Int16Array::value_type)); + if( type_overrides.null_mapping.have_int16 ){ + for( auto i = 0ll; i < int16_array->length(); ++i ){ + kH( k_array )[i] = ( int16_array->IsNull( i ) * type_overrides.null_mapping.int16_null ) + + (!int16_array->IsNull( i ) * int16_array->Value( i ) ); + } + } + else { + memcpy(kH(k_array), int16_array->raw_values(), int16_array->length() * sizeof(arrow::Int16Array::value_type)); + } } template<> @@ -173,7 +181,15 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto int32_array = static_pointer_cast(array_data); - memcpy(kI(k_array), int32_array->raw_values(), int32_array->length() * sizeof(arrow::Int32Array::value_type)); + if( type_overrides.null_mapping.have_int32 ){ + for( auto i = 0ll; i < int32_array->length(); ++i ){ + kH( k_array )[i] = ( int32_array->IsNull( i ) * type_overrides.null_mapping.int32_null ) + + (!int32_array->IsNull( i ) * int32_array->Value( i ) ); + } + } + else { + memcpy(kI(k_array), int32_array->raw_values(), int32_array->length() * sizeof(arrow::Int32Array::value_type)); + } } template<> @@ -216,10 +232,17 @@ void AppendArray(shared_ptr array_data, K k_a { auto str_array = static_pointer_cast(array_data); for (auto i = 0; i < str_array->length(); ++i) { - auto str_data = str_array->GetString(i); - K k_str = ktn(KC, str_data.length()); - memcpy(kG(k_str), str_data.data(), str_data.length()); - kK(k_array)[index++] = k_str; + K k_str = nullptr; + if( type_overrides.null_mapping.have_string ){ + k_str = ktn( KC, type_overrides.null_mapping.string_null.length() ); + memcpy( kG(k_str), type_overrides.null_mapping.string_null.data(), type_overrides.null_mapping.string_null.length() ); + } + else{ + auto str_data = str_array->GetString(i); + k_str = ktn(KC, str_data.length()); + memcpy(kG( k_str ), str_data.data(), str_data.length()); + } + kK( k_array )[index++] = k_str; } } @@ -228,10 +251,17 @@ void AppendArray(shared_ptr array_data, { auto str_array = static_pointer_cast(array_data); for (auto i = 0; i < str_array->length(); ++i) { - auto str_data = str_array->GetString(i); - K k_str = ktn(KC, str_data.length()); - memcpy(kG(k_str), str_data.data(), str_data.length()); - kK(k_array)[index++] = k_str; + K k_str = nullptr; + if( type_overrides.null_mapping.have_large_string ){ + k_str = ktn( KC, type_overrides.null_mapping.large_string_null.length() ); + memcpy( kG( k_str ), type_overrides.null_mapping.large_string_null.data(), type_overrides.null_mapping.large_string_null.length() ); + } + else{ + auto str_data = str_array->GetString(i); + k_str = ktn(KC, str_data.length()); + memcpy(kG(k_str), str_data.data(), str_data.length()); + } + kK( k_array )[index++] = k_str; } } From b08cd417ccd6d6d0cd33ba0c793242ae90dfde71 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 31 Jan 2023 11:40:18 +0300 Subject: [PATCH 038/276] Null mapping of several types --- src/ArrayReader.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index d2a12c6..d1413b9 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -183,7 +183,7 @@ void AppendArray(shared_ptr array_data, K k_ar auto int32_array = static_pointer_cast(array_data); if( type_overrides.null_mapping.have_int32 ){ for( auto i = 0ll; i < int32_array->length(); ++i ){ - kH( k_array )[i] = ( int32_array->IsNull( i ) * type_overrides.null_mapping.int32_null ) + kI( k_array )[i] = ( int32_array->IsNull( i ) * type_overrides.null_mapping.int32_null ) + (!int32_array->IsNull( i ) * int32_array->Value( i ) ); } } @@ -233,7 +233,7 @@ void AppendArray(shared_ptr array_data, K k_a auto str_array = static_pointer_cast(array_data); for (auto i = 0; i < str_array->length(); ++i) { K k_str = nullptr; - if( type_overrides.null_mapping.have_string ){ + if( type_overrides.null_mapping.have_string && str_array->IsNull( i ) ){ k_str = ktn( KC, type_overrides.null_mapping.string_null.length() ); memcpy( kG(k_str), type_overrides.null_mapping.string_null.data(), type_overrides.null_mapping.string_null.length() ); } @@ -252,7 +252,7 @@ void AppendArray(shared_ptr array_data, auto str_array = static_pointer_cast(array_data); for (auto i = 0; i < str_array->length(); ++i) { K k_str = nullptr; - if( type_overrides.null_mapping.have_large_string ){ + if( type_overrides.null_mapping.have_large_string && str_array->IsNull( i ) ){ k_str = ktn( KC, type_overrides.null_mapping.large_string_null.length() ); memcpy( kG( k_str ), type_overrides.null_mapping.large_string_null.data(), type_overrides.null_mapping.large_string_null.length() ); } From eb3b51ef9cfb31d79c6aa7acdc4646e352b468ff Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 31 Jan 2023 12:15:21 +0300 Subject: [PATCH 039/276] Replacing duplicating type accessors --- src/KdbOptions.h | 52 ++++++++++++++++-------------------------------- 1 file changed, 17 insertions(+), 35 deletions(-) diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 9f14230..ba9847b 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -481,16 +481,11 @@ inline void null_mapping_error( const std::string& key, K value ) template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - switch( value->t ){ - case -KB: - null_mapping_options.boolean_null = value->g; - null_mapping_options.have_boolean = true; - break; - case -KG: + if( value->t == -KB || value->t == -KG ){ null_mapping_options.boolean_null = value->g; null_mapping_options.have_boolean = true; - break; - default: + } + else{ null_mapping_error( key, value ); } } @@ -499,7 +494,7 @@ template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KG == value->t ){ - null_mapping_options.uint8_null = value->g; + null_mapping_options.uint8_null = static_cast( value->g ); null_mapping_options.have_uint8 = true; } else{ @@ -523,7 +518,7 @@ template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KH == value->t ){ - null_mapping_options.uint16_null = value->h; + null_mapping_options.uint16_null = static_cast( value->h ); null_mapping_options.have_uint16 = true; } else{ @@ -547,7 +542,7 @@ template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KI == value->t ){ - null_mapping_options.uint32_null = value->i; + null_mapping_options.uint32_null = static_cast( value->i ); null_mapping_options.have_uint32 = true; } else{ @@ -571,7 +566,7 @@ template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KJ == value->t ){ - null_mapping_options.uint64_null = value->j; + null_mapping_options.uint64_null = static_cast( value->j ); null_mapping_options.have_uint64 = true; } else{ @@ -595,7 +590,7 @@ template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KH == value->t ){ - null_mapping_options.float16_null = value->h; + null_mapping_options.float16_null = static_cast( value->h ); null_mapping_options.have_float16 = true; } else{ @@ -654,16 +649,11 @@ inline void KdbOptions::HandleNullMapping( const std: template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - switch( value->t ){ - case KG: - null_mapping_options.binary_null.assign( kG( value ), value->n ); - null_mapping_options.have_binary = true; - break; - case KC: - null_mapping_options.binary_null.assign( kC( value ), value->n ); - null_mapping_options.have_binary = true; - break; - default: + if( value->t == KG || value->t == KC ){ + null_mapping_options.binary_null.assign( kG( value ), value->n ); + null_mapping_options.have_binary = true; + } + else{ null_mapping_error( key, value ); } } @@ -671,16 +661,11 @@ inline void KdbOptions::HandleNullMapping( const std::strin template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - switch( value->t ){ - case KG: + if( value->t == KG || value->t == KC ){ null_mapping_options.large_binary_null.assign( kG( value ), value->n ); null_mapping_options.have_large_binary = true; - break; - case KC: - null_mapping_options.large_binary_null.assign( kC( value ), value->n ); - null_mapping_options.have_large_binary = true; - break; - default: + } + else{ null_mapping_error( key, value ); } } @@ -694,11 +679,8 @@ inline void KdbOptions::HandleNullMapping( const null_mapping_options.have_fixed_binary = true; break; case KG: - null_mapping_options.fixed_binary_null.assign( kG( value ), value->n ); - null_mapping_options.have_fixed_binary = true; - break; case KC: - null_mapping_options.fixed_binary_null.assign( kC( value ), value->n ); + null_mapping_options.fixed_binary_null.assign( kG( value ), value->n ); null_mapping_options.have_fixed_binary = true; break; default: From f96991e668a70099ee3a01b3121c99a4b762d6e8 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 31 Jan 2023 12:34:44 +0300 Subject: [PATCH 040/276] Temporal type options --- src/KdbOptions.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/KdbOptions.h b/src/KdbOptions.h index ba9847b..01b4afd 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -691,7 +691,7 @@ inline void KdbOptions::HandleNullMapping( const template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - if( -KI == value->t ){ + if( value->t == -KI || value->t == -KD ){ null_mapping_options.date32_null = value->i; null_mapping_options.have_date32 = true; } @@ -703,7 +703,7 @@ inline void KdbOptions::HandleNullMapping( const std::strin template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - if( -KJ == value->t ){ + if( value->t == -KJ || value->t == -KP ){ null_mapping_options.date64_null = value->j; null_mapping_options.have_date64 = true; } @@ -715,7 +715,7 @@ inline void KdbOptions::HandleNullMapping( const std::strin template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - if( -KJ == value->t ){ + if( value->t == -KJ || value->t == -KP ){ null_mapping_options.timestamp_null = value->j; null_mapping_options.have_timestamp = true; } @@ -727,7 +727,7 @@ inline void KdbOptions::HandleNullMapping( const std::st template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - if( -KI == value->t ){ + if( value->t == -KI || value->t == -KT ){ null_mapping_options.time32_null = value->i; null_mapping_options.have_time32 = true; } @@ -739,7 +739,7 @@ inline void KdbOptions::HandleNullMapping( const std::strin template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - if( -KJ == value->t ){ + if( value->t == -KJ || value->t == -KN ){ null_mapping_options.time64_null = value->j; null_mapping_options.have_time64 = true; } @@ -763,7 +763,7 @@ inline void KdbOptions::HandleNullMapping( const std::stri template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - if( -KJ == value->t ){ + if( value->t == -KJ || value->t == -KN ){ null_mapping_options.duration_null = value->j; null_mapping_options.have_duration = true; } @@ -775,7 +775,7 @@ inline void KdbOptions::HandleNullMapping( const std::str template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - if( -KI == value->t ){ + if( value->t == -KI || value->t == -KM ){ null_mapping_options.month_interval_null = value->i; null_mapping_options.have_month_interval = true; } @@ -787,7 +787,7 @@ inline void KdbOptions::HandleNullMapping( const s template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - if( -KJ == value->t ){ + if( value->t == -KJ || value->t == -KN ){ null_mapping_options.day_time_interval_null = value->j; null_mapping_options.have_day_time_interval = true; } From d27a16f91fb7ed085aa0e888bd7b46ad8329659a Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 31 Jan 2023 13:20:46 +0300 Subject: [PATCH 041/276] Replacing null_mapping integers with temporals --- examples/null_mapping.q | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/null_mapping.q b/examples/null_mapping.q index 629672c..55b3a37 100644 --- a/examples/null_mapping.q +++ b/examples/null_mapping.q @@ -11,8 +11,8 @@ short_opts:(`bool`uint8`int8`uint16`int16)!(0b;0x01;0x02;3h;4h); long_opts:(`uint32`int32`uint64`int64)!(5i;6i;7;8); float_opts:(`float32`float64`decimal)!(1.23e;4.56;7.89); str_opts:(`string`large_string`binary`large_binary`fixed_binary)!("start";"stop";"x"$"alert";"x"$"acknowledge";0Ng); -time_opts:(`date32`date64`timestamp`time64`duration)!("i"$2006.07.21;"j"$2015.01.01D00:00:00.000000000;"j"$2011.01.01D00:00:00.000000000;"j"$12:00:00.000000000;"j"$12:00:00.000000000); -other_opts:(`float16`time32`month_interval`day_time_interval)!(9h;"i"$09:01:02.042;"i"$2006.07m;"j"$12:00:00.000000000); +time_opts:(`date32`date64`timestamp`time64`duration)!(2006.07.21;2015.01.01D00:00:00.000000000;2011.01.01D00:00:00.000000000;12:00:00.000000000;12:00:00.000000000); +other_opts:(`float16`time32`month_interval`day_time_interval)!(9h;09:01:02.042;2006.07m;12:00:00.000000000); options:(``NULL_MAPPING)!((::);short_opts,long_opts,float_opts,str_opts,time_opts,other_opts); From f0b284da489c46ce34817b20658ed3783c7ba94b Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 31 Jan 2023 17:02:55 +0300 Subject: [PATCH 042/276] Integer types expansion --- src/ArrayReader.cpp | 62 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 55 insertions(+), 7 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index d1413b9..0d36b2e 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -138,21 +138,45 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto uint8_array = static_pointer_cast(array_data); - memcpy(kG(k_array), uint8_array->raw_values(), uint8_array->length() * sizeof(arrow::UInt8Array::value_type)); + if( type_overrides.null_mapping.have_uint8 ){ + for( auto i = 0ll; i < uint8_array->length(); ++i ){ + kG( k_array )[i] = ( uint8_array->IsNull( i ) * type_overrides.null_mapping.uint8_null ) + + ( !uint8_array->IsNull( i ) * uint8_array->Value( i ) ); + } + } + else { + memcpy(kG(k_array), uint8_array->raw_values(), uint8_array->length() * sizeof(arrow::UInt8Array::value_type)); + } } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto int8_array = static_pointer_cast(array_data); - memcpy(kG(k_array), int8_array->raw_values(), int8_array->length() * sizeof(arrow::Int8Array::value_type)); + if( type_overrides.null_mapping.have_int8 ){ + for( auto i = 0ll; i < int8_array->length(); ++i ){ + kG( k_array )[i] = ( int8_array->IsNull( i ) * type_overrides.null_mapping.int8_null ) + + ( !int8_array->IsNull( i ) * int8_array->Value( i ) ); + } + } + else { + memcpy(kG(k_array), int8_array->raw_values(), int8_array->length() * sizeof(arrow::Int8Array::value_type)); + } } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto uint16_array = static_pointer_cast(array_data); - memcpy(kH(k_array), uint16_array->raw_values(), uint16_array->length() * sizeof(arrow::UInt16Array::value_type)); + if( type_overrides.null_mapping.have_uint16 ){ + for( auto i = 0ll; i < uint16_array->length(); ++i ){ + kH( k_array )[i] = ( uint16_array->IsNull( i ) * type_overrides.null_mapping.uint16_null ) + + ( !uint16_array->IsNull( i ) * uint16_array->Value( i ) ); + } + } + else { + memcpy(kH(k_array), uint16_array->raw_values(), uint16_array->length() * sizeof(arrow::UInt16Array::value_type)); + } } template<> @@ -162,7 +186,7 @@ void AppendArray(shared_ptr array_data, K k_ar if( type_overrides.null_mapping.have_int16 ){ for( auto i = 0ll; i < int16_array->length(); ++i ){ kH( k_array )[i] = ( int16_array->IsNull( i ) * type_overrides.null_mapping.int16_null ) - + (!int16_array->IsNull( i ) * int16_array->Value( i ) ); + + ( !int16_array->IsNull( i ) * int16_array->Value( i ) ); } } else { @@ -174,7 +198,15 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto uint32_array = static_pointer_cast(array_data); - memcpy(kI(k_array), uint32_array->raw_values(), uint32_array->length() * sizeof(arrow::UInt32Array::value_type)); + if( type_overrides.null_mapping.have_uint32 ){ + for( auto i = 0ll; i < uint32_array->length(); ++i ){ + kI( k_array )[i] = ( uint32_array->IsNull( i ) * type_overrides.null_mapping.uint32_null ) + + ( !uint32_array->IsNull( i ) * uint32_array->Value( i ) ); + } + } + else { + memcpy(kI(k_array), uint32_array->raw_values(), uint32_array->length() * sizeof(arrow::UInt32Array::value_type)); + } } template<> @@ -196,14 +228,30 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto uint64_array = static_pointer_cast(array_data); - memcpy(kJ(k_array), uint64_array->raw_values(), uint64_array->length() * sizeof(arrow::UInt64Array::value_type)); + if( type_overrides.null_mapping.have_uint64 ){ + for( auto i = 0ll; i < uint64_array->length(); ++i ){ + kJ( k_array )[i] = ( uint64_array->IsNull( i ) * type_overrides.null_mapping.uint64_null ) + + ( !uint64_array->IsNull( i ) * uint64_array->Value( i ) ); + } + } + else { + memcpy(kJ(k_array), uint64_array->raw_values(), uint64_array->length() * sizeof(arrow::UInt64Array::value_type)); + } } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto int64_array = static_pointer_cast(array_data); - memcpy(kJ(k_array), int64_array->raw_values(), int64_array->length() * sizeof(arrow::Int64Array::value_type)); + if( type_overrides.null_mapping.have_int32 ){ + for( auto i = 0ll; i < int64_array->length(); ++i ){ + kJ( k_array )[i] = ( int64_array->IsNull( i ) * type_overrides.null_mapping.int64_null ) + + (!int64_array->IsNull( i ) * int64_array->Value( i ) ); + } + } + else { + memcpy(kJ(k_array), int64_array->raw_values(), int64_array->length() * sizeof(arrow::Int64Array::value_type)); + } } template<> From acb65a74cb5ed2820ac6acb9e1f3dfa757d3253b Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 31 Jan 2023 18:25:56 +0300 Subject: [PATCH 043/276] Boolean and floats expansion --- src/ArrayReader.cpp | 42 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 0d36b2e..41ffcd7 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -130,8 +130,11 @@ void AppendArray(shared_ptr array_data, K k_arr { auto bool_array = static_pointer_cast(array_data); // BooleanArray doesn't have a bulk reader since arrow BooleanType is only 1 bit - for (auto i = 0; i < bool_array->length(); ++i) - kG(k_array)[index++] = bool_array->Value(i); + for (auto i = 0; i < bool_array->length(); ++i){ + kG(k_array)[index++] = // preventing branch prediction failures + ( ( type_overrides.null_mapping.have_boolean && bool_array->IsNull( i ) ) * type_overrides.null_mapping.float16_null ) + + ( !( type_overrides.null_mapping.have_boolean && bool_array->IsNull( i ) ) * bool_array->Value( i ) ); + } } template<> @@ -258,21 +261,45 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto hfl_array = static_pointer_cast(array_data); - memcpy(kH(k_array), hfl_array->raw_values(), hfl_array->length() * sizeof(arrow::HalfFloatArray::value_type)); + if( type_overrides.null_mapping.have_float16 ){ + for( auto i = 0ll; i < hfl_array->length(); ++i ){ + kH( k_array )[i] = ( hfl_array->IsNull( i ) * type_overrides.null_mapping.float16_null ) + + ( !hfl_array->IsNull( i ) * hfl_array->Value( i ) ); + } + } + else { + memcpy(kH(k_array), hfl_array->raw_values(), hfl_array->length() * sizeof(arrow::HalfFloatArray::value_type)); + } } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto fl_array = static_pointer_cast(array_data); - memcpy(kE(k_array), fl_array->raw_values(), fl_array->length() * sizeof(arrow::FloatArray::value_type)); + if( type_overrides.null_mapping.have_float32 ){ + for( auto i = 0ll; i < fl_array->length(); ++i ){ + kE( k_array )[i] = ( fl_array->IsNull( i ) * type_overrides.null_mapping.float32_null ) + + ( !fl_array->IsNull( i ) * fl_array->Value( i ) ); + } + } + else { + memcpy(kE(k_array), fl_array->raw_values(), fl_array->length() * sizeof(arrow::FloatArray::value_type)); + } } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto dbl_array = static_pointer_cast(array_data); - memcpy(kF(k_array), dbl_array->raw_values(), dbl_array->length() * sizeof(arrow::DoubleArray::value_type)); + if( type_overrides.null_mapping.have_float64 ){ + for( auto i = 0ll; i < dbl_array->length(); ++i ){ + kF( k_array )[i] = ( dbl_array->IsNull( i ) * type_overrides.null_mapping.float64_null ) + + ( !dbl_array->IsNull( i ) * dbl_array->Value( i ) ); + } + } + else { + memcpy(kF(k_array), dbl_array->raw_values(), dbl_array->length() * sizeof(arrow::DoubleArray::value_type)); + } } template<> @@ -406,7 +433,10 @@ void AppendArray(shared_ptr array_data, K k_ auto decimal = arrow::Decimal128(dec_array->Value(i)); if (type_overrides.decimal128_as_double) { // Convert the decimal to a double - auto dec_as_double = decimal.ToDouble(dec_type->scale()); + auto dec_as_double = + ( ( type_overrides.null_mapping.have_decimal && dec_array->IsNull( i ) ) * type_overrides.null_mapping.decimal_null ) + + ( !( type_overrides.null_mapping.have_decimal && dec_array->IsNull( i ) ) * decimal.ToDouble( dec_type->scale() ) ); + kF(k_array)[index++] = dec_as_double; } else { // Each decimal is a list of 16 bytes From 6e8a2fe12b5535fb090bec38efc96b3c908aaaac Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 1 Feb 2023 11:14:06 +0300 Subject: [PATCH 044/276] Strings and binaries expansion --- src/ArrayReader.cpp | 39 ++++++++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 41ffcd7..35f3021 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -345,9 +345,16 @@ void AppendArray(shared_ptr array_data, K k_a { auto bin_array = static_pointer_cast(array_data); for (auto i = 0; i < bin_array->length(); ++i) { - auto bin_data = bin_array->GetString(i); - K k_bin = ktn(KG, bin_data.length()); - memcpy(kG(k_bin), bin_data.data(), bin_data.length()); + K k_bin = nullptr; + if( type_overrides.null_mapping.have_binary && bin_array->IsNull( i ) ){ + k_bin = ktn( KG, type_overrides.null_mapping.binary_null.length() ); + memcpy( kG( k_bin ), type_overrides.null_mapping.binary_null.data(), type_overrides.null_mapping.binary_null.length() ); + } + else{ + auto bin_data = bin_array->GetString(i); + k_bin = ktn(KG, bin_data.length()); + memcpy(kG(k_bin), bin_data.data(), bin_data.length()); + } kK(k_array)[index++] = k_bin; } } @@ -357,9 +364,16 @@ void AppendArray(shared_ptr array_data, { auto bin_array = static_pointer_cast(array_data); for (auto i = 0; i < bin_array->length(); ++i) { - auto bin_data = bin_array->GetString(i); - K k_bin = ktn(KG, bin_data.length()); - memcpy(kG(k_bin), bin_data.data(), bin_data.length()); + K k_bin = nullptr; + if( type_overrides.null_mapping.have_large_binary && bin_array->IsNull( i ) ){ + k_bin = ktn( KG, type_overrides.null_mapping.large_binary_null.length() ); + memcpy( kG( k_bin ), type_overrides.null_mapping.large_binary_null.data(), type_overrides.null_mapping.large_binary_null.length() ); + } + else{ + auto bin_data = bin_array->GetString(i); + k_bin = ktn(KG, bin_data.length()); + memcpy(kG(k_bin), bin_data.data(), bin_data.length()); + } kK(k_array)[index++] = k_bin; } } @@ -369,9 +383,16 @@ void AppendArray(shared_ptr array_ { auto fixed_bin_array = static_pointer_cast(array_data); for (auto i = 0; i < fixed_bin_array->length(); ++i) { - auto bin_data = fixed_bin_array->GetString(i); - K k_bin = ktn(KG, bin_data.length()); - memcpy(kG(k_bin), bin_data.data(), bin_data.length()); + K k_bin = nullptr; + if( type_overrides.null_mapping.have_fixed_binary && fixed_bin_array->IsNull( i ) ){ + k_bin = ktn( KG, type_overrides.null_mapping.fixed_binary_null.length() ); + memcpy( kG( k_bin ), type_overrides.null_mapping.fixed_binary_null.data(), type_overrides.null_mapping.fixed_binary_null.length() ); + } + else{ + auto bin_data = fixed_bin_array->GetString(i); + k_bin = ktn(KG, bin_data.length()); + memcpy(kG(k_bin), bin_data.data(), bin_data.length()); + } kK(k_array)[index++] = k_bin; } } From f84153b47a0ac2a21f642e3f49857a5513b95685 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 1 Feb 2023 16:34:45 +0300 Subject: [PATCH 045/276] Temporals expansion --- src/ArrayReader.cpp | 61 +++++++++++++++++++++++++++++++++------------ 1 file changed, 45 insertions(+), 16 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 35f3021..d037b0e 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -132,7 +132,7 @@ void AppendArray(shared_ptr array_data, K k_arr // BooleanArray doesn't have a bulk reader since arrow BooleanType is only 1 bit for (auto i = 0; i < bool_array->length(); ++i){ kG(k_array)[index++] = // preventing branch prediction failures - ( ( type_overrides.null_mapping.have_boolean && bool_array->IsNull( i ) ) * type_overrides.null_mapping.float16_null ) + ( ( type_overrides.null_mapping.have_boolean && bool_array->IsNull( i ) ) * type_overrides.null_mapping.boolean_null ) + ( !( type_overrides.null_mapping.have_boolean && bool_array->IsNull( i ) ) * bool_array->Value( i ) ); } } @@ -402,8 +402,11 @@ void AppendArray(shared_ptr array_data, K k_a { TemporalConversion tc(array_data->type()); auto d32_array = static_pointer_cast(array_data); - for (auto i = 0; i < d32_array->length(); ++i) - kI(k_array)[index++] = tc.ArrowToKdb(d32_array->Value(i)); + for (auto i = 0; i < d32_array->length(); ++i){ + kI( k_array )[index++] = + ( ( type_overrides.null_mapping.have_date32 && d32_array->IsNull( i ) ) * type_overrides.null_mapping.date32_null ) + + ( !( type_overrides.null_mapping.have_date32 && d32_array->IsNull( i ) ) * tc.ArrowToKdb( d32_array->Value( i ) ) ); + } } template<> @@ -411,8 +414,11 @@ void AppendArray(shared_ptr array_data, K k_a { TemporalConversion tc(array_data->type()); auto d64_array = static_pointer_cast(array_data); - for (auto i = 0; i < d64_array->length(); ++i) - kJ(k_array)[index++] = tc.ArrowToKdb(d64_array->Value(i)); + for (auto i = 0; i < d64_array->length(); ++i){ + kJ( k_array )[index++] = + ( ( type_overrides.null_mapping.have_date64 && d64_array->IsNull( i ) ) * type_overrides.null_mapping.date64_null ) + + ( !( type_overrides.null_mapping.have_date64 && d64_array->IsNull( i ) ) * tc.ArrowToKdb( d64_array->Value( i ) ) ); + } } template<> @@ -421,8 +427,11 @@ void AppendArray(shared_ptr array_data, K TemporalConversion tc(array_data->type()); auto ts_array = static_pointer_cast(array_data); auto timestamp_type = static_pointer_cast(ts_array->type()); - for (auto i = 0; i < ts_array->length(); ++i) - kJ(k_array)[index++] = tc.ArrowToKdb(ts_array->Value(i)); + for (auto i = 0; i < ts_array->length(); ++i){ + kJ( k_array )[index++] = + ( ( type_overrides.null_mapping.have_timestamp && ts_array->IsNull( i ) ) * type_overrides.null_mapping.timestamp_null ) + + ( !( type_overrides.null_mapping.have_timestamp && ts_array->IsNull( i ) ) * tc.ArrowToKdb( ts_array->Value( i ) ) ); + } } template<> @@ -431,8 +440,11 @@ void AppendArray(shared_ptr array_data, K k_a TemporalConversion tc(array_data->type()); auto t32_array = static_pointer_cast(array_data); auto time32_type = static_pointer_cast(t32_array->type()); - for (auto i = 0; i < t32_array->length(); ++i) - kI(k_array)[index++] = tc.ArrowToKdb(t32_array->Value(i)); + for (auto i = 0; i < t32_array->length(); ++i){ + kI( k_array )[index++] = + ( ( type_overrides.null_mapping.have_time32 && t32_array->IsNull( i ) ) * type_overrides.null_mapping.time32_null ) + + ( !( type_overrides.null_mapping.have_time32 && t32_array->IsNull( i ) ) * tc.ArrowToKdb( t32_array->Value( i ) ) ); + } } template<> @@ -441,8 +453,11 @@ void AppendArray(shared_ptr array_data, K k_a TemporalConversion tc(array_data->type()); auto t64_array = static_pointer_cast(array_data); auto time64_type = static_pointer_cast(t64_array->type()); - for (auto i = 0; i < t64_array->length(); ++i) - kJ(k_array)[index++] = tc.ArrowToKdb(t64_array->Value(i)); + for (auto i = 0; i < t64_array->length(); ++i){ + kJ( k_array )[index++] = + ( ( type_overrides.null_mapping.have_time64 && t64_array->IsNull( i ) ) * type_overrides.null_mapping.time64_null ) + + ( !( type_overrides.null_mapping.have_time64 && t64_array->IsNull( i ) ) * tc.ArrowToKdb( t64_array->Value( i ) ) ); + } } template<> @@ -474,23 +489,37 @@ void AppendArray(shared_ptr array_data, K k TemporalConversion tc(array_data->type()); auto dur_array = static_pointer_cast(array_data); auto duration_type = static_pointer_cast(dur_array->type()); - for (auto i = 0; i < dur_array->length(); ++i) - kJ(k_array)[index++] = tc.ArrowToKdb(dur_array->Value(i)); + for (auto i = 0; i < dur_array->length(); ++i){ + kJ( k_array )[index++] = + ( ( type_overrides.null_mapping.have_duration && dur_array->IsNull( i ) ) * type_overrides.null_mapping.duration_null ) + + ( !( type_overrides.null_mapping.have_duration && dur_array->IsNull( i ) ) * tc.ArrowToKdb( dur_array->Value( i ) ) ); + } } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto month_array = static_pointer_cast(array_data); - memcpy(kI(k_array), month_array->raw_values(), month_array->length() * sizeof(arrow::MonthIntervalArray::value_type)); + if( type_overrides.null_mapping.have_month_interval ){ + for( auto i = 0ll; i < month_array->length(); ++i ){ + kI( k_array )[i] = ( month_array->IsNull( i ) * type_overrides.null_mapping.month_interval_null ) + + ( !month_array->IsNull( i ) * month_array->Value( i ) ); + } + } + else { + memcpy(kI(k_array), month_array->raw_values(), month_array->length() * sizeof(arrow::MonthIntervalArray::value_type)); + } } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto dt_array = static_pointer_cast(array_data); - for (auto i = 0; i < dt_array->length(); ++i) - kJ(k_array)[index++] = DayTimeInterval_KTimespan(dt_array->Value(i)); + for (auto i = 0; i < dt_array->length(); ++i){ + kJ( k_array )[index++] = + ( ( type_overrides.null_mapping.have_day_time_interval && dt_array->IsNull( i ) ) * type_overrides.null_mapping.day_time_interval_null ) + + ( !( type_overrides.null_mapping.have_day_time_interval && dt_array->IsNull( i ) ) * DayTimeInterval_KTimespan( dt_array->Value( i ) ) ); + } } template<> From ce0424a6979af52b208aa631592cd7ad9bec6494 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 1 Feb 2023 16:53:22 +0300 Subject: [PATCH 046/276] Counting nulls for bulk copying --- src/ArrayReader.cpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index d037b0e..e748f14 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -141,7 +141,7 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto uint8_array = static_pointer_cast(array_data); - if( type_overrides.null_mapping.have_uint8 ){ + if( type_overrides.null_mapping.have_uint8 && uint8_array->null_count() ){ for( auto i = 0ll; i < uint8_array->length(); ++i ){ kG( k_array )[i] = ( uint8_array->IsNull( i ) * type_overrides.null_mapping.uint8_null ) + ( !uint8_array->IsNull( i ) * uint8_array->Value( i ) ); @@ -156,7 +156,7 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto int8_array = static_pointer_cast(array_data); - if( type_overrides.null_mapping.have_int8 ){ + if( type_overrides.null_mapping.have_int8 && int8_array->null_count() ){ for( auto i = 0ll; i < int8_array->length(); ++i ){ kG( k_array )[i] = ( int8_array->IsNull( i ) * type_overrides.null_mapping.int8_null ) + ( !int8_array->IsNull( i ) * int8_array->Value( i ) ); @@ -171,7 +171,7 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto uint16_array = static_pointer_cast(array_data); - if( type_overrides.null_mapping.have_uint16 ){ + if( type_overrides.null_mapping.have_uint16 && uint16_array->null_count() ){ for( auto i = 0ll; i < uint16_array->length(); ++i ){ kH( k_array )[i] = ( uint16_array->IsNull( i ) * type_overrides.null_mapping.uint16_null ) + ( !uint16_array->IsNull( i ) * uint16_array->Value( i ) ); @@ -186,7 +186,7 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto int16_array = static_pointer_cast(array_data); - if( type_overrides.null_mapping.have_int16 ){ + if( type_overrides.null_mapping.have_int16 && int16_array->null_count() ){ for( auto i = 0ll; i < int16_array->length(); ++i ){ kH( k_array )[i] = ( int16_array->IsNull( i ) * type_overrides.null_mapping.int16_null ) + ( !int16_array->IsNull( i ) * int16_array->Value( i ) ); @@ -201,7 +201,7 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto uint32_array = static_pointer_cast(array_data); - if( type_overrides.null_mapping.have_uint32 ){ + if( type_overrides.null_mapping.have_uint32 && uint32_array->null_count() ){ for( auto i = 0ll; i < uint32_array->length(); ++i ){ kI( k_array )[i] = ( uint32_array->IsNull( i ) * type_overrides.null_mapping.uint32_null ) + ( !uint32_array->IsNull( i ) * uint32_array->Value( i ) ); @@ -216,7 +216,7 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto int32_array = static_pointer_cast(array_data); - if( type_overrides.null_mapping.have_int32 ){ + if( type_overrides.null_mapping.have_int32 && int32_array->null_count() ){ for( auto i = 0ll; i < int32_array->length(); ++i ){ kI( k_array )[i] = ( int32_array->IsNull( i ) * type_overrides.null_mapping.int32_null ) + (!int32_array->IsNull( i ) * int32_array->Value( i ) ); @@ -231,7 +231,7 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto uint64_array = static_pointer_cast(array_data); - if( type_overrides.null_mapping.have_uint64 ){ + if( type_overrides.null_mapping.have_uint64 && uint64_array->null_count() ){ for( auto i = 0ll; i < uint64_array->length(); ++i ){ kJ( k_array )[i] = ( uint64_array->IsNull( i ) * type_overrides.null_mapping.uint64_null ) + ( !uint64_array->IsNull( i ) * uint64_array->Value( i ) ); @@ -246,7 +246,7 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto int64_array = static_pointer_cast(array_data); - if( type_overrides.null_mapping.have_int32 ){ + if( type_overrides.null_mapping.have_int32 && int64_array->null_count() ){ for( auto i = 0ll; i < int64_array->length(); ++i ){ kJ( k_array )[i] = ( int64_array->IsNull( i ) * type_overrides.null_mapping.int64_null ) + (!int64_array->IsNull( i ) * int64_array->Value( i ) ); @@ -261,7 +261,7 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto hfl_array = static_pointer_cast(array_data); - if( type_overrides.null_mapping.have_float16 ){ + if( type_overrides.null_mapping.have_float16 && hfl_array->null_count() ){ for( auto i = 0ll; i < hfl_array->length(); ++i ){ kH( k_array )[i] = ( hfl_array->IsNull( i ) * type_overrides.null_mapping.float16_null ) + ( !hfl_array->IsNull( i ) * hfl_array->Value( i ) ); @@ -276,7 +276,7 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto fl_array = static_pointer_cast(array_data); - if( type_overrides.null_mapping.have_float32 ){ + if( type_overrides.null_mapping.have_float32 && fl_array->null_count() ){ for( auto i = 0ll; i < fl_array->length(); ++i ){ kE( k_array )[i] = ( fl_array->IsNull( i ) * type_overrides.null_mapping.float32_null ) + ( !fl_array->IsNull( i ) * fl_array->Value( i ) ); @@ -291,7 +291,7 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto dbl_array = static_pointer_cast(array_data); - if( type_overrides.null_mapping.have_float64 ){ + if( type_overrides.null_mapping.have_float64 && dbl_array->null_count() ){ for( auto i = 0ll; i < dbl_array->length(); ++i ){ kF( k_array )[i] = ( dbl_array->IsNull( i ) * type_overrides.null_mapping.float64_null ) + ( !dbl_array->IsNull( i ) * dbl_array->Value( i ) ); From 34fb1454083ba0d930ff63349ca7c9f4d9b88d94 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 7 Feb 2023 10:32:35 +0300 Subject: [PATCH 047/276] Clean up option getters --- src/ArrayReader.cpp | 4 ++-- src/ArrayWriter.cpp | 14 -------------- src/HelperFunctions.h | 18 ++++++++++++++++++ src/KdbOptions.h | 32 -------------------------------- 4 files changed, 20 insertions(+), 48 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index e748f14..30e34b4 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -246,7 +246,7 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto int64_array = static_pointer_cast(array_data); - if( type_overrides.null_mapping.have_int32 && int64_array->null_count() ){ + if( type_overrides.null_mapping.have_int64 && int64_array->null_count() ){ for( auto i = 0ll; i < int64_array->length(); ++i ){ kJ( k_array )[i] = ( int64_array->IsNull( i ) * type_overrides.null_mapping.int64_null ) + (!int64_array->IsNull( i ) * int64_array->Value( i ) ); @@ -500,7 +500,7 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto month_array = static_pointer_cast(array_data); - if( type_overrides.null_mapping.have_month_interval ){ + if( type_overrides.null_mapping.have_month_interval && month_array->null_count() ){ for( auto i = 0ll; i < month_array->length(); ++i ){ kI( k_array )[i] = ( month_array->IsNull( i ) * type_overrides.null_mapping.month_interval_null ) + ( !month_array->IsNull( i ) * month_array->Value( i ) ); diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index a084ffd..e29a197 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -1,9 +1,7 @@ #include -#include #include #include #include -#include #include #include @@ -22,18 +20,6 @@ using namespace kx::arrowkdb; namespace { -//! Compares floating point numbers, because of unreliable direct compare -//! @param lhs - left-hand side value -//! @param rhs - right-hand side value -//! @return true if values are nearby -template -bool is_equal( T lhs, T rhs ) -{ - static const T epsilon = 2 * std::numeric_limits::epsilon(); - - return ::fabs( lhs -= rhs ) <= epsilon; -} - shared_ptr GetBuilder(shared_ptr datatype); template diff --git a/src/HelperFunctions.h b/src/HelperFunctions.h index 201707a..f48f8f1 100644 --- a/src/HelperFunctions.h +++ b/src/HelperFunctions.h @@ -1,6 +1,8 @@ #ifndef __HELPER_FUNCTIONS_H__ #define __HELPER_FUNCTIONS_H__ +#include +#include #include #include @@ -71,6 +73,22 @@ bool IsKdbString(K str); const std::string GetKdbString(K str); +//////////////////// +// FLOATS COMPARE // +//////////////////// + +//! Compares floating point numbers, because of unreliable direct compare +//! @param lhs - left-hand side value +//! @param rhs - right-hand side value +//! @return true if values are nearby +template +inline bool is_equal( T lhs, T rhs ) +{ + static const T epsilon = 2 * std::numeric_limits::epsilon(); + + return ::fabs( lhs -= rhs ) <= epsilon; +} + ////////////////// // TYPE MAPPING // ////////////////// diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 01b4afd..d85989d 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -180,37 +180,8 @@ namespace Options int64_t duration_null; int32_t month_interval_null; int64_t day_time_interval_null; - - template - inline auto GetOption() const; }; - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_boolean, boolean_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_uint8, uint8_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_int8, int8_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_uint16, uint16_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_int16, int16_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_uint32, uint32_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_int32, int32_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_uint64, uint64_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_int64, int64_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_float16, float16_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_float32, float32_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_float64, float64_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_string, string_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_large_string, large_string_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_binary, binary_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_large_binary, large_binary_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_fixed_binary, fixed_binary_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_date32, date32_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_date64, date64_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_timestamp, timestamp_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_time32, time32_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_time64, time64_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_decimal, decimal_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_duration, duration_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_month_interval, month_interval_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_day_time_interval, day_time_interval_null ); } } // namespace Options template<> @@ -437,9 +408,6 @@ class KdbOptions template inline void HandleNullMapping( const std::string& key, K value ); - template - auto GetNullMappingOption() const { return null_mapping_options.GetOption(); } - void GetNullMappingOptions( Options::NullMapping& null_mapping ) const{ null_mapping = null_mapping_options; } From 78d9a8810f8fc244adf7387ef8f25f8145b2c149 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 7 Feb 2023 17:59:41 +0300 Subject: [PATCH 048/276] Clean up option setters --- src/KdbOptions.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/KdbOptions.h b/src/KdbOptions.h index d85989d..6a8fc84 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -659,7 +659,7 @@ inline void KdbOptions::HandleNullMapping( const template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - if( value->t == -KI || value->t == -KD ){ + if( value->t == -KD ){ null_mapping_options.date32_null = value->i; null_mapping_options.have_date32 = true; } @@ -671,7 +671,7 @@ inline void KdbOptions::HandleNullMapping( const std::strin template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - if( value->t == -KJ || value->t == -KP ){ + if( value->t == -KP ){ null_mapping_options.date64_null = value->j; null_mapping_options.have_date64 = true; } @@ -683,7 +683,7 @@ inline void KdbOptions::HandleNullMapping( const std::strin template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - if( value->t == -KJ || value->t == -KP ){ + if( value->t == -KP ){ null_mapping_options.timestamp_null = value->j; null_mapping_options.have_timestamp = true; } @@ -695,7 +695,7 @@ inline void KdbOptions::HandleNullMapping( const std::st template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - if( value->t == -KI || value->t == -KT ){ + if( value->t == -KT ){ null_mapping_options.time32_null = value->i; null_mapping_options.have_time32 = true; } @@ -707,7 +707,7 @@ inline void KdbOptions::HandleNullMapping( const std::strin template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - if( value->t == -KJ || value->t == -KN ){ + if( value->t == -KN ){ null_mapping_options.time64_null = value->j; null_mapping_options.have_time64 = true; } @@ -731,7 +731,7 @@ inline void KdbOptions::HandleNullMapping( const std::stri template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - if( value->t == -KJ || value->t == -KN ){ + if( value->t == -KN ){ null_mapping_options.duration_null = value->j; null_mapping_options.have_duration = true; } @@ -743,7 +743,7 @@ inline void KdbOptions::HandleNullMapping( const std::str template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - if( value->t == -KI || value->t == -KM ){ + if( value->t == -KM ){ null_mapping_options.month_interval_null = value->i; null_mapping_options.have_month_interval = true; } @@ -755,7 +755,7 @@ inline void KdbOptions::HandleNullMapping( const s template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - if( value->t == -KJ || value->t == -KN ){ + if( value->t == -KN ){ null_mapping_options.day_time_interval_null = value->j; null_mapping_options.have_day_time_interval = true; } From 0c4f1c73312ae6427e30c43d9c5dc4c51e58023d Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 27 Jan 2023 18:18:45 +0300 Subject: [PATCH 049/276] Reading parquet data back and compare https://github.com/KxSystems/arrowkdb/pull/7 --- examples/null_mapping.q | 66 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 61 insertions(+), 5 deletions(-) diff --git a/examples/null_mapping.q b/examples/null_mapping.q index 55b3a37..dc43651 100644 --- a/examples/null_mapping.q +++ b/examples/null_mapping.q @@ -184,9 +184,65 @@ options[`DECIMAL128_AS_DOUBLE]:1 .arrowkdb.tb.prettyPrintTable[time_schema;time_data;options] .arrowkdb.tb.prettyPrintTable[other_schema;other_data;options] +//-------------------------// +// Example-1. Parquet file // +//-------------------------// + +// Write the schema and array data to a parquet file options[`PARQUET_VERSION]:`V2.0 -.arrowkdb.pq.writeParquet["null_mapping_short.parquet";short_schema;short_data;options] -.arrowkdb.pq.writeParquet["null_mapping_long.parquet";long_schema;long_data;options] -.arrowkdb.pq.writeParquet["null_mapping_float.parquet";float_schema;float_data;options] -.arrowkdb.pq.writeParquet["null_mapping_str.parquet";str_schema;str_data;options] -.arrowkdb.pq.writeParquet["null_mapping_time.parquet";time_schema;time_data;options] + +filename_short:"null_mapping_short.parquet" +filename_long:"null_mapping_long.parquet" +filename_float:"null_mapping_float.parquet" +filename_str:"null_mapping_str.parquet" +filename_time:"null_mapping_time.parquet" + +.arrowkdb.pq.writeParquet[filename_short;short_schema;short_data;options] +.arrowkdb.pq.writeParquet[filename_long;long_schema;long_data;options] +.arrowkdb.pq.writeParquet[filename_float;float_schema;float_data;options] +.arrowkdb.pq.writeParquet[filename_str;str_schema;str_data;options] +.arrowkdb.pq.writeParquet[filename_time;time_schema;time_data;options] + +show ls filename_short +show ls filename_long +show ls filename_float +show ls filename_str +show ls filename_time + +// Read the schema back and compare +new_short_schema:.arrowkdb.pq.readParquetSchema[filename_short]; +new_long_schema:.arrowkdb.pq.readParquetSchema[filename_long]; +new_float_schema:.arrowkdb.pq.readParquetSchema[filename_float]; +new_str_schema:.arrowkdb.pq.readParquetSchema[filename_str]; +new_time_schema:.arrowkdb.pq.readParquetSchema[filename_time]; + +show .arrowkdb.sc.equalSchemas[short_schema;new_short_schema] +show .arrowkdb.sc.equalSchemas[long_schema;new_long_schema] +show .arrowkdb.sc.equalSchemas[float_schema;new_float_schema] +show .arrowkdb.sc.equalSchemas[str_schema;new_str_schema] +show .arrowkdb.sc.equalSchemas[time_schema;new_time_schema] + +show short_schema~new_short_schema +show long_schema~new_long_schema +show float_schema~new_float_schema +show str_schema~new_str_schema +show time_schema~new_time_schema + +// Read the array data back and compare +new_short_data:.arrowkdb.pq.readParquetData[filename_short;::]; +new_long_data:.arrowkdb.pq.readParquetData[filename_long;::]; +new_float_data:.arrowkdb.pq.readParquetData[filename_float;::]; +new_str_data:.arrowkdb.pq.readParquetData[filename_str;::]; +new_time_data:.arrowkdb.pq.readParquetData[filename_time;::]; + +show short_data~new_short_data +show long_data~new_long_data +show float_data~new_float_data +show str_data~new_str_data +show time_data~new_time_data + +rm filename_short; +rm filename_long; +rm filename_float; +rm filename_str; +rm filename_time; From 9561916619008b89c22100fe4a259312bb746e2e Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 27 Jan 2023 19:21:25 +0300 Subject: [PATCH 050/276] Reloading Arrow IPC file --- examples/null_mapping.q | 100 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 92 insertions(+), 8 deletions(-) diff --git a/examples/null_mapping.q b/examples/null_mapping.q index dc43651..e8929de 100644 --- a/examples/null_mapping.q +++ b/examples/null_mapping.q @@ -1,3 +1,15 @@ +// null_mapping.q +// Examples of creating a schema supporting null mapping and using it to read/write parquet and arrow tables + +-1"\n+----------|| null_mapping.q ||----------+\n"; + +// import the arrowkdb library +\l arrowkdb.q + +// Filesystem functions for Linux/MacOS/Windows +ls:{[filename] $[.z.o like "w*";system "dir /b ",filename;system "ls ",filename]}; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + ///////////////////////// // CONSTRUCTED SCHEMAS // ///////////////////////// @@ -70,9 +82,9 @@ f64_fd:.arrowkdb.fd.field[`float64;f64_dt]; dec_fd:.arrowkdb.fd.field[`decimal;dec_dt]; str_fd:.arrowkdb.fd.field[`string;str_dt]; -lstr_fd:.arrowkdb.fd.field[`long_string;lstr_dt]; +lstr_fd:.arrowkdb.fd.field[`large_string;lstr_dt]; bin_fd:.arrowkdb.fd.field[`binary;bin_dt]; -lbin_fd:.arrowkdb.fd.field[`long_binary;lbin_dt]; +lbin_fd:.arrowkdb.fd.field[`large_binary;lbin_dt]; fbin_fd:.arrowkdb.fd.field[`fixed_binary;fbin_dt]; d32_fd:.arrowkdb.fd.field[`date32;d32_dt]; @@ -189,7 +201,7 @@ options[`DECIMAL128_AS_DOUBLE]:1 //-------------------------// // Write the schema and array data to a parquet file -options[`PARQUET_VERSION]:`V2.0 +options[`PARQUET_VERSION]:`V2.LATEST filename_short:"null_mapping_short.parquet" filename_long:"null_mapping_long.parquet" @@ -235,14 +247,86 @@ new_float_data:.arrowkdb.pq.readParquetData[filename_float;::]; new_str_data:.arrowkdb.pq.readParquetData[filename_str;::]; new_time_data:.arrowkdb.pq.readParquetData[filename_time;::]; -show short_data~new_short_data -show long_data~new_long_data -show float_data~new_float_data -show str_data~new_str_data -show time_data~new_time_data +//TODO: enable data comparison when reload mapping is ready +//show short_data~new_short_data +//show long_data~new_long_data +//show float_data~new_float_data +//show str_data~new_str_data +//show time_data~new_time_data + +rm filename_short; +rm filename_long; +rm filename_float; +rm filename_str; +rm filename_time; + +//---------------------------// +// Example-2. Arrow IPC file // +//---------------------------// + +// Write the schema and array data to an arrow file +filename_short:"null_mapping_short.arrow"; +filename_long:"null_mapping_long.arrow"; +filename_float:"null_mapping_float.arrow"; +filename_str:"null_mapping_str.arrow"; +filename_time:"null_mapping_time.arrow"; +filename_other:"null_mapping_other.arrow"; + +.arrowkdb.ipc.writeArrow[filename_short;short_schema;short_data;::]; +.arrowkdb.ipc.writeArrow[filename_long;long_schema;long_data;::]; +.arrowkdb.ipc.writeArrow[filename_float;float_schema;float_data;::]; +.arrowkdb.ipc.writeArrow[filename_str;str_schema;str_data;::]; +.arrowkdb.ipc.writeArrow[filename_time;time_schema;time_data;::]; +.arrowkdb.ipc.writeArrow[filename_other;other_schema;other_data;::]; + +show ls filename_short +show ls filename_long +show ls filename_float +show ls filename_str +show ls filename_time +show ls filename_other + +// Read the schema back and compare +new_short_schema:.arrowkdb.ipc.readArrowSchema[filename_short]; +new_long_schema:.arrowkdb.ipc.readArrowSchema[filename_long]; +new_float_schema:.arrowkdb.ipc.readArrowSchema[filename_float]; +new_str_schema:.arrowkdb.ipc.readArrowSchema[filename_str]; +new_time_schema:.arrowkdb.ipc.readArrowSchema[filename_time]; +new_other_schema:.arrowkdb.ipc.readArrowSchema[filename_other]; + +show .arrowkdb.sc.equalSchemas[short_schema;new_short_schema] +show .arrowkdb.sc.equalSchemas[long_schema;new_long_schema] +show .arrowkdb.sc.equalSchemas[float_schema;new_float_schema] +show .arrowkdb.sc.equalSchemas[str_schema;new_str_schema] +show .arrowkdb.sc.equalSchemas[time_schema;new_time_schema] +show .arrowkdb.sc.equalSchemas[other_schema;new_other_schema] + +show short_schema~new_short_schema +show long_schema~new_long_schema +show float_schema~new_float_schema +show str_schema~new_str_schema +show time_schema~new_time_schema +show other_schema~new_other_schema + +// Read the array data back and compare +new_short_data:.arrowkdb.ipc.readArrowData[filename;::]; +new_long_data:.arrowkdb.ipc.readArrowData[filename;::]; +new_float_data:.arrowkdb.ipc.readArrowData[filename;::]; +new_str_data:.arrowkdb.ipc.readArrowData[filename;::]; +new_time_data:.arrowkdb.ipc.readArrowData[filename;::]; +new_other_data:.arrowkdb.ipc.readArrowData[filename;::]; + +//TODO: enable data comparison when reload mapping is ready +//show short_data~new_short_data +//show long_data~new_long_data +//show float_data~new_float_data +//show str_data~new_str_data +//show time_data~new_time_data +//show other_data~new_other_data rm filename_short; rm filename_long; rm filename_float; rm filename_str; rm filename_time; +rm filename_other; From d881063329226dbb117310d74140acf97f14014a Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 30 Jan 2023 17:30:53 +0300 Subject: [PATCH 051/276] Example-2 Arrow IPC file is added --- examples/null_mapping.q | 244 ++++++++++++++++++++-------------------- 1 file changed, 121 insertions(+), 123 deletions(-) diff --git a/examples/null_mapping.q b/examples/null_mapping.q index e8929de..93d12de 100644 --- a/examples/null_mapping.q +++ b/examples/null_mapping.q @@ -184,149 +184,147 @@ short_data:(ts_data;bool_data;ui8_data;i8_data;ui16_data;i16_data); long_data:(ts_data;ui32_data;i32_data;ui64_data;i64_data); float_data:(ts_data;f32_data;f64_data;dec_data); str_data:(ts_data;str_data;lstr_data;bin_data;lbin_data;fbin_data); -time_data:(ts_data;d32_data;d64_data;tstamp_data;t64_data;dur_data) -other_data:(ts_data;f16_data;t32_data;mint_data;dtint_data) +time_data:(ts_data;d32_data;d64_data;tstamp_data;t64_data;dur_data); +other_data:(ts_data;f16_data;t32_data;mint_data;dtint_data); // Pretty print the Arrow table populated from the array data options[`DECIMAL128_AS_DOUBLE]:1 -.arrowkdb.tb.prettyPrintTable[short_schema;short_data;options] -.arrowkdb.tb.prettyPrintTable[long_schema;long_data;options] -.arrowkdb.tb.prettyPrintTable[float_schema;float_data;options] -.arrowkdb.tb.prettyPrintTable[str_schema;str_data;options] -.arrowkdb.tb.prettyPrintTable[time_schema;time_data;options] -.arrowkdb.tb.prettyPrintTable[other_schema;other_data;options] +.arrowkdb.tb.prettyPrintTable[short_schema;short_data;options]; +.arrowkdb.tb.prettyPrintTable[long_schema;long_data;options]; +.arrowkdb.tb.prettyPrintTable[float_schema;float_data;options]; +.arrowkdb.tb.prettyPrintTable[str_schema;str_data;options]; +.arrowkdb.tb.prettyPrintTable[time_schema;time_data;options]; +.arrowkdb.tb.prettyPrintTable[other_schema;other_data;options]; //-------------------------// // Example-1. Parquet file // //-------------------------// // Write the schema and array data to a parquet file -options[`PARQUET_VERSION]:`V2.LATEST - -filename_short:"null_mapping_short.parquet" -filename_long:"null_mapping_long.parquet" -filename_float:"null_mapping_float.parquet" -filename_str:"null_mapping_str.parquet" -filename_time:"null_mapping_time.parquet" - -.arrowkdb.pq.writeParquet[filename_short;short_schema;short_data;options] -.arrowkdb.pq.writeParquet[filename_long;long_schema;long_data;options] -.arrowkdb.pq.writeParquet[filename_float;float_schema;float_data;options] -.arrowkdb.pq.writeParquet[filename_str;str_schema;str_data;options] -.arrowkdb.pq.writeParquet[filename_time;time_schema;time_data;options] - -show ls filename_short -show ls filename_long -show ls filename_float -show ls filename_str -show ls filename_time +options[`PARQUET_VERSION]:`V2.0 + +parquet_short:"null_mapping_short.parquet" +parquet_long:"null_mapping_long.parquet" +parquet_float:"null_mapping_float.parquet" +parquet_str:"null_mapping_str.parquet" +parquet_time:"null_mapping_time.parquet" + +.arrowkdb.pq.writeParquet[parquet_short;short_schema;short_data;options]; +.arrowkdb.pq.writeParquet[parquet_long;long_schema;long_data;options]; +.arrowkdb.pq.writeParquet[parquet_float;float_schema;float_data;options]; +.arrowkdb.pq.writeParquet[parquet_str;str_schema;str_data;options]; +.arrowkdb.pq.writeParquet[parquet_time;time_schema;time_data;options]; + +show ls parquet_short +show ls parquet_long +show ls parquet_float +show ls parquet_str +show ls parquet_time // Read the schema back and compare -new_short_schema:.arrowkdb.pq.readParquetSchema[filename_short]; -new_long_schema:.arrowkdb.pq.readParquetSchema[filename_long]; -new_float_schema:.arrowkdb.pq.readParquetSchema[filename_float]; -new_str_schema:.arrowkdb.pq.readParquetSchema[filename_str]; -new_time_schema:.arrowkdb.pq.readParquetSchema[filename_time]; - -show .arrowkdb.sc.equalSchemas[short_schema;new_short_schema] -show .arrowkdb.sc.equalSchemas[long_schema;new_long_schema] -show .arrowkdb.sc.equalSchemas[float_schema;new_float_schema] -show .arrowkdb.sc.equalSchemas[str_schema;new_str_schema] -show .arrowkdb.sc.equalSchemas[time_schema;new_time_schema] - -show short_schema~new_short_schema -show long_schema~new_long_schema -show float_schema~new_float_schema -show str_schema~new_str_schema -show time_schema~new_time_schema +parquet_short_schema:.arrowkdb.pq.readParquetSchema[parquet_short]; +parquet_long_schema:.arrowkdb.pq.readParquetSchema[parquet_long]; +parquet_float_schema:.arrowkdb.pq.readParquetSchema[parquet_float]; +parquet_str_schema:.arrowkdb.pq.readParquetSchema[parquet_str]; +parquet_time_schema:.arrowkdb.pq.readParquetSchema[parquet_time]; + +show .arrowkdb.sc.equalSchemas[short_schema;parquet_short_schema] +show .arrowkdb.sc.equalSchemas[long_schema;parquet_long_schema] +show .arrowkdb.sc.equalSchemas[float_schema;parquet_float_schema] +show .arrowkdb.sc.equalSchemas[str_schema;parquet_str_schema] +show .arrowkdb.sc.equalSchemas[time_schema;parquet_time_schema] + +show short_schema~parquet_short_schema +show long_schema~parquet_long_schema +show float_schema~parquet_float_schema +show str_schema~parquet_str_schema +show time_schema~parquet_time_schema // Read the array data back and compare -new_short_data:.arrowkdb.pq.readParquetData[filename_short;::]; -new_long_data:.arrowkdb.pq.readParquetData[filename_long;::]; -new_float_data:.arrowkdb.pq.readParquetData[filename_float;::]; -new_str_data:.arrowkdb.pq.readParquetData[filename_str;::]; -new_time_data:.arrowkdb.pq.readParquetData[filename_time;::]; - -//TODO: enable data comparison when reload mapping is ready -//show short_data~new_short_data -//show long_data~new_long_data -//show float_data~new_float_data -//show str_data~new_str_data -//show time_data~new_time_data - -rm filename_short; -rm filename_long; -rm filename_float; -rm filename_str; -rm filename_time; +parquet_short_data:.arrowkdb.pq.readParquetData[parquet_short;::]; +parquet_long_data:.arrowkdb.pq.readParquetData[parquet_long;::]; +parquet_float_data:.arrowkdb.pq.readParquetData[parquet_float;(``DECIMAL128_AS_DOUBLE)!((::);1)]; +parquet_str_data:.arrowkdb.pq.readParquetData[parquet_str;::]; +parquet_time_data:.arrowkdb.pq.readParquetData[parquet_time;::]; + +show short_data~parquet_short_data +show long_data~parquet_long_data +show float_data~parquet_float_data +show str_data~parquet_str_data +show time_data~parquet_time_data + +rm parquet_short; +rm parquet_long; +rm parquet_float; +rm parquet_str; +rm parquet_time; //---------------------------// // Example-2. Arrow IPC file // //---------------------------// // Write the schema and array data to an arrow file -filename_short:"null_mapping_short.arrow"; -filename_long:"null_mapping_long.arrow"; -filename_float:"null_mapping_float.arrow"; -filename_str:"null_mapping_str.arrow"; -filename_time:"null_mapping_time.arrow"; -filename_other:"null_mapping_other.arrow"; - -.arrowkdb.ipc.writeArrow[filename_short;short_schema;short_data;::]; -.arrowkdb.ipc.writeArrow[filename_long;long_schema;long_data;::]; -.arrowkdb.ipc.writeArrow[filename_float;float_schema;float_data;::]; -.arrowkdb.ipc.writeArrow[filename_str;str_schema;str_data;::]; -.arrowkdb.ipc.writeArrow[filename_time;time_schema;time_data;::]; -.arrowkdb.ipc.writeArrow[filename_other;other_schema;other_data;::]; - -show ls filename_short -show ls filename_long -show ls filename_float -show ls filename_str -show ls filename_time -show ls filename_other +arrow_short:"null_mapping_short.arrow"; +arrow_long:"null_mapping_long.arrow"; +arrow_float:"null_mapping_float.arrow"; +arrow_str:"null_mapping_str.arrow"; +arrow_time:"null_mapping_time.arrow"; +arrow_other:"null_mapping_other.arrow"; + +.arrowkdb.ipc.writeArrow[arrow_short;short_schema;short_data;options]; +.arrowkdb.ipc.writeArrow[arrow_long;long_schema;long_data;options]; +.arrowkdb.ipc.writeArrow[arrow_float;float_schema;float_data;options]; +.arrowkdb.ipc.writeArrow[arrow_str;str_schema;str_data;options]; +.arrowkdb.ipc.writeArrow[arrow_time;time_schema;time_data;options]; +.arrowkdb.ipc.writeArrow[arrow_other;other_schema;other_data;options]; + +show ls arrow_short +show ls arrow_long +show ls arrow_float +show ls arrow_str +show ls arrow_time +show ls arrow_other // Read the schema back and compare -new_short_schema:.arrowkdb.ipc.readArrowSchema[filename_short]; -new_long_schema:.arrowkdb.ipc.readArrowSchema[filename_long]; -new_float_schema:.arrowkdb.ipc.readArrowSchema[filename_float]; -new_str_schema:.arrowkdb.ipc.readArrowSchema[filename_str]; -new_time_schema:.arrowkdb.ipc.readArrowSchema[filename_time]; -new_other_schema:.arrowkdb.ipc.readArrowSchema[filename_other]; - -show .arrowkdb.sc.equalSchemas[short_schema;new_short_schema] -show .arrowkdb.sc.equalSchemas[long_schema;new_long_schema] -show .arrowkdb.sc.equalSchemas[float_schema;new_float_schema] -show .arrowkdb.sc.equalSchemas[str_schema;new_str_schema] -show .arrowkdb.sc.equalSchemas[time_schema;new_time_schema] -show .arrowkdb.sc.equalSchemas[other_schema;new_other_schema] - -show short_schema~new_short_schema -show long_schema~new_long_schema -show float_schema~new_float_schema -show str_schema~new_str_schema -show time_schema~new_time_schema -show other_schema~new_other_schema +arrow_short_schema:.arrowkdb.ipc.readArrowSchema[arrow_short]; +arrow_long_schema:.arrowkdb.ipc.readArrowSchema[arrow_long]; +arrow_float_schema:.arrowkdb.ipc.readArrowSchema[arrow_float]; +arrow_str_schema:.arrowkdb.ipc.readArrowSchema[arrow_str]; +arrow_time_schema:.arrowkdb.ipc.readArrowSchema[arrow_time]; +arrow_other_schema:.arrowkdb.ipc.readArrowSchema[arrow_other]; + +show .arrowkdb.sc.equalSchemas[short_schema;arrow_short_schema] +show .arrowkdb.sc.equalSchemas[long_schema;arrow_long_schema] +show .arrowkdb.sc.equalSchemas[float_schema;arrow_float_schema] +show .arrowkdb.sc.equalSchemas[str_schema;arrow_str_schema] +show .arrowkdb.sc.equalSchemas[time_schema;arrow_time_schema] +show .arrowkdb.sc.equalSchemas[other_schema;arrow_other_schema] + +show short_schema~arrow_short_schema +show long_schema~arrow_long_schema +show float_schema~arrow_float_schema +show str_schema~arrow_str_schema +show time_schema~arrow_time_schema +show other_schema~arrow_other_schema // Read the array data back and compare -new_short_data:.arrowkdb.ipc.readArrowData[filename;::]; -new_long_data:.arrowkdb.ipc.readArrowData[filename;::]; -new_float_data:.arrowkdb.ipc.readArrowData[filename;::]; -new_str_data:.arrowkdb.ipc.readArrowData[filename;::]; -new_time_data:.arrowkdb.ipc.readArrowData[filename;::]; -new_other_data:.arrowkdb.ipc.readArrowData[filename;::]; - -//TODO: enable data comparison when reload mapping is ready -//show short_data~new_short_data -//show long_data~new_long_data -//show float_data~new_float_data -//show str_data~new_str_data -//show time_data~new_time_data -//show other_data~new_other_data - -rm filename_short; -rm filename_long; -rm filename_float; -rm filename_str; -rm filename_time; -rm filename_other; +arrow_short_data:.arrowkdb.ipc.readArrowData[arrow_short;::]; +arrow_long_data:.arrowkdb.ipc.readArrowData[arrow_long;::]; +arrow_float_data:.arrowkdb.ipc.readArrowData[arrow_float;(``DECIMAL128_AS_DOUBLE)!((::);1)]; +arrow_str_data:.arrowkdb.ipc.readArrowData[arrow_str;::]; +arrow_time_data:.arrowkdb.ipc.readArrowData[arrow_time;::]; +arrow_other_data:.arrowkdb.ipc.readArrowData[arrow_other;::]; + +show short_data~arrow_short_data +show long_data~arrow_long_data +show float_data~arrow_float_data +show str_data~arrow_str_data +show time_data~arrow_time_data +show other_data~arrow_other_data + +rm arrow_short; +rm arrow_long; +rm arrow_float; +rm arrow_str; +rm arrow_time; +rm arrow_other; From 201e124adc237275f338f0e7eeb16aa19ba115ab Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 1 Feb 2023 19:31:34 +0300 Subject: [PATCH 052/276] Mapping nulls when reading arrow and parquet data --- examples/null_mapping.q | 46 ++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/examples/null_mapping.q b/examples/null_mapping.q index 93d12de..952a0bf 100644 --- a/examples/null_mapping.q +++ b/examples/null_mapping.q @@ -28,9 +28,9 @@ other_opts:(`float16`time32`month_interval`day_time_interval)!(9h;09:01:02.042;2 options:(``NULL_MAPPING)!((::);short_opts,long_opts,float_opts,str_opts,time_opts,other_opts); +// Create the datatype identifiers ts_dt:.arrowkdb.dt.timestamp[`nano]; -// Create the datatype identifiers bool_dt:.arrowkdb.dt.boolean[]; ui8_dt:.arrowkdb.dt.uint8[]; i8_dt:.arrowkdb.dt.int8[]; @@ -107,12 +107,12 @@ time_schema:.arrowkdb.sc.schema[(ts_fd,d32_fd,d64_fd,tstamp_fd,t64_fd,dur_fd)]; other_schema:.arrowkdb.sc.schema[(ts_fd,f16_fd,t32_fd,mint_fd,dtint_fd)]; // Print the schemas -.arrowkdb.sc.printSchema[short_schema] -.arrowkdb.sc.printSchema[long_schema] +.arrowkdb.sc.printSchema[short_schema]; +.arrowkdb.sc.printSchema[long_schema]; .arrowkdb.sc.printSchema[float_schema] -.arrowkdb.sc.printSchema[str_schema] -.arrowkdb.sc.printSchema[time_schema] -.arrowkdb.sc.printSchema[other_schema] +.arrowkdb.sc.printSchema[str_schema]; +.arrowkdb.sc.printSchema[time_schema]; +.arrowkdb.sc.printSchema[other_schema]; //-----------------------// // Create the array data // @@ -145,7 +145,7 @@ f32_data:N?100e; f32_data[0]:1.23e; f64_data:N?100f; f64_data[1]:4.56f; -dec_data:N?(10f); +dec_data:{"F"$.Q.f[2]x} each N?(10f) dec_data[2]:7.89f str_data:N?("start";"stop";"alert";"acknowledge";""); @@ -203,11 +203,11 @@ options[`DECIMAL128_AS_DOUBLE]:1 // Write the schema and array data to a parquet file options[`PARQUET_VERSION]:`V2.0 -parquet_short:"null_mapping_short.parquet" -parquet_long:"null_mapping_long.parquet" -parquet_float:"null_mapping_float.parquet" -parquet_str:"null_mapping_str.parquet" -parquet_time:"null_mapping_time.parquet" +parquet_short:"null_mapping_short.parquet"; +parquet_long:"null_mapping_long.parquet"; +parquet_float:"null_mapping_float.parquet"; +parquet_str:"null_mapping_str.parquet"; +parquet_time:"null_mapping_time.parquet"; .arrowkdb.pq.writeParquet[parquet_short;short_schema;short_data;options]; .arrowkdb.pq.writeParquet[parquet_long;long_schema;long_data;options]; @@ -241,11 +241,11 @@ show str_schema~parquet_str_schema show time_schema~parquet_time_schema // Read the array data back and compare -parquet_short_data:.arrowkdb.pq.readParquetData[parquet_short;::]; -parquet_long_data:.arrowkdb.pq.readParquetData[parquet_long;::]; -parquet_float_data:.arrowkdb.pq.readParquetData[parquet_float;(``DECIMAL128_AS_DOUBLE)!((::);1)]; -parquet_str_data:.arrowkdb.pq.readParquetData[parquet_str;::]; -parquet_time_data:.arrowkdb.pq.readParquetData[parquet_time;::]; +parquet_short_data:.arrowkdb.pq.readParquetData[parquet_short;options]; +parquet_long_data:.arrowkdb.pq.readParquetData[parquet_long;options]; +parquet_float_data:.arrowkdb.pq.readParquetData[parquet_float;options]; +parquet_str_data:.arrowkdb.pq.readParquetData[parquet_str;options]; +parquet_time_data:.arrowkdb.pq.readParquetData[parquet_time;options]; show short_data~parquet_short_data show long_data~parquet_long_data @@ -308,12 +308,12 @@ show time_schema~arrow_time_schema show other_schema~arrow_other_schema // Read the array data back and compare -arrow_short_data:.arrowkdb.ipc.readArrowData[arrow_short;::]; -arrow_long_data:.arrowkdb.ipc.readArrowData[arrow_long;::]; -arrow_float_data:.arrowkdb.ipc.readArrowData[arrow_float;(``DECIMAL128_AS_DOUBLE)!((::);1)]; -arrow_str_data:.arrowkdb.ipc.readArrowData[arrow_str;::]; -arrow_time_data:.arrowkdb.ipc.readArrowData[arrow_time;::]; -arrow_other_data:.arrowkdb.ipc.readArrowData[arrow_other;::]; +arrow_short_data:.arrowkdb.ipc.readArrowData[arrow_short;options]; +arrow_long_data:.arrowkdb.ipc.readArrowData[arrow_long;options]; +arrow_float_data:.arrowkdb.ipc.readArrowData[arrow_float;options]; +arrow_str_data:.arrowkdb.ipc.readArrowData[arrow_str;options]; +arrow_time_data:.arrowkdb.ipc.readArrowData[arrow_time;options]; +arrow_other_data:.arrowkdb.ipc.readArrowData[arrow_other;options]; show short_data~arrow_short_data show long_data~arrow_long_data From bfaee2240a37590060efbe0414bddc1ec632fe76 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 2 Feb 2023 18:43:57 +0300 Subject: [PATCH 053/276] Arrow IPC stream example --- examples/null_mapping.q | 58 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/examples/null_mapping.q b/examples/null_mapping.q index 952a0bf..cd2fd77 100644 --- a/examples/null_mapping.q +++ b/examples/null_mapping.q @@ -328,3 +328,61 @@ rm arrow_float; rm arrow_str; rm arrow_time; rm arrow_other; + +//-----------------------------// +// Example-3. Arrow IPC stream // +//-----------------------------// + +// Serialize the schema and array data to an arrow stream +serialized_short:.arrowkdb.ipc.serializeArrow[short_schema;short_data;options]; +serialized_long:.arrowkdb.ipc.serializeArrow[long_schema;long_data;options]; +serialized_float:.arrowkdb.ipc.serializeArrow[float_schema;float_data;options]; +serialized_str:.arrowkdb.ipc.serializeArrow[str_schema;str_data;options]; +serialized_time:.arrowkdb.ipc.serializeArrow[time_schema;time_data;options]; +serialized_other:.arrowkdb.ipc.serializeArrow[other_schema;other_data;options]; +show serialized_short +show serialized_long +show serialized_float +show serialized_str +show serialized_time +show serialized_other + +// Parse the schema back abd compare +stream_short_schema:.arrowkdb.ipc.parseArrowSchema[serialized_short]; +stream_long_schema:.arrowkdb.ipc.parseArrowSchema[serialized_long]; +stream_float_schema:.arrowkdb.ipc.parseArrowSchema[serialized_float]; +stream_str_schema:.arrowkdb.ipc.parseArrowSchema[serialized_str]; +stream_time_schema:.arrowkdb.ipc.parseArrowSchema[serialized_time]; +stream_other_schema:.arrowkdb.ipc.parseArrowSchema[serialized_other]; +show .arrowkdb.sc.equalSchemas[short_schema;stream_short_schema] +show .arrowkdb.sc.equalSchemas[long_schema;stream_long_schema] +show .arrowkdb.sc.equalSchemas[float_schema;stream_float_schema] +show .arrowkdb.sc.equalSchemas[str_schema;stream_str_schema] +show .arrowkdb.sc.equalSchemas[time_schema;stream_time_schema] +show .arrowkdb.sc.equalSchemas[other_schema;stream_other_schema] +show short_schema~stream_short_schema +show long_schema~stream_long_schema +show float_schema~stream_float_schema +show str_schema~stream_str_schema +show time_schema~stream_time_schema +show other_schema~stream_other_schema + +// Parse the array data back and compare +stream_short_data:.arrowkdb.ipc.parseArrowData[serialized_short;options]; +stream_long_data:.arrowkdb.ipc.parseArrowData[serialized_long;options]; +stream_float_data:.arrowkdb.ipc.parseArrowData[serialized_float;options]; +stream_str_data:.arrowkdb.ipc.parseArrowData[serialized_str;options]; +stream_time_data:.arrowkdb.ipc.parseArrowData[serialized_time;options]; +stream_other_data:.arrowkdb.ipc.parseArrowData[serialized_other;options]; +show short_data~stream_short_data +show long_data~stream_long_data +show float_data~stream_float_data +show str_data~stream_str_data +show time_data~stream_time_data +show other_data~stream_other_data + + +-1 "\n+----------------------------------------+\n"; + +// Process off +exit 0; \ No newline at end of file From 53dcf998406f9fe4a81774e45d6f0c6c45d1451b Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 2 Feb 2023 20:31:56 +0300 Subject: [PATCH 054/276] Arrow only supported extra fields --- examples/null_mapping.q | 96 +++++++++++++++++++++++++++++------------ 1 file changed, 68 insertions(+), 28 deletions(-) diff --git a/examples/null_mapping.q b/examples/null_mapping.q index cd2fd77..627485d 100644 --- a/examples/null_mapping.q +++ b/examples/null_mapping.q @@ -18,15 +18,18 @@ rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; // Create the schema // //-------------------// -// Support null mapping +// Support null mapping in parquet and arrow short_opts:(`bool`uint8`int8`uint16`int16)!(0b;0x01;0x02;3h;4h); long_opts:(`uint32`int32`uint64`int64)!(5i;6i;7;8); float_opts:(`float32`float64`decimal)!(1.23e;4.56;7.89); -str_opts:(`string`large_string`binary`large_binary`fixed_binary)!("start";"stop";"x"$"alert";"x"$"acknowledge";0Ng); -time_opts:(`date32`date64`timestamp`time64`duration)!(2006.07.21;2015.01.01D00:00:00.000000000;2011.01.01D00:00:00.000000000;12:00:00.000000000;12:00:00.000000000); -other_opts:(`float16`time32`month_interval`day_time_interval)!(9h;09:01:02.042;2006.07m;12:00:00.000000000); +str_opts:(`string`binary`fixed_binary)!("start";"x"$"alert";0Ng); +time_opts:(`date32`timestamp`time64)!(2006.07.21;2011.01.01D00:00:00.000000000;12:00:00.000000000); -options:(``NULL_MAPPING)!((::);short_opts,long_opts,float_opts,str_opts,time_opts,other_opts); +// Support null mapping only in arrow +extra_opts:(`float16`large_string`large_binary`duration)!(9h;"stop";"x"$"acknowledge";12:00:00.000000000); +other_opts:(`date64`time32`month_interval`day_time_interval)!(2015.01.01D00:00:00.000000000;09:01:02.042;2006.07m;12:00:00.000000000); + +options:(``NULL_MAPPING)!((::);short_opts,long_opts,float_opts,str_opts,time_opts,extra_opts,other_opts); // Create the datatype identifiers ts_dt:.arrowkdb.dt.timestamp[`nano]; @@ -47,18 +50,19 @@ f64_dt:.arrowkdb.dt.float64[]; dec_dt:.arrowkdb.dt.decimal128[38i;2i]; str_dt:.arrowkdb.dt.utf8[]; -lstr_dt:.arrowkdb.dt.large_utf8[]; bin_dt:.arrowkdb.dt.binary[]; -lbin_dt:.arrowkdb.dt.large_binary[]; fbin_dt:.arrowkdb.dt.fixed_size_binary[16i]; d32_dt:.arrowkdb.dt.date32[]; -d64_dt:.arrowkdb.dt.date64[]; tstamp_dt:.arrowkdb.dt.timestamp[`nano]; t64_dt:.arrowkdb.dt.time64[`nano]; -dur_dt:.arrowkdb.dt.duration[`milli]; f16_dt:.arrowkdb.dt.float16[]; +lstr_dt:.arrowkdb.dt.large_utf8[]; +lbin_dt:.arrowkdb.dt.large_binary[]; +dur_dt:.arrowkdb.dt.duration[`milli]; + +d64_dt:.arrowkdb.dt.date64[]; t32_dt:.arrowkdb.dt.time32[`milli]; mint_dt:.arrowkdb.dt.month_interval[]; dtint_dt:.arrowkdb.dt.day_time_interval[]; @@ -82,18 +86,19 @@ f64_fd:.arrowkdb.fd.field[`float64;f64_dt]; dec_fd:.arrowkdb.fd.field[`decimal;dec_dt]; str_fd:.arrowkdb.fd.field[`string;str_dt]; -lstr_fd:.arrowkdb.fd.field[`large_string;lstr_dt]; bin_fd:.arrowkdb.fd.field[`binary;bin_dt]; -lbin_fd:.arrowkdb.fd.field[`large_binary;lbin_dt]; fbin_fd:.arrowkdb.fd.field[`fixed_binary;fbin_dt]; d32_fd:.arrowkdb.fd.field[`date32;d32_dt]; -d64_fd:.arrowkdb.fd.field[`date64;d64_dt]; tstamp_fd:.arrowkdb.fd.field[`timestamp;tstamp_dt]; t64_fd:.arrowkdb.fd.field[`time64;t64_dt]; -dur_fd:.arrowkdb.fd.field[`duration;dur_dt]; f16_fd:.arrowkdb.fd.field[`float16;f16_dt]; +lstr_fd:.arrowkdb.fd.field[`large_string;lstr_dt]; +lbin_fd:.arrowkdb.fd.field[`large_binary;lbin_dt]; +dur_fd:.arrowkdb.fd.field[`duration;dur_dt]; + +d64_fd:.arrowkdb.fd.field[`date64;d64_dt]; t32_fd:.arrowkdb.fd.field[`time32;t32_dt]; mint_fd:.arrowkdb.fd.field[`month_interval;mint_dt]; dtint_fd:.arrowkdb.fd.field[`day_time_interval;dtint_dt]; @@ -102,9 +107,11 @@ dtint_fd:.arrowkdb.fd.field[`day_time_interval;dtint_dt]; short_schema:.arrowkdb.sc.schema[(ts_fd,bool_fd,ui8_fd,i8_fd,ui16_fd,i16_fd)]; long_schema:.arrowkdb.sc.schema[(ts_fd,ui32_fd,i32_fd,ui64_fd,i64_fd)]; float_schema:.arrowkdb.sc.schema[(ts_fd,f32_fd,f64_fd,dec_fd)]; -str_schema:.arrowkdb.sc.schema[(ts_fd,str_fd,lstr_fd,bin_fd,lbin_fd,fbin_fd)]; -time_schema:.arrowkdb.sc.schema[(ts_fd,d32_fd,d64_fd,tstamp_fd,t64_fd,dur_fd)]; -other_schema:.arrowkdb.sc.schema[(ts_fd,f16_fd,t32_fd,mint_fd,dtint_fd)]; +str_schema:.arrowkdb.sc.schema[(ts_fd,str_fd,bin_fd,fbin_fd)]; +time_schema:.arrowkdb.sc.schema[(ts_fd,d32_fd,tstamp_fd,t64_fd)]; + +extra_schema:.arrowkdb.sc.schema[(ts_fd,f16_fd,lstr_fd,lbin_fd,dur_fd)]; +other_schema:.arrowkdb.sc.schema[(ts_fd,d64_fd,t32_fd,mint_fd,dtint_fd)]; // Print the schemas .arrowkdb.sc.printSchema[short_schema]; @@ -112,6 +119,8 @@ other_schema:.arrowkdb.sc.schema[(ts_fd,f16_fd,t32_fd,mint_fd,dtint_fd)]; .arrowkdb.sc.printSchema[float_schema] .arrowkdb.sc.printSchema[str_schema]; .arrowkdb.sc.printSchema[time_schema]; + +.arrowkdb.sc.printSchema[extra_schema]; .arrowkdb.sc.printSchema[other_schema]; //-----------------------// @@ -150,28 +159,29 @@ dec_data[2]:7.89f str_data:N?("start";"stop";"alert";"acknowledge";""); str_data[0]:"start" -lstr_data:N?("start";"stop";"alert";"acknowledge";""); -lstr_data[1]:"stop" bin_data:N?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); bin_data[2]:"x"$"alert" -lbin_data:N?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); -lbin_data[3]:"x"$"acknowledge" fbin_data:N?0Ng; fbin_data[4]:0Ng; d32_data:N?(2006.07.21;2008.07.18;2012.07.16;2014.07.15;2016.07.11); d32_data[0]:2006.07.21; -d64_data:N?(2015.01.01D00:00:00.000000000;2017.01.01D00:00:00.000000000;2018.01.01D00:00:00.000000000;2019.01.01D00:00:00.000000000;2020.01.01D00:00:00.000000000); -d64_data[1]:2015.01.01D00:00:00.000000000; tstamp_data:N?(2015.01.01D00:00:00.000000000;2014.01.01D00:00:00.000000000;2013.01.01D00:00:00.000000000;2012.01.01D00:00:00.000000000;2011.01.01D00:00:00.000000000); tstamp_data[2]:2011.01.01D00:00:00.000000000; t64_data:N?(12:00:00.000000000;13:00:00.000000000;14:00:00.000000000;15:00:00.000000000;16:00:00.000000000); t64_data[3]:12:00:00.000000000; -dur_data:N?(12:00:00.000000000;13:00:00.000000000;14:00:00.000000000;15:00:00.000000000;16:00:00.000000000); -dur_data[4]:12:00:00.000000000; f16_data:N?100h; f16_data[0]:9h; +lstr_data:N?("start";"stop";"alert";"acknowledge";""); +lstr_data[1]:"stop" +lbin_data:N?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); +lbin_data[3]:"x"$"acknowledge" +dur_data:N?(12:00:00.000000000;13:00:00.000000000;14:00:00.000000000;15:00:00.000000000;16:00:00.000000000); +dur_data[4]:12:00:00.000000000; + +d64_data:N?(2015.01.01D00:00:00.000000000;2017.01.01D00:00:00.000000000;2018.01.01D00:00:00.000000000;2019.01.01D00:00:00.000000000;2020.01.01D00:00:00.000000000); +d64_data[1]:2015.01.01D00:00:00.000000000; t32_data:N?(09:01:02.042;08:01:02.042;07:01:02.042;06:01:02.042;05:01:02.042); t32_data[1]:09:01:02.042; mint_data:N?(2006.07m;2006.06m;2006.05m;2006.04m;2006.03m); @@ -183,17 +193,21 @@ dtint_data[3]:12:00:00.000000000; short_data:(ts_data;bool_data;ui8_data;i8_data;ui16_data;i16_data); long_data:(ts_data;ui32_data;i32_data;ui64_data;i64_data); float_data:(ts_data;f32_data;f64_data;dec_data); -str_data:(ts_data;str_data;lstr_data;bin_data;lbin_data;fbin_data); -time_data:(ts_data;d32_data;d64_data;tstamp_data;t64_data;dur_data); -other_data:(ts_data;f16_data;t32_data;mint_data;dtint_data); +str_data:(ts_data;str_data;bin_data;fbin_data); +time_data:(ts_data;d32_data;tstamp_data;t64_data); + +extra_data:(ts_data;f16_data;lstr_data;lbin_data;dur_data); +other_data:(ts_data;d64_data;t32_data;mint_data;dtint_data); // Pretty print the Arrow table populated from the array data options[`DECIMAL128_AS_DOUBLE]:1 + .arrowkdb.tb.prettyPrintTable[short_schema;short_data;options]; .arrowkdb.tb.prettyPrintTable[long_schema;long_data;options]; .arrowkdb.tb.prettyPrintTable[float_schema;float_data;options]; .arrowkdb.tb.prettyPrintTable[str_schema;str_data;options]; .arrowkdb.tb.prettyPrintTable[time_schema;time_data;options]; +.arrowkdb.tb.prettyPrintTable[extra_schema;extra_data;options]; .arrowkdb.tb.prettyPrintTable[other_schema;other_data;options]; //-------------------------// @@ -247,6 +261,8 @@ parquet_float_data:.arrowkdb.pq.readParquetData[parquet_float;options]; parquet_str_data:.arrowkdb.pq.readParquetData[parquet_str;options]; parquet_time_data:.arrowkdb.pq.readParquetData[parquet_time;options]; +parquet_str_data[3]:{0x0 sv x} each parquet_str_data[3] // Convert to GUIDs + show short_data~parquet_short_data show long_data~parquet_long_data show float_data~parquet_float_data @@ -269,6 +285,7 @@ arrow_long:"null_mapping_long.arrow"; arrow_float:"null_mapping_float.arrow"; arrow_str:"null_mapping_str.arrow"; arrow_time:"null_mapping_time.arrow"; +arrow_extra:"null_mapping_extra.arrow"; arrow_other:"null_mapping_other.arrow"; .arrowkdb.ipc.writeArrow[arrow_short;short_schema;short_data;options]; @@ -276,6 +293,7 @@ arrow_other:"null_mapping_other.arrow"; .arrowkdb.ipc.writeArrow[arrow_float;float_schema;float_data;options]; .arrowkdb.ipc.writeArrow[arrow_str;str_schema;str_data;options]; .arrowkdb.ipc.writeArrow[arrow_time;time_schema;time_data;options]; +.arrowkdb.ipc.writeArrow[arrow_extra;extra_schema;extra_data;options]; .arrowkdb.ipc.writeArrow[arrow_other;other_schema;other_data;options]; show ls arrow_short @@ -283,6 +301,7 @@ show ls arrow_long show ls arrow_float show ls arrow_str show ls arrow_time +show ls arrow_extra show ls arrow_other // Read the schema back and compare @@ -291,6 +310,7 @@ arrow_long_schema:.arrowkdb.ipc.readArrowSchema[arrow_long]; arrow_float_schema:.arrowkdb.ipc.readArrowSchema[arrow_float]; arrow_str_schema:.arrowkdb.ipc.readArrowSchema[arrow_str]; arrow_time_schema:.arrowkdb.ipc.readArrowSchema[arrow_time]; +arrow_extra_schema:.arrowkdb.ipc.readArrowSchema[arrow_extra]; arrow_other_schema:.arrowkdb.ipc.readArrowSchema[arrow_other]; show .arrowkdb.sc.equalSchemas[short_schema;arrow_short_schema] @@ -298,6 +318,7 @@ show .arrowkdb.sc.equalSchemas[long_schema;arrow_long_schema] show .arrowkdb.sc.equalSchemas[float_schema;arrow_float_schema] show .arrowkdb.sc.equalSchemas[str_schema;arrow_str_schema] show .arrowkdb.sc.equalSchemas[time_schema;arrow_time_schema] +show .arrowkdb.sc.equalSchemas[extra_schema;arrow_extra_schema] show .arrowkdb.sc.equalSchemas[other_schema;arrow_other_schema] show short_schema~arrow_short_schema @@ -305,6 +326,7 @@ show long_schema~arrow_long_schema show float_schema~arrow_float_schema show str_schema~arrow_str_schema show time_schema~arrow_time_schema +show extra_schema~arrow_extra_schema show other_schema~arrow_other_schema // Read the array data back and compare @@ -313,13 +335,17 @@ arrow_long_data:.arrowkdb.ipc.readArrowData[arrow_long;options]; arrow_float_data:.arrowkdb.ipc.readArrowData[arrow_float;options]; arrow_str_data:.arrowkdb.ipc.readArrowData[arrow_str;options]; arrow_time_data:.arrowkdb.ipc.readArrowData[arrow_time;options]; +arrow_extra_data:.arrowkdb.ipc.readArrowData[arrow_extra;options]; arrow_other_data:.arrowkdb.ipc.readArrowData[arrow_other;options]; +arrow_str_data[3]:{0x0 sv x} each arrow_str_data[3] // Convert to GUIDs + show short_data~arrow_short_data show long_data~arrow_long_data show float_data~arrow_float_data show str_data~arrow_str_data show time_data~arrow_time_data +show extra_data~arrow_extra_data show other_data~arrow_other_data rm arrow_short; @@ -327,6 +353,7 @@ rm arrow_long; rm arrow_float; rm arrow_str; rm arrow_time; +rm arrow_extra; rm arrow_other; //-----------------------------// @@ -339,12 +366,15 @@ serialized_long:.arrowkdb.ipc.serializeArrow[long_schema;long_data;options]; serialized_float:.arrowkdb.ipc.serializeArrow[float_schema;float_data;options]; serialized_str:.arrowkdb.ipc.serializeArrow[str_schema;str_data;options]; serialized_time:.arrowkdb.ipc.serializeArrow[time_schema;time_data;options]; +serialized_extra:.arrowkdb.ipc.serializeArrow[extra_schema;extra_data;options]; serialized_other:.arrowkdb.ipc.serializeArrow[other_schema;other_data;options]; + show serialized_short show serialized_long show serialized_float show serialized_str show serialized_time +show serialized_extra show serialized_other // Parse the schema back abd compare @@ -353,18 +383,23 @@ stream_long_schema:.arrowkdb.ipc.parseArrowSchema[serialized_long]; stream_float_schema:.arrowkdb.ipc.parseArrowSchema[serialized_float]; stream_str_schema:.arrowkdb.ipc.parseArrowSchema[serialized_str]; stream_time_schema:.arrowkdb.ipc.parseArrowSchema[serialized_time]; +stream_extra_schema:.arrowkdb.ipc.parseArrowSchema[serialized_extra]; stream_other_schema:.arrowkdb.ipc.parseArrowSchema[serialized_other]; + show .arrowkdb.sc.equalSchemas[short_schema;stream_short_schema] show .arrowkdb.sc.equalSchemas[long_schema;stream_long_schema] show .arrowkdb.sc.equalSchemas[float_schema;stream_float_schema] show .arrowkdb.sc.equalSchemas[str_schema;stream_str_schema] show .arrowkdb.sc.equalSchemas[time_schema;stream_time_schema] +show .arrowkdb.sc.equalSchemas[extra_schema;stream_extra_schema] show .arrowkdb.sc.equalSchemas[other_schema;stream_other_schema] + show short_schema~stream_short_schema show long_schema~stream_long_schema show float_schema~stream_float_schema show str_schema~stream_str_schema show time_schema~stream_time_schema +show extra_schema~stream_extra_schema show other_schema~stream_other_schema // Parse the array data back and compare @@ -373,16 +408,21 @@ stream_long_data:.arrowkdb.ipc.parseArrowData[serialized_long;options]; stream_float_data:.arrowkdb.ipc.parseArrowData[serialized_float;options]; stream_str_data:.arrowkdb.ipc.parseArrowData[serialized_str;options]; stream_time_data:.arrowkdb.ipc.parseArrowData[serialized_time;options]; +stream_extra_data:.arrowkdb.ipc.parseArrowData[serialized_extra;options]; stream_other_data:.arrowkdb.ipc.parseArrowData[serialized_other;options]; + +stream_str_data[3]:{0x0 sv x} each stream_str_data[3] // Convert to GUIDs + show short_data~stream_short_data show long_data~stream_long_data show float_data~stream_float_data show str_data~stream_str_data show time_data~stream_time_data +show extra_data~stream_extra_data show other_data~stream_other_data -1 "\n+----------------------------------------+\n"; // Process off -exit 0; \ No newline at end of file +exit 0; From d59edfcc049f886021f0ab891eee7e3b7b556287 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 8 Feb 2023 15:31:27 +0000 Subject: [PATCH 055/276] Null mapping of short integer fields --- .gitignore | 1 + tests/.gitignore | 1 + tests/null_mapping_short.t | 103 +++++++++++++++++++++++++++++++++++++ 3 files changed, 105 insertions(+) create mode 100644 tests/null_mapping_short.t diff --git a/.gitignore b/.gitignore index 18f4d15..3d6594b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ arrowkdb.code-workspace +.vscode/ build/ diff --git a/tests/.gitignore b/tests/.gitignore index 492b6a4..56b0696 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -1 +1,2 @@ test.q +null_mapping_short.q diff --git a/tests/null_mapping_short.t b/tests/null_mapping_short.t new file mode 100644 index 0000000..59dc02d --- /dev/null +++ b/tests/null_mapping_short.t @@ -0,0 +1,103 @@ +// null_mapping_short.t + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping ||----------+\n"; +short_opts:(`bool`uint8`int8`uint16`int16)!(0b;0x01;0x02;3h;4h); + +options:(``NULL_MAPPING)!((::);short_opts); + +N:5 + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +bool_dt:.arrowkdb.dt.boolean[]; +ui8_dt:.arrowkdb.dt.uint8[]; +i8_dt:.arrowkdb.dt.int8[]; +ui16_dt:.arrowkdb.dt.uint16[]; +i16_dt:.arrowkdb.dt.int16[]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +bool_fd:.arrowkdb.fd.field[`bool;bool_dt]; +ui8_fd:.arrowkdb.fd.field[`uint8;ui8_dt]; +i8_fd:.arrowkdb.fd.field[`int8;i8_dt]; +ui16_fd:.arrowkdb.fd.field[`uint16;ui16_dt]; +i16_fd:.arrowkdb.fd.field[`int16;i16_dt]; + +-1"\n+----------|| Create the schemas for the list of fields ||----------+\n"; +short_schema:.arrowkdb.sc.schema[(ts_fd,bool_fd,ui8_fd,i8_fd,ui16_fd,i16_fd)]; + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +bool_data:N?(0b;1b); +bool_data[0]:0b; +ui8_data:N?0x64; +ui8_data[1]:0x01; +i8_data:N?0x64; +i8_data[2]:0x02; +ui16_data:N?100h; +ui16_data[3]:3h; +i16_data:N?100h; +i16_data[4]:4h; + +-1"\n+----------|| Combine the data for all columns ||----------+\n"; +short_data:(ts_data;bool_data;ui8_data;i8_data;ui16_data;i16_data); + +-1"\n+----------|| Write the schema and array data to a parquet file ||----------+\n"; +options[`PARQUET_VERSION]:`V2.0 + +parquet_short:"null_mapping_short.parquet"; +.arrowkdb.pq.writeParquet[parquet_short;short_schema;short_data;options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +parquet_short_schema:.arrowkdb.pq.readParquetSchema[parquet_short]; +.arrowkdb.sc.equalSchemas[short_schema;parquet_short_schema] +short_schema~parquet_short_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +parquet_short_data:.arrowkdb.pq.readParquetData[parquet_short;options]; +short_data~parquet_short_data +rm parquet_short; + +-1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; +arrow_short:"null_mapping_short.arrow"; +.arrowkdb.ipc.writeArrow[arrow_short;short_schema;short_data;options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +arrow_short_schema:.arrowkdb.ipc.readArrowSchema[arrow_short]; +.arrowkdb.sc.equalSchemas[short_schema;arrow_short_schema] +short_schema~arrow_short_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +arrow_short_data:.arrowkdb.ipc.readArrowData[arrow_short;options]; +short_data~arrow_short_data +rm arrow_short; + +-1"\n+----------|| Serialize the schema and array data to an arrow stream ||----------+\n"; +serialized_short:.arrowkdb.ipc.serializeArrow[short_schema;short_data;options]; + +-1"\n+----------|| Parse the schema back and compare ||----------+\n"; +stream_short_schema:.arrowkdb.ipc.parseArrowSchema[serialized_short]; +.arrowkdb.sc.equalSchemas[short_schema;stream_short_schema] +short_schema~stream_short_schema + +-1"\n+----------|| Parse the array data back and compare ||----------+\n"; +stream_short_data:.arrowkdb.ipc.parseArrowData[serialized_short;options]; +short_data~stream_short_data + + +-1 "\n+----------|| Test utils ||----------+\n"; + +show util.buildInfo[] +(type util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; \ No newline at end of file From 0983d89af6d21553d82e0b18c7b986fb9357e8e5 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 8 Feb 2023 16:26:45 +0000 Subject: [PATCH 056/276] Null mapping of long integer fields --- tests/.gitignore | 1 + tests/null_mapping_long.t | 99 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+) create mode 100644 tests/null_mapping_long.t diff --git a/tests/.gitignore b/tests/.gitignore index 56b0696..6adfd46 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -1,2 +1,3 @@ test.q null_mapping_short.q +null_mapping_long.q diff --git a/tests/null_mapping_long.t b/tests/null_mapping_long.t new file mode 100644 index 0000000..db4c2b5 --- /dev/null +++ b/tests/null_mapping_long.t @@ -0,0 +1,99 @@ +// null_mapping_long.q + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping ||----------+\n"; +long_opts:(`uint32`int32`uint64`int64)!(5i;6i;7;8); + +options:(``NULL_MAPPING)!((::);long_opts); + +N:5 + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +ui32_dt:.arrowkdb.dt.uint32[]; +i32_dt:.arrowkdb.dt.int32[]; +ui64_dt:.arrowkdb.dt.uint64[]; +i64_dt:.arrowkdb.dt.int64[]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +ui32_fd:.arrowkdb.fd.field[`uint32;ui32_dt]; +i32_fd:.arrowkdb.fd.field[`int32;i32_dt]; +ui64_fd:.arrowkdb.fd.field[`uint64;ui64_dt]; +i64_fd:.arrowkdb.fd.field[`int64;i64_dt]; + +-1"\n+----------|| Create the schemas for the list of fields ||----------+\n"; +long_schema:.arrowkdb.sc.schema[(ts_fd,ui32_fd,i32_fd,ui64_fd,i64_fd)]; + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +ui32_data:N?100i; +ui32_data[0]:5i; +i32_data:N?100i; +i32_data[1]:6i; +ui64_data:N?100; +ui64_data[2]:7; +i64_data:N?100; +i64_data[3]:8; + +-1"\n+----------|| Combine the data for all columns ||----------+\n"; +long_data:(ts_data;ui32_data;i32_data;ui64_data;i64_data); + +-1"\n+----------|| Write the schema and array data to a parquet file ||----------+\n"; +options[`PARQUET_VERSION]:`V2.0 + +parquet_long:"null_mapping_long.parquet"; +.arrowkdb.pq.writeParquet[parquet_long;long_schema;long_data;options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +parquet_long_schema:.arrowkdb.pq.readParquetSchema[parquet_long]; +.arrowkdb.sc.equalSchemas[long_schema;parquet_long_schema] +long_schema~parquet_long_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +parquet_long_data:.arrowkdb.pq.readParquetData[parquet_long;options]; +long_data~parquet_long_data +rm parquet_long; + +-1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; +arrow_long:"null_mapping_long.arrow"; +.arrowkdb.ipc.writeArrow[arrow_long;long_schema;long_data;options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +arrow_long_schema:.arrowkdb.ipc.readArrowSchema[arrow_long]; +.arrowkdb.sc.equalSchemas[long_schema;arrow_long_schema] +long_schema~arrow_long_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +arrow_long_data:.arrowkdb.ipc.readArrowData[arrow_long;options]; +long_data~arrow_long_data +rm arrow_long; + +-1"\n+----------|| Serialize the schema and array data to an arrow stream ||----------+\n"; +serialized_long:.arrowkdb.ipc.serializeArrow[long_schema;long_data;options]; + +-1"\n+----------|| Parse the schema back abd compare ||----------+\n"; +stream_long_schema:.arrowkdb.ipc.parseArrowSchema[serialized_long]; +.arrowkdb.sc.equalSchemas[long_schema;stream_long_schema] +long_schema~stream_long_schema + +-1"\n+----------|| Parse the array data back and compare ||----------+\n"; +stream_long_data:.arrowkdb.ipc.parseArrowData[serialized_long;options]; +long_data~stream_long_data + + +-1 "\n+----------|| Test utils ||----------+\n"; + +show util.buildInfo[] +(type util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; From 1c2a365ea9c048b0e20f1c4fe89da95fb0bc5918 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 8 Feb 2023 16:51:17 +0000 Subject: [PATCH 057/276] Null mapping of float fields --- tests/.gitignore | 1 + tests/null_mapping_float.t | 96 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+) create mode 100644 tests/null_mapping_float.t diff --git a/tests/.gitignore b/tests/.gitignore index 6adfd46..6ffd0d7 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -1,3 +1,4 @@ test.q null_mapping_short.q null_mapping_long.q +null_mapping_float.q diff --git a/tests/null_mapping_float.t b/tests/null_mapping_float.t new file mode 100644 index 0000000..96a2d26 --- /dev/null +++ b/tests/null_mapping_float.t @@ -0,0 +1,96 @@ +// null_mapping_float.q + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping ||----------+\n"; +float_opts:(`float32`float64`decimal)!(1.23e;4.56;7.89); + +options:(``NULL_MAPPING)!((::);float_opts); + +N:5 + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +f32_dt:.arrowkdb.dt.float32[]; +f64_dt:.arrowkdb.dt.float64[]; +dec_dt:.arrowkdb.dt.decimal128[38i;2i]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +f32_fd:.arrowkdb.fd.field[`float32;f32_dt]; +f64_fd:.arrowkdb.fd.field[`float64;f64_dt]; +dec_fd:.arrowkdb.fd.field[`decimal;dec_dt]; + +-1"\n+----------|| Create the schemas for the list of fields ||----------+\n"; +float_schema:.arrowkdb.sc.schema[(ts_fd,f32_fd,f64_fd,dec_fd)]; + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +f32_data:N?100e; +f32_data[0]:1.23e; +f64_data:N?100f; +f64_data[1]:4.56f; +dec_data:{"F"$.Q.f[2]x} each N?(10f) +dec_data[2]:7.89f + +-1"\n+----------|| Combine the data for all columns ||----------+\n"; +float_data:(ts_data;f32_data;f64_data;dec_data); + +-1"\n+----------|| Write the schema and array data to a parquet file ||----------+\n"; +options[`DECIMAL128_AS_DOUBLE]:1 +options[`PARQUET_VERSION]:`V2.0 + +parquet_float:"null_mapping_float.parquet"; +.arrowkdb.pq.writeParquet[parquet_float;float_schema;float_data;options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +parquet_float_schema:.arrowkdb.pq.readParquetSchema[parquet_float]; +.arrowkdb.sc.equalSchemas[float_schema;parquet_float_schema] +float_schema~parquet_float_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +parquet_float_data:.arrowkdb.pq.readParquetData[parquet_float;options]; +float_data~parquet_float_data +rm parquet_float; + +-1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; +arrow_float:"null_mapping_float.arrow"; +.arrowkdb.ipc.writeArrow[arrow_float;float_schema;float_data;options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +arrow_float_schema:.arrowkdb.ipc.readArrowSchema[arrow_float]; +.arrowkdb.sc.equalSchemas[float_schema;arrow_float_schema] +float_schema~arrow_float_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +arrow_float_data:.arrowkdb.ipc.readArrowData[arrow_float;options]; +float_data~arrow_float_data +rm arrow_float; + +-1"\n+----------|| Serialize the schema and array data to an arrow stream ||----------+\n"; +serialized_float:.arrowkdb.ipc.serializeArrow[float_schema;float_data;options]; + +-1"\n+----------|| Parse the schema back abd compare ||----------+\n"; +stream_float_schema:.arrowkdb.ipc.parseArrowSchema[serialized_float]; +.arrowkdb.sc.equalSchemas[float_schema;stream_float_schema] +float_schema~stream_float_schema + +-1"\n+----------|| Parse the array data back and compare ||----------+\n"; +stream_float_data:.arrowkdb.ipc.parseArrowData[serialized_float;options]; +float_data~stream_float_data + + +-1 "\n+----------|| Test utils ||----------+\n"; + +show util.buildInfo[] +(type util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; From c2a0f171cf6b629e7bef2872da8cd73b324f7751 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 8 Feb 2023 17:07:19 +0000 Subject: [PATCH 058/276] Null mapping of string fields --- tests/.gitignore | 1 + tests/null_mapping_str.t | 98 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 99 insertions(+) create mode 100644 tests/null_mapping_str.t diff --git a/tests/.gitignore b/tests/.gitignore index 6ffd0d7..deb3a89 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -2,3 +2,4 @@ test.q null_mapping_short.q null_mapping_long.q null_mapping_float.q +null_mapping_str.q diff --git a/tests/null_mapping_str.t b/tests/null_mapping_str.t new file mode 100644 index 0000000..95bdd95 --- /dev/null +++ b/tests/null_mapping_str.t @@ -0,0 +1,98 @@ +// null_mapping_str.q + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping ||----------+\n"; +str_opts:(`string`binary`fixed_binary)!("start";"x"$"alert";0Ng); + +options:(``NULL_MAPPING)!((::);str_opts); + +N:5 + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +str_dt:.arrowkdb.dt.utf8[]; +bin_dt:.arrowkdb.dt.binary[]; +fbin_dt:.arrowkdb.dt.fixed_size_binary[16i]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +str_fd:.arrowkdb.fd.field[`string;str_dt]; +bin_fd:.arrowkdb.fd.field[`binary;bin_dt]; +fbin_fd:.arrowkdb.fd.field[`fixed_binary;fbin_dt]; + +-1"\n+----------|| Create the schemas for the list of fields ||----------+\n"; +str_schema:.arrowkdb.sc.schema[(ts_fd,str_fd,bin_fd,fbin_fd)]; + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +str_data:N?("start";"stop";"alert";"acknowledge";""); +str_data[0]:"start" +bin_data:N?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); +bin_data[2]:"x"$"alert" +fbin_data:N?0Ng; +fbin_data[4]:0Ng; + +-1"\n+----------|| Combine the data for all columns ||----------+\n"; +str_data:(ts_data;str_data;bin_data;fbin_data); + +-1"\n+----------|| Write the schema and array data to a parquet file ||----------+\n"; +options[`PARQUET_VERSION]:`V2.0 + +parquet_str:"null_mapping_str.parquet"; +.arrowkdb.pq.writeParquet[parquet_str;str_schema;str_data;options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +parquet_str_schema:.arrowkdb.pq.readParquetSchema[parquet_str]; +.arrowkdb.sc.equalSchemas[str_schema;parquet_str_schema] +str_schema~parquet_str_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +parquet_str_data:.arrowkdb.pq.readParquetData[parquet_str;options]; +parquet_str_data[3]:{0x0 sv x} each parquet_str_data[3] // Convert to GUIDs +str_data~parquet_str_data +rm parquet_str; + +-1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; +arrow_str:"null_mapping_str.arrow"; +.arrowkdb.ipc.writeArrow[arrow_str;str_schema;str_data;options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +arrow_str_schema:.arrowkdb.ipc.readArrowSchema[arrow_str]; +.arrowkdb.sc.equalSchemas[str_schema;arrow_str_schema] +str_schema~arrow_str_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +arrow_str_data:.arrowkdb.ipc.readArrowData[arrow_str;options]; +arrow_str_data[3]:{0x0 sv x} each arrow_str_data[3] // Convert to GUIDs +str_data~arrow_str_data +rm arrow_str; + +-1"\n+----------|| Serialize the schema and array data to an arrow stream ||----------+\n"; +serialized_str:.arrowkdb.ipc.serializeArrow[str_schema;str_data;options]; + +-1"\n+----------|| Parse the schema back abd compare ||----------+\n"; +stream_str_schema:.arrowkdb.ipc.parseArrowSchema[serialized_str]; +.arrowkdb.sc.equalSchemas[str_schema;stream_str_schema] +str_schema~stream_str_schema + +-1"\n+----------|| Parse the array data back and compare ||----------+\n"; +stream_str_data:.arrowkdb.ipc.parseArrowData[serialized_str;options]; +stream_str_data[3]:{0x0 sv x} each stream_str_data[3] // Convert to GUIDs +str_data~stream_str_data + + +-1 "\n+----------|| Test utils ||----------+\n"; + +show util.buildInfo[] +(type util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; From 4d4da57df9e19d216518460ff79edccb58ee0a2a Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 8 Feb 2023 17:29:41 +0000 Subject: [PATCH 059/276] Null mapping of temporal fields --- tests/.gitignore | 1 + tests/null_mapping_time.t | 89 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 tests/null_mapping_time.t diff --git a/tests/.gitignore b/tests/.gitignore index deb3a89..32522e9 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -3,3 +3,4 @@ null_mapping_short.q null_mapping_long.q null_mapping_float.q null_mapping_str.q +null_mapping_time.q diff --git a/tests/null_mapping_time.t b/tests/null_mapping_time.t new file mode 100644 index 0000000..e06c1f5 --- /dev/null +++ b/tests/null_mapping_time.t @@ -0,0 +1,89 @@ +// null_mapping_time.q + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping ||----------+\n"; +time_opts:(`date32`timestamp`time64)!(2006.07.21;2011.01.01D00:00:00.000000000;12:00:00.000000000); + +options:(``NULL_MAPPING)!((::);time_opts); + +N:5 + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +d32_dt:.arrowkdb.dt.date32[]; +tstamp_dt:.arrowkdb.dt.timestamp[`nano]; +t64_dt:.arrowkdb.dt.time64[`nano]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +d32_fd:.arrowkdb.fd.field[`date32;d32_dt]; +tstamp_fd:.arrowkdb.fd.field[`timestamp;tstamp_dt]; +t64_fd:.arrowkdb.fd.field[`time64;t64_dt]; + +-1"\n+----------|| Create the schemas for the list of fields ||----------+\n"; +time_schema:.arrowkdb.sc.schema[(ts_fd,d32_fd,tstamp_fd,t64_fd)]; + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +d32_data:N?(2006.07.21;2008.07.18;2012.07.16;2014.07.15;2016.07.11); +d32_data[0]:2006.07.21; +tstamp_data:N?(2015.01.01D00:00:00.000000000;2014.01.01D00:00:00.000000000;2013.01.01D00:00:00.000000000;2012.01.01D00:00:00.000000000;2011.01.01D00:00:00.000000000); +tstamp_data[2]:2011.01.01D00:00:00.000000000; +t64_data:N?(12:00:00.000000000;13:00:00.000000000;14:00:00.000000000;15:00:00.000000000;16:00:00.000000000); +t64_data[3]:12:00:00.000000000; + +-1"\n+----------|| Combine the data for all columns ||----------+\n"; +time_data:(ts_data;d32_data;tstamp_data;t64_data); + +-1"\n+----------|| Write the schema and array data to a parquet file ||----------+\n"; +options[`PARQUET_VERSION]:`V2.0 + +parquet_time:"null_mapping_time.parquet"; +.arrowkdb.pq.writeParquet[parquet_time;time_schema;time_data;options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +parquet_time_schema:.arrowkdb.pq.readParquetSchema[parquet_time]; +.arrowkdb.sc.equalSchemas[time_schema;parquet_time_schema] +time_schema~parquet_time_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +parquet_time_data:.arrowkdb.pq.readParquetData[parquet_time;options]; +time_data~parquet_time_data +rm parquet_time; + +-1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; +arrow_time:"null_mapping_time.arrow"; +.arrowkdb.ipc.writeArrow[arrow_time;time_schema;time_data;options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +arrow_time_schema:.arrowkdb.ipc.readArrowSchema[arrow_time]; +.arrowkdb.sc.equalSchemas[time_schema;arrow_time_schema] +time_schema~arrow_time_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +arrow_time_data:.arrowkdb.ipc.readArrowData[arrow_time;options]; +time_data~arrow_time_data +rm arrow_time; + +-1"\n+----------|| Serialize the schema and array data to an arrow stream ||----------+\n"; +serialized_time:.arrowkdb.ipc.serializeArrow[time_schema;time_data;options]; + +-1"\n+----------|| Parse the schema back abd compare ||----------+\n"; +stream_time_schema:.arrowkdb.ipc.parseArrowSchema[serialized_time]; +.arrowkdb.sc.equalSchemas[time_schema;stream_time_schema] +time_schema~stream_time_schema + +-1"\n+----------|| Parse the array data back and compare ||----------+\n"; +stream_time_data:.arrowkdb.ipc.parseArrowData[serialized_time;options]; +time_data~stream_time_data + + +-1 "\n+----------------------------------------+\n"; \ No newline at end of file From 795a595825d93f2ba72b759032572b78ed7d4f8d Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 8 Feb 2023 17:53:02 +0000 Subject: [PATCH 060/276] Null mapping of extra fields, unsupported by parquet --- tests/.gitignore | 1 + tests/null_mapping_extra.t | 84 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+) create mode 100644 tests/null_mapping_extra.t diff --git a/tests/.gitignore b/tests/.gitignore index 32522e9..e80b02e 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -4,3 +4,4 @@ null_mapping_long.q null_mapping_float.q null_mapping_str.q null_mapping_time.q +null_mapping_extra.q diff --git a/tests/null_mapping_extra.t b/tests/null_mapping_extra.t new file mode 100644 index 0000000..4e6405d --- /dev/null +++ b/tests/null_mapping_extra.t @@ -0,0 +1,84 @@ +// null_mapping_extra.q + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping only in arrow ||----------+\n"; +extra_opts:(`float16`large_string`large_binary`duration)!(9h;"stop";"x"$"acknowledge";12:00:00.000000000); + +options:(``NULL_MAPPING)!((::);extra_opts); + +N:5 + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +f16_dt:.arrowkdb.dt.float16[]; +lstr_dt:.arrowkdb.dt.large_utf8[]; +lbin_dt:.arrowkdb.dt.large_binary[]; +dur_dt:.arrowkdb.dt.duration[`milli]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +f16_fd:.arrowkdb.fd.field[`float16;f16_dt]; +lstr_fd:.arrowkdb.fd.field[`large_string;lstr_dt]; +lbin_fd:.arrowkdb.fd.field[`large_binary;lbin_dt]; +dur_fd:.arrowkdb.fd.field[`duration;dur_dt]; + +-1"\n+----------|| Create the schemas for the list of fields ||----------+\n"; +extra_schema:.arrowkdb.sc.schema[(ts_fd,f16_fd,lstr_fd,lbin_fd,dur_fd)]; + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +f16_data:N?100h; +f16_data[0]:9h; +lstr_data:N?("start";"stop";"alert";"acknowledge";""); +lstr_data[1]:"stop" +lbin_data:N?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); +lbin_data[3]:"x"$"acknowledge" +dur_data:N?(12:00:00.000000000;13:00:00.000000000;14:00:00.000000000;15:00:00.000000000;16:00:00.000000000); +dur_data[4]:12:00:00.000000000; + +-1"\n+----------|| Combine the data for all columns ||----------+\n"; +extra_data:(ts_data;f16_data;lstr_data;lbin_data;dur_data); + +-1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; +arrow_extra:"null_mapping_extra.arrow"; +.arrowkdb.ipc.writeArrow[arrow_extra;extra_schema;extra_data;options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +arrow_extra_schema:.arrowkdb.ipc.readArrowSchema[arrow_extra]; +.arrowkdb.sc.equalSchemas[extra_schema;arrow_extra_schema] +extra_schema~arrow_extra_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +arrow_extra_data:.arrowkdb.ipc.readArrowData[arrow_extra;options]; +extra_data~arrow_extra_data +rm arrow_extra; + +-1"\n+----------|| Serialize the schema and array data to an arrow stream ||----------+\n"; +serialized_extra:.arrowkdb.ipc.serializeArrow[extra_schema;extra_data;options]; + +-1"\n+----------|| Parse the schema back abd compare ||----------+\n"; +stream_extra_schema:.arrowkdb.ipc.parseArrowSchema[serialized_extra]; +.arrowkdb.sc.equalSchemas[extra_schema;stream_extra_schema] +extra_schema~stream_extra_schema + +-1"\n+----------|| Parse the array data back and compare ||----------+\n"; +stream_extra_data:.arrowkdb.ipc.parseArrowData[serialized_extra;options]; + +extra_data~stream_extra_data + + +-1 "\n+----------|| Test utils ||----------+\n"; + +show util.buildInfo[] +(type util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; From 0bbb2b49c2f8e7ff5eb0d76c57fe09a9c8eb61f9 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 8 Feb 2023 18:02:25 +0000 Subject: [PATCH 061/276] Null mapping of other fields, unsopported by parquet --- tests/.gitignore | 1 + tests/null_mapping_other.t | 83 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+) create mode 100644 tests/null_mapping_other.t diff --git a/tests/.gitignore b/tests/.gitignore index e80b02e..6b77dee 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -5,3 +5,4 @@ null_mapping_float.q null_mapping_str.q null_mapping_time.q null_mapping_extra.q +null_mapping_other.q diff --git a/tests/null_mapping_other.t b/tests/null_mapping_other.t new file mode 100644 index 0000000..bff0dc3 --- /dev/null +++ b/tests/null_mapping_other.t @@ -0,0 +1,83 @@ +// null_mapping_other.q + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping only in arrow ||----------+\n"; +other_opts:(`date64`time32`month_interval`day_time_interval)!(2015.01.01D00:00:00.000000000;09:01:02.042;2006.07m;12:00:00.000000000); + +options:(``NULL_MAPPING)!((::);other_opts); + +N:5 + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +d64_dt:.arrowkdb.dt.date64[]; +t32_dt:.arrowkdb.dt.time32[`milli]; +mint_dt:.arrowkdb.dt.month_interval[]; +dtint_dt:.arrowkdb.dt.day_time_interval[]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +d64_fd:.arrowkdb.fd.field[`date64;d64_dt]; +t32_fd:.arrowkdb.fd.field[`time32;t32_dt]; +mint_fd:.arrowkdb.fd.field[`month_interval;mint_dt]; +dtint_fd:.arrowkdb.fd.field[`day_time_interval;dtint_dt]; + +-1"\n+----------|| Create the schemas for the list of fields ||----------+\n"; +other_schema:.arrowkdb.sc.schema[(ts_fd,d64_fd,t32_fd,mint_fd,dtint_fd)]; + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +d64_data:N?(2015.01.01D00:00:00.000000000;2017.01.01D00:00:00.000000000;2018.01.01D00:00:00.000000000;2019.01.01D00:00:00.000000000;2020.01.01D00:00:00.000000000); +d64_data[1]:2015.01.01D00:00:00.000000000; +t32_data:N?(09:01:02.042;08:01:02.042;07:01:02.042;06:01:02.042;05:01:02.042); +t32_data[1]:09:01:02.042; +mint_data:N?(2006.07m;2006.06m;2006.05m;2006.04m;2006.03m); +mint_data[2]:2006.07m; +dtint_data:N?(12:00:00.000000000;11:00:00.000000000;10:00:00.000000000;09:00:00.000000000;08:00:00.000000000); +dtint_data[3]:12:00:00.000000000; + +-1"\n+----------|| Combine the data for all columns ||----------+\n"; +other_data:(ts_data;d64_data;t32_data;mint_data;dtint_data); + +-1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; +arrow_other:"null_mapping_other.arrow"; +.arrowkdb.ipc.writeArrow[arrow_other;other_schema;other_data;options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +arrow_other_schema:.arrowkdb.ipc.readArrowSchema[arrow_other]; +.arrowkdb.sc.equalSchemas[other_schema;arrow_other_schema] +other_schema~arrow_other_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +arrow_other_data:.arrowkdb.ipc.readArrowData[arrow_other;options]; +other_data~arrow_other_data +rm arrow_other; + +-1"\n+----------|| Serialize the schema and array data to an arrow stream ||----------+\n"; +serialized_other:.arrowkdb.ipc.serializeArrow[other_schema;other_data;options]; + +-1"\n+----------|| Parse the schema back abd compare ||----------+\n"; +stream_other_schema:.arrowkdb.ipc.parseArrowSchema[serialized_other]; +.arrowkdb.sc.equalSchemas[other_schema;stream_other_schema] +other_schema~stream_other_schema + +-1"\n+----------|| Parse the array data back and compare ||----------+\n"; +stream_other_data:.arrowkdb.ipc.parseArrowData[serialized_other;options]; +other_data~stream_other_data + + +-1 "\n+----------|| Test utils ||----------+\n"; + +show util.buildInfo[] +(type util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; From c3a18c87671f69661382111bde28e73710690fe8 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 13 Feb 2023 15:51:18 +0000 Subject: [PATCH 062/276] Disabled OSX and Windows builds --- .travis.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 7deb2e9..3084fc5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,9 +4,9 @@ jobs: os: linux - dist: focal os: linux - - osx_image: xcode12.5 - os: osx - - os: windows +# - osx_image: xcode12.5 +# os: osx +# - os: windows language: c compiler: gcc os: linux From 570549ad3e09740ba6cab8905b15af465d03203c Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 13 Feb 2023 16:16:07 +0000 Subject: [PATCH 063/276] Extra namespace for build info --- tests/null_mapping_extra.t | 4 ++-- tests/null_mapping_float.t | 4 ++-- tests/null_mapping_long.t | 4 ++-- tests/null_mapping_other.t | 4 ++-- tests/null_mapping_short.t | 6 +++--- tests/null_mapping_str.t | 4 ++-- tests/null_mapping_time.t | 8 +++++++- 7 files changed, 20 insertions(+), 14 deletions(-) diff --git a/tests/null_mapping_extra.t b/tests/null_mapping_extra.t index 4e6405d..b269370 100644 --- a/tests/null_mapping_extra.t +++ b/tests/null_mapping_extra.t @@ -77,8 +77,8 @@ extra_data~stream_extra_data -1 "\n+----------|| Test utils ||----------+\n"; -show util.buildInfo[] -(type util.buildInfo[])~99h +show .arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h -1 "\n+----------|| Finished testing ||----------+\n"; diff --git a/tests/null_mapping_float.t b/tests/null_mapping_float.t index 96a2d26..d036af0 100644 --- a/tests/null_mapping_float.t +++ b/tests/null_mapping_float.t @@ -89,8 +89,8 @@ float_data~stream_float_data -1 "\n+----------|| Test utils ||----------+\n"; -show util.buildInfo[] -(type util.buildInfo[])~99h +show .arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h -1 "\n+----------|| Finished testing ||----------+\n"; diff --git a/tests/null_mapping_long.t b/tests/null_mapping_long.t index db4c2b5..fcc1541 100644 --- a/tests/null_mapping_long.t +++ b/tests/null_mapping_long.t @@ -92,8 +92,8 @@ long_data~stream_long_data -1 "\n+----------|| Test utils ||----------+\n"; -show util.buildInfo[] -(type util.buildInfo[])~99h +show .arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h -1 "\n+----------|| Finished testing ||----------+\n"; diff --git a/tests/null_mapping_other.t b/tests/null_mapping_other.t index bff0dc3..05cb069 100644 --- a/tests/null_mapping_other.t +++ b/tests/null_mapping_other.t @@ -76,8 +76,8 @@ other_data~stream_other_data -1 "\n+----------|| Test utils ||----------+\n"; -show util.buildInfo[] -(type util.buildInfo[])~99h +show .arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h -1 "\n+----------|| Finished testing ||----------+\n"; diff --git a/tests/null_mapping_short.t b/tests/null_mapping_short.t index 59dc02d..f2c8816 100644 --- a/tests/null_mapping_short.t +++ b/tests/null_mapping_short.t @@ -96,8 +96,8 @@ short_data~stream_short_data -1 "\n+----------|| Test utils ||----------+\n"; -show util.buildInfo[] -(type util.buildInfo[])~99h +show .arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h --1 "\n+----------|| Finished testing ||----------+\n"; \ No newline at end of file +-1 "\n+----------|| Finished testing ||----------+\n"; diff --git a/tests/null_mapping_str.t b/tests/null_mapping_str.t index 95bdd95..8220eac 100644 --- a/tests/null_mapping_str.t +++ b/tests/null_mapping_str.t @@ -91,8 +91,8 @@ str_data~stream_str_data -1 "\n+----------|| Test utils ||----------+\n"; -show util.buildInfo[] -(type util.buildInfo[])~99h +show .arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h -1 "\n+----------|| Finished testing ||----------+\n"; diff --git a/tests/null_mapping_time.t b/tests/null_mapping_time.t index e06c1f5..d24914b 100644 --- a/tests/null_mapping_time.t +++ b/tests/null_mapping_time.t @@ -86,4 +86,10 @@ stream_time_data:.arrowkdb.ipc.parseArrowData[serialized_time;options]; time_data~stream_time_data --1 "\n+----------------------------------------+\n"; \ No newline at end of file +-1 "\n+----------|| Test utils ||----------+\n"; + +show .arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; From 7b6462c28dea3f2364ae33a07aec78c736f12283 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 13 Feb 2023 16:35:07 +0000 Subject: [PATCH 064/276] Adjusting travis root repo folder --- tests/null_mapping_extra.t | 2 +- tests/null_mapping_float.t | 2 +- tests/null_mapping_long.t | 2 +- tests/null_mapping_other.t | 2 +- tests/null_mapping_short.t | 2 +- tests/null_mapping_str.t | 2 +- tests/null_mapping_time.t | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/null_mapping_extra.t b/tests/null_mapping_extra.t index b269370..3ac2e1b 100644 --- a/tests/null_mapping_extra.t +++ b/tests/null_mapping_extra.t @@ -1,7 +1,7 @@ // null_mapping_extra.q -1"\n+----------|| Import the arrowkdb library ||----------+\n"; -\l arrowkdb.q +\l q/arrowkdb.q -1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; diff --git a/tests/null_mapping_float.t b/tests/null_mapping_float.t index d036af0..e61b9ed 100644 --- a/tests/null_mapping_float.t +++ b/tests/null_mapping_float.t @@ -1,7 +1,7 @@ // null_mapping_float.q -1"\n+----------|| Import the arrowkdb library ||----------+\n"; -\l arrowkdb.q +\l q/arrowkdb.q -1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; diff --git a/tests/null_mapping_long.t b/tests/null_mapping_long.t index fcc1541..6c64a4b 100644 --- a/tests/null_mapping_long.t +++ b/tests/null_mapping_long.t @@ -1,7 +1,7 @@ // null_mapping_long.q -1"\n+----------|| Import the arrowkdb library ||----------+\n"; -\l arrowkdb.q +\l q/arrowkdb.q -1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; diff --git a/tests/null_mapping_other.t b/tests/null_mapping_other.t index 05cb069..6228a28 100644 --- a/tests/null_mapping_other.t +++ b/tests/null_mapping_other.t @@ -1,7 +1,7 @@ // null_mapping_other.q -1"\n+----------|| Import the arrowkdb library ||----------+\n"; -\l arrowkdb.q +\l q/arrowkdb.q -1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; diff --git a/tests/null_mapping_short.t b/tests/null_mapping_short.t index f2c8816..1c4dfec 100644 --- a/tests/null_mapping_short.t +++ b/tests/null_mapping_short.t @@ -1,7 +1,7 @@ // null_mapping_short.t -1"\n+----------|| Import the arrowkdb library ||----------+\n"; -\l arrowkdb.q +\l q/arrowkdb.q -1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; diff --git a/tests/null_mapping_str.t b/tests/null_mapping_str.t index 8220eac..295d533 100644 --- a/tests/null_mapping_str.t +++ b/tests/null_mapping_str.t @@ -1,7 +1,7 @@ // null_mapping_str.q -1"\n+----------|| Import the arrowkdb library ||----------+\n"; -\l arrowkdb.q +\l q/arrowkdb.q -1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; diff --git a/tests/null_mapping_time.t b/tests/null_mapping_time.t index d24914b..c006ed4 100644 --- a/tests/null_mapping_time.t +++ b/tests/null_mapping_time.t @@ -1,7 +1,7 @@ // null_mapping_time.q -1"\n+----------|| Import the arrowkdb library ||----------+\n"; -\l arrowkdb.q +\l q/arrowkdb.q -1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; From 79fdb6c4a631794e38878a3bd330afda117d8acb Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 13 Feb 2023 17:08:27 +0000 Subject: [PATCH 065/276] Renaming to prevent caching of datatypes, fields and schemas --- tests/.gitignore | 2 +- tests/{test.t => basic.t} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename tests/{test.t => basic.t} (100%) diff --git a/tests/.gitignore b/tests/.gitignore index 6b77dee..c0832d1 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -1,4 +1,4 @@ -test.q +basic.q null_mapping_short.q null_mapping_long.q null_mapping_float.q diff --git a/tests/test.t b/tests/basic.t similarity index 100% rename from tests/test.t rename to tests/basic.t From ec4046e465bf8163adb1ab9cd0424aea63b10628 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 9 Feb 2023 14:55:44 +0300 Subject: [PATCH 066/276] Example for bitmap reading option --- examples/null_bitmap.q | 87 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 examples/null_bitmap.q diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q new file mode 100644 index 0000000..68f1bfd --- /dev/null +++ b/examples/null_bitmap.q @@ -0,0 +1,87 @@ +// null_bitmap.q +// Example of exposing null bitmap as a separate structure to kdb + +-1"\n+----------|| null_bitmap.q ||----------+\n"; + +// import the arrowkdb library +\l arrowkdb.q + +// Filesystem functions for Linux/MacOS/Windows +ls:{[filename] $[.z.o like "w*";system "dir /b ",filename;system "ls ",filename]}; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +///////////////////////// +// CONSTRUCTED SCHEMAS // +///////////////////////// + +//-------------------// +// Create the schema // +//-------------------// + +// Support null mapping +bitmap_opts:(`bool`int32`float64`string`date32)!(0b;1i;2.34;"start";2006.07.21); + +options:(``NULL_MAPPING)!((::);bitmap_opts); + +// Create the datatype identifiers +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +bool_dt:.arrowkdb.dt.boolean[]; +i32_dt:.arrowkdb.dt.int32[]; +f64_dt:.arrowkdb.dt.float64[]; +str_dt:.arrowkdb.dt.utf8[]; +d32_dt:.arrowkdb.dt.date32[]; + +// Create the field identifiers +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +bool_fd:.arrowkdb.fd.field[`bool;bool_dt]; +i32_fd:.arrowkdb.fd.field[`int32;i32_dt]; +f64_fd:.arrowkdb.fd.field[`float64;f64_dt]; +str_fd:.arrowkdb.fd.field[`string;str_dt]; +d32_fd:.arrowkdb.fd.field[`date32;d32_dt]; + +// Create the schemas for the list of fields +bitmap_schema:.arrowkdb.sc.schema[(ts_fd,bool_fd,i32_fd,f64_fd,str_fd,d32_fd)]; + +// Print the schema +.arrowkdb.sc.printSchema[bitmap_schema]; + +// Create data for each column in the table +ts_data:asc N?0p; + +bool_data:N?(0b;1b); +bool_data[0]:0b; +i32_data:N?100i; +i32_data[1]:1i; +f64_data:N?100f; +f64_data[2]:2.34f; +str_data:N?("start";"stop";"alert";"acknowledge";""); +str_data[3]:"start" +d32_data:N?(2006.07.21;2005.07.18;2004.07.16;2003.07.15;2002.07.11); +d32_data[4]:2006.07.21; + +// Combine the data for all columns +bitmap_data:(ts_data;bool_data;i32_data;f64_data;str_data;d32_data); + +// Pretty print the Arrow table populated from the bitmap data +.arrowkdb.tb.prettyPrintTable[bitmap_schema;bitmap_data;options]; + +//-------------------------// +// Example-1. Parquet file // +//-------------------------// + +// Write the schema and array data to a parquet file +options[`PARQUET_VERSION]:`V2.0 + +parquet_bitmap:"null_bitmap.parquet"; +.arrowkdb.pq.writeParquet[parquet_bitmap;bitmap_schema;bitmap_data;options]; +show ls parquet_bitmap + +// Read the parquet file into another table +parquet_table:.arrowkdb.pq.readParquetToTable[parquet_bitmap;(``WITH_NULL_BITMAP)!((::);1)]; +.arrowkdb.tb.prettyPrintTableFromTable[parquet_table;::]; + +// Compare the kdb+ tables +show bitmap_data~parquet_table +//rm parquet_bitmap; \ No newline at end of file From d40b07844d7a0c6cd842c3721854f8f603cf3329 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 10 Feb 2023 13:01:32 +0300 Subject: [PATCH 067/276] Example of reading Parquet's data with null bitmap --- examples/null_bitmap.q | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q index 68f1bfd..9f3faeb 100644 --- a/examples/null_bitmap.q +++ b/examples/null_bitmap.q @@ -72,16 +72,13 @@ bitmap_data:(ts_data;bool_data;i32_data;f64_data;str_data;d32_data); //-------------------------// // Write the schema and array data to a parquet file -options[`PARQUET_VERSION]:`V2.0 +options[`PARQUET_VERSION]:`V2.0; parquet_bitmap:"null_bitmap.parquet"; .arrowkdb.pq.writeParquet[parquet_bitmap;bitmap_schema;bitmap_data;options]; show ls parquet_bitmap -// Read the parquet file into another table -parquet_table:.arrowkdb.pq.readParquetToTable[parquet_bitmap;(``WITH_NULL_BITMAP)!((::);1)]; -.arrowkdb.tb.prettyPrintTableFromTable[parquet_table;::]; - -// Compare the kdb+ tables -show bitmap_data~parquet_table -//rm parquet_bitmap; \ No newline at end of file +// Read the array data back and compare +parquet_bitmap_data:.arrowkdb.pq.readParquetData[parquet_bitmap;(``WITH_NULL_BITMAP)!((::);1)]; +show bitmap_data~parquet_bitmap_data +rm parquet_bitmap; From 4e187adc4838b72dc17dd0edba1a3dc6e573aa57 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 10 Feb 2023 13:04:17 +0300 Subject: [PATCH 068/276] Null bitmap reader --- src/ArrayReader.cpp | 36 ++++++++++++++++++++++++++++++++++++ src/ArrayReader.h | 24 ++++++++++++++++++++++++ src/KdbOptions.h | 2 ++ src/TableData.cpp | 14 ++++++++++++++ 4 files changed, 76 insertions(+) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 30e34b4..3a3e336 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -634,6 +634,30 @@ void AppendArray(shared_ptr array_data, K k_array, size_t& index, } } +void AppendNullBitmap( shared_ptr array_data, K k_bitmap, size_t& index ) +{ + auto type_id = array_data->type_id(); + const uint8_t* null_data = array_data->null_bitmap_data(); + if( null_data == nullptr || array_data->null_count() == 0 + || arrow::Type::LIST == type_id + || arrow::Type::LARGE_LIST == type_id + || arrow::Type::FIXED_SIZE_LIST == type_id + || arrow::Type::MAP == type_id + || arrow::Type::STRUCT == type_id + || arrow::Type::SPARSE_UNION == type_id + || arrow::Type::DENSE_UNION == type_id + || arrow::Type::DICTIONARY == type_id ){ + memset( &kG( k_bitmap )[index], 0, array_data->length() ); + index += array_data->length(); + } + else{ + for(auto i = 0; i < array_data->length(); ++i ){ + kG( k_bitmap )[index] = null_data[index]; + ++index; + } + } +} + K InitKdbForArray(shared_ptr datatype, size_t length, TypeMappingOverride& type_overrides) { switch (datatype->id()) { @@ -696,6 +720,18 @@ K ReadChunkedArray(shared_ptr chunked_array, TypeMappingOve return k_array; } +K ReadChunkedNullBitmap( shared_ptr chunked_array, TypeMappingOverride& type_overrides ) +{ + auto bitmapDatatype = std::make_shared(); + K k_bitmap = InitKdbForArray( bitmapDatatype, chunked_array->length(), type_overrides ); + size_t index = 0; + for( auto i = 0; i < chunked_array->num_chunks(); ++i ){ + AppendNullBitmap( chunked_array->chunk( i ), k_bitmap, index ); + } + + return k_bitmap; +} + } // namespace arrowkdb } // namspace kx diff --git a/src/ArrayReader.h b/src/ArrayReader.h index 3298190..16c72b5 100644 --- a/src/ArrayReader.h +++ b/src/ArrayReader.h @@ -26,6 +26,22 @@ namespace arrowkdb { */ void AppendArray(std::shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides); + +/** + * @brief Appends null bitmap data from an arrow array into an existing kdb boolean + * list starting at the specified index. + * + * @param array_data The arrow array from which to source the data. The entire + * array will be appended. + * @param k_bitmap The kdb boolean list that the data should be inserted into. + * This list needs to have been created with the correct length by the calling + * function. + * @param index The index into the kdb list at which the appending should + * begin. Index will be updated to account for the new offset by adding the + * length of the array array. +*/ +void AppendNullBitmap( std::shared_ptr array_data, K k_bitmap, size_t& index ); + /** * @brief Copies and converts an arrow array to a kdb list * @@ -45,6 +61,14 @@ K ReadArray(std::shared_ptr array, TypeMappingOverride& type_overr */ K ReadChunkedArray(std::shared_ptr chunked_array, TypeMappingOverride& type_overrides); +/** + * @brief Extracts nulls bitmap of an arrow array into a boolean kdb list + * + * @param chunked_array The chunked array to be converted + * @return A kdb list representing the nulls bitmap +*/ +K ReadChunkedNullBitmap( std::shared_ptr chunked_array, TypeMappingOverride& type_overrides ); + /** * @brief Creates a kdb list of the correct type and specified length according * to the arrow datatype. For the arrow struct/union datatypes this includes diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 6a8fc84..6e7d59f 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -68,6 +68,7 @@ namespace Options const std::string PARQUET_MULTITHREADED_READ = "PARQUET_MULTITHREADED_READ"; const std::string USE_MMAP = "USE_MMAP"; const std::string DECIMAL128_AS_DOUBLE = "DECIMAL128_AS_DOUBLE"; + const std::string WITH_NULL_BITMAP = "WITH_NULL_BITMAP"; // String options const std::string PARQUET_VERSION = "PARQUET_VERSION"; @@ -108,6 +109,7 @@ namespace Options PARQUET_MULTITHREADED_READ, USE_MMAP, DECIMAL128_AS_DOUBLE, + WITH_NULL_BITMAP }; const static std::set string_options = { PARQUET_VERSION, diff --git a/src/TableData.cpp b/src/TableData.cpp index 0deeb88..22db1f2 100644 --- a/src/TableData.cpp +++ b/src/TableData.cpp @@ -294,6 +294,20 @@ K readParquetData(K parquet_file, K options) kK(data)[i] = kx::arrowkdb::ReadChunkedArray(chunked_array, type_overrides); } + int64_t with_null_bitmap = 0; + read_options.GetIntOption( kx::arrowkdb::Options::WITH_NULL_BITMAP, with_null_bitmap ); + if( with_null_bitmap ){ + K bitmap = ktn( 0, col_num ); + for( auto i = 0; i < col_num; ++i ){ + auto chunked_array = table->column( i ); + kK( bitmap )[i] = kx::arrowkdb::ReadChunkedNullBitmap( chunked_array, type_overrides ); + } + K array = data; + data = ktn( 0, 2 ); + kK( data )[0] = array; + kK( data )[1] = bitmap; + } + return data; KDB_EXCEPTION_CATCH; From 2d8e79ae3e861c0b0b32583a5e0c31d3a2c92fe9 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 10 Feb 2023 20:25:18 +0300 Subject: [PATCH 069/276] Null bitmap debugging changes --- src/ArrayReader.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 3a3e336..16c1043 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -637,8 +637,7 @@ void AppendArray(shared_ptr array_data, K k_array, size_t& index, void AppendNullBitmap( shared_ptr array_data, K k_bitmap, size_t& index ) { auto type_id = array_data->type_id(); - const uint8_t* null_data = array_data->null_bitmap_data(); - if( null_data == nullptr || array_data->null_count() == 0 + if( array_data->null_count() == 0 || arrow::Type::LIST == type_id || arrow::Type::LARGE_LIST == type_id || arrow::Type::FIXED_SIZE_LIST == type_id @@ -651,8 +650,8 @@ void AppendNullBitmap( shared_ptr array_data, K k_bitmap, size_t& index += array_data->length(); } else{ - for(auto i = 0; i < array_data->length(); ++i ){ - kG( k_bitmap )[index] = null_data[index]; + for( auto i = 0; i < array_data->length(); ++i ){ + kG( k_bitmap )[index] = array_data->IsNull( index ); ++index; } } @@ -722,8 +721,8 @@ K ReadChunkedArray(shared_ptr chunked_array, TypeMappingOve K ReadChunkedNullBitmap( shared_ptr chunked_array, TypeMappingOverride& type_overrides ) { - auto bitmapDatatype = std::make_shared(); - K k_bitmap = InitKdbForArray( bitmapDatatype, chunked_array->length(), type_overrides ); + auto boolean = std::make_shared(); + K k_bitmap = InitKdbForArray( boolean, chunked_array->length(), type_overrides ); size_t index = 0; for( auto i = 0; i < chunked_array->num_chunks(); ++i ){ AppendNullBitmap( chunked_array->chunk( i ), k_bitmap, index ); From 0c3f2a40789087faa0ac3c8fb9dcf5fc71b58a7c Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 10 Feb 2023 20:26:41 +0300 Subject: [PATCH 070/276] Bitmap test improved --- examples/null_bitmap.q | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q index 9f3faeb..b45e347 100644 --- a/examples/null_bitmap.q +++ b/examples/null_bitmap.q @@ -79,6 +79,12 @@ parquet_bitmap:"null_bitmap.parquet"; show ls parquet_bitmap // Read the array data back and compare -parquet_bitmap_data:.arrowkdb.pq.readParquetData[parquet_bitmap;(``WITH_NULL_BITMAP)!((::);1)]; -show bitmap_data~parquet_bitmap_data +options[`WITH_NULL_BITMAP]:1; +parquet_bitmap_data:.arrowkdb.pq.readParquetData[parquet_bitmap;options]; +show bitmap_data~first parquet_bitmap_data + +nulls_data:1b,(N-1)?1b +bitmap_nulls:{x rotate nulls_data} each neg til {x-1} count bitmap_data +parquet_bitmap_nulls:last parquet_bitmap_data +show bitmap_nulls~bitmap_nulls & sublist[{1-x} count parquet_bitmap_nulls;parquet_bitmap_nulls] rm parquet_bitmap; From 36948e6a9d2cf00fcb3053c465b41c6285698b18 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 13 Feb 2023 13:50:48 +0300 Subject: [PATCH 071/276] Example of reading of Null's bitmap from Arrow --- examples/null_bitmap.q | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q index b45e347..6bfd3b3 100644 --- a/examples/null_bitmap.q +++ b/examples/null_bitmap.q @@ -83,8 +83,29 @@ options[`WITH_NULL_BITMAP]:1; parquet_bitmap_data:.arrowkdb.pq.readParquetData[parquet_bitmap;options]; show bitmap_data~first parquet_bitmap_data -nulls_data:1b,(N-1)?1b -bitmap_nulls:{x rotate nulls_data} each neg til {x-1} count bitmap_data -parquet_bitmap_nulls:last parquet_bitmap_data +nulls_data:1b,(N-1)?1b; +bitmap_nulls:{x rotate nulls_data} each neg til {x-1} count bitmap_data; +parquet_bitmap_nulls:last parquet_bitmap_data; show bitmap_nulls~bitmap_nulls & sublist[{1-x} count parquet_bitmap_nulls;parquet_bitmap_nulls] rm parquet_bitmap; + +//---------------------------// +// Example-2. Arrow IPC file // +//---------------------------// + +// Write the schema and array data to an arrow file +arrow_bitmap:"null_bitmap.arrow"; +.arrowkdb.ipc.writeArrow[arrow_bitmap;bitmap_schema;bitmap_data;options]; +show ls arrow_bitmap + +// Read the schema back and compare +arrow_bitmap_schema:.arrowkdb.ipc.readArrowSchema[arrow_bitmap]; +show .arrowkdb.sc.equalSchemas[bitmap_schema;arrow_bitmap_schema] +show bitmap_schema~arrow_bitmap_schema + +// Read the array data back and compare +arrow_bitmap_data:.arrowkdb.ipc.readArrowData[arrow_bitmap;options]; +show bitmap_data~first arrow_bitmap_data +arrow_bitmap_nulls:last arrow_bitmap_data; +show bitmap_nulls~bitmap_nulls & sublist[{1-x} count arrow_bitmap_nulls;arrow_bitmap_nulls] +rm arrow_bitmap; From 0bd5e43383776e0479617ae63715fff46819c1af Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 13 Feb 2023 13:52:29 +0300 Subject: [PATCH 072/276] Read Arraw data with Null's bitmap --- src/TableData.cpp | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/TableData.cpp b/src/TableData.cpp index 22db1f2..39ed95a 100644 --- a/src/TableData.cpp +++ b/src/TableData.cpp @@ -561,6 +561,23 @@ K readArrowData(K arrow_file, K options) kK(data)[i] = kx::arrowkdb::ReadChunkedArray(chunked_array, type_overrides); } + int64_t with_null_bitmap = 0; + read_options.GetIntOption( kx::arrowkdb::Options::WITH_NULL_BITMAP, with_null_bitmap ); + if( with_null_bitmap ){ + K bitmap = ktn( 0, col_num ); + for( auto i = 0; i < col_num; ++i ){ + arrow::ArrayVector column_arrays; + for (auto batch : all_batches) + column_arrays.push_back(batch->column(i)); + auto chunked_array = std::make_shared(column_arrays); + kK( bitmap )[i] = kx::arrowkdb::ReadChunkedNullBitmap( chunked_array, type_overrides ); + } + K array = data; + data = ktn( 0, 2 ); + kK( data )[0] = array; + kK( data )[1] = bitmap; + } + return data; KDB_EXCEPTION_CATCH; From c1cdbaee665a31d402a5843bde17b34770a1b389 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 13 Feb 2023 20:45:27 +0300 Subject: [PATCH 073/276] Arrow stream example for null bitmap --- examples/null_bitmap.q | 28 +++++++++++++++++++++++++++- examples/null_mapping.q | 2 +- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q index 6bfd3b3..c8a02fd 100644 --- a/examples/null_bitmap.q +++ b/examples/null_bitmap.q @@ -4,7 +4,7 @@ -1"\n+----------|| null_bitmap.q ||----------+\n"; // import the arrowkdb library -\l arrowkdb.q +\l q/arrowkdb.q // Filesystem functions for Linux/MacOS/Windows ls:{[filename] $[.z.o like "w*";system "dir /b ",filename;system "ls ",filename]}; @@ -109,3 +109,29 @@ show bitmap_data~first arrow_bitmap_data arrow_bitmap_nulls:last arrow_bitmap_data; show bitmap_nulls~bitmap_nulls & sublist[{1-x} count arrow_bitmap_nulls;arrow_bitmap_nulls] rm arrow_bitmap; + +//-----------------------------// +// Example-3. Arrow IPC stream // +//-----------------------------// + +// Serialize the schema and array data to an arrow stream +serialized_bitmap:.arrowkdb.ipc.serializeArrow[bitmap_schema;bitmap_data;options]; +show serialized_bitmap + +// Parse the schema back abd compare +stream_bitmap_schema:.arrowkdb.ipc.parseArrowSchema[serialized_bitmap]; +show .arrowkdb.sc.equalSchemas[bitmap_schema;stream_bitmap_schema] +show bitmap_schema~stream_bitmap_schema + +// Parse the array data back and compare +stream_bitmap_data:.arrowkdb.ipc.parseArrowData[serialized_bitmap;options]; +show bitmap_data~first stream_bitmap_data + +stream_bitmap_nulls:last stream_bitmap_data; +show bitmap_nulls~bitmap_nulls & sublist[{1-x} count stream_bitmap_nulls;stream_bitmap_nulls] + + +-1 "\n+----------------------------------------+\n"; + +// Process off +exit 0; diff --git a/examples/null_mapping.q b/examples/null_mapping.q index 627485d..470dd29 100644 --- a/examples/null_mapping.q +++ b/examples/null_mapping.q @@ -4,7 +4,7 @@ -1"\n+----------|| null_mapping.q ||----------+\n"; // import the arrowkdb library -\l arrowkdb.q +\l q/arrowkdb.q // Filesystem functions for Linux/MacOS/Windows ls:{[filename] $[.z.o like "w*";system "dir /b ",filename;system "ls ",filename]}; From ef2f8523468ded5325cd6e4d5f726598fd94533e Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 14 Feb 2023 10:55:36 +0300 Subject: [PATCH 074/276] Read Arrow stream with null bitmap --- src/TableData.cpp | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/src/TableData.cpp b/src/TableData.cpp index 39ed95a..5d84bf9 100644 --- a/src/TableData.cpp +++ b/src/TableData.cpp @@ -567,9 +567,9 @@ K readArrowData(K arrow_file, K options) K bitmap = ktn( 0, col_num ); for( auto i = 0; i < col_num; ++i ){ arrow::ArrayVector column_arrays; - for (auto batch : all_batches) - column_arrays.push_back(batch->column(i)); - auto chunked_array = std::make_shared(column_arrays); + for( auto batch: all_batches ) + column_arrays.push_back( batch->column( i ) ); + auto chunked_array = std::make_shared( column_arrays ); kK( bitmap )[i] = kx::arrowkdb::ReadChunkedNullBitmap( chunked_array, type_overrides ); } K array = data; @@ -695,6 +695,23 @@ K parseArrowData(K char_array, K options) kK(data)[i] = kx::arrowkdb::ReadChunkedArray(chunked_array, type_overrides); } + int64_t with_null_bitmap = 0; + read_options.GetIntOption( kx::arrowkdb::Options::WITH_NULL_BITMAP, with_null_bitmap ); + if( with_null_bitmap ){ + K bitmap = ktn( 0, col_num ); + for( auto i = 0; i < col_num; ++i ){ + arrow::ArrayVector column_arrays; + for( auto batch: all_batches ) + column_arrays.push_back( batch->column( i ) ); + auto chunked_array = std::make_shared( column_arrays ); + kK( bitmap )[i] = kx::arrowkdb::ReadChunkedNullBitmap( chunked_array, type_overrides ); + } + K array = data; + data = ktn( 0, 2 ); + kK( data )[0] = array; + kK( data )[1] = bitmap; + } + return data; KDB_EXCEPTION_CATCH; From b9a7edd2ba093f6ae6b1c067e59fbcd52ac62d4c Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 14 Feb 2023 11:59:35 +0300 Subject: [PATCH 075/276] Cleanup compiler warnings --- src/KdbOptions.h | 133 ++++++++++++++++++----------------------------- 1 file changed, 52 insertions(+), 81 deletions(-) diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 6e7d59f..c60e6e0 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -15,51 +15,6 @@ namespace kx { namespace arrowkdb { -template -constexpr auto toUType( E option ) noexcept -{ - return static_cast>( option ); -} - -template< typename E > -struct ETraits -{ - using Options = std::unordered_map; - - static std::string mapping( E option ){ - auto it = options.find( option ); - if( it != options.end() ){ - return it->second; - } - - return "unknown"; - } - - static std::string mapping( int option ) { return mapping( static_cast( option ) ); } - - static std::set mappings(){ - std::set values; - transform( options.begin(), options.end(), std::inserter( values, end( values ) ), []( const auto& option ){ - return option.second; - } ); - - return values; - } - - static E option( const std::string& value ){ - auto it = std::find_if( options.begin(), options.end(), [&value]( const auto& option ){ - return value == option.second; - } ); - if( it != options.end() ){ - return it->first; - } - - return E( 0 ); - } - - static const Options options; -}; - // Supported options namespace Options { @@ -186,36 +141,6 @@ namespace Options } // namespace Options -template<> -inline const ETraits::Options ETraits::options{ - { arrow::Type::BOOL, arrowkdb::Options::NM_BOOLEAN } - , { arrow::Type::UINT8, arrowkdb::Options::NM_UINT_8 } - , { arrow::Type::INT8, arrowkdb::Options::NM_INT_8 } - , { arrow::Type::UINT16, arrowkdb::Options::NM_UINT_16 } - , { arrow::Type::INT16, arrowkdb::Options::NM_INT_16 } - , { arrow::Type::UINT32, arrowkdb::Options::NM_UINT_32 } - , { arrow::Type::INT32, arrowkdb::Options::NM_INT_32 } - , { arrow::Type::UINT64, arrowkdb::Options::NM_UINT_64 } - , { arrow::Type::INT64, arrowkdb::Options::NM_INT_64 } - , { arrow::Type::HALF_FLOAT, arrowkdb::Options::NM_FLOAT_16 } - , { arrow::Type::FLOAT, arrowkdb::Options::NM_FLOAT_32 } - , { arrow::Type::DOUBLE, arrowkdb::Options::NM_FLOAT_64 } - , { arrow::Type::STRING, arrowkdb::Options::NM_STRING } - , { arrow::Type::LARGE_STRING, arrowkdb::Options::NM_LARGE_STRING } - , { arrow::Type::BINARY, arrowkdb::Options::NM_BINARY } - , { arrow::Type::LARGE_BINARY, arrowkdb::Options::NM_LARGE_BINARY } - , { arrow::Type::FIXED_SIZE_BINARY, arrowkdb::Options::NM_FIXED_BINARY } - , { arrow::Type::DATE32, arrowkdb::Options::NM_DATE_32 } - , { arrow::Type::DATE64, arrowkdb::Options::NM_DATE_64 } - , { arrow::Type::TIMESTAMP, arrowkdb::Options::NM_TIMESTAMP } - , { arrow::Type::TIME32, arrowkdb::Options::NM_TIME_32 } - , { arrow::Type::TIME64, arrowkdb::Options::NM_TIME_64 } - , { arrow::Type::DECIMAL, arrowkdb::Options::NM_DECIMAL } - , { arrow::Type::DURATION, arrowkdb::Options::NM_DURATION } - , { arrow::Type::INTERVAL_MONTHS, arrowkdb::Options::NM_MONTH_INTERVAL } - , { arrow::Type::INTERVAL_DAY_TIME, arrowkdb::Options::NM_DAY_TIME_INTERVAL } -}; - // Helper class for reading dictionary of options // // Dictionary key: KS @@ -233,10 +158,12 @@ class KdbOptions const std::set& supported_string_options; const std::set& supported_int_options; const std::set& supported_dict_options; - const std::set& supported_null_mapping_options; + std::set supported_null_mapping_options; using NullMappingHandler = void ( KdbOptions::* )( const std::string&, K ); using NullMappingHandlers = std::unordered_map; + const std::unordered_map null_mapping_types; + static const NullMappingHandlers null_mapping_handlers; private: const std::string ToUpper(std::string str) const @@ -290,7 +217,7 @@ class KdbOptions throw InvalidOption( "Unsupported NULL_MAPPING option '" + key + "'" ); } K value = kK( values )[i]; - auto option = ETraits::option( key ); + auto option = GetNullMappingType( key ); auto it = null_mapping_handlers.find( option ); if( it != null_mapping_handlers.end() ){ ( this->*it->second )( key, value ); @@ -373,14 +300,46 @@ class KdbOptions K options , const std::set supported_string_options_ , const std::set supported_int_options_ - , const std::set& supported_dict_options_ = Options::dict_options - , const std::set& supported_null_mapping_options_ = ETraits::mappings() ) + , const std::set& supported_dict_options_ = Options::dict_options ) : null_mapping_options {0} , supported_string_options(supported_string_options_) , supported_int_options(supported_int_options_) , supported_dict_options( supported_dict_options_ ) - , supported_null_mapping_options( supported_null_mapping_options_ ) + , null_mapping_types { + { arrow::Type::BOOL, arrowkdb::Options::NM_BOOLEAN } + , { arrow::Type::UINT8, arrowkdb::Options::NM_UINT_8 } + , { arrow::Type::INT8, arrowkdb::Options::NM_INT_8 } + , { arrow::Type::UINT16, arrowkdb::Options::NM_UINT_16 } + , { arrow::Type::INT16, arrowkdb::Options::NM_INT_16 } + , { arrow::Type::UINT32, arrowkdb::Options::NM_UINT_32 } + , { arrow::Type::INT32, arrowkdb::Options::NM_INT_32 } + , { arrow::Type::UINT64, arrowkdb::Options::NM_UINT_64 } + , { arrow::Type::INT64, arrowkdb::Options::NM_INT_64 } + , { arrow::Type::HALF_FLOAT, arrowkdb::Options::NM_FLOAT_16 } + , { arrow::Type::FLOAT, arrowkdb::Options::NM_FLOAT_32 } + , { arrow::Type::DOUBLE, arrowkdb::Options::NM_FLOAT_64 } + , { arrow::Type::STRING, arrowkdb::Options::NM_STRING } + , { arrow::Type::LARGE_STRING, arrowkdb::Options::NM_LARGE_STRING } + , { arrow::Type::BINARY, arrowkdb::Options::NM_BINARY } + , { arrow::Type::LARGE_BINARY, arrowkdb::Options::NM_LARGE_BINARY } + , { arrow::Type::FIXED_SIZE_BINARY, arrowkdb::Options::NM_FIXED_BINARY } + , { arrow::Type::DATE32, arrowkdb::Options::NM_DATE_32 } + , { arrow::Type::DATE64, arrowkdb::Options::NM_DATE_64 } + , { arrow::Type::TIMESTAMP, arrowkdb::Options::NM_TIMESTAMP } + , { arrow::Type::TIME32, arrowkdb::Options::NM_TIME_32 } + , { arrow::Type::TIME64, arrowkdb::Options::NM_TIME_64 } + , { arrow::Type::DECIMAL, arrowkdb::Options::NM_DECIMAL } + , { arrow::Type::DURATION, arrowkdb::Options::NM_DURATION } + , { arrow::Type::INTERVAL_MONTHS, arrowkdb::Options::NM_MONTH_INTERVAL } + , { arrow::Type::INTERVAL_DAY_TIME, arrowkdb::Options::NM_DAY_TIME_INTERVAL } } { + std::transform( + null_mapping_types.begin() + , null_mapping_types.end() + , std::inserter( supported_null_mapping_options, end( supported_null_mapping_options ) ) + , []( const auto& value ){ + return value.second; + } ); if (options != NULL && options->t != 101) { if (options->t != 99) throw InvalidOption("options not -99h"); @@ -410,8 +369,20 @@ class KdbOptions template inline void HandleNullMapping( const std::string& key, K value ); + arrow::Type::type GetNullMappingType( const std::string& option ) + { + auto it = std::find_if( null_mapping_types.begin(), null_mapping_types.end(), [&option]( const auto& value ){ + return option == value.second; + } ); + if( it != null_mapping_types.end() ){ + return it->first; + } + + return arrow::Type::NA; + } + void GetNullMappingOptions( Options::NullMapping& null_mapping ) const{ - null_mapping = null_mapping_options; + null_mapping = null_mapping_options; } bool GetStringOption(const std::string key, std::string& result) const From b7a4992fd51177871b97bedbc086455b42957acd Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 14 Feb 2023 16:04:23 +0000 Subject: [PATCH 076/276] Null bitmap test for Travis CI --- tests/.gitignore | 1 + tests/null_bitmap.t | 109 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 110 insertions(+) create mode 100644 tests/null_bitmap.t diff --git a/tests/.gitignore b/tests/.gitignore index c0832d1..9efe47e 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -1,4 +1,5 @@ basic.q +null_bitmap.q null_mapping_short.q null_mapping_long.q null_mapping_float.q diff --git a/tests/null_bitmap.t b/tests/null_bitmap.t new file mode 100644 index 0000000..3a15b1d --- /dev/null +++ b/tests/null_bitmap.t @@ -0,0 +1,109 @@ +// null_bitmap.t + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l q/arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping ||----------+\n"; +bitmap_opts:(`bool`int32`float64`string`date32)!(0b;1i;2.34;"start";2006.07.21); + +options:(``NULL_MAPPING)!((::);bitmap_opts); + +N:5 + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +bool_dt:.arrowkdb.dt.boolean[]; +i32_dt:.arrowkdb.dt.int32[]; +f64_dt:.arrowkdb.dt.float64[]; +str_dt:.arrowkdb.dt.utf8[]; +d32_dt:.arrowkdb.dt.date32[]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +bool_fd:.arrowkdb.fd.field[`bool;bool_dt]; +i32_fd:.arrowkdb.fd.field[`int32;i32_dt]; +f64_fd:.arrowkdb.fd.field[`float64;f64_dt]; +str_fd:.arrowkdb.fd.field[`string;str_dt]; +d32_fd:.arrowkdb.fd.field[`date32;d32_dt]; + +-1"\n+----------|| Create the schemas for the list of fields ||----------+\n"; +bitmap_schema:.arrowkdb.sc.schema[(ts_fd,bool_fd,i32_fd,f64_fd,str_fd,d32_fd)]; + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +bool_data:N?(0b;1b); +bool_data[0]:0b; +i32_data:N?100i; +i32_data[1]:1i; +f64_data:N?100f; +f64_data[2]:2.34f; +str_data:N?("start";"stop";"alert";"acknowledge";""); +str_data[3]:"start" +d32_data:N?(2006.07.21;2005.07.18;2004.07.16;2003.07.15;2002.07.11); +d32_data[4]:2006.07.21; + +-1"\n+----------|| Combine the data for all columns ||----------+\n"; +bitmap_data:(ts_data;bool_data;i32_data;f64_data;str_data;d32_data); + +-1"\n+----------|| Write the schema and array data to a parquet file ||----------+\n"; +options[`PARQUET_VERSION]:`V2.0; + +parquet_bitmap:"null_bitmap.parquet"; +.arrowkdb.pq.writeParquet[parquet_bitmap;bitmap_schema;bitmap_data;options]; + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +options[`WITH_NULL_BITMAP]:1; +parquet_bitmap_data:.arrowkdb.pq.readParquetData[parquet_bitmap;options]; +bitmap_data~first parquet_bitmap_data + +nulls_data:1b,(N-1)?1b; +bitmap_nulls:{x rotate nulls_data} each neg til {x-1} count bitmap_data; +parquet_bitmap_nulls:last parquet_bitmap_data; +bitmap_nulls~bitmap_nulls & sublist[{1-x} count parquet_bitmap_nulls;parquet_bitmap_nulls] +rm parquet_bitmap; + +-1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; +arrow_bitmap:"null_bitmap.arrow"; +.arrowkdb.ipc.writeArrow[arrow_bitmap;bitmap_schema;bitmap_data;options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +arrow_bitmap_schema:.arrowkdb.ipc.readArrowSchema[arrow_bitmap]; +.arrowkdb.sc.equalSchemas[bitmap_schema;arrow_bitmap_schema] +bitmap_schema~arrow_bitmap_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +arrow_bitmap_data:.arrowkdb.ipc.readArrowData[arrow_bitmap;options]; +bitmap_data~first arrow_bitmap_data +arrow_bitmap_nulls:last arrow_bitmap_data; +bitmap_nulls~bitmap_nulls & sublist[{1-x} count arrow_bitmap_nulls;arrow_bitmap_nulls] +rm arrow_bitmap; + +-1"\n+----------|| Serialize the schema and array data to an arrow stream ||----------+\n"; +serialized_bitmap:.arrowkdb.ipc.serializeArrow[bitmap_schema;bitmap_data;options]; + +-1"\n+----------|| Parse the schema back abd compare ||----------+\n"; +stream_bitmap_schema:.arrowkdb.ipc.parseArrowSchema[serialized_bitmap]; +.arrowkdb.sc.equalSchemas[bitmap_schema;stream_bitmap_schema] +bitmap_schema~stream_bitmap_schema + +-1"\n+----------|| Parse the array data back and compare ||----------+\n"; +stream_bitmap_data:.arrowkdb.ipc.parseArrowData[serialized_bitmap;options]; +bitmap_data~first stream_bitmap_data + +stream_bitmap_nulls:last stream_bitmap_data; +bitmap_nulls~bitmap_nulls & sublist[{1-x} count stream_bitmap_nulls;stream_bitmap_nulls] + + +-1 "\n+----------|| Test utils ||----------+\n"; + +show .arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; From e4f2a0d504f251d4a6567590513f665865fdc871 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 15 Feb 2023 10:49:26 +0300 Subject: [PATCH 077/276] Continue warning clean up --- src/KdbOptions.cpp | 274 +++++++++++++++++++++++++++++++++++++++++++++ src/KdbOptions.h | 263 +++---------------------------------------- 2 files changed, 288 insertions(+), 249 deletions(-) create mode 100644 src/KdbOptions.cpp diff --git a/src/KdbOptions.cpp b/src/KdbOptions.cpp new file mode 100644 index 0000000..75c903c --- /dev/null +++ b/src/KdbOptions.cpp @@ -0,0 +1,274 @@ +#include "KdbOptions.h" + +namespace{ + +template +auto make_handler() +{ + return std::make_pair( TypeId, &kx::arrowkdb::KdbOptions::HandleNullMapping ); +} + +} // namespace + +namespace kx { + +namespace arrowkdb { + +const KdbOptions::NullMappingHandlers KdbOptions::null_mapping_handlers = { + make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() +}; + +KdbOptions::KdbOptions( + K options + , const std::set& supported_string_options_ + , const std::set& supported_int_options_ + , const std::set& supported_dict_options_ ) + : null_mapping_options {0} + , supported_string_options(supported_string_options_) + , supported_int_options(supported_int_options_) + , supported_dict_options( supported_dict_options_ ) + , null_mapping_types { + { arrow::Type::BOOL, arrowkdb::Options::NM_BOOLEAN } + , { arrow::Type::UINT8, arrowkdb::Options::NM_UINT_8 } + , { arrow::Type::INT8, arrowkdb::Options::NM_INT_8 } + , { arrow::Type::UINT16, arrowkdb::Options::NM_UINT_16 } + , { arrow::Type::INT16, arrowkdb::Options::NM_INT_16 } + , { arrow::Type::UINT32, arrowkdb::Options::NM_UINT_32 } + , { arrow::Type::INT32, arrowkdb::Options::NM_INT_32 } + , { arrow::Type::UINT64, arrowkdb::Options::NM_UINT_64 } + , { arrow::Type::INT64, arrowkdb::Options::NM_INT_64 } + , { arrow::Type::HALF_FLOAT, arrowkdb::Options::NM_FLOAT_16 } + , { arrow::Type::FLOAT, arrowkdb::Options::NM_FLOAT_32 } + , { arrow::Type::DOUBLE, arrowkdb::Options::NM_FLOAT_64 } + , { arrow::Type::STRING, arrowkdb::Options::NM_STRING } + , { arrow::Type::LARGE_STRING, arrowkdb::Options::NM_LARGE_STRING } + , { arrow::Type::BINARY, arrowkdb::Options::NM_BINARY } + , { arrow::Type::LARGE_BINARY, arrowkdb::Options::NM_LARGE_BINARY } + , { arrow::Type::FIXED_SIZE_BINARY, arrowkdb::Options::NM_FIXED_BINARY } + , { arrow::Type::DATE32, arrowkdb::Options::NM_DATE_32 } + , { arrow::Type::DATE64, arrowkdb::Options::NM_DATE_64 } + , { arrow::Type::TIMESTAMP, arrowkdb::Options::NM_TIMESTAMP } + , { arrow::Type::TIME32, arrowkdb::Options::NM_TIME_32 } + , { arrow::Type::TIME64, arrowkdb::Options::NM_TIME_64 } + , { arrow::Type::DECIMAL, arrowkdb::Options::NM_DECIMAL } + , { arrow::Type::DURATION, arrowkdb::Options::NM_DURATION } + , { arrow::Type::INTERVAL_MONTHS, arrowkdb::Options::NM_MONTH_INTERVAL } + , { arrow::Type::INTERVAL_DAY_TIME, arrowkdb::Options::NM_DAY_TIME_INTERVAL } } +{ + std::transform( + null_mapping_types.begin() + , null_mapping_types.end() + , std::inserter( supported_null_mapping_options, end( supported_null_mapping_options ) ) + , []( const auto& value ){ + return value.second; + } ); + if (options != NULL && options->t != 101) { + if (options->t != 99) + throw InvalidOption("options not -99h"); + K keys = kK(options)[0]; + if (keys->t != KS) + throw InvalidOption("options keys not 11h"); + K values = kK(options)[1]; + switch (values->t) { + case KJ: + PopulateIntOptions(keys, values); + break; + case KS: + PopulateStringOptions(keys, values); + break; + case XD: + PopulateDictOptions(keys, values); + break; + case 0: + PopulateMixedOptions(keys, values); + break; + default: + throw InvalidOption("options values not 7|11|0h"); + } + } +} + +const std::string KdbOptions::ToUpper(std::string str) const +{ + std::string upper; + for (auto i : str) + upper.push_back((unsigned char)std::toupper(i)); + return upper; +} + +const std::string KdbOptions::ToLower( std::string str ) const +{ + std::transform( str.begin(), str.end(), str.begin(), ::tolower ); + + return str; +} + +void KdbOptions::PopulateIntOptions(K keys, K values) +{ + for (auto i = 0ll; i < values->n; ++i) { + const std::string key = ToUpper(kS(keys)[i]); + if (supported_int_options.find(key) == supported_int_options.end()) + throw InvalidOption(("Unsupported int option '" + key + "'").c_str()); + int_options[key] = kJ(values)[i]; + } +} + +void KdbOptions::PopulateStringOptions(K keys, K values) +{ + for (auto i = 0ll; i < values->n; ++i) { + const std::string key = ToUpper(kS(keys)[i]); + if (supported_string_options.find(key) == supported_string_options.end()) + throw InvalidOption(("Unsupported string option '" + key + "'").c_str()); + string_options[key] = ToUpper(kS(values)[i]); + } +} + +void KdbOptions::PopulateNullMappingOptions( long long index, K dict ) +{ + K keys = kK( kK( dict )[index] )[0]; + K values = kK( kK( dict )[index] )[1]; + if( KS != keys->t ){ + throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING keys (expected=11h), type=" + std::to_string( keys->t ) + "h" ); + } + if( 0 != values->t ){ + throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING values (extected=0h), type=" + std::to_string( values->t ) + "h" ); + } + for( auto i = 0ll; i < values->n; ++i ){ + const std::string key = ToLower( kS( keys )[i] ); + if( supported_null_mapping_options.find( key ) == supported_null_mapping_options.end() ){ + throw InvalidOption( "Unsupported NULL_MAPPING option '" + key + "'" ); + } + K value = kK( values )[i]; + auto option = GetNullMappingType( key ); + auto it = null_mapping_handlers.find( option ); + if( it != null_mapping_handlers.end() ){ + ( this->*it->second )( key, value ); + } + else if( 101 == value->t ){ + // Ignore generic null, which may be used here to ensure mixed list of options + } + else{ + throw InvalidOption( "Unhandled NULL_MAPPING option '" + key + "', type=" + std::to_string( keys->t ) + "h" ); + } + } +} + +void KdbOptions::PopulateDictOptions( K keys, K values ) +{ + for( auto i = 0ll; i < values->n; ++i ) { + const std::string key = ToUpper( kS( keys )[i] ); + if( supported_dict_options.find( key ) == supported_dict_options.end() ){ + throw InvalidOption(("Unsupported dict option '" + key + "'").c_str()); + } + if( Options::NULL_MAPPING == key ) + { + PopulateNullMappingOptions( i, values ); + } + } +} + +void KdbOptions::PopulateMixedOptions(K keys, K values) +{ + for (auto i = 0ll; i < values->n; ++i) { + const std::string key = ToUpper(kS(keys)[i]); + K value = kK(values)[i]; + switch (value->t) { + case -KJ: + if (supported_int_options.find(key) == supported_int_options.end()) + throw InvalidOption(("Unsupported int option '" + key + "'").c_str()); + int_options[key] = value->j; + break; + case -KS: + if (supported_string_options.find(key) == supported_string_options.end()) + throw InvalidOption(("Unsupported string option '" + key + "'").c_str()); + string_options[key] = ToUpper(value->s); + break; + case KC: + { + if (supported_string_options.find(key) == supported_string_options.end()) + throw InvalidOption(("Unsupported string option '" + key + "'").c_str()); + string_options[key] = ToUpper(std::string((char*)kG(value), value->n)); + break; + } + case XD: + { + if( supported_dict_options.find( key ) == supported_dict_options.end() ){ + throw InvalidOption(("Unsupported dict option '" + key + "'").c_str()); + } + if( Options::NULL_MAPPING == key ) + { + PopulateNullMappingOptions( i, values ); + } + break; + } + case 101: + // Ignore :: + break; + default: + throw InvalidOption(("option '" + key + "' value not -7|-11|10h").c_str()); + } + } +} + +arrow::Type::type KdbOptions::GetNullMappingType( const std::string& option ) +{ + auto it = std::find_if( null_mapping_types.begin(), null_mapping_types.end(), [&option]( const auto& value ){ + return option == value.second; + } ); + if( it != null_mapping_types.end() ){ + return it->first; + } + + return arrow::Type::NA; +} + +bool KdbOptions::GetStringOption(const std::string key, std::string& result) const +{ + const auto it = string_options.find(key); + if (it == string_options.end()) + return false; + else { + result = it->second; + return true; + } +} + +bool KdbOptions::GetIntOption(const std::string key, int64_t& result) const +{ + const auto it = int_options.find(key); + if (it == int_options.end()) + return false; + else { + result = it->second; + return true; + } +} + +} // namespace arrowkdb + +} // kx diff --git a/src/KdbOptions.h b/src/KdbOptions.h index c60e6e0..9bde4f8 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -166,127 +166,19 @@ class KdbOptions static const NullMappingHandlers null_mapping_handlers; private: - const std::string ToUpper(std::string str) const - { - std::string upper; - for (auto i : str) - upper.push_back((unsigned char)std::toupper(i)); - return upper; - } + const std::string ToUpper(std::string str) const; - const std::string ToLower( std::string str ) const - { - std::transform( str.begin(), str.end(), str.begin(), ::tolower ); + const std::string ToLower( std::string str ) const; - return str; - } + void PopulateIntOptions(K keys, K values); - void PopulateIntOptions(K keys, K values) - { - for (auto i = 0ll; i < values->n; ++i) { - const std::string key = ToUpper(kS(keys)[i]); - if (supported_int_options.find(key) == supported_int_options.end()) - throw InvalidOption(("Unsupported int option '" + key + "'").c_str()); - int_options[key] = kJ(values)[i]; - } - } + void PopulateStringOptions(K keys, K values); - void PopulateStringOptions(K keys, K values) - { - for (auto i = 0ll; i < values->n; ++i) { - const std::string key = ToUpper(kS(keys)[i]); - if (supported_string_options.find(key) == supported_string_options.end()) - throw InvalidOption(("Unsupported string option '" + key + "'").c_str()); - string_options[key] = ToUpper(kS(values)[i]); - } - } + void PopulateNullMappingOptions( long long index, K dict ); - void PopulateNullMappingOptions( long long index, K dict ) - { - K keys = kK( kK( dict )[index] )[0]; - K values = kK( kK( dict )[index] )[1]; - if( KS != keys->t ){ - throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING keys (expected=11h), type=" + std::to_string( keys->t ) + "h" ); - } - if( 0 != values->t ){ - throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING values (extected=0h), type=" + std::to_string( values->t ) + "h" ); - } - for( auto i = 0ll; i < values->n; ++i ){ - const std::string key = ToLower( kS( keys )[i] ); - if( supported_null_mapping_options.find( key ) == supported_null_mapping_options.end() ){ - throw InvalidOption( "Unsupported NULL_MAPPING option '" + key + "'" ); - } - K value = kK( values )[i]; - auto option = GetNullMappingType( key ); - auto it = null_mapping_handlers.find( option ); - if( it != null_mapping_handlers.end() ){ - ( this->*it->second )( key, value ); - } - else if( 101 == value->t ){ - // Ignore generic null, which may be used here to ensure mixed list of options - } - else{ - throw InvalidOption( "Unhandled NULL_MAPPING option '" + key + "', type=" + std::to_string( keys->t ) + "h" ); - } - } - } - - void PopulateDictOptions( K keys, K values ) - { - for( auto i = 0ll; i < values->n; ++i ) { - const std::string key = ToUpper( kS( keys )[i] ); - if( supported_dict_options.find( key ) == supported_dict_options.end() ){ - throw InvalidOption(("Unsupported dict option '" + key + "'").c_str()); - } - if( Options::NULL_MAPPING == key ) - { - PopulateNullMappingOptions( i, values ); - } - } - } - - void PopulateMixedOptions(K keys, K values) - { - for (auto i = 0ll; i < values->n; ++i) { - const std::string key = ToUpper(kS(keys)[i]); - K value = kK(values)[i]; - switch (value->t) { - case -KJ: - if (supported_int_options.find(key) == supported_int_options.end()) - throw InvalidOption(("Unsupported int option '" + key + "'").c_str()); - int_options[key] = value->j; - break; - case -KS: - if (supported_string_options.find(key) == supported_string_options.end()) - throw InvalidOption(("Unsupported string option '" + key + "'").c_str()); - string_options[key] = ToUpper(value->s); - break; - case KC: - { - if (supported_string_options.find(key) == supported_string_options.end()) - throw InvalidOption(("Unsupported string option '" + key + "'").c_str()); - string_options[key] = ToUpper(std::string((char*)kG(value), value->n)); - break; - } - case XD: - { - if( supported_dict_options.find( key ) == supported_dict_options.end() ){ - throw InvalidOption(("Unsupported dict option '" + key + "'").c_str()); - } - if( Options::NULL_MAPPING == key ) - { - PopulateNullMappingOptions( i, values ); - } - break; - } - case 101: - // Ignore :: - break; - default: - throw InvalidOption(("option '" + key + "' value not -7|-11|10h").c_str()); - } - } - } + void PopulateDictOptions( K keys, K values ); + + void PopulateMixedOptions(K keys, K values); public: class InvalidOption : public std::invalid_argument @@ -298,114 +190,22 @@ class KdbOptions KdbOptions( K options - , const std::set supported_string_options_ - , const std::set supported_int_options_ - , const std::set& supported_dict_options_ = Options::dict_options ) - : null_mapping_options {0} - , supported_string_options(supported_string_options_) - , supported_int_options(supported_int_options_) - , supported_dict_options( supported_dict_options_ ) - , null_mapping_types { - { arrow::Type::BOOL, arrowkdb::Options::NM_BOOLEAN } - , { arrow::Type::UINT8, arrowkdb::Options::NM_UINT_8 } - , { arrow::Type::INT8, arrowkdb::Options::NM_INT_8 } - , { arrow::Type::UINT16, arrowkdb::Options::NM_UINT_16 } - , { arrow::Type::INT16, arrowkdb::Options::NM_INT_16 } - , { arrow::Type::UINT32, arrowkdb::Options::NM_UINT_32 } - , { arrow::Type::INT32, arrowkdb::Options::NM_INT_32 } - , { arrow::Type::UINT64, arrowkdb::Options::NM_UINT_64 } - , { arrow::Type::INT64, arrowkdb::Options::NM_INT_64 } - , { arrow::Type::HALF_FLOAT, arrowkdb::Options::NM_FLOAT_16 } - , { arrow::Type::FLOAT, arrowkdb::Options::NM_FLOAT_32 } - , { arrow::Type::DOUBLE, arrowkdb::Options::NM_FLOAT_64 } - , { arrow::Type::STRING, arrowkdb::Options::NM_STRING } - , { arrow::Type::LARGE_STRING, arrowkdb::Options::NM_LARGE_STRING } - , { arrow::Type::BINARY, arrowkdb::Options::NM_BINARY } - , { arrow::Type::LARGE_BINARY, arrowkdb::Options::NM_LARGE_BINARY } - , { arrow::Type::FIXED_SIZE_BINARY, arrowkdb::Options::NM_FIXED_BINARY } - , { arrow::Type::DATE32, arrowkdb::Options::NM_DATE_32 } - , { arrow::Type::DATE64, arrowkdb::Options::NM_DATE_64 } - , { arrow::Type::TIMESTAMP, arrowkdb::Options::NM_TIMESTAMP } - , { arrow::Type::TIME32, arrowkdb::Options::NM_TIME_32 } - , { arrow::Type::TIME64, arrowkdb::Options::NM_TIME_64 } - , { arrow::Type::DECIMAL, arrowkdb::Options::NM_DECIMAL } - , { arrow::Type::DURATION, arrowkdb::Options::NM_DURATION } - , { arrow::Type::INTERVAL_MONTHS, arrowkdb::Options::NM_MONTH_INTERVAL } - , { arrow::Type::INTERVAL_DAY_TIME, arrowkdb::Options::NM_DAY_TIME_INTERVAL } } - { - std::transform( - null_mapping_types.begin() - , null_mapping_types.end() - , std::inserter( supported_null_mapping_options, end( supported_null_mapping_options ) ) - , []( const auto& value ){ - return value.second; - } ); - if (options != NULL && options->t != 101) { - if (options->t != 99) - throw InvalidOption("options not -99h"); - K keys = kK(options)[0]; - if (keys->t != KS) - throw InvalidOption("options keys not 11h"); - K values = kK(options)[1]; - switch (values->t) { - case KJ: - PopulateIntOptions(keys, values); - break; - case KS: - PopulateStringOptions(keys, values); - break; - case XD: - PopulateDictOptions(keys, values); - break; - case 0: - PopulateMixedOptions(keys, values); - break; - default: - throw InvalidOption("options values not 7|11|0h"); - } - } - } + , const std::set& supported_string_options_ + , const std::set& supported_int_options_ + , const std::set& supported_dict_options_ = Options::dict_options ); template inline void HandleNullMapping( const std::string& key, K value ); - arrow::Type::type GetNullMappingType( const std::string& option ) - { - auto it = std::find_if( null_mapping_types.begin(), null_mapping_types.end(), [&option]( const auto& value ){ - return option == value.second; - } ); - if( it != null_mapping_types.end() ){ - return it->first; - } - - return arrow::Type::NA; - } + arrow::Type::type GetNullMappingType( const std::string& option ); void GetNullMappingOptions( Options::NullMapping& null_mapping ) const{ null_mapping = null_mapping_options; } - bool GetStringOption(const std::string key, std::string& result) const - { - const auto it = string_options.find(key); - if (it == string_options.end()) - return false; - else { - result = it->second; - return true; - } - } + bool GetStringOption(const std::string key, std::string& result) const; - bool GetIntOption(const std::string key, int64_t& result) const - { - const auto it = int_options.find(key); - if (it == int_options.end()) - return false; - else { - result = it->second; - return true; - } - } + bool GetIntOption(const std::string key, int64_t& result) const; }; inline void null_mapping_error( const std::string& key, K value ) @@ -737,41 +537,6 @@ inline void KdbOptions::HandleNullMapping( const } } -template -auto make_handler() -{ - return std::make_pair( TypeId, &KdbOptions::HandleNullMapping ); -} - -inline const KdbOptions::NullMappingHandlers KdbOptions::null_mapping_handlers = { - make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() -}; - } // namespace arrowkdb } // namespace kx From 74cce544b23debd8293876f00a14a076de06dcd4 Mon Sep 17 00:00:00 2001 From: Stephen Taylor Date: Wed, 15 Feb 2023 18:50:40 +0000 Subject: [PATCH 078/276] Move/merge documentation to `docs` folder in source repo --- README.md | 24 +-- apache_arrow.png | Bin 0 -> 3153 bytes docs/arrow-types.md | 10 - docs/examples.md | 15 +- docs/index.md | 26 +-- docs/reference.md | 470 ++++++++++++++++++++++---------------------- 6 files changed, 252 insertions(+), 293 deletions(-) create mode 100644 apache_arrow.png diff --git a/README.md b/README.md index bd271ef..47aca47 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # arrowkdb +![Arrow](apache_arrow.png) + [![GitHub release (latest by date)](https://img.shields.io/github/v/release/kxsystems/arrowkdb?include_prereleases)](https://github.com/kxsystems/arrowkdb/releases) [![Travis (.com) branch](https://travis-ci.com/KxSystems/arrowkdb.svg?branch=main)](https://travis-ci.com/KxSystems/arrowkdb) @@ -17,7 +19,7 @@ This is part of the [*Fusion for kdb+*](http://code.kx.com/q/interfaces/fusion/) ## New to kdb+ ? -Kdb+ is the world's fastest time-series database, optimized for ingesting, analyzing and storing massive amounts of structured data. To get started with kdb+, please visit https://code.kx.com/q/learn/ for downloads and developer information. For general information, visit https://kx.com/ +Kdb+ is the world's fastest time-series database, optimized for ingesting, analyzing and storing massive amounts of structured data. To get started with kdb+, please visit https://code.kx.com/q/ for downloads and developer information. For general information, visit https://kx.com/ @@ -48,7 +50,7 @@ Conversely, Arrow is an in-memory format meant for direct and efficient use for -### Third-Party Library Installation +### Third-party library installation #### Linux @@ -61,7 +63,7 @@ sudo apt install -y -V libarrow-dev=9.0.0-1 sudo apt install -y -V libparquet-dev=9.0.0-1 ``` -#### MacOS +#### macOS Follow the instructions [here](https://arrow.apache.org/install/#c-and-glib-c-packages-on-homebrew) to install `apache-arrow` using Homebrew. @@ -76,7 +78,7 @@ C:\Git> git clone https://github.com/apache/arrow.git C:\Git> cd arrow ``` -Switch to the 9.0.0 tag: +Switch to the `9.0.0` tag: ```bash C:\Git\arrow> git checkout refs/tags/apache-arrow-9.0.0 -- @@ -118,12 +120,12 @@ C:\Git\arrow\cpp\build> MKLINK %QHOME%\w64\parquet.dll %BUILD_HOME%\bin\parquet. It is recommended that a user install this interface through a release. This is completed in a number of steps: -1. Ensure you have downloaded/installed the Arrow C++ API following the instructions [here](https://github.com/KxSystems/arrowkdb#third-party-library-installation). -2. Download a release from [here](https://github.com/KxSystems/arrowkdb/releases) for your system architecture. +1. Ensure you have downloaded/installed the Arrow C++ API following the [instructions](#third-party-library-installation). +2. [Download a release](releases) for your system architecture. 3. Install script `arrowkdb.q` to `$QHOME`, and binary file `lib/arrowkdb.(so|dll)` to `$QHOME/[mlw](64)`, by executing the following from the Release directory: ```bash -## Linux/MacOS +## Linux/macOS chmod +x install.sh && ./install.sh ## Windows @@ -139,7 +141,7 @@ In order to successfully build and install this interface from source, the follo 1. `ARROW_INSTALL` = Location of the Arrow C++ API release (only required if Arrow is not installed globally on the system, e.g. on Windows where Arrow was built from source) 2. `QHOME` = Q installation directory (directory containing `q.k`) -From a shell prompt (on Linux/MacOS) or Visual Studio command prompt (on Windows), clone the `arrowkdb` source from github: +From a shell prompt (on Linux/macOS) or Visual Studio command prompt (on Windows), clone the `arrowkdb` source from github: ```bash git clone https://github.com/KxSystems/arrowkdb.git @@ -175,7 +177,7 @@ cmake --build . --config Release --target install ## Documentation -Documentation outlining the functionality available for this interface can be found [here](https://code.kx.com/q/interfaces/arrow/). +Documentation outlining the functionality available for this interface can be found in the [`docs`](docs/index.md) folder. @@ -183,6 +185,6 @@ Documentation outlining the functionality available for this interface can be fo The arrowkdb interface is provided here under an Apache 2.0 license. -If you find issues with the interface or have feature requests, please consider raising an issue [here](https://github.com/KxSystems/arrowkdb/issues). +If you find issues with the interface or have feature requests, please consider [raising an issue](https://github.com/KxSystems/arrowkdb/issues). -If you wish to contribute to this project, please follow the contributing guide [here](https://github.com/KxSystems/arrowkdb/blob/main/CONTRIBUTING.md). +If you wish to contribute to this project, please follow the [contribution guide](CONTRIBUTING.md). diff --git a/apache_arrow.png b/apache_arrow.png new file mode 100644 index 0000000000000000000000000000000000000000..4168ed13d5cb883147a5885d702d1cd889b43a8c GIT binary patch literal 3153 zcmV-X46gHuP)6 z8r|L9qN1XOg@sE?OTxm!At52k%E~G#D)aO6U0q##e0&lT5@={>baZr$jg2ueF}S$6 ztE;P#k&$?Kco!EJB_$;e4h{mr?*;$>3yw)dK~#90?Va&o@-7g^M`c=OT55jF%vsh` z|NlSu-GGBTIM`l2sawx|eyGMyg!kA12U|rU5{X12kw_#Gi9|=#!dO8>!dOg?cM#=uOl$^d{;Wx=?Q;1HFN|f!;vfKo{yQWTEG&Tj+V}7P?Sx z7C_HZ2k2Sq09~jz%0bUjchGax9dx1IBo949-9yh%_t1rUgAnvIbp$<49YGiBd12^D z>KJ;GI)*OPvm($_)Cu$ybplJ++A&nSS-QxBl?)C1^3J*^NrOFe|n zQV*dYmioZ{o(?L_*I{y{)!tTRIx@c}^~vGD=zLwAP7!pDdIX)L9zj1O^+&ZLMf+nG zw^j<)WX4K8+qzs{n52!WeoWQGS{$X)%6BgBvb(M?`t7}m=@&I~^ zm9j&NgC3QOgC0@mpdX6*SQW1ZD?q>gs1H7iuW@~}R%V%eSQ~W@`YnelEZ8M$6i{cO zABg(hCHNJ9{#lpib739$R3YQ;L1!CrNS)^yQolGF^=Z|R*Gl)Dmi?Q$s)i?(%bDsn zPN84SN0sK~r#>^Tk525^`BofIKkpXx%h&41&9eS;p+3s8ywLsTu5Qv#b)hJ9D9Nin zV|3zOIz6PW_J#KAl@?#S<#IRHy{fAH`N9kgk#+anOVLB>quQ4gLp>&6_$^M4df0fS zMty+ZJL(&oZCMcLrA^)x;P8v-SZg!U zKZS()DJ0ZSA)%hvdOAJ8igxd)j~*T#ZfDn5%&F^(ySuxao9g`S_491IEk8Dw;lqS? zTH46jBmxDK_d8RCuk9O7f2d3FJev{Yt0$c=&EWc0TYf}Y_l>#@b1j%tuk81~6C1uF zXK`kO8)AVT@_e#MWR5#FX(VjFu!hSb(!r@;Gkm@;o9PpCivN0Am4t~3?}#$8Siz~C zO(0ilX3|Jocn&{_)T_4s2AR&|b1CS04a6#CLJ_X-84GCakM@`}Ca$XRK{f)>C+cot zoq4D`30*HS86{0{BVF?r%ODT*)sIZt$R;J8Ebotpx$=3&C)3dVeUUa%BbU)r!ops= zhv);v5=@XQ0hgZIYu=l-zVKPbFFD9ScU?f*#M>-U;(w%fHe)7O`9}AKb4MGW23w4h zLFK+s|MWR-qm%H>^NG>?&F}Jo7N{Z{zx~QyMt?WG>Q7B~E}_&0;5YoBaq2A44H&0{ z>W7D!;~b;27x9Tz`1No+WNhqZbc{=|n7Hp;lT2I|=mvC-49fbPsZTH3I==By?-+b9 z9xaRmKn!$0A~J_B_6Ri-N8OBrW+7m-Z_bT^;6iXS3a36$C;2^uXF2H3+fHZ>#O6Pn zsHZj#FrM_n)iyW7o?HT2E;lKKQF{K3Mcwx>k2Jq96u?3UT(&!7p>MDacpK72z*oHJ zBe*W$)SZLEIm2;7ojOl_YqEK;tv8&!v(VpZEm-JKaoumi*o5Xz5e7ivqG{PU4P2(O zsZt&HFhiTm*tXs<6!7hq4mRibfKJ|FULX)PH_#pii1!9 z3;mVave3asGlZxMt@I3=xUSbYbuy-c=5O!H$Okg&G<6&IyK(4(@Er#|oS$>hsSVJg z9bidPAx>cW)d1Q6e-Rg$i}bEob-}~H3}o+J0RM#us2hsFP=|r;Y)roh(COxCa47bM zfqO$A*swd?3_6w!t)B_&GU{$yhaPV2pIV@yfDrm{?#K1vj1<;`hNIAihYnUY-mfn-vWOWMVHDV3Rfh8)&f98I!)&d!*W_gM)W} z)Rzk`UE7Ui3F@>}C8*P78W)eF`kse@h{qT+AQlJ+=B|je;-PvPR!~76k^wy6uoF07^smLD-~;a|G?6Lv!+3?p-wC+V>MJ zUMgDP`UIkpjv=?Sg66=q{DE4Gq8Ql^-;yKpg-Pg>D3@3+0Go#nmj*hRiR)jc-jVyi zTJ9BY$NUd#6huRwS)*?|aP?CCV2;-|qmr1+2 zb(OiR#*mg1Pu6+p^!xZS;Y&AT(meFXAfqkx@9(Gt8#tQ_>w0z7x42HqRl^2g?~x5a z-|cqGr5g#-z6es6n@U^Gh95FQk8&QF+26X`xC|1rVO`MQFh6KJk3$(Si$e>QbP>aC zq9FOu#4Tf!pOL!A1H>e@I0tY#Gd2%io%Jnl*5~O!V?rLALU&7Oq`6Vb6UGK8=3*}e zS)oJ42&&tnVx9FZsES|k2dwX)iUrGcTr&=XvWyKMWrV!F)1ht8T9Y8JDw`|zgUo-r zou?*j6ixu$HDf$jR*g1o;_0<0Uz8=AW>=L^H5if+JC-M-S#a$)1c`945)Gp-Brwu=3^kS4c?pnyv5;Z zx+u%0`&UQh!@3>!ExyYN(qYpf3@Mmxbq`?)n5-h3q&Evz3;Zrfl<4o1Y*@GBzQwob zGCTpdvJC|=(A`=Ke3rp92yGBJBKnjO+QCTTm=07ftlM?p;@L(W6V%#Pnt|?Jtabbh z^x`J4Nl=fr3A;X+6wLL*nHqK7w;%%1nJGQPL3dL`i;TrjL2mVj_lY!QjTp zECv4{^8>YWH#Cc%40P?-gaSC|KFDrDRV=4FYNPe$f;(C>Zg#n4@!C)9nGYCs-?w;d z%;TQazreCBF>$kXv(Cptceev^&MQ}=&mAA5UA@*1kG=*WmThzask$3K?}o0XlgZ?V zU+LHb-}39!2jBABB+QR5@h!)fwDaRjm>*x_TaGVj=f{^YKfc7b9ADDTk1t_MIL zne)(vIVWa;Ct}V)kC?O2g*mGjdd!@G9y8~l3v*u4kb_|!LuZ&X(1m$IA#{#;1f63Z zLl@>L1rLt1%tPob^9Z^yPl`h4nFr8$<{@-po)&?gU{0YYmc}Ck7%n9@qa|&IU z=d^9X979hsC(sY@ev&>;NZ1j73q8-=LKo&O^wEO3f!@H}JaO|P%v)&Fg1Ls?#9TxFBh1@q z(}H;oy^(qS{MCIY{)ce)wQ0e;g5J!$f-cNks!Tuh<#pNS#hKIAuQdJV)#g=RMe`eZ rDy?NsBoc{4B9TZW5{X2jU!ea1xVn;`Jh1e^00000NkvXXu0mjfQpz;z literal 0 HcmV?d00001 diff --git a/docs/arrow-types.md b/docs/arrow-types.md index 2e10258..368f28a 100644 --- a/docs/arrow-types.md +++ b/docs/arrow-types.md @@ -1,18 +1,8 @@ ---- -title: Type mapping between Arrow and kdb+ -description: The data layout of an Arrow table is defined by its schema. -author: Neal McDonnell -date: February 2021 ---- # Type mapping between Arrow and kdb+ The data layout of an Arrow table is defined by its schema. The schema is composed from a list of fields, one for each column in the table. The field describes the name of the column and its datatype. This page examines each of these and details how they are mapped in kdb+. -:fontawesome-brands-github: -[KxSystems/arrowkdb](https://github.com/KxSystems/arrowkdb) - - ## Arrow datatypes Currently Arrow supports over 35 datatypes including concrete, parameterized and nested datatypes. diff --git a/docs/examples.md b/docs/examples.md index 6d93c7c..eff81d2 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -1,9 +1,3 @@ ---- -title: Example usage of interface | Arrow/Parquet interface -description: Examples of how to read and write Parquet files, Arrow files and Arrow streams from a kdb+ session -author: Neal McDonnell -date: February 2021 ---- # Example usage of interface _Examples of how to read and write Parquet files, Arrow files and Arrow streams from a kdb+ session_ @@ -11,9 +5,6 @@ _Examples of how to read and write Parquet files, Arrow files and Arrow streams The repository has examples with more functionality. -:fontawesome-brands-github: -[KxSystems/arrowkdb/examples](https://github.com/KxSystems/arrowkdb/tree/master/examples) - ## Inferred schemas @@ -542,6 +533,6 @@ multi_comments: It is left as an exercise to write the schema and array data to Parquet or Arrow files. -??? tip "Remember to use Parquet v2.0" - - Otherwise the `timestamp(ns)` datatype will be converted to `timestamp(us)` resulting in a loss of precision. \ No newline at end of file +> :warning: **Remember to use Parquet v2.0** +> +> Otherwise the `timestamp(ns)` datatype will be converted to `timestamp(us)` resulting in a loss of precision. diff --git a/docs/index.md b/docs/index.md index 2b087d3..f8cf3e5 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,16 +1,6 @@ ---- -title: Using Apache Arrow/Parquet data with kdb+ -description: Apache Arrow is a software-development platform for building high-performance applications that process and transport large datasets -author: Neal McDonnell -date: February 2021 ---- -![Arrow](../img/apache_arrow.png) +![Arrow](../apache_arrow.png) # Using Apache Arrow/Parquet data with kdb+ -:fontawesome-brands-github: -[KxSystems/arrowkdb](https://github.com/KxSystems/arrowkdb) - - [Apache Arrow](https://arrow.apache.org/) is a software-development platform for building high-performance applications that process and transport large datasets. It is designed to both improve the performance of analytical algorithms and the efficiency of moving data from one system (or programming language to another). @@ -57,22 +47,18 @@ Users can read and write Arrow tables created from kdb+ data using: Separate APIs are provided where the Arrow table is either created from a kdb+ table using an inferred schema or from an Arrow schema and the table’s list of array data. -:fontawesome-regular-hand-point-right: +:point_right: [API reference](reference.md)
-:fontawesome-regular-hand-point-right: +:point_right: [Example implementations](examples.md)
-:fontawesome-brands-github: -[Install guide](https://github.com/KxSystems/arrowkdb#installation) +[Install guide](../README.md#installation) ## Project The `arrowkdb` interface is published under an Apache 2.0 license. -:fontawesome-brands-github: -[Raise an issue](https://github.com/KxSystems/arrowkdb/issues) -
-:fontawesome-brands-github: -[Contribute](https://github.com/KxSystems/arrowkdb/blob/master/CONTRIBUTING.md) +- [Raise an issue](https://github.com/KxSystems/arrowkdb/issues) +- [Contribute](../CONTRIBUTING.md) diff --git a/docs/reference.md b/docs/reference.md index d33c10f..d90bf9b 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -1,17 +1,7 @@ ---- -title: 'Function reference | Arrow/Parquet interface' -description: 'These functions are exposed within the .arrowkdb namespace, allowing users to convert data between the Arrow/Parquet and kdb+' -author: Neal McDonnell -date: February 2021 ---- # Function reference These functions are exposed within the `.arrowkdb` namespace, allowing users to convert data between the Arrow/Parquet and kdb+. -:fontawesome-brands-github: -[KxSystems/arrowkdb](https://github.com/KxSystems/arrowkdb) - -
.arrowkdb **Arrow/Parquet interface** @@ -194,11 +184,11 @@ These functions are exposed within the `.arrowkdb` namespace, allowing users to ## Datatype constructors -### **`dt.na`** +### `dt.na` *Create a NULL datatype* -```syntax +```txt .arrowkdb.dt.na[] ``` @@ -211,11 +201,11 @@ q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.na[];(();();());::] 3 nulls ``` -### **`dt.boolean`** +### `dt.boolean` *Create a boolean datatype* -```syntax +```txt .arrowkdb.dt.boolean[] ``` @@ -232,17 +222,17 @@ q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.boolean[];(010b);::] ] ``` -### **`dt.int8`** +### `dt.int8` *Create an int8 datatype* -```syntax +```txt .arrowkdb.dt.int8[] ``` -??? note "kdb+ type 10h can be written to an `int8` array" - - The is supported on the writing path only. Reading from an int8 array returns a 4h list +> **kdb+ type 10h can be written to an `int8` array** +> +> The is supported on the writing path only. Reading from an int8 array returns a 4h list Returns the datatype identifier @@ -257,11 +247,11 @@ q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.int8[];(0x102030);::] ] ``` -### **`dt.int16`** +### `dt.int16` *Create an int16 datatype* -```syntax +```txt .arrowkdb.dt.int16[] ``` @@ -278,11 +268,11 @@ q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.int16[];(11 22 33h);::] ] ``` -### **`dt.int32`** +### `dt.int32` *Create an int32 datatype* -```syntax +```txt .arrowkdb.dt.int32[] ``` @@ -299,11 +289,11 @@ q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.int32[];(11 22 33i);::] ] ``` -### **`dt.int64`** +### `dt.int64` *Create an int64 datatype* -```syntax +```txt .arrowkdb.dt.int64[] ``` @@ -320,11 +310,11 @@ q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.int64[];(11 22 33j);::] ] ``` -### **`dt.uint8`** +### `dt.uint8` *Create an uint8 datatype* -```syntax +```txt .arrowkdb.dt.uint8[] ``` @@ -341,11 +331,11 @@ q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.uint8[];(0x102030);::] ] ``` -### **`dt.uint16`** +### `dt.uint16` *Create an uint16 datatype* -```syntax +```txt .arrowkdb.dt.uint16[] ``` @@ -362,19 +352,19 @@ q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.uint16[];(11 22 33h);::] ] ``` -### **`dt.uint32`** +### `dt.uint32` *Create an uint32 datatype* -```syntax +```txt .arrowkdb.dt.uint32[] ``` Returns the datatype identifier -??? warning "`uint32` datatype is supported by Parquet v2.0 only, being changed to `int64` otherwise" - - The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) +> :warning: **`uint32` datatype is supported by Parquet v2.0 only, being changed > to `int64` otherwise** +> +> The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) ```q q).arrowkdb.dt.printDatatype[.arrowkdb.dt.uint32[]] @@ -387,11 +377,11 @@ q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.uint32[];(11 22 33i);::] ] ``` -### **`dt.uint64`** +### `dt.uint64` *Create an uint64 datatype* -```syntax +```txt .arrowkdb.dt.uint64[] ``` @@ -408,19 +398,19 @@ q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.uint64[];(11 22 33j);::] ] ``` -### **`dt.float16`** +### `dt.float16` *Create a float16 (represented as uint16_t) datatype* -```syntax +```txt .arrowkdb.dt.float16[] ``` Returns the datatype identifier -??? warning "`float16` datatype is not supported by Parquet" - - The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) +> :warning: **`float16` datatype is not supported by Parquet** +> +> The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) ```q q).arrowkdb.dt.printDatatype[.arrowkdb.dt.float16[]] @@ -433,11 +423,11 @@ q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.float16[];(11 22 33h);::] ] ``` -### **`dt.float32`** +### `dt.float32` *Create a float32 datatype* -```syntax +```txt .arrowkdb.dt.float32[] ``` @@ -454,11 +444,11 @@ q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.float32[];(1.1 2.2 3.3e);::] ] ``` -### **`dt.float64`** +### `dt.float64` *Create a float64 datatype* -```syntax +```txt .arrowkdb.dt.float64[] ``` @@ -475,11 +465,11 @@ q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.float64[];(1.1 2.2 3.3f);::] ] ``` -### **`dt.time32`** +### `dt.time32` *Create a 32-bit time (units since midnight with specified granularity) datatype* -```syntax +```txt .arrowkdb.dt.time32[time_unit] ``` @@ -500,11 +490,11 @@ q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.time32[`MILLI];(01:00:00.100 02:00: ] ``` -### **`dt.time64`** +### `dt.time64` *Create a 64-bit time (units since midnight with specified granularity) datatype* -```syntax +```txt .arrowkdb.dt.time64[time_unit] ``` @@ -525,11 +515,11 @@ q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.time64[`NANO];(0D01:00:00.100000001 ] ``` -### **`dt.timestamp`** +### `dt.timestamp` *Create a 64-bit timestamp (units since UNIX epoch with specified granularity) datatype* -```syntax +```txt .arrowkdb.dt.timestamp[time_unit] ``` @@ -537,9 +527,9 @@ Where `time_unit` is the time unit string: SECOND, MILLI, MICRO or NANO returns the datatype identifier -??? warning "`timestamp(nano)` datatype is supported by Parquet v2.0 only, being mapped to `timestamp(milli)` otherwise" - - The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) +> :warning: **`timestamp(nano)` datatype is supported by Parquet v2.0 only, being mapped to `timestamp(milli)` otherwise** +> +> The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) ```q q).arrowkdb.dt.printDatatype[.arrowkdb.dt.timestamp[`NANO]] @@ -554,11 +544,11 @@ q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.timestamp[`NANO];(2001.01.01D00:00: ] ``` -### **`dt.date32`** +### `dt.date32` *Create a 32-bit date (days since UNIX epoch) datatype* -```syntax +```txt .arrowkdb.dt.date32[] ``` @@ -575,19 +565,19 @@ q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.date32[];(2001.01.01 2002.02.02 200 ] ``` -### **`dt.date64`** +### `dt.date64` *Create a 64-bit date (milliseconds since UNIX epoch) datatype* -```syntax +```txt .arrowkdb.dt.date64[] ``` Returns the datatype identifier -??? warning "`date64` datatype is changed to `date32(days)` by Parquet" - - The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) +> :warning: **`date64` datatype is changed to `date32(days)` by Parquet** +> +> The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) ```q q).arrowkdb.dt.printDatatype[.arrowkdb.dt.date64[]] @@ -600,19 +590,19 @@ q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.date64[];(2001.01.01D00:00:00.00000 ] ``` -### **`dt.month_interval`** +### `dt.month_interval` *Create a 32-bit interval (described as a number of months, similar to YEAR_MONTH in SQL) datatype* -```syntax +```txt .arrowkdb.dt.month_interval[] ``` Returns the datatype identifier -??? warning "`month_interval` datatype is not supported by Parquet" - - The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) +> :warning: **`month_interval` datatype is not supported by Parquet** +> +> The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) ```q q).arrowkdb.dt.printDatatype[.arrowkdb.dt.month_interval[]] @@ -625,19 +615,19 @@ q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.month_interval[];(2001.01m,2002.02m ] ``` -### **`dt.day_time_interval`** +### `dt.day_time_interval` *Create a 64-bit interval (described as a number of days and milliseconds, similar to DAY_TIME in SQL) datatype* -```syntax +```txt .arrowkdb.dt.day_time_interval[] ``` Returns the datatype identifier -??? warning "`day_time_interval` datatype is not supported by Parquet" - - The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) +> :warning: **`day_time_interval` datatype is not supported by Parquet** +> +> The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) ```q q).arrowkdb.dt.printDatatype[.arrowkdb.dt.day_time_interval[]] @@ -650,11 +640,11 @@ q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.day_time_interval[];(0D01:00:00.100 ] ``` -### **`dt.duration`** +### `dt.duration` *Create a 64-bit duration (measured in units of specified granularity) datatype* -```syntax +```txt .arrowkdb.dt.duration[time_unit] ``` @@ -662,9 +652,9 @@ Where `time_unit` is the time unit string: SECOND, MILLI, MICRO or NANO returns the datatype identifier -??? warning "`duration` datatype is not supported by Parquet" - - The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) +> :warning: **`duration` datatype is not supported by Parquet** +> +> The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) ```q q).arrowkdb.dt.printDatatype[.arrowkdb.dt.duration[`NANO]] @@ -679,11 +669,11 @@ q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.duration[`NANO];(0D01:00:00.1000000 ] ``` -### **`dt.binary`** +### `dt.binary` *Create a variable length bytes datatype* -```syntax +```txt .arrowkdb.dt.binary[] ``` @@ -700,19 +690,19 @@ q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.binary[];(enlist 0x11;0x2222;0x3333 ] ``` -### **`dt.utf8`** +### `dt.utf8` *Create a UTF8 variable length string datatype* -```syntax +```txt .arrowkdb.dt.utf8[] ``` Returns the datatype identifier -??? note "kdb+ type 11h can be written to an `utf8` array" - - The is supported on the writing path only. Reading from an utf8 array returns a mixed list of 10h +> :warning: **kdb+ type 11h can be written to an `utf8` array** +> +> The is supported on the writing path only. Reading from an utf8 array returns a mixed list of 10h ```q q).arrowkdb.dt.printDatatype[.arrowkdb.dt.utf8[]] @@ -725,19 +715,19 @@ q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.utf8[];(enlist "a";"bb";"ccc");::] ] ``` -### **`dt.large_binary`** +### `dt.large_binary` *Create a large (64-bit offsets) variable length bytes datatype* -```syntax +```txt .arrowkdb.dt.large_binary[] ``` Returns the datatype identifier -??? warning "`large_binary` datatype is not supported by Parquet" - - The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) +> :warning: **`large_binary` datatype is not supported by Parquet** +> +> The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) ```q q).arrowkdb.dt.printDatatype[.arrowkdb.dt.large_binary[]] @@ -750,19 +740,19 @@ q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.large_binary[];(enlist 0x11;0x2222; ] ``` -### **`dt.large_utf8`** +### `dt.large_utf8` *Create a large (64-bit offsets) UTF8 variable length string datatype* -```syntax +```txt .arrowkdb.dt.large_utf8[] ``` Returns the datatype identifier -??? warning "`large_utf8` datatype is not supported by Parquet" - - The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) +> :warning: **`large_utf8` datatype is not supported by Parquet** +> +> The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) ```q q).arrowkdb.dt.printDatatype[.arrowkdb.dt.large_utf8[]] @@ -775,11 +765,11 @@ q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.large_utf8[];(enlist "a";"bb";"ccc" ] ``` -### **`dt.fixed_size_binary`** +### `dt.fixed_size_binary` *Create a fixed width bytes datatype* -```syntax +```txt .arrowkdb.dt.fixed_size_binary[byte_width] ``` @@ -787,9 +777,9 @@ Where `byte_width` is the int32 fixed size byte width (each value in the array o returns the datatype identifier -??? note "kdb+ type 2h can be written to a `fixed_size_binary(16)` array" - - The is supported on the writing path only. Reading from a fixed_size_binary array returns a mixed list of 4h +> :warning: **kdb+ type 2h can be written to a `fixed_size_binary(16)` array** +> +> The is supported on the writing path only. Reading from a fixed_size_binary array returns a mixed list of 4h ```q q).arrowkdb.dt.printDatatype[.arrowkdb.dt.fixed_size_binary[2i]] @@ -804,11 +794,11 @@ q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.fixed_size_binary[2i];(0x1111;0x222 ] ``` -### **`dt.decimal128`** +### `dt.decimal128` *Create a 128-bit integer (with precision and scale in twos complement) datatype* -```syntax +```txt .arrowkdb.dt.decimal128[precision;scale] ``` @@ -834,11 +824,11 @@ q).arrowkdb.ar.prettyPrintArray[.arrowkdb.dt.decimal128[38i;2i];(0x0000000000000 q) // With little endian twos complement the decimal128 values are 0, minimum positive, maximum negative ``` -### **`dt.list`** +### `dt.list` *Create a list datatype, specified in terms of its child datatype* -```syntax +```txt .arrowkdb.dt.list[child_datatype_id] ``` @@ -869,11 +859,11 @@ q).arrowkdb.ar.prettyPrintArray[list_datatype;((enlist 1);(2 2);(3 3 3));::] ] ``` -### **`dt.large_list`** +### `dt.large_list` *Create a large (64-bit offsets) list datatype, specified in terms of its child datatype* -```syntax +```txt .arrowkdb.dt.large_list[child_datatype_id] ``` @@ -904,11 +894,11 @@ q).arrowkdb.ar.prettyPrintArray[list_datatype;((enlist 1);(2 2);(3 3 3));::] ] ``` -### **`dt.fixed_size_list`** +### `dt.fixed_size_list` *Create a fixed size list datatype, specified in terms of its child datatype* -```syntax +```txt .arrowkdb.dt.fixed_size_list[child_datatype_id;list_size] ``` @@ -919,9 +909,9 @@ Where: returns the datatype identifier -??? warning "`fixed_size_list` datatype is changed to `list` by Parquet" - - The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) +> :warning: **`fixed_size_list` datatype is changed to `list` by Parquet** +> +> The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) ```q q)list_datatype:.arrowkdb.dt.fixed_size_list[.arrowkdb.dt.int64[];2i] @@ -948,11 +938,11 @@ q).arrowkdb.ar.prettyPrintArray[list_datatype;((1 1);(2 2);(3 3));::] ] ``` -### **`dt.map`** +### `dt.map` *Create a map datatype, specified in terms of its key and item child datatypes* -```syntax +```txt .arrowkdb.dt.map[key_datatype_id;item_datatype_id] ``` @@ -1007,11 +997,11 @@ q).arrowkdb.ar.prettyPrintArray[map_datatype;((enlist 1)!(enlist 1f);(2 2)!(2 2f ] ``` -### **`dt.struct`** +### `dt.struct` *Create a struct datatype, specified in terms of the field identifiers of its children* -```syntax +```txt .arrowkdb.dt.struct[field_ids] ``` @@ -1049,11 +1039,11 @@ q).arrowkdb.ar.prettyPrintArray[struct_datatype;((1 2 3);("aa";"bb";"cc"));::] q) // By slicing across the lists the logical struct values are: (1,"aa"); (2,"bb"); (3,"cc") ``` -### **`dt.sparse_union`** +### `dt.sparse_union` *Create a sparse union datatype, specified in terms of the field identifiers of its children* -```syntax +```txt .arrowkdb.dt.sparse_union[field_ids] ``` @@ -1063,9 +1053,9 @@ returns the datatype identifier An arrow union array is similar to a struct array except that it has an additional type_id array which identifies the live field in each union value set. -??? warning "`sparse_union` datatype is not supported by Parquet" - - The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) +> :warning: **`sparse_union` datatype is not supported by Parquet** +> +> The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) ```q q)field_one:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] @@ -1102,11 +1092,11 @@ q).arrowkdb.ar.prettyPrintArray[union_datatype;((1 0 1h);(1 2 3);("aa";"bb";"cc" q) // Looking up the type_id array the logical union values are: "aa", 2, "cc" ``` -### **`dt.dense_union`** +### `dt.dense_union` *Create a dense union datatype, specified in terms of the field identifiers of its children* -```syntax +```txt .arrowkdb.dt.dense_union[field_ids] ``` @@ -1116,9 +1106,9 @@ returns the datatype identifier An arrow union array is similar to a struct array except that it has an additional type_id array which identifies the live field in each union value set. -??? warning "`dense_union` datatype is not supported by Parquet" - - The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) +> :warning: **`dense_union` datatype is not supported by Parquet** +> +> The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) ```q q)field_one:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] @@ -1164,7 +1154,7 @@ q) // Looking up the type_id array the logical union values are: "aa", 2, "cc" *Create a dictionary datatype specified in terms of its value and index datatypes, similar to pandas categorical* -```syntax +```txt .arrowkdb.dt.dictionary[value_datatype_id;index_datatype_id] ``` @@ -1175,9 +1165,9 @@ Where: returns the datatype identifier -??? warning "Only the categorical interpretation of a `dictionary` datatype array is saved by Parquet" - - The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) +> :warning: **Only the categorical interpretation of a `dictionary` datatype array is saved by Parquet** +> +> The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) ```q q)dict_datatype:.arrowkdb.dt.dictionary[.arrowkdb.dt.utf8[];.arrowkdb.dt.int64[]] @@ -1211,7 +1201,7 @@ q) // The categorical interpretation of the dictionary (looking up the values se *Infer and construct a datatype from a kdb+ list* -```syntax +```txt .arrowkdb.dt.inferDatatype[list] ``` @@ -1234,7 +1224,7 @@ string *Return the base name of a datatype, ignoring any parameters or child datatypes/fields* -```syntax +```txt .arrowkdb.dt.datatypeName[datatype_id] ``` @@ -1253,7 +1243,7 @@ q).arrowkdb.dt.datatypeName[.arrowkdb.dt.fixed_size_binary[4i]] *Return the TimeUnit of a time32/time64/timestamp/duration datatype* -```syntax +```txt .arrowkdb.dt.getTimeUnit[datatype_id] ``` @@ -1270,7 +1260,7 @@ q).arrowkdb.dt.getTimeUnit[.arrowkdb.dt.timestamp[`NANO]] *Return the byte_width of a fixed_size_binary datatype* -```syntax +```txt .arrowkdb.dt.getByteWidth[datatype_id] ``` @@ -1287,7 +1277,7 @@ q).arrowkdb.dt.getByteWidth[.arrowkdb.dt.fixed_size_binary[4i]] *Returns the list_size of a fixed_size_list datatype* -```syntax +```txt .arrowkdb.dt.getListSize[datatype_id] ``` @@ -1304,7 +1294,7 @@ q).arrowkdb.dt.getListSize[.arrowkdb.dt.fixed_size_list[.arrowkdb.dt.int64[];4i] *Return the precision and scale of a decimal128 datatype* -```syntax +```txt .arrowkdb.dt.getPrecisionScale[datatype_id] ``` @@ -1322,7 +1312,7 @@ q).arrowkdb.dt.getPrecisionScale[.arrowkdb.dt.decimal128[38i;2i]] *Return the child datatype identifier of a list/large_list/fixed_size_list datatype* -```syntax +```txt .arrowkdb.dt.getListDatatype[datatype_id] ``` @@ -1340,7 +1330,7 @@ int64 *Return the key and item child datatype identifiers of a map datatype* -```syntax +```txt .arrowkdb.dt.getMapDatatypes[datatype_id] ``` @@ -1361,7 +1351,7 @@ double *Return the value and index child datatype identifiers of a dictionary datatype* -```syntax +```txt .arrowkdb.dt.getDictionaryDatatypes[datatype_id] ``` @@ -1382,7 +1372,7 @@ int64 *Return the list of child field identifiers of a struct/spare_union/dense_union datatype* -```syntax +```txt .arrowkdb.dt.getChildFields[datatype_id] ``` @@ -1407,7 +1397,7 @@ utf8_field: string not null *Display user-readable information for a datatype, including parameters and nested child datatypes* -```syntax +```txt .arrowkdb.dt.printDatatype[datatype_id] ``` @@ -1416,9 +1406,9 @@ Where `datatype_id` is the identifier of the datatype, 1. prints datatype information to stdout 1. returns generic null -??? warning "For debugging use only" - - The information is generated by the `arrow::DataType::ToString()` functionality and displayed on stdout to preserve formatting and indentation. +> :warning: **For debugging use only** +> +> The information is generated by the `arrow::DataType::ToString()` functionality and displayed on stdout to preserve formatting and indentation. ```q q).arrowkdb.dt.printDatatype[.arrowkdb.dt.fixed_size_list[.arrowkdb.dt.int64[];4i]] @@ -1429,7 +1419,7 @@ fixed_size_list[4] *Return the list of identifiers for all datatypes held in the DatatypeStore* -```syntax +```txt .arrowkdb.dt.listDatatypes[] ``` @@ -1451,7 +1441,7 @@ double *Remove a datatype from the DatatypeStore* -```syntax +```txt .arrowkdb.dt.removeDatatype[datatype_id] ``` @@ -1475,7 +1465,7 @@ q).arrowkdb.dt.listDatatypes[] *Check if two datatypes are logically equal, including parameters and nested child datatypes* -```syntax +```txt .arrowkdb.dt.equalDatatypes[first_datatype_id;second_datatype_id] ``` @@ -1510,7 +1500,7 @@ q).arrowkdb.dt.equalDatatypes[.arrowkdb.dt.list[.arrowkdb.dt.int64[]];.arrowkdb. *Create a field instance from its name and datatype* -```syntax +```txt .arrowkdb.fd.field[field_name;datatype_id] ``` @@ -1532,7 +1522,7 @@ int_field: int64 not null _Name of a field_ -```syntax +```txt .arrowkdb.fd.fieldName[field_id] ``` @@ -1550,7 +1540,7 @@ q).arrowkdb.fd.fieldName[field] _Datatype of a field_ -```syntax +```txt .arrowkdb.fd.fieldDatatype[field_id] ``` @@ -1571,7 +1561,7 @@ int64 *Display user readable information for a field, including name and datatype* -```syntax +```txt .arrowkdb.fd.printField[field_id] ``` @@ -1580,9 +1570,9 @@ Where `field_id` is the identifier of the field, 1. prints field information to stdout 1. returns generic null -??? warning "For debugging use only" - - The information is generated by the `arrow::Field::ToString()` functionality and displayed on stdout to preserve formatting and indentation. +> :warning: **For debugging use only** +> +> The information is generated by the `arrow::Field::ToString()` functionality and displayed on stdout to preserve formatting and indentation. ```q q).arrowkdb.fd.printField[.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]]] @@ -1593,7 +1583,7 @@ int_field: int64 not null _List of identifiers for all fields held in the FieldStore_ -```syntax +```txt .arrowkdb.fd.listFields[] ``` @@ -1615,7 +1605,7 @@ float_field: double not null *Remove a field from the FieldStore* -```syntax +```txt .arrowkdb.fd.removeField[field_id] ``` @@ -1639,7 +1629,7 @@ q).arrowkdb.fd.listFields[] *Check if two fields are logically equal, including names and datatypes* -```syntax +```txt .arrowkdb.fd.equalDatatypes[first_field_id;second_field_id] ``` @@ -1669,7 +1659,7 @@ q).arrowkdb.fd.equalFields[.arrowkdb.fd.field[`f1;int_dt];.arrowkdb.fd.field[`f1 *Create a schema instance from a list of field identifiers* -```syntax +```txt .arrowkdb.sc.schema[field_ids] ``` @@ -1689,7 +1679,7 @@ float_field: double not null *Infer and construct a schema based on a kdb+ table* -```syntax +```txt .arrowkdb.sc.inferSchema[table] ``` @@ -1697,9 +1687,9 @@ Where `table` is a kdb+ table or dictionary returns the schema identifier -??? warning "Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors" - - Each column in the table is mapped to a field in the schema. The column name is used as the field name and the column’s kdb+ type is mapped to an Arrow datatype as as described [here](#inferred-datatypes). +> :warning: **Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors** +> +> Each column in the table is mapped to a field in the schema. The column name is used as the field name and the column’s kdb+ type is mapped to an Arrow datatype as as described [here](#inferred-datatypes). ```q q)schema_from_table:.arrowkdb.sc.inferSchema[([] int_field:(1 2 3); float_field:(4 5 6f); str_field:("aa";"bb";"cc"))] @@ -1715,7 +1705,7 @@ str_field: string *Return the list of field identifiers used by a schema* -```syntax +```txt .arrowkdb.sc.schemaFields[schema_id] ``` @@ -1740,7 +1730,7 @@ float_field: double not null *Display user readable information for a schema, including its fields and their order* -```syntax +```txt .arrowkdb.sc.printSchema[schema_id] ``` @@ -1749,9 +1739,9 @@ Where `schema_id` is the identifier of the schema, 1. prints schema information to stdout 1. returns generic null -??? warning "For debugging use only" - - The information is generated by the `arrow::Schema::ToString()` functionality and displayed on stdout to preserve formatting and indentation. +> :warning: **For debugging use only** +> +> The information is generated by the `arrow::Schema::ToString()` functionality and displayed on stdout to preserve formatting and indentation. ```q q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] @@ -1768,7 +1758,7 @@ str_field: string not null *Return the list of identifiers for all schemas held in the SchemaStore* -```syntax +```txt .arrowkdb.sc.listSchemas[] ``` @@ -1789,7 +1779,7 @@ q).arrowkdb.sc.listSchemas[] *Remove a schema from the SchemaStore* -```syntax +```txt .arrowkdb.sc.removeSchema[schema_id] ``` @@ -1815,7 +1805,7 @@ q).arrowkdb.sc.listSchemas[] *Check if two schemas are logically equal, including their fields and the fields' order* -```syntax +```txt .arrowkdb.sc.equalSchemas[first_schema_id;second_schema_id] ``` @@ -1849,7 +1839,7 @@ q).arrowkdb.sc.equalSchemas[.arrowkdb.sc.schema[(f1,f2)];.arrowkdb.sc.schema[(f2 *Convert a kdb+ list to an Arrow array and pretty print the array* -```syntax +```txt .arrowkdb.ar.prettyPrintArray[datatype_id;list;options] ``` @@ -1857,7 +1847,7 @@ Where: - `datatype_id` is the datatype identifier of the array - `list` is the kdb+ list data to be displayed -- `options` is a kdb+ dictionary of options or generic null (::) to use defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or mixed list of -7|-11|4h. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. the function @@ -1868,9 +1858,9 @@ Supported options: - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -??? warning "For debugging use only" - - The information is generated by the `arrow::PrettyPrint()` functionality and displayed on stdout to preserve formatting and indentation. +> :warning: **For debugging use only** +> +> The information is generated by the `arrow::PrettyPrint()` functionality and displayed on stdout to preserve formatting and indentation. ```q q)int_datatype:.arrowkdb.dt.int64[] @@ -1886,14 +1876,14 @@ q).arrowkdb.ar.prettyPrintArray[int_datatype;(1 2 3j);::] *Convert a kdb+ list to an Arrow array and pretty print the array, inferring the datatype from the kdb+ list type* -```syntax +```txt .arrowkdb.ar.prettyPrintArrayFromList[list;options] ``` Where: - `list` is the kdb+ list data to be displayed -- `options` is reserved for future use - specify generic null (::) +- `options` is reserved for future use - specify generic null (`::`) the function @@ -1902,9 +1892,9 @@ the function The kdb+ list type is mapped to an Arrow datatype as described [here](#inferreddatatypes). -??? warning "For debugging use only" - - The information is generated by the `arrow::PrettyPrint()` functionality and displayed on stdout to preserve formatting and indentation. +> :warning: **For debugging use only** +> +> The information is generated by the `arrow::PrettyPrint()` functionality and displayed on stdout to preserve formatting and indentation. ```q q).arrowkdb.ar.prettyPrintArrayFromList[(1 2 3j);::] @@ -1929,7 +1919,7 @@ Where: - `schema_id` is the schema identifier of the table - `array_data` is a mixed list of array data -- `options` is a kdb+ dictionary of options or generic null (::) to use defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or mixed list of -7|-11|4h. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list; values list can be `7h`, `11h` or mixed list of -7|-11|4h the function @@ -1942,9 +1932,9 @@ Supported options: - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -??? warning "For debugging use only" - - The information is generated by the `arrow::Table::ToString()` functionality and displayed on stdout to preserve formatting and indentation. +> :warning: **For debugging use only** +> +> The information is generated by the `arrow::Table::ToString()` functionality and displayed on stdout to preserve formatting and indentation. ```q q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] @@ -1986,14 +1976,14 @@ str_field: *Convert a kdb+ table to an Arrow table and pretty print the table, inferring the schema from the kdb+ table structure* -```syntax +```txt .arrowkdb.tb.prettyPrintTableFromTable[table;options] ``` Where: - `table` is a kdb+ table -- `options` is reserved for future use - specify generic null (::) +- `options` is reserved for future use - specify generic null (`::`) the function @@ -2002,13 +1992,13 @@ the function Each column in the table is mapped to a field in the schema. The column name is used as the field name and the column’s kdb+ type is mapped to an Arrow datatype as as described [here](#inferreddatatypes). -??? warning "Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors" - - Each column in the table is mapped to a field in the schema. The column name is used as the field name and the column’s kdb+ type is mapped to an Arrow datatype as as described [here](#inferred-datatypes). +> :warning: **Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors** +> +> Each column in the table is mapped to a field in the schema. The column name is used as the field name and the column’s kdb+ type is mapped to an Arrow datatype as as described [here](#inferred-datatypes). -??? warning "For debugging use only" - - The information is generated by the `arrow::Table::ToString()` functionality and displayed on stdout to preserve formatting and indentation. +> :warning: **For debugging use only** +> +> The information is generated by the `arrow::Table::ToString()` functionality and displayed on stdout to preserve formatting and indentation. ```q q).arrowkdb.tb.prettyPrintTableFromTable[([] int_field:(1 2 3); float_field:(4 5 6f); str_field:("aa";"bb";"cc"));::] @@ -2048,7 +2038,7 @@ str_field: *Convert a kdb+ mixed list of array data to an Arrow table and write to a Parquet file* -```syntax +```txt .arrowkdb.pq.writeParquet[parquet_file;schema_id;array_data;options] ``` @@ -2057,7 +2047,7 @@ Where: - `parquet_file` is a string containing the Parquet file name - `schema_id` is the schema identifier to use for the table - `array_data` is a mixed list of array data -- `options` is a kdb+ dictionary of options or generic null (::) to use defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or mixed list of -7|-11|4h. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. returns generic null on success @@ -2069,9 +2059,9 @@ Supported options: - `PARQUET_VERSION` - Select the Parquet format version: `V1.0`, `V2.0`, `V2.4`, `V2.6` or `V2.LATEST`. Later versions are more fully featured but may be incompatible with older Parquet implementations. Default `V1.0` - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -??? warning "The Parquet format is compressed and designed for for maximum space efficiency which may cause a performance overhead compared to Arrow. Parquet is also less fully featured than Arrow which can result in schema limitations" - - The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) +> :warning: **The Parquet format is compressed and designed for for maximum space efficiency which may cause a performance overhead compared to Arrow. Parquet is also less fully featured than Arrow which can result in schema limitations** +> +> The Parquet file format is less fully featured compared to Arrow and consequently the Arrow/Parquet file writer currently does not support some datatypes or represents them using a different datatype as described [here](#parquet-datatype-limitations) ```q q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] @@ -2089,7 +2079,7 @@ q)array_data~read_data *Convert a kdb+ table to an Arrow table and write to a Parquet file, inferring the schema from the kdb+ table structure* -```syntax +```txt .arrowkdb.pq.writeParquetFromTable[parquet_file;table;options] ``` @@ -2097,7 +2087,7 @@ Where: - `parquet_file` is a string containing the Parquet file name - `table` is a kdb+ table -- `options` is a kdb+ dictionary of options or generic null (::) to use defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or mixed list of -7|-11|4h. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. returns generic null on success @@ -2106,9 +2096,9 @@ Supported options: - `PARQUET_CHUNK_SIZE` - Controls the approximate size of encoded data pages within a column chunk. Long, default 1MB. - `PARQUET_VERSION` - Select the Parquet format version: `V1.0`, `V2.0`, `V2.4`, `V2.6` or `V2.LATEST`. Later versions are more fully featured but may be incompatible with older Parquet implementations. Default `V1.0` -??? warning "Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors" - - Each column in the table is mapped to a field in the schema. The column name is used as the field name and the column’s kdb+ type is mapped to an Arrow datatype as as described [here](#inferred-datatypes). +> :warning: **Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors** +> +> Each column in the table is mapped to a field in the schema. The column name is used as the field name and the column’s kdb+ type is mapped to an Arrow datatype as as described [here](#inferred-datatypes). ```q q)table:([] int_field:(1 2 3); float_field:(4 5 6f); str_field:("aa";"bb";"cc")) @@ -2122,7 +2112,7 @@ q)read_table~table *Read the schema from a Parquet file* -```syntax +```txt .arrowkdb.pq.readParquetSchema[parquet_file] ``` @@ -2145,14 +2135,14 @@ q).arrowkdb.sc.equalSchemas[schema;.arrowkdb.pq.readParquetSchema["file.parquet" *Read an Arrow table from a Parquet file and convert to a kdb+ mixed list of array data* -```syntax +```txt .arrowkdb.pq.readParquetData[parquet_file;options] ``` Where: - `parquet_file` is a string containing the Parquet file name -- `options` is a kdb+ dictionary of options or generic null (::) to use defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or mixed list of -7|-11|4h. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. returns the array data @@ -2178,7 +2168,7 @@ q)array_data~read_data *Read a single column from a Parquet file and convert to a kdb+ list* -```syntax +```txt .arrowkdb.pq.readParquetColumn[parquet_file;column_index;options] ``` @@ -2186,7 +2176,7 @@ Where: - `parquet_file` is a string containing the Parquet file name - `column_index` is the index of the column to read, relative to the schema field order -- `options` is a kdb+ dictionary of options or generic null (::) to use defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or mixed list of -7|-11|4h. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. returns the array’s data @@ -2210,14 +2200,14 @@ q)col1~array_data[1] *Read an Arrow table from a Parquet file and convert to a kdb+ table* -```syntax +```txt .arrowkdb.pq.readParquetToTable[parquet_file;options] ``` Where: - `parquet_file` is a string containing the Parquet file name -- `options` is a kdb+ dictionary of options or generic null (::) to use defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or mixed list of -7|-11|4h. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. returns the kdb+ table @@ -2241,7 +2231,7 @@ q)read_table~table *Read the number of row groups used by a Parquet file* -```syntax +```txt .arrowkdb.pq.readParquetNumRowGroups[parquet_file] ``` @@ -2260,16 +2250,16 @@ q).arrowkdb.pq.readParquetNumRowGroups["file.parquet"] *Read a set of row groups from a Parquet file into an Arrow table then convert to a kdb+ mixed list of array data* -```syntax +```txt .arrowkdb.pq.readParquetRowGroups[parquet_file;row_groups;columns;options] ``` Where: - `parquet_file` is a string containing the Parquet file name -- `row_groups` is an integer list (6h) of row groups indices to read, or generic null (::) to read all row groups -- `columns` is an integer list (6h) of column indices to read, or generic null (::) to read all columns -- `options` is a kdb+ dictionary of options or generic null (::) to use defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or mixed list of -7|-11|4h. +- `row_groups` is an integer list (6h) of row groups indices to read, or generic null (`::`) to read all row groups +- `columns` is an integer list (6h) of column indices to read, or generic null (`::`) to read all columns +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. returns the array data @@ -2294,16 +2284,16 @@ q)count first .arrowkdb.pq.readParquetRowGroups["file.parquet";1 2i;enlist 0i;:: *Read a set of row groups from a Parquet file into an Arrow table then convert to a kdb+ table* -```syntax +```txt .arrowkdb.pq.readParquetRowGroupsToTable[parquet_file;row_groups;columns;options] ``` Where: - `parquet_file` is a string containing the Parquet file name -- `row_groups` is an integer list (6h) of row groups indices to read, or generic null (::) to read all row groups -- `columns` is an integer list (6h) of column indices to read, or generic null (::) to read all columns -- `options` is a kdb+ dictionary of options or generic null (::) to use defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or mixed list of -7|-11|4h. +- `row_groups` is an integer list (6h) of row groups indices to read, or generic null (`::`) to read all row groups +- `columns` is an integer list (6h) of column indices to read, or generic null (`::`) to read all columns +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. returns the kdb+ table @@ -2330,7 +2320,7 @@ q)count .arrowkdb.pq.readParquetRowGroupsToTable["file.parquet";1 2i;enlist 0i;: *Convert a kdb+ mixed list of array data to an Arrow table and write to an Arrow file* -```syntax +```txt .arrowkdb.ipc.writeArrow[arrow_file;schema_id;array_data;options] ``` @@ -2339,7 +2329,7 @@ Where: - `arrow_file` is a string containing the Arrow file name - `schema_id` is the schema identifier to use for the table - `array_data` is a mixed list of array data -- `options` is a kdb+ dictionary of options or generic null (::) to use defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or mixed list of -7|-11|4h. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. returns generic null on success @@ -2365,7 +2355,7 @@ q)read_data~array_data *Convert a kdb+ table to an Arrow table and write to an Arrow file, inferring the schema from the kdb+ table structure* -```syntax +```txt .arrowkdb.ipc.writeArrowFromTable[arrow_file;table;options] ``` @@ -2373,13 +2363,13 @@ Where: - `arrow_file` is a string containing the Arrow file name - `table` is a kdb+ table -- `options` is reserved for future use - specify generic null (::) +- `options` is reserved for future use - specify generic null (`::`) returns generic null on success -??? warning "Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors" - - Each column in the table is mapped to a field in the schema. The column name is used as the field name and the column’s kdb+ type is mapped to an Arrow datatype as as described [here](#inferred-datatypes). +> :warning: **Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors** +> +> Each column in the table is mapped to a field in the schema. The column name is used as the field name and the column’s kdb+ type is mapped to an Arrow datatype as as described [here](#inferred-datatypes). ```q q)table:([] int_field:(1 2 3); float_field:(4 5 6f); str_field:("aa";"bb";"cc")) @@ -2393,7 +2383,7 @@ q)read_table~table *Read the schema from an Arrow file* -```syntax +```txt .arrowkdb.ipc.readArrowSchema[arrow_file] ``` @@ -2416,14 +2406,14 @@ q).arrowkdb.sc.equalSchemas[schema;.arrowkdb.ipc.readArrowSchema["file.arrow"]] *Read an Arrow table from an Arrow file and convert to a kdb+ mixed list of array data* -```syntax +```txt .arrowkdb.ipc.readArrowData[arrow_file;options] ``` Where: - `arrow_file` is a string containing the Arrow file name -- `options` is a kdb+ dictionary of options or generic null (::) to use defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or mixed list of -7|-11|4h. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. returns the array data @@ -2448,14 +2438,14 @@ q)read_data~array_data *Read an Arrow table from an Arrow file and convert to a kdb+ table* -```syntax +```txt .arrowkdb.ipc.readArrowToTable[arrow_file;options] ``` Where: - `arrow_file` is a string containing the Arrow file name -- `options` is a kdb+ dictionary of options or generic null (::) to use defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or mixed list of -7|-11|4h. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. returns the kdb+ table @@ -2480,7 +2470,7 @@ q)read_table~table *Convert a kdb+ mixed list of array data to an Arrow table and serialize to an Arrow stream* -```syntax +```txt .arrowkdb.ipc.serializeArrow[schema_id;array_data;options] ``` @@ -2488,7 +2478,7 @@ Where: - `schema_id` is the schema identifier to use for the table - `array_data` is a mixed list of array data -- `options` is a kdb+ dictionary of options or generic null (::) to use defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or mixed list of -7|-11|4h. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. returns a byte list containing the serialized stream data @@ -2514,20 +2504,20 @@ q)read_data~array_data *Convert a kdb+ table to an Arrow table and serialize to an Arrow stream, inferring the schema from the kdb+ table structure* -```syntax +```txt .arrowkdb.ipc.serializeArrowFromTable[table;options] ``` Where: - `table` is a kdb+ table -- `options` is reserved for future use - specify generic null (::) +- `options` is reserved for future use - specify generic null (`::`) returns a byte list containing the serialized stream data -??? warning "Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors" - - Each column in the table is mapped to a field in the schema. The column name is used as the field name and the column’s kdb+ type is mapped to an Arrow datatype as as described [here](#inferred-datatypes). +> :warning: **Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors** +> +> Each column in the table is mapped to a field in the schema. The column name is used as the field name and the column’s kdb+ type is mapped to an Arrow datatype as as described [here](#inferred-datatypes). ```q q)table:([] int_field:(1 2 3); float_field:(4 5 6f); str_field:("aa";"bb";"cc")) @@ -2541,7 +2531,7 @@ q)new_table~table *Parse the schema from an Arrow stream* -```syntax +```txt .arrowkdb.ipc.parseArrowSchema[serialized] ``` @@ -2564,14 +2554,14 @@ q).arrowkdb.sc.equalSchemas[schema;.arrowkdb.ipc.parseArrowSchema[serialized]] *Parse an Arrow table from an Arrow stream and convert to a kdb+ mixed list of array data* -```syntax +```txt .arrowkdb.ipc.parseArrowData[serialized;options] ``` Where: - `serialized` is a byte list containing the serialized stream data -- `options` is a kdb+ dictionary of options or generic null (::) to use defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or mixed list of -7|-11|4h. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. returns the array data @@ -2595,14 +2585,14 @@ q)read_data~array_data *Parse an Arrow table from an Arrow file and convert to a kdb+ table* -```syntax +```txt .arrowkdb.ipc.parseArrowToTable[serialized;options] ``` Where: - `serialized` is a byte list containing the serialized stream data -- `options` is a kdb+ dictionary of options or generic null (::) to use defaults. Dictionary key must be a 11h list. Values list can be 7h, 11h or mixed list of -7|-11|4h. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. returns the kdb+ table @@ -2626,7 +2616,7 @@ q)new_table~table *Return build information regarding the in use Arrow library* -```syntax +```txt .arrowkdb.util.buildInfo[] ``` From 25b531e2e21fd33df0347e8b4b1e77e6e3b34283 Mon Sep 17 00:00:00 2001 From: Stephen Taylor Date: Wed, 15 Feb 2023 18:54:26 +0000 Subject: [PATCH 079/276] fix link to Issues --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 47aca47..dc5e95f 100644 --- a/README.md +++ b/README.md @@ -121,7 +121,7 @@ C:\Git\arrow\cpp\build> MKLINK %QHOME%\w64\parquet.dll %BUILD_HOME%\bin\parquet. It is recommended that a user install this interface through a release. This is completed in a number of steps: 1. Ensure you have downloaded/installed the Arrow C++ API following the [instructions](#third-party-library-installation). -2. [Download a release](releases) for your system architecture. +2. [Download a release](https://github.com/KxSystems/arrowkdb/releases) for your system architecture. 3. Install script `arrowkdb.q` to `$QHOME`, and binary file `lib/arrowkdb.(so|dll)` to `$QHOME/[mlw](64)`, by executing the following from the Release directory: ```bash From aef592ae86b0a0d04496071d318578ae9b5b0761 Mon Sep 17 00:00:00 2001 From: Stephen Taylor Date: Wed, 15 Feb 2023 19:14:21 +0000 Subject: [PATCH 080/276] fix interface index --- docs/reference.md | 344 ++++++++++++++++++++++------------------------ 1 file changed, 166 insertions(+), 178 deletions(-) diff --git a/docs/reference.md b/docs/reference.md index d90bf9b..fc14189 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -3,184 +3,172 @@ These functions are exposed within the `.arrowkdb` namespace, allowing users to convert data between the Arrow/Parquet and kdb+. -
-.arrowkdb **Arrow/Parquet interface** -[Datatype constructors](#datatype-constructors) - [dt.na](#dtna) Create a NULL datatype - [dt.boolean](#dtboolean) Create a boolean datatype - [dt.int8](#dtint8) Create an int8 datatype - [dt.int16](#dtint16) Create an int16 datatype - [dt.int32](#dtint32) Create an int32 datatype - [dt.int64](#dtint64) Create an int64 datatype - [dt.uint8](#dtuint8) Create an uint8 datatype - [dt.uint16](#dtuint16) Create an uint16 datatype - [dt.uint32](#dtuint32) Create an uint32 datatype - [dt.uint64](#dtuint64) Create an uint64 datatype - [dt.float16](#dtfloat16) Create a float16 (represented as uint16_t) datatype - [dt.float32](#dtfloat32) Create a float32 datatype - [dt.float64](#dtfloat64) Create a float64 datatype - [dt.time32](#dttime32) Create a 32-bit time (units since midnight with specified - granularity) datatype - [dt.time64](#dttime64) Create a 64-bit time (units since midnight with specified - granularity) datatype - [dt.timestamp](#dttimestamp) Create a 64-bit timestamp (units since UNIX epoch with - specified granularity) datatype - [dt.date32](#dtdate32) Create a 32-bit date (days since UNIX epoch) datatype - [dt.date64](#dtdate64) Create a 64-bit date (milliseconds since UNIX epoch) - datatype - [dt.month_interval](#dtmonth_interval) Create a 32-bit interval (described as a number of months, - similar to YEAR_MONTH in SQL) datatype - [dt.day_time_interval](#dtday_time_interval) Create a 64-bit interval (described as a number of days - and milliseconds, similar to DAY_TIME in SQL) datatype - [dt.duration](#dtduration) Create a 64-bit duration (measured in units of specified - granularity) datatype - [dt.binary](#dtbinary) Create a variable length bytes datatype - [dt.utf8](#dtutf8) Create a UTF8 variable length string datatype - [dt.large_binary](#dtlarge_binary) Create a large (64-bit offsets) variable length bytes - datatype - [dt.large_utf8](#dtlarge_utf8) Create a large (64-bit offsets) UTF8 variable length - string datatype - [dt.fixed_size_binary](#dtfixed_size_binary) Create a fixed width bytes datatype - [dt.decimal128](#dtdecimal128) Create a 128-bit integer (with precision and scale in - twos complement) datatype - [dt.list](#dtlist) Create a list datatype, specified in terms of its child - datatype - [dt.large_list](#dtlarge_list) Create a large (64-bit offsets) list datatype, specified - in terms of its child datatype - [dt.fixed_size_list](#dt_fixed_size_list) Create a fixed size list datatype, specified in terms of - its child datatype - [dt.map](#dtmap) Create a map datatype, specified in terms of its key and - item child datatypes - [dt.struct](#dtstruct) Create a struct datatype, specified in terms of the field - identifiers of its children - [dt.sparse_union](#dtsparse_union) Create a sparse union datatype, specified in terms of the - field identifiers of its children - [dt.dense_union](#dtdense_union) Create a dense union datatype, specified in terms of the - field identifiers of its children - [dt.dictionary](#dtdictionary) Create a dictionary datatype specified in terms of its - value and index datatypes, similar to pandas categorical - [dt.inferDatatype](#dtinferDatatype) Infer and construct a datatype from a kdb+ list - -[Datatype inspection](#datatype-inspection) - [dt.datatypeName](#dtdatatypename) Return the base name of a datatype, ignoring any - parameters or child datatypes/fields - [dt.getTimeUnit](#dtgettimeunit) Return the TimeUnit of a time32/time64/timestamp/duration - datatype - [dt.getByteWidth](#dtgetbytewidth) Return the byte_width of a fixed_size_binary datatype - [dt.getListSize](#dtgetlistsize) Returns the list_size of a fixed_size_list datatype - [dt.getPrecisionScale](#dtgetprecisionscale) Return the precision and scale of a decimal128 datatype - [dt.getListDatatype](#dtgetlistdatatype) Return the child datatype identifier of a - list/large_list/fixed_size_list datatype - [dt.getMapDatatypes](#dtgetmapdatatypes) Return the key and item child datatype identifiers of a - map datatype - [dt.getDictionaryDatatypes](#dtgetdictionarydatatypes) Return the value and index child datatype identifiers of a - dictionary datatype - [dt.getChildFields](#dtgetchildfields) Return the list of child field identifiers of a - struct/spare_union/dense_union datatype - -[Datatype management](#datatype-management) - [dt.printDatatype](#dtprintdatatype) Display user readable information for a datatype, - including parameters and nested child datatypes - [dt.listDatatypes](#dtlistdatatypes) Return the list of identifiers for all datatypes held in - the DatatypeStore - [dt.removeDatatype](#dtremovedatatype) Remove a datatype from the DatatypeStore - [dt.equalDatatypes](#dtequaldatatypes) Check if two datatypes are logically equal, including - parameters and nested child datatypes - -[Field Constructor](#field-constructor) - [fd.field](#fdfield) Create a field instance from its name and datatype - -[Field Inspection](#field-inspection) - [fd.fieldName](#fdfieldname) Return the name of a field - [fd.fieldDatatype](#fdfielddatatype) Return the datatype of a field - -[Field management](#field-management) - [fd.printField](#fdprintfield) Display user readable information for a field, including - name and datatype - [fd.listFields](#fdlistfields) Return the list of identifiers for all fields held in the - FieldStore - [fd.removeField](#fdremovefield) Remove a field from the FieldStore - [fd.equalFields](#fdequalfields) Check if two fields are logically equal, including names - and datatypes - -[Schema constructors](#schema-constructors) - [sc.schema](#scschema) Create a schema instance from a list of field identifiers - [sc.inferSchema](#scinferschema) Infer and construct a schema based on a kdb+ table - -[Schema inspection](#schema-inspection) - [sc.schemaFields](#scschemafields) Return the list of field identifiers used by a schema - -[Schema management](#schema-management) - [sc.printSchema](#scprintschema) Display user readable information for a schema, including - its fields and their order - [sc.listSchemas](#sclistschemas) Return the list of identifiers for all schemas held in the - SchemaStore - [sc.removeSchema](#scremoveschema) Remove a schema from the SchemaStore - [sc.equalSchemas](#scequalschemas) Check if two schemas are logically equal, including their - fields and the fields' order - -[Array data](#array-data) - [ar.prettyPrintArray](#arprettyprintarray) Convert a kdb+ list to an Arrow array and pretty print the - array - [ar.prettyPrintArrayFromList](#arprettyprintarrayfromlist) Convert a kdb+ list to an Arrow array and pretty print the - array, inferring the datatype from the kdb+ list type - - -[Table data](#table-data) - [tb.prettyPrintTable](#tbprettyprinttable) Convert a kdb+ mixed list of array data to an Arrow table - and pretty print the table - [tb.prettyPrintTableFromTable](#tbprettyprinttablefromtable) Convert a kdb+ table to an Arrow table and pretty print - the table, inferring the schema from the kdb+ table - structure - -[Parquet files](#parquet-files) - [pq.writeParquet](#pqwriteparquet) Convert a kdb+ mixed list of array data to an Arrow table - and write to a Parquet file - [pq.writeParquetFromTable](#pqwriteparquetfromtable) Convert a kdb+ table to an Arrow table and write to a - Parquet file, inferring the schema from the kdb+ table - structure - [pq.readParquetSchema](#pqreadparquetschema) Read the schema from a Parquet file - [pq.readParquetData](#pqreadparquetdata) Read an Arrow table from a Parquet file and convert to a - kdb+ mixed list of array data - [pq.readParquetColumn](#pqreadparquetcolumn) Read a single column from a Parquet file and convert to a - kdb+ list - [pq.readParquetToTable](#pqreadparquettotable) Read an Arrow table from a Parquet file and convert to a - kdb+ table - [pq.readParquetNumRowGroups](#pqreadparquetnumrowgroups) Read the number of row groups used by a Parquet file - [pq.readParquetRowGroups](#pqreadparquetrowgroups) Read a set of row groups from a Parquet file into an Arrow - table then convert to a kdb+ mixed list of array data - [pq.readParquetRowGroupsToTable](#pqreadparquetrowgroupstotable) Read a set of row groups from a Parquet file into an Arrow - table then convert to a kdb+ table - -[Arrow IPC files](#arrow-ipc-files) - [ipc.writeArrow](#ipcwritearrow) Convert a kdb+ mixed list of array data to an Arrow table - and write to an Arrow file - [ipc.writeArrowFromTable](#ipcwritearrowfromtable) Convert a kdb+ table to an Arrow table and write to an - Arrow file, inferring the schema from the kdb+ table - structure - [ipc.readArrowSchema](#ipcreadarrowschema) Read the schema from an Arrow file - [ipc.readArrowData](#ipcreadarrowdata) Read an Arrow table from an Arrow file and convert to a - kdb+ mixed list of array data - [ipc.readArrowToTable](#ipcreadarrowtotable) Read an Arrow table from an Arrow file and convert to a - kdb+ table - -[Arrow IPC streams](#arrow-ipc-streams) - [ipc.serializeArrow](#ipcserializearrow) Convert a kdb+ mixed list of array data to an Arrow table - and serialize to an Arrow stream - [ipc.serializeArrowFromTable](#ipcserializearrowfromtable) Convert a kdb+ table to an Arrow table and serialize to an - Arrow stream, inferring the schema from the kdb+ table - structure - [ipc.parseArrowSchema](#ipcparsearrowschema) Parse the schema from an Arrow stream - [ipc.parseArrowData](#ipcparsearrowdata) Parse an Arrow table from an Arrow stream and convert to a - kdb+ mixed list of array data - [ipc.parseArrowToTable](#ipcparsearrowtotable) Parse an Arrow table from an Arrow file and convert to a - kdb+ table - -[Utilities](#utilities) - [util.buildInfo](#utilbuildinfo) Return build information regarding the in use Arrow - library - -
+## `.arrowkdb` Arrow/Parquet interface + +### [Datatype constructors](#datatype-constructors) + +object | use +-------|------- +[`dt.na`](#dtna) | Create a NULL datatype +[`dt.boolean`](#dtboolean) | Create a boolean datatype +[`dt.int8`](#dtint8) | Create an int8 datatype +[`dt.int16`](#dtint16) | Create an int16 datatype +[`dt.int32`](#dtint32) | Create an int32 datatype +[`dt.int64`](#dtint64) | Create an int64 datatype +[`dt.uint8`](#dtuint8) | Create an uint8 datatype +[`dt.uint16`](#dtuint16) | Create an uint16 datatype +[`dt.uint32`](#dtuint32) | Create an uint32 datatype +[`dt.uint64`](#dtuint64) | Create an uint64 datatype +[`dt.float16`](#dtfloat16) | Create a float16 (represented as uint16_t) datatype +[`dt.float32`](#dtfloat32) | Create a float32 datatype +[`dt.float64`](#dtfloat64) | Create a float64 datatype +[`dt.time32`](#dttime32) | Create a 32-bit time (units since midnight with specified granularity) datatype +[`dt.time64`](#dttime64) | Create a 64-bit time (units since midnight with specified granularity) datatype +[`dt.timestamp`](#dttimestamp) | Create a 64-bit timestamp (units since UNIX epoch with specified granularity) datatype +[`dt.date32`](#dtdate32) | Create a 32-bit date (days since UNIX epoch) datatype +[`dt.date64`](#dtdate64) | Create a 64-bit date (milliseconds since UNIX epoch) datatype +[`dt.month_interval`](#dtmonth_interval) | Create a 32-bit interval (described as a number of months, similar to YEAR_MONTH in SQL) datatype +[`dt.day_time_interval`](#dtday_time_interval) | Create a 64-bit interval (described as a number of days and milliseconds, similar to DAY_TIME in SQL) datatype +[`dt.duration`](#dtduration) | Create a 64-bit duration (measured in units of specified granularity) datatype +[`dt.binary`](#dtbinary) | Create a variable length bytes datatype +[`dt.utf8`](#dtutf8) | Create a UTF8 variable length string datatype +[`dt.large_binary`](#dtlarge_binary) | Create a large (64-bit offsets) variable length bytes datatype +[`dt.large_utf8`](#dtlarge_utf8) | Create a large (64-bit offsets) UTF8 variable length string datatype +[`dt.fixed_size_binary`](#dtfixed_size_binary) | Create a fixed width bytes datatype +[`dt.decimal128`](#dtdecimal128) | Create a 128-bit integer (with precision and scale in twos complement) datatype +[`dt.list`](#dtlist) | Create a list datatype, specified in terms of its child datatype +[`dt.large_list`](#dtlarge_list) | Create a large (64-bit offsets) list datatype, specified in terms of its child datatype +[`dt.fixed_size_list`](#dt_fixed_size_list) | Create a fixed size list datatype, specified in terms of its child datatype +[`dt.map`](#dtmap) | Create a map datatype, specified in terms of its key and item child datatypes +[`dt.struct`](#dtstruct) | Create a struct datatype, specified in terms of the field identifiers of its children +[`dt.sparse_union`](#dtsparse_union) | Create a sparse union datatype, specified in terms of the field identifiers of its children +[`dt.dense_union`](#dtdense_union) | Create a dense union datatype, specified in terms of the field identifiers of its children +[`dt.dictionary`](#dtdictionary) | Create a dictionary datatype specified in terms of its value and index datatypes, similar to pandas categorical +[`dt.inferDatatype`](#dtinferDatatype) | Infer and construct a datatype from a kdb+ list + +### [Datatype inspection](#datatype-inspection) + +object | use +-------|------- +[`dt.datatypeName`](#dtdatatypename) | Return the base name of a datatype, ignoring any parameters or child datatypes/fields +[`dt.getTimeUnit`](#dtgettimeunit) | Return the TimeUnit of a time32/time64/timestamp/duration datatype +[`dt.getByteWidth`](#dtgetbytewidth) | Return the byte_width of a fixed_size_binary datatype +[`dt.getListSize`](#dtgetlistsize) | Returns the list_size of a fixed_size_list datatype +[`dt.getPrecisionScale`](#dtgetprecisionscale) | Return the precision and scale of a decimal128 datatype +[`dt.getListDatatype`](#dtgetlistdatatype) | Return the child datatype identifier of a list/large_list/fixed_size_list datatype +[`dt.getMapDatatypes`](#dtgetmapdatatypes) | Return the key and item child datatype identifiers of a map datatype +[`dt.getDictionaryDatatypes`](#dtgetdictionarydatatypes) | Return the value and index child datatype identifiers of a dictionary datatype +[`dt.getChildFields`](#dtgetchildfields) | Return the list of child field identifiers of a struct/spare_union/dense_union datatype + +### [Datatype management](#datatype-management) + +object | use +-------|------- +[`dt.printDatatype`](#dtprintdatatype) | Display user readable information for a datatype, including parameters and nested child datatypes +[`dt.listDatatypes`](#dtlistdatatypes) | Return the list of identifiers for all datatypes held in the DatatypeStore +[`dt.removeDatatype`](#dtremovedatatype) | Remove a datatype from the DatatypeStore +[`dt.equalDatatypes`](#dtequaldatatypes) | Check if two datatypes are logically equal, including parameters and nested child datatypes + +### [Field Constructor](#field-constructor) + +object | use +-------|------- +[`fd.field`](#fdfield) | Create a field instance from its name and datatype + +### [Field Inspection](#field-inspection) + +object | use +-------|------- +[`fd.fieldName`](#fdfieldname) | Return the name of a field +[`fd.fieldDatatype`](#fdfielddatatype) | Return the datatype of a field + +### [Field management](#field-management) + +object | use +-------|------- +[`fd.printField`](#fdprintfield) | Display user readable information for a field, including name and datatype +[`fd.listFields`](#fdlistfields) | Return the list of identifiers for all fields held in the FieldStore +[`fd.removeField`](#fdremovefield) | Remove a field from the FieldStore +[`fd.equalFields`](#fdequalfields) | Check if two fields are logically equal, including names and datatypes + +### [Schema constructors](#schema-constructors) + +object | use +-------|------- +[`sc.schema`](#scschema) | Create a schema instance from a list of field identifiers +[`sc.inferSchema`](#scinferschema) | Infer and construct a schema based on a kdb+ table + +### [Schema inspection](#schema-inspection) + +object | use +-------|------- +[`sc.schemaFields`](#scschemafields) | Return the list of field identifiers used by a schema + +### [Schema management](#schema-management) + +object | use +-------|------- +[`sc.printSchema`](#scprintschema) | Display user readable information for a schema, including its fields and their order +[`sc.listSchemas`](#sclistschemas) | Return the list of identifiers for all schemas held in the SchemaStore +[`sc.removeSchema`](#scremoveschema) | Remove a schema from the SchemaStore +[`sc.equalSchemas`](#scequalschemas) | Check if two schemas are logically equal, including their fields and the fields' order + +### [Array data](#array-data) + +object | use +-------|------- +[`ar.prettyPrintArray`](#arprettyprintarray) | Convert a kdb+ list to an Arrow array and pretty print the array +[`ar.prettyPrintArrayFromList`](#arprettyprintarrayfromlist) | Convert a kdb+ list to an Arrow array and pretty-print the array, inferring the datatype from the kdb+ list type + + +### [Table data](#table-data) + +object | use +-------|------- +[`tb.prettyPrintTable`](#tbprettyprinttable) | Convert a kdb+ mixed list of array data to an Arrow table and pretty print the table +[`tb.prettyPrintTableFromTable`](#tbprettyprinttablefromtable) | Convert a kdb+ table to an Arrow table and pretty print the table, inferring the schema from the kdb+ table structure + +### [Parquet files](#parquet-files) + +object | use +-------|------- +[`pq.writeParquet`](#pqwriteparquet) | Convert a kdb+ mixed list of array data to an Arrow table and write to a Parquet file +[`pq.writeParquetFromTable`](#pqwriteparquetfromtable) | Convert a kdb+ table to an Arrow table and write to a Parquet file, inferring the schema from the kdb+ table structure +[`pq.readParquetSchema`](#pqreadparquetschema) | Read the schema from a Parquet file +[`pq.readParquetData`](#pqreadparquetdata) | Read an Arrow table from a Parquet file and convert to a kdb+ mixed list of array data +[`pq.readParquetColumn`](#pqreadparquetcolumn) | Read a single column from a Parquet file and convert to a kdb+ list +[`pq.readParquetToTable`](#pqreadparquettotable) | Read an Arrow table from a Parquet file and convert to a kdb+ table +[`pq.readParquetNumRowGroups`](#pqreadparquetnumrowgroups) | Read the number of row groups used by a Parquet file +[`pq.readParquetRowGroups`](#pqreadparquetrowgroups) | Read a set of row groups from a Parquet file into an Arrow table then convert to a kdb+ mixed list of array data +[`pq.readParquetRowGroupsToTable`](#pqreadparquetrowgroupstotable) | Read a set of row groups from a Parquet file into an Arrow table then convert to a kdb+ table + +### [Arrow IPC files](#arrow-ipc-files) + +object | use +-------|------- +[`ipc.writeArrow`](#ipcwritearrow) | Convert a kdb+ mixed list of array data to an Arrow table and write to an Arrow file +[`ipc.writeArrowFromTable`](#ipcwritearrowfromtable) | Convert a kdb+ table to an Arrow table and write to an Arrow file, inferring the schema from the kdb+ table structure +[`ipc.readArrowSchema`](#ipcreadarrowschema) | Read the schema from an Arrow file +[`ipc.readArrowData`](#ipcreadarrowdata) | Read an Arrow table from an Arrow file and convert to a kdb+ mixed list of array data +[`ipc.readArrowToTable`](#ipcreadarrowtotable) | Read an Arrow table from an Arrow file and convert to a kdb+ table + +### [Arrow IPC streams](#arrow-ipc-streams) + +object | use +-------|------- +[`ipc.serializeArrow`](#ipcserializearrow) | Convert a kdb+ mixed list of array data to an Arrow table and serialize to an Arrow stream +[`ipc.serializeArrowFromTable`](#ipcserializearrowfromtable) | Convert a kdb+ table to an Arrow table and serialize to an Arrow stream, inferring the schema from the kdb+ table structure +[`ipc.parseArrowSchema`](#ipcparsearrowschema) | Parse the schema from an Arrow stream +[`ipc.parseArrowData`](#ipcparsearrowdata) | Parse an Arrow table from an Arrow stream and convert to a kdb+ mixed list of array data +[`ipc.parseArrowToTable`](#ipcparsearrowtotable) | Parse an Arrow table from an Arrow file and convert to a kdb+ table + +### [Utilities](#utilities) + +object | use +-------|------- +[`util.buildInfo`](#utilbuildinfo) | Return build information regarding the in use Arrow library + + ## Datatype constructors From 85f08a4af4844c1785cc75a7ff110987a63997de Mon Sep 17 00:00:00 2001 From: Stephen Taylor Date: Wed, 15 Feb 2023 19:21:46 +0000 Subject: [PATCH 081/276] fix index table --- docs/reference.md | 87 ++++++++--------------------------------------- 1 file changed, 15 insertions(+), 72 deletions(-) diff --git a/docs/reference.md b/docs/reference.md index fc14189..0af5cb1 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -5,10 +5,10 @@ These functions are exposed within the `.arrowkdb` namespace, allowing users to ## `.arrowkdb` Arrow/Parquet interface -### [Datatype constructors](#datatype-constructors) object | use -------|------- +
**[Datatype constructors](#datatype-constructors)** [`dt.na`](#dtna) | Create a NULL datatype [`dt.boolean`](#dtboolean) | Create a boolean datatype [`dt.int8`](#dtint8) | Create an int8 datatype @@ -45,11 +45,7 @@ object | use [`dt.dense_union`](#dtdense_union) | Create a dense union datatype, specified in terms of the field identifiers of its children [`dt.dictionary`](#dtdictionary) | Create a dictionary datatype specified in terms of its value and index datatypes, similar to pandas categorical [`dt.inferDatatype`](#dtinferDatatype) | Infer and construct a datatype from a kdb+ list - -### [Datatype inspection](#datatype-inspection) - -object | use --------|------- +
**[Datatype inspection](#datatype-inspection)** [`dt.datatypeName`](#dtdatatypename) | Return the base name of a datatype, ignoring any parameters or child datatypes/fields [`dt.getTimeUnit`](#dtgettimeunit) | Return the TimeUnit of a time32/time64/timestamp/duration datatype [`dt.getByteWidth`](#dtgetbytewidth) | Return the byte_width of a fixed_size_binary datatype @@ -59,79 +55,38 @@ object | use [`dt.getMapDatatypes`](#dtgetmapdatatypes) | Return the key and item child datatype identifiers of a map datatype [`dt.getDictionaryDatatypes`](#dtgetdictionarydatatypes) | Return the value and index child datatype identifiers of a dictionary datatype [`dt.getChildFields`](#dtgetchildfields) | Return the list of child field identifiers of a struct/spare_union/dense_union datatype - -### [Datatype management](#datatype-management) - -object | use --------|------- +
**[Datatype management](#datatype-management)** [`dt.printDatatype`](#dtprintdatatype) | Display user readable information for a datatype, including parameters and nested child datatypes [`dt.listDatatypes`](#dtlistdatatypes) | Return the list of identifiers for all datatypes held in the DatatypeStore [`dt.removeDatatype`](#dtremovedatatype) | Remove a datatype from the DatatypeStore [`dt.equalDatatypes`](#dtequaldatatypes) | Check if two datatypes are logically equal, including parameters and nested child datatypes - -### [Field Constructor](#field-constructor) - -object | use --------|------- +
**[Field Constructor](#field-constructor)** [`fd.field`](#fdfield) | Create a field instance from its name and datatype - -### [Field Inspection](#field-inspection) - -object | use --------|------- +
**[Field Inspection](#field-inspection)** [`fd.fieldName`](#fdfieldname) | Return the name of a field [`fd.fieldDatatype`](#fdfielddatatype) | Return the datatype of a field - -### [Field management](#field-management) - -object | use --------|------- +
**[Field management](#field-management)** [`fd.printField`](#fdprintfield) | Display user readable information for a field, including name and datatype [`fd.listFields`](#fdlistfields) | Return the list of identifiers for all fields held in the FieldStore [`fd.removeField`](#fdremovefield) | Remove a field from the FieldStore [`fd.equalFields`](#fdequalfields) | Check if two fields are logically equal, including names and datatypes - -### [Schema constructors](#schema-constructors) - -object | use --------|------- +
**[Schema constructors](#schema-constructors)** [`sc.schema`](#scschema) | Create a schema instance from a list of field identifiers [`sc.inferSchema`](#scinferschema) | Infer and construct a schema based on a kdb+ table - -### [Schema inspection](#schema-inspection) - -object | use --------|------- +
**[Schema inspection](#schema-inspection)** [`sc.schemaFields`](#scschemafields) | Return the list of field identifiers used by a schema - -### [Schema management](#schema-management) - -object | use --------|------- +
**[Schema management](#schema-management)** [`sc.printSchema`](#scprintschema) | Display user readable information for a schema, including its fields and their order [`sc.listSchemas`](#sclistschemas) | Return the list of identifiers for all schemas held in the SchemaStore [`sc.removeSchema`](#scremoveschema) | Remove a schema from the SchemaStore [`sc.equalSchemas`](#scequalschemas) | Check if two schemas are logically equal, including their fields and the fields' order - -### [Array data](#array-data) - -object | use --------|------- +
**[Array data](#array-data)** [`ar.prettyPrintArray`](#arprettyprintarray) | Convert a kdb+ list to an Arrow array and pretty print the array [`ar.prettyPrintArrayFromList`](#arprettyprintarrayfromlist) | Convert a kdb+ list to an Arrow array and pretty-print the array, inferring the datatype from the kdb+ list type - - -### [Table data](#table-data) - -object | use --------|------- +
**[Table data](#table-data)** [`tb.prettyPrintTable`](#tbprettyprinttable) | Convert a kdb+ mixed list of array data to an Arrow table and pretty print the table [`tb.prettyPrintTableFromTable`](#tbprettyprinttablefromtable) | Convert a kdb+ table to an Arrow table and pretty print the table, inferring the schema from the kdb+ table structure - -### [Parquet files](#parquet-files) - -object | use --------|------- +
**[Parquet files](#parquet-files)** [`pq.writeParquet`](#pqwriteparquet) | Convert a kdb+ mixed list of array data to an Arrow table and write to a Parquet file [`pq.writeParquetFromTable`](#pqwriteparquetfromtable) | Convert a kdb+ table to an Arrow table and write to a Parquet file, inferring the schema from the kdb+ table structure [`pq.readParquetSchema`](#pqreadparquetschema) | Read the schema from a Parquet file @@ -141,31 +96,19 @@ object | use [`pq.readParquetNumRowGroups`](#pqreadparquetnumrowgroups) | Read the number of row groups used by a Parquet file [`pq.readParquetRowGroups`](#pqreadparquetrowgroups) | Read a set of row groups from a Parquet file into an Arrow table then convert to a kdb+ mixed list of array data [`pq.readParquetRowGroupsToTable`](#pqreadparquetrowgroupstotable) | Read a set of row groups from a Parquet file into an Arrow table then convert to a kdb+ table - -### [Arrow IPC files](#arrow-ipc-files) - -object | use --------|------- +
**[Arrow IPC files](#arrow-ipc-files)** [`ipc.writeArrow`](#ipcwritearrow) | Convert a kdb+ mixed list of array data to an Arrow table and write to an Arrow file [`ipc.writeArrowFromTable`](#ipcwritearrowfromtable) | Convert a kdb+ table to an Arrow table and write to an Arrow file, inferring the schema from the kdb+ table structure [`ipc.readArrowSchema`](#ipcreadarrowschema) | Read the schema from an Arrow file [`ipc.readArrowData`](#ipcreadarrowdata) | Read an Arrow table from an Arrow file and convert to a kdb+ mixed list of array data [`ipc.readArrowToTable`](#ipcreadarrowtotable) | Read an Arrow table from an Arrow file and convert to a kdb+ table - -### [Arrow IPC streams](#arrow-ipc-streams) - -object | use --------|------- +
**[Arrow IPC streams](#arrow-ipc-streams)** [`ipc.serializeArrow`](#ipcserializearrow) | Convert a kdb+ mixed list of array data to an Arrow table and serialize to an Arrow stream [`ipc.serializeArrowFromTable`](#ipcserializearrowfromtable) | Convert a kdb+ table to an Arrow table and serialize to an Arrow stream, inferring the schema from the kdb+ table structure [`ipc.parseArrowSchema`](#ipcparsearrowschema) | Parse the schema from an Arrow stream [`ipc.parseArrowData`](#ipcparsearrowdata) | Parse an Arrow table from an Arrow stream and convert to a kdb+ mixed list of array data [`ipc.parseArrowToTable`](#ipcparsearrowtotable) | Parse an Arrow table from an Arrow file and convert to a kdb+ table - -### [Utilities](#utilities) - -object | use --------|------- +
**[Utilities](#utilities)** [`util.buildInfo`](#utilbuildinfo) | Return build information regarding the in use Arrow library From 9d8620984ef78f74535e714615dd88243b712f42 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 17 Feb 2023 16:48:46 +0300 Subject: [PATCH 082/276] Writing and serialization of chunked tables --- src/ArrayWriter.cpp | 45 +++++++++++++---- src/ArrayWriter.h | 9 ++++ src/HelperFunctions.h | 13 +++++ src/KdbOptions.h | 2 + src/TableData.cpp | 111 +++++++++++++++++++++++++++++++++++------- 5 files changed, 152 insertions(+), 28 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index e29a197..adb73a7 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -449,32 +449,38 @@ void PopulateBuilder(shared_ptr datatype, K template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; auto uint8_builder = static_cast(builder); if( type_overrides.null_mapping.have_uint8 ){ - std::vector null_bitmap( k_array->n ); - for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.uint8_null != static_cast( kG( k_array )[i] ); + std::vector null_bitmap( length ); + for( auto i = 0ll; i < length; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.uint8_null != static_cast( kG( k_array )[i+offset] ); } - PARQUET_THROW_NOT_OK( uint8_builder->AppendValues( ( uint8_t* )kG( k_array ), k_array->n, null_bitmap ) ); + PARQUET_THROW_NOT_OK( uint8_builder->AppendValues( ( uint8_t* )&kG( k_array )[offset], length, null_bitmap ) ); } else { - PARQUET_THROW_NOT_OK(uint8_builder->AppendValues((uint8_t*)kG(k_array), k_array->n)); + PARQUET_THROW_NOT_OK( uint8_builder->AppendValues( ( uint8_t* )&kG( k_array )[offset], length ) ); } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; auto int8_builder = static_cast(builder); if( type_overrides.null_mapping.have_int8 ){ - std::vector null_bitmap( k_array->n ); - for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.int8_null != kG( k_array )[i]; + std::vector null_bitmap( length ); + for( auto i = 0ll; i < length; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.int8_null != kG( k_array )[i+offset]; } - PARQUET_THROW_NOT_OK( int8_builder->AppendValues( ( int8_t* )kG( k_array ), k_array->n, null_bitmap ) ); + PARQUET_THROW_NOT_OK( int8_builder->AppendValues( ( int8_t* )&kG( k_array )[offset], length, null_bitmap ) ); } else { - PARQUET_THROW_NOT_OK(int8_builder->AppendValues((int8_t*)kG(k_array), k_array->n)); + PARQUET_THROW_NOT_OK(int8_builder->AppendValues((int8_t*)&kG(k_array)[offset], length)); } } @@ -1120,6 +1126,25 @@ shared_ptr MakeArray(shared_ptr datatype, K k_arr return array; } +shared_ptr MakeChunkedArray( + shared_ptr datatype + , K k_array + , TypeMappingOverride& type_overrides ) +{ + type_overrides.chunk_offset = 0; + vector> chunks; + int64_t num_chunks = type_overrides.NumChunks( k_array->n ); + for( int64_t i = 0; i < num_chunks; ++i ){ + auto array = MakeArray( datatype, k_array, type_overrides ); + chunks.push_back( array ); + type_overrides.chunk_offset += type_overrides.chunk_length; + } + + auto chunked_array = make_shared( move( chunks ) ); + + return chunked_array; +} + } // namespace arrowkdb } // namespace kx diff --git a/src/ArrayWriter.h b/src/ArrayWriter.h index 53a9b1b..e73aede 100644 --- a/src/ArrayWriter.h +++ b/src/ArrayWriter.h @@ -29,6 +29,15 @@ void PopulateBuilder(std::shared_ptr datatype, K k_array, arrow */ std::shared_ptr MakeArray(std::shared_ptr datatype, K k_array, TypeMappingOverride& type_overrides); +/** + * @brief Copies and converts a kdb list to an arrow chunked array + * + * @param datatype The datatype to use when creating the arrow array + * @param k_array The kdb list from which to source the data + * @return The arrow array +*/ +std::shared_ptr MakeChunkedArray( std::shared_ptr datatype, K k_array, TypeMappingOverride& type_overrides ); + } // namespace arrowkdb } // namespace kx diff --git a/src/HelperFunctions.h b/src/HelperFunctions.h index f48f8f1..001e46f 100644 --- a/src/HelperFunctions.h +++ b/src/HelperFunctions.h @@ -99,8 +99,21 @@ typedef signed char KdbType; { int64_t decimal128_as_double = 0; Options::NullMapping null_mapping; + int64_t chunk_offset = 0; + int64_t chunk_length = 0; + TypeMappingOverride(void) {}; TypeMappingOverride(const KdbOptions& options); + + int64_t NumChunks( long long array_length ) { return !chunk_length ? 1 + : array_length / chunk_length + ( array_length % chunk_length ? 1 : 0 ); + } + std::pair GetChunk( long long array_length ){ + int64_t offset = chunk_length ? chunk_offset : 0; + int64_t length = std::min( array_length - offset, chunk_length ? chunk_length : array_length ); + + return std::make_pair( offset, length ); + } }; /** diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 9bde4f8..c836747 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -19,6 +19,7 @@ namespace arrowkdb { namespace Options { // Int options + const std::string ARROW_CHUNK_ROWS = "ARROW_CHUNK_ROWS"; const std::string PARQUET_CHUNK_SIZE = "PARQUET_CHUNK_SIZE"; const std::string PARQUET_MULTITHREADED_READ = "PARQUET_MULTITHREADED_READ"; const std::string USE_MMAP = "USE_MMAP"; @@ -60,6 +61,7 @@ namespace Options const std::string NM_DAY_TIME_INTERVAL = "day_time_interval"; const static std::set int_options = { + ARROW_CHUNK_ROWS, PARQUET_CHUNK_SIZE, PARQUET_MULTITHREADED_READ, USE_MMAP, diff --git a/src/TableData.cpp b/src/TableData.cpp index 5d84bf9..cab590f 100644 --- a/src/TableData.cpp +++ b/src/TableData.cpp @@ -63,6 +63,31 @@ std::vector> MakeArrays(std::shared_ptr> MakeChunkedArrays( + std::shared_ptr schema + , K array_data + , kx::arrowkdb::TypeMappingOverride& type_overrides ) +{ + if( array_data->t != 0 ) + throw kx::arrowkdb::TypeCheck( "array_data not mixed list" ); + if( array_data->n < schema->num_fields() ) + throw kx::arrowkdb::TypeCheck( "array_data length less than number of schema fields" ); + std::vector> chunked_arrays; + if( array_data->t == 0 && array_data->n == 0 ){ + // Empty table + } + else{ + // Only count up to the number of schema fields. Additional trailing data + // in the kdb mixed list is ignored (to allow for ::) + for( auto i = 0; i < schema->num_fields(); ++i ){ + auto k_array = kK( array_data )[i]; + chunked_arrays.push_back( kx::arrowkdb::MakeChunkedArray( schema->field(i)->type(), k_array, type_overrides ) ); + } + } + + return chunked_arrays; +} + // Create a an arrow table from the arrow schema and mixed list of kdb array objects std::shared_ptr MakeTable(std::shared_ptr schema, K array_data, kx::arrowkdb::TypeMappingOverride& type_overrides) { @@ -453,19 +478,44 @@ K writeArrow(K arrow_file, K schema_id, K array_data, K options) std::shared_ptr writer; PARQUET_ASSIGN_OR_THROW(writer, arrow::ipc::MakeFileWriter(outfile.get(), schema)); - auto arrays = MakeArrays(schema, array_data, type_overrides); + // Chunk size + read_options.GetIntOption( kx::arrowkdb::Options::ARROW_CHUNK_ROWS, type_overrides.chunk_length ); + + auto check_length = []( const auto& arrays ){ + // Check all arrays are same length + int64_t len = -1; + for (auto i : arrays) { + if (len == -1) + len = i->length(); + else if (len != i->length()) + return -1l; + } + + return len; + }; + + if( !type_overrides.chunk_length ){ // arrow not chunked + auto arrays = MakeArrays(schema, array_data, type_overrides); - // Check all arrays are same length - int64_t len = -1; - for (auto i : arrays) { - if (len == -1) - len = i->length(); - else if (len != i->length()) + auto len = check_length( arrays ); + if( len < 0 ){ return krr((S)"unequal length arrays"); + } + + auto batch = arrow::RecordBatch::Make(schema, len, arrays); + PARQUET_THROW_NOT_OK(writer->WriteRecordBatch(*batch)); } + else{ + auto chunked_arrays = MakeChunkedArrays( schema, array_data, type_overrides ); - auto batch = arrow::RecordBatch::Make(schema, len, arrays); - PARQUET_THROW_NOT_OK(writer->WriteRecordBatch(*batch)); + auto len = check_length( chunked_arrays ); + if( len < 0 ){ + return krr((S)"unequal length arrays"); + } + + auto table = arrow::Table::Make( schema, chunked_arrays ); + PARQUET_THROW_NOT_OK( writer->WriteTable( *table ) ); + } PARQUET_THROW_NOT_OK(writer->Close()); @@ -607,19 +657,44 @@ K serializeArrow(K schema_id, K array_data, K options) sink.reset(new arrow::io::BufferOutputStream(buffer)); PARQUET_ASSIGN_OR_THROW(writer, arrow::ipc::MakeStreamWriter(sink.get(), schema)); - auto arrays = MakeArrays(schema, array_data, type_overrides); + // Chunk size + read_options.GetIntOption( kx::arrowkdb::Options::ARROW_CHUNK_ROWS, type_overrides.chunk_length ); + + auto check_length = []( const auto& arrays ){ + // Check all arrays are same length + int64_t len = -1; + for (auto i : arrays) { + if (len == -1) + len = i->length(); + else if (len != i->length()) + return -1l; + } + + return len; + }; - // Check all arrays are same length - int64_t len = -1; - for (auto i : arrays) { - if (len == -1) - len = i->length(); - else if (len != i->length()) + if( !type_overrides.chunk_length ){ // arrow not chunked + auto arrays = MakeArrays(schema, array_data, type_overrides); + + auto len = check_length( arrays ); + if( len < 0 ){ return krr((S)"unequal length arrays"); + } + + auto batch = arrow::RecordBatch::Make(schema, len, arrays); + PARQUET_THROW_NOT_OK(writer->WriteRecordBatch(*batch)); } + else{ + auto chunked_arrays = MakeChunkedArrays( schema, array_data, type_overrides ); - auto batch = arrow::RecordBatch::Make(schema, len, arrays); - PARQUET_THROW_NOT_OK(writer->WriteRecordBatch(*batch)); + auto len = check_length( chunked_arrays ); + if( len < 0 ){ + return krr((S)"unequal length arrays"); + } + + auto table = arrow::Table::Make( schema, chunked_arrays ); + PARQUET_THROW_NOT_OK( writer->WriteTable( *table ) ); + } PARQUET_THROW_NOT_OK(writer->Close()); std::shared_ptr final_buffer; From 25c7c08070f5f60efeae6c269af00aa4b4310777 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 20 Feb 2023 19:45:08 +0300 Subject: [PATCH 083/276] Example of batching array data --- examples/batching_tables.q | 44 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 examples/batching_tables.q diff --git a/examples/batching_tables.q b/examples/batching_tables.q new file mode 100644 index 0000000..e98a87f --- /dev/null +++ b/examples/batching_tables.q @@ -0,0 +1,44 @@ +// batching_tables.q +// Examples of creating a schema supporting null mapping and using it to read/write parquet and arrow tables + +-1"\n+----------|| batching_tables.q ||----------+\n"; + +// import the arrowkdb library +\l q/arrowkdb.q + +// Filesystem functions for Linux/MacOS/Windows +ls:{[filename] $[.z.o like "w*";system "dir /b ",filename;system "ls ",filename]}; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +//-------------------// +// Create the table // +//-------------------// + +// Support batching of large tables + +// Create data for a large column in the table +batching_table:([]col:2147483652#0x00) +.arrowkdb.ts.writeReadArray[.arrowkdb.dt.int8[];batching_table`col;::] + +// Write the batching table data to a parquet file +batching_options:(``PARQUET_VERSION)!((::);`V2.0) + +parquet_batching:"batching_table.parquet"; +.arrowkdb.pq.writeParquetFromTable[parquet_batching;batching_table;batching_options] +show ls parquet_batching +//rm parquet_batching + +// Write the batching array data to an arrow file +batching_options[`ARROW_CHUNK_ROWS]:214748365 + +arrow_batching:"batching_table.arrow"; +.arrowkdb.ipc.writeArrowFromTable[arrow_batching;batching_table;batching_options] +show ls arrow_batching +//rm arrow_batching; + +// Serialize the batching array data to an arrow stream +serialized_batching:.arrowkdb.ipc.serializeArrowFromTable[batching_table;batching_options]; +show serialized_batching + + +-1 "\n+----------------------------------------+\n"; From 46e837947c733c4af05724c906b755cd2e98ac47 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 20 Feb 2023 20:54:26 +0300 Subject: [PATCH 084/276] Batching through numeric types --- src/ArrayWriter.cpp | 132 +++++++++++++++++++++++++++----------------- 1 file changed, 81 insertions(+), 51 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index adb73a7..1616d0a 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -433,16 +433,19 @@ void PopulateBuilder(shared_ptr datatype, K k_ template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; auto bool_builder = static_cast(builder); if( type_overrides.null_mapping.have_boolean ){ - std::vector null_bitmap( k_array->n ); - for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.boolean_null != static_cast( kG( k_array )[i] ); + std::vector null_bitmap( length ); + for( auto i = 0ll; i < length; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.boolean_null != static_cast( kG( k_array )[i+offset] ); } - PARQUET_THROW_NOT_OK( bool_builder->AppendValues( ( uint8_t* )kG( k_array ), k_array->n, null_bitmap ) ); + PARQUET_THROW_NOT_OK( bool_builder->AppendValues( ( uint8_t* )&kG( k_array )[offset], length, null_bitmap ) ); } else { - PARQUET_THROW_NOT_OK(bool_builder->AppendValues((uint8_t*)kG(k_array), k_array->n)); + PARQUET_THROW_NOT_OK( bool_builder->AppendValues( ( uint8_t* )&kG( k_array )[offset], length ) ); } } @@ -480,151 +483,178 @@ void PopulateBuilder(shared_ptr datatype, K PARQUET_THROW_NOT_OK( int8_builder->AppendValues( ( int8_t* )&kG( k_array )[offset], length, null_bitmap ) ); } else { - PARQUET_THROW_NOT_OK(int8_builder->AppendValues((int8_t*)&kG(k_array)[offset], length)); + PARQUET_THROW_NOT_OK( int8_builder->AppendValues( ( int8_t* )&kG( k_array )[offset], length ) ); } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; auto uint16_builder = static_cast(builder); if( type_overrides.null_mapping.have_uint16 ){ - std::vector null_bitmap( k_array->n ); - for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.uint16_null != static_cast( kH( k_array )[i] ); + std::vector null_bitmap( length ); + for( auto i = 0ll; i < length; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.uint16_null != static_cast( kH( k_array )[i+offset] ); } - PARQUET_THROW_NOT_OK( uint16_builder->AppendValues( ( uint16_t* )kH( k_array ), k_array->n, null_bitmap ) ); + PARQUET_THROW_NOT_OK( uint16_builder->AppendValues( ( uint16_t* )&kH( k_array )[offset], length, null_bitmap ) ); } else { - PARQUET_THROW_NOT_OK(uint16_builder->AppendValues((uint16_t*)kH(k_array), k_array->n)); + PARQUET_THROW_NOT_OK( uint16_builder->AppendValues( ( uint16_t* )&kH( k_array )[offset], length ) ); } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; auto int16_builder = static_cast(builder); if( type_overrides.null_mapping.have_int16 ){ - std::vector null_bitmap( k_array->n ); - for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.int16_null != kH( k_array )[i]; + std::vector null_bitmap( length ); + for( auto i = 0ll; i < length; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.int16_null != kH( k_array )[i+offset]; } - PARQUET_THROW_NOT_OK( int16_builder->AppendValues( ( int16_t* )kH( k_array ), k_array->n, null_bitmap ) ); + PARQUET_THROW_NOT_OK( int16_builder->AppendValues( ( int16_t* )&kH( k_array )[offset], length, null_bitmap ) ); } else { - PARQUET_THROW_NOT_OK(int16_builder->AppendValues((int16_t*)kH(k_array), k_array->n)); + PARQUET_THROW_NOT_OK( int16_builder->AppendValues( ( int16_t* )&kH( k_array )[offset], length) ); } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; auto uint32_builder = static_cast(builder); if( type_overrides.null_mapping.have_uint32 ){ - std::vector null_bitmap( k_array->n ); - for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.uint32_null != static_cast( kI( k_array )[i] ); + std::vector null_bitmap( length ); + for( auto i = 0ll; i < length; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.uint32_null != static_cast( kI( k_array )[i+offset] ); } - PARQUET_THROW_NOT_OK( uint32_builder->AppendValues( ( uint32_t* )kI( k_array ), k_array->n, null_bitmap ) ); + PARQUET_THROW_NOT_OK( uint32_builder->AppendValues( ( uint32_t* )&kI( k_array )[offset], length, null_bitmap ) ); } else{ - PARQUET_THROW_NOT_OK(uint32_builder->AppendValues((uint32_t*)kI(k_array), k_array->n)); + PARQUET_THROW_NOT_OK( uint32_builder->AppendValues( ( uint32_t* )&kI( k_array )[offset], length ) ); } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; auto int32_builder = static_cast(builder); if( type_overrides.null_mapping.have_int32 ){ - std::vector null_bitmap( k_array->n ); - for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.int32_null != kI( k_array )[i]; + std::vector null_bitmap( length ); + for( auto i = 0ll; i < length; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.int32_null != kI( k_array )[i+offset]; } - PARQUET_THROW_NOT_OK( int32_builder->AppendValues( ( int32_t* )kI( k_array ), k_array->n, null_bitmap ) ); + PARQUET_THROW_NOT_OK( int32_builder->AppendValues( ( int32_t* )&kI( k_array )[offset], length, null_bitmap ) ); } else{ - PARQUET_THROW_NOT_OK(int32_builder->AppendValues((int32_t*)kI(k_array), k_array->n)); + PARQUET_THROW_NOT_OK( int32_builder->AppendValues( ( int32_t* )&kI( k_array )[offset], length ) ); } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; auto uint64_builder = static_cast(builder); if( type_overrides.null_mapping.have_uint64 ){ - std::vector null_bitmap( k_array->n ); - for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.uint64_null != static_cast( kJ( k_array )[i] ); + std::vector null_bitmap( length ); + for( auto i = 0ll; i < length; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.uint64_null != static_cast( kJ( k_array )[i+offset] ); } - PARQUET_THROW_NOT_OK( uint64_builder->AppendValues( ( uint64_t* )kJ( k_array ), k_array->n, null_bitmap ) ); + PARQUET_THROW_NOT_OK( uint64_builder->AppendValues( ( uint64_t* )&kJ( k_array )[offset], length, null_bitmap ) ); } else{ - PARQUET_THROW_NOT_OK(uint64_builder->AppendValues((uint64_t*)kJ(k_array), k_array->n)); + PARQUET_THROW_NOT_OK( uint64_builder->AppendValues( ( uint64_t* )&kJ( k_array )[offset], length ) ); } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; auto int64_builder = static_cast(builder); if( type_overrides.null_mapping.have_int64 ){ - std::vector null_bitmap( k_array->n ); - for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.int64_null != kJ( k_array )[i]; + std::vector null_bitmap( length ); + for( auto i = 0ll; i < length; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.int64_null != kJ( k_array )[i+offset]; } - PARQUET_THROW_NOT_OK( int64_builder->AppendValues( ( int64_t* )kJ( k_array ), k_array->n, null_bitmap ) ); + PARQUET_THROW_NOT_OK( int64_builder->AppendValues( ( int64_t* )&kJ( k_array )[offset], length, null_bitmap ) ); } else{ - PARQUET_THROW_NOT_OK(int64_builder->AppendValues((int64_t*)kJ(k_array), k_array->n)); + PARQUET_THROW_NOT_OK( int64_builder->AppendValues( ( int64_t* )&kJ( k_array )[offset], length ) ); } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; auto hfl_builder = static_cast(builder); if( type_overrides.null_mapping.have_float16 ){ - std::vector null_bitmap( k_array->n ); - for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.float16_null != static_cast( kH( k_array )[i] ); + std::vector null_bitmap( length ); + for( auto i = 0ll; i < length; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.float16_null != static_cast( kH( k_array )[i+offset] ); } - PARQUET_THROW_NOT_OK( hfl_builder->AppendValues( ( uint16_t* )kH( k_array ), k_array->n, null_bitmap ) ); + PARQUET_THROW_NOT_OK( hfl_builder->AppendValues( ( uint16_t* )&kH( k_array )[offset], length, null_bitmap ) ); } else { - PARQUET_THROW_NOT_OK(hfl_builder->AppendValues((uint16_t*)kH(k_array), k_array->n)); + PARQUET_THROW_NOT_OK( hfl_builder->AppendValues( ( uint16_t* )&kH( k_array )[offset], length ) ); } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; auto fl_builder = static_cast(builder); if( type_overrides.null_mapping.have_float32 ){ - std::vector null_bitmap( k_array->n ); - for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = !is_equal( type_overrides.null_mapping.float32_null, kE( k_array )[i] ); + std::vector null_bitmap( length ); + for( auto i = 0ll; i < length; ++i ){ + null_bitmap[i] = !is_equal( type_overrides.null_mapping.float32_null, kE( k_array )[i+offset] ); } - PARQUET_THROW_NOT_OK( fl_builder->AppendValues( kE( k_array ), k_array->n, null_bitmap ) ); + PARQUET_THROW_NOT_OK( fl_builder->AppendValues( &kE( k_array )[offset], length, null_bitmap ) ); } else { - PARQUET_THROW_NOT_OK(fl_builder->AppendValues(kE(k_array), k_array->n)); + PARQUET_THROW_NOT_OK( fl_builder->AppendValues( &kE( k_array )[offset], length ) ); } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; auto dbl_builder = static_cast(builder); if( type_overrides.null_mapping.have_float64 ){ - std::vector null_bitmap( k_array->n ); - for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = !is_equal( type_overrides.null_mapping.float64_null, kF( k_array )[i] ); + std::vector null_bitmap( length ); + for( auto i = 0ll; i < length; ++i ){ + null_bitmap[i] = !is_equal( type_overrides.null_mapping.float64_null, kF( k_array )[i+offset] ); } - PARQUET_THROW_NOT_OK( dbl_builder->AppendValues( kF( k_array ), k_array->n, null_bitmap ) ); + PARQUET_THROW_NOT_OK( dbl_builder->AppendValues( &kF( k_array )[offset], length, null_bitmap ) ); } else { - PARQUET_THROW_NOT_OK(dbl_builder->AppendValues(kF(k_array), k_array->n)); + PARQUET_THROW_NOT_OK( dbl_builder->AppendValues( &kF( k_array )[offset], length ) ); } } From b237ce92976e4f934a070f14994e273501f61d6a Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 20 Feb 2023 21:47:06 +0300 Subject: [PATCH 085/276] Batching through string types --- src/ArrayWriter.cpp | 53 +++++++++++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 19 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index 1616d0a..9a4520e 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -661,23 +661,26 @@ void PopulateBuilder(shared_ptr datatype, template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; auto str_builder = static_cast(builder); bool is_symbol = k_array->t == KS && (datatype->id() == arrow::Type::STRING || datatype->id() == arrow::Type::LARGE_STRING); if( is_symbol ){ // Populate from symbol list - for( auto i = 0ll; i < k_array->n; ++i ){ + for( auto i = 0ll; i < length; ++i ){ if( type_overrides.null_mapping.have_string - && type_overrides.null_mapping.string_null == kS( k_array )[i] ){ + && type_overrides.null_mapping.string_null == kS( k_array )[i+offset] ){ PARQUET_THROW_NOT_OK( str_builder->AppendNull() ); } else{ - PARQUET_THROW_NOT_OK( str_builder->Append( kS( k_array )[i] ) ); + PARQUET_THROW_NOT_OK( str_builder->Append( kS( k_array )[i+offset] ) ); } } } else { // Populate from mixed list of char lists - for( auto i = 0ll; i < k_array->n; ++i ){ - K str_data = kK( k_array )[i]; + for( auto i = 0ll; i < length; ++i ){ + K str_data = kK( k_array )[i+offset]; TYPE_CHECK_ITEM( str_data->t != KC, datatype->ToString(), KC, str_data->t ); if( type_overrides.null_mapping.have_string && type_overrides.null_mapping.string_null.length() == static_cast( str_data->n ) @@ -694,23 +697,26 @@ void PopulateBuilder(shared_ptr datatype, template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; auto str_builder = static_cast(builder); bool is_symbol = k_array->t == KS && (datatype->id() == arrow::Type::STRING || datatype->id() == arrow::Type::LARGE_STRING); if( is_symbol ){ // Populate from symbol list - for( auto i = 0ll; i < k_array->n; ++i ){ + for( auto i = 0ll; i < length; ++i ){ if( type_overrides.null_mapping.have_large_string - && type_overrides.null_mapping.large_string_null == kS( k_array )[i] ){ + && type_overrides.null_mapping.large_string_null == kS( k_array )[i+offset] ){ PARQUET_THROW_NOT_OK( str_builder->AppendNull() ); } else{ - PARQUET_THROW_NOT_OK( str_builder->Append( kS( k_array )[i] ) ); + PARQUET_THROW_NOT_OK( str_builder->Append( kS( k_array )[i+offset] ) ); } } } else { // Populate from mixed list of char lists - for( auto i = 0ll; i < k_array->n; ++i ){ - K str_data = kK( k_array )[i]; + for( auto i = 0ll; i < length; ++i ){ + K str_data = kK( k_array )[i+offset]; TYPE_CHECK_ITEM( str_data->t != KC, datatype->ToString(), KC, str_data->t ); if( type_overrides.null_mapping.have_large_string && type_overrides.null_mapping.large_string_null.length() == static_cast( str_data->n ) @@ -727,9 +733,12 @@ void PopulateBuilder(shared_ptr data template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; auto bin_builder = static_cast(builder); - for (auto i = 0; i < k_array->n; ++i) { - K bin_data = kK(k_array)[i]; + for( auto i = 0; i < length; ++i ){ + K bin_data = kK( k_array )[i+offset]; TYPE_CHECK_ITEM(bin_data->t != KG, datatype->ToString(), KG, bin_data->t); if( type_overrides.null_mapping.have_binary && type_overrides.null_mapping.binary_null.length() == static_cast( bin_data->n ) @@ -745,9 +754,12 @@ void PopulateBuilder(shared_ptr datatype, template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; auto bin_builder = static_cast(builder); - for (auto i = 0; i < k_array->n; ++i) { - K bin_data = kK(k_array)[i]; + for( auto i = 0; i < length; ++i ){ + K bin_data = kK( k_array )[i+offset]; TYPE_CHECK_ITEM(bin_data->t != KG, datatype->ToString(), KG, bin_data->t); if( type_overrides.null_mapping.have_large_binary && type_overrides.null_mapping.large_binary_null.length() == static_cast( bin_data->n ) @@ -763,22 +775,25 @@ void PopulateBuilder(shared_ptr data template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; bool is_guid = k_array->t == UU && datatype->id() == arrow::Type::FIXED_SIZE_BINARY && static_cast(builder)->byte_width() == sizeof(U); auto fixed_bin_builder = static_cast(builder); if (is_guid) { - for (auto i = 0; i < k_array->n; ++i){ + for (auto i = 0; i < length; ++i){ if( type_overrides.null_mapping.have_fixed_binary && type_overrides.null_mapping.fixed_binary_null.length() == sizeof( U ) - && !type_overrides.null_mapping.fixed_binary_null.compare( 0, sizeof( U ), &kU( k_array )[i].g[0], sizeof( U ) ) ){ + && !type_overrides.null_mapping.fixed_binary_null.compare( 0, sizeof( U ), &kU( k_array )[i+offset].g[0], sizeof( U ) ) ){ PARQUET_THROW_NOT_OK( fixed_bin_builder->AppendNull() ); } else{ - PARQUET_THROW_NOT_OK(fixed_bin_builder->Append((char*)&kU(k_array)[i])); + PARQUET_THROW_NOT_OK( fixed_bin_builder->Append( ( char* )&kU( k_array )[i+offset] ) ); } } } else { - for (auto i = 0; i < k_array->n; ++i) { - K bin_data = kK(k_array)[i]; + for (auto i = 0; i < length; ++i) { + K bin_data = kK(k_array)[i+offset]; TYPE_CHECK_ITEM(bin_data->t != KG, datatype->ToString(), KG, bin_data->t); TYPE_CHECK_LENGTH(fixed_bin_builder->byte_width() != bin_data->n, builder->type()->ToString(), fixed_bin_builder->byte_width(), bin_data->n); if( type_overrides.null_mapping.have_fixed_binary From 1da6aa5650f1abadce10decd3a264c0262a31e14 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 20 Feb 2023 22:19:18 +0300 Subject: [PATCH 086/276] Batching through temporal types --- src/ArrayWriter.cpp | 96 ++++++++++++++++++++++++++++++--------------- 1 file changed, 64 insertions(+), 32 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index 9a4520e..9ea8d68 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -781,7 +781,7 @@ void PopulateBuilder(shared_ptr bool is_guid = k_array->t == UU && datatype->id() == arrow::Type::FIXED_SIZE_BINARY && static_cast(builder)->byte_width() == sizeof(U); auto fixed_bin_builder = static_cast(builder); if (is_guid) { - for (auto i = 0; i < length; ++i){ + for( auto i = 0; i < length; ++i ){ if( type_overrides.null_mapping.have_fixed_binary && type_overrides.null_mapping.fixed_binary_null.length() == sizeof( U ) && !type_overrides.null_mapping.fixed_binary_null.compare( 0, sizeof( U ), &kU( k_array )[i+offset].g[0], sizeof( U ) ) ){ @@ -792,7 +792,7 @@ void PopulateBuilder(shared_ptr } } } else { - for (auto i = 0; i < length; ++i) { + for( auto i = 0; i < length; ++i ){ K bin_data = kK(k_array)[i+offset]; TYPE_CHECK_ITEM(bin_data->t != KG, datatype->ToString(), KG, bin_data->t); TYPE_CHECK_LENGTH(fixed_bin_builder->byte_width() != bin_data->n, builder->type()->ToString(), fixed_bin_builder->byte_width(), bin_data->n); @@ -811,15 +811,18 @@ void PopulateBuilder(shared_ptr template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; TemporalConversion tc(datatype); auto d32_builder = static_cast(builder); - for (auto i = 0; i < k_array->n; ++i){ + for( auto i = 0; i < length; ++i ){ if( type_overrides.null_mapping.have_date32 - && type_overrides.null_mapping.date32_null == kI( k_array )[i] ){ + && type_overrides.null_mapping.date32_null == kI( k_array )[i+offset] ){ PARQUET_THROW_NOT_OK( d32_builder->AppendNull() ); } else{ - PARQUET_THROW_NOT_OK(d32_builder->Append(tc.KdbToArrow(kI(k_array)[i]))); + PARQUET_THROW_NOT_OK( d32_builder->Append( tc.KdbToArrow( kI( k_array )[i+offset] ) ) ); } } } @@ -827,86 +830,105 @@ void PopulateBuilder(shared_ptr datatype, template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; TemporalConversion tc(datatype); auto d64_builder = static_cast(builder); - for (auto i = 0; i < k_array->n; ++i) + for( auto i = 0; i < length; ++i ){ if( type_overrides.null_mapping.have_date64 - && type_overrides.null_mapping.date64_null == kJ( k_array )[i] ){ + && type_overrides.null_mapping.date64_null == kJ( k_array )[i+offset] ){ PARQUET_THROW_NOT_OK( d64_builder->AppendNull() ); } else{ - PARQUET_THROW_NOT_OK(d64_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); + PARQUET_THROW_NOT_OK( d64_builder->Append( tc.KdbToArrow( kJ( k_array )[i+offset] ) ) ); } + } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; TemporalConversion tc(datatype); auto ts_builder = static_cast(builder); auto timestamp_type = static_pointer_cast(datatype); - for (auto i = 0; i < k_array->n; ++i) + for( auto i = 0; i < length; ++i ){ if( type_overrides.null_mapping.have_timestamp - && type_overrides.null_mapping.timestamp_null == kJ( k_array )[i] ){ + && type_overrides.null_mapping.timestamp_null == kJ( k_array )[i+offset] ){ PARQUET_THROW_NOT_OK( ts_builder->AppendNull() ); } else{ - PARQUET_THROW_NOT_OK(ts_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); + PARQUET_THROW_NOT_OK( ts_builder->Append( tc.KdbToArrow( kJ( k_array )[i+offset] ) ) ); } + } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; TemporalConversion tc(datatype); auto t32_builder = static_cast(builder); auto time32_type = static_pointer_cast(datatype); - for (auto i = 0; i < k_array->n; ++i) + for( auto i = 0; i < length; ++i ){ if( type_overrides.null_mapping.have_time32 - && type_overrides.null_mapping.time32_null == kI( k_array )[i] ){ + && type_overrides.null_mapping.time32_null == kI( k_array )[i+offset] ){ PARQUET_THROW_NOT_OK( t32_builder->AppendNull() ); } else{ - PARQUET_THROW_NOT_OK(t32_builder->Append(tc.KdbToArrow(kI(k_array)[i]))); + PARQUET_THROW_NOT_OK( t32_builder->Append( tc.KdbToArrow( kI( k_array )[i+offset] ) ) ); } + } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; TemporalConversion tc(datatype); auto t64_builder = static_cast(builder); auto time64_type = static_pointer_cast(datatype); - for (auto i = 0; i < k_array->n; ++i) + for( auto i = 0; i < length; ++i ){ if( type_overrides.null_mapping.have_time64 - && type_overrides.null_mapping.time64_null == kJ( k_array )[i] ){ + && type_overrides.null_mapping.time64_null == kJ( k_array )[i+offset] ){ PARQUET_THROW_NOT_OK( t64_builder->AppendNull() ); } else{ - PARQUET_THROW_NOT_OK(t64_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); + PARQUET_THROW_NOT_OK( t64_builder->Append( tc.KdbToArrow( kJ( k_array )[i+offset] ) ) ); } + } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; auto dec_builder = static_cast(builder); auto dec_type = static_pointer_cast(datatype); - for (auto i = 0; i < k_array->n; ++i) { + for (auto i = 0; i < length; ++i) { if (type_overrides.decimal128_as_double) { if( type_overrides.null_mapping.have_decimal - && is_equal( type_overrides.null_mapping.decimal_null, kF( k_array )[i] ) ){ + && is_equal( type_overrides.null_mapping.decimal_null, kF( k_array )[i+offset] ) ){ PARQUET_THROW_NOT_OK( dec_builder->AppendNull() ); } else{ // Construct the decimal from a double arrow::Decimal128 dec128; - PARQUET_ASSIGN_OR_THROW(dec128, arrow::Decimal128::FromReal(kF(k_array)[i], dec_type->precision(), dec_type->scale())); + PARQUET_ASSIGN_OR_THROW(dec128, arrow::Decimal128::FromReal(kF(k_array)[i+offset], dec_type->precision(), dec_type->scale())); PARQUET_THROW_NOT_OK(dec_builder->Append(dec128)); } } else { // Each decimal is a list of 16 bytes - K k_dec = kK(k_array)[i]; + K k_dec = kK(k_array)[i+offset]; TYPE_CHECK_LENGTH(k_dec->n != 16, datatype->ToString(), 16, k_dec->n); TYPE_CHECK_ITEM(k_dec->t != KG, datatype->ToString(), KG, k_dec->t); @@ -919,46 +941,56 @@ void PopulateBuilder(shared_ptr datatype, template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; TemporalConversion tc(datatype); auto dur_builder = static_cast(builder); auto duration_type = static_pointer_cast(datatype); - for (auto i = 0; i < k_array->n; ++i) + for( auto i = 0; i < length; ++i ){ if( type_overrides.null_mapping.have_duration - && type_overrides.null_mapping.duration_null == kJ( k_array )[i] ){ + && type_overrides.null_mapping.duration_null == kJ( k_array )[i+offset] ){ PARQUET_THROW_NOT_OK( dur_builder->AppendNull() ); } else{ - PARQUET_THROW_NOT_OK(dur_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); + PARQUET_THROW_NOT_OK( dur_builder->Append( tc.KdbToArrow( kJ( k_array )[i+offset] ) ) ); } + } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; auto month_builder = static_cast(builder); if( type_overrides.null_mapping.have_month_interval ){ - std::vector null_bitmap( k_array->n ); - for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.month_interval_null != kI( k_array )[i]; + std::vector null_bitmap( length ); + for( auto i = 0ll; i < length; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.month_interval_null != kI( k_array )[i+offset]; } - PARQUET_THROW_NOT_OK( month_builder->AppendValues( ( int32_t* )kI( k_array ), k_array->n, null_bitmap ) ); + PARQUET_THROW_NOT_OK( month_builder->AppendValues( ( int32_t* )&kI( k_array )[offset], length, null_bitmap ) ); } else{ - PARQUET_THROW_NOT_OK(month_builder->AppendValues((int32_t*)kI(k_array), k_array->n)); + PARQUET_THROW_NOT_OK( month_builder->AppendValues( ( int32_t* )&kI( k_array )[offset], length ) ); } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; auto dt_builder = static_cast(builder); - for (auto i = 0; i < k_array->n; ++i){ + for (auto i = 0; i < length; ++i){ if( type_overrides.null_mapping.have_day_time_interval - && type_overrides.null_mapping.day_time_interval_null == kJ( k_array )[i] ){ + && type_overrides.null_mapping.day_time_interval_null == kJ( k_array )[i+offset] ){ PARQUET_THROW_NOT_OK( dt_builder->AppendNull() ); } else{ - PARQUET_THROW_NOT_OK(dt_builder->Append(KTimespan_DayTimeInterval(kJ(k_array)[i]))); + PARQUET_THROW_NOT_OK( dt_builder->Append( KTimespan_DayTimeInterval( kJ( k_array )[i+offset] ) ) ); } } } From 192216cbaa232ae6045bb094a787665145fba2e1 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 24 Feb 2023 11:36:37 +0300 Subject: [PATCH 087/276] Example of nested nulls bitmap --- examples/null_bitmap.q | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q index c8a02fd..afe805c 100644 --- a/examples/null_bitmap.q +++ b/examples/null_bitmap.q @@ -20,8 +20,9 @@ rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; // Support null mapping bitmap_opts:(`bool`int32`float64`string`date32)!(0b;1i;2.34;"start";2006.07.21); +nested_opts:(`uint16`float32`binary`time64)!(9h;8.76e;"x"$"alert";12:00:00.000000000); -options:(``NULL_MAPPING)!((::);bitmap_opts); +options:(``NULL_MAPPING)!((::);bitmap_opts,nested_opts); // Create the datatype identifiers ts_dt:.arrowkdb.dt.timestamp[`nano]; @@ -32,6 +33,12 @@ f64_dt:.arrowkdb.dt.float64[]; str_dt:.arrowkdb.dt.utf8[]; d32_dt:.arrowkdb.dt.date32[]; +ui16_dt:.arrowkdb.dt.uint16[]; + +f32_dt:.arrowkdb.dt.float32[]; +bin_dt:.arrowkdb.dt.binary[]; +t64_dt:.arrowkdb.dt.time64[`nano]; + // Create the field identifiers ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; @@ -41,12 +48,37 @@ f64_fd:.arrowkdb.fd.field[`float64;f64_dt]; str_fd:.arrowkdb.fd.field[`string;str_dt]; d32_fd:.arrowkdb.fd.field[`date32;d32_dt]; +ui16_fd:.arrowkdb.fd.field[`uint16;ui16_dt]; + +f32_fd:.arrowkdb.fd.field[`float32;f32_dt]; +bin_fd:.arrowkdb.fd.field[`binary;bin_dt]; +t64_fd:.arrowkdb.fd.field[`time64;t64_dt]; + +// Create a list datatype, using the uint16 datatype as its child +list_dt:.arrowkdb.dt.list[ui16_dt]; +.arrowkdb.dt.printDatatype[list_dt] + +// Create a field containing the list datatype +list_fd:.arrowkdb.fd.field[`list_field;list_dt]; + +// Create a struct datatype using the float32, binary and time64 fields as its children +struct_dt:.arrowkdb.dt.struct[(f32_fd,bin_dt,t64_dt)]; + +// Create a field containing the struct datatype +struct_fd:.arrowkdb.fd.field[`struct_field;struct_dt]; +.arrowkdb.dt.printDatatype[struct_dt] + // Create the schemas for the list of fields bitmap_schema:.arrowkdb.sc.schema[(ts_fd,bool_fd,i32_fd,f64_fd,str_fd,d32_fd)]; +// Create the schema containing the list and struct fields +nested_schema:.arrowkdb.sc.schema[(ts_fd,struct_dt)]; + // Print the schema .arrowkdb.sc.printSchema[bitmap_schema]; +.arrowkdb.sc.printSchema[nested_schema]; + // Create data for each column in the table ts_data:asc N?0p; From e08b456d528fa6b133dca27c7e28b2ca4780afad Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 24 Feb 2023 12:59:29 +0300 Subject: [PATCH 088/276] Example of writing of nested parquet --- examples/null_bitmap.q | 66 +++++++++++++++++++++++++++++++++++------- 1 file changed, 56 insertions(+), 10 deletions(-) diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q index afe805c..20af51a 100644 --- a/examples/null_bitmap.q +++ b/examples/null_bitmap.q @@ -20,7 +20,7 @@ rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; // Support null mapping bitmap_opts:(`bool`int32`float64`string`date32)!(0b;1i;2.34;"start";2006.07.21); -nested_opts:(`uint16`float32`binary`time64)!(9h;8.76e;"x"$"alert";12:00:00.000000000); +nested_opts:(`uint16`float32`binary`time64)!(9h;8.76e;"x"$"acknowledge";00:00:00.123456789); options:(``NULL_MAPPING)!((::);bitmap_opts,nested_opts); @@ -56,7 +56,6 @@ t64_fd:.arrowkdb.fd.field[`time64;t64_dt]; // Create a list datatype, using the uint16 datatype as its child list_dt:.arrowkdb.dt.list[ui16_dt]; -.arrowkdb.dt.printDatatype[list_dt] // Create a field containing the list datatype list_fd:.arrowkdb.fd.field[`list_field;list_dt]; @@ -66,17 +65,18 @@ struct_dt:.arrowkdb.dt.struct[(f32_fd,bin_dt,t64_dt)]; // Create a field containing the struct datatype struct_fd:.arrowkdb.fd.field[`struct_field;struct_dt]; -.arrowkdb.dt.printDatatype[struct_dt] // Create the schemas for the list of fields bitmap_schema:.arrowkdb.sc.schema[(ts_fd,bool_fd,i32_fd,f64_fd,str_fd,d32_fd)]; // Create the schema containing the list and struct fields -nested_schema:.arrowkdb.sc.schema[(ts_fd,struct_dt)]; +nested_schema:.arrowkdb.sc.schema[(list_fd,struct_dt)]; // Print the schema +-1"\nBitmap schema:"; .arrowkdb.sc.printSchema[bitmap_schema]; +-1"\nNested schema:"; .arrowkdb.sc.printSchema[nested_schema]; // Create data for each column in the table @@ -93,12 +93,34 @@ str_data[3]:"start" d32_data:N?(2006.07.21;2005.07.18;2004.07.16;2003.07.15;2002.07.11); d32_data[4]:2006.07.21; +// Create the data for each of the struct child fields +f32_data:3?100e; +f32_data[0]:8.76e; +bin_data:3?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); +bin_data[1]:"x"$"acknowledge" +t64_data:3?(12:00:00.000000000;13:00:00.000000000;14:00:00.000000000;15:00:00.000000000;16:00:00.000000000); +t64_data[2]:00:00:00.123456789; + // Combine the data for all columns bitmap_data:(ts_data;bool_data;i32_data;f64_data;str_data;d32_data); +// Create the data for the list array +list_data:(enlist (9h);(8h;7h);(6h;5h;4h)); + +// Create the data for the struct array from its child arrays +struct_data:(f32_data;bin_data;t64_data); + +// Combine the array data for the list and struct columns +nested_data:(list_data;struct_data); + // Pretty print the Arrow table populated from the bitmap data +-1"\nBitmap table:"; .arrowkdb.tb.prettyPrintTable[bitmap_schema;bitmap_data;options]; +// Show the array data as an arrow table +-1"\nNested table:"; +.arrowkdb.tb.prettyPrintTable[nested_schema;nested_data;options] + //-------------------------// // Example-1. Parquet file // //-------------------------// @@ -106,20 +128,44 @@ bitmap_data:(ts_data;bool_data;i32_data;f64_data;str_data;d32_data); // Write the schema and array data to a parquet file options[`PARQUET_VERSION]:`V2.0; -parquet_bitmap:"null_bitmap.parquet"; -.arrowkdb.pq.writeParquet[parquet_bitmap;bitmap_schema;bitmap_data;options]; -show ls parquet_bitmap +parquet_null_bitmap:"null_bitmap.parquet"; +parquet_nested_bitmap:"nested_bitmap.parquet"; + +.arrowkdb.pq.writeParquet[parquet_null_bitmap;bitmap_schema;bitmap_data;options]; +.arrowkdb.pq.writeParquet[parquet_nested_bitmap;nested_schema;nested_data;options]; + +show ls parquet_null_bitmap +show ls parquet_nested_bitmap // Read the array data back and compare options[`WITH_NULL_BITMAP]:1; -parquet_bitmap_data:.arrowkdb.pq.readParquetData[parquet_bitmap;options]; + +// Read the schema back and compare +parquet_bitmap_schema:.arrowkdb.pq.readParquetSchema[parquet_null_bitmap]; +parquet_nested_schema:.arrowkdb.pq.readParquetSchema[parquet_nested_bitmap]; + +show .arrowkdb.sc.equalSchemas[bitmap_schema;parquet_bitmap_schema] +show .arrowkdb.sc.equalSchemas[nested_schema;parquet_nested_schema] + +show bitmap_schema~parquet_bitmap_schema +show nested_schema~parquet_nested_schema + +parquet_bitmap_data:.arrowkdb.pq.readParquetData[parquet_null_bitmap;options]; +parquet_nested_data:.arrowkdb.pq.readParquetData[parquet_nested_bitmap;options]; + show bitmap_data~first parquet_bitmap_data +show nested_data~first parquet_nested_data nulls_data:1b,(N-1)?1b; bitmap_nulls:{x rotate nulls_data} each neg til {x-1} count bitmap_data; + parquet_bitmap_nulls:last parquet_bitmap_data; +parquet_nested_nulls:last parquet_nested_data; + show bitmap_nulls~bitmap_nulls & sublist[{1-x} count parquet_bitmap_nulls;parquet_bitmap_nulls] -rm parquet_bitmap; + +rm parquet_null_bitmap; +rm parquet_nested_bitmap; //---------------------------// // Example-2. Arrow IPC file // @@ -166,4 +212,4 @@ show bitmap_nulls~bitmap_nulls & sublist[{1-x} count stream_bitmap_nulls;stream_ -1 "\n+----------------------------------------+\n"; // Process off -exit 0; +//exit 0; From 9f0c7fbb615db2ecff50a56b1b2d1b72fc7a8aea Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 24 Feb 2023 13:31:03 +0300 Subject: [PATCH 089/276] Example of writing of nested arrow file --- examples/null_bitmap.q | 53 +++++++++++++++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 11 deletions(-) diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q index 20af51a..f7be20d 100644 --- a/examples/null_bitmap.q +++ b/examples/null_bitmap.q @@ -172,44 +172,75 @@ rm parquet_nested_bitmap; //---------------------------// // Write the schema and array data to an arrow file -arrow_bitmap:"null_bitmap.arrow"; -.arrowkdb.ipc.writeArrow[arrow_bitmap;bitmap_schema;bitmap_data;options]; -show ls arrow_bitmap +arrow_null_bitmap:"null_bitmap.arrow"; +arrow_nested_bitmap:"nested_bitmap.arrow"; + +.arrowkdb.ipc.writeArrow[arrow_null_bitmap;bitmap_schema;bitmap_data;options]; +.arrowkdb.ipc.writeArrow[arrow_nested_bitmap;nested_schema;nested_data;options]; + +show ls arrow_null_bitmap +show ls arrow_nested_bitmap // Read the schema back and compare -arrow_bitmap_schema:.arrowkdb.ipc.readArrowSchema[arrow_bitmap]; +arrow_bitmap_schema:.arrowkdb.ipc.readArrowSchema[arrow_null_bitmap]; +arrow_nested_schema:.arrowkdb.ipc.readArrowSchema[arrow_nested_bitmap]; + show .arrowkdb.sc.equalSchemas[bitmap_schema;arrow_bitmap_schema] +show .arrowkdb.sc.equalSchemas[nested_schema;arrow_nested_schema] + show bitmap_schema~arrow_bitmap_schema +show nested_schema~arrow_nested_schema // Read the array data back and compare -arrow_bitmap_data:.arrowkdb.ipc.readArrowData[arrow_bitmap;options]; +arrow_bitmap_data:.arrowkdb.ipc.readArrowData[arrow_null_bitmap;options]; +arrow_nested_data:.arrowkdb.ipc.readArrowData[arrow_nested_bitmap;options]; + show bitmap_data~first arrow_bitmap_data +show nested_data~first arrow_nested_data + arrow_bitmap_nulls:last arrow_bitmap_data; +arrow_nested_nulls:last arrow_nested_data; + show bitmap_nulls~bitmap_nulls & sublist[{1-x} count arrow_bitmap_nulls;arrow_bitmap_nulls] -rm arrow_bitmap; + +rm arrow_null_bitmap; +rm arrow_nested_bitmap; //-----------------------------// // Example-3. Arrow IPC stream // //-----------------------------// // Serialize the schema and array data to an arrow stream -serialized_bitmap:.arrowkdb.ipc.serializeArrow[bitmap_schema;bitmap_data;options]; -show serialized_bitmap +serialized_null_bitmap:.arrowkdb.ipc.serializeArrow[bitmap_schema;bitmap_data;options]; +serialized_nested_bitmap:.arrowkdb.ipc.serializeArrow[nested_schema;nested_data;options]; + +show serialized_null_bitmap +show serialized_nested_bitmap // Parse the schema back abd compare -stream_bitmap_schema:.arrowkdb.ipc.parseArrowSchema[serialized_bitmap]; +stream_bitmap_schema:.arrowkdb.ipc.parseArrowSchema[serialized_null_bitmap]; +stream_nested_schema:.arrowkdb.ipc.parseArrowSchema[serialized_nested_bitmap]; + show .arrowkdb.sc.equalSchemas[bitmap_schema;stream_bitmap_schema] +show .arrowkdb.sc.equalSchemas[nested_schema;stream_nested_schema] + show bitmap_schema~stream_bitmap_schema +show nested_schema~stream_nested_schema // Parse the array data back and compare -stream_bitmap_data:.arrowkdb.ipc.parseArrowData[serialized_bitmap;options]; +stream_bitmap_data:.arrowkdb.ipc.parseArrowData[serialized_null_bitmap;options]; +stream_nested_data:.arrowkdb.ipc.parseArrowData[serialized_nested_bitmap;options]; + show bitmap_data~first stream_bitmap_data +show nested_data~first stream_nested_data stream_bitmap_nulls:last stream_bitmap_data; +stream_nested_nulls:last stream_nested_data; + show bitmap_nulls~bitmap_nulls & sublist[{1-x} count stream_bitmap_nulls;stream_bitmap_nulls] -1 "\n+----------------------------------------+\n"; // Process off -//exit 0; +exit 0; From b24367d6a6c1c783bd0c404142c49d52c0b8ca1d Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 27 Feb 2023 21:39:24 +0300 Subject: [PATCH 090/276] Example of nested null bitmap validation --- examples/null_bitmap.q | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q index f7be20d..687b6fa 100644 --- a/examples/null_bitmap.q +++ b/examples/null_bitmap.q @@ -158,11 +158,16 @@ show nested_data~first parquet_nested_data nulls_data:1b,(N-1)?1b; bitmap_nulls:{x rotate nulls_data} each neg til {x-1} count bitmap_data; +nested_list_nulls:((::;1b);(::;0b;0b);(::;0b;0b;0b)) +nested_struct_nulls:((::;1b;0b;0b);(::;0b;1b;0b);(::;0b;0b;1b)) parquet_bitmap_nulls:last parquet_bitmap_data; -parquet_nested_nulls:last parquet_nested_data; +parquet_list_nulls:first parquet_nested_data[1] +parquet_struct_nulls:last parquet_nested_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count parquet_bitmap_nulls;parquet_bitmap_nulls] +nested_list_nulls~{(::),x} each parquet_list_nulls +nested_struct_nulls~{(::),x} each parquet_struct_nulls rm parquet_null_bitmap; rm parquet_nested_bitmap; @@ -199,9 +204,12 @@ show bitmap_data~first arrow_bitmap_data show nested_data~first arrow_nested_data arrow_bitmap_nulls:last arrow_bitmap_data; -arrow_nested_nulls:last arrow_nested_data; +arrow_list_nulls:first parquet_nested_data[1] +arrow_struct_nulls:last parquet_nested_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count arrow_bitmap_nulls;arrow_bitmap_nulls] +nested_list_nulls~{(::),x} each arrow_list_nulls +nested_struct_nulls~{(::),x} each arrow_struct_nulls rm arrow_null_bitmap; rm arrow_nested_bitmap; @@ -235,10 +243,12 @@ show bitmap_data~first stream_bitmap_data show nested_data~first stream_nested_data stream_bitmap_nulls:last stream_bitmap_data; -stream_nested_nulls:last stream_nested_data; +stream_list_nulls:first parquet_nested_data[1] +stream_struct_nulls:last parquet_nested_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count stream_bitmap_nulls;stream_bitmap_nulls] - +nested_list_nulls~{(::),x} each stream_list_nulls +nested_struct_nulls~{(::),x} each stream_struct_nulls -1 "\n+----------------------------------------+\n"; From 58a4e862ba03482ac5d6a1b183f1daa716ef87de Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 27 Feb 2023 21:46:16 +0300 Subject: [PATCH 091/276] Recurse through constituents of nested types --- src/ArrayReader.cpp | 114 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 97 insertions(+), 17 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 16c1043..72d3636 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -616,6 +616,90 @@ unordered_map ArrayHandlers { , make_array_handler() }; +template +K MakeNullBitmap( shared_ptr array_data, size_t& index ); + +template<> +K MakeNullBitmap( shared_ptr array_data, size_t& index ) +{ + auto slice_array = static_pointer_cast( array_data )->value_slice( index ); + + size_t counter = 0; + auto length = slice_array->length(); + K k_bitmap = knk( length ); + auto slice = slice_array->Slice( 0, length ); + AppendNullBitmap( slice, k_bitmap, counter ); + + return k_bitmap; +} + +template<> +K MakeNullBitmap( shared_ptr array_data, size_t& index ) +{ + return nullptr; +} + +template<> +K MakeNullBitmap( shared_ptr array_data, size_t& index ) +{ + return nullptr; +} + +template<> +K MakeNullBitmap( shared_ptr array_data, size_t& index ) +{ + return nullptr; +} + +template<> +K MakeNullBitmap( shared_ptr array_data, size_t& index ) +{ + auto struct_array = static_pointer_cast( array_data ); + + size_t counter = 0; + auto num_fields = struct_array->type()->num_fields(); + K k_bitmap = knk( num_fields ); + auto field = struct_array->field( index ); + AppendNullBitmap( field, k_bitmap, counter ); + + return k_bitmap; +} + +template<> +K MakeNullBitmap( shared_ptr array_data, size_t& index ) +{ + return nullptr; +} + +template<> +K MakeNullBitmap( shared_ptr array_data, size_t& index ) +{ + return nullptr; +} + +template<> +K MakeNullBitmap( shared_ptr array_data, size_t& index ) +{ + return nullptr; +} + +template +auto make_null_bitmap_handler() +{ + return make_pair( TypeId, &MakeNullBitmap ); +} + +unordered_map array_data, size_t& index )>> null_bitmap_handlers{ + make_null_bitmap_handler() + , make_null_bitmap_handler() + , make_null_bitmap_handler() + , make_null_bitmap_handler() + , make_null_bitmap_handler() + , make_null_bitmap_handler() + , make_null_bitmap_handler() + , make_null_bitmap_handler() +}; + } // namespace namespace kx { @@ -637,23 +721,18 @@ void AppendArray(shared_ptr array_data, K k_array, size_t& index, void AppendNullBitmap( shared_ptr array_data, K k_bitmap, size_t& index ) { auto type_id = array_data->type_id(); - if( array_data->null_count() == 0 - || arrow::Type::LIST == type_id - || arrow::Type::LARGE_LIST == type_id - || arrow::Type::FIXED_SIZE_LIST == type_id - || arrow::Type::MAP == type_id - || arrow::Type::STRUCT == type_id - || arrow::Type::SPARSE_UNION == type_id - || arrow::Type::DENSE_UNION == type_id - || arrow::Type::DICTIONARY == type_id ){ - memset( &kG( k_bitmap )[index], 0, array_data->length() ); - index += array_data->length(); - } - else{ - for( auto i = 0; i < array_data->length(); ++i ){ - kG( k_bitmap )[index] = array_data->IsNull( index ); - ++index; + auto length = array_data->length(); + for( auto i = 0; i < length; ++i ){ + if( array_data->IsNull( i ) ){ + kK( k_bitmap )[index] = kb( true ); + } + else if( null_bitmap_handlers.find( type_id ) != null_bitmap_handlers.end() ){ + kK( k_bitmap )[index] = null_bitmap_handlers[type_id]( array_data, index ); } + else{ + kK( k_bitmap )[index] = kb( false ); + } + ++index; } } @@ -722,7 +801,8 @@ K ReadChunkedArray(shared_ptr chunked_array, TypeMappingOve K ReadChunkedNullBitmap( shared_ptr chunked_array, TypeMappingOverride& type_overrides ) { auto boolean = std::make_shared(); - K k_bitmap = InitKdbForArray( boolean, chunked_array->length(), type_overrides ); + K k_bitmap = knk( chunked_array->length() ); + size_t index = 0; for( auto i = 0; i < chunked_array->num_chunks(); ++i ){ AppendNullBitmap( chunked_array->chunk( i ), k_bitmap, index ); From d0fcbcd9555236633b2e445f3c612da3604d6339 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 28 Feb 2023 11:52:05 +0300 Subject: [PATCH 092/276] Unit-tests of nested null bitmap for Travis CI --- tests/.gitignore | 1 + tests/nested_null_bitmap.t | 142 +++++++++++++++++++++++++++++++++++++ 2 files changed, 143 insertions(+) create mode 100644 tests/nested_null_bitmap.t diff --git a/tests/.gitignore b/tests/.gitignore index 9efe47e..d96539e 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -1,5 +1,6 @@ basic.q null_bitmap.q +nested_null_bitmap.q null_mapping_short.q null_mapping_long.q null_mapping_float.q diff --git a/tests/nested_null_bitmap.t b/tests/nested_null_bitmap.t new file mode 100644 index 0000000..0b7b037 --- /dev/null +++ b/tests/nested_null_bitmap.t @@ -0,0 +1,142 @@ +// nested_null_bitmap.t + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l q/arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping ||----------+\n"; +nested_opts:(`uint16`float32`binary`time64)!(9h;8.76e;"x"$"acknowledge";00:00:00.123456789); + +options:(``NULL_MAPPING)!((::);nested_opts); + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +ui16_dt:.arrowkdb.dt.uint16[]; + +f32_dt:.arrowkdb.dt.float32[]; +bin_dt:.arrowkdb.dt.binary[]; +t64_dt:.arrowkdb.dt.time64[`nano]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +ui16_fd:.arrowkdb.fd.field[`uint16;ui16_dt]; + +f32_fd:.arrowkdb.fd.field[`float32;f32_dt]; +bin_fd:.arrowkdb.fd.field[`binary;bin_dt]; +t64_fd:.arrowkdb.fd.field[`time64;t64_dt]; + +-1"\n+----------|| Create a list datatype, using the uint16 datatype as its child ||----------+\n"; +list_dt:.arrowkdb.dt.list[ui16_dt]; + +-1"\n+----------|| Create a field containing the list datatype ||----------+\n"; +list_fd:.arrowkdb.fd.field[`list_field;list_dt]; + +-1"\n+----------|| Create a struct datatype using the float32, binary and time64 fields as its children ||----------+\n"; +struct_dt:.arrowkdb.dt.struct[(f32_fd,bin_dt,t64_dt)]; + +-1"\n+----------|| Create a field containing the struct datatype ||----------+\n"; +struct_fd:.arrowkdb.fd.field[`struct_field;struct_dt]; + +-1"\n+----------|| Create the schema containing the list and struct fields ||----------+\n"; +nested_schema:.arrowkdb.sc.schema[(list_fd,struct_dt)]; + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +-1"\n+----------|| Create the data for each of the struct child fields ||----------+\n"; +f32_data:3?100e; +f32_data[0]:8.76e; +bin_data:3?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); +bin_data[1]:"x"$"acknowledge" +t64_data:3?(12:00:00.000000000;13:00:00.000000000;14:00:00.000000000;15:00:00.000000000;16:00:00.000000000); +t64_data[2]:00:00:00.123456789; + +-1"\n+----------|| Create the data for the list array ||----------+\n"; +list_data:(enlist (9h);(8h;7h);(6h;5h;4h)); + +-1"\n+----------|| Create the data for the struct array from its child arrays ||----------+\n"; +struct_data:(f32_data;bin_data;t64_data); + +-1"\n+----------|| Combine the array data for the list and struct columns ||----------+\n"; +nested_data:(list_data;struct_data); + +-1"\n+----------|| Write the schema and array data to a parquet file ||----------+\n"; +options[`PARQUET_VERSION]:`V2.0; + +parquet_nested_bitmap:"nested_bitmap.parquet"; +.arrowkdb.pq.writeParquet[parquet_nested_bitmap;nested_schema;nested_data;options]; + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +options[`WITH_NULL_BITMAP]:1; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +parquet_nested_schema:.arrowkdb.pq.readParquetSchema[parquet_nested_bitmap]; +.arrowkdb.sc.equalSchemas[nested_schema;parquet_nested_schema] +nested_schema~parquet_nested_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +parquet_nested_data:.arrowkdb.pq.readParquetData[parquet_nested_bitmap;options]; +nested_data~first parquet_nested_data + +-1"\n+----------|| Compare nested null bitmaps ||----------+\n"; +nested_list_nulls:((::;1b);(::;0b;0b);(::;0b;0b;0b)) +nested_struct_nulls:((::;1b;0b;0b);(::;0b;1b;0b);(::;0b;0b;1b)) + +parquet_list_nulls:first parquet_nested_data[1] +parquet_struct_nulls:last parquet_nested_data[1] +nested_list_nulls~{(::),x} each parquet_list_nulls +nested_struct_nulls~{(::),x} each parquet_struct_nulls + +rm parquet_nested_bitmap; + +-1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; +arrow_nested_bitmap:"nested_bitmap.arrow"; +.arrowkdb.ipc.writeArrow[arrow_nested_bitmap;nested_schema;nested_data;options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +arrow_nested_schema:.arrowkdb.ipc.readArrowSchema[arrow_nested_bitmap]; +.arrowkdb.sc.equalSchemas[nested_schema;arrow_nested_schema] +nested_schema~arrow_nested_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +arrow_nested_data:.arrowkdb.ipc.readArrowData[arrow_nested_bitmap;options]; +nested_data~first arrow_nested_data + +-1"\n+----------|| Compare nested null bitmaps ||----------+\n"; +arrow_list_nulls:first parquet_nested_data[1] +arrow_struct_nulls:last parquet_nested_data[1] +nested_list_nulls~{(::),x} each arrow_list_nulls +nested_struct_nulls~{(::),x} each arrow_struct_nulls + +rm arrow_nested_bitmap; + +-1"\n+----------|| Serialize the schema and array data to an arrow stream ||----------+\n"; +serialized_nested_bitmap:.arrowkdb.ipc.serializeArrow[nested_schema;nested_data;options]; + +-1"\n+----------|| Parse the schema back abd compare ||----------+\n"; +stream_nested_schema:.arrowkdb.ipc.parseArrowSchema[serialized_nested_bitmap]; +.arrowkdb.sc.equalSchemas[nested_schema;stream_nested_schema] +nested_schema~stream_nested_schema + +-1"\n+----------|| Parse the array data back and compare ||----------+\n"; +stream_nested_data:.arrowkdb.ipc.parseArrowData[serialized_nested_bitmap;options]; +nested_data~first stream_nested_data + +-1"\n+----------|| Compare nested null bitmaps ||----------+\n"; +stream_list_nulls:first parquet_nested_data[1] +stream_struct_nulls:last parquet_nested_data[1] +nested_list_nulls~{(::),x} each stream_list_nulls +nested_struct_nulls~{(::),x} each stream_struct_nulls + + +-1 "\n+----------|| Test utils ||----------+\n"; + +.arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; From 9a0f937692a0e4cbc147ab4880ca326df70c67be Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 28 Feb 2023 12:34:06 +0300 Subject: [PATCH 093/276] Break null bitmap traversal of lists for simple types --- src/ArrayReader.cpp | 101 +++++++++++++++++++++++++------------------- src/ArrayReader.h | 31 +++++++------- 2 files changed, 72 insertions(+), 60 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 72d3636..a8ef3d1 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -616,80 +616,93 @@ unordered_map ArrayHandlers { , make_array_handler() }; +using BitmapHandler = K (*) (shared_ptr array_data, size_t& index ); + +extern unordered_map null_bitmap_handlers; + template -K MakeNullBitmap( shared_ptr array_data, size_t& index ); +K AppendNullBitmap( shared_ptr array_data, size_t& index ); template<> -K MakeNullBitmap( shared_ptr array_data, size_t& index ) +K AppendNullBitmap( shared_ptr array_data, size_t& index ) { - auto slice_array = static_pointer_cast( array_data )->value_slice( index ); + K k_bitmap = nullptr; + auto slice_array = static_pointer_cast( array_data )->value_slice( index ); + auto length = slice_array->length(); + if( null_bitmap_handlers.find( slice_array->type_id() ) == null_bitmap_handlers.end() ){ + k_bitmap = ktn( KB, length ); + for( int i = 0ll; i < length; ++i ){ + kG( k_bitmap )[i] = slice_array->IsNull( i ); + } + } + else{ size_t counter = 0; - auto length = slice_array->length(); - K k_bitmap = knk( length ); + k_bitmap = knk( length ); auto slice = slice_array->Slice( 0, length ); - AppendNullBitmap( slice, k_bitmap, counter ); + InitKdbNullBitmap( slice, k_bitmap, counter ); + } - return k_bitmap; + return k_bitmap; } template<> -K MakeNullBitmap( shared_ptr array_data, size_t& index ) +K AppendNullBitmap( shared_ptr array_data, size_t& index ) { - return nullptr; + return nullptr; } template<> -K MakeNullBitmap( shared_ptr array_data, size_t& index ) +K AppendNullBitmap( shared_ptr array_data, size_t& index ) { - return nullptr; + return nullptr; } template<> -K MakeNullBitmap( shared_ptr array_data, size_t& index ) +K AppendNullBitmap( shared_ptr array_data, size_t& index ) { - return nullptr; + return nullptr; } template<> -K MakeNullBitmap( shared_ptr array_data, size_t& index ) +K AppendNullBitmap( shared_ptr array_data, size_t& index ) { - auto struct_array = static_pointer_cast( array_data ); + auto struct_array = static_pointer_cast( array_data ); - size_t counter = 0; - auto num_fields = struct_array->type()->num_fields(); - K k_bitmap = knk( num_fields ); - auto field = struct_array->field( index ); - AppendNullBitmap( field, k_bitmap, counter ); + size_t counter = 0; + auto num_fields = struct_array->type()->num_fields(); + K k_bitmap = knk( num_fields ); + auto field = struct_array->field( index ); + InitKdbNullBitmap( field, k_bitmap, counter ); - return k_bitmap; + return k_bitmap; } template<> -K MakeNullBitmap( shared_ptr array_data, size_t& index ) +K AppendNullBitmap( shared_ptr array_data, size_t& index ) { - return nullptr; + return nullptr; } template<> -K MakeNullBitmap( shared_ptr array_data, size_t& index ) +K AppendNullBitmap( shared_ptr array_data, size_t& index ) { - return nullptr; + return nullptr; } template<> -K MakeNullBitmap( shared_ptr array_data, size_t& index ) +K AppendNullBitmap( shared_ptr array_data, size_t& index ) { - return nullptr; + return nullptr; } template auto make_null_bitmap_handler() { - return make_pair( TypeId, &MakeNullBitmap ); + return make_pair( TypeId, &AppendNullBitmap ); } -unordered_map array_data, size_t& index )>> null_bitmap_handlers{ +unordered_map null_bitmap_handlers{ make_null_bitmap_handler() , make_null_bitmap_handler() , make_null_bitmap_handler() @@ -718,22 +731,22 @@ void AppendArray(shared_ptr array_data, K k_array, size_t& index, } } -void AppendNullBitmap( shared_ptr array_data, K k_bitmap, size_t& index ) +void InitKdbNullBitmap( shared_ptr array_data, K k_bitmap, size_t& index ) { - auto type_id = array_data->type_id(); - auto length = array_data->length(); - for( auto i = 0; i < length; ++i ){ - if( array_data->IsNull( i ) ){ - kK( k_bitmap )[index] = kb( true ); - } - else if( null_bitmap_handlers.find( type_id ) != null_bitmap_handlers.end() ){ - kK( k_bitmap )[index] = null_bitmap_handlers[type_id]( array_data, index ); - } - else{ - kK( k_bitmap )[index] = kb( false ); - } - ++index; + auto type_id = array_data->type_id(); + auto length = array_data->length(); + for( auto i = 0; i < length; ++i ){ + if( array_data->IsNull( i ) ){ + kK( k_bitmap )[index] = kb( true ); } + else if( null_bitmap_handlers.find( type_id ) != null_bitmap_handlers.end() ){ + kK( k_bitmap )[index] = null_bitmap_handlers[type_id]( array_data, index ); + } + else{ + kK( k_bitmap )[index] = kb( false ); + } + ++index; + } } K InitKdbForArray(shared_ptr datatype, size_t length, TypeMappingOverride& type_overrides) @@ -805,7 +818,7 @@ K ReadChunkedNullBitmap( shared_ptr chunked_array, TypeMapp size_t index = 0; for( auto i = 0; i < chunked_array->num_chunks(); ++i ){ - AppendNullBitmap( chunked_array->chunk( i ), k_bitmap, index ); + InitKdbNullBitmap( chunked_array->chunk( i ), k_bitmap, index ); } return k_bitmap; diff --git a/src/ArrayReader.h b/src/ArrayReader.h index 16c72b5..9ed0386 100644 --- a/src/ArrayReader.h +++ b/src/ArrayReader.h @@ -26,22 +26,6 @@ namespace arrowkdb { */ void AppendArray(std::shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides); - -/** - * @brief Appends null bitmap data from an arrow array into an existing kdb boolean - * list starting at the specified index. - * - * @param array_data The arrow array from which to source the data. The entire - * array will be appended. - * @param k_bitmap The kdb boolean list that the data should be inserted into. - * This list needs to have been created with the correct length by the calling - * function. - * @param index The index into the kdb list at which the appending should - * begin. Index will be updated to account for the new offset by adding the - * length of the array array. -*/ -void AppendNullBitmap( std::shared_ptr array_data, K k_bitmap, size_t& index ); - /** * @brief Copies and converts an arrow array to a kdb list * @@ -80,6 +64,21 @@ K ReadChunkedNullBitmap( std::shared_ptr chunked_array, Typ */ K InitKdbForArray(std::shared_ptr datatype, size_t length, TypeMappingOverride& type_overrides); +/** + * @brief Appends null bitmap data from an arrow array into an existing kdb boolean + * list starting at the specified index. + * + * @param array_data The arrow array from which to source the data. The entire + * array will be appended. + * @param k_bitmap The kdb boolean list that the data should be inserted into. + * This list needs to have been created with the correct length by the calling + * function. + * @param index The index into the kdb list at which the appending should + * begin. Index will be updated to account for the new offset by adding the + * length of the array array. +*/ +void InitKdbNullBitmap( std::shared_ptr array_data, K k_bitmap, size_t& index ); + } // namespace arrowkdb } // namespace kx From 89d0829c589c1ac7223ca531f6c679f18981e720 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 28 Feb 2023 14:21:46 +0300 Subject: [PATCH 094/276] Simple lists for validating nested nulls --- examples/null_bitmap.q | 10 +++++----- tests/nested_null_bitmap.t | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q index 687b6fa..e36822e 100644 --- a/examples/null_bitmap.q +++ b/examples/null_bitmap.q @@ -158,7 +158,7 @@ show nested_data~first parquet_nested_data nulls_data:1b,(N-1)?1b; bitmap_nulls:{x rotate nulls_data} each neg til {x-1} count bitmap_data; -nested_list_nulls:((::;1b);(::;0b;0b);(::;0b;0b;0b)) +nested_list_nulls:(enlist 1b;00b;000b) nested_struct_nulls:((::;1b;0b;0b);(::;0b;1b;0b);(::;0b;0b;1b)) parquet_bitmap_nulls:last parquet_bitmap_data; @@ -166,7 +166,7 @@ parquet_list_nulls:first parquet_nested_data[1] parquet_struct_nulls:last parquet_nested_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count parquet_bitmap_nulls;parquet_bitmap_nulls] -nested_list_nulls~{(::),x} each parquet_list_nulls +nested_list_nulls~parquet_list_nulls nested_struct_nulls~{(::),x} each parquet_struct_nulls rm parquet_null_bitmap; @@ -208,7 +208,7 @@ arrow_list_nulls:first parquet_nested_data[1] arrow_struct_nulls:last parquet_nested_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count arrow_bitmap_nulls;arrow_bitmap_nulls] -nested_list_nulls~{(::),x} each arrow_list_nulls +nested_list_nulls~arrow_list_nulls nested_struct_nulls~{(::),x} each arrow_struct_nulls rm arrow_null_bitmap; @@ -247,10 +247,10 @@ stream_list_nulls:first parquet_nested_data[1] stream_struct_nulls:last parquet_nested_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count stream_bitmap_nulls;stream_bitmap_nulls] -nested_list_nulls~{(::),x} each stream_list_nulls +nested_list_nulls~stream_list_nulls nested_struct_nulls~{(::),x} each stream_struct_nulls -1 "\n+----------------------------------------+\n"; // Process off -exit 0; +//exit 0; diff --git a/tests/nested_null_bitmap.t b/tests/nested_null_bitmap.t index 0b7b037..4d9fde1 100644 --- a/tests/nested_null_bitmap.t +++ b/tests/nested_null_bitmap.t @@ -83,12 +83,12 @@ parquet_nested_data:.arrowkdb.pq.readParquetData[parquet_nested_bitmap;options]; nested_data~first parquet_nested_data -1"\n+----------|| Compare nested null bitmaps ||----------+\n"; -nested_list_nulls:((::;1b);(::;0b;0b);(::;0b;0b;0b)) +nested_list_nulls:(enlist 1b;00b;000b) nested_struct_nulls:((::;1b;0b;0b);(::;0b;1b;0b);(::;0b;0b;1b)) parquet_list_nulls:first parquet_nested_data[1] parquet_struct_nulls:last parquet_nested_data[1] -nested_list_nulls~{(::),x} each parquet_list_nulls +nested_list_nulls~parquet_list_nulls nested_struct_nulls~{(::),x} each parquet_struct_nulls rm parquet_nested_bitmap; @@ -109,7 +109,7 @@ nested_data~first arrow_nested_data -1"\n+----------|| Compare nested null bitmaps ||----------+\n"; arrow_list_nulls:first parquet_nested_data[1] arrow_struct_nulls:last parquet_nested_data[1] -nested_list_nulls~{(::),x} each arrow_list_nulls +nested_list_nulls~arrow_list_nulls nested_struct_nulls~{(::),x} each arrow_struct_nulls rm arrow_nested_bitmap; @@ -129,7 +129,7 @@ nested_data~first stream_nested_data -1"\n+----------|| Compare nested null bitmaps ||----------+\n"; stream_list_nulls:first parquet_nested_data[1] stream_struct_nulls:last parquet_nested_data[1] -nested_list_nulls~{(::),x} each stream_list_nulls +nested_list_nulls~stream_list_nulls nested_struct_nulls~{(::),x} each stream_struct_nulls From c1200a0d1801dc928aa2b9c84188d16ea01a6b24 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 28 Feb 2023 20:06:04 +0300 Subject: [PATCH 095/276] Simple arrays for nested nulls --- src/ArrayReader.cpp | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index a8ef3d1..9c694b3 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -626,22 +626,14 @@ K AppendNullBitmap( shared_ptr array_data, size_t& index ); template<> K AppendNullBitmap( shared_ptr array_data, size_t& index ) { - K k_bitmap = nullptr; + string strArray = array_data->ToString(); auto slice_array = static_pointer_cast( array_data )->value_slice( index ); auto length = slice_array->length(); - if( null_bitmap_handlers.find( slice_array->type_id() ) == null_bitmap_handlers.end() ){ - k_bitmap = ktn( KB, length ); - for( int i = 0ll; i < length; ++i ){ - kG( k_bitmap )[i] = slice_array->IsNull( i ); - } - } - else{ - size_t counter = 0; - k_bitmap = knk( length ); - auto slice = slice_array->Slice( 0, length ); - InitKdbNullBitmap( slice, k_bitmap, counter ); - } + size_t counter = 0; + K k_bitmap = ktn( KB, length ); + auto slice = slice_array->Slice( 0, length ); + InitKdbNullBitmap( slice, k_bitmap, counter ); return k_bitmap; } @@ -667,11 +659,12 @@ K AppendNullBitmap( shared_ptr array_data, size_ template<> K AppendNullBitmap( shared_ptr array_data, size_t& index ) { + string strArray = array_data->ToString(); auto struct_array = static_pointer_cast( array_data ); size_t counter = 0; auto num_fields = struct_array->type()->num_fields(); - K k_bitmap = knk( num_fields ); + K k_bitmap = ktn( KB, num_fields ); auto field = struct_array->field( index ); InitKdbNullBitmap( field, k_bitmap, counter ); @@ -735,17 +728,19 @@ void InitKdbNullBitmap( shared_ptr array_data, K k_bitmap, size_t& { auto type_id = array_data->type_id(); auto length = array_data->length(); - for( auto i = 0; i < length; ++i ){ - if( array_data->IsNull( i ) ){ - kK( k_bitmap )[index] = kb( true ); + + string strArray = array_data->ToString(); + if( null_bitmap_handlers.find( type_id ) == null_bitmap_handlers.end() ){ + for( int i = 0ll; i < length; ++i ){ + kG( k_bitmap )[index] = array_data->IsNull( i ); + ++index; } - else if( null_bitmap_handlers.find( type_id ) != null_bitmap_handlers.end() ){ + } + else{ + for( int i = 0ll; i < length; ++i ){ kK( k_bitmap )[index] = null_bitmap_handlers[type_id]( array_data, index ); + ++index; } - else{ - kK( k_bitmap )[index] = kb( false ); - } - ++index; } } From dca28183b7b9314d3aa628d813b748a683167618 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 28 Feb 2023 20:49:19 +0300 Subject: [PATCH 096/276] Deep nested structures handled --- src/ArrayReader.cpp | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 9c694b3..67d0326 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -626,12 +626,14 @@ K AppendNullBitmap( shared_ptr array_data, size_t& index ); template<> K AppendNullBitmap( shared_ptr array_data, size_t& index ) { - string strArray = array_data->ToString(); auto slice_array = static_pointer_cast( array_data )->value_slice( index ); auto length = slice_array->length(); + auto type_id = slice_array->type_id(); size_t counter = 0; - K k_bitmap = ktn( KB, length ); + K k_bitmap = ( null_bitmap_handlers.find( type_id ) == null_bitmap_handlers.end() ) + ? ktn( KB, length ) + : knk( length ); auto slice = slice_array->Slice( 0, length ); InitKdbNullBitmap( slice, k_bitmap, counter ); @@ -659,13 +661,15 @@ K AppendNullBitmap( shared_ptr array_data, size_ template<> K AppendNullBitmap( shared_ptr array_data, size_t& index ) { - string strArray = array_data->ToString(); auto struct_array = static_pointer_cast( array_data ); - - size_t counter = 0; auto num_fields = struct_array->type()->num_fields(); - K k_bitmap = ktn( KB, num_fields ); auto field = struct_array->field( index ); + auto type_id = field->type_id(); + + size_t counter = 0; + K k_bitmap = ( null_bitmap_handlers.find( type_id ) == null_bitmap_handlers.end() ) + ? ktn( KB, num_fields ) + : knk( num_fields ); InitKdbNullBitmap( field, k_bitmap, counter ); return k_bitmap; @@ -729,18 +733,14 @@ void InitKdbNullBitmap( shared_ptr array_data, K k_bitmap, size_t& auto type_id = array_data->type_id(); auto length = array_data->length(); - string strArray = array_data->ToString(); - if( null_bitmap_handlers.find( type_id ) == null_bitmap_handlers.end() ){ - for( int i = 0ll; i < length; ++i ){ + for( int i = 0ll; i < length; ++i ){ + if( null_bitmap_handlers.find( type_id ) == null_bitmap_handlers.end() ){ kG( k_bitmap )[index] = array_data->IsNull( i ); - ++index; } - } - else{ - for( int i = 0ll; i < length; ++i ){ + else{ kK( k_bitmap )[index] = null_bitmap_handlers[type_id]( array_data, index ); - ++index; } + ++index; } } @@ -808,8 +808,11 @@ K ReadChunkedArray(shared_ptr chunked_array, TypeMappingOve K ReadChunkedNullBitmap( shared_ptr chunked_array, TypeMappingOverride& type_overrides ) { - auto boolean = std::make_shared(); - K k_bitmap = knk( chunked_array->length() ); + auto length = chunked_array->length(); + auto type_id = chunked_array->type()->id(); + K k_bitmap = ( null_bitmap_handlers.find( type_id ) == null_bitmap_handlers.end() ) + ? ktn( KB, length ) + : knk( length ); size_t index = 0; for( auto i = 0; i < chunked_array->num_chunks(); ++i ){ From 3860473035b7cf0a13c724d788f899de436f25ca Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 28 Feb 2023 20:50:33 +0300 Subject: [PATCH 097/276] Unit-tests nesting performance --- examples/null_bitmap.q | 54 ++++++++--------- tests/.gitignore | 2 +- tests/crucial_null_bitmap.t | 114 ++++++++++++++++++++++++++++++++++++ tests/nested_null_bitmap.t | 42 ++++++------- tests/null_bitmap.t | 109 ---------------------------------- 5 files changed, 164 insertions(+), 157 deletions(-) create mode 100644 tests/crucial_null_bitmap.t delete mode 100644 tests/null_bitmap.t diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q index e36822e..c10a5b0 100644 --- a/examples/null_bitmap.q +++ b/examples/null_bitmap.q @@ -22,7 +22,7 @@ rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; bitmap_opts:(`bool`int32`float64`string`date32)!(0b;1i;2.34;"start";2006.07.21); nested_opts:(`uint16`float32`binary`time64)!(9h;8.76e;"x"$"acknowledge";00:00:00.123456789); -options:(``NULL_MAPPING)!((::);bitmap_opts,nested_opts); +nested_options:(``NULL_MAPPING)!((::);bitmap_opts,nested_opts); // Create the datatype identifiers ts_dt:.arrowkdb.dt.timestamp[`nano]; @@ -70,7 +70,7 @@ struct_fd:.arrowkdb.fd.field[`struct_field;struct_dt]; bitmap_schema:.arrowkdb.sc.schema[(ts_fd,bool_fd,i32_fd,f64_fd,str_fd,d32_fd)]; // Create the schema containing the list and struct fields -nested_schema:.arrowkdb.sc.schema[(list_fd,struct_dt)]; +nested_schema:.arrowkdb.sc.schema[(list_fd,struct_fd)]; // Print the schema -1"\nBitmap schema:"; @@ -105,7 +105,7 @@ t64_data[2]:00:00:00.123456789; bitmap_data:(ts_data;bool_data;i32_data;f64_data;str_data;d32_data); // Create the data for the list array -list_data:(enlist (9h);(8h;7h);(6h;5h;4h)); +list_data:(enlist 9h;(8h;7h);(6h;5h;4h)); // Create the data for the struct array from its child arrays struct_data:(f32_data;bin_data;t64_data); @@ -115,30 +115,30 @@ nested_data:(list_data;struct_data); // Pretty print the Arrow table populated from the bitmap data -1"\nBitmap table:"; -.arrowkdb.tb.prettyPrintTable[bitmap_schema;bitmap_data;options]; +.arrowkdb.tb.prettyPrintTable[bitmap_schema;bitmap_data;nested_options]; // Show the array data as an arrow table -1"\nNested table:"; -.arrowkdb.tb.prettyPrintTable[nested_schema;nested_data;options] +.arrowkdb.tb.prettyPrintTable[nested_schema;nested_data;nested_options] //-------------------------// // Example-1. Parquet file // //-------------------------// // Write the schema and array data to a parquet file -options[`PARQUET_VERSION]:`V2.0; +nested_options[`PARQUET_VERSION]:`V2.0; parquet_null_bitmap:"null_bitmap.parquet"; parquet_nested_bitmap:"nested_bitmap.parquet"; -.arrowkdb.pq.writeParquet[parquet_null_bitmap;bitmap_schema;bitmap_data;options]; -.arrowkdb.pq.writeParquet[parquet_nested_bitmap;nested_schema;nested_data;options]; +.arrowkdb.pq.writeParquet[parquet_null_bitmap;bitmap_schema;bitmap_data;nested_options]; +.arrowkdb.pq.writeParquet[parquet_nested_bitmap;nested_schema;nested_data;nested_options]; show ls parquet_null_bitmap show ls parquet_nested_bitmap // Read the array data back and compare -options[`WITH_NULL_BITMAP]:1; +nested_options[`WITH_NULL_BITMAP]:1; // Read the schema back and compare parquet_bitmap_schema:.arrowkdb.pq.readParquetSchema[parquet_null_bitmap]; @@ -150,8 +150,8 @@ show .arrowkdb.sc.equalSchemas[nested_schema;parquet_nested_schema] show bitmap_schema~parquet_bitmap_schema show nested_schema~parquet_nested_schema -parquet_bitmap_data:.arrowkdb.pq.readParquetData[parquet_null_bitmap;options]; -parquet_nested_data:.arrowkdb.pq.readParquetData[parquet_nested_bitmap;options]; +parquet_bitmap_data:.arrowkdb.pq.readParquetData[parquet_null_bitmap;nested_options]; +parquet_nested_data:.arrowkdb.pq.readParquetData[parquet_nested_bitmap;nested_options]; show bitmap_data~first parquet_bitmap_data show nested_data~first parquet_nested_data @@ -159,7 +159,7 @@ show nested_data~first parquet_nested_data nulls_data:1b,(N-1)?1b; bitmap_nulls:{x rotate nulls_data} each neg til {x-1} count bitmap_data; nested_list_nulls:(enlist 1b;00b;000b) -nested_struct_nulls:((::;1b;0b;0b);(::;0b;1b;0b);(::;0b;0b;1b)) +nested_struct_nulls:(100b;010b;001b) parquet_bitmap_nulls:last parquet_bitmap_data; parquet_list_nulls:first parquet_nested_data[1] @@ -167,7 +167,7 @@ parquet_struct_nulls:last parquet_nested_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count parquet_bitmap_nulls;parquet_bitmap_nulls] nested_list_nulls~parquet_list_nulls -nested_struct_nulls~{(::),x} each parquet_struct_nulls +nested_struct_nulls~parquet_struct_nulls rm parquet_null_bitmap; rm parquet_nested_bitmap; @@ -180,8 +180,8 @@ rm parquet_nested_bitmap; arrow_null_bitmap:"null_bitmap.arrow"; arrow_nested_bitmap:"nested_bitmap.arrow"; -.arrowkdb.ipc.writeArrow[arrow_null_bitmap;bitmap_schema;bitmap_data;options]; -.arrowkdb.ipc.writeArrow[arrow_nested_bitmap;nested_schema;nested_data;options]; +.arrowkdb.ipc.writeArrow[arrow_null_bitmap;bitmap_schema;bitmap_data;nested_options]; +.arrowkdb.ipc.writeArrow[arrow_nested_bitmap;nested_schema;nested_data;nested_options]; show ls arrow_null_bitmap show ls arrow_nested_bitmap @@ -197,19 +197,19 @@ show bitmap_schema~arrow_bitmap_schema show nested_schema~arrow_nested_schema // Read the array data back and compare -arrow_bitmap_data:.arrowkdb.ipc.readArrowData[arrow_null_bitmap;options]; -arrow_nested_data:.arrowkdb.ipc.readArrowData[arrow_nested_bitmap;options]; +arrow_bitmap_data:.arrowkdb.ipc.readArrowData[arrow_null_bitmap;nested_options]; +arrow_nested_data:.arrowkdb.ipc.readArrowData[arrow_nested_bitmap;nested_options]; show bitmap_data~first arrow_bitmap_data show nested_data~first arrow_nested_data arrow_bitmap_nulls:last arrow_bitmap_data; -arrow_list_nulls:first parquet_nested_data[1] -arrow_struct_nulls:last parquet_nested_data[1] +arrow_list_nulls:first arrow_nested_data[1] +arrow_struct_nulls:last arrow_nested_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count arrow_bitmap_nulls;arrow_bitmap_nulls] nested_list_nulls~arrow_list_nulls -nested_struct_nulls~{(::),x} each arrow_struct_nulls +nested_struct_nulls~arrow_struct_nulls rm arrow_null_bitmap; rm arrow_nested_bitmap; @@ -219,8 +219,8 @@ rm arrow_nested_bitmap; //-----------------------------// // Serialize the schema and array data to an arrow stream -serialized_null_bitmap:.arrowkdb.ipc.serializeArrow[bitmap_schema;bitmap_data;options]; -serialized_nested_bitmap:.arrowkdb.ipc.serializeArrow[nested_schema;nested_data;options]; +serialized_null_bitmap:.arrowkdb.ipc.serializeArrow[bitmap_schema;bitmap_data;nested_options]; +serialized_nested_bitmap:.arrowkdb.ipc.serializeArrow[nested_schema;nested_data;nested_options]; show serialized_null_bitmap show serialized_nested_bitmap @@ -236,19 +236,19 @@ show bitmap_schema~stream_bitmap_schema show nested_schema~stream_nested_schema // Parse the array data back and compare -stream_bitmap_data:.arrowkdb.ipc.parseArrowData[serialized_null_bitmap;options]; -stream_nested_data:.arrowkdb.ipc.parseArrowData[serialized_nested_bitmap;options]; +stream_bitmap_data:.arrowkdb.ipc.parseArrowData[serialized_null_bitmap;nested_options]; +stream_nested_data:.arrowkdb.ipc.parseArrowData[serialized_nested_bitmap;nested_options]; show bitmap_data~first stream_bitmap_data show nested_data~first stream_nested_data stream_bitmap_nulls:last stream_bitmap_data; -stream_list_nulls:first parquet_nested_data[1] -stream_struct_nulls:last parquet_nested_data[1] +stream_list_nulls:first stream_nested_data[1] +stream_struct_nulls:last stream_nested_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count stream_bitmap_nulls;stream_bitmap_nulls] nested_list_nulls~stream_list_nulls -nested_struct_nulls~{(::),x} each stream_struct_nulls +nested_struct_nulls~stream_struct_nulls -1 "\n+----------------------------------------+\n"; diff --git a/tests/.gitignore b/tests/.gitignore index d96539e..01c7b10 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -1,5 +1,5 @@ basic.q -null_bitmap.q +crucial_null_bitmap.q nested_null_bitmap.q null_mapping_short.q null_mapping_long.q diff --git a/tests/crucial_null_bitmap.t b/tests/crucial_null_bitmap.t new file mode 100644 index 0000000..a56c576 --- /dev/null +++ b/tests/crucial_null_bitmap.t @@ -0,0 +1,114 @@ +// crucial_null_bitmap.t + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l q/arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping ||----------+\n"; +crucial_opts:(`bool`int32`float64`string`date32)!(0b;1i;2.34;"start";2006.07.21); + +crucial_options:(``NULL_MAPPING)!((::);crucial_opts); + +N:5 + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +bool_dt:.arrowkdb.dt.boolean[]; +i32_dt:.arrowkdb.dt.int32[]; +f64_dt:.arrowkdb.dt.float64[]; +str_dt:.arrowkdb.dt.utf8[]; +d32_dt:.arrowkdb.dt.date32[]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +bool_fd:.arrowkdb.fd.field[`bool;bool_dt]; +i32_fd:.arrowkdb.fd.field[`int32;i32_dt]; +f64_fd:.arrowkdb.fd.field[`float64;f64_dt]; +str_fd:.arrowkdb.fd.field[`string;str_dt]; +d32_fd:.arrowkdb.fd.field[`date32;d32_dt]; + +-1"\n+----------|| Create the schemas for the list of fields ||----------+\n"; +crucial_schema:.arrowkdb.sc.schema[(ts_fd,bool_fd,i32_fd,f64_fd,str_fd,d32_fd)]; + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +bool_data:N?(0b;1b); +bool_data[0]:0b; +i32_data:N?100i; +i32_data[1]:1i; +f64_data:N?100f; +f64_data[2]:2.34f; +str_data:N?("start";"stop";"alert";"acknowledge";""); +str_data[3]:"start" +d32_data:N?(2006.07.21;2005.07.18;2004.07.16;2003.07.15;2002.07.11); +d32_data[4]:2006.07.21; + +-1"\n+----------|| Combine the data for all columns ||----------+\n"; +crucial_data:(ts_data;bool_data;i32_data;f64_data;str_data;d32_data); + +-1"\n+----------|| Write the schema and array data to a parquet file ||----------+\n"; +crucial_options[`PARQUET_VERSION]:`V2.0; + +parquet_crucial_bitmap:"null_bitmap.parquet"; +.arrowkdb.pq.writeParquet[parquet_crucial_bitmap;crucial_schema;crucial_data;crucial_options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +parquet_crucial_schema:.arrowkdb.pq.readParquetSchema[parquet_crucial_bitmap]; +.arrowkdb.sc.equalSchemas[crucial_schema;parquet_crucial_schema] +crucial_schema~parquet_crucial_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +crucial_options[`WITH_NULL_BITMAP]:1; +parquet_crucial_data:.arrowkdb.pq.readParquetData[parquet_crucial_bitmap;crucial_options]; +crucial_data~first parquet_crucial_data + +nulls_data:1b,(N-1)?1b; +crucial_nulls:{x rotate nulls_data} each neg til {x-1} count crucial_data; +parquet_crucial_nulls:last parquet_crucial_data; +crucial_nulls~crucial_nulls & sublist[{1-x} count parquet_crucial_nulls;parquet_crucial_nulls] +rm parquet_crucial_bitmap; + +-1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; +arrow_crucial_bitmap:"null_bitmap.arrow"; +.arrowkdb.ipc.writeArrow[arrow_crucial_bitmap;crucial_schema;crucial_data;crucial_options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +arrow_crucial_schema:.arrowkdb.ipc.readArrowSchema[arrow_crucial_bitmap]; +.arrowkdb.sc.equalSchemas[crucial_schema;arrow_crucial_schema] +crucial_schema~arrow_crucial_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +arrow_crucial_data:.arrowkdb.ipc.readArrowData[arrow_crucial_bitmap;crucial_options]; +crucial_data~first arrow_crucial_data +arrow_crucial_nulls:last arrow_crucial_data; +crucial_nulls~crucial_nulls & sublist[{1-x} count arrow_crucial_nulls;arrow_crucial_nulls] +rm arrow_crucial_bitmap; + +-1"\n+----------|| Serialize the schema and array data to an arrow stream ||----------+\n"; +serialized_bitmap:.arrowkdb.ipc.serializeArrow[crucial_schema;crucial_data;crucial_options]; + +-1"\n+----------|| Parse the schema back abd compare ||----------+\n"; +stream_crucial_schema:.arrowkdb.ipc.parseArrowSchema[serialized_bitmap]; +.arrowkdb.sc.equalSchemas[crucial_schema;stream_crucial_schema] +crucial_schema~stream_crucial_schema + +-1"\n+----------|| Parse the array data back and compare ||----------+\n"; +stream_crucial_data:.arrowkdb.ipc.parseArrowData[serialized_bitmap;crucial_options]; +crucial_data~first stream_crucial_data + +stream_crucial_nulls:last stream_crucial_data; +crucial_nulls~crucial_nulls & sublist[{1-x} count stream_crucial_nulls;stream_crucial_nulls] + + +-1 "\n+----------|| Test utils ||----------+\n"; + +show .arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; diff --git a/tests/nested_null_bitmap.t b/tests/nested_null_bitmap.t index 4d9fde1..a12bdf6 100644 --- a/tests/nested_null_bitmap.t +++ b/tests/nested_null_bitmap.t @@ -9,7 +9,9 @@ rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; -1"\n+----------|| Support null mapping ||----------+\n"; nested_opts:(`uint16`float32`binary`time64)!(9h;8.76e;"x"$"acknowledge";00:00:00.123456789); -options:(``NULL_MAPPING)!((::);nested_opts); +nested_options:(``NULL_MAPPING)!((::);nested_opts); + +N:5 -1"\n+----------|| Create the datatype identifiers ||----------+\n"; ts_dt:.arrowkdb.dt.timestamp[`nano]; @@ -42,7 +44,7 @@ struct_dt:.arrowkdb.dt.struct[(f32_fd,bin_dt,t64_dt)]; struct_fd:.arrowkdb.fd.field[`struct_field;struct_dt]; -1"\n+----------|| Create the schema containing the list and struct fields ||----------+\n"; -nested_schema:.arrowkdb.sc.schema[(list_fd,struct_dt)]; +nested_schema:.arrowkdb.sc.schema[(list_fd,struct_fd)]; -1"\n+----------|| Create data for each column in the table ||----------+\n"; ts_data:asc N?0p; @@ -56,7 +58,7 @@ t64_data:3?(12:00:00.000000000;13:00:00.000000000;14:00:00.000000000;15:00:00.00 t64_data[2]:00:00:00.123456789; -1"\n+----------|| Create the data for the list array ||----------+\n"; -list_data:(enlist (9h);(8h;7h);(6h;5h;4h)); +list_data:(enlist 9h;(8h;7h);(6h;5h;4h)); -1"\n+----------|| Create the data for the struct array from its child arrays ||----------+\n"; struct_data:(f32_data;bin_data;t64_data); @@ -65,13 +67,13 @@ struct_data:(f32_data;bin_data;t64_data); nested_data:(list_data;struct_data); -1"\n+----------|| Write the schema and array data to a parquet file ||----------+\n"; -options[`PARQUET_VERSION]:`V2.0; +nested_options[`PARQUET_VERSION]:`V2.0; parquet_nested_bitmap:"nested_bitmap.parquet"; -.arrowkdb.pq.writeParquet[parquet_nested_bitmap;nested_schema;nested_data;options]; +.arrowkdb.pq.writeParquet[parquet_nested_bitmap;nested_schema;nested_data;nested_options]; --1"\n+----------|| Read the array data back and compare ||----------+\n"; -options[`WITH_NULL_BITMAP]:1; +-1"\n+----------|| Read the array back and compare ||----------+\n"; +nested_options[`WITH_NULL_BITMAP]:1; -1"\n+----------|| Read the schema back and compare ||----------+\n"; parquet_nested_schema:.arrowkdb.pq.readParquetSchema[parquet_nested_bitmap]; @@ -79,23 +81,23 @@ parquet_nested_schema:.arrowkdb.pq.readParquetSchema[parquet_nested_bitmap]; nested_schema~parquet_nested_schema -1"\n+----------|| Read the array data back and compare ||----------+\n"; -parquet_nested_data:.arrowkdb.pq.readParquetData[parquet_nested_bitmap;options]; +parquet_nested_data:.arrowkdb.pq.readParquetData[parquet_nested_bitmap;nested_options]; nested_data~first parquet_nested_data -1"\n+----------|| Compare nested null bitmaps ||----------+\n"; nested_list_nulls:(enlist 1b;00b;000b) -nested_struct_nulls:((::;1b;0b;0b);(::;0b;1b;0b);(::;0b;0b;1b)) +nested_struct_nulls:(100b;010b;001b) parquet_list_nulls:first parquet_nested_data[1] parquet_struct_nulls:last parquet_nested_data[1] nested_list_nulls~parquet_list_nulls -nested_struct_nulls~{(::),x} each parquet_struct_nulls +nested_struct_nulls~parquet_struct_nulls rm parquet_nested_bitmap; -1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; arrow_nested_bitmap:"nested_bitmap.arrow"; -.arrowkdb.ipc.writeArrow[arrow_nested_bitmap;nested_schema;nested_data;options]; +.arrowkdb.ipc.writeArrow[arrow_nested_bitmap;nested_schema;nested_data;nested_options]; -1"\n+----------|| Read the schema back and compare ||----------+\n"; arrow_nested_schema:.arrowkdb.ipc.readArrowSchema[arrow_nested_bitmap]; @@ -103,19 +105,19 @@ arrow_nested_schema:.arrowkdb.ipc.readArrowSchema[arrow_nested_bitmap]; nested_schema~arrow_nested_schema -1"\n+----------|| Read the array data back and compare ||----------+\n"; -arrow_nested_data:.arrowkdb.ipc.readArrowData[arrow_nested_bitmap;options]; +arrow_nested_data:.arrowkdb.ipc.readArrowData[arrow_nested_bitmap;nested_options]; nested_data~first arrow_nested_data -1"\n+----------|| Compare nested null bitmaps ||----------+\n"; -arrow_list_nulls:first parquet_nested_data[1] -arrow_struct_nulls:last parquet_nested_data[1] +arrow_list_nulls:first arrow_nested_data[1] +arrow_struct_nulls:last arrow_nested_data[1] nested_list_nulls~arrow_list_nulls -nested_struct_nulls~{(::),x} each arrow_struct_nulls +nested_struct_nulls~arrow_struct_nulls rm arrow_nested_bitmap; -1"\n+----------|| Serialize the schema and array data to an arrow stream ||----------+\n"; -serialized_nested_bitmap:.arrowkdb.ipc.serializeArrow[nested_schema;nested_data;options]; +serialized_nested_bitmap:.arrowkdb.ipc.serializeArrow[nested_schema;nested_data;nested_options]; -1"\n+----------|| Parse the schema back abd compare ||----------+\n"; stream_nested_schema:.arrowkdb.ipc.parseArrowSchema[serialized_nested_bitmap]; @@ -123,14 +125,14 @@ stream_nested_schema:.arrowkdb.ipc.parseArrowSchema[serialized_nested_bitmap]; nested_schema~stream_nested_schema -1"\n+----------|| Parse the array data back and compare ||----------+\n"; -stream_nested_data:.arrowkdb.ipc.parseArrowData[serialized_nested_bitmap;options]; +stream_nested_data:.arrowkdb.ipc.parseArrowData[serialized_nested_bitmap;nested_options]; nested_data~first stream_nested_data -1"\n+----------|| Compare nested null bitmaps ||----------+\n"; -stream_list_nulls:first parquet_nested_data[1] -stream_struct_nulls:last parquet_nested_data[1] +stream_list_nulls:first stream_nested_data[1] +stream_struct_nulls:last stream_nested_data[1] nested_list_nulls~stream_list_nulls -nested_struct_nulls~{(::),x} each stream_struct_nulls +nested_struct_nulls~stream_struct_nulls -1 "\n+----------|| Test utils ||----------+\n"; diff --git a/tests/null_bitmap.t b/tests/null_bitmap.t deleted file mode 100644 index 3a15b1d..0000000 --- a/tests/null_bitmap.t +++ /dev/null @@ -1,109 +0,0 @@ -// null_bitmap.t - --1"\n+----------|| Import the arrowkdb library ||----------+\n"; -\l q/arrowkdb.q - --1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; -rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; - --1"\n+----------|| Support null mapping ||----------+\n"; -bitmap_opts:(`bool`int32`float64`string`date32)!(0b;1i;2.34;"start";2006.07.21); - -options:(``NULL_MAPPING)!((::);bitmap_opts); - -N:5 - --1"\n+----------|| Create the datatype identifiers ||----------+\n"; -ts_dt:.arrowkdb.dt.timestamp[`nano]; - -bool_dt:.arrowkdb.dt.boolean[]; -i32_dt:.arrowkdb.dt.int32[]; -f64_dt:.arrowkdb.dt.float64[]; -str_dt:.arrowkdb.dt.utf8[]; -d32_dt:.arrowkdb.dt.date32[]; - --1"\n+----------|| Create the field identifiers ||----------+\n"; -ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; - -bool_fd:.arrowkdb.fd.field[`bool;bool_dt]; -i32_fd:.arrowkdb.fd.field[`int32;i32_dt]; -f64_fd:.arrowkdb.fd.field[`float64;f64_dt]; -str_fd:.arrowkdb.fd.field[`string;str_dt]; -d32_fd:.arrowkdb.fd.field[`date32;d32_dt]; - --1"\n+----------|| Create the schemas for the list of fields ||----------+\n"; -bitmap_schema:.arrowkdb.sc.schema[(ts_fd,bool_fd,i32_fd,f64_fd,str_fd,d32_fd)]; - --1"\n+----------|| Create data for each column in the table ||----------+\n"; -ts_data:asc N?0p; - -bool_data:N?(0b;1b); -bool_data[0]:0b; -i32_data:N?100i; -i32_data[1]:1i; -f64_data:N?100f; -f64_data[2]:2.34f; -str_data:N?("start";"stop";"alert";"acknowledge";""); -str_data[3]:"start" -d32_data:N?(2006.07.21;2005.07.18;2004.07.16;2003.07.15;2002.07.11); -d32_data[4]:2006.07.21; - --1"\n+----------|| Combine the data for all columns ||----------+\n"; -bitmap_data:(ts_data;bool_data;i32_data;f64_data;str_data;d32_data); - --1"\n+----------|| Write the schema and array data to a parquet file ||----------+\n"; -options[`PARQUET_VERSION]:`V2.0; - -parquet_bitmap:"null_bitmap.parquet"; -.arrowkdb.pq.writeParquet[parquet_bitmap;bitmap_schema;bitmap_data;options]; - --1"\n+----------|| Read the array data back and compare ||----------+\n"; -options[`WITH_NULL_BITMAP]:1; -parquet_bitmap_data:.arrowkdb.pq.readParquetData[parquet_bitmap;options]; -bitmap_data~first parquet_bitmap_data - -nulls_data:1b,(N-1)?1b; -bitmap_nulls:{x rotate nulls_data} each neg til {x-1} count bitmap_data; -parquet_bitmap_nulls:last parquet_bitmap_data; -bitmap_nulls~bitmap_nulls & sublist[{1-x} count parquet_bitmap_nulls;parquet_bitmap_nulls] -rm parquet_bitmap; - --1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; -arrow_bitmap:"null_bitmap.arrow"; -.arrowkdb.ipc.writeArrow[arrow_bitmap;bitmap_schema;bitmap_data;options]; - --1"\n+----------|| Read the schema back and compare ||----------+\n"; -arrow_bitmap_schema:.arrowkdb.ipc.readArrowSchema[arrow_bitmap]; -.arrowkdb.sc.equalSchemas[bitmap_schema;arrow_bitmap_schema] -bitmap_schema~arrow_bitmap_schema - --1"\n+----------|| Read the array data back and compare ||----------+\n"; -arrow_bitmap_data:.arrowkdb.ipc.readArrowData[arrow_bitmap;options]; -bitmap_data~first arrow_bitmap_data -arrow_bitmap_nulls:last arrow_bitmap_data; -bitmap_nulls~bitmap_nulls & sublist[{1-x} count arrow_bitmap_nulls;arrow_bitmap_nulls] -rm arrow_bitmap; - --1"\n+----------|| Serialize the schema and array data to an arrow stream ||----------+\n"; -serialized_bitmap:.arrowkdb.ipc.serializeArrow[bitmap_schema;bitmap_data;options]; - --1"\n+----------|| Parse the schema back abd compare ||----------+\n"; -stream_bitmap_schema:.arrowkdb.ipc.parseArrowSchema[serialized_bitmap]; -.arrowkdb.sc.equalSchemas[bitmap_schema;stream_bitmap_schema] -bitmap_schema~stream_bitmap_schema - --1"\n+----------|| Parse the array data back and compare ||----------+\n"; -stream_bitmap_data:.arrowkdb.ipc.parseArrowData[serialized_bitmap;options]; -bitmap_data~first stream_bitmap_data - -stream_bitmap_nulls:last stream_bitmap_data; -bitmap_nulls~bitmap_nulls & sublist[{1-x} count stream_bitmap_nulls;stream_bitmap_nulls] - - --1 "\n+----------|| Test utils ||----------+\n"; - -show .arrowkdb.util.buildInfo[] -(type .arrowkdb.util.buildInfo[])~99h - - --1 "\n+----------|| Finished testing ||----------+\n"; From 70ed27f095f9d7748b9d44196587e82b844963b0 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 28 Feb 2023 22:31:26 +0300 Subject: [PATCH 098/276] Rearrange tests by features --- .gitignore | 1 + .travis.yml | 2 +- tests/null_bitmap/.gitignore | 2 ++ tests/{ => null_bitmap}/crucial_null_bitmap.t | 0 tests/{ => null_bitmap}/nested_null_bitmap.t | 2 +- tests/null_mapping/.gitignore | 7 +++++++ tests/{ => null_mapping}/null_mapping_extra.t | 0 tests/{ => null_mapping}/null_mapping_float.t | 0 tests/{ => null_mapping}/null_mapping_long.t | 0 tests/{ => null_mapping}/null_mapping_other.t | 0 tests/{ => null_mapping}/null_mapping_short.t | 0 tests/{ => null_mapping}/null_mapping_str.t | 0 tests/{ => null_mapping}/null_mapping_time.t | 0 13 files changed, 12 insertions(+), 2 deletions(-) create mode 100644 tests/null_bitmap/.gitignore rename tests/{ => null_bitmap}/crucial_null_bitmap.t (100%) rename tests/{ => null_bitmap}/nested_null_bitmap.t (99%) create mode 100644 tests/null_mapping/.gitignore rename tests/{ => null_mapping}/null_mapping_extra.t (100%) rename tests/{ => null_mapping}/null_mapping_float.t (100%) rename tests/{ => null_mapping}/null_mapping_long.t (100%) rename tests/{ => null_mapping}/null_mapping_other.t (100%) rename tests/{ => null_mapping}/null_mapping_short.t (100%) rename tests/{ => null_mapping}/null_mapping_str.t (100%) rename tests/{ => null_mapping}/null_mapping_time.t (100%) diff --git a/.gitignore b/.gitignore index 3d6594b..688988a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ arrowkdb.code-workspace .vscode/ build/ +test.q diff --git a/.travis.yml b/.travis.yml index 3084fc5..546e880 100644 --- a/.travis.yml +++ b/.travis.yml @@ -82,7 +82,7 @@ before_install: script: - if [[ $TESTS == "True" && "x$OD" != "x" && "x$QLIC_KC" != "x" ]]; then curl -o test.q -L https://github.com/KxSystems/hdf5/raw/master/test.q; - q test.q tests/ -q; + q test.q tests -q && q test.q tests/null_mapping -q && q test.q tests/null_bitmap -q; fi - if [[ $TRAVIS_OS_NAME == "windows" && $BUILD == "True" ]]; then 7z a -tzip -r $FILE_NAME ./cmake/$FILE_ROOT/*; diff --git a/tests/null_bitmap/.gitignore b/tests/null_bitmap/.gitignore new file mode 100644 index 0000000..b002a2d --- /dev/null +++ b/tests/null_bitmap/.gitignore @@ -0,0 +1,2 @@ +crucial_null_bitmap.q +nested_null_bitmap.q diff --git a/tests/crucial_null_bitmap.t b/tests/null_bitmap/crucial_null_bitmap.t similarity index 100% rename from tests/crucial_null_bitmap.t rename to tests/null_bitmap/crucial_null_bitmap.t diff --git a/tests/nested_null_bitmap.t b/tests/null_bitmap/nested_null_bitmap.t similarity index 99% rename from tests/nested_null_bitmap.t rename to tests/null_bitmap/nested_null_bitmap.t index a12bdf6..6184c1e 100644 --- a/tests/nested_null_bitmap.t +++ b/tests/null_bitmap/nested_null_bitmap.t @@ -137,7 +137,7 @@ nested_struct_nulls~stream_struct_nulls -1 "\n+----------|| Test utils ||----------+\n"; -.arrowkdb.util.buildInfo[] +show .arrowkdb.util.buildInfo[] (type .arrowkdb.util.buildInfo[])~99h diff --git a/tests/null_mapping/.gitignore b/tests/null_mapping/.gitignore new file mode 100644 index 0000000..ff2bc45 --- /dev/null +++ b/tests/null_mapping/.gitignore @@ -0,0 +1,7 @@ +null_mapping_short.q +null_mapping_long.q +null_mapping_float.q +null_mapping_str.q +null_mapping_time.q +null_mapping_extra.q +null_mapping_other.q diff --git a/tests/null_mapping_extra.t b/tests/null_mapping/null_mapping_extra.t similarity index 100% rename from tests/null_mapping_extra.t rename to tests/null_mapping/null_mapping_extra.t diff --git a/tests/null_mapping_float.t b/tests/null_mapping/null_mapping_float.t similarity index 100% rename from tests/null_mapping_float.t rename to tests/null_mapping/null_mapping_float.t diff --git a/tests/null_mapping_long.t b/tests/null_mapping/null_mapping_long.t similarity index 100% rename from tests/null_mapping_long.t rename to tests/null_mapping/null_mapping_long.t diff --git a/tests/null_mapping_other.t b/tests/null_mapping/null_mapping_other.t similarity index 100% rename from tests/null_mapping_other.t rename to tests/null_mapping/null_mapping_other.t diff --git a/tests/null_mapping_short.t b/tests/null_mapping/null_mapping_short.t similarity index 100% rename from tests/null_mapping_short.t rename to tests/null_mapping/null_mapping_short.t diff --git a/tests/null_mapping_str.t b/tests/null_mapping/null_mapping_str.t similarity index 100% rename from tests/null_mapping_str.t rename to tests/null_mapping/null_mapping_str.t diff --git a/tests/null_mapping_time.t b/tests/null_mapping/null_mapping_time.t similarity index 100% rename from tests/null_mapping_time.t rename to tests/null_mapping/null_mapping_time.t From c95685d0bdaaf21ede35f06d449491b6763afd15 Mon Sep 17 00:00:00 2001 From: nmcdonnell-kx Date: Wed, 1 Mar 2023 16:14:19 +0000 Subject: [PATCH 099/276] KXI-0 Windows build fixes --- src/KdbOptions.cpp | 2 ++ src/TableData.cpp | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/KdbOptions.cpp b/src/KdbOptions.cpp index 75c903c..ce6f325 100644 --- a/src/KdbOptions.cpp +++ b/src/KdbOptions.cpp @@ -1,3 +1,5 @@ +#include + #include "KdbOptions.h" namespace{ diff --git a/src/TableData.cpp b/src/TableData.cpp index cab590f..f076f6f 100644 --- a/src/TableData.cpp +++ b/src/TableData.cpp @@ -481,7 +481,7 @@ K writeArrow(K arrow_file, K schema_id, K array_data, K options) // Chunk size read_options.GetIntOption( kx::arrowkdb::Options::ARROW_CHUNK_ROWS, type_overrides.chunk_length ); - auto check_length = []( const auto& arrays ){ + auto check_length = []( const auto& arrays ) -> int64_t { // Check all arrays are same length int64_t len = -1; for (auto i : arrays) { @@ -660,7 +660,7 @@ K serializeArrow(K schema_id, K array_data, K options) // Chunk size read_options.GetIntOption( kx::arrowkdb::Options::ARROW_CHUNK_ROWS, type_overrides.chunk_length ); - auto check_length = []( const auto& arrays ){ + auto check_length = []( const auto& arrays ) -> int64_t { // Check all arrays are same length int64_t len = -1; for (auto i : arrays) { From f35fdcf2029201ac9c9cb1fdc13832ad41b34d85 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 2 Mar 2023 14:19:56 +0300 Subject: [PATCH 100/276] Hot fix for struct dimensions --- src/ArrayReader.cpp | 40 +++++++++++++++++++++++++--------------- src/ArrayReader.h | 2 +- 2 files changed, 26 insertions(+), 16 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 67d0326..7eb79aa 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -633,9 +633,10 @@ K AppendNullBitmap( shared_ptr array_data, size size_t counter = 0; K k_bitmap = ( null_bitmap_handlers.find( type_id ) == null_bitmap_handlers.end() ) ? ktn( KB, length ) - : knk( length ); + : knk( 0 ); auto slice = slice_array->Slice( 0, length ); - InitKdbNullBitmap( slice, k_bitmap, counter ); + InitKdbNullBitmap( slice, &k_bitmap, counter ); + ++index; return k_bitmap; } @@ -661,16 +662,24 @@ K AppendNullBitmap( shared_ptr array_data, size_ template<> K AppendNullBitmap( shared_ptr array_data, size_t& index ) { + size_t length = 0; auto struct_array = static_pointer_cast( array_data ); auto num_fields = struct_array->type()->num_fields(); - auto field = struct_array->field( index ); - auto type_id = field->type_id(); - size_t counter = 0; - K k_bitmap = ( null_bitmap_handlers.find( type_id ) == null_bitmap_handlers.end() ) - ? ktn( KB, num_fields ) - : knk( num_fields ); - InitKdbNullBitmap( field, k_bitmap, counter ); + K k_bitmap = knk( num_fields ); + for( int i = 0; i < num_fields; ++i ){ + auto field = struct_array->field( i ); + auto type_id = field->type_id(); + length = field->length(); + + size_t counter = 0; + kK( k_bitmap )[i] = ( null_bitmap_handlers.find( type_id ) == null_bitmap_handlers.end() ) + ? ktn( KB, length ) + : knk( 0 ); + InitKdbNullBitmap( field, &kK( k_bitmap )[i], counter ); + } + + index += length; return k_bitmap; } @@ -728,19 +737,20 @@ void AppendArray(shared_ptr array_data, K k_array, size_t& index, } } -void InitKdbNullBitmap( shared_ptr array_data, K k_bitmap, size_t& index ) +void InitKdbNullBitmap( shared_ptr array_data, K* k_bitmap, size_t& index ) { auto type_id = array_data->type_id(); auto length = array_data->length(); for( int i = 0ll; i < length; ++i ){ if( null_bitmap_handlers.find( type_id ) == null_bitmap_handlers.end() ){ - kG( k_bitmap )[index] = array_data->IsNull( i ); + kG( *k_bitmap )[index++] = array_data->IsNull( i ); } else{ - kK( k_bitmap )[index] = null_bitmap_handlers[type_id]( array_data, index ); + auto pos = index; + *k_bitmap = jk( k_bitmap, null_bitmap_handlers[type_id]( array_data, index ) ); + i += index - pos - 1; } - ++index; } } @@ -812,11 +822,11 @@ K ReadChunkedNullBitmap( shared_ptr chunked_array, TypeMapp auto type_id = chunked_array->type()->id(); K k_bitmap = ( null_bitmap_handlers.find( type_id ) == null_bitmap_handlers.end() ) ? ktn( KB, length ) - : knk( length ); + : knk( 0 ); size_t index = 0; for( auto i = 0; i < chunked_array->num_chunks(); ++i ){ - InitKdbNullBitmap( chunked_array->chunk( i ), k_bitmap, index ); + InitKdbNullBitmap( chunked_array->chunk( i ), &k_bitmap, index ); } return k_bitmap; diff --git a/src/ArrayReader.h b/src/ArrayReader.h index 9ed0386..2fdd5b8 100644 --- a/src/ArrayReader.h +++ b/src/ArrayReader.h @@ -77,7 +77,7 @@ K InitKdbForArray(std::shared_ptr datatype, size_t length, Type * begin. Index will be updated to account for the new offset by adding the * length of the array array. */ -void InitKdbNullBitmap( std::shared_ptr array_data, K k_bitmap, size_t& index ); +void InitKdbNullBitmap( std::shared_ptr array_data, K* k_bitmap, size_t& index ); } // namespace arrowkdb } // namespace kx From 28407ecf5ebbc73b5f0b12ded93a7495a30507ae Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 2 Mar 2023 14:31:10 +0300 Subject: [PATCH 101/276] Struct dimension reshaping --- examples/null_bitmap.q | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q index c10a5b0..0db6d7e 100644 --- a/examples/null_bitmap.q +++ b/examples/null_bitmap.q @@ -94,18 +94,18 @@ d32_data:N?(2006.07.21;2005.07.18;2004.07.16;2003.07.15;2002.07.11); d32_data[4]:2006.07.21; // Create the data for each of the struct child fields -f32_data:3?100e; +f32_data:5?100e; f32_data[0]:8.76e; -bin_data:3?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); +bin_data:5?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); bin_data[1]:"x"$"acknowledge" -t64_data:3?(12:00:00.000000000;13:00:00.000000000;14:00:00.000000000;15:00:00.000000000;16:00:00.000000000); +t64_data:5?(12:00:00.000000000;13:00:00.000000000;14:00:00.000000000;15:00:00.000000000;16:00:00.000000000); t64_data[2]:00:00:00.123456789; // Combine the data for all columns bitmap_data:(ts_data;bool_data;i32_data;f64_data;str_data;d32_data); // Create the data for the list array -list_data:(enlist 9h;(8h;7h);(6h;5h;4h)); +list_data:(enlist 9h;(8h;7h);(6h;5h;4h);(1h;2h;3h;4h);(5h;6h;7h;8h;9h)); // Create the data for the struct array from its child arrays struct_data:(f32_data;bin_data;t64_data); @@ -158,8 +158,8 @@ show nested_data~first parquet_nested_data nulls_data:1b,(N-1)?1b; bitmap_nulls:{x rotate nulls_data} each neg til {x-1} count bitmap_data; -nested_list_nulls:(enlist 1b;00b;000b) -nested_struct_nulls:(100b;010b;001b) +nested_list_nulls:(enlist 1b;00b;000b;0000b;00001b) +nested_struct_nulls:(10000b;01000b;00100b) parquet_bitmap_nulls:last parquet_bitmap_data; parquet_list_nulls:first parquet_nested_data[1] @@ -167,7 +167,7 @@ parquet_struct_nulls:last parquet_nested_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count parquet_bitmap_nulls;parquet_bitmap_nulls] nested_list_nulls~parquet_list_nulls -nested_struct_nulls~parquet_struct_nulls +nested_struct_nulls~parquet_struct_nulls[0] rm parquet_null_bitmap; rm parquet_nested_bitmap; @@ -209,7 +209,7 @@ arrow_struct_nulls:last arrow_nested_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count arrow_bitmap_nulls;arrow_bitmap_nulls] nested_list_nulls~arrow_list_nulls -nested_struct_nulls~arrow_struct_nulls +nested_struct_nulls~arrow_struct_nulls[0] rm arrow_null_bitmap; rm arrow_nested_bitmap; @@ -248,7 +248,7 @@ stream_struct_nulls:last stream_nested_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count stream_bitmap_nulls;stream_bitmap_nulls] nested_list_nulls~stream_list_nulls -nested_struct_nulls~stream_struct_nulls +nested_struct_nulls~stream_struct_nulls[0] -1 "\n+----------------------------------------+\n"; From 173f74597463a3ef84e8a06b7bb5612fb1368186 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 2 Mar 2023 15:44:31 +0300 Subject: [PATCH 102/276] Updating unit-test for optimal struct shape --- tests/null_bitmap/nested_null_bitmap.t | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/null_bitmap/nested_null_bitmap.t b/tests/null_bitmap/nested_null_bitmap.t index 6184c1e..57f4bae 100644 --- a/tests/null_bitmap/nested_null_bitmap.t +++ b/tests/null_bitmap/nested_null_bitmap.t @@ -50,15 +50,15 @@ nested_schema:.arrowkdb.sc.schema[(list_fd,struct_fd)]; ts_data:asc N?0p; -1"\n+----------|| Create the data for each of the struct child fields ||----------+\n"; -f32_data:3?100e; +f32_data:5?100e; f32_data[0]:8.76e; -bin_data:3?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); +bin_data:5?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); bin_data[1]:"x"$"acknowledge" -t64_data:3?(12:00:00.000000000;13:00:00.000000000;14:00:00.000000000;15:00:00.000000000;16:00:00.000000000); +t64_data:5?(12:00:00.000000000;13:00:00.000000000;14:00:00.000000000;15:00:00.000000000;16:00:00.000000000); t64_data[2]:00:00:00.123456789; -1"\n+----------|| Create the data for the list array ||----------+\n"; -list_data:(enlist 9h;(8h;7h);(6h;5h;4h)); +list_data:(enlist 9h;(8h;7h);(6h;5h;4h);(1h;2h;3h;4h);(5h;6h;7h;8h;9h)); -1"\n+----------|| Create the data for the struct array from its child arrays ||----------+\n"; struct_data:(f32_data;bin_data;t64_data); @@ -85,13 +85,13 @@ parquet_nested_data:.arrowkdb.pq.readParquetData[parquet_nested_bitmap;nested_op nested_data~first parquet_nested_data -1"\n+----------|| Compare nested null bitmaps ||----------+\n"; -nested_list_nulls:(enlist 1b;00b;000b) -nested_struct_nulls:(100b;010b;001b) +nested_list_nulls:(enlist 1b;00b;000b;0000b;00001b) +nested_struct_nulls:(10000b;01000b;00100b) parquet_list_nulls:first parquet_nested_data[1] parquet_struct_nulls:last parquet_nested_data[1] nested_list_nulls~parquet_list_nulls -nested_struct_nulls~parquet_struct_nulls +nested_struct_nulls~parquet_struct_nulls[0] rm parquet_nested_bitmap; @@ -112,7 +112,7 @@ nested_data~first arrow_nested_data arrow_list_nulls:first arrow_nested_data[1] arrow_struct_nulls:last arrow_nested_data[1] nested_list_nulls~arrow_list_nulls -nested_struct_nulls~arrow_struct_nulls +nested_struct_nulls~arrow_struct_nulls[0] rm arrow_nested_bitmap; @@ -132,7 +132,7 @@ nested_data~first stream_nested_data stream_list_nulls:first stream_nested_data[1] stream_struct_nulls:last stream_nested_data[1] nested_list_nulls~stream_list_nulls -nested_struct_nulls~stream_struct_nulls +nested_struct_nulls~stream_struct_nulls[0] -1 "\n+----------|| Test utils ||----------+\n"; From b3f3f58766d005cdc7d7245c97a6a21532153c3c Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 2 Mar 2023 18:56:20 +0300 Subject: [PATCH 103/276] Map and dictionary examples --- examples/null_bitmap.q | 161 +++++++++++++++++++++++++++-------------- 1 file changed, 106 insertions(+), 55 deletions(-) diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q index 0db6d7e..eedb6ca 100644 --- a/examples/null_bitmap.q +++ b/examples/null_bitmap.q @@ -20,9 +20,10 @@ rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; // Support null mapping bitmap_opts:(`bool`int32`float64`string`date32)!(0b;1i;2.34;"start";2006.07.21); -nested_opts:(`uint16`float32`binary`time64)!(9h;8.76e;"x"$"acknowledge";00:00:00.123456789); +nested_struct_opts:(`uint16`float32`binary`time64)!(9h;8.76e;"x"$"acknowledge";00:00:00.123456789); +nested_dict_opts:(enlist `int64)!(enlist 5); -nested_options:(``NULL_MAPPING)!((::);bitmap_opts,nested_opts); +nested_options:(``NULL_MAPPING)!((::);bitmap_opts,nested_struct_opts,nested_dict_opts); // Create the datatype identifiers ts_dt:.arrowkdb.dt.timestamp[`nano]; @@ -34,11 +35,18 @@ str_dt:.arrowkdb.dt.utf8[]; d32_dt:.arrowkdb.dt.date32[]; ui16_dt:.arrowkdb.dt.uint16[]; +// Create a list datatype, using the uint16 datatype as its child +list_dt:.arrowkdb.dt.list[ui16_dt]; f32_dt:.arrowkdb.dt.float32[]; bin_dt:.arrowkdb.dt.binary[]; t64_dt:.arrowkdb.dt.time64[`nano]; +i64_dt:.arrowkdb.dt.int64[]; + +// Create a map datatype using the i16_dt as the key and dec_dt as its values +map_dt:.arrowkdb.dt.map[i64_dt;f64_dt] + // Create the field identifiers ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; @@ -49,35 +57,38 @@ str_fd:.arrowkdb.fd.field[`string;str_dt]; d32_fd:.arrowkdb.fd.field[`date32;d32_dt]; ui16_fd:.arrowkdb.fd.field[`uint16;ui16_dt]; +// Create a field containing the list datatype +list_fd:.arrowkdb.fd.field[`list_field;list_dt]; f32_fd:.arrowkdb.fd.field[`float32;f32_dt]; bin_fd:.arrowkdb.fd.field[`binary;bin_dt]; t64_fd:.arrowkdb.fd.field[`time64;t64_dt]; - -// Create a list datatype, using the uint16 datatype as its child -list_dt:.arrowkdb.dt.list[ui16_dt]; - -// Create a field containing the list datatype -list_fd:.arrowkdb.fd.field[`list_field;list_dt]; - // Create a struct datatype using the float32, binary and time64 fields as its children -struct_dt:.arrowkdb.dt.struct[(f32_fd,bin_dt,t64_dt)]; - +struct_dt:.arrowkdb.dt.struct[(f32_fd,bin_fd,t64_fd)]; // Create a field containing the struct datatype struct_fd:.arrowkdb.fd.field[`struct_field;struct_dt]; +// Create a field containing the map datatype +map_fd:.arrowkdb.fd.field[`map;map_dt]; + // Create the schemas for the list of fields bitmap_schema:.arrowkdb.sc.schema[(ts_fd,bool_fd,i32_fd,f64_fd,str_fd,d32_fd)]; // Create the schema containing the list and struct fields -nested_schema:.arrowkdb.sc.schema[(list_fd,struct_fd)]; +struct_schema:.arrowkdb.sc.schema[(list_fd,struct_fd)]; + +// Create the schema containing the large list, dictionary and sparce union fields +dict_schema:.arrowkdb.sc.schema[(enlist map_fd)]; // Print the schema -1"\nBitmap schema:"; .arrowkdb.sc.printSchema[bitmap_schema]; --1"\nNested schema:"; -.arrowkdb.sc.printSchema[nested_schema]; +-1"\nStruct schema:"; +.arrowkdb.sc.printSchema[struct_schema]; + +-1"\nDict schema:"; +.arrowkdb.sc.printSchema[dict_schema]; // Create data for each column in the table ts_data:asc N?0p; @@ -104,22 +115,27 @@ t64_data[2]:00:00:00.123456789; // Combine the data for all columns bitmap_data:(ts_data;bool_data;i32_data;f64_data;str_data;d32_data); -// Create the data for the list array -list_data:(enlist 9h;(8h;7h);(6h;5h;4h);(1h;2h;3h;4h);(5h;6h;7h;8h;9h)); +list_array:(enlist 9h;(8h;7h);(6h;5h;4h);(1h;2h;3h;4h);(5h;6h;7h;8h;9h)); +struct_array:(f32_data;bin_data;t64_data); +// Combine the array data for the list and struct columns +struct_data:(list_array;struct_array); -// Create the data for the struct array from its child arrays -struct_data:(f32_data;bin_data;t64_data); +map_data:((enlist 1)!(enlist 1f);(2 2)!(2 2.34f);(3 3 3)!(3 3 3f)) // Combine the array data for the list and struct columns -nested_data:(list_data;struct_data); +dict_data:(enlist map_data); // Pretty print the Arrow table populated from the bitmap data -1"\nBitmap table:"; .arrowkdb.tb.prettyPrintTable[bitmap_schema;bitmap_data;nested_options]; // Show the array data as an arrow table --1"\nNested table:"; -.arrowkdb.tb.prettyPrintTable[nested_schema;nested_data;nested_options] +-1"\nStruct table:"; +.arrowkdb.tb.prettyPrintTable[struct_schema;struct_data;nested_options] + +// Show the array data as an arrow table +-1"\nDict table:"; +.arrowkdb.tb.prettyPrintTable[dict_schema;dict_data;nested_options] //-------------------------// // Example-1. Parquet file // @@ -129,48 +145,61 @@ nested_data:(list_data;struct_data); nested_options[`PARQUET_VERSION]:`V2.0; parquet_null_bitmap:"null_bitmap.parquet"; -parquet_nested_bitmap:"nested_bitmap.parquet"; +parquet_nested_struct:"nested_struct.parquet"; +parquet_nested_dict:"nested_dict.parquet"; .arrowkdb.pq.writeParquet[parquet_null_bitmap;bitmap_schema;bitmap_data;nested_options]; -.arrowkdb.pq.writeParquet[parquet_nested_bitmap;nested_schema;nested_data;nested_options]; +.arrowkdb.pq.writeParquet[parquet_nested_struct;struct_schema;struct_data;nested_options]; +.arrowkdb.pq.writeParquet[parquet_nested_dict;dict_schema;dict_data;nested_options]; show ls parquet_null_bitmap -show ls parquet_nested_bitmap +show ls parquet_nested_struct +show ls parquet_nested_dict -// Read the array data back and compare +// Read the schema back and compare nested_options[`WITH_NULL_BITMAP]:1; -// Read the schema back and compare parquet_bitmap_schema:.arrowkdb.pq.readParquetSchema[parquet_null_bitmap]; -parquet_nested_schema:.arrowkdb.pq.readParquetSchema[parquet_nested_bitmap]; +parquet_struct_schema:.arrowkdb.pq.readParquetSchema[parquet_nested_struct]; +parquet_dict_schema:.arrowkdb.pq.readParquetSchema[parquet_nested_dict]; show .arrowkdb.sc.equalSchemas[bitmap_schema;parquet_bitmap_schema] -show .arrowkdb.sc.equalSchemas[nested_schema;parquet_nested_schema] +show .arrowkdb.sc.equalSchemas[struct_schema;parquet_struct_schema] +show .arrowkdb.sc.equalSchemas[dict_schema;parquet_dict_schema] show bitmap_schema~parquet_bitmap_schema -show nested_schema~parquet_nested_schema +show struct_schema~parquet_struct_schema +show dict_schema~parquet_dict_schema +// Read the array data back and compare parquet_bitmap_data:.arrowkdb.pq.readParquetData[parquet_null_bitmap;nested_options]; -parquet_nested_data:.arrowkdb.pq.readParquetData[parquet_nested_bitmap;nested_options]; +parquet_struct_data:.arrowkdb.pq.readParquetData[parquet_nested_struct;nested_options]; +parquet_dict_data:.arrowkdb.pq.readParquetData[parquet_nested_dict;nested_options]; show bitmap_data~first parquet_bitmap_data -show nested_data~first parquet_nested_data +show struct_data~first parquet_struct_data +show dict_data~first parquet_dict_data +// Compare null bitmaps of parquet data nulls_data:1b,(N-1)?1b; bitmap_nulls:{x rotate nulls_data} each neg til {x-1} count bitmap_data; nested_list_nulls:(enlist 1b;00b;000b;0000b;00001b) nested_struct_nulls:(10000b;01000b;00100b) +nested_dict_nulls:((enlist 0b)!(enlist 0b);00b!01b;000b!000b) parquet_bitmap_nulls:last parquet_bitmap_data; -parquet_list_nulls:first parquet_nested_data[1] -parquet_struct_nulls:last parquet_nested_data[1] +parquet_list_nulls:first parquet_struct_data[1] +parquet_struct_nulls:last parquet_struct_data[1] +parquet_dict_nulls:parquet_dict_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count parquet_bitmap_nulls;parquet_bitmap_nulls] nested_list_nulls~parquet_list_nulls nested_struct_nulls~parquet_struct_nulls[0] +nested_dict_nulls~parquet_dict_nulls[0][0] rm parquet_null_bitmap; -rm parquet_nested_bitmap; +rm parquet_nested_struct; +rm parquet_nested_dict; //---------------------------// // Example-2. Arrow IPC file // @@ -178,41 +207,53 @@ rm parquet_nested_bitmap; // Write the schema and array data to an arrow file arrow_null_bitmap:"null_bitmap.arrow"; -arrow_nested_bitmap:"nested_bitmap.arrow"; +arrow_struct_bitmap:"nested_bitmap.arrow"; +arrow_dict_bitmap:"nested_dict.arrow"; .arrowkdb.ipc.writeArrow[arrow_null_bitmap;bitmap_schema;bitmap_data;nested_options]; -.arrowkdb.ipc.writeArrow[arrow_nested_bitmap;nested_schema;nested_data;nested_options]; +.arrowkdb.ipc.writeArrow[arrow_struct_bitmap;struct_schema;struct_data;nested_options]; +.arrowkdb.ipc.writeArrow[arrow_dict_bitmap;dict_schema;dict_data;nested_options]; show ls arrow_null_bitmap -show ls arrow_nested_bitmap +show ls arrow_struct_bitmap +show ls arrow_dict_bitmap // Read the schema back and compare arrow_bitmap_schema:.arrowkdb.ipc.readArrowSchema[arrow_null_bitmap]; -arrow_nested_schema:.arrowkdb.ipc.readArrowSchema[arrow_nested_bitmap]; +arrow_struct_schema:.arrowkdb.ipc.readArrowSchema[arrow_struct_bitmap]; +arrow_dict_schema:.arrowkdb.ipc.readArrowSchema[arrow_dict_bitmap]; show .arrowkdb.sc.equalSchemas[bitmap_schema;arrow_bitmap_schema] -show .arrowkdb.sc.equalSchemas[nested_schema;arrow_nested_schema] +show .arrowkdb.sc.equalSchemas[struct_schema;arrow_struct_schema] +show .arrowkdb.sc.equalSchemas[dict_schema;arrow_dict_schema] show bitmap_schema~arrow_bitmap_schema -show nested_schema~arrow_nested_schema +show struct_schema~arrow_struct_schema +show dict_schema~arrow_dict_schema // Read the array data back and compare arrow_bitmap_data:.arrowkdb.ipc.readArrowData[arrow_null_bitmap;nested_options]; -arrow_nested_data:.arrowkdb.ipc.readArrowData[arrow_nested_bitmap;nested_options]; +arrow_struct_data:.arrowkdb.ipc.readArrowData[arrow_struct_bitmap;nested_options]; +arrow_dict_data:.arrowkdb.ipc.readArrowData[arrow_dict_bitmap;nested_options]; show bitmap_data~first arrow_bitmap_data -show nested_data~first arrow_nested_data +show struct_data~first arrow_struct_data +show dict_data~first arrow_dict_data +// Compare null bitmaps of arrow data arrow_bitmap_nulls:last arrow_bitmap_data; -arrow_list_nulls:first arrow_nested_data[1] -arrow_struct_nulls:last arrow_nested_data[1] +arrow_list_nulls:first arrow_struct_data[1] +arrow_struct_nulls:last arrow_struct_data[1] +arrow_dict_nulls:arrow_dict_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count arrow_bitmap_nulls;arrow_bitmap_nulls] nested_list_nulls~arrow_list_nulls nested_struct_nulls~arrow_struct_nulls[0] +nested_dict_nulls~arrow_dict_nulls[0][0] rm arrow_null_bitmap; -rm arrow_nested_bitmap; +rm arrow_struct_bitmap; +rm arrow_dict_bitmap; //-----------------------------// // Example-3. Arrow IPC stream // @@ -220,37 +261,47 @@ rm arrow_nested_bitmap; // Serialize the schema and array data to an arrow stream serialized_null_bitmap:.arrowkdb.ipc.serializeArrow[bitmap_schema;bitmap_data;nested_options]; -serialized_nested_bitmap:.arrowkdb.ipc.serializeArrow[nested_schema;nested_data;nested_options]; +serialized_nested_struct:.arrowkdb.ipc.serializeArrow[struct_schema;struct_data;nested_options]; +serialized_nested_dict:.arrowkdb.ipc.serializeArrow[dict_schema;dict_data;nested_options]; show serialized_null_bitmap -show serialized_nested_bitmap +show serialized_nested_struct +show serialized_nested_dict // Parse the schema back abd compare stream_bitmap_schema:.arrowkdb.ipc.parseArrowSchema[serialized_null_bitmap]; -stream_nested_schema:.arrowkdb.ipc.parseArrowSchema[serialized_nested_bitmap]; +stream_struct_schema:.arrowkdb.ipc.parseArrowSchema[serialized_nested_struct]; +stream_dict_schema:.arrowkdb.ipc.parseArrowSchema[serialized_nested_dict]; show .arrowkdb.sc.equalSchemas[bitmap_schema;stream_bitmap_schema] -show .arrowkdb.sc.equalSchemas[nested_schema;stream_nested_schema] +show .arrowkdb.sc.equalSchemas[struct_schema;stream_struct_schema] +show .arrowkdb.sc.equalSchemas[dict_schema;stream_dict_schema] show bitmap_schema~stream_bitmap_schema -show nested_schema~stream_nested_schema +show struct_schema~stream_struct_schema +show dict_schema~stream_dict_schema // Parse the array data back and compare stream_bitmap_data:.arrowkdb.ipc.parseArrowData[serialized_null_bitmap;nested_options]; -stream_nested_data:.arrowkdb.ipc.parseArrowData[serialized_nested_bitmap;nested_options]; +stream_struct_data:.arrowkdb.ipc.parseArrowData[serialized_nested_struct;nested_options]; +stream_dict_data:.arrowkdb.ipc.parseArrowData[serialized_nested_dict;nested_options]; show bitmap_data~first stream_bitmap_data -show nested_data~first stream_nested_data +show struct_data~first stream_struct_data +show dict_data~first stream_dict_data +// Compare null bitmaps of stream data stream_bitmap_nulls:last stream_bitmap_data; -stream_list_nulls:first stream_nested_data[1] -stream_struct_nulls:last stream_nested_data[1] +stream_list_nulls:first stream_struct_data[1] +stream_struct_nulls:last stream_struct_data[1] +stream_dict_nulls:stream_dict_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count stream_bitmap_nulls;stream_bitmap_nulls] nested_list_nulls~stream_list_nulls nested_struct_nulls~stream_struct_nulls[0] +nested_dict_nulls~stream_dict_nulls[0][0] -1 "\n+----------------------------------------+\n"; // Process off -//exit 0; +exit 0; From 10783f4d63e6f051668623704fcddacc1848abdf Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 2 Mar 2023 22:02:38 +0300 Subject: [PATCH 104/276] Mapping nulls into nested map --- src/ArrayReader.cpp | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 7eb79aa..c64e949 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -656,7 +656,37 @@ K AppendNullBitmap( shared_ptr array template<> K AppendNullBitmap( shared_ptr array_data, size_t& index ) { - return nullptr; + auto map_array = static_pointer_cast( array_data ); + auto keys = map_array->keys(); + auto items = map_array->items(); + auto keys_type_id = keys->type_id(); + auto items_type_id = items->type_id(); + auto length = map_array->length(); + + K k_bitmap = knk( length ); + for( auto i = 0; i < length; ++i ){ + auto keys_slice = keys->Slice( map_array->value_offset( i ), map_array->value_length( i ) ); + auto items_slice = items->Slice( map_array->value_offset( i ), map_array->value_length( i ) ); + auto keys_length = keys_slice->length(); + auto items_length = items_slice->length(); + + K k_keys = ( null_bitmap_handlers.find( keys_type_id ) == null_bitmap_handlers.end() ) + ? ktn( KB, keys_length ) + : knk( 0 ); + K k_items = ( null_bitmap_handlers.find( items_type_id ) == null_bitmap_handlers.end() ) + ? ktn( KB, items_length ) + : knk( 0 ); + + size_t keys_counter = 0; + size_t items_counter = 0; + InitKdbNullBitmap( keys_slice, &k_keys, keys_counter ); + InitKdbNullBitmap( items_slice, &k_items, items_counter ); + kK( k_bitmap )[i] = xD( k_keys, k_items ); + } + + index += length; + + return k_bitmap; } template<> From 4f6b10f79344635c2e9283ec0769d7260173193b Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 3 Mar 2023 11:49:01 +0300 Subject: [PATCH 105/276] Unit-test for null bitmaps in associative arrays --- .gitignore | 2 + tests/.gitignore | 1 + tests/null_bitmap/.gitignore | 1 + tests/null_bitmap/glossary_null_bitmap.t | 116 +++++++++++++++++++++++ 4 files changed, 120 insertions(+) create mode 100644 tests/null_bitmap/glossary_null_bitmap.t diff --git a/.gitignore b/.gitignore index 688988a..701d371 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,5 @@ arrowkdb.code-workspace .vscode/ build/ test.q +unit.q +*.user diff --git a/tests/.gitignore b/tests/.gitignore index 01c7b10..b0b3e83 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -1,5 +1,6 @@ basic.q crucial_null_bitmap.q +glossary_null_bitmap.q nested_null_bitmap.q null_mapping_short.q null_mapping_long.q diff --git a/tests/null_bitmap/.gitignore b/tests/null_bitmap/.gitignore index b002a2d..3857116 100644 --- a/tests/null_bitmap/.gitignore +++ b/tests/null_bitmap/.gitignore @@ -1,2 +1,3 @@ crucial_null_bitmap.q +glossary_null_bitmap.q nested_null_bitmap.q diff --git a/tests/null_bitmap/glossary_null_bitmap.t b/tests/null_bitmap/glossary_null_bitmap.t new file mode 100644 index 0000000..15396da --- /dev/null +++ b/tests/null_bitmap/glossary_null_bitmap.t @@ -0,0 +1,116 @@ +// glossary_null_bitmap.q + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l q/arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping ||----------+\n"; +glossary_opts:(`int64`float64)!(5;2.34); + +glossary_options:(``NULL_MAPPING)!((::);glossary_opts); + +N:5 + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +i64_dt:.arrowkdb.dt.int64[]; +f64_dt:.arrowkdb.dt.float64[]; + +-1"\n+----------|| Create a map datatype using the i16_dt as the key and dec_dt as its values ||----------+\n"; +map_dt:.arrowkdb.dt.map[i64_dt;f64_dt] + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +i64_fd:.arrowkdb.fd.field[`int64;i64_dt]; +f64_fd:.arrowkdb.fd.field[`float64;f64_dt]; + +-1"\n+----------|| Create a field containing the map datatype ||----------+\n"; +map_fd:.arrowkdb.fd.field[`map;map_dt]; + +-1"\n+----------|| Create the schema containing the large list, dictionary and sparce union fields ||----------+\n"; +glossary_schema:.arrowkdb.sc.schema[(enlist map_fd)]; + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +i64_data:N?100i; +i64_data[0]:1i; +f64_data:N?100f; +f64_data[1]:2.34f; + +map_data:((enlist 1)!(enlist 1f);(2 2)!(2 2.34f);(3 3 3)!(3 3 3f)) + +-1"\n+----------|| Combine the array data for the glossary columns ||----------+\n"; +glossary_data:(enlist map_data); + +-1"\n+----------|| Write the schema and array data to a parquet file ||----------+\n"; +glossary_options[`PARQUET_VERSION]:`V2.0; + +parquet_glossary_bitmap:"glossary_bitmap.parquet"; +.arrowkdb.pq.writeParquet[parquet_glossary_bitmap;glossary_schema;glossary_data;glossary_options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +glossary_options[`WITH_NULL_BITMAP]:1; + +parquet_glossary_schema:.arrowkdb.pq.readParquetSchema[parquet_glossary_bitmap]; +.arrowkdb.sc.equalSchemas[glossary_schema;parquet_glossary_schema] +glossary_schema~parquet_glossary_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +parquet_glossary_data:.arrowkdb.pq.readParquetData[parquet_glossary_bitmap;glossary_options]; +glossary_data~first parquet_glossary_data + +-1"\n+----------|| Compare null bitmaps of parquet data ||----------+\n"; +glossary_nulls:((enlist 0b)!(enlist 0b);00b!01b;000b!000b) +parquet_glossary_nulls:parquet_glossary_data[1] +glossary_nulls~parquet_glossary_nulls[0][0] + +rm parquet_glossary_bitmap; + +-1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; +arrow_glossary_bitmap:"nested_map.arrow"; +.arrowkdb.ipc.writeArrow[arrow_glossary_bitmap;glossary_schema;glossary_data;glossary_options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +arrow_glossary_schema:.arrowkdb.ipc.readArrowSchema[arrow_glossary_bitmap]; +.arrowkdb.sc.equalSchemas[glossary_schema;arrow_glossary_schema] +glossary_schema~arrow_glossary_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +arrow_glossary_data:.arrowkdb.ipc.readArrowData[arrow_glossary_bitmap;glossary_options]; +glossary_data~first arrow_glossary_data + +-1"\n+----------|| Compare null bitmaps of arrow data ||----------+\n"; +arrow_glossary_nulls:arrow_glossary_data[1] +glossary_nulls~arrow_glossary_nulls[0][0] + +rm arrow_glossary_bitmap; + +-1"\n+----------|| Serialize the schema and array data to an arrow stream ||----------+\n"; +serialized_glossary:.arrowkdb.ipc.serializeArrow[glossary_schema;glossary_data;glossary_options]; + +-1"\n+----------|| Parse the schema back abd compare ||----------+\n"; +stream_glossary_schema:.arrowkdb.ipc.parseArrowSchema[serialized_glossary]; +.arrowkdb.sc.equalSchemas[glossary_schema;stream_glossary_schema] +glossary_schema~stream_glossary_schema + +-1"\n+----------|| Parse the array data back and compare ||----------+\n"; +stream_glossary_data:.arrowkdb.ipc.parseArrowData[serialized_glossary;glossary_options]; +glossary_data~first stream_glossary_data + +-1"\n+----------|| Compare null bitmaps of stream data ||----------+\n"; +stream_glossary_nulls:stream_glossary_data[1] +glossary_nulls~stream_glossary_nulls[0][0] + + +-1 "\n+----------|| Test utils ||----------+\n"; + +show .arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; From 6999c0a0c42dfe3bfa3bb27f71ae7ec3af7dfd19 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 3 Mar 2023 14:44:10 +0300 Subject: [PATCH 106/276] Example for dictionary bitmap --- examples/null_bitmap.q | 43 +++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q index eedb6ca..a532c50 100644 --- a/examples/null_bitmap.q +++ b/examples/null_bitmap.q @@ -45,6 +45,7 @@ t64_dt:.arrowkdb.dt.time64[`nano]; i64_dt:.arrowkdb.dt.int64[]; // Create a map datatype using the i16_dt as the key and dec_dt as its values +dict_dt:.arrowkdb.dt.dictionary[str_dt;i64_dt] map_dt:.arrowkdb.dt.map[i64_dt;f64_dt] // Create the field identifiers @@ -69,6 +70,7 @@ struct_dt:.arrowkdb.dt.struct[(f32_fd,bin_fd,t64_fd)]; struct_fd:.arrowkdb.fd.field[`struct_field;struct_dt]; // Create a field containing the map datatype +dict_fd:.arrowkdb.fd.field[`dictionary;dict_dt] map_fd:.arrowkdb.fd.field[`map;map_dt]; // Create the schemas for the list of fields @@ -78,7 +80,7 @@ bitmap_schema:.arrowkdb.sc.schema[(ts_fd,bool_fd,i32_fd,f64_fd,str_fd,d32_fd)]; struct_schema:.arrowkdb.sc.schema[(list_fd,struct_fd)]; // Create the schema containing the large list, dictionary and sparce union fields -dict_schema:.arrowkdb.sc.schema[(enlist map_fd)]; +dict_schema:.arrowkdb.sc.schema[(dict_fd, map_fd)]; // Print the schema -1"\nBitmap schema:"; @@ -120,10 +122,11 @@ struct_array:(f32_data;bin_data;t64_data); // Combine the array data for the list and struct columns struct_data:(list_array;struct_array); +dict_data:(("aa";"bb";"cc");(2 0 1)) map_data:((enlist 1)!(enlist 1f);(2 2)!(2 2.34f);(3 3 3)!(3 3 3f)) // Combine the array data for the list and struct columns -dict_data:(enlist map_data); +dict_data:(dict_data;map_data); // Pretty print the Arrow table populated from the bitmap data -1"\nBitmap table:"; @@ -161,15 +164,12 @@ nested_options[`WITH_NULL_BITMAP]:1; parquet_bitmap_schema:.arrowkdb.pq.readParquetSchema[parquet_null_bitmap]; parquet_struct_schema:.arrowkdb.pq.readParquetSchema[parquet_nested_struct]; -parquet_dict_schema:.arrowkdb.pq.readParquetSchema[parquet_nested_dict]; show .arrowkdb.sc.equalSchemas[bitmap_schema;parquet_bitmap_schema] show .arrowkdb.sc.equalSchemas[struct_schema;parquet_struct_schema] -show .arrowkdb.sc.equalSchemas[dict_schema;parquet_dict_schema] show bitmap_schema~parquet_bitmap_schema show struct_schema~parquet_struct_schema -show dict_schema~parquet_dict_schema // Read the array data back and compare parquet_bitmap_data:.arrowkdb.pq.readParquetData[parquet_null_bitmap;nested_options]; @@ -178,14 +178,16 @@ parquet_dict_data:.arrowkdb.pq.readParquetData[parquet_nested_dict;nested_option show bitmap_data~first parquet_bitmap_data show struct_data~first parquet_struct_data -show dict_data~first parquet_dict_data +show first[dict_data[0]]~asc first parquet_dict_data[0] +show last[dict_data]~last parquet_dict_data[0] // Compare null bitmaps of parquet data nulls_data:1b,(N-1)?1b; bitmap_nulls:{x rotate nulls_data} each neg til {x-1} count bitmap_data; -nested_list_nulls:(enlist 1b;00b;000b;0000b;00001b) -nested_struct_nulls:(10000b;01000b;00100b) -nested_dict_nulls:((enlist 0b)!(enlist 0b);00b!01b;000b!000b) +nested_list_nulls:(enlist 1b;00b;000b;0000b;00001b); +nested_struct_nulls:(10000b;01000b;00100b); +nested_dict_nulls:(000b;000b); +nested_map_nulls:((enlist 0b)!(enlist 0b);00b!01b;000b!000b); parquet_bitmap_nulls:last parquet_bitmap_data; parquet_list_nulls:first parquet_struct_data[1] @@ -193,9 +195,10 @@ parquet_struct_nulls:last parquet_struct_data[1] parquet_dict_nulls:parquet_dict_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count parquet_bitmap_nulls;parquet_bitmap_nulls] -nested_list_nulls~parquet_list_nulls -nested_struct_nulls~parquet_struct_nulls[0] -nested_dict_nulls~parquet_dict_nulls[0][0] +show nested_list_nulls~parquet_list_nulls +show nested_struct_nulls~parquet_struct_nulls[0] +show nested_dict_nulls[0]~parquet_dict_nulls[0] +show nested_map_nulls~last[parquet_dict_nulls][0] rm parquet_null_bitmap; rm parquet_nested_struct; @@ -247,9 +250,10 @@ arrow_struct_nulls:last arrow_struct_data[1] arrow_dict_nulls:arrow_dict_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count arrow_bitmap_nulls;arrow_bitmap_nulls] -nested_list_nulls~arrow_list_nulls -nested_struct_nulls~arrow_struct_nulls[0] -nested_dict_nulls~arrow_dict_nulls[0][0] +show nested_list_nulls~arrow_list_nulls +show nested_struct_nulls~arrow_struct_nulls[0] +show nested_dict_nulls~first[arrow_dict_nulls][0] +show nested_map_nulls~last[arrow_dict_nulls][0] rm arrow_null_bitmap; rm arrow_struct_bitmap; @@ -297,11 +301,12 @@ stream_struct_nulls:last stream_struct_data[1] stream_dict_nulls:stream_dict_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count stream_bitmap_nulls;stream_bitmap_nulls] -nested_list_nulls~stream_list_nulls -nested_struct_nulls~stream_struct_nulls[0] -nested_dict_nulls~stream_dict_nulls[0][0] +show nested_list_nulls~stream_list_nulls +show nested_struct_nulls~stream_struct_nulls[0] +show nested_dict_nulls~first[stream_dict_nulls][0] +show nested_map_nulls~last[stream_dict_nulls][0] -1 "\n+----------------------------------------+\n"; // Process off -exit 0; +//exit 0; From e684c984f55dc165bd827d3a30cf1eb8f95f4d30 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 3 Mar 2023 14:45:54 +0300 Subject: [PATCH 107/276] Mapping nulls into nested dictionary --- src/ArrayReader.cpp | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index c64e949..d446b21 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -666,13 +666,13 @@ K AppendNullBitmap( shared_ptr array_data, size_ K k_bitmap = knk( length ); for( auto i = 0; i < length; ++i ){ auto keys_slice = keys->Slice( map_array->value_offset( i ), map_array->value_length( i ) ); - auto items_slice = items->Slice( map_array->value_offset( i ), map_array->value_length( i ) ); auto keys_length = keys_slice->length(); - auto items_length = items_slice->length(); - K k_keys = ( null_bitmap_handlers.find( keys_type_id ) == null_bitmap_handlers.end() ) ? ktn( KB, keys_length ) : knk( 0 ); + + auto items_slice = items->Slice( map_array->value_offset( i ), map_array->value_length( i ) ); + auto items_length = items_slice->length(); K k_items = ( null_bitmap_handlers.find( items_type_id ) == null_bitmap_handlers.end() ) ? ktn( KB, items_length ) : knk( 0 ); @@ -729,7 +729,32 @@ K AppendNullBitmap( shared_ptr array_dat template<> K AppendNullBitmap( shared_ptr array_data, size_t& index ) { - return nullptr; + auto dictionary_array = static_pointer_cast( array_data ); + auto length = dictionary_array->length(); + + auto items = dictionary_array->dictionary(); + auto items_type_id = items->type_id(); + auto items_length = items->length(); + K k_items = ( null_bitmap_handlers.find( items_type_id ) == null_bitmap_handlers.end() ) + ? ktn( KB, items_length ) + : knk( 0 ); + + auto indices = dictionary_array->indices(); + auto indices_type_id = indices->type_id(); + auto indices_length = indices->length(); + K k_indices = ( null_bitmap_handlers.find( indices_type_id ) == null_bitmap_handlers.end() ) + ? ktn( KB, indices_length ) + : knk( 0 ); + + size_t items_counter = 0; + size_t indices_counter = 0; + InitKdbNullBitmap( items, &k_items, items_counter ); + InitKdbNullBitmap( indices, &k_indices, indices_counter ); + + K k_bitmap = knk( 2, k_items, k_indices ); + index += length; + + return k_bitmap; } template From 1edaacc7469ce3319d44439fb90de8f302766b90 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 3 Mar 2023 18:31:54 +0300 Subject: [PATCH 108/276] Example for unions --- examples/null_bitmap.q | 77 ++++++++++++++++++++++++++++++++---------- 1 file changed, 60 insertions(+), 17 deletions(-) diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q index a532c50..5af3844 100644 --- a/examples/null_bitmap.q +++ b/examples/null_bitmap.q @@ -35,8 +35,6 @@ str_dt:.arrowkdb.dt.utf8[]; d32_dt:.arrowkdb.dt.date32[]; ui16_dt:.arrowkdb.dt.uint16[]; -// Create a list datatype, using the uint16 datatype as its child -list_dt:.arrowkdb.dt.list[ui16_dt]; f32_dt:.arrowkdb.dt.float32[]; bin_dt:.arrowkdb.dt.binary[]; @@ -44,10 +42,6 @@ t64_dt:.arrowkdb.dt.time64[`nano]; i64_dt:.arrowkdb.dt.int64[]; -// Create a map datatype using the i16_dt as the key and dec_dt as its values -dict_dt:.arrowkdb.dt.dictionary[str_dt;i64_dt] -map_dt:.arrowkdb.dt.map[i64_dt;f64_dt] - // Create the field identifiers ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; @@ -58,30 +52,45 @@ str_fd:.arrowkdb.fd.field[`string;str_dt]; d32_fd:.arrowkdb.fd.field[`date32;d32_dt]; ui16_fd:.arrowkdb.fd.field[`uint16;ui16_dt]; -// Create a field containing the list datatype -list_fd:.arrowkdb.fd.field[`list_field;list_dt]; f32_fd:.arrowkdb.fd.field[`float32;f32_dt]; bin_fd:.arrowkdb.fd.field[`binary;bin_dt]; t64_fd:.arrowkdb.fd.field[`time64;t64_dt]; -// Create a struct datatype using the float32, binary and time64 fields as its children -struct_dt:.arrowkdb.dt.struct[(f32_fd,bin_fd,t64_fd)]; + +i64_fd:.arrowkdb.fd.field[`int64;i64_dt]; + +// Create a field containing the list datatype +list_dt:.arrowkdb.dt.list[ui16_dt]; +list_fd:.arrowkdb.fd.field[`list_field;list_dt]; + // Create a field containing the struct datatype +struct_dt:.arrowkdb.dt.struct[(f32_fd,bin_fd,t64_fd)]; struct_fd:.arrowkdb.fd.field[`struct_field;struct_dt]; -// Create a field containing the map datatype +// Create fields containing dictionary datatypes +dict_dt:.arrowkdb.dt.dictionary[str_dt;i64_dt] dict_fd:.arrowkdb.fd.field[`dictionary;dict_dt] +map_dt:.arrowkdb.dt.map[i64_dt;f64_dt] map_fd:.arrowkdb.fd.field[`map;map_dt]; -// Create the schemas for the list of fields +// Create fields containing union datatypes +sparse_dt:.arrowkdb.dt.sparse_union[(i64_fd,f64_fd)] +sparse_fd:.arrowkdb.fd.field[`sparse_union;sparse_dt] +dense_dt:.arrowkdb.dt.dense_union[(i64_fd,f64_fd)] +dense_fd:.arrowkdb.fd.field[`dense_union;dense_dt] + +// Create the schemas for primitive fields bitmap_schema:.arrowkdb.sc.schema[(ts_fd,bool_fd,i32_fd,f64_fd,str_fd,d32_fd)]; // Create the schema containing the list and struct fields struct_schema:.arrowkdb.sc.schema[(list_fd,struct_fd)]; -// Create the schema containing the large list, dictionary and sparce union fields +// Create the schema containing the dictionary and map fields dict_schema:.arrowkdb.sc.schema[(dict_fd, map_fd)]; +// Create the schema containing the sparce and dense union fields +union_schema:.arrowkdb.sc.schema[(sparse_fd, dense_fd)] + // Print the schema -1"\nBitmap schema:"; .arrowkdb.sc.printSchema[bitmap_schema]; @@ -92,6 +101,9 @@ dict_schema:.arrowkdb.sc.schema[(dict_fd, map_fd)]; -1"\nDict schema:"; .arrowkdb.sc.printSchema[dict_schema]; +-1"\nUnion schema:"; +.arrowkdb.sc.printSchema[union_schema]; + // Create data for each column in the table ts_data:asc N?0p; @@ -114,19 +126,22 @@ bin_data[1]:"x"$"acknowledge" t64_data:5?(12:00:00.000000000;13:00:00.000000000;14:00:00.000000000;15:00:00.000000000;16:00:00.000000000); t64_data[2]:00:00:00.123456789; -// Combine the data for all columns +// Combine the data for primitive columns bitmap_data:(ts_data;bool_data;i32_data;f64_data;str_data;d32_data); +// Combine the array data for the list and struct columns list_array:(enlist 9h;(8h;7h);(6h;5h;4h);(1h;2h;3h;4h);(5h;6h;7h;8h;9h)); struct_array:(f32_data;bin_data;t64_data); -// Combine the array data for the list and struct columns struct_data:(list_array;struct_array); +// Combine the array data for the list and struct columns dict_data:(("aa";"bb";"cc");(2 0 1)) map_data:((enlist 1)!(enlist 1f);(2 2)!(2 2.34f);(3 3 3)!(3 3 3f)) +dict_data:(dict_data;map_data); // Combine the array data for the list and struct columns -dict_data:(dict_data;map_data); +sparse_data:dense_data:(0 1 0h;1 2 3;4 5 6f) +union_data:(sparse_data;dense_data) // Pretty print the Arrow table populated from the bitmap data -1"\nBitmap table:"; @@ -140,6 +155,10 @@ dict_data:(dict_data;map_data); -1"\nDict table:"; .arrowkdb.tb.prettyPrintTable[dict_schema;dict_data;nested_options] +// Show the array data as an arrow table +-1"\nUnion table:"; +.arrowkdb.tb.prettyPrintTable[union_schema;union_data;nested_options] + //-------------------------// // Example-1. Parquet file // //-------------------------// @@ -150,6 +169,7 @@ nested_options[`PARQUET_VERSION]:`V2.0; parquet_null_bitmap:"null_bitmap.parquet"; parquet_nested_struct:"nested_struct.parquet"; parquet_nested_dict:"nested_dict.parquet"; +parquet_nested_union:"nested_union.parquet"; .arrowkdb.pq.writeParquet[parquet_null_bitmap;bitmap_schema;bitmap_data;nested_options]; .arrowkdb.pq.writeParquet[parquet_nested_struct;struct_schema;struct_data;nested_options]; @@ -188,6 +208,7 @@ nested_list_nulls:(enlist 1b;00b;000b;0000b;00001b); nested_struct_nulls:(10000b;01000b;00100b); nested_dict_nulls:(000b;000b); nested_map_nulls:((enlist 0b)!(enlist 0b);00b!01b;000b!000b); +nested_union_nulls:((7h;9h);000b;000b); parquet_bitmap_nulls:last parquet_bitmap_data; parquet_list_nulls:first parquet_struct_data[1] @@ -212,52 +233,64 @@ rm parquet_nested_dict; arrow_null_bitmap:"null_bitmap.arrow"; arrow_struct_bitmap:"nested_bitmap.arrow"; arrow_dict_bitmap:"nested_dict.arrow"; +arrow_union_bitmap:"nested_union.arrow"; .arrowkdb.ipc.writeArrow[arrow_null_bitmap;bitmap_schema;bitmap_data;nested_options]; .arrowkdb.ipc.writeArrow[arrow_struct_bitmap;struct_schema;struct_data;nested_options]; .arrowkdb.ipc.writeArrow[arrow_dict_bitmap;dict_schema;dict_data;nested_options]; +.arrowkdb.ipc.writeArrow[arrow_union_bitmap;union_schema;union_data;nested_options]; show ls arrow_null_bitmap show ls arrow_struct_bitmap show ls arrow_dict_bitmap +show ls arrow_union_bitmap // Read the schema back and compare arrow_bitmap_schema:.arrowkdb.ipc.readArrowSchema[arrow_null_bitmap]; arrow_struct_schema:.arrowkdb.ipc.readArrowSchema[arrow_struct_bitmap]; arrow_dict_schema:.arrowkdb.ipc.readArrowSchema[arrow_dict_bitmap]; +arrow_union_schema:.arrowkdb.ipc.readArrowSchema[arrow_union_bitmap]; show .arrowkdb.sc.equalSchemas[bitmap_schema;arrow_bitmap_schema] show .arrowkdb.sc.equalSchemas[struct_schema;arrow_struct_schema] show .arrowkdb.sc.equalSchemas[dict_schema;arrow_dict_schema] +show .arrowkdb.sc.equalSchemas[union_schema;arrow_union_schema] show bitmap_schema~arrow_bitmap_schema show struct_schema~arrow_struct_schema show dict_schema~arrow_dict_schema +show union_schema~arrow_union_schema // Read the array data back and compare arrow_bitmap_data:.arrowkdb.ipc.readArrowData[arrow_null_bitmap;nested_options]; arrow_struct_data:.arrowkdb.ipc.readArrowData[arrow_struct_bitmap;nested_options]; arrow_dict_data:.arrowkdb.ipc.readArrowData[arrow_dict_bitmap;nested_options]; +arrow_union_data:.arrowkdb.ipc.readArrowData[arrow_union_bitmap;nested_options]; show bitmap_data~first arrow_bitmap_data show struct_data~first arrow_struct_data show dict_data~first arrow_dict_data +show union_data~first arrow_union_data // Compare null bitmaps of arrow data arrow_bitmap_nulls:last arrow_bitmap_data; arrow_list_nulls:first arrow_struct_data[1] arrow_struct_nulls:last arrow_struct_data[1] arrow_dict_nulls:arrow_dict_data[1] +arrow_union_nulls:arrow_union_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count arrow_bitmap_nulls;arrow_bitmap_nulls] show nested_list_nulls~arrow_list_nulls show nested_struct_nulls~arrow_struct_nulls[0] show nested_dict_nulls~first[arrow_dict_nulls][0] show nested_map_nulls~last[arrow_dict_nulls][0] +show nested_union_nulls~arrow_union_nulls[0][0] +show nested_union_nulls~arrow_union_nulls[1][0] rm arrow_null_bitmap; rm arrow_struct_bitmap; rm arrow_dict_bitmap; +rm arrow_union_bitmap; //-----------------------------// // Example-3. Arrow IPC stream // @@ -267,46 +300,56 @@ rm arrow_dict_bitmap; serialized_null_bitmap:.arrowkdb.ipc.serializeArrow[bitmap_schema;bitmap_data;nested_options]; serialized_nested_struct:.arrowkdb.ipc.serializeArrow[struct_schema;struct_data;nested_options]; serialized_nested_dict:.arrowkdb.ipc.serializeArrow[dict_schema;dict_data;nested_options]; +serialized_nested_union:.arrowkdb.ipc.serializeArrow[union_schema;union_data;nested_options]; show serialized_null_bitmap show serialized_nested_struct show serialized_nested_dict +show serialized_nested_union // Parse the schema back abd compare stream_bitmap_schema:.arrowkdb.ipc.parseArrowSchema[serialized_null_bitmap]; stream_struct_schema:.arrowkdb.ipc.parseArrowSchema[serialized_nested_struct]; stream_dict_schema:.arrowkdb.ipc.parseArrowSchema[serialized_nested_dict]; +stream_union_schema:.arrowkdb.ipc.parseArrowSchema[serialized_nested_union]; show .arrowkdb.sc.equalSchemas[bitmap_schema;stream_bitmap_schema] show .arrowkdb.sc.equalSchemas[struct_schema;stream_struct_schema] show .arrowkdb.sc.equalSchemas[dict_schema;stream_dict_schema] +show .arrowkdb.sc.equalSchemas[union_schema;stream_union_schema] show bitmap_schema~stream_bitmap_schema show struct_schema~stream_struct_schema show dict_schema~stream_dict_schema +show union_schema~stream_union_schema // Parse the array data back and compare stream_bitmap_data:.arrowkdb.ipc.parseArrowData[serialized_null_bitmap;nested_options]; stream_struct_data:.arrowkdb.ipc.parseArrowData[serialized_nested_struct;nested_options]; stream_dict_data:.arrowkdb.ipc.parseArrowData[serialized_nested_dict;nested_options]; +stream_union_data:.arrowkdb.ipc.parseArrowData[serialized_nested_union;nested_options]; show bitmap_data~first stream_bitmap_data show struct_data~first stream_struct_data show dict_data~first stream_dict_data +show union_data~first stream_union_data // Compare null bitmaps of stream data stream_bitmap_nulls:last stream_bitmap_data; stream_list_nulls:first stream_struct_data[1] stream_struct_nulls:last stream_struct_data[1] stream_dict_nulls:stream_dict_data[1] +stream_union_nulls:stream_union_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count stream_bitmap_nulls;stream_bitmap_nulls] show nested_list_nulls~stream_list_nulls show nested_struct_nulls~stream_struct_nulls[0] show nested_dict_nulls~first[stream_dict_nulls][0] show nested_map_nulls~last[stream_dict_nulls][0] +show nested_union_nulls~stream_union_nulls[0][0] +show nested_union_nulls~stream_union_nulls[1][0] -1 "\n+----------------------------------------+\n"; // Process off -//exit 0; +exit 0; From 994c50b18b5c304a045a3cd684157dfd34c6b97c Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 3 Mar 2023 18:32:34 +0300 Subject: [PATCH 109/276] Mapping nulls into unions --- src/ArrayReader.cpp | 110 +++++++++++++++++++++++++++++--------------- 1 file changed, 72 insertions(+), 38 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index d446b21..31268a8 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -616,22 +616,20 @@ unordered_map ArrayHandlers { , make_array_handler() }; -using BitmapHandler = K (*) (shared_ptr array_data, size_t& index ); +using NestedHandler = K (*) (shared_ptr array_data, size_t& index ); -extern unordered_map null_bitmap_handlers; +extern unordered_map NestedHandlers; -template -K AppendNullBitmap( shared_ptr array_data, size_t& index ); -template<> -K AppendNullBitmap( shared_ptr array_data, size_t& index ) +template +K AppendNestedList( shared_ptr array_data, size_t& index ) { - auto slice_array = static_pointer_cast( array_data )->value_slice( index ); + auto slice_array = static_pointer_cast( array_data )->value_slice( index ); auto length = slice_array->length(); auto type_id = slice_array->type_id(); size_t counter = 0; - K k_bitmap = ( null_bitmap_handlers.find( type_id ) == null_bitmap_handlers.end() ) + K k_bitmap = ( NestedHandlers.find( type_id ) == NestedHandlers.end() ) ? ktn( KB, length ) : knk( 0 ); auto slice = slice_array->Slice( 0, length ); @@ -641,20 +639,29 @@ K AppendNullBitmap( shared_ptr array_data, size return k_bitmap; } +template +K AppendNested( shared_ptr array_data, size_t& index ); + +template<> +K AppendNested( shared_ptr array_data, size_t& index ) +{ + return AppendNestedList( array_data, index ); +} + template<> -K AppendNullBitmap( shared_ptr array_data, size_t& index ) +K AppendNested( shared_ptr array_data, size_t& index ) { - return nullptr; + return AppendNestedList( array_data, index ); } template<> -K AppendNullBitmap( shared_ptr array_data, size_t& index ) +K AppendNested( shared_ptr array_data, size_t& index ) { - return nullptr; + return AppendNestedList( array_data, index ); } template<> -K AppendNullBitmap( shared_ptr array_data, size_t& index ) +K AppendNested( shared_ptr array_data, size_t& index ) { auto map_array = static_pointer_cast( array_data ); auto keys = map_array->keys(); @@ -667,13 +674,13 @@ K AppendNullBitmap( shared_ptr array_data, size_ for( auto i = 0; i < length; ++i ){ auto keys_slice = keys->Slice( map_array->value_offset( i ), map_array->value_length( i ) ); auto keys_length = keys_slice->length(); - K k_keys = ( null_bitmap_handlers.find( keys_type_id ) == null_bitmap_handlers.end() ) + K k_keys = ( NestedHandlers.find( keys_type_id ) == NestedHandlers.end() ) ? ktn( KB, keys_length ) : knk( 0 ); auto items_slice = items->Slice( map_array->value_offset( i ), map_array->value_length( i ) ); auto items_length = items_slice->length(); - K k_items = ( null_bitmap_handlers.find( items_type_id ) == null_bitmap_handlers.end() ) + K k_items = ( NestedHandlers.find( items_type_id ) == NestedHandlers.end() ) ? ktn( KB, items_length ) : knk( 0 ); @@ -690,7 +697,7 @@ K AppendNullBitmap( shared_ptr array_data, size_ } template<> -K AppendNullBitmap( shared_ptr array_data, size_t& index ) +K AppendNested( shared_ptr array_data, size_t& index ) { size_t length = 0; auto struct_array = static_pointer_cast( array_data ); @@ -703,7 +710,7 @@ K AppendNullBitmap( shared_ptr array_data, si length = field->length(); size_t counter = 0; - kK( k_bitmap )[i] = ( null_bitmap_handlers.find( type_id ) == null_bitmap_handlers.end() ) + kK( k_bitmap )[i] = ( NestedHandlers.find( type_id ) == NestedHandlers.end() ) ? ktn( KB, length ) : knk( 0 ); InitKdbNullBitmap( field, &kK( k_bitmap )[i], counter ); @@ -715,19 +722,46 @@ K AppendNullBitmap( shared_ptr array_data, si } template<> -K AppendNullBitmap( shared_ptr array_data, size_t& index ) +K AppendNested( shared_ptr array_data, size_t& index ) { - return nullptr; + auto union_array = static_pointer_cast( array_data ); + auto length = union_array->length(); + auto num_fields = union_array->num_fields(); + + K type_ids = ktn( KH, num_fields ); + for( int i = 0; i < num_fields; ++i ){ + kH( type_ids )[i] = union_array->child_id( i ); + } + K k_bitmap = knk( num_fields + 1, type_ids ); + + for( int i = 0; i < num_fields; ++i ){ + auto field_array = union_array->field( i ); + auto type_id = field_array->type_id(); + auto field_length = field_array->length(); + + TypeMappingOverride type_overrides; + kH( type_ids )[i] = kx::arrowkdb::GetKdbType( field_array->type(), type_overrides ); + K k_field = ( NestedHandlers.find( type_id ) == NestedHandlers.end() ) + ? ktn( KB, field_length ) + : knk( 0 ); + + size_t counter = 0; + InitKdbNullBitmap( field_array, &k_field, counter ); + kK( k_bitmap )[i+1] = k_field; + } + index += length; + + return k_bitmap; } template<> -K AppendNullBitmap( shared_ptr array_data, size_t& index ) +K AppendNested( shared_ptr array_data, size_t& index ) { - return nullptr; + return NestedHandlers[arrow::Type::SPARSE_UNION]( array_data, index ); } template<> -K AppendNullBitmap( shared_ptr array_data, size_t& index ) +K AppendNested( shared_ptr array_data, size_t& index ) { auto dictionary_array = static_pointer_cast( array_data ); auto length = dictionary_array->length(); @@ -735,14 +769,14 @@ K AppendNullBitmap( shared_ptr array_data auto items = dictionary_array->dictionary(); auto items_type_id = items->type_id(); auto items_length = items->length(); - K k_items = ( null_bitmap_handlers.find( items_type_id ) == null_bitmap_handlers.end() ) + K k_items = ( NestedHandlers.find( items_type_id ) == NestedHandlers.end() ) ? ktn( KB, items_length ) : knk( 0 ); auto indices = dictionary_array->indices(); auto indices_type_id = indices->type_id(); auto indices_length = indices->length(); - K k_indices = ( null_bitmap_handlers.find( indices_type_id ) == null_bitmap_handlers.end() ) + K k_indices = ( NestedHandlers.find( indices_type_id ) == NestedHandlers.end() ) ? ktn( KB, indices_length ) : knk( 0 ); @@ -758,20 +792,20 @@ K AppendNullBitmap( shared_ptr array_data } template -auto make_null_bitmap_handler() +auto make_nested_handler() { - return make_pair( TypeId, &AppendNullBitmap ); + return make_pair( TypeId, &AppendNested ); } -unordered_map null_bitmap_handlers{ - make_null_bitmap_handler() - , make_null_bitmap_handler() - , make_null_bitmap_handler() - , make_null_bitmap_handler() - , make_null_bitmap_handler() - , make_null_bitmap_handler() - , make_null_bitmap_handler() - , make_null_bitmap_handler() +unordered_map NestedHandlers{ + make_nested_handler() + , make_nested_handler() + , make_nested_handler() + , make_nested_handler() + , make_nested_handler() + , make_nested_handler() + , make_nested_handler() + , make_nested_handler() }; } // namespace @@ -798,12 +832,12 @@ void InitKdbNullBitmap( shared_ptr array_data, K* k_bitmap, size_t auto length = array_data->length(); for( int i = 0ll; i < length; ++i ){ - if( null_bitmap_handlers.find( type_id ) == null_bitmap_handlers.end() ){ + if( NestedHandlers.find( type_id ) == NestedHandlers.end() ){ kG( *k_bitmap )[index++] = array_data->IsNull( i ); } else{ auto pos = index; - *k_bitmap = jk( k_bitmap, null_bitmap_handlers[type_id]( array_data, index ) ); + *k_bitmap = jk( k_bitmap, NestedHandlers[type_id]( array_data, index ) ); i += index - pos - 1; } } @@ -875,7 +909,7 @@ K ReadChunkedNullBitmap( shared_ptr chunked_array, TypeMapp { auto length = chunked_array->length(); auto type_id = chunked_array->type()->id(); - K k_bitmap = ( null_bitmap_handlers.find( type_id ) == null_bitmap_handlers.end() ) + K k_bitmap = ( NestedHandlers.find( type_id ) == NestedHandlers.end() ) ? ktn( KB, length ) : knk( 0 ); From 19be7a30102d367fea661c94d1d436cd2637d021 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 6 Mar 2023 11:36:12 +0300 Subject: [PATCH 110/276] Union key types overriding for decimals --- src/ArrayReader.cpp | 59 +++++++++++++++++++++------------------------ src/ArrayReader.h | 9 ++++++- 2 files changed, 35 insertions(+), 33 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 31268a8..5568bd3 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -616,13 +616,12 @@ unordered_map ArrayHandlers { , make_array_handler() }; -using NestedHandler = K (*) (shared_ptr array_data, size_t& index ); +using NestedHandler = K ( * )(shared_ptr array_data, size_t& index, TypeMappingOverride& type_overrides ); extern unordered_map NestedHandlers; - template -K AppendNestedList( shared_ptr array_data, size_t& index ) +K AppendNestedList( shared_ptr array_data, size_t& index, TypeMappingOverride& type_overrides ) { auto slice_array = static_pointer_cast( array_data )->value_slice( index ); auto length = slice_array->length(); @@ -633,35 +632,35 @@ K AppendNestedList( shared_ptr array_data, size_t& index ) ? ktn( KB, length ) : knk( 0 ); auto slice = slice_array->Slice( 0, length ); - InitKdbNullBitmap( slice, &k_bitmap, counter ); + InitKdbNullBitmap( slice, &k_bitmap, counter, type_overrides ); ++index; return k_bitmap; } template -K AppendNested( shared_ptr array_data, size_t& index ); +K AppendNested( shared_ptr array_data, size_t& index, TypeMappingOverride& type_overrides ); template<> -K AppendNested( shared_ptr array_data, size_t& index ) +K AppendNested( shared_ptr array_data, size_t& index, TypeMappingOverride& type_overrides ) { - return AppendNestedList( array_data, index ); + return AppendNestedList( array_data, index, type_overrides ); } template<> -K AppendNested( shared_ptr array_data, size_t& index ) +K AppendNested( shared_ptr array_data, size_t& index, TypeMappingOverride& type_overrides ) { - return AppendNestedList( array_data, index ); + return AppendNestedList( array_data, index, type_overrides ); } template<> -K AppendNested( shared_ptr array_data, size_t& index ) +K AppendNested( shared_ptr array_data, size_t& index, TypeMappingOverride& type_overrides ) { - return AppendNestedList( array_data, index ); + return AppendNestedList( array_data, index, type_overrides ); } template<> -K AppendNested( shared_ptr array_data, size_t& index ) +K AppendNested( shared_ptr array_data, size_t& index, TypeMappingOverride& type_overrides ) { auto map_array = static_pointer_cast( array_data ); auto keys = map_array->keys(); @@ -686,18 +685,17 @@ K AppendNested( shared_ptr array_data, size_t& i size_t keys_counter = 0; size_t items_counter = 0; - InitKdbNullBitmap( keys_slice, &k_keys, keys_counter ); - InitKdbNullBitmap( items_slice, &k_items, items_counter ); + InitKdbNullBitmap( keys_slice, &k_keys, keys_counter, type_overrides ); + InitKdbNullBitmap( items_slice, &k_items, items_counter, type_overrides ); kK( k_bitmap )[i] = xD( k_keys, k_items ); } - index += length; return k_bitmap; } template<> -K AppendNested( shared_ptr array_data, size_t& index ) +K AppendNested( shared_ptr array_data, size_t& index, TypeMappingOverride& type_overrides ) { size_t length = 0; auto struct_array = static_pointer_cast( array_data ); @@ -713,16 +711,15 @@ K AppendNested( shared_ptr array_data, size_t kK( k_bitmap )[i] = ( NestedHandlers.find( type_id ) == NestedHandlers.end() ) ? ktn( KB, length ) : knk( 0 ); - InitKdbNullBitmap( field, &kK( k_bitmap )[i], counter ); + InitKdbNullBitmap( field, &kK( k_bitmap )[i], counter, type_overrides ); } - index += length; return k_bitmap; } template<> -K AppendNested( shared_ptr array_data, size_t& index ) +K AppendNested( shared_ptr array_data, size_t& index, TypeMappingOverride& type_overrides ) { auto union_array = static_pointer_cast( array_data ); auto length = union_array->length(); @@ -732,21 +729,20 @@ K AppendNested( shared_ptr array_data, for( int i = 0; i < num_fields; ++i ){ kH( type_ids )[i] = union_array->child_id( i ); } - K k_bitmap = knk( num_fields + 1, type_ids ); + K k_bitmap = knk( num_fields + 1, type_ids ); for( int i = 0; i < num_fields; ++i ){ auto field_array = union_array->field( i ); auto type_id = field_array->type_id(); auto field_length = field_array->length(); - TypeMappingOverride type_overrides; kH( type_ids )[i] = kx::arrowkdb::GetKdbType( field_array->type(), type_overrides ); K k_field = ( NestedHandlers.find( type_id ) == NestedHandlers.end() ) ? ktn( KB, field_length ) : knk( 0 ); size_t counter = 0; - InitKdbNullBitmap( field_array, &k_field, counter ); + InitKdbNullBitmap( field_array, &k_field, counter, type_overrides ); kK( k_bitmap )[i+1] = k_field; } index += length; @@ -755,16 +751,15 @@ K AppendNested( shared_ptr array_data, } template<> -K AppendNested( shared_ptr array_data, size_t& index ) +K AppendNested( shared_ptr array_data, size_t& index, TypeMappingOverride& type_overrides ) { - return NestedHandlers[arrow::Type::SPARSE_UNION]( array_data, index ); + return NestedHandlers[arrow::Type::SPARSE_UNION]( array_data, index, type_overrides ); } template<> -K AppendNested( shared_ptr array_data, size_t& index ) +K AppendNested( shared_ptr array_data, size_t& index, TypeMappingOverride& type_overrides ) { auto dictionary_array = static_pointer_cast( array_data ); - auto length = dictionary_array->length(); auto items = dictionary_array->dictionary(); auto items_type_id = items->type_id(); @@ -782,11 +777,11 @@ K AppendNested( shared_ptr array_data, si size_t items_counter = 0; size_t indices_counter = 0; - InitKdbNullBitmap( items, &k_items, items_counter ); - InitKdbNullBitmap( indices, &k_indices, indices_counter ); + InitKdbNullBitmap( items, &k_items, items_counter, type_overrides ); + InitKdbNullBitmap( indices, &k_indices, indices_counter, type_overrides ); K k_bitmap = knk( 2, k_items, k_indices ); - index += length; + index += dictionary_array->length(); return k_bitmap; } @@ -826,7 +821,7 @@ void AppendArray(shared_ptr array_data, K k_array, size_t& index, } } -void InitKdbNullBitmap( shared_ptr array_data, K* k_bitmap, size_t& index ) +void InitKdbNullBitmap( shared_ptr array_data, K* k_bitmap, size_t& index, TypeMappingOverride& type_overrides ) { auto type_id = array_data->type_id(); auto length = array_data->length(); @@ -837,7 +832,7 @@ void InitKdbNullBitmap( shared_ptr array_data, K* k_bitmap, size_t } else{ auto pos = index; - *k_bitmap = jk( k_bitmap, NestedHandlers[type_id]( array_data, index ) ); + *k_bitmap = jk( k_bitmap, NestedHandlers[type_id]( array_data, index, type_overrides ) ); i += index - pos - 1; } } @@ -915,7 +910,7 @@ K ReadChunkedNullBitmap( shared_ptr chunked_array, TypeMapp size_t index = 0; for( auto i = 0; i < chunked_array->num_chunks(); ++i ){ - InitKdbNullBitmap( chunked_array->chunk( i ), &k_bitmap, index ); + InitKdbNullBitmap( chunked_array->chunk( i ), &k_bitmap, index, type_overrides ); } return k_bitmap; diff --git a/src/ArrayReader.h b/src/ArrayReader.h index 2fdd5b8..fbb2662 100644 --- a/src/ArrayReader.h +++ b/src/ArrayReader.h @@ -21,6 +21,7 @@ namespace arrowkdb { * list needs to have been created with the correct length by the calling * function. * @param index The index into the kdb list at which the appending should + * @param type_overrides Overrides for type mappings configured by KdbOptions * begin. Index will be updated to account for the new offset by adding the * length of the array array. */ @@ -30,6 +31,7 @@ void AppendArray(std::shared_ptr array_data, K k_array, size_t& in * @brief Copies and converts an arrow array to a kdb list * * @param array The arrow array to be converted + * @param type_overrides Overrides for type mappings configured by KdbOptions * @return A kdb list represented the arrow array */ K ReadArray(std::shared_ptr array, TypeMappingOverride& type_overrides); @@ -41,6 +43,7 @@ K ReadArray(std::shared_ptr array, TypeMappingOverride& type_overr * into the list. * * @param chunked_array The chunked array to be converted + * @param type_overrides Overrides for type mappings configured by KdbOptions * @return A kdb list representing the chunked array */ K ReadChunkedArray(std::shared_ptr chunked_array, TypeMappingOverride& type_overrides); @@ -49,6 +52,7 @@ K ReadChunkedArray(std::shared_ptr chunked_array, TypeMappi * @brief Extracts nulls bitmap of an arrow array into a boolean kdb list * * @param chunked_array The chunked array to be converted + * @param type_overrides Overrides for type mappings configured by KdbOptions * @return A kdb list representing the nulls bitmap */ K ReadChunkedNullBitmap( std::shared_ptr chunked_array, TypeMappingOverride& type_overrides ); @@ -60,6 +64,7 @@ K ReadChunkedNullBitmap( std::shared_ptr chunked_array, Typ * * @param datatype The arrow datatype to be stored in the kdb list * @param length The required length of the kdb list + * @param type_overrides Overrides for type mappings configured by KdbOptions * @return Newly created kdb list */ K InitKdbForArray(std::shared_ptr datatype, size_t length, TypeMappingOverride& type_overrides); @@ -76,8 +81,10 @@ K InitKdbForArray(std::shared_ptr datatype, size_t length, Type * @param index The index into the kdb list at which the appending should * begin. Index will be updated to account for the new offset by adding the * length of the array array. + * @param type_overrides Overrides for type mappings configured by KdbOptions + * In null bitmap is used for overriding key types of unions */ -void InitKdbNullBitmap( std::shared_ptr array_data, K* k_bitmap, size_t& index ); +void InitKdbNullBitmap( std::shared_ptr array_data, K* k_bitmap, size_t& index, TypeMappingOverride& type_overrides ); } // namespace arrowkdb } // namespace kx From 7e049422ddbaa51568f0e86006fc9b526a791645 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 6 Mar 2023 12:09:05 +0300 Subject: [PATCH 111/276] Unit-test for union null bitmaps --- examples/null_bitmap.q | 10 ++- tests/.gitignore | 1 + tests/null_bitmap/.gitignore | 3 - tests/null_bitmap/union_null_bitmap.t | 98 +++++++++++++++++++++++++++ tests/null_mapping/.gitignore | 7 -- 5 files changed, 106 insertions(+), 13 deletions(-) delete mode 100644 tests/null_bitmap/.gitignore create mode 100644 tests/null_bitmap/union_null_bitmap.t delete mode 100644 tests/null_mapping/.gitignore diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q index 5af3844..742220d 100644 --- a/examples/null_bitmap.q +++ b/examples/null_bitmap.q @@ -126,6 +126,10 @@ bin_data[1]:"x"$"acknowledge" t64_data:5?(12:00:00.000000000;13:00:00.000000000;14:00:00.000000000;15:00:00.000000000;16:00:00.000000000); t64_data[2]:00:00:00.123456789; +// Create the data for the union child fields +i64_data:N?100; +i64_data[0]:1; + // Combine the data for primitive columns bitmap_data:(ts_data;bool_data;i32_data;f64_data;str_data;d32_data); @@ -140,7 +144,7 @@ map_data:((enlist 1)!(enlist 1f);(2 2)!(2 2.34f);(3 3 3)!(3 3 3f)) dict_data:(dict_data;map_data); // Combine the array data for the list and struct columns -sparse_data:dense_data:(0 1 0h;1 2 3;4 5 6f) +sparse_data:dense_data:(0 1 0h;5 2 3;4 2.34 6f) union_data:(sparse_data;dense_data) // Pretty print the Arrow table populated from the bitmap data @@ -208,7 +212,7 @@ nested_list_nulls:(enlist 1b;00b;000b;0000b;00001b); nested_struct_nulls:(10000b;01000b;00100b); nested_dict_nulls:(000b;000b); nested_map_nulls:((enlist 0b)!(enlist 0b);00b!01b;000b!000b); -nested_union_nulls:((7h;9h);000b;000b); +nested_union_nulls:((0 1 0h);100b;010b); parquet_bitmap_nulls:last parquet_bitmap_data; parquet_list_nulls:first parquet_struct_data[1] @@ -231,7 +235,7 @@ rm parquet_nested_dict; // Write the schema and array data to an arrow file arrow_null_bitmap:"null_bitmap.arrow"; -arrow_struct_bitmap:"nested_bitmap.arrow"; +arrow_struct_bitmap:"nested_struct.arrow"; arrow_dict_bitmap:"nested_dict.arrow"; arrow_union_bitmap:"nested_union.arrow"; diff --git a/tests/.gitignore b/tests/.gitignore index b0b3e83..5415741 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -2,6 +2,7 @@ basic.q crucial_null_bitmap.q glossary_null_bitmap.q nested_null_bitmap.q +union_null_bitmap.q null_mapping_short.q null_mapping_long.q null_mapping_float.q diff --git a/tests/null_bitmap/.gitignore b/tests/null_bitmap/.gitignore deleted file mode 100644 index 3857116..0000000 --- a/tests/null_bitmap/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -crucial_null_bitmap.q -glossary_null_bitmap.q -nested_null_bitmap.q diff --git a/tests/null_bitmap/union_null_bitmap.t b/tests/null_bitmap/union_null_bitmap.t new file mode 100644 index 0000000..2f6580e --- /dev/null +++ b/tests/null_bitmap/union_null_bitmap.t @@ -0,0 +1,98 @@ +// union_null_bitmap.q + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l q/arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping ||----------+\n"; +nested_union_opts:(`float64`int64)!(2.34;5); +union_options:(``NULL_MAPPING)!((::);nested_union_opts); +N:5 + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +f64_dt:.arrowkdb.dt.float64[]; +i64_dt:.arrowkdb.dt.int64[]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +f64_fd:.arrowkdb.fd.field[`float64;f64_dt]; +i64_fd:.arrowkdb.fd.field[`int64;i64_dt]; + +-1"\n+----------|| Create fields containing union datatypes ||----------+\n"; +sparse_dt:.arrowkdb.dt.sparse_union[(i64_fd,f64_fd)] +sparse_fd:.arrowkdb.fd.field[`sparse_union;sparse_dt] +dense_dt:.arrowkdb.dt.dense_union[(i64_fd,f64_fd)] +dense_fd:.arrowkdb.fd.field[`dense_union;dense_dt] + +-1"\n+----------|| Create the schema containing the sparce and dense union fields ||----------+\n"; +union_schema:.arrowkdb.sc.schema[(sparse_fd, dense_fd)] + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +f64_data:N?100f; +f64_data[0]:2.34f; +i64_data:N?100h; +i64_data[1]:5h; + +-1"\n+----------|| Create the data the union child fields ||----------+\n"; +i64_data:N?100; +i64_data[0]:1; + +-1"\n+----------|| Combine the array data for the list and struct columns ||----------+\n"; +sparse_data:dense_data:(0 1 0h;5 2 3;4 2.34 6f) +union_data:(sparse_data;dense_data) + +-1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; +union_options[`WITH_NULL_BITMAP]:1; +arrow_union_bitmap:"nested_union.arrow"; +.arrowkdb.ipc.writeArrow[arrow_union_bitmap;union_schema;union_data;union_options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +arrow_union_schema:.arrowkdb.ipc.readArrowSchema[arrow_union_bitmap]; +.arrowkdb.sc.equalSchemas[union_schema;arrow_union_schema] +union_schema~arrow_union_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +arrow_union_data:.arrowkdb.ipc.readArrowData[arrow_union_bitmap;union_options]; +union_data~first arrow_union_data + +-1"\n+----------|| Compare null bitmaps of arrow data ||----------+\n"; +nested_union_nulls:((0 1 0h);100b;010b); + +arrow_union_nulls:arrow_union_data[1] +nested_union_nulls~arrow_union_nulls[0][0] +nested_union_nulls~arrow_union_nulls[1][0] + +rm arrow_union_bitmap; + +-1"\n+----------|| Serialize the schema and array data to an arrow stream ||----------+\n"; +serialized_nested_union:.arrowkdb.ipc.serializeArrow[union_schema;union_data;union_options]; + +-1"\n+----------|| Parse the schema back abd compare ||----------+\n"; +stream_union_schema:.arrowkdb.ipc.parseArrowSchema[serialized_nested_union]; +.arrowkdb.sc.equalSchemas[union_schema;stream_union_schema] +union_schema~stream_union_schema + +-1"\n+----------|| Parse the array data back and compare ||----------+\n"; +stream_union_data:.arrowkdb.ipc.parseArrowData[serialized_nested_union;union_options]; +union_data~first stream_union_data + +-1"\n+----------|| Compare null bitmaps of stream data ||----------+\n"; +stream_union_nulls:stream_union_data[1] +nested_union_nulls~stream_union_nulls[0][0] +nested_union_nulls~stream_union_nulls[1][0] + + +-1 "\n+----------|| Test utils ||----------+\n"; + +show .arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; diff --git a/tests/null_mapping/.gitignore b/tests/null_mapping/.gitignore deleted file mode 100644 index ff2bc45..0000000 --- a/tests/null_mapping/.gitignore +++ /dev/null @@ -1,7 +0,0 @@ -null_mapping_short.q -null_mapping_long.q -null_mapping_float.q -null_mapping_str.q -null_mapping_time.q -null_mapping_extra.q -null_mapping_other.q From 2814325ff999ea02dd4c72ed49be4b15404de6c5 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 6 Mar 2023 13:55:25 +0300 Subject: [PATCH 112/276] Prevent null mapping of map keys --- src/ArrayWriter.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index 9ea8d68..27d9b41 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -1036,7 +1036,11 @@ void PopulateBuilder(shared_ptr datatype, K k // Populate the child builders for this map set from the dictionary key/value lists auto k_dict = kK(k_array)[i]; TYPE_CHECK_ITEM(99 != k_dict->t, datatype->ToString(), 99, k_dict->t); + + auto items_null_mapping = type_overrides.null_mapping; + type_overrides.null_mapping = Options::NullMapping {0}; PopulateBuilder(key_builder->type(), kK(k_dict)[0], key_builder, type_overrides); + type_overrides.null_mapping = items_null_mapping; PopulateBuilder(item_builder->type(), kK(k_dict)[1], item_builder, type_overrides); } } From 8b6229f800d2910330ac859f5fee403e4db69e9e Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 6 Mar 2023 15:14:11 +0300 Subject: [PATCH 113/276] Fix reading of union type IDs --- src/ArrayReader.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 5568bd3..c7e8f3a 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -725,8 +725,9 @@ K AppendNested( shared_ptr array_data, auto length = union_array->length(); auto num_fields = union_array->num_fields(); - K type_ids = ktn( KH, num_fields ); - for( int i = 0; i < num_fields; ++i ){ + // The type_id array is represented as a KH list at the start of the parent mixed list. + K type_ids = ktn( KH, length ); + for( int i = 0; i < length; ++i ){ kH( type_ids )[i] = union_array->child_id( i ); } @@ -736,7 +737,6 @@ K AppendNested( shared_ptr array_data, auto type_id = field_array->type_id(); auto field_length = field_array->length(); - kH( type_ids )[i] = kx::arrowkdb::GetKdbType( field_array->type(), type_overrides ); K k_field = ( NestedHandlers.find( type_id ) == NestedHandlers.end() ) ? ktn( KB, field_length ) : knk( 0 ); @@ -760,6 +760,7 @@ template<> K AppendNested( shared_ptr array_data, size_t& index, TypeMappingOverride& type_overrides ) { auto dictionary_array = static_pointer_cast( array_data ); + auto length = dictionary_array->length(); auto items = dictionary_array->dictionary(); auto items_type_id = items->type_id(); @@ -781,7 +782,7 @@ K AppendNested( shared_ptr array_data, si InitKdbNullBitmap( indices, &k_indices, indices_counter, type_overrides ); K k_bitmap = knk( 2, k_items, k_indices ); - index += dictionary_array->length(); + index += length; return k_bitmap; } From 8b9e3d031f17f3414803f3843d621089b7b1b85c Mon Sep 17 00:00:00 2001 From: nmcdonnell-kx Date: Tue, 7 Mar 2023 18:44:30 +0000 Subject: [PATCH 114/276] KXI-22441 Read arrow null bitmap as separate structure * Reuse existing logic for recursion to populate null bitmaps --- src/ArrayReader.cpp | 284 +++++++++++++----------------------------- src/ArrayReader.h | 6 +- src/HelperFunctions.h | 8 ++ src/TableData.cpp | 6 +- 4 files changed, 103 insertions(+), 201 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index c7e8f3a..a58c800 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -21,25 +21,28 @@ using namespace kx::arrowkdb; namespace { +typedef std::function array_data, TypeMappingOverride& type_overrides)> ReadArrayCommon; +typedef std::function array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides)> AppendArrayCommon; + // An arrow list array is a nested set of child lists. This is represented in // kdb as a mixed list for the parent list array containing a set of sub-lists, // one for each of the list value sets. template -void AppendList(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +void AppendList(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides, ReadArrayCommon read_array) { for (auto i = 0; i < array_data->length(); ++i) { // Slice the parent array to get the list value set at the specified index auto value_slice = static_pointer_cast(array_data)->value_slice(i); // Recursively populate the kdb parent mixed list from that slice - kK(k_array)[index++] = ReadArray(value_slice, type_overrides); + kK(k_array)[index++] = read_array(value_slice, type_overrides); } } // An arrow map array is a nested set of key/item paired child arrays. This is // represented in kdb as a mixed list for the parent map array, with a // dictionary for each map value set. -void AppendMap(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +void AppendMap(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides, ReadArrayCommon read_array) { auto map_array = static_pointer_cast(array_data); auto keys = map_array->keys(); @@ -51,7 +54,7 @@ void AppendMap(shared_ptr array_data, K k_array, size_t& index, Ty auto items_slice = items->Slice(map_array->value_offset(i), map_array->value_length(i)); // Recursively populate the kdb parent mixed list with a dictionary // populated from those slices - kK(k_array)[index++] = xD(ReadArray(keys_slice, type_overrides), ReadArray(items_slice, type_overrides)); + kK(k_array)[index++] = xD(read_array(keys_slice, type_overrides), read_array(items_slice, type_overrides)); } } @@ -60,7 +63,7 @@ void AppendMap(shared_ptr array_data, K k_array, size_t& index, Ty // value is obtaining by slicing across all the child arrays at a given index. // This is represented in kdb as a mixed list for the parent struct array, // containing child lists for each field in the struct. -void AppendStruct(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +void AppendStruct(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides, AppendArrayCommon append_array) { auto struct_array = static_pointer_cast(array_data); auto num_fields = struct_array->type()->num_fields(); @@ -69,7 +72,7 @@ void AppendStruct(shared_ptr array_data, K k_array, size_t& index, // Only advance the index into the kdb mixed list at the end once all child // lists have been populated from the same initial index auto temp_index = index; - AppendArray(field_array, kK(k_array)[i], temp_index, type_overrides); + append_array(field_array, kK(k_array)[i], temp_index, type_overrides); } index += array_data->length(); } @@ -77,7 +80,7 @@ void AppendStruct(shared_ptr array_data, K k_array, size_t& index, // An arrow union array is similar to a struct array except that it has an // additional type id array which identifies the live field in each union value // set. -void AppendUnion(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +void AppendUnion(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides, AppendArrayCommon append_array) { auto union_array = static_pointer_cast(array_data); @@ -93,14 +96,14 @@ void AppendUnion(shared_ptr array_data, K k_array, size_t& index, // Only advance the index into the kdb mixed list at the end once all child // lists have been populated from the same initial index auto temp_index = index; - AppendArray(field_array, kK(k_array)[i + 1], temp_index, type_overrides); + append_array(field_array, kK(k_array)[i + 1], temp_index, type_overrides); } index += array_data->length(); } // An arrow dictionary array is represented in kdb as a mixed list for the // parent dictionary array containing the values and indicies sub-lists. -void AppendDictionary(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +void AppendDictionary(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides, ReadArrayCommon read_array) { auto dictionary_array = static_pointer_cast(array_data); @@ -108,9 +111,9 @@ void AppendDictionary(shared_ptr array_data, K k_array, size_t& in // two child arrays could be a different length to each other and the parent // dictionary array which makes it difficult to preallocate the kdb lists of // the correct length. - K values = ReadArray(dictionary_array->dictionary(), type_overrides); + K values = read_array(dictionary_array->dictionary(), type_overrides); jv(&kK(k_array)[0], values); - K indices = ReadArray(dictionary_array->indices(), type_overrides); + K indices = read_array(dictionary_array->indices(), type_overrides); jv(&kK(k_array)[1], indices); } @@ -525,37 +528,37 @@ void AppendArray(shared_ptr array_ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - AppendList(array_data, k_array, index, type_overrides); + AppendList(array_data, k_array, index, type_overrides, kx::arrowkdb::ReadArray); } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - AppendList(array_data, k_array, index, type_overrides); + AppendList(array_data, k_array, index, type_overrides, kx::arrowkdb::ReadArray); } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - AppendList(array_data, k_array, index, type_overrides); + AppendList(array_data, k_array, index, type_overrides, kx::arrowkdb::ReadArray); } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - AppendMap(array_data, k_array, index, type_overrides); + AppendMap(array_data, k_array, index, type_overrides, kx::arrowkdb::ReadArray); } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - AppendStruct(array_data, k_array, index, type_overrides); + AppendStruct(array_data, k_array, index, type_overrides, kx::arrowkdb::AppendArray); } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - AppendUnion(array_data, k_array, index, type_overrides); + AppendUnion(array_data, k_array, index, type_overrides, kx::arrowkdb::AppendArray); } template<> @@ -567,7 +570,7 @@ void AppendArray(shared_ptr array_data, template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - AppendDictionary(array_data, k_array, index, type_overrides); + AppendDictionary(array_data, k_array, index, type_overrides, kx::arrowkdb::ReadArray); } using ArrayHandler = void (*) (shared_ptr, K, size_t&, TypeMappingOverride&); @@ -616,192 +619,76 @@ unordered_map ArrayHandlers { , make_array_handler() }; -using NestedHandler = K ( * )(shared_ptr array_data, size_t& index, TypeMappingOverride& type_overrides ); - -extern unordered_map NestedHandlers; - -template -K AppendNestedList( shared_ptr array_data, size_t& index, TypeMappingOverride& type_overrides ) -{ - auto slice_array = static_pointer_cast( array_data )->value_slice( index ); - auto length = slice_array->length(); - auto type_id = slice_array->type_id(); +using NullBitmapHandler = void ( * )(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides); - size_t counter = 0; - K k_bitmap = ( NestedHandlers.find( type_id ) == NestedHandlers.end() ) - ? ktn( KB, length ) - : knk( 0 ); - auto slice = slice_array->Slice( 0, length ); - InitKdbNullBitmap( slice, &k_bitmap, counter, type_overrides ); - ++index; - - return k_bitmap; -} +extern unordered_map NullBitmapHandlers; template -K AppendNested( shared_ptr array_data, size_t& index, TypeMappingOverride& type_overrides ); +void AppendArrayNullBitmap(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides); template<> -K AppendNested( shared_ptr array_data, size_t& index, TypeMappingOverride& type_overrides ) +void AppendArrayNullBitmap(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - return AppendNestedList( array_data, index, type_overrides ); + AppendList(array_data, k_array, index, type_overrides, kx::arrowkdb::ReadArrayNullBitmap); } template<> -K AppendNested( shared_ptr array_data, size_t& index, TypeMappingOverride& type_overrides ) +void AppendArrayNullBitmap(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - return AppendNestedList( array_data, index, type_overrides ); + AppendList(array_data, k_array, index, type_overrides, kx::arrowkdb::ReadArrayNullBitmap); } template<> -K AppendNested( shared_ptr array_data, size_t& index, TypeMappingOverride& type_overrides ) +void AppendArrayNullBitmap(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - return AppendNestedList( array_data, index, type_overrides ); + AppendList(array_data, k_array, index, type_overrides, kx::arrowkdb::ReadArrayNullBitmap); } template<> -K AppendNested( shared_ptr array_data, size_t& index, TypeMappingOverride& type_overrides ) +void AppendArrayNullBitmap(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto map_array = static_pointer_cast( array_data ); - auto keys = map_array->keys(); - auto items = map_array->items(); - auto keys_type_id = keys->type_id(); - auto items_type_id = items->type_id(); - auto length = map_array->length(); - - K k_bitmap = knk( length ); - for( auto i = 0; i < length; ++i ){ - auto keys_slice = keys->Slice( map_array->value_offset( i ), map_array->value_length( i ) ); - auto keys_length = keys_slice->length(); - K k_keys = ( NestedHandlers.find( keys_type_id ) == NestedHandlers.end() ) - ? ktn( KB, keys_length ) - : knk( 0 ); - - auto items_slice = items->Slice( map_array->value_offset( i ), map_array->value_length( i ) ); - auto items_length = items_slice->length(); - K k_items = ( NestedHandlers.find( items_type_id ) == NestedHandlers.end() ) - ? ktn( KB, items_length ) - : knk( 0 ); - - size_t keys_counter = 0; - size_t items_counter = 0; - InitKdbNullBitmap( keys_slice, &k_keys, keys_counter, type_overrides ); - InitKdbNullBitmap( items_slice, &k_items, items_counter, type_overrides ); - kK( k_bitmap )[i] = xD( k_keys, k_items ); - } - index += length; - - return k_bitmap; + AppendMap(array_data, k_array, index, type_overrides, kx::arrowkdb::ReadArrayNullBitmap); } template<> -K AppendNested( shared_ptr array_data, size_t& index, TypeMappingOverride& type_overrides ) +void AppendArrayNullBitmap(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - size_t length = 0; - auto struct_array = static_pointer_cast( array_data ); - auto num_fields = struct_array->type()->num_fields(); - - K k_bitmap = knk( num_fields ); - for( int i = 0; i < num_fields; ++i ){ - auto field = struct_array->field( i ); - auto type_id = field->type_id(); - length = field->length(); - - size_t counter = 0; - kK( k_bitmap )[i] = ( NestedHandlers.find( type_id ) == NestedHandlers.end() ) - ? ktn( KB, length ) - : knk( 0 ); - InitKdbNullBitmap( field, &kK( k_bitmap )[i], counter, type_overrides ); - } - index += length; - - return k_bitmap; + AppendStruct(array_data, k_array, index, type_overrides, kx::arrowkdb::AppendArrayNullBitmap); } template<> -K AppendNested( shared_ptr array_data, size_t& index, TypeMappingOverride& type_overrides ) +void AppendArrayNullBitmap(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto union_array = static_pointer_cast( array_data ); - auto length = union_array->length(); - auto num_fields = union_array->num_fields(); - - // The type_id array is represented as a KH list at the start of the parent mixed list. - K type_ids = ktn( KH, length ); - for( int i = 0; i < length; ++i ){ - kH( type_ids )[i] = union_array->child_id( i ); - } - - K k_bitmap = knk( num_fields + 1, type_ids ); - for( int i = 0; i < num_fields; ++i ){ - auto field_array = union_array->field( i ); - auto type_id = field_array->type_id(); - auto field_length = field_array->length(); - - K k_field = ( NestedHandlers.find( type_id ) == NestedHandlers.end() ) - ? ktn( KB, field_length ) - : knk( 0 ); - - size_t counter = 0; - InitKdbNullBitmap( field_array, &k_field, counter, type_overrides ); - kK( k_bitmap )[i+1] = k_field; - } - index += length; - - return k_bitmap; + AppendUnion(array_data, k_array, index, type_overrides, kx::arrowkdb::AppendArrayNullBitmap); } template<> -K AppendNested( shared_ptr array_data, size_t& index, TypeMappingOverride& type_overrides ) +void AppendArrayNullBitmap(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - return NestedHandlers[arrow::Type::SPARSE_UNION]( array_data, index, type_overrides ); + AppendArray(array_data, k_array, index, type_overrides); } template<> -K AppendNested( shared_ptr array_data, size_t& index, TypeMappingOverride& type_overrides ) +void AppendArrayNullBitmap(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto dictionary_array = static_pointer_cast( array_data ); - auto length = dictionary_array->length(); - - auto items = dictionary_array->dictionary(); - auto items_type_id = items->type_id(); - auto items_length = items->length(); - K k_items = ( NestedHandlers.find( items_type_id ) == NestedHandlers.end() ) - ? ktn( KB, items_length ) - : knk( 0 ); - - auto indices = dictionary_array->indices(); - auto indices_type_id = indices->type_id(); - auto indices_length = indices->length(); - K k_indices = ( NestedHandlers.find( indices_type_id ) == NestedHandlers.end() ) - ? ktn( KB, indices_length ) - : knk( 0 ); - - size_t items_counter = 0; - size_t indices_counter = 0; - InitKdbNullBitmap( items, &k_items, items_counter, type_overrides ); - InitKdbNullBitmap( indices, &k_indices, indices_counter, type_overrides ); - - K k_bitmap = knk( 2, k_items, k_indices ); - index += length; - - return k_bitmap; + AppendDictionary(array_data, k_array, index, type_overrides, kx::arrowkdb::ReadArrayNullBitmap); } template -auto make_nested_handler() +auto make_append_array_null_bitmap_handler() { - return make_pair( TypeId, &AppendNested ); + return make_pair( TypeId, &AppendArrayNullBitmap ); } -unordered_map NestedHandlers{ - make_nested_handler() - , make_nested_handler() - , make_nested_handler() - , make_nested_handler() - , make_nested_handler() - , make_nested_handler() - , make_nested_handler() - , make_nested_handler() +unordered_map NullBitmapHandlers{ + make_append_array_null_bitmap_handler() + , make_append_array_null_bitmap_handler() + , make_append_array_null_bitmap_handler() + , make_append_array_null_bitmap_handler() + , make_append_array_null_bitmap_handler() + , make_append_array_null_bitmap_handler() + , make_append_array_null_bitmap_handler() + , make_append_array_null_bitmap_handler() }; } // namespace @@ -822,24 +709,28 @@ void AppendArray(shared_ptr array_data, K k_array, size_t& index, } } -void InitKdbNullBitmap( shared_ptr array_data, K* k_bitmap, size_t& index, TypeMappingOverride& type_overrides ) +void AppendArrayNullBitmap(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto type_id = array_data->type_id(); - auto length = array_data->length(); - - for( int i = 0ll; i < length; ++i ){ - if( NestedHandlers.find( type_id ) == NestedHandlers.end() ){ - kG( *k_bitmap )[index++] = array_data->IsNull( i ); - } - else{ - auto pos = index; - *k_bitmap = jk( k_bitmap, NestedHandlers[type_id]( array_data, index, type_overrides ) ); - i += index - pos - 1; - } + if (NullBitmapHandlers.find(type_id) == NullBitmapHandlers.end()) + { + for (int i = 0ll; i < array_data->length(); ++i) + kG(k_array)[index++] = array_data->IsNull(i); + } else + { + NullBitmapHandlers[type_id](array_data, k_array, index, type_overrides); } } -K InitKdbForArray(shared_ptr datatype, size_t length, TypeMappingOverride& type_overrides) +KdbType GetKdbTypeNullBitmap(std::shared_ptr datatype, TypeMappingOverride& type_overrides) +{ + if (NullBitmapHandlers.find(datatype->id()) == NullBitmapHandlers.end()) + return KB; + else + return 0; +} + +K InitKdbForArray(shared_ptr datatype, size_t length, TypeMappingOverride& type_overrides, GetKdbTypeCommon get_kdb_type) { switch (datatype->id()) { case arrow::Type::STRUCT: @@ -849,7 +740,7 @@ K InitKdbForArray(shared_ptr datatype, size_t length, TypeMappi K result = knk(num_fields); for (auto i = 0; i < num_fields; ++i) { auto field = datatype->field(i); - kK(result)[i] = InitKdbForArray(field->type(), length, type_overrides); + kK(result)[i] = InitKdbForArray(field->type(), length, type_overrides, get_kdb_type); } return result; } @@ -862,7 +753,7 @@ K InitKdbForArray(shared_ptr datatype, size_t length, TypeMappi kK(result)[0] = ktn(KH, length); // type_id list for (auto i = 0; i < num_fields; ++i) { auto field = datatype->field(i); - kK(result)[i + 1] = InitKdbForArray(field->type(), length, type_overrides); + kK(result)[i + 1] = InitKdbForArray(field->type(), length, type_overrides, get_kdb_type); } return result; } @@ -874,47 +765,48 @@ K InitKdbForArray(shared_ptr datatype, size_t length, TypeMappi // Do not preallocate the child lists since AppendDictionary has to join to the // indicies and values lists - kK(result)[0] = InitKdbForArray(dictionary_type->value_type(), 0, type_overrides); - kK(result)[1] = InitKdbForArray(dictionary_type->index_type(), 0, type_overrides); + kK(result)[0] = InitKdbForArray(dictionary_type->value_type(), 0, type_overrides, get_kdb_type); + kK(result)[1] = InitKdbForArray(dictionary_type->index_type(), 0, type_overrides, get_kdb_type); return result; } default: - return ktn(GetKdbType(datatype, type_overrides), length); + return ktn(get_kdb_type(datatype, type_overrides), length); } } K ReadArray(shared_ptr array, TypeMappingOverride& type_overrides) { - K k_array = InitKdbForArray(array->type(), array->length(), type_overrides); + K k_array = InitKdbForArray(array->type(), array->length(), type_overrides, GetKdbType); size_t index = 0; AppendArray(array, k_array, index, type_overrides); return k_array; } +K ReadArrayNullBitmap(shared_ptr array, TypeMappingOverride& type_overrides) +{ + K k_array = InitKdbForArray(array->type(), array->length(), type_overrides, GetKdbTypeNullBitmap); + size_t index = 0; + AppendArrayNullBitmap(array, k_array, index, type_overrides); + return k_array; +} + K ReadChunkedArray(shared_ptr chunked_array, TypeMappingOverride& type_overrides) { - K k_array = InitKdbForArray(chunked_array->type(), chunked_array->length(), type_overrides); + K k_array = InitKdbForArray(chunked_array->type(), chunked_array->length(), type_overrides, GetKdbType); size_t index = 0; for (auto j = 0; j < chunked_array->num_chunks(); ++j) AppendArray(chunked_array->chunk(j), k_array, index, type_overrides); return k_array; } -K ReadChunkedNullBitmap( shared_ptr chunked_array, TypeMappingOverride& type_overrides ) +K ReadChunkedArrayNullBitmap(shared_ptr chunked_array, TypeMappingOverride& type_overrides) { - auto length = chunked_array->length(); - auto type_id = chunked_array->type()->id(); - K k_bitmap = ( NestedHandlers.find( type_id ) == NestedHandlers.end() ) - ? ktn( KB, length ) - : knk( 0 ); - + K k_array = InitKdbForArray(chunked_array->type(), chunked_array->length(), type_overrides, GetKdbTypeNullBitmap); size_t index = 0; - for( auto i = 0; i < chunked_array->num_chunks(); ++i ){ - InitKdbNullBitmap( chunked_array->chunk( i ), &k_bitmap, index, type_overrides ); - } - - return k_bitmap; + for (auto j = 0; j < chunked_array->num_chunks(); ++j) + AppendArrayNullBitmap(chunked_array->chunk(j), k_array, index, type_overrides); + return k_array; } } // namespace arrowkdb diff --git a/src/ArrayReader.h b/src/ArrayReader.h index fbb2662..22b11d2 100644 --- a/src/ArrayReader.h +++ b/src/ArrayReader.h @@ -26,6 +26,7 @@ namespace arrowkdb { * length of the array array. */ void AppendArray(std::shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides); +void AppendArrayNullBitmap(std::shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides); /** * @brief Copies and converts an arrow array to a kdb list @@ -35,6 +36,7 @@ void AppendArray(std::shared_ptr array_data, K k_array, size_t& in * @return A kdb list represented the arrow array */ K ReadArray(std::shared_ptr array, TypeMappingOverride& type_overrides); +K ReadArrayNullBitmap(std::shared_ptr array, TypeMappingOverride& type_overrides); /** * @brief An arrow chunked array is a set of sub-arrays which are logically but not @@ -55,7 +57,7 @@ K ReadChunkedArray(std::shared_ptr chunked_array, TypeMappi * @param type_overrides Overrides for type mappings configured by KdbOptions * @return A kdb list representing the nulls bitmap */ -K ReadChunkedNullBitmap( std::shared_ptr chunked_array, TypeMappingOverride& type_overrides ); +K ReadChunkedArrayNullBitmap( std::shared_ptr chunked_array, TypeMappingOverride& type_overrides ); /** * @brief Creates a kdb list of the correct type and specified length according @@ -67,7 +69,7 @@ K ReadChunkedNullBitmap( std::shared_ptr chunked_array, Typ * @param type_overrides Overrides for type mappings configured by KdbOptions * @return Newly created kdb list */ -K InitKdbForArray(std::shared_ptr datatype, size_t length, TypeMappingOverride& type_overrides); +K InitKdbForArray(std::shared_ptr datatype, size_t length, TypeMappingOverride& type_overrides, GetKdbTypeCommon get_kdb_type); /** * @brief Appends null bitmap data from an arrow array into an existing kdb boolean diff --git a/src/HelperFunctions.h b/src/HelperFunctions.h index 001e46f..36ac03e 100644 --- a/src/HelperFunctions.h +++ b/src/HelperFunctions.h @@ -89,6 +89,7 @@ inline bool is_equal( T lhs, T rhs ) return ::fabs( lhs -= rhs ) <= epsilon; } + ////////////////// // TYPE MAPPING // ////////////////// @@ -170,6 +171,13 @@ KdbType GetKdbType(std::shared_ptr datatype, TypeMappingOverrid */ std::shared_ptr GetArrowType(K k_array); + +/////////////////////// +// FUNCTION HANDLERS // +/////////////////////// + +typedef std::function datatype, TypeMappingOverride& type_overrides)> GetKdbTypeCommon; + } // namespace arrowkdb } // namespace kx diff --git a/src/TableData.cpp b/src/TableData.cpp index f076f6f..4b76ae6 100644 --- a/src/TableData.cpp +++ b/src/TableData.cpp @@ -325,7 +325,7 @@ K readParquetData(K parquet_file, K options) K bitmap = ktn( 0, col_num ); for( auto i = 0; i < col_num; ++i ){ auto chunked_array = table->column( i ); - kK( bitmap )[i] = kx::arrowkdb::ReadChunkedNullBitmap( chunked_array, type_overrides ); + kK( bitmap )[i] = kx::arrowkdb::ReadChunkedArrayNullBitmap( chunked_array, type_overrides ); } K array = data; data = ktn( 0, 2 ); @@ -620,7 +620,7 @@ K readArrowData(K arrow_file, K options) for( auto batch: all_batches ) column_arrays.push_back( batch->column( i ) ); auto chunked_array = std::make_shared( column_arrays ); - kK( bitmap )[i] = kx::arrowkdb::ReadChunkedNullBitmap( chunked_array, type_overrides ); + kK( bitmap )[i] = kx::arrowkdb::ReadChunkedArrayNullBitmap( chunked_array, type_overrides ); } K array = data; data = ktn( 0, 2 ); @@ -779,7 +779,7 @@ K parseArrowData(K char_array, K options) for( auto batch: all_batches ) column_arrays.push_back( batch->column( i ) ); auto chunked_array = std::make_shared( column_arrays ); - kK( bitmap )[i] = kx::arrowkdb::ReadChunkedNullBitmap( chunked_array, type_overrides ); + kK( bitmap )[i] = kx::arrowkdb::ReadChunkedArrayNullBitmap( chunked_array, type_overrides ); } K array = data; data = ktn( 0, 2 ); From 6121a7e4d37294740c5ecb4b889ca90aff35a91a Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 8 Mar 2023 13:46:16 +0300 Subject: [PATCH 115/276] Unit-test for structs usability improvement --- examples/null_bitmap.q | 6 +++--- tests/null_bitmap/nested_null_bitmap.t | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q index 742220d..135c2ab 100644 --- a/examples/null_bitmap.q +++ b/examples/null_bitmap.q @@ -221,7 +221,7 @@ parquet_dict_nulls:parquet_dict_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count parquet_bitmap_nulls;parquet_bitmap_nulls] show nested_list_nulls~parquet_list_nulls -show nested_struct_nulls~parquet_struct_nulls[0] +show nested_struct_nulls~parquet_struct_nulls show nested_dict_nulls[0]~parquet_dict_nulls[0] show nested_map_nulls~last[parquet_dict_nulls][0] @@ -285,7 +285,7 @@ arrow_union_nulls:arrow_union_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count arrow_bitmap_nulls;arrow_bitmap_nulls] show nested_list_nulls~arrow_list_nulls -show nested_struct_nulls~arrow_struct_nulls[0] +show nested_struct_nulls~arrow_struct_nulls show nested_dict_nulls~first[arrow_dict_nulls][0] show nested_map_nulls~last[arrow_dict_nulls][0] show nested_union_nulls~arrow_union_nulls[0][0] @@ -347,7 +347,7 @@ stream_union_nulls:stream_union_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count stream_bitmap_nulls;stream_bitmap_nulls] show nested_list_nulls~stream_list_nulls -show nested_struct_nulls~stream_struct_nulls[0] +show nested_struct_nulls~stream_struct_nulls show nested_dict_nulls~first[stream_dict_nulls][0] show nested_map_nulls~last[stream_dict_nulls][0] show nested_union_nulls~stream_union_nulls[0][0] diff --git a/tests/null_bitmap/nested_null_bitmap.t b/tests/null_bitmap/nested_null_bitmap.t index 57f4bae..a3f7706 100644 --- a/tests/null_bitmap/nested_null_bitmap.t +++ b/tests/null_bitmap/nested_null_bitmap.t @@ -91,7 +91,7 @@ nested_struct_nulls:(10000b;01000b;00100b) parquet_list_nulls:first parquet_nested_data[1] parquet_struct_nulls:last parquet_nested_data[1] nested_list_nulls~parquet_list_nulls -nested_struct_nulls~parquet_struct_nulls[0] +nested_struct_nulls~parquet_struct_nulls rm parquet_nested_bitmap; @@ -112,7 +112,7 @@ nested_data~first arrow_nested_data arrow_list_nulls:first arrow_nested_data[1] arrow_struct_nulls:last arrow_nested_data[1] nested_list_nulls~arrow_list_nulls -nested_struct_nulls~arrow_struct_nulls[0] +nested_struct_nulls~arrow_struct_nulls rm arrow_nested_bitmap; @@ -132,7 +132,7 @@ nested_data~first stream_nested_data stream_list_nulls:first stream_nested_data[1] stream_struct_nulls:last stream_nested_data[1] nested_list_nulls~stream_list_nulls -nested_struct_nulls~stream_struct_nulls[0] +nested_struct_nulls~stream_struct_nulls -1 "\n+----------|| Test utils ||----------+\n"; From cf809e3a8b94f49cdbfc67aa1b4253004ea1ada7 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 8 Mar 2023 13:46:47 +0300 Subject: [PATCH 116/276] Struct usability improvement, joining sublists --- src/ArrayReader.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index c7e8f3a..7e72a8a 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -831,6 +831,11 @@ void InitKdbNullBitmap( shared_ptr array_data, K* k_bitmap, size_t if( NestedHandlers.find( type_id ) == NestedHandlers.end() ){ kG( *k_bitmap )[index++] = array_data->IsNull( i ); } + else if( arrow::Type::STRUCT == type_id ){ + auto pos = index; + *k_bitmap = jv( k_bitmap, NestedHandlers[type_id]( array_data, index, type_overrides ) ); + i += index - pos - 1; + } else{ auto pos = index; *k_bitmap = jk( k_bitmap, NestedHandlers[type_id]( array_data, index, type_overrides ) ); From e53c7d9404bafd86bb06040095a049b5fee7a3b7 Mon Sep 17 00:00:00 2001 From: nmcdonnell-kx Date: Wed, 8 Mar 2023 12:29:14 +0000 Subject: [PATCH 117/276] KXI-22441 Read arrow null bitmap as separate structure * Bugfix to AppendArrayNullBitmap * Fix unit tests --- src/ArrayReader.cpp | 2 +- tests/null_bitmap/glossary_null_bitmap.t | 12 +++++------- tests/null_bitmap/nested_null_bitmap.t | 6 +++--- tests/null_bitmap/union_null_bitmap.t | 12 ++++-------- 4 files changed, 13 insertions(+), 19 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index a58c800..d958d0d 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -665,7 +665,7 @@ void AppendArrayNullBitmap(shared_ptr a template<> void AppendArrayNullBitmap(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - AppendArray(array_data, k_array, index, type_overrides); + AppendArrayNullBitmap(array_data, k_array, index, type_overrides); } template<> diff --git a/tests/null_bitmap/glossary_null_bitmap.t b/tests/null_bitmap/glossary_null_bitmap.t index 15396da..2d6d299 100644 --- a/tests/null_bitmap/glossary_null_bitmap.t +++ b/tests/null_bitmap/glossary_null_bitmap.t @@ -65,9 +65,9 @@ parquet_glossary_data:.arrowkdb.pq.readParquetData[parquet_glossary_bitmap;gloss glossary_data~first parquet_glossary_data -1"\n+----------|| Compare null bitmaps of parquet data ||----------+\n"; -glossary_nulls:((enlist 0b)!(enlist 0b);00b!01b;000b!000b) -parquet_glossary_nulls:parquet_glossary_data[1] -glossary_nulls~parquet_glossary_nulls[0][0] +null_data:((enlist 0b)!(enlist 0b);00b!01b;000b!000b) +glossary_nulls:enlist null_data +glossary_nulls~last parquet_glossary_data rm parquet_glossary_bitmap; @@ -85,8 +85,7 @@ arrow_glossary_data:.arrowkdb.ipc.readArrowData[arrow_glossary_bitmap;glossary_o glossary_data~first arrow_glossary_data -1"\n+----------|| Compare null bitmaps of arrow data ||----------+\n"; -arrow_glossary_nulls:arrow_glossary_data[1] -glossary_nulls~arrow_glossary_nulls[0][0] +glossary_nulls~last arrow_glossary_data rm arrow_glossary_bitmap; @@ -103,8 +102,7 @@ stream_glossary_data:.arrowkdb.ipc.parseArrowData[serialized_glossary;glossary_o glossary_data~first stream_glossary_data -1"\n+----------|| Compare null bitmaps of stream data ||----------+\n"; -stream_glossary_nulls:stream_glossary_data[1] -glossary_nulls~stream_glossary_nulls[0][0] +glossary_nulls~last stream_glossary_data -1 "\n+----------|| Test utils ||----------+\n"; diff --git a/tests/null_bitmap/nested_null_bitmap.t b/tests/null_bitmap/nested_null_bitmap.t index 57f4bae..a3f7706 100644 --- a/tests/null_bitmap/nested_null_bitmap.t +++ b/tests/null_bitmap/nested_null_bitmap.t @@ -91,7 +91,7 @@ nested_struct_nulls:(10000b;01000b;00100b) parquet_list_nulls:first parquet_nested_data[1] parquet_struct_nulls:last parquet_nested_data[1] nested_list_nulls~parquet_list_nulls -nested_struct_nulls~parquet_struct_nulls[0] +nested_struct_nulls~parquet_struct_nulls rm parquet_nested_bitmap; @@ -112,7 +112,7 @@ nested_data~first arrow_nested_data arrow_list_nulls:first arrow_nested_data[1] arrow_struct_nulls:last arrow_nested_data[1] nested_list_nulls~arrow_list_nulls -nested_struct_nulls~arrow_struct_nulls[0] +nested_struct_nulls~arrow_struct_nulls rm arrow_nested_bitmap; @@ -132,7 +132,7 @@ nested_data~first stream_nested_data stream_list_nulls:first stream_nested_data[1] stream_struct_nulls:last stream_nested_data[1] nested_list_nulls~stream_list_nulls -nested_struct_nulls~stream_struct_nulls[0] +nested_struct_nulls~stream_struct_nulls -1 "\n+----------|| Test utils ||----------+\n"; diff --git a/tests/null_bitmap/union_null_bitmap.t b/tests/null_bitmap/union_null_bitmap.t index 2f6580e..867e85c 100644 --- a/tests/null_bitmap/union_null_bitmap.t +++ b/tests/null_bitmap/union_null_bitmap.t @@ -63,11 +63,9 @@ arrow_union_data:.arrowkdb.ipc.readArrowData[arrow_union_bitmap;union_options]; union_data~first arrow_union_data -1"\n+----------|| Compare null bitmaps of arrow data ||----------+\n"; -nested_union_nulls:((0 1 0h);100b;010b); - -arrow_union_nulls:arrow_union_data[1] -nested_union_nulls~arrow_union_nulls[0][0] -nested_union_nulls~arrow_union_nulls[1][0] +sparse_nulls:dense_nulls:((0 1 0h);100b;010b); +union_nulls:(sparse_nulls;dense_nulls); +union_nulls~last arrow_union_data rm arrow_union_bitmap; @@ -84,9 +82,7 @@ stream_union_data:.arrowkdb.ipc.parseArrowData[serialized_nested_union;union_opt union_data~first stream_union_data -1"\n+----------|| Compare null bitmaps of stream data ||----------+\n"; -stream_union_nulls:stream_union_data[1] -nested_union_nulls~stream_union_nulls[0][0] -nested_union_nulls~stream_union_nulls[1][0] +union_nulls~last stream_union_data -1 "\n+----------|| Test utils ||----------+\n"; From bd6aafa230bc40f88cfd3f56f958da533f58a0a7 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 8 Mar 2023 19:36:07 +0300 Subject: [PATCH 118/276] Markdown formatting of high-level design document of Null Mapping feature --- docs/null-mapping.md | 125 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 docs/null-mapping.md diff --git a/docs/null-mapping.md b/docs/null-mapping.md new file mode 100644 index 0000000..afdbb42 --- /dev/null +++ b/docs/null-mapping.md @@ -0,0 +1,125 @@ +# Arrowkdb null mapping + +## Background + +The basic unit for storing data in arrow is an array.  Each array contains:  + +- Datatype identifier  + +- Length  + +- Block of data (length as above) and accessors  + +- A null bitmap (length as above) + +Arrowkdb converts an arrow array to a kdb list and vice versa with type mapping as required:  + +- Simple datatypes (ints, floats) are memcpy-ed  + +- Temporal datatypes are copied one item at a time with the appropriate epoch offsetting and scaling  + +- String and binary datatypes are copied into a mixed list of char or byte lists  + +- Nested datatypes (list, map, struct, union, dictionaries) are represented by a mixed list of sublists, depending on the child datatypes (using recursion to populate the child lists) + + Full details are provided here https://code.kx.com/q/interfaces/arrow/arrow-types/ + +An arrow table is a set of such arrays, which arrowkdb converts to a mixed list of lists, one per column (although in the case of the nested datatypes one column can be represented by a further set of lists).  By decorating with the field names from the schema, this mixed list of array data then becomes a kdb table. + +# + +# Mapping arrow nulls to kdb nulls + +## Approach + +Currently the simple datatype arrays are memcpy-ed like: + +```cpp + case arrow::Type::UINT16: + { + auto uint16_array = std::static_pointer_cast(array_data); + memcpy(kH(k_array), uint16_array->raw_values(), uint16_array->length() * sizeof(arrow::UInt16Array::value_type)); + break; + } +``` + +A simple approach to map arrow nulls to kdb nulls is to change this to: + +```cpp + case arrow::Type::INT16: + { + auto int16_array = std::static_pointer_cast(array_data); + + for (auto i = 0; i < int16_array->length(); ++i) + if (int16_array->IsNull(i)) + kH(k_array)[i] = INT16_MIN; + else + kH(k_array)[i] = int16_array->Value(i); + break; + } +``` + +The issue with this is that it would result in a significant drop in performance due to inevitable failures in branch prediction.  However, with some arithmetic trickery the same functionality can be modelled without a branch: + +```cpp + case arrow::Type::INT16: + { + auto int16_array = std::static_pointer_cast(array_data); + for (auto i = 0; i < int16_array->length(); ++i) + kH(k_array)[i] = (int16_array->IsNull(i) * INT16_MIN) + (!int16_array->IsNull(i) * int16_array->Value(i)); + break; + } +``` + +Although there would still be a loss of performance (memcpy copies 64 bits at a time so doing an item by item copy where the datatype < 64 bits will be slower, plus the overhead of indexing into the null bitmap), that loss should not be significant.  + +Note: the examples above refer to the reader functions but similar functionality would be provided for the writer functions which would perform the reverse operation (setting the arrow null bitmap if the value is a kdb null). + +## Considerations + +The bigger problem is that unlike arrow, not all kdb types have a null value.  Also, those that do overload one value in the range (typically INT_MIN or FLOAT_MIN).  For example: + +- Each item in an arrow boolean array can be 0b, 1b or NULL.  Kdb has no boolean null.  Similarly it doesn't have a byte null.  + +- Unlike arrow, kdb can't distinguish between a null string and empty string.  Similarly it can't distinguish between the " " character and null.  + +- Int16 value of -32768 (a valid non-null value in arrow).  In kdb -32768 == 0Nh (null).  Similarly for the other int, float and temporal types  + +- Etc. + +Therefore mapping arrow nulls to kdb nulls is going to result in corner cases which can't be represented accurately.  However, the type mapping elsewhere in arrowkdb already has corner cases: + +- db has no unsigned integer types.  Arrow unsigned ints are represented in kdb as signed ints.  Therefore if the top bit of an unsigned is set in arrow, it will display as a negative number in kdb.  + +- Converting from kdb temporals to arrow temporals can result in a loss of precision. + +A compromise would be to allow the user to specify how arrowkdb should map nulls.  Each reader and writer function in arrowkdb takes an options dictionary.  A new `NULL_MAPPING option would be added which allows the user to specify whether an arrow datatype should be null mapped and what value to use for null.  For example: + +```q +q)options`NULL_MAPPING +int16 | 0Nh +int32 | 0Ni +int64 | 0N +float32 | 0Ne +float64 | 0n +date32 | 0Nd +date64 | 0Np +month_interval | 0Nm +day_time_interval| 0Nn +timestamp | 0Np +time32 | 0Nt +time64 | 0Nn +duration | 0Nn +utf8 | "" +binary | `byte$() +``` + +The type of each value in this dictionary must be the atomic type of the corresponding list representation for that datatype.  Where a datatype isn't present in this dictionary, arrowkdb would ignore the null bitmap (as per the existing behaviour). + +Note: There is no null mapping for arrow arrays which use nested datatypes (list, map, struct, union, dictionaries) where the parent array contains null values.  For example, an array with a struct datatype in arrow can have either null child field values or the parent struct value could be null.  Arrowkdb will only map nulls for the child fields using the above mapping. + +Conclusions + +Mapping arrow nulls to kdb nulls is considerably easier to implement and more intuitive for a kdb user. + +Therefore, subject to review of this document by users, the better choice is null mapping. From 212f63afbd7115a010208121c2ae7447cd538275 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 8 Mar 2023 19:36:26 +0300 Subject: [PATCH 119/276] Markdown formatting of high-level design document of Null Bitmap feature --- docs/null-bitmap.md | 76 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 docs/null-bitmap.md diff --git a/docs/null-bitmap.md b/docs/null-bitmap.md new file mode 100644 index 0000000..392fa88 --- /dev/null +++ b/docs/null-bitmap.md @@ -0,0 +1,76 @@ +# Arrowkdb null bitmap + +## Problem + +Arrowkdb ignores the null bitmap when reading or writing an arrow array.  Even if the null bitmap is set for an array item, arrowkdb still reads its value from the array data.  This was done for a couple of reasons:  + +- Kdb doesn't have proper distinct null values for its types.  Using the kdb null values will result in some strange corner cases.  The only way to do this properly would be to expose the array data separately to the null bitmap (in line with how arrow represents nulls).  But this would make the API more complex.  + +- Mapping to kdb nulls would hurt the performance.  For simple types (ints, floats, etc.) arrowkdb bulk copies the entire arrow array into a kdb list using memcpy.  Having to check every array item for null then either use its value or the closest kdb null would require processing one item at a time.  + +However, null support in arrowkdb has been requested by users so potential implementations are to be considered. + +# Exposing the null bitmap to the kdb user + +## Approach + +Arrowkdb represents an arrow table (which is a set of arrow arrays) as a mixed list of lists.  This is then decorated with the field names from the schema to create a kdb table, similar to: + +```q +q)field_names:`col1`col2`col3 +q)array_data:(3?0i;`float$3?0;3?0p) +q)GetTable:{flip field_names!array_data} +q)GetTable[] +col1 col2 col3 +----------------------------------------------------- +-239800692 -1.16675e+18 2003.05.24D03:45:53.202889856 +-930424766 4.413091e+18 2001.07.22D09:51:37.461634128 +1760748068 2.89119e+18 2001.07.26D01:03:47.039068936 +``` + +In order to avoid the limitations described above with kdb nulls, an alternative approach is to expose the null bitmap as a separate structure to kdb (more in line with how arrow represents nulls): + +```q +q)null_bitmaps:(3?0b;3?0b;3?0b) +q)GetTableWithNulls:{((flip field_names!array_data);(flip field_names!null_bitmaps))} +q)GetTableWithNulls[] ++`col1`col2`col3!(-239800692 -930424766 1760748068i;-1.16675e+18 4.413091e+18.. ++`col1`col2`col3!(011b;110b;010b) +q)first GetTableWithNulls[] +col1 col2 col3 +----------------------------------------------------- +-239800692 -1.16675e+18 2003.05.24D03:45:53.202889856 +-930424766 4.413091e+18 2001.07.22D09:51:37.461634128 +1760748068 2.89119e+18 2001.07.26D01:03:47.039068936 +q)last GetTableWithNulls[] +col1 col2 col3 +-------------- +0 1 0 +1 1 1 +1 0 0 +``` + +The shape of the null bitmap structure would be exactly the same as the data structure.  It is then left to the user to interpret the two structures as appropriate for their application. + +Each reader function in arrowkdb takes an options dictionary.  A new `WITH_NULL_BITMAP option would be added.  When this option is set the reader functions then return a two item mixed list (the data values and null bitmap): + +```q +q)read_data_structures:.arrowkdb.pq.readParquetToTable["file.parquet";(enlist `WITH_NULL_BITMAP)!(enlist 1)] +q)read_data_structures ++`col1`col2`col3!(-239800692 -930424766 1760748068i;-1.16675e+18 4.413091e+18.. ++`col1`col2`col3!(011b;110b;010b) +``` + +Note: it would not be possible to support the null bitmap with the writer functions without a significant rework of arrowkdb.  This is because arrow arrays are built append only, it would not be possible to populate the values with a first pass (as done currently) then populate the null bitmap with a second pass.  Rather it would be necessary to populate the data values and null bitmap in a single pass which is not possible with the current design. + +## Considerations + +This approach results in an overly complicated API and would be unintuitive for kdb users (who are more familiar with and expect kdb nulls).  + +Furthermore, how would the null bitmap be used in a kdb application?  If its only purpose is to populate the data structure with kdb nulls then it will suffer the same limitations as having arrowkdb do this mapping, while introducing unnecessary complexity.  + +Note: Since the null bitmap structure and data structure must have the same shape, arrow arrays which use nested datatypes (list, map, struct, union, dictionaries) where the parent array contains null values cannot be represented.  For example, an array with a struct datatype in arrow can have either null child field values or the parent struct value could be null.  The null bitmap structure will only reflect the null bitmap of the child field datatypes. + +## Conclusions + +Exposing the null bitmap to the kdb user, while closer to how arrow represents null, makes the API overly complex and it isn't clear whether there is a clear use case where the null bitmap could be well utilised in kdb.  Also, given the additional complexity of exposing the null bitmap, there may be other issues or corner cases which only become evident during development. From c8523b92452d49c961ed2668c6f9924a2927c92e Mon Sep 17 00:00:00 2001 From: vgrechin-kx <122450037+vgrechin-kx@users.noreply.github.com> Date: Wed, 8 Mar 2023 21:41:56 +0300 Subject: [PATCH 120/276] Update docs/null-mapping.md Co-authored-by: LizNorris <120571649+LizNorris@users.noreply.github.com> --- docs/null-mapping.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/null-mapping.md b/docs/null-mapping.md index afdbb42..6796c6d 100644 --- a/docs/null-mapping.md +++ b/docs/null-mapping.md @@ -77,7 +77,7 @@ Note: the examples above refer to the reader functions but similar functionality ## Considerations -The bigger problem is that unlike arrow, not all kdb types have a null value.  Also, those that do overload one value in the range (typically INT_MIN or FLOAT_MIN).  For example: +The bigger problem is that unlike arrow, not all kdb types have a null value.  Also, those that do just overload one value in the range (typically INT_MIN or FLOAT_MIN).  For example: - Each item in an arrow boolean array can be 0b, 1b or NULL.  Kdb has no boolean null.  Similarly it doesn't have a byte null.  From db7a3e43f11bae6c94a995972eb6cf7258b39d5d Mon Sep 17 00:00:00 2001 From: vgrechin-kx <122450037+vgrechin-kx@users.noreply.github.com> Date: Wed, 8 Mar 2023 21:42:12 +0300 Subject: [PATCH 121/276] Update docs/null-mapping.md Co-authored-by: LizNorris <120571649+LizNorris@users.noreply.github.com> --- docs/null-mapping.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/null-mapping.md b/docs/null-mapping.md index 6796c6d..06cefbb 100644 --- a/docs/null-mapping.md +++ b/docs/null-mapping.md @@ -22,7 +22,7 @@ Arrowkdb converts an arrow array to a kdb list and vice versa with type mapping - Nested datatypes (list, map, struct, union, dictionaries) are represented by a mixed list of sublists, depending on the child datatypes (using recursion to populate the child lists) - Full details are provided here https://code.kx.com/q/interfaces/arrow/arrow-types/ +Full details are provided here https://code.kx.com/q/interfaces/arrow/arrow-types/ An arrow table is a set of such arrays, which arrowkdb converts to a mixed list of lists, one per column (although in the case of the nested datatypes one column can be represented by a further set of lists).  By decorating with the field names from the schema, this mixed list of array data then becomes a kdb table. From 2484d9c58c37e7b991f3dd582a5e92d32e6eda10 Mon Sep 17 00:00:00 2001 From: vgrechin-kx <122450037+vgrechin-kx@users.noreply.github.com> Date: Wed, 8 Mar 2023 21:42:21 +0300 Subject: [PATCH 122/276] Update docs/null-mapping.md Co-authored-by: LizNorris <120571649+LizNorris@users.noreply.github.com> --- docs/null-mapping.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/null-mapping.md b/docs/null-mapping.md index 06cefbb..a6edba0 100644 --- a/docs/null-mapping.md +++ b/docs/null-mapping.md @@ -32,7 +32,7 @@ An arrow table is a set of such arrays, which arrowkdb converts to a mixed list ## Approach -Currently the simple datatype arrays are memcpy-ed like: +Currently the simple datatype arrays are memcpy-ed as follows: ```cpp case arrow::Type::UINT16: From a65fb83bb6b64ef8b78645a6a01a311f83d67b46 Mon Sep 17 00:00:00 2001 From: vgrechin-kx <122450037+vgrechin-kx@users.noreply.github.com> Date: Wed, 8 Mar 2023 21:43:02 +0300 Subject: [PATCH 123/276] Update docs/null-mapping.md Co-authored-by: LizNorris <120571649+LizNorris@users.noreply.github.com> --- docs/null-mapping.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/null-mapping.md b/docs/null-mapping.md index a6edba0..2409c8d 100644 --- a/docs/null-mapping.md +++ b/docs/null-mapping.md @@ -79,7 +79,8 @@ Note: the examples above refer to the reader functions but similar functionality The bigger problem is that unlike arrow, not all kdb types have a null value.  Also, those that do just overload one value in the range (typically INT_MIN or FLOAT_MIN).  For example: -- Each item in an arrow boolean array can be 0b, 1b or NULL.  Kdb has no boolean null.  Similarly it doesn't have a byte null.  +- Each item in an arrow boolean array can be 0b, 1b or NULL.  kdb has no boolean null.  +- kdb doesn't have a byte null.  - Unlike arrow, kdb can't distinguish between a null string and empty string.  Similarly it can't distinguish between the " " character and null.  From 5afc284467f0a6215161f3169aeb13299f824766 Mon Sep 17 00:00:00 2001 From: vgrechin-kx <122450037+vgrechin-kx@users.noreply.github.com> Date: Wed, 8 Mar 2023 21:43:42 +0300 Subject: [PATCH 124/276] Update docs/null-mapping.md Co-authored-by: LizNorris <120571649+LizNorris@users.noreply.github.com> --- docs/null-mapping.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/null-mapping.md b/docs/null-mapping.md index 2409c8d..814a8cb 100644 --- a/docs/null-mapping.md +++ b/docs/null-mapping.md @@ -94,7 +94,9 @@ Therefore mapping arrow nulls to kdb nulls is going to result in corner cases wh - Converting from kdb temporals to arrow temporals can result in a loss of precision. -A compromise would be to allow the user to specify how arrowkdb should map nulls.  Each reader and writer function in arrowkdb takes an options dictionary.  A new `NULL_MAPPING option would be added which allows the user to specify whether an arrow datatype should be null mapped and what value to use for null.  For example: +A compromise would be to allow the user to specify how arrowkdb should map nulls.  Each reader and writer function in arrowkdb takes an options dictionary.  A new `NULL_MAPPING` option would be added which allows the user to specify whether an arrow datatype should be null mapped and what value to use for null.  + +For example: ```q q)options`NULL_MAPPING From 74c437e326f296c20a8f7ce2a239fb0c1697e32f Mon Sep 17 00:00:00 2001 From: vgrechin-kx <122450037+vgrechin-kx@users.noreply.github.com> Date: Wed, 8 Mar 2023 21:43:56 +0300 Subject: [PATCH 125/276] Update docs/null-mapping.md Co-authored-by: LizNorris <120571649+LizNorris@users.noreply.github.com> --- docs/null-mapping.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/null-mapping.md b/docs/null-mapping.md index 814a8cb..37ad0c6 100644 --- a/docs/null-mapping.md +++ b/docs/null-mapping.md @@ -121,7 +121,7 @@ The type of each value in this dictionary must be the atomic type of the corresp Note: There is no null mapping for arrow arrays which use nested datatypes (list, map, struct, union, dictionaries) where the parent array contains null values.  For example, an array with a struct datatype in arrow can have either null child field values or the parent struct value could be null.  Arrowkdb will only map nulls for the child fields using the above mapping. -Conclusions +## Conclusions Mapping arrow nulls to kdb nulls is considerably easier to implement and more intuitive for a kdb user. From 4b01cb6b607935cb00178c09be52fe51c3036c47 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 8 Mar 2023 21:59:53 +0300 Subject: [PATCH 126/276] Update docs 1 --- docs/null-mapping.md | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/docs/null-mapping.md b/docs/null-mapping.md index 37ad0c6..7270d7f 100644 --- a/docs/null-mapping.md +++ b/docs/null-mapping.md @@ -21,12 +21,8 @@ Arrowkdb converts an arrow array to a kdb list and vice versa with type mapping - String and binary datatypes are copied into a mixed list of char or byte lists  - Nested datatypes (list, map, struct, union, dictionaries) are represented by a mixed list of sublists, depending on the child datatypes (using recursion to populate the child lists) - -Full details are provided here https://code.kx.com/q/interfaces/arrow/arrow-types/ - -An arrow table is a set of such arrays, which arrowkdb converts to a mixed list of lists, one per column (although in the case of the nested datatypes one column can be represented by a further set of lists).  By decorating with the field names from the schema, this mixed list of array data then becomes a kdb table. -# +Full details are provided here https://code.kx.com/q/interfaces/arrow/arrow-types/ # Mapping arrow nulls to kdb nulls @@ -77,16 +73,15 @@ Note: the examples above refer to the reader functions but similar functionality ## Considerations -The bigger problem is that unlike arrow, not all kdb types have a null value.  Also, those that do just overload one value in the range (typically INT_MIN or FLOAT_MIN).  For example: +The bigger problem is that unlike arrow, not all kdb types have a null value.  Also, those that do just overload one value in the range (typically INT_MIN or FLOAT_MIN).  -- Each item in an arrow boolean array can be 0b, 1b or NULL.  kdb has no boolean null.  -- kdb doesn't have a byte null.  +For example: -- Unlike arrow, kdb can't distinguish between a null string and empty string.  Similarly it can't distinguish between the " " character and null.  +- Each item in an arrow boolean array can be 0b, 1b or NULL.  kdb has no boolean null.  -- Int16 value of -32768 (a valid non-null value in arrow).  In kdb -32768 == 0Nh (null).  Similarly for the other int, float and temporal types  +- kdb doesn't have a byte null.  -- Etc. +- Unlike arrow, kdb can't distinguish between a null string and empty string.  Similarly it can't distinguish between the " " character and null. Therefore mapping arrow nulls to kdb nulls is going to result in corner cases which can't be represented accurately.  However, the type mapping elsewhere in arrowkdb already has corner cases: From c418b64d2fba2b2297e12d66393b51a158d91fec Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 8 Mar 2023 20:49:25 +0300 Subject: [PATCH 127/276] Map usability improvement --- src/ArrayReader.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 7e72a8a..c43cd28 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -831,7 +831,7 @@ void InitKdbNullBitmap( shared_ptr array_data, K* k_bitmap, size_t if( NestedHandlers.find( type_id ) == NestedHandlers.end() ){ kG( *k_bitmap )[index++] = array_data->IsNull( i ); } - else if( arrow::Type::STRUCT == type_id ){ + else if( arrow::Type::STRUCT == type_id || arrow::Type::MAP == type_id ){ auto pos = index; *k_bitmap = jv( k_bitmap, NestedHandlers[type_id]( array_data, index, type_overrides ) ); i += index - pos - 1; From 9d0752baa570894e9ddf99c1d3e0c14428abe660 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 8 Mar 2023 20:50:24 +0300 Subject: [PATCH 128/276] Unit-test for map usability improvement --- examples/batching_tables.q | 7 +++- examples/null_bitmap.q | 8 ++-- tests/null_bitmap/glossary_null_bitmap.t | 52 +++++++++--------------- 3 files changed, 28 insertions(+), 39 deletions(-) diff --git a/examples/batching_tables.q b/examples/batching_tables.q index e98a87f..1078c3d 100644 --- a/examples/batching_tables.q +++ b/examples/batching_tables.q @@ -26,7 +26,7 @@ batching_options:(``PARQUET_VERSION)!((::);`V2.0) parquet_batching:"batching_table.parquet"; .arrowkdb.pq.writeParquetFromTable[parquet_batching;batching_table;batching_options] show ls parquet_batching -//rm parquet_batching +rm parquet_batching // Write the batching array data to an arrow file batching_options[`ARROW_CHUNK_ROWS]:214748365 @@ -34,7 +34,7 @@ batching_options[`ARROW_CHUNK_ROWS]:214748365 arrow_batching:"batching_table.arrow"; .arrowkdb.ipc.writeArrowFromTable[arrow_batching;batching_table;batching_options] show ls arrow_batching -//rm arrow_batching; +rm arrow_batching; // Serialize the batching array data to an arrow stream serialized_batching:.arrowkdb.ipc.serializeArrowFromTable[batching_table;batching_options]; @@ -42,3 +42,6 @@ show serialized_batching -1 "\n+----------------------------------------+\n"; + +// Process off +exit 0; diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q index 135c2ab..66fae45 100644 --- a/examples/null_bitmap.q +++ b/examples/null_bitmap.q @@ -223,7 +223,7 @@ show bitmap_nulls~bitmap_nulls & sublist[{1-x} count parquet_bitmap_nulls;parque show nested_list_nulls~parquet_list_nulls show nested_struct_nulls~parquet_struct_nulls show nested_dict_nulls[0]~parquet_dict_nulls[0] -show nested_map_nulls~last[parquet_dict_nulls][0] +show nested_map_nulls~last[parquet_dict_nulls] rm parquet_null_bitmap; rm parquet_nested_struct; @@ -287,7 +287,7 @@ show bitmap_nulls~bitmap_nulls & sublist[{1-x} count arrow_bitmap_nulls;arrow_bi show nested_list_nulls~arrow_list_nulls show nested_struct_nulls~arrow_struct_nulls show nested_dict_nulls~first[arrow_dict_nulls][0] -show nested_map_nulls~last[arrow_dict_nulls][0] +show nested_map_nulls~last[arrow_dict_nulls] show nested_union_nulls~arrow_union_nulls[0][0] show nested_union_nulls~arrow_union_nulls[1][0] @@ -349,11 +349,11 @@ show bitmap_nulls~bitmap_nulls & sublist[{1-x} count stream_bitmap_nulls;stream_ show nested_list_nulls~stream_list_nulls show nested_struct_nulls~stream_struct_nulls show nested_dict_nulls~first[stream_dict_nulls][0] -show nested_map_nulls~last[stream_dict_nulls][0] +show nested_map_nulls~last[stream_dict_nulls] show nested_union_nulls~stream_union_nulls[0][0] show nested_union_nulls~stream_union_nulls[1][0] -1 "\n+----------------------------------------+\n"; // Process off -exit 0; +//exit 0; diff --git a/tests/null_bitmap/glossary_null_bitmap.t b/tests/null_bitmap/glossary_null_bitmap.t index 15396da..8c0eab5 100644 --- a/tests/null_bitmap/glossary_null_bitmap.t +++ b/tests/null_bitmap/glossary_null_bitmap.t @@ -16,66 +16,49 @@ N:5 -1"\n+----------|| Create the datatype identifiers ||----------+\n"; ts_dt:.arrowkdb.dt.timestamp[`nano]; +str_dt:.arrowkdb.dt.utf8[]; i64_dt:.arrowkdb.dt.int64[]; f64_dt:.arrowkdb.dt.float64[]; --1"\n+----------|| Create a map datatype using the i16_dt as the key and dec_dt as its values ||----------+\n"; -map_dt:.arrowkdb.dt.map[i64_dt;f64_dt] - -1"\n+----------|| Create the field identifiers ||----------+\n"; ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; +str_fd:.arrowkdb.fd.field[`string;str_dt]; i64_fd:.arrowkdb.fd.field[`int64;i64_dt]; f64_fd:.arrowkdb.fd.field[`float64;f64_dt]; --1"\n+----------|| Create a field containing the map datatype ||----------+\n"; +-1"\n+----------|| Create a field containing glossary datatypes ||----------+\n"; +dict_dt:.arrowkdb.dt.dictionary[str_dt;i64_dt] +dict_fd:.arrowkdb.fd.field[`dictionary;dict_dt] +map_dt:.arrowkdb.dt.map[i64_dt;f64_dt] map_fd:.arrowkdb.fd.field[`map;map_dt]; -1"\n+----------|| Create the schema containing the large list, dictionary and sparce union fields ||----------+\n"; -glossary_schema:.arrowkdb.sc.schema[(enlist map_fd)]; +glossary_schema:.arrowkdb.sc.schema[(dict_fd, map_fd)]; -1"\n+----------|| Create data for each column in the table ||----------+\n"; ts_data:asc N?0p; +str_data:N?("start";"stop";"alert";"acknowledge";""); +str_data[0]:"start" i64_data:N?100i; i64_data[0]:1i; f64_data:N?100f; f64_data[1]:2.34f; +dict_data:(("aa";"bb";"cc");(2 0 1)) map_data:((enlist 1)!(enlist 1f);(2 2)!(2 2.34f);(3 3 3)!(3 3 3f)) -1"\n+----------|| Combine the array data for the glossary columns ||----------+\n"; -glossary_data:(enlist map_data); - --1"\n+----------|| Write the schema and array data to a parquet file ||----------+\n"; -glossary_options[`PARQUET_VERSION]:`V2.0; - -parquet_glossary_bitmap:"glossary_bitmap.parquet"; -.arrowkdb.pq.writeParquet[parquet_glossary_bitmap;glossary_schema;glossary_data;glossary_options]; - --1"\n+----------|| Read the schema back and compare ||----------+\n"; -glossary_options[`WITH_NULL_BITMAP]:1; - -parquet_glossary_schema:.arrowkdb.pq.readParquetSchema[parquet_glossary_bitmap]; -.arrowkdb.sc.equalSchemas[glossary_schema;parquet_glossary_schema] -glossary_schema~parquet_glossary_schema - --1"\n+----------|| Read the array data back and compare ||----------+\n"; -parquet_glossary_data:.arrowkdb.pq.readParquetData[parquet_glossary_bitmap;glossary_options]; -glossary_data~first parquet_glossary_data - --1"\n+----------|| Compare null bitmaps of parquet data ||----------+\n"; -glossary_nulls:((enlist 0b)!(enlist 0b);00b!01b;000b!000b) -parquet_glossary_nulls:parquet_glossary_data[1] -glossary_nulls~parquet_glossary_nulls[0][0] - -rm parquet_glossary_bitmap; +glossary_data:(dict_data;map_data); -1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; arrow_glossary_bitmap:"nested_map.arrow"; .arrowkdb.ipc.writeArrow[arrow_glossary_bitmap;glossary_schema;glossary_data;glossary_options]; -1"\n+----------|| Read the schema back and compare ||----------+\n"; +glossary_options[`WITH_NULL_BITMAP]:1; + arrow_glossary_schema:.arrowkdb.ipc.readArrowSchema[arrow_glossary_bitmap]; .arrowkdb.sc.equalSchemas[glossary_schema;arrow_glossary_schema] glossary_schema~arrow_glossary_schema @@ -85,8 +68,11 @@ arrow_glossary_data:.arrowkdb.ipc.readArrowData[arrow_glossary_bitmap;glossary_o glossary_data~first arrow_glossary_data -1"\n+----------|| Compare null bitmaps of arrow data ||----------+\n"; +nested_dict_nulls:(000b;000b); +nested_map_nulls:((enlist 0b)!(enlist 0b);00b!01b;000b!000b) arrow_glossary_nulls:arrow_glossary_data[1] -glossary_nulls~arrow_glossary_nulls[0][0] +nested_dict_nulls~first arrow_glossary_nulls +nested_map_nulls~last arrow_glossary_nulls rm arrow_glossary_bitmap; @@ -104,8 +90,8 @@ glossary_data~first stream_glossary_data -1"\n+----------|| Compare null bitmaps of stream data ||----------+\n"; stream_glossary_nulls:stream_glossary_data[1] -glossary_nulls~stream_glossary_nulls[0][0] - +nested_dict_nulls~first arrow_glossary_nulls +nested_map_nulls~last arrow_glossary_nulls -1 "\n+----------|| Test utils ||----------+\n"; From 7a53fcce77ece6cb4189e196190345f72b101a1a Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 9 Mar 2023 10:15:53 +0300 Subject: [PATCH 129/276] Union usability improvement --- src/ArrayReader.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index c43cd28..7c894a5 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -831,14 +831,14 @@ void InitKdbNullBitmap( shared_ptr array_data, K* k_bitmap, size_t if( NestedHandlers.find( type_id ) == NestedHandlers.end() ){ kG( *k_bitmap )[index++] = array_data->IsNull( i ); } - else if( arrow::Type::STRUCT == type_id || arrow::Type::MAP == type_id ){ - auto pos = index; - *k_bitmap = jv( k_bitmap, NestedHandlers[type_id]( array_data, index, type_overrides ) ); - i += index - pos - 1; + else if( arrow::Type::LIST == type_id || arrow::Type::LARGE_LIST == type_id || arrow::Type::FIXED_SIZE_LIST == type_id ){ + auto pos = index; + *k_bitmap = jk( k_bitmap, NestedHandlers[type_id]( array_data, index, type_overrides ) ); + i += index - pos - 1; } else{ auto pos = index; - *k_bitmap = jk( k_bitmap, NestedHandlers[type_id]( array_data, index, type_overrides ) ); + *k_bitmap = jv( k_bitmap, NestedHandlers[type_id]( array_data, index, type_overrides ) ); i += index - pos - 1; } } From 9b210218c6fd7a6d521b411effa1f7a4f3d8ee88 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 9 Mar 2023 10:16:24 +0300 Subject: [PATCH 130/276] Unit-test for union usability imporovement --- examples/null_bitmap.q | 12 ++++++------ ...{nested_null_bitmap.t => formation_null_bitmap.t} | 0 tests/null_bitmap/union_null_bitmap.t | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) rename tests/null_bitmap/{nested_null_bitmap.t => formation_null_bitmap.t} (100%) diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q index 66fae45..f01a394 100644 --- a/examples/null_bitmap.q +++ b/examples/null_bitmap.q @@ -286,10 +286,10 @@ arrow_union_nulls:arrow_union_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count arrow_bitmap_nulls;arrow_bitmap_nulls] show nested_list_nulls~arrow_list_nulls show nested_struct_nulls~arrow_struct_nulls -show nested_dict_nulls~first[arrow_dict_nulls][0] +show nested_dict_nulls~first[arrow_dict_nulls] show nested_map_nulls~last[arrow_dict_nulls] -show nested_union_nulls~arrow_union_nulls[0][0] -show nested_union_nulls~arrow_union_nulls[1][0] +show nested_union_nulls~arrow_union_nulls[0] +show nested_union_nulls~arrow_union_nulls[1] rm arrow_null_bitmap; rm arrow_struct_bitmap; @@ -348,10 +348,10 @@ stream_union_nulls:stream_union_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count stream_bitmap_nulls;stream_bitmap_nulls] show nested_list_nulls~stream_list_nulls show nested_struct_nulls~stream_struct_nulls -show nested_dict_nulls~first[stream_dict_nulls][0] +show nested_dict_nulls~first[stream_dict_nulls] show nested_map_nulls~last[stream_dict_nulls] -show nested_union_nulls~stream_union_nulls[0][0] -show nested_union_nulls~stream_union_nulls[1][0] +show nested_union_nulls~stream_union_nulls[0] +show nested_union_nulls~stream_union_nulls[1] -1 "\n+----------------------------------------+\n"; diff --git a/tests/null_bitmap/nested_null_bitmap.t b/tests/null_bitmap/formation_null_bitmap.t similarity index 100% rename from tests/null_bitmap/nested_null_bitmap.t rename to tests/null_bitmap/formation_null_bitmap.t diff --git a/tests/null_bitmap/union_null_bitmap.t b/tests/null_bitmap/union_null_bitmap.t index 2f6580e..2812dcf 100644 --- a/tests/null_bitmap/union_null_bitmap.t +++ b/tests/null_bitmap/union_null_bitmap.t @@ -66,8 +66,8 @@ union_data~first arrow_union_data nested_union_nulls:((0 1 0h);100b;010b); arrow_union_nulls:arrow_union_data[1] -nested_union_nulls~arrow_union_nulls[0][0] -nested_union_nulls~arrow_union_nulls[1][0] +nested_union_nulls~arrow_union_nulls[0] +nested_union_nulls~arrow_union_nulls[1] rm arrow_union_bitmap; @@ -85,8 +85,8 @@ union_data~first stream_union_data -1"\n+----------|| Compare null bitmaps of stream data ||----------+\n"; stream_union_nulls:stream_union_data[1] -nested_union_nulls~stream_union_nulls[0][0] -nested_union_nulls~stream_union_nulls[1][0] +nested_union_nulls~stream_union_nulls[0] +nested_union_nulls~stream_union_nulls[1] -1 "\n+----------|| Test utils ||----------+\n"; From 9840002e92e59bc6fd2bcbf3fbfd1d4efc4be35d Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 9 Mar 2023 12:11:13 +0000 Subject: [PATCH 131/276] Adjusting more realistic feature usage examples --- docs/null-bitmap.md | 2 +- docs/null-mapping.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) mode change 100644 => 100755 docs/null-bitmap.md mode change 100644 => 100755 docs/null-mapping.md diff --git a/docs/null-bitmap.md b/docs/null-bitmap.md old mode 100644 new mode 100755 index 392fa88..f334faf --- a/docs/null-bitmap.md +++ b/docs/null-bitmap.md @@ -55,7 +55,7 @@ The shape of the null bitmap structure would be exactly the same as the data str Each reader function in arrowkdb takes an options dictionary.  A new `WITH_NULL_BITMAP option would be added.  When this option is set the reader functions then return a two item mixed list (the data values and null bitmap): ```q -q)read_data_structures:.arrowkdb.pq.readParquetToTable["file.parquet";(enlist `WITH_NULL_BITMAP)!(enlist 1)] +q)read_data_structures:.arrowkdb.pq.readParquetToTable["file.parquet";(``WITH_NULL_BITMAP)!((::);1)] q)read_data_structures +`col1`col2`col3!(-239800692 -930424766 1760748068i;-1.16675e+18 4.413091e+18.. +`col1`col2`col3!(011b;110b;010b) diff --git a/docs/null-mapping.md b/docs/null-mapping.md old mode 100644 new mode 100755 index 7270d7f..53745e3 --- a/docs/null-mapping.md +++ b/docs/null-mapping.md @@ -94,7 +94,7 @@ A compromise would be to allow the user to specify how arrowkdb should map nulls For example: ```q -q)options`NULL_MAPPING +q)options[`NULL_MAPPING] int16 | 0Nh int32 | 0Ni int64 | 0N From 1cb360d0f9ba8c0d339decd4522b2ffd378e73b0 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 10 Mar 2023 13:53:21 +0000 Subject: [PATCH 132/276] Fixing conclusions --- docs/null-bitmap.md | 2 +- docs/null-mapping.md | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/docs/null-bitmap.md b/docs/null-bitmap.md index f334faf..dded155 100755 --- a/docs/null-bitmap.md +++ b/docs/null-bitmap.md @@ -73,4 +73,4 @@ Note: Since the null bitmap structure and data structure must have the same shap ## Conclusions -Exposing the null bitmap to the kdb user, while closer to how arrow represents null, makes the API overly complex and it isn't clear whether there is a clear use case where the null bitmap could be well utilised in kdb.  Also, given the additional complexity of exposing the null bitmap, there may be other issues or corner cases which only become evident during development. +Exposing the null bitmap to the kdb user, while closer to how arrow represents null and makes the API more complex forms understanding that there is a clear use case where the null bitmap could be well utilised in kdb.  Also, more intuitive null mapping feature enabled in parallel may improve user experience. diff --git a/docs/null-mapping.md b/docs/null-mapping.md index 53745e3..ec74880 100755 --- a/docs/null-mapping.md +++ b/docs/null-mapping.md @@ -118,6 +118,4 @@ Note: There is no null mapping for arrow arrays which use nested datatypes (list ## Conclusions -Mapping arrow nulls to kdb nulls is considerably easier to implement and more intuitive for a kdb user. - -Therefore, subject to review of this document by users, the better choice is null mapping. +Mapping arrow nulls to kdb nulls is considerably easier to implement and more intuitive for a kdb user. While the user may achive better precision combining the method with exposing of null bitmap. From e9bf95bd0a502c9bf0a12e25eb764fc508af1e7b Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 5 Jan 2023 17:10:43 +0000 Subject: [PATCH 133/276] Array reader decomposition --- src/ArrayReader.cpp | 565 +++++++++++++++++++++++++------------------- 1 file changed, 316 insertions(+), 249 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 0456986..255c664 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -15,29 +16,27 @@ #include "HelperFunctions.h" #include "TypeCheck.h" - -namespace kx { -namespace arrowkdb { +namespace { // An arrow list array is a nested set of child lists. This is represented in // kdb as a mixed list for the parent list array containing a set of sub-lists, // one for each of the list value sets. template -void AppendList(std::shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +void AppendList(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) { for (auto i = 0; i < array_data->length(); ++i) { // Slice the parent array to get the list value set at the specified index auto value_slice = std::static_pointer_cast(array_data)->value_slice(i); // Recursively populate the kdb parent mixed list from that slice - kK(k_array)[index++] = ReadArray(value_slice, type_overrides); + kK(k_array)[index++] = kx::arrowkdb::ReadArray(value_slice, type_overrides); } } // An arrow map array is a nested set of key/item paired child arrays. This is // represented in kdb as a mixed list for the parent map array, with a // dictionary for each map value set. -void AppendMap(std::shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +void AppendMap(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) { auto map_array = std::static_pointer_cast(array_data); auto keys = map_array->keys(); @@ -49,7 +48,7 @@ void AppendMap(std::shared_ptr array_data, K k_array, size_t& inde auto items_slice = items->Slice(map_array->value_offset(i), map_array->value_length(i)); // Recursively populate the kdb parent mixed list with a dictionary // populated from those slices - kK(k_array)[index++] = xD(ReadArray(keys_slice, type_overrides), ReadArray(items_slice, type_overrides)); + kK(k_array)[index++] = xD(kx::arrowkdb::ReadArray(keys_slice, type_overrides), kx::arrowkdb::ReadArray(items_slice, type_overrides)); } } @@ -58,7 +57,7 @@ void AppendMap(std::shared_ptr array_data, K k_array, size_t& inde // value is obtaining by slicing across all the child arrays at a given index. // This is represented in kdb as a mixed list for the parent struct array, // containing child lists for each field in the struct. -void AppendStruct(std::shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +void AppendStruct(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) { auto struct_array = std::static_pointer_cast(array_data); auto num_fields = struct_array->type()->num_fields(); @@ -75,7 +74,7 @@ void AppendStruct(std::shared_ptr array_data, K k_array, size_t& i // An arrow union array is similar to a struct array except that it has an // additional type id array which identifies the live field in each union value // set. -void AppendUnion(std::shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +void AppendUnion(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) { auto union_array = std::static_pointer_cast(array_data); @@ -98,7 +97,7 @@ void AppendUnion(std::shared_ptr array_data, K k_array, size_t& in // An arrow dictionary array is represented in kdb as a mixed list for the // parent dictionary array containing the values and indicies sub-lists. -void AppendDictionary(std::shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +void AppendDictionary(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) { auto dictionary_array = std::static_pointer_cast(array_data); @@ -106,259 +105,327 @@ void AppendDictionary(std::shared_ptr array_data, K k_array, size_ // two child arrays could be a different length to each other and the parent // dictionary array which makes it difficult to preallocate the kdb lists of // the correct length. - K values = ReadArray(dictionary_array->dictionary(), type_overrides); + K values = kx::arrowkdb::ReadArray(dictionary_array->dictionary(), type_overrides); jv(&kK(k_array)[0], values); - K indices = ReadArray(dictionary_array->indices(), type_overrides); + K indices = kx::arrowkdb::ReadArray(dictionary_array->indices(), type_overrides); jv(&kK(k_array)[1], indices); } -void AppendArray(std::shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +void AppendArray_NA(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) { - switch (array_data->type_id()) { - case arrow::Type::NA: - { - auto null_array = std::static_pointer_cast(array_data); - for (auto i = 0; i < null_array->length(); ++i) - kK(k_array)[index++] = knk(0); - break; - } - case arrow::Type::BOOL: - { - auto bool_array = std::static_pointer_cast(array_data); - // BooleanArray doesn't have a bulk reader since arrow BooleanType is only 1 bit - for (auto i = 0; i < bool_array->length(); ++i) - kG(k_array)[index++] = bool_array->Value(i); - break; - } - case arrow::Type::UINT8: - { - auto uint8_array = std::static_pointer_cast(array_data); - memcpy(kG(k_array), uint8_array->raw_values(), uint8_array->length() * sizeof(arrow::UInt8Array::value_type)); - break; - } - case arrow::Type::INT8: - { - auto int8_array = std::static_pointer_cast(array_data); - memcpy(kG(k_array), int8_array->raw_values(), int8_array->length() * sizeof(arrow::Int8Array::value_type)); - break; - } - case arrow::Type::UINT16: - { - auto uint16_array = std::static_pointer_cast(array_data); - memcpy(kH(k_array), uint16_array->raw_values(), uint16_array->length() * sizeof(arrow::UInt16Array::value_type)); - break; - } - case arrow::Type::INT16: - { - auto int16_array = std::static_pointer_cast(array_data); - memcpy(kH(k_array), int16_array->raw_values(), int16_array->length() * sizeof(arrow::Int16Array::value_type)); - break; - } - case arrow::Type::UINT32: - { - auto uint32_array = std::static_pointer_cast(array_data); - memcpy(kI(k_array), uint32_array->raw_values(), uint32_array->length() * sizeof(arrow::UInt32Array::value_type)); - break; - } - case arrow::Type::INT32: - { - auto int32_array = std::static_pointer_cast(array_data); - memcpy(kI(k_array), int32_array->raw_values(), int32_array->length() * sizeof(arrow::Int32Array::value_type)); - break; - } - case arrow::Type::UINT64: - { - auto uint64_array = std::static_pointer_cast(array_data); - memcpy(kJ(k_array), uint64_array->raw_values(), uint64_array->length() * sizeof(arrow::UInt64Array::value_type)); - break; - } - case arrow::Type::INT64: - { - auto int64_array = std::static_pointer_cast(array_data); - memcpy(kJ(k_array), int64_array->raw_values(), int64_array->length() * sizeof(arrow::Int64Array::value_type)); - break; - } - case arrow::Type::HALF_FLOAT: - { - auto hfl_array = std::static_pointer_cast(array_data); - memcpy(kH(k_array), hfl_array->raw_values(), hfl_array->length() * sizeof(arrow::HalfFloatArray::value_type)); - break; - } - case arrow::Type::FLOAT: - { - auto fl_array = std::static_pointer_cast(array_data); - memcpy(kE(k_array), fl_array->raw_values(), fl_array->length() * sizeof(arrow::FloatArray::value_type)); - break; - } - case arrow::Type::DOUBLE: - { - auto dbl_array = std::static_pointer_cast(array_data); - memcpy(kF(k_array), dbl_array->raw_values(), dbl_array->length() * sizeof(arrow::DoubleArray::value_type)); - break; - } - case arrow::Type::STRING: - { - auto str_array = std::static_pointer_cast(array_data); - for (auto i = 0; i < str_array->length(); ++i) { - auto str_data = str_array->GetString(i); - K k_str = ktn(KC, str_data.length()); - memcpy(kG(k_str), str_data.data(), str_data.length()); - kK(k_array)[index++] = k_str; - } - break; - } - case arrow::Type::LARGE_STRING: - { - auto str_array = std::static_pointer_cast(array_data); - for (auto i = 0; i < str_array->length(); ++i) { - auto str_data = str_array->GetString(i); - K k_str = ktn(KC, str_data.length()); - memcpy(kG(k_str), str_data.data(), str_data.length()); - kK(k_array)[index++] = k_str; - } - break; - } - case arrow::Type::BINARY: - { - auto bin_array = std::static_pointer_cast(array_data); - for (auto i = 0; i < bin_array->length(); ++i) { - auto bin_data = bin_array->GetString(i); - K k_bin = ktn(KG, bin_data.length()); - memcpy(kG(k_bin), bin_data.data(), bin_data.length()); - kK(k_array)[index++] = k_bin; - } - break; - } - case arrow::Type::LARGE_BINARY: - { - auto bin_array = std::static_pointer_cast(array_data); - for (auto i = 0; i < bin_array->length(); ++i) { - auto bin_data = bin_array->GetString(i); - K k_bin = ktn(KG, bin_data.length()); - memcpy(kG(k_bin), bin_data.data(), bin_data.length()); - kK(k_array)[index++] = k_bin; - } - break; - } - case arrow::Type::FIXED_SIZE_BINARY: - { - auto fixed_bin_array = std::static_pointer_cast(array_data); - for (auto i = 0; i < fixed_bin_array->length(); ++i) { - auto bin_data = fixed_bin_array->GetString(i); - K k_bin = ktn(KG, bin_data.length()); - memcpy(kG(k_bin), bin_data.data(), bin_data.length()); - kK(k_array)[index++] = k_bin; - } - break; - } - case arrow::Type::DATE32: - { - TemporalConversion tc(array_data->type()); - auto d32_array = std::static_pointer_cast(array_data); - for (auto i = 0; i < d32_array->length(); ++i) - kI(k_array)[index++] = tc.ArrowToKdb(d32_array->Value(i)); - break; + auto null_array = std::static_pointer_cast(array_data); + for (auto i = 0; i < null_array->length(); ++i) + kK(k_array)[index++] = knk(0); +} + +void AppendArray_BOOL(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + auto bool_array = std::static_pointer_cast(array_data); + // BooleanArray doesn't have a bulk reader since arrow BooleanType is only 1 bit + for (auto i = 0; i < bool_array->length(); ++i) + kG(k_array)[index++] = bool_array->Value(i); +} + +void AppendArray_UINT8(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + auto uint8_array = std::static_pointer_cast(array_data); + memcpy(kG(k_array), uint8_array->raw_values(), uint8_array->length() * sizeof(arrow::UInt8Array::value_type)); +} + +void AppendArray_INT8(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + auto int8_array = std::static_pointer_cast(array_data); + memcpy(kG(k_array), int8_array->raw_values(), int8_array->length() * sizeof(arrow::Int8Array::value_type)); +} + +void AppendArray_UINT16(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + auto uint16_array = std::static_pointer_cast(array_data); + memcpy(kH(k_array), uint16_array->raw_values(), uint16_array->length() * sizeof(arrow::UInt16Array::value_type)); +} + +void AppendArray_INT16(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + auto int16_array = std::static_pointer_cast(array_data); + memcpy(kH(k_array), int16_array->raw_values(), int16_array->length() * sizeof(arrow::Int16Array::value_type)); +} + +void AppendArray_UINT32(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + auto uint32_array = std::static_pointer_cast(array_data); + memcpy(kI(k_array), uint32_array->raw_values(), uint32_array->length() * sizeof(arrow::UInt32Array::value_type)); +} + +void AppendArray_INT32(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + auto int32_array = std::static_pointer_cast(array_data); + memcpy(kI(k_array), int32_array->raw_values(), int32_array->length() * sizeof(arrow::Int32Array::value_type)); +} + +void AppendArray_UINT64(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + auto uint64_array = std::static_pointer_cast(array_data); + memcpy(kJ(k_array), uint64_array->raw_values(), uint64_array->length() * sizeof(arrow::UInt64Array::value_type)); +} + +void AppendArray_INT64(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + auto int64_array = std::static_pointer_cast(array_data); + memcpy(kJ(k_array), int64_array->raw_values(), int64_array->length() * sizeof(arrow::Int64Array::value_type)); +} + +void AppendArray_HALF_FLOAT(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + auto hfl_array = std::static_pointer_cast(array_data); + memcpy(kH(k_array), hfl_array->raw_values(), hfl_array->length() * sizeof(arrow::HalfFloatArray::value_type)); +} + +void AppendArray_FLOAT(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + auto fl_array = std::static_pointer_cast(array_data); + memcpy(kE(k_array), fl_array->raw_values(), fl_array->length() * sizeof(arrow::FloatArray::value_type)); +} + +void AppendArray_DOUBLE(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + auto dbl_array = std::static_pointer_cast(array_data); + memcpy(kF(k_array), dbl_array->raw_values(), dbl_array->length() * sizeof(arrow::DoubleArray::value_type)); +} + +void AppendArray_STRING(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + auto str_array = std::static_pointer_cast(array_data); + for (auto i = 0; i < str_array->length(); ++i) { + auto str_data = str_array->GetString(i); + K k_str = ktn(KC, str_data.length()); + memcpy(kG(k_str), str_data.data(), str_data.length()); + kK(k_array)[index++] = k_str; } - case arrow::Type::DATE64: - { - TemporalConversion tc(array_data->type()); - auto d64_array = std::static_pointer_cast(array_data); - for (auto i = 0; i < d64_array->length(); ++i) - kJ(k_array)[index++] = tc.ArrowToKdb(d64_array->Value(i)); - break; +} + +void AppendArray_LARGE_STRING(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + auto str_array = std::static_pointer_cast(array_data); + for (auto i = 0; i < str_array->length(); ++i) { + auto str_data = str_array->GetString(i); + K k_str = ktn(KC, str_data.length()); + memcpy(kG(k_str), str_data.data(), str_data.length()); + kK(k_array)[index++] = k_str; } - case arrow::Type::TIMESTAMP: - { - TemporalConversion tc(array_data->type()); - auto ts_array = std::static_pointer_cast(array_data); - auto timestamp_type = std::static_pointer_cast(ts_array->type()); - for (auto i = 0; i < ts_array->length(); ++i) - kJ(k_array)[index++] = tc.ArrowToKdb(ts_array->Value(i)); - break; +} + +void AppendArray_BINARY(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + auto bin_array = std::static_pointer_cast(array_data); + for (auto i = 0; i < bin_array->length(); ++i) { + auto bin_data = bin_array->GetString(i); + K k_bin = ktn(KG, bin_data.length()); + memcpy(kG(k_bin), bin_data.data(), bin_data.length()); + kK(k_array)[index++] = k_bin; } - case arrow::Type::TIME32: - { - TemporalConversion tc(array_data->type()); - auto t32_array = std::static_pointer_cast(array_data); - auto time32_type = std::static_pointer_cast(t32_array->type()); - for (auto i = 0; i < t32_array->length(); ++i) - kI(k_array)[index++] = tc.ArrowToKdb(t32_array->Value(i)); - break; +} + +void AppendArray_LARGE_BINARY(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + auto bin_array = std::static_pointer_cast(array_data); + for (auto i = 0; i < bin_array->length(); ++i) { + auto bin_data = bin_array->GetString(i); + K k_bin = ktn(KG, bin_data.length()); + memcpy(kG(k_bin), bin_data.data(), bin_data.length()); + kK(k_array)[index++] = k_bin; } - case arrow::Type::TIME64: - { - TemporalConversion tc(array_data->type()); - auto t64_array = std::static_pointer_cast(array_data); - auto time64_type = std::static_pointer_cast(t64_array->type()); - for (auto i = 0; i < t64_array->length(); ++i) - kJ(k_array)[index++] = tc.ArrowToKdb(t64_array->Value(i)); - break; +} + +void AppendArray_FIXED_SIZE_BINARY(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + auto fixed_bin_array = std::static_pointer_cast(array_data); + for (auto i = 0; i < fixed_bin_array->length(); ++i) { + auto bin_data = fixed_bin_array->GetString(i); + K k_bin = ktn(KG, bin_data.length()); + memcpy(kG(k_bin), bin_data.data(), bin_data.length()); + kK(k_array)[index++] = k_bin; } - case arrow::Type::DECIMAL: - { - auto dec_array = std::static_pointer_cast(array_data); - auto dec_type = std::static_pointer_cast(dec_array->type()); - for (auto i = 0; i < dec_array->length(); ++i) { - auto decimal = arrow::Decimal128(dec_array->Value(i)); - if (type_overrides.decimal128_as_double) { - // Convert the decimal to a double - auto dec_as_double = decimal.ToDouble(dec_type->scale()); - kF(k_array)[index++] = dec_as_double; - } else { - // Each decimal is a list of 16 bytes - K k_dec = ktn(KG, 16); - decimal.ToBytes(kG(k_dec)); - kK(k_array)[index++] = k_dec; - } +} + +void AppendArray_DATE32(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + kx::arrowkdb::TemporalConversion tc(array_data->type()); + auto d32_array = std::static_pointer_cast(array_data); + for (auto i = 0; i < d32_array->length(); ++i) + kI(k_array)[index++] = tc.ArrowToKdb(d32_array->Value(i)); +} + +void AppendArray_DATE64(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + kx::arrowkdb::TemporalConversion tc(array_data->type()); + auto d64_array = std::static_pointer_cast(array_data); + for (auto i = 0; i < d64_array->length(); ++i) + kJ(k_array)[index++] = tc.ArrowToKdb(d64_array->Value(i)); +} + +void AppendArray_TIMESTAMP(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + kx::arrowkdb::TemporalConversion tc(array_data->type()); + auto ts_array = std::static_pointer_cast(array_data); + auto timestamp_type = std::static_pointer_cast(ts_array->type()); + for (auto i = 0; i < ts_array->length(); ++i) + kJ(k_array)[index++] = tc.ArrowToKdb(ts_array->Value(i)); +} + +void AppendArray_TIME32(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + kx::arrowkdb::TemporalConversion tc(array_data->type()); + auto t32_array = std::static_pointer_cast(array_data); + auto time32_type = std::static_pointer_cast(t32_array->type()); + for (auto i = 0; i < t32_array->length(); ++i) + kI(k_array)[index++] = tc.ArrowToKdb(t32_array->Value(i)); +} + +void AppendArray_TIME64(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + kx::arrowkdb::TemporalConversion tc(array_data->type()); + auto t64_array = std::static_pointer_cast(array_data); + auto time64_type = std::static_pointer_cast(t64_array->type()); + for (auto i = 0; i < t64_array->length(); ++i) + kJ(k_array)[index++] = tc.ArrowToKdb(t64_array->Value(i)); +} + +void AppendArray_DECIMAL(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + auto dec_array = std::static_pointer_cast(array_data); + auto dec_type = std::static_pointer_cast(dec_array->type()); + for (auto i = 0; i < dec_array->length(); ++i) { + auto decimal = arrow::Decimal128(dec_array->Value(i)); + if (type_overrides.decimal128_as_double) { + // Convert the decimal to a double + auto dec_as_double = decimal.ToDouble(dec_type->scale()); + kF(k_array)[index++] = dec_as_double; + } else { + // Each decimal is a list of 16 bytes + K k_dec = ktn(KG, 16); + decimal.ToBytes(kG(k_dec)); + kK(k_array)[index++] = k_dec; } - break; - } - case arrow::Type::DURATION: - { - TemporalConversion tc(array_data->type()); - auto dur_array = std::static_pointer_cast(array_data); - auto duration_type = std::static_pointer_cast(dur_array->type()); - for (auto i = 0; i < dur_array->length(); ++i) - kJ(k_array)[index++] = tc.ArrowToKdb(dur_array->Value(i)); - break; } - case arrow::Type::INTERVAL_MONTHS: +} + +void AppendArray_DURATION(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + kx::arrowkdb::TemporalConversion tc(array_data->type()); + auto dur_array = std::static_pointer_cast(array_data); + auto duration_type = std::static_pointer_cast(dur_array->type()); + for (auto i = 0; i < dur_array->length(); ++i) + kJ(k_array)[index++] = tc.ArrowToKdb(dur_array->Value(i)); +} + +void AppendArray_INTERVAL_MONTHS(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + auto month_array = std::static_pointer_cast(array_data); + memcpy(kI(k_array), month_array->raw_values(), month_array->length() * sizeof(arrow::MonthIntervalArray::value_type)); +} + +void AppendArray_INTERVAL_DAY_TIME(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + auto dt_array = std::static_pointer_cast(array_data); + for (auto i = 0; i < dt_array->length(); ++i) + kJ(k_array)[index++] = kx::arrowkdb::DayTimeInterval_KTimespan(dt_array->Value(i)); +} + +void AppendArray_LIST(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + AppendList(array_data, k_array, index, type_overrides); +} + +void AppendArray_LARGE_LIST(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + AppendList(array_data, k_array, index, type_overrides); +} + +void AppendArray_FIXED_SIZE_LIST(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + AppendList(array_data, k_array, index, type_overrides); +} + +void AppendArray_MAP(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + AppendMap(array_data, k_array, index, type_overrides); +} + +void AppendArray_STRUCT(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + AppendStruct(array_data, k_array, index, type_overrides); +} + +void AppendArray_SPARSE_UNION(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + AppendUnion(array_data, k_array, index, type_overrides); +} + +void AppendArray_DENSE_UNION(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + AppendArray_SPARSE_UNION(array_data, k_array, index, type_overrides); +} + +void AppendArray_DICTIONARY(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +{ + AppendDictionary(array_data, k_array, index, type_overrides); +} + +using ArrayHandler = void (*) (std::shared_ptr, K, size_t&, kx::arrowkdb::TypeMappingOverride&); + +std::unordered_map ArrayHandlers { + std::make_pair( arrow::Type::NA, &AppendArray_NA ) + , std::make_pair( arrow::Type::BOOL, &AppendArray_BOOL ) + , std::make_pair( arrow::Type::UINT8, &AppendArray_UINT8 ) + , std::make_pair( arrow::Type::INT8, &AppendArray_INT8 ) + , std::make_pair( arrow::Type::UINT16, &AppendArray_UINT16 ) + , std::make_pair( arrow::Type::INT16, &AppendArray_INT16 ) + , std::make_pair( arrow::Type::UINT32, &AppendArray_UINT32 ) + , std::make_pair( arrow::Type::INT32, &AppendArray_INT32 ) + , std::make_pair( arrow::Type::UINT64, &AppendArray_UINT64 ) + , std::make_pair( arrow::Type::INT64, &AppendArray_INT64 ) + , std::make_pair( arrow::Type::HALF_FLOAT, &AppendArray_HALF_FLOAT ) + , std::make_pair( arrow::Type::FLOAT, &AppendArray_FLOAT ) + , std::make_pair( arrow::Type::DOUBLE, &AppendArray_DOUBLE ) + , std::make_pair( arrow::Type::STRING, &AppendArray_STRING ) + , std::make_pair( arrow::Type::LARGE_STRING, &AppendArray_LARGE_STRING ) + , std::make_pair( arrow::Type::BINARY, &AppendArray_BINARY ) + , std::make_pair( arrow::Type::LARGE_BINARY, &AppendArray_LARGE_BINARY ) + , std::make_pair( arrow::Type::FIXED_SIZE_BINARY, &AppendArray_FIXED_SIZE_BINARY ) + , std::make_pair( arrow::Type::DATE32, &AppendArray_DATE32 ) + , std::make_pair( arrow::Type::DATE64, &AppendArray_DATE64 ) + , std::make_pair( arrow::Type::TIMESTAMP, &AppendArray_TIMESTAMP ) + , std::make_pair( arrow::Type::TIME32, &AppendArray_TIME32 ) + , std::make_pair( arrow::Type::TIME64, &AppendArray_TIME64 ) + , std::make_pair( arrow::Type::DECIMAL, &AppendArray_DECIMAL ) + , std::make_pair( arrow::Type::DURATION, &AppendArray_DURATION ) + , std::make_pair( arrow::Type::INTERVAL_MONTHS, &AppendArray_INTERVAL_MONTHS ) + , std::make_pair( arrow::Type::INTERVAL_DAY_TIME, &AppendArray_INTERVAL_DAY_TIME ) + , std::make_pair( arrow::Type::LIST, &AppendArray_LIST ) + , std::make_pair( arrow::Type::LARGE_LIST, &AppendArray_LARGE_LIST ) + , std::make_pair( arrow::Type::FIXED_SIZE_LIST, &AppendArray_FIXED_SIZE_LIST ) + , std::make_pair( arrow::Type::MAP, &AppendArray_MAP ) + , std::make_pair( arrow::Type::STRUCT, &AppendArray_STRUCT ) + , std::make_pair( arrow::Type::SPARSE_UNION, &AppendArray_SPARSE_UNION ) + , std::make_pair( arrow::Type::DENSE_UNION, &AppendArray_DENSE_UNION ) + , std::make_pair( arrow::Type::DICTIONARY, &AppendArray_DICTIONARY ) +}; + +} // namespace + +namespace kx { +namespace arrowkdb { + +void AppendArray(std::shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +{ + auto type_id = array_data->type_id(); + if( ArrayHandlers.find( type_id ) == ArrayHandlers.end() ) { - auto month_array = std::static_pointer_cast(array_data); - memcpy(kI(k_array), month_array->raw_values(), month_array->length() * sizeof(arrow::MonthIntervalArray::value_type)); - break; + TYPE_CHECK_UNSUPPORTED(array_data->type()->ToString()); } - case arrow::Type::INTERVAL_DAY_TIME: + else { - auto dt_array = std::static_pointer_cast(array_data); - for (auto i = 0; i < dt_array->length(); ++i) - kJ(k_array)[index++] = DayTimeInterval_KTimespan(dt_array->Value(i)); - break; - } - case arrow::Type::LIST: - AppendList(array_data, k_array, index, type_overrides); - break; - case arrow::Type::LARGE_LIST: - AppendList(array_data, k_array, index, type_overrides); - break; - case arrow::Type::FIXED_SIZE_LIST: - AppendList(array_data, k_array, index, type_overrides); - break; - case arrow::Type::MAP: - AppendMap(array_data, k_array, index, type_overrides); - break; - case arrow::Type::STRUCT: - AppendStruct(array_data, k_array, index, type_overrides); - break; - case arrow::Type::SPARSE_UNION: - case arrow::Type::DENSE_UNION: - AppendUnion(array_data, k_array, index, type_overrides); - break; - case arrow::Type::DICTIONARY: - AppendDictionary(array_data, k_array, index, type_overrides); - break; - default: - TYPE_CHECK_UNSUPPORTED(array_data->type()->ToString()); + ArrayHandlers[type_id]( array_data, k_array, index, type_overrides ); } } @@ -450,4 +517,4 @@ K writeReadArray(K datatype_id, K array, K options) return kx::arrowkdb::ReadArray(arrow_array, type_overrides); KDB_EXCEPTION_CATCH; -} \ No newline at end of file +} From 1ab31058bfa5869a36453fae49c5a34a12ae7e4f Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 5 Jan 2023 17:25:22 +0000 Subject: [PATCH 134/276] Using implicit namespaces --- src/ArrayReader.cpp | 275 ++++++++++++++++++++++---------------------- 1 file changed, 139 insertions(+), 136 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 255c664..217af1e 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -16,29 +16,32 @@ #include "HelperFunctions.h" #include "TypeCheck.h" +using namespace std; +using namespace kx::arrowkdb; + namespace { // An arrow list array is a nested set of child lists. This is represented in // kdb as a mixed list for the parent list array containing a set of sub-lists, // one for each of the list value sets. template -void AppendList(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendList(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { for (auto i = 0; i < array_data->length(); ++i) { // Slice the parent array to get the list value set at the specified index - auto value_slice = std::static_pointer_cast(array_data)->value_slice(i); + auto value_slice = static_pointer_cast(array_data)->value_slice(i); // Recursively populate the kdb parent mixed list from that slice - kK(k_array)[index++] = kx::arrowkdb::ReadArray(value_slice, type_overrides); + kK(k_array)[index++] = ReadArray(value_slice, type_overrides); } } // An arrow map array is a nested set of key/item paired child arrays. This is // represented in kdb as a mixed list for the parent map array, with a // dictionary for each map value set. -void AppendMap(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendMap(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto map_array = std::static_pointer_cast(array_data); + auto map_array = static_pointer_cast(array_data); auto keys = map_array->keys(); auto items = map_array->items(); for (auto i = 0; i < array_data->length(); ++i) { @@ -48,7 +51,7 @@ void AppendMap(std::shared_ptr array_data, K k_array, size_t& inde auto items_slice = items->Slice(map_array->value_offset(i), map_array->value_length(i)); // Recursively populate the kdb parent mixed list with a dictionary // populated from those slices - kK(k_array)[index++] = xD(kx::arrowkdb::ReadArray(keys_slice, type_overrides), kx::arrowkdb::ReadArray(items_slice, type_overrides)); + kK(k_array)[index++] = xD(ReadArray(keys_slice, type_overrides), ReadArray(items_slice, type_overrides)); } } @@ -57,9 +60,9 @@ void AppendMap(std::shared_ptr array_data, K k_array, size_t& inde // value is obtaining by slicing across all the child arrays at a given index. // This is represented in kdb as a mixed list for the parent struct array, // containing child lists for each field in the struct. -void AppendStruct(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendStruct(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto struct_array = std::static_pointer_cast(array_data); + auto struct_array = static_pointer_cast(array_data); auto num_fields = struct_array->type()->num_fields(); for (auto i = 0; i < num_fields; ++i) { auto field_array = struct_array->field(i); @@ -74,9 +77,9 @@ void AppendStruct(std::shared_ptr array_data, K k_array, size_t& i // An arrow union array is similar to a struct array except that it has an // additional type id array which identifies the live field in each union value // set. -void AppendUnion(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendUnion(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto union_array = std::static_pointer_cast(array_data); + auto union_array = static_pointer_cast(array_data); // The type_id array is represented as a KH list at the start of the parent mixed list. K type_ids = kK(k_array)[0]; @@ -97,104 +100,104 @@ void AppendUnion(std::shared_ptr array_data, K k_array, size_t& in // An arrow dictionary array is represented in kdb as a mixed list for the // parent dictionary array containing the values and indicies sub-lists. -void AppendDictionary(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendDictionary(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto dictionary_array = std::static_pointer_cast(array_data); + auto dictionary_array = static_pointer_cast(array_data); // Append the dictionary and indicies arrays. Have to use a join since the // two child arrays could be a different length to each other and the parent // dictionary array which makes it difficult to preallocate the kdb lists of // the correct length. - K values = kx::arrowkdb::ReadArray(dictionary_array->dictionary(), type_overrides); + K values = ReadArray(dictionary_array->dictionary(), type_overrides); jv(&kK(k_array)[0], values); - K indices = kx::arrowkdb::ReadArray(dictionary_array->indices(), type_overrides); + K indices = ReadArray(dictionary_array->indices(), type_overrides); jv(&kK(k_array)[1], indices); } -void AppendArray_NA(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_NA(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto null_array = std::static_pointer_cast(array_data); + auto null_array = static_pointer_cast(array_data); for (auto i = 0; i < null_array->length(); ++i) kK(k_array)[index++] = knk(0); } -void AppendArray_BOOL(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_BOOL(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto bool_array = std::static_pointer_cast(array_data); + auto bool_array = static_pointer_cast(array_data); // BooleanArray doesn't have a bulk reader since arrow BooleanType is only 1 bit for (auto i = 0; i < bool_array->length(); ++i) kG(k_array)[index++] = bool_array->Value(i); } -void AppendArray_UINT8(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_UINT8(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto uint8_array = std::static_pointer_cast(array_data); + auto uint8_array = static_pointer_cast(array_data); memcpy(kG(k_array), uint8_array->raw_values(), uint8_array->length() * sizeof(arrow::UInt8Array::value_type)); } -void AppendArray_INT8(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_INT8(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto int8_array = std::static_pointer_cast(array_data); + auto int8_array = static_pointer_cast(array_data); memcpy(kG(k_array), int8_array->raw_values(), int8_array->length() * sizeof(arrow::Int8Array::value_type)); } -void AppendArray_UINT16(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_UINT16(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto uint16_array = std::static_pointer_cast(array_data); + auto uint16_array = static_pointer_cast(array_data); memcpy(kH(k_array), uint16_array->raw_values(), uint16_array->length() * sizeof(arrow::UInt16Array::value_type)); } -void AppendArray_INT16(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_INT16(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto int16_array = std::static_pointer_cast(array_data); + auto int16_array = static_pointer_cast(array_data); memcpy(kH(k_array), int16_array->raw_values(), int16_array->length() * sizeof(arrow::Int16Array::value_type)); } -void AppendArray_UINT32(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_UINT32(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto uint32_array = std::static_pointer_cast(array_data); + auto uint32_array = static_pointer_cast(array_data); memcpy(kI(k_array), uint32_array->raw_values(), uint32_array->length() * sizeof(arrow::UInt32Array::value_type)); } -void AppendArray_INT32(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_INT32(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto int32_array = std::static_pointer_cast(array_data); + auto int32_array = static_pointer_cast(array_data); memcpy(kI(k_array), int32_array->raw_values(), int32_array->length() * sizeof(arrow::Int32Array::value_type)); } -void AppendArray_UINT64(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_UINT64(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto uint64_array = std::static_pointer_cast(array_data); + auto uint64_array = static_pointer_cast(array_data); memcpy(kJ(k_array), uint64_array->raw_values(), uint64_array->length() * sizeof(arrow::UInt64Array::value_type)); } -void AppendArray_INT64(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_INT64(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto int64_array = std::static_pointer_cast(array_data); + auto int64_array = static_pointer_cast(array_data); memcpy(kJ(k_array), int64_array->raw_values(), int64_array->length() * sizeof(arrow::Int64Array::value_type)); } -void AppendArray_HALF_FLOAT(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_HALF_FLOAT(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto hfl_array = std::static_pointer_cast(array_data); + auto hfl_array = static_pointer_cast(array_data); memcpy(kH(k_array), hfl_array->raw_values(), hfl_array->length() * sizeof(arrow::HalfFloatArray::value_type)); } -void AppendArray_FLOAT(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_FLOAT(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto fl_array = std::static_pointer_cast(array_data); + auto fl_array = static_pointer_cast(array_data); memcpy(kE(k_array), fl_array->raw_values(), fl_array->length() * sizeof(arrow::FloatArray::value_type)); } -void AppendArray_DOUBLE(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_DOUBLE(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto dbl_array = std::static_pointer_cast(array_data); + auto dbl_array = static_pointer_cast(array_data); memcpy(kF(k_array), dbl_array->raw_values(), dbl_array->length() * sizeof(arrow::DoubleArray::value_type)); } -void AppendArray_STRING(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_STRING(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto str_array = std::static_pointer_cast(array_data); + auto str_array = static_pointer_cast(array_data); for (auto i = 0; i < str_array->length(); ++i) { auto str_data = str_array->GetString(i); K k_str = ktn(KC, str_data.length()); @@ -203,9 +206,9 @@ void AppendArray_STRING(std::shared_ptr array_data, K k_array, siz } } -void AppendArray_LARGE_STRING(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_LARGE_STRING(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto str_array = std::static_pointer_cast(array_data); + auto str_array = static_pointer_cast(array_data); for (auto i = 0; i < str_array->length(); ++i) { auto str_data = str_array->GetString(i); K k_str = ktn(KC, str_data.length()); @@ -214,9 +217,9 @@ void AppendArray_LARGE_STRING(std::shared_ptr array_data, K k_arra } } -void AppendArray_BINARY(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_BINARY(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto bin_array = std::static_pointer_cast(array_data); + auto bin_array = static_pointer_cast(array_data); for (auto i = 0; i < bin_array->length(); ++i) { auto bin_data = bin_array->GetString(i); K k_bin = ktn(KG, bin_data.length()); @@ -225,9 +228,9 @@ void AppendArray_BINARY(std::shared_ptr array_data, K k_array, siz } } -void AppendArray_LARGE_BINARY(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_LARGE_BINARY(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto bin_array = std::static_pointer_cast(array_data); + auto bin_array = static_pointer_cast(array_data); for (auto i = 0; i < bin_array->length(); ++i) { auto bin_data = bin_array->GetString(i); K k_bin = ktn(KG, bin_data.length()); @@ -236,9 +239,9 @@ void AppendArray_LARGE_BINARY(std::shared_ptr array_data, K k_arra } } -void AppendArray_FIXED_SIZE_BINARY(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_FIXED_SIZE_BINARY(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto fixed_bin_array = std::static_pointer_cast(array_data); + auto fixed_bin_array = static_pointer_cast(array_data); for (auto i = 0; i < fixed_bin_array->length(); ++i) { auto bin_data = fixed_bin_array->GetString(i); K k_bin = ktn(KG, bin_data.length()); @@ -247,53 +250,53 @@ void AppendArray_FIXED_SIZE_BINARY(std::shared_ptr array_data, K k } } -void AppendArray_DATE32(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_DATE32(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - kx::arrowkdb::TemporalConversion tc(array_data->type()); - auto d32_array = std::static_pointer_cast(array_data); + TemporalConversion tc(array_data->type()); + auto d32_array = static_pointer_cast(array_data); for (auto i = 0; i < d32_array->length(); ++i) kI(k_array)[index++] = tc.ArrowToKdb(d32_array->Value(i)); } -void AppendArray_DATE64(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_DATE64(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - kx::arrowkdb::TemporalConversion tc(array_data->type()); - auto d64_array = std::static_pointer_cast(array_data); + TemporalConversion tc(array_data->type()); + auto d64_array = static_pointer_cast(array_data); for (auto i = 0; i < d64_array->length(); ++i) kJ(k_array)[index++] = tc.ArrowToKdb(d64_array->Value(i)); } -void AppendArray_TIMESTAMP(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_TIMESTAMP(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - kx::arrowkdb::TemporalConversion tc(array_data->type()); - auto ts_array = std::static_pointer_cast(array_data); - auto timestamp_type = std::static_pointer_cast(ts_array->type()); + TemporalConversion tc(array_data->type()); + auto ts_array = static_pointer_cast(array_data); + auto timestamp_type = static_pointer_cast(ts_array->type()); for (auto i = 0; i < ts_array->length(); ++i) kJ(k_array)[index++] = tc.ArrowToKdb(ts_array->Value(i)); } -void AppendArray_TIME32(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_TIME32(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - kx::arrowkdb::TemporalConversion tc(array_data->type()); - auto t32_array = std::static_pointer_cast(array_data); - auto time32_type = std::static_pointer_cast(t32_array->type()); + TemporalConversion tc(array_data->type()); + auto t32_array = static_pointer_cast(array_data); + auto time32_type = static_pointer_cast(t32_array->type()); for (auto i = 0; i < t32_array->length(); ++i) kI(k_array)[index++] = tc.ArrowToKdb(t32_array->Value(i)); } -void AppendArray_TIME64(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_TIME64(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - kx::arrowkdb::TemporalConversion tc(array_data->type()); - auto t64_array = std::static_pointer_cast(array_data); - auto time64_type = std::static_pointer_cast(t64_array->type()); + TemporalConversion tc(array_data->type()); + auto t64_array = static_pointer_cast(array_data); + auto time64_type = static_pointer_cast(t64_array->type()); for (auto i = 0; i < t64_array->length(); ++i) kJ(k_array)[index++] = tc.ArrowToKdb(t64_array->Value(i)); } -void AppendArray_DECIMAL(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_DECIMAL(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto dec_array = std::static_pointer_cast(array_data); - auto dec_type = std::static_pointer_cast(dec_array->type()); + auto dec_array = static_pointer_cast(array_data); + auto dec_type = static_pointer_cast(dec_array->type()); for (auto i = 0; i < dec_array->length(); ++i) { auto decimal = arrow::Decimal128(dec_array->Value(i)); if (type_overrides.decimal128_as_double) { @@ -309,106 +312,106 @@ void AppendArray_DECIMAL(std::shared_ptr array_data, K k_array, si } } -void AppendArray_DURATION(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_DURATION(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - kx::arrowkdb::TemporalConversion tc(array_data->type()); - auto dur_array = std::static_pointer_cast(array_data); - auto duration_type = std::static_pointer_cast(dur_array->type()); + TemporalConversion tc(array_data->type()); + auto dur_array = static_pointer_cast(array_data); + auto duration_type = static_pointer_cast(dur_array->type()); for (auto i = 0; i < dur_array->length(); ++i) kJ(k_array)[index++] = tc.ArrowToKdb(dur_array->Value(i)); } -void AppendArray_INTERVAL_MONTHS(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_INTERVAL_MONTHS(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto month_array = std::static_pointer_cast(array_data); + auto month_array = static_pointer_cast(array_data); memcpy(kI(k_array), month_array->raw_values(), month_array->length() * sizeof(arrow::MonthIntervalArray::value_type)); } -void AppendArray_INTERVAL_DAY_TIME(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_INTERVAL_DAY_TIME(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto dt_array = std::static_pointer_cast(array_data); + auto dt_array = static_pointer_cast(array_data); for (auto i = 0; i < dt_array->length(); ++i) - kJ(k_array)[index++] = kx::arrowkdb::DayTimeInterval_KTimespan(dt_array->Value(i)); + kJ(k_array)[index++] = DayTimeInterval_KTimespan(dt_array->Value(i)); } -void AppendArray_LIST(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_LIST(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { AppendList(array_data, k_array, index, type_overrides); } -void AppendArray_LARGE_LIST(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_LARGE_LIST(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { AppendList(array_data, k_array, index, type_overrides); } -void AppendArray_FIXED_SIZE_LIST(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_FIXED_SIZE_LIST(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { AppendList(array_data, k_array, index, type_overrides); } -void AppendArray_MAP(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_MAP(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { AppendMap(array_data, k_array, index, type_overrides); } -void AppendArray_STRUCT(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_STRUCT(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { AppendStruct(array_data, k_array, index, type_overrides); } -void AppendArray_SPARSE_UNION(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_SPARSE_UNION(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { AppendUnion(array_data, k_array, index, type_overrides); } -void AppendArray_DENSE_UNION(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_DENSE_UNION(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { AppendArray_SPARSE_UNION(array_data, k_array, index, type_overrides); } -void AppendArray_DICTIONARY(std::shared_ptr array_data, K k_array, size_t& index, kx::arrowkdb::TypeMappingOverride& type_overrides) +void AppendArray_DICTIONARY(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { AppendDictionary(array_data, k_array, index, type_overrides); } -using ArrayHandler = void (*) (std::shared_ptr, K, size_t&, kx::arrowkdb::TypeMappingOverride&); - -std::unordered_map ArrayHandlers { - std::make_pair( arrow::Type::NA, &AppendArray_NA ) - , std::make_pair( arrow::Type::BOOL, &AppendArray_BOOL ) - , std::make_pair( arrow::Type::UINT8, &AppendArray_UINT8 ) - , std::make_pair( arrow::Type::INT8, &AppendArray_INT8 ) - , std::make_pair( arrow::Type::UINT16, &AppendArray_UINT16 ) - , std::make_pair( arrow::Type::INT16, &AppendArray_INT16 ) - , std::make_pair( arrow::Type::UINT32, &AppendArray_UINT32 ) - , std::make_pair( arrow::Type::INT32, &AppendArray_INT32 ) - , std::make_pair( arrow::Type::UINT64, &AppendArray_UINT64 ) - , std::make_pair( arrow::Type::INT64, &AppendArray_INT64 ) - , std::make_pair( arrow::Type::HALF_FLOAT, &AppendArray_HALF_FLOAT ) - , std::make_pair( arrow::Type::FLOAT, &AppendArray_FLOAT ) - , std::make_pair( arrow::Type::DOUBLE, &AppendArray_DOUBLE ) - , std::make_pair( arrow::Type::STRING, &AppendArray_STRING ) - , std::make_pair( arrow::Type::LARGE_STRING, &AppendArray_LARGE_STRING ) - , std::make_pair( arrow::Type::BINARY, &AppendArray_BINARY ) - , std::make_pair( arrow::Type::LARGE_BINARY, &AppendArray_LARGE_BINARY ) - , std::make_pair( arrow::Type::FIXED_SIZE_BINARY, &AppendArray_FIXED_SIZE_BINARY ) - , std::make_pair( arrow::Type::DATE32, &AppendArray_DATE32 ) - , std::make_pair( arrow::Type::DATE64, &AppendArray_DATE64 ) - , std::make_pair( arrow::Type::TIMESTAMP, &AppendArray_TIMESTAMP ) - , std::make_pair( arrow::Type::TIME32, &AppendArray_TIME32 ) - , std::make_pair( arrow::Type::TIME64, &AppendArray_TIME64 ) - , std::make_pair( arrow::Type::DECIMAL, &AppendArray_DECIMAL ) - , std::make_pair( arrow::Type::DURATION, &AppendArray_DURATION ) - , std::make_pair( arrow::Type::INTERVAL_MONTHS, &AppendArray_INTERVAL_MONTHS ) - , std::make_pair( arrow::Type::INTERVAL_DAY_TIME, &AppendArray_INTERVAL_DAY_TIME ) - , std::make_pair( arrow::Type::LIST, &AppendArray_LIST ) - , std::make_pair( arrow::Type::LARGE_LIST, &AppendArray_LARGE_LIST ) - , std::make_pair( arrow::Type::FIXED_SIZE_LIST, &AppendArray_FIXED_SIZE_LIST ) - , std::make_pair( arrow::Type::MAP, &AppendArray_MAP ) - , std::make_pair( arrow::Type::STRUCT, &AppendArray_STRUCT ) - , std::make_pair( arrow::Type::SPARSE_UNION, &AppendArray_SPARSE_UNION ) - , std::make_pair( arrow::Type::DENSE_UNION, &AppendArray_DENSE_UNION ) - , std::make_pair( arrow::Type::DICTIONARY, &AppendArray_DICTIONARY ) +using ArrayHandler = void (*) (shared_ptr, K, size_t&, TypeMappingOverride&); + +unordered_map ArrayHandlers { + make_pair( arrow::Type::NA, &AppendArray_NA ) + , make_pair( arrow::Type::BOOL, &AppendArray_BOOL ) + , make_pair( arrow::Type::UINT8, &AppendArray_UINT8 ) + , make_pair( arrow::Type::INT8, &AppendArray_INT8 ) + , make_pair( arrow::Type::UINT16, &AppendArray_UINT16 ) + , make_pair( arrow::Type::INT16, &AppendArray_INT16 ) + , make_pair( arrow::Type::UINT32, &AppendArray_UINT32 ) + , make_pair( arrow::Type::INT32, &AppendArray_INT32 ) + , make_pair( arrow::Type::UINT64, &AppendArray_UINT64 ) + , make_pair( arrow::Type::INT64, &AppendArray_INT64 ) + , make_pair( arrow::Type::HALF_FLOAT, &AppendArray_HALF_FLOAT ) + , make_pair( arrow::Type::FLOAT, &AppendArray_FLOAT ) + , make_pair( arrow::Type::DOUBLE, &AppendArray_DOUBLE ) + , make_pair( arrow::Type::STRING, &AppendArray_STRING ) + , make_pair( arrow::Type::LARGE_STRING, &AppendArray_LARGE_STRING ) + , make_pair( arrow::Type::BINARY, &AppendArray_BINARY ) + , make_pair( arrow::Type::LARGE_BINARY, &AppendArray_LARGE_BINARY ) + , make_pair( arrow::Type::FIXED_SIZE_BINARY, &AppendArray_FIXED_SIZE_BINARY ) + , make_pair( arrow::Type::DATE32, &AppendArray_DATE32 ) + , make_pair( arrow::Type::DATE64, &AppendArray_DATE64 ) + , make_pair( arrow::Type::TIMESTAMP, &AppendArray_TIMESTAMP ) + , make_pair( arrow::Type::TIME32, &AppendArray_TIME32 ) + , make_pair( arrow::Type::TIME64, &AppendArray_TIME64 ) + , make_pair( arrow::Type::DECIMAL, &AppendArray_DECIMAL ) + , make_pair( arrow::Type::DURATION, &AppendArray_DURATION ) + , make_pair( arrow::Type::INTERVAL_MONTHS, &AppendArray_INTERVAL_MONTHS ) + , make_pair( arrow::Type::INTERVAL_DAY_TIME, &AppendArray_INTERVAL_DAY_TIME ) + , make_pair( arrow::Type::LIST, &AppendArray_LIST ) + , make_pair( arrow::Type::LARGE_LIST, &AppendArray_LARGE_LIST ) + , make_pair( arrow::Type::FIXED_SIZE_LIST, &AppendArray_FIXED_SIZE_LIST ) + , make_pair( arrow::Type::MAP, &AppendArray_MAP ) + , make_pair( arrow::Type::STRUCT, &AppendArray_STRUCT ) + , make_pair( arrow::Type::SPARSE_UNION, &AppendArray_SPARSE_UNION ) + , make_pair( arrow::Type::DENSE_UNION, &AppendArray_DENSE_UNION ) + , make_pair( arrow::Type::DICTIONARY, &AppendArray_DICTIONARY ) }; } // namespace @@ -416,7 +419,7 @@ std::unordered_map ArrayHandlers { namespace kx { namespace arrowkdb { -void AppendArray(std::shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto type_id = array_data->type_id(); if( ArrayHandlers.find( type_id ) == ArrayHandlers.end() ) @@ -429,7 +432,7 @@ void AppendArray(std::shared_ptr array_data, K k_array, size_t& in } } -K InitKdbForArray(std::shared_ptr datatype, size_t length, TypeMappingOverride& type_overrides) +K InitKdbForArray(shared_ptr datatype, size_t length, TypeMappingOverride& type_overrides) { switch (datatype->id()) { case arrow::Type::STRUCT: @@ -459,7 +462,7 @@ K InitKdbForArray(std::shared_ptr datatype, size_t length, Type case arrow::Type::DICTIONARY: { // Arrow dictionary becomes a two item mixed list - auto dictionary_type = std::static_pointer_cast(datatype); + auto dictionary_type = static_pointer_cast(datatype); K result = ktn(0, 2); // Do not preallocate the child lists since AppendDictionary has to join to the @@ -474,7 +477,7 @@ K InitKdbForArray(std::shared_ptr datatype, size_t length, Type } } -K ReadArray(std::shared_ptr array, TypeMappingOverride& type_overrides) +K ReadArray(shared_ptr array, TypeMappingOverride& type_overrides) { K k_array = InitKdbForArray(array->type(), array->length(), type_overrides); size_t index = 0; @@ -482,7 +485,7 @@ K ReadArray(std::shared_ptr array, TypeMappingOverride& type_overr return k_array; } -K ReadChunkedArray(std::shared_ptr chunked_array, TypeMappingOverride& type_overrides) +K ReadChunkedArray(shared_ptr chunked_array, TypeMappingOverride& type_overrides) { K k_array = InitKdbForArray(chunked_array->type(), chunked_array->length(), type_overrides); size_t index = 0; @@ -502,19 +505,19 @@ K writeReadArray(K datatype_id, K array, K options) if (datatype_id->t != -KI) return krr((S)"datatype_id not -6h"); - auto datatype = kx::arrowkdb::GetDatatypeStore()->Find(datatype_id->i); + auto datatype = GetDatatypeStore()->Find(datatype_id->i); if (!datatype) return krr((S)"datatype not found"); // Parse the options - auto read_options = kx::arrowkdb::KdbOptions(options, kx::arrowkdb::Options::string_options, kx::arrowkdb::Options::int_options); + auto read_options = KdbOptions(options, Options::string_options, Options::int_options); // Type mapping overrides - kx::arrowkdb::TypeMappingOverride type_overrides{ read_options }; + TypeMappingOverride type_overrides{ read_options }; - auto arrow_array = kx::arrowkdb::MakeArray(datatype, array, type_overrides); + auto arrow_array = MakeArray(datatype, array, type_overrides); - return kx::arrowkdb::ReadArray(arrow_array, type_overrides); + return ReadArray(arrow_array, type_overrides); KDB_EXCEPTION_CATCH; } From 6c756bafa05e0d074cbd0262e4392768c522ab1d Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 6 Jan 2023 13:11:27 +0000 Subject: [PATCH 135/276] Generalizing arrow reader handlers --- src/ArrayReader.cpp | 186 +++++++++++++++++++++++++++----------------- 1 file changed, 115 insertions(+), 71 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 217af1e..ced60fd 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -114,14 +114,19 @@ void AppendDictionary(shared_ptr array_data, K k_array, size_t& in jv(&kK(k_array)[1], indices); } -void AppendArray_NA(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides); + +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto null_array = static_pointer_cast(array_data); for (auto i = 0; i < null_array->length(); ++i) kK(k_array)[index++] = knk(0); } -void AppendArray_BOOL(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto bool_array = static_pointer_cast(array_data); // BooleanArray doesn't have a bulk reader since arrow BooleanType is only 1 bit @@ -129,73 +134,85 @@ void AppendArray_BOOL(shared_ptr array_data, K k_array, size_t& in kG(k_array)[index++] = bool_array->Value(i); } -void AppendArray_UINT8(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto uint8_array = static_pointer_cast(array_data); memcpy(kG(k_array), uint8_array->raw_values(), uint8_array->length() * sizeof(arrow::UInt8Array::value_type)); } -void AppendArray_INT8(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto int8_array = static_pointer_cast(array_data); memcpy(kG(k_array), int8_array->raw_values(), int8_array->length() * sizeof(arrow::Int8Array::value_type)); } -void AppendArray_UINT16(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto uint16_array = static_pointer_cast(array_data); memcpy(kH(k_array), uint16_array->raw_values(), uint16_array->length() * sizeof(arrow::UInt16Array::value_type)); } -void AppendArray_INT16(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto int16_array = static_pointer_cast(array_data); memcpy(kH(k_array), int16_array->raw_values(), int16_array->length() * sizeof(arrow::Int16Array::value_type)); } -void AppendArray_UINT32(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto uint32_array = static_pointer_cast(array_data); memcpy(kI(k_array), uint32_array->raw_values(), uint32_array->length() * sizeof(arrow::UInt32Array::value_type)); } -void AppendArray_INT32(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto int32_array = static_pointer_cast(array_data); memcpy(kI(k_array), int32_array->raw_values(), int32_array->length() * sizeof(arrow::Int32Array::value_type)); } -void AppendArray_UINT64(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto uint64_array = static_pointer_cast(array_data); memcpy(kJ(k_array), uint64_array->raw_values(), uint64_array->length() * sizeof(arrow::UInt64Array::value_type)); } -void AppendArray_INT64(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto int64_array = static_pointer_cast(array_data); memcpy(kJ(k_array), int64_array->raw_values(), int64_array->length() * sizeof(arrow::Int64Array::value_type)); } -void AppendArray_HALF_FLOAT(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto hfl_array = static_pointer_cast(array_data); memcpy(kH(k_array), hfl_array->raw_values(), hfl_array->length() * sizeof(arrow::HalfFloatArray::value_type)); } -void AppendArray_FLOAT(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto fl_array = static_pointer_cast(array_data); memcpy(kE(k_array), fl_array->raw_values(), fl_array->length() * sizeof(arrow::FloatArray::value_type)); } -void AppendArray_DOUBLE(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto dbl_array = static_pointer_cast(array_data); memcpy(kF(k_array), dbl_array->raw_values(), dbl_array->length() * sizeof(arrow::DoubleArray::value_type)); } -void AppendArray_STRING(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto str_array = static_pointer_cast(array_data); for (auto i = 0; i < str_array->length(); ++i) { @@ -206,7 +223,8 @@ void AppendArray_STRING(shared_ptr array_data, K k_array, size_t& } } -void AppendArray_LARGE_STRING(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto str_array = static_pointer_cast(array_data); for (auto i = 0; i < str_array->length(); ++i) { @@ -217,7 +235,8 @@ void AppendArray_LARGE_STRING(shared_ptr array_data, K k_array, si } } -void AppendArray_BINARY(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto bin_array = static_pointer_cast(array_data); for (auto i = 0; i < bin_array->length(); ++i) { @@ -228,7 +247,8 @@ void AppendArray_BINARY(shared_ptr array_data, K k_array, size_t& } } -void AppendArray_LARGE_BINARY(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto bin_array = static_pointer_cast(array_data); for (auto i = 0; i < bin_array->length(); ++i) { @@ -239,7 +259,8 @@ void AppendArray_LARGE_BINARY(shared_ptr array_data, K k_array, si } } -void AppendArray_FIXED_SIZE_BINARY(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto fixed_bin_array = static_pointer_cast(array_data); for (auto i = 0; i < fixed_bin_array->length(); ++i) { @@ -250,7 +271,8 @@ void AppendArray_FIXED_SIZE_BINARY(shared_ptr array_data, K k_arra } } -void AppendArray_DATE32(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { TemporalConversion tc(array_data->type()); auto d32_array = static_pointer_cast(array_data); @@ -258,7 +280,8 @@ void AppendArray_DATE32(shared_ptr array_data, K k_array, size_t& kI(k_array)[index++] = tc.ArrowToKdb(d32_array->Value(i)); } -void AppendArray_DATE64(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { TemporalConversion tc(array_data->type()); auto d64_array = static_pointer_cast(array_data); @@ -266,7 +289,8 @@ void AppendArray_DATE64(shared_ptr array_data, K k_array, size_t& kJ(k_array)[index++] = tc.ArrowToKdb(d64_array->Value(i)); } -void AppendArray_TIMESTAMP(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { TemporalConversion tc(array_data->type()); auto ts_array = static_pointer_cast(array_data); @@ -275,7 +299,8 @@ void AppendArray_TIMESTAMP(shared_ptr array_data, K k_array, size_ kJ(k_array)[index++] = tc.ArrowToKdb(ts_array->Value(i)); } -void AppendArray_TIME32(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { TemporalConversion tc(array_data->type()); auto t32_array = static_pointer_cast(array_data); @@ -284,7 +309,8 @@ void AppendArray_TIME32(shared_ptr array_data, K k_array, size_t& kI(k_array)[index++] = tc.ArrowToKdb(t32_array->Value(i)); } -void AppendArray_TIME64(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { TemporalConversion tc(array_data->type()); auto t64_array = static_pointer_cast(array_data); @@ -293,7 +319,8 @@ void AppendArray_TIME64(shared_ptr array_data, K k_array, size_t& kJ(k_array)[index++] = tc.ArrowToKdb(t64_array->Value(i)); } -void AppendArray_DECIMAL(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto dec_array = static_pointer_cast(array_data); auto dec_type = static_pointer_cast(dec_array->type()); @@ -312,7 +339,8 @@ void AppendArray_DECIMAL(shared_ptr array_data, K k_array, size_t& } } -void AppendArray_DURATION(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { TemporalConversion tc(array_data->type()); auto dur_array = static_pointer_cast(array_data); @@ -321,97 +349,113 @@ void AppendArray_DURATION(shared_ptr array_data, K k_array, size_t kJ(k_array)[index++] = tc.ArrowToKdb(dur_array->Value(i)); } -void AppendArray_INTERVAL_MONTHS(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto month_array = static_pointer_cast(array_data); memcpy(kI(k_array), month_array->raw_values(), month_array->length() * sizeof(arrow::MonthIntervalArray::value_type)); } -void AppendArray_INTERVAL_DAY_TIME(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto dt_array = static_pointer_cast(array_data); for (auto i = 0; i < dt_array->length(); ++i) kJ(k_array)[index++] = DayTimeInterval_KTimespan(dt_array->Value(i)); } -void AppendArray_LIST(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { AppendList(array_data, k_array, index, type_overrides); } -void AppendArray_LARGE_LIST(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { AppendList(array_data, k_array, index, type_overrides); } -void AppendArray_FIXED_SIZE_LIST(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { AppendList(array_data, k_array, index, type_overrides); } -void AppendArray_MAP(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { AppendMap(array_data, k_array, index, type_overrides); } -void AppendArray_STRUCT(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { AppendStruct(array_data, k_array, index, type_overrides); } -void AppendArray_SPARSE_UNION(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { AppendUnion(array_data, k_array, index, type_overrides); } -void AppendArray_DENSE_UNION(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - AppendArray_SPARSE_UNION(array_data, k_array, index, type_overrides); + AppendArray(array_data, k_array, index, type_overrides); } -void AppendArray_DICTIONARY(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) +template<> +void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { AppendDictionary(array_data, k_array, index, type_overrides); } using ArrayHandler = void (*) (shared_ptr, K, size_t&, TypeMappingOverride&); +template +auto make_array_handler() +{ + return make_pair( TypeId, &AppendArray ); +} + unordered_map ArrayHandlers { - make_pair( arrow::Type::NA, &AppendArray_NA ) - , make_pair( arrow::Type::BOOL, &AppendArray_BOOL ) - , make_pair( arrow::Type::UINT8, &AppendArray_UINT8 ) - , make_pair( arrow::Type::INT8, &AppendArray_INT8 ) - , make_pair( arrow::Type::UINT16, &AppendArray_UINT16 ) - , make_pair( arrow::Type::INT16, &AppendArray_INT16 ) - , make_pair( arrow::Type::UINT32, &AppendArray_UINT32 ) - , make_pair( arrow::Type::INT32, &AppendArray_INT32 ) - , make_pair( arrow::Type::UINT64, &AppendArray_UINT64 ) - , make_pair( arrow::Type::INT64, &AppendArray_INT64 ) - , make_pair( arrow::Type::HALF_FLOAT, &AppendArray_HALF_FLOAT ) - , make_pair( arrow::Type::FLOAT, &AppendArray_FLOAT ) - , make_pair( arrow::Type::DOUBLE, &AppendArray_DOUBLE ) - , make_pair( arrow::Type::STRING, &AppendArray_STRING ) - , make_pair( arrow::Type::LARGE_STRING, &AppendArray_LARGE_STRING ) - , make_pair( arrow::Type::BINARY, &AppendArray_BINARY ) - , make_pair( arrow::Type::LARGE_BINARY, &AppendArray_LARGE_BINARY ) - , make_pair( arrow::Type::FIXED_SIZE_BINARY, &AppendArray_FIXED_SIZE_BINARY ) - , make_pair( arrow::Type::DATE32, &AppendArray_DATE32 ) - , make_pair( arrow::Type::DATE64, &AppendArray_DATE64 ) - , make_pair( arrow::Type::TIMESTAMP, &AppendArray_TIMESTAMP ) - , make_pair( arrow::Type::TIME32, &AppendArray_TIME32 ) - , make_pair( arrow::Type::TIME64, &AppendArray_TIME64 ) - , make_pair( arrow::Type::DECIMAL, &AppendArray_DECIMAL ) - , make_pair( arrow::Type::DURATION, &AppendArray_DURATION ) - , make_pair( arrow::Type::INTERVAL_MONTHS, &AppendArray_INTERVAL_MONTHS ) - , make_pair( arrow::Type::INTERVAL_DAY_TIME, &AppendArray_INTERVAL_DAY_TIME ) - , make_pair( arrow::Type::LIST, &AppendArray_LIST ) - , make_pair( arrow::Type::LARGE_LIST, &AppendArray_LARGE_LIST ) - , make_pair( arrow::Type::FIXED_SIZE_LIST, &AppendArray_FIXED_SIZE_LIST ) - , make_pair( arrow::Type::MAP, &AppendArray_MAP ) - , make_pair( arrow::Type::STRUCT, &AppendArray_STRUCT ) - , make_pair( arrow::Type::SPARSE_UNION, &AppendArray_SPARSE_UNION ) - , make_pair( arrow::Type::DENSE_UNION, &AppendArray_DENSE_UNION ) - , make_pair( arrow::Type::DICTIONARY, &AppendArray_DICTIONARY ) + make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() + , make_array_handler() }; } // namespace From e4a89a30a68a75a50658abb74f831d4820ca2864 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 6 Jan 2023 13:13:47 +0000 Subject: [PATCH 136/276] Ignoring service files --- .gitignore | 2 ++ tests/.gitignore | 1 + 2 files changed, 3 insertions(+) create mode 100644 .gitignore create mode 100644 tests/.gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..18f4d15 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +arrowkdb.code-workspace +build/ diff --git a/tests/.gitignore b/tests/.gitignore new file mode 100644 index 0000000..492b6a4 --- /dev/null +++ b/tests/.gitignore @@ -0,0 +1 @@ +test.q From fbb50807aeac9b18170b24cb23c5bbf2a0095358 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 6 Jan 2023 14:35:29 +0000 Subject: [PATCH 137/276] Array builder decomposition --- src/ArrayWriter.cpp | 424 ++++++++++++++++++++++++++++++++------------ 1 file changed, 309 insertions(+), 115 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index 602c764..77ca309 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -13,9 +14,306 @@ #include "HelperFunctions.h" #include "TypeCheck.h" +using namespace std; +using namespace kx::arrowkdb; -namespace kx { -namespace arrowkdb { +namespace +{ + +std::shared_ptr GetBuilder(std::shared_ptr datatype); + +template +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool); + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(datatype, pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(datatype, pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(datatype, pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(datatype, pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(datatype, pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(datatype, pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return std::make_shared(pool); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + // The parent list datatype details the child datatype so construct the child + // builder and use it to initialise the parent list builder + auto list_type = std::static_pointer_cast(datatype); + auto value_builder = GetBuilder(list_type->value_type()); + + // Construct the correct listbuilder + if (datatype->id() == arrow::Type::LIST) + return std::make_shared(pool, value_builder); + else if (datatype->id() == arrow::Type::LARGE_LIST) + return std::make_shared(pool, value_builder); + else + return std::make_shared(pool, value_builder, datatype); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return GetBuilder( datatype, pool ); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return GetBuilder( datatype, pool ); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + // The parent map datatype details the key/item child datatypes so construct + // builders for both and use these to initialise the parent map builder + auto map_type = std::static_pointer_cast(datatype); + auto key_builder = GetBuilder(map_type->key_type()); + auto item_builder = GetBuilder(map_type->item_type()); + return std::make_shared(pool, key_builder, item_builder); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + auto struct_type = std::static_pointer_cast(datatype); + + // Iterate through all the fields in the struct constructing and adding each + // field's builder into a vector + auto fields = struct_type->fields(); + std::vector> field_builders; + for (auto field : fields) + field_builders.push_back(GetBuilder(field->type())); + + // Construct the parent struct builder from this vector of all the child + // builders + return std::make_shared(datatype, pool, field_builders); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + auto union_type = std::static_pointer_cast(datatype); + + // Iterate through all the fields in the union constructing and adding each + // field's builder into a vector + auto fields = union_type->fields(); + std::vector> field_builders; + for (auto field : fields) + field_builders.push_back(GetBuilder(field->type())); + + // Construct the parent union builder from this vector of all the child + // builders + if (datatype->id() == arrow::Type::SPARSE_UNION) + return std::make_shared(pool, field_builders, datatype); + else + return std::make_shared(pool, field_builders, datatype); +} + +template<> +shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) +{ + return GetBuilder( datatype, pool ); +} + +using BuilderHandler = shared_ptr ( * ) ( shared_ptr, arrow::MemoryPool* ); + +template +auto make_builder_handler() +{ + return make_pair( TypeId, &GetBuilder ); +} + +unordered_map BuilderHandlers { + make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() + , make_builder_handler() +}; // Constructs and returns the correct arrow array builder for the specified // datatype. @@ -23,126 +321,22 @@ namespace arrowkdb { // This handles all datatypes except Dictionary which is handled separately. std::shared_ptr GetBuilder(std::shared_ptr datatype) { + auto type_id = datatype->id(); arrow::MemoryPool* pool = arrow::default_memory_pool(); - switch (datatype->id()) { - case arrow::Type::NA: - return std::make_shared(pool); - case arrow::Type::BOOL: - return std::make_shared(pool); - case arrow::Type::UINT8: - return std::make_shared(pool); - case arrow::Type::INT8: - return std::make_shared(pool); - case arrow::Type::UINT16: - return std::make_shared(pool); - case arrow::Type::INT16: - return std::make_shared(pool); - case arrow::Type::UINT32: - return std::make_shared(pool); - case arrow::Type::INT32: - return std::make_shared(pool); - case arrow::Type::UINT64: - return std::make_shared(pool); - case arrow::Type::INT64: - return std::make_shared(pool); - case arrow::Type::HALF_FLOAT: - return std::make_shared(pool); - case arrow::Type::FLOAT: - return std::make_shared(pool); - case arrow::Type::DOUBLE: - return std::make_shared(pool); - case arrow::Type::STRING: - return std::make_shared(pool); - case arrow::Type::LARGE_STRING: - return std::make_shared(pool); - case arrow::Type::BINARY: - return std::make_shared(pool); - case arrow::Type::LARGE_BINARY: - return std::make_shared(pool); - case arrow::Type::FIXED_SIZE_BINARY: - return std::make_shared(datatype, pool); - case arrow::Type::DATE32: - return std::make_shared(pool); - case arrow::Type::DATE64: - return std::make_shared(pool); - case arrow::Type::TIMESTAMP: - return std::make_shared(datatype, pool); - case arrow::Type::TIME32: - return std::make_shared(datatype, pool); - case arrow::Type::TIME64: - return std::make_shared(datatype, pool); - case arrow::Type::DECIMAL: - return std::make_shared(datatype, pool); - case arrow::Type::DURATION: - return std::make_shared(datatype, pool); - case arrow::Type::INTERVAL_MONTHS: - return std::make_shared(pool); - case arrow::Type::INTERVAL_DAY_TIME: - return std::make_shared(pool); - case arrow::Type::LIST: - case arrow::Type::LARGE_LIST: - case arrow::Type::FIXED_SIZE_LIST: + if( BuilderHandlers.find( type_id ) == BuilderHandlers.end() ) { - // The parent list datatype details the child datatype so construct the child - // builder and use it to initialise the parent list builder - auto list_type = std::static_pointer_cast(datatype); - auto value_builder = GetBuilder(list_type->value_type()); - - // Construct the correct listbuilder - if (datatype->id() == arrow::Type::LIST) - return std::make_shared(pool, value_builder); - else if (datatype->id() == arrow::Type::LARGE_LIST) - return std::make_shared(pool, value_builder); - else - return std::make_shared(pool, value_builder, datatype); + TYPE_CHECK_UNSUPPORTED(datatype->ToString()); } - case arrow::Type::MAP: + else { - // The parent map datatype details the key/item child datatypes so construct - // builders for both and use these to initialise the parent map builder - auto map_type = std::static_pointer_cast(datatype); - auto key_builder = GetBuilder(map_type->key_type()); - auto item_builder = GetBuilder(map_type->item_type()); - return std::make_shared(pool, key_builder, item_builder); + return BuilderHandlers[type_id]( datatype, pool ); } - case arrow::Type::STRUCT: - { - auto struct_type = std::static_pointer_cast(datatype); +} - // Iterate through all the fields in the struct constructing and adding each - // field's builder into a vector - auto fields = struct_type->fields(); - std::vector> field_builders; - for (auto field : fields) - field_builders.push_back(GetBuilder(field->type())); +} // namespace - // Construct the parent struct builder from this vector of all the child - // builders - return std::make_shared(datatype, pool, field_builders); - } - case arrow::Type::SPARSE_UNION: - case arrow::Type::DENSE_UNION: - { - auto union_type = std::static_pointer_cast(datatype); - - // Iterate through all the fields in the union constructing and adding each - // field's builder into a vector - auto fields = union_type->fields(); - std::vector> field_builders; - for (auto field : fields) - field_builders.push_back(GetBuilder(field->type())); - - // Construct the parent union builder from this vector of all the child - // builders - if (datatype->id() == arrow::Type::SPARSE_UNION) - return std::make_shared(pool, field_builders, datatype); - else - return std::make_shared(pool, field_builders, datatype); - } - default: - TYPE_CHECK_UNSUPPORTED(datatype->ToString()); - } -} +namespace kx { +namespace arrowkdb { // Populate a list/large_list/fixed_size_list builder // From e218a48843c4ef95cac6c96f38f7694da77c2837 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 6 Jan 2023 15:35:42 +0000 Subject: [PATCH 138/276] Populate builder decomposition --- src/ArrayWriter.cpp | 845 +++++++++++++++++++++++++------------------- 1 file changed, 474 insertions(+), 371 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index 77ca309..ad05e33 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -20,7 +20,7 @@ using namespace kx::arrowkdb; namespace { -std::shared_ptr GetBuilder(std::shared_ptr datatype); +shared_ptr GetBuilder(shared_ptr datatype); template shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool); @@ -28,163 +28,163 @@ shared_ptr GetBuilder(shared_ptr datatype, template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(datatype, pool); + return make_shared(datatype, pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(datatype, pool); + return make_shared(datatype, pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(datatype, pool); + return make_shared(datatype, pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(datatype, pool); + return make_shared(datatype, pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(datatype, pool); + return make_shared(datatype, pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(datatype, pool); + return make_shared(datatype, pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - return std::make_shared(pool); + return make_shared(pool); } template<> @@ -192,16 +192,16 @@ shared_ptr GetBuilder(shared_ptr(datatype); + auto list_type = static_pointer_cast(datatype); auto value_builder = GetBuilder(list_type->value_type()); // Construct the correct listbuilder if (datatype->id() == arrow::Type::LIST) - return std::make_shared(pool, value_builder); + return make_shared(pool, value_builder); else if (datatype->id() == arrow::Type::LARGE_LIST) - return std::make_shared(pool, value_builder); + return make_shared(pool, value_builder); else - return std::make_shared(pool, value_builder, datatype); + return make_shared(pool, value_builder, datatype); } template<> @@ -221,47 +221,47 @@ shared_ptr GetBuilder(shared_ptr(datatype); + auto map_type = static_pointer_cast(datatype); auto key_builder = GetBuilder(map_type->key_type()); auto item_builder = GetBuilder(map_type->item_type()); - return std::make_shared(pool, key_builder, item_builder); + return make_shared(pool, key_builder, item_builder); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - auto struct_type = std::static_pointer_cast(datatype); + auto struct_type = static_pointer_cast(datatype); // Iterate through all the fields in the struct constructing and adding each // field's builder into a vector auto fields = struct_type->fields(); - std::vector> field_builders; + vector> field_builders; for (auto field : fields) field_builders.push_back(GetBuilder(field->type())); // Construct the parent struct builder from this vector of all the child // builders - return std::make_shared(datatype, pool, field_builders); + return make_shared(datatype, pool, field_builders); } template<> shared_ptr GetBuilder(shared_ptr datatype, arrow::MemoryPool* pool) { - auto union_type = std::static_pointer_cast(datatype); + auto union_type = static_pointer_cast(datatype); // Iterate through all the fields in the union constructing and adding each // field's builder into a vector auto fields = union_type->fields(); - std::vector> field_builders; + vector> field_builders; for (auto field : fields) field_builders.push_back(GetBuilder(field->type())); // Construct the parent union builder from this vector of all the child // builders if (datatype->id() == arrow::Type::SPARSE_UNION) - return std::make_shared(pool, field_builders, datatype); + return make_shared(pool, field_builders, datatype); else - return std::make_shared(pool, field_builders, datatype); + return make_shared(pool, field_builders, datatype); } template<> @@ -319,7 +319,7 @@ unordered_map BuilderHandlers { // datatype. // // This handles all datatypes except Dictionary which is handled separately. -std::shared_ptr GetBuilder(std::shared_ptr datatype) +shared_ptr GetBuilder(shared_ptr datatype) { auto type_id = datatype->id(); arrow::MemoryPool* pool = arrow::default_memory_pool(); @@ -335,8 +335,8 @@ std::shared_ptr GetBuilder(std::shared_ptr } // namespace -namespace kx { -namespace arrowkdb { +namespace +{ // Populate a list/large_list/fixed_size_list builder // @@ -344,7 +344,7 @@ namespace arrowkdb { // kdb as a mixed list for the parent list array containing a set of sub-lists, // one for each of the list value sets. template -void PopulateListBuilder(std::shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +void PopulateListBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { // Get the value builder from the parent list builder auto list_builder = static_cast(builder); @@ -361,7 +361,7 @@ void PopulateListBuilder(std::shared_ptr datatype, K k_array, a if (datatype->id() == arrow::Type::FIXED_SIZE_LIST) { // Check each sub-list is the same length as the fixed size K list_data = kK(k_array)[i]; - auto fixed_list_type = std::static_pointer_cast(datatype); + auto fixed_list_type = static_pointer_cast(datatype); TYPE_CHECK_LENGTH(fixed_list_type->list_size() != list_data->n, datatype->ToString(), fixed_list_type->list_size(), list_data->n); } @@ -376,12 +376,12 @@ void PopulateListBuilder(std::shared_ptr datatype, K k_array, a // additional type id array which identifies the live field in each union value // set. template -void PopulateUnionBuilder(std::shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +void PopulateUnionBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { // Check that the mixed list length is at least one greater (the additional // first sub-list contains the union type_ids) than the number of union // fields - auto union_type = std::static_pointer_cast(datatype); + auto union_type = static_pointer_cast(datatype); const auto min_length = union_type->num_fields() + 1; TYPE_CHECK_LENGTH(min_length > k_array->n, datatype->ToString(), min_length, k_array->n); @@ -393,7 +393,7 @@ void PopulateUnionBuilder(std::shared_ptr datatype, K k_array, // Get all the child builders from the parent union builder auto union_builder = static_cast(builder); - std::vector> child_builders; + vector> child_builders; for (auto i = 0; i < union_builder->num_children(); ++i) child_builders.push_back(union_builder->child_builder(i)); @@ -420,364 +420,467 @@ void PopulateUnionBuilder(std::shared_ptr datatype, K k_array, throw TypeCheck("Mismatched union list lengths"); } -// Populates data values from a kdb list into the specified array builder. -void PopulateBuilder(std::shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +template +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides); + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { - // Special cases for: - // symbol - string or large_string - // guid - fixed_size_binary(16) - // char - uint8 - bool is_symbol = k_array->t == KS && (datatype->id() == arrow::Type::STRING || datatype->id() == arrow::Type::LARGE_STRING); - bool is_guid = k_array->t == UU && datatype->id() == arrow::Type::FIXED_SIZE_BINARY && static_cast(builder)->byte_width() == sizeof(U); - bool is_char = k_array->t == KC && (datatype->id() == arrow::Type::UINT8 || datatype->id() == arrow::Type::INT8); + auto null_builder = static_cast(builder); + PARQUET_THROW_NOT_OK(null_builder->AppendNulls(k_array->n)); +} - // Type check the kdb structure - if (!is_symbol && !is_guid && !is_char) - TYPE_CHECK_ARRAY(kx::arrowkdb::GetKdbType(datatype, type_overrides) != k_array->t, datatype->ToString(), kx::arrowkdb::GetKdbType(datatype, type_overrides), k_array->t); +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto bool_builder = static_cast(builder); + PARQUET_THROW_NOT_OK(bool_builder->AppendValues((uint8_t*)kG(k_array), k_array->n)); +} - switch (datatype->id()) { - case arrow::Type::NA: - { - auto null_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(null_builder->AppendNulls(k_array->n)); - break; - } - case arrow::Type::BOOL: - { - auto bool_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(bool_builder->AppendValues((uint8_t*)kG(k_array), k_array->n)); - break; - } - case arrow::Type::UINT8: - { - auto uint8_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(uint8_builder->AppendValues((uint8_t*)kG(k_array), k_array->n)); - break; - } - case arrow::Type::INT8: - { - auto int8_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(int8_builder->AppendValues((int8_t*)kG(k_array), k_array->n)); - break; - } - case arrow::Type::UINT16: - { - auto uint16_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(uint16_builder->AppendValues((uint16_t*)kH(k_array), k_array->n)); - arrow::Status s; - break; - } - case arrow::Type::INT16: - { - auto int16_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(int16_builder->AppendValues((int16_t*)kH(k_array), k_array->n)); - break; - } - case arrow::Type::UINT32: - { - auto uint32_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(uint32_builder->AppendValues((uint32_t*)kI(k_array), k_array->n)); - break; - } - case arrow::Type::INT32: - { - auto int32_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(int32_builder->AppendValues((int32_t*)kI(k_array), k_array->n)); - break; - } - case arrow::Type::UINT64: - { - auto uint64_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(uint64_builder->AppendValues((uint64_t*)kJ(k_array), k_array->n)); - break; - } - case arrow::Type::INT64: - { - auto int64_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(int64_builder->AppendValues((int64_t*)kJ(k_array), k_array->n)); - break; - } - case arrow::Type::HALF_FLOAT: - { - arrow::HalfFloatType hft; - auto hfl_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(hfl_builder->AppendValues((uint16_t*)kH(k_array), k_array->n)); - break; - } - case arrow::Type::FLOAT: - { - auto fl_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(fl_builder->AppendValues(kE(k_array), k_array->n)); - break; - } - case arrow::Type::DOUBLE: - { - auto dbl_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(dbl_builder->AppendValues(kF(k_array), k_array->n)); - break; - } - case arrow::Type::STRING: - { - auto str_builder = static_cast(builder); - if (is_symbol) { - // Populate from symbol list - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(str_builder->Append(kS(k_array)[i])); - } else { - // Populate from mixed list of char lists - for (auto i = 0; i < k_array->n; ++i) { - K str_data = kK(k_array)[i]; - TYPE_CHECK_ITEM(str_data->t != KC, datatype->ToString(), KC, str_data->t); - PARQUET_THROW_NOT_OK(str_builder->Append(kG(str_data), str_data->n)); - } - } - break; - } - case arrow::Type::LARGE_STRING: - { - auto str_builder = static_cast(builder); - if (is_symbol) { - // Populate from symbol list - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(str_builder->Append(kS(k_array)[i])); - } else { - // Populate from mixed list of char lists - for (auto i = 0; i < k_array->n; ++i) { - K str_data = kK(k_array)[i]; - TYPE_CHECK_ITEM(str_data->t != KC, datatype->ToString(), KC, str_data->t); - PARQUET_THROW_NOT_OK(str_builder->Append(kG(str_data), str_data->n)); - } - } - break; - } - case arrow::Type::BINARY: - { - auto bin_builder = static_cast(builder); +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto uint8_builder = static_cast(builder); + PARQUET_THROW_NOT_OK(uint8_builder->AppendValues((uint8_t*)kG(k_array), k_array->n)); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto int8_builder = static_cast(builder); + PARQUET_THROW_NOT_OK(int8_builder->AppendValues((int8_t*)kG(k_array), k_array->n)); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto uint16_builder = static_cast(builder); + PARQUET_THROW_NOT_OK(uint16_builder->AppendValues((uint16_t*)kH(k_array), k_array->n)); + arrow::Status s; +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto int16_builder = static_cast(builder); + PARQUET_THROW_NOT_OK(int16_builder->AppendValues((int16_t*)kH(k_array), k_array->n)); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto uint32_builder = static_cast(builder); + PARQUET_THROW_NOT_OK(uint32_builder->AppendValues((uint32_t*)kI(k_array), k_array->n)); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto int32_builder = static_cast(builder); + PARQUET_THROW_NOT_OK(int32_builder->AppendValues((int32_t*)kI(k_array), k_array->n)); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto uint64_builder = static_cast(builder); + PARQUET_THROW_NOT_OK(uint64_builder->AppendValues((uint64_t*)kJ(k_array), k_array->n)); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto int64_builder = static_cast(builder); + PARQUET_THROW_NOT_OK(int64_builder->AppendValues((int64_t*)kJ(k_array), k_array->n)); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + arrow::HalfFloatType hft; + auto hfl_builder = static_cast(builder); + PARQUET_THROW_NOT_OK(hfl_builder->AppendValues((uint16_t*)kH(k_array), k_array->n)); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto fl_builder = static_cast(builder); + PARQUET_THROW_NOT_OK(fl_builder->AppendValues(kE(k_array), k_array->n)); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto dbl_builder = static_cast(builder); + PARQUET_THROW_NOT_OK(dbl_builder->AppendValues(kF(k_array), k_array->n)); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + bool is_symbol = k_array->t == KS && (datatype->id() == arrow::Type::STRING || datatype->id() == arrow::Type::LARGE_STRING); + auto str_builder = static_cast(builder); + if (is_symbol) { + // Populate from symbol list + for (auto i = 0; i < k_array->n; ++i) + PARQUET_THROW_NOT_OK(str_builder->Append(kS(k_array)[i])); + } else { + // Populate from mixed list of char lists for (auto i = 0; i < k_array->n; ++i) { - K bin_data = kK(k_array)[i]; - TYPE_CHECK_ITEM(bin_data->t != KG, datatype->ToString(), KG, bin_data->t); - PARQUET_THROW_NOT_OK(bin_builder->Append(kG(bin_data), bin_data->n)); + K str_data = kK(k_array)[i]; + TYPE_CHECK_ITEM(str_data->t != KC, datatype->ToString(), KC, str_data->t); + PARQUET_THROW_NOT_OK(str_builder->Append(kG(str_data), str_data->n)); } - break; } - case arrow::Type::LARGE_BINARY: - { - auto bin_builder = static_cast(builder); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + bool is_symbol = k_array->t == KS && (datatype->id() == arrow::Type::STRING || datatype->id() == arrow::Type::LARGE_STRING); + auto str_builder = static_cast(builder); + if (is_symbol) { + // Populate from symbol list + for (auto i = 0; i < k_array->n; ++i) + PARQUET_THROW_NOT_OK(str_builder->Append(kS(k_array)[i])); + } else { + // Populate from mixed list of char lists for (auto i = 0; i < k_array->n; ++i) { - K bin_data = kK(k_array)[i]; - TYPE_CHECK_ITEM(bin_data->t != KG, datatype->ToString(), KG, bin_data->t); - PARQUET_THROW_NOT_OK(bin_builder->Append(kG(bin_data), bin_data->n)); + K str_data = kK(k_array)[i]; + TYPE_CHECK_ITEM(str_data->t != KC, datatype->ToString(), KC, str_data->t); + PARQUET_THROW_NOT_OK(str_builder->Append(kG(str_data), str_data->n)); } - break; } - case arrow::Type::FIXED_SIZE_BINARY: - { - auto fixed_bin_builder = static_cast(builder); - if (is_guid) { - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(fixed_bin_builder->Append((char*)&kU(k_array)[i])); - } else { - for (auto i = 0; i < k_array->n; ++i) { - K bin_data = kK(k_array)[i]; - TYPE_CHECK_ITEM(bin_data->t != KG, datatype->ToString(), KG, bin_data->t); - TYPE_CHECK_LENGTH(fixed_bin_builder->byte_width() != bin_data->n, builder->type()->ToString(), fixed_bin_builder->byte_width(), bin_data->n); - PARQUET_THROW_NOT_OK(fixed_bin_builder->Append(kG(bin_data))); - } - } - break; - } - case arrow::Type::DATE32: - { - TemporalConversion tc(datatype); - auto d32_builder = static_cast(builder); - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(d32_builder->Append(tc.KdbToArrow(kI(k_array)[i]))); - break; - } - case arrow::Type::DATE64: - { - TemporalConversion tc(datatype); - auto d64_builder = static_cast(builder); - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(d64_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); - break; - } - case arrow::Type::TIMESTAMP: - { - TemporalConversion tc(datatype); - auto ts_builder = static_cast(builder); - auto timestamp_type = std::static_pointer_cast(datatype); - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(ts_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); - break; +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto bin_builder = static_cast(builder); + for (auto i = 0; i < k_array->n; ++i) { + K bin_data = kK(k_array)[i]; + TYPE_CHECK_ITEM(bin_data->t != KG, datatype->ToString(), KG, bin_data->t); + PARQUET_THROW_NOT_OK(bin_builder->Append(kG(bin_data), bin_data->n)); } - case arrow::Type::TIME32: - { - TemporalConversion tc(datatype); - auto t32_builder = static_cast(builder); - auto time32_type = std::static_pointer_cast(datatype); - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(t32_builder->Append(tc.KdbToArrow(kI(k_array)[i]))); - break; +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto bin_builder = static_cast(builder); + for (auto i = 0; i < k_array->n; ++i) { + K bin_data = kK(k_array)[i]; + TYPE_CHECK_ITEM(bin_data->t != KG, datatype->ToString(), KG, bin_data->t); + PARQUET_THROW_NOT_OK(bin_builder->Append(kG(bin_data), bin_data->n)); } - case arrow::Type::TIME64: - { - TemporalConversion tc(datatype); - auto t64_builder = static_cast(builder); - auto time64_type = std::static_pointer_cast(datatype); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + bool is_guid = k_array->t == UU && datatype->id() == arrow::Type::FIXED_SIZE_BINARY && static_cast(builder)->byte_width() == sizeof(U); + auto fixed_bin_builder = static_cast(builder); + if (is_guid) { for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(t64_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); - break; - } - case arrow::Type::DECIMAL: - { - auto dec_builder = static_cast(builder); - auto dec_type = std::static_pointer_cast(datatype); + PARQUET_THROW_NOT_OK(fixed_bin_builder->Append((char*)&kU(k_array)[i])); + } else { for (auto i = 0; i < k_array->n; ++i) { - if (type_overrides.decimal128_as_double) { - // Construct the decimal from a double - arrow::Decimal128 dec128; - PARQUET_ASSIGN_OR_THROW(dec128, arrow::Decimal128::FromReal(kF(k_array)[i], dec_type->precision(), dec_type->scale())); - PARQUET_THROW_NOT_OK(dec_builder->Append(dec128)); - } else { - // Each decimal is a list of 16 bytes - K k_dec = kK(k_array)[i]; - TYPE_CHECK_LENGTH(k_dec->n != 16, datatype->ToString(), 16, k_dec->n); - TYPE_CHECK_ITEM(k_dec->t != KG, datatype->ToString(), KG, k_dec->t); - - arrow::Decimal128 dec128((const uint8_t*)kG(k_dec)); - PARQUET_THROW_NOT_OK(dec_builder->Append(dec128)); - } + K bin_data = kK(k_array)[i]; + TYPE_CHECK_ITEM(bin_data->t != KG, datatype->ToString(), KG, bin_data->t); + TYPE_CHECK_LENGTH(fixed_bin_builder->byte_width() != bin_data->n, builder->type()->ToString(), fixed_bin_builder->byte_width(), bin_data->n); + PARQUET_THROW_NOT_OK(fixed_bin_builder->Append(kG(bin_data))); } - break; } - case arrow::Type::DURATION: - { - TemporalConversion tc(datatype); - auto dur_builder = static_cast(builder); - auto duration_type = std::static_pointer_cast(datatype); - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(dur_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); - break; - } - case arrow::Type::INTERVAL_MONTHS: - { - auto month_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(month_builder->AppendValues((int32_t*)kI(k_array), k_array->n)); - break; - } - case arrow::Type::INTERVAL_DAY_TIME: - { - auto dt_builder = static_cast(builder); - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(dt_builder->Append(KTimespan_DayTimeInterval(kJ(k_array)[i]))); - break; - } - case arrow::Type::LIST: - PopulateListBuilder(datatype, k_array, builder, type_overrides); - break; - case arrow::Type::LARGE_LIST: - PopulateListBuilder(datatype, k_array, builder, type_overrides); - break; - case arrow::Type::FIXED_SIZE_LIST: - PopulateListBuilder(datatype, k_array, builder, type_overrides); - break; - case arrow::Type::MAP: - { - // An arrow map array is a nested set of key/item paired child arrays. This - // is represented in kdb as a mixed list for the parent map array, with a - // dictionary for each map value set. - // - // Get the key and item builders from the parent map builder - auto map_builder = static_cast(builder); - auto key_builder = map_builder->key_builder(); - auto item_builder = map_builder->item_builder(); +} - for (auto i = 0; i < k_array->n; ++i) { - // Ignore any mixed list items set to :: - if (kK(k_array)[i]->t == 101) - continue; - - // Delimit the start/end of each child map set - map_builder->Append(); - - // Populate the child builders for this map set from the dictionary key/value lists - auto k_dict = kK(k_array)[i]; - TYPE_CHECK_ITEM(99 != k_dict->t, datatype->ToString(), 99, k_dict->t); - PopulateBuilder(key_builder->type(), kK(k_dict)[0], key_builder, type_overrides); - PopulateBuilder(item_builder->type(), kK(k_dict)[1], item_builder, type_overrides); +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + TemporalConversion tc(datatype); + auto d32_builder = static_cast(builder); + for (auto i = 0; i < k_array->n; ++i) + PARQUET_THROW_NOT_OK(d32_builder->Append(tc.KdbToArrow(kI(k_array)[i]))); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + TemporalConversion tc(datatype); + auto d64_builder = static_cast(builder); + for (auto i = 0; i < k_array->n; ++i) + PARQUET_THROW_NOT_OK(d64_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + TemporalConversion tc(datatype); + auto ts_builder = static_cast(builder); + auto timestamp_type = static_pointer_cast(datatype); + for (auto i = 0; i < k_array->n; ++i) + PARQUET_THROW_NOT_OK(ts_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + TemporalConversion tc(datatype); + auto t32_builder = static_cast(builder); + auto time32_type = static_pointer_cast(datatype); + for (auto i = 0; i < k_array->n; ++i) + PARQUET_THROW_NOT_OK(t32_builder->Append(tc.KdbToArrow(kI(k_array)[i]))); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + TemporalConversion tc(datatype); + auto t64_builder = static_cast(builder); + auto time64_type = static_pointer_cast(datatype); + for (auto i = 0; i < k_array->n; ++i) + PARQUET_THROW_NOT_OK(t64_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto dec_builder = static_cast(builder); + auto dec_type = static_pointer_cast(datatype); + for (auto i = 0; i < k_array->n; ++i) { + if (type_overrides.decimal128_as_double) { + // Construct the decimal from a double + arrow::Decimal128 dec128; + PARQUET_ASSIGN_OR_THROW(dec128, arrow::Decimal128::FromReal(kF(k_array)[i], dec_type->precision(), dec_type->scale())); + PARQUET_THROW_NOT_OK(dec_builder->Append(dec128)); + } else { + // Each decimal is a list of 16 bytes + K k_dec = kK(k_array)[i]; + TYPE_CHECK_LENGTH(k_dec->n != 16, datatype->ToString(), 16, k_dec->n); + TYPE_CHECK_ITEM(k_dec->t != KG, datatype->ToString(), KG, k_dec->t); + + arrow::Decimal128 dec128((const uint8_t*)kG(k_dec)); + PARQUET_THROW_NOT_OK(dec_builder->Append(dec128)); } - break; } +} - case arrow::Type::STRUCT: - { - // An arrow struct array is a logical grouping of child arrays with each - // child array corresponding to one of the fields in the struct. A single - // struct value is obtaining by slicing across all the child arrays at a - // given index. This is represented in kdb as a mixed list for the parent - // struct array, containing child lists for each field in the struct. - // - // Check that the mixed list length is at least equal to the number of struct fields - auto struct_type = std::static_pointer_cast(datatype); - TYPE_CHECK_LENGTH(struct_type->num_fields() > k_array->n, datatype->ToString(), struct_type->num_fields(), k_array->n); - - // Get all the field builders from the parent struct builder - auto struct_builder = static_cast(builder); - std::vector field_builders; - for (auto i = 0; i < struct_builder->num_fields(); ++i) - field_builders.push_back(struct_builder->field_builder(i)); - - // Delimit each struct value in the parent builder - for (auto index = 0; index < kK(k_array)[0]->n; ++index) - struct_builder->Append(); - - // Populate each of the field builders from its kdb list. Only count up to - // the number of struct fields. Additional trailing data in the kdb mixed - // list is ignored (to allow for ::) - for (auto i = 0; i < struct_type->num_fields(); ++i) - PopulateBuilder(field_builders[i]->type(), kK(k_array)[i], field_builders[i], type_overrides); - - // Check that all the populated field builders have the same length. - for (auto it : field_builders) - if (it->length() != struct_builder->length()) - throw TypeCheck("Mismatched struct list lengths"); - - break; +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + TemporalConversion tc(datatype); + auto dur_builder = static_cast(builder); + auto duration_type = static_pointer_cast(datatype); + for (auto i = 0; i < k_array->n; ++i) + PARQUET_THROW_NOT_OK(dur_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto month_builder = static_cast(builder); + PARQUET_THROW_NOT_OK(month_builder->AppendValues((int32_t*)kI(k_array), k_array->n)); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + auto dt_builder = static_cast(builder); + for (auto i = 0; i < k_array->n; ++i) + PARQUET_THROW_NOT_OK(dt_builder->Append(KTimespan_DayTimeInterval(kJ(k_array)[i]))); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + PopulateListBuilder(datatype, k_array, builder, type_overrides); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + PopulateListBuilder(datatype, k_array, builder, type_overrides); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + PopulateListBuilder(datatype, k_array, builder, type_overrides); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + // An arrow map array is a nested set of key/item paired child arrays. This + // is represented in kdb as a mixed list for the parent map array, with a + // dictionary for each map value set. + // + // Get the key and item builders from the parent map builder + auto map_builder = static_cast(builder); + auto key_builder = map_builder->key_builder(); + auto item_builder = map_builder->item_builder(); + + for (auto i = 0; i < k_array->n; ++i) { + // Ignore any mixed list items set to :: + if (kK(k_array)[i]->t == 101) + continue; + + // Delimit the start/end of each child map set + map_builder->Append(); + + // Populate the child builders for this map set from the dictionary key/value lists + auto k_dict = kK(k_array)[i]; + TYPE_CHECK_ITEM(99 != k_dict->t, datatype->ToString(), 99, k_dict->t); + PopulateBuilder(key_builder->type(), kK(k_dict)[0], key_builder, type_overrides); + PopulateBuilder(item_builder->type(), kK(k_dict)[1], item_builder, type_overrides); } - case arrow::Type::SPARSE_UNION: - PopulateUnionBuilder(datatype, k_array, builder, type_overrides); - break; - case arrow::Type::DENSE_UNION: - PopulateUnionBuilder(datatype, k_array, builder, type_overrides); - break; - default: +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + // An arrow struct array is a logical grouping of child arrays with each + // child array corresponding to one of the fields in the struct. A single + // struct value is obtaining by slicing across all the child arrays at a + // given index. This is represented in kdb as a mixed list for the parent + // struct array, containing child lists for each field in the struct. + // + // Check that the mixed list length is at least equal to the number of struct fields + auto struct_type = static_pointer_cast(datatype); + TYPE_CHECK_LENGTH(struct_type->num_fields() > k_array->n, datatype->ToString(), struct_type->num_fields(), k_array->n); + + // Get all the field builders from the parent struct builder + auto struct_builder = static_cast(builder); + vector field_builders; + for (auto i = 0; i < struct_builder->num_fields(); ++i) + field_builders.push_back(struct_builder->field_builder(i)); + + // Delimit each struct value in the parent builder + for (auto index = 0; index < kK(k_array)[0]->n; ++index) + struct_builder->Append(); + + // Populate each of the field builders from its kdb list. Only count up to + // the number of struct fields. Additional trailing data in the kdb mixed + // list is ignored (to allow for ::) + for (auto i = 0; i < struct_type->num_fields(); ++i) + PopulateBuilder(field_builders[i]->type(), kK(k_array)[i], field_builders[i], type_overrides); + + // Check that all the populated field builders have the same length. + for (auto it : field_builders) + if (it->length() != struct_builder->length()) + throw TypeCheck("Mismatched struct list lengths"); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + PopulateUnionBuilder(datatype, k_array, builder, type_overrides); +} + +template<> +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + PopulateUnionBuilder(datatype, k_array, builder, type_overrides); +} + +using PopulateHandler = void ( * ) ( shared_ptr, K, arrow::ArrayBuilder*, TypeMappingOverride& ); + +template +auto make_populate_handler() +{ + return make_pair( TypeId, &PopulateBuilder ); +} + +unordered_map PopulateHandlers { + make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() + , make_populate_handler() +}; + +} // namespace + +namespace kx { +namespace arrowkdb { + +// Populates data values from a kdb list into the specified array builder. +void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) +{ + // Special cases for: + // symbol - string or large_string + // guid - fixed_size_binary(16) + // char - uint8 + bool is_symbol = k_array->t == KS && (datatype->id() == arrow::Type::STRING || datatype->id() == arrow::Type::LARGE_STRING); + bool is_guid = k_array->t == UU && datatype->id() == arrow::Type::FIXED_SIZE_BINARY && static_cast(builder)->byte_width() == sizeof(U); + bool is_char = k_array->t == KC && (datatype->id() == arrow::Type::UINT8 || datatype->id() == arrow::Type::INT8); + + // Type check the kdb structure + if (!is_symbol && !is_guid && !is_char) + TYPE_CHECK_ARRAY(GetKdbType(datatype, type_overrides) != k_array->t, datatype->ToString(), GetKdbType(datatype, type_overrides), k_array->t); + + auto type_id = datatype->id(); + if( PopulateHandlers.find( type_id ) == PopulateHandlers.end() ) + { TYPE_CHECK_UNSUPPORTED(datatype->ToString()); } + else + { + PopulateHandlers[type_id]( datatype, k_array, builder, type_overrides ); + } } // Construct a dictionary array from its values and indicies arrays. // // This is represented in kdb as a mixed list for the parent dictionary array // containing the values and indicies sub-lists. -std::shared_ptr MakeDictionary(std::shared_ptr datatype, K k_array, TypeMappingOverride& type_overrides) +shared_ptr MakeDictionary(shared_ptr datatype, K k_array, TypeMappingOverride& type_overrides) { K values = kK(k_array)[0]; K indicies = kK(k_array)[1]; - auto dictionary_type = std::static_pointer_cast(datatype); + auto dictionary_type = static_pointer_cast(datatype); // Recursively construct the values and indicies arrays auto values_array = MakeArray(dictionary_type->value_type(), values, type_overrides); auto indicies_array = MakeArray(dictionary_type->index_type(), indicies, type_overrides); - std::shared_ptr result; + shared_ptr result; PARQUET_ASSIGN_OR_THROW(result, arrow::DictionaryArray::FromArrays(datatype, indicies_array, values_array)); return result; } -std::shared_ptr MakeArray(std::shared_ptr datatype, K k_array, TypeMappingOverride& type_overrides) +shared_ptr MakeArray(shared_ptr datatype, K k_array, TypeMappingOverride& type_overrides) { // DictionaryBuilder works in quite an unusual and non-standard way so just // construct the dictionary array directly @@ -790,7 +893,7 @@ std::shared_ptr MakeArray(std::shared_ptr datatyp PopulateBuilder(datatype, k_array, builder.get(), type_overrides); // Finalise the builder into the arrow array - std::shared_ptr array; + shared_ptr array; PARQUET_THROW_NOT_OK(builder->Finish(&array)); return array; } @@ -806,19 +909,19 @@ K prettyPrintArray(K datatype_id, K array, K options) if (datatype_id->t != -KI) return krr((S)"datatype_id not -6h"); - auto datatype = kx::arrowkdb::GetDatatypeStore()->Find(datatype_id->i); + auto datatype = GetDatatypeStore()->Find(datatype_id->i); if (!datatype) return krr((S)"datatype not found"); // Parse the options - auto read_options = kx::arrowkdb::KdbOptions(options, kx::arrowkdb::Options::string_options, kx::arrowkdb::Options::int_options); + auto read_options = KdbOptions(options, Options::string_options, Options::int_options); // Type mapping overrides - kx::arrowkdb::TypeMappingOverride type_overrides{ read_options }; + TypeMappingOverride type_overrides{ read_options }; - auto arrow_array = kx::arrowkdb::MakeArray(datatype, array, type_overrides); + auto arrow_array = MakeArray(datatype, array, type_overrides); auto options = arrow::PrettyPrintOptions(); - std::string result; + string result; arrow::PrettyPrint(*arrow_array, options, &result); return kp((S)result.c_str()); From ddc3b9d8b07d09c8278ab259eebe3a20c3ddbbc1 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 11 Jan 2023 13:30:47 +0000 Subject: [PATCH 139/276] Bump C++17 standard version to pass Mac checks on Travis --- CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 29f623d..33bfada 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,6 +7,9 @@ project(arrowkdb CXX) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -DKXVER=3") set(CMAKE_CXX_STANDARD 14) +IF(APPLE) + set(CMAKE_CXX_STANDARD 17) +endif() set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) From c374963913430e68f7aa061bae79bd53e714a282 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 10 Jan 2023 12:49:42 +0300 Subject: [PATCH 140/276] Dict options populating for null mapping --- src/KdbOptions.h | 85 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 82 insertions(+), 3 deletions(-) diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 1af50a8..dd79a62 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -25,6 +25,13 @@ namespace Options // String options const std::string PARQUET_VERSION = "PARQUET_VERSION"; + // Dict options + const std::string NULL_MAPPING = "NULL_MAPPING"; + + // Null mapping options + const std::string NM_INT_16 = "int16"; + const std::string NM_INT_32 = "int32"; + const static std::set int_options = { PARQUET_CHUNK_SIZE, PARQUET_MULTITHREADED_READ, @@ -34,6 +41,13 @@ namespace Options const static std::set string_options = { PARQUET_VERSION, }; + const static std::set dict_options = { + NULL_MAPPING, + }; + const static std::set null_mapping_options = { + NM_INT_16, + NM_INT_32, + }; } @@ -42,15 +56,19 @@ namespace Options // Dictionary key: KS // Dictionary value: KS or // KJ or -// 0 of -KS|-KJ|KC +// XD or +// 0 of -KS|-KJ|XD|KC class KdbOptions { private: + std::map null_mapping_options; std::map string_options; std::map int_options; const std::set& supported_string_options; const std::set& supported_int_options; + const std::set& supported_dict_options; + const std::set& supported_null_mapping_options; private: const std::string ToUpper(std::string str) const @@ -81,6 +99,26 @@ class KdbOptions } } + void PopulateDictOptions( K keys, K values ) + { + for( auto i = 0ll; i < values->n; ++i ) { + const std::string key = ToUpper( kS( keys )[i] ); + if( supported_dict_options.find( key ) == supported_dict_options.end() ){ + throw InvalidOption(("Unsupported dict option '" + key + "'").c_str()); + } + + K dict = kK( values )[0]; + K options = kK( values )[1]; + for( auto j = 0ll; j < options->n; ++j ) { + const std::string option = ToUpper( kS( dict )[j] ); + if( supported_null_mapping_options.find( key ) == supported_null_mapping_options.end() ){ + throw InvalidOption(("Unsupported '" + key + "' option '" + option + "'").c_str()); + } + null_mapping_options[option] = ToUpper( kS( options )[j] ); + } + } + } + void PopulateMixedOptions(K keys, K values) { for (auto i = 0ll; i < values->n; ++i) { @@ -104,6 +142,22 @@ class KdbOptions string_options[key] = ToUpper(std::string((char*)kG(value), value->n)); break; } + case XD: + { + if( supported_dict_options.find( key ) == supported_dict_options.end() ){ + throw InvalidOption(("Unsupported dict option '" + key + "'").c_str()); + } + K dict = kK( values )[0]; + K options = kK( values )[1]; + for( auto j = 0ll; j < options->n; ++j ) { + const std::string option = ToUpper( kS( dict )[j] ); + if( supported_null_mapping_options.find( key ) == supported_null_mapping_options.end() ){ + throw InvalidOption(("Unsupported '" + key + "' option '" + option + "'").c_str()); + } + null_mapping_options[option] = ToUpper( kS( options )[j] ); + } + break; + } case 101: // Ignore :: break; @@ -121,8 +175,16 @@ class KdbOptions {}; }; - KdbOptions(K options, const std::set supported_string_options_, const std::set supported_int_options_) : - supported_string_options(supported_string_options_), supported_int_options(supported_int_options_) + KdbOptions( + K options + , const std::set supported_string_options_ + , const std::set supported_int_options_ + , const std::set& supported_dict_options_ = std::set {} + , const std::set& supported_null_mapping_options_ = std::set {} ) + : supported_string_options(supported_string_options_) + , supported_int_options(supported_int_options_) + , supported_dict_options( supported_dict_options_ ) + , supported_null_mapping_options( supported_null_mapping_options_ ) { if (options != NULL && options->t != 101) { if (options->t != 99) @@ -138,6 +200,9 @@ class KdbOptions case KS: PopulateStringOptions(keys, values); break; + case XD: + PopulateDictOptions(keys, values); + break; case 0: PopulateMixedOptions(keys, values); break; @@ -147,6 +212,20 @@ class KdbOptions } } + bool GetNullMappingOption( const std::string key, std::string& result ) const + { + const auto it = null_mapping_options.find( key ); + if( it == null_mapping_options.end() ) + { + return false; + } + else + { + result = it->second; + return true; + } + } + bool GetStringOption(const std::string key, std::string& result) const { const auto it = string_options.find(key); From fa83f88cce4e5f77f6ddceed4cd37cd3ce93633f Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 12 Jan 2023 12:34:30 +0300 Subject: [PATCH 141/276] Supporting of nested dictionaries --- src/KdbOptions.h | 51 ++++++++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 19 deletions(-) diff --git a/src/KdbOptions.h b/src/KdbOptions.h index dd79a62..1a92fed 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -29,8 +29,8 @@ namespace Options const std::string NULL_MAPPING = "NULL_MAPPING"; // Null mapping options - const std::string NM_INT_16 = "int16"; - const std::string NM_INT_32 = "int32"; + const std::string NM_INT_16 = "INT16"; + const std::string NM_INT_32 = "INT32"; const static std::set int_options = { PARQUET_CHUNK_SIZE, @@ -99,6 +99,30 @@ class KdbOptions } } + void PopulateNullMappingOptions( long long index, K dict ) + { + K keys = kK( kK( dict )[index] )[0]; + K values = kK( kK( dict )[index] )[1]; + for( auto i = 0ll; i < values->n; ++i ){ + const std::string key = ToUpper( kS( keys )[i] ); + if( supported_null_mapping_options.find( key ) == supported_null_mapping_options.end() ){ + throw InvalidOption(("Unsupported NULL_MAPPING option '" + key + "'").c_str()); + } + switch( kK( values )[i]->t ) + { + case -KH: + null_mapping_options[key]; // = kK( values )[j]->h; to_string? variant?? + break; + case -KI: + null_mapping_options[key]; // = kK( values )[j]->i; to_string? variant?? + break; + case 0: + null_mapping_options[key] = ToUpper( kS( values )[i] ); + break; + }; + } + } + void PopulateDictOptions( K keys, K values ) { for( auto i = 0ll; i < values->n; ++i ) { @@ -106,15 +130,9 @@ class KdbOptions if( supported_dict_options.find( key ) == supported_dict_options.end() ){ throw InvalidOption(("Unsupported dict option '" + key + "'").c_str()); } - - K dict = kK( values )[0]; - K options = kK( values )[1]; - for( auto j = 0ll; j < options->n; ++j ) { - const std::string option = ToUpper( kS( dict )[j] ); - if( supported_null_mapping_options.find( key ) == supported_null_mapping_options.end() ){ - throw InvalidOption(("Unsupported '" + key + "' option '" + option + "'").c_str()); - } - null_mapping_options[option] = ToUpper( kS( options )[j] ); + if( Options::NULL_MAPPING == key ) + { + PopulateNullMappingOptions( i, values ); } } } @@ -147,14 +165,9 @@ class KdbOptions if( supported_dict_options.find( key ) == supported_dict_options.end() ){ throw InvalidOption(("Unsupported dict option '" + key + "'").c_str()); } - K dict = kK( values )[0]; - K options = kK( values )[1]; - for( auto j = 0ll; j < options->n; ++j ) { - const std::string option = ToUpper( kS( dict )[j] ); - if( supported_null_mapping_options.find( key ) == supported_null_mapping_options.end() ){ - throw InvalidOption(("Unsupported '" + key + "' option '" + option + "'").c_str()); - } - null_mapping_options[option] = ToUpper( kS( options )[j] ); + if( Options::NULL_MAPPING == key ) + { + PopulateNullMappingOptions( i, values ); } break; } From 18f6f779460b72ec67d25db803b9727af7f16763 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 12 Jan 2023 20:40:00 +0300 Subject: [PATCH 142/276] Arrow builder null mapping overriding --- src/ArrayWriter.cpp | 11 ++++++- src/HelperFunctions.cpp | 1 + src/HelperFunctions.h | 1 + src/KdbOptions.h | 64 +++++++++++++++++++++++++++++++---------- 4 files changed, 61 insertions(+), 16 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index ad05e33..83ca5ce 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -463,7 +463,16 @@ template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { auto int16_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(int16_builder->AppendValues((int16_t*)kH(k_array), k_array->n)); + if( type_overrides.null_mapping.have_int16 ){ + auto null_bitmap = std::unique_ptr( new uint8_t[k_array->n] ); + for( auto i = 0; i < k_array->n; ++i ){ + null_bitmap[i] = !( kH( k_array )[i] ^ type_overrides.null_mapping.int16_null ); + PARQUET_THROW_NOT_OK( int16_builder->AppendValues( ( int16_t* )kH( k_array ), k_array->n, null_bitmap.get() ) ); + } + } + else { + PARQUET_THROW_NOT_OK( int16_builder->AppendValues( ( int16_t* )kH( k_array), k_array->n ) ); + } } template<> diff --git a/src/HelperFunctions.cpp b/src/HelperFunctions.cpp index 5ade109..868cf07 100644 --- a/src/HelperFunctions.cpp +++ b/src/HelperFunctions.cpp @@ -148,6 +148,7 @@ const std::string GetKdbString(K str) TypeMappingOverride::TypeMappingOverride(const KdbOptions& options) { options.GetIntOption(Options::DECIMAL128_AS_DOUBLE, decimal128_as_double); + options.GetNullMappingOptions( null_mapping ); } KdbType GetKdbType(std::shared_ptr datatype, TypeMappingOverride& type_overrides) diff --git a/src/HelperFunctions.h b/src/HelperFunctions.h index d6faaef..201707a 100644 --- a/src/HelperFunctions.h +++ b/src/HelperFunctions.h @@ -80,6 +80,7 @@ typedef signed char KdbType; struct TypeMappingOverride { int64_t decimal128_as_double = 0; + Options::NullMapping null_mapping; TypeMappingOverride(void) {}; TypeMappingOverride(const KdbOptions& options); }; diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 1a92fed..5ba9b66 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -31,6 +31,7 @@ namespace Options // Null mapping options const std::string NM_INT_16 = "INT16"; const std::string NM_INT_32 = "INT32"; + const std::string NM_SYMBOL = "SYMBOL"; const static std::set int_options = { PARQUET_CHUNK_SIZE, @@ -47,6 +48,17 @@ namespace Options const static std::set null_mapping_options = { NM_INT_16, NM_INT_32, + NM_SYMBOL + }; + + struct NullMapping + { + bool have_int16; + int16_t int16_null; + bool have_int32; + int32_t int32_null; + bool have_symbol; + std::string symbol_null; }; } @@ -61,7 +73,7 @@ namespace Options class KdbOptions { private: - std::map null_mapping_options; + Options::NullMapping null_mapping_options; std::map string_options; std::map int_options; @@ -111,13 +123,16 @@ class KdbOptions switch( kK( values )[i]->t ) { case -KH: - null_mapping_options[key]; // = kK( values )[j]->h; to_string? variant?? + null_mapping_options.have_int16 = true; + null_mapping_options.int16_null = kK( values )[i]->h; break; case -KI: - null_mapping_options[key]; // = kK( values )[j]->i; to_string? variant?? + null_mapping_options.int32_null = true; + null_mapping_options.int32_null = kK( values )[i]->i; break; case 0: - null_mapping_options[key] = ToUpper( kS( values )[i] ); + null_mapping_options.have_symbol = true; + null_mapping_options.symbol_null = ToUpper( kS( values )[i] ); break; }; } @@ -225,18 +240,12 @@ class KdbOptions } } - bool GetNullMappingOption( const std::string key, std::string& result ) const + template + auto GetNullMappingOption( bool& result ); + + void GetNullMappingOptions( Options::NullMapping& null_mapping ) const { - const auto it = null_mapping_options.find( key ); - if( it == null_mapping_options.end() ) - { - return false; - } - else - { - result = it->second; - return true; - } + null_mapping = null_mapping_options; } bool GetStringOption(const std::string key, std::string& result) const @@ -262,6 +271,31 @@ class KdbOptions } }; +template<> +inline auto KdbOptions::GetNullMappingOption( bool& result ) +{ + result = null_mapping_options.have_int16; + + return null_mapping_options.int16_null; +} + +template<> +inline auto KdbOptions::GetNullMappingOption( bool& result ) +{ + result = null_mapping_options.have_int32; + + return null_mapping_options.int32_null; +} + +template<> +inline auto KdbOptions::GetNullMappingOption( bool& result ) +{ + result = null_mapping_options.have_symbol; + + return null_mapping_options.symbol_null; +} + + } // namespace arrowkdb } // namespace kx From 2677c56bcd4ac86df1e73f82c00ddf2ac6cb419f Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 13 Jan 2023 17:27:37 +0300 Subject: [PATCH 143/276] Arrow string builder with mapping of nulls --- src/ArrayWriter.cpp | 53 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 11 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index 83ca5ce..e26c2ae 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -467,8 +467,8 @@ void PopulateBuilder(shared_ptr datatype, K auto null_bitmap = std::unique_ptr( new uint8_t[k_array->n] ); for( auto i = 0; i < k_array->n; ++i ){ null_bitmap[i] = !( kH( k_array )[i] ^ type_overrides.null_mapping.int16_null ); - PARQUET_THROW_NOT_OK( int16_builder->AppendValues( ( int16_t* )kH( k_array ), k_array->n, null_bitmap.get() ) ); } + PARQUET_THROW_NOT_OK( int16_builder->AppendValues( ( int16_t* )kH( k_array ), k_array->n, null_bitmap.get() ) ); } else { PARQUET_THROW_NOT_OK( int16_builder->AppendValues( ( int16_t* )kH( k_array), k_array->n ) ); @@ -486,7 +486,16 @@ template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { auto int32_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(int32_builder->AppendValues((int32_t*)kI(k_array), k_array->n)); + if( type_overrides.null_mapping.have_int32 ){ + auto null_bitmap = std::unique_ptr( new uint8_t[k_array->n] ); + for( auto i = 0; i < k_array->n; ++i ){ + null_bitmap[i] = !( kH( k_array )[i] ^ type_overrides.null_mapping.int32_null ); + } + PARQUET_THROW_NOT_OK( int32_builder->AppendValues( ( int32_t* )kH( k_array ), k_array->n, null_bitmap.get() ) ); + } + else{ + PARQUET_THROW_NOT_OK(int32_builder->AppendValues((int32_t*)kI(k_array), k_array->n)); + } } template<> @@ -528,12 +537,23 @@ void PopulateBuilder(shared_ptr datatype, template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { - bool is_symbol = k_array->t == KS && (datatype->id() == arrow::Type::STRING || datatype->id() == arrow::Type::LARGE_STRING); auto str_builder = static_cast(builder); - if (is_symbol) { - // Populate from symbol list - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(str_builder->Append(kS(k_array)[i])); + if( is_symbol ){ + if( type_overrides.null_mapping.have_symbol ){ + for( auto i = 0; i < k_array->n; ++i ){ + if( type_overrides.null_mapping.symbol_null == kS( k_array )[i] ){ + PARQUET_THROW_NOT_OK( str_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK( str_builder->Append( kS( k_array )[i] ) ); + } + } + } + else{ + // Populate from symbol list + for (auto i = 0; i < k_array->n; ++i) + PARQUET_THROW_NOT_OK(str_builder->Append(kS(k_array)[i])); + } } else { // Populate from mixed list of char lists for (auto i = 0; i < k_array->n; ++i) { @@ -547,12 +567,23 @@ void PopulateBuilder(shared_ptr datatype, template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { - bool is_symbol = k_array->t == KS && (datatype->id() == arrow::Type::STRING || datatype->id() == arrow::Type::LARGE_STRING); auto str_builder = static_cast(builder); if (is_symbol) { - // Populate from symbol list - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(str_builder->Append(kS(k_array)[i])); + if( type_overrides.null_mapping.have_symbol ){ + for( auto i = 0; i < k_array->n; ++i ){ + if( type_overrides.null_mapping.symbol_null == kS( k_array )[i] ){ + PARQUET_THROW_NOT_OK( str_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK( str_builder->Append( kS( k_array )[i] ) ); + } + } + } + else{ + // Populate from symbol list + for (auto i = 0; i < k_array->n; ++i) + PARQUET_THROW_NOT_OK(str_builder->Append(kS(k_array)[i])); + } } else { // Populate from mixed list of char lists for (auto i = 0; i < k_array->n; ++i) { From 92e1cd34154e14b1235783262c7d142af3995eeb Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 13 Jan 2023 19:18:57 +0300 Subject: [PATCH 144/276] More granular mappings for each arrow datatype --- src/ArrayWriter.cpp | 8 ++-- src/KdbOptions.h | 100 ++++++++++++++++++++++++++++++++++---------- 2 files changed, 83 insertions(+), 25 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index e26c2ae..d7977b0 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -539,9 +539,9 @@ void PopulateBuilder(shared_ptr datatype, { auto str_builder = static_cast(builder); if( is_symbol ){ - if( type_overrides.null_mapping.have_symbol ){ + if( type_overrides.null_mapping.have_string ){ for( auto i = 0; i < k_array->n; ++i ){ - if( type_overrides.null_mapping.symbol_null == kS( k_array )[i] ){ + if( type_overrides.null_mapping.string_null == kS( k_array )[i] ){ PARQUET_THROW_NOT_OK( str_builder->AppendNull() ); } else{ @@ -569,9 +569,9 @@ void PopulateBuilder(shared_ptr data { auto str_builder = static_cast(builder); if (is_symbol) { - if( type_overrides.null_mapping.have_symbol ){ + if( type_overrides.null_mapping.have_large_string ){ for( auto i = 0; i < k_array->n; ++i ){ - if( type_overrides.null_mapping.symbol_null == kS( k_array )[i] ){ + if( type_overrides.null_mapping.large_string_null == kS( k_array )[i] ){ PARQUET_THROW_NOT_OK( str_builder->AppendNull() ); } else{ diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 5ba9b66..1a32ecf 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -13,6 +13,33 @@ namespace kx { namespace arrowkdb { +template +constexpr auto toUType( E enumerator ) noexcept +{ + return static_cast>( enumerator ); +} + +template< typename E > +struct ETraits +{ + using Names = std::map< E, std::string >; + + static std::string name( E enumerator ) + { + auto it = names.find( enumerator ); + if( it != names.end() ) + { + return it->second; + } + + return "unknown"; + } + + static std::string name( int index ) { return name( static_cast( index ) ); } + + static const Names names; +}; + // Supported options namespace Options { @@ -31,7 +58,8 @@ namespace Options // Null mapping options const std::string NM_INT_16 = "INT16"; const std::string NM_INT_32 = "INT32"; - const std::string NM_SYMBOL = "SYMBOL"; + const std::string NM_STRING = "STRING"; + const std::string NM_LARGE_STRING = "LARGE_STRING"; const static std::set int_options = { PARQUET_CHUNK_SIZE, @@ -48,20 +76,37 @@ namespace Options const static std::set null_mapping_options = { NM_INT_16, NM_INT_32, - NM_SYMBOL + NM_STRING, + NM_LARGE_STRING }; struct NullMapping { + enum class Type: int{ + INT_16 + , INT_32 + , STRING + , LARGE_STRING + }; + bool have_int16; int16_t int16_null; bool have_int32; int32_t int32_null; - bool have_symbol; - std::string symbol_null; + bool have_string; + std::string string_null; + bool have_large_string; + std::string large_string_null; }; } +template<> +inline const ETraits< Options::NullMapping::Type >::Names ETraits< Options::NullMapping::Type >::names { + { Options::NullMapping::Type::INT_16, Options::NM_INT_16 } + , { Options::NullMapping::Type::INT_32, Options::NM_INT_32 } + , { Options::NullMapping::Type::STRING, Options::NM_STRING } + , { Options::NullMapping::Type::LARGE_STRING, Options::NM_LARGE_STRING } +}; // Helper class for reading dictionary of options // @@ -113,6 +158,8 @@ class KdbOptions void PopulateNullMappingOptions( long long index, K dict ) { + using NM = Options::NullMapping::Type; + K keys = kK( kK( dict )[index] )[0]; K values = kK( kK( dict )[index] )[1]; for( auto i = 0ll; i < values->n; ++i ){ @@ -120,21 +167,25 @@ class KdbOptions if( supported_null_mapping_options.find( key ) == supported_null_mapping_options.end() ){ throw InvalidOption(("Unsupported NULL_MAPPING option '" + key + "'").c_str()); } - switch( kK( values )[i]->t ) - { - case -KH: + if( ETraits::name( NM::INT_16 ) == key && -KH == kK( values )[i]->t ){ null_mapping_options.have_int16 = true; null_mapping_options.int16_null = kK( values )[i]->h; - break; - case -KI: + } + else if( ETraits::name( NM::INT_32 ) == key && -KI == kK( values )[i]->t ){ null_mapping_options.int32_null = true; null_mapping_options.int32_null = kK( values )[i]->i; - break; - case 0: - null_mapping_options.have_symbol = true; - null_mapping_options.symbol_null = ToUpper( kS( values )[i] ); - break; - }; + } + else if( ETraits::name( NM::STRING ) == key && 0 == kK( values )[i]->t ){ + null_mapping_options.have_string = true; + null_mapping_options.string_null = ToUpper( kS( values )[i] ); + } + else if( ETraits::name( NM::LARGE_STRING ) == key && 0 == kK( values )[i]->t ){ + null_mapping_options.have_large_string = true; + null_mapping_options.large_string_null = ToUpper( kS( values )[i] ); + } + else{ + throw InvalidOption(("Unsupported KDB data type for NULL_MAPPING option '" + key + "'").c_str()); + } } } @@ -240,7 +291,7 @@ class KdbOptions } } - template + template auto GetNullMappingOption( bool& result ); void GetNullMappingOptions( Options::NullMapping& null_mapping ) const @@ -272,7 +323,7 @@ class KdbOptions }; template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ) +inline auto KdbOptions::GetNullMappingOption( bool& result ) { result = null_mapping_options.have_int16; @@ -280,7 +331,7 @@ inline auto KdbOptions::GetNullMappingOption( bool& result ) } template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ) +inline auto KdbOptions::GetNullMappingOption( bool& result ) { result = null_mapping_options.have_int32; @@ -288,13 +339,20 @@ inline auto KdbOptions::GetNullMappingOption( bool& result ) } template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ) +inline auto KdbOptions::GetNullMappingOption( bool& result ) { - result = null_mapping_options.have_symbol; + result = null_mapping_options.have_string; - return null_mapping_options.symbol_null; + return null_mapping_options.string_null; } +template<> +inline auto KdbOptions::GetNullMappingOption( bool& result ) +{ + result = null_mapping_options.have_large_string; + + return null_mapping_options.large_string_null; +} } // namespace arrowkdb } // namespace kx From e247bf7e5a27bdb750f114fc017f993fa6ea6739 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 16 Jan 2023 11:35:57 +0300 Subject: [PATCH 145/276] Default initialization of options --- src/KdbOptions.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 1a32ecf..3470adc 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -258,9 +258,10 @@ class KdbOptions K options , const std::set supported_string_options_ , const std::set supported_int_options_ - , const std::set& supported_dict_options_ = std::set {} - , const std::set& supported_null_mapping_options_ = std::set {} ) - : supported_string_options(supported_string_options_) + , const std::set& supported_dict_options_ = Options::dict_options + , const std::set& supported_null_mapping_options_ = Options::null_mapping_options ) + : null_mapping_options {0} + , supported_string_options(supported_string_options_) , supported_int_options(supported_int_options_) , supported_dict_options( supported_dict_options_ ) , supported_null_mapping_options( supported_null_mapping_options_ ) From 0e3d56af83685625c0bd1caeb9886b92f3a3a41c Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 16 Jan 2023 14:32:15 +0300 Subject: [PATCH 146/276] Integer writer debugging --- src/ArrayWriter.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index d7977b0..2883f1c 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -489,9 +489,9 @@ void PopulateBuilder(shared_ptr datatype, K if( type_overrides.null_mapping.have_int32 ){ auto null_bitmap = std::unique_ptr( new uint8_t[k_array->n] ); for( auto i = 0; i < k_array->n; ++i ){ - null_bitmap[i] = !( kH( k_array )[i] ^ type_overrides.null_mapping.int32_null ); + null_bitmap[i] = !( kI( k_array )[i] ^ type_overrides.null_mapping.int32_null ); } - PARQUET_THROW_NOT_OK( int32_builder->AppendValues( ( int32_t* )kH( k_array ), k_array->n, null_bitmap.get() ) ); + PARQUET_THROW_NOT_OK( int32_builder->AppendValues( ( int32_t* )kI( k_array ), k_array->n, null_bitmap.get() ) ); } else{ PARQUET_THROW_NOT_OK(int32_builder->AppendValues((int32_t*)kI(k_array), k_array->n)); From 4bb906db88782da0bddef08ce5dc527481029af2 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 16 Jan 2023 17:46:25 +0300 Subject: [PATCH 147/276] String writer debugging --- src/ArrayWriter.cpp | 92 +++++++++++++++++++++++++-------------------- src/KdbOptions.h | 19 +++++----- 2 files changed, 61 insertions(+), 50 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index 2883f1c..b5d1bf1 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -464,11 +464,14 @@ void PopulateBuilder(shared_ptr datatype, K { auto int16_builder = static_cast(builder); if( type_overrides.null_mapping.have_int16 ){ - auto null_bitmap = std::unique_ptr( new uint8_t[k_array->n] ); for( auto i = 0; i < k_array->n; ++i ){ - null_bitmap[i] = !( kH( k_array )[i] ^ type_overrides.null_mapping.int16_null ); + if( type_overrides.null_mapping.int16_null == kH( k_array )[i]){ + PARQUET_THROW_NOT_OK( int16_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK( int16_builder->AppendValues( ( int16_t* )&kH( k_array )[i], 1 ) ); + } } - PARQUET_THROW_NOT_OK( int16_builder->AppendValues( ( int16_t* )kH( k_array ), k_array->n, null_bitmap.get() ) ); } else { PARQUET_THROW_NOT_OK( int16_builder->AppendValues( ( int16_t* )kH( k_array), k_array->n ) ); @@ -486,12 +489,17 @@ template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { auto int32_builder = static_cast(builder); + type_overrides.null_mapping.have_int32 = true; + type_overrides.null_mapping.int32_null = -2147483648; if( type_overrides.null_mapping.have_int32 ){ - auto null_bitmap = std::unique_ptr( new uint8_t[k_array->n] ); for( auto i = 0; i < k_array->n; ++i ){ - null_bitmap[i] = !( kI( k_array )[i] ^ type_overrides.null_mapping.int32_null ); + if( type_overrides.null_mapping.int32_null == kI( k_array )[i] ){ + PARQUET_THROW_NOT_OK( int32_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK( int32_builder->AppendValues( ( int32_t* )&kI( k_array )[i], 1 ) ); + } } - PARQUET_THROW_NOT_OK( int32_builder->AppendValues( ( int32_t* )kI( k_array ), k_array->n, null_bitmap.get() ) ); } else{ PARQUET_THROW_NOT_OK(int32_builder->AppendValues((int32_t*)kI(k_array), k_array->n)); @@ -539,27 +547,28 @@ void PopulateBuilder(shared_ptr datatype, { auto str_builder = static_cast(builder); if( is_symbol ){ - if( type_overrides.null_mapping.have_string ){ - for( auto i = 0; i < k_array->n; ++i ){ - if( type_overrides.null_mapping.string_null == kS( k_array )[i] ){ - PARQUET_THROW_NOT_OK( str_builder->AppendNull() ); - } - else{ - PARQUET_THROW_NOT_OK( str_builder->Append( kS( k_array )[i] ) ); - } + // Populate from symbol list + for( auto i = 0ll; i < k_array->n; ++i ){ + if( type_overrides.null_mapping.have_string + && type_overrides.null_mapping.string_null == kS( k_array )[i] ){ + PARQUET_THROW_NOT_OK( str_builder->AppendEmptyValue() ); + } + else{ + PARQUET_THROW_NOT_OK( str_builder->Append( kS( k_array )[i] ) ); } - } - else{ - // Populate from symbol list - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(str_builder->Append(kS(k_array)[i])); } } else { // Populate from mixed list of char lists - for (auto i = 0; i < k_array->n; ++i) { - K str_data = kK(k_array)[i]; - TYPE_CHECK_ITEM(str_data->t != KC, datatype->ToString(), KC, str_data->t); - PARQUET_THROW_NOT_OK(str_builder->Append(kG(str_data), str_data->n)); + for( auto i = 0ll; i < k_array->n; ++i ){ + K str_data = kK( k_array )[i]; + TYPE_CHECK_ITEM( str_data->t != KC, datatype->ToString(), KC, str_data->t ); + if( type_overrides.null_mapping.have_string + && type_overrides.null_mapping.string_null == std::string( ( char* )kG( str_data ), str_data->n ) ){ + PARQUET_THROW_NOT_OK( str_builder->AppendEmptyValue() ); + } + else{ + PARQUET_THROW_NOT_OK( str_builder->Append( kG( str_data ), str_data->n ) ); + } } } } @@ -568,28 +577,29 @@ template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { auto str_builder = static_cast(builder); - if (is_symbol) { - if( type_overrides.null_mapping.have_large_string ){ - for( auto i = 0; i < k_array->n; ++i ){ - if( type_overrides.null_mapping.large_string_null == kS( k_array )[i] ){ - PARQUET_THROW_NOT_OK( str_builder->AppendNull() ); - } - else{ - PARQUET_THROW_NOT_OK( str_builder->Append( kS( k_array )[i] ) ); - } + if( is_symbol ){ + // Populate from symbol list + for( auto i = 0ll; i < k_array->n; ++i ){ + if( type_overrides.null_mapping.have_large_string + && type_overrides.null_mapping.large_string_null == kS( k_array )[i] ){ + PARQUET_THROW_NOT_OK( str_builder->AppendEmptyValue() ); + } + else{ + PARQUET_THROW_NOT_OK( str_builder->Append( kS( k_array )[i] ) ); } - } - else{ - // Populate from symbol list - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(str_builder->Append(kS(k_array)[i])); } } else { // Populate from mixed list of char lists - for (auto i = 0; i < k_array->n; ++i) { - K str_data = kK(k_array)[i]; - TYPE_CHECK_ITEM(str_data->t != KC, datatype->ToString(), KC, str_data->t); - PARQUET_THROW_NOT_OK(str_builder->Append(kG(str_data), str_data->n)); + for( auto i = 0ll; i < k_array->n; ++i ){ + K str_data = kK( k_array )[i]; + TYPE_CHECK_ITEM( str_data->t != KC, datatype->ToString(), KC, str_data->t ); + if( type_overrides.null_mapping.have_string + && type_overrides.null_mapping.string_null == std::string( ( char* )kG( str_data ), str_data->n ) ){ + PARQUET_THROW_NOT_OK( str_builder->AppendEmptyValue() ); + } + else{ + PARQUET_THROW_NOT_OK( str_builder->Append( kG( str_data ), str_data->n ) ); + } } } } diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 3470adc..4858b8e 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -167,21 +167,22 @@ class KdbOptions if( supported_null_mapping_options.find( key ) == supported_null_mapping_options.end() ){ throw InvalidOption(("Unsupported NULL_MAPPING option '" + key + "'").c_str()); } - if( ETraits::name( NM::INT_16 ) == key && -KH == kK( values )[i]->t ){ + K value = kK( values )[i]; + if( ETraits::name( NM::INT_16 ) == key && -KH == value->t ){ + null_mapping_options.int16_null = value->h; null_mapping_options.have_int16 = true; - null_mapping_options.int16_null = kK( values )[i]->h; } - else if( ETraits::name( NM::INT_32 ) == key && -KI == kK( values )[i]->t ){ - null_mapping_options.int32_null = true; - null_mapping_options.int32_null = kK( values )[i]->i; + else if( ETraits::name( NM::INT_32 ) == key && -KI == value->t ){ + null_mapping_options.int32_null = value->i; + null_mapping_options.have_int32 = true; } - else if( ETraits::name( NM::STRING ) == key && 0 == kK( values )[i]->t ){ + else if( ETraits::name( NM::STRING ) == key && KC == value->t ){ + null_mapping_options.string_null.assign( (char*)kC( value ), value->n ); null_mapping_options.have_string = true; - null_mapping_options.string_null = ToUpper( kS( values )[i] ); } - else if( ETraits::name( NM::LARGE_STRING ) == key && 0 == kK( values )[i]->t ){ + else if( ETraits::name( NM::LARGE_STRING ) == key && KC == value->t ){ + null_mapping_options.large_string_null.assign( (char*)kC( value ), value->n ); null_mapping_options.have_large_string = true; - null_mapping_options.large_string_null = ToUpper( kS( values )[i] ); } else{ throw InvalidOption(("Unsupported KDB data type for NULL_MAPPING option '" + key + "'").c_str()); From d7be8aad2c9cbf81e467e1d1c64acac2616aa6ad Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 17 Jan 2023 15:53:03 +0300 Subject: [PATCH 148/276] Enable null supporting fields --- src/ArrayWriter.cpp | 8 ++++---- src/FieldStore.cpp | 10 ++-------- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index b5d1bf1..88d8e86 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -551,7 +551,7 @@ void PopulateBuilder(shared_ptr datatype, for( auto i = 0ll; i < k_array->n; ++i ){ if( type_overrides.null_mapping.have_string && type_overrides.null_mapping.string_null == kS( k_array )[i] ){ - PARQUET_THROW_NOT_OK( str_builder->AppendEmptyValue() ); + PARQUET_THROW_NOT_OK( str_builder->AppendNull() ); } else{ PARQUET_THROW_NOT_OK( str_builder->Append( kS( k_array )[i] ) ); @@ -564,7 +564,7 @@ void PopulateBuilder(shared_ptr datatype, TYPE_CHECK_ITEM( str_data->t != KC, datatype->ToString(), KC, str_data->t ); if( type_overrides.null_mapping.have_string && type_overrides.null_mapping.string_null == std::string( ( char* )kG( str_data ), str_data->n ) ){ - PARQUET_THROW_NOT_OK( str_builder->AppendEmptyValue() ); + PARQUET_THROW_NOT_OK( str_builder->AppendNull() ); } else{ PARQUET_THROW_NOT_OK( str_builder->Append( kG( str_data ), str_data->n ) ); @@ -582,7 +582,7 @@ void PopulateBuilder(shared_ptr data for( auto i = 0ll; i < k_array->n; ++i ){ if( type_overrides.null_mapping.have_large_string && type_overrides.null_mapping.large_string_null == kS( k_array )[i] ){ - PARQUET_THROW_NOT_OK( str_builder->AppendEmptyValue() ); + PARQUET_THROW_NOT_OK( str_builder->AppendNull() ); } else{ PARQUET_THROW_NOT_OK( str_builder->Append( kS( k_array )[i] ) ); @@ -595,7 +595,7 @@ void PopulateBuilder(shared_ptr data TYPE_CHECK_ITEM( str_data->t != KC, datatype->ToString(), KC, str_data->t ); if( type_overrides.null_mapping.have_string && type_overrides.null_mapping.string_null == std::string( ( char* )kG( str_data ), str_data->n ) ){ - PARQUET_THROW_NOT_OK( str_builder->AppendEmptyValue() ); + PARQUET_THROW_NOT_OK( str_builder->AppendNull() ); } else{ PARQUET_THROW_NOT_OK( str_builder->Append( kG( str_data ), str_data->n ) ); diff --git a/src/FieldStore.cpp b/src/FieldStore.cpp index 53cf6b0..04af9f5 100644 --- a/src/FieldStore.cpp +++ b/src/FieldStore.cpp @@ -108,11 +108,5 @@ K field(K field_name, K datatype_id) if (!datatype) return krr((S)"datatype not found"); - // Converting between kdb nulls are arrow nulls would incur a massive - // performance hit (up to 10x worse with trival datatypes that could otherwise - // be memcpy'ed). Also, not all kdb types have a null value, e.g. KB, KG, KS, - // 0 of KC, 0 of KG, etc. So don't allow fields to be created as nullable - // (other than NA type which is all nulls). - bool nullable = datatype->id() == arrow::Type::NA; - return ki(kx::arrowkdb::GetFieldStore()->Add(arrow::field(kx::arrowkdb::GetKdbString(field_name), datatype, nullable))); -} \ No newline at end of file + return ki(kx::arrowkdb::GetFieldStore()->Add(arrow::field(kx::arrowkdb::GetKdbString(field_name), datatype, true))); +} From 70265390a4185e66d8357f126ba6a15855d09428 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 17 Jan 2023 15:53:28 +0300 Subject: [PATCH 149/276] Null mapping example --- examples/null_mapping.q | 52 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 examples/null_mapping.q diff --git a/examples/null_mapping.q b/examples/null_mapping.q new file mode 100644 index 0000000..2b1b31c --- /dev/null +++ b/examples/null_mapping.q @@ -0,0 +1,52 @@ +///////////////////////// +// CONSTRUCTED SCHEMAS // +///////////////////////// + +//-------------------// +// Create the schema // +//-------------------// + +// Create the datatype identifiers +ts_dt:.arrowkdb.dt.timestamp[`nano]; +f64_dt:.arrowkdb.dt.float64[]; +i32_dt:.arrowkdb.dt.int32[]; +bool_dt:.arrowkdb.dt.boolean[]; +str_dt:.arrowkdb.dt.utf8[]; + +// Create the field identifiers +tstamp_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; +temp_fd:.arrowkdb.fd.field[`temperature;f64_dt]; +fill_fd:.arrowkdb.fd.field[`fill_level;i32_dt]; +pump_fd:.arrowkdb.fd.field[`pump_status;bool_dt]; +comment_fd:.arrowkdb.fd.field[`comment;str_dt]; + +// Create the schema for the list of fields +schema:.arrowkdb.sc.schema[(tstamp_fd,temp_fd,fill_fd,pump_fd,comment_fd)]; + +// Print the schema +.arrowkdb.sc.printSchema[schema] + +//-----------------------// +// Create the array data // +//-----------------------// + +// Create data for each column in the table +tstamp_data:asc N?0p; +temp_data:N?100f; +fill_data:N?100i; +fill_data[0]:0Ni +pump_data:N?0b; +comment_data:N?("start";"stop";"alert";"acknowledge";""); + +// Combine the data for all columns +array_data:(tstamp_data;temp_data;fill_data;pump_data;comment_data); + +// Support null mapping +null_opts:(`int16`int32`string)!(0Nh;0Ni;"start") +options:(``NULL_MAPPING)!((::);null_opts) + +// Pretty print the Arrow table populated from the array data +.arrowkdb.tb.prettyPrintTable[schema;array_data;options] + +options[`PARQUET_VERSION]:`V2.0 +.arrowkdb.pq.writeParquet["null_mapping.parquet";schema;array_data;options] From 24b91c447569ab2420bee66c650253794e5218df Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 18 Jan 2023 11:29:54 +0300 Subject: [PATCH 150/276] Batch operations for integers --- src/ArrayWriter.cpp | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index 88d8e86..584c77f 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -464,17 +464,14 @@ void PopulateBuilder(shared_ptr datatype, K { auto int16_builder = static_cast(builder); if( type_overrides.null_mapping.have_int16 ){ - for( auto i = 0; i < k_array->n; ++i ){ - if( type_overrides.null_mapping.int16_null == kH( k_array )[i]){ - PARQUET_THROW_NOT_OK( int16_builder->AppendNull() ); - } - else{ - PARQUET_THROW_NOT_OK( int16_builder->AppendValues( ( int16_t* )&kH( k_array )[i], 1 ) ); - } + std::vector null_bitmap( k_array->n ); + for( auto i = 0ll; i < k_array->n; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.int16_null ^ kH( k_array )[i]; } + PARQUET_THROW_NOT_OK( int16_builder->AppendValues( ( int16_t* )kH( k_array ), k_array->n, null_bitmap ) ); } else { - PARQUET_THROW_NOT_OK( int16_builder->AppendValues( ( int16_t* )kH( k_array), k_array->n ) ); + PARQUET_THROW_NOT_OK(int16_builder->AppendValues((int16_t*)kH(k_array), k_array->n)); } } @@ -489,17 +486,12 @@ template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { auto int32_builder = static_cast(builder); - type_overrides.null_mapping.have_int32 = true; - type_overrides.null_mapping.int32_null = -2147483648; if( type_overrides.null_mapping.have_int32 ){ - for( auto i = 0; i < k_array->n; ++i ){ - if( type_overrides.null_mapping.int32_null == kI( k_array )[i] ){ - PARQUET_THROW_NOT_OK( int32_builder->AppendNull() ); - } - else{ - PARQUET_THROW_NOT_OK( int32_builder->AppendValues( ( int32_t* )&kI( k_array )[i], 1 ) ); - } + std::vector null_bitmap( k_array->n ); + for( auto i = 0ll; i < k_array->n; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.int32_null ^ kI( k_array )[i]; } + PARQUET_THROW_NOT_OK( int32_builder->AppendValues( ( int32_t* )kI( k_array ), k_array->n, null_bitmap ) ); } else{ PARQUET_THROW_NOT_OK(int32_builder->AppendValues((int32_t*)kI(k_array), k_array->n)); @@ -546,6 +538,7 @@ template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { auto str_builder = static_cast(builder); + bool is_symbol = k_array->t == KS && (datatype->id() == arrow::Type::STRING || datatype->id() == arrow::Type::LARGE_STRING); if( is_symbol ){ // Populate from symbol list for( auto i = 0ll; i < k_array->n; ++i ){ @@ -563,7 +556,8 @@ void PopulateBuilder(shared_ptr datatype, K str_data = kK( k_array )[i]; TYPE_CHECK_ITEM( str_data->t != KC, datatype->ToString(), KC, str_data->t ); if( type_overrides.null_mapping.have_string - && type_overrides.null_mapping.string_null == std::string( ( char* )kG( str_data ), str_data->n ) ){ + && type_overrides.null_mapping.string_null.length() == str_data->n + && !type_overrides.null_mapping.string_null.compare( 0, str_data->n, ( char* )kG( str_data ), str_data->n ) ){ PARQUET_THROW_NOT_OK( str_builder->AppendNull() ); } else{ @@ -577,6 +571,7 @@ template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { auto str_builder = static_cast(builder); + bool is_symbol = k_array->t == KS && (datatype->id() == arrow::Type::STRING || datatype->id() == arrow::Type::LARGE_STRING); if( is_symbol ){ // Populate from symbol list for( auto i = 0ll; i < k_array->n; ++i ){ @@ -593,8 +588,9 @@ void PopulateBuilder(shared_ptr data for( auto i = 0ll; i < k_array->n; ++i ){ K str_data = kK( k_array )[i]; TYPE_CHECK_ITEM( str_data->t != KC, datatype->ToString(), KC, str_data->t ); - if( type_overrides.null_mapping.have_string - && type_overrides.null_mapping.string_null == std::string( ( char* )kG( str_data ), str_data->n ) ){ + if( type_overrides.null_mapping.have_large_string + && type_overrides.null_mapping.large_string_null.length() == str_data->n + && !type_overrides.null_mapping.large_string_null.compare( 0, str_data->n, ( char* )kG( str_data ), str_data->n ) ){ PARQUET_THROW_NOT_OK( str_builder->AppendNull() ); } else{ From 4bb5bf268a5be602368b435133b9404516969380 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 23 Jan 2023 17:47:37 +0300 Subject: [PATCH 151/276] Pull-request #6 changes, patch 1 https://github.com/KxSystems/arrowkdb/pull/6 --- src/ArrayWriter.cpp | 4 ++-- src/KdbOptions.h | 9 +++++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index 584c77f..41fa68c 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -466,7 +466,7 @@ void PopulateBuilder(shared_ptr datatype, K if( type_overrides.null_mapping.have_int16 ){ std::vector null_bitmap( k_array->n ); for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.int16_null ^ kH( k_array )[i]; + null_bitmap[i] = type_overrides.null_mapping.int16_null != kH( k_array )[i]; } PARQUET_THROW_NOT_OK( int16_builder->AppendValues( ( int16_t* )kH( k_array ), k_array->n, null_bitmap ) ); } @@ -489,7 +489,7 @@ void PopulateBuilder(shared_ptr datatype, K if( type_overrides.null_mapping.have_int32 ){ std::vector null_bitmap( k_array->n ); for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.int32_null ^ kI( k_array )[i]; + null_bitmap[i] = type_overrides.null_mapping.int32_null != kI( k_array )[i]; } PARQUET_THROW_NOT_OK( int32_builder->AppendValues( ( int32_t* )kI( k_array ), k_array->n, null_bitmap ) ); } diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 4858b8e..95573d8 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -162,6 +162,12 @@ class KdbOptions K keys = kK( kK( dict )[index] )[0]; K values = kK( kK( dict )[index] )[1]; + if( KS != keys->t ){ + throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING keys" ); + } + if( 0 != values->t ){ + throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING values" ); + } for( auto i = 0ll; i < values->n; ++i ){ const std::string key = ToUpper( kS( keys )[i] ); if( supported_null_mapping_options.find( key ) == supported_null_mapping_options.end() ){ @@ -184,6 +190,9 @@ class KdbOptions null_mapping_options.large_string_null.assign( (char*)kC( value ), value->n ); null_mapping_options.have_large_string = true; } + else if( 101 == value->t ){ + // Ignore generic null, which may be used here to ensure mixed list of options + } else{ throw InvalidOption(("Unsupported KDB data type for NULL_MAPPING option '" + key + "'").c_str()); } From a60df05df32d85448879682c5b6d12e0a044224f Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 24 Jan 2023 10:32:03 +0000 Subject: [PATCH 152/276] Pull-request #6 changes, patch 2 https://github.com/KxSystems/arrowkdb/pull/6 --- src/KdbOptions.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 95573d8..cf46f3e 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -163,10 +163,10 @@ class KdbOptions K keys = kK( kK( dict )[index] )[0]; K values = kK( kK( dict )[index] )[1]; if( KS != keys->t ){ - throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING keys" ); + throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING keys (expected=11h), type=" + std::to_string( keys->t ) ); } if( 0 != values->t ){ - throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING values" ); + throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING values (extected=0), type=" + std::to_string( keys->t ) ); } for( auto i = 0ll; i < values->n; ++i ){ const std::string key = ToUpper( kS( keys )[i] ); @@ -194,7 +194,7 @@ class KdbOptions // Ignore generic null, which may be used here to ensure mixed list of options } else{ - throw InvalidOption(("Unsupported KDB data type for NULL_MAPPING option '" + key + "'").c_str()); + throw InvalidOption(("Unsupported KDB data type for NULL_MAPPING option '" + key + "', type=" + std::to_string( value->t ) ).c_str()); } } } From f83a7c418c85e7294ce5fb128d34edd6c306d6b9 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 24 Jan 2023 13:47:23 +0000 Subject: [PATCH 153/276] Pull-request #6 changes, patch 3 --- src/KdbOptions.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/KdbOptions.h b/src/KdbOptions.h index cf46f3e..c4bf1ae 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -163,10 +163,10 @@ class KdbOptions K keys = kK( kK( dict )[index] )[0]; K values = kK( kK( dict )[index] )[1]; if( KS != keys->t ){ - throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING keys (expected=11h), type=" + std::to_string( keys->t ) ); + throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING keys (expected=11h), type=" + std::to_string( keys->t ) + "h" ); } if( 0 != values->t ){ - throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING values (extected=0), type=" + std::to_string( keys->t ) ); + throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING values (extected=0h), type=" + std::to_string( keys->t ) + "h" ); } for( auto i = 0ll; i < values->n; ++i ){ const std::string key = ToUpper( kS( keys )[i] ); @@ -194,7 +194,7 @@ class KdbOptions // Ignore generic null, which may be used here to ensure mixed list of options } else{ - throw InvalidOption(("Unsupported KDB data type for NULL_MAPPING option '" + key + "', type=" + std::to_string( value->t ) ).c_str()); + throw InvalidOption(("Unsupported KDB data type for NULL_MAPPING option '" + key + "', type=" + std::to_string( value->t ) + "h" ).c_str()); } } } From 92b442d601e182743df53220ef27e5ba54e76d80 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 24 Jan 2023 14:30:41 +0000 Subject: [PATCH 154/276] Pull-request #6 changes, patch 4 --- src/KdbOptions.h | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/KdbOptions.h b/src/KdbOptions.h index c4bf1ae..509ddd5 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -6,6 +6,7 @@ #include #include #include +#include #include "k.h" @@ -56,10 +57,10 @@ namespace Options const std::string NULL_MAPPING = "NULL_MAPPING"; // Null mapping options - const std::string NM_INT_16 = "INT16"; - const std::string NM_INT_32 = "INT32"; - const std::string NM_STRING = "STRING"; - const std::string NM_LARGE_STRING = "LARGE_STRING"; + const std::string NM_INT_16 = "int16"; + const std::string NM_INT_32 = "int32"; + const std::string NM_STRING = "string"; + const std::string NM_LARGE_STRING = "large_string"; const static std::set int_options = { PARQUET_CHUNK_SIZE, @@ -136,6 +137,13 @@ class KdbOptions return upper; } + const std::string ToLower( std::string str ) const + { + std::transform( str.begin(), str.end(), str.begin(), ::tolower ); + + return str; + } + void PopulateIntOptions(K keys, K values) { for (auto i = 0ll; i < values->n; ++i) { @@ -169,7 +177,7 @@ class KdbOptions throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING values (extected=0h), type=" + std::to_string( keys->t ) + "h" ); } for( auto i = 0ll; i < values->n; ++i ){ - const std::string key = ToUpper( kS( keys )[i] ); + const std::string key = ToLower( kS( keys )[i] ); if( supported_null_mapping_options.find( key ) == supported_null_mapping_options.end() ){ throw InvalidOption(("Unsupported NULL_MAPPING option '" + key + "'").c_str()); } From 007d5dc8bf3995056bde88f2274269e3b8f2ef65 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 23 Jan 2023 19:06:33 +0300 Subject: [PATCH 155/276] Null mapping example for all supported fields --- examples/null_mapping.q | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/examples/null_mapping.q b/examples/null_mapping.q index 2b1b31c..b825751 100644 --- a/examples/null_mapping.q +++ b/examples/null_mapping.q @@ -6,11 +6,36 @@ // Create the schema // //-------------------// +// Support null mapping +short_opts:(`bool`uint8`int8`uint16`int16)!(0b;0x01;0x02;3h;4h) +long_opts:(`uint32`int32`uint64`int64)!(5i;6i;7;8) +float_opts:(`float16`float32`float64)!(9h;1.23e;4.56) +string_opts:(`string`large_string`binary`large_binary`fixed_binary)!("start";"stop";"x"$"alert";"x"$"acknowledge";0Ng) +date_opts:(`date32`date64`timestamp)!("i"$2006.07.21;"j"$2015.01.01D00:00:00.000000000;"j"$12:00:00.000000000) +time_opts:(`time32`time64`decimal`duration)!("i"$09:01:02.042;"j"$2015.01.01D00:00:00.000000000;"f"$7.89;"j"$12:00:00.000000000) +interval_opts:(`month_interval`day_time_interval)!("i"$2006.07m;"j"$12:00:00.000000000) + +options:(``NULL_MAPPING)!((::);short_opts,long_opts,float_opts,string_opts,date_opts,time_opts,interval_opts) + // Create the datatype identifiers -ts_dt:.arrowkdb.dt.timestamp[`nano]; -f64_dt:.arrowkdb.dt.float64[]; -i32_dt:.arrowkdb.dt.int32[]; bool_dt:.arrowkdb.dt.boolean[]; +ui8_dt:.arrowkdb.dt.uint8[]; +i8_dt:.arrowkdb.dt.int8[]; +ui16_dt:.arrowkdb.dt.uint16[]; +i16_dt:.arrowkdb.dt.int16[]; + +ui32_dt:.arrowkdb.dt.uint32[]; +i32_dt:.arrowkdb.dt.int32[]; +ui64_dt:.arrowkdb.dt.uint64[]; +i64_dt:.arrowkdb.dt.int64[]; + +f64_dt:.arrowkdb.dt.float16[]; +f64_dt:.arrowkdb.dt.float64[]; +f64_dt:.arrowkdb.dt.float64[]; + + + +ts_dt:.arrowkdb.dt.timestamp[`nano]; str_dt:.arrowkdb.dt.utf8[]; // Create the field identifiers @@ -41,10 +66,6 @@ comment_data:N?("start";"stop";"alert";"acknowledge";""); // Combine the data for all columns array_data:(tstamp_data;temp_data;fill_data;pump_data;comment_data); -// Support null mapping -null_opts:(`int16`int32`string)!(0Nh;0Ni;"start") -options:(``NULL_MAPPING)!((::);null_opts) - // Pretty print the Arrow table populated from the array data .arrowkdb.tb.prettyPrintTable[schema;array_data;options] From e626d1856118f75df96d13665b6d209bfb1a90d4 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 24 Jan 2023 21:56:23 +0300 Subject: [PATCH 156/276] Writing parquet files for null mapping --- examples/null_mapping.q | 192 +++++++++++++++++++++++++++++++++------- 1 file changed, 160 insertions(+), 32 deletions(-) diff --git a/examples/null_mapping.q b/examples/null_mapping.q index b825751..1d8bb93 100644 --- a/examples/null_mapping.q +++ b/examples/null_mapping.q @@ -7,15 +7,18 @@ //-------------------// // Support null mapping -short_opts:(`bool`uint8`int8`uint16`int16)!(0b;0x01;0x02;3h;4h) -long_opts:(`uint32`int32`uint64`int64)!(5i;6i;7;8) -float_opts:(`float16`float32`float64)!(9h;1.23e;4.56) -string_opts:(`string`large_string`binary`large_binary`fixed_binary)!("start";"stop";"x"$"alert";"x"$"acknowledge";0Ng) -date_opts:(`date32`date64`timestamp)!("i"$2006.07.21;"j"$2015.01.01D00:00:00.000000000;"j"$12:00:00.000000000) -time_opts:(`time32`time64`decimal`duration)!("i"$09:01:02.042;"j"$2015.01.01D00:00:00.000000000;"f"$7.89;"j"$12:00:00.000000000) -interval_opts:(`month_interval`day_time_interval)!("i"$2006.07m;"j"$12:00:00.000000000) +short_opts:(`bool`uint8`int8`uint16`int16)!(0b;0x01;0x02;3h;4h); +long_opts:(`uint32`int32`uint64`int64)!(5i;6i;7;8); +float_opts:(`float16`float32`float64)!(9h;1.23e;4.56); +//string_opts:(`string`large_string`binary`large_binary`fixed_binary)!("start";"stop";"x"$"alert";"x"$"acknowledge";0Ng); +string_opts:(`string`large_string`binary`large_binary)!("start";"stop";"alert";"acknowledge"); +date_opts:(`date32`date64`timestamp)!("i"$2006.07.21;"j"$2015.01.01D00:00:00.000000000;"j"$12:00:00.000000000); +//time_opts:(`time32`time64`decimal)!("i"$09:01:02.042;"j"$2015.01.01D00:00:00.000000000;"f"$7.89); +dur_opts:(`duration`month_interval`day_time_interval)!("j"$12:00:00.000000000;"i"$2006.07m;"j"$12:00:00.000000000); -options:(``NULL_MAPPING)!((::);short_opts,long_opts,float_opts,string_opts,date_opts,time_opts,interval_opts) +options:(``NULL_MAPPING)!((::);short_opts,long_opts,float_opts,string_opts,date_opts,time_opts,interval_opts); + +ts_dt:.arrowkdb.dt.timestamp[`nano]; // Create the datatype identifiers bool_dt:.arrowkdb.dt.boolean[]; @@ -29,45 +32,170 @@ i32_dt:.arrowkdb.dt.int32[]; ui64_dt:.arrowkdb.dt.uint64[]; i64_dt:.arrowkdb.dt.int64[]; -f64_dt:.arrowkdb.dt.float16[]; -f64_dt:.arrowkdb.dt.float64[]; +f16_dt:.arrowkdb.dt.float16[]; +f32_dt:.arrowkdb.dt.float32[]; f64_dt:.arrowkdb.dt.float64[]; - - -ts_dt:.arrowkdb.dt.timestamp[`nano]; str_dt:.arrowkdb.dt.utf8[]; +lstr_dt:.arrowkdb.dt.large_utf8[]; +bin_dt:.arrowkdb.dt.binary[]; +lbin_dt:.arrowkdb.dt.large_binary[]; +fbin_dt:.arrowkdb.dt.fixed_size_binary[2i]; -// Create the field identifiers -tstamp_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; -temp_fd:.arrowkdb.fd.field[`temperature;f64_dt]; -fill_fd:.arrowkdb.fd.field[`fill_level;i32_dt]; -pump_fd:.arrowkdb.fd.field[`pump_status;bool_dt]; -comment_fd:.arrowkdb.fd.field[`comment;str_dt]; +d32_dt:.arrowkdb.dt.date32[]; +d64_dt:.arrowkdb.dt.date64[]; +tstamp_dt:.arrowkdb.dt.timestamp[`nano]; -// Create the schema for the list of fields -schema:.arrowkdb.sc.schema[(tstamp_fd,temp_fd,fill_fd,pump_fd,comment_fd)]; +t32_dt:.arrowkdb.dt.time32[`milli]; +t64_dt:.arrowkdb.dt.time64[`nano]; +dec_dt:.arrowkdb.dt.decimal128[38i;2i]; +dur_dt:.arrowkdb.dt.duration[`milli]; -// Print the schema -.arrowkdb.sc.printSchema[schema] +mint_dt:.arrowkdb.dt.month_interval[]; +dtint_dt:.arrowkdb.dt.day_time_interval[]; + +// Create the field identifiers +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +bool_fd:.arrowkdb.fd.field[`bool;bool_dt]; +ui8_fd:.arrowkdb.fd.field[`uint8;ui8_dt]; +i8_fd:.arrowkdb.fd.field[`int8;i8_dt]; +ui16_fd:.arrowkdb.fd.field[`uint16;ui16_dt]; +i16_fd:.arrowkdb.fd.field[`int16;i16_dt]; + +ui32_fd:.arrowkdb.fd.field[`uint32;ui32_dt]; +i32_fd:.arrowkdb.fd.field[`int32;i32_dt]; +ui64_fd:.arrowkdb.fd.field[`uint64;ui64_dt]; +i64_fd:.arrowkdb.fd.field[`int64;i64_dt]; + +f16_fd:.arrowkdb.fd.field[`float16;f16_dt]; +f32_fd:.arrowkdb.fd.field[`float32;f32_dt]; +f64_fd:.arrowkdb.fd.field[`float64;f64_dt]; + +str_fd:.arrowkdb.fd.field[`string;str_dt]; +lstr_fd:.arrowkdb.fd.field[`long_string;lstr_dt]; +bin_fd:.arrowkdb.fd.field[`binary;bin_dt]; +lbin_fd:.arrowkdb.fd.field[`long_binary;lbin_dt]; +fbin_fd:.arrowkdb.fd.field[`fixed_binary;fbin_dt]; + +d32_fd:.arrowkdb.fd.field[`date32;d32_dt]; +d64_fd:.arrowkdb.fd.field[`date64;d64_dt]; +tstamp_fd:.arrowkdb.fd.field[`timestamp;tstamp_dt]; + +t32_fd:.arrowkdb.fd.field[`time32;t32_dt]; +t64_fd:.arrowkdb.fd.field[`time64;t64_dt]; +dec_fd:.arrowkdb.fd.field[`decimal;dec_dt]; + +dur_fd:.arrowkdb.fd.field[`duration;dur_dt]; +mint_fd:.arrowkdb.fd.field[`month_interval;mint_dt]; +dtint_fd:.arrowkdb.fd.field[`day_time_interval;dtint_dt]; + +// Create the schemas for the list of fields +short_schema:.arrowkdb.sc.schema[(ts_fd,bool_fd,ui8_fd,i8_fd,ui16_fd,i16_fd)]; +long_schema:.arrowkdb.sc.schema[(ts_fd,ui32_fd,i32_fd,ui64_fd,i64_fd)]; +float_schema:.arrowkdb.sc.schema[(ts_fd,f16_fd,f32_fd,f64_fd)]; +str_schema:.arrowkdb.sc.schema[(ts_fd,str_fd,lstr_fd,bin_fd,lbin_fd,fbin_fd)]; +date_schema:.arrowkdb.sc.schema[(ts_fd,d32_fd,d64_fd,tstamp_fd)]; +time_schema:.arrowkdb.sc.schema[(ts_fd,t32_fd,t64_fd,dec_fd)]; +dur_schema:.arrowkdb.sc.schema[ts_fd,dur_fd,mint_fd,dtint_fd] + +// Print the schemas +.arrowkdb.sc.printSchema[short_schema] +.arrowkdb.sc.printSchema[long_schema] +.arrowkdb.sc.printSchema[float_schema] +.arrowkdb.sc.printSchema[string_schema] +.arrowkdb.sc.printSchema[date_schema] +.arrowkdb.sc.printSchema[time_schema] +.arrowkdb.sc.printSchema[dur_schema] //-----------------------// // Create the array data // //-----------------------// // Create data for each column in the table -tstamp_data:asc N?0p; -temp_data:N?100f; -fill_data:N?100i; -fill_data[0]:0Ni -pump_data:N?0b; -comment_data:N?("start";"stop";"alert";"acknowledge";""); +ts_data:asc N?0p; + +bool_data:N?(0b;1b); +bool_data[0]:0b; +ui8_data:N?0x64; +ui8_data[1]:0x01; +i8_data:N?0x64; +i8_data[2]:0x02; +ui16_data:N?100h; +ui16_data[3]:3h; +i16_data:N?100h; +i16_data[4]:4h; + +ui32_data:N?100i; +ui32_data[0]:5i; +i32_data:N?100i; +i32_data[1]:6i; +ui64_data:N?100; +ui64_data[2]:7; +i64_data:N?100; +i64_data[3]:8; + +f16_data:N?100h; +f16_data[0]:9h; +f32_data:N?100e; +f32_data[1]:1.23e; +f64_data:N?100f; +f64_data[2]:4.56f; + +str_data:N?("start";"stop";"alert";"acknowledge";""); +str_data[0]:"start" +lstr_data:N?("start";"stop";"alert";"acknowledge";""); +lstr_data[1]:"stop" +bin_data:N?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); +bin_data[2]:"x"$"alert" +lbin_data:N?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); +lbin_data[3]:"x"$"acknowledge" +fbin_data:N?("x"$"0123456789"); +fbin_data[4]:"x"$"5" + +d32_data:N?("i"$2006.07.21;"i"$2008.07.18;"i"$2012.07.16;"i"$2014.07.15;"i"$2016.07.11); +d32_data[1]:"i"$2006.07.21; +d64_data:N?("j"$2015.01.01D00:00:00.000000000;"j"$2017.01.01D00:00:00.000000000;"j"$2018.01.01D00:00:00.000000000;"j"$2019.01.01D00:00:00.000000000;"j"$2020.01.01D00:00:00.000000000); +d64_data[2]:"j"$2015.01.01D00:00:00.000000000; +tstamp_data:N?("j"$12:00:00.000000000;"j"$13:00:00.000000000;"j"$14:00:00.000000000;"j"$15:00:00.000000000;"j"$16:00:00.000000000); +tstamp_data[3]:"j"$12:00:00.000000000; + +t32_data:N?("i"$09:01:02.042;"i"$08:01:02.042;"i"$07:01:02.042;"i"$06:01:02.042;"i"$05:01:02.042); +t32_data[0]:"i"$09:01:02.042; +t64_data:N?("j"$2015.01.01D00:00:00.000000000;"j"$2016.01.01D00:00:00.000000000;"j"$2017.01.01D00:00:00.000000000;"j"$2018.01.01D00:00:00.000000000;"j"$2019.01.01D00:00:00.000000000); +t64_data[1]:"j"$2015.01.01D00:00:00.000000000; +dec_data:N?(10f); +dec_data[2]:7.89f + +dur_data:N?("j"$12:00:00.000000000;"j"$13:00:00.000000000;"j"$14:00:00.000000000;"j"$15:00:00.000000000;"j"$16:00:00.000000000); +dur_data[0]:"j"$12:00:00.000000000 +mint_data:N?("i"$2006.07m;"i"$2006.06m;"i"$2006.05m;"i"$2006.04m;"i"$2006.03m); +mint_data[1]:"i"$2006.07m; +dtint_data:N?("j"$12:00:00.000000000;"j"$11:00:00.000000000;"j"$10:00:00.000000000;"j"$09:00:00.000000000;"j"$08:00:00.000000000); +dtint_data[2]:"j"$12:00:00.000000000; // Combine the data for all columns -array_data:(tstamp_data;temp_data;fill_data;pump_data;comment_data); +short_data:(ts_data;bool_data;ui8_data;i8_data;ui16_data;i16_data); +long_data:(ts_data;ui32_data;i32_data;ui64_data;i64_data); +float_data:(ts_data;f16_data;f32_data;f64_data); +str_data:(ts_data;str_data;lstr_data;bin_data;lbin_data;fbin_data); +date_data:(ts_data;d32_data;d64_data;tstamp_data); +time_data:(ts_data;t32_data;t64_data;dec_data) +dur_data:(ts_data;dur_data;mint_data;dtint_data) // Pretty print the Arrow table populated from the array data -.arrowkdb.tb.prettyPrintTable[schema;array_data;options] +.arrowkdb.tb.prettyPrintTable[short_schema;short_data;options] +.arrowkdb.tb.prettyPrintTable[long_schema;long_data;options] +.arrowkdb.tb.prettyPrintTable[float_schema;float_data;options] +.arrowkdb.tb.prettyPrintTable[str_schema;str_data;options] +.arrowkdb.tb.prettyPrintTable[date_schema;date_data;options] +.arrowkdb.tb.prettyPrintTable[time_schema;time_data;options] +.arrowkdb.tb.prettyPrintTable[dur_schema;dur_data;options] options[`PARQUET_VERSION]:`V2.0 -.arrowkdb.pq.writeParquet["null_mapping.parquet";schema;array_data;options] +.arrowkdb.pq.writeParquet["null_mapping_short.parquet";short_schema;short_data;options] +.arrowkdb.pq.writeParquet["null_mapping_long.parquet";long_schema;long_data;options] +.arrowkdb.pq.writeParquet["null_mapping_float.parquet";float_schema;float_data;options] +.arrowkdb.pq.writeParquet["null_mapping_str.parquet";str_schema;str_data;options] +.arrowkdb.pq.writeParquet["null_mapping_date.parquet";date_schema;date_data;options] +.arrowkdb.pq.writeParquet["null_mapping_time.parquet";time_schema;time_data;options] From 58da5ebd76954ac477d314b78226039f6e35a6d9 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 25 Jan 2023 19:59:38 +0300 Subject: [PATCH 157/276] Other fields removed from parquet --- examples/null_mapping.q | 105 ++++++++++++++++++---------------------- 1 file changed, 48 insertions(+), 57 deletions(-) diff --git a/examples/null_mapping.q b/examples/null_mapping.q index 1d8bb93..629672c 100644 --- a/examples/null_mapping.q +++ b/examples/null_mapping.q @@ -9,14 +9,12 @@ // Support null mapping short_opts:(`bool`uint8`int8`uint16`int16)!(0b;0x01;0x02;3h;4h); long_opts:(`uint32`int32`uint64`int64)!(5i;6i;7;8); -float_opts:(`float16`float32`float64)!(9h;1.23e;4.56); -//string_opts:(`string`large_string`binary`large_binary`fixed_binary)!("start";"stop";"x"$"alert";"x"$"acknowledge";0Ng); -string_opts:(`string`large_string`binary`large_binary)!("start";"stop";"alert";"acknowledge"); -date_opts:(`date32`date64`timestamp)!("i"$2006.07.21;"j"$2015.01.01D00:00:00.000000000;"j"$12:00:00.000000000); -//time_opts:(`time32`time64`decimal)!("i"$09:01:02.042;"j"$2015.01.01D00:00:00.000000000;"f"$7.89); -dur_opts:(`duration`month_interval`day_time_interval)!("j"$12:00:00.000000000;"i"$2006.07m;"j"$12:00:00.000000000); +float_opts:(`float32`float64`decimal)!(1.23e;4.56;7.89); +str_opts:(`string`large_string`binary`large_binary`fixed_binary)!("start";"stop";"x"$"alert";"x"$"acknowledge";0Ng); +time_opts:(`date32`date64`timestamp`time64`duration)!("i"$2006.07.21;"j"$2015.01.01D00:00:00.000000000;"j"$2011.01.01D00:00:00.000000000;"j"$12:00:00.000000000;"j"$12:00:00.000000000); +other_opts:(`float16`time32`month_interval`day_time_interval)!(9h;"i"$09:01:02.042;"i"$2006.07m;"j"$12:00:00.000000000); -options:(``NULL_MAPPING)!((::);short_opts,long_opts,float_opts,string_opts,date_opts,time_opts,interval_opts); +options:(``NULL_MAPPING)!((::);short_opts,long_opts,float_opts,str_opts,time_opts,other_opts); ts_dt:.arrowkdb.dt.timestamp[`nano]; @@ -32,25 +30,24 @@ i32_dt:.arrowkdb.dt.int32[]; ui64_dt:.arrowkdb.dt.uint64[]; i64_dt:.arrowkdb.dt.int64[]; -f16_dt:.arrowkdb.dt.float16[]; f32_dt:.arrowkdb.dt.float32[]; f64_dt:.arrowkdb.dt.float64[]; +dec_dt:.arrowkdb.dt.decimal128[38i;2i]; str_dt:.arrowkdb.dt.utf8[]; lstr_dt:.arrowkdb.dt.large_utf8[]; bin_dt:.arrowkdb.dt.binary[]; lbin_dt:.arrowkdb.dt.large_binary[]; -fbin_dt:.arrowkdb.dt.fixed_size_binary[2i]; +fbin_dt:.arrowkdb.dt.fixed_size_binary[16i]; d32_dt:.arrowkdb.dt.date32[]; d64_dt:.arrowkdb.dt.date64[]; tstamp_dt:.arrowkdb.dt.timestamp[`nano]; - -t32_dt:.arrowkdb.dt.time32[`milli]; t64_dt:.arrowkdb.dt.time64[`nano]; -dec_dt:.arrowkdb.dt.decimal128[38i;2i]; dur_dt:.arrowkdb.dt.duration[`milli]; +f16_dt:.arrowkdb.dt.float16[]; +t32_dt:.arrowkdb.dt.time32[`milli]; mint_dt:.arrowkdb.dt.month_interval[]; dtint_dt:.arrowkdb.dt.day_time_interval[]; @@ -68,9 +65,9 @@ i32_fd:.arrowkdb.fd.field[`int32;i32_dt]; ui64_fd:.arrowkdb.fd.field[`uint64;ui64_dt]; i64_fd:.arrowkdb.fd.field[`int64;i64_dt]; -f16_fd:.arrowkdb.fd.field[`float16;f16_dt]; f32_fd:.arrowkdb.fd.field[`float32;f32_dt]; f64_fd:.arrowkdb.fd.field[`float64;f64_dt]; +dec_fd:.arrowkdb.fd.field[`decimal;dec_dt]; str_fd:.arrowkdb.fd.field[`string;str_dt]; lstr_fd:.arrowkdb.fd.field[`long_string;lstr_dt]; @@ -81,32 +78,29 @@ fbin_fd:.arrowkdb.fd.field[`fixed_binary;fbin_dt]; d32_fd:.arrowkdb.fd.field[`date32;d32_dt]; d64_fd:.arrowkdb.fd.field[`date64;d64_dt]; tstamp_fd:.arrowkdb.fd.field[`timestamp;tstamp_dt]; - -t32_fd:.arrowkdb.fd.field[`time32;t32_dt]; t64_fd:.arrowkdb.fd.field[`time64;t64_dt]; -dec_fd:.arrowkdb.fd.field[`decimal;dec_dt]; - dur_fd:.arrowkdb.fd.field[`duration;dur_dt]; + +f16_fd:.arrowkdb.fd.field[`float16;f16_dt]; +t32_fd:.arrowkdb.fd.field[`time32;t32_dt]; mint_fd:.arrowkdb.fd.field[`month_interval;mint_dt]; dtint_fd:.arrowkdb.fd.field[`day_time_interval;dtint_dt]; // Create the schemas for the list of fields short_schema:.arrowkdb.sc.schema[(ts_fd,bool_fd,ui8_fd,i8_fd,ui16_fd,i16_fd)]; long_schema:.arrowkdb.sc.schema[(ts_fd,ui32_fd,i32_fd,ui64_fd,i64_fd)]; -float_schema:.arrowkdb.sc.schema[(ts_fd,f16_fd,f32_fd,f64_fd)]; +float_schema:.arrowkdb.sc.schema[(ts_fd,f32_fd,f64_fd,dec_fd)]; str_schema:.arrowkdb.sc.schema[(ts_fd,str_fd,lstr_fd,bin_fd,lbin_fd,fbin_fd)]; -date_schema:.arrowkdb.sc.schema[(ts_fd,d32_fd,d64_fd,tstamp_fd)]; -time_schema:.arrowkdb.sc.schema[(ts_fd,t32_fd,t64_fd,dec_fd)]; -dur_schema:.arrowkdb.sc.schema[ts_fd,dur_fd,mint_fd,dtint_fd] +time_schema:.arrowkdb.sc.schema[(ts_fd,d32_fd,d64_fd,tstamp_fd,t64_fd,dur_fd)]; +other_schema:.arrowkdb.sc.schema[(ts_fd,f16_fd,t32_fd,mint_fd,dtint_fd)]; // Print the schemas .arrowkdb.sc.printSchema[short_schema] .arrowkdb.sc.printSchema[long_schema] .arrowkdb.sc.printSchema[float_schema] -.arrowkdb.sc.printSchema[string_schema] -.arrowkdb.sc.printSchema[date_schema] +.arrowkdb.sc.printSchema[str_schema] .arrowkdb.sc.printSchema[time_schema] -.arrowkdb.sc.printSchema[dur_schema] +.arrowkdb.sc.printSchema[other_schema] //-----------------------// // Create the array data // @@ -135,12 +129,12 @@ ui64_data[2]:7; i64_data:N?100; i64_data[3]:8; -f16_data:N?100h; -f16_data[0]:9h; f32_data:N?100e; -f32_data[1]:1.23e; +f32_data[0]:1.23e; f64_data:N?100f; -f64_data[2]:4.56f; +f64_data[1]:4.56f; +dec_data:N?(10f); +dec_data[2]:7.89f str_data:N?("start";"stop";"alert";"acknowledge";""); str_data[0]:"start" @@ -150,52 +144,49 @@ bin_data:N?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); bin_data[2]:"x"$"alert" lbin_data:N?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); lbin_data[3]:"x"$"acknowledge" -fbin_data:N?("x"$"0123456789"); -fbin_data[4]:"x"$"5" - -d32_data:N?("i"$2006.07.21;"i"$2008.07.18;"i"$2012.07.16;"i"$2014.07.15;"i"$2016.07.11); -d32_data[1]:"i"$2006.07.21; -d64_data:N?("j"$2015.01.01D00:00:00.000000000;"j"$2017.01.01D00:00:00.000000000;"j"$2018.01.01D00:00:00.000000000;"j"$2019.01.01D00:00:00.000000000;"j"$2020.01.01D00:00:00.000000000); -d64_data[2]:"j"$2015.01.01D00:00:00.000000000; -tstamp_data:N?("j"$12:00:00.000000000;"j"$13:00:00.000000000;"j"$14:00:00.000000000;"j"$15:00:00.000000000;"j"$16:00:00.000000000); -tstamp_data[3]:"j"$12:00:00.000000000; - -t32_data:N?("i"$09:01:02.042;"i"$08:01:02.042;"i"$07:01:02.042;"i"$06:01:02.042;"i"$05:01:02.042); -t32_data[0]:"i"$09:01:02.042; -t64_data:N?("j"$2015.01.01D00:00:00.000000000;"j"$2016.01.01D00:00:00.000000000;"j"$2017.01.01D00:00:00.000000000;"j"$2018.01.01D00:00:00.000000000;"j"$2019.01.01D00:00:00.000000000); -t64_data[1]:"j"$2015.01.01D00:00:00.000000000; -dec_data:N?(10f); -dec_data[2]:7.89f +fbin_data:N?0Ng; +fbin_data[4]:0Ng; + +d32_data:N?(2006.07.21;2008.07.18;2012.07.16;2014.07.15;2016.07.11); +d32_data[0]:2006.07.21; +d64_data:N?(2015.01.01D00:00:00.000000000;2017.01.01D00:00:00.000000000;2018.01.01D00:00:00.000000000;2019.01.01D00:00:00.000000000;2020.01.01D00:00:00.000000000); +d64_data[1]:2015.01.01D00:00:00.000000000; +tstamp_data:N?(2015.01.01D00:00:00.000000000;2014.01.01D00:00:00.000000000;2013.01.01D00:00:00.000000000;2012.01.01D00:00:00.000000000;2011.01.01D00:00:00.000000000); +tstamp_data[2]:2011.01.01D00:00:00.000000000; +t64_data:N?(12:00:00.000000000;13:00:00.000000000;14:00:00.000000000;15:00:00.000000000;16:00:00.000000000); +t64_data[3]:12:00:00.000000000; +dur_data:N?(12:00:00.000000000;13:00:00.000000000;14:00:00.000000000;15:00:00.000000000;16:00:00.000000000); +dur_data[4]:12:00:00.000000000; -dur_data:N?("j"$12:00:00.000000000;"j"$13:00:00.000000000;"j"$14:00:00.000000000;"j"$15:00:00.000000000;"j"$16:00:00.000000000); -dur_data[0]:"j"$12:00:00.000000000 -mint_data:N?("i"$2006.07m;"i"$2006.06m;"i"$2006.05m;"i"$2006.04m;"i"$2006.03m); -mint_data[1]:"i"$2006.07m; -dtint_data:N?("j"$12:00:00.000000000;"j"$11:00:00.000000000;"j"$10:00:00.000000000;"j"$09:00:00.000000000;"j"$08:00:00.000000000); -dtint_data[2]:"j"$12:00:00.000000000; +f16_data:N?100h; +f16_data[0]:9h; +t32_data:N?(09:01:02.042;08:01:02.042;07:01:02.042;06:01:02.042;05:01:02.042); +t32_data[1]:09:01:02.042; +mint_data:N?(2006.07m;2006.06m;2006.05m;2006.04m;2006.03m); +mint_data[2]:2006.07m; +dtint_data:N?(12:00:00.000000000;11:00:00.000000000;10:00:00.000000000;09:00:00.000000000;08:00:00.000000000); +dtint_data[3]:12:00:00.000000000; // Combine the data for all columns short_data:(ts_data;bool_data;ui8_data;i8_data;ui16_data;i16_data); long_data:(ts_data;ui32_data;i32_data;ui64_data;i64_data); -float_data:(ts_data;f16_data;f32_data;f64_data); +float_data:(ts_data;f32_data;f64_data;dec_data); str_data:(ts_data;str_data;lstr_data;bin_data;lbin_data;fbin_data); -date_data:(ts_data;d32_data;d64_data;tstamp_data); -time_data:(ts_data;t32_data;t64_data;dec_data) -dur_data:(ts_data;dur_data;mint_data;dtint_data) +time_data:(ts_data;d32_data;d64_data;tstamp_data;t64_data;dur_data) +other_data:(ts_data;f16_data;t32_data;mint_data;dtint_data) // Pretty print the Arrow table populated from the array data +options[`DECIMAL128_AS_DOUBLE]:1 .arrowkdb.tb.prettyPrintTable[short_schema;short_data;options] .arrowkdb.tb.prettyPrintTable[long_schema;long_data;options] .arrowkdb.tb.prettyPrintTable[float_schema;float_data;options] .arrowkdb.tb.prettyPrintTable[str_schema;str_data;options] -.arrowkdb.tb.prettyPrintTable[date_schema;date_data;options] .arrowkdb.tb.prettyPrintTable[time_schema;time_data;options] -.arrowkdb.tb.prettyPrintTable[dur_schema;dur_data;options] +.arrowkdb.tb.prettyPrintTable[other_schema;other_data;options] options[`PARQUET_VERSION]:`V2.0 .arrowkdb.pq.writeParquet["null_mapping_short.parquet";short_schema;short_data;options] .arrowkdb.pq.writeParquet["null_mapping_long.parquet";long_schema;long_data;options] .arrowkdb.pq.writeParquet["null_mapping_float.parquet";float_schema;float_data;options] .arrowkdb.pq.writeParquet["null_mapping_str.parquet";str_schema;str_data;options] -.arrowkdb.pq.writeParquet["null_mapping_date.parquet";date_schema;date_data;options] .arrowkdb.pq.writeParquet["null_mapping_time.parquet";time_schema;time_data;options] From 86413f78b7ef0d0b375a87055b5dcd983f754dfe Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 19 Jan 2023 21:37:36 +0300 Subject: [PATCH 158/276] Propagate null mapping through supported types --- src/ArrayWriter.cpp | 185 ++++++++++++++++++++++--- src/KdbOptions.h | 321 ++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 460 insertions(+), 46 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index 41fa68c..c23277f 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -1,7 +1,9 @@ #include +#include #include #include #include +#include #include #include @@ -20,6 +22,18 @@ using namespace kx::arrowkdb; namespace { +//! Compares floating point numbers, because of unreliable direct compare +//! @param lhs - left-hand side value +//! @param rhs - right-hand side value +//! @return true if values are nearby +template +bool is_equal( T lhs, T rhs ) +{ + static const T epsilon = 2 * std::numeric_limits::epsilon(); + + return ::fabs(lhs -= rhs) <= epsilon; +} + shared_ptr GetBuilder(shared_ptr datatype); template @@ -434,29 +448,64 @@ template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { auto bool_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(bool_builder->AppendValues((uint8_t*)kG(k_array), k_array->n)); + if( type_overrides.null_mapping.have_uint8 ){ + std::vector null_bitmap( k_array->n ); + for( auto i = 0ll; i < k_array->n; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.uint8_null ^ kG( k_array )[i]; + } + PARQUET_THROW_NOT_OK( bool_builder->AppendValues( ( uint8_t* )kG( k_array ), k_array->n, null_bitmap ) ); + } + else { + PARQUET_THROW_NOT_OK(bool_builder->AppendValues((uint8_t*)kG(k_array), k_array->n)); + } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { auto uint8_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(uint8_builder->AppendValues((uint8_t*)kG(k_array), k_array->n)); + if( type_overrides.null_mapping.have_uint8 ){ + std::vector null_bitmap( k_array->n ); + for( auto i = 0ll; i < k_array->n; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.uint8_null ^ kG( k_array )[i]; + } + PARQUET_THROW_NOT_OK( uint8_builder->AppendValues( ( uint8_t* )kG( k_array ), k_array->n, null_bitmap ) ); + } + else { + PARQUET_THROW_NOT_OK(uint8_builder->AppendValues((uint8_t*)kG(k_array), k_array->n)); + } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { auto int8_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(int8_builder->AppendValues((int8_t*)kG(k_array), k_array->n)); + if( type_overrides.null_mapping.have_int8 ){ + std::vector null_bitmap( k_array->n ); + for( auto i = 0ll; i < k_array->n; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.int8_null ^ kG( k_array )[i]; + } + PARQUET_THROW_NOT_OK( int8_builder->AppendValues( ( int8_t* )kG( k_array ), k_array->n, null_bitmap ) ); + } + else { + PARQUET_THROW_NOT_OK(int8_builder->AppendValues((int8_t*)kG(k_array), k_array->n)); + } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { auto uint16_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(uint16_builder->AppendValues((uint16_t*)kH(k_array), k_array->n)); - arrow::Status s; + if( type_overrides.null_mapping.have_uint16 ){ + std::vector null_bitmap( k_array->n ); + for( auto i = 0ll; i < k_array->n; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.uint16_null ^ kH( k_array )[i]; + } + PARQUET_THROW_NOT_OK( uint16_builder->AppendValues( ( uint16_t* )kH( k_array ), k_array->n, null_bitmap ) ); + } + else { + PARQUET_THROW_NOT_OK(uint16_builder->AppendValues((uint16_t*)kH(k_array), k_array->n)); + } } template<> @@ -479,7 +528,16 @@ template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { auto uint32_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(uint32_builder->AppendValues((uint32_t*)kI(k_array), k_array->n)); + if( type_overrides.null_mapping.have_uint32 ){ + std::vector null_bitmap( k_array->n ); + for( auto i = 0ll; i < k_array->n; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.uint32_null ^ kI( k_array )[i]; + } + PARQUET_THROW_NOT_OK( uint32_builder->AppendValues( ( uint32_t* )kI( k_array ), k_array->n, null_bitmap ) ); + } + else{ + PARQUET_THROW_NOT_OK(uint32_builder->AppendValues((uint32_t*)kI(k_array), k_array->n)); + } } template<> @@ -502,36 +560,80 @@ template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { auto uint64_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(uint64_builder->AppendValues((uint64_t*)kJ(k_array), k_array->n)); + if( type_overrides.null_mapping.have_uint64 ){ + std::vector null_bitmap( k_array->n ); + for( auto i = 0ll; i < k_array->n; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.uint64_null ^ kI( k_array )[i]; + } + PARQUET_THROW_NOT_OK( uint64_builder->AppendValues( ( uint64_t* )kI( k_array ), k_array->n, null_bitmap ) ); + } + else{ + PARQUET_THROW_NOT_OK(uint64_builder->AppendValues((uint64_t*)kJ(k_array), k_array->n)); + } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { auto int64_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(int64_builder->AppendValues((int64_t*)kJ(k_array), k_array->n)); + if( type_overrides.null_mapping.have_int64 ){ + std::vector null_bitmap( k_array->n ); + for( auto i = 0ll; i < k_array->n; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.int64_null ^ kI( k_array )[i]; + } + PARQUET_THROW_NOT_OK( int64_builder->AppendValues( ( int64_t* )kI( k_array ), k_array->n, null_bitmap ) ); + } + else{ + PARQUET_THROW_NOT_OK(int64_builder->AppendValues((int64_t*)kJ(k_array), k_array->n)); + } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { - arrow::HalfFloatType hft; auto hfl_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(hfl_builder->AppendValues((uint16_t*)kH(k_array), k_array->n)); + if( type_overrides.null_mapping.have_float16 ){ + std::vector null_bitmap( k_array->n ); + for( auto i = 0ll; i < k_array->n; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.float16_null ^ kH( k_array )[i]; + } + PARQUET_THROW_NOT_OK( hfl_builder->AppendValues( ( uint16_t* )kH( k_array ), k_array->n, null_bitmap ) ); + } + else { + PARQUET_THROW_NOT_OK(hfl_builder->AppendValues((uint16_t*)kH(k_array), k_array->n)); + } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { auto fl_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(fl_builder->AppendValues(kE(k_array), k_array->n)); + if( type_overrides.null_mapping.have_float32 ){ + std::vector null_bitmap( k_array->n ); + for( auto i = 0ll; i < k_array->n; ++i ){ + null_bitmap[i] = is_equal( type_overrides.null_mapping.float32_null, kE( k_array )[i] ); + } + PARQUET_THROW_NOT_OK( fl_builder->AppendValues( kE( k_array ), k_array->n, null_bitmap ) ); + } + else { + PARQUET_THROW_NOT_OK(fl_builder->AppendValues(kE(k_array), k_array->n)); + } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { auto dbl_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(dbl_builder->AppendValues(kF(k_array), k_array->n)); + if( type_overrides.null_mapping.have_float64 ){ + std::vector null_bitmap( k_array->n ); + for( auto i = 0ll; i < k_array->n; ++i ){ + null_bitmap[i] = is_equal( type_overrides.null_mapping.float64_null, kF( k_array )[i] ); + } + PARQUET_THROW_NOT_OK( dbl_builder->AppendValues( kF( k_array ), k_array->n, null_bitmap ) ); + } + else { + PARQUET_THROW_NOT_OK(dbl_builder->AppendValues(kF(k_array), k_array->n)); + } } template<> @@ -607,7 +709,14 @@ void PopulateBuilder(shared_ptr datatype, for (auto i = 0; i < k_array->n; ++i) { K bin_data = kK(k_array)[i]; TYPE_CHECK_ITEM(bin_data->t != KG, datatype->ToString(), KG, bin_data->t); - PARQUET_THROW_NOT_OK(bin_builder->Append(kG(bin_data), bin_data->n)); + if( type_overrides.null_mapping.have_binary + && type_overrides.null_mapping.binary_null.length() == bin_data->n + && !type_overrides.null_mapping.binary_null.compare( 0, bin_data->n, kG( bin_data ), bin_data->n ) ){ + PARQUET_THROW_NOT_OK( bin_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK(bin_builder->Append(kG(bin_data), bin_data->n)); + } } } @@ -618,7 +727,14 @@ void PopulateBuilder(shared_ptr data for (auto i = 0; i < k_array->n; ++i) { K bin_data = kK(k_array)[i]; TYPE_CHECK_ITEM(bin_data->t != KG, datatype->ToString(), KG, bin_data->t); - PARQUET_THROW_NOT_OK(bin_builder->Append(kG(bin_data), bin_data->n)); + if( type_overrides.null_mapping.have_large_binary + && type_overrides.null_mapping.large_binary_null.length() == bin_data->n + && !type_overrides.null_mapping.large_binary_null.compare( 0, bin_data->n, kG( bin_data ), bin_data->n ) ){ + PARQUET_THROW_NOT_OK( bin_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK(bin_builder->Append(kG(bin_data), bin_data->n)); + } } } @@ -645,8 +761,15 @@ void PopulateBuilder(shared_ptr datatype, { TemporalConversion tc(datatype); auto d32_builder = static_cast(builder); - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(d32_builder->Append(tc.KdbToArrow(kI(k_array)[i]))); + for (auto i = 0; i < k_array->n; ++i){ + if( type_overrides.null_mapping.have_date32 + && type_overrides.null_mapping.date32_null == kI( k_array )[i] ){ + PARQUET_THROW_NOT_OK( d32_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK(d32_builder->Append(tc.KdbToArrow(kI(k_array)[i]))); + } + } } template<> @@ -655,7 +778,13 @@ void PopulateBuilder(shared_ptr datatype, TemporalConversion tc(datatype); auto d64_builder = static_cast(builder); for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(d64_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); + if( type_overrides.null_mapping.have_date64 + && type_overrides.null_mapping.date64_null == kJ( k_array )[i] ){ + PARQUET_THROW_NOT_OK( d64_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK(d64_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); + } } template<> @@ -725,15 +854,31 @@ template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { auto month_builder = static_cast(builder); - PARQUET_THROW_NOT_OK(month_builder->AppendValues((int32_t*)kI(k_array), k_array->n)); + if( type_overrides.null_mapping.have_month_interval ){ + std::vector null_bitmap( k_array->n ); + for( auto i = 0ll; i < k_array->n; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.month_interval_null ^ kI( k_array )[i]; + } + PARQUET_THROW_NOT_OK( month_builder->AppendValues( ( int32_t* )kI( k_array ), k_array->n, null_bitmap ) ); + } + else{ + PARQUET_THROW_NOT_OK(month_builder->AppendValues((int32_t*)kI(k_array), k_array->n)); + } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { auto dt_builder = static_cast(builder); - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(dt_builder->Append(KTimespan_DayTimeInterval(kJ(k_array)[i]))); + for (auto i = 0; i < k_array->n; ++i){ + if( type_overrides.null_mapping.have_day_time_interval + && type_overrides.null_mapping.day_time_interval_null == kJ( k_array )[i] ){ + PARQUET_THROW_NOT_OK( dt_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK(dt_builder->Append(KTimespan_DayTimeInterval(kJ(k_array)[i]))); + } + } } template<> diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 509ddd5..5118e49 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -23,7 +23,7 @@ constexpr auto toUType( E enumerator ) noexcept template< typename E > struct ETraits { - using Names = std::map< E, std::string >; + using Names = std::map; static std::string name( E enumerator ) { @@ -33,7 +33,7 @@ struct ETraits return it->second; } - return "unknown"; + return "UNKNOWN"; } static std::string name( int index ) { return name( static_cast( index ) ); } @@ -57,10 +57,27 @@ namespace Options const std::string NULL_MAPPING = "NULL_MAPPING"; // Null mapping options + const std::string NM_NA = "na"; + const std::string NM_BOOLEAN = "boolean"; + const std::string NM_UINT_8 = "uint8"; + const std::string NM_INT_8 = "int8"; + const std::string NM_UINT_16 = "uint16"; const std::string NM_INT_16 = "int16"; + const std::string NM_UINT_32 = "uint32"; const std::string NM_INT_32 = "int32"; + const std::string NM_UINT_64 = "uint64"; + const std::string NM_INT_64 = "int64"; + const std::string NM_FLOAT_16 = "float16"; + const std::string NM_FLOAT_32 = "float32"; + const std::string NM_FLOAT_64 = "float64"; const std::string NM_STRING = "string"; const std::string NM_LARGE_STRING = "large_string"; + const std::string NM_BINARY = "binary"; + const std::string NM_LARGE_BINARY = "large_binary"; + const std::string NM_DATE_32 = "date32"; + const std::string NM_DATE_64 = "date64"; + const std::string NM_MONTH_INTERVAL = "month_interval"; + const std::string NM_DAY_TIME_INTERVAL = "day_time_interval"; const static std::set int_options = { PARQUET_CHUNK_SIZE, @@ -75,38 +92,133 @@ namespace Options NULL_MAPPING, }; const static std::set null_mapping_options = { - NM_INT_16, - NM_INT_32, - NM_STRING, - NM_LARGE_STRING + NM_NA + , NM_BOOLEAN + , NM_UINT_8 + , NM_INT_8 + , NM_UINT_16 + , NM_INT_16 + , NM_UINT_32 + , NM_INT_32 + , NM_UINT_64 + , NM_INT_64 + , NM_FLOAT_16 + , NM_FLOAT_32 + , NM_FLOAT_64 + , NM_STRING + , NM_LARGE_STRING + , NM_BINARY + , NM_LARGE_BINARY + , NM_DATE_32 + , NM_DATE_64 + , NM_MONTH_INTERVAL + , NM_DAY_TIME_INTERVAL }; struct NullMapping { enum class Type: int{ - INT_16 + NA + , BOOLEAN + , UINT_8 + , INT_8 + , UINT_16 + , INT_16 + , UINT_32 , INT_32 + , UINT_64 + , INT_64 + , FLOAT_16 + , FLOAT_32 + , FLOAT_64 , STRING , LARGE_STRING + , BINARY + , LARGE_BINARY + , DATE_32 + , DATE_64 + , MONTH_INTERVAL + , DAY_TIME_INTERVAL }; + bool have_na; + bool have_boolean; + bool have_uint8; + bool have_int8; + bool have_uint16; bool have_int16; - int16_t int16_null; + bool have_uint32; bool have_int32; - int32_t int32_null; + bool have_uint64; + bool have_int64; + bool have_float16; + bool have_float32; + bool have_float64; bool have_string; - std::string string_null; bool have_large_string; + bool have_binary; + bool have_large_binary; + bool have_date32; + bool have_date64; + bool have_month_interval; + bool have_day_time_interval; + + using Binary = std::basic_string; + + void* na_null = nullptr; + bool boolean_null; + + uint8_t uint8_null; + int8_t int8_null; + + uint16_t uint16_null; + int16_t int16_null; + + uint32_t uint32_null; + int32_t int32_null; + + uint64_t uint64_null; + int64_t int64_null; + + uint16_t float16_null; + float float32_null; + double float64_null; + + std::string string_null; std::string large_string_null; + Binary binary_null; + Binary large_binary_null; + + int32_t date32_null; + int64_t date64_null; + int32_t month_interval_null; + int64_t day_time_interval_null; }; } template<> inline const ETraits< Options::NullMapping::Type >::Names ETraits< Options::NullMapping::Type >::names { - { Options::NullMapping::Type::INT_16, Options::NM_INT_16 } + { Options::NullMapping::Type::NA, Options::NM_NA } + , { Options::NullMapping::Type::BOOLEAN, Options::NM_BOOLEAN } + , { Options::NullMapping::Type::UINT_8, Options::NM_UINT_8 } + , { Options::NullMapping::Type::INT_8, Options::NM_INT_8 } + , { Options::NullMapping::Type::UINT_16, Options::NM_UINT_16 } + , { Options::NullMapping::Type::INT_16, Options::NM_INT_16 } + , { Options::NullMapping::Type::UINT_32, Options::NM_UINT_32 } , { Options::NullMapping::Type::INT_32, Options::NM_INT_32 } + , { Options::NullMapping::Type::UINT_64, Options::NM_UINT_64 } + , { Options::NullMapping::Type::INT_64, Options::NM_INT_64 } + , { Options::NullMapping::Type::FLOAT_16, Options::NM_FLOAT_16 } + , { Options::NullMapping::Type::FLOAT_32, Options::NM_FLOAT_32 } + , { Options::NullMapping::Type::FLOAT_64, Options::NM_FLOAT_64 } , { Options::NullMapping::Type::STRING, Options::NM_STRING } , { Options::NullMapping::Type::LARGE_STRING, Options::NM_LARGE_STRING } + , { Options::NullMapping::Type::BINARY, Options::NM_BINARY } + , { Options::NullMapping::Type::LARGE_BINARY, Options::NM_LARGE_BINARY } + , { Options::NullMapping::Type::DATE_32, Options::NM_DATE_32 } + , { Options::NullMapping::Type::DATE_64, Options::NM_DATE_64 } + , { Options::NullMapping::Type::MONTH_INTERVAL, Options::NM_MONTH_INTERVAL } + , { Options::NullMapping::Type::DAY_TIME_INTERVAL, Options::NM_DAY_TIME_INTERVAL } }; // Helper class for reading dictionary of options @@ -182,14 +294,54 @@ class KdbOptions throw InvalidOption(("Unsupported NULL_MAPPING option '" + key + "'").c_str()); } K value = kK( values )[i]; - if( ETraits::name( NM::INT_16 ) == key && -KH == value->t ){ + if( ETraits::name( NM::BOOLEAN ) == key && -KG == value->t ){ + null_mapping_options.boolean_null = value->g; + null_mapping_options.have_boolean = true; + } + else if( ETraits::name( NM::UINT_8 ) == key && -KG == value->t ){ + null_mapping_options.uint8_null = value->g; + null_mapping_options.have_uint8 = true; + } + else if( ETraits::name( NM::INT_8 ) == key && -KG == value->t ){ + null_mapping_options.int8_null = value->g; + null_mapping_options.have_int8 = true; + } + else if( ETraits::name( NM::UINT_16 ) == key && -KH == value->t ){ + null_mapping_options.uint16_null = value->h; + null_mapping_options.have_uint16 = true; + } + else if( ETraits::name( NM::INT_16 ) == key && -KH == value->t ){ null_mapping_options.int16_null = value->h; null_mapping_options.have_int16 = true; } + else if( ETraits::name( NM::UINT_32 ) == key && -KI == value->t ){ + null_mapping_options.uint32_null = value->i; + null_mapping_options.have_uint32 = true; + } else if( ETraits::name( NM::INT_32 ) == key && -KI == value->t ){ null_mapping_options.int32_null = value->i; null_mapping_options.have_int32 = true; } + else if( ETraits::name( NM::UINT_64 ) == key && -KJ == value->t ){ + null_mapping_options.uint64_null = value->j; + null_mapping_options.have_uint64 = true; + } + else if( ETraits::name( NM::INT_64 ) == key && -KJ == value->t ){ + null_mapping_options.int64_null = value->j; + null_mapping_options.have_int64 = true; + } + else if( ETraits::name( NM::FLOAT_16 ) == key && -KH == value->t ){ + null_mapping_options.float16_null = value->h; + null_mapping_options.have_float16 = true; + } + else if( ETraits::name( NM::FLOAT_32 ) == key && -KE == value->t ){ + null_mapping_options.float32_null = value->e; + null_mapping_options.have_float32 = true; + } + else if( ETraits::name( NM::FLOAT_64 ) == key && -KF == value->t ){ + null_mapping_options.float64_null = value->f; + null_mapping_options.have_float64 = true; + } else if( ETraits::name( NM::STRING ) == key && KC == value->t ){ null_mapping_options.string_null.assign( (char*)kC( value ), value->n ); null_mapping_options.have_string = true; @@ -198,6 +350,30 @@ class KdbOptions null_mapping_options.large_string_null.assign( (char*)kC( value ), value->n ); null_mapping_options.have_large_string = true; } + else if( ETraits::name( NM::BINARY ) == key && KC == value->t ){ + null_mapping_options.binary_null.assign( kC( value ), value->n ); + null_mapping_options.have_binary = true; + } + else if( ETraits::name( NM::LARGE_BINARY ) == key && KC == value->t ){ + null_mapping_options.large_binary_null.assign( kC( value ), value->n ); + null_mapping_options.have_large_binary = true; + } + else if( ETraits::name( NM::DATE_32 ) == key && -KI == value->t ){ + null_mapping_options.date32_null = value->i; + null_mapping_options.have_date32 = true; + } + else if( ETraits::name( NM::DATE_64 ) == key && -KJ == value->t ){ + null_mapping_options.date64_null = value->j; + null_mapping_options.have_date64 = true; + } + else if( ETraits::name( NM::MONTH_INTERVAL ) == key && -KI == value->t ){ + null_mapping_options.month_interval_null = value->i; + null_mapping_options.have_month_interval = true; + } + else if( ETraits::name( NM::DAY_TIME_INTERVAL ) == key && -KJ == value->t ){ + null_mapping_options.day_time_interval_null = value->j; + null_mapping_options.have_day_time_interval = true; + } else if( 101 == value->t ){ // Ignore generic null, which may be used here to ensure mixed list of options } @@ -310,8 +486,12 @@ class KdbOptions } } - template - auto GetNullMappingOption( bool& result ); + template + auto GetNullMappingOption( bool& result ) { + result = true; + + return null_mapping_options.na_null; + } void GetNullMappingOptions( Options::NullMapping& null_mapping ) const { @@ -342,37 +522,126 @@ class KdbOptions }; template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ) -{ - result = null_mapping_options.have_int16; +inline auto KdbOptions::GetNullMappingOption( bool& result ){ + result = null_mapping_options.have_boolean; + return null_mapping_options.boolean_null; +} + +template<> +inline auto KdbOptions::GetNullMappingOption( bool& result ){ + result = null_mapping_options.have_uint8; + return null_mapping_options.uint8_null; +} + +template<> +inline auto KdbOptions::GetNullMappingOption( bool& result ){ + result = null_mapping_options.have_int8; + return null_mapping_options.int8_null; +} + +template<> +inline auto KdbOptions::GetNullMappingOption( bool& result ){ + result = null_mapping_options.have_uint16; + return null_mapping_options.uint16_null; +} +template<> +inline auto KdbOptions::GetNullMappingOption( bool& result ){ + result = null_mapping_options.have_int16; return null_mapping_options.int16_null; } template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ) -{ - result = null_mapping_options.have_int32; +inline auto KdbOptions::GetNullMappingOption( bool& result ){ + result = null_mapping_options.have_uint32; + return null_mapping_options.uint32_null; +} +template<> +inline auto KdbOptions::GetNullMappingOption( bool& result ){ + result = null_mapping_options.have_int32; return null_mapping_options.int32_null; } template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ) -{ - result = null_mapping_options.have_string; +inline auto KdbOptions::GetNullMappingOption( bool& result ){ + result = null_mapping_options.have_uint64; + return null_mapping_options.uint64_null; +} +template<> +inline auto KdbOptions::GetNullMappingOption( bool& result ){ + result = null_mapping_options.have_int64; + return null_mapping_options.int64_null; +} + +template<> +inline auto KdbOptions::GetNullMappingOption( bool& result ){ + result = null_mapping_options.have_float16; + return null_mapping_options.float16_null; +} + +template<> +inline auto KdbOptions::GetNullMappingOption( bool& result ){ + result = null_mapping_options.have_float32; + return null_mapping_options.float32_null; +} + +template<> +inline auto KdbOptions::GetNullMappingOption( bool& result ){ + result = null_mapping_options.have_float64; + return null_mapping_options.float64_null; +} + +template<> +inline auto KdbOptions::GetNullMappingOption( bool& result ){ + result = null_mapping_options.have_string; return null_mapping_options.string_null; } template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ) -{ +inline auto KdbOptions::GetNullMappingOption( bool& result ){ result = null_mapping_options.have_large_string; - return null_mapping_options.large_string_null; } +template<> +inline auto KdbOptions::GetNullMappingOption( bool& result ){ + result = null_mapping_options.have_binary; + return null_mapping_options.binary_null; +} + +template<> +inline auto KdbOptions::GetNullMappingOption( bool& result ){ + result = null_mapping_options.have_large_binary; + return null_mapping_options.large_binary_null; +} + +template<> +inline auto KdbOptions::GetNullMappingOption( bool& result ){ + result = null_mapping_options.have_date32; + return null_mapping_options.date32_null; +} + +template<> +inline auto KdbOptions::GetNullMappingOption( bool& result ){ + result = null_mapping_options.have_date64; + return null_mapping_options.date64_null; +} + +template<> +inline auto KdbOptions::GetNullMappingOption( bool& result ){ + result = null_mapping_options.have_month_interval; + return null_mapping_options.month_interval_null; +} + +template<> +inline auto KdbOptions::GetNullMappingOption( bool& result ){ + result = null_mapping_options.have_day_time_interval; + return null_mapping_options.day_time_interval_null; +} + + } // namespace arrowkdb } // namespace kx From cc7e2b42a330e04b08a8b5963e12e8206c9cd9e9 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 20 Jan 2023 13:48:14 +0300 Subject: [PATCH 159/276] Arrow types primacy --- src/ArrayWriter.cpp | 4 +- src/KdbOptions.h | 257 ++++++++++++-------------------------------- 2 files changed, 68 insertions(+), 193 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index c23277f..49756e0 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -448,10 +448,10 @@ template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { auto bool_builder = static_cast(builder); - if( type_overrides.null_mapping.have_uint8 ){ + if( type_overrides.null_mapping.have_boolean ){ std::vector null_bitmap( k_array->n ); for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.uint8_null ^ kG( k_array )[i]; + null_bitmap[i] = type_overrides.null_mapping.boolean_null ^ kG( k_array )[i]; } PARQUET_THROW_NOT_OK( bool_builder->AppendValues( ( uint8_t* )kG( k_array ), k_array->n, null_bitmap ) ); } diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 5118e49..7d5bb6d 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -9,7 +9,7 @@ #include #include "k.h" - +#include namespace kx { namespace arrowkdb { @@ -117,30 +117,6 @@ namespace Options struct NullMapping { - enum class Type: int{ - NA - , BOOLEAN - , UINT_8 - , INT_8 - , UINT_16 - , INT_16 - , UINT_32 - , INT_32 - , UINT_64 - , INT_64 - , FLOAT_16 - , FLOAT_32 - , FLOAT_64 - , STRING - , LARGE_STRING - , BINARY - , LARGE_BINARY - , DATE_32 - , DATE_64 - , MONTH_INTERVAL - , DAY_TIME_INTERVAL - }; - bool have_na; bool have_boolean; bool have_uint8; @@ -193,32 +169,56 @@ namespace Options int64_t date64_null; int32_t month_interval_null; int64_t day_time_interval_null; + + template + auto GetOption() const { return std::make_pair( true, na_null );} }; + + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_boolean, boolean_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_uint8, uint8_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_int8, int8_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_uint16, uint16_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_int16, int16_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_uint32, uint32_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_int32, int32_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_uint64, uint64_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_int64, int64_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_float16, float16_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_float32, float32_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_float64, float64_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_string, string_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_large_string, large_string_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_binary, binary_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_large_binary, large_binary_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_date32, date32_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_date64, date64_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_month_interval, month_interval_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_day_time_interval, day_time_interval_null ); } } template<> -inline const ETraits< Options::NullMapping::Type >::Names ETraits< Options::NullMapping::Type >::names { - { Options::NullMapping::Type::NA, Options::NM_NA } - , { Options::NullMapping::Type::BOOLEAN, Options::NM_BOOLEAN } - , { Options::NullMapping::Type::UINT_8, Options::NM_UINT_8 } - , { Options::NullMapping::Type::INT_8, Options::NM_INT_8 } - , { Options::NullMapping::Type::UINT_16, Options::NM_UINT_16 } - , { Options::NullMapping::Type::INT_16, Options::NM_INT_16 } - , { Options::NullMapping::Type::UINT_32, Options::NM_UINT_32 } - , { Options::NullMapping::Type::INT_32, Options::NM_INT_32 } - , { Options::NullMapping::Type::UINT_64, Options::NM_UINT_64 } - , { Options::NullMapping::Type::INT_64, Options::NM_INT_64 } - , { Options::NullMapping::Type::FLOAT_16, Options::NM_FLOAT_16 } - , { Options::NullMapping::Type::FLOAT_32, Options::NM_FLOAT_32 } - , { Options::NullMapping::Type::FLOAT_64, Options::NM_FLOAT_64 } - , { Options::NullMapping::Type::STRING, Options::NM_STRING } - , { Options::NullMapping::Type::LARGE_STRING, Options::NM_LARGE_STRING } - , { Options::NullMapping::Type::BINARY, Options::NM_BINARY } - , { Options::NullMapping::Type::LARGE_BINARY, Options::NM_LARGE_BINARY } - , { Options::NullMapping::Type::DATE_32, Options::NM_DATE_32 } - , { Options::NullMapping::Type::DATE_64, Options::NM_DATE_64 } - , { Options::NullMapping::Type::MONTH_INTERVAL, Options::NM_MONTH_INTERVAL } - , { Options::NullMapping::Type::DAY_TIME_INTERVAL, Options::NM_DAY_TIME_INTERVAL } +inline const ETraits::Names ETraits::names{ + { arrow::Type::NA, Options::NM_NA } + , { arrow::Type::BOOL, Options::NM_BOOLEAN } + , { arrow::Type::UINT8, Options::NM_UINT_8 } + , { arrow::Type::INT8, Options::NM_INT_8 } + , { arrow::Type::UINT16, Options::NM_UINT_16 } + , { arrow::Type::INT16, Options::NM_INT_16 } + , { arrow::Type::UINT32, Options::NM_UINT_32 } + , { arrow::Type::INT32, Options::NM_INT_32 } + , { arrow::Type::UINT64, Options::NM_UINT_64 } + , { arrow::Type::INT64, Options::NM_INT_64 } + , { arrow::Type::HALF_FLOAT, Options::NM_FLOAT_16 } + , { arrow::Type::FLOAT, Options::NM_FLOAT_32 } + , { arrow::Type::DOUBLE, Options::NM_FLOAT_64 } + , { arrow::Type::STRING, Options::NM_STRING } + , { arrow::Type::LARGE_STRING, Options::NM_LARGE_STRING } + , { arrow::Type::BINARY, Options::NM_BINARY } + , { arrow::Type::LARGE_BINARY, Options::NM_LARGE_BINARY } + , { arrow::Type::DATE32, Options::NM_DATE_32 } + , { arrow::Type::DATE64, Options::NM_DATE_64 } + , { arrow::Type::INTERVAL_MONTHS, Options::NM_MONTH_INTERVAL } + , { arrow::Type::INTERVAL_DAY_TIME, Options::NM_DAY_TIME_INTERVAL } }; // Helper class for reading dictionary of options @@ -278,7 +278,7 @@ class KdbOptions void PopulateNullMappingOptions( long long index, K dict ) { - using NM = Options::NullMapping::Type; + using NM = arrow::Type::type; K keys = kK( kK( dict )[index] )[0]; K values = kK( kK( dict )[index] )[1]; @@ -294,51 +294,51 @@ class KdbOptions throw InvalidOption(("Unsupported NULL_MAPPING option '" + key + "'").c_str()); } K value = kK( values )[i]; - if( ETraits::name( NM::BOOLEAN ) == key && -KG == value->t ){ + if( ETraits::name( NM::BOOL ) == key && -KG == value->t ){ null_mapping_options.boolean_null = value->g; null_mapping_options.have_boolean = true; } - else if( ETraits::name( NM::UINT_8 ) == key && -KG == value->t ){ + else if( ETraits::name( NM::UINT8 ) == key && -KG == value->t ){ null_mapping_options.uint8_null = value->g; null_mapping_options.have_uint8 = true; } - else if( ETraits::name( NM::INT_8 ) == key && -KG == value->t ){ + else if( ETraits::name( NM::INT8 ) == key && -KG == value->t ){ null_mapping_options.int8_null = value->g; null_mapping_options.have_int8 = true; } - else if( ETraits::name( NM::UINT_16 ) == key && -KH == value->t ){ + else if( ETraits::name( NM::UINT16 ) == key && -KH == value->t ){ null_mapping_options.uint16_null = value->h; null_mapping_options.have_uint16 = true; } - else if( ETraits::name( NM::INT_16 ) == key && -KH == value->t ){ + else if( ETraits::name( NM::INT16 ) == key && -KH == value->t ){ null_mapping_options.int16_null = value->h; null_mapping_options.have_int16 = true; } - else if( ETraits::name( NM::UINT_32 ) == key && -KI == value->t ){ + else if( ETraits::name( NM::UINT32 ) == key && -KI == value->t ){ null_mapping_options.uint32_null = value->i; null_mapping_options.have_uint32 = true; } - else if( ETraits::name( NM::INT_32 ) == key && -KI == value->t ){ + else if( ETraits::name( NM::INT32 ) == key && -KI == value->t ){ null_mapping_options.int32_null = value->i; null_mapping_options.have_int32 = true; } - else if( ETraits::name( NM::UINT_64 ) == key && -KJ == value->t ){ + else if( ETraits::name( NM::UINT64 ) == key && -KJ == value->t ){ null_mapping_options.uint64_null = value->j; null_mapping_options.have_uint64 = true; } - else if( ETraits::name( NM::INT_64 ) == key && -KJ == value->t ){ + else if( ETraits::name( NM::INT64 ) == key && -KJ == value->t ){ null_mapping_options.int64_null = value->j; null_mapping_options.have_int64 = true; } - else if( ETraits::name( NM::FLOAT_16 ) == key && -KH == value->t ){ + else if( ETraits::name( NM::HALF_FLOAT ) == key && -KH == value->t ){ null_mapping_options.float16_null = value->h; null_mapping_options.have_float16 = true; } - else if( ETraits::name( NM::FLOAT_32 ) == key && -KE == value->t ){ + else if( ETraits::name( NM::FLOAT ) == key && -KE == value->t ){ null_mapping_options.float32_null = value->e; null_mapping_options.have_float32 = true; } - else if( ETraits::name( NM::FLOAT_64 ) == key && -KF == value->t ){ + else if( ETraits::name( NM::DOUBLE ) == key && -KF == value->t ){ null_mapping_options.float64_null = value->f; null_mapping_options.have_float64 = true; } @@ -358,19 +358,19 @@ class KdbOptions null_mapping_options.large_binary_null.assign( kC( value ), value->n ); null_mapping_options.have_large_binary = true; } - else if( ETraits::name( NM::DATE_32 ) == key && -KI == value->t ){ + else if( ETraits::name( NM::DATE32 ) == key && -KI == value->t ){ null_mapping_options.date32_null = value->i; null_mapping_options.have_date32 = true; } - else if( ETraits::name( NM::DATE_64 ) == key && -KJ == value->t ){ + else if( ETraits::name( NM::DATE64 ) == key && -KJ == value->t ){ null_mapping_options.date64_null = value->j; null_mapping_options.have_date64 = true; } - else if( ETraits::name( NM::MONTH_INTERVAL ) == key && -KI == value->t ){ + else if( ETraits::name( NM::INTERVAL_MONTHS ) == key && -KI == value->t ){ null_mapping_options.month_interval_null = value->i; null_mapping_options.have_month_interval = true; } - else if( ETraits::name( NM::DAY_TIME_INTERVAL ) == key && -KJ == value->t ){ + else if( ETraits::name( NM::INTERVAL_DAY_TIME ) == key && -KJ == value->t ){ null_mapping_options.day_time_interval_null = value->j; null_mapping_options.have_day_time_interval = true; } @@ -486,12 +486,8 @@ class KdbOptions } } - template - auto GetNullMappingOption( bool& result ) { - result = true; - - return null_mapping_options.na_null; - } + template + auto GetNullMappingOption() const { return null_mapping_options.GetOption(); } void GetNullMappingOptions( Options::NullMapping& null_mapping ) const { @@ -521,127 +517,6 @@ class KdbOptions } }; -template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ){ - result = null_mapping_options.have_boolean; - return null_mapping_options.boolean_null; -} - -template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ){ - result = null_mapping_options.have_uint8; - return null_mapping_options.uint8_null; -} - -template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ){ - result = null_mapping_options.have_int8; - return null_mapping_options.int8_null; -} - -template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ){ - result = null_mapping_options.have_uint16; - return null_mapping_options.uint16_null; -} - -template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ){ - result = null_mapping_options.have_int16; - return null_mapping_options.int16_null; -} - -template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ){ - result = null_mapping_options.have_uint32; - return null_mapping_options.uint32_null; -} - -template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ){ - result = null_mapping_options.have_int32; - return null_mapping_options.int32_null; -} - -template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ){ - result = null_mapping_options.have_uint64; - return null_mapping_options.uint64_null; -} - -template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ){ - result = null_mapping_options.have_int64; - return null_mapping_options.int64_null; -} - -template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ){ - result = null_mapping_options.have_float16; - return null_mapping_options.float16_null; -} - -template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ){ - result = null_mapping_options.have_float32; - return null_mapping_options.float32_null; -} - -template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ){ - result = null_mapping_options.have_float64; - return null_mapping_options.float64_null; -} - -template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ){ - result = null_mapping_options.have_string; - return null_mapping_options.string_null; -} - -template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ){ - result = null_mapping_options.have_large_string; - return null_mapping_options.large_string_null; -} - -template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ){ - result = null_mapping_options.have_binary; - return null_mapping_options.binary_null; -} - -template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ){ - result = null_mapping_options.have_large_binary; - return null_mapping_options.large_binary_null; -} - -template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ){ - result = null_mapping_options.have_date32; - return null_mapping_options.date32_null; -} - -template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ){ - result = null_mapping_options.have_date64; - return null_mapping_options.date64_null; -} - -template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ){ - result = null_mapping_options.have_month_interval; - return null_mapping_options.month_interval_null; -} - -template<> -inline auto KdbOptions::GetNullMappingOption( bool& result ){ - result = null_mapping_options.have_day_time_interval; - return null_mapping_options.day_time_interval_null; -} - - } // namespace arrowkdb } // namespace kx From eb28da2040e4dbb85c886c45a03f8bb095dfc941 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 20 Jan 2023 16:32:55 +0300 Subject: [PATCH 160/276] Options for extended types support --- src/KdbOptions.h | 62 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 61 insertions(+), 1 deletion(-) diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 7d5bb6d..64ce207 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -74,8 +74,14 @@ namespace Options const std::string NM_LARGE_STRING = "large_string"; const std::string NM_BINARY = "binary"; const std::string NM_LARGE_BINARY = "large_binary"; + const std::string NM_FIXED_BINARY = "fixed_binary"; const std::string NM_DATE_32 = "date32"; const std::string NM_DATE_64 = "date64"; + const std::string NM_TIMESTAMP = "timestamp"; + const std::string NM_TIME_32 = "time32"; + const std::string NM_TIME_64 = "time64"; + const std::string NM_DECIMAL = "decimal"; + const std::string NM_DURATION = "duration"; const std::string NM_MONTH_INTERVAL = "month_interval"; const std::string NM_DAY_TIME_INTERVAL = "day_time_interval"; @@ -109,8 +115,14 @@ namespace Options , NM_LARGE_STRING , NM_BINARY , NM_LARGE_BINARY + , NM_FIXED_BINARY , NM_DATE_32 , NM_DATE_64 + , NM_TIMESTAMP + , NM_TIME_32 + , NM_TIME_64 + , NM_DECIMAL + , NM_DURATION , NM_MONTH_INTERVAL , NM_DAY_TIME_INTERVAL }; @@ -134,8 +146,14 @@ namespace Options bool have_large_string; bool have_binary; bool have_large_binary; + bool have_fixed_binary; bool have_date32; bool have_date64; + bool have_timestamp; + bool have_time32; + bool have_time64; + bool have_decimal; + bool have_duration; bool have_month_interval; bool have_day_time_interval; @@ -164,9 +182,15 @@ namespace Options std::string large_string_null; Binary binary_null; Binary large_binary_null; + Binary fixed_binary_null; int32_t date32_null; int64_t date64_null; + int64_t timestamp_null; + int32_t time32_null; + int64_t time64_null; + double decimal_null; + int64_t duration_null; int32_t month_interval_null; int64_t day_time_interval_null; @@ -190,11 +214,17 @@ namespace Options template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_large_string, large_string_null ); } template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_binary, binary_null ); } template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_large_binary, large_binary_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_fixed_binary, fixed_binary_null ); } template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_date32, date32_null ); } template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_date64, date64_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_timestamp, timestamp_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_time32, time32_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_time64, time64_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_decimal, decimal_null ); } + template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_duration, duration_null ); } template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_month_interval, month_interval_null ); } template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_day_time_interval, day_time_interval_null ); } -} +} // namespace Options template<> inline const ETraits::Names ETraits::names{ @@ -215,8 +245,14 @@ inline const ETraits::Names ETraits::names , { arrow::Type::LARGE_STRING, Options::NM_LARGE_STRING } , { arrow::Type::BINARY, Options::NM_BINARY } , { arrow::Type::LARGE_BINARY, Options::NM_LARGE_BINARY } + , { arrow::Type::FIXED_SIZE_BINARY, Options::NM_FIXED_BINARY } + , { arrow::Type::DATE32, Options::NM_DATE_32 } + , { arrow::Type::DATE64, Options::NM_DATE_64 } + , { arrow::Type::TIMESTAMP, Options::NM_TIMESTAMP } , { arrow::Type::DATE32, Options::NM_DATE_32 } , { arrow::Type::DATE64, Options::NM_DATE_64 } + , { arrow::Type::DECIMAL, Options::NM_DECIMAL } + , { arrow::Type::DURATION, Options::NM_DURATION } , { arrow::Type::INTERVAL_MONTHS, Options::NM_MONTH_INTERVAL } , { arrow::Type::INTERVAL_DAY_TIME, Options::NM_DAY_TIME_INTERVAL } }; @@ -358,6 +394,10 @@ class KdbOptions null_mapping_options.large_binary_null.assign( kC( value ), value->n ); null_mapping_options.have_large_binary = true; } + else if( ETraits::name( NM::FIXED_SIZE_BINARY ) == key && KC == value->t ){ + null_mapping_options.fixed_binary_null.assign( kC( value ), value->n ); + null_mapping_options.have_fixed_binary = true; + } else if( ETraits::name( NM::DATE32 ) == key && -KI == value->t ){ null_mapping_options.date32_null = value->i; null_mapping_options.have_date32 = true; @@ -366,6 +406,26 @@ class KdbOptions null_mapping_options.date64_null = value->j; null_mapping_options.have_date64 = true; } + else if( ETraits::name( NM::TIMESTAMP ) == key && -KJ == value->t ){ + null_mapping_options.timestamp_null = value->j; + null_mapping_options.have_timestamp = true; + } + else if( ETraits::name( NM::TIME32 ) == key && -KI == value->t ){ + null_mapping_options.time32_null = value->i; + null_mapping_options.have_time32 = true; + } + else if( ETraits::name( NM::TIME64 ) == key && -KJ == value->t ){ + null_mapping_options.time64_null = value->j; + null_mapping_options.have_time64 = true; + } + else if( ETraits::name( NM::DECIMAL ) == key && -KF == value->t ){ + null_mapping_options.decimal_null = value->f; + null_mapping_options.have_decimal = true; + } + else if( ETraits::name( NM::DURATION ) == key && -KJ == value->t ){ + null_mapping_options.duration_null = value->j; + null_mapping_options.have_duration = true; + } else if( ETraits::name( NM::INTERVAL_MONTHS ) == key && -KI == value->t ){ null_mapping_options.month_interval_null = value->i; null_mapping_options.have_month_interval = true; From 116a19afbf0ba063372a1fc7adfa959559a25d27 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 20 Jan 2023 17:10:58 +0300 Subject: [PATCH 161/276] Supporting of extended types writing --- src/ArrayWriter.cpp | 83 ++++++++++++++++++++++++++++++++------------- 1 file changed, 60 insertions(+), 23 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index 49756e0..e4d4bfd 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -31,7 +31,7 @@ bool is_equal( T lhs, T rhs ) { static const T epsilon = 2 * std::numeric_limits::epsilon(); - return ::fabs(lhs -= rhs) <= epsilon; + return ::fabs( lhs -= rhs ) <= epsilon; } shared_ptr GetBuilder(shared_ptr datatype); @@ -370,7 +370,7 @@ void PopulateListBuilder(shared_ptr datatype, K k_array, arrow: continue; // Delimit the start/end of each child list set - list_builder->Append(); + PARQUET_THROW_NOT_OK( list_builder->Append() ); if (datatype->id() == arrow::Type::FIXED_SIZE_LIST) { // Check each sub-list is the same length as the fixed size @@ -415,7 +415,7 @@ void PopulateUnionBuilder(shared_ptr datatype, K k_array, arrow // for this union value for (auto index = 0; index < kK(k_array)[0]->n; ++index) { int8_t live_type_id = kH(type_ids)[index]; - union_builder->Append(live_type_id); + PARQUET_THROW_NOT_OK( union_builder->Append(live_type_id) ); } // Populate each of the child builders from its kdb list, starting from 1 to @@ -563,9 +563,9 @@ void PopulateBuilder(shared_ptr datatype, if( type_overrides.null_mapping.have_uint64 ){ std::vector null_bitmap( k_array->n ); for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.uint64_null ^ kI( k_array )[i]; + null_bitmap[i] = type_overrides.null_mapping.uint64_null ^ kJ( k_array )[i]; } - PARQUET_THROW_NOT_OK( uint64_builder->AppendValues( ( uint64_t* )kI( k_array ), k_array->n, null_bitmap ) ); + PARQUET_THROW_NOT_OK( uint64_builder->AppendValues( ( uint64_t* )kJ( k_array ), k_array->n, null_bitmap ) ); } else{ PARQUET_THROW_NOT_OK(uint64_builder->AppendValues((uint64_t*)kJ(k_array), k_array->n)); @@ -579,9 +579,9 @@ void PopulateBuilder(shared_ptr datatype, K if( type_overrides.null_mapping.have_int64 ){ std::vector null_bitmap( k_array->n ); for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.int64_null ^ kI( k_array )[i]; + null_bitmap[i] = type_overrides.null_mapping.int64_null ^ kJ( k_array )[i]; } - PARQUET_THROW_NOT_OK( int64_builder->AppendValues( ( int64_t* )kI( k_array ), k_array->n, null_bitmap ) ); + PARQUET_THROW_NOT_OK( int64_builder->AppendValues( ( int64_t* )kJ( k_array ), k_array->n, null_bitmap ) ); } else{ PARQUET_THROW_NOT_OK(int64_builder->AppendValues((int64_t*)kJ(k_array), k_array->n)); @@ -658,7 +658,7 @@ void PopulateBuilder(shared_ptr datatype, K str_data = kK( k_array )[i]; TYPE_CHECK_ITEM( str_data->t != KC, datatype->ToString(), KC, str_data->t ); if( type_overrides.null_mapping.have_string - && type_overrides.null_mapping.string_null.length() == str_data->n + && type_overrides.null_mapping.string_null.length() == static_cast( str_data->n ) && !type_overrides.null_mapping.string_null.compare( 0, str_data->n, ( char* )kG( str_data ), str_data->n ) ){ PARQUET_THROW_NOT_OK( str_builder->AppendNull() ); } @@ -691,7 +691,7 @@ void PopulateBuilder(shared_ptr data K str_data = kK( k_array )[i]; TYPE_CHECK_ITEM( str_data->t != KC, datatype->ToString(), KC, str_data->t ); if( type_overrides.null_mapping.have_large_string - && type_overrides.null_mapping.large_string_null.length() == str_data->n + && type_overrides.null_mapping.large_string_null.length() == static_cast( str_data->n ) && !type_overrides.null_mapping.large_string_null.compare( 0, str_data->n, ( char* )kG( str_data ), str_data->n ) ){ PARQUET_THROW_NOT_OK( str_builder->AppendNull() ); } @@ -710,7 +710,7 @@ void PopulateBuilder(shared_ptr datatype, K bin_data = kK(k_array)[i]; TYPE_CHECK_ITEM(bin_data->t != KG, datatype->ToString(), KG, bin_data->t); if( type_overrides.null_mapping.have_binary - && type_overrides.null_mapping.binary_null.length() == bin_data->n + && type_overrides.null_mapping.binary_null.length() == static_cast( bin_data->n ) && !type_overrides.null_mapping.binary_null.compare( 0, bin_data->n, kG( bin_data ), bin_data->n ) ){ PARQUET_THROW_NOT_OK( bin_builder->AppendNull() ); } @@ -728,7 +728,7 @@ void PopulateBuilder(shared_ptr data K bin_data = kK(k_array)[i]; TYPE_CHECK_ITEM(bin_data->t != KG, datatype->ToString(), KG, bin_data->t); if( type_overrides.null_mapping.have_large_binary - && type_overrides.null_mapping.large_binary_null.length() == bin_data->n + && type_overrides.null_mapping.large_binary_null.length() == static_cast( bin_data->n ) && !type_overrides.null_mapping.large_binary_null.compare( 0, bin_data->n, kG( bin_data ), bin_data->n ) ){ PARQUET_THROW_NOT_OK( bin_builder->AppendNull() ); } @@ -751,7 +751,14 @@ void PopulateBuilder(shared_ptr K bin_data = kK(k_array)[i]; TYPE_CHECK_ITEM(bin_data->t != KG, datatype->ToString(), KG, bin_data->t); TYPE_CHECK_LENGTH(fixed_bin_builder->byte_width() != bin_data->n, builder->type()->ToString(), fixed_bin_builder->byte_width(), bin_data->n); - PARQUET_THROW_NOT_OK(fixed_bin_builder->Append(kG(bin_data))); + if( type_overrides.null_mapping.have_fixed_binary + && type_overrides.null_mapping.fixed_binary_null.length() == static_cast( bin_data->n ) + && !type_overrides.null_mapping.fixed_binary_null.compare( 0, bin_data->n, kG( bin_data ), bin_data->n ) ){ + PARQUET_THROW_NOT_OK( fixed_bin_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK(fixed_bin_builder->Append(kG(bin_data))); + } } } } @@ -794,7 +801,13 @@ void PopulateBuilder(shared_ptr datatyp auto ts_builder = static_cast(builder); auto timestamp_type = static_pointer_cast(datatype); for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(ts_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); + if( type_overrides.null_mapping.have_timestamp + && type_overrides.null_mapping.timestamp_null == kJ( k_array )[i] ){ + PARQUET_THROW_NOT_OK( ts_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK(ts_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); + } } template<> @@ -804,7 +817,13 @@ void PopulateBuilder(shared_ptr datatype, auto t32_builder = static_cast(builder); auto time32_type = static_pointer_cast(datatype); for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(t32_builder->Append(tc.KdbToArrow(kI(k_array)[i]))); + if( type_overrides.null_mapping.have_time32 + && type_overrides.null_mapping.time32_null == kI( k_array )[i] ){ + PARQUET_THROW_NOT_OK( t32_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK(t32_builder->Append(tc.KdbToArrow(kI(k_array)[i]))); + } } template<> @@ -814,7 +833,13 @@ void PopulateBuilder(shared_ptr datatype, auto t64_builder = static_cast(builder); auto time64_type = static_pointer_cast(datatype); for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(t64_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); + if( type_overrides.null_mapping.have_time64 + && type_overrides.null_mapping.time64_null == kJ( k_array )[i] ){ + PARQUET_THROW_NOT_OK( t64_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK(t64_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); + } } template<> @@ -824,10 +849,16 @@ void PopulateBuilder(shared_ptr datatype, auto dec_type = static_pointer_cast(datatype); for (auto i = 0; i < k_array->n; ++i) { if (type_overrides.decimal128_as_double) { - // Construct the decimal from a double - arrow::Decimal128 dec128; - PARQUET_ASSIGN_OR_THROW(dec128, arrow::Decimal128::FromReal(kF(k_array)[i], dec_type->precision(), dec_type->scale())); - PARQUET_THROW_NOT_OK(dec_builder->Append(dec128)); + if( type_overrides.null_mapping.have_decimal + && is_equal( type_overrides.null_mapping.decimal_null, kF( k_array )[i] ) ){ + PARQUET_THROW_NOT_OK( dec_builder->AppendNull() ); + } + else{ + // Construct the decimal from a double + arrow::Decimal128 dec128; + PARQUET_ASSIGN_OR_THROW(dec128, arrow::Decimal128::FromReal(kF(k_array)[i], dec_type->precision(), dec_type->scale())); + PARQUET_THROW_NOT_OK(dec_builder->Append(dec128)); + } } else { // Each decimal is a list of 16 bytes K k_dec = kK(k_array)[i]; @@ -847,7 +878,13 @@ void PopulateBuilder(shared_ptr datatype auto dur_builder = static_cast(builder); auto duration_type = static_pointer_cast(datatype); for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(dur_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); + if( type_overrides.null_mapping.have_duration + && type_overrides.null_mapping.duration_null == kJ( k_array )[i] ){ + PARQUET_THROW_NOT_OK( dur_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK(dur_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); + } } template<> @@ -917,7 +954,7 @@ void PopulateBuilder(shared_ptr datatype, K k continue; // Delimit the start/end of each child map set - map_builder->Append(); + PARQUET_THROW_NOT_OK( map_builder->Append() ); // Populate the child builders for this map set from the dictionary key/value lists auto k_dict = kK(k_array)[i]; @@ -948,7 +985,7 @@ void PopulateBuilder(shared_ptr datatype, // Delimit each struct value in the parent builder for (auto index = 0; index < kK(k_array)[0]->n; ++index) - struct_builder->Append(); + PARQUET_THROW_NOT_OK( struct_builder->Append() ); // Populate each of the field builders from its kdb list. Only count up to // the number of struct fields. Additional trailing data in the kdb mixed @@ -1113,7 +1150,7 @@ K prettyPrintArray(K datatype_id, K array, K options) auto arrow_array = MakeArray(datatype, array, type_overrides); auto options = arrow::PrettyPrintOptions(); string result; - arrow::PrettyPrint(*arrow_array, options, &result); + PARQUET_THROW_NOT_OK( arrow::PrettyPrint(*arrow_array, options, &result) ); return kp((S)result.c_str()); From 57c03c3a0d5e82f22ed14cfdea1eb05940290293 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 23 Jan 2023 17:47:37 +0300 Subject: [PATCH 162/276] Pull-request #6 changes, patch 5 https://github.com/KxSystems/arrowkdb/pull/6 --- src/ArrayWriter.cpp | 18 +++++++++--------- src/KdbOptions.h | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index e4d4bfd..e1ddd1b 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -451,7 +451,7 @@ void PopulateBuilder(shared_ptr datatype, K if( type_overrides.null_mapping.have_boolean ){ std::vector null_bitmap( k_array->n ); for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.boolean_null ^ kG( k_array )[i]; + null_bitmap[i] = type_overrides.null_mapping.boolean_null != kG( k_array )[i]; } PARQUET_THROW_NOT_OK( bool_builder->AppendValues( ( uint8_t* )kG( k_array ), k_array->n, null_bitmap ) ); } @@ -467,7 +467,7 @@ void PopulateBuilder(shared_ptr datatype, K if( type_overrides.null_mapping.have_uint8 ){ std::vector null_bitmap( k_array->n ); for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.uint8_null ^ kG( k_array )[i]; + null_bitmap[i] = type_overrides.null_mapping.uint8_null != kG( k_array )[i]; } PARQUET_THROW_NOT_OK( uint8_builder->AppendValues( ( uint8_t* )kG( k_array ), k_array->n, null_bitmap ) ); } @@ -483,7 +483,7 @@ void PopulateBuilder(shared_ptr datatype, K if( type_overrides.null_mapping.have_int8 ){ std::vector null_bitmap( k_array->n ); for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.int8_null ^ kG( k_array )[i]; + null_bitmap[i] = type_overrides.null_mapping.int8_null != kG( k_array )[i]; } PARQUET_THROW_NOT_OK( int8_builder->AppendValues( ( int8_t* )kG( k_array ), k_array->n, null_bitmap ) ); } @@ -499,7 +499,7 @@ void PopulateBuilder(shared_ptr datatype, if( type_overrides.null_mapping.have_uint16 ){ std::vector null_bitmap( k_array->n ); for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.uint16_null ^ kH( k_array )[i]; + null_bitmap[i] = type_overrides.null_mapping.uint16_null != kH( k_array )[i]; } PARQUET_THROW_NOT_OK( uint16_builder->AppendValues( ( uint16_t* )kH( k_array ), k_array->n, null_bitmap ) ); } @@ -531,7 +531,7 @@ void PopulateBuilder(shared_ptr datatype, if( type_overrides.null_mapping.have_uint32 ){ std::vector null_bitmap( k_array->n ); for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.uint32_null ^ kI( k_array )[i]; + null_bitmap[i] = type_overrides.null_mapping.uint32_null != static_cast( kI( k_array )[i] ); } PARQUET_THROW_NOT_OK( uint32_builder->AppendValues( ( uint32_t* )kI( k_array ), k_array->n, null_bitmap ) ); } @@ -563,7 +563,7 @@ void PopulateBuilder(shared_ptr datatype, if( type_overrides.null_mapping.have_uint64 ){ std::vector null_bitmap( k_array->n ); for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.uint64_null ^ kJ( k_array )[i]; + null_bitmap[i] = type_overrides.null_mapping.uint64_null != static_cast( kJ( k_array )[i] ); } PARQUET_THROW_NOT_OK( uint64_builder->AppendValues( ( uint64_t* )kJ( k_array ), k_array->n, null_bitmap ) ); } @@ -579,7 +579,7 @@ void PopulateBuilder(shared_ptr datatype, K if( type_overrides.null_mapping.have_int64 ){ std::vector null_bitmap( k_array->n ); for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.int64_null ^ kJ( k_array )[i]; + null_bitmap[i] = type_overrides.null_mapping.int64_null != kJ( k_array )[i]; } PARQUET_THROW_NOT_OK( int64_builder->AppendValues( ( int64_t* )kJ( k_array ), k_array->n, null_bitmap ) ); } @@ -595,7 +595,7 @@ void PopulateBuilder(shared_ptr dataty if( type_overrides.null_mapping.have_float16 ){ std::vector null_bitmap( k_array->n ); for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.float16_null ^ kH( k_array )[i]; + null_bitmap[i] = type_overrides.null_mapping.float16_null != kH( k_array )[i]; } PARQUET_THROW_NOT_OK( hfl_builder->AppendValues( ( uint16_t* )kH( k_array ), k_array->n, null_bitmap ) ); } @@ -894,7 +894,7 @@ void PopulateBuilder(shared_ptr d if( type_overrides.null_mapping.have_month_interval ){ std::vector null_bitmap( k_array->n ); for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.month_interval_null ^ kI( k_array )[i]; + null_bitmap[i] = type_overrides.null_mapping.month_interval_null != kI( k_array )[i]; } PARQUET_THROW_NOT_OK( month_builder->AppendValues( ( int32_t* )kI( k_array ), k_array->n, null_bitmap ) ); } diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 64ce207..aa79596 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -58,7 +58,7 @@ namespace Options // Null mapping options const std::string NM_NA = "na"; - const std::string NM_BOOLEAN = "boolean"; + const std::string NM_BOOLEAN = "bool"; const std::string NM_UINT_8 = "uint8"; const std::string NM_INT_8 = "int8"; const std::string NM_UINT_16 = "uint16"; From 70aebae8d03867e2b604254fc8874996ad88cde8 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 25 Jan 2023 12:15:37 +0300 Subject: [PATCH 163/276] Fix of bool option initialized by -1h --- src/KdbOptions.h | 6 +++++- src/SchemaStore.cpp | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/KdbOptions.h b/src/KdbOptions.h index aa79596..1168306 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -330,7 +330,11 @@ class KdbOptions throw InvalidOption(("Unsupported NULL_MAPPING option '" + key + "'").c_str()); } K value = kK( values )[i]; - if( ETraits::name( NM::BOOL ) == key && -KG == value->t ){ + if( ETraits::name( NM::BOOL ) == key && -KB == value->t ){ + null_mapping_options.boolean_null = value->g; + null_mapping_options.have_boolean = true; + } + else if( ETraits::name( NM::BOOL ) == key && -KG == value->t ){ null_mapping_options.boolean_null = value->g; null_mapping_options.have_boolean = true; } diff --git a/src/SchemaStore.cpp b/src/SchemaStore.cpp index b95a1b9..da407e2 100644 --- a/src/SchemaStore.cpp +++ b/src/SchemaStore.cpp @@ -143,7 +143,7 @@ K inferSchema(K table) // Determine the arrow datatype for each data set K k_array_data = kK(dict)[1]; - assert(k_array_data->n == field_names.size()); + assert(static_cast( k_array_data->n ) == field_names.size()); arrow::FieldVector fields; for (auto i = 0ul; i < field_names.size(); ++i) { auto datatype = kx::arrowkdb::GetArrowType(kK(k_array_data)[i]); From 72a7e9401e324c90cf28d99856adb64fdc0a4229 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 25 Jan 2023 13:29:23 +0300 Subject: [PATCH 164/276] Support GUIDs of type 2h in Fixed Size Binarray --- src/ArrayWriter.cpp | 12 ++++++++++-- src/KdbOptions.h | 20 ++++++++++++++++++-- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index e1ddd1b..3b51605 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -744,8 +744,16 @@ void PopulateBuilder(shared_ptr bool is_guid = k_array->t == UU && datatype->id() == arrow::Type::FIXED_SIZE_BINARY && static_cast(builder)->byte_width() == sizeof(U); auto fixed_bin_builder = static_cast(builder); if (is_guid) { - for (auto i = 0; i < k_array->n; ++i) - PARQUET_THROW_NOT_OK(fixed_bin_builder->Append((char*)&kU(k_array)[i])); + for (auto i = 0; i < k_array->n; ++i){ + if( type_overrides.null_mapping.have_fixed_binary + && type_overrides.null_mapping.fixed_binary_null.length() == sizeof( U ) + && !type_overrides.null_mapping.fixed_binary_null.compare( 0, sizeof( U ), &kU( k_array )[i].g[0], sizeof( U ) ) ){ + PARQUET_THROW_NOT_OK( fixed_bin_builder->AppendNull() ); + } + else{ + PARQUET_THROW_NOT_OK(fixed_bin_builder->Append((char*)&kU(k_array)[i])); + } + } } else { for (auto i = 0; i < k_array->n; ++i) { K bin_data = kK(k_array)[i]; diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 1168306..8fe82e0 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -249,8 +249,8 @@ inline const ETraits::Names ETraits::names , { arrow::Type::DATE32, Options::NM_DATE_32 } , { arrow::Type::DATE64, Options::NM_DATE_64 } , { arrow::Type::TIMESTAMP, Options::NM_TIMESTAMP } - , { arrow::Type::DATE32, Options::NM_DATE_32 } - , { arrow::Type::DATE64, Options::NM_DATE_64 } + , { arrow::Type::TIME32, Options::NM_TIME_32 } + , { arrow::Type::TIME64, Options::NM_TIME_64 } , { arrow::Type::DECIMAL, Options::NM_DECIMAL } , { arrow::Type::DURATION, Options::NM_DURATION } , { arrow::Type::INTERVAL_MONTHS, Options::NM_MONTH_INTERVAL } @@ -390,14 +390,30 @@ class KdbOptions null_mapping_options.large_string_null.assign( (char*)kC( value ), value->n ); null_mapping_options.have_large_string = true; } + else if( ETraits::name( NM::BINARY ) == key && KG == value->t ){ + null_mapping_options.binary_null.assign( kG( value ), value->n ); + null_mapping_options.have_binary = true; + } else if( ETraits::name( NM::BINARY ) == key && KC == value->t ){ null_mapping_options.binary_null.assign( kC( value ), value->n ); null_mapping_options.have_binary = true; } + else if( ETraits::name( NM::LARGE_BINARY ) == key && KG == value->t ){ + null_mapping_options.large_binary_null.assign( kG( value ), value->n ); + null_mapping_options.have_large_binary = true; + } else if( ETraits::name( NM::LARGE_BINARY ) == key && KC == value->t ){ null_mapping_options.large_binary_null.assign( kC( value ), value->n ); null_mapping_options.have_large_binary = true; } + else if( ETraits::name( NM::FIXED_SIZE_BINARY ) == key && -UU == value->t ){ + null_mapping_options.fixed_binary_null.assign( &kU( value )->g[0], sizeof( U ) ); + null_mapping_options.have_fixed_binary = true; + } + else if( ETraits::name( NM::FIXED_SIZE_BINARY ) == key && KG == value->t ){ + null_mapping_options.fixed_binary_null.assign( kG( value ), value->n ); + null_mapping_options.have_fixed_binary = true; + } else if( ETraits::name( NM::FIXED_SIZE_BINARY ) == key && KC == value->t ){ null_mapping_options.fixed_binary_null.assign( kC( value ), value->n ); null_mapping_options.have_fixed_binary = true; From d7f3432b778cb224f3d56bcba39a381de99d5b0f Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 25 Jan 2023 17:47:23 +0300 Subject: [PATCH 165/276] Reverse null_bitmap for floats --- src/ArrayWriter.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index 3b51605..4d298c1 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -611,7 +611,7 @@ void PopulateBuilder(shared_ptr datatype, K if( type_overrides.null_mapping.have_float32 ){ std::vector null_bitmap( k_array->n ); for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = is_equal( type_overrides.null_mapping.float32_null, kE( k_array )[i] ); + null_bitmap[i] = !is_equal( type_overrides.null_mapping.float32_null, kE( k_array )[i] ); } PARQUET_THROW_NOT_OK( fl_builder->AppendValues( kE( k_array ), k_array->n, null_bitmap ) ); } @@ -627,7 +627,7 @@ void PopulateBuilder(shared_ptr datatype, if( type_overrides.null_mapping.have_float64 ){ std::vector null_bitmap( k_array->n ); for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = is_equal( type_overrides.null_mapping.float64_null, kF( k_array )[i] ); + null_bitmap[i] = !is_equal( type_overrides.null_mapping.float64_null, kF( k_array )[i] ); } PARQUET_THROW_NOT_OK( dbl_builder->AppendValues( kF( k_array ), k_array->n, null_bitmap ) ); } From b6a29f78b86b762f3eae9ee9d2b93d77c15bfc07 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 26 Jan 2023 21:23:11 +0300 Subject: [PATCH 166/276] Breaking apart null mapping option ifelses --- src/KdbOptions.h | 544 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 411 insertions(+), 133 deletions(-) diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 8fe82e0..35f0ef0 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -23,13 +24,11 @@ constexpr auto toUType( E enumerator ) noexcept template< typename E > struct ETraits { - using Names = std::map; + using Names = std::unordered_map; - static std::string name( E enumerator ) - { + static std::string name( E enumerator ){ auto it = names.find( enumerator ); - if( it != names.end() ) - { + if( it != names.end() ){ return it->second; } @@ -38,6 +37,17 @@ struct ETraits static std::string name( int index ) { return name( static_cast( index ) ); } + static auto value( const std::string& name ){ + auto it = std::find_if( names.begin(), names.end(), [&name]( const auto& value ){ + return name == value.second; + } ); + if( it != names.end() ){ + return it->first; + } + + return E( 0 ); + } + static const Names names; }; @@ -266,6 +276,12 @@ inline const ETraits::Names ETraits::names // 0 of -KS|-KJ|XD|KC class KdbOptions { +public: + template + inline void NullMappingOption( const std::string& key, K value ); + + using NullMappingHandler = void ( KdbOptions::* )( const std::string&, K ); + using NullMappingHandlers = std::unordered_map; private: Options::NullMapping null_mapping_options; std::map string_options; @@ -276,6 +292,7 @@ class KdbOptions const std::set& supported_dict_options; const std::set& supported_null_mapping_options; + static const NullMappingHandlers null_mapping_handlers; private: const std::string ToUpper(std::string str) const { @@ -314,151 +331,30 @@ class KdbOptions void PopulateNullMappingOptions( long long index, K dict ) { - using NM = arrow::Type::type; - K keys = kK( kK( dict )[index] )[0]; K values = kK( kK( dict )[index] )[1]; if( KS != keys->t ){ throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING keys (expected=11h), type=" + std::to_string( keys->t ) + "h" ); } if( 0 != values->t ){ - throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING values (extected=0h), type=" + std::to_string( keys->t ) + "h" ); + throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING values (extected=0h), type=" ); } for( auto i = 0ll; i < values->n; ++i ){ const std::string key = ToLower( kS( keys )[i] ); if( supported_null_mapping_options.find( key ) == supported_null_mapping_options.end() ){ - throw InvalidOption(("Unsupported NULL_MAPPING option '" + key + "'").c_str()); + throw InvalidOption( "Unsupported NULL_MAPPING option '" + key + "'" ); } K value = kK( values )[i]; - if( ETraits::name( NM::BOOL ) == key && -KB == value->t ){ - null_mapping_options.boolean_null = value->g; - null_mapping_options.have_boolean = true; - } - else if( ETraits::name( NM::BOOL ) == key && -KG == value->t ){ - null_mapping_options.boolean_null = value->g; - null_mapping_options.have_boolean = true; - } - else if( ETraits::name( NM::UINT8 ) == key && -KG == value->t ){ - null_mapping_options.uint8_null = value->g; - null_mapping_options.have_uint8 = true; - } - else if( ETraits::name( NM::INT8 ) == key && -KG == value->t ){ - null_mapping_options.int8_null = value->g; - null_mapping_options.have_int8 = true; - } - else if( ETraits::name( NM::UINT16 ) == key && -KH == value->t ){ - null_mapping_options.uint16_null = value->h; - null_mapping_options.have_uint16 = true; - } - else if( ETraits::name( NM::INT16 ) == key && -KH == value->t ){ - null_mapping_options.int16_null = value->h; - null_mapping_options.have_int16 = true; - } - else if( ETraits::name( NM::UINT32 ) == key && -KI == value->t ){ - null_mapping_options.uint32_null = value->i; - null_mapping_options.have_uint32 = true; - } - else if( ETraits::name( NM::INT32 ) == key && -KI == value->t ){ - null_mapping_options.int32_null = value->i; - null_mapping_options.have_int32 = true; - } - else if( ETraits::name( NM::UINT64 ) == key && -KJ == value->t ){ - null_mapping_options.uint64_null = value->j; - null_mapping_options.have_uint64 = true; - } - else if( ETraits::name( NM::INT64 ) == key && -KJ == value->t ){ - null_mapping_options.int64_null = value->j; - null_mapping_options.have_int64 = true; - } - else if( ETraits::name( NM::HALF_FLOAT ) == key && -KH == value->t ){ - null_mapping_options.float16_null = value->h; - null_mapping_options.have_float16 = true; - } - else if( ETraits::name( NM::FLOAT ) == key && -KE == value->t ){ - null_mapping_options.float32_null = value->e; - null_mapping_options.have_float32 = true; - } - else if( ETraits::name( NM::DOUBLE ) == key && -KF == value->t ){ - null_mapping_options.float64_null = value->f; - null_mapping_options.have_float64 = true; - } - else if( ETraits::name( NM::STRING ) == key && KC == value->t ){ - null_mapping_options.string_null.assign( (char*)kC( value ), value->n ); - null_mapping_options.have_string = true; - } - else if( ETraits::name( NM::LARGE_STRING ) == key && KC == value->t ){ - null_mapping_options.large_string_null.assign( (char*)kC( value ), value->n ); - null_mapping_options.have_large_string = true; - } - else if( ETraits::name( NM::BINARY ) == key && KG == value->t ){ - null_mapping_options.binary_null.assign( kG( value ), value->n ); - null_mapping_options.have_binary = true; - } - else if( ETraits::name( NM::BINARY ) == key && KC == value->t ){ - null_mapping_options.binary_null.assign( kC( value ), value->n ); - null_mapping_options.have_binary = true; - } - else if( ETraits::name( NM::LARGE_BINARY ) == key && KG == value->t ){ - null_mapping_options.large_binary_null.assign( kG( value ), value->n ); - null_mapping_options.have_large_binary = true; - } - else if( ETraits::name( NM::LARGE_BINARY ) == key && KC == value->t ){ - null_mapping_options.large_binary_null.assign( kC( value ), value->n ); - null_mapping_options.have_large_binary = true; - } - else if( ETraits::name( NM::FIXED_SIZE_BINARY ) == key && -UU == value->t ){ - null_mapping_options.fixed_binary_null.assign( &kU( value )->g[0], sizeof( U ) ); - null_mapping_options.have_fixed_binary = true; - } - else if( ETraits::name( NM::FIXED_SIZE_BINARY ) == key && KG == value->t ){ - null_mapping_options.fixed_binary_null.assign( kG( value ), value->n ); - null_mapping_options.have_fixed_binary = true; - } - else if( ETraits::name( NM::FIXED_SIZE_BINARY ) == key && KC == value->t ){ - null_mapping_options.fixed_binary_null.assign( kC( value ), value->n ); - null_mapping_options.have_fixed_binary = true; - } - else if( ETraits::name( NM::DATE32 ) == key && -KI == value->t ){ - null_mapping_options.date32_null = value->i; - null_mapping_options.have_date32 = true; - } - else if( ETraits::name( NM::DATE64 ) == key && -KJ == value->t ){ - null_mapping_options.date64_null = value->j; - null_mapping_options.have_date64 = true; - } - else if( ETraits::name( NM::TIMESTAMP ) == key && -KJ == value->t ){ - null_mapping_options.timestamp_null = value->j; - null_mapping_options.have_timestamp = true; - } - else if( ETraits::name( NM::TIME32 ) == key && -KI == value->t ){ - null_mapping_options.time32_null = value->i; - null_mapping_options.have_time32 = true; - } - else if( ETraits::name( NM::TIME64 ) == key && -KJ == value->t ){ - null_mapping_options.time64_null = value->j; - null_mapping_options.have_time64 = true; - } - else if( ETraits::name( NM::DECIMAL ) == key && -KF == value->t ){ - null_mapping_options.decimal_null = value->f; - null_mapping_options.have_decimal = true; - } - else if( ETraits::name( NM::DURATION ) == key && -KJ == value->t ){ - null_mapping_options.duration_null = value->j; - null_mapping_options.have_duration = true; - } - else if( ETraits::name( NM::INTERVAL_MONTHS ) == key && -KI == value->t ){ - null_mapping_options.month_interval_null = value->i; - null_mapping_options.have_month_interval = true; - } - else if( ETraits::name( NM::INTERVAL_DAY_TIME ) == key && -KJ == value->t ){ - null_mapping_options.day_time_interval_null = value->j; - null_mapping_options.have_day_time_interval = true; + arrow::Type::type mapping = ETraits::value( key ); + auto it = null_mapping_handlers.find( mapping ); + if( it != null_mapping_handlers.end() ){ + ( this->*it->second )( key, value ); } else if( 101 == value->t ){ // Ignore generic null, which may be used here to ensure mixed list of options } else{ - throw InvalidOption(("Unsupported KDB data type for NULL_MAPPING option '" + key + "', type=" + std::to_string( value->t ) + "h" ).c_str()); + throw InvalidOption( "Unhandled NULL_MAPPING option '" + key + "', type=" + std::to_string( keys->t ) + "h" ); } } } @@ -597,6 +493,388 @@ class KdbOptions } }; +inline void null_mapping_error( const std::string& key, K value ) +{ + std::string message = std::string( "Unsupported KDB data type for NULL_MAPPING option '") + .append( key ) + .append( "', type=" ) + .append( std::to_string( value->t ) ) + .append( "h" ); + + throw KdbOptions::InvalidOption( message ); +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + switch( value->t ){ + case -KB: + null_mapping_options.boolean_null = value->g; + null_mapping_options.have_boolean = true; + break; + case -KG: + null_mapping_options.boolean_null = value->g; + null_mapping_options.have_boolean = true; + break; + default: + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KG == value->t ){ + null_mapping_options.uint8_null = value->g; + null_mapping_options.have_uint8 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KG == value->t ){ + null_mapping_options.int8_null = value->g; + null_mapping_options.have_int8 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KH == value->t ){ + null_mapping_options.uint16_null = value->h; + null_mapping_options.have_uint16 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KH == value->t ){ + null_mapping_options.int16_null = value->h; + null_mapping_options.have_int16 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KI == value->t ){ + null_mapping_options.uint32_null = value->i; + null_mapping_options.have_uint32 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KI == value->t ){ + null_mapping_options.int32_null = value->i; + null_mapping_options.have_int32 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KJ == value->t ){ + null_mapping_options.uint64_null = value->j; + null_mapping_options.have_uint64 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KJ == value->t ){ + null_mapping_options.int64_null = value->j; + null_mapping_options.have_int64 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KH == value->t ){ + null_mapping_options.float16_null = value->h; + null_mapping_options.have_float16 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KE == value->t ){ + null_mapping_options.float32_null = value->e; + null_mapping_options.have_float32 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KF == value->t ){ + null_mapping_options.float64_null = value->f; + null_mapping_options.have_float64 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KC == value->t ){ + null_mapping_options.string_null.assign( (char*)kC( value ), value->n ); + null_mapping_options.have_string = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KC == value->t ){ + null_mapping_options.large_string_null.assign( (char*)kC( value ), value->n ); + null_mapping_options.have_large_string = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + switch( value->t ){ + case KG: + null_mapping_options.binary_null.assign( kG( value ), value->n ); + null_mapping_options.have_binary = true; + break; + case KC: + null_mapping_options.binary_null.assign( kC( value ), value->n ); + null_mapping_options.have_binary = true; + break; + default: + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + switch( value->t ){ + case KG: + null_mapping_options.large_binary_null.assign( kG( value ), value->n ); + null_mapping_options.have_large_binary = true; + break; + case KC: + null_mapping_options.large_binary_null.assign( kC( value ), value->n ); + null_mapping_options.have_large_binary = true; + break; + default: + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + switch( value->t ){ + case -UU: + null_mapping_options.fixed_binary_null.assign( &kU( value )->g[0], sizeof( U ) ); + null_mapping_options.have_fixed_binary = true; + break; + case KG: + null_mapping_options.fixed_binary_null.assign( kG( value ), value->n ); + null_mapping_options.have_fixed_binary = true; + break; + case KC: + null_mapping_options.fixed_binary_null.assign( kC( value ), value->n ); + null_mapping_options.have_fixed_binary = true; + break; + default: + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KI == value->t ){ + null_mapping_options.date32_null = value->i; + null_mapping_options.have_date32 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KJ == value->t ){ + null_mapping_options.date64_null = value->j; + null_mapping_options.have_date64 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KJ == value->t ){ + null_mapping_options.timestamp_null = value->j; + null_mapping_options.have_timestamp = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KI == value->t ){ + null_mapping_options.time32_null = value->i; + null_mapping_options.have_time32 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KJ == value->t ){ + null_mapping_options.time64_null = value->j; + null_mapping_options.have_time64 = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KF == value->t ){ + null_mapping_options.decimal_null = value->f; + null_mapping_options.have_decimal = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KJ == value->t ){ + null_mapping_options.duration_null = value->j; + null_mapping_options.have_duration = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KI == value->t ){ + null_mapping_options.month_interval_null = value->i; + null_mapping_options.have_month_interval = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template<> +inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +{ + if( -KJ == value->t ){ + null_mapping_options.day_time_interval_null = value->j; + null_mapping_options.have_day_time_interval = true; + } + else{ + null_mapping_error( key, value ); + } +} + +template +auto make_null_mapping() +{ + return std::make_pair( TypeId, &KdbOptions::NullMappingOption ); +} + +inline const KdbOptions::NullMappingHandlers KdbOptions::null_mapping_handlers = { + make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() + , make_null_mapping() +}; + } // namespace arrowkdb } // namespace kx From c5c4f1b8704978b0d01ca791d50e5d3e033eafd9 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 27 Jan 2023 11:46:00 +0300 Subject: [PATCH 167/276] Unifying supported null mapping options --- src/KdbOptions.h | 257 ++++++++++++++++++++++------------------------- 1 file changed, 118 insertions(+), 139 deletions(-) diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 35f0ef0..72a93d1 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -16,39 +16,48 @@ namespace kx { namespace arrowkdb { template -constexpr auto toUType( E enumerator ) noexcept +constexpr auto toUType( E option ) noexcept { - return static_cast>( enumerator ); + return static_cast>( option ); } template< typename E > struct ETraits { - using Names = std::unordered_map; + using Options = std::unordered_map; - static std::string name( E enumerator ){ - auto it = names.find( enumerator ); - if( it != names.end() ){ + static std::string mapping( E option ){ + auto it = options.find( option ); + if( it != options.end() ){ return it->second; } - return "UNKNOWN"; + return "unknown"; } - static std::string name( int index ) { return name( static_cast( index ) ); } + static std::string mapping( int option ) { return mapping( static_cast( option ) ); } - static auto value( const std::string& name ){ - auto it = std::find_if( names.begin(), names.end(), [&name]( const auto& value ){ - return name == value.second; + static std::set mappings(){ + std::set values; + transform( options.begin(), options.end(), std::inserter( values, end( values ) ), []( const auto& option ){ + return option.second; + } ); + + return values; + } + + static E option( const std::string& value ){ + auto it = std::find_if( options.begin(), options.end(), [&value]( const auto& option ){ + return value == option.second; } ); - if( it != names.end() ){ + if( it != options.end() ){ return it->first; } return E( 0 ); } - static const Names names; + static const Options options; }; // Supported options @@ -107,35 +116,6 @@ namespace Options const static std::set dict_options = { NULL_MAPPING, }; - const static std::set null_mapping_options = { - NM_NA - , NM_BOOLEAN - , NM_UINT_8 - , NM_INT_8 - , NM_UINT_16 - , NM_INT_16 - , NM_UINT_32 - , NM_INT_32 - , NM_UINT_64 - , NM_INT_64 - , NM_FLOAT_16 - , NM_FLOAT_32 - , NM_FLOAT_64 - , NM_STRING - , NM_LARGE_STRING - , NM_BINARY - , NM_LARGE_BINARY - , NM_FIXED_BINARY - , NM_DATE_32 - , NM_DATE_64 - , NM_TIMESTAMP - , NM_TIME_32 - , NM_TIME_64 - , NM_DECIMAL - , NM_DURATION - , NM_MONTH_INTERVAL - , NM_DAY_TIME_INTERVAL - }; struct NullMapping { @@ -205,7 +185,7 @@ namespace Options int64_t day_time_interval_null; template - auto GetOption() const { return std::make_pair( true, na_null );} + inline auto GetOption() const { return std::make_pair( true, na_null );} }; template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_boolean, boolean_null ); } @@ -237,34 +217,34 @@ namespace Options } // namespace Options template<> -inline const ETraits::Names ETraits::names{ - { arrow::Type::NA, Options::NM_NA } - , { arrow::Type::BOOL, Options::NM_BOOLEAN } - , { arrow::Type::UINT8, Options::NM_UINT_8 } - , { arrow::Type::INT8, Options::NM_INT_8 } - , { arrow::Type::UINT16, Options::NM_UINT_16 } - , { arrow::Type::INT16, Options::NM_INT_16 } - , { arrow::Type::UINT32, Options::NM_UINT_32 } - , { arrow::Type::INT32, Options::NM_INT_32 } - , { arrow::Type::UINT64, Options::NM_UINT_64 } - , { arrow::Type::INT64, Options::NM_INT_64 } - , { arrow::Type::HALF_FLOAT, Options::NM_FLOAT_16 } - , { arrow::Type::FLOAT, Options::NM_FLOAT_32 } - , { arrow::Type::DOUBLE, Options::NM_FLOAT_64 } - , { arrow::Type::STRING, Options::NM_STRING } - , { arrow::Type::LARGE_STRING, Options::NM_LARGE_STRING } - , { arrow::Type::BINARY, Options::NM_BINARY } - , { arrow::Type::LARGE_BINARY, Options::NM_LARGE_BINARY } - , { arrow::Type::FIXED_SIZE_BINARY, Options::NM_FIXED_BINARY } - , { arrow::Type::DATE32, Options::NM_DATE_32 } - , { arrow::Type::DATE64, Options::NM_DATE_64 } - , { arrow::Type::TIMESTAMP, Options::NM_TIMESTAMP } - , { arrow::Type::TIME32, Options::NM_TIME_32 } - , { arrow::Type::TIME64, Options::NM_TIME_64 } - , { arrow::Type::DECIMAL, Options::NM_DECIMAL } - , { arrow::Type::DURATION, Options::NM_DURATION } - , { arrow::Type::INTERVAL_MONTHS, Options::NM_MONTH_INTERVAL } - , { arrow::Type::INTERVAL_DAY_TIME, Options::NM_DAY_TIME_INTERVAL } +inline const ETraits::Options ETraits::options{ + { arrow::Type::NA, arrowkdb::Options::NM_NA } + , { arrow::Type::BOOL, arrowkdb::Options::NM_BOOLEAN } + , { arrow::Type::UINT8, arrowkdb::Options::NM_UINT_8 } + , { arrow::Type::INT8, arrowkdb::Options::NM_INT_8 } + , { arrow::Type::UINT16, arrowkdb::Options::NM_UINT_16 } + , { arrow::Type::INT16, arrowkdb::Options::NM_INT_16 } + , { arrow::Type::UINT32, arrowkdb::Options::NM_UINT_32 } + , { arrow::Type::INT32, arrowkdb::Options::NM_INT_32 } + , { arrow::Type::UINT64, arrowkdb::Options::NM_UINT_64 } + , { arrow::Type::INT64, arrowkdb::Options::NM_INT_64 } + , { arrow::Type::HALF_FLOAT, arrowkdb::Options::NM_FLOAT_16 } + , { arrow::Type::FLOAT, arrowkdb::Options::NM_FLOAT_32 } + , { arrow::Type::DOUBLE, arrowkdb::Options::NM_FLOAT_64 } + , { arrow::Type::STRING, arrowkdb::Options::NM_STRING } + , { arrow::Type::LARGE_STRING, arrowkdb::Options::NM_LARGE_STRING } + , { arrow::Type::BINARY, arrowkdb::Options::NM_BINARY } + , { arrow::Type::LARGE_BINARY, arrowkdb::Options::NM_LARGE_BINARY } + , { arrow::Type::FIXED_SIZE_BINARY, arrowkdb::Options::NM_FIXED_BINARY } + , { arrow::Type::DATE32, arrowkdb::Options::NM_DATE_32 } + , { arrow::Type::DATE64, arrowkdb::Options::NM_DATE_64 } + , { arrow::Type::TIMESTAMP, arrowkdb::Options::NM_TIMESTAMP } + , { arrow::Type::TIME32, arrowkdb::Options::NM_TIME_32 } + , { arrow::Type::TIME64, arrowkdb::Options::NM_TIME_64 } + , { arrow::Type::DECIMAL, arrowkdb::Options::NM_DECIMAL } + , { arrow::Type::DURATION, arrowkdb::Options::NM_DURATION } + , { arrow::Type::INTERVAL_MONTHS, arrowkdb::Options::NM_MONTH_INTERVAL } + , { arrow::Type::INTERVAL_DAY_TIME, arrowkdb::Options::NM_DAY_TIME_INTERVAL } }; // Helper class for reading dictionary of options @@ -276,12 +256,6 @@ inline const ETraits::Names ETraits::names // 0 of -KS|-KJ|XD|KC class KdbOptions { -public: - template - inline void NullMappingOption( const std::string& key, K value ); - - using NullMappingHandler = void ( KdbOptions::* )( const std::string&, K ); - using NullMappingHandlers = std::unordered_map; private: Options::NullMapping null_mapping_options; std::map string_options; @@ -292,6 +266,8 @@ class KdbOptions const std::set& supported_dict_options; const std::set& supported_null_mapping_options; + using NullMappingHandler = void ( KdbOptions::* )( const std::string&, K ); + using NullMappingHandlers = std::unordered_map; static const NullMappingHandlers null_mapping_handlers; private: const std::string ToUpper(std::string str) const @@ -337,7 +313,7 @@ class KdbOptions throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING keys (expected=11h), type=" + std::to_string( keys->t ) + "h" ); } if( 0 != values->t ){ - throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING values (extected=0h), type=" ); + throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING values (extected=0h), type=" + std::to_string( values->t ) + "h" ); } for( auto i = 0ll; i < values->n; ++i ){ const std::string key = ToLower( kS( keys )[i] ); @@ -345,8 +321,8 @@ class KdbOptions throw InvalidOption( "Unsupported NULL_MAPPING option '" + key + "'" ); } K value = kK( values )[i]; - arrow::Type::type mapping = ETraits::value( key ); - auto it = null_mapping_handlers.find( mapping ); + auto option = ETraits::option( key ); + auto it = null_mapping_handlers.find( option ); if( it != null_mapping_handlers.end() ){ ( this->*it->second )( key, value ); } @@ -429,7 +405,7 @@ class KdbOptions , const std::set supported_string_options_ , const std::set supported_int_options_ , const std::set& supported_dict_options_ = Options::dict_options - , const std::set& supported_null_mapping_options_ = Options::null_mapping_options ) + , const std::set& supported_null_mapping_options_ = ETraits::mappings() ) : null_mapping_options {0} , supported_string_options(supported_string_options_) , supported_int_options(supported_int_options_) @@ -462,6 +438,9 @@ class KdbOptions } } + template + inline void HandleNullMapping( const std::string& key, K value ); + template auto GetNullMappingOption() const { return null_mapping_options.GetOption(); } @@ -505,7 +484,7 @@ inline void null_mapping_error( const std::string& key, K value ) } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { switch( value->t ){ case -KB: @@ -522,7 +501,7 @@ inline void KdbOptions::NullMappingOption( const std::string& } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KG == value->t ){ null_mapping_options.uint8_null = value->g; @@ -534,7 +513,7 @@ inline void KdbOptions::NullMappingOption( const std::string } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KG == value->t ){ null_mapping_options.int8_null = value->g; @@ -546,7 +525,7 @@ inline void KdbOptions::NullMappingOption( const std::string& } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KH == value->t ){ null_mapping_options.uint16_null = value->h; @@ -558,7 +537,7 @@ inline void KdbOptions::NullMappingOption( const std::strin } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KH == value->t ){ null_mapping_options.int16_null = value->h; @@ -570,7 +549,7 @@ inline void KdbOptions::NullMappingOption( const std::string } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KI == value->t ){ null_mapping_options.uint32_null = value->i; @@ -582,7 +561,7 @@ inline void KdbOptions::NullMappingOption( const std::strin } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KI == value->t ){ null_mapping_options.int32_null = value->i; @@ -594,7 +573,7 @@ inline void KdbOptions::NullMappingOption( const std::string } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KJ == value->t ){ null_mapping_options.uint64_null = value->j; @@ -606,7 +585,7 @@ inline void KdbOptions::NullMappingOption( const std::strin } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KJ == value->t ){ null_mapping_options.int64_null = value->j; @@ -618,7 +597,7 @@ inline void KdbOptions::NullMappingOption( const std::string } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KH == value->t ){ null_mapping_options.float16_null = value->h; @@ -630,7 +609,7 @@ inline void KdbOptions::NullMappingOption( const std::s } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KE == value->t ){ null_mapping_options.float32_null = value->e; @@ -642,7 +621,7 @@ inline void KdbOptions::NullMappingOption( const std::string } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KF == value->t ){ null_mapping_options.float64_null = value->f; @@ -654,10 +633,10 @@ inline void KdbOptions::NullMappingOption( const std::strin } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - if( -KC == value->t ){ - null_mapping_options.string_null.assign( (char*)kC( value ), value->n ); + if( KC == value->t ){ + null_mapping_options.string_null.assign( ( char* )kC( value ), value->n ); null_mapping_options.have_string = true; } else{ @@ -666,10 +645,10 @@ inline void KdbOptions::NullMappingOption( const std::strin } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - if( -KC == value->t ){ - null_mapping_options.large_string_null.assign( (char*)kC( value ), value->n ); + if( KC == value->t ){ + null_mapping_options.large_string_null.assign( ( char* )kC( value ), value->n ); null_mapping_options.have_large_string = true; } else{ @@ -678,7 +657,7 @@ inline void KdbOptions::NullMappingOption( const std: } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { switch( value->t ){ case KG: @@ -695,7 +674,7 @@ inline void KdbOptions::NullMappingOption( const std::strin } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { switch( value->t ){ case KG: @@ -712,7 +691,7 @@ inline void KdbOptions::NullMappingOption( const std: } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { switch( value->t ){ case -UU: @@ -733,7 +712,7 @@ inline void KdbOptions::NullMappingOption( const } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KI == value->t ){ null_mapping_options.date32_null = value->i; @@ -745,7 +724,7 @@ inline void KdbOptions::NullMappingOption( const std::strin } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KJ == value->t ){ null_mapping_options.date64_null = value->j; @@ -757,7 +736,7 @@ inline void KdbOptions::NullMappingOption( const std::strin } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KJ == value->t ){ null_mapping_options.timestamp_null = value->j; @@ -769,7 +748,7 @@ inline void KdbOptions::NullMappingOption( const std::st } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KI == value->t ){ null_mapping_options.time32_null = value->i; @@ -781,7 +760,7 @@ inline void KdbOptions::NullMappingOption( const std::strin } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KJ == value->t ){ null_mapping_options.time64_null = value->j; @@ -793,7 +772,7 @@ inline void KdbOptions::NullMappingOption( const std::strin } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KF == value->t ){ null_mapping_options.decimal_null = value->f; @@ -805,7 +784,7 @@ inline void KdbOptions::NullMappingOption( const std::stri } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KJ == value->t ){ null_mapping_options.duration_null = value->j; @@ -817,7 +796,7 @@ inline void KdbOptions::NullMappingOption( const std::str } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KI == value->t ){ null_mapping_options.month_interval_null = value->i; @@ -829,7 +808,7 @@ inline void KdbOptions::NullMappingOption( const s } template<> -inline void KdbOptions::NullMappingOption( const std::string& key, K value ) +inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KJ == value->t ){ null_mapping_options.day_time_interval_null = value->j; @@ -841,38 +820,38 @@ inline void KdbOptions::NullMappingOption( const } template -auto make_null_mapping() +auto make_handler() { - return std::make_pair( TypeId, &KdbOptions::NullMappingOption ); + return std::make_pair( TypeId, &KdbOptions::HandleNullMapping ); } inline const KdbOptions::NullMappingHandlers KdbOptions::null_mapping_handlers = { - make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() - , make_null_mapping() + make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() }; } // namespace arrowkdb From 958696b8089cff317fa98496340d692b1fa09728 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 30 Jan 2023 17:50:54 +0300 Subject: [PATCH 168/276] Patch #1, getting rid of NA fields --- src/ArrayWriter.cpp | 8 ++++---- src/KdbOptions.h | 13 ++++--------- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index 4d298c1..a084ffd 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -451,7 +451,7 @@ void PopulateBuilder(shared_ptr datatype, K if( type_overrides.null_mapping.have_boolean ){ std::vector null_bitmap( k_array->n ); for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.boolean_null != kG( k_array )[i]; + null_bitmap[i] = type_overrides.null_mapping.boolean_null != static_cast( kG( k_array )[i] ); } PARQUET_THROW_NOT_OK( bool_builder->AppendValues( ( uint8_t* )kG( k_array ), k_array->n, null_bitmap ) ); } @@ -467,7 +467,7 @@ void PopulateBuilder(shared_ptr datatype, K if( type_overrides.null_mapping.have_uint8 ){ std::vector null_bitmap( k_array->n ); for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.uint8_null != kG( k_array )[i]; + null_bitmap[i] = type_overrides.null_mapping.uint8_null != static_cast( kG( k_array )[i] ); } PARQUET_THROW_NOT_OK( uint8_builder->AppendValues( ( uint8_t* )kG( k_array ), k_array->n, null_bitmap ) ); } @@ -499,7 +499,7 @@ void PopulateBuilder(shared_ptr datatype, if( type_overrides.null_mapping.have_uint16 ){ std::vector null_bitmap( k_array->n ); for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.uint16_null != kH( k_array )[i]; + null_bitmap[i] = type_overrides.null_mapping.uint16_null != static_cast( kH( k_array )[i] ); } PARQUET_THROW_NOT_OK( uint16_builder->AppendValues( ( uint16_t* )kH( k_array ), k_array->n, null_bitmap ) ); } @@ -595,7 +595,7 @@ void PopulateBuilder(shared_ptr dataty if( type_overrides.null_mapping.have_float16 ){ std::vector null_bitmap( k_array->n ); for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.float16_null != kH( k_array )[i]; + null_bitmap[i] = type_overrides.null_mapping.float16_null != static_cast( kH( k_array )[i] ); } PARQUET_THROW_NOT_OK( hfl_builder->AppendValues( ( uint16_t* )kH( k_array ), k_array->n, null_bitmap ) ); } diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 72a93d1..9f14230 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -76,7 +76,6 @@ namespace Options const std::string NULL_MAPPING = "NULL_MAPPING"; // Null mapping options - const std::string NM_NA = "na"; const std::string NM_BOOLEAN = "bool"; const std::string NM_UINT_8 = "uint8"; const std::string NM_INT_8 = "int8"; @@ -119,7 +118,6 @@ namespace Options struct NullMapping { - bool have_na; bool have_boolean; bool have_uint8; bool have_int8; @@ -149,7 +147,6 @@ namespace Options using Binary = std::basic_string; - void* na_null = nullptr; bool boolean_null; uint8_t uint8_null; @@ -184,8 +181,8 @@ namespace Options int32_t month_interval_null; int64_t day_time_interval_null; - template - inline auto GetOption() const { return std::make_pair( true, na_null );} + template + inline auto GetOption() const; }; template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_boolean, boolean_null ); } @@ -218,8 +215,7 @@ namespace Options template<> inline const ETraits::Options ETraits::options{ - { arrow::Type::NA, arrowkdb::Options::NM_NA } - , { arrow::Type::BOOL, arrowkdb::Options::NM_BOOLEAN } + { arrow::Type::BOOL, arrowkdb::Options::NM_BOOLEAN } , { arrow::Type::UINT8, arrowkdb::Options::NM_UINT_8 } , { arrow::Type::INT8, arrowkdb::Options::NM_INT_8 } , { arrow::Type::UINT16, arrowkdb::Options::NM_UINT_16 } @@ -444,8 +440,7 @@ class KdbOptions template auto GetNullMappingOption() const { return null_mapping_options.GetOption(); } - void GetNullMappingOptions( Options::NullMapping& null_mapping ) const - { + void GetNullMappingOptions( Options::NullMapping& null_mapping ) const{ null_mapping = null_mapping_options; } From 900e9f397a1aae83fabea0f3ca821900fb094210 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 30 Jan 2023 20:57:51 +0300 Subject: [PATCH 169/276] Minor subset of readers --- src/ArrayReader.cpp | 50 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 10 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index ced60fd..d2a12c6 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -159,7 +159,15 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto int16_array = static_pointer_cast(array_data); - memcpy(kH(k_array), int16_array->raw_values(), int16_array->length() * sizeof(arrow::Int16Array::value_type)); + if( type_overrides.null_mapping.have_int16 ){ + for( auto i = 0ll; i < int16_array->length(); ++i ){ + kH( k_array )[i] = ( int16_array->IsNull( i ) * type_overrides.null_mapping.int16_null ) + + (!int16_array->IsNull( i ) * int16_array->Value( i ) ); + } + } + else { + memcpy(kH(k_array), int16_array->raw_values(), int16_array->length() * sizeof(arrow::Int16Array::value_type)); + } } template<> @@ -173,7 +181,15 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto int32_array = static_pointer_cast(array_data); - memcpy(kI(k_array), int32_array->raw_values(), int32_array->length() * sizeof(arrow::Int32Array::value_type)); + if( type_overrides.null_mapping.have_int32 ){ + for( auto i = 0ll; i < int32_array->length(); ++i ){ + kH( k_array )[i] = ( int32_array->IsNull( i ) * type_overrides.null_mapping.int32_null ) + + (!int32_array->IsNull( i ) * int32_array->Value( i ) ); + } + } + else { + memcpy(kI(k_array), int32_array->raw_values(), int32_array->length() * sizeof(arrow::Int32Array::value_type)); + } } template<> @@ -216,10 +232,17 @@ void AppendArray(shared_ptr array_data, K k_a { auto str_array = static_pointer_cast(array_data); for (auto i = 0; i < str_array->length(); ++i) { - auto str_data = str_array->GetString(i); - K k_str = ktn(KC, str_data.length()); - memcpy(kG(k_str), str_data.data(), str_data.length()); - kK(k_array)[index++] = k_str; + K k_str = nullptr; + if( type_overrides.null_mapping.have_string ){ + k_str = ktn( KC, type_overrides.null_mapping.string_null.length() ); + memcpy( kG(k_str), type_overrides.null_mapping.string_null.data(), type_overrides.null_mapping.string_null.length() ); + } + else{ + auto str_data = str_array->GetString(i); + k_str = ktn(KC, str_data.length()); + memcpy(kG( k_str ), str_data.data(), str_data.length()); + } + kK( k_array )[index++] = k_str; } } @@ -228,10 +251,17 @@ void AppendArray(shared_ptr array_data, { auto str_array = static_pointer_cast(array_data); for (auto i = 0; i < str_array->length(); ++i) { - auto str_data = str_array->GetString(i); - K k_str = ktn(KC, str_data.length()); - memcpy(kG(k_str), str_data.data(), str_data.length()); - kK(k_array)[index++] = k_str; + K k_str = nullptr; + if( type_overrides.null_mapping.have_large_string ){ + k_str = ktn( KC, type_overrides.null_mapping.large_string_null.length() ); + memcpy( kG( k_str ), type_overrides.null_mapping.large_string_null.data(), type_overrides.null_mapping.large_string_null.length() ); + } + else{ + auto str_data = str_array->GetString(i); + k_str = ktn(KC, str_data.length()); + memcpy(kG(k_str), str_data.data(), str_data.length()); + } + kK( k_array )[index++] = k_str; } } From 17ab0dfc7cf7ca4f817d13722b16a1dd9385ad83 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 31 Jan 2023 11:40:18 +0300 Subject: [PATCH 170/276] Null mapping of several types --- src/ArrayReader.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index d2a12c6..d1413b9 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -183,7 +183,7 @@ void AppendArray(shared_ptr array_data, K k_ar auto int32_array = static_pointer_cast(array_data); if( type_overrides.null_mapping.have_int32 ){ for( auto i = 0ll; i < int32_array->length(); ++i ){ - kH( k_array )[i] = ( int32_array->IsNull( i ) * type_overrides.null_mapping.int32_null ) + kI( k_array )[i] = ( int32_array->IsNull( i ) * type_overrides.null_mapping.int32_null ) + (!int32_array->IsNull( i ) * int32_array->Value( i ) ); } } @@ -233,7 +233,7 @@ void AppendArray(shared_ptr array_data, K k_a auto str_array = static_pointer_cast(array_data); for (auto i = 0; i < str_array->length(); ++i) { K k_str = nullptr; - if( type_overrides.null_mapping.have_string ){ + if( type_overrides.null_mapping.have_string && str_array->IsNull( i ) ){ k_str = ktn( KC, type_overrides.null_mapping.string_null.length() ); memcpy( kG(k_str), type_overrides.null_mapping.string_null.data(), type_overrides.null_mapping.string_null.length() ); } @@ -252,7 +252,7 @@ void AppendArray(shared_ptr array_data, auto str_array = static_pointer_cast(array_data); for (auto i = 0; i < str_array->length(); ++i) { K k_str = nullptr; - if( type_overrides.null_mapping.have_large_string ){ + if( type_overrides.null_mapping.have_large_string && str_array->IsNull( i ) ){ k_str = ktn( KC, type_overrides.null_mapping.large_string_null.length() ); memcpy( kG( k_str ), type_overrides.null_mapping.large_string_null.data(), type_overrides.null_mapping.large_string_null.length() ); } From 50a8e462a02cb51ea82e9954ede6f55e54d9e698 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 31 Jan 2023 12:15:21 +0300 Subject: [PATCH 171/276] Replacing duplicating type accessors --- src/KdbOptions.h | 52 ++++++++++++++++-------------------------------- 1 file changed, 17 insertions(+), 35 deletions(-) diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 9f14230..ba9847b 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -481,16 +481,11 @@ inline void null_mapping_error( const std::string& key, K value ) template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - switch( value->t ){ - case -KB: - null_mapping_options.boolean_null = value->g; - null_mapping_options.have_boolean = true; - break; - case -KG: + if( value->t == -KB || value->t == -KG ){ null_mapping_options.boolean_null = value->g; null_mapping_options.have_boolean = true; - break; - default: + } + else{ null_mapping_error( key, value ); } } @@ -499,7 +494,7 @@ template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KG == value->t ){ - null_mapping_options.uint8_null = value->g; + null_mapping_options.uint8_null = static_cast( value->g ); null_mapping_options.have_uint8 = true; } else{ @@ -523,7 +518,7 @@ template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KH == value->t ){ - null_mapping_options.uint16_null = value->h; + null_mapping_options.uint16_null = static_cast( value->h ); null_mapping_options.have_uint16 = true; } else{ @@ -547,7 +542,7 @@ template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KI == value->t ){ - null_mapping_options.uint32_null = value->i; + null_mapping_options.uint32_null = static_cast( value->i ); null_mapping_options.have_uint32 = true; } else{ @@ -571,7 +566,7 @@ template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KJ == value->t ){ - null_mapping_options.uint64_null = value->j; + null_mapping_options.uint64_null = static_cast( value->j ); null_mapping_options.have_uint64 = true; } else{ @@ -595,7 +590,7 @@ template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { if( -KH == value->t ){ - null_mapping_options.float16_null = value->h; + null_mapping_options.float16_null = static_cast( value->h ); null_mapping_options.have_float16 = true; } else{ @@ -654,16 +649,11 @@ inline void KdbOptions::HandleNullMapping( const std: template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - switch( value->t ){ - case KG: - null_mapping_options.binary_null.assign( kG( value ), value->n ); - null_mapping_options.have_binary = true; - break; - case KC: - null_mapping_options.binary_null.assign( kC( value ), value->n ); - null_mapping_options.have_binary = true; - break; - default: + if( value->t == KG || value->t == KC ){ + null_mapping_options.binary_null.assign( kG( value ), value->n ); + null_mapping_options.have_binary = true; + } + else{ null_mapping_error( key, value ); } } @@ -671,16 +661,11 @@ inline void KdbOptions::HandleNullMapping( const std::strin template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - switch( value->t ){ - case KG: + if( value->t == KG || value->t == KC ){ null_mapping_options.large_binary_null.assign( kG( value ), value->n ); null_mapping_options.have_large_binary = true; - break; - case KC: - null_mapping_options.large_binary_null.assign( kC( value ), value->n ); - null_mapping_options.have_large_binary = true; - break; - default: + } + else{ null_mapping_error( key, value ); } } @@ -694,11 +679,8 @@ inline void KdbOptions::HandleNullMapping( const null_mapping_options.have_fixed_binary = true; break; case KG: - null_mapping_options.fixed_binary_null.assign( kG( value ), value->n ); - null_mapping_options.have_fixed_binary = true; - break; case KC: - null_mapping_options.fixed_binary_null.assign( kC( value ), value->n ); + null_mapping_options.fixed_binary_null.assign( kG( value ), value->n ); null_mapping_options.have_fixed_binary = true; break; default: From 06038323455fb04f6ec3498f7cc700071874f4bf Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 31 Jan 2023 12:34:44 +0300 Subject: [PATCH 172/276] Temporal type options --- src/KdbOptions.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/KdbOptions.h b/src/KdbOptions.h index ba9847b..01b4afd 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -691,7 +691,7 @@ inline void KdbOptions::HandleNullMapping( const template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - if( -KI == value->t ){ + if( value->t == -KI || value->t == -KD ){ null_mapping_options.date32_null = value->i; null_mapping_options.have_date32 = true; } @@ -703,7 +703,7 @@ inline void KdbOptions::HandleNullMapping( const std::strin template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - if( -KJ == value->t ){ + if( value->t == -KJ || value->t == -KP ){ null_mapping_options.date64_null = value->j; null_mapping_options.have_date64 = true; } @@ -715,7 +715,7 @@ inline void KdbOptions::HandleNullMapping( const std::strin template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - if( -KJ == value->t ){ + if( value->t == -KJ || value->t == -KP ){ null_mapping_options.timestamp_null = value->j; null_mapping_options.have_timestamp = true; } @@ -727,7 +727,7 @@ inline void KdbOptions::HandleNullMapping( const std::st template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - if( -KI == value->t ){ + if( value->t == -KI || value->t == -KT ){ null_mapping_options.time32_null = value->i; null_mapping_options.have_time32 = true; } @@ -739,7 +739,7 @@ inline void KdbOptions::HandleNullMapping( const std::strin template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - if( -KJ == value->t ){ + if( value->t == -KJ || value->t == -KN ){ null_mapping_options.time64_null = value->j; null_mapping_options.have_time64 = true; } @@ -763,7 +763,7 @@ inline void KdbOptions::HandleNullMapping( const std::stri template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - if( -KJ == value->t ){ + if( value->t == -KJ || value->t == -KN ){ null_mapping_options.duration_null = value->j; null_mapping_options.have_duration = true; } @@ -775,7 +775,7 @@ inline void KdbOptions::HandleNullMapping( const std::str template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - if( -KI == value->t ){ + if( value->t == -KI || value->t == -KM ){ null_mapping_options.month_interval_null = value->i; null_mapping_options.have_month_interval = true; } @@ -787,7 +787,7 @@ inline void KdbOptions::HandleNullMapping( const s template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - if( -KJ == value->t ){ + if( value->t == -KJ || value->t == -KN ){ null_mapping_options.day_time_interval_null = value->j; null_mapping_options.have_day_time_interval = true; } From 7eeaaa2e31f2771cdb3d21f3002af9bf4925f6a7 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 31 Jan 2023 13:20:46 +0300 Subject: [PATCH 173/276] Replacing null_mapping integers with temporals --- examples/null_mapping.q | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/null_mapping.q b/examples/null_mapping.q index 629672c..55b3a37 100644 --- a/examples/null_mapping.q +++ b/examples/null_mapping.q @@ -11,8 +11,8 @@ short_opts:(`bool`uint8`int8`uint16`int16)!(0b;0x01;0x02;3h;4h); long_opts:(`uint32`int32`uint64`int64)!(5i;6i;7;8); float_opts:(`float32`float64`decimal)!(1.23e;4.56;7.89); str_opts:(`string`large_string`binary`large_binary`fixed_binary)!("start";"stop";"x"$"alert";"x"$"acknowledge";0Ng); -time_opts:(`date32`date64`timestamp`time64`duration)!("i"$2006.07.21;"j"$2015.01.01D00:00:00.000000000;"j"$2011.01.01D00:00:00.000000000;"j"$12:00:00.000000000;"j"$12:00:00.000000000); -other_opts:(`float16`time32`month_interval`day_time_interval)!(9h;"i"$09:01:02.042;"i"$2006.07m;"j"$12:00:00.000000000); +time_opts:(`date32`date64`timestamp`time64`duration)!(2006.07.21;2015.01.01D00:00:00.000000000;2011.01.01D00:00:00.000000000;12:00:00.000000000;12:00:00.000000000); +other_opts:(`float16`time32`month_interval`day_time_interval)!(9h;09:01:02.042;2006.07m;12:00:00.000000000); options:(``NULL_MAPPING)!((::);short_opts,long_opts,float_opts,str_opts,time_opts,other_opts); From ed747b5e9c2d5a7147dd8015025969c37a38bee8 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 31 Jan 2023 17:02:55 +0300 Subject: [PATCH 174/276] Integer types expansion --- src/ArrayReader.cpp | 62 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 55 insertions(+), 7 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index d1413b9..0d36b2e 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -138,21 +138,45 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto uint8_array = static_pointer_cast(array_data); - memcpy(kG(k_array), uint8_array->raw_values(), uint8_array->length() * sizeof(arrow::UInt8Array::value_type)); + if( type_overrides.null_mapping.have_uint8 ){ + for( auto i = 0ll; i < uint8_array->length(); ++i ){ + kG( k_array )[i] = ( uint8_array->IsNull( i ) * type_overrides.null_mapping.uint8_null ) + + ( !uint8_array->IsNull( i ) * uint8_array->Value( i ) ); + } + } + else { + memcpy(kG(k_array), uint8_array->raw_values(), uint8_array->length() * sizeof(arrow::UInt8Array::value_type)); + } } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto int8_array = static_pointer_cast(array_data); - memcpy(kG(k_array), int8_array->raw_values(), int8_array->length() * sizeof(arrow::Int8Array::value_type)); + if( type_overrides.null_mapping.have_int8 ){ + for( auto i = 0ll; i < int8_array->length(); ++i ){ + kG( k_array )[i] = ( int8_array->IsNull( i ) * type_overrides.null_mapping.int8_null ) + + ( !int8_array->IsNull( i ) * int8_array->Value( i ) ); + } + } + else { + memcpy(kG(k_array), int8_array->raw_values(), int8_array->length() * sizeof(arrow::Int8Array::value_type)); + } } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto uint16_array = static_pointer_cast(array_data); - memcpy(kH(k_array), uint16_array->raw_values(), uint16_array->length() * sizeof(arrow::UInt16Array::value_type)); + if( type_overrides.null_mapping.have_uint16 ){ + for( auto i = 0ll; i < uint16_array->length(); ++i ){ + kH( k_array )[i] = ( uint16_array->IsNull( i ) * type_overrides.null_mapping.uint16_null ) + + ( !uint16_array->IsNull( i ) * uint16_array->Value( i ) ); + } + } + else { + memcpy(kH(k_array), uint16_array->raw_values(), uint16_array->length() * sizeof(arrow::UInt16Array::value_type)); + } } template<> @@ -162,7 +186,7 @@ void AppendArray(shared_ptr array_data, K k_ar if( type_overrides.null_mapping.have_int16 ){ for( auto i = 0ll; i < int16_array->length(); ++i ){ kH( k_array )[i] = ( int16_array->IsNull( i ) * type_overrides.null_mapping.int16_null ) - + (!int16_array->IsNull( i ) * int16_array->Value( i ) ); + + ( !int16_array->IsNull( i ) * int16_array->Value( i ) ); } } else { @@ -174,7 +198,15 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto uint32_array = static_pointer_cast(array_data); - memcpy(kI(k_array), uint32_array->raw_values(), uint32_array->length() * sizeof(arrow::UInt32Array::value_type)); + if( type_overrides.null_mapping.have_uint32 ){ + for( auto i = 0ll; i < uint32_array->length(); ++i ){ + kI( k_array )[i] = ( uint32_array->IsNull( i ) * type_overrides.null_mapping.uint32_null ) + + ( !uint32_array->IsNull( i ) * uint32_array->Value( i ) ); + } + } + else { + memcpy(kI(k_array), uint32_array->raw_values(), uint32_array->length() * sizeof(arrow::UInt32Array::value_type)); + } } template<> @@ -196,14 +228,30 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto uint64_array = static_pointer_cast(array_data); - memcpy(kJ(k_array), uint64_array->raw_values(), uint64_array->length() * sizeof(arrow::UInt64Array::value_type)); + if( type_overrides.null_mapping.have_uint64 ){ + for( auto i = 0ll; i < uint64_array->length(); ++i ){ + kJ( k_array )[i] = ( uint64_array->IsNull( i ) * type_overrides.null_mapping.uint64_null ) + + ( !uint64_array->IsNull( i ) * uint64_array->Value( i ) ); + } + } + else { + memcpy(kJ(k_array), uint64_array->raw_values(), uint64_array->length() * sizeof(arrow::UInt64Array::value_type)); + } } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto int64_array = static_pointer_cast(array_data); - memcpy(kJ(k_array), int64_array->raw_values(), int64_array->length() * sizeof(arrow::Int64Array::value_type)); + if( type_overrides.null_mapping.have_int32 ){ + for( auto i = 0ll; i < int64_array->length(); ++i ){ + kJ( k_array )[i] = ( int64_array->IsNull( i ) * type_overrides.null_mapping.int64_null ) + + (!int64_array->IsNull( i ) * int64_array->Value( i ) ); + } + } + else { + memcpy(kJ(k_array), int64_array->raw_values(), int64_array->length() * sizeof(arrow::Int64Array::value_type)); + } } template<> From dcb8a164be32b14fd9e370f6e20469b77780e84a Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 31 Jan 2023 18:25:56 +0300 Subject: [PATCH 175/276] Boolean and floats expansion --- src/ArrayReader.cpp | 42 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 0d36b2e..41ffcd7 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -130,8 +130,11 @@ void AppendArray(shared_ptr array_data, K k_arr { auto bool_array = static_pointer_cast(array_data); // BooleanArray doesn't have a bulk reader since arrow BooleanType is only 1 bit - for (auto i = 0; i < bool_array->length(); ++i) - kG(k_array)[index++] = bool_array->Value(i); + for (auto i = 0; i < bool_array->length(); ++i){ + kG(k_array)[index++] = // preventing branch prediction failures + ( ( type_overrides.null_mapping.have_boolean && bool_array->IsNull( i ) ) * type_overrides.null_mapping.float16_null ) + + ( !( type_overrides.null_mapping.have_boolean && bool_array->IsNull( i ) ) * bool_array->Value( i ) ); + } } template<> @@ -258,21 +261,45 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto hfl_array = static_pointer_cast(array_data); - memcpy(kH(k_array), hfl_array->raw_values(), hfl_array->length() * sizeof(arrow::HalfFloatArray::value_type)); + if( type_overrides.null_mapping.have_float16 ){ + for( auto i = 0ll; i < hfl_array->length(); ++i ){ + kH( k_array )[i] = ( hfl_array->IsNull( i ) * type_overrides.null_mapping.float16_null ) + + ( !hfl_array->IsNull( i ) * hfl_array->Value( i ) ); + } + } + else { + memcpy(kH(k_array), hfl_array->raw_values(), hfl_array->length() * sizeof(arrow::HalfFloatArray::value_type)); + } } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto fl_array = static_pointer_cast(array_data); - memcpy(kE(k_array), fl_array->raw_values(), fl_array->length() * sizeof(arrow::FloatArray::value_type)); + if( type_overrides.null_mapping.have_float32 ){ + for( auto i = 0ll; i < fl_array->length(); ++i ){ + kE( k_array )[i] = ( fl_array->IsNull( i ) * type_overrides.null_mapping.float32_null ) + + ( !fl_array->IsNull( i ) * fl_array->Value( i ) ); + } + } + else { + memcpy(kE(k_array), fl_array->raw_values(), fl_array->length() * sizeof(arrow::FloatArray::value_type)); + } } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto dbl_array = static_pointer_cast(array_data); - memcpy(kF(k_array), dbl_array->raw_values(), dbl_array->length() * sizeof(arrow::DoubleArray::value_type)); + if( type_overrides.null_mapping.have_float64 ){ + for( auto i = 0ll; i < dbl_array->length(); ++i ){ + kF( k_array )[i] = ( dbl_array->IsNull( i ) * type_overrides.null_mapping.float64_null ) + + ( !dbl_array->IsNull( i ) * dbl_array->Value( i ) ); + } + } + else { + memcpy(kF(k_array), dbl_array->raw_values(), dbl_array->length() * sizeof(arrow::DoubleArray::value_type)); + } } template<> @@ -406,7 +433,10 @@ void AppendArray(shared_ptr array_data, K k_ auto decimal = arrow::Decimal128(dec_array->Value(i)); if (type_overrides.decimal128_as_double) { // Convert the decimal to a double - auto dec_as_double = decimal.ToDouble(dec_type->scale()); + auto dec_as_double = + ( ( type_overrides.null_mapping.have_decimal && dec_array->IsNull( i ) ) * type_overrides.null_mapping.decimal_null ) + + ( !( type_overrides.null_mapping.have_decimal && dec_array->IsNull( i ) ) * decimal.ToDouble( dec_type->scale() ) ); + kF(k_array)[index++] = dec_as_double; } else { // Each decimal is a list of 16 bytes From fc6f4b6cf456e5a90b31f29d5c3729da30c1302e Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 1 Feb 2023 11:14:06 +0300 Subject: [PATCH 176/276] Strings and binaries expansion --- src/ArrayReader.cpp | 39 ++++++++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 41ffcd7..35f3021 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -345,9 +345,16 @@ void AppendArray(shared_ptr array_data, K k_a { auto bin_array = static_pointer_cast(array_data); for (auto i = 0; i < bin_array->length(); ++i) { - auto bin_data = bin_array->GetString(i); - K k_bin = ktn(KG, bin_data.length()); - memcpy(kG(k_bin), bin_data.data(), bin_data.length()); + K k_bin = nullptr; + if( type_overrides.null_mapping.have_binary && bin_array->IsNull( i ) ){ + k_bin = ktn( KG, type_overrides.null_mapping.binary_null.length() ); + memcpy( kG( k_bin ), type_overrides.null_mapping.binary_null.data(), type_overrides.null_mapping.binary_null.length() ); + } + else{ + auto bin_data = bin_array->GetString(i); + k_bin = ktn(KG, bin_data.length()); + memcpy(kG(k_bin), bin_data.data(), bin_data.length()); + } kK(k_array)[index++] = k_bin; } } @@ -357,9 +364,16 @@ void AppendArray(shared_ptr array_data, { auto bin_array = static_pointer_cast(array_data); for (auto i = 0; i < bin_array->length(); ++i) { - auto bin_data = bin_array->GetString(i); - K k_bin = ktn(KG, bin_data.length()); - memcpy(kG(k_bin), bin_data.data(), bin_data.length()); + K k_bin = nullptr; + if( type_overrides.null_mapping.have_large_binary && bin_array->IsNull( i ) ){ + k_bin = ktn( KG, type_overrides.null_mapping.large_binary_null.length() ); + memcpy( kG( k_bin ), type_overrides.null_mapping.large_binary_null.data(), type_overrides.null_mapping.large_binary_null.length() ); + } + else{ + auto bin_data = bin_array->GetString(i); + k_bin = ktn(KG, bin_data.length()); + memcpy(kG(k_bin), bin_data.data(), bin_data.length()); + } kK(k_array)[index++] = k_bin; } } @@ -369,9 +383,16 @@ void AppendArray(shared_ptr array_ { auto fixed_bin_array = static_pointer_cast(array_data); for (auto i = 0; i < fixed_bin_array->length(); ++i) { - auto bin_data = fixed_bin_array->GetString(i); - K k_bin = ktn(KG, bin_data.length()); - memcpy(kG(k_bin), bin_data.data(), bin_data.length()); + K k_bin = nullptr; + if( type_overrides.null_mapping.have_fixed_binary && fixed_bin_array->IsNull( i ) ){ + k_bin = ktn( KG, type_overrides.null_mapping.fixed_binary_null.length() ); + memcpy( kG( k_bin ), type_overrides.null_mapping.fixed_binary_null.data(), type_overrides.null_mapping.fixed_binary_null.length() ); + } + else{ + auto bin_data = fixed_bin_array->GetString(i); + k_bin = ktn(KG, bin_data.length()); + memcpy(kG(k_bin), bin_data.data(), bin_data.length()); + } kK(k_array)[index++] = k_bin; } } From f423c874b9403ae91bffbaa276a1d107d78182da Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 1 Feb 2023 16:34:45 +0300 Subject: [PATCH 177/276] Temporals expansion --- src/ArrayReader.cpp | 61 +++++++++++++++++++++++++++++++++------------ 1 file changed, 45 insertions(+), 16 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 35f3021..d037b0e 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -132,7 +132,7 @@ void AppendArray(shared_ptr array_data, K k_arr // BooleanArray doesn't have a bulk reader since arrow BooleanType is only 1 bit for (auto i = 0; i < bool_array->length(); ++i){ kG(k_array)[index++] = // preventing branch prediction failures - ( ( type_overrides.null_mapping.have_boolean && bool_array->IsNull( i ) ) * type_overrides.null_mapping.float16_null ) + ( ( type_overrides.null_mapping.have_boolean && bool_array->IsNull( i ) ) * type_overrides.null_mapping.boolean_null ) + ( !( type_overrides.null_mapping.have_boolean && bool_array->IsNull( i ) ) * bool_array->Value( i ) ); } } @@ -402,8 +402,11 @@ void AppendArray(shared_ptr array_data, K k_a { TemporalConversion tc(array_data->type()); auto d32_array = static_pointer_cast(array_data); - for (auto i = 0; i < d32_array->length(); ++i) - kI(k_array)[index++] = tc.ArrowToKdb(d32_array->Value(i)); + for (auto i = 0; i < d32_array->length(); ++i){ + kI( k_array )[index++] = + ( ( type_overrides.null_mapping.have_date32 && d32_array->IsNull( i ) ) * type_overrides.null_mapping.date32_null ) + + ( !( type_overrides.null_mapping.have_date32 && d32_array->IsNull( i ) ) * tc.ArrowToKdb( d32_array->Value( i ) ) ); + } } template<> @@ -411,8 +414,11 @@ void AppendArray(shared_ptr array_data, K k_a { TemporalConversion tc(array_data->type()); auto d64_array = static_pointer_cast(array_data); - for (auto i = 0; i < d64_array->length(); ++i) - kJ(k_array)[index++] = tc.ArrowToKdb(d64_array->Value(i)); + for (auto i = 0; i < d64_array->length(); ++i){ + kJ( k_array )[index++] = + ( ( type_overrides.null_mapping.have_date64 && d64_array->IsNull( i ) ) * type_overrides.null_mapping.date64_null ) + + ( !( type_overrides.null_mapping.have_date64 && d64_array->IsNull( i ) ) * tc.ArrowToKdb( d64_array->Value( i ) ) ); + } } template<> @@ -421,8 +427,11 @@ void AppendArray(shared_ptr array_data, K TemporalConversion tc(array_data->type()); auto ts_array = static_pointer_cast(array_data); auto timestamp_type = static_pointer_cast(ts_array->type()); - for (auto i = 0; i < ts_array->length(); ++i) - kJ(k_array)[index++] = tc.ArrowToKdb(ts_array->Value(i)); + for (auto i = 0; i < ts_array->length(); ++i){ + kJ( k_array )[index++] = + ( ( type_overrides.null_mapping.have_timestamp && ts_array->IsNull( i ) ) * type_overrides.null_mapping.timestamp_null ) + + ( !( type_overrides.null_mapping.have_timestamp && ts_array->IsNull( i ) ) * tc.ArrowToKdb( ts_array->Value( i ) ) ); + } } template<> @@ -431,8 +440,11 @@ void AppendArray(shared_ptr array_data, K k_a TemporalConversion tc(array_data->type()); auto t32_array = static_pointer_cast(array_data); auto time32_type = static_pointer_cast(t32_array->type()); - for (auto i = 0; i < t32_array->length(); ++i) - kI(k_array)[index++] = tc.ArrowToKdb(t32_array->Value(i)); + for (auto i = 0; i < t32_array->length(); ++i){ + kI( k_array )[index++] = + ( ( type_overrides.null_mapping.have_time32 && t32_array->IsNull( i ) ) * type_overrides.null_mapping.time32_null ) + + ( !( type_overrides.null_mapping.have_time32 && t32_array->IsNull( i ) ) * tc.ArrowToKdb( t32_array->Value( i ) ) ); + } } template<> @@ -441,8 +453,11 @@ void AppendArray(shared_ptr array_data, K k_a TemporalConversion tc(array_data->type()); auto t64_array = static_pointer_cast(array_data); auto time64_type = static_pointer_cast(t64_array->type()); - for (auto i = 0; i < t64_array->length(); ++i) - kJ(k_array)[index++] = tc.ArrowToKdb(t64_array->Value(i)); + for (auto i = 0; i < t64_array->length(); ++i){ + kJ( k_array )[index++] = + ( ( type_overrides.null_mapping.have_time64 && t64_array->IsNull( i ) ) * type_overrides.null_mapping.time64_null ) + + ( !( type_overrides.null_mapping.have_time64 && t64_array->IsNull( i ) ) * tc.ArrowToKdb( t64_array->Value( i ) ) ); + } } template<> @@ -474,23 +489,37 @@ void AppendArray(shared_ptr array_data, K k TemporalConversion tc(array_data->type()); auto dur_array = static_pointer_cast(array_data); auto duration_type = static_pointer_cast(dur_array->type()); - for (auto i = 0; i < dur_array->length(); ++i) - kJ(k_array)[index++] = tc.ArrowToKdb(dur_array->Value(i)); + for (auto i = 0; i < dur_array->length(); ++i){ + kJ( k_array )[index++] = + ( ( type_overrides.null_mapping.have_duration && dur_array->IsNull( i ) ) * type_overrides.null_mapping.duration_null ) + + ( !( type_overrides.null_mapping.have_duration && dur_array->IsNull( i ) ) * tc.ArrowToKdb( dur_array->Value( i ) ) ); + } } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto month_array = static_pointer_cast(array_data); - memcpy(kI(k_array), month_array->raw_values(), month_array->length() * sizeof(arrow::MonthIntervalArray::value_type)); + if( type_overrides.null_mapping.have_month_interval ){ + for( auto i = 0ll; i < month_array->length(); ++i ){ + kI( k_array )[i] = ( month_array->IsNull( i ) * type_overrides.null_mapping.month_interval_null ) + + ( !month_array->IsNull( i ) * month_array->Value( i ) ); + } + } + else { + memcpy(kI(k_array), month_array->raw_values(), month_array->length() * sizeof(arrow::MonthIntervalArray::value_type)); + } } template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto dt_array = static_pointer_cast(array_data); - for (auto i = 0; i < dt_array->length(); ++i) - kJ(k_array)[index++] = DayTimeInterval_KTimespan(dt_array->Value(i)); + for (auto i = 0; i < dt_array->length(); ++i){ + kJ( k_array )[index++] = + ( ( type_overrides.null_mapping.have_day_time_interval && dt_array->IsNull( i ) ) * type_overrides.null_mapping.day_time_interval_null ) + + ( !( type_overrides.null_mapping.have_day_time_interval && dt_array->IsNull( i ) ) * DayTimeInterval_KTimespan( dt_array->Value( i ) ) ); + } } template<> From fb3faa802a465c60b03c8b55542478e0bf7e74c9 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 1 Feb 2023 16:53:22 +0300 Subject: [PATCH 178/276] Counting nulls for bulk copying --- src/ArrayReader.cpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index d037b0e..e748f14 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -141,7 +141,7 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto uint8_array = static_pointer_cast(array_data); - if( type_overrides.null_mapping.have_uint8 ){ + if( type_overrides.null_mapping.have_uint8 && uint8_array->null_count() ){ for( auto i = 0ll; i < uint8_array->length(); ++i ){ kG( k_array )[i] = ( uint8_array->IsNull( i ) * type_overrides.null_mapping.uint8_null ) + ( !uint8_array->IsNull( i ) * uint8_array->Value( i ) ); @@ -156,7 +156,7 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto int8_array = static_pointer_cast(array_data); - if( type_overrides.null_mapping.have_int8 ){ + if( type_overrides.null_mapping.have_int8 && int8_array->null_count() ){ for( auto i = 0ll; i < int8_array->length(); ++i ){ kG( k_array )[i] = ( int8_array->IsNull( i ) * type_overrides.null_mapping.int8_null ) + ( !int8_array->IsNull( i ) * int8_array->Value( i ) ); @@ -171,7 +171,7 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto uint16_array = static_pointer_cast(array_data); - if( type_overrides.null_mapping.have_uint16 ){ + if( type_overrides.null_mapping.have_uint16 && uint16_array->null_count() ){ for( auto i = 0ll; i < uint16_array->length(); ++i ){ kH( k_array )[i] = ( uint16_array->IsNull( i ) * type_overrides.null_mapping.uint16_null ) + ( !uint16_array->IsNull( i ) * uint16_array->Value( i ) ); @@ -186,7 +186,7 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto int16_array = static_pointer_cast(array_data); - if( type_overrides.null_mapping.have_int16 ){ + if( type_overrides.null_mapping.have_int16 && int16_array->null_count() ){ for( auto i = 0ll; i < int16_array->length(); ++i ){ kH( k_array )[i] = ( int16_array->IsNull( i ) * type_overrides.null_mapping.int16_null ) + ( !int16_array->IsNull( i ) * int16_array->Value( i ) ); @@ -201,7 +201,7 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto uint32_array = static_pointer_cast(array_data); - if( type_overrides.null_mapping.have_uint32 ){ + if( type_overrides.null_mapping.have_uint32 && uint32_array->null_count() ){ for( auto i = 0ll; i < uint32_array->length(); ++i ){ kI( k_array )[i] = ( uint32_array->IsNull( i ) * type_overrides.null_mapping.uint32_null ) + ( !uint32_array->IsNull( i ) * uint32_array->Value( i ) ); @@ -216,7 +216,7 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto int32_array = static_pointer_cast(array_data); - if( type_overrides.null_mapping.have_int32 ){ + if( type_overrides.null_mapping.have_int32 && int32_array->null_count() ){ for( auto i = 0ll; i < int32_array->length(); ++i ){ kI( k_array )[i] = ( int32_array->IsNull( i ) * type_overrides.null_mapping.int32_null ) + (!int32_array->IsNull( i ) * int32_array->Value( i ) ); @@ -231,7 +231,7 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto uint64_array = static_pointer_cast(array_data); - if( type_overrides.null_mapping.have_uint64 ){ + if( type_overrides.null_mapping.have_uint64 && uint64_array->null_count() ){ for( auto i = 0ll; i < uint64_array->length(); ++i ){ kJ( k_array )[i] = ( uint64_array->IsNull( i ) * type_overrides.null_mapping.uint64_null ) + ( !uint64_array->IsNull( i ) * uint64_array->Value( i ) ); @@ -246,7 +246,7 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto int64_array = static_pointer_cast(array_data); - if( type_overrides.null_mapping.have_int32 ){ + if( type_overrides.null_mapping.have_int32 && int64_array->null_count() ){ for( auto i = 0ll; i < int64_array->length(); ++i ){ kJ( k_array )[i] = ( int64_array->IsNull( i ) * type_overrides.null_mapping.int64_null ) + (!int64_array->IsNull( i ) * int64_array->Value( i ) ); @@ -261,7 +261,7 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto hfl_array = static_pointer_cast(array_data); - if( type_overrides.null_mapping.have_float16 ){ + if( type_overrides.null_mapping.have_float16 && hfl_array->null_count() ){ for( auto i = 0ll; i < hfl_array->length(); ++i ){ kH( k_array )[i] = ( hfl_array->IsNull( i ) * type_overrides.null_mapping.float16_null ) + ( !hfl_array->IsNull( i ) * hfl_array->Value( i ) ); @@ -276,7 +276,7 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto fl_array = static_pointer_cast(array_data); - if( type_overrides.null_mapping.have_float32 ){ + if( type_overrides.null_mapping.have_float32 && fl_array->null_count() ){ for( auto i = 0ll; i < fl_array->length(); ++i ){ kE( k_array )[i] = ( fl_array->IsNull( i ) * type_overrides.null_mapping.float32_null ) + ( !fl_array->IsNull( i ) * fl_array->Value( i ) ); @@ -291,7 +291,7 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto dbl_array = static_pointer_cast(array_data); - if( type_overrides.null_mapping.have_float64 ){ + if( type_overrides.null_mapping.have_float64 && dbl_array->null_count() ){ for( auto i = 0ll; i < dbl_array->length(); ++i ){ kF( k_array )[i] = ( dbl_array->IsNull( i ) * type_overrides.null_mapping.float64_null ) + ( !dbl_array->IsNull( i ) * dbl_array->Value( i ) ); From 83b0b590035c6530d5efa5deb00603efe23610da Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 7 Feb 2023 10:32:35 +0300 Subject: [PATCH 179/276] Clean up option getters --- src/ArrayReader.cpp | 4 ++-- src/ArrayWriter.cpp | 14 -------------- src/HelperFunctions.h | 18 ++++++++++++++++++ src/KdbOptions.h | 32 -------------------------------- 4 files changed, 20 insertions(+), 48 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index e748f14..30e34b4 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -246,7 +246,7 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto int64_array = static_pointer_cast(array_data); - if( type_overrides.null_mapping.have_int32 && int64_array->null_count() ){ + if( type_overrides.null_mapping.have_int64 && int64_array->null_count() ){ for( auto i = 0ll; i < int64_array->length(); ++i ){ kJ( k_array )[i] = ( int64_array->IsNull( i ) * type_overrides.null_mapping.int64_null ) + (!int64_array->IsNull( i ) * int64_array->Value( i ) ); @@ -500,7 +500,7 @@ template<> void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { auto month_array = static_pointer_cast(array_data); - if( type_overrides.null_mapping.have_month_interval ){ + if( type_overrides.null_mapping.have_month_interval && month_array->null_count() ){ for( auto i = 0ll; i < month_array->length(); ++i ){ kI( k_array )[i] = ( month_array->IsNull( i ) * type_overrides.null_mapping.month_interval_null ) + ( !month_array->IsNull( i ) * month_array->Value( i ) ); diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index a084ffd..e29a197 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -1,9 +1,7 @@ #include -#include #include #include #include -#include #include #include @@ -22,18 +20,6 @@ using namespace kx::arrowkdb; namespace { -//! Compares floating point numbers, because of unreliable direct compare -//! @param lhs - left-hand side value -//! @param rhs - right-hand side value -//! @return true if values are nearby -template -bool is_equal( T lhs, T rhs ) -{ - static const T epsilon = 2 * std::numeric_limits::epsilon(); - - return ::fabs( lhs -= rhs ) <= epsilon; -} - shared_ptr GetBuilder(shared_ptr datatype); template diff --git a/src/HelperFunctions.h b/src/HelperFunctions.h index 201707a..f48f8f1 100644 --- a/src/HelperFunctions.h +++ b/src/HelperFunctions.h @@ -1,6 +1,8 @@ #ifndef __HELPER_FUNCTIONS_H__ #define __HELPER_FUNCTIONS_H__ +#include +#include #include #include @@ -71,6 +73,22 @@ bool IsKdbString(K str); const std::string GetKdbString(K str); +//////////////////// +// FLOATS COMPARE // +//////////////////// + +//! Compares floating point numbers, because of unreliable direct compare +//! @param lhs - left-hand side value +//! @param rhs - right-hand side value +//! @return true if values are nearby +template +inline bool is_equal( T lhs, T rhs ) +{ + static const T epsilon = 2 * std::numeric_limits::epsilon(); + + return ::fabs( lhs -= rhs ) <= epsilon; +} + ////////////////// // TYPE MAPPING // ////////////////// diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 01b4afd..d85989d 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -180,37 +180,8 @@ namespace Options int64_t duration_null; int32_t month_interval_null; int64_t day_time_interval_null; - - template - inline auto GetOption() const; }; - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_boolean, boolean_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_uint8, uint8_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_int8, int8_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_uint16, uint16_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_int16, int16_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_uint32, uint32_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_int32, int32_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_uint64, uint64_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_int64, int64_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_float16, float16_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_float32, float32_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_float64, float64_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_string, string_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_large_string, large_string_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_binary, binary_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_large_binary, large_binary_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_fixed_binary, fixed_binary_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_date32, date32_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_date64, date64_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_timestamp, timestamp_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_time32, time32_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_time64, time64_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_decimal, decimal_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_duration, duration_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_month_interval, month_interval_null ); } - template<> inline auto NullMapping::GetOption() const{ return std::make_pair( have_day_time_interval, day_time_interval_null ); } } // namespace Options template<> @@ -437,9 +408,6 @@ class KdbOptions template inline void HandleNullMapping( const std::string& key, K value ); - template - auto GetNullMappingOption() const { return null_mapping_options.GetOption(); } - void GetNullMappingOptions( Options::NullMapping& null_mapping ) const{ null_mapping = null_mapping_options; } From ebdea7648436ad82dd765ed67595478a300ac1ba Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 7 Feb 2023 17:59:41 +0300 Subject: [PATCH 180/276] Clean up option setters --- src/KdbOptions.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/KdbOptions.h b/src/KdbOptions.h index d85989d..6a8fc84 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -659,7 +659,7 @@ inline void KdbOptions::HandleNullMapping( const template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - if( value->t == -KI || value->t == -KD ){ + if( value->t == -KD ){ null_mapping_options.date32_null = value->i; null_mapping_options.have_date32 = true; } @@ -671,7 +671,7 @@ inline void KdbOptions::HandleNullMapping( const std::strin template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - if( value->t == -KJ || value->t == -KP ){ + if( value->t == -KP ){ null_mapping_options.date64_null = value->j; null_mapping_options.have_date64 = true; } @@ -683,7 +683,7 @@ inline void KdbOptions::HandleNullMapping( const std::strin template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - if( value->t == -KJ || value->t == -KP ){ + if( value->t == -KP ){ null_mapping_options.timestamp_null = value->j; null_mapping_options.have_timestamp = true; } @@ -695,7 +695,7 @@ inline void KdbOptions::HandleNullMapping( const std::st template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - if( value->t == -KI || value->t == -KT ){ + if( value->t == -KT ){ null_mapping_options.time32_null = value->i; null_mapping_options.have_time32 = true; } @@ -707,7 +707,7 @@ inline void KdbOptions::HandleNullMapping( const std::strin template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - if( value->t == -KJ || value->t == -KN ){ + if( value->t == -KN ){ null_mapping_options.time64_null = value->j; null_mapping_options.have_time64 = true; } @@ -731,7 +731,7 @@ inline void KdbOptions::HandleNullMapping( const std::stri template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - if( value->t == -KJ || value->t == -KN ){ + if( value->t == -KN ){ null_mapping_options.duration_null = value->j; null_mapping_options.have_duration = true; } @@ -743,7 +743,7 @@ inline void KdbOptions::HandleNullMapping( const std::str template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - if( value->t == -KI || value->t == -KM ){ + if( value->t == -KM ){ null_mapping_options.month_interval_null = value->i; null_mapping_options.have_month_interval = true; } @@ -755,7 +755,7 @@ inline void KdbOptions::HandleNullMapping( const s template<> inline void KdbOptions::HandleNullMapping( const std::string& key, K value ) { - if( value->t == -KJ || value->t == -KN ){ + if( value->t == -KN ){ null_mapping_options.day_time_interval_null = value->j; null_mapping_options.have_day_time_interval = true; } From f95155a6dcf61a5d35eecaa0f442dba34748e33b Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 27 Jan 2023 18:18:45 +0300 Subject: [PATCH 181/276] Reading parquet data back and compare https://github.com/KxSystems/arrowkdb/pull/7 --- examples/null_mapping.q | 66 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 61 insertions(+), 5 deletions(-) diff --git a/examples/null_mapping.q b/examples/null_mapping.q index 55b3a37..dc43651 100644 --- a/examples/null_mapping.q +++ b/examples/null_mapping.q @@ -184,9 +184,65 @@ options[`DECIMAL128_AS_DOUBLE]:1 .arrowkdb.tb.prettyPrintTable[time_schema;time_data;options] .arrowkdb.tb.prettyPrintTable[other_schema;other_data;options] +//-------------------------// +// Example-1. Parquet file // +//-------------------------// + +// Write the schema and array data to a parquet file options[`PARQUET_VERSION]:`V2.0 -.arrowkdb.pq.writeParquet["null_mapping_short.parquet";short_schema;short_data;options] -.arrowkdb.pq.writeParquet["null_mapping_long.parquet";long_schema;long_data;options] -.arrowkdb.pq.writeParquet["null_mapping_float.parquet";float_schema;float_data;options] -.arrowkdb.pq.writeParquet["null_mapping_str.parquet";str_schema;str_data;options] -.arrowkdb.pq.writeParquet["null_mapping_time.parquet";time_schema;time_data;options] + +filename_short:"null_mapping_short.parquet" +filename_long:"null_mapping_long.parquet" +filename_float:"null_mapping_float.parquet" +filename_str:"null_mapping_str.parquet" +filename_time:"null_mapping_time.parquet" + +.arrowkdb.pq.writeParquet[filename_short;short_schema;short_data;options] +.arrowkdb.pq.writeParquet[filename_long;long_schema;long_data;options] +.arrowkdb.pq.writeParquet[filename_float;float_schema;float_data;options] +.arrowkdb.pq.writeParquet[filename_str;str_schema;str_data;options] +.arrowkdb.pq.writeParquet[filename_time;time_schema;time_data;options] + +show ls filename_short +show ls filename_long +show ls filename_float +show ls filename_str +show ls filename_time + +// Read the schema back and compare +new_short_schema:.arrowkdb.pq.readParquetSchema[filename_short]; +new_long_schema:.arrowkdb.pq.readParquetSchema[filename_long]; +new_float_schema:.arrowkdb.pq.readParquetSchema[filename_float]; +new_str_schema:.arrowkdb.pq.readParquetSchema[filename_str]; +new_time_schema:.arrowkdb.pq.readParquetSchema[filename_time]; + +show .arrowkdb.sc.equalSchemas[short_schema;new_short_schema] +show .arrowkdb.sc.equalSchemas[long_schema;new_long_schema] +show .arrowkdb.sc.equalSchemas[float_schema;new_float_schema] +show .arrowkdb.sc.equalSchemas[str_schema;new_str_schema] +show .arrowkdb.sc.equalSchemas[time_schema;new_time_schema] + +show short_schema~new_short_schema +show long_schema~new_long_schema +show float_schema~new_float_schema +show str_schema~new_str_schema +show time_schema~new_time_schema + +// Read the array data back and compare +new_short_data:.arrowkdb.pq.readParquetData[filename_short;::]; +new_long_data:.arrowkdb.pq.readParquetData[filename_long;::]; +new_float_data:.arrowkdb.pq.readParquetData[filename_float;::]; +new_str_data:.arrowkdb.pq.readParquetData[filename_str;::]; +new_time_data:.arrowkdb.pq.readParquetData[filename_time;::]; + +show short_data~new_short_data +show long_data~new_long_data +show float_data~new_float_data +show str_data~new_str_data +show time_data~new_time_data + +rm filename_short; +rm filename_long; +rm filename_float; +rm filename_str; +rm filename_time; From 2a6248962f144ee54c782f5b94ef48bcc41e62f5 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 27 Jan 2023 19:21:25 +0300 Subject: [PATCH 182/276] Reloading Arrow IPC file --- examples/null_mapping.q | 100 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 92 insertions(+), 8 deletions(-) diff --git a/examples/null_mapping.q b/examples/null_mapping.q index dc43651..e8929de 100644 --- a/examples/null_mapping.q +++ b/examples/null_mapping.q @@ -1,3 +1,15 @@ +// null_mapping.q +// Examples of creating a schema supporting null mapping and using it to read/write parquet and arrow tables + +-1"\n+----------|| null_mapping.q ||----------+\n"; + +// import the arrowkdb library +\l arrowkdb.q + +// Filesystem functions for Linux/MacOS/Windows +ls:{[filename] $[.z.o like "w*";system "dir /b ",filename;system "ls ",filename]}; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + ///////////////////////// // CONSTRUCTED SCHEMAS // ///////////////////////// @@ -70,9 +82,9 @@ f64_fd:.arrowkdb.fd.field[`float64;f64_dt]; dec_fd:.arrowkdb.fd.field[`decimal;dec_dt]; str_fd:.arrowkdb.fd.field[`string;str_dt]; -lstr_fd:.arrowkdb.fd.field[`long_string;lstr_dt]; +lstr_fd:.arrowkdb.fd.field[`large_string;lstr_dt]; bin_fd:.arrowkdb.fd.field[`binary;bin_dt]; -lbin_fd:.arrowkdb.fd.field[`long_binary;lbin_dt]; +lbin_fd:.arrowkdb.fd.field[`large_binary;lbin_dt]; fbin_fd:.arrowkdb.fd.field[`fixed_binary;fbin_dt]; d32_fd:.arrowkdb.fd.field[`date32;d32_dt]; @@ -189,7 +201,7 @@ options[`DECIMAL128_AS_DOUBLE]:1 //-------------------------// // Write the schema and array data to a parquet file -options[`PARQUET_VERSION]:`V2.0 +options[`PARQUET_VERSION]:`V2.LATEST filename_short:"null_mapping_short.parquet" filename_long:"null_mapping_long.parquet" @@ -235,14 +247,86 @@ new_float_data:.arrowkdb.pq.readParquetData[filename_float;::]; new_str_data:.arrowkdb.pq.readParquetData[filename_str;::]; new_time_data:.arrowkdb.pq.readParquetData[filename_time;::]; -show short_data~new_short_data -show long_data~new_long_data -show float_data~new_float_data -show str_data~new_str_data -show time_data~new_time_data +//TODO: enable data comparison when reload mapping is ready +//show short_data~new_short_data +//show long_data~new_long_data +//show float_data~new_float_data +//show str_data~new_str_data +//show time_data~new_time_data + +rm filename_short; +rm filename_long; +rm filename_float; +rm filename_str; +rm filename_time; + +//---------------------------// +// Example-2. Arrow IPC file // +//---------------------------// + +// Write the schema and array data to an arrow file +filename_short:"null_mapping_short.arrow"; +filename_long:"null_mapping_long.arrow"; +filename_float:"null_mapping_float.arrow"; +filename_str:"null_mapping_str.arrow"; +filename_time:"null_mapping_time.arrow"; +filename_other:"null_mapping_other.arrow"; + +.arrowkdb.ipc.writeArrow[filename_short;short_schema;short_data;::]; +.arrowkdb.ipc.writeArrow[filename_long;long_schema;long_data;::]; +.arrowkdb.ipc.writeArrow[filename_float;float_schema;float_data;::]; +.arrowkdb.ipc.writeArrow[filename_str;str_schema;str_data;::]; +.arrowkdb.ipc.writeArrow[filename_time;time_schema;time_data;::]; +.arrowkdb.ipc.writeArrow[filename_other;other_schema;other_data;::]; + +show ls filename_short +show ls filename_long +show ls filename_float +show ls filename_str +show ls filename_time +show ls filename_other + +// Read the schema back and compare +new_short_schema:.arrowkdb.ipc.readArrowSchema[filename_short]; +new_long_schema:.arrowkdb.ipc.readArrowSchema[filename_long]; +new_float_schema:.arrowkdb.ipc.readArrowSchema[filename_float]; +new_str_schema:.arrowkdb.ipc.readArrowSchema[filename_str]; +new_time_schema:.arrowkdb.ipc.readArrowSchema[filename_time]; +new_other_schema:.arrowkdb.ipc.readArrowSchema[filename_other]; + +show .arrowkdb.sc.equalSchemas[short_schema;new_short_schema] +show .arrowkdb.sc.equalSchemas[long_schema;new_long_schema] +show .arrowkdb.sc.equalSchemas[float_schema;new_float_schema] +show .arrowkdb.sc.equalSchemas[str_schema;new_str_schema] +show .arrowkdb.sc.equalSchemas[time_schema;new_time_schema] +show .arrowkdb.sc.equalSchemas[other_schema;new_other_schema] + +show short_schema~new_short_schema +show long_schema~new_long_schema +show float_schema~new_float_schema +show str_schema~new_str_schema +show time_schema~new_time_schema +show other_schema~new_other_schema + +// Read the array data back and compare +new_short_data:.arrowkdb.ipc.readArrowData[filename;::]; +new_long_data:.arrowkdb.ipc.readArrowData[filename;::]; +new_float_data:.arrowkdb.ipc.readArrowData[filename;::]; +new_str_data:.arrowkdb.ipc.readArrowData[filename;::]; +new_time_data:.arrowkdb.ipc.readArrowData[filename;::]; +new_other_data:.arrowkdb.ipc.readArrowData[filename;::]; + +//TODO: enable data comparison when reload mapping is ready +//show short_data~new_short_data +//show long_data~new_long_data +//show float_data~new_float_data +//show str_data~new_str_data +//show time_data~new_time_data +//show other_data~new_other_data rm filename_short; rm filename_long; rm filename_float; rm filename_str; rm filename_time; +rm filename_other; From e9f25eeb79fc706c529fda1bcb718fce4e19657e Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 30 Jan 2023 17:30:53 +0300 Subject: [PATCH 183/276] Example-2 Arrow IPC file is added --- examples/null_mapping.q | 244 ++++++++++++++++++++-------------------- 1 file changed, 121 insertions(+), 123 deletions(-) diff --git a/examples/null_mapping.q b/examples/null_mapping.q index e8929de..93d12de 100644 --- a/examples/null_mapping.q +++ b/examples/null_mapping.q @@ -184,149 +184,147 @@ short_data:(ts_data;bool_data;ui8_data;i8_data;ui16_data;i16_data); long_data:(ts_data;ui32_data;i32_data;ui64_data;i64_data); float_data:(ts_data;f32_data;f64_data;dec_data); str_data:(ts_data;str_data;lstr_data;bin_data;lbin_data;fbin_data); -time_data:(ts_data;d32_data;d64_data;tstamp_data;t64_data;dur_data) -other_data:(ts_data;f16_data;t32_data;mint_data;dtint_data) +time_data:(ts_data;d32_data;d64_data;tstamp_data;t64_data;dur_data); +other_data:(ts_data;f16_data;t32_data;mint_data;dtint_data); // Pretty print the Arrow table populated from the array data options[`DECIMAL128_AS_DOUBLE]:1 -.arrowkdb.tb.prettyPrintTable[short_schema;short_data;options] -.arrowkdb.tb.prettyPrintTable[long_schema;long_data;options] -.arrowkdb.tb.prettyPrintTable[float_schema;float_data;options] -.arrowkdb.tb.prettyPrintTable[str_schema;str_data;options] -.arrowkdb.tb.prettyPrintTable[time_schema;time_data;options] -.arrowkdb.tb.prettyPrintTable[other_schema;other_data;options] +.arrowkdb.tb.prettyPrintTable[short_schema;short_data;options]; +.arrowkdb.tb.prettyPrintTable[long_schema;long_data;options]; +.arrowkdb.tb.prettyPrintTable[float_schema;float_data;options]; +.arrowkdb.tb.prettyPrintTable[str_schema;str_data;options]; +.arrowkdb.tb.prettyPrintTable[time_schema;time_data;options]; +.arrowkdb.tb.prettyPrintTable[other_schema;other_data;options]; //-------------------------// // Example-1. Parquet file // //-------------------------// // Write the schema and array data to a parquet file -options[`PARQUET_VERSION]:`V2.LATEST - -filename_short:"null_mapping_short.parquet" -filename_long:"null_mapping_long.parquet" -filename_float:"null_mapping_float.parquet" -filename_str:"null_mapping_str.parquet" -filename_time:"null_mapping_time.parquet" - -.arrowkdb.pq.writeParquet[filename_short;short_schema;short_data;options] -.arrowkdb.pq.writeParquet[filename_long;long_schema;long_data;options] -.arrowkdb.pq.writeParquet[filename_float;float_schema;float_data;options] -.arrowkdb.pq.writeParquet[filename_str;str_schema;str_data;options] -.arrowkdb.pq.writeParquet[filename_time;time_schema;time_data;options] - -show ls filename_short -show ls filename_long -show ls filename_float -show ls filename_str -show ls filename_time +options[`PARQUET_VERSION]:`V2.0 + +parquet_short:"null_mapping_short.parquet" +parquet_long:"null_mapping_long.parquet" +parquet_float:"null_mapping_float.parquet" +parquet_str:"null_mapping_str.parquet" +parquet_time:"null_mapping_time.parquet" + +.arrowkdb.pq.writeParquet[parquet_short;short_schema;short_data;options]; +.arrowkdb.pq.writeParquet[parquet_long;long_schema;long_data;options]; +.arrowkdb.pq.writeParquet[parquet_float;float_schema;float_data;options]; +.arrowkdb.pq.writeParquet[parquet_str;str_schema;str_data;options]; +.arrowkdb.pq.writeParquet[parquet_time;time_schema;time_data;options]; + +show ls parquet_short +show ls parquet_long +show ls parquet_float +show ls parquet_str +show ls parquet_time // Read the schema back and compare -new_short_schema:.arrowkdb.pq.readParquetSchema[filename_short]; -new_long_schema:.arrowkdb.pq.readParquetSchema[filename_long]; -new_float_schema:.arrowkdb.pq.readParquetSchema[filename_float]; -new_str_schema:.arrowkdb.pq.readParquetSchema[filename_str]; -new_time_schema:.arrowkdb.pq.readParquetSchema[filename_time]; - -show .arrowkdb.sc.equalSchemas[short_schema;new_short_schema] -show .arrowkdb.sc.equalSchemas[long_schema;new_long_schema] -show .arrowkdb.sc.equalSchemas[float_schema;new_float_schema] -show .arrowkdb.sc.equalSchemas[str_schema;new_str_schema] -show .arrowkdb.sc.equalSchemas[time_schema;new_time_schema] - -show short_schema~new_short_schema -show long_schema~new_long_schema -show float_schema~new_float_schema -show str_schema~new_str_schema -show time_schema~new_time_schema +parquet_short_schema:.arrowkdb.pq.readParquetSchema[parquet_short]; +parquet_long_schema:.arrowkdb.pq.readParquetSchema[parquet_long]; +parquet_float_schema:.arrowkdb.pq.readParquetSchema[parquet_float]; +parquet_str_schema:.arrowkdb.pq.readParquetSchema[parquet_str]; +parquet_time_schema:.arrowkdb.pq.readParquetSchema[parquet_time]; + +show .arrowkdb.sc.equalSchemas[short_schema;parquet_short_schema] +show .arrowkdb.sc.equalSchemas[long_schema;parquet_long_schema] +show .arrowkdb.sc.equalSchemas[float_schema;parquet_float_schema] +show .arrowkdb.sc.equalSchemas[str_schema;parquet_str_schema] +show .arrowkdb.sc.equalSchemas[time_schema;parquet_time_schema] + +show short_schema~parquet_short_schema +show long_schema~parquet_long_schema +show float_schema~parquet_float_schema +show str_schema~parquet_str_schema +show time_schema~parquet_time_schema // Read the array data back and compare -new_short_data:.arrowkdb.pq.readParquetData[filename_short;::]; -new_long_data:.arrowkdb.pq.readParquetData[filename_long;::]; -new_float_data:.arrowkdb.pq.readParquetData[filename_float;::]; -new_str_data:.arrowkdb.pq.readParquetData[filename_str;::]; -new_time_data:.arrowkdb.pq.readParquetData[filename_time;::]; - -//TODO: enable data comparison when reload mapping is ready -//show short_data~new_short_data -//show long_data~new_long_data -//show float_data~new_float_data -//show str_data~new_str_data -//show time_data~new_time_data - -rm filename_short; -rm filename_long; -rm filename_float; -rm filename_str; -rm filename_time; +parquet_short_data:.arrowkdb.pq.readParquetData[parquet_short;::]; +parquet_long_data:.arrowkdb.pq.readParquetData[parquet_long;::]; +parquet_float_data:.arrowkdb.pq.readParquetData[parquet_float;(``DECIMAL128_AS_DOUBLE)!((::);1)]; +parquet_str_data:.arrowkdb.pq.readParquetData[parquet_str;::]; +parquet_time_data:.arrowkdb.pq.readParquetData[parquet_time;::]; + +show short_data~parquet_short_data +show long_data~parquet_long_data +show float_data~parquet_float_data +show str_data~parquet_str_data +show time_data~parquet_time_data + +rm parquet_short; +rm parquet_long; +rm parquet_float; +rm parquet_str; +rm parquet_time; //---------------------------// // Example-2. Arrow IPC file // //---------------------------// // Write the schema and array data to an arrow file -filename_short:"null_mapping_short.arrow"; -filename_long:"null_mapping_long.arrow"; -filename_float:"null_mapping_float.arrow"; -filename_str:"null_mapping_str.arrow"; -filename_time:"null_mapping_time.arrow"; -filename_other:"null_mapping_other.arrow"; - -.arrowkdb.ipc.writeArrow[filename_short;short_schema;short_data;::]; -.arrowkdb.ipc.writeArrow[filename_long;long_schema;long_data;::]; -.arrowkdb.ipc.writeArrow[filename_float;float_schema;float_data;::]; -.arrowkdb.ipc.writeArrow[filename_str;str_schema;str_data;::]; -.arrowkdb.ipc.writeArrow[filename_time;time_schema;time_data;::]; -.arrowkdb.ipc.writeArrow[filename_other;other_schema;other_data;::]; - -show ls filename_short -show ls filename_long -show ls filename_float -show ls filename_str -show ls filename_time -show ls filename_other +arrow_short:"null_mapping_short.arrow"; +arrow_long:"null_mapping_long.arrow"; +arrow_float:"null_mapping_float.arrow"; +arrow_str:"null_mapping_str.arrow"; +arrow_time:"null_mapping_time.arrow"; +arrow_other:"null_mapping_other.arrow"; + +.arrowkdb.ipc.writeArrow[arrow_short;short_schema;short_data;options]; +.arrowkdb.ipc.writeArrow[arrow_long;long_schema;long_data;options]; +.arrowkdb.ipc.writeArrow[arrow_float;float_schema;float_data;options]; +.arrowkdb.ipc.writeArrow[arrow_str;str_schema;str_data;options]; +.arrowkdb.ipc.writeArrow[arrow_time;time_schema;time_data;options]; +.arrowkdb.ipc.writeArrow[arrow_other;other_schema;other_data;options]; + +show ls arrow_short +show ls arrow_long +show ls arrow_float +show ls arrow_str +show ls arrow_time +show ls arrow_other // Read the schema back and compare -new_short_schema:.arrowkdb.ipc.readArrowSchema[filename_short]; -new_long_schema:.arrowkdb.ipc.readArrowSchema[filename_long]; -new_float_schema:.arrowkdb.ipc.readArrowSchema[filename_float]; -new_str_schema:.arrowkdb.ipc.readArrowSchema[filename_str]; -new_time_schema:.arrowkdb.ipc.readArrowSchema[filename_time]; -new_other_schema:.arrowkdb.ipc.readArrowSchema[filename_other]; - -show .arrowkdb.sc.equalSchemas[short_schema;new_short_schema] -show .arrowkdb.sc.equalSchemas[long_schema;new_long_schema] -show .arrowkdb.sc.equalSchemas[float_schema;new_float_schema] -show .arrowkdb.sc.equalSchemas[str_schema;new_str_schema] -show .arrowkdb.sc.equalSchemas[time_schema;new_time_schema] -show .arrowkdb.sc.equalSchemas[other_schema;new_other_schema] - -show short_schema~new_short_schema -show long_schema~new_long_schema -show float_schema~new_float_schema -show str_schema~new_str_schema -show time_schema~new_time_schema -show other_schema~new_other_schema +arrow_short_schema:.arrowkdb.ipc.readArrowSchema[arrow_short]; +arrow_long_schema:.arrowkdb.ipc.readArrowSchema[arrow_long]; +arrow_float_schema:.arrowkdb.ipc.readArrowSchema[arrow_float]; +arrow_str_schema:.arrowkdb.ipc.readArrowSchema[arrow_str]; +arrow_time_schema:.arrowkdb.ipc.readArrowSchema[arrow_time]; +arrow_other_schema:.arrowkdb.ipc.readArrowSchema[arrow_other]; + +show .arrowkdb.sc.equalSchemas[short_schema;arrow_short_schema] +show .arrowkdb.sc.equalSchemas[long_schema;arrow_long_schema] +show .arrowkdb.sc.equalSchemas[float_schema;arrow_float_schema] +show .arrowkdb.sc.equalSchemas[str_schema;arrow_str_schema] +show .arrowkdb.sc.equalSchemas[time_schema;arrow_time_schema] +show .arrowkdb.sc.equalSchemas[other_schema;arrow_other_schema] + +show short_schema~arrow_short_schema +show long_schema~arrow_long_schema +show float_schema~arrow_float_schema +show str_schema~arrow_str_schema +show time_schema~arrow_time_schema +show other_schema~arrow_other_schema // Read the array data back and compare -new_short_data:.arrowkdb.ipc.readArrowData[filename;::]; -new_long_data:.arrowkdb.ipc.readArrowData[filename;::]; -new_float_data:.arrowkdb.ipc.readArrowData[filename;::]; -new_str_data:.arrowkdb.ipc.readArrowData[filename;::]; -new_time_data:.arrowkdb.ipc.readArrowData[filename;::]; -new_other_data:.arrowkdb.ipc.readArrowData[filename;::]; - -//TODO: enable data comparison when reload mapping is ready -//show short_data~new_short_data -//show long_data~new_long_data -//show float_data~new_float_data -//show str_data~new_str_data -//show time_data~new_time_data -//show other_data~new_other_data - -rm filename_short; -rm filename_long; -rm filename_float; -rm filename_str; -rm filename_time; -rm filename_other; +arrow_short_data:.arrowkdb.ipc.readArrowData[arrow_short;::]; +arrow_long_data:.arrowkdb.ipc.readArrowData[arrow_long;::]; +arrow_float_data:.arrowkdb.ipc.readArrowData[arrow_float;(``DECIMAL128_AS_DOUBLE)!((::);1)]; +arrow_str_data:.arrowkdb.ipc.readArrowData[arrow_str;::]; +arrow_time_data:.arrowkdb.ipc.readArrowData[arrow_time;::]; +arrow_other_data:.arrowkdb.ipc.readArrowData[arrow_other;::]; + +show short_data~arrow_short_data +show long_data~arrow_long_data +show float_data~arrow_float_data +show str_data~arrow_str_data +show time_data~arrow_time_data +show other_data~arrow_other_data + +rm arrow_short; +rm arrow_long; +rm arrow_float; +rm arrow_str; +rm arrow_time; +rm arrow_other; From 307b40de71c7cf55237f2e8683b42bcd87a61cba Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 1 Feb 2023 19:31:34 +0300 Subject: [PATCH 184/276] Mapping nulls when reading arrow and parquet data --- examples/null_mapping.q | 46 ++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/examples/null_mapping.q b/examples/null_mapping.q index 93d12de..952a0bf 100644 --- a/examples/null_mapping.q +++ b/examples/null_mapping.q @@ -28,9 +28,9 @@ other_opts:(`float16`time32`month_interval`day_time_interval)!(9h;09:01:02.042;2 options:(``NULL_MAPPING)!((::);short_opts,long_opts,float_opts,str_opts,time_opts,other_opts); +// Create the datatype identifiers ts_dt:.arrowkdb.dt.timestamp[`nano]; -// Create the datatype identifiers bool_dt:.arrowkdb.dt.boolean[]; ui8_dt:.arrowkdb.dt.uint8[]; i8_dt:.arrowkdb.dt.int8[]; @@ -107,12 +107,12 @@ time_schema:.arrowkdb.sc.schema[(ts_fd,d32_fd,d64_fd,tstamp_fd,t64_fd,dur_fd)]; other_schema:.arrowkdb.sc.schema[(ts_fd,f16_fd,t32_fd,mint_fd,dtint_fd)]; // Print the schemas -.arrowkdb.sc.printSchema[short_schema] -.arrowkdb.sc.printSchema[long_schema] +.arrowkdb.sc.printSchema[short_schema]; +.arrowkdb.sc.printSchema[long_schema]; .arrowkdb.sc.printSchema[float_schema] -.arrowkdb.sc.printSchema[str_schema] -.arrowkdb.sc.printSchema[time_schema] -.arrowkdb.sc.printSchema[other_schema] +.arrowkdb.sc.printSchema[str_schema]; +.arrowkdb.sc.printSchema[time_schema]; +.arrowkdb.sc.printSchema[other_schema]; //-----------------------// // Create the array data // @@ -145,7 +145,7 @@ f32_data:N?100e; f32_data[0]:1.23e; f64_data:N?100f; f64_data[1]:4.56f; -dec_data:N?(10f); +dec_data:{"F"$.Q.f[2]x} each N?(10f) dec_data[2]:7.89f str_data:N?("start";"stop";"alert";"acknowledge";""); @@ -203,11 +203,11 @@ options[`DECIMAL128_AS_DOUBLE]:1 // Write the schema and array data to a parquet file options[`PARQUET_VERSION]:`V2.0 -parquet_short:"null_mapping_short.parquet" -parquet_long:"null_mapping_long.parquet" -parquet_float:"null_mapping_float.parquet" -parquet_str:"null_mapping_str.parquet" -parquet_time:"null_mapping_time.parquet" +parquet_short:"null_mapping_short.parquet"; +parquet_long:"null_mapping_long.parquet"; +parquet_float:"null_mapping_float.parquet"; +parquet_str:"null_mapping_str.parquet"; +parquet_time:"null_mapping_time.parquet"; .arrowkdb.pq.writeParquet[parquet_short;short_schema;short_data;options]; .arrowkdb.pq.writeParquet[parquet_long;long_schema;long_data;options]; @@ -241,11 +241,11 @@ show str_schema~parquet_str_schema show time_schema~parquet_time_schema // Read the array data back and compare -parquet_short_data:.arrowkdb.pq.readParquetData[parquet_short;::]; -parquet_long_data:.arrowkdb.pq.readParquetData[parquet_long;::]; -parquet_float_data:.arrowkdb.pq.readParquetData[parquet_float;(``DECIMAL128_AS_DOUBLE)!((::);1)]; -parquet_str_data:.arrowkdb.pq.readParquetData[parquet_str;::]; -parquet_time_data:.arrowkdb.pq.readParquetData[parquet_time;::]; +parquet_short_data:.arrowkdb.pq.readParquetData[parquet_short;options]; +parquet_long_data:.arrowkdb.pq.readParquetData[parquet_long;options]; +parquet_float_data:.arrowkdb.pq.readParquetData[parquet_float;options]; +parquet_str_data:.arrowkdb.pq.readParquetData[parquet_str;options]; +parquet_time_data:.arrowkdb.pq.readParquetData[parquet_time;options]; show short_data~parquet_short_data show long_data~parquet_long_data @@ -308,12 +308,12 @@ show time_schema~arrow_time_schema show other_schema~arrow_other_schema // Read the array data back and compare -arrow_short_data:.arrowkdb.ipc.readArrowData[arrow_short;::]; -arrow_long_data:.arrowkdb.ipc.readArrowData[arrow_long;::]; -arrow_float_data:.arrowkdb.ipc.readArrowData[arrow_float;(``DECIMAL128_AS_DOUBLE)!((::);1)]; -arrow_str_data:.arrowkdb.ipc.readArrowData[arrow_str;::]; -arrow_time_data:.arrowkdb.ipc.readArrowData[arrow_time;::]; -arrow_other_data:.arrowkdb.ipc.readArrowData[arrow_other;::]; +arrow_short_data:.arrowkdb.ipc.readArrowData[arrow_short;options]; +arrow_long_data:.arrowkdb.ipc.readArrowData[arrow_long;options]; +arrow_float_data:.arrowkdb.ipc.readArrowData[arrow_float;options]; +arrow_str_data:.arrowkdb.ipc.readArrowData[arrow_str;options]; +arrow_time_data:.arrowkdb.ipc.readArrowData[arrow_time;options]; +arrow_other_data:.arrowkdb.ipc.readArrowData[arrow_other;options]; show short_data~arrow_short_data show long_data~arrow_long_data From bb8cdab21e8a11fc9e570b68c344165f51642230 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 2 Feb 2023 18:43:57 +0300 Subject: [PATCH 185/276] Arrow IPC stream example --- examples/null_mapping.q | 58 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/examples/null_mapping.q b/examples/null_mapping.q index 952a0bf..cd2fd77 100644 --- a/examples/null_mapping.q +++ b/examples/null_mapping.q @@ -328,3 +328,61 @@ rm arrow_float; rm arrow_str; rm arrow_time; rm arrow_other; + +//-----------------------------// +// Example-3. Arrow IPC stream // +//-----------------------------// + +// Serialize the schema and array data to an arrow stream +serialized_short:.arrowkdb.ipc.serializeArrow[short_schema;short_data;options]; +serialized_long:.arrowkdb.ipc.serializeArrow[long_schema;long_data;options]; +serialized_float:.arrowkdb.ipc.serializeArrow[float_schema;float_data;options]; +serialized_str:.arrowkdb.ipc.serializeArrow[str_schema;str_data;options]; +serialized_time:.arrowkdb.ipc.serializeArrow[time_schema;time_data;options]; +serialized_other:.arrowkdb.ipc.serializeArrow[other_schema;other_data;options]; +show serialized_short +show serialized_long +show serialized_float +show serialized_str +show serialized_time +show serialized_other + +// Parse the schema back abd compare +stream_short_schema:.arrowkdb.ipc.parseArrowSchema[serialized_short]; +stream_long_schema:.arrowkdb.ipc.parseArrowSchema[serialized_long]; +stream_float_schema:.arrowkdb.ipc.parseArrowSchema[serialized_float]; +stream_str_schema:.arrowkdb.ipc.parseArrowSchema[serialized_str]; +stream_time_schema:.arrowkdb.ipc.parseArrowSchema[serialized_time]; +stream_other_schema:.arrowkdb.ipc.parseArrowSchema[serialized_other]; +show .arrowkdb.sc.equalSchemas[short_schema;stream_short_schema] +show .arrowkdb.sc.equalSchemas[long_schema;stream_long_schema] +show .arrowkdb.sc.equalSchemas[float_schema;stream_float_schema] +show .arrowkdb.sc.equalSchemas[str_schema;stream_str_schema] +show .arrowkdb.sc.equalSchemas[time_schema;stream_time_schema] +show .arrowkdb.sc.equalSchemas[other_schema;stream_other_schema] +show short_schema~stream_short_schema +show long_schema~stream_long_schema +show float_schema~stream_float_schema +show str_schema~stream_str_schema +show time_schema~stream_time_schema +show other_schema~stream_other_schema + +// Parse the array data back and compare +stream_short_data:.arrowkdb.ipc.parseArrowData[serialized_short;options]; +stream_long_data:.arrowkdb.ipc.parseArrowData[serialized_long;options]; +stream_float_data:.arrowkdb.ipc.parseArrowData[serialized_float;options]; +stream_str_data:.arrowkdb.ipc.parseArrowData[serialized_str;options]; +stream_time_data:.arrowkdb.ipc.parseArrowData[serialized_time;options]; +stream_other_data:.arrowkdb.ipc.parseArrowData[serialized_other;options]; +show short_data~stream_short_data +show long_data~stream_long_data +show float_data~stream_float_data +show str_data~stream_str_data +show time_data~stream_time_data +show other_data~stream_other_data + + +-1 "\n+----------------------------------------+\n"; + +// Process off +exit 0; \ No newline at end of file From 2a51c00da4192c336bdbde9c2b55941403952114 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 2 Feb 2023 20:31:56 +0300 Subject: [PATCH 186/276] Arrow only supported extra fields --- examples/null_mapping.q | 96 +++++++++++++++++++++++++++++------------ 1 file changed, 68 insertions(+), 28 deletions(-) diff --git a/examples/null_mapping.q b/examples/null_mapping.q index cd2fd77..627485d 100644 --- a/examples/null_mapping.q +++ b/examples/null_mapping.q @@ -18,15 +18,18 @@ rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; // Create the schema // //-------------------// -// Support null mapping +// Support null mapping in parquet and arrow short_opts:(`bool`uint8`int8`uint16`int16)!(0b;0x01;0x02;3h;4h); long_opts:(`uint32`int32`uint64`int64)!(5i;6i;7;8); float_opts:(`float32`float64`decimal)!(1.23e;4.56;7.89); -str_opts:(`string`large_string`binary`large_binary`fixed_binary)!("start";"stop";"x"$"alert";"x"$"acknowledge";0Ng); -time_opts:(`date32`date64`timestamp`time64`duration)!(2006.07.21;2015.01.01D00:00:00.000000000;2011.01.01D00:00:00.000000000;12:00:00.000000000;12:00:00.000000000); -other_opts:(`float16`time32`month_interval`day_time_interval)!(9h;09:01:02.042;2006.07m;12:00:00.000000000); +str_opts:(`string`binary`fixed_binary)!("start";"x"$"alert";0Ng); +time_opts:(`date32`timestamp`time64)!(2006.07.21;2011.01.01D00:00:00.000000000;12:00:00.000000000); -options:(``NULL_MAPPING)!((::);short_opts,long_opts,float_opts,str_opts,time_opts,other_opts); +// Support null mapping only in arrow +extra_opts:(`float16`large_string`large_binary`duration)!(9h;"stop";"x"$"acknowledge";12:00:00.000000000); +other_opts:(`date64`time32`month_interval`day_time_interval)!(2015.01.01D00:00:00.000000000;09:01:02.042;2006.07m;12:00:00.000000000); + +options:(``NULL_MAPPING)!((::);short_opts,long_opts,float_opts,str_opts,time_opts,extra_opts,other_opts); // Create the datatype identifiers ts_dt:.arrowkdb.dt.timestamp[`nano]; @@ -47,18 +50,19 @@ f64_dt:.arrowkdb.dt.float64[]; dec_dt:.arrowkdb.dt.decimal128[38i;2i]; str_dt:.arrowkdb.dt.utf8[]; -lstr_dt:.arrowkdb.dt.large_utf8[]; bin_dt:.arrowkdb.dt.binary[]; -lbin_dt:.arrowkdb.dt.large_binary[]; fbin_dt:.arrowkdb.dt.fixed_size_binary[16i]; d32_dt:.arrowkdb.dt.date32[]; -d64_dt:.arrowkdb.dt.date64[]; tstamp_dt:.arrowkdb.dt.timestamp[`nano]; t64_dt:.arrowkdb.dt.time64[`nano]; -dur_dt:.arrowkdb.dt.duration[`milli]; f16_dt:.arrowkdb.dt.float16[]; +lstr_dt:.arrowkdb.dt.large_utf8[]; +lbin_dt:.arrowkdb.dt.large_binary[]; +dur_dt:.arrowkdb.dt.duration[`milli]; + +d64_dt:.arrowkdb.dt.date64[]; t32_dt:.arrowkdb.dt.time32[`milli]; mint_dt:.arrowkdb.dt.month_interval[]; dtint_dt:.arrowkdb.dt.day_time_interval[]; @@ -82,18 +86,19 @@ f64_fd:.arrowkdb.fd.field[`float64;f64_dt]; dec_fd:.arrowkdb.fd.field[`decimal;dec_dt]; str_fd:.arrowkdb.fd.field[`string;str_dt]; -lstr_fd:.arrowkdb.fd.field[`large_string;lstr_dt]; bin_fd:.arrowkdb.fd.field[`binary;bin_dt]; -lbin_fd:.arrowkdb.fd.field[`large_binary;lbin_dt]; fbin_fd:.arrowkdb.fd.field[`fixed_binary;fbin_dt]; d32_fd:.arrowkdb.fd.field[`date32;d32_dt]; -d64_fd:.arrowkdb.fd.field[`date64;d64_dt]; tstamp_fd:.arrowkdb.fd.field[`timestamp;tstamp_dt]; t64_fd:.arrowkdb.fd.field[`time64;t64_dt]; -dur_fd:.arrowkdb.fd.field[`duration;dur_dt]; f16_fd:.arrowkdb.fd.field[`float16;f16_dt]; +lstr_fd:.arrowkdb.fd.field[`large_string;lstr_dt]; +lbin_fd:.arrowkdb.fd.field[`large_binary;lbin_dt]; +dur_fd:.arrowkdb.fd.field[`duration;dur_dt]; + +d64_fd:.arrowkdb.fd.field[`date64;d64_dt]; t32_fd:.arrowkdb.fd.field[`time32;t32_dt]; mint_fd:.arrowkdb.fd.field[`month_interval;mint_dt]; dtint_fd:.arrowkdb.fd.field[`day_time_interval;dtint_dt]; @@ -102,9 +107,11 @@ dtint_fd:.arrowkdb.fd.field[`day_time_interval;dtint_dt]; short_schema:.arrowkdb.sc.schema[(ts_fd,bool_fd,ui8_fd,i8_fd,ui16_fd,i16_fd)]; long_schema:.arrowkdb.sc.schema[(ts_fd,ui32_fd,i32_fd,ui64_fd,i64_fd)]; float_schema:.arrowkdb.sc.schema[(ts_fd,f32_fd,f64_fd,dec_fd)]; -str_schema:.arrowkdb.sc.schema[(ts_fd,str_fd,lstr_fd,bin_fd,lbin_fd,fbin_fd)]; -time_schema:.arrowkdb.sc.schema[(ts_fd,d32_fd,d64_fd,tstamp_fd,t64_fd,dur_fd)]; -other_schema:.arrowkdb.sc.schema[(ts_fd,f16_fd,t32_fd,mint_fd,dtint_fd)]; +str_schema:.arrowkdb.sc.schema[(ts_fd,str_fd,bin_fd,fbin_fd)]; +time_schema:.arrowkdb.sc.schema[(ts_fd,d32_fd,tstamp_fd,t64_fd)]; + +extra_schema:.arrowkdb.sc.schema[(ts_fd,f16_fd,lstr_fd,lbin_fd,dur_fd)]; +other_schema:.arrowkdb.sc.schema[(ts_fd,d64_fd,t32_fd,mint_fd,dtint_fd)]; // Print the schemas .arrowkdb.sc.printSchema[short_schema]; @@ -112,6 +119,8 @@ other_schema:.arrowkdb.sc.schema[(ts_fd,f16_fd,t32_fd,mint_fd,dtint_fd)]; .arrowkdb.sc.printSchema[float_schema] .arrowkdb.sc.printSchema[str_schema]; .arrowkdb.sc.printSchema[time_schema]; + +.arrowkdb.sc.printSchema[extra_schema]; .arrowkdb.sc.printSchema[other_schema]; //-----------------------// @@ -150,28 +159,29 @@ dec_data[2]:7.89f str_data:N?("start";"stop";"alert";"acknowledge";""); str_data[0]:"start" -lstr_data:N?("start";"stop";"alert";"acknowledge";""); -lstr_data[1]:"stop" bin_data:N?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); bin_data[2]:"x"$"alert" -lbin_data:N?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); -lbin_data[3]:"x"$"acknowledge" fbin_data:N?0Ng; fbin_data[4]:0Ng; d32_data:N?(2006.07.21;2008.07.18;2012.07.16;2014.07.15;2016.07.11); d32_data[0]:2006.07.21; -d64_data:N?(2015.01.01D00:00:00.000000000;2017.01.01D00:00:00.000000000;2018.01.01D00:00:00.000000000;2019.01.01D00:00:00.000000000;2020.01.01D00:00:00.000000000); -d64_data[1]:2015.01.01D00:00:00.000000000; tstamp_data:N?(2015.01.01D00:00:00.000000000;2014.01.01D00:00:00.000000000;2013.01.01D00:00:00.000000000;2012.01.01D00:00:00.000000000;2011.01.01D00:00:00.000000000); tstamp_data[2]:2011.01.01D00:00:00.000000000; t64_data:N?(12:00:00.000000000;13:00:00.000000000;14:00:00.000000000;15:00:00.000000000;16:00:00.000000000); t64_data[3]:12:00:00.000000000; -dur_data:N?(12:00:00.000000000;13:00:00.000000000;14:00:00.000000000;15:00:00.000000000;16:00:00.000000000); -dur_data[4]:12:00:00.000000000; f16_data:N?100h; f16_data[0]:9h; +lstr_data:N?("start";"stop";"alert";"acknowledge";""); +lstr_data[1]:"stop" +lbin_data:N?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); +lbin_data[3]:"x"$"acknowledge" +dur_data:N?(12:00:00.000000000;13:00:00.000000000;14:00:00.000000000;15:00:00.000000000;16:00:00.000000000); +dur_data[4]:12:00:00.000000000; + +d64_data:N?(2015.01.01D00:00:00.000000000;2017.01.01D00:00:00.000000000;2018.01.01D00:00:00.000000000;2019.01.01D00:00:00.000000000;2020.01.01D00:00:00.000000000); +d64_data[1]:2015.01.01D00:00:00.000000000; t32_data:N?(09:01:02.042;08:01:02.042;07:01:02.042;06:01:02.042;05:01:02.042); t32_data[1]:09:01:02.042; mint_data:N?(2006.07m;2006.06m;2006.05m;2006.04m;2006.03m); @@ -183,17 +193,21 @@ dtint_data[3]:12:00:00.000000000; short_data:(ts_data;bool_data;ui8_data;i8_data;ui16_data;i16_data); long_data:(ts_data;ui32_data;i32_data;ui64_data;i64_data); float_data:(ts_data;f32_data;f64_data;dec_data); -str_data:(ts_data;str_data;lstr_data;bin_data;lbin_data;fbin_data); -time_data:(ts_data;d32_data;d64_data;tstamp_data;t64_data;dur_data); -other_data:(ts_data;f16_data;t32_data;mint_data;dtint_data); +str_data:(ts_data;str_data;bin_data;fbin_data); +time_data:(ts_data;d32_data;tstamp_data;t64_data); + +extra_data:(ts_data;f16_data;lstr_data;lbin_data;dur_data); +other_data:(ts_data;d64_data;t32_data;mint_data;dtint_data); // Pretty print the Arrow table populated from the array data options[`DECIMAL128_AS_DOUBLE]:1 + .arrowkdb.tb.prettyPrintTable[short_schema;short_data;options]; .arrowkdb.tb.prettyPrintTable[long_schema;long_data;options]; .arrowkdb.tb.prettyPrintTable[float_schema;float_data;options]; .arrowkdb.tb.prettyPrintTable[str_schema;str_data;options]; .arrowkdb.tb.prettyPrintTable[time_schema;time_data;options]; +.arrowkdb.tb.prettyPrintTable[extra_schema;extra_data;options]; .arrowkdb.tb.prettyPrintTable[other_schema;other_data;options]; //-------------------------// @@ -247,6 +261,8 @@ parquet_float_data:.arrowkdb.pq.readParquetData[parquet_float;options]; parquet_str_data:.arrowkdb.pq.readParquetData[parquet_str;options]; parquet_time_data:.arrowkdb.pq.readParquetData[parquet_time;options]; +parquet_str_data[3]:{0x0 sv x} each parquet_str_data[3] // Convert to GUIDs + show short_data~parquet_short_data show long_data~parquet_long_data show float_data~parquet_float_data @@ -269,6 +285,7 @@ arrow_long:"null_mapping_long.arrow"; arrow_float:"null_mapping_float.arrow"; arrow_str:"null_mapping_str.arrow"; arrow_time:"null_mapping_time.arrow"; +arrow_extra:"null_mapping_extra.arrow"; arrow_other:"null_mapping_other.arrow"; .arrowkdb.ipc.writeArrow[arrow_short;short_schema;short_data;options]; @@ -276,6 +293,7 @@ arrow_other:"null_mapping_other.arrow"; .arrowkdb.ipc.writeArrow[arrow_float;float_schema;float_data;options]; .arrowkdb.ipc.writeArrow[arrow_str;str_schema;str_data;options]; .arrowkdb.ipc.writeArrow[arrow_time;time_schema;time_data;options]; +.arrowkdb.ipc.writeArrow[arrow_extra;extra_schema;extra_data;options]; .arrowkdb.ipc.writeArrow[arrow_other;other_schema;other_data;options]; show ls arrow_short @@ -283,6 +301,7 @@ show ls arrow_long show ls arrow_float show ls arrow_str show ls arrow_time +show ls arrow_extra show ls arrow_other // Read the schema back and compare @@ -291,6 +310,7 @@ arrow_long_schema:.arrowkdb.ipc.readArrowSchema[arrow_long]; arrow_float_schema:.arrowkdb.ipc.readArrowSchema[arrow_float]; arrow_str_schema:.arrowkdb.ipc.readArrowSchema[arrow_str]; arrow_time_schema:.arrowkdb.ipc.readArrowSchema[arrow_time]; +arrow_extra_schema:.arrowkdb.ipc.readArrowSchema[arrow_extra]; arrow_other_schema:.arrowkdb.ipc.readArrowSchema[arrow_other]; show .arrowkdb.sc.equalSchemas[short_schema;arrow_short_schema] @@ -298,6 +318,7 @@ show .arrowkdb.sc.equalSchemas[long_schema;arrow_long_schema] show .arrowkdb.sc.equalSchemas[float_schema;arrow_float_schema] show .arrowkdb.sc.equalSchemas[str_schema;arrow_str_schema] show .arrowkdb.sc.equalSchemas[time_schema;arrow_time_schema] +show .arrowkdb.sc.equalSchemas[extra_schema;arrow_extra_schema] show .arrowkdb.sc.equalSchemas[other_schema;arrow_other_schema] show short_schema~arrow_short_schema @@ -305,6 +326,7 @@ show long_schema~arrow_long_schema show float_schema~arrow_float_schema show str_schema~arrow_str_schema show time_schema~arrow_time_schema +show extra_schema~arrow_extra_schema show other_schema~arrow_other_schema // Read the array data back and compare @@ -313,13 +335,17 @@ arrow_long_data:.arrowkdb.ipc.readArrowData[arrow_long;options]; arrow_float_data:.arrowkdb.ipc.readArrowData[arrow_float;options]; arrow_str_data:.arrowkdb.ipc.readArrowData[arrow_str;options]; arrow_time_data:.arrowkdb.ipc.readArrowData[arrow_time;options]; +arrow_extra_data:.arrowkdb.ipc.readArrowData[arrow_extra;options]; arrow_other_data:.arrowkdb.ipc.readArrowData[arrow_other;options]; +arrow_str_data[3]:{0x0 sv x} each arrow_str_data[3] // Convert to GUIDs + show short_data~arrow_short_data show long_data~arrow_long_data show float_data~arrow_float_data show str_data~arrow_str_data show time_data~arrow_time_data +show extra_data~arrow_extra_data show other_data~arrow_other_data rm arrow_short; @@ -327,6 +353,7 @@ rm arrow_long; rm arrow_float; rm arrow_str; rm arrow_time; +rm arrow_extra; rm arrow_other; //-----------------------------// @@ -339,12 +366,15 @@ serialized_long:.arrowkdb.ipc.serializeArrow[long_schema;long_data;options]; serialized_float:.arrowkdb.ipc.serializeArrow[float_schema;float_data;options]; serialized_str:.arrowkdb.ipc.serializeArrow[str_schema;str_data;options]; serialized_time:.arrowkdb.ipc.serializeArrow[time_schema;time_data;options]; +serialized_extra:.arrowkdb.ipc.serializeArrow[extra_schema;extra_data;options]; serialized_other:.arrowkdb.ipc.serializeArrow[other_schema;other_data;options]; + show serialized_short show serialized_long show serialized_float show serialized_str show serialized_time +show serialized_extra show serialized_other // Parse the schema back abd compare @@ -353,18 +383,23 @@ stream_long_schema:.arrowkdb.ipc.parseArrowSchema[serialized_long]; stream_float_schema:.arrowkdb.ipc.parseArrowSchema[serialized_float]; stream_str_schema:.arrowkdb.ipc.parseArrowSchema[serialized_str]; stream_time_schema:.arrowkdb.ipc.parseArrowSchema[serialized_time]; +stream_extra_schema:.arrowkdb.ipc.parseArrowSchema[serialized_extra]; stream_other_schema:.arrowkdb.ipc.parseArrowSchema[serialized_other]; + show .arrowkdb.sc.equalSchemas[short_schema;stream_short_schema] show .arrowkdb.sc.equalSchemas[long_schema;stream_long_schema] show .arrowkdb.sc.equalSchemas[float_schema;stream_float_schema] show .arrowkdb.sc.equalSchemas[str_schema;stream_str_schema] show .arrowkdb.sc.equalSchemas[time_schema;stream_time_schema] +show .arrowkdb.sc.equalSchemas[extra_schema;stream_extra_schema] show .arrowkdb.sc.equalSchemas[other_schema;stream_other_schema] + show short_schema~stream_short_schema show long_schema~stream_long_schema show float_schema~stream_float_schema show str_schema~stream_str_schema show time_schema~stream_time_schema +show extra_schema~stream_extra_schema show other_schema~stream_other_schema // Parse the array data back and compare @@ -373,16 +408,21 @@ stream_long_data:.arrowkdb.ipc.parseArrowData[serialized_long;options]; stream_float_data:.arrowkdb.ipc.parseArrowData[serialized_float;options]; stream_str_data:.arrowkdb.ipc.parseArrowData[serialized_str;options]; stream_time_data:.arrowkdb.ipc.parseArrowData[serialized_time;options]; +stream_extra_data:.arrowkdb.ipc.parseArrowData[serialized_extra;options]; stream_other_data:.arrowkdb.ipc.parseArrowData[serialized_other;options]; + +stream_str_data[3]:{0x0 sv x} each stream_str_data[3] // Convert to GUIDs + show short_data~stream_short_data show long_data~stream_long_data show float_data~stream_float_data show str_data~stream_str_data show time_data~stream_time_data +show extra_data~stream_extra_data show other_data~stream_other_data -1 "\n+----------------------------------------+\n"; // Process off -exit 0; \ No newline at end of file +exit 0; From 8064610b8dc4c214f24f03ddd4ed81ab2fc4794c Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 8 Feb 2023 15:31:27 +0000 Subject: [PATCH 187/276] Null mapping of short integer fields --- .gitignore | 1 + tests/.gitignore | 1 + tests/null_mapping_short.t | 103 +++++++++++++++++++++++++++++++++++++ 3 files changed, 105 insertions(+) create mode 100644 tests/null_mapping_short.t diff --git a/.gitignore b/.gitignore index 18f4d15..3d6594b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ arrowkdb.code-workspace +.vscode/ build/ diff --git a/tests/.gitignore b/tests/.gitignore index 492b6a4..56b0696 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -1 +1,2 @@ test.q +null_mapping_short.q diff --git a/tests/null_mapping_short.t b/tests/null_mapping_short.t new file mode 100644 index 0000000..59dc02d --- /dev/null +++ b/tests/null_mapping_short.t @@ -0,0 +1,103 @@ +// null_mapping_short.t + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping ||----------+\n"; +short_opts:(`bool`uint8`int8`uint16`int16)!(0b;0x01;0x02;3h;4h); + +options:(``NULL_MAPPING)!((::);short_opts); + +N:5 + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +bool_dt:.arrowkdb.dt.boolean[]; +ui8_dt:.arrowkdb.dt.uint8[]; +i8_dt:.arrowkdb.dt.int8[]; +ui16_dt:.arrowkdb.dt.uint16[]; +i16_dt:.arrowkdb.dt.int16[]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +bool_fd:.arrowkdb.fd.field[`bool;bool_dt]; +ui8_fd:.arrowkdb.fd.field[`uint8;ui8_dt]; +i8_fd:.arrowkdb.fd.field[`int8;i8_dt]; +ui16_fd:.arrowkdb.fd.field[`uint16;ui16_dt]; +i16_fd:.arrowkdb.fd.field[`int16;i16_dt]; + +-1"\n+----------|| Create the schemas for the list of fields ||----------+\n"; +short_schema:.arrowkdb.sc.schema[(ts_fd,bool_fd,ui8_fd,i8_fd,ui16_fd,i16_fd)]; + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +bool_data:N?(0b;1b); +bool_data[0]:0b; +ui8_data:N?0x64; +ui8_data[1]:0x01; +i8_data:N?0x64; +i8_data[2]:0x02; +ui16_data:N?100h; +ui16_data[3]:3h; +i16_data:N?100h; +i16_data[4]:4h; + +-1"\n+----------|| Combine the data for all columns ||----------+\n"; +short_data:(ts_data;bool_data;ui8_data;i8_data;ui16_data;i16_data); + +-1"\n+----------|| Write the schema and array data to a parquet file ||----------+\n"; +options[`PARQUET_VERSION]:`V2.0 + +parquet_short:"null_mapping_short.parquet"; +.arrowkdb.pq.writeParquet[parquet_short;short_schema;short_data;options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +parquet_short_schema:.arrowkdb.pq.readParquetSchema[parquet_short]; +.arrowkdb.sc.equalSchemas[short_schema;parquet_short_schema] +short_schema~parquet_short_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +parquet_short_data:.arrowkdb.pq.readParquetData[parquet_short;options]; +short_data~parquet_short_data +rm parquet_short; + +-1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; +arrow_short:"null_mapping_short.arrow"; +.arrowkdb.ipc.writeArrow[arrow_short;short_schema;short_data;options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +arrow_short_schema:.arrowkdb.ipc.readArrowSchema[arrow_short]; +.arrowkdb.sc.equalSchemas[short_schema;arrow_short_schema] +short_schema~arrow_short_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +arrow_short_data:.arrowkdb.ipc.readArrowData[arrow_short;options]; +short_data~arrow_short_data +rm arrow_short; + +-1"\n+----------|| Serialize the schema and array data to an arrow stream ||----------+\n"; +serialized_short:.arrowkdb.ipc.serializeArrow[short_schema;short_data;options]; + +-1"\n+----------|| Parse the schema back and compare ||----------+\n"; +stream_short_schema:.arrowkdb.ipc.parseArrowSchema[serialized_short]; +.arrowkdb.sc.equalSchemas[short_schema;stream_short_schema] +short_schema~stream_short_schema + +-1"\n+----------|| Parse the array data back and compare ||----------+\n"; +stream_short_data:.arrowkdb.ipc.parseArrowData[serialized_short;options]; +short_data~stream_short_data + + +-1 "\n+----------|| Test utils ||----------+\n"; + +show util.buildInfo[] +(type util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; \ No newline at end of file From e72f440c10e09045b2cea0ca1ca723136d7df6bf Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 8 Feb 2023 16:26:45 +0000 Subject: [PATCH 188/276] Null mapping of long integer fields --- tests/.gitignore | 1 + tests/null_mapping_long.t | 99 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+) create mode 100644 tests/null_mapping_long.t diff --git a/tests/.gitignore b/tests/.gitignore index 56b0696..6adfd46 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -1,2 +1,3 @@ test.q null_mapping_short.q +null_mapping_long.q diff --git a/tests/null_mapping_long.t b/tests/null_mapping_long.t new file mode 100644 index 0000000..db4c2b5 --- /dev/null +++ b/tests/null_mapping_long.t @@ -0,0 +1,99 @@ +// null_mapping_long.q + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping ||----------+\n"; +long_opts:(`uint32`int32`uint64`int64)!(5i;6i;7;8); + +options:(``NULL_MAPPING)!((::);long_opts); + +N:5 + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +ui32_dt:.arrowkdb.dt.uint32[]; +i32_dt:.arrowkdb.dt.int32[]; +ui64_dt:.arrowkdb.dt.uint64[]; +i64_dt:.arrowkdb.dt.int64[]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +ui32_fd:.arrowkdb.fd.field[`uint32;ui32_dt]; +i32_fd:.arrowkdb.fd.field[`int32;i32_dt]; +ui64_fd:.arrowkdb.fd.field[`uint64;ui64_dt]; +i64_fd:.arrowkdb.fd.field[`int64;i64_dt]; + +-1"\n+----------|| Create the schemas for the list of fields ||----------+\n"; +long_schema:.arrowkdb.sc.schema[(ts_fd,ui32_fd,i32_fd,ui64_fd,i64_fd)]; + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +ui32_data:N?100i; +ui32_data[0]:5i; +i32_data:N?100i; +i32_data[1]:6i; +ui64_data:N?100; +ui64_data[2]:7; +i64_data:N?100; +i64_data[3]:8; + +-1"\n+----------|| Combine the data for all columns ||----------+\n"; +long_data:(ts_data;ui32_data;i32_data;ui64_data;i64_data); + +-1"\n+----------|| Write the schema and array data to a parquet file ||----------+\n"; +options[`PARQUET_VERSION]:`V2.0 + +parquet_long:"null_mapping_long.parquet"; +.arrowkdb.pq.writeParquet[parquet_long;long_schema;long_data;options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +parquet_long_schema:.arrowkdb.pq.readParquetSchema[parquet_long]; +.arrowkdb.sc.equalSchemas[long_schema;parquet_long_schema] +long_schema~parquet_long_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +parquet_long_data:.arrowkdb.pq.readParquetData[parquet_long;options]; +long_data~parquet_long_data +rm parquet_long; + +-1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; +arrow_long:"null_mapping_long.arrow"; +.arrowkdb.ipc.writeArrow[arrow_long;long_schema;long_data;options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +arrow_long_schema:.arrowkdb.ipc.readArrowSchema[arrow_long]; +.arrowkdb.sc.equalSchemas[long_schema;arrow_long_schema] +long_schema~arrow_long_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +arrow_long_data:.arrowkdb.ipc.readArrowData[arrow_long;options]; +long_data~arrow_long_data +rm arrow_long; + +-1"\n+----------|| Serialize the schema and array data to an arrow stream ||----------+\n"; +serialized_long:.arrowkdb.ipc.serializeArrow[long_schema;long_data;options]; + +-1"\n+----------|| Parse the schema back abd compare ||----------+\n"; +stream_long_schema:.arrowkdb.ipc.parseArrowSchema[serialized_long]; +.arrowkdb.sc.equalSchemas[long_schema;stream_long_schema] +long_schema~stream_long_schema + +-1"\n+----------|| Parse the array data back and compare ||----------+\n"; +stream_long_data:.arrowkdb.ipc.parseArrowData[serialized_long;options]; +long_data~stream_long_data + + +-1 "\n+----------|| Test utils ||----------+\n"; + +show util.buildInfo[] +(type util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; From cbd66a5954d181c9a798ef422df9b3fcaccfcb8c Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 8 Feb 2023 16:51:17 +0000 Subject: [PATCH 189/276] Null mapping of float fields --- tests/.gitignore | 1 + tests/null_mapping_float.t | 96 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+) create mode 100644 tests/null_mapping_float.t diff --git a/tests/.gitignore b/tests/.gitignore index 6adfd46..6ffd0d7 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -1,3 +1,4 @@ test.q null_mapping_short.q null_mapping_long.q +null_mapping_float.q diff --git a/tests/null_mapping_float.t b/tests/null_mapping_float.t new file mode 100644 index 0000000..96a2d26 --- /dev/null +++ b/tests/null_mapping_float.t @@ -0,0 +1,96 @@ +// null_mapping_float.q + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping ||----------+\n"; +float_opts:(`float32`float64`decimal)!(1.23e;4.56;7.89); + +options:(``NULL_MAPPING)!((::);float_opts); + +N:5 + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +f32_dt:.arrowkdb.dt.float32[]; +f64_dt:.arrowkdb.dt.float64[]; +dec_dt:.arrowkdb.dt.decimal128[38i;2i]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +f32_fd:.arrowkdb.fd.field[`float32;f32_dt]; +f64_fd:.arrowkdb.fd.field[`float64;f64_dt]; +dec_fd:.arrowkdb.fd.field[`decimal;dec_dt]; + +-1"\n+----------|| Create the schemas for the list of fields ||----------+\n"; +float_schema:.arrowkdb.sc.schema[(ts_fd,f32_fd,f64_fd,dec_fd)]; + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +f32_data:N?100e; +f32_data[0]:1.23e; +f64_data:N?100f; +f64_data[1]:4.56f; +dec_data:{"F"$.Q.f[2]x} each N?(10f) +dec_data[2]:7.89f + +-1"\n+----------|| Combine the data for all columns ||----------+\n"; +float_data:(ts_data;f32_data;f64_data;dec_data); + +-1"\n+----------|| Write the schema and array data to a parquet file ||----------+\n"; +options[`DECIMAL128_AS_DOUBLE]:1 +options[`PARQUET_VERSION]:`V2.0 + +parquet_float:"null_mapping_float.parquet"; +.arrowkdb.pq.writeParquet[parquet_float;float_schema;float_data;options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +parquet_float_schema:.arrowkdb.pq.readParquetSchema[parquet_float]; +.arrowkdb.sc.equalSchemas[float_schema;parquet_float_schema] +float_schema~parquet_float_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +parquet_float_data:.arrowkdb.pq.readParquetData[parquet_float;options]; +float_data~parquet_float_data +rm parquet_float; + +-1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; +arrow_float:"null_mapping_float.arrow"; +.arrowkdb.ipc.writeArrow[arrow_float;float_schema;float_data;options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +arrow_float_schema:.arrowkdb.ipc.readArrowSchema[arrow_float]; +.arrowkdb.sc.equalSchemas[float_schema;arrow_float_schema] +float_schema~arrow_float_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +arrow_float_data:.arrowkdb.ipc.readArrowData[arrow_float;options]; +float_data~arrow_float_data +rm arrow_float; + +-1"\n+----------|| Serialize the schema and array data to an arrow stream ||----------+\n"; +serialized_float:.arrowkdb.ipc.serializeArrow[float_schema;float_data;options]; + +-1"\n+----------|| Parse the schema back abd compare ||----------+\n"; +stream_float_schema:.arrowkdb.ipc.parseArrowSchema[serialized_float]; +.arrowkdb.sc.equalSchemas[float_schema;stream_float_schema] +float_schema~stream_float_schema + +-1"\n+----------|| Parse the array data back and compare ||----------+\n"; +stream_float_data:.arrowkdb.ipc.parseArrowData[serialized_float;options]; +float_data~stream_float_data + + +-1 "\n+----------|| Test utils ||----------+\n"; + +show util.buildInfo[] +(type util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; From 5406e0c4f9e893013ca1891f75c8915292b54ae3 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 8 Feb 2023 17:07:19 +0000 Subject: [PATCH 190/276] Null mapping of string fields --- tests/.gitignore | 1 + tests/null_mapping_str.t | 98 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 99 insertions(+) create mode 100644 tests/null_mapping_str.t diff --git a/tests/.gitignore b/tests/.gitignore index 6ffd0d7..deb3a89 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -2,3 +2,4 @@ test.q null_mapping_short.q null_mapping_long.q null_mapping_float.q +null_mapping_str.q diff --git a/tests/null_mapping_str.t b/tests/null_mapping_str.t new file mode 100644 index 0000000..95bdd95 --- /dev/null +++ b/tests/null_mapping_str.t @@ -0,0 +1,98 @@ +// null_mapping_str.q + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping ||----------+\n"; +str_opts:(`string`binary`fixed_binary)!("start";"x"$"alert";0Ng); + +options:(``NULL_MAPPING)!((::);str_opts); + +N:5 + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +str_dt:.arrowkdb.dt.utf8[]; +bin_dt:.arrowkdb.dt.binary[]; +fbin_dt:.arrowkdb.dt.fixed_size_binary[16i]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +str_fd:.arrowkdb.fd.field[`string;str_dt]; +bin_fd:.arrowkdb.fd.field[`binary;bin_dt]; +fbin_fd:.arrowkdb.fd.field[`fixed_binary;fbin_dt]; + +-1"\n+----------|| Create the schemas for the list of fields ||----------+\n"; +str_schema:.arrowkdb.sc.schema[(ts_fd,str_fd,bin_fd,fbin_fd)]; + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +str_data:N?("start";"stop";"alert";"acknowledge";""); +str_data[0]:"start" +bin_data:N?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); +bin_data[2]:"x"$"alert" +fbin_data:N?0Ng; +fbin_data[4]:0Ng; + +-1"\n+----------|| Combine the data for all columns ||----------+\n"; +str_data:(ts_data;str_data;bin_data;fbin_data); + +-1"\n+----------|| Write the schema and array data to a parquet file ||----------+\n"; +options[`PARQUET_VERSION]:`V2.0 + +parquet_str:"null_mapping_str.parquet"; +.arrowkdb.pq.writeParquet[parquet_str;str_schema;str_data;options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +parquet_str_schema:.arrowkdb.pq.readParquetSchema[parquet_str]; +.arrowkdb.sc.equalSchemas[str_schema;parquet_str_schema] +str_schema~parquet_str_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +parquet_str_data:.arrowkdb.pq.readParquetData[parquet_str;options]; +parquet_str_data[3]:{0x0 sv x} each parquet_str_data[3] // Convert to GUIDs +str_data~parquet_str_data +rm parquet_str; + +-1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; +arrow_str:"null_mapping_str.arrow"; +.arrowkdb.ipc.writeArrow[arrow_str;str_schema;str_data;options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +arrow_str_schema:.arrowkdb.ipc.readArrowSchema[arrow_str]; +.arrowkdb.sc.equalSchemas[str_schema;arrow_str_schema] +str_schema~arrow_str_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +arrow_str_data:.arrowkdb.ipc.readArrowData[arrow_str;options]; +arrow_str_data[3]:{0x0 sv x} each arrow_str_data[3] // Convert to GUIDs +str_data~arrow_str_data +rm arrow_str; + +-1"\n+----------|| Serialize the schema and array data to an arrow stream ||----------+\n"; +serialized_str:.arrowkdb.ipc.serializeArrow[str_schema;str_data;options]; + +-1"\n+----------|| Parse the schema back abd compare ||----------+\n"; +stream_str_schema:.arrowkdb.ipc.parseArrowSchema[serialized_str]; +.arrowkdb.sc.equalSchemas[str_schema;stream_str_schema] +str_schema~stream_str_schema + +-1"\n+----------|| Parse the array data back and compare ||----------+\n"; +stream_str_data:.arrowkdb.ipc.parseArrowData[serialized_str;options]; +stream_str_data[3]:{0x0 sv x} each stream_str_data[3] // Convert to GUIDs +str_data~stream_str_data + + +-1 "\n+----------|| Test utils ||----------+\n"; + +show util.buildInfo[] +(type util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; From b012b6c8babe44a17fbbcb85f138b4e9fb609250 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 8 Feb 2023 17:29:41 +0000 Subject: [PATCH 191/276] Null mapping of temporal fields --- tests/.gitignore | 1 + tests/null_mapping_time.t | 89 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 tests/null_mapping_time.t diff --git a/tests/.gitignore b/tests/.gitignore index deb3a89..32522e9 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -3,3 +3,4 @@ null_mapping_short.q null_mapping_long.q null_mapping_float.q null_mapping_str.q +null_mapping_time.q diff --git a/tests/null_mapping_time.t b/tests/null_mapping_time.t new file mode 100644 index 0000000..e06c1f5 --- /dev/null +++ b/tests/null_mapping_time.t @@ -0,0 +1,89 @@ +// null_mapping_time.q + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping ||----------+\n"; +time_opts:(`date32`timestamp`time64)!(2006.07.21;2011.01.01D00:00:00.000000000;12:00:00.000000000); + +options:(``NULL_MAPPING)!((::);time_opts); + +N:5 + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +d32_dt:.arrowkdb.dt.date32[]; +tstamp_dt:.arrowkdb.dt.timestamp[`nano]; +t64_dt:.arrowkdb.dt.time64[`nano]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +d32_fd:.arrowkdb.fd.field[`date32;d32_dt]; +tstamp_fd:.arrowkdb.fd.field[`timestamp;tstamp_dt]; +t64_fd:.arrowkdb.fd.field[`time64;t64_dt]; + +-1"\n+----------|| Create the schemas for the list of fields ||----------+\n"; +time_schema:.arrowkdb.sc.schema[(ts_fd,d32_fd,tstamp_fd,t64_fd)]; + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +d32_data:N?(2006.07.21;2008.07.18;2012.07.16;2014.07.15;2016.07.11); +d32_data[0]:2006.07.21; +tstamp_data:N?(2015.01.01D00:00:00.000000000;2014.01.01D00:00:00.000000000;2013.01.01D00:00:00.000000000;2012.01.01D00:00:00.000000000;2011.01.01D00:00:00.000000000); +tstamp_data[2]:2011.01.01D00:00:00.000000000; +t64_data:N?(12:00:00.000000000;13:00:00.000000000;14:00:00.000000000;15:00:00.000000000;16:00:00.000000000); +t64_data[3]:12:00:00.000000000; + +-1"\n+----------|| Combine the data for all columns ||----------+\n"; +time_data:(ts_data;d32_data;tstamp_data;t64_data); + +-1"\n+----------|| Write the schema and array data to a parquet file ||----------+\n"; +options[`PARQUET_VERSION]:`V2.0 + +parquet_time:"null_mapping_time.parquet"; +.arrowkdb.pq.writeParquet[parquet_time;time_schema;time_data;options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +parquet_time_schema:.arrowkdb.pq.readParquetSchema[parquet_time]; +.arrowkdb.sc.equalSchemas[time_schema;parquet_time_schema] +time_schema~parquet_time_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +parquet_time_data:.arrowkdb.pq.readParquetData[parquet_time;options]; +time_data~parquet_time_data +rm parquet_time; + +-1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; +arrow_time:"null_mapping_time.arrow"; +.arrowkdb.ipc.writeArrow[arrow_time;time_schema;time_data;options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +arrow_time_schema:.arrowkdb.ipc.readArrowSchema[arrow_time]; +.arrowkdb.sc.equalSchemas[time_schema;arrow_time_schema] +time_schema~arrow_time_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +arrow_time_data:.arrowkdb.ipc.readArrowData[arrow_time;options]; +time_data~arrow_time_data +rm arrow_time; + +-1"\n+----------|| Serialize the schema and array data to an arrow stream ||----------+\n"; +serialized_time:.arrowkdb.ipc.serializeArrow[time_schema;time_data;options]; + +-1"\n+----------|| Parse the schema back abd compare ||----------+\n"; +stream_time_schema:.arrowkdb.ipc.parseArrowSchema[serialized_time]; +.arrowkdb.sc.equalSchemas[time_schema;stream_time_schema] +time_schema~stream_time_schema + +-1"\n+----------|| Parse the array data back and compare ||----------+\n"; +stream_time_data:.arrowkdb.ipc.parseArrowData[serialized_time;options]; +time_data~stream_time_data + + +-1 "\n+----------------------------------------+\n"; \ No newline at end of file From 0a1fdadcbc87b96c09802c6014f1d9f1feb5e744 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 8 Feb 2023 17:53:02 +0000 Subject: [PATCH 192/276] Null mapping of extra fields, unsupported by parquet --- tests/.gitignore | 1 + tests/null_mapping_extra.t | 84 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+) create mode 100644 tests/null_mapping_extra.t diff --git a/tests/.gitignore b/tests/.gitignore index 32522e9..e80b02e 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -4,3 +4,4 @@ null_mapping_long.q null_mapping_float.q null_mapping_str.q null_mapping_time.q +null_mapping_extra.q diff --git a/tests/null_mapping_extra.t b/tests/null_mapping_extra.t new file mode 100644 index 0000000..4e6405d --- /dev/null +++ b/tests/null_mapping_extra.t @@ -0,0 +1,84 @@ +// null_mapping_extra.q + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping only in arrow ||----------+\n"; +extra_opts:(`float16`large_string`large_binary`duration)!(9h;"stop";"x"$"acknowledge";12:00:00.000000000); + +options:(``NULL_MAPPING)!((::);extra_opts); + +N:5 + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +f16_dt:.arrowkdb.dt.float16[]; +lstr_dt:.arrowkdb.dt.large_utf8[]; +lbin_dt:.arrowkdb.dt.large_binary[]; +dur_dt:.arrowkdb.dt.duration[`milli]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +f16_fd:.arrowkdb.fd.field[`float16;f16_dt]; +lstr_fd:.arrowkdb.fd.field[`large_string;lstr_dt]; +lbin_fd:.arrowkdb.fd.field[`large_binary;lbin_dt]; +dur_fd:.arrowkdb.fd.field[`duration;dur_dt]; + +-1"\n+----------|| Create the schemas for the list of fields ||----------+\n"; +extra_schema:.arrowkdb.sc.schema[(ts_fd,f16_fd,lstr_fd,lbin_fd,dur_fd)]; + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +f16_data:N?100h; +f16_data[0]:9h; +lstr_data:N?("start";"stop";"alert";"acknowledge";""); +lstr_data[1]:"stop" +lbin_data:N?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); +lbin_data[3]:"x"$"acknowledge" +dur_data:N?(12:00:00.000000000;13:00:00.000000000;14:00:00.000000000;15:00:00.000000000;16:00:00.000000000); +dur_data[4]:12:00:00.000000000; + +-1"\n+----------|| Combine the data for all columns ||----------+\n"; +extra_data:(ts_data;f16_data;lstr_data;lbin_data;dur_data); + +-1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; +arrow_extra:"null_mapping_extra.arrow"; +.arrowkdb.ipc.writeArrow[arrow_extra;extra_schema;extra_data;options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +arrow_extra_schema:.arrowkdb.ipc.readArrowSchema[arrow_extra]; +.arrowkdb.sc.equalSchemas[extra_schema;arrow_extra_schema] +extra_schema~arrow_extra_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +arrow_extra_data:.arrowkdb.ipc.readArrowData[arrow_extra;options]; +extra_data~arrow_extra_data +rm arrow_extra; + +-1"\n+----------|| Serialize the schema and array data to an arrow stream ||----------+\n"; +serialized_extra:.arrowkdb.ipc.serializeArrow[extra_schema;extra_data;options]; + +-1"\n+----------|| Parse the schema back abd compare ||----------+\n"; +stream_extra_schema:.arrowkdb.ipc.parseArrowSchema[serialized_extra]; +.arrowkdb.sc.equalSchemas[extra_schema;stream_extra_schema] +extra_schema~stream_extra_schema + +-1"\n+----------|| Parse the array data back and compare ||----------+\n"; +stream_extra_data:.arrowkdb.ipc.parseArrowData[serialized_extra;options]; + +extra_data~stream_extra_data + + +-1 "\n+----------|| Test utils ||----------+\n"; + +show util.buildInfo[] +(type util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; From 17c033aa74d0cc6f1cb6f27c96fd2e2ba4fd6a16 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 8 Feb 2023 18:02:25 +0000 Subject: [PATCH 193/276] Null mapping of other fields, unsopported by parquet --- tests/.gitignore | 1 + tests/null_mapping_other.t | 83 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+) create mode 100644 tests/null_mapping_other.t diff --git a/tests/.gitignore b/tests/.gitignore index e80b02e..6b77dee 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -5,3 +5,4 @@ null_mapping_float.q null_mapping_str.q null_mapping_time.q null_mapping_extra.q +null_mapping_other.q diff --git a/tests/null_mapping_other.t b/tests/null_mapping_other.t new file mode 100644 index 0000000..bff0dc3 --- /dev/null +++ b/tests/null_mapping_other.t @@ -0,0 +1,83 @@ +// null_mapping_other.q + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping only in arrow ||----------+\n"; +other_opts:(`date64`time32`month_interval`day_time_interval)!(2015.01.01D00:00:00.000000000;09:01:02.042;2006.07m;12:00:00.000000000); + +options:(``NULL_MAPPING)!((::);other_opts); + +N:5 + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +d64_dt:.arrowkdb.dt.date64[]; +t32_dt:.arrowkdb.dt.time32[`milli]; +mint_dt:.arrowkdb.dt.month_interval[]; +dtint_dt:.arrowkdb.dt.day_time_interval[]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +d64_fd:.arrowkdb.fd.field[`date64;d64_dt]; +t32_fd:.arrowkdb.fd.field[`time32;t32_dt]; +mint_fd:.arrowkdb.fd.field[`month_interval;mint_dt]; +dtint_fd:.arrowkdb.fd.field[`day_time_interval;dtint_dt]; + +-1"\n+----------|| Create the schemas for the list of fields ||----------+\n"; +other_schema:.arrowkdb.sc.schema[(ts_fd,d64_fd,t32_fd,mint_fd,dtint_fd)]; + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +d64_data:N?(2015.01.01D00:00:00.000000000;2017.01.01D00:00:00.000000000;2018.01.01D00:00:00.000000000;2019.01.01D00:00:00.000000000;2020.01.01D00:00:00.000000000); +d64_data[1]:2015.01.01D00:00:00.000000000; +t32_data:N?(09:01:02.042;08:01:02.042;07:01:02.042;06:01:02.042;05:01:02.042); +t32_data[1]:09:01:02.042; +mint_data:N?(2006.07m;2006.06m;2006.05m;2006.04m;2006.03m); +mint_data[2]:2006.07m; +dtint_data:N?(12:00:00.000000000;11:00:00.000000000;10:00:00.000000000;09:00:00.000000000;08:00:00.000000000); +dtint_data[3]:12:00:00.000000000; + +-1"\n+----------|| Combine the data for all columns ||----------+\n"; +other_data:(ts_data;d64_data;t32_data;mint_data;dtint_data); + +-1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; +arrow_other:"null_mapping_other.arrow"; +.arrowkdb.ipc.writeArrow[arrow_other;other_schema;other_data;options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +arrow_other_schema:.arrowkdb.ipc.readArrowSchema[arrow_other]; +.arrowkdb.sc.equalSchemas[other_schema;arrow_other_schema] +other_schema~arrow_other_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +arrow_other_data:.arrowkdb.ipc.readArrowData[arrow_other;options]; +other_data~arrow_other_data +rm arrow_other; + +-1"\n+----------|| Serialize the schema and array data to an arrow stream ||----------+\n"; +serialized_other:.arrowkdb.ipc.serializeArrow[other_schema;other_data;options]; + +-1"\n+----------|| Parse the schema back abd compare ||----------+\n"; +stream_other_schema:.arrowkdb.ipc.parseArrowSchema[serialized_other]; +.arrowkdb.sc.equalSchemas[other_schema;stream_other_schema] +other_schema~stream_other_schema + +-1"\n+----------|| Parse the array data back and compare ||----------+\n"; +stream_other_data:.arrowkdb.ipc.parseArrowData[serialized_other;options]; +other_data~stream_other_data + + +-1 "\n+----------|| Test utils ||----------+\n"; + +show util.buildInfo[] +(type util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; From d3e2bc03e6664f01592c2b529042e7d8a11d4e37 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 13 Feb 2023 15:51:18 +0000 Subject: [PATCH 194/276] Disabled OSX and Windows builds --- .travis.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 7deb2e9..3084fc5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,9 +4,9 @@ jobs: os: linux - dist: focal os: linux - - osx_image: xcode12.5 - os: osx - - os: windows +# - osx_image: xcode12.5 +# os: osx +# - os: windows language: c compiler: gcc os: linux From 5ac249d7874b0920e9e99f8153bf5efe13deb71d Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 13 Feb 2023 16:16:07 +0000 Subject: [PATCH 195/276] Extra namespace for build info --- tests/null_mapping_extra.t | 4 ++-- tests/null_mapping_float.t | 4 ++-- tests/null_mapping_long.t | 4 ++-- tests/null_mapping_other.t | 4 ++-- tests/null_mapping_short.t | 6 +++--- tests/null_mapping_str.t | 4 ++-- tests/null_mapping_time.t | 8 +++++++- 7 files changed, 20 insertions(+), 14 deletions(-) diff --git a/tests/null_mapping_extra.t b/tests/null_mapping_extra.t index 4e6405d..b269370 100644 --- a/tests/null_mapping_extra.t +++ b/tests/null_mapping_extra.t @@ -77,8 +77,8 @@ extra_data~stream_extra_data -1 "\n+----------|| Test utils ||----------+\n"; -show util.buildInfo[] -(type util.buildInfo[])~99h +show .arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h -1 "\n+----------|| Finished testing ||----------+\n"; diff --git a/tests/null_mapping_float.t b/tests/null_mapping_float.t index 96a2d26..d036af0 100644 --- a/tests/null_mapping_float.t +++ b/tests/null_mapping_float.t @@ -89,8 +89,8 @@ float_data~stream_float_data -1 "\n+----------|| Test utils ||----------+\n"; -show util.buildInfo[] -(type util.buildInfo[])~99h +show .arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h -1 "\n+----------|| Finished testing ||----------+\n"; diff --git a/tests/null_mapping_long.t b/tests/null_mapping_long.t index db4c2b5..fcc1541 100644 --- a/tests/null_mapping_long.t +++ b/tests/null_mapping_long.t @@ -92,8 +92,8 @@ long_data~stream_long_data -1 "\n+----------|| Test utils ||----------+\n"; -show util.buildInfo[] -(type util.buildInfo[])~99h +show .arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h -1 "\n+----------|| Finished testing ||----------+\n"; diff --git a/tests/null_mapping_other.t b/tests/null_mapping_other.t index bff0dc3..05cb069 100644 --- a/tests/null_mapping_other.t +++ b/tests/null_mapping_other.t @@ -76,8 +76,8 @@ other_data~stream_other_data -1 "\n+----------|| Test utils ||----------+\n"; -show util.buildInfo[] -(type util.buildInfo[])~99h +show .arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h -1 "\n+----------|| Finished testing ||----------+\n"; diff --git a/tests/null_mapping_short.t b/tests/null_mapping_short.t index 59dc02d..f2c8816 100644 --- a/tests/null_mapping_short.t +++ b/tests/null_mapping_short.t @@ -96,8 +96,8 @@ short_data~stream_short_data -1 "\n+----------|| Test utils ||----------+\n"; -show util.buildInfo[] -(type util.buildInfo[])~99h +show .arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h --1 "\n+----------|| Finished testing ||----------+\n"; \ No newline at end of file +-1 "\n+----------|| Finished testing ||----------+\n"; diff --git a/tests/null_mapping_str.t b/tests/null_mapping_str.t index 95bdd95..8220eac 100644 --- a/tests/null_mapping_str.t +++ b/tests/null_mapping_str.t @@ -91,8 +91,8 @@ str_data~stream_str_data -1 "\n+----------|| Test utils ||----------+\n"; -show util.buildInfo[] -(type util.buildInfo[])~99h +show .arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h -1 "\n+----------|| Finished testing ||----------+\n"; diff --git a/tests/null_mapping_time.t b/tests/null_mapping_time.t index e06c1f5..d24914b 100644 --- a/tests/null_mapping_time.t +++ b/tests/null_mapping_time.t @@ -86,4 +86,10 @@ stream_time_data:.arrowkdb.ipc.parseArrowData[serialized_time;options]; time_data~stream_time_data --1 "\n+----------------------------------------+\n"; \ No newline at end of file +-1 "\n+----------|| Test utils ||----------+\n"; + +show .arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; From 0ccac8881cf804cf76b3361949c02d7312472c6f Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 13 Feb 2023 16:35:07 +0000 Subject: [PATCH 196/276] Adjusting travis root repo folder --- tests/null_mapping_extra.t | 2 +- tests/null_mapping_float.t | 2 +- tests/null_mapping_long.t | 2 +- tests/null_mapping_other.t | 2 +- tests/null_mapping_short.t | 2 +- tests/null_mapping_str.t | 2 +- tests/null_mapping_time.t | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/null_mapping_extra.t b/tests/null_mapping_extra.t index b269370..3ac2e1b 100644 --- a/tests/null_mapping_extra.t +++ b/tests/null_mapping_extra.t @@ -1,7 +1,7 @@ // null_mapping_extra.q -1"\n+----------|| Import the arrowkdb library ||----------+\n"; -\l arrowkdb.q +\l q/arrowkdb.q -1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; diff --git a/tests/null_mapping_float.t b/tests/null_mapping_float.t index d036af0..e61b9ed 100644 --- a/tests/null_mapping_float.t +++ b/tests/null_mapping_float.t @@ -1,7 +1,7 @@ // null_mapping_float.q -1"\n+----------|| Import the arrowkdb library ||----------+\n"; -\l arrowkdb.q +\l q/arrowkdb.q -1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; diff --git a/tests/null_mapping_long.t b/tests/null_mapping_long.t index fcc1541..6c64a4b 100644 --- a/tests/null_mapping_long.t +++ b/tests/null_mapping_long.t @@ -1,7 +1,7 @@ // null_mapping_long.q -1"\n+----------|| Import the arrowkdb library ||----------+\n"; -\l arrowkdb.q +\l q/arrowkdb.q -1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; diff --git a/tests/null_mapping_other.t b/tests/null_mapping_other.t index 05cb069..6228a28 100644 --- a/tests/null_mapping_other.t +++ b/tests/null_mapping_other.t @@ -1,7 +1,7 @@ // null_mapping_other.q -1"\n+----------|| Import the arrowkdb library ||----------+\n"; -\l arrowkdb.q +\l q/arrowkdb.q -1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; diff --git a/tests/null_mapping_short.t b/tests/null_mapping_short.t index f2c8816..1c4dfec 100644 --- a/tests/null_mapping_short.t +++ b/tests/null_mapping_short.t @@ -1,7 +1,7 @@ // null_mapping_short.t -1"\n+----------|| Import the arrowkdb library ||----------+\n"; -\l arrowkdb.q +\l q/arrowkdb.q -1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; diff --git a/tests/null_mapping_str.t b/tests/null_mapping_str.t index 8220eac..295d533 100644 --- a/tests/null_mapping_str.t +++ b/tests/null_mapping_str.t @@ -1,7 +1,7 @@ // null_mapping_str.q -1"\n+----------|| Import the arrowkdb library ||----------+\n"; -\l arrowkdb.q +\l q/arrowkdb.q -1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; diff --git a/tests/null_mapping_time.t b/tests/null_mapping_time.t index d24914b..c006ed4 100644 --- a/tests/null_mapping_time.t +++ b/tests/null_mapping_time.t @@ -1,7 +1,7 @@ // null_mapping_time.q -1"\n+----------|| Import the arrowkdb library ||----------+\n"; -\l arrowkdb.q +\l q/arrowkdb.q -1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; From 06231ffbd3c71c0bf60688e6b241991d54940b98 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 13 Feb 2023 17:08:27 +0000 Subject: [PATCH 197/276] Renaming to prevent caching of datatypes, fields and schemas --- tests/.gitignore | 2 +- tests/{test.t => basic.t} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename tests/{test.t => basic.t} (100%) diff --git a/tests/.gitignore b/tests/.gitignore index 6b77dee..c0832d1 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -1,4 +1,4 @@ -test.q +basic.q null_mapping_short.q null_mapping_long.q null_mapping_float.q diff --git a/tests/test.t b/tests/basic.t similarity index 100% rename from tests/test.t rename to tests/basic.t From e61b247d26938eb6593772cf08093f79c47c3e47 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 9 Feb 2023 14:55:44 +0300 Subject: [PATCH 198/276] Example for bitmap reading option --- examples/null_bitmap.q | 87 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 examples/null_bitmap.q diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q new file mode 100644 index 0000000..68f1bfd --- /dev/null +++ b/examples/null_bitmap.q @@ -0,0 +1,87 @@ +// null_bitmap.q +// Example of exposing null bitmap as a separate structure to kdb + +-1"\n+----------|| null_bitmap.q ||----------+\n"; + +// import the arrowkdb library +\l arrowkdb.q + +// Filesystem functions for Linux/MacOS/Windows +ls:{[filename] $[.z.o like "w*";system "dir /b ",filename;system "ls ",filename]}; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +///////////////////////// +// CONSTRUCTED SCHEMAS // +///////////////////////// + +//-------------------// +// Create the schema // +//-------------------// + +// Support null mapping +bitmap_opts:(`bool`int32`float64`string`date32)!(0b;1i;2.34;"start";2006.07.21); + +options:(``NULL_MAPPING)!((::);bitmap_opts); + +// Create the datatype identifiers +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +bool_dt:.arrowkdb.dt.boolean[]; +i32_dt:.arrowkdb.dt.int32[]; +f64_dt:.arrowkdb.dt.float64[]; +str_dt:.arrowkdb.dt.utf8[]; +d32_dt:.arrowkdb.dt.date32[]; + +// Create the field identifiers +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +bool_fd:.arrowkdb.fd.field[`bool;bool_dt]; +i32_fd:.arrowkdb.fd.field[`int32;i32_dt]; +f64_fd:.arrowkdb.fd.field[`float64;f64_dt]; +str_fd:.arrowkdb.fd.field[`string;str_dt]; +d32_fd:.arrowkdb.fd.field[`date32;d32_dt]; + +// Create the schemas for the list of fields +bitmap_schema:.arrowkdb.sc.schema[(ts_fd,bool_fd,i32_fd,f64_fd,str_fd,d32_fd)]; + +// Print the schema +.arrowkdb.sc.printSchema[bitmap_schema]; + +// Create data for each column in the table +ts_data:asc N?0p; + +bool_data:N?(0b;1b); +bool_data[0]:0b; +i32_data:N?100i; +i32_data[1]:1i; +f64_data:N?100f; +f64_data[2]:2.34f; +str_data:N?("start";"stop";"alert";"acknowledge";""); +str_data[3]:"start" +d32_data:N?(2006.07.21;2005.07.18;2004.07.16;2003.07.15;2002.07.11); +d32_data[4]:2006.07.21; + +// Combine the data for all columns +bitmap_data:(ts_data;bool_data;i32_data;f64_data;str_data;d32_data); + +// Pretty print the Arrow table populated from the bitmap data +.arrowkdb.tb.prettyPrintTable[bitmap_schema;bitmap_data;options]; + +//-------------------------// +// Example-1. Parquet file // +//-------------------------// + +// Write the schema and array data to a parquet file +options[`PARQUET_VERSION]:`V2.0 + +parquet_bitmap:"null_bitmap.parquet"; +.arrowkdb.pq.writeParquet[parquet_bitmap;bitmap_schema;bitmap_data;options]; +show ls parquet_bitmap + +// Read the parquet file into another table +parquet_table:.arrowkdb.pq.readParquetToTable[parquet_bitmap;(``WITH_NULL_BITMAP)!((::);1)]; +.arrowkdb.tb.prettyPrintTableFromTable[parquet_table;::]; + +// Compare the kdb+ tables +show bitmap_data~parquet_table +//rm parquet_bitmap; \ No newline at end of file From 761a32abce356bf8d6a29fc6cf7e04f76616aa91 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 10 Feb 2023 13:01:32 +0300 Subject: [PATCH 199/276] Example of reading Parquet's data with null bitmap --- examples/null_bitmap.q | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q index 68f1bfd..9f3faeb 100644 --- a/examples/null_bitmap.q +++ b/examples/null_bitmap.q @@ -72,16 +72,13 @@ bitmap_data:(ts_data;bool_data;i32_data;f64_data;str_data;d32_data); //-------------------------// // Write the schema and array data to a parquet file -options[`PARQUET_VERSION]:`V2.0 +options[`PARQUET_VERSION]:`V2.0; parquet_bitmap:"null_bitmap.parquet"; .arrowkdb.pq.writeParquet[parquet_bitmap;bitmap_schema;bitmap_data;options]; show ls parquet_bitmap -// Read the parquet file into another table -parquet_table:.arrowkdb.pq.readParquetToTable[parquet_bitmap;(``WITH_NULL_BITMAP)!((::);1)]; -.arrowkdb.tb.prettyPrintTableFromTable[parquet_table;::]; - -// Compare the kdb+ tables -show bitmap_data~parquet_table -//rm parquet_bitmap; \ No newline at end of file +// Read the array data back and compare +parquet_bitmap_data:.arrowkdb.pq.readParquetData[parquet_bitmap;(``WITH_NULL_BITMAP)!((::);1)]; +show bitmap_data~parquet_bitmap_data +rm parquet_bitmap; From 40dbc063c705b9e8739c7f4fe11c94f9142885f5 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 10 Feb 2023 13:04:17 +0300 Subject: [PATCH 200/276] Null bitmap reader --- src/ArrayReader.cpp | 36 ++++++++++++++++++++++++++++++++++++ src/ArrayReader.h | 24 ++++++++++++++++++++++++ src/KdbOptions.h | 2 ++ src/TableData.cpp | 14 ++++++++++++++ 4 files changed, 76 insertions(+) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 30e34b4..3a3e336 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -634,6 +634,30 @@ void AppendArray(shared_ptr array_data, K k_array, size_t& index, } } +void AppendNullBitmap( shared_ptr array_data, K k_bitmap, size_t& index ) +{ + auto type_id = array_data->type_id(); + const uint8_t* null_data = array_data->null_bitmap_data(); + if( null_data == nullptr || array_data->null_count() == 0 + || arrow::Type::LIST == type_id + || arrow::Type::LARGE_LIST == type_id + || arrow::Type::FIXED_SIZE_LIST == type_id + || arrow::Type::MAP == type_id + || arrow::Type::STRUCT == type_id + || arrow::Type::SPARSE_UNION == type_id + || arrow::Type::DENSE_UNION == type_id + || arrow::Type::DICTIONARY == type_id ){ + memset( &kG( k_bitmap )[index], 0, array_data->length() ); + index += array_data->length(); + } + else{ + for(auto i = 0; i < array_data->length(); ++i ){ + kG( k_bitmap )[index] = null_data[index]; + ++index; + } + } +} + K InitKdbForArray(shared_ptr datatype, size_t length, TypeMappingOverride& type_overrides) { switch (datatype->id()) { @@ -696,6 +720,18 @@ K ReadChunkedArray(shared_ptr chunked_array, TypeMappingOve return k_array; } +K ReadChunkedNullBitmap( shared_ptr chunked_array, TypeMappingOverride& type_overrides ) +{ + auto bitmapDatatype = std::make_shared(); + K k_bitmap = InitKdbForArray( bitmapDatatype, chunked_array->length(), type_overrides ); + size_t index = 0; + for( auto i = 0; i < chunked_array->num_chunks(); ++i ){ + AppendNullBitmap( chunked_array->chunk( i ), k_bitmap, index ); + } + + return k_bitmap; +} + } // namespace arrowkdb } // namspace kx diff --git a/src/ArrayReader.h b/src/ArrayReader.h index 3298190..16c72b5 100644 --- a/src/ArrayReader.h +++ b/src/ArrayReader.h @@ -26,6 +26,22 @@ namespace arrowkdb { */ void AppendArray(std::shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides); + +/** + * @brief Appends null bitmap data from an arrow array into an existing kdb boolean + * list starting at the specified index. + * + * @param array_data The arrow array from which to source the data. The entire + * array will be appended. + * @param k_bitmap The kdb boolean list that the data should be inserted into. + * This list needs to have been created with the correct length by the calling + * function. + * @param index The index into the kdb list at which the appending should + * begin. Index will be updated to account for the new offset by adding the + * length of the array array. +*/ +void AppendNullBitmap( std::shared_ptr array_data, K k_bitmap, size_t& index ); + /** * @brief Copies and converts an arrow array to a kdb list * @@ -45,6 +61,14 @@ K ReadArray(std::shared_ptr array, TypeMappingOverride& type_overr */ K ReadChunkedArray(std::shared_ptr chunked_array, TypeMappingOverride& type_overrides); +/** + * @brief Extracts nulls bitmap of an arrow array into a boolean kdb list + * + * @param chunked_array The chunked array to be converted + * @return A kdb list representing the nulls bitmap +*/ +K ReadChunkedNullBitmap( std::shared_ptr chunked_array, TypeMappingOverride& type_overrides ); + /** * @brief Creates a kdb list of the correct type and specified length according * to the arrow datatype. For the arrow struct/union datatypes this includes diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 6a8fc84..6e7d59f 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -68,6 +68,7 @@ namespace Options const std::string PARQUET_MULTITHREADED_READ = "PARQUET_MULTITHREADED_READ"; const std::string USE_MMAP = "USE_MMAP"; const std::string DECIMAL128_AS_DOUBLE = "DECIMAL128_AS_DOUBLE"; + const std::string WITH_NULL_BITMAP = "WITH_NULL_BITMAP"; // String options const std::string PARQUET_VERSION = "PARQUET_VERSION"; @@ -108,6 +109,7 @@ namespace Options PARQUET_MULTITHREADED_READ, USE_MMAP, DECIMAL128_AS_DOUBLE, + WITH_NULL_BITMAP }; const static std::set string_options = { PARQUET_VERSION, diff --git a/src/TableData.cpp b/src/TableData.cpp index 0deeb88..22db1f2 100644 --- a/src/TableData.cpp +++ b/src/TableData.cpp @@ -294,6 +294,20 @@ K readParquetData(K parquet_file, K options) kK(data)[i] = kx::arrowkdb::ReadChunkedArray(chunked_array, type_overrides); } + int64_t with_null_bitmap = 0; + read_options.GetIntOption( kx::arrowkdb::Options::WITH_NULL_BITMAP, with_null_bitmap ); + if( with_null_bitmap ){ + K bitmap = ktn( 0, col_num ); + for( auto i = 0; i < col_num; ++i ){ + auto chunked_array = table->column( i ); + kK( bitmap )[i] = kx::arrowkdb::ReadChunkedNullBitmap( chunked_array, type_overrides ); + } + K array = data; + data = ktn( 0, 2 ); + kK( data )[0] = array; + kK( data )[1] = bitmap; + } + return data; KDB_EXCEPTION_CATCH; From d1080093fd41c414bff2128818618dd3db3fcbe4 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 10 Feb 2023 20:25:18 +0300 Subject: [PATCH 201/276] Null bitmap debugging changes --- src/ArrayReader.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 3a3e336..16c1043 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -637,8 +637,7 @@ void AppendArray(shared_ptr array_data, K k_array, size_t& index, void AppendNullBitmap( shared_ptr array_data, K k_bitmap, size_t& index ) { auto type_id = array_data->type_id(); - const uint8_t* null_data = array_data->null_bitmap_data(); - if( null_data == nullptr || array_data->null_count() == 0 + if( array_data->null_count() == 0 || arrow::Type::LIST == type_id || arrow::Type::LARGE_LIST == type_id || arrow::Type::FIXED_SIZE_LIST == type_id @@ -651,8 +650,8 @@ void AppendNullBitmap( shared_ptr array_data, K k_bitmap, size_t& index += array_data->length(); } else{ - for(auto i = 0; i < array_data->length(); ++i ){ - kG( k_bitmap )[index] = null_data[index]; + for( auto i = 0; i < array_data->length(); ++i ){ + kG( k_bitmap )[index] = array_data->IsNull( index ); ++index; } } @@ -722,8 +721,8 @@ K ReadChunkedArray(shared_ptr chunked_array, TypeMappingOve K ReadChunkedNullBitmap( shared_ptr chunked_array, TypeMappingOverride& type_overrides ) { - auto bitmapDatatype = std::make_shared(); - K k_bitmap = InitKdbForArray( bitmapDatatype, chunked_array->length(), type_overrides ); + auto boolean = std::make_shared(); + K k_bitmap = InitKdbForArray( boolean, chunked_array->length(), type_overrides ); size_t index = 0; for( auto i = 0; i < chunked_array->num_chunks(); ++i ){ AppendNullBitmap( chunked_array->chunk( i ), k_bitmap, index ); From 9f4c6758e38243499befcea8f847b8c698ea6609 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 10 Feb 2023 20:26:41 +0300 Subject: [PATCH 202/276] Bitmap test improved --- examples/null_bitmap.q | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q index 9f3faeb..b45e347 100644 --- a/examples/null_bitmap.q +++ b/examples/null_bitmap.q @@ -79,6 +79,12 @@ parquet_bitmap:"null_bitmap.parquet"; show ls parquet_bitmap // Read the array data back and compare -parquet_bitmap_data:.arrowkdb.pq.readParquetData[parquet_bitmap;(``WITH_NULL_BITMAP)!((::);1)]; -show bitmap_data~parquet_bitmap_data +options[`WITH_NULL_BITMAP]:1; +parquet_bitmap_data:.arrowkdb.pq.readParquetData[parquet_bitmap;options]; +show bitmap_data~first parquet_bitmap_data + +nulls_data:1b,(N-1)?1b +bitmap_nulls:{x rotate nulls_data} each neg til {x-1} count bitmap_data +parquet_bitmap_nulls:last parquet_bitmap_data +show bitmap_nulls~bitmap_nulls & sublist[{1-x} count parquet_bitmap_nulls;parquet_bitmap_nulls] rm parquet_bitmap; From 4ebe721d0b30c3bc96e6a41b0bcb83bf9dcf57ed Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 13 Feb 2023 13:50:48 +0300 Subject: [PATCH 203/276] Example of reading of Null's bitmap from Arrow --- examples/null_bitmap.q | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q index b45e347..6bfd3b3 100644 --- a/examples/null_bitmap.q +++ b/examples/null_bitmap.q @@ -83,8 +83,29 @@ options[`WITH_NULL_BITMAP]:1; parquet_bitmap_data:.arrowkdb.pq.readParquetData[parquet_bitmap;options]; show bitmap_data~first parquet_bitmap_data -nulls_data:1b,(N-1)?1b -bitmap_nulls:{x rotate nulls_data} each neg til {x-1} count bitmap_data -parquet_bitmap_nulls:last parquet_bitmap_data +nulls_data:1b,(N-1)?1b; +bitmap_nulls:{x rotate nulls_data} each neg til {x-1} count bitmap_data; +parquet_bitmap_nulls:last parquet_bitmap_data; show bitmap_nulls~bitmap_nulls & sublist[{1-x} count parquet_bitmap_nulls;parquet_bitmap_nulls] rm parquet_bitmap; + +//---------------------------// +// Example-2. Arrow IPC file // +//---------------------------// + +// Write the schema and array data to an arrow file +arrow_bitmap:"null_bitmap.arrow"; +.arrowkdb.ipc.writeArrow[arrow_bitmap;bitmap_schema;bitmap_data;options]; +show ls arrow_bitmap + +// Read the schema back and compare +arrow_bitmap_schema:.arrowkdb.ipc.readArrowSchema[arrow_bitmap]; +show .arrowkdb.sc.equalSchemas[bitmap_schema;arrow_bitmap_schema] +show bitmap_schema~arrow_bitmap_schema + +// Read the array data back and compare +arrow_bitmap_data:.arrowkdb.ipc.readArrowData[arrow_bitmap;options]; +show bitmap_data~first arrow_bitmap_data +arrow_bitmap_nulls:last arrow_bitmap_data; +show bitmap_nulls~bitmap_nulls & sublist[{1-x} count arrow_bitmap_nulls;arrow_bitmap_nulls] +rm arrow_bitmap; From a29346a1b39ec75b8da3ac744daf6c793485ec72 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 13 Feb 2023 13:52:29 +0300 Subject: [PATCH 204/276] Read Arraw data with Null's bitmap --- src/TableData.cpp | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/TableData.cpp b/src/TableData.cpp index 22db1f2..39ed95a 100644 --- a/src/TableData.cpp +++ b/src/TableData.cpp @@ -561,6 +561,23 @@ K readArrowData(K arrow_file, K options) kK(data)[i] = kx::arrowkdb::ReadChunkedArray(chunked_array, type_overrides); } + int64_t with_null_bitmap = 0; + read_options.GetIntOption( kx::arrowkdb::Options::WITH_NULL_BITMAP, with_null_bitmap ); + if( with_null_bitmap ){ + K bitmap = ktn( 0, col_num ); + for( auto i = 0; i < col_num; ++i ){ + arrow::ArrayVector column_arrays; + for (auto batch : all_batches) + column_arrays.push_back(batch->column(i)); + auto chunked_array = std::make_shared(column_arrays); + kK( bitmap )[i] = kx::arrowkdb::ReadChunkedNullBitmap( chunked_array, type_overrides ); + } + K array = data; + data = ktn( 0, 2 ); + kK( data )[0] = array; + kK( data )[1] = bitmap; + } + return data; KDB_EXCEPTION_CATCH; From b0bf5d819870129af8596725e8be02ffcd74e4f9 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 13 Feb 2023 20:45:27 +0300 Subject: [PATCH 205/276] Arrow stream example for null bitmap --- examples/null_bitmap.q | 28 +++++++++++++++++++++++++++- examples/null_mapping.q | 2 +- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q index 6bfd3b3..c8a02fd 100644 --- a/examples/null_bitmap.q +++ b/examples/null_bitmap.q @@ -4,7 +4,7 @@ -1"\n+----------|| null_bitmap.q ||----------+\n"; // import the arrowkdb library -\l arrowkdb.q +\l q/arrowkdb.q // Filesystem functions for Linux/MacOS/Windows ls:{[filename] $[.z.o like "w*";system "dir /b ",filename;system "ls ",filename]}; @@ -109,3 +109,29 @@ show bitmap_data~first arrow_bitmap_data arrow_bitmap_nulls:last arrow_bitmap_data; show bitmap_nulls~bitmap_nulls & sublist[{1-x} count arrow_bitmap_nulls;arrow_bitmap_nulls] rm arrow_bitmap; + +//-----------------------------// +// Example-3. Arrow IPC stream // +//-----------------------------// + +// Serialize the schema and array data to an arrow stream +serialized_bitmap:.arrowkdb.ipc.serializeArrow[bitmap_schema;bitmap_data;options]; +show serialized_bitmap + +// Parse the schema back abd compare +stream_bitmap_schema:.arrowkdb.ipc.parseArrowSchema[serialized_bitmap]; +show .arrowkdb.sc.equalSchemas[bitmap_schema;stream_bitmap_schema] +show bitmap_schema~stream_bitmap_schema + +// Parse the array data back and compare +stream_bitmap_data:.arrowkdb.ipc.parseArrowData[serialized_bitmap;options]; +show bitmap_data~first stream_bitmap_data + +stream_bitmap_nulls:last stream_bitmap_data; +show bitmap_nulls~bitmap_nulls & sublist[{1-x} count stream_bitmap_nulls;stream_bitmap_nulls] + + +-1 "\n+----------------------------------------+\n"; + +// Process off +exit 0; diff --git a/examples/null_mapping.q b/examples/null_mapping.q index 627485d..470dd29 100644 --- a/examples/null_mapping.q +++ b/examples/null_mapping.q @@ -4,7 +4,7 @@ -1"\n+----------|| null_mapping.q ||----------+\n"; // import the arrowkdb library -\l arrowkdb.q +\l q/arrowkdb.q // Filesystem functions for Linux/MacOS/Windows ls:{[filename] $[.z.o like "w*";system "dir /b ",filename;system "ls ",filename]}; From fedb1ce2dbe55520272631e6e9c7752a706282d9 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 14 Feb 2023 10:55:36 +0300 Subject: [PATCH 206/276] Read Arrow stream with null bitmap --- src/TableData.cpp | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/src/TableData.cpp b/src/TableData.cpp index 39ed95a..5d84bf9 100644 --- a/src/TableData.cpp +++ b/src/TableData.cpp @@ -567,9 +567,9 @@ K readArrowData(K arrow_file, K options) K bitmap = ktn( 0, col_num ); for( auto i = 0; i < col_num; ++i ){ arrow::ArrayVector column_arrays; - for (auto batch : all_batches) - column_arrays.push_back(batch->column(i)); - auto chunked_array = std::make_shared(column_arrays); + for( auto batch: all_batches ) + column_arrays.push_back( batch->column( i ) ); + auto chunked_array = std::make_shared( column_arrays ); kK( bitmap )[i] = kx::arrowkdb::ReadChunkedNullBitmap( chunked_array, type_overrides ); } K array = data; @@ -695,6 +695,23 @@ K parseArrowData(K char_array, K options) kK(data)[i] = kx::arrowkdb::ReadChunkedArray(chunked_array, type_overrides); } + int64_t with_null_bitmap = 0; + read_options.GetIntOption( kx::arrowkdb::Options::WITH_NULL_BITMAP, with_null_bitmap ); + if( with_null_bitmap ){ + K bitmap = ktn( 0, col_num ); + for( auto i = 0; i < col_num; ++i ){ + arrow::ArrayVector column_arrays; + for( auto batch: all_batches ) + column_arrays.push_back( batch->column( i ) ); + auto chunked_array = std::make_shared( column_arrays ); + kK( bitmap )[i] = kx::arrowkdb::ReadChunkedNullBitmap( chunked_array, type_overrides ); + } + K array = data; + data = ktn( 0, 2 ); + kK( data )[0] = array; + kK( data )[1] = bitmap; + } + return data; KDB_EXCEPTION_CATCH; From 9c6566e39484599fb8a75bc8e7ca623f1d721424 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 14 Feb 2023 11:59:35 +0300 Subject: [PATCH 207/276] Cleanup compiler warnings --- src/KdbOptions.h | 133 ++++++++++++++++++----------------------------- 1 file changed, 52 insertions(+), 81 deletions(-) diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 6e7d59f..c60e6e0 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -15,51 +15,6 @@ namespace kx { namespace arrowkdb { -template -constexpr auto toUType( E option ) noexcept -{ - return static_cast>( option ); -} - -template< typename E > -struct ETraits -{ - using Options = std::unordered_map; - - static std::string mapping( E option ){ - auto it = options.find( option ); - if( it != options.end() ){ - return it->second; - } - - return "unknown"; - } - - static std::string mapping( int option ) { return mapping( static_cast( option ) ); } - - static std::set mappings(){ - std::set values; - transform( options.begin(), options.end(), std::inserter( values, end( values ) ), []( const auto& option ){ - return option.second; - } ); - - return values; - } - - static E option( const std::string& value ){ - auto it = std::find_if( options.begin(), options.end(), [&value]( const auto& option ){ - return value == option.second; - } ); - if( it != options.end() ){ - return it->first; - } - - return E( 0 ); - } - - static const Options options; -}; - // Supported options namespace Options { @@ -186,36 +141,6 @@ namespace Options } // namespace Options -template<> -inline const ETraits::Options ETraits::options{ - { arrow::Type::BOOL, arrowkdb::Options::NM_BOOLEAN } - , { arrow::Type::UINT8, arrowkdb::Options::NM_UINT_8 } - , { arrow::Type::INT8, arrowkdb::Options::NM_INT_8 } - , { arrow::Type::UINT16, arrowkdb::Options::NM_UINT_16 } - , { arrow::Type::INT16, arrowkdb::Options::NM_INT_16 } - , { arrow::Type::UINT32, arrowkdb::Options::NM_UINT_32 } - , { arrow::Type::INT32, arrowkdb::Options::NM_INT_32 } - , { arrow::Type::UINT64, arrowkdb::Options::NM_UINT_64 } - , { arrow::Type::INT64, arrowkdb::Options::NM_INT_64 } - , { arrow::Type::HALF_FLOAT, arrowkdb::Options::NM_FLOAT_16 } - , { arrow::Type::FLOAT, arrowkdb::Options::NM_FLOAT_32 } - , { arrow::Type::DOUBLE, arrowkdb::Options::NM_FLOAT_64 } - , { arrow::Type::STRING, arrowkdb::Options::NM_STRING } - , { arrow::Type::LARGE_STRING, arrowkdb::Options::NM_LARGE_STRING } - , { arrow::Type::BINARY, arrowkdb::Options::NM_BINARY } - , { arrow::Type::LARGE_BINARY, arrowkdb::Options::NM_LARGE_BINARY } - , { arrow::Type::FIXED_SIZE_BINARY, arrowkdb::Options::NM_FIXED_BINARY } - , { arrow::Type::DATE32, arrowkdb::Options::NM_DATE_32 } - , { arrow::Type::DATE64, arrowkdb::Options::NM_DATE_64 } - , { arrow::Type::TIMESTAMP, arrowkdb::Options::NM_TIMESTAMP } - , { arrow::Type::TIME32, arrowkdb::Options::NM_TIME_32 } - , { arrow::Type::TIME64, arrowkdb::Options::NM_TIME_64 } - , { arrow::Type::DECIMAL, arrowkdb::Options::NM_DECIMAL } - , { arrow::Type::DURATION, arrowkdb::Options::NM_DURATION } - , { arrow::Type::INTERVAL_MONTHS, arrowkdb::Options::NM_MONTH_INTERVAL } - , { arrow::Type::INTERVAL_DAY_TIME, arrowkdb::Options::NM_DAY_TIME_INTERVAL } -}; - // Helper class for reading dictionary of options // // Dictionary key: KS @@ -233,10 +158,12 @@ class KdbOptions const std::set& supported_string_options; const std::set& supported_int_options; const std::set& supported_dict_options; - const std::set& supported_null_mapping_options; + std::set supported_null_mapping_options; using NullMappingHandler = void ( KdbOptions::* )( const std::string&, K ); using NullMappingHandlers = std::unordered_map; + const std::unordered_map null_mapping_types; + static const NullMappingHandlers null_mapping_handlers; private: const std::string ToUpper(std::string str) const @@ -290,7 +217,7 @@ class KdbOptions throw InvalidOption( "Unsupported NULL_MAPPING option '" + key + "'" ); } K value = kK( values )[i]; - auto option = ETraits::option( key ); + auto option = GetNullMappingType( key ); auto it = null_mapping_handlers.find( option ); if( it != null_mapping_handlers.end() ){ ( this->*it->second )( key, value ); @@ -373,14 +300,46 @@ class KdbOptions K options , const std::set supported_string_options_ , const std::set supported_int_options_ - , const std::set& supported_dict_options_ = Options::dict_options - , const std::set& supported_null_mapping_options_ = ETraits::mappings() ) + , const std::set& supported_dict_options_ = Options::dict_options ) : null_mapping_options {0} , supported_string_options(supported_string_options_) , supported_int_options(supported_int_options_) , supported_dict_options( supported_dict_options_ ) - , supported_null_mapping_options( supported_null_mapping_options_ ) + , null_mapping_types { + { arrow::Type::BOOL, arrowkdb::Options::NM_BOOLEAN } + , { arrow::Type::UINT8, arrowkdb::Options::NM_UINT_8 } + , { arrow::Type::INT8, arrowkdb::Options::NM_INT_8 } + , { arrow::Type::UINT16, arrowkdb::Options::NM_UINT_16 } + , { arrow::Type::INT16, arrowkdb::Options::NM_INT_16 } + , { arrow::Type::UINT32, arrowkdb::Options::NM_UINT_32 } + , { arrow::Type::INT32, arrowkdb::Options::NM_INT_32 } + , { arrow::Type::UINT64, arrowkdb::Options::NM_UINT_64 } + , { arrow::Type::INT64, arrowkdb::Options::NM_INT_64 } + , { arrow::Type::HALF_FLOAT, arrowkdb::Options::NM_FLOAT_16 } + , { arrow::Type::FLOAT, arrowkdb::Options::NM_FLOAT_32 } + , { arrow::Type::DOUBLE, arrowkdb::Options::NM_FLOAT_64 } + , { arrow::Type::STRING, arrowkdb::Options::NM_STRING } + , { arrow::Type::LARGE_STRING, arrowkdb::Options::NM_LARGE_STRING } + , { arrow::Type::BINARY, arrowkdb::Options::NM_BINARY } + , { arrow::Type::LARGE_BINARY, arrowkdb::Options::NM_LARGE_BINARY } + , { arrow::Type::FIXED_SIZE_BINARY, arrowkdb::Options::NM_FIXED_BINARY } + , { arrow::Type::DATE32, arrowkdb::Options::NM_DATE_32 } + , { arrow::Type::DATE64, arrowkdb::Options::NM_DATE_64 } + , { arrow::Type::TIMESTAMP, arrowkdb::Options::NM_TIMESTAMP } + , { arrow::Type::TIME32, arrowkdb::Options::NM_TIME_32 } + , { arrow::Type::TIME64, arrowkdb::Options::NM_TIME_64 } + , { arrow::Type::DECIMAL, arrowkdb::Options::NM_DECIMAL } + , { arrow::Type::DURATION, arrowkdb::Options::NM_DURATION } + , { arrow::Type::INTERVAL_MONTHS, arrowkdb::Options::NM_MONTH_INTERVAL } + , { arrow::Type::INTERVAL_DAY_TIME, arrowkdb::Options::NM_DAY_TIME_INTERVAL } } { + std::transform( + null_mapping_types.begin() + , null_mapping_types.end() + , std::inserter( supported_null_mapping_options, end( supported_null_mapping_options ) ) + , []( const auto& value ){ + return value.second; + } ); if (options != NULL && options->t != 101) { if (options->t != 99) throw InvalidOption("options not -99h"); @@ -410,8 +369,20 @@ class KdbOptions template inline void HandleNullMapping( const std::string& key, K value ); + arrow::Type::type GetNullMappingType( const std::string& option ) + { + auto it = std::find_if( null_mapping_types.begin(), null_mapping_types.end(), [&option]( const auto& value ){ + return option == value.second; + } ); + if( it != null_mapping_types.end() ){ + return it->first; + } + + return arrow::Type::NA; + } + void GetNullMappingOptions( Options::NullMapping& null_mapping ) const{ - null_mapping = null_mapping_options; + null_mapping = null_mapping_options; } bool GetStringOption(const std::string key, std::string& result) const From be9353f9a4e4e06829e12d31a93ebe17093d602e Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 14 Feb 2023 16:04:23 +0000 Subject: [PATCH 208/276] Null bitmap test for Travis CI --- tests/.gitignore | 1 + tests/null_bitmap.t | 109 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 110 insertions(+) create mode 100644 tests/null_bitmap.t diff --git a/tests/.gitignore b/tests/.gitignore index c0832d1..9efe47e 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -1,4 +1,5 @@ basic.q +null_bitmap.q null_mapping_short.q null_mapping_long.q null_mapping_float.q diff --git a/tests/null_bitmap.t b/tests/null_bitmap.t new file mode 100644 index 0000000..3a15b1d --- /dev/null +++ b/tests/null_bitmap.t @@ -0,0 +1,109 @@ +// null_bitmap.t + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l q/arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping ||----------+\n"; +bitmap_opts:(`bool`int32`float64`string`date32)!(0b;1i;2.34;"start";2006.07.21); + +options:(``NULL_MAPPING)!((::);bitmap_opts); + +N:5 + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +bool_dt:.arrowkdb.dt.boolean[]; +i32_dt:.arrowkdb.dt.int32[]; +f64_dt:.arrowkdb.dt.float64[]; +str_dt:.arrowkdb.dt.utf8[]; +d32_dt:.arrowkdb.dt.date32[]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +bool_fd:.arrowkdb.fd.field[`bool;bool_dt]; +i32_fd:.arrowkdb.fd.field[`int32;i32_dt]; +f64_fd:.arrowkdb.fd.field[`float64;f64_dt]; +str_fd:.arrowkdb.fd.field[`string;str_dt]; +d32_fd:.arrowkdb.fd.field[`date32;d32_dt]; + +-1"\n+----------|| Create the schemas for the list of fields ||----------+\n"; +bitmap_schema:.arrowkdb.sc.schema[(ts_fd,bool_fd,i32_fd,f64_fd,str_fd,d32_fd)]; + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +bool_data:N?(0b;1b); +bool_data[0]:0b; +i32_data:N?100i; +i32_data[1]:1i; +f64_data:N?100f; +f64_data[2]:2.34f; +str_data:N?("start";"stop";"alert";"acknowledge";""); +str_data[3]:"start" +d32_data:N?(2006.07.21;2005.07.18;2004.07.16;2003.07.15;2002.07.11); +d32_data[4]:2006.07.21; + +-1"\n+----------|| Combine the data for all columns ||----------+\n"; +bitmap_data:(ts_data;bool_data;i32_data;f64_data;str_data;d32_data); + +-1"\n+----------|| Write the schema and array data to a parquet file ||----------+\n"; +options[`PARQUET_VERSION]:`V2.0; + +parquet_bitmap:"null_bitmap.parquet"; +.arrowkdb.pq.writeParquet[parquet_bitmap;bitmap_schema;bitmap_data;options]; + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +options[`WITH_NULL_BITMAP]:1; +parquet_bitmap_data:.arrowkdb.pq.readParquetData[parquet_bitmap;options]; +bitmap_data~first parquet_bitmap_data + +nulls_data:1b,(N-1)?1b; +bitmap_nulls:{x rotate nulls_data} each neg til {x-1} count bitmap_data; +parquet_bitmap_nulls:last parquet_bitmap_data; +bitmap_nulls~bitmap_nulls & sublist[{1-x} count parquet_bitmap_nulls;parquet_bitmap_nulls] +rm parquet_bitmap; + +-1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; +arrow_bitmap:"null_bitmap.arrow"; +.arrowkdb.ipc.writeArrow[arrow_bitmap;bitmap_schema;bitmap_data;options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +arrow_bitmap_schema:.arrowkdb.ipc.readArrowSchema[arrow_bitmap]; +.arrowkdb.sc.equalSchemas[bitmap_schema;arrow_bitmap_schema] +bitmap_schema~arrow_bitmap_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +arrow_bitmap_data:.arrowkdb.ipc.readArrowData[arrow_bitmap;options]; +bitmap_data~first arrow_bitmap_data +arrow_bitmap_nulls:last arrow_bitmap_data; +bitmap_nulls~bitmap_nulls & sublist[{1-x} count arrow_bitmap_nulls;arrow_bitmap_nulls] +rm arrow_bitmap; + +-1"\n+----------|| Serialize the schema and array data to an arrow stream ||----------+\n"; +serialized_bitmap:.arrowkdb.ipc.serializeArrow[bitmap_schema;bitmap_data;options]; + +-1"\n+----------|| Parse the schema back abd compare ||----------+\n"; +stream_bitmap_schema:.arrowkdb.ipc.parseArrowSchema[serialized_bitmap]; +.arrowkdb.sc.equalSchemas[bitmap_schema;stream_bitmap_schema] +bitmap_schema~stream_bitmap_schema + +-1"\n+----------|| Parse the array data back and compare ||----------+\n"; +stream_bitmap_data:.arrowkdb.ipc.parseArrowData[serialized_bitmap;options]; +bitmap_data~first stream_bitmap_data + +stream_bitmap_nulls:last stream_bitmap_data; +bitmap_nulls~bitmap_nulls & sublist[{1-x} count stream_bitmap_nulls;stream_bitmap_nulls] + + +-1 "\n+----------|| Test utils ||----------+\n"; + +show .arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; From 456463783e5a325495bc443b25f6f439637cd8df Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 15 Feb 2023 10:49:26 +0300 Subject: [PATCH 209/276] Continue warning clean up --- src/KdbOptions.cpp | 274 +++++++++++++++++++++++++++++++++++++++++++++ src/KdbOptions.h | 263 +++---------------------------------------- 2 files changed, 288 insertions(+), 249 deletions(-) create mode 100644 src/KdbOptions.cpp diff --git a/src/KdbOptions.cpp b/src/KdbOptions.cpp new file mode 100644 index 0000000..75c903c --- /dev/null +++ b/src/KdbOptions.cpp @@ -0,0 +1,274 @@ +#include "KdbOptions.h" + +namespace{ + +template +auto make_handler() +{ + return std::make_pair( TypeId, &kx::arrowkdb::KdbOptions::HandleNullMapping ); +} + +} // namespace + +namespace kx { + +namespace arrowkdb { + +const KdbOptions::NullMappingHandlers KdbOptions::null_mapping_handlers = { + make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() + , make_handler() +}; + +KdbOptions::KdbOptions( + K options + , const std::set& supported_string_options_ + , const std::set& supported_int_options_ + , const std::set& supported_dict_options_ ) + : null_mapping_options {0} + , supported_string_options(supported_string_options_) + , supported_int_options(supported_int_options_) + , supported_dict_options( supported_dict_options_ ) + , null_mapping_types { + { arrow::Type::BOOL, arrowkdb::Options::NM_BOOLEAN } + , { arrow::Type::UINT8, arrowkdb::Options::NM_UINT_8 } + , { arrow::Type::INT8, arrowkdb::Options::NM_INT_8 } + , { arrow::Type::UINT16, arrowkdb::Options::NM_UINT_16 } + , { arrow::Type::INT16, arrowkdb::Options::NM_INT_16 } + , { arrow::Type::UINT32, arrowkdb::Options::NM_UINT_32 } + , { arrow::Type::INT32, arrowkdb::Options::NM_INT_32 } + , { arrow::Type::UINT64, arrowkdb::Options::NM_UINT_64 } + , { arrow::Type::INT64, arrowkdb::Options::NM_INT_64 } + , { arrow::Type::HALF_FLOAT, arrowkdb::Options::NM_FLOAT_16 } + , { arrow::Type::FLOAT, arrowkdb::Options::NM_FLOAT_32 } + , { arrow::Type::DOUBLE, arrowkdb::Options::NM_FLOAT_64 } + , { arrow::Type::STRING, arrowkdb::Options::NM_STRING } + , { arrow::Type::LARGE_STRING, arrowkdb::Options::NM_LARGE_STRING } + , { arrow::Type::BINARY, arrowkdb::Options::NM_BINARY } + , { arrow::Type::LARGE_BINARY, arrowkdb::Options::NM_LARGE_BINARY } + , { arrow::Type::FIXED_SIZE_BINARY, arrowkdb::Options::NM_FIXED_BINARY } + , { arrow::Type::DATE32, arrowkdb::Options::NM_DATE_32 } + , { arrow::Type::DATE64, arrowkdb::Options::NM_DATE_64 } + , { arrow::Type::TIMESTAMP, arrowkdb::Options::NM_TIMESTAMP } + , { arrow::Type::TIME32, arrowkdb::Options::NM_TIME_32 } + , { arrow::Type::TIME64, arrowkdb::Options::NM_TIME_64 } + , { arrow::Type::DECIMAL, arrowkdb::Options::NM_DECIMAL } + , { arrow::Type::DURATION, arrowkdb::Options::NM_DURATION } + , { arrow::Type::INTERVAL_MONTHS, arrowkdb::Options::NM_MONTH_INTERVAL } + , { arrow::Type::INTERVAL_DAY_TIME, arrowkdb::Options::NM_DAY_TIME_INTERVAL } } +{ + std::transform( + null_mapping_types.begin() + , null_mapping_types.end() + , std::inserter( supported_null_mapping_options, end( supported_null_mapping_options ) ) + , []( const auto& value ){ + return value.second; + } ); + if (options != NULL && options->t != 101) { + if (options->t != 99) + throw InvalidOption("options not -99h"); + K keys = kK(options)[0]; + if (keys->t != KS) + throw InvalidOption("options keys not 11h"); + K values = kK(options)[1]; + switch (values->t) { + case KJ: + PopulateIntOptions(keys, values); + break; + case KS: + PopulateStringOptions(keys, values); + break; + case XD: + PopulateDictOptions(keys, values); + break; + case 0: + PopulateMixedOptions(keys, values); + break; + default: + throw InvalidOption("options values not 7|11|0h"); + } + } +} + +const std::string KdbOptions::ToUpper(std::string str) const +{ + std::string upper; + for (auto i : str) + upper.push_back((unsigned char)std::toupper(i)); + return upper; +} + +const std::string KdbOptions::ToLower( std::string str ) const +{ + std::transform( str.begin(), str.end(), str.begin(), ::tolower ); + + return str; +} + +void KdbOptions::PopulateIntOptions(K keys, K values) +{ + for (auto i = 0ll; i < values->n; ++i) { + const std::string key = ToUpper(kS(keys)[i]); + if (supported_int_options.find(key) == supported_int_options.end()) + throw InvalidOption(("Unsupported int option '" + key + "'").c_str()); + int_options[key] = kJ(values)[i]; + } +} + +void KdbOptions::PopulateStringOptions(K keys, K values) +{ + for (auto i = 0ll; i < values->n; ++i) { + const std::string key = ToUpper(kS(keys)[i]); + if (supported_string_options.find(key) == supported_string_options.end()) + throw InvalidOption(("Unsupported string option '" + key + "'").c_str()); + string_options[key] = ToUpper(kS(values)[i]); + } +} + +void KdbOptions::PopulateNullMappingOptions( long long index, K dict ) +{ + K keys = kK( kK( dict )[index] )[0]; + K values = kK( kK( dict )[index] )[1]; + if( KS != keys->t ){ + throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING keys (expected=11h), type=" + std::to_string( keys->t ) + "h" ); + } + if( 0 != values->t ){ + throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING values (extected=0h), type=" + std::to_string( values->t ) + "h" ); + } + for( auto i = 0ll; i < values->n; ++i ){ + const std::string key = ToLower( kS( keys )[i] ); + if( supported_null_mapping_options.find( key ) == supported_null_mapping_options.end() ){ + throw InvalidOption( "Unsupported NULL_MAPPING option '" + key + "'" ); + } + K value = kK( values )[i]; + auto option = GetNullMappingType( key ); + auto it = null_mapping_handlers.find( option ); + if( it != null_mapping_handlers.end() ){ + ( this->*it->second )( key, value ); + } + else if( 101 == value->t ){ + // Ignore generic null, which may be used here to ensure mixed list of options + } + else{ + throw InvalidOption( "Unhandled NULL_MAPPING option '" + key + "', type=" + std::to_string( keys->t ) + "h" ); + } + } +} + +void KdbOptions::PopulateDictOptions( K keys, K values ) +{ + for( auto i = 0ll; i < values->n; ++i ) { + const std::string key = ToUpper( kS( keys )[i] ); + if( supported_dict_options.find( key ) == supported_dict_options.end() ){ + throw InvalidOption(("Unsupported dict option '" + key + "'").c_str()); + } + if( Options::NULL_MAPPING == key ) + { + PopulateNullMappingOptions( i, values ); + } + } +} + +void KdbOptions::PopulateMixedOptions(K keys, K values) +{ + for (auto i = 0ll; i < values->n; ++i) { + const std::string key = ToUpper(kS(keys)[i]); + K value = kK(values)[i]; + switch (value->t) { + case -KJ: + if (supported_int_options.find(key) == supported_int_options.end()) + throw InvalidOption(("Unsupported int option '" + key + "'").c_str()); + int_options[key] = value->j; + break; + case -KS: + if (supported_string_options.find(key) == supported_string_options.end()) + throw InvalidOption(("Unsupported string option '" + key + "'").c_str()); + string_options[key] = ToUpper(value->s); + break; + case KC: + { + if (supported_string_options.find(key) == supported_string_options.end()) + throw InvalidOption(("Unsupported string option '" + key + "'").c_str()); + string_options[key] = ToUpper(std::string((char*)kG(value), value->n)); + break; + } + case XD: + { + if( supported_dict_options.find( key ) == supported_dict_options.end() ){ + throw InvalidOption(("Unsupported dict option '" + key + "'").c_str()); + } + if( Options::NULL_MAPPING == key ) + { + PopulateNullMappingOptions( i, values ); + } + break; + } + case 101: + // Ignore :: + break; + default: + throw InvalidOption(("option '" + key + "' value not -7|-11|10h").c_str()); + } + } +} + +arrow::Type::type KdbOptions::GetNullMappingType( const std::string& option ) +{ + auto it = std::find_if( null_mapping_types.begin(), null_mapping_types.end(), [&option]( const auto& value ){ + return option == value.second; + } ); + if( it != null_mapping_types.end() ){ + return it->first; + } + + return arrow::Type::NA; +} + +bool KdbOptions::GetStringOption(const std::string key, std::string& result) const +{ + const auto it = string_options.find(key); + if (it == string_options.end()) + return false; + else { + result = it->second; + return true; + } +} + +bool KdbOptions::GetIntOption(const std::string key, int64_t& result) const +{ + const auto it = int_options.find(key); + if (it == int_options.end()) + return false; + else { + result = it->second; + return true; + } +} + +} // namespace arrowkdb + +} // kx diff --git a/src/KdbOptions.h b/src/KdbOptions.h index c60e6e0..9bde4f8 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -166,127 +166,19 @@ class KdbOptions static const NullMappingHandlers null_mapping_handlers; private: - const std::string ToUpper(std::string str) const - { - std::string upper; - for (auto i : str) - upper.push_back((unsigned char)std::toupper(i)); - return upper; - } + const std::string ToUpper(std::string str) const; - const std::string ToLower( std::string str ) const - { - std::transform( str.begin(), str.end(), str.begin(), ::tolower ); + const std::string ToLower( std::string str ) const; - return str; - } + void PopulateIntOptions(K keys, K values); - void PopulateIntOptions(K keys, K values) - { - for (auto i = 0ll; i < values->n; ++i) { - const std::string key = ToUpper(kS(keys)[i]); - if (supported_int_options.find(key) == supported_int_options.end()) - throw InvalidOption(("Unsupported int option '" + key + "'").c_str()); - int_options[key] = kJ(values)[i]; - } - } + void PopulateStringOptions(K keys, K values); - void PopulateStringOptions(K keys, K values) - { - for (auto i = 0ll; i < values->n; ++i) { - const std::string key = ToUpper(kS(keys)[i]); - if (supported_string_options.find(key) == supported_string_options.end()) - throw InvalidOption(("Unsupported string option '" + key + "'").c_str()); - string_options[key] = ToUpper(kS(values)[i]); - } - } + void PopulateNullMappingOptions( long long index, K dict ); - void PopulateNullMappingOptions( long long index, K dict ) - { - K keys = kK( kK( dict )[index] )[0]; - K values = kK( kK( dict )[index] )[1]; - if( KS != keys->t ){ - throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING keys (expected=11h), type=" + std::to_string( keys->t ) + "h" ); - } - if( 0 != values->t ){ - throw InvalidOption( "Unsupported KDB data type for NULL_MAPPING values (extected=0h), type=" + std::to_string( values->t ) + "h" ); - } - for( auto i = 0ll; i < values->n; ++i ){ - const std::string key = ToLower( kS( keys )[i] ); - if( supported_null_mapping_options.find( key ) == supported_null_mapping_options.end() ){ - throw InvalidOption( "Unsupported NULL_MAPPING option '" + key + "'" ); - } - K value = kK( values )[i]; - auto option = GetNullMappingType( key ); - auto it = null_mapping_handlers.find( option ); - if( it != null_mapping_handlers.end() ){ - ( this->*it->second )( key, value ); - } - else if( 101 == value->t ){ - // Ignore generic null, which may be used here to ensure mixed list of options - } - else{ - throw InvalidOption( "Unhandled NULL_MAPPING option '" + key + "', type=" + std::to_string( keys->t ) + "h" ); - } - } - } - - void PopulateDictOptions( K keys, K values ) - { - for( auto i = 0ll; i < values->n; ++i ) { - const std::string key = ToUpper( kS( keys )[i] ); - if( supported_dict_options.find( key ) == supported_dict_options.end() ){ - throw InvalidOption(("Unsupported dict option '" + key + "'").c_str()); - } - if( Options::NULL_MAPPING == key ) - { - PopulateNullMappingOptions( i, values ); - } - } - } - - void PopulateMixedOptions(K keys, K values) - { - for (auto i = 0ll; i < values->n; ++i) { - const std::string key = ToUpper(kS(keys)[i]); - K value = kK(values)[i]; - switch (value->t) { - case -KJ: - if (supported_int_options.find(key) == supported_int_options.end()) - throw InvalidOption(("Unsupported int option '" + key + "'").c_str()); - int_options[key] = value->j; - break; - case -KS: - if (supported_string_options.find(key) == supported_string_options.end()) - throw InvalidOption(("Unsupported string option '" + key + "'").c_str()); - string_options[key] = ToUpper(value->s); - break; - case KC: - { - if (supported_string_options.find(key) == supported_string_options.end()) - throw InvalidOption(("Unsupported string option '" + key + "'").c_str()); - string_options[key] = ToUpper(std::string((char*)kG(value), value->n)); - break; - } - case XD: - { - if( supported_dict_options.find( key ) == supported_dict_options.end() ){ - throw InvalidOption(("Unsupported dict option '" + key + "'").c_str()); - } - if( Options::NULL_MAPPING == key ) - { - PopulateNullMappingOptions( i, values ); - } - break; - } - case 101: - // Ignore :: - break; - default: - throw InvalidOption(("option '" + key + "' value not -7|-11|10h").c_str()); - } - } - } + void PopulateDictOptions( K keys, K values ); + + void PopulateMixedOptions(K keys, K values); public: class InvalidOption : public std::invalid_argument @@ -298,114 +190,22 @@ class KdbOptions KdbOptions( K options - , const std::set supported_string_options_ - , const std::set supported_int_options_ - , const std::set& supported_dict_options_ = Options::dict_options ) - : null_mapping_options {0} - , supported_string_options(supported_string_options_) - , supported_int_options(supported_int_options_) - , supported_dict_options( supported_dict_options_ ) - , null_mapping_types { - { arrow::Type::BOOL, arrowkdb::Options::NM_BOOLEAN } - , { arrow::Type::UINT8, arrowkdb::Options::NM_UINT_8 } - , { arrow::Type::INT8, arrowkdb::Options::NM_INT_8 } - , { arrow::Type::UINT16, arrowkdb::Options::NM_UINT_16 } - , { arrow::Type::INT16, arrowkdb::Options::NM_INT_16 } - , { arrow::Type::UINT32, arrowkdb::Options::NM_UINT_32 } - , { arrow::Type::INT32, arrowkdb::Options::NM_INT_32 } - , { arrow::Type::UINT64, arrowkdb::Options::NM_UINT_64 } - , { arrow::Type::INT64, arrowkdb::Options::NM_INT_64 } - , { arrow::Type::HALF_FLOAT, arrowkdb::Options::NM_FLOAT_16 } - , { arrow::Type::FLOAT, arrowkdb::Options::NM_FLOAT_32 } - , { arrow::Type::DOUBLE, arrowkdb::Options::NM_FLOAT_64 } - , { arrow::Type::STRING, arrowkdb::Options::NM_STRING } - , { arrow::Type::LARGE_STRING, arrowkdb::Options::NM_LARGE_STRING } - , { arrow::Type::BINARY, arrowkdb::Options::NM_BINARY } - , { arrow::Type::LARGE_BINARY, arrowkdb::Options::NM_LARGE_BINARY } - , { arrow::Type::FIXED_SIZE_BINARY, arrowkdb::Options::NM_FIXED_BINARY } - , { arrow::Type::DATE32, arrowkdb::Options::NM_DATE_32 } - , { arrow::Type::DATE64, arrowkdb::Options::NM_DATE_64 } - , { arrow::Type::TIMESTAMP, arrowkdb::Options::NM_TIMESTAMP } - , { arrow::Type::TIME32, arrowkdb::Options::NM_TIME_32 } - , { arrow::Type::TIME64, arrowkdb::Options::NM_TIME_64 } - , { arrow::Type::DECIMAL, arrowkdb::Options::NM_DECIMAL } - , { arrow::Type::DURATION, arrowkdb::Options::NM_DURATION } - , { arrow::Type::INTERVAL_MONTHS, arrowkdb::Options::NM_MONTH_INTERVAL } - , { arrow::Type::INTERVAL_DAY_TIME, arrowkdb::Options::NM_DAY_TIME_INTERVAL } } - { - std::transform( - null_mapping_types.begin() - , null_mapping_types.end() - , std::inserter( supported_null_mapping_options, end( supported_null_mapping_options ) ) - , []( const auto& value ){ - return value.second; - } ); - if (options != NULL && options->t != 101) { - if (options->t != 99) - throw InvalidOption("options not -99h"); - K keys = kK(options)[0]; - if (keys->t != KS) - throw InvalidOption("options keys not 11h"); - K values = kK(options)[1]; - switch (values->t) { - case KJ: - PopulateIntOptions(keys, values); - break; - case KS: - PopulateStringOptions(keys, values); - break; - case XD: - PopulateDictOptions(keys, values); - break; - case 0: - PopulateMixedOptions(keys, values); - break; - default: - throw InvalidOption("options values not 7|11|0h"); - } - } - } + , const std::set& supported_string_options_ + , const std::set& supported_int_options_ + , const std::set& supported_dict_options_ = Options::dict_options ); template inline void HandleNullMapping( const std::string& key, K value ); - arrow::Type::type GetNullMappingType( const std::string& option ) - { - auto it = std::find_if( null_mapping_types.begin(), null_mapping_types.end(), [&option]( const auto& value ){ - return option == value.second; - } ); - if( it != null_mapping_types.end() ){ - return it->first; - } - - return arrow::Type::NA; - } + arrow::Type::type GetNullMappingType( const std::string& option ); void GetNullMappingOptions( Options::NullMapping& null_mapping ) const{ null_mapping = null_mapping_options; } - bool GetStringOption(const std::string key, std::string& result) const - { - const auto it = string_options.find(key); - if (it == string_options.end()) - return false; - else { - result = it->second; - return true; - } - } + bool GetStringOption(const std::string key, std::string& result) const; - bool GetIntOption(const std::string key, int64_t& result) const - { - const auto it = int_options.find(key); - if (it == int_options.end()) - return false; - else { - result = it->second; - return true; - } - } + bool GetIntOption(const std::string key, int64_t& result) const; }; inline void null_mapping_error( const std::string& key, K value ) @@ -737,41 +537,6 @@ inline void KdbOptions::HandleNullMapping( const } } -template -auto make_handler() -{ - return std::make_pair( TypeId, &KdbOptions::HandleNullMapping ); -} - -inline const KdbOptions::NullMappingHandlers KdbOptions::null_mapping_handlers = { - make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() - , make_handler() -}; - } // namespace arrowkdb } // namespace kx From c4ed0e70e77fb47af9e4dd355b9220209b0b8e89 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 17 Feb 2023 16:48:46 +0300 Subject: [PATCH 210/276] Writing and serialization of chunked tables --- src/ArrayWriter.cpp | 45 +++++++++++++---- src/ArrayWriter.h | 9 ++++ src/HelperFunctions.h | 13 +++++ src/KdbOptions.h | 2 + src/TableData.cpp | 111 +++++++++++++++++++++++++++++++++++------- 5 files changed, 152 insertions(+), 28 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index e29a197..adb73a7 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -449,32 +449,38 @@ void PopulateBuilder(shared_ptr datatype, K template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; auto uint8_builder = static_cast(builder); if( type_overrides.null_mapping.have_uint8 ){ - std::vector null_bitmap( k_array->n ); - for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.uint8_null != static_cast( kG( k_array )[i] ); + std::vector null_bitmap( length ); + for( auto i = 0ll; i < length; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.uint8_null != static_cast( kG( k_array )[i+offset] ); } - PARQUET_THROW_NOT_OK( uint8_builder->AppendValues( ( uint8_t* )kG( k_array ), k_array->n, null_bitmap ) ); + PARQUET_THROW_NOT_OK( uint8_builder->AppendValues( ( uint8_t* )&kG( k_array )[offset], length, null_bitmap ) ); } else { - PARQUET_THROW_NOT_OK(uint8_builder->AppendValues((uint8_t*)kG(k_array), k_array->n)); + PARQUET_THROW_NOT_OK( uint8_builder->AppendValues( ( uint8_t* )&kG( k_array )[offset], length ) ); } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; auto int8_builder = static_cast(builder); if( type_overrides.null_mapping.have_int8 ){ - std::vector null_bitmap( k_array->n ); - for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.int8_null != kG( k_array )[i]; + std::vector null_bitmap( length ); + for( auto i = 0ll; i < length; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.int8_null != kG( k_array )[i+offset]; } - PARQUET_THROW_NOT_OK( int8_builder->AppendValues( ( int8_t* )kG( k_array ), k_array->n, null_bitmap ) ); + PARQUET_THROW_NOT_OK( int8_builder->AppendValues( ( int8_t* )&kG( k_array )[offset], length, null_bitmap ) ); } else { - PARQUET_THROW_NOT_OK(int8_builder->AppendValues((int8_t*)kG(k_array), k_array->n)); + PARQUET_THROW_NOT_OK(int8_builder->AppendValues((int8_t*)&kG(k_array)[offset], length)); } } @@ -1120,6 +1126,25 @@ shared_ptr MakeArray(shared_ptr datatype, K k_arr return array; } +shared_ptr MakeChunkedArray( + shared_ptr datatype + , K k_array + , TypeMappingOverride& type_overrides ) +{ + type_overrides.chunk_offset = 0; + vector> chunks; + int64_t num_chunks = type_overrides.NumChunks( k_array->n ); + for( int64_t i = 0; i < num_chunks; ++i ){ + auto array = MakeArray( datatype, k_array, type_overrides ); + chunks.push_back( array ); + type_overrides.chunk_offset += type_overrides.chunk_length; + } + + auto chunked_array = make_shared( move( chunks ) ); + + return chunked_array; +} + } // namespace arrowkdb } // namespace kx diff --git a/src/ArrayWriter.h b/src/ArrayWriter.h index 53a9b1b..e73aede 100644 --- a/src/ArrayWriter.h +++ b/src/ArrayWriter.h @@ -29,6 +29,15 @@ void PopulateBuilder(std::shared_ptr datatype, K k_array, arrow */ std::shared_ptr MakeArray(std::shared_ptr datatype, K k_array, TypeMappingOverride& type_overrides); +/** + * @brief Copies and converts a kdb list to an arrow chunked array + * + * @param datatype The datatype to use when creating the arrow array + * @param k_array The kdb list from which to source the data + * @return The arrow array +*/ +std::shared_ptr MakeChunkedArray( std::shared_ptr datatype, K k_array, TypeMappingOverride& type_overrides ); + } // namespace arrowkdb } // namespace kx diff --git a/src/HelperFunctions.h b/src/HelperFunctions.h index f48f8f1..001e46f 100644 --- a/src/HelperFunctions.h +++ b/src/HelperFunctions.h @@ -99,8 +99,21 @@ typedef signed char KdbType; { int64_t decimal128_as_double = 0; Options::NullMapping null_mapping; + int64_t chunk_offset = 0; + int64_t chunk_length = 0; + TypeMappingOverride(void) {}; TypeMappingOverride(const KdbOptions& options); + + int64_t NumChunks( long long array_length ) { return !chunk_length ? 1 + : array_length / chunk_length + ( array_length % chunk_length ? 1 : 0 ); + } + std::pair GetChunk( long long array_length ){ + int64_t offset = chunk_length ? chunk_offset : 0; + int64_t length = std::min( array_length - offset, chunk_length ? chunk_length : array_length ); + + return std::make_pair( offset, length ); + } }; /** diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 9bde4f8..c836747 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -19,6 +19,7 @@ namespace arrowkdb { namespace Options { // Int options + const std::string ARROW_CHUNK_ROWS = "ARROW_CHUNK_ROWS"; const std::string PARQUET_CHUNK_SIZE = "PARQUET_CHUNK_SIZE"; const std::string PARQUET_MULTITHREADED_READ = "PARQUET_MULTITHREADED_READ"; const std::string USE_MMAP = "USE_MMAP"; @@ -60,6 +61,7 @@ namespace Options const std::string NM_DAY_TIME_INTERVAL = "day_time_interval"; const static std::set int_options = { + ARROW_CHUNK_ROWS, PARQUET_CHUNK_SIZE, PARQUET_MULTITHREADED_READ, USE_MMAP, diff --git a/src/TableData.cpp b/src/TableData.cpp index 5d84bf9..cab590f 100644 --- a/src/TableData.cpp +++ b/src/TableData.cpp @@ -63,6 +63,31 @@ std::vector> MakeArrays(std::shared_ptr> MakeChunkedArrays( + std::shared_ptr schema + , K array_data + , kx::arrowkdb::TypeMappingOverride& type_overrides ) +{ + if( array_data->t != 0 ) + throw kx::arrowkdb::TypeCheck( "array_data not mixed list" ); + if( array_data->n < schema->num_fields() ) + throw kx::arrowkdb::TypeCheck( "array_data length less than number of schema fields" ); + std::vector> chunked_arrays; + if( array_data->t == 0 && array_data->n == 0 ){ + // Empty table + } + else{ + // Only count up to the number of schema fields. Additional trailing data + // in the kdb mixed list is ignored (to allow for ::) + for( auto i = 0; i < schema->num_fields(); ++i ){ + auto k_array = kK( array_data )[i]; + chunked_arrays.push_back( kx::arrowkdb::MakeChunkedArray( schema->field(i)->type(), k_array, type_overrides ) ); + } + } + + return chunked_arrays; +} + // Create a an arrow table from the arrow schema and mixed list of kdb array objects std::shared_ptr MakeTable(std::shared_ptr schema, K array_data, kx::arrowkdb::TypeMappingOverride& type_overrides) { @@ -453,19 +478,44 @@ K writeArrow(K arrow_file, K schema_id, K array_data, K options) std::shared_ptr writer; PARQUET_ASSIGN_OR_THROW(writer, arrow::ipc::MakeFileWriter(outfile.get(), schema)); - auto arrays = MakeArrays(schema, array_data, type_overrides); + // Chunk size + read_options.GetIntOption( kx::arrowkdb::Options::ARROW_CHUNK_ROWS, type_overrides.chunk_length ); + + auto check_length = []( const auto& arrays ){ + // Check all arrays are same length + int64_t len = -1; + for (auto i : arrays) { + if (len == -1) + len = i->length(); + else if (len != i->length()) + return -1l; + } + + return len; + }; + + if( !type_overrides.chunk_length ){ // arrow not chunked + auto arrays = MakeArrays(schema, array_data, type_overrides); - // Check all arrays are same length - int64_t len = -1; - for (auto i : arrays) { - if (len == -1) - len = i->length(); - else if (len != i->length()) + auto len = check_length( arrays ); + if( len < 0 ){ return krr((S)"unequal length arrays"); + } + + auto batch = arrow::RecordBatch::Make(schema, len, arrays); + PARQUET_THROW_NOT_OK(writer->WriteRecordBatch(*batch)); } + else{ + auto chunked_arrays = MakeChunkedArrays( schema, array_data, type_overrides ); - auto batch = arrow::RecordBatch::Make(schema, len, arrays); - PARQUET_THROW_NOT_OK(writer->WriteRecordBatch(*batch)); + auto len = check_length( chunked_arrays ); + if( len < 0 ){ + return krr((S)"unequal length arrays"); + } + + auto table = arrow::Table::Make( schema, chunked_arrays ); + PARQUET_THROW_NOT_OK( writer->WriteTable( *table ) ); + } PARQUET_THROW_NOT_OK(writer->Close()); @@ -607,19 +657,44 @@ K serializeArrow(K schema_id, K array_data, K options) sink.reset(new arrow::io::BufferOutputStream(buffer)); PARQUET_ASSIGN_OR_THROW(writer, arrow::ipc::MakeStreamWriter(sink.get(), schema)); - auto arrays = MakeArrays(schema, array_data, type_overrides); + // Chunk size + read_options.GetIntOption( kx::arrowkdb::Options::ARROW_CHUNK_ROWS, type_overrides.chunk_length ); + + auto check_length = []( const auto& arrays ){ + // Check all arrays are same length + int64_t len = -1; + for (auto i : arrays) { + if (len == -1) + len = i->length(); + else if (len != i->length()) + return -1l; + } + + return len; + }; - // Check all arrays are same length - int64_t len = -1; - for (auto i : arrays) { - if (len == -1) - len = i->length(); - else if (len != i->length()) + if( !type_overrides.chunk_length ){ // arrow not chunked + auto arrays = MakeArrays(schema, array_data, type_overrides); + + auto len = check_length( arrays ); + if( len < 0 ){ return krr((S)"unequal length arrays"); + } + + auto batch = arrow::RecordBatch::Make(schema, len, arrays); + PARQUET_THROW_NOT_OK(writer->WriteRecordBatch(*batch)); } + else{ + auto chunked_arrays = MakeChunkedArrays( schema, array_data, type_overrides ); - auto batch = arrow::RecordBatch::Make(schema, len, arrays); - PARQUET_THROW_NOT_OK(writer->WriteRecordBatch(*batch)); + auto len = check_length( chunked_arrays ); + if( len < 0 ){ + return krr((S)"unequal length arrays"); + } + + auto table = arrow::Table::Make( schema, chunked_arrays ); + PARQUET_THROW_NOT_OK( writer->WriteTable( *table ) ); + } PARQUET_THROW_NOT_OK(writer->Close()); std::shared_ptr final_buffer; From 89c31c86353f3ef5bd4a47b41fe2f9d8d18e4db9 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 20 Feb 2023 19:45:08 +0300 Subject: [PATCH 211/276] Example of batching array data --- examples/batching_tables.q | 44 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 examples/batching_tables.q diff --git a/examples/batching_tables.q b/examples/batching_tables.q new file mode 100644 index 0000000..e98a87f --- /dev/null +++ b/examples/batching_tables.q @@ -0,0 +1,44 @@ +// batching_tables.q +// Examples of creating a schema supporting null mapping and using it to read/write parquet and arrow tables + +-1"\n+----------|| batching_tables.q ||----------+\n"; + +// import the arrowkdb library +\l q/arrowkdb.q + +// Filesystem functions for Linux/MacOS/Windows +ls:{[filename] $[.z.o like "w*";system "dir /b ",filename;system "ls ",filename]}; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +//-------------------// +// Create the table // +//-------------------// + +// Support batching of large tables + +// Create data for a large column in the table +batching_table:([]col:2147483652#0x00) +.arrowkdb.ts.writeReadArray[.arrowkdb.dt.int8[];batching_table`col;::] + +// Write the batching table data to a parquet file +batching_options:(``PARQUET_VERSION)!((::);`V2.0) + +parquet_batching:"batching_table.parquet"; +.arrowkdb.pq.writeParquetFromTable[parquet_batching;batching_table;batching_options] +show ls parquet_batching +//rm parquet_batching + +// Write the batching array data to an arrow file +batching_options[`ARROW_CHUNK_ROWS]:214748365 + +arrow_batching:"batching_table.arrow"; +.arrowkdb.ipc.writeArrowFromTable[arrow_batching;batching_table;batching_options] +show ls arrow_batching +//rm arrow_batching; + +// Serialize the batching array data to an arrow stream +serialized_batching:.arrowkdb.ipc.serializeArrowFromTable[batching_table;batching_options]; +show serialized_batching + + +-1 "\n+----------------------------------------+\n"; From 9ff5eb25067aa6d738eb68dfa7f6f4b70389533f Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 20 Feb 2023 20:54:26 +0300 Subject: [PATCH 212/276] Batching through numeric types --- src/ArrayWriter.cpp | 132 +++++++++++++++++++++++++++----------------- 1 file changed, 81 insertions(+), 51 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index adb73a7..1616d0a 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -433,16 +433,19 @@ void PopulateBuilder(shared_ptr datatype, K k_ template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; auto bool_builder = static_cast(builder); if( type_overrides.null_mapping.have_boolean ){ - std::vector null_bitmap( k_array->n ); - for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.boolean_null != static_cast( kG( k_array )[i] ); + std::vector null_bitmap( length ); + for( auto i = 0ll; i < length; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.boolean_null != static_cast( kG( k_array )[i+offset] ); } - PARQUET_THROW_NOT_OK( bool_builder->AppendValues( ( uint8_t* )kG( k_array ), k_array->n, null_bitmap ) ); + PARQUET_THROW_NOT_OK( bool_builder->AppendValues( ( uint8_t* )&kG( k_array )[offset], length, null_bitmap ) ); } else { - PARQUET_THROW_NOT_OK(bool_builder->AppendValues((uint8_t*)kG(k_array), k_array->n)); + PARQUET_THROW_NOT_OK( bool_builder->AppendValues( ( uint8_t* )&kG( k_array )[offset], length ) ); } } @@ -480,151 +483,178 @@ void PopulateBuilder(shared_ptr datatype, K PARQUET_THROW_NOT_OK( int8_builder->AppendValues( ( int8_t* )&kG( k_array )[offset], length, null_bitmap ) ); } else { - PARQUET_THROW_NOT_OK(int8_builder->AppendValues((int8_t*)&kG(k_array)[offset], length)); + PARQUET_THROW_NOT_OK( int8_builder->AppendValues( ( int8_t* )&kG( k_array )[offset], length ) ); } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; auto uint16_builder = static_cast(builder); if( type_overrides.null_mapping.have_uint16 ){ - std::vector null_bitmap( k_array->n ); - for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.uint16_null != static_cast( kH( k_array )[i] ); + std::vector null_bitmap( length ); + for( auto i = 0ll; i < length; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.uint16_null != static_cast( kH( k_array )[i+offset] ); } - PARQUET_THROW_NOT_OK( uint16_builder->AppendValues( ( uint16_t* )kH( k_array ), k_array->n, null_bitmap ) ); + PARQUET_THROW_NOT_OK( uint16_builder->AppendValues( ( uint16_t* )&kH( k_array )[offset], length, null_bitmap ) ); } else { - PARQUET_THROW_NOT_OK(uint16_builder->AppendValues((uint16_t*)kH(k_array), k_array->n)); + PARQUET_THROW_NOT_OK( uint16_builder->AppendValues( ( uint16_t* )&kH( k_array )[offset], length ) ); } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; auto int16_builder = static_cast(builder); if( type_overrides.null_mapping.have_int16 ){ - std::vector null_bitmap( k_array->n ); - for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.int16_null != kH( k_array )[i]; + std::vector null_bitmap( length ); + for( auto i = 0ll; i < length; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.int16_null != kH( k_array )[i+offset]; } - PARQUET_THROW_NOT_OK( int16_builder->AppendValues( ( int16_t* )kH( k_array ), k_array->n, null_bitmap ) ); + PARQUET_THROW_NOT_OK( int16_builder->AppendValues( ( int16_t* )&kH( k_array )[offset], length, null_bitmap ) ); } else { - PARQUET_THROW_NOT_OK(int16_builder->AppendValues((int16_t*)kH(k_array), k_array->n)); + PARQUET_THROW_NOT_OK( int16_builder->AppendValues( ( int16_t* )&kH( k_array )[offset], length) ); } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; auto uint32_builder = static_cast(builder); if( type_overrides.null_mapping.have_uint32 ){ - std::vector null_bitmap( k_array->n ); - for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.uint32_null != static_cast( kI( k_array )[i] ); + std::vector null_bitmap( length ); + for( auto i = 0ll; i < length; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.uint32_null != static_cast( kI( k_array )[i+offset] ); } - PARQUET_THROW_NOT_OK( uint32_builder->AppendValues( ( uint32_t* )kI( k_array ), k_array->n, null_bitmap ) ); + PARQUET_THROW_NOT_OK( uint32_builder->AppendValues( ( uint32_t* )&kI( k_array )[offset], length, null_bitmap ) ); } else{ - PARQUET_THROW_NOT_OK(uint32_builder->AppendValues((uint32_t*)kI(k_array), k_array->n)); + PARQUET_THROW_NOT_OK( uint32_builder->AppendValues( ( uint32_t* )&kI( k_array )[offset], length ) ); } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; auto int32_builder = static_cast(builder); if( type_overrides.null_mapping.have_int32 ){ - std::vector null_bitmap( k_array->n ); - for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.int32_null != kI( k_array )[i]; + std::vector null_bitmap( length ); + for( auto i = 0ll; i < length; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.int32_null != kI( k_array )[i+offset]; } - PARQUET_THROW_NOT_OK( int32_builder->AppendValues( ( int32_t* )kI( k_array ), k_array->n, null_bitmap ) ); + PARQUET_THROW_NOT_OK( int32_builder->AppendValues( ( int32_t* )&kI( k_array )[offset], length, null_bitmap ) ); } else{ - PARQUET_THROW_NOT_OK(int32_builder->AppendValues((int32_t*)kI(k_array), k_array->n)); + PARQUET_THROW_NOT_OK( int32_builder->AppendValues( ( int32_t* )&kI( k_array )[offset], length ) ); } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; auto uint64_builder = static_cast(builder); if( type_overrides.null_mapping.have_uint64 ){ - std::vector null_bitmap( k_array->n ); - for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.uint64_null != static_cast( kJ( k_array )[i] ); + std::vector null_bitmap( length ); + for( auto i = 0ll; i < length; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.uint64_null != static_cast( kJ( k_array )[i+offset] ); } - PARQUET_THROW_NOT_OK( uint64_builder->AppendValues( ( uint64_t* )kJ( k_array ), k_array->n, null_bitmap ) ); + PARQUET_THROW_NOT_OK( uint64_builder->AppendValues( ( uint64_t* )&kJ( k_array )[offset], length, null_bitmap ) ); } else{ - PARQUET_THROW_NOT_OK(uint64_builder->AppendValues((uint64_t*)kJ(k_array), k_array->n)); + PARQUET_THROW_NOT_OK( uint64_builder->AppendValues( ( uint64_t* )&kJ( k_array )[offset], length ) ); } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; auto int64_builder = static_cast(builder); if( type_overrides.null_mapping.have_int64 ){ - std::vector null_bitmap( k_array->n ); - for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.int64_null != kJ( k_array )[i]; + std::vector null_bitmap( length ); + for( auto i = 0ll; i < length; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.int64_null != kJ( k_array )[i+offset]; } - PARQUET_THROW_NOT_OK( int64_builder->AppendValues( ( int64_t* )kJ( k_array ), k_array->n, null_bitmap ) ); + PARQUET_THROW_NOT_OK( int64_builder->AppendValues( ( int64_t* )&kJ( k_array )[offset], length, null_bitmap ) ); } else{ - PARQUET_THROW_NOT_OK(int64_builder->AppendValues((int64_t*)kJ(k_array), k_array->n)); + PARQUET_THROW_NOT_OK( int64_builder->AppendValues( ( int64_t* )&kJ( k_array )[offset], length ) ); } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; auto hfl_builder = static_cast(builder); if( type_overrides.null_mapping.have_float16 ){ - std::vector null_bitmap( k_array->n ); - for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.float16_null != static_cast( kH( k_array )[i] ); + std::vector null_bitmap( length ); + for( auto i = 0ll; i < length; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.float16_null != static_cast( kH( k_array )[i+offset] ); } - PARQUET_THROW_NOT_OK( hfl_builder->AppendValues( ( uint16_t* )kH( k_array ), k_array->n, null_bitmap ) ); + PARQUET_THROW_NOT_OK( hfl_builder->AppendValues( ( uint16_t* )&kH( k_array )[offset], length, null_bitmap ) ); } else { - PARQUET_THROW_NOT_OK(hfl_builder->AppendValues((uint16_t*)kH(k_array), k_array->n)); + PARQUET_THROW_NOT_OK( hfl_builder->AppendValues( ( uint16_t* )&kH( k_array )[offset], length ) ); } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; auto fl_builder = static_cast(builder); if( type_overrides.null_mapping.have_float32 ){ - std::vector null_bitmap( k_array->n ); - for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = !is_equal( type_overrides.null_mapping.float32_null, kE( k_array )[i] ); + std::vector null_bitmap( length ); + for( auto i = 0ll; i < length; ++i ){ + null_bitmap[i] = !is_equal( type_overrides.null_mapping.float32_null, kE( k_array )[i+offset] ); } - PARQUET_THROW_NOT_OK( fl_builder->AppendValues( kE( k_array ), k_array->n, null_bitmap ) ); + PARQUET_THROW_NOT_OK( fl_builder->AppendValues( &kE( k_array )[offset], length, null_bitmap ) ); } else { - PARQUET_THROW_NOT_OK(fl_builder->AppendValues(kE(k_array), k_array->n)); + PARQUET_THROW_NOT_OK( fl_builder->AppendValues( &kE( k_array )[offset], length ) ); } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; auto dbl_builder = static_cast(builder); if( type_overrides.null_mapping.have_float64 ){ - std::vector null_bitmap( k_array->n ); - for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = !is_equal( type_overrides.null_mapping.float64_null, kF( k_array )[i] ); + std::vector null_bitmap( length ); + for( auto i = 0ll; i < length; ++i ){ + null_bitmap[i] = !is_equal( type_overrides.null_mapping.float64_null, kF( k_array )[i+offset] ); } - PARQUET_THROW_NOT_OK( dbl_builder->AppendValues( kF( k_array ), k_array->n, null_bitmap ) ); + PARQUET_THROW_NOT_OK( dbl_builder->AppendValues( &kF( k_array )[offset], length, null_bitmap ) ); } else { - PARQUET_THROW_NOT_OK(dbl_builder->AppendValues(kF(k_array), k_array->n)); + PARQUET_THROW_NOT_OK( dbl_builder->AppendValues( &kF( k_array )[offset], length ) ); } } From 68f2475d257bb011121f388f4a16cf2566824f9c Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 20 Feb 2023 21:47:06 +0300 Subject: [PATCH 213/276] Batching through string types --- src/ArrayWriter.cpp | 53 +++++++++++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 19 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index 1616d0a..9a4520e 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -661,23 +661,26 @@ void PopulateBuilder(shared_ptr datatype, template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; auto str_builder = static_cast(builder); bool is_symbol = k_array->t == KS && (datatype->id() == arrow::Type::STRING || datatype->id() == arrow::Type::LARGE_STRING); if( is_symbol ){ // Populate from symbol list - for( auto i = 0ll; i < k_array->n; ++i ){ + for( auto i = 0ll; i < length; ++i ){ if( type_overrides.null_mapping.have_string - && type_overrides.null_mapping.string_null == kS( k_array )[i] ){ + && type_overrides.null_mapping.string_null == kS( k_array )[i+offset] ){ PARQUET_THROW_NOT_OK( str_builder->AppendNull() ); } else{ - PARQUET_THROW_NOT_OK( str_builder->Append( kS( k_array )[i] ) ); + PARQUET_THROW_NOT_OK( str_builder->Append( kS( k_array )[i+offset] ) ); } } } else { // Populate from mixed list of char lists - for( auto i = 0ll; i < k_array->n; ++i ){ - K str_data = kK( k_array )[i]; + for( auto i = 0ll; i < length; ++i ){ + K str_data = kK( k_array )[i+offset]; TYPE_CHECK_ITEM( str_data->t != KC, datatype->ToString(), KC, str_data->t ); if( type_overrides.null_mapping.have_string && type_overrides.null_mapping.string_null.length() == static_cast( str_data->n ) @@ -694,23 +697,26 @@ void PopulateBuilder(shared_ptr datatype, template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; auto str_builder = static_cast(builder); bool is_symbol = k_array->t == KS && (datatype->id() == arrow::Type::STRING || datatype->id() == arrow::Type::LARGE_STRING); if( is_symbol ){ // Populate from symbol list - for( auto i = 0ll; i < k_array->n; ++i ){ + for( auto i = 0ll; i < length; ++i ){ if( type_overrides.null_mapping.have_large_string - && type_overrides.null_mapping.large_string_null == kS( k_array )[i] ){ + && type_overrides.null_mapping.large_string_null == kS( k_array )[i+offset] ){ PARQUET_THROW_NOT_OK( str_builder->AppendNull() ); } else{ - PARQUET_THROW_NOT_OK( str_builder->Append( kS( k_array )[i] ) ); + PARQUET_THROW_NOT_OK( str_builder->Append( kS( k_array )[i+offset] ) ); } } } else { // Populate from mixed list of char lists - for( auto i = 0ll; i < k_array->n; ++i ){ - K str_data = kK( k_array )[i]; + for( auto i = 0ll; i < length; ++i ){ + K str_data = kK( k_array )[i+offset]; TYPE_CHECK_ITEM( str_data->t != KC, datatype->ToString(), KC, str_data->t ); if( type_overrides.null_mapping.have_large_string && type_overrides.null_mapping.large_string_null.length() == static_cast( str_data->n ) @@ -727,9 +733,12 @@ void PopulateBuilder(shared_ptr data template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; auto bin_builder = static_cast(builder); - for (auto i = 0; i < k_array->n; ++i) { - K bin_data = kK(k_array)[i]; + for( auto i = 0; i < length; ++i ){ + K bin_data = kK( k_array )[i+offset]; TYPE_CHECK_ITEM(bin_data->t != KG, datatype->ToString(), KG, bin_data->t); if( type_overrides.null_mapping.have_binary && type_overrides.null_mapping.binary_null.length() == static_cast( bin_data->n ) @@ -745,9 +754,12 @@ void PopulateBuilder(shared_ptr datatype, template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; auto bin_builder = static_cast(builder); - for (auto i = 0; i < k_array->n; ++i) { - K bin_data = kK(k_array)[i]; + for( auto i = 0; i < length; ++i ){ + K bin_data = kK( k_array )[i+offset]; TYPE_CHECK_ITEM(bin_data->t != KG, datatype->ToString(), KG, bin_data->t); if( type_overrides.null_mapping.have_large_binary && type_overrides.null_mapping.large_binary_null.length() == static_cast( bin_data->n ) @@ -763,22 +775,25 @@ void PopulateBuilder(shared_ptr data template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; bool is_guid = k_array->t == UU && datatype->id() == arrow::Type::FIXED_SIZE_BINARY && static_cast(builder)->byte_width() == sizeof(U); auto fixed_bin_builder = static_cast(builder); if (is_guid) { - for (auto i = 0; i < k_array->n; ++i){ + for (auto i = 0; i < length; ++i){ if( type_overrides.null_mapping.have_fixed_binary && type_overrides.null_mapping.fixed_binary_null.length() == sizeof( U ) - && !type_overrides.null_mapping.fixed_binary_null.compare( 0, sizeof( U ), &kU( k_array )[i].g[0], sizeof( U ) ) ){ + && !type_overrides.null_mapping.fixed_binary_null.compare( 0, sizeof( U ), &kU( k_array )[i+offset].g[0], sizeof( U ) ) ){ PARQUET_THROW_NOT_OK( fixed_bin_builder->AppendNull() ); } else{ - PARQUET_THROW_NOT_OK(fixed_bin_builder->Append((char*)&kU(k_array)[i])); + PARQUET_THROW_NOT_OK( fixed_bin_builder->Append( ( char* )&kU( k_array )[i+offset] ) ); } } } else { - for (auto i = 0; i < k_array->n; ++i) { - K bin_data = kK(k_array)[i]; + for (auto i = 0; i < length; ++i) { + K bin_data = kK(k_array)[i+offset]; TYPE_CHECK_ITEM(bin_data->t != KG, datatype->ToString(), KG, bin_data->t); TYPE_CHECK_LENGTH(fixed_bin_builder->byte_width() != bin_data->n, builder->type()->ToString(), fixed_bin_builder->byte_width(), bin_data->n); if( type_overrides.null_mapping.have_fixed_binary From e22d3617350b5086046c79bde9b96d6577bcdf9a Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 20 Feb 2023 22:19:18 +0300 Subject: [PATCH 214/276] Batching through temporal types --- src/ArrayWriter.cpp | 96 ++++++++++++++++++++++++++++++--------------- 1 file changed, 64 insertions(+), 32 deletions(-) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index 9a4520e..9ea8d68 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -781,7 +781,7 @@ void PopulateBuilder(shared_ptr bool is_guid = k_array->t == UU && datatype->id() == arrow::Type::FIXED_SIZE_BINARY && static_cast(builder)->byte_width() == sizeof(U); auto fixed_bin_builder = static_cast(builder); if (is_guid) { - for (auto i = 0; i < length; ++i){ + for( auto i = 0; i < length; ++i ){ if( type_overrides.null_mapping.have_fixed_binary && type_overrides.null_mapping.fixed_binary_null.length() == sizeof( U ) && !type_overrides.null_mapping.fixed_binary_null.compare( 0, sizeof( U ), &kU( k_array )[i+offset].g[0], sizeof( U ) ) ){ @@ -792,7 +792,7 @@ void PopulateBuilder(shared_ptr } } } else { - for (auto i = 0; i < length; ++i) { + for( auto i = 0; i < length; ++i ){ K bin_data = kK(k_array)[i+offset]; TYPE_CHECK_ITEM(bin_data->t != KG, datatype->ToString(), KG, bin_data->t); TYPE_CHECK_LENGTH(fixed_bin_builder->byte_width() != bin_data->n, builder->type()->ToString(), fixed_bin_builder->byte_width(), bin_data->n); @@ -811,15 +811,18 @@ void PopulateBuilder(shared_ptr template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; TemporalConversion tc(datatype); auto d32_builder = static_cast(builder); - for (auto i = 0; i < k_array->n; ++i){ + for( auto i = 0; i < length; ++i ){ if( type_overrides.null_mapping.have_date32 - && type_overrides.null_mapping.date32_null == kI( k_array )[i] ){ + && type_overrides.null_mapping.date32_null == kI( k_array )[i+offset] ){ PARQUET_THROW_NOT_OK( d32_builder->AppendNull() ); } else{ - PARQUET_THROW_NOT_OK(d32_builder->Append(tc.KdbToArrow(kI(k_array)[i]))); + PARQUET_THROW_NOT_OK( d32_builder->Append( tc.KdbToArrow( kI( k_array )[i+offset] ) ) ); } } } @@ -827,86 +830,105 @@ void PopulateBuilder(shared_ptr datatype, template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; TemporalConversion tc(datatype); auto d64_builder = static_cast(builder); - for (auto i = 0; i < k_array->n; ++i) + for( auto i = 0; i < length; ++i ){ if( type_overrides.null_mapping.have_date64 - && type_overrides.null_mapping.date64_null == kJ( k_array )[i] ){ + && type_overrides.null_mapping.date64_null == kJ( k_array )[i+offset] ){ PARQUET_THROW_NOT_OK( d64_builder->AppendNull() ); } else{ - PARQUET_THROW_NOT_OK(d64_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); + PARQUET_THROW_NOT_OK( d64_builder->Append( tc.KdbToArrow( kJ( k_array )[i+offset] ) ) ); } + } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; TemporalConversion tc(datatype); auto ts_builder = static_cast(builder); auto timestamp_type = static_pointer_cast(datatype); - for (auto i = 0; i < k_array->n; ++i) + for( auto i = 0; i < length; ++i ){ if( type_overrides.null_mapping.have_timestamp - && type_overrides.null_mapping.timestamp_null == kJ( k_array )[i] ){ + && type_overrides.null_mapping.timestamp_null == kJ( k_array )[i+offset] ){ PARQUET_THROW_NOT_OK( ts_builder->AppendNull() ); } else{ - PARQUET_THROW_NOT_OK(ts_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); + PARQUET_THROW_NOT_OK( ts_builder->Append( tc.KdbToArrow( kJ( k_array )[i+offset] ) ) ); } + } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; TemporalConversion tc(datatype); auto t32_builder = static_cast(builder); auto time32_type = static_pointer_cast(datatype); - for (auto i = 0; i < k_array->n; ++i) + for( auto i = 0; i < length; ++i ){ if( type_overrides.null_mapping.have_time32 - && type_overrides.null_mapping.time32_null == kI( k_array )[i] ){ + && type_overrides.null_mapping.time32_null == kI( k_array )[i+offset] ){ PARQUET_THROW_NOT_OK( t32_builder->AppendNull() ); } else{ - PARQUET_THROW_NOT_OK(t32_builder->Append(tc.KdbToArrow(kI(k_array)[i]))); + PARQUET_THROW_NOT_OK( t32_builder->Append( tc.KdbToArrow( kI( k_array )[i+offset] ) ) ); } + } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; TemporalConversion tc(datatype); auto t64_builder = static_cast(builder); auto time64_type = static_pointer_cast(datatype); - for (auto i = 0; i < k_array->n; ++i) + for( auto i = 0; i < length; ++i ){ if( type_overrides.null_mapping.have_time64 - && type_overrides.null_mapping.time64_null == kJ( k_array )[i] ){ + && type_overrides.null_mapping.time64_null == kJ( k_array )[i+offset] ){ PARQUET_THROW_NOT_OK( t64_builder->AppendNull() ); } else{ - PARQUET_THROW_NOT_OK(t64_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); + PARQUET_THROW_NOT_OK( t64_builder->Append( tc.KdbToArrow( kJ( k_array )[i+offset] ) ) ); } + } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; auto dec_builder = static_cast(builder); auto dec_type = static_pointer_cast(datatype); - for (auto i = 0; i < k_array->n; ++i) { + for (auto i = 0; i < length; ++i) { if (type_overrides.decimal128_as_double) { if( type_overrides.null_mapping.have_decimal - && is_equal( type_overrides.null_mapping.decimal_null, kF( k_array )[i] ) ){ + && is_equal( type_overrides.null_mapping.decimal_null, kF( k_array )[i+offset] ) ){ PARQUET_THROW_NOT_OK( dec_builder->AppendNull() ); } else{ // Construct the decimal from a double arrow::Decimal128 dec128; - PARQUET_ASSIGN_OR_THROW(dec128, arrow::Decimal128::FromReal(kF(k_array)[i], dec_type->precision(), dec_type->scale())); + PARQUET_ASSIGN_OR_THROW(dec128, arrow::Decimal128::FromReal(kF(k_array)[i+offset], dec_type->precision(), dec_type->scale())); PARQUET_THROW_NOT_OK(dec_builder->Append(dec128)); } } else { // Each decimal is a list of 16 bytes - K k_dec = kK(k_array)[i]; + K k_dec = kK(k_array)[i+offset]; TYPE_CHECK_LENGTH(k_dec->n != 16, datatype->ToString(), 16, k_dec->n); TYPE_CHECK_ITEM(k_dec->t != KG, datatype->ToString(), KG, k_dec->t); @@ -919,46 +941,56 @@ void PopulateBuilder(shared_ptr datatype, template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; TemporalConversion tc(datatype); auto dur_builder = static_cast(builder); auto duration_type = static_pointer_cast(datatype); - for (auto i = 0; i < k_array->n; ++i) + for( auto i = 0; i < length; ++i ){ if( type_overrides.null_mapping.have_duration - && type_overrides.null_mapping.duration_null == kJ( k_array )[i] ){ + && type_overrides.null_mapping.duration_null == kJ( k_array )[i+offset] ){ PARQUET_THROW_NOT_OK( dur_builder->AppendNull() ); } else{ - PARQUET_THROW_NOT_OK(dur_builder->Append(tc.KdbToArrow(kJ(k_array)[i]))); + PARQUET_THROW_NOT_OK( dur_builder->Append( tc.KdbToArrow( kJ( k_array )[i+offset] ) ) ); } + } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; auto month_builder = static_cast(builder); if( type_overrides.null_mapping.have_month_interval ){ - std::vector null_bitmap( k_array->n ); - for( auto i = 0ll; i < k_array->n; ++i ){ - null_bitmap[i] = type_overrides.null_mapping.month_interval_null != kI( k_array )[i]; + std::vector null_bitmap( length ); + for( auto i = 0ll; i < length; ++i ){ + null_bitmap[i] = type_overrides.null_mapping.month_interval_null != kI( k_array )[i+offset]; } - PARQUET_THROW_NOT_OK( month_builder->AppendValues( ( int32_t* )kI( k_array ), k_array->n, null_bitmap ) ); + PARQUET_THROW_NOT_OK( month_builder->AppendValues( ( int32_t* )&kI( k_array )[offset], length, null_bitmap ) ); } else{ - PARQUET_THROW_NOT_OK(month_builder->AppendValues((int32_t*)kI(k_array), k_array->n)); + PARQUET_THROW_NOT_OK( month_builder->AppendValues( ( int32_t* )&kI( k_array )[offset], length ) ); } } template<> void PopulateBuilder(shared_ptr datatype, K k_array, arrow::ArrayBuilder* builder, TypeMappingOverride& type_overrides) { + auto chunk = type_overrides.GetChunk( k_array->n ); + int64_t offset = chunk.first; + int64_t length = chunk.second; auto dt_builder = static_cast(builder); - for (auto i = 0; i < k_array->n; ++i){ + for (auto i = 0; i < length; ++i){ if( type_overrides.null_mapping.have_day_time_interval - && type_overrides.null_mapping.day_time_interval_null == kJ( k_array )[i] ){ + && type_overrides.null_mapping.day_time_interval_null == kJ( k_array )[i+offset] ){ PARQUET_THROW_NOT_OK( dt_builder->AppendNull() ); } else{ - PARQUET_THROW_NOT_OK(dt_builder->Append(KTimespan_DayTimeInterval(kJ(k_array)[i]))); + PARQUET_THROW_NOT_OK( dt_builder->Append( KTimespan_DayTimeInterval( kJ( k_array )[i+offset] ) ) ); } } } From 3b2b6f0714c43dea81a9512c199d005f5e217d7e Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 8 Mar 2023 18:21:33 +0000 Subject: [PATCH 215/276] Including docs with release --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 3084fc5..56238b5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -89,9 +89,9 @@ script: elif [[ $BUILD == "True" && ( $TRAVIS_OS_NAME == "linux" || $TRAVIS_OS_NAME == "osx" ) ]]; then tar -zcvf $FILE_NAME -C cmake/$FILE_ROOT .; elif [[ $TRAVIS_OS_NAME == "windows" ]]; then - 7z a -tzip $FILE_NAME README.md install.bat LICENSE q examples proto; + 7z a -tzip $FILE_NAME README.md install.bat LICENSE q docs examples proto; elif [[ $TRAVIS_OS_NAME == "linux" || $TRAVIS_OS_NAME == "osx" ]]; then - tar -zcvf $FILE_NAME README.md install.sh LICENSE q examples proto; + tar -zcvf $FILE_NAME README.md install.sh LICENSE q docs examples proto; fi deploy: From 0390b02ede34c10f95d1b4cf19a95ab1d85579c0 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 24 Feb 2023 11:36:37 +0300 Subject: [PATCH 216/276] Example of nested nulls bitmap --- examples/null_bitmap.q | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q index c8a02fd..afe805c 100644 --- a/examples/null_bitmap.q +++ b/examples/null_bitmap.q @@ -20,8 +20,9 @@ rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; // Support null mapping bitmap_opts:(`bool`int32`float64`string`date32)!(0b;1i;2.34;"start";2006.07.21); +nested_opts:(`uint16`float32`binary`time64)!(9h;8.76e;"x"$"alert";12:00:00.000000000); -options:(``NULL_MAPPING)!((::);bitmap_opts); +options:(``NULL_MAPPING)!((::);bitmap_opts,nested_opts); // Create the datatype identifiers ts_dt:.arrowkdb.dt.timestamp[`nano]; @@ -32,6 +33,12 @@ f64_dt:.arrowkdb.dt.float64[]; str_dt:.arrowkdb.dt.utf8[]; d32_dt:.arrowkdb.dt.date32[]; +ui16_dt:.arrowkdb.dt.uint16[]; + +f32_dt:.arrowkdb.dt.float32[]; +bin_dt:.arrowkdb.dt.binary[]; +t64_dt:.arrowkdb.dt.time64[`nano]; + // Create the field identifiers ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; @@ -41,12 +48,37 @@ f64_fd:.arrowkdb.fd.field[`float64;f64_dt]; str_fd:.arrowkdb.fd.field[`string;str_dt]; d32_fd:.arrowkdb.fd.field[`date32;d32_dt]; +ui16_fd:.arrowkdb.fd.field[`uint16;ui16_dt]; + +f32_fd:.arrowkdb.fd.field[`float32;f32_dt]; +bin_fd:.arrowkdb.fd.field[`binary;bin_dt]; +t64_fd:.arrowkdb.fd.field[`time64;t64_dt]; + +// Create a list datatype, using the uint16 datatype as its child +list_dt:.arrowkdb.dt.list[ui16_dt]; +.arrowkdb.dt.printDatatype[list_dt] + +// Create a field containing the list datatype +list_fd:.arrowkdb.fd.field[`list_field;list_dt]; + +// Create a struct datatype using the float32, binary and time64 fields as its children +struct_dt:.arrowkdb.dt.struct[(f32_fd,bin_dt,t64_dt)]; + +// Create a field containing the struct datatype +struct_fd:.arrowkdb.fd.field[`struct_field;struct_dt]; +.arrowkdb.dt.printDatatype[struct_dt] + // Create the schemas for the list of fields bitmap_schema:.arrowkdb.sc.schema[(ts_fd,bool_fd,i32_fd,f64_fd,str_fd,d32_fd)]; +// Create the schema containing the list and struct fields +nested_schema:.arrowkdb.sc.schema[(ts_fd,struct_dt)]; + // Print the schema .arrowkdb.sc.printSchema[bitmap_schema]; +.arrowkdb.sc.printSchema[nested_schema]; + // Create data for each column in the table ts_data:asc N?0p; From ede1b2251564fb3bf3b2d5eeefcdede6b15ffc19 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 24 Feb 2023 12:59:29 +0300 Subject: [PATCH 217/276] Example of writing of nested parquet --- examples/null_bitmap.q | 66 +++++++++++++++++++++++++++++++++++------- 1 file changed, 56 insertions(+), 10 deletions(-) diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q index afe805c..20af51a 100644 --- a/examples/null_bitmap.q +++ b/examples/null_bitmap.q @@ -20,7 +20,7 @@ rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; // Support null mapping bitmap_opts:(`bool`int32`float64`string`date32)!(0b;1i;2.34;"start";2006.07.21); -nested_opts:(`uint16`float32`binary`time64)!(9h;8.76e;"x"$"alert";12:00:00.000000000); +nested_opts:(`uint16`float32`binary`time64)!(9h;8.76e;"x"$"acknowledge";00:00:00.123456789); options:(``NULL_MAPPING)!((::);bitmap_opts,nested_opts); @@ -56,7 +56,6 @@ t64_fd:.arrowkdb.fd.field[`time64;t64_dt]; // Create a list datatype, using the uint16 datatype as its child list_dt:.arrowkdb.dt.list[ui16_dt]; -.arrowkdb.dt.printDatatype[list_dt] // Create a field containing the list datatype list_fd:.arrowkdb.fd.field[`list_field;list_dt]; @@ -66,17 +65,18 @@ struct_dt:.arrowkdb.dt.struct[(f32_fd,bin_dt,t64_dt)]; // Create a field containing the struct datatype struct_fd:.arrowkdb.fd.field[`struct_field;struct_dt]; -.arrowkdb.dt.printDatatype[struct_dt] // Create the schemas for the list of fields bitmap_schema:.arrowkdb.sc.schema[(ts_fd,bool_fd,i32_fd,f64_fd,str_fd,d32_fd)]; // Create the schema containing the list and struct fields -nested_schema:.arrowkdb.sc.schema[(ts_fd,struct_dt)]; +nested_schema:.arrowkdb.sc.schema[(list_fd,struct_dt)]; // Print the schema +-1"\nBitmap schema:"; .arrowkdb.sc.printSchema[bitmap_schema]; +-1"\nNested schema:"; .arrowkdb.sc.printSchema[nested_schema]; // Create data for each column in the table @@ -93,12 +93,34 @@ str_data[3]:"start" d32_data:N?(2006.07.21;2005.07.18;2004.07.16;2003.07.15;2002.07.11); d32_data[4]:2006.07.21; +// Create the data for each of the struct child fields +f32_data:3?100e; +f32_data[0]:8.76e; +bin_data:3?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); +bin_data[1]:"x"$"acknowledge" +t64_data:3?(12:00:00.000000000;13:00:00.000000000;14:00:00.000000000;15:00:00.000000000;16:00:00.000000000); +t64_data[2]:00:00:00.123456789; + // Combine the data for all columns bitmap_data:(ts_data;bool_data;i32_data;f64_data;str_data;d32_data); +// Create the data for the list array +list_data:(enlist (9h);(8h;7h);(6h;5h;4h)); + +// Create the data for the struct array from its child arrays +struct_data:(f32_data;bin_data;t64_data); + +// Combine the array data for the list and struct columns +nested_data:(list_data;struct_data); + // Pretty print the Arrow table populated from the bitmap data +-1"\nBitmap table:"; .arrowkdb.tb.prettyPrintTable[bitmap_schema;bitmap_data;options]; +// Show the array data as an arrow table +-1"\nNested table:"; +.arrowkdb.tb.prettyPrintTable[nested_schema;nested_data;options] + //-------------------------// // Example-1. Parquet file // //-------------------------// @@ -106,20 +128,44 @@ bitmap_data:(ts_data;bool_data;i32_data;f64_data;str_data;d32_data); // Write the schema and array data to a parquet file options[`PARQUET_VERSION]:`V2.0; -parquet_bitmap:"null_bitmap.parquet"; -.arrowkdb.pq.writeParquet[parquet_bitmap;bitmap_schema;bitmap_data;options]; -show ls parquet_bitmap +parquet_null_bitmap:"null_bitmap.parquet"; +parquet_nested_bitmap:"nested_bitmap.parquet"; + +.arrowkdb.pq.writeParquet[parquet_null_bitmap;bitmap_schema;bitmap_data;options]; +.arrowkdb.pq.writeParquet[parquet_nested_bitmap;nested_schema;nested_data;options]; + +show ls parquet_null_bitmap +show ls parquet_nested_bitmap // Read the array data back and compare options[`WITH_NULL_BITMAP]:1; -parquet_bitmap_data:.arrowkdb.pq.readParquetData[parquet_bitmap;options]; + +// Read the schema back and compare +parquet_bitmap_schema:.arrowkdb.pq.readParquetSchema[parquet_null_bitmap]; +parquet_nested_schema:.arrowkdb.pq.readParquetSchema[parquet_nested_bitmap]; + +show .arrowkdb.sc.equalSchemas[bitmap_schema;parquet_bitmap_schema] +show .arrowkdb.sc.equalSchemas[nested_schema;parquet_nested_schema] + +show bitmap_schema~parquet_bitmap_schema +show nested_schema~parquet_nested_schema + +parquet_bitmap_data:.arrowkdb.pq.readParquetData[parquet_null_bitmap;options]; +parquet_nested_data:.arrowkdb.pq.readParquetData[parquet_nested_bitmap;options]; + show bitmap_data~first parquet_bitmap_data +show nested_data~first parquet_nested_data nulls_data:1b,(N-1)?1b; bitmap_nulls:{x rotate nulls_data} each neg til {x-1} count bitmap_data; + parquet_bitmap_nulls:last parquet_bitmap_data; +parquet_nested_nulls:last parquet_nested_data; + show bitmap_nulls~bitmap_nulls & sublist[{1-x} count parquet_bitmap_nulls;parquet_bitmap_nulls] -rm parquet_bitmap; + +rm parquet_null_bitmap; +rm parquet_nested_bitmap; //---------------------------// // Example-2. Arrow IPC file // @@ -166,4 +212,4 @@ show bitmap_nulls~bitmap_nulls & sublist[{1-x} count stream_bitmap_nulls;stream_ -1 "\n+----------------------------------------+\n"; // Process off -exit 0; +//exit 0; From 9d450bd940e52b207e05d3018d9e1e7be3c9395d Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 24 Feb 2023 13:31:03 +0300 Subject: [PATCH 218/276] Example of writing of nested arrow file --- examples/null_bitmap.q | 53 +++++++++++++++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 11 deletions(-) diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q index 20af51a..f7be20d 100644 --- a/examples/null_bitmap.q +++ b/examples/null_bitmap.q @@ -172,44 +172,75 @@ rm parquet_nested_bitmap; //---------------------------// // Write the schema and array data to an arrow file -arrow_bitmap:"null_bitmap.arrow"; -.arrowkdb.ipc.writeArrow[arrow_bitmap;bitmap_schema;bitmap_data;options]; -show ls arrow_bitmap +arrow_null_bitmap:"null_bitmap.arrow"; +arrow_nested_bitmap:"nested_bitmap.arrow"; + +.arrowkdb.ipc.writeArrow[arrow_null_bitmap;bitmap_schema;bitmap_data;options]; +.arrowkdb.ipc.writeArrow[arrow_nested_bitmap;nested_schema;nested_data;options]; + +show ls arrow_null_bitmap +show ls arrow_nested_bitmap // Read the schema back and compare -arrow_bitmap_schema:.arrowkdb.ipc.readArrowSchema[arrow_bitmap]; +arrow_bitmap_schema:.arrowkdb.ipc.readArrowSchema[arrow_null_bitmap]; +arrow_nested_schema:.arrowkdb.ipc.readArrowSchema[arrow_nested_bitmap]; + show .arrowkdb.sc.equalSchemas[bitmap_schema;arrow_bitmap_schema] +show .arrowkdb.sc.equalSchemas[nested_schema;arrow_nested_schema] + show bitmap_schema~arrow_bitmap_schema +show nested_schema~arrow_nested_schema // Read the array data back and compare -arrow_bitmap_data:.arrowkdb.ipc.readArrowData[arrow_bitmap;options]; +arrow_bitmap_data:.arrowkdb.ipc.readArrowData[arrow_null_bitmap;options]; +arrow_nested_data:.arrowkdb.ipc.readArrowData[arrow_nested_bitmap;options]; + show bitmap_data~first arrow_bitmap_data +show nested_data~first arrow_nested_data + arrow_bitmap_nulls:last arrow_bitmap_data; +arrow_nested_nulls:last arrow_nested_data; + show bitmap_nulls~bitmap_nulls & sublist[{1-x} count arrow_bitmap_nulls;arrow_bitmap_nulls] -rm arrow_bitmap; + +rm arrow_null_bitmap; +rm arrow_nested_bitmap; //-----------------------------// // Example-3. Arrow IPC stream // //-----------------------------// // Serialize the schema and array data to an arrow stream -serialized_bitmap:.arrowkdb.ipc.serializeArrow[bitmap_schema;bitmap_data;options]; -show serialized_bitmap +serialized_null_bitmap:.arrowkdb.ipc.serializeArrow[bitmap_schema;bitmap_data;options]; +serialized_nested_bitmap:.arrowkdb.ipc.serializeArrow[nested_schema;nested_data;options]; + +show serialized_null_bitmap +show serialized_nested_bitmap // Parse the schema back abd compare -stream_bitmap_schema:.arrowkdb.ipc.parseArrowSchema[serialized_bitmap]; +stream_bitmap_schema:.arrowkdb.ipc.parseArrowSchema[serialized_null_bitmap]; +stream_nested_schema:.arrowkdb.ipc.parseArrowSchema[serialized_nested_bitmap]; + show .arrowkdb.sc.equalSchemas[bitmap_schema;stream_bitmap_schema] +show .arrowkdb.sc.equalSchemas[nested_schema;stream_nested_schema] + show bitmap_schema~stream_bitmap_schema +show nested_schema~stream_nested_schema // Parse the array data back and compare -stream_bitmap_data:.arrowkdb.ipc.parseArrowData[serialized_bitmap;options]; +stream_bitmap_data:.arrowkdb.ipc.parseArrowData[serialized_null_bitmap;options]; +stream_nested_data:.arrowkdb.ipc.parseArrowData[serialized_nested_bitmap;options]; + show bitmap_data~first stream_bitmap_data +show nested_data~first stream_nested_data stream_bitmap_nulls:last stream_bitmap_data; +stream_nested_nulls:last stream_nested_data; + show bitmap_nulls~bitmap_nulls & sublist[{1-x} count stream_bitmap_nulls;stream_bitmap_nulls] -1 "\n+----------------------------------------+\n"; // Process off -//exit 0; +exit 0; From 39874d3e6c19696f80fb40c19f9b4ab24db9bf94 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 27 Feb 2023 21:39:24 +0300 Subject: [PATCH 219/276] Example of nested null bitmap validation --- examples/null_bitmap.q | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q index f7be20d..687b6fa 100644 --- a/examples/null_bitmap.q +++ b/examples/null_bitmap.q @@ -158,11 +158,16 @@ show nested_data~first parquet_nested_data nulls_data:1b,(N-1)?1b; bitmap_nulls:{x rotate nulls_data} each neg til {x-1} count bitmap_data; +nested_list_nulls:((::;1b);(::;0b;0b);(::;0b;0b;0b)) +nested_struct_nulls:((::;1b;0b;0b);(::;0b;1b;0b);(::;0b;0b;1b)) parquet_bitmap_nulls:last parquet_bitmap_data; -parquet_nested_nulls:last parquet_nested_data; +parquet_list_nulls:first parquet_nested_data[1] +parquet_struct_nulls:last parquet_nested_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count parquet_bitmap_nulls;parquet_bitmap_nulls] +nested_list_nulls~{(::),x} each parquet_list_nulls +nested_struct_nulls~{(::),x} each parquet_struct_nulls rm parquet_null_bitmap; rm parquet_nested_bitmap; @@ -199,9 +204,12 @@ show bitmap_data~first arrow_bitmap_data show nested_data~first arrow_nested_data arrow_bitmap_nulls:last arrow_bitmap_data; -arrow_nested_nulls:last arrow_nested_data; +arrow_list_nulls:first parquet_nested_data[1] +arrow_struct_nulls:last parquet_nested_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count arrow_bitmap_nulls;arrow_bitmap_nulls] +nested_list_nulls~{(::),x} each arrow_list_nulls +nested_struct_nulls~{(::),x} each arrow_struct_nulls rm arrow_null_bitmap; rm arrow_nested_bitmap; @@ -235,10 +243,12 @@ show bitmap_data~first stream_bitmap_data show nested_data~first stream_nested_data stream_bitmap_nulls:last stream_bitmap_data; -stream_nested_nulls:last stream_nested_data; +stream_list_nulls:first parquet_nested_data[1] +stream_struct_nulls:last parquet_nested_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count stream_bitmap_nulls;stream_bitmap_nulls] - +nested_list_nulls~{(::),x} each stream_list_nulls +nested_struct_nulls~{(::),x} each stream_struct_nulls -1 "\n+----------------------------------------+\n"; From cb4bb7795f25c4e93d7bd38a590e98e9f6f1a5d9 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 27 Feb 2023 21:46:16 +0300 Subject: [PATCH 220/276] Recurse through constituents of nested types --- src/ArrayReader.cpp | 114 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 97 insertions(+), 17 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 16c1043..72d3636 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -616,6 +616,90 @@ unordered_map ArrayHandlers { , make_array_handler() }; +template +K MakeNullBitmap( shared_ptr array_data, size_t& index ); + +template<> +K MakeNullBitmap( shared_ptr array_data, size_t& index ) +{ + auto slice_array = static_pointer_cast( array_data )->value_slice( index ); + + size_t counter = 0; + auto length = slice_array->length(); + K k_bitmap = knk( length ); + auto slice = slice_array->Slice( 0, length ); + AppendNullBitmap( slice, k_bitmap, counter ); + + return k_bitmap; +} + +template<> +K MakeNullBitmap( shared_ptr array_data, size_t& index ) +{ + return nullptr; +} + +template<> +K MakeNullBitmap( shared_ptr array_data, size_t& index ) +{ + return nullptr; +} + +template<> +K MakeNullBitmap( shared_ptr array_data, size_t& index ) +{ + return nullptr; +} + +template<> +K MakeNullBitmap( shared_ptr array_data, size_t& index ) +{ + auto struct_array = static_pointer_cast( array_data ); + + size_t counter = 0; + auto num_fields = struct_array->type()->num_fields(); + K k_bitmap = knk( num_fields ); + auto field = struct_array->field( index ); + AppendNullBitmap( field, k_bitmap, counter ); + + return k_bitmap; +} + +template<> +K MakeNullBitmap( shared_ptr array_data, size_t& index ) +{ + return nullptr; +} + +template<> +K MakeNullBitmap( shared_ptr array_data, size_t& index ) +{ + return nullptr; +} + +template<> +K MakeNullBitmap( shared_ptr array_data, size_t& index ) +{ + return nullptr; +} + +template +auto make_null_bitmap_handler() +{ + return make_pair( TypeId, &MakeNullBitmap ); +} + +unordered_map array_data, size_t& index )>> null_bitmap_handlers{ + make_null_bitmap_handler() + , make_null_bitmap_handler() + , make_null_bitmap_handler() + , make_null_bitmap_handler() + , make_null_bitmap_handler() + , make_null_bitmap_handler() + , make_null_bitmap_handler() + , make_null_bitmap_handler() +}; + } // namespace namespace kx { @@ -637,23 +721,18 @@ void AppendArray(shared_ptr array_data, K k_array, size_t& index, void AppendNullBitmap( shared_ptr array_data, K k_bitmap, size_t& index ) { auto type_id = array_data->type_id(); - if( array_data->null_count() == 0 - || arrow::Type::LIST == type_id - || arrow::Type::LARGE_LIST == type_id - || arrow::Type::FIXED_SIZE_LIST == type_id - || arrow::Type::MAP == type_id - || arrow::Type::STRUCT == type_id - || arrow::Type::SPARSE_UNION == type_id - || arrow::Type::DENSE_UNION == type_id - || arrow::Type::DICTIONARY == type_id ){ - memset( &kG( k_bitmap )[index], 0, array_data->length() ); - index += array_data->length(); - } - else{ - for( auto i = 0; i < array_data->length(); ++i ){ - kG( k_bitmap )[index] = array_data->IsNull( index ); - ++index; + auto length = array_data->length(); + for( auto i = 0; i < length; ++i ){ + if( array_data->IsNull( i ) ){ + kK( k_bitmap )[index] = kb( true ); + } + else if( null_bitmap_handlers.find( type_id ) != null_bitmap_handlers.end() ){ + kK( k_bitmap )[index] = null_bitmap_handlers[type_id]( array_data, index ); } + else{ + kK( k_bitmap )[index] = kb( false ); + } + ++index; } } @@ -722,7 +801,8 @@ K ReadChunkedArray(shared_ptr chunked_array, TypeMappingOve K ReadChunkedNullBitmap( shared_ptr chunked_array, TypeMappingOverride& type_overrides ) { auto boolean = std::make_shared(); - K k_bitmap = InitKdbForArray( boolean, chunked_array->length(), type_overrides ); + K k_bitmap = knk( chunked_array->length() ); + size_t index = 0; for( auto i = 0; i < chunked_array->num_chunks(); ++i ){ AppendNullBitmap( chunked_array->chunk( i ), k_bitmap, index ); From 76d3f0ac561fa12ea9ccc89dd2011737462d05ac Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 28 Feb 2023 11:52:05 +0300 Subject: [PATCH 221/276] Unit-tests of nested null bitmap for Travis CI --- tests/.gitignore | 1 + tests/nested_null_bitmap.t | 142 +++++++++++++++++++++++++++++++++++++ 2 files changed, 143 insertions(+) create mode 100644 tests/nested_null_bitmap.t diff --git a/tests/.gitignore b/tests/.gitignore index 9efe47e..d96539e 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -1,5 +1,6 @@ basic.q null_bitmap.q +nested_null_bitmap.q null_mapping_short.q null_mapping_long.q null_mapping_float.q diff --git a/tests/nested_null_bitmap.t b/tests/nested_null_bitmap.t new file mode 100644 index 0000000..0b7b037 --- /dev/null +++ b/tests/nested_null_bitmap.t @@ -0,0 +1,142 @@ +// nested_null_bitmap.t + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l q/arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping ||----------+\n"; +nested_opts:(`uint16`float32`binary`time64)!(9h;8.76e;"x"$"acknowledge";00:00:00.123456789); + +options:(``NULL_MAPPING)!((::);nested_opts); + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +ui16_dt:.arrowkdb.dt.uint16[]; + +f32_dt:.arrowkdb.dt.float32[]; +bin_dt:.arrowkdb.dt.binary[]; +t64_dt:.arrowkdb.dt.time64[`nano]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +ui16_fd:.arrowkdb.fd.field[`uint16;ui16_dt]; + +f32_fd:.arrowkdb.fd.field[`float32;f32_dt]; +bin_fd:.arrowkdb.fd.field[`binary;bin_dt]; +t64_fd:.arrowkdb.fd.field[`time64;t64_dt]; + +-1"\n+----------|| Create a list datatype, using the uint16 datatype as its child ||----------+\n"; +list_dt:.arrowkdb.dt.list[ui16_dt]; + +-1"\n+----------|| Create a field containing the list datatype ||----------+\n"; +list_fd:.arrowkdb.fd.field[`list_field;list_dt]; + +-1"\n+----------|| Create a struct datatype using the float32, binary and time64 fields as its children ||----------+\n"; +struct_dt:.arrowkdb.dt.struct[(f32_fd,bin_dt,t64_dt)]; + +-1"\n+----------|| Create a field containing the struct datatype ||----------+\n"; +struct_fd:.arrowkdb.fd.field[`struct_field;struct_dt]; + +-1"\n+----------|| Create the schema containing the list and struct fields ||----------+\n"; +nested_schema:.arrowkdb.sc.schema[(list_fd,struct_dt)]; + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +-1"\n+----------|| Create the data for each of the struct child fields ||----------+\n"; +f32_data:3?100e; +f32_data[0]:8.76e; +bin_data:3?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); +bin_data[1]:"x"$"acknowledge" +t64_data:3?(12:00:00.000000000;13:00:00.000000000;14:00:00.000000000;15:00:00.000000000;16:00:00.000000000); +t64_data[2]:00:00:00.123456789; + +-1"\n+----------|| Create the data for the list array ||----------+\n"; +list_data:(enlist (9h);(8h;7h);(6h;5h;4h)); + +-1"\n+----------|| Create the data for the struct array from its child arrays ||----------+\n"; +struct_data:(f32_data;bin_data;t64_data); + +-1"\n+----------|| Combine the array data for the list and struct columns ||----------+\n"; +nested_data:(list_data;struct_data); + +-1"\n+----------|| Write the schema and array data to a parquet file ||----------+\n"; +options[`PARQUET_VERSION]:`V2.0; + +parquet_nested_bitmap:"nested_bitmap.parquet"; +.arrowkdb.pq.writeParquet[parquet_nested_bitmap;nested_schema;nested_data;options]; + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +options[`WITH_NULL_BITMAP]:1; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +parquet_nested_schema:.arrowkdb.pq.readParquetSchema[parquet_nested_bitmap]; +.arrowkdb.sc.equalSchemas[nested_schema;parquet_nested_schema] +nested_schema~parquet_nested_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +parquet_nested_data:.arrowkdb.pq.readParquetData[parquet_nested_bitmap;options]; +nested_data~first parquet_nested_data + +-1"\n+----------|| Compare nested null bitmaps ||----------+\n"; +nested_list_nulls:((::;1b);(::;0b;0b);(::;0b;0b;0b)) +nested_struct_nulls:((::;1b;0b;0b);(::;0b;1b;0b);(::;0b;0b;1b)) + +parquet_list_nulls:first parquet_nested_data[1] +parquet_struct_nulls:last parquet_nested_data[1] +nested_list_nulls~{(::),x} each parquet_list_nulls +nested_struct_nulls~{(::),x} each parquet_struct_nulls + +rm parquet_nested_bitmap; + +-1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; +arrow_nested_bitmap:"nested_bitmap.arrow"; +.arrowkdb.ipc.writeArrow[arrow_nested_bitmap;nested_schema;nested_data;options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +arrow_nested_schema:.arrowkdb.ipc.readArrowSchema[arrow_nested_bitmap]; +.arrowkdb.sc.equalSchemas[nested_schema;arrow_nested_schema] +nested_schema~arrow_nested_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +arrow_nested_data:.arrowkdb.ipc.readArrowData[arrow_nested_bitmap;options]; +nested_data~first arrow_nested_data + +-1"\n+----------|| Compare nested null bitmaps ||----------+\n"; +arrow_list_nulls:first parquet_nested_data[1] +arrow_struct_nulls:last parquet_nested_data[1] +nested_list_nulls~{(::),x} each arrow_list_nulls +nested_struct_nulls~{(::),x} each arrow_struct_nulls + +rm arrow_nested_bitmap; + +-1"\n+----------|| Serialize the schema and array data to an arrow stream ||----------+\n"; +serialized_nested_bitmap:.arrowkdb.ipc.serializeArrow[nested_schema;nested_data;options]; + +-1"\n+----------|| Parse the schema back abd compare ||----------+\n"; +stream_nested_schema:.arrowkdb.ipc.parseArrowSchema[serialized_nested_bitmap]; +.arrowkdb.sc.equalSchemas[nested_schema;stream_nested_schema] +nested_schema~stream_nested_schema + +-1"\n+----------|| Parse the array data back and compare ||----------+\n"; +stream_nested_data:.arrowkdb.ipc.parseArrowData[serialized_nested_bitmap;options]; +nested_data~first stream_nested_data + +-1"\n+----------|| Compare nested null bitmaps ||----------+\n"; +stream_list_nulls:first parquet_nested_data[1] +stream_struct_nulls:last parquet_nested_data[1] +nested_list_nulls~{(::),x} each stream_list_nulls +nested_struct_nulls~{(::),x} each stream_struct_nulls + + +-1 "\n+----------|| Test utils ||----------+\n"; + +.arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; From d48d7a1f46b3b4bebb474f9f6dd5247dabe9c1b4 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 28 Feb 2023 12:34:06 +0300 Subject: [PATCH 222/276] Break null bitmap traversal of lists for simple types --- src/ArrayReader.cpp | 101 +++++++++++++++++++++++++------------------- src/ArrayReader.h | 31 +++++++------- 2 files changed, 72 insertions(+), 60 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 72d3636..a8ef3d1 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -616,80 +616,93 @@ unordered_map ArrayHandlers { , make_array_handler() }; +using BitmapHandler = K (*) (shared_ptr array_data, size_t& index ); + +extern unordered_map null_bitmap_handlers; + template -K MakeNullBitmap( shared_ptr array_data, size_t& index ); +K AppendNullBitmap( shared_ptr array_data, size_t& index ); template<> -K MakeNullBitmap( shared_ptr array_data, size_t& index ) +K AppendNullBitmap( shared_ptr array_data, size_t& index ) { - auto slice_array = static_pointer_cast( array_data )->value_slice( index ); + K k_bitmap = nullptr; + auto slice_array = static_pointer_cast( array_data )->value_slice( index ); + auto length = slice_array->length(); + if( null_bitmap_handlers.find( slice_array->type_id() ) == null_bitmap_handlers.end() ){ + k_bitmap = ktn( KB, length ); + for( int i = 0ll; i < length; ++i ){ + kG( k_bitmap )[i] = slice_array->IsNull( i ); + } + } + else{ size_t counter = 0; - auto length = slice_array->length(); - K k_bitmap = knk( length ); + k_bitmap = knk( length ); auto slice = slice_array->Slice( 0, length ); - AppendNullBitmap( slice, k_bitmap, counter ); + InitKdbNullBitmap( slice, k_bitmap, counter ); + } - return k_bitmap; + return k_bitmap; } template<> -K MakeNullBitmap( shared_ptr array_data, size_t& index ) +K AppendNullBitmap( shared_ptr array_data, size_t& index ) { - return nullptr; + return nullptr; } template<> -K MakeNullBitmap( shared_ptr array_data, size_t& index ) +K AppendNullBitmap( shared_ptr array_data, size_t& index ) { - return nullptr; + return nullptr; } template<> -K MakeNullBitmap( shared_ptr array_data, size_t& index ) +K AppendNullBitmap( shared_ptr array_data, size_t& index ) { - return nullptr; + return nullptr; } template<> -K MakeNullBitmap( shared_ptr array_data, size_t& index ) +K AppendNullBitmap( shared_ptr array_data, size_t& index ) { - auto struct_array = static_pointer_cast( array_data ); + auto struct_array = static_pointer_cast( array_data ); - size_t counter = 0; - auto num_fields = struct_array->type()->num_fields(); - K k_bitmap = knk( num_fields ); - auto field = struct_array->field( index ); - AppendNullBitmap( field, k_bitmap, counter ); + size_t counter = 0; + auto num_fields = struct_array->type()->num_fields(); + K k_bitmap = knk( num_fields ); + auto field = struct_array->field( index ); + InitKdbNullBitmap( field, k_bitmap, counter ); - return k_bitmap; + return k_bitmap; } template<> -K MakeNullBitmap( shared_ptr array_data, size_t& index ) +K AppendNullBitmap( shared_ptr array_data, size_t& index ) { - return nullptr; + return nullptr; } template<> -K MakeNullBitmap( shared_ptr array_data, size_t& index ) +K AppendNullBitmap( shared_ptr array_data, size_t& index ) { - return nullptr; + return nullptr; } template<> -K MakeNullBitmap( shared_ptr array_data, size_t& index ) +K AppendNullBitmap( shared_ptr array_data, size_t& index ) { - return nullptr; + return nullptr; } template auto make_null_bitmap_handler() { - return make_pair( TypeId, &MakeNullBitmap ); + return make_pair( TypeId, &AppendNullBitmap ); } -unordered_map array_data, size_t& index )>> null_bitmap_handlers{ +unordered_map null_bitmap_handlers{ make_null_bitmap_handler() , make_null_bitmap_handler() , make_null_bitmap_handler() @@ -718,22 +731,22 @@ void AppendArray(shared_ptr array_data, K k_array, size_t& index, } } -void AppendNullBitmap( shared_ptr array_data, K k_bitmap, size_t& index ) +void InitKdbNullBitmap( shared_ptr array_data, K k_bitmap, size_t& index ) { - auto type_id = array_data->type_id(); - auto length = array_data->length(); - for( auto i = 0; i < length; ++i ){ - if( array_data->IsNull( i ) ){ - kK( k_bitmap )[index] = kb( true ); - } - else if( null_bitmap_handlers.find( type_id ) != null_bitmap_handlers.end() ){ - kK( k_bitmap )[index] = null_bitmap_handlers[type_id]( array_data, index ); - } - else{ - kK( k_bitmap )[index] = kb( false ); - } - ++index; + auto type_id = array_data->type_id(); + auto length = array_data->length(); + for( auto i = 0; i < length; ++i ){ + if( array_data->IsNull( i ) ){ + kK( k_bitmap )[index] = kb( true ); } + else if( null_bitmap_handlers.find( type_id ) != null_bitmap_handlers.end() ){ + kK( k_bitmap )[index] = null_bitmap_handlers[type_id]( array_data, index ); + } + else{ + kK( k_bitmap )[index] = kb( false ); + } + ++index; + } } K InitKdbForArray(shared_ptr datatype, size_t length, TypeMappingOverride& type_overrides) @@ -805,7 +818,7 @@ K ReadChunkedNullBitmap( shared_ptr chunked_array, TypeMapp size_t index = 0; for( auto i = 0; i < chunked_array->num_chunks(); ++i ){ - AppendNullBitmap( chunked_array->chunk( i ), k_bitmap, index ); + InitKdbNullBitmap( chunked_array->chunk( i ), k_bitmap, index ); } return k_bitmap; diff --git a/src/ArrayReader.h b/src/ArrayReader.h index 16c72b5..9ed0386 100644 --- a/src/ArrayReader.h +++ b/src/ArrayReader.h @@ -26,22 +26,6 @@ namespace arrowkdb { */ void AppendArray(std::shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides); - -/** - * @brief Appends null bitmap data from an arrow array into an existing kdb boolean - * list starting at the specified index. - * - * @param array_data The arrow array from which to source the data. The entire - * array will be appended. - * @param k_bitmap The kdb boolean list that the data should be inserted into. - * This list needs to have been created with the correct length by the calling - * function. - * @param index The index into the kdb list at which the appending should - * begin. Index will be updated to account for the new offset by adding the - * length of the array array. -*/ -void AppendNullBitmap( std::shared_ptr array_data, K k_bitmap, size_t& index ); - /** * @brief Copies and converts an arrow array to a kdb list * @@ -80,6 +64,21 @@ K ReadChunkedNullBitmap( std::shared_ptr chunked_array, Typ */ K InitKdbForArray(std::shared_ptr datatype, size_t length, TypeMappingOverride& type_overrides); +/** + * @brief Appends null bitmap data from an arrow array into an existing kdb boolean + * list starting at the specified index. + * + * @param array_data The arrow array from which to source the data. The entire + * array will be appended. + * @param k_bitmap The kdb boolean list that the data should be inserted into. + * This list needs to have been created with the correct length by the calling + * function. + * @param index The index into the kdb list at which the appending should + * begin. Index will be updated to account for the new offset by adding the + * length of the array array. +*/ +void InitKdbNullBitmap( std::shared_ptr array_data, K k_bitmap, size_t& index ); + } // namespace arrowkdb } // namespace kx From ba9383dbbf40ed4815946d52b5155b9d7205008d Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 28 Feb 2023 14:21:46 +0300 Subject: [PATCH 223/276] Simple lists for validating nested nulls --- examples/null_bitmap.q | 10 +++++----- tests/nested_null_bitmap.t | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q index 687b6fa..e36822e 100644 --- a/examples/null_bitmap.q +++ b/examples/null_bitmap.q @@ -158,7 +158,7 @@ show nested_data~first parquet_nested_data nulls_data:1b,(N-1)?1b; bitmap_nulls:{x rotate nulls_data} each neg til {x-1} count bitmap_data; -nested_list_nulls:((::;1b);(::;0b;0b);(::;0b;0b;0b)) +nested_list_nulls:(enlist 1b;00b;000b) nested_struct_nulls:((::;1b;0b;0b);(::;0b;1b;0b);(::;0b;0b;1b)) parquet_bitmap_nulls:last parquet_bitmap_data; @@ -166,7 +166,7 @@ parquet_list_nulls:first parquet_nested_data[1] parquet_struct_nulls:last parquet_nested_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count parquet_bitmap_nulls;parquet_bitmap_nulls] -nested_list_nulls~{(::),x} each parquet_list_nulls +nested_list_nulls~parquet_list_nulls nested_struct_nulls~{(::),x} each parquet_struct_nulls rm parquet_null_bitmap; @@ -208,7 +208,7 @@ arrow_list_nulls:first parquet_nested_data[1] arrow_struct_nulls:last parquet_nested_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count arrow_bitmap_nulls;arrow_bitmap_nulls] -nested_list_nulls~{(::),x} each arrow_list_nulls +nested_list_nulls~arrow_list_nulls nested_struct_nulls~{(::),x} each arrow_struct_nulls rm arrow_null_bitmap; @@ -247,10 +247,10 @@ stream_list_nulls:first parquet_nested_data[1] stream_struct_nulls:last parquet_nested_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count stream_bitmap_nulls;stream_bitmap_nulls] -nested_list_nulls~{(::),x} each stream_list_nulls +nested_list_nulls~stream_list_nulls nested_struct_nulls~{(::),x} each stream_struct_nulls -1 "\n+----------------------------------------+\n"; // Process off -exit 0; +//exit 0; diff --git a/tests/nested_null_bitmap.t b/tests/nested_null_bitmap.t index 0b7b037..4d9fde1 100644 --- a/tests/nested_null_bitmap.t +++ b/tests/nested_null_bitmap.t @@ -83,12 +83,12 @@ parquet_nested_data:.arrowkdb.pq.readParquetData[parquet_nested_bitmap;options]; nested_data~first parquet_nested_data -1"\n+----------|| Compare nested null bitmaps ||----------+\n"; -nested_list_nulls:((::;1b);(::;0b;0b);(::;0b;0b;0b)) +nested_list_nulls:(enlist 1b;00b;000b) nested_struct_nulls:((::;1b;0b;0b);(::;0b;1b;0b);(::;0b;0b;1b)) parquet_list_nulls:first parquet_nested_data[1] parquet_struct_nulls:last parquet_nested_data[1] -nested_list_nulls~{(::),x} each parquet_list_nulls +nested_list_nulls~parquet_list_nulls nested_struct_nulls~{(::),x} each parquet_struct_nulls rm parquet_nested_bitmap; @@ -109,7 +109,7 @@ nested_data~first arrow_nested_data -1"\n+----------|| Compare nested null bitmaps ||----------+\n"; arrow_list_nulls:first parquet_nested_data[1] arrow_struct_nulls:last parquet_nested_data[1] -nested_list_nulls~{(::),x} each arrow_list_nulls +nested_list_nulls~arrow_list_nulls nested_struct_nulls~{(::),x} each arrow_struct_nulls rm arrow_nested_bitmap; @@ -129,7 +129,7 @@ nested_data~first stream_nested_data -1"\n+----------|| Compare nested null bitmaps ||----------+\n"; stream_list_nulls:first parquet_nested_data[1] stream_struct_nulls:last parquet_nested_data[1] -nested_list_nulls~{(::),x} each stream_list_nulls +nested_list_nulls~stream_list_nulls nested_struct_nulls~{(::),x} each stream_struct_nulls From b7f170b3f69a0afc0a74d79759a1e8e50eba7bb4 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 28 Feb 2023 20:06:04 +0300 Subject: [PATCH 224/276] Simple arrays for nested nulls --- src/ArrayReader.cpp | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index a8ef3d1..9c694b3 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -626,22 +626,14 @@ K AppendNullBitmap( shared_ptr array_data, size_t& index ); template<> K AppendNullBitmap( shared_ptr array_data, size_t& index ) { - K k_bitmap = nullptr; + string strArray = array_data->ToString(); auto slice_array = static_pointer_cast( array_data )->value_slice( index ); auto length = slice_array->length(); - if( null_bitmap_handlers.find( slice_array->type_id() ) == null_bitmap_handlers.end() ){ - k_bitmap = ktn( KB, length ); - for( int i = 0ll; i < length; ++i ){ - kG( k_bitmap )[i] = slice_array->IsNull( i ); - } - } - else{ - size_t counter = 0; - k_bitmap = knk( length ); - auto slice = slice_array->Slice( 0, length ); - InitKdbNullBitmap( slice, k_bitmap, counter ); - } + size_t counter = 0; + K k_bitmap = ktn( KB, length ); + auto slice = slice_array->Slice( 0, length ); + InitKdbNullBitmap( slice, k_bitmap, counter ); return k_bitmap; } @@ -667,11 +659,12 @@ K AppendNullBitmap( shared_ptr array_data, size_ template<> K AppendNullBitmap( shared_ptr array_data, size_t& index ) { + string strArray = array_data->ToString(); auto struct_array = static_pointer_cast( array_data ); size_t counter = 0; auto num_fields = struct_array->type()->num_fields(); - K k_bitmap = knk( num_fields ); + K k_bitmap = ktn( KB, num_fields ); auto field = struct_array->field( index ); InitKdbNullBitmap( field, k_bitmap, counter ); @@ -735,17 +728,19 @@ void InitKdbNullBitmap( shared_ptr array_data, K k_bitmap, size_t& { auto type_id = array_data->type_id(); auto length = array_data->length(); - for( auto i = 0; i < length; ++i ){ - if( array_data->IsNull( i ) ){ - kK( k_bitmap )[index] = kb( true ); + + string strArray = array_data->ToString(); + if( null_bitmap_handlers.find( type_id ) == null_bitmap_handlers.end() ){ + for( int i = 0ll; i < length; ++i ){ + kG( k_bitmap )[index] = array_data->IsNull( i ); + ++index; } - else if( null_bitmap_handlers.find( type_id ) != null_bitmap_handlers.end() ){ + } + else{ + for( int i = 0ll; i < length; ++i ){ kK( k_bitmap )[index] = null_bitmap_handlers[type_id]( array_data, index ); + ++index; } - else{ - kK( k_bitmap )[index] = kb( false ); - } - ++index; } } From 15abcad5fd468d28bf621c97332d8a0e61fd9116 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 28 Feb 2023 20:49:19 +0300 Subject: [PATCH 225/276] Deep nested structures handled --- src/ArrayReader.cpp | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 9c694b3..67d0326 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -626,12 +626,14 @@ K AppendNullBitmap( shared_ptr array_data, size_t& index ); template<> K AppendNullBitmap( shared_ptr array_data, size_t& index ) { - string strArray = array_data->ToString(); auto slice_array = static_pointer_cast( array_data )->value_slice( index ); auto length = slice_array->length(); + auto type_id = slice_array->type_id(); size_t counter = 0; - K k_bitmap = ktn( KB, length ); + K k_bitmap = ( null_bitmap_handlers.find( type_id ) == null_bitmap_handlers.end() ) + ? ktn( KB, length ) + : knk( length ); auto slice = slice_array->Slice( 0, length ); InitKdbNullBitmap( slice, k_bitmap, counter ); @@ -659,13 +661,15 @@ K AppendNullBitmap( shared_ptr array_data, size_ template<> K AppendNullBitmap( shared_ptr array_data, size_t& index ) { - string strArray = array_data->ToString(); auto struct_array = static_pointer_cast( array_data ); - - size_t counter = 0; auto num_fields = struct_array->type()->num_fields(); - K k_bitmap = ktn( KB, num_fields ); auto field = struct_array->field( index ); + auto type_id = field->type_id(); + + size_t counter = 0; + K k_bitmap = ( null_bitmap_handlers.find( type_id ) == null_bitmap_handlers.end() ) + ? ktn( KB, num_fields ) + : knk( num_fields ); InitKdbNullBitmap( field, k_bitmap, counter ); return k_bitmap; @@ -729,18 +733,14 @@ void InitKdbNullBitmap( shared_ptr array_data, K k_bitmap, size_t& auto type_id = array_data->type_id(); auto length = array_data->length(); - string strArray = array_data->ToString(); - if( null_bitmap_handlers.find( type_id ) == null_bitmap_handlers.end() ){ - for( int i = 0ll; i < length; ++i ){ + for( int i = 0ll; i < length; ++i ){ + if( null_bitmap_handlers.find( type_id ) == null_bitmap_handlers.end() ){ kG( k_bitmap )[index] = array_data->IsNull( i ); - ++index; } - } - else{ - for( int i = 0ll; i < length; ++i ){ + else{ kK( k_bitmap )[index] = null_bitmap_handlers[type_id]( array_data, index ); - ++index; } + ++index; } } @@ -808,8 +808,11 @@ K ReadChunkedArray(shared_ptr chunked_array, TypeMappingOve K ReadChunkedNullBitmap( shared_ptr chunked_array, TypeMappingOverride& type_overrides ) { - auto boolean = std::make_shared(); - K k_bitmap = knk( chunked_array->length() ); + auto length = chunked_array->length(); + auto type_id = chunked_array->type()->id(); + K k_bitmap = ( null_bitmap_handlers.find( type_id ) == null_bitmap_handlers.end() ) + ? ktn( KB, length ) + : knk( length ); size_t index = 0; for( auto i = 0; i < chunked_array->num_chunks(); ++i ){ From 6303dbfc54836e0e482c6cc4c87fae0d8e40cc47 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 28 Feb 2023 20:50:33 +0300 Subject: [PATCH 226/276] Unit-tests nesting performance --- examples/null_bitmap.q | 54 ++++++++--------- tests/.gitignore | 2 +- tests/crucial_null_bitmap.t | 114 ++++++++++++++++++++++++++++++++++++ tests/nested_null_bitmap.t | 42 ++++++------- tests/null_bitmap.t | 109 ---------------------------------- 5 files changed, 164 insertions(+), 157 deletions(-) create mode 100644 tests/crucial_null_bitmap.t delete mode 100644 tests/null_bitmap.t diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q index e36822e..c10a5b0 100644 --- a/examples/null_bitmap.q +++ b/examples/null_bitmap.q @@ -22,7 +22,7 @@ rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; bitmap_opts:(`bool`int32`float64`string`date32)!(0b;1i;2.34;"start";2006.07.21); nested_opts:(`uint16`float32`binary`time64)!(9h;8.76e;"x"$"acknowledge";00:00:00.123456789); -options:(``NULL_MAPPING)!((::);bitmap_opts,nested_opts); +nested_options:(``NULL_MAPPING)!((::);bitmap_opts,nested_opts); // Create the datatype identifiers ts_dt:.arrowkdb.dt.timestamp[`nano]; @@ -70,7 +70,7 @@ struct_fd:.arrowkdb.fd.field[`struct_field;struct_dt]; bitmap_schema:.arrowkdb.sc.schema[(ts_fd,bool_fd,i32_fd,f64_fd,str_fd,d32_fd)]; // Create the schema containing the list and struct fields -nested_schema:.arrowkdb.sc.schema[(list_fd,struct_dt)]; +nested_schema:.arrowkdb.sc.schema[(list_fd,struct_fd)]; // Print the schema -1"\nBitmap schema:"; @@ -105,7 +105,7 @@ t64_data[2]:00:00:00.123456789; bitmap_data:(ts_data;bool_data;i32_data;f64_data;str_data;d32_data); // Create the data for the list array -list_data:(enlist (9h);(8h;7h);(6h;5h;4h)); +list_data:(enlist 9h;(8h;7h);(6h;5h;4h)); // Create the data for the struct array from its child arrays struct_data:(f32_data;bin_data;t64_data); @@ -115,30 +115,30 @@ nested_data:(list_data;struct_data); // Pretty print the Arrow table populated from the bitmap data -1"\nBitmap table:"; -.arrowkdb.tb.prettyPrintTable[bitmap_schema;bitmap_data;options]; +.arrowkdb.tb.prettyPrintTable[bitmap_schema;bitmap_data;nested_options]; // Show the array data as an arrow table -1"\nNested table:"; -.arrowkdb.tb.prettyPrintTable[nested_schema;nested_data;options] +.arrowkdb.tb.prettyPrintTable[nested_schema;nested_data;nested_options] //-------------------------// // Example-1. Parquet file // //-------------------------// // Write the schema and array data to a parquet file -options[`PARQUET_VERSION]:`V2.0; +nested_options[`PARQUET_VERSION]:`V2.0; parquet_null_bitmap:"null_bitmap.parquet"; parquet_nested_bitmap:"nested_bitmap.parquet"; -.arrowkdb.pq.writeParquet[parquet_null_bitmap;bitmap_schema;bitmap_data;options]; -.arrowkdb.pq.writeParquet[parquet_nested_bitmap;nested_schema;nested_data;options]; +.arrowkdb.pq.writeParquet[parquet_null_bitmap;bitmap_schema;bitmap_data;nested_options]; +.arrowkdb.pq.writeParquet[parquet_nested_bitmap;nested_schema;nested_data;nested_options]; show ls parquet_null_bitmap show ls parquet_nested_bitmap // Read the array data back and compare -options[`WITH_NULL_BITMAP]:1; +nested_options[`WITH_NULL_BITMAP]:1; // Read the schema back and compare parquet_bitmap_schema:.arrowkdb.pq.readParquetSchema[parquet_null_bitmap]; @@ -150,8 +150,8 @@ show .arrowkdb.sc.equalSchemas[nested_schema;parquet_nested_schema] show bitmap_schema~parquet_bitmap_schema show nested_schema~parquet_nested_schema -parquet_bitmap_data:.arrowkdb.pq.readParquetData[parquet_null_bitmap;options]; -parquet_nested_data:.arrowkdb.pq.readParquetData[parquet_nested_bitmap;options]; +parquet_bitmap_data:.arrowkdb.pq.readParquetData[parquet_null_bitmap;nested_options]; +parquet_nested_data:.arrowkdb.pq.readParquetData[parquet_nested_bitmap;nested_options]; show bitmap_data~first parquet_bitmap_data show nested_data~first parquet_nested_data @@ -159,7 +159,7 @@ show nested_data~first parquet_nested_data nulls_data:1b,(N-1)?1b; bitmap_nulls:{x rotate nulls_data} each neg til {x-1} count bitmap_data; nested_list_nulls:(enlist 1b;00b;000b) -nested_struct_nulls:((::;1b;0b;0b);(::;0b;1b;0b);(::;0b;0b;1b)) +nested_struct_nulls:(100b;010b;001b) parquet_bitmap_nulls:last parquet_bitmap_data; parquet_list_nulls:first parquet_nested_data[1] @@ -167,7 +167,7 @@ parquet_struct_nulls:last parquet_nested_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count parquet_bitmap_nulls;parquet_bitmap_nulls] nested_list_nulls~parquet_list_nulls -nested_struct_nulls~{(::),x} each parquet_struct_nulls +nested_struct_nulls~parquet_struct_nulls rm parquet_null_bitmap; rm parquet_nested_bitmap; @@ -180,8 +180,8 @@ rm parquet_nested_bitmap; arrow_null_bitmap:"null_bitmap.arrow"; arrow_nested_bitmap:"nested_bitmap.arrow"; -.arrowkdb.ipc.writeArrow[arrow_null_bitmap;bitmap_schema;bitmap_data;options]; -.arrowkdb.ipc.writeArrow[arrow_nested_bitmap;nested_schema;nested_data;options]; +.arrowkdb.ipc.writeArrow[arrow_null_bitmap;bitmap_schema;bitmap_data;nested_options]; +.arrowkdb.ipc.writeArrow[arrow_nested_bitmap;nested_schema;nested_data;nested_options]; show ls arrow_null_bitmap show ls arrow_nested_bitmap @@ -197,19 +197,19 @@ show bitmap_schema~arrow_bitmap_schema show nested_schema~arrow_nested_schema // Read the array data back and compare -arrow_bitmap_data:.arrowkdb.ipc.readArrowData[arrow_null_bitmap;options]; -arrow_nested_data:.arrowkdb.ipc.readArrowData[arrow_nested_bitmap;options]; +arrow_bitmap_data:.arrowkdb.ipc.readArrowData[arrow_null_bitmap;nested_options]; +arrow_nested_data:.arrowkdb.ipc.readArrowData[arrow_nested_bitmap;nested_options]; show bitmap_data~first arrow_bitmap_data show nested_data~first arrow_nested_data arrow_bitmap_nulls:last arrow_bitmap_data; -arrow_list_nulls:first parquet_nested_data[1] -arrow_struct_nulls:last parquet_nested_data[1] +arrow_list_nulls:first arrow_nested_data[1] +arrow_struct_nulls:last arrow_nested_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count arrow_bitmap_nulls;arrow_bitmap_nulls] nested_list_nulls~arrow_list_nulls -nested_struct_nulls~{(::),x} each arrow_struct_nulls +nested_struct_nulls~arrow_struct_nulls rm arrow_null_bitmap; rm arrow_nested_bitmap; @@ -219,8 +219,8 @@ rm arrow_nested_bitmap; //-----------------------------// // Serialize the schema and array data to an arrow stream -serialized_null_bitmap:.arrowkdb.ipc.serializeArrow[bitmap_schema;bitmap_data;options]; -serialized_nested_bitmap:.arrowkdb.ipc.serializeArrow[nested_schema;nested_data;options]; +serialized_null_bitmap:.arrowkdb.ipc.serializeArrow[bitmap_schema;bitmap_data;nested_options]; +serialized_nested_bitmap:.arrowkdb.ipc.serializeArrow[nested_schema;nested_data;nested_options]; show serialized_null_bitmap show serialized_nested_bitmap @@ -236,19 +236,19 @@ show bitmap_schema~stream_bitmap_schema show nested_schema~stream_nested_schema // Parse the array data back and compare -stream_bitmap_data:.arrowkdb.ipc.parseArrowData[serialized_null_bitmap;options]; -stream_nested_data:.arrowkdb.ipc.parseArrowData[serialized_nested_bitmap;options]; +stream_bitmap_data:.arrowkdb.ipc.parseArrowData[serialized_null_bitmap;nested_options]; +stream_nested_data:.arrowkdb.ipc.parseArrowData[serialized_nested_bitmap;nested_options]; show bitmap_data~first stream_bitmap_data show nested_data~first stream_nested_data stream_bitmap_nulls:last stream_bitmap_data; -stream_list_nulls:first parquet_nested_data[1] -stream_struct_nulls:last parquet_nested_data[1] +stream_list_nulls:first stream_nested_data[1] +stream_struct_nulls:last stream_nested_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count stream_bitmap_nulls;stream_bitmap_nulls] nested_list_nulls~stream_list_nulls -nested_struct_nulls~{(::),x} each stream_struct_nulls +nested_struct_nulls~stream_struct_nulls -1 "\n+----------------------------------------+\n"; diff --git a/tests/.gitignore b/tests/.gitignore index d96539e..01c7b10 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -1,5 +1,5 @@ basic.q -null_bitmap.q +crucial_null_bitmap.q nested_null_bitmap.q null_mapping_short.q null_mapping_long.q diff --git a/tests/crucial_null_bitmap.t b/tests/crucial_null_bitmap.t new file mode 100644 index 0000000..a56c576 --- /dev/null +++ b/tests/crucial_null_bitmap.t @@ -0,0 +1,114 @@ +// crucial_null_bitmap.t + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l q/arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping ||----------+\n"; +crucial_opts:(`bool`int32`float64`string`date32)!(0b;1i;2.34;"start";2006.07.21); + +crucial_options:(``NULL_MAPPING)!((::);crucial_opts); + +N:5 + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +bool_dt:.arrowkdb.dt.boolean[]; +i32_dt:.arrowkdb.dt.int32[]; +f64_dt:.arrowkdb.dt.float64[]; +str_dt:.arrowkdb.dt.utf8[]; +d32_dt:.arrowkdb.dt.date32[]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +bool_fd:.arrowkdb.fd.field[`bool;bool_dt]; +i32_fd:.arrowkdb.fd.field[`int32;i32_dt]; +f64_fd:.arrowkdb.fd.field[`float64;f64_dt]; +str_fd:.arrowkdb.fd.field[`string;str_dt]; +d32_fd:.arrowkdb.fd.field[`date32;d32_dt]; + +-1"\n+----------|| Create the schemas for the list of fields ||----------+\n"; +crucial_schema:.arrowkdb.sc.schema[(ts_fd,bool_fd,i32_fd,f64_fd,str_fd,d32_fd)]; + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +bool_data:N?(0b;1b); +bool_data[0]:0b; +i32_data:N?100i; +i32_data[1]:1i; +f64_data:N?100f; +f64_data[2]:2.34f; +str_data:N?("start";"stop";"alert";"acknowledge";""); +str_data[3]:"start" +d32_data:N?(2006.07.21;2005.07.18;2004.07.16;2003.07.15;2002.07.11); +d32_data[4]:2006.07.21; + +-1"\n+----------|| Combine the data for all columns ||----------+\n"; +crucial_data:(ts_data;bool_data;i32_data;f64_data;str_data;d32_data); + +-1"\n+----------|| Write the schema and array data to a parquet file ||----------+\n"; +crucial_options[`PARQUET_VERSION]:`V2.0; + +parquet_crucial_bitmap:"null_bitmap.parquet"; +.arrowkdb.pq.writeParquet[parquet_crucial_bitmap;crucial_schema;crucial_data;crucial_options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +parquet_crucial_schema:.arrowkdb.pq.readParquetSchema[parquet_crucial_bitmap]; +.arrowkdb.sc.equalSchemas[crucial_schema;parquet_crucial_schema] +crucial_schema~parquet_crucial_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +crucial_options[`WITH_NULL_BITMAP]:1; +parquet_crucial_data:.arrowkdb.pq.readParquetData[parquet_crucial_bitmap;crucial_options]; +crucial_data~first parquet_crucial_data + +nulls_data:1b,(N-1)?1b; +crucial_nulls:{x rotate nulls_data} each neg til {x-1} count crucial_data; +parquet_crucial_nulls:last parquet_crucial_data; +crucial_nulls~crucial_nulls & sublist[{1-x} count parquet_crucial_nulls;parquet_crucial_nulls] +rm parquet_crucial_bitmap; + +-1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; +arrow_crucial_bitmap:"null_bitmap.arrow"; +.arrowkdb.ipc.writeArrow[arrow_crucial_bitmap;crucial_schema;crucial_data;crucial_options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +arrow_crucial_schema:.arrowkdb.ipc.readArrowSchema[arrow_crucial_bitmap]; +.arrowkdb.sc.equalSchemas[crucial_schema;arrow_crucial_schema] +crucial_schema~arrow_crucial_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +arrow_crucial_data:.arrowkdb.ipc.readArrowData[arrow_crucial_bitmap;crucial_options]; +crucial_data~first arrow_crucial_data +arrow_crucial_nulls:last arrow_crucial_data; +crucial_nulls~crucial_nulls & sublist[{1-x} count arrow_crucial_nulls;arrow_crucial_nulls] +rm arrow_crucial_bitmap; + +-1"\n+----------|| Serialize the schema and array data to an arrow stream ||----------+\n"; +serialized_bitmap:.arrowkdb.ipc.serializeArrow[crucial_schema;crucial_data;crucial_options]; + +-1"\n+----------|| Parse the schema back abd compare ||----------+\n"; +stream_crucial_schema:.arrowkdb.ipc.parseArrowSchema[serialized_bitmap]; +.arrowkdb.sc.equalSchemas[crucial_schema;stream_crucial_schema] +crucial_schema~stream_crucial_schema + +-1"\n+----------|| Parse the array data back and compare ||----------+\n"; +stream_crucial_data:.arrowkdb.ipc.parseArrowData[serialized_bitmap;crucial_options]; +crucial_data~first stream_crucial_data + +stream_crucial_nulls:last stream_crucial_data; +crucial_nulls~crucial_nulls & sublist[{1-x} count stream_crucial_nulls;stream_crucial_nulls] + + +-1 "\n+----------|| Test utils ||----------+\n"; + +show .arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; diff --git a/tests/nested_null_bitmap.t b/tests/nested_null_bitmap.t index 4d9fde1..a12bdf6 100644 --- a/tests/nested_null_bitmap.t +++ b/tests/nested_null_bitmap.t @@ -9,7 +9,9 @@ rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; -1"\n+----------|| Support null mapping ||----------+\n"; nested_opts:(`uint16`float32`binary`time64)!(9h;8.76e;"x"$"acknowledge";00:00:00.123456789); -options:(``NULL_MAPPING)!((::);nested_opts); +nested_options:(``NULL_MAPPING)!((::);nested_opts); + +N:5 -1"\n+----------|| Create the datatype identifiers ||----------+\n"; ts_dt:.arrowkdb.dt.timestamp[`nano]; @@ -42,7 +44,7 @@ struct_dt:.arrowkdb.dt.struct[(f32_fd,bin_dt,t64_dt)]; struct_fd:.arrowkdb.fd.field[`struct_field;struct_dt]; -1"\n+----------|| Create the schema containing the list and struct fields ||----------+\n"; -nested_schema:.arrowkdb.sc.schema[(list_fd,struct_dt)]; +nested_schema:.arrowkdb.sc.schema[(list_fd,struct_fd)]; -1"\n+----------|| Create data for each column in the table ||----------+\n"; ts_data:asc N?0p; @@ -56,7 +58,7 @@ t64_data:3?(12:00:00.000000000;13:00:00.000000000;14:00:00.000000000;15:00:00.00 t64_data[2]:00:00:00.123456789; -1"\n+----------|| Create the data for the list array ||----------+\n"; -list_data:(enlist (9h);(8h;7h);(6h;5h;4h)); +list_data:(enlist 9h;(8h;7h);(6h;5h;4h)); -1"\n+----------|| Create the data for the struct array from its child arrays ||----------+\n"; struct_data:(f32_data;bin_data;t64_data); @@ -65,13 +67,13 @@ struct_data:(f32_data;bin_data;t64_data); nested_data:(list_data;struct_data); -1"\n+----------|| Write the schema and array data to a parquet file ||----------+\n"; -options[`PARQUET_VERSION]:`V2.0; +nested_options[`PARQUET_VERSION]:`V2.0; parquet_nested_bitmap:"nested_bitmap.parquet"; -.arrowkdb.pq.writeParquet[parquet_nested_bitmap;nested_schema;nested_data;options]; +.arrowkdb.pq.writeParquet[parquet_nested_bitmap;nested_schema;nested_data;nested_options]; --1"\n+----------|| Read the array data back and compare ||----------+\n"; -options[`WITH_NULL_BITMAP]:1; +-1"\n+----------|| Read the array back and compare ||----------+\n"; +nested_options[`WITH_NULL_BITMAP]:1; -1"\n+----------|| Read the schema back and compare ||----------+\n"; parquet_nested_schema:.arrowkdb.pq.readParquetSchema[parquet_nested_bitmap]; @@ -79,23 +81,23 @@ parquet_nested_schema:.arrowkdb.pq.readParquetSchema[parquet_nested_bitmap]; nested_schema~parquet_nested_schema -1"\n+----------|| Read the array data back and compare ||----------+\n"; -parquet_nested_data:.arrowkdb.pq.readParquetData[parquet_nested_bitmap;options]; +parquet_nested_data:.arrowkdb.pq.readParquetData[parquet_nested_bitmap;nested_options]; nested_data~first parquet_nested_data -1"\n+----------|| Compare nested null bitmaps ||----------+\n"; nested_list_nulls:(enlist 1b;00b;000b) -nested_struct_nulls:((::;1b;0b;0b);(::;0b;1b;0b);(::;0b;0b;1b)) +nested_struct_nulls:(100b;010b;001b) parquet_list_nulls:first parquet_nested_data[1] parquet_struct_nulls:last parquet_nested_data[1] nested_list_nulls~parquet_list_nulls -nested_struct_nulls~{(::),x} each parquet_struct_nulls +nested_struct_nulls~parquet_struct_nulls rm parquet_nested_bitmap; -1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; arrow_nested_bitmap:"nested_bitmap.arrow"; -.arrowkdb.ipc.writeArrow[arrow_nested_bitmap;nested_schema;nested_data;options]; +.arrowkdb.ipc.writeArrow[arrow_nested_bitmap;nested_schema;nested_data;nested_options]; -1"\n+----------|| Read the schema back and compare ||----------+\n"; arrow_nested_schema:.arrowkdb.ipc.readArrowSchema[arrow_nested_bitmap]; @@ -103,19 +105,19 @@ arrow_nested_schema:.arrowkdb.ipc.readArrowSchema[arrow_nested_bitmap]; nested_schema~arrow_nested_schema -1"\n+----------|| Read the array data back and compare ||----------+\n"; -arrow_nested_data:.arrowkdb.ipc.readArrowData[arrow_nested_bitmap;options]; +arrow_nested_data:.arrowkdb.ipc.readArrowData[arrow_nested_bitmap;nested_options]; nested_data~first arrow_nested_data -1"\n+----------|| Compare nested null bitmaps ||----------+\n"; -arrow_list_nulls:first parquet_nested_data[1] -arrow_struct_nulls:last parquet_nested_data[1] +arrow_list_nulls:first arrow_nested_data[1] +arrow_struct_nulls:last arrow_nested_data[1] nested_list_nulls~arrow_list_nulls -nested_struct_nulls~{(::),x} each arrow_struct_nulls +nested_struct_nulls~arrow_struct_nulls rm arrow_nested_bitmap; -1"\n+----------|| Serialize the schema and array data to an arrow stream ||----------+\n"; -serialized_nested_bitmap:.arrowkdb.ipc.serializeArrow[nested_schema;nested_data;options]; +serialized_nested_bitmap:.arrowkdb.ipc.serializeArrow[nested_schema;nested_data;nested_options]; -1"\n+----------|| Parse the schema back abd compare ||----------+\n"; stream_nested_schema:.arrowkdb.ipc.parseArrowSchema[serialized_nested_bitmap]; @@ -123,14 +125,14 @@ stream_nested_schema:.arrowkdb.ipc.parseArrowSchema[serialized_nested_bitmap]; nested_schema~stream_nested_schema -1"\n+----------|| Parse the array data back and compare ||----------+\n"; -stream_nested_data:.arrowkdb.ipc.parseArrowData[serialized_nested_bitmap;options]; +stream_nested_data:.arrowkdb.ipc.parseArrowData[serialized_nested_bitmap;nested_options]; nested_data~first stream_nested_data -1"\n+----------|| Compare nested null bitmaps ||----------+\n"; -stream_list_nulls:first parquet_nested_data[1] -stream_struct_nulls:last parquet_nested_data[1] +stream_list_nulls:first stream_nested_data[1] +stream_struct_nulls:last stream_nested_data[1] nested_list_nulls~stream_list_nulls -nested_struct_nulls~{(::),x} each stream_struct_nulls +nested_struct_nulls~stream_struct_nulls -1 "\n+----------|| Test utils ||----------+\n"; diff --git a/tests/null_bitmap.t b/tests/null_bitmap.t deleted file mode 100644 index 3a15b1d..0000000 --- a/tests/null_bitmap.t +++ /dev/null @@ -1,109 +0,0 @@ -// null_bitmap.t - --1"\n+----------|| Import the arrowkdb library ||----------+\n"; -\l q/arrowkdb.q - --1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; -rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; - --1"\n+----------|| Support null mapping ||----------+\n"; -bitmap_opts:(`bool`int32`float64`string`date32)!(0b;1i;2.34;"start";2006.07.21); - -options:(``NULL_MAPPING)!((::);bitmap_opts); - -N:5 - --1"\n+----------|| Create the datatype identifiers ||----------+\n"; -ts_dt:.arrowkdb.dt.timestamp[`nano]; - -bool_dt:.arrowkdb.dt.boolean[]; -i32_dt:.arrowkdb.dt.int32[]; -f64_dt:.arrowkdb.dt.float64[]; -str_dt:.arrowkdb.dt.utf8[]; -d32_dt:.arrowkdb.dt.date32[]; - --1"\n+----------|| Create the field identifiers ||----------+\n"; -ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; - -bool_fd:.arrowkdb.fd.field[`bool;bool_dt]; -i32_fd:.arrowkdb.fd.field[`int32;i32_dt]; -f64_fd:.arrowkdb.fd.field[`float64;f64_dt]; -str_fd:.arrowkdb.fd.field[`string;str_dt]; -d32_fd:.arrowkdb.fd.field[`date32;d32_dt]; - --1"\n+----------|| Create the schemas for the list of fields ||----------+\n"; -bitmap_schema:.arrowkdb.sc.schema[(ts_fd,bool_fd,i32_fd,f64_fd,str_fd,d32_fd)]; - --1"\n+----------|| Create data for each column in the table ||----------+\n"; -ts_data:asc N?0p; - -bool_data:N?(0b;1b); -bool_data[0]:0b; -i32_data:N?100i; -i32_data[1]:1i; -f64_data:N?100f; -f64_data[2]:2.34f; -str_data:N?("start";"stop";"alert";"acknowledge";""); -str_data[3]:"start" -d32_data:N?(2006.07.21;2005.07.18;2004.07.16;2003.07.15;2002.07.11); -d32_data[4]:2006.07.21; - --1"\n+----------|| Combine the data for all columns ||----------+\n"; -bitmap_data:(ts_data;bool_data;i32_data;f64_data;str_data;d32_data); - --1"\n+----------|| Write the schema and array data to a parquet file ||----------+\n"; -options[`PARQUET_VERSION]:`V2.0; - -parquet_bitmap:"null_bitmap.parquet"; -.arrowkdb.pq.writeParquet[parquet_bitmap;bitmap_schema;bitmap_data;options]; - --1"\n+----------|| Read the array data back and compare ||----------+\n"; -options[`WITH_NULL_BITMAP]:1; -parquet_bitmap_data:.arrowkdb.pq.readParquetData[parquet_bitmap;options]; -bitmap_data~first parquet_bitmap_data - -nulls_data:1b,(N-1)?1b; -bitmap_nulls:{x rotate nulls_data} each neg til {x-1} count bitmap_data; -parquet_bitmap_nulls:last parquet_bitmap_data; -bitmap_nulls~bitmap_nulls & sublist[{1-x} count parquet_bitmap_nulls;parquet_bitmap_nulls] -rm parquet_bitmap; - --1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; -arrow_bitmap:"null_bitmap.arrow"; -.arrowkdb.ipc.writeArrow[arrow_bitmap;bitmap_schema;bitmap_data;options]; - --1"\n+----------|| Read the schema back and compare ||----------+\n"; -arrow_bitmap_schema:.arrowkdb.ipc.readArrowSchema[arrow_bitmap]; -.arrowkdb.sc.equalSchemas[bitmap_schema;arrow_bitmap_schema] -bitmap_schema~arrow_bitmap_schema - --1"\n+----------|| Read the array data back and compare ||----------+\n"; -arrow_bitmap_data:.arrowkdb.ipc.readArrowData[arrow_bitmap;options]; -bitmap_data~first arrow_bitmap_data -arrow_bitmap_nulls:last arrow_bitmap_data; -bitmap_nulls~bitmap_nulls & sublist[{1-x} count arrow_bitmap_nulls;arrow_bitmap_nulls] -rm arrow_bitmap; - --1"\n+----------|| Serialize the schema and array data to an arrow stream ||----------+\n"; -serialized_bitmap:.arrowkdb.ipc.serializeArrow[bitmap_schema;bitmap_data;options]; - --1"\n+----------|| Parse the schema back abd compare ||----------+\n"; -stream_bitmap_schema:.arrowkdb.ipc.parseArrowSchema[serialized_bitmap]; -.arrowkdb.sc.equalSchemas[bitmap_schema;stream_bitmap_schema] -bitmap_schema~stream_bitmap_schema - --1"\n+----------|| Parse the array data back and compare ||----------+\n"; -stream_bitmap_data:.arrowkdb.ipc.parseArrowData[serialized_bitmap;options]; -bitmap_data~first stream_bitmap_data - -stream_bitmap_nulls:last stream_bitmap_data; -bitmap_nulls~bitmap_nulls & sublist[{1-x} count stream_bitmap_nulls;stream_bitmap_nulls] - - --1 "\n+----------|| Test utils ||----------+\n"; - -show .arrowkdb.util.buildInfo[] -(type .arrowkdb.util.buildInfo[])~99h - - --1 "\n+----------|| Finished testing ||----------+\n"; From e683e9f0fdb12ec990f9ebf38ca58fcc725b04a2 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 28 Feb 2023 22:31:26 +0300 Subject: [PATCH 227/276] Rearrange tests by features --- .gitignore | 1 + .travis.yml | 2 +- tests/null_bitmap/.gitignore | 2 ++ tests/{ => null_bitmap}/crucial_null_bitmap.t | 0 tests/{ => null_bitmap}/nested_null_bitmap.t | 2 +- tests/null_mapping/.gitignore | 7 +++++++ tests/{ => null_mapping}/null_mapping_extra.t | 0 tests/{ => null_mapping}/null_mapping_float.t | 0 tests/{ => null_mapping}/null_mapping_long.t | 0 tests/{ => null_mapping}/null_mapping_other.t | 0 tests/{ => null_mapping}/null_mapping_short.t | 0 tests/{ => null_mapping}/null_mapping_str.t | 0 tests/{ => null_mapping}/null_mapping_time.t | 0 13 files changed, 12 insertions(+), 2 deletions(-) create mode 100644 tests/null_bitmap/.gitignore rename tests/{ => null_bitmap}/crucial_null_bitmap.t (100%) rename tests/{ => null_bitmap}/nested_null_bitmap.t (99%) create mode 100644 tests/null_mapping/.gitignore rename tests/{ => null_mapping}/null_mapping_extra.t (100%) rename tests/{ => null_mapping}/null_mapping_float.t (100%) rename tests/{ => null_mapping}/null_mapping_long.t (100%) rename tests/{ => null_mapping}/null_mapping_other.t (100%) rename tests/{ => null_mapping}/null_mapping_short.t (100%) rename tests/{ => null_mapping}/null_mapping_str.t (100%) rename tests/{ => null_mapping}/null_mapping_time.t (100%) diff --git a/.gitignore b/.gitignore index 3d6594b..688988a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ arrowkdb.code-workspace .vscode/ build/ +test.q diff --git a/.travis.yml b/.travis.yml index 56238b5..69269a2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -82,7 +82,7 @@ before_install: script: - if [[ $TESTS == "True" && "x$OD" != "x" && "x$QLIC_KC" != "x" ]]; then curl -o test.q -L https://github.com/KxSystems/hdf5/raw/master/test.q; - q test.q tests/ -q; + q test.q tests -q && q test.q tests/null_mapping -q && q test.q tests/null_bitmap -q; fi - if [[ $TRAVIS_OS_NAME == "windows" && $BUILD == "True" ]]; then 7z a -tzip -r $FILE_NAME ./cmake/$FILE_ROOT/*; diff --git a/tests/null_bitmap/.gitignore b/tests/null_bitmap/.gitignore new file mode 100644 index 0000000..b002a2d --- /dev/null +++ b/tests/null_bitmap/.gitignore @@ -0,0 +1,2 @@ +crucial_null_bitmap.q +nested_null_bitmap.q diff --git a/tests/crucial_null_bitmap.t b/tests/null_bitmap/crucial_null_bitmap.t similarity index 100% rename from tests/crucial_null_bitmap.t rename to tests/null_bitmap/crucial_null_bitmap.t diff --git a/tests/nested_null_bitmap.t b/tests/null_bitmap/nested_null_bitmap.t similarity index 99% rename from tests/nested_null_bitmap.t rename to tests/null_bitmap/nested_null_bitmap.t index a12bdf6..6184c1e 100644 --- a/tests/nested_null_bitmap.t +++ b/tests/null_bitmap/nested_null_bitmap.t @@ -137,7 +137,7 @@ nested_struct_nulls~stream_struct_nulls -1 "\n+----------|| Test utils ||----------+\n"; -.arrowkdb.util.buildInfo[] +show .arrowkdb.util.buildInfo[] (type .arrowkdb.util.buildInfo[])~99h diff --git a/tests/null_mapping/.gitignore b/tests/null_mapping/.gitignore new file mode 100644 index 0000000..ff2bc45 --- /dev/null +++ b/tests/null_mapping/.gitignore @@ -0,0 +1,7 @@ +null_mapping_short.q +null_mapping_long.q +null_mapping_float.q +null_mapping_str.q +null_mapping_time.q +null_mapping_extra.q +null_mapping_other.q diff --git a/tests/null_mapping_extra.t b/tests/null_mapping/null_mapping_extra.t similarity index 100% rename from tests/null_mapping_extra.t rename to tests/null_mapping/null_mapping_extra.t diff --git a/tests/null_mapping_float.t b/tests/null_mapping/null_mapping_float.t similarity index 100% rename from tests/null_mapping_float.t rename to tests/null_mapping/null_mapping_float.t diff --git a/tests/null_mapping_long.t b/tests/null_mapping/null_mapping_long.t similarity index 100% rename from tests/null_mapping_long.t rename to tests/null_mapping/null_mapping_long.t diff --git a/tests/null_mapping_other.t b/tests/null_mapping/null_mapping_other.t similarity index 100% rename from tests/null_mapping_other.t rename to tests/null_mapping/null_mapping_other.t diff --git a/tests/null_mapping_short.t b/tests/null_mapping/null_mapping_short.t similarity index 100% rename from tests/null_mapping_short.t rename to tests/null_mapping/null_mapping_short.t diff --git a/tests/null_mapping_str.t b/tests/null_mapping/null_mapping_str.t similarity index 100% rename from tests/null_mapping_str.t rename to tests/null_mapping/null_mapping_str.t diff --git a/tests/null_mapping_time.t b/tests/null_mapping/null_mapping_time.t similarity index 100% rename from tests/null_mapping_time.t rename to tests/null_mapping/null_mapping_time.t From 2341d48d9a815b2b88e3bd14975bdca35d49b307 Mon Sep 17 00:00:00 2001 From: nmcdonnell-kx Date: Wed, 1 Mar 2023 16:14:19 +0000 Subject: [PATCH 228/276] KXI-0 Windows build fixes --- src/KdbOptions.cpp | 2 ++ src/TableData.cpp | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/KdbOptions.cpp b/src/KdbOptions.cpp index 75c903c..ce6f325 100644 --- a/src/KdbOptions.cpp +++ b/src/KdbOptions.cpp @@ -1,3 +1,5 @@ +#include + #include "KdbOptions.h" namespace{ diff --git a/src/TableData.cpp b/src/TableData.cpp index cab590f..f076f6f 100644 --- a/src/TableData.cpp +++ b/src/TableData.cpp @@ -481,7 +481,7 @@ K writeArrow(K arrow_file, K schema_id, K array_data, K options) // Chunk size read_options.GetIntOption( kx::arrowkdb::Options::ARROW_CHUNK_ROWS, type_overrides.chunk_length ); - auto check_length = []( const auto& arrays ){ + auto check_length = []( const auto& arrays ) -> int64_t { // Check all arrays are same length int64_t len = -1; for (auto i : arrays) { @@ -660,7 +660,7 @@ K serializeArrow(K schema_id, K array_data, K options) // Chunk size read_options.GetIntOption( kx::arrowkdb::Options::ARROW_CHUNK_ROWS, type_overrides.chunk_length ); - auto check_length = []( const auto& arrays ){ + auto check_length = []( const auto& arrays ) -> int64_t { // Check all arrays are same length int64_t len = -1; for (auto i : arrays) { From 28277461edc77d4b3d80af6ce5f72454a385d23e Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 2 Mar 2023 14:19:56 +0300 Subject: [PATCH 229/276] Hot fix for struct dimensions --- src/ArrayReader.cpp | 40 +++++++++++++++++++++++++--------------- src/ArrayReader.h | 2 +- 2 files changed, 26 insertions(+), 16 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 67d0326..7eb79aa 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -633,9 +633,10 @@ K AppendNullBitmap( shared_ptr array_data, size size_t counter = 0; K k_bitmap = ( null_bitmap_handlers.find( type_id ) == null_bitmap_handlers.end() ) ? ktn( KB, length ) - : knk( length ); + : knk( 0 ); auto slice = slice_array->Slice( 0, length ); - InitKdbNullBitmap( slice, k_bitmap, counter ); + InitKdbNullBitmap( slice, &k_bitmap, counter ); + ++index; return k_bitmap; } @@ -661,16 +662,24 @@ K AppendNullBitmap( shared_ptr array_data, size_ template<> K AppendNullBitmap( shared_ptr array_data, size_t& index ) { + size_t length = 0; auto struct_array = static_pointer_cast( array_data ); auto num_fields = struct_array->type()->num_fields(); - auto field = struct_array->field( index ); - auto type_id = field->type_id(); - size_t counter = 0; - K k_bitmap = ( null_bitmap_handlers.find( type_id ) == null_bitmap_handlers.end() ) - ? ktn( KB, num_fields ) - : knk( num_fields ); - InitKdbNullBitmap( field, k_bitmap, counter ); + K k_bitmap = knk( num_fields ); + for( int i = 0; i < num_fields; ++i ){ + auto field = struct_array->field( i ); + auto type_id = field->type_id(); + length = field->length(); + + size_t counter = 0; + kK( k_bitmap )[i] = ( null_bitmap_handlers.find( type_id ) == null_bitmap_handlers.end() ) + ? ktn( KB, length ) + : knk( 0 ); + InitKdbNullBitmap( field, &kK( k_bitmap )[i], counter ); + } + + index += length; return k_bitmap; } @@ -728,19 +737,20 @@ void AppendArray(shared_ptr array_data, K k_array, size_t& index, } } -void InitKdbNullBitmap( shared_ptr array_data, K k_bitmap, size_t& index ) +void InitKdbNullBitmap( shared_ptr array_data, K* k_bitmap, size_t& index ) { auto type_id = array_data->type_id(); auto length = array_data->length(); for( int i = 0ll; i < length; ++i ){ if( null_bitmap_handlers.find( type_id ) == null_bitmap_handlers.end() ){ - kG( k_bitmap )[index] = array_data->IsNull( i ); + kG( *k_bitmap )[index++] = array_data->IsNull( i ); } else{ - kK( k_bitmap )[index] = null_bitmap_handlers[type_id]( array_data, index ); + auto pos = index; + *k_bitmap = jk( k_bitmap, null_bitmap_handlers[type_id]( array_data, index ) ); + i += index - pos - 1; } - ++index; } } @@ -812,11 +822,11 @@ K ReadChunkedNullBitmap( shared_ptr chunked_array, TypeMapp auto type_id = chunked_array->type()->id(); K k_bitmap = ( null_bitmap_handlers.find( type_id ) == null_bitmap_handlers.end() ) ? ktn( KB, length ) - : knk( length ); + : knk( 0 ); size_t index = 0; for( auto i = 0; i < chunked_array->num_chunks(); ++i ){ - InitKdbNullBitmap( chunked_array->chunk( i ), k_bitmap, index ); + InitKdbNullBitmap( chunked_array->chunk( i ), &k_bitmap, index ); } return k_bitmap; diff --git a/src/ArrayReader.h b/src/ArrayReader.h index 9ed0386..2fdd5b8 100644 --- a/src/ArrayReader.h +++ b/src/ArrayReader.h @@ -77,7 +77,7 @@ K InitKdbForArray(std::shared_ptr datatype, size_t length, Type * begin. Index will be updated to account for the new offset by adding the * length of the array array. */ -void InitKdbNullBitmap( std::shared_ptr array_data, K k_bitmap, size_t& index ); +void InitKdbNullBitmap( std::shared_ptr array_data, K* k_bitmap, size_t& index ); } // namespace arrowkdb } // namespace kx From 3db62f6a49c9769c553d404ddd72f2e310c90963 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 2 Mar 2023 14:31:10 +0300 Subject: [PATCH 230/276] Struct dimension reshaping --- examples/null_bitmap.q | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q index c10a5b0..0db6d7e 100644 --- a/examples/null_bitmap.q +++ b/examples/null_bitmap.q @@ -94,18 +94,18 @@ d32_data:N?(2006.07.21;2005.07.18;2004.07.16;2003.07.15;2002.07.11); d32_data[4]:2006.07.21; // Create the data for each of the struct child fields -f32_data:3?100e; +f32_data:5?100e; f32_data[0]:8.76e; -bin_data:3?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); +bin_data:5?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); bin_data[1]:"x"$"acknowledge" -t64_data:3?(12:00:00.000000000;13:00:00.000000000;14:00:00.000000000;15:00:00.000000000;16:00:00.000000000); +t64_data:5?(12:00:00.000000000;13:00:00.000000000;14:00:00.000000000;15:00:00.000000000;16:00:00.000000000); t64_data[2]:00:00:00.123456789; // Combine the data for all columns bitmap_data:(ts_data;bool_data;i32_data;f64_data;str_data;d32_data); // Create the data for the list array -list_data:(enlist 9h;(8h;7h);(6h;5h;4h)); +list_data:(enlist 9h;(8h;7h);(6h;5h;4h);(1h;2h;3h;4h);(5h;6h;7h;8h;9h)); // Create the data for the struct array from its child arrays struct_data:(f32_data;bin_data;t64_data); @@ -158,8 +158,8 @@ show nested_data~first parquet_nested_data nulls_data:1b,(N-1)?1b; bitmap_nulls:{x rotate nulls_data} each neg til {x-1} count bitmap_data; -nested_list_nulls:(enlist 1b;00b;000b) -nested_struct_nulls:(100b;010b;001b) +nested_list_nulls:(enlist 1b;00b;000b;0000b;00001b) +nested_struct_nulls:(10000b;01000b;00100b) parquet_bitmap_nulls:last parquet_bitmap_data; parquet_list_nulls:first parquet_nested_data[1] @@ -167,7 +167,7 @@ parquet_struct_nulls:last parquet_nested_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count parquet_bitmap_nulls;parquet_bitmap_nulls] nested_list_nulls~parquet_list_nulls -nested_struct_nulls~parquet_struct_nulls +nested_struct_nulls~parquet_struct_nulls[0] rm parquet_null_bitmap; rm parquet_nested_bitmap; @@ -209,7 +209,7 @@ arrow_struct_nulls:last arrow_nested_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count arrow_bitmap_nulls;arrow_bitmap_nulls] nested_list_nulls~arrow_list_nulls -nested_struct_nulls~arrow_struct_nulls +nested_struct_nulls~arrow_struct_nulls[0] rm arrow_null_bitmap; rm arrow_nested_bitmap; @@ -248,7 +248,7 @@ stream_struct_nulls:last stream_nested_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count stream_bitmap_nulls;stream_bitmap_nulls] nested_list_nulls~stream_list_nulls -nested_struct_nulls~stream_struct_nulls +nested_struct_nulls~stream_struct_nulls[0] -1 "\n+----------------------------------------+\n"; From 03888bbe931d34ef8882f49a8d6e71d0eeafff14 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 2 Mar 2023 15:44:31 +0300 Subject: [PATCH 231/276] Updating unit-test for optimal struct shape --- tests/null_bitmap/nested_null_bitmap.t | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/null_bitmap/nested_null_bitmap.t b/tests/null_bitmap/nested_null_bitmap.t index 6184c1e..57f4bae 100644 --- a/tests/null_bitmap/nested_null_bitmap.t +++ b/tests/null_bitmap/nested_null_bitmap.t @@ -50,15 +50,15 @@ nested_schema:.arrowkdb.sc.schema[(list_fd,struct_fd)]; ts_data:asc N?0p; -1"\n+----------|| Create the data for each of the struct child fields ||----------+\n"; -f32_data:3?100e; +f32_data:5?100e; f32_data[0]:8.76e; -bin_data:3?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); +bin_data:5?("x"$"start";"x"$"stop";"x"$"alert";"x"$"acknowledge";"x"$""); bin_data[1]:"x"$"acknowledge" -t64_data:3?(12:00:00.000000000;13:00:00.000000000;14:00:00.000000000;15:00:00.000000000;16:00:00.000000000); +t64_data:5?(12:00:00.000000000;13:00:00.000000000;14:00:00.000000000;15:00:00.000000000;16:00:00.000000000); t64_data[2]:00:00:00.123456789; -1"\n+----------|| Create the data for the list array ||----------+\n"; -list_data:(enlist 9h;(8h;7h);(6h;5h;4h)); +list_data:(enlist 9h;(8h;7h);(6h;5h;4h);(1h;2h;3h;4h);(5h;6h;7h;8h;9h)); -1"\n+----------|| Create the data for the struct array from its child arrays ||----------+\n"; struct_data:(f32_data;bin_data;t64_data); @@ -85,13 +85,13 @@ parquet_nested_data:.arrowkdb.pq.readParquetData[parquet_nested_bitmap;nested_op nested_data~first parquet_nested_data -1"\n+----------|| Compare nested null bitmaps ||----------+\n"; -nested_list_nulls:(enlist 1b;00b;000b) -nested_struct_nulls:(100b;010b;001b) +nested_list_nulls:(enlist 1b;00b;000b;0000b;00001b) +nested_struct_nulls:(10000b;01000b;00100b) parquet_list_nulls:first parquet_nested_data[1] parquet_struct_nulls:last parquet_nested_data[1] nested_list_nulls~parquet_list_nulls -nested_struct_nulls~parquet_struct_nulls +nested_struct_nulls~parquet_struct_nulls[0] rm parquet_nested_bitmap; @@ -112,7 +112,7 @@ nested_data~first arrow_nested_data arrow_list_nulls:first arrow_nested_data[1] arrow_struct_nulls:last arrow_nested_data[1] nested_list_nulls~arrow_list_nulls -nested_struct_nulls~arrow_struct_nulls +nested_struct_nulls~arrow_struct_nulls[0] rm arrow_nested_bitmap; @@ -132,7 +132,7 @@ nested_data~first stream_nested_data stream_list_nulls:first stream_nested_data[1] stream_struct_nulls:last stream_nested_data[1] nested_list_nulls~stream_list_nulls -nested_struct_nulls~stream_struct_nulls +nested_struct_nulls~stream_struct_nulls[0] -1 "\n+----------|| Test utils ||----------+\n"; From 44453f2ad9bfd8fe02b968ea78b5355050e12c8d Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 2 Mar 2023 18:56:20 +0300 Subject: [PATCH 232/276] Map and dictionary examples --- examples/null_bitmap.q | 161 +++++++++++++++++++++++++++-------------- 1 file changed, 106 insertions(+), 55 deletions(-) diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q index 0db6d7e..eedb6ca 100644 --- a/examples/null_bitmap.q +++ b/examples/null_bitmap.q @@ -20,9 +20,10 @@ rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; // Support null mapping bitmap_opts:(`bool`int32`float64`string`date32)!(0b;1i;2.34;"start";2006.07.21); -nested_opts:(`uint16`float32`binary`time64)!(9h;8.76e;"x"$"acknowledge";00:00:00.123456789); +nested_struct_opts:(`uint16`float32`binary`time64)!(9h;8.76e;"x"$"acknowledge";00:00:00.123456789); +nested_dict_opts:(enlist `int64)!(enlist 5); -nested_options:(``NULL_MAPPING)!((::);bitmap_opts,nested_opts); +nested_options:(``NULL_MAPPING)!((::);bitmap_opts,nested_struct_opts,nested_dict_opts); // Create the datatype identifiers ts_dt:.arrowkdb.dt.timestamp[`nano]; @@ -34,11 +35,18 @@ str_dt:.arrowkdb.dt.utf8[]; d32_dt:.arrowkdb.dt.date32[]; ui16_dt:.arrowkdb.dt.uint16[]; +// Create a list datatype, using the uint16 datatype as its child +list_dt:.arrowkdb.dt.list[ui16_dt]; f32_dt:.arrowkdb.dt.float32[]; bin_dt:.arrowkdb.dt.binary[]; t64_dt:.arrowkdb.dt.time64[`nano]; +i64_dt:.arrowkdb.dt.int64[]; + +// Create a map datatype using the i16_dt as the key and dec_dt as its values +map_dt:.arrowkdb.dt.map[i64_dt;f64_dt] + // Create the field identifiers ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; @@ -49,35 +57,38 @@ str_fd:.arrowkdb.fd.field[`string;str_dt]; d32_fd:.arrowkdb.fd.field[`date32;d32_dt]; ui16_fd:.arrowkdb.fd.field[`uint16;ui16_dt]; +// Create a field containing the list datatype +list_fd:.arrowkdb.fd.field[`list_field;list_dt]; f32_fd:.arrowkdb.fd.field[`float32;f32_dt]; bin_fd:.arrowkdb.fd.field[`binary;bin_dt]; t64_fd:.arrowkdb.fd.field[`time64;t64_dt]; - -// Create a list datatype, using the uint16 datatype as its child -list_dt:.arrowkdb.dt.list[ui16_dt]; - -// Create a field containing the list datatype -list_fd:.arrowkdb.fd.field[`list_field;list_dt]; - // Create a struct datatype using the float32, binary and time64 fields as its children -struct_dt:.arrowkdb.dt.struct[(f32_fd,bin_dt,t64_dt)]; - +struct_dt:.arrowkdb.dt.struct[(f32_fd,bin_fd,t64_fd)]; // Create a field containing the struct datatype struct_fd:.arrowkdb.fd.field[`struct_field;struct_dt]; +// Create a field containing the map datatype +map_fd:.arrowkdb.fd.field[`map;map_dt]; + // Create the schemas for the list of fields bitmap_schema:.arrowkdb.sc.schema[(ts_fd,bool_fd,i32_fd,f64_fd,str_fd,d32_fd)]; // Create the schema containing the list and struct fields -nested_schema:.arrowkdb.sc.schema[(list_fd,struct_fd)]; +struct_schema:.arrowkdb.sc.schema[(list_fd,struct_fd)]; + +// Create the schema containing the large list, dictionary and sparce union fields +dict_schema:.arrowkdb.sc.schema[(enlist map_fd)]; // Print the schema -1"\nBitmap schema:"; .arrowkdb.sc.printSchema[bitmap_schema]; --1"\nNested schema:"; -.arrowkdb.sc.printSchema[nested_schema]; +-1"\nStruct schema:"; +.arrowkdb.sc.printSchema[struct_schema]; + +-1"\nDict schema:"; +.arrowkdb.sc.printSchema[dict_schema]; // Create data for each column in the table ts_data:asc N?0p; @@ -104,22 +115,27 @@ t64_data[2]:00:00:00.123456789; // Combine the data for all columns bitmap_data:(ts_data;bool_data;i32_data;f64_data;str_data;d32_data); -// Create the data for the list array -list_data:(enlist 9h;(8h;7h);(6h;5h;4h);(1h;2h;3h;4h);(5h;6h;7h;8h;9h)); +list_array:(enlist 9h;(8h;7h);(6h;5h;4h);(1h;2h;3h;4h);(5h;6h;7h;8h;9h)); +struct_array:(f32_data;bin_data;t64_data); +// Combine the array data for the list and struct columns +struct_data:(list_array;struct_array); -// Create the data for the struct array from its child arrays -struct_data:(f32_data;bin_data;t64_data); +map_data:((enlist 1)!(enlist 1f);(2 2)!(2 2.34f);(3 3 3)!(3 3 3f)) // Combine the array data for the list and struct columns -nested_data:(list_data;struct_data); +dict_data:(enlist map_data); // Pretty print the Arrow table populated from the bitmap data -1"\nBitmap table:"; .arrowkdb.tb.prettyPrintTable[bitmap_schema;bitmap_data;nested_options]; // Show the array data as an arrow table --1"\nNested table:"; -.arrowkdb.tb.prettyPrintTable[nested_schema;nested_data;nested_options] +-1"\nStruct table:"; +.arrowkdb.tb.prettyPrintTable[struct_schema;struct_data;nested_options] + +// Show the array data as an arrow table +-1"\nDict table:"; +.arrowkdb.tb.prettyPrintTable[dict_schema;dict_data;nested_options] //-------------------------// // Example-1. Parquet file // @@ -129,48 +145,61 @@ nested_data:(list_data;struct_data); nested_options[`PARQUET_VERSION]:`V2.0; parquet_null_bitmap:"null_bitmap.parquet"; -parquet_nested_bitmap:"nested_bitmap.parquet"; +parquet_nested_struct:"nested_struct.parquet"; +parquet_nested_dict:"nested_dict.parquet"; .arrowkdb.pq.writeParquet[parquet_null_bitmap;bitmap_schema;bitmap_data;nested_options]; -.arrowkdb.pq.writeParquet[parquet_nested_bitmap;nested_schema;nested_data;nested_options]; +.arrowkdb.pq.writeParquet[parquet_nested_struct;struct_schema;struct_data;nested_options]; +.arrowkdb.pq.writeParquet[parquet_nested_dict;dict_schema;dict_data;nested_options]; show ls parquet_null_bitmap -show ls parquet_nested_bitmap +show ls parquet_nested_struct +show ls parquet_nested_dict -// Read the array data back and compare +// Read the schema back and compare nested_options[`WITH_NULL_BITMAP]:1; -// Read the schema back and compare parquet_bitmap_schema:.arrowkdb.pq.readParquetSchema[parquet_null_bitmap]; -parquet_nested_schema:.arrowkdb.pq.readParquetSchema[parquet_nested_bitmap]; +parquet_struct_schema:.arrowkdb.pq.readParquetSchema[parquet_nested_struct]; +parquet_dict_schema:.arrowkdb.pq.readParquetSchema[parquet_nested_dict]; show .arrowkdb.sc.equalSchemas[bitmap_schema;parquet_bitmap_schema] -show .arrowkdb.sc.equalSchemas[nested_schema;parquet_nested_schema] +show .arrowkdb.sc.equalSchemas[struct_schema;parquet_struct_schema] +show .arrowkdb.sc.equalSchemas[dict_schema;parquet_dict_schema] show bitmap_schema~parquet_bitmap_schema -show nested_schema~parquet_nested_schema +show struct_schema~parquet_struct_schema +show dict_schema~parquet_dict_schema +// Read the array data back and compare parquet_bitmap_data:.arrowkdb.pq.readParquetData[parquet_null_bitmap;nested_options]; -parquet_nested_data:.arrowkdb.pq.readParquetData[parquet_nested_bitmap;nested_options]; +parquet_struct_data:.arrowkdb.pq.readParquetData[parquet_nested_struct;nested_options]; +parquet_dict_data:.arrowkdb.pq.readParquetData[parquet_nested_dict;nested_options]; show bitmap_data~first parquet_bitmap_data -show nested_data~first parquet_nested_data +show struct_data~first parquet_struct_data +show dict_data~first parquet_dict_data +// Compare null bitmaps of parquet data nulls_data:1b,(N-1)?1b; bitmap_nulls:{x rotate nulls_data} each neg til {x-1} count bitmap_data; nested_list_nulls:(enlist 1b;00b;000b;0000b;00001b) nested_struct_nulls:(10000b;01000b;00100b) +nested_dict_nulls:((enlist 0b)!(enlist 0b);00b!01b;000b!000b) parquet_bitmap_nulls:last parquet_bitmap_data; -parquet_list_nulls:first parquet_nested_data[1] -parquet_struct_nulls:last parquet_nested_data[1] +parquet_list_nulls:first parquet_struct_data[1] +parquet_struct_nulls:last parquet_struct_data[1] +parquet_dict_nulls:parquet_dict_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count parquet_bitmap_nulls;parquet_bitmap_nulls] nested_list_nulls~parquet_list_nulls nested_struct_nulls~parquet_struct_nulls[0] +nested_dict_nulls~parquet_dict_nulls[0][0] rm parquet_null_bitmap; -rm parquet_nested_bitmap; +rm parquet_nested_struct; +rm parquet_nested_dict; //---------------------------// // Example-2. Arrow IPC file // @@ -178,41 +207,53 @@ rm parquet_nested_bitmap; // Write the schema and array data to an arrow file arrow_null_bitmap:"null_bitmap.arrow"; -arrow_nested_bitmap:"nested_bitmap.arrow"; +arrow_struct_bitmap:"nested_bitmap.arrow"; +arrow_dict_bitmap:"nested_dict.arrow"; .arrowkdb.ipc.writeArrow[arrow_null_bitmap;bitmap_schema;bitmap_data;nested_options]; -.arrowkdb.ipc.writeArrow[arrow_nested_bitmap;nested_schema;nested_data;nested_options]; +.arrowkdb.ipc.writeArrow[arrow_struct_bitmap;struct_schema;struct_data;nested_options]; +.arrowkdb.ipc.writeArrow[arrow_dict_bitmap;dict_schema;dict_data;nested_options]; show ls arrow_null_bitmap -show ls arrow_nested_bitmap +show ls arrow_struct_bitmap +show ls arrow_dict_bitmap // Read the schema back and compare arrow_bitmap_schema:.arrowkdb.ipc.readArrowSchema[arrow_null_bitmap]; -arrow_nested_schema:.arrowkdb.ipc.readArrowSchema[arrow_nested_bitmap]; +arrow_struct_schema:.arrowkdb.ipc.readArrowSchema[arrow_struct_bitmap]; +arrow_dict_schema:.arrowkdb.ipc.readArrowSchema[arrow_dict_bitmap]; show .arrowkdb.sc.equalSchemas[bitmap_schema;arrow_bitmap_schema] -show .arrowkdb.sc.equalSchemas[nested_schema;arrow_nested_schema] +show .arrowkdb.sc.equalSchemas[struct_schema;arrow_struct_schema] +show .arrowkdb.sc.equalSchemas[dict_schema;arrow_dict_schema] show bitmap_schema~arrow_bitmap_schema -show nested_schema~arrow_nested_schema +show struct_schema~arrow_struct_schema +show dict_schema~arrow_dict_schema // Read the array data back and compare arrow_bitmap_data:.arrowkdb.ipc.readArrowData[arrow_null_bitmap;nested_options]; -arrow_nested_data:.arrowkdb.ipc.readArrowData[arrow_nested_bitmap;nested_options]; +arrow_struct_data:.arrowkdb.ipc.readArrowData[arrow_struct_bitmap;nested_options]; +arrow_dict_data:.arrowkdb.ipc.readArrowData[arrow_dict_bitmap;nested_options]; show bitmap_data~first arrow_bitmap_data -show nested_data~first arrow_nested_data +show struct_data~first arrow_struct_data +show dict_data~first arrow_dict_data +// Compare null bitmaps of arrow data arrow_bitmap_nulls:last arrow_bitmap_data; -arrow_list_nulls:first arrow_nested_data[1] -arrow_struct_nulls:last arrow_nested_data[1] +arrow_list_nulls:first arrow_struct_data[1] +arrow_struct_nulls:last arrow_struct_data[1] +arrow_dict_nulls:arrow_dict_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count arrow_bitmap_nulls;arrow_bitmap_nulls] nested_list_nulls~arrow_list_nulls nested_struct_nulls~arrow_struct_nulls[0] +nested_dict_nulls~arrow_dict_nulls[0][0] rm arrow_null_bitmap; -rm arrow_nested_bitmap; +rm arrow_struct_bitmap; +rm arrow_dict_bitmap; //-----------------------------// // Example-3. Arrow IPC stream // @@ -220,37 +261,47 @@ rm arrow_nested_bitmap; // Serialize the schema and array data to an arrow stream serialized_null_bitmap:.arrowkdb.ipc.serializeArrow[bitmap_schema;bitmap_data;nested_options]; -serialized_nested_bitmap:.arrowkdb.ipc.serializeArrow[nested_schema;nested_data;nested_options]; +serialized_nested_struct:.arrowkdb.ipc.serializeArrow[struct_schema;struct_data;nested_options]; +serialized_nested_dict:.arrowkdb.ipc.serializeArrow[dict_schema;dict_data;nested_options]; show serialized_null_bitmap -show serialized_nested_bitmap +show serialized_nested_struct +show serialized_nested_dict // Parse the schema back abd compare stream_bitmap_schema:.arrowkdb.ipc.parseArrowSchema[serialized_null_bitmap]; -stream_nested_schema:.arrowkdb.ipc.parseArrowSchema[serialized_nested_bitmap]; +stream_struct_schema:.arrowkdb.ipc.parseArrowSchema[serialized_nested_struct]; +stream_dict_schema:.arrowkdb.ipc.parseArrowSchema[serialized_nested_dict]; show .arrowkdb.sc.equalSchemas[bitmap_schema;stream_bitmap_schema] -show .arrowkdb.sc.equalSchemas[nested_schema;stream_nested_schema] +show .arrowkdb.sc.equalSchemas[struct_schema;stream_struct_schema] +show .arrowkdb.sc.equalSchemas[dict_schema;stream_dict_schema] show bitmap_schema~stream_bitmap_schema -show nested_schema~stream_nested_schema +show struct_schema~stream_struct_schema +show dict_schema~stream_dict_schema // Parse the array data back and compare stream_bitmap_data:.arrowkdb.ipc.parseArrowData[serialized_null_bitmap;nested_options]; -stream_nested_data:.arrowkdb.ipc.parseArrowData[serialized_nested_bitmap;nested_options]; +stream_struct_data:.arrowkdb.ipc.parseArrowData[serialized_nested_struct;nested_options]; +stream_dict_data:.arrowkdb.ipc.parseArrowData[serialized_nested_dict;nested_options]; show bitmap_data~first stream_bitmap_data -show nested_data~first stream_nested_data +show struct_data~first stream_struct_data +show dict_data~first stream_dict_data +// Compare null bitmaps of stream data stream_bitmap_nulls:last stream_bitmap_data; -stream_list_nulls:first stream_nested_data[1] -stream_struct_nulls:last stream_nested_data[1] +stream_list_nulls:first stream_struct_data[1] +stream_struct_nulls:last stream_struct_data[1] +stream_dict_nulls:stream_dict_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count stream_bitmap_nulls;stream_bitmap_nulls] nested_list_nulls~stream_list_nulls nested_struct_nulls~stream_struct_nulls[0] +nested_dict_nulls~stream_dict_nulls[0][0] -1 "\n+----------------------------------------+\n"; // Process off -//exit 0; +exit 0; From ab5e145071994f6c656584878f318aedc60b24c8 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 2 Mar 2023 22:02:38 +0300 Subject: [PATCH 233/276] Mapping nulls into nested map --- src/ArrayReader.cpp | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 7eb79aa..c64e949 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -656,7 +656,37 @@ K AppendNullBitmap( shared_ptr array template<> K AppendNullBitmap( shared_ptr array_data, size_t& index ) { - return nullptr; + auto map_array = static_pointer_cast( array_data ); + auto keys = map_array->keys(); + auto items = map_array->items(); + auto keys_type_id = keys->type_id(); + auto items_type_id = items->type_id(); + auto length = map_array->length(); + + K k_bitmap = knk( length ); + for( auto i = 0; i < length; ++i ){ + auto keys_slice = keys->Slice( map_array->value_offset( i ), map_array->value_length( i ) ); + auto items_slice = items->Slice( map_array->value_offset( i ), map_array->value_length( i ) ); + auto keys_length = keys_slice->length(); + auto items_length = items_slice->length(); + + K k_keys = ( null_bitmap_handlers.find( keys_type_id ) == null_bitmap_handlers.end() ) + ? ktn( KB, keys_length ) + : knk( 0 ); + K k_items = ( null_bitmap_handlers.find( items_type_id ) == null_bitmap_handlers.end() ) + ? ktn( KB, items_length ) + : knk( 0 ); + + size_t keys_counter = 0; + size_t items_counter = 0; + InitKdbNullBitmap( keys_slice, &k_keys, keys_counter ); + InitKdbNullBitmap( items_slice, &k_items, items_counter ); + kK( k_bitmap )[i] = xD( k_keys, k_items ); + } + + index += length; + + return k_bitmap; } template<> From f4f1dfc52086bd192dbf16668344b12994e9a75f Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 3 Mar 2023 11:49:01 +0300 Subject: [PATCH 234/276] Unit-test for null bitmaps in associative arrays --- .gitignore | 2 + tests/.gitignore | 1 + tests/null_bitmap/.gitignore | 1 + tests/null_bitmap/glossary_null_bitmap.t | 116 +++++++++++++++++++++++ 4 files changed, 120 insertions(+) create mode 100644 tests/null_bitmap/glossary_null_bitmap.t diff --git a/.gitignore b/.gitignore index 688988a..701d371 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,5 @@ arrowkdb.code-workspace .vscode/ build/ test.q +unit.q +*.user diff --git a/tests/.gitignore b/tests/.gitignore index 01c7b10..b0b3e83 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -1,5 +1,6 @@ basic.q crucial_null_bitmap.q +glossary_null_bitmap.q nested_null_bitmap.q null_mapping_short.q null_mapping_long.q diff --git a/tests/null_bitmap/.gitignore b/tests/null_bitmap/.gitignore index b002a2d..3857116 100644 --- a/tests/null_bitmap/.gitignore +++ b/tests/null_bitmap/.gitignore @@ -1,2 +1,3 @@ crucial_null_bitmap.q +glossary_null_bitmap.q nested_null_bitmap.q diff --git a/tests/null_bitmap/glossary_null_bitmap.t b/tests/null_bitmap/glossary_null_bitmap.t new file mode 100644 index 0000000..15396da --- /dev/null +++ b/tests/null_bitmap/glossary_null_bitmap.t @@ -0,0 +1,116 @@ +// glossary_null_bitmap.q + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l q/arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping ||----------+\n"; +glossary_opts:(`int64`float64)!(5;2.34); + +glossary_options:(``NULL_MAPPING)!((::);glossary_opts); + +N:5 + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +i64_dt:.arrowkdb.dt.int64[]; +f64_dt:.arrowkdb.dt.float64[]; + +-1"\n+----------|| Create a map datatype using the i16_dt as the key and dec_dt as its values ||----------+\n"; +map_dt:.arrowkdb.dt.map[i64_dt;f64_dt] + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +i64_fd:.arrowkdb.fd.field[`int64;i64_dt]; +f64_fd:.arrowkdb.fd.field[`float64;f64_dt]; + +-1"\n+----------|| Create a field containing the map datatype ||----------+\n"; +map_fd:.arrowkdb.fd.field[`map;map_dt]; + +-1"\n+----------|| Create the schema containing the large list, dictionary and sparce union fields ||----------+\n"; +glossary_schema:.arrowkdb.sc.schema[(enlist map_fd)]; + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +i64_data:N?100i; +i64_data[0]:1i; +f64_data:N?100f; +f64_data[1]:2.34f; + +map_data:((enlist 1)!(enlist 1f);(2 2)!(2 2.34f);(3 3 3)!(3 3 3f)) + +-1"\n+----------|| Combine the array data for the glossary columns ||----------+\n"; +glossary_data:(enlist map_data); + +-1"\n+----------|| Write the schema and array data to a parquet file ||----------+\n"; +glossary_options[`PARQUET_VERSION]:`V2.0; + +parquet_glossary_bitmap:"glossary_bitmap.parquet"; +.arrowkdb.pq.writeParquet[parquet_glossary_bitmap;glossary_schema;glossary_data;glossary_options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +glossary_options[`WITH_NULL_BITMAP]:1; + +parquet_glossary_schema:.arrowkdb.pq.readParquetSchema[parquet_glossary_bitmap]; +.arrowkdb.sc.equalSchemas[glossary_schema;parquet_glossary_schema] +glossary_schema~parquet_glossary_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +parquet_glossary_data:.arrowkdb.pq.readParquetData[parquet_glossary_bitmap;glossary_options]; +glossary_data~first parquet_glossary_data + +-1"\n+----------|| Compare null bitmaps of parquet data ||----------+\n"; +glossary_nulls:((enlist 0b)!(enlist 0b);00b!01b;000b!000b) +parquet_glossary_nulls:parquet_glossary_data[1] +glossary_nulls~parquet_glossary_nulls[0][0] + +rm parquet_glossary_bitmap; + +-1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; +arrow_glossary_bitmap:"nested_map.arrow"; +.arrowkdb.ipc.writeArrow[arrow_glossary_bitmap;glossary_schema;glossary_data;glossary_options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +arrow_glossary_schema:.arrowkdb.ipc.readArrowSchema[arrow_glossary_bitmap]; +.arrowkdb.sc.equalSchemas[glossary_schema;arrow_glossary_schema] +glossary_schema~arrow_glossary_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +arrow_glossary_data:.arrowkdb.ipc.readArrowData[arrow_glossary_bitmap;glossary_options]; +glossary_data~first arrow_glossary_data + +-1"\n+----------|| Compare null bitmaps of arrow data ||----------+\n"; +arrow_glossary_nulls:arrow_glossary_data[1] +glossary_nulls~arrow_glossary_nulls[0][0] + +rm arrow_glossary_bitmap; + +-1"\n+----------|| Serialize the schema and array data to an arrow stream ||----------+\n"; +serialized_glossary:.arrowkdb.ipc.serializeArrow[glossary_schema;glossary_data;glossary_options]; + +-1"\n+----------|| Parse the schema back abd compare ||----------+\n"; +stream_glossary_schema:.arrowkdb.ipc.parseArrowSchema[serialized_glossary]; +.arrowkdb.sc.equalSchemas[glossary_schema;stream_glossary_schema] +glossary_schema~stream_glossary_schema + +-1"\n+----------|| Parse the array data back and compare ||----------+\n"; +stream_glossary_data:.arrowkdb.ipc.parseArrowData[serialized_glossary;glossary_options]; +glossary_data~first stream_glossary_data + +-1"\n+----------|| Compare null bitmaps of stream data ||----------+\n"; +stream_glossary_nulls:stream_glossary_data[1] +glossary_nulls~stream_glossary_nulls[0][0] + + +-1 "\n+----------|| Test utils ||----------+\n"; + +show .arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; From 1072e9f0c1e1571425a94c1cce06042f6f6c5df2 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 3 Mar 2023 14:44:10 +0300 Subject: [PATCH 235/276] Example for dictionary bitmap --- examples/null_bitmap.q | 43 +++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q index eedb6ca..a532c50 100644 --- a/examples/null_bitmap.q +++ b/examples/null_bitmap.q @@ -45,6 +45,7 @@ t64_dt:.arrowkdb.dt.time64[`nano]; i64_dt:.arrowkdb.dt.int64[]; // Create a map datatype using the i16_dt as the key and dec_dt as its values +dict_dt:.arrowkdb.dt.dictionary[str_dt;i64_dt] map_dt:.arrowkdb.dt.map[i64_dt;f64_dt] // Create the field identifiers @@ -69,6 +70,7 @@ struct_dt:.arrowkdb.dt.struct[(f32_fd,bin_fd,t64_fd)]; struct_fd:.arrowkdb.fd.field[`struct_field;struct_dt]; // Create a field containing the map datatype +dict_fd:.arrowkdb.fd.field[`dictionary;dict_dt] map_fd:.arrowkdb.fd.field[`map;map_dt]; // Create the schemas for the list of fields @@ -78,7 +80,7 @@ bitmap_schema:.arrowkdb.sc.schema[(ts_fd,bool_fd,i32_fd,f64_fd,str_fd,d32_fd)]; struct_schema:.arrowkdb.sc.schema[(list_fd,struct_fd)]; // Create the schema containing the large list, dictionary and sparce union fields -dict_schema:.arrowkdb.sc.schema[(enlist map_fd)]; +dict_schema:.arrowkdb.sc.schema[(dict_fd, map_fd)]; // Print the schema -1"\nBitmap schema:"; @@ -120,10 +122,11 @@ struct_array:(f32_data;bin_data;t64_data); // Combine the array data for the list and struct columns struct_data:(list_array;struct_array); +dict_data:(("aa";"bb";"cc");(2 0 1)) map_data:((enlist 1)!(enlist 1f);(2 2)!(2 2.34f);(3 3 3)!(3 3 3f)) // Combine the array data for the list and struct columns -dict_data:(enlist map_data); +dict_data:(dict_data;map_data); // Pretty print the Arrow table populated from the bitmap data -1"\nBitmap table:"; @@ -161,15 +164,12 @@ nested_options[`WITH_NULL_BITMAP]:1; parquet_bitmap_schema:.arrowkdb.pq.readParquetSchema[parquet_null_bitmap]; parquet_struct_schema:.arrowkdb.pq.readParquetSchema[parquet_nested_struct]; -parquet_dict_schema:.arrowkdb.pq.readParquetSchema[parquet_nested_dict]; show .arrowkdb.sc.equalSchemas[bitmap_schema;parquet_bitmap_schema] show .arrowkdb.sc.equalSchemas[struct_schema;parquet_struct_schema] -show .arrowkdb.sc.equalSchemas[dict_schema;parquet_dict_schema] show bitmap_schema~parquet_bitmap_schema show struct_schema~parquet_struct_schema -show dict_schema~parquet_dict_schema // Read the array data back and compare parquet_bitmap_data:.arrowkdb.pq.readParquetData[parquet_null_bitmap;nested_options]; @@ -178,14 +178,16 @@ parquet_dict_data:.arrowkdb.pq.readParquetData[parquet_nested_dict;nested_option show bitmap_data~first parquet_bitmap_data show struct_data~first parquet_struct_data -show dict_data~first parquet_dict_data +show first[dict_data[0]]~asc first parquet_dict_data[0] +show last[dict_data]~last parquet_dict_data[0] // Compare null bitmaps of parquet data nulls_data:1b,(N-1)?1b; bitmap_nulls:{x rotate nulls_data} each neg til {x-1} count bitmap_data; -nested_list_nulls:(enlist 1b;00b;000b;0000b;00001b) -nested_struct_nulls:(10000b;01000b;00100b) -nested_dict_nulls:((enlist 0b)!(enlist 0b);00b!01b;000b!000b) +nested_list_nulls:(enlist 1b;00b;000b;0000b;00001b); +nested_struct_nulls:(10000b;01000b;00100b); +nested_dict_nulls:(000b;000b); +nested_map_nulls:((enlist 0b)!(enlist 0b);00b!01b;000b!000b); parquet_bitmap_nulls:last parquet_bitmap_data; parquet_list_nulls:first parquet_struct_data[1] @@ -193,9 +195,10 @@ parquet_struct_nulls:last parquet_struct_data[1] parquet_dict_nulls:parquet_dict_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count parquet_bitmap_nulls;parquet_bitmap_nulls] -nested_list_nulls~parquet_list_nulls -nested_struct_nulls~parquet_struct_nulls[0] -nested_dict_nulls~parquet_dict_nulls[0][0] +show nested_list_nulls~parquet_list_nulls +show nested_struct_nulls~parquet_struct_nulls[0] +show nested_dict_nulls[0]~parquet_dict_nulls[0] +show nested_map_nulls~last[parquet_dict_nulls][0] rm parquet_null_bitmap; rm parquet_nested_struct; @@ -247,9 +250,10 @@ arrow_struct_nulls:last arrow_struct_data[1] arrow_dict_nulls:arrow_dict_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count arrow_bitmap_nulls;arrow_bitmap_nulls] -nested_list_nulls~arrow_list_nulls -nested_struct_nulls~arrow_struct_nulls[0] -nested_dict_nulls~arrow_dict_nulls[0][0] +show nested_list_nulls~arrow_list_nulls +show nested_struct_nulls~arrow_struct_nulls[0] +show nested_dict_nulls~first[arrow_dict_nulls][0] +show nested_map_nulls~last[arrow_dict_nulls][0] rm arrow_null_bitmap; rm arrow_struct_bitmap; @@ -297,11 +301,12 @@ stream_struct_nulls:last stream_struct_data[1] stream_dict_nulls:stream_dict_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count stream_bitmap_nulls;stream_bitmap_nulls] -nested_list_nulls~stream_list_nulls -nested_struct_nulls~stream_struct_nulls[0] -nested_dict_nulls~stream_dict_nulls[0][0] +show nested_list_nulls~stream_list_nulls +show nested_struct_nulls~stream_struct_nulls[0] +show nested_dict_nulls~first[stream_dict_nulls][0] +show nested_map_nulls~last[stream_dict_nulls][0] -1 "\n+----------------------------------------+\n"; // Process off -exit 0; +//exit 0; From 38feaa63bb5a425358d07e2639ae568cf979c801 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 3 Mar 2023 14:45:54 +0300 Subject: [PATCH 236/276] Mapping nulls into nested dictionary --- src/ArrayReader.cpp | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index c64e949..d446b21 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -666,13 +666,13 @@ K AppendNullBitmap( shared_ptr array_data, size_ K k_bitmap = knk( length ); for( auto i = 0; i < length; ++i ){ auto keys_slice = keys->Slice( map_array->value_offset( i ), map_array->value_length( i ) ); - auto items_slice = items->Slice( map_array->value_offset( i ), map_array->value_length( i ) ); auto keys_length = keys_slice->length(); - auto items_length = items_slice->length(); - K k_keys = ( null_bitmap_handlers.find( keys_type_id ) == null_bitmap_handlers.end() ) ? ktn( KB, keys_length ) : knk( 0 ); + + auto items_slice = items->Slice( map_array->value_offset( i ), map_array->value_length( i ) ); + auto items_length = items_slice->length(); K k_items = ( null_bitmap_handlers.find( items_type_id ) == null_bitmap_handlers.end() ) ? ktn( KB, items_length ) : knk( 0 ); @@ -729,7 +729,32 @@ K AppendNullBitmap( shared_ptr array_dat template<> K AppendNullBitmap( shared_ptr array_data, size_t& index ) { - return nullptr; + auto dictionary_array = static_pointer_cast( array_data ); + auto length = dictionary_array->length(); + + auto items = dictionary_array->dictionary(); + auto items_type_id = items->type_id(); + auto items_length = items->length(); + K k_items = ( null_bitmap_handlers.find( items_type_id ) == null_bitmap_handlers.end() ) + ? ktn( KB, items_length ) + : knk( 0 ); + + auto indices = dictionary_array->indices(); + auto indices_type_id = indices->type_id(); + auto indices_length = indices->length(); + K k_indices = ( null_bitmap_handlers.find( indices_type_id ) == null_bitmap_handlers.end() ) + ? ktn( KB, indices_length ) + : knk( 0 ); + + size_t items_counter = 0; + size_t indices_counter = 0; + InitKdbNullBitmap( items, &k_items, items_counter ); + InitKdbNullBitmap( indices, &k_indices, indices_counter ); + + K k_bitmap = knk( 2, k_items, k_indices ); + index += length; + + return k_bitmap; } template From f3ad1ccf5c714c32a6e42c50fade38c97fcfedf8 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 3 Mar 2023 18:31:54 +0300 Subject: [PATCH 237/276] Example for unions --- examples/null_bitmap.q | 77 ++++++++++++++++++++++++++++++++---------- 1 file changed, 60 insertions(+), 17 deletions(-) diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q index a532c50..5af3844 100644 --- a/examples/null_bitmap.q +++ b/examples/null_bitmap.q @@ -35,8 +35,6 @@ str_dt:.arrowkdb.dt.utf8[]; d32_dt:.arrowkdb.dt.date32[]; ui16_dt:.arrowkdb.dt.uint16[]; -// Create a list datatype, using the uint16 datatype as its child -list_dt:.arrowkdb.dt.list[ui16_dt]; f32_dt:.arrowkdb.dt.float32[]; bin_dt:.arrowkdb.dt.binary[]; @@ -44,10 +42,6 @@ t64_dt:.arrowkdb.dt.time64[`nano]; i64_dt:.arrowkdb.dt.int64[]; -// Create a map datatype using the i16_dt as the key and dec_dt as its values -dict_dt:.arrowkdb.dt.dictionary[str_dt;i64_dt] -map_dt:.arrowkdb.dt.map[i64_dt;f64_dt] - // Create the field identifiers ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; @@ -58,30 +52,45 @@ str_fd:.arrowkdb.fd.field[`string;str_dt]; d32_fd:.arrowkdb.fd.field[`date32;d32_dt]; ui16_fd:.arrowkdb.fd.field[`uint16;ui16_dt]; -// Create a field containing the list datatype -list_fd:.arrowkdb.fd.field[`list_field;list_dt]; f32_fd:.arrowkdb.fd.field[`float32;f32_dt]; bin_fd:.arrowkdb.fd.field[`binary;bin_dt]; t64_fd:.arrowkdb.fd.field[`time64;t64_dt]; -// Create a struct datatype using the float32, binary and time64 fields as its children -struct_dt:.arrowkdb.dt.struct[(f32_fd,bin_fd,t64_fd)]; + +i64_fd:.arrowkdb.fd.field[`int64;i64_dt]; + +// Create a field containing the list datatype +list_dt:.arrowkdb.dt.list[ui16_dt]; +list_fd:.arrowkdb.fd.field[`list_field;list_dt]; + // Create a field containing the struct datatype +struct_dt:.arrowkdb.dt.struct[(f32_fd,bin_fd,t64_fd)]; struct_fd:.arrowkdb.fd.field[`struct_field;struct_dt]; -// Create a field containing the map datatype +// Create fields containing dictionary datatypes +dict_dt:.arrowkdb.dt.dictionary[str_dt;i64_dt] dict_fd:.arrowkdb.fd.field[`dictionary;dict_dt] +map_dt:.arrowkdb.dt.map[i64_dt;f64_dt] map_fd:.arrowkdb.fd.field[`map;map_dt]; -// Create the schemas for the list of fields +// Create fields containing union datatypes +sparse_dt:.arrowkdb.dt.sparse_union[(i64_fd,f64_fd)] +sparse_fd:.arrowkdb.fd.field[`sparse_union;sparse_dt] +dense_dt:.arrowkdb.dt.dense_union[(i64_fd,f64_fd)] +dense_fd:.arrowkdb.fd.field[`dense_union;dense_dt] + +// Create the schemas for primitive fields bitmap_schema:.arrowkdb.sc.schema[(ts_fd,bool_fd,i32_fd,f64_fd,str_fd,d32_fd)]; // Create the schema containing the list and struct fields struct_schema:.arrowkdb.sc.schema[(list_fd,struct_fd)]; -// Create the schema containing the large list, dictionary and sparce union fields +// Create the schema containing the dictionary and map fields dict_schema:.arrowkdb.sc.schema[(dict_fd, map_fd)]; +// Create the schema containing the sparce and dense union fields +union_schema:.arrowkdb.sc.schema[(sparse_fd, dense_fd)] + // Print the schema -1"\nBitmap schema:"; .arrowkdb.sc.printSchema[bitmap_schema]; @@ -92,6 +101,9 @@ dict_schema:.arrowkdb.sc.schema[(dict_fd, map_fd)]; -1"\nDict schema:"; .arrowkdb.sc.printSchema[dict_schema]; +-1"\nUnion schema:"; +.arrowkdb.sc.printSchema[union_schema]; + // Create data for each column in the table ts_data:asc N?0p; @@ -114,19 +126,22 @@ bin_data[1]:"x"$"acknowledge" t64_data:5?(12:00:00.000000000;13:00:00.000000000;14:00:00.000000000;15:00:00.000000000;16:00:00.000000000); t64_data[2]:00:00:00.123456789; -// Combine the data for all columns +// Combine the data for primitive columns bitmap_data:(ts_data;bool_data;i32_data;f64_data;str_data;d32_data); +// Combine the array data for the list and struct columns list_array:(enlist 9h;(8h;7h);(6h;5h;4h);(1h;2h;3h;4h);(5h;6h;7h;8h;9h)); struct_array:(f32_data;bin_data;t64_data); -// Combine the array data for the list and struct columns struct_data:(list_array;struct_array); +// Combine the array data for the list and struct columns dict_data:(("aa";"bb";"cc");(2 0 1)) map_data:((enlist 1)!(enlist 1f);(2 2)!(2 2.34f);(3 3 3)!(3 3 3f)) +dict_data:(dict_data;map_data); // Combine the array data for the list and struct columns -dict_data:(dict_data;map_data); +sparse_data:dense_data:(0 1 0h;1 2 3;4 5 6f) +union_data:(sparse_data;dense_data) // Pretty print the Arrow table populated from the bitmap data -1"\nBitmap table:"; @@ -140,6 +155,10 @@ dict_data:(dict_data;map_data); -1"\nDict table:"; .arrowkdb.tb.prettyPrintTable[dict_schema;dict_data;nested_options] +// Show the array data as an arrow table +-1"\nUnion table:"; +.arrowkdb.tb.prettyPrintTable[union_schema;union_data;nested_options] + //-------------------------// // Example-1. Parquet file // //-------------------------// @@ -150,6 +169,7 @@ nested_options[`PARQUET_VERSION]:`V2.0; parquet_null_bitmap:"null_bitmap.parquet"; parquet_nested_struct:"nested_struct.parquet"; parquet_nested_dict:"nested_dict.parquet"; +parquet_nested_union:"nested_union.parquet"; .arrowkdb.pq.writeParquet[parquet_null_bitmap;bitmap_schema;bitmap_data;nested_options]; .arrowkdb.pq.writeParquet[parquet_nested_struct;struct_schema;struct_data;nested_options]; @@ -188,6 +208,7 @@ nested_list_nulls:(enlist 1b;00b;000b;0000b;00001b); nested_struct_nulls:(10000b;01000b;00100b); nested_dict_nulls:(000b;000b); nested_map_nulls:((enlist 0b)!(enlist 0b);00b!01b;000b!000b); +nested_union_nulls:((7h;9h);000b;000b); parquet_bitmap_nulls:last parquet_bitmap_data; parquet_list_nulls:first parquet_struct_data[1] @@ -212,52 +233,64 @@ rm parquet_nested_dict; arrow_null_bitmap:"null_bitmap.arrow"; arrow_struct_bitmap:"nested_bitmap.arrow"; arrow_dict_bitmap:"nested_dict.arrow"; +arrow_union_bitmap:"nested_union.arrow"; .arrowkdb.ipc.writeArrow[arrow_null_bitmap;bitmap_schema;bitmap_data;nested_options]; .arrowkdb.ipc.writeArrow[arrow_struct_bitmap;struct_schema;struct_data;nested_options]; .arrowkdb.ipc.writeArrow[arrow_dict_bitmap;dict_schema;dict_data;nested_options]; +.arrowkdb.ipc.writeArrow[arrow_union_bitmap;union_schema;union_data;nested_options]; show ls arrow_null_bitmap show ls arrow_struct_bitmap show ls arrow_dict_bitmap +show ls arrow_union_bitmap // Read the schema back and compare arrow_bitmap_schema:.arrowkdb.ipc.readArrowSchema[arrow_null_bitmap]; arrow_struct_schema:.arrowkdb.ipc.readArrowSchema[arrow_struct_bitmap]; arrow_dict_schema:.arrowkdb.ipc.readArrowSchema[arrow_dict_bitmap]; +arrow_union_schema:.arrowkdb.ipc.readArrowSchema[arrow_union_bitmap]; show .arrowkdb.sc.equalSchemas[bitmap_schema;arrow_bitmap_schema] show .arrowkdb.sc.equalSchemas[struct_schema;arrow_struct_schema] show .arrowkdb.sc.equalSchemas[dict_schema;arrow_dict_schema] +show .arrowkdb.sc.equalSchemas[union_schema;arrow_union_schema] show bitmap_schema~arrow_bitmap_schema show struct_schema~arrow_struct_schema show dict_schema~arrow_dict_schema +show union_schema~arrow_union_schema // Read the array data back and compare arrow_bitmap_data:.arrowkdb.ipc.readArrowData[arrow_null_bitmap;nested_options]; arrow_struct_data:.arrowkdb.ipc.readArrowData[arrow_struct_bitmap;nested_options]; arrow_dict_data:.arrowkdb.ipc.readArrowData[arrow_dict_bitmap;nested_options]; +arrow_union_data:.arrowkdb.ipc.readArrowData[arrow_union_bitmap;nested_options]; show bitmap_data~first arrow_bitmap_data show struct_data~first arrow_struct_data show dict_data~first arrow_dict_data +show union_data~first arrow_union_data // Compare null bitmaps of arrow data arrow_bitmap_nulls:last arrow_bitmap_data; arrow_list_nulls:first arrow_struct_data[1] arrow_struct_nulls:last arrow_struct_data[1] arrow_dict_nulls:arrow_dict_data[1] +arrow_union_nulls:arrow_union_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count arrow_bitmap_nulls;arrow_bitmap_nulls] show nested_list_nulls~arrow_list_nulls show nested_struct_nulls~arrow_struct_nulls[0] show nested_dict_nulls~first[arrow_dict_nulls][0] show nested_map_nulls~last[arrow_dict_nulls][0] +show nested_union_nulls~arrow_union_nulls[0][0] +show nested_union_nulls~arrow_union_nulls[1][0] rm arrow_null_bitmap; rm arrow_struct_bitmap; rm arrow_dict_bitmap; +rm arrow_union_bitmap; //-----------------------------// // Example-3. Arrow IPC stream // @@ -267,46 +300,56 @@ rm arrow_dict_bitmap; serialized_null_bitmap:.arrowkdb.ipc.serializeArrow[bitmap_schema;bitmap_data;nested_options]; serialized_nested_struct:.arrowkdb.ipc.serializeArrow[struct_schema;struct_data;nested_options]; serialized_nested_dict:.arrowkdb.ipc.serializeArrow[dict_schema;dict_data;nested_options]; +serialized_nested_union:.arrowkdb.ipc.serializeArrow[union_schema;union_data;nested_options]; show serialized_null_bitmap show serialized_nested_struct show serialized_nested_dict +show serialized_nested_union // Parse the schema back abd compare stream_bitmap_schema:.arrowkdb.ipc.parseArrowSchema[serialized_null_bitmap]; stream_struct_schema:.arrowkdb.ipc.parseArrowSchema[serialized_nested_struct]; stream_dict_schema:.arrowkdb.ipc.parseArrowSchema[serialized_nested_dict]; +stream_union_schema:.arrowkdb.ipc.parseArrowSchema[serialized_nested_union]; show .arrowkdb.sc.equalSchemas[bitmap_schema;stream_bitmap_schema] show .arrowkdb.sc.equalSchemas[struct_schema;stream_struct_schema] show .arrowkdb.sc.equalSchemas[dict_schema;stream_dict_schema] +show .arrowkdb.sc.equalSchemas[union_schema;stream_union_schema] show bitmap_schema~stream_bitmap_schema show struct_schema~stream_struct_schema show dict_schema~stream_dict_schema +show union_schema~stream_union_schema // Parse the array data back and compare stream_bitmap_data:.arrowkdb.ipc.parseArrowData[serialized_null_bitmap;nested_options]; stream_struct_data:.arrowkdb.ipc.parseArrowData[serialized_nested_struct;nested_options]; stream_dict_data:.arrowkdb.ipc.parseArrowData[serialized_nested_dict;nested_options]; +stream_union_data:.arrowkdb.ipc.parseArrowData[serialized_nested_union;nested_options]; show bitmap_data~first stream_bitmap_data show struct_data~first stream_struct_data show dict_data~first stream_dict_data +show union_data~first stream_union_data // Compare null bitmaps of stream data stream_bitmap_nulls:last stream_bitmap_data; stream_list_nulls:first stream_struct_data[1] stream_struct_nulls:last stream_struct_data[1] stream_dict_nulls:stream_dict_data[1] +stream_union_nulls:stream_union_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count stream_bitmap_nulls;stream_bitmap_nulls] show nested_list_nulls~stream_list_nulls show nested_struct_nulls~stream_struct_nulls[0] show nested_dict_nulls~first[stream_dict_nulls][0] show nested_map_nulls~last[stream_dict_nulls][0] +show nested_union_nulls~stream_union_nulls[0][0] +show nested_union_nulls~stream_union_nulls[1][0] -1 "\n+----------------------------------------+\n"; // Process off -//exit 0; +exit 0; From a0738139d892679a86ac55b8ec0cef6f4fca4569 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Fri, 3 Mar 2023 18:32:34 +0300 Subject: [PATCH 238/276] Mapping nulls into unions --- src/ArrayReader.cpp | 110 +++++++++++++++++++++++++++++--------------- 1 file changed, 72 insertions(+), 38 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index d446b21..31268a8 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -616,22 +616,20 @@ unordered_map ArrayHandlers { , make_array_handler() }; -using BitmapHandler = K (*) (shared_ptr array_data, size_t& index ); +using NestedHandler = K (*) (shared_ptr array_data, size_t& index ); -extern unordered_map null_bitmap_handlers; +extern unordered_map NestedHandlers; -template -K AppendNullBitmap( shared_ptr array_data, size_t& index ); -template<> -K AppendNullBitmap( shared_ptr array_data, size_t& index ) +template +K AppendNestedList( shared_ptr array_data, size_t& index ) { - auto slice_array = static_pointer_cast( array_data )->value_slice( index ); + auto slice_array = static_pointer_cast( array_data )->value_slice( index ); auto length = slice_array->length(); auto type_id = slice_array->type_id(); size_t counter = 0; - K k_bitmap = ( null_bitmap_handlers.find( type_id ) == null_bitmap_handlers.end() ) + K k_bitmap = ( NestedHandlers.find( type_id ) == NestedHandlers.end() ) ? ktn( KB, length ) : knk( 0 ); auto slice = slice_array->Slice( 0, length ); @@ -641,20 +639,29 @@ K AppendNullBitmap( shared_ptr array_data, size return k_bitmap; } +template +K AppendNested( shared_ptr array_data, size_t& index ); + +template<> +K AppendNested( shared_ptr array_data, size_t& index ) +{ + return AppendNestedList( array_data, index ); +} + template<> -K AppendNullBitmap( shared_ptr array_data, size_t& index ) +K AppendNested( shared_ptr array_data, size_t& index ) { - return nullptr; + return AppendNestedList( array_data, index ); } template<> -K AppendNullBitmap( shared_ptr array_data, size_t& index ) +K AppendNested( shared_ptr array_data, size_t& index ) { - return nullptr; + return AppendNestedList( array_data, index ); } template<> -K AppendNullBitmap( shared_ptr array_data, size_t& index ) +K AppendNested( shared_ptr array_data, size_t& index ) { auto map_array = static_pointer_cast( array_data ); auto keys = map_array->keys(); @@ -667,13 +674,13 @@ K AppendNullBitmap( shared_ptr array_data, size_ for( auto i = 0; i < length; ++i ){ auto keys_slice = keys->Slice( map_array->value_offset( i ), map_array->value_length( i ) ); auto keys_length = keys_slice->length(); - K k_keys = ( null_bitmap_handlers.find( keys_type_id ) == null_bitmap_handlers.end() ) + K k_keys = ( NestedHandlers.find( keys_type_id ) == NestedHandlers.end() ) ? ktn( KB, keys_length ) : knk( 0 ); auto items_slice = items->Slice( map_array->value_offset( i ), map_array->value_length( i ) ); auto items_length = items_slice->length(); - K k_items = ( null_bitmap_handlers.find( items_type_id ) == null_bitmap_handlers.end() ) + K k_items = ( NestedHandlers.find( items_type_id ) == NestedHandlers.end() ) ? ktn( KB, items_length ) : knk( 0 ); @@ -690,7 +697,7 @@ K AppendNullBitmap( shared_ptr array_data, size_ } template<> -K AppendNullBitmap( shared_ptr array_data, size_t& index ) +K AppendNested( shared_ptr array_data, size_t& index ) { size_t length = 0; auto struct_array = static_pointer_cast( array_data ); @@ -703,7 +710,7 @@ K AppendNullBitmap( shared_ptr array_data, si length = field->length(); size_t counter = 0; - kK( k_bitmap )[i] = ( null_bitmap_handlers.find( type_id ) == null_bitmap_handlers.end() ) + kK( k_bitmap )[i] = ( NestedHandlers.find( type_id ) == NestedHandlers.end() ) ? ktn( KB, length ) : knk( 0 ); InitKdbNullBitmap( field, &kK( k_bitmap )[i], counter ); @@ -715,19 +722,46 @@ K AppendNullBitmap( shared_ptr array_data, si } template<> -K AppendNullBitmap( shared_ptr array_data, size_t& index ) +K AppendNested( shared_ptr array_data, size_t& index ) { - return nullptr; + auto union_array = static_pointer_cast( array_data ); + auto length = union_array->length(); + auto num_fields = union_array->num_fields(); + + K type_ids = ktn( KH, num_fields ); + for( int i = 0; i < num_fields; ++i ){ + kH( type_ids )[i] = union_array->child_id( i ); + } + K k_bitmap = knk( num_fields + 1, type_ids ); + + for( int i = 0; i < num_fields; ++i ){ + auto field_array = union_array->field( i ); + auto type_id = field_array->type_id(); + auto field_length = field_array->length(); + + TypeMappingOverride type_overrides; + kH( type_ids )[i] = kx::arrowkdb::GetKdbType( field_array->type(), type_overrides ); + K k_field = ( NestedHandlers.find( type_id ) == NestedHandlers.end() ) + ? ktn( KB, field_length ) + : knk( 0 ); + + size_t counter = 0; + InitKdbNullBitmap( field_array, &k_field, counter ); + kK( k_bitmap )[i+1] = k_field; + } + index += length; + + return k_bitmap; } template<> -K AppendNullBitmap( shared_ptr array_data, size_t& index ) +K AppendNested( shared_ptr array_data, size_t& index ) { - return nullptr; + return NestedHandlers[arrow::Type::SPARSE_UNION]( array_data, index ); } template<> -K AppendNullBitmap( shared_ptr array_data, size_t& index ) +K AppendNested( shared_ptr array_data, size_t& index ) { auto dictionary_array = static_pointer_cast( array_data ); auto length = dictionary_array->length(); @@ -735,14 +769,14 @@ K AppendNullBitmap( shared_ptr array_data auto items = dictionary_array->dictionary(); auto items_type_id = items->type_id(); auto items_length = items->length(); - K k_items = ( null_bitmap_handlers.find( items_type_id ) == null_bitmap_handlers.end() ) + K k_items = ( NestedHandlers.find( items_type_id ) == NestedHandlers.end() ) ? ktn( KB, items_length ) : knk( 0 ); auto indices = dictionary_array->indices(); auto indices_type_id = indices->type_id(); auto indices_length = indices->length(); - K k_indices = ( null_bitmap_handlers.find( indices_type_id ) == null_bitmap_handlers.end() ) + K k_indices = ( NestedHandlers.find( indices_type_id ) == NestedHandlers.end() ) ? ktn( KB, indices_length ) : knk( 0 ); @@ -758,20 +792,20 @@ K AppendNullBitmap( shared_ptr array_data } template -auto make_null_bitmap_handler() +auto make_nested_handler() { - return make_pair( TypeId, &AppendNullBitmap ); + return make_pair( TypeId, &AppendNested ); } -unordered_map null_bitmap_handlers{ - make_null_bitmap_handler() - , make_null_bitmap_handler() - , make_null_bitmap_handler() - , make_null_bitmap_handler() - , make_null_bitmap_handler() - , make_null_bitmap_handler() - , make_null_bitmap_handler() - , make_null_bitmap_handler() +unordered_map NestedHandlers{ + make_nested_handler() + , make_nested_handler() + , make_nested_handler() + , make_nested_handler() + , make_nested_handler() + , make_nested_handler() + , make_nested_handler() + , make_nested_handler() }; } // namespace @@ -798,12 +832,12 @@ void InitKdbNullBitmap( shared_ptr array_data, K* k_bitmap, size_t auto length = array_data->length(); for( int i = 0ll; i < length; ++i ){ - if( null_bitmap_handlers.find( type_id ) == null_bitmap_handlers.end() ){ + if( NestedHandlers.find( type_id ) == NestedHandlers.end() ){ kG( *k_bitmap )[index++] = array_data->IsNull( i ); } else{ auto pos = index; - *k_bitmap = jk( k_bitmap, null_bitmap_handlers[type_id]( array_data, index ) ); + *k_bitmap = jk( k_bitmap, NestedHandlers[type_id]( array_data, index ) ); i += index - pos - 1; } } @@ -875,7 +909,7 @@ K ReadChunkedNullBitmap( shared_ptr chunked_array, TypeMapp { auto length = chunked_array->length(); auto type_id = chunked_array->type()->id(); - K k_bitmap = ( null_bitmap_handlers.find( type_id ) == null_bitmap_handlers.end() ) + K k_bitmap = ( NestedHandlers.find( type_id ) == NestedHandlers.end() ) ? ktn( KB, length ) : knk( 0 ); From fae9103a15090b8267c92eacb564b0e25531440a Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 6 Mar 2023 11:36:12 +0300 Subject: [PATCH 239/276] Union key types overriding for decimals --- src/ArrayReader.cpp | 59 +++++++++++++++++++++------------------------ src/ArrayReader.h | 9 ++++++- 2 files changed, 35 insertions(+), 33 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 31268a8..5568bd3 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -616,13 +616,12 @@ unordered_map ArrayHandlers { , make_array_handler() }; -using NestedHandler = K (*) (shared_ptr array_data, size_t& index ); +using NestedHandler = K ( * )(shared_ptr array_data, size_t& index, TypeMappingOverride& type_overrides ); extern unordered_map NestedHandlers; - template -K AppendNestedList( shared_ptr array_data, size_t& index ) +K AppendNestedList( shared_ptr array_data, size_t& index, TypeMappingOverride& type_overrides ) { auto slice_array = static_pointer_cast( array_data )->value_slice( index ); auto length = slice_array->length(); @@ -633,35 +632,35 @@ K AppendNestedList( shared_ptr array_data, size_t& index ) ? ktn( KB, length ) : knk( 0 ); auto slice = slice_array->Slice( 0, length ); - InitKdbNullBitmap( slice, &k_bitmap, counter ); + InitKdbNullBitmap( slice, &k_bitmap, counter, type_overrides ); ++index; return k_bitmap; } template -K AppendNested( shared_ptr array_data, size_t& index ); +K AppendNested( shared_ptr array_data, size_t& index, TypeMappingOverride& type_overrides ); template<> -K AppendNested( shared_ptr array_data, size_t& index ) +K AppendNested( shared_ptr array_data, size_t& index, TypeMappingOverride& type_overrides ) { - return AppendNestedList( array_data, index ); + return AppendNestedList( array_data, index, type_overrides ); } template<> -K AppendNested( shared_ptr array_data, size_t& index ) +K AppendNested( shared_ptr array_data, size_t& index, TypeMappingOverride& type_overrides ) { - return AppendNestedList( array_data, index ); + return AppendNestedList( array_data, index, type_overrides ); } template<> -K AppendNested( shared_ptr array_data, size_t& index ) +K AppendNested( shared_ptr array_data, size_t& index, TypeMappingOverride& type_overrides ) { - return AppendNestedList( array_data, index ); + return AppendNestedList( array_data, index, type_overrides ); } template<> -K AppendNested( shared_ptr array_data, size_t& index ) +K AppendNested( shared_ptr array_data, size_t& index, TypeMappingOverride& type_overrides ) { auto map_array = static_pointer_cast( array_data ); auto keys = map_array->keys(); @@ -686,18 +685,17 @@ K AppendNested( shared_ptr array_data, size_t& i size_t keys_counter = 0; size_t items_counter = 0; - InitKdbNullBitmap( keys_slice, &k_keys, keys_counter ); - InitKdbNullBitmap( items_slice, &k_items, items_counter ); + InitKdbNullBitmap( keys_slice, &k_keys, keys_counter, type_overrides ); + InitKdbNullBitmap( items_slice, &k_items, items_counter, type_overrides ); kK( k_bitmap )[i] = xD( k_keys, k_items ); } - index += length; return k_bitmap; } template<> -K AppendNested( shared_ptr array_data, size_t& index ) +K AppendNested( shared_ptr array_data, size_t& index, TypeMappingOverride& type_overrides ) { size_t length = 0; auto struct_array = static_pointer_cast( array_data ); @@ -713,16 +711,15 @@ K AppendNested( shared_ptr array_data, size_t kK( k_bitmap )[i] = ( NestedHandlers.find( type_id ) == NestedHandlers.end() ) ? ktn( KB, length ) : knk( 0 ); - InitKdbNullBitmap( field, &kK( k_bitmap )[i], counter ); + InitKdbNullBitmap( field, &kK( k_bitmap )[i], counter, type_overrides ); } - index += length; return k_bitmap; } template<> -K AppendNested( shared_ptr array_data, size_t& index ) +K AppendNested( shared_ptr array_data, size_t& index, TypeMappingOverride& type_overrides ) { auto union_array = static_pointer_cast( array_data ); auto length = union_array->length(); @@ -732,21 +729,20 @@ K AppendNested( shared_ptr array_data, for( int i = 0; i < num_fields; ++i ){ kH( type_ids )[i] = union_array->child_id( i ); } - K k_bitmap = knk( num_fields + 1, type_ids ); + K k_bitmap = knk( num_fields + 1, type_ids ); for( int i = 0; i < num_fields; ++i ){ auto field_array = union_array->field( i ); auto type_id = field_array->type_id(); auto field_length = field_array->length(); - TypeMappingOverride type_overrides; kH( type_ids )[i] = kx::arrowkdb::GetKdbType( field_array->type(), type_overrides ); K k_field = ( NestedHandlers.find( type_id ) == NestedHandlers.end() ) ? ktn( KB, field_length ) : knk( 0 ); size_t counter = 0; - InitKdbNullBitmap( field_array, &k_field, counter ); + InitKdbNullBitmap( field_array, &k_field, counter, type_overrides ); kK( k_bitmap )[i+1] = k_field; } index += length; @@ -755,16 +751,15 @@ K AppendNested( shared_ptr array_data, } template<> -K AppendNested( shared_ptr array_data, size_t& index ) +K AppendNested( shared_ptr array_data, size_t& index, TypeMappingOverride& type_overrides ) { - return NestedHandlers[arrow::Type::SPARSE_UNION]( array_data, index ); + return NestedHandlers[arrow::Type::SPARSE_UNION]( array_data, index, type_overrides ); } template<> -K AppendNested( shared_ptr array_data, size_t& index ) +K AppendNested( shared_ptr array_data, size_t& index, TypeMappingOverride& type_overrides ) { auto dictionary_array = static_pointer_cast( array_data ); - auto length = dictionary_array->length(); auto items = dictionary_array->dictionary(); auto items_type_id = items->type_id(); @@ -782,11 +777,11 @@ K AppendNested( shared_ptr array_data, si size_t items_counter = 0; size_t indices_counter = 0; - InitKdbNullBitmap( items, &k_items, items_counter ); - InitKdbNullBitmap( indices, &k_indices, indices_counter ); + InitKdbNullBitmap( items, &k_items, items_counter, type_overrides ); + InitKdbNullBitmap( indices, &k_indices, indices_counter, type_overrides ); K k_bitmap = knk( 2, k_items, k_indices ); - index += length; + index += dictionary_array->length(); return k_bitmap; } @@ -826,7 +821,7 @@ void AppendArray(shared_ptr array_data, K k_array, size_t& index, } } -void InitKdbNullBitmap( shared_ptr array_data, K* k_bitmap, size_t& index ) +void InitKdbNullBitmap( shared_ptr array_data, K* k_bitmap, size_t& index, TypeMappingOverride& type_overrides ) { auto type_id = array_data->type_id(); auto length = array_data->length(); @@ -837,7 +832,7 @@ void InitKdbNullBitmap( shared_ptr array_data, K* k_bitmap, size_t } else{ auto pos = index; - *k_bitmap = jk( k_bitmap, NestedHandlers[type_id]( array_data, index ) ); + *k_bitmap = jk( k_bitmap, NestedHandlers[type_id]( array_data, index, type_overrides ) ); i += index - pos - 1; } } @@ -915,7 +910,7 @@ K ReadChunkedNullBitmap( shared_ptr chunked_array, TypeMapp size_t index = 0; for( auto i = 0; i < chunked_array->num_chunks(); ++i ){ - InitKdbNullBitmap( chunked_array->chunk( i ), &k_bitmap, index ); + InitKdbNullBitmap( chunked_array->chunk( i ), &k_bitmap, index, type_overrides ); } return k_bitmap; diff --git a/src/ArrayReader.h b/src/ArrayReader.h index 2fdd5b8..fbb2662 100644 --- a/src/ArrayReader.h +++ b/src/ArrayReader.h @@ -21,6 +21,7 @@ namespace arrowkdb { * list needs to have been created with the correct length by the calling * function. * @param index The index into the kdb list at which the appending should + * @param type_overrides Overrides for type mappings configured by KdbOptions * begin. Index will be updated to account for the new offset by adding the * length of the array array. */ @@ -30,6 +31,7 @@ void AppendArray(std::shared_ptr array_data, K k_array, size_t& in * @brief Copies and converts an arrow array to a kdb list * * @param array The arrow array to be converted + * @param type_overrides Overrides for type mappings configured by KdbOptions * @return A kdb list represented the arrow array */ K ReadArray(std::shared_ptr array, TypeMappingOverride& type_overrides); @@ -41,6 +43,7 @@ K ReadArray(std::shared_ptr array, TypeMappingOverride& type_overr * into the list. * * @param chunked_array The chunked array to be converted + * @param type_overrides Overrides for type mappings configured by KdbOptions * @return A kdb list representing the chunked array */ K ReadChunkedArray(std::shared_ptr chunked_array, TypeMappingOverride& type_overrides); @@ -49,6 +52,7 @@ K ReadChunkedArray(std::shared_ptr chunked_array, TypeMappi * @brief Extracts nulls bitmap of an arrow array into a boolean kdb list * * @param chunked_array The chunked array to be converted + * @param type_overrides Overrides for type mappings configured by KdbOptions * @return A kdb list representing the nulls bitmap */ K ReadChunkedNullBitmap( std::shared_ptr chunked_array, TypeMappingOverride& type_overrides ); @@ -60,6 +64,7 @@ K ReadChunkedNullBitmap( std::shared_ptr chunked_array, Typ * * @param datatype The arrow datatype to be stored in the kdb list * @param length The required length of the kdb list + * @param type_overrides Overrides for type mappings configured by KdbOptions * @return Newly created kdb list */ K InitKdbForArray(std::shared_ptr datatype, size_t length, TypeMappingOverride& type_overrides); @@ -76,8 +81,10 @@ K InitKdbForArray(std::shared_ptr datatype, size_t length, Type * @param index The index into the kdb list at which the appending should * begin. Index will be updated to account for the new offset by adding the * length of the array array. + * @param type_overrides Overrides for type mappings configured by KdbOptions + * In null bitmap is used for overriding key types of unions */ -void InitKdbNullBitmap( std::shared_ptr array_data, K* k_bitmap, size_t& index ); +void InitKdbNullBitmap( std::shared_ptr array_data, K* k_bitmap, size_t& index, TypeMappingOverride& type_overrides ); } // namespace arrowkdb } // namespace kx From 47fc9405597cdf28bf9a29c4e28ddafb2bf82b41 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 6 Mar 2023 12:09:05 +0300 Subject: [PATCH 240/276] Unit-test for union null bitmaps --- examples/null_bitmap.q | 10 ++- tests/.gitignore | 1 + tests/null_bitmap/.gitignore | 3 - tests/null_bitmap/union_null_bitmap.t | 98 +++++++++++++++++++++++++++ tests/null_mapping/.gitignore | 7 -- 5 files changed, 106 insertions(+), 13 deletions(-) delete mode 100644 tests/null_bitmap/.gitignore create mode 100644 tests/null_bitmap/union_null_bitmap.t delete mode 100644 tests/null_mapping/.gitignore diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q index 5af3844..742220d 100644 --- a/examples/null_bitmap.q +++ b/examples/null_bitmap.q @@ -126,6 +126,10 @@ bin_data[1]:"x"$"acknowledge" t64_data:5?(12:00:00.000000000;13:00:00.000000000;14:00:00.000000000;15:00:00.000000000;16:00:00.000000000); t64_data[2]:00:00:00.123456789; +// Create the data for the union child fields +i64_data:N?100; +i64_data[0]:1; + // Combine the data for primitive columns bitmap_data:(ts_data;bool_data;i32_data;f64_data;str_data;d32_data); @@ -140,7 +144,7 @@ map_data:((enlist 1)!(enlist 1f);(2 2)!(2 2.34f);(3 3 3)!(3 3 3f)) dict_data:(dict_data;map_data); // Combine the array data for the list and struct columns -sparse_data:dense_data:(0 1 0h;1 2 3;4 5 6f) +sparse_data:dense_data:(0 1 0h;5 2 3;4 2.34 6f) union_data:(sparse_data;dense_data) // Pretty print the Arrow table populated from the bitmap data @@ -208,7 +212,7 @@ nested_list_nulls:(enlist 1b;00b;000b;0000b;00001b); nested_struct_nulls:(10000b;01000b;00100b); nested_dict_nulls:(000b;000b); nested_map_nulls:((enlist 0b)!(enlist 0b);00b!01b;000b!000b); -nested_union_nulls:((7h;9h);000b;000b); +nested_union_nulls:((0 1 0h);100b;010b); parquet_bitmap_nulls:last parquet_bitmap_data; parquet_list_nulls:first parquet_struct_data[1] @@ -231,7 +235,7 @@ rm parquet_nested_dict; // Write the schema and array data to an arrow file arrow_null_bitmap:"null_bitmap.arrow"; -arrow_struct_bitmap:"nested_bitmap.arrow"; +arrow_struct_bitmap:"nested_struct.arrow"; arrow_dict_bitmap:"nested_dict.arrow"; arrow_union_bitmap:"nested_union.arrow"; diff --git a/tests/.gitignore b/tests/.gitignore index b0b3e83..5415741 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -2,6 +2,7 @@ basic.q crucial_null_bitmap.q glossary_null_bitmap.q nested_null_bitmap.q +union_null_bitmap.q null_mapping_short.q null_mapping_long.q null_mapping_float.q diff --git a/tests/null_bitmap/.gitignore b/tests/null_bitmap/.gitignore deleted file mode 100644 index 3857116..0000000 --- a/tests/null_bitmap/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -crucial_null_bitmap.q -glossary_null_bitmap.q -nested_null_bitmap.q diff --git a/tests/null_bitmap/union_null_bitmap.t b/tests/null_bitmap/union_null_bitmap.t new file mode 100644 index 0000000..2f6580e --- /dev/null +++ b/tests/null_bitmap/union_null_bitmap.t @@ -0,0 +1,98 @@ +// union_null_bitmap.q + +-1"\n+----------|| Import the arrowkdb library ||----------+\n"; +\l q/arrowkdb.q + +-1"\n+----------|| Filesystem functions for Linux/MacOS/Windows ||----------+\n"; +rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; + +-1"\n+----------|| Support null mapping ||----------+\n"; +nested_union_opts:(`float64`int64)!(2.34;5); +union_options:(``NULL_MAPPING)!((::);nested_union_opts); +N:5 + +-1"\n+----------|| Create the datatype identifiers ||----------+\n"; +ts_dt:.arrowkdb.dt.timestamp[`nano]; + +f64_dt:.arrowkdb.dt.float64[]; +i64_dt:.arrowkdb.dt.int64[]; + +-1"\n+----------|| Create the field identifiers ||----------+\n"; +ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; + +f64_fd:.arrowkdb.fd.field[`float64;f64_dt]; +i64_fd:.arrowkdb.fd.field[`int64;i64_dt]; + +-1"\n+----------|| Create fields containing union datatypes ||----------+\n"; +sparse_dt:.arrowkdb.dt.sparse_union[(i64_fd,f64_fd)] +sparse_fd:.arrowkdb.fd.field[`sparse_union;sparse_dt] +dense_dt:.arrowkdb.dt.dense_union[(i64_fd,f64_fd)] +dense_fd:.arrowkdb.fd.field[`dense_union;dense_dt] + +-1"\n+----------|| Create the schema containing the sparce and dense union fields ||----------+\n"; +union_schema:.arrowkdb.sc.schema[(sparse_fd, dense_fd)] + +-1"\n+----------|| Create data for each column in the table ||----------+\n"; +ts_data:asc N?0p; + +f64_data:N?100f; +f64_data[0]:2.34f; +i64_data:N?100h; +i64_data[1]:5h; + +-1"\n+----------|| Create the data the union child fields ||----------+\n"; +i64_data:N?100; +i64_data[0]:1; + +-1"\n+----------|| Combine the array data for the list and struct columns ||----------+\n"; +sparse_data:dense_data:(0 1 0h;5 2 3;4 2.34 6f) +union_data:(sparse_data;dense_data) + +-1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; +union_options[`WITH_NULL_BITMAP]:1; +arrow_union_bitmap:"nested_union.arrow"; +.arrowkdb.ipc.writeArrow[arrow_union_bitmap;union_schema;union_data;union_options]; + +-1"\n+----------|| Read the schema back and compare ||----------+\n"; +arrow_union_schema:.arrowkdb.ipc.readArrowSchema[arrow_union_bitmap]; +.arrowkdb.sc.equalSchemas[union_schema;arrow_union_schema] +union_schema~arrow_union_schema + +-1"\n+----------|| Read the array data back and compare ||----------+\n"; +arrow_union_data:.arrowkdb.ipc.readArrowData[arrow_union_bitmap;union_options]; +union_data~first arrow_union_data + +-1"\n+----------|| Compare null bitmaps of arrow data ||----------+\n"; +nested_union_nulls:((0 1 0h);100b;010b); + +arrow_union_nulls:arrow_union_data[1] +nested_union_nulls~arrow_union_nulls[0][0] +nested_union_nulls~arrow_union_nulls[1][0] + +rm arrow_union_bitmap; + +-1"\n+----------|| Serialize the schema and array data to an arrow stream ||----------+\n"; +serialized_nested_union:.arrowkdb.ipc.serializeArrow[union_schema;union_data;union_options]; + +-1"\n+----------|| Parse the schema back abd compare ||----------+\n"; +stream_union_schema:.arrowkdb.ipc.parseArrowSchema[serialized_nested_union]; +.arrowkdb.sc.equalSchemas[union_schema;stream_union_schema] +union_schema~stream_union_schema + +-1"\n+----------|| Parse the array data back and compare ||----------+\n"; +stream_union_data:.arrowkdb.ipc.parseArrowData[serialized_nested_union;union_options]; +union_data~first stream_union_data + +-1"\n+----------|| Compare null bitmaps of stream data ||----------+\n"; +stream_union_nulls:stream_union_data[1] +nested_union_nulls~stream_union_nulls[0][0] +nested_union_nulls~stream_union_nulls[1][0] + + +-1 "\n+----------|| Test utils ||----------+\n"; + +show .arrowkdb.util.buildInfo[] +(type .arrowkdb.util.buildInfo[])~99h + + +-1 "\n+----------|| Finished testing ||----------+\n"; diff --git a/tests/null_mapping/.gitignore b/tests/null_mapping/.gitignore deleted file mode 100644 index ff2bc45..0000000 --- a/tests/null_mapping/.gitignore +++ /dev/null @@ -1,7 +0,0 @@ -null_mapping_short.q -null_mapping_long.q -null_mapping_float.q -null_mapping_str.q -null_mapping_time.q -null_mapping_extra.q -null_mapping_other.q From 8f18ecc33add2d52dea53410fba8f02694f6abbe Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 6 Mar 2023 13:55:25 +0300 Subject: [PATCH 241/276] Prevent null mapping of map keys --- src/ArrayWriter.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/ArrayWriter.cpp b/src/ArrayWriter.cpp index 9ea8d68..27d9b41 100644 --- a/src/ArrayWriter.cpp +++ b/src/ArrayWriter.cpp @@ -1036,7 +1036,11 @@ void PopulateBuilder(shared_ptr datatype, K k // Populate the child builders for this map set from the dictionary key/value lists auto k_dict = kK(k_array)[i]; TYPE_CHECK_ITEM(99 != k_dict->t, datatype->ToString(), 99, k_dict->t); + + auto items_null_mapping = type_overrides.null_mapping; + type_overrides.null_mapping = Options::NullMapping {0}; PopulateBuilder(key_builder->type(), kK(k_dict)[0], key_builder, type_overrides); + type_overrides.null_mapping = items_null_mapping; PopulateBuilder(item_builder->type(), kK(k_dict)[1], item_builder, type_overrides); } } From 83de44631a02e546534fe9ce66a4bd7099c6ed10 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Mon, 6 Mar 2023 15:14:11 +0300 Subject: [PATCH 242/276] Fix reading of union type IDs --- src/ArrayReader.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 5568bd3..c7e8f3a 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -725,8 +725,9 @@ K AppendNested( shared_ptr array_data, auto length = union_array->length(); auto num_fields = union_array->num_fields(); - K type_ids = ktn( KH, num_fields ); - for( int i = 0; i < num_fields; ++i ){ + // The type_id array is represented as a KH list at the start of the parent mixed list. + K type_ids = ktn( KH, length ); + for( int i = 0; i < length; ++i ){ kH( type_ids )[i] = union_array->child_id( i ); } @@ -736,7 +737,6 @@ K AppendNested( shared_ptr array_data, auto type_id = field_array->type_id(); auto field_length = field_array->length(); - kH( type_ids )[i] = kx::arrowkdb::GetKdbType( field_array->type(), type_overrides ); K k_field = ( NestedHandlers.find( type_id ) == NestedHandlers.end() ) ? ktn( KB, field_length ) : knk( 0 ); @@ -760,6 +760,7 @@ template<> K AppendNested( shared_ptr array_data, size_t& index, TypeMappingOverride& type_overrides ) { auto dictionary_array = static_pointer_cast( array_data ); + auto length = dictionary_array->length(); auto items = dictionary_array->dictionary(); auto items_type_id = items->type_id(); @@ -781,7 +782,7 @@ K AppendNested( shared_ptr array_data, si InitKdbNullBitmap( indices, &k_indices, indices_counter, type_overrides ); K k_bitmap = knk( 2, k_items, k_indices ); - index += dictionary_array->length(); + index += length; return k_bitmap; } From bbbba534c67943358f78785f4030dc91308a1961 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 8 Mar 2023 13:46:16 +0300 Subject: [PATCH 243/276] Unit-test for structs usability improvement --- examples/null_bitmap.q | 6 +++--- tests/null_bitmap/nested_null_bitmap.t | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q index 742220d..135c2ab 100644 --- a/examples/null_bitmap.q +++ b/examples/null_bitmap.q @@ -221,7 +221,7 @@ parquet_dict_nulls:parquet_dict_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count parquet_bitmap_nulls;parquet_bitmap_nulls] show nested_list_nulls~parquet_list_nulls -show nested_struct_nulls~parquet_struct_nulls[0] +show nested_struct_nulls~parquet_struct_nulls show nested_dict_nulls[0]~parquet_dict_nulls[0] show nested_map_nulls~last[parquet_dict_nulls][0] @@ -285,7 +285,7 @@ arrow_union_nulls:arrow_union_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count arrow_bitmap_nulls;arrow_bitmap_nulls] show nested_list_nulls~arrow_list_nulls -show nested_struct_nulls~arrow_struct_nulls[0] +show nested_struct_nulls~arrow_struct_nulls show nested_dict_nulls~first[arrow_dict_nulls][0] show nested_map_nulls~last[arrow_dict_nulls][0] show nested_union_nulls~arrow_union_nulls[0][0] @@ -347,7 +347,7 @@ stream_union_nulls:stream_union_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count stream_bitmap_nulls;stream_bitmap_nulls] show nested_list_nulls~stream_list_nulls -show nested_struct_nulls~stream_struct_nulls[0] +show nested_struct_nulls~stream_struct_nulls show nested_dict_nulls~first[stream_dict_nulls][0] show nested_map_nulls~last[stream_dict_nulls][0] show nested_union_nulls~stream_union_nulls[0][0] diff --git a/tests/null_bitmap/nested_null_bitmap.t b/tests/null_bitmap/nested_null_bitmap.t index 57f4bae..a3f7706 100644 --- a/tests/null_bitmap/nested_null_bitmap.t +++ b/tests/null_bitmap/nested_null_bitmap.t @@ -91,7 +91,7 @@ nested_struct_nulls:(10000b;01000b;00100b) parquet_list_nulls:first parquet_nested_data[1] parquet_struct_nulls:last parquet_nested_data[1] nested_list_nulls~parquet_list_nulls -nested_struct_nulls~parquet_struct_nulls[0] +nested_struct_nulls~parquet_struct_nulls rm parquet_nested_bitmap; @@ -112,7 +112,7 @@ nested_data~first arrow_nested_data arrow_list_nulls:first arrow_nested_data[1] arrow_struct_nulls:last arrow_nested_data[1] nested_list_nulls~arrow_list_nulls -nested_struct_nulls~arrow_struct_nulls[0] +nested_struct_nulls~arrow_struct_nulls rm arrow_nested_bitmap; @@ -132,7 +132,7 @@ nested_data~first stream_nested_data stream_list_nulls:first stream_nested_data[1] stream_struct_nulls:last stream_nested_data[1] nested_list_nulls~stream_list_nulls -nested_struct_nulls~stream_struct_nulls[0] +nested_struct_nulls~stream_struct_nulls -1 "\n+----------|| Test utils ||----------+\n"; From aa67a6d4de1c54a13e3f1186000713b7c0f307f2 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 8 Mar 2023 13:46:47 +0300 Subject: [PATCH 244/276] Struct usability improvement, joining sublists --- src/ArrayReader.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index c7e8f3a..7e72a8a 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -831,6 +831,11 @@ void InitKdbNullBitmap( shared_ptr array_data, K* k_bitmap, size_t if( NestedHandlers.find( type_id ) == NestedHandlers.end() ){ kG( *k_bitmap )[index++] = array_data->IsNull( i ); } + else if( arrow::Type::STRUCT == type_id ){ + auto pos = index; + *k_bitmap = jv( k_bitmap, NestedHandlers[type_id]( array_data, index, type_overrides ) ); + i += index - pos - 1; + } else{ auto pos = index; *k_bitmap = jk( k_bitmap, NestedHandlers[type_id]( array_data, index, type_overrides ) ); From d5d2ae9376435ca413214ce2c31b24b3a47e3ce2 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 8 Mar 2023 20:49:25 +0300 Subject: [PATCH 245/276] Map usability improvement --- src/ArrayReader.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index 7e72a8a..c43cd28 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -831,7 +831,7 @@ void InitKdbNullBitmap( shared_ptr array_data, K* k_bitmap, size_t if( NestedHandlers.find( type_id ) == NestedHandlers.end() ){ kG( *k_bitmap )[index++] = array_data->IsNull( i ); } - else if( arrow::Type::STRUCT == type_id ){ + else if( arrow::Type::STRUCT == type_id || arrow::Type::MAP == type_id ){ auto pos = index; *k_bitmap = jv( k_bitmap, NestedHandlers[type_id]( array_data, index, type_overrides ) ); i += index - pos - 1; From 99bb483192932972031224b83202cda5c3761672 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Wed, 8 Mar 2023 20:50:24 +0300 Subject: [PATCH 246/276] Unit-test for map usability improvement --- examples/batching_tables.q | 7 +++- examples/null_bitmap.q | 8 ++-- tests/null_bitmap/glossary_null_bitmap.t | 52 +++++++++--------------- 3 files changed, 28 insertions(+), 39 deletions(-) diff --git a/examples/batching_tables.q b/examples/batching_tables.q index e98a87f..1078c3d 100644 --- a/examples/batching_tables.q +++ b/examples/batching_tables.q @@ -26,7 +26,7 @@ batching_options:(``PARQUET_VERSION)!((::);`V2.0) parquet_batching:"batching_table.parquet"; .arrowkdb.pq.writeParquetFromTable[parquet_batching;batching_table;batching_options] show ls parquet_batching -//rm parquet_batching +rm parquet_batching // Write the batching array data to an arrow file batching_options[`ARROW_CHUNK_ROWS]:214748365 @@ -34,7 +34,7 @@ batching_options[`ARROW_CHUNK_ROWS]:214748365 arrow_batching:"batching_table.arrow"; .arrowkdb.ipc.writeArrowFromTable[arrow_batching;batching_table;batching_options] show ls arrow_batching -//rm arrow_batching; +rm arrow_batching; // Serialize the batching array data to an arrow stream serialized_batching:.arrowkdb.ipc.serializeArrowFromTable[batching_table;batching_options]; @@ -42,3 +42,6 @@ show serialized_batching -1 "\n+----------------------------------------+\n"; + +// Process off +exit 0; diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q index 135c2ab..66fae45 100644 --- a/examples/null_bitmap.q +++ b/examples/null_bitmap.q @@ -223,7 +223,7 @@ show bitmap_nulls~bitmap_nulls & sublist[{1-x} count parquet_bitmap_nulls;parque show nested_list_nulls~parquet_list_nulls show nested_struct_nulls~parquet_struct_nulls show nested_dict_nulls[0]~parquet_dict_nulls[0] -show nested_map_nulls~last[parquet_dict_nulls][0] +show nested_map_nulls~last[parquet_dict_nulls] rm parquet_null_bitmap; rm parquet_nested_struct; @@ -287,7 +287,7 @@ show bitmap_nulls~bitmap_nulls & sublist[{1-x} count arrow_bitmap_nulls;arrow_bi show nested_list_nulls~arrow_list_nulls show nested_struct_nulls~arrow_struct_nulls show nested_dict_nulls~first[arrow_dict_nulls][0] -show nested_map_nulls~last[arrow_dict_nulls][0] +show nested_map_nulls~last[arrow_dict_nulls] show nested_union_nulls~arrow_union_nulls[0][0] show nested_union_nulls~arrow_union_nulls[1][0] @@ -349,11 +349,11 @@ show bitmap_nulls~bitmap_nulls & sublist[{1-x} count stream_bitmap_nulls;stream_ show nested_list_nulls~stream_list_nulls show nested_struct_nulls~stream_struct_nulls show nested_dict_nulls~first[stream_dict_nulls][0] -show nested_map_nulls~last[stream_dict_nulls][0] +show nested_map_nulls~last[stream_dict_nulls] show nested_union_nulls~stream_union_nulls[0][0] show nested_union_nulls~stream_union_nulls[1][0] -1 "\n+----------------------------------------+\n"; // Process off -exit 0; +//exit 0; diff --git a/tests/null_bitmap/glossary_null_bitmap.t b/tests/null_bitmap/glossary_null_bitmap.t index 15396da..8c0eab5 100644 --- a/tests/null_bitmap/glossary_null_bitmap.t +++ b/tests/null_bitmap/glossary_null_bitmap.t @@ -16,66 +16,49 @@ N:5 -1"\n+----------|| Create the datatype identifiers ||----------+\n"; ts_dt:.arrowkdb.dt.timestamp[`nano]; +str_dt:.arrowkdb.dt.utf8[]; i64_dt:.arrowkdb.dt.int64[]; f64_dt:.arrowkdb.dt.float64[]; --1"\n+----------|| Create a map datatype using the i16_dt as the key and dec_dt as its values ||----------+\n"; -map_dt:.arrowkdb.dt.map[i64_dt;f64_dt] - -1"\n+----------|| Create the field identifiers ||----------+\n"; ts_fd:.arrowkdb.fd.field[`tstamp;ts_dt]; +str_fd:.arrowkdb.fd.field[`string;str_dt]; i64_fd:.arrowkdb.fd.field[`int64;i64_dt]; f64_fd:.arrowkdb.fd.field[`float64;f64_dt]; --1"\n+----------|| Create a field containing the map datatype ||----------+\n"; +-1"\n+----------|| Create a field containing glossary datatypes ||----------+\n"; +dict_dt:.arrowkdb.dt.dictionary[str_dt;i64_dt] +dict_fd:.arrowkdb.fd.field[`dictionary;dict_dt] +map_dt:.arrowkdb.dt.map[i64_dt;f64_dt] map_fd:.arrowkdb.fd.field[`map;map_dt]; -1"\n+----------|| Create the schema containing the large list, dictionary and sparce union fields ||----------+\n"; -glossary_schema:.arrowkdb.sc.schema[(enlist map_fd)]; +glossary_schema:.arrowkdb.sc.schema[(dict_fd, map_fd)]; -1"\n+----------|| Create data for each column in the table ||----------+\n"; ts_data:asc N?0p; +str_data:N?("start";"stop";"alert";"acknowledge";""); +str_data[0]:"start" i64_data:N?100i; i64_data[0]:1i; f64_data:N?100f; f64_data[1]:2.34f; +dict_data:(("aa";"bb";"cc");(2 0 1)) map_data:((enlist 1)!(enlist 1f);(2 2)!(2 2.34f);(3 3 3)!(3 3 3f)) -1"\n+----------|| Combine the array data for the glossary columns ||----------+\n"; -glossary_data:(enlist map_data); - --1"\n+----------|| Write the schema and array data to a parquet file ||----------+\n"; -glossary_options[`PARQUET_VERSION]:`V2.0; - -parquet_glossary_bitmap:"glossary_bitmap.parquet"; -.arrowkdb.pq.writeParquet[parquet_glossary_bitmap;glossary_schema;glossary_data;glossary_options]; - --1"\n+----------|| Read the schema back and compare ||----------+\n"; -glossary_options[`WITH_NULL_BITMAP]:1; - -parquet_glossary_schema:.arrowkdb.pq.readParquetSchema[parquet_glossary_bitmap]; -.arrowkdb.sc.equalSchemas[glossary_schema;parquet_glossary_schema] -glossary_schema~parquet_glossary_schema - --1"\n+----------|| Read the array data back and compare ||----------+\n"; -parquet_glossary_data:.arrowkdb.pq.readParquetData[parquet_glossary_bitmap;glossary_options]; -glossary_data~first parquet_glossary_data - --1"\n+----------|| Compare null bitmaps of parquet data ||----------+\n"; -glossary_nulls:((enlist 0b)!(enlist 0b);00b!01b;000b!000b) -parquet_glossary_nulls:parquet_glossary_data[1] -glossary_nulls~parquet_glossary_nulls[0][0] - -rm parquet_glossary_bitmap; +glossary_data:(dict_data;map_data); -1"\n+----------|| Write the schema and array data to an arrow file ||----------+\n"; arrow_glossary_bitmap:"nested_map.arrow"; .arrowkdb.ipc.writeArrow[arrow_glossary_bitmap;glossary_schema;glossary_data;glossary_options]; -1"\n+----------|| Read the schema back and compare ||----------+\n"; +glossary_options[`WITH_NULL_BITMAP]:1; + arrow_glossary_schema:.arrowkdb.ipc.readArrowSchema[arrow_glossary_bitmap]; .arrowkdb.sc.equalSchemas[glossary_schema;arrow_glossary_schema] glossary_schema~arrow_glossary_schema @@ -85,8 +68,11 @@ arrow_glossary_data:.arrowkdb.ipc.readArrowData[arrow_glossary_bitmap;glossary_o glossary_data~first arrow_glossary_data -1"\n+----------|| Compare null bitmaps of arrow data ||----------+\n"; +nested_dict_nulls:(000b;000b); +nested_map_nulls:((enlist 0b)!(enlist 0b);00b!01b;000b!000b) arrow_glossary_nulls:arrow_glossary_data[1] -glossary_nulls~arrow_glossary_nulls[0][0] +nested_dict_nulls~first arrow_glossary_nulls +nested_map_nulls~last arrow_glossary_nulls rm arrow_glossary_bitmap; @@ -104,8 +90,8 @@ glossary_data~first stream_glossary_data -1"\n+----------|| Compare null bitmaps of stream data ||----------+\n"; stream_glossary_nulls:stream_glossary_data[1] -glossary_nulls~stream_glossary_nulls[0][0] - +nested_dict_nulls~first arrow_glossary_nulls +nested_map_nulls~last arrow_glossary_nulls -1 "\n+----------|| Test utils ||----------+\n"; From deed3f6d85778d56b4cb1624630510c6a213da80 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 9 Mar 2023 10:15:53 +0300 Subject: [PATCH 247/276] Union usability improvement --- src/ArrayReader.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index c43cd28..7c894a5 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -831,14 +831,14 @@ void InitKdbNullBitmap( shared_ptr array_data, K* k_bitmap, size_t if( NestedHandlers.find( type_id ) == NestedHandlers.end() ){ kG( *k_bitmap )[index++] = array_data->IsNull( i ); } - else if( arrow::Type::STRUCT == type_id || arrow::Type::MAP == type_id ){ - auto pos = index; - *k_bitmap = jv( k_bitmap, NestedHandlers[type_id]( array_data, index, type_overrides ) ); - i += index - pos - 1; + else if( arrow::Type::LIST == type_id || arrow::Type::LARGE_LIST == type_id || arrow::Type::FIXED_SIZE_LIST == type_id ){ + auto pos = index; + *k_bitmap = jk( k_bitmap, NestedHandlers[type_id]( array_data, index, type_overrides ) ); + i += index - pos - 1; } else{ auto pos = index; - *k_bitmap = jk( k_bitmap, NestedHandlers[type_id]( array_data, index, type_overrides ) ); + *k_bitmap = jv( k_bitmap, NestedHandlers[type_id]( array_data, index, type_overrides ) ); i += index - pos - 1; } } From 3f54c103c7ba633d7a3ba6046e3e52bafc237843 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Thu, 9 Mar 2023 10:16:24 +0300 Subject: [PATCH 248/276] Unit-test for union usability imporovement --- examples/null_bitmap.q | 12 ++++++------ ...{nested_null_bitmap.t => formation_null_bitmap.t} | 0 tests/null_bitmap/union_null_bitmap.t | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) rename tests/null_bitmap/{nested_null_bitmap.t => formation_null_bitmap.t} (100%) diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q index 66fae45..f01a394 100644 --- a/examples/null_bitmap.q +++ b/examples/null_bitmap.q @@ -286,10 +286,10 @@ arrow_union_nulls:arrow_union_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count arrow_bitmap_nulls;arrow_bitmap_nulls] show nested_list_nulls~arrow_list_nulls show nested_struct_nulls~arrow_struct_nulls -show nested_dict_nulls~first[arrow_dict_nulls][0] +show nested_dict_nulls~first[arrow_dict_nulls] show nested_map_nulls~last[arrow_dict_nulls] -show nested_union_nulls~arrow_union_nulls[0][0] -show nested_union_nulls~arrow_union_nulls[1][0] +show nested_union_nulls~arrow_union_nulls[0] +show nested_union_nulls~arrow_union_nulls[1] rm arrow_null_bitmap; rm arrow_struct_bitmap; @@ -348,10 +348,10 @@ stream_union_nulls:stream_union_data[1] show bitmap_nulls~bitmap_nulls & sublist[{1-x} count stream_bitmap_nulls;stream_bitmap_nulls] show nested_list_nulls~stream_list_nulls show nested_struct_nulls~stream_struct_nulls -show nested_dict_nulls~first[stream_dict_nulls][0] +show nested_dict_nulls~first[stream_dict_nulls] show nested_map_nulls~last[stream_dict_nulls] -show nested_union_nulls~stream_union_nulls[0][0] -show nested_union_nulls~stream_union_nulls[1][0] +show nested_union_nulls~stream_union_nulls[0] +show nested_union_nulls~stream_union_nulls[1] -1 "\n+----------------------------------------+\n"; diff --git a/tests/null_bitmap/nested_null_bitmap.t b/tests/null_bitmap/formation_null_bitmap.t similarity index 100% rename from tests/null_bitmap/nested_null_bitmap.t rename to tests/null_bitmap/formation_null_bitmap.t diff --git a/tests/null_bitmap/union_null_bitmap.t b/tests/null_bitmap/union_null_bitmap.t index 2f6580e..2812dcf 100644 --- a/tests/null_bitmap/union_null_bitmap.t +++ b/tests/null_bitmap/union_null_bitmap.t @@ -66,8 +66,8 @@ union_data~first arrow_union_data nested_union_nulls:((0 1 0h);100b;010b); arrow_union_nulls:arrow_union_data[1] -nested_union_nulls~arrow_union_nulls[0][0] -nested_union_nulls~arrow_union_nulls[1][0] +nested_union_nulls~arrow_union_nulls[0] +nested_union_nulls~arrow_union_nulls[1] rm arrow_union_bitmap; @@ -85,8 +85,8 @@ union_data~first stream_union_data -1"\n+----------|| Compare null bitmaps of stream data ||----------+\n"; stream_union_nulls:stream_union_data[1] -nested_union_nulls~stream_union_nulls[0][0] -nested_union_nulls~stream_union_nulls[1][0] +nested_union_nulls~stream_union_nulls[0] +nested_union_nulls~stream_union_nulls[1] -1 "\n+----------|| Test utils ||----------+\n"; From 4637a4bfa80946fe93c29ebb9ad491ce97abf1e4 Mon Sep 17 00:00:00 2001 From: nmcdonnell-kx Date: Mon, 13 Mar 2023 12:08:11 +0000 Subject: [PATCH 249/276] KXI-22441 Recurse top level child lists of LIST, MAP, STRUCT, UNION, DICT * Need to support null bitmaps with readParquetRowGroups * Need to support null bitmaps with the xxxToTable q bindings --- q/arrowkdb.q | 36 ++++++++++++++++++++++++++++++++---- src/TableData.cpp | 14 ++++++++++++++ 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/q/arrowkdb.q b/q/arrowkdb.q index 8fd01f4..b3d56cd 100644 --- a/q/arrowkdb.q +++ b/q/arrowkdb.q @@ -112,11 +112,25 @@ pq.writeParquet:`arrowkdb 2:(`writeParquet;4); pq.writeParquetFromTable:{[filename;table;options] pq.writeParquet[filename;sc.inferSchema[table];value flip table;options]}; pq.readParquetSchema:`arrowkdb 2:(`readParquetSchema;1); pq.readParquetData:`arrowkdb 2:(`readParquetData;2); -pq.readParquetToTable:{[filename;options] flip (fd.fieldName each sc.schemaFields[pq.readParquetSchema[filename]])!(pq.readParquetData[filename;options])}; +pq.readParquetToTable:{[filename;options] + fields:fd.fieldName each sc.schemaFields[pq.readParquetSchema[filename]]; + data:pq.readParquetData[filename;options]; + $[1~options`WITH_NULL_BITMAP; + (flip fields!first data;flip fields!last data); + flip fields!data + ] + }; pq.readParquetColumn:`arrowkdb 2:(`readParquetColumn;3); pq.readParquetNumRowGroups:`arrowkdb 2:(`readParquetNumRowGroups;1); pq.readParquetRowGroups:`arrowkdb 2:(`readParquetRowGroups;4); -pq.readParquetRowGroupsToTable:{[filename;row_groups;columns;options] flip (fd.fieldName each sc.schemaFields[pq.readParquetSchema[filename]](columns))!(pq.readParquetRowGroups[filename;row_groups;columns;options])}; +pq.readParquetRowGroupsToTable:{[filename;row_groups;columns;options] + fields:fd.fieldName each sc.schemaFields[pq.readParquetSchema[filename]](columns); + data:pq.readParquetRowGroups[filename;row_groups;columns;options]; + $[1~options`WITH_NULL_BITMAP; + (flip fields!first data;flip fields!last data); + flip fields!data + ] + }; // arrow files @@ -124,7 +138,14 @@ ipc.writeArrow:`arrowkdb 2:(`writeArrow;4); ipc.writeArrowFromTable:{[filename;table;options] ipc.writeArrow[filename;sc.inferSchema[table];value flip table;options]}; ipc.readArrowSchema:`arrowkdb 2:(`readArrowSchema;1); ipc.readArrowData:`arrowkdb 2:(`readArrowData;2); -ipc.readArrowToTable:{[filename;options] flip (fd.fieldName each sc.schemaFields[ipc.readArrowSchema[filename]])!(ipc.readArrowData[filename;options])}; +ipc.readArrowToTable:{[filename;options] + fields:fd.fieldName each sc.schemaFields[ipc.readArrowSchema[filename]]; + data:ipc.readArrowData[filename;options]; + $[1~options`WITH_NULL_BITMAP; + (flip fields!first data;flip fields!last data); + flip fields!data + ] + }; // arrow streams @@ -132,7 +153,14 @@ ipc.serializeArrow:`arrowkdb 2:(`serializeArrow;3); ipc.serializeArrowFromTable:{[table;options] ipc.serializeArrow[sc.inferSchema[table];value flip table;options]}; ipc.parseArrowSchema:`arrowkdb 2:(`parseArrowSchema;1); ipc.parseArrowData:`arrowkdb 2:(`parseArrowData;2); -ipc.parseArrowToTable:{[serialized;options] flip (fd.fieldName each sc.schemaFields[ipc.parseArrowSchema[serialized]])!(ipc.parseArrowData[serialized;options])}; +ipc.parseArrowToTable:{[serialized;options] + fields:fd.fieldName each sc.schemaFields[ipc.parseArrowSchema[serialized]]; + data:ipc.parseArrowData[serialized;options]; + $[1~options`WITH_NULL_BITMAP; + (flip fields!first data;flip fields!last data); + flip fields!data + ] + }; // utils diff --git a/src/TableData.cpp b/src/TableData.cpp index 4b76ae6..b9ba2e9 100644 --- a/src/TableData.cpp +++ b/src/TableData.cpp @@ -446,6 +446,20 @@ K readParquetRowGroups(K parquet_file, K row_groups, K columns, K options) kK(data)[i] = kx::arrowkdb::ReadChunkedArray(chunked_array, type_overrides); } + int64_t with_null_bitmap = 0; + read_options.GetIntOption(kx::arrowkdb::Options::WITH_NULL_BITMAP, with_null_bitmap); + if (with_null_bitmap) { + K bitmap = ktn(0, col_num); + for (auto i = 0; i < col_num; ++i) { + auto chunked_array = table->column(i); + kK(bitmap)[i] = kx::arrowkdb::ReadChunkedArrayNullBitmap(chunked_array, type_overrides); + } + K array = data; + data = ktn(0, 2); + kK(data)[0] = array; + kK(data)[1] = bitmap; + } + return data; KDB_EXCEPTION_CATCH; From a7a114fb33d1c487420256d3d5759cc7036b9748 Mon Sep 17 00:00:00 2001 From: nmcdonnell-kx Date: Mon, 13 Mar 2023 12:16:17 +0000 Subject: [PATCH 250/276] KXI-22441 Recurse top level child lists of LIST, MAP, STRUCT, UNION, DICT * Some more tidy up --- src/ArrayReader.h | 34 ---------------------------------- 1 file changed, 34 deletions(-) diff --git a/src/ArrayReader.h b/src/ArrayReader.h index 18f0cef..8a0c4f2 100644 --- a/src/ArrayReader.h +++ b/src/ArrayReader.h @@ -71,40 +71,6 @@ K ReadChunkedArrayNullBitmap( std::shared_ptr chunked_array */ K InitKdbForArray(std::shared_ptr datatype, size_t length, TypeMappingOverride& type_overrides, GetKdbTypeCommon get_kdb_type); -/** - * @brief Appends null bitmap data from an arrow array into an existing kdb boolean - * list starting at the specified index. - * - * @param array_data The arrow array from which to source the data. The entire - * array will be appended. - * @param k_bitmap The kdb boolean list that the data should be inserted into. - * This list needs to have been created with the correct length by the calling - * function. - * @param index The index into the kdb list at which the appending should - * begin. Index will be updated to account for the new offset by adding the - * length of the array array. - * @param type_overrides Overrides for type mappings configured by KdbOptions - * In null bitmap is used for overriding key types of unions -*/ -void InitKdbNullBitmap( std::shared_ptr array_data, K* k_bitmap, size_t& index, TypeMappingOverride& type_overrides ); - -/** - * @brief Appends null bitmap data from an arrow array into an existing kdb boolean - * list starting at the specified index. - * - * @param array_data The arrow array from which to source the data. The entire - * array will be appended. - * @param k_bitmap The kdb boolean list that the data should be inserted into. - * This list needs to have been created with the correct length by the calling - * function. - * @param index The index into the kdb list at which the appending should - * begin. Index will be updated to account for the new offset by adding the - * length of the array array. - * @param type_overrides Overrides for type mappings configured by KdbOptions - * In null bitmap is used for overriding key types of unions -*/ -void InitKdbNullBitmap( std::shared_ptr array_data, K* k_bitmap, size_t& index, TypeMappingOverride& type_overrides ); - } // namespace arrowkdb } // namespace kx From ea0cfcd8fee7baffde07ab2830ee09c3c5fccbbd Mon Sep 17 00:00:00 2001 From: nmcdonnell-kx Date: Mon, 13 Mar 2023 14:31:11 +0000 Subject: [PATCH 251/276] KXI-22441 Recurse top level child lists of LIST, MAP, STRUCT, UNION, DICT * Don't use std::function * Use lookup for InitKdbForArray --- src/ArrayReader.cpp | 137 +++++++++++++++++++++++++----------------- src/HelperFunctions.h | 2 +- 2 files changed, 84 insertions(+), 55 deletions(-) diff --git a/src/ArrayReader.cpp b/src/ArrayReader.cpp index d958d0d..353d709 100644 --- a/src/ArrayReader.cpp +++ b/src/ArrayReader.cpp @@ -21,8 +21,8 @@ using namespace kx::arrowkdb; namespace { -typedef std::function array_data, TypeMappingOverride& type_overrides)> ReadArrayCommon; -typedef std::function array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides)> AppendArrayCommon; +typedef K(*ReadArrayCommon)(std::shared_ptr array_data, TypeMappingOverride& type_overrides); +typedef void(*AppendArrayCommon)(std::shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides); // An arrow list array is a nested set of child lists. This is represented in // kdb as a mixed list for the parent list array containing a set of sub-lists, @@ -691,6 +691,75 @@ unordered_map NullBitmapHandlers{ , make_append_array_null_bitmap_handler() }; +typedef K(*InitKdbForArrayHandler)(shared_ptr datatype, size_t length, TypeMappingOverride& type_overrides, GetKdbTypeCommon get_kdb_type); + +extern unordered_map InitKdbForArrayHandlers; + +template +K InitKdbForArray(shared_ptr datatype, size_t length, TypeMappingOverride& type_overrides, GetKdbTypeCommon get_kdb_type); + +template<> +K InitKdbForArray(shared_ptr datatype, size_t length, TypeMappingOverride& type_overrides, GetKdbTypeCommon get_kdb_type) +{ + // Arrow struct becomes a mixed list of lists so create necessary lists + auto num_fields = datatype->num_fields(); + K result = knk(num_fields); + for (auto i = 0; i < num_fields; ++i) { + auto field = datatype->field(i); + kK(result)[i] = InitKdbForArray(field->type(), length, type_overrides, get_kdb_type); + } + return result; +} + +template<> +K InitKdbForArray(shared_ptr datatype, size_t length, TypeMappingOverride& type_overrides, GetKdbTypeCommon get_kdb_type) +{ + // Arrow union becomes a mixed list of type_id list plus the child lists + auto num_fields = datatype->num_fields(); + K result = knk(num_fields + 1); + kK(result)[0] = ktn(KH, length); // type_id list + for (auto i = 0; i < num_fields; ++i) { + auto field = datatype->field(i); + kK(result)[i + 1] = InitKdbForArray(field->type(), length, type_overrides, get_kdb_type); + } + return result; +} + +template<> +K InitKdbForArray(shared_ptr datatype, size_t length, TypeMappingOverride& type_overrides, GetKdbTypeCommon get_kdb_type) +{ + return InitKdbForArray(datatype, length, type_overrides, get_kdb_type); +} + +template<> +K InitKdbForArray(shared_ptr datatype, size_t length, TypeMappingOverride& type_overrides, GetKdbTypeCommon get_kdb_type) +{ + // Arrow dictionary becomes a two item mixed list + auto dictionary_type = static_pointer_cast(datatype); + K result = ktn(0, 2); + + // Do not preallocate the child lists since AppendDictionary has to join to the + // indicies and values lists + kK(result)[0] = InitKdbForArray(dictionary_type->value_type(), 0, type_overrides, get_kdb_type); + kK(result)[1] = InitKdbForArray(dictionary_type->index_type(), 0, type_overrides, get_kdb_type); + + return result; +} + +template +auto make_init_kdb_for_array_handler() +{ + return make_pair(TypeId, &InitKdbForArray); +} + +unordered_map InitKdbForArrayHandlers{ + make_init_kdb_for_array_handler() + , make_init_kdb_for_array_handler() + , make_init_kdb_for_array_handler() + , make_init_kdb_for_array_handler() +}; + + } // namespace namespace kx { @@ -698,27 +767,22 @@ namespace arrowkdb { void AppendArray(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto type_id = array_data->type_id(); - if( ArrayHandlers.find( type_id ) == ArrayHandlers.end() ) - { + auto lookup = ArrayHandlers.find(array_data->type_id()); + if (lookup == ArrayHandlers.end()) { TYPE_CHECK_UNSUPPORTED(array_data->type()->ToString()); - } - else - { - ArrayHandlers[type_id]( array_data, k_array, index, type_overrides ); + } else { + lookup->second(array_data, k_array, index, type_overrides); } } void AppendArrayNullBitmap(shared_ptr array_data, K k_array, size_t& index, TypeMappingOverride& type_overrides) { - auto type_id = array_data->type_id(); - if (NullBitmapHandlers.find(type_id) == NullBitmapHandlers.end()) - { + auto lookup = NullBitmapHandlers.find(array_data->type_id()); + if (lookup == NullBitmapHandlers.end()) { for (int i = 0ll; i < array_data->length(); ++i) kG(k_array)[index++] = array_data->IsNull(i); - } else - { - NullBitmapHandlers[type_id](array_data, k_array, index, type_overrides); + } else { + lookup->second(array_data, k_array, index, type_overrides); } } @@ -732,45 +796,10 @@ KdbType GetKdbTypeNullBitmap(std::shared_ptr datatype, TypeMapp K InitKdbForArray(shared_ptr datatype, size_t length, TypeMappingOverride& type_overrides, GetKdbTypeCommon get_kdb_type) { - switch (datatype->id()) { - case arrow::Type::STRUCT: - { - // Arrow struct becomes a mixed list of lists so create necessary lists - auto num_fields = datatype->num_fields(); - K result = knk(num_fields); - for (auto i = 0; i < num_fields; ++i) { - auto field = datatype->field(i); - kK(result)[i] = InitKdbForArray(field->type(), length, type_overrides, get_kdb_type); - } - return result; - } - case arrow::Type::SPARSE_UNION: - case arrow::Type::DENSE_UNION: - { - // Arrow union becomes a mixed list of type_id list plus the child lists - auto num_fields = datatype->num_fields(); - K result = knk(num_fields + 1); - kK(result)[0] = ktn(KH, length); // type_id list - for (auto i = 0; i < num_fields; ++i) { - auto field = datatype->field(i); - kK(result)[i + 1] = InitKdbForArray(field->type(), length, type_overrides, get_kdb_type); - } - return result; - } - case arrow::Type::DICTIONARY: - { - // Arrow dictionary becomes a two item mixed list - auto dictionary_type = static_pointer_cast(datatype); - K result = ktn(0, 2); - - // Do not preallocate the child lists since AppendDictionary has to join to the - // indicies and values lists - kK(result)[0] = InitKdbForArray(dictionary_type->value_type(), 0, type_overrides, get_kdb_type); - kK(result)[1] = InitKdbForArray(dictionary_type->index_type(), 0, type_overrides, get_kdb_type); - - return result; - } - default: + auto lookup = InitKdbForArrayHandlers.find(datatype->id()); + if (lookup != InitKdbForArrayHandlers.end()) { + return lookup->second(datatype, length, type_overrides, get_kdb_type); + } else { return ktn(get_kdb_type(datatype, type_overrides), length); } } diff --git a/src/HelperFunctions.h b/src/HelperFunctions.h index 36ac03e..63a4cef 100644 --- a/src/HelperFunctions.h +++ b/src/HelperFunctions.h @@ -176,7 +176,7 @@ std::shared_ptr GetArrowType(K k_array); // FUNCTION HANDLERS // /////////////////////// -typedef std::function datatype, TypeMappingOverride& type_overrides)> GetKdbTypeCommon; +typedef KdbType(*GetKdbTypeCommon)(std::shared_ptr datatype, TypeMappingOverride& type_overrides); } // namespace arrowkdb } // namespace kx From f4c4b77e13466f1921a3e08cb3ba53070e765598 Mon Sep 17 00:00:00 2001 From: nmcdonnell-kx Date: Mon, 13 Mar 2023 14:35:31 +0000 Subject: [PATCH 252/276] KXI-22441 Recurse top level child lists of LIST, MAP, STRUCT, UNION, DICT * Add Liz's docs review updates --- docs/null-bitmap.md | 52 +++++++++++------------- docs/null-mapping.md | 95 ++++++-------------------------------------- 2 files changed, 37 insertions(+), 110 deletions(-) diff --git a/docs/null-bitmap.md b/docs/null-bitmap.md index dded155..a6550a4 100755 --- a/docs/null-bitmap.md +++ b/docs/null-bitmap.md @@ -2,17 +2,21 @@ ## Problem -Arrowkdb ignores the null bitmap when reading or writing an arrow array.  Even if the null bitmap is set for an array item, arrowkdb still reads its value from the array data.  This was done for a couple of reasons:  +Previously arrowkdb ignored the null bitmap when reading or writing an arrow array. This was due to the following reasons:  -- Kdb doesn't have proper distinct null values for its types.  Using the kdb null values will result in some strange corner cases.  The only way to do this properly would be to expose the array data separately to the null bitmap (in line with how arrow represents nulls).  But this would make the API more complex.  +- Using the kdb null values will result in some strange corner cases. -- Mapping to kdb nulls would hurt the performance.  For simple types (ints, floats, etc.) arrowkdb bulk copies the entire arrow array into a kdb list using memcpy.  Having to check every array item for null then either use its value or the closest kdb null would require processing one item at a time.  +- Mapping to kdb nulls would hurt the performance. -However, null support in arrowkdb has been requested by users so potential implementations are to be considered. + Users have requested that arrowkdb provides a null bitmap when reading an arrow array.so that the user can use this array in their applications. -# Exposing the null bitmap to the kdb user +When reading an arrow array using arrowkdb the user can now choose to return the null bitmap as well as the data values. The shape of the null bitmap structure is exactly the same as the data structure.  It is then left to the user to interpret the two structures as appropriate for their application. + +Note: there is currently no support for null bitmap with the writer functions.  + + +## Implementation -## Approach Arrowkdb represents an arrow table (which is a set of arrow arrays) as a mixed list of lists.  This is then decorated with the field names from the schema to create a kdb table, similar to: @@ -28,7 +32,17 @@ col1 col2 col3 1760748068 2.89119e+18 2001.07.26D01:03:47.039068936 ``` -In order to avoid the limitations described above with kdb nulls, an alternative approach is to expose the null bitmap as a separate structure to kdb (more in line with how arrow represents nulls): + +Each reader function in arrowkdb takes an options dictionary.  A new `WITH_NULL_BITMAP option has been added.  When this option is set the reader functions return a two item mixed list, rather than one (the data values and null bitmap): + +```q +q)read_data_structures:.arrowkdb.pq.readParquetToTable["file.parquet";(``WITH_NULL_BITMAP)!((::);1)] +q)read_data_structures ++`col1`col2`col3!(-239800692 -930424766 1760748068i;-1.16675e+18 4.413091e+18.. ++`col1`col2`col3!(011b;110b;010b) +``` + +The null bitmap is a separate structure to kdb: ```q q)null_bitmaps:(3?0b;3?0b;3?0b) @@ -50,27 +64,9 @@ col1 col2 col3 1 0 0 ``` -The shape of the null bitmap structure would be exactly the same as the data structure.  It is then left to the user to interpret the two structures as appropriate for their application. - -Each reader function in arrowkdb takes an options dictionary.  A new `WITH_NULL_BITMAP option would be added.  When this option is set the reader functions then return a two item mixed list (the data values and null bitmap): - -```q -q)read_data_structures:.arrowkdb.pq.readParquetToTable["file.parquet";(``WITH_NULL_BITMAP)!((::);1)] -q)read_data_structures -+`col1`col2`col3!(-239800692 -930424766 1760748068i;-1.16675e+18 4.413091e+18.. -+`col1`col2`col3!(011b;110b;010b) -``` - -Note: it would not be possible to support the null bitmap with the writer functions without a significant rework of arrowkdb.  This is because arrow arrays are built append only, it would not be possible to populate the values with a first pass (as done currently) then populate the null bitmap with a second pass.  Rather it would be necessary to populate the data values and null bitmap in a single pass which is not possible with the current design. - -## Considerations - -This approach results in an overly complicated API and would be unintuitive for kdb users (who are more familiar with and expect kdb nulls).  - -Furthermore, how would the null bitmap be used in a kdb application?  If its only purpose is to populate the data structure with kdb nulls then it will suffer the same limitations as having arrowkdb do this mapping, while introducing unnecessary complexity.  -Note: Since the null bitmap structure and data structure must have the same shape, arrow arrays which use nested datatypes (list, map, struct, union, dictionaries) where the parent array contains null values cannot be represented.  For example, an array with a struct datatype in arrow can have either null child field values or the parent struct value could be null.  The null bitmap structure will only reflect the null bitmap of the child field datatypes. +## Limitations -## Conclusions +- The use of a null bitmap with the writer functions is not supported.  -Exposing the null bitmap to the kdb user, while closer to how arrow represents null and makes the API more complex forms understanding that there is a clear use case where the null bitmap could be well utilised in kdb.  Also, more intuitive null mapping feature enabled in parallel may improve user experience. +- Since the null bitmap structure and data structure must have the same shape, arrow arrays which use nested datatypes (list, map, struct, union, dictionaries) where the parent array contains null values cannot be represented.  For example, an array with a struct datatype in arrow can have either null child field values or the parent struct value could be null.  The null bitmap structure will only reflect the null bitmap of the child field datatypes. diff --git a/docs/null-mapping.md b/docs/null-mapping.md index ec74880..5cefbe0 100755 --- a/docs/null-mapping.md +++ b/docs/null-mapping.md @@ -1,79 +1,10 @@ # Arrowkdb null mapping -## Background +## Problem -The basic unit for storing data in arrow is an array.  Each array contains:  +Previously arrowkdb ignored the null bitmap when reading or writing an arrow array. Users have requested that arrowkdb maps arrow nulls into kdb. -- Datatype identifier  - -- Length  - -- Block of data (length as above) and accessors  - -- A null bitmap (length as above) - -Arrowkdb converts an arrow array to a kdb list and vice versa with type mapping as required:  - -- Simple datatypes (ints, floats) are memcpy-ed  - -- Temporal datatypes are copied one item at a time with the appropriate epoch offsetting and scaling  - -- String and binary datatypes are copied into a mixed list of char or byte lists  - -- Nested datatypes (list, map, struct, union, dictionaries) are represented by a mixed list of sublists, depending on the child datatypes (using recursion to populate the child lists) - -Full details are provided here https://code.kx.com/q/interfaces/arrow/arrow-types/ - -# Mapping arrow nulls to kdb nulls - -## Approach - -Currently the simple datatype arrays are memcpy-ed as follows: - -```cpp - case arrow::Type::UINT16: - { - auto uint16_array = std::static_pointer_cast(array_data); - memcpy(kH(k_array), uint16_array->raw_values(), uint16_array->length() * sizeof(arrow::UInt16Array::value_type)); - break; - } -``` - -A simple approach to map arrow nulls to kdb nulls is to change this to: - -```cpp - case arrow::Type::INT16: - { - auto int16_array = std::static_pointer_cast(array_data); - - for (auto i = 0; i < int16_array->length(); ++i) - if (int16_array->IsNull(i)) - kH(k_array)[i] = INT16_MIN; - else - kH(k_array)[i] = int16_array->Value(i); - break; - } -``` - -The issue with this is that it would result in a significant drop in performance due to inevitable failures in branch prediction.  However, with some arithmetic trickery the same functionality can be modelled without a branch: - -```cpp - case arrow::Type::INT16: - { - auto int16_array = std::static_pointer_cast(array_data); - for (auto i = 0; i < int16_array->length(); ++i) - kH(k_array)[i] = (int16_array->IsNull(i) * INT16_MIN) + (!int16_array->IsNull(i) * int16_array->Value(i)); - break; - } -``` - -Although there would still be a loss of performance (memcpy copies 64 bits at a time so doing an item by item copy where the datatype < 64 bits will be slower, plus the overhead of indexing into the null bitmap), that loss should not be significant.  - -Note: the examples above refer to the reader functions but similar functionality would be provided for the writer functions which would perform the reverse operation (setting the arrow null bitmap if the value is a kdb null). - -## Considerations - -The bigger problem is that unlike arrow, not all kdb types have a null value.  Also, those that do just overload one value in the range (typically INT_MIN or FLOAT_MIN).  +Unlike arrow, not all kdb types have a null value and those that do overload one value in the range (typically INT_MIN or FLOAT_MIN).  For example: @@ -81,15 +12,14 @@ For example: - kdb doesn't have a byte null.  -- Unlike arrow, kdb can't distinguish between a null string and empty string.  Similarly it can't distinguish between the " " character and null. - -Therefore mapping arrow nulls to kdb nulls is going to result in corner cases which can't be represented accurately.  However, the type mapping elsewhere in arrowkdb already has corner cases: +- Unlike arrow, kdb can't distinguish between: + - a null string and empty string.  + - the " " character and null. -- db has no unsigned integer types.  Arrow unsigned ints are represented in kdb as signed ints.  Therefore if the top bit of an unsigned is set in arrow, it will display as a negative number in kdb.  -- Converting from kdb temporals to arrow temporals can result in a loss of precision. +## Implementation -A compromise would be to allow the user to specify how arrowkdb should map nulls.  Each reader and writer function in arrowkdb takes an options dictionary.  A new `NULL_MAPPING` option would be added which allows the user to specify whether an arrow datatype should be null mapped and what value to use for null.  +When reading and writing an arrow array using arrowkdb the user can now choose whether to map arrow nulls. Each reader and writer function in arrowkdb takes an options dictionary.  A new `NULL_MAPPING option has been added which allows the user to specify whether an arrow datatype should be null mapped and what value to use for null in kdb.  For example: @@ -112,10 +42,11 @@ utf8 | "" binary | `byte$() ``` -The type of each value in this dictionary must be the atomic type of the corresponding list representation for that datatype.  Where a datatype isn't present in this dictionary, arrowkdb would ignore the null bitmap (as per the existing behaviour). +The type of each value in this dictionary must be the atomic type of the corresponding list representation for that datatype.  Where a datatype isn't present in this dictionary, arrowkdb will ignore the null bitmap (as per the previous behaviour). + -Note: There is no null mapping for arrow arrays which use nested datatypes (list, map, struct, union, dictionaries) where the parent array contains null values.  For example, an array with a struct datatype in arrow can have either null child field values or the parent struct value could be null.  Arrowkdb will only map nulls for the child fields using the above mapping. +## Limitations -## Conclusions +- There is no null mapping for arrow arrays which use nested datatypes (list, map, struct, union, dictionaries) where the parent array contains null values.  For example, an array with a struct datatype in arrow can have either null child field values or the parent struct value could be null.  Arrowkdb will only map nulls for the child fields using the above mapping. -Mapping arrow nulls to kdb nulls is considerably easier to implement and more intuitive for a kdb user. While the user may achive better precision combining the method with exposing of null bitmap. +- There is a loss of performance when choosing to map nulls, but this should not be significant.  From 01ac2650f56779abfc4f703f41d903e641e352d1 Mon Sep 17 00:00:00 2001 From: nmcdonnell-kx Date: Mon, 13 Mar 2023 15:17:48 +0000 Subject: [PATCH 253/276] KXI-22441 Recurse top level child lists of LIST, MAP, STRUCT, UNION, DICT * null mappings should be `utf8 and `large_utf8 (to match datatype constructors) - not `string and `large_string * Fix tests for above * Fix examples for above * Null examples didn't run - N wasn't set --- examples/null_bitmap.q | 5 ++++- examples/null_mapping.q | 7 +++++-- src/KdbOptions.h | 4 ++-- tests/null_bitmap/crucial_null_bitmap.t | 2 +- tests/null_mapping/null_mapping_extra.t | 2 +- tests/null_mapping/null_mapping_str.t | 2 +- 6 files changed, 14 insertions(+), 8 deletions(-) diff --git a/examples/null_bitmap.q b/examples/null_bitmap.q index f01a394..9dbb7b4 100644 --- a/examples/null_bitmap.q +++ b/examples/null_bitmap.q @@ -19,7 +19,7 @@ rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; //-------------------// // Support null mapping -bitmap_opts:(`bool`int32`float64`string`date32)!(0b;1i;2.34;"start";2006.07.21); +bitmap_opts:(`bool`int32`float64`utf8`date32)!(0b;1i;2.34;"start";2006.07.21); nested_struct_opts:(`uint16`float32`binary`time64)!(9h;8.76e;"x"$"acknowledge";00:00:00.123456789); nested_dict_opts:(enlist `int64)!(enlist 5); @@ -104,6 +104,9 @@ union_schema:.arrowkdb.sc.schema[(sparse_fd, dense_fd)] -1"\nUnion schema:"; .arrowkdb.sc.printSchema[union_schema]; +// Number of items in each array +N:10 + // Create data for each column in the table ts_data:asc N?0p; diff --git a/examples/null_mapping.q b/examples/null_mapping.q index 470dd29..6b4e9d7 100644 --- a/examples/null_mapping.q +++ b/examples/null_mapping.q @@ -22,11 +22,11 @@ rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; short_opts:(`bool`uint8`int8`uint16`int16)!(0b;0x01;0x02;3h;4h); long_opts:(`uint32`int32`uint64`int64)!(5i;6i;7;8); float_opts:(`float32`float64`decimal)!(1.23e;4.56;7.89); -str_opts:(`string`binary`fixed_binary)!("start";"x"$"alert";0Ng); +str_opts:(`utf8`binary`fixed_binary)!("start";"x"$"alert";0Ng); time_opts:(`date32`timestamp`time64)!(2006.07.21;2011.01.01D00:00:00.000000000;12:00:00.000000000); // Support null mapping only in arrow -extra_opts:(`float16`large_string`large_binary`duration)!(9h;"stop";"x"$"acknowledge";12:00:00.000000000); +extra_opts:(`float16`large_utf8`large_binary`duration)!(9h;"stop";"x"$"acknowledge";12:00:00.000000000); other_opts:(`date64`time32`month_interval`day_time_interval)!(2015.01.01D00:00:00.000000000;09:01:02.042;2006.07m;12:00:00.000000000); options:(``NULL_MAPPING)!((::);short_opts,long_opts,float_opts,str_opts,time_opts,extra_opts,other_opts); @@ -127,6 +127,9 @@ other_schema:.arrowkdb.sc.schema[(ts_fd,d64_fd,t32_fd,mint_fd,dtint_fd)]; // Create the array data // //-----------------------// +// Number of items in each array +N:10 + // Create data for each column in the table ts_data:asc N?0p; diff --git a/src/KdbOptions.h b/src/KdbOptions.h index c836747..40ac9a1 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -45,8 +45,8 @@ namespace Options const std::string NM_FLOAT_16 = "float16"; const std::string NM_FLOAT_32 = "float32"; const std::string NM_FLOAT_64 = "float64"; - const std::string NM_STRING = "string"; - const std::string NM_LARGE_STRING = "large_string"; + const std::string NM_STRING = "utf8"; + const std::string NM_LARGE_STRING = "large_utf8"; const std::string NM_BINARY = "binary"; const std::string NM_LARGE_BINARY = "large_binary"; const std::string NM_FIXED_BINARY = "fixed_binary"; diff --git a/tests/null_bitmap/crucial_null_bitmap.t b/tests/null_bitmap/crucial_null_bitmap.t index a56c576..8443408 100644 --- a/tests/null_bitmap/crucial_null_bitmap.t +++ b/tests/null_bitmap/crucial_null_bitmap.t @@ -7,7 +7,7 @@ rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; -1"\n+----------|| Support null mapping ||----------+\n"; -crucial_opts:(`bool`int32`float64`string`date32)!(0b;1i;2.34;"start";2006.07.21); +crucial_opts:(`bool`int32`float64`utf8`date32)!(0b;1i;2.34;"start";2006.07.21); crucial_options:(``NULL_MAPPING)!((::);crucial_opts); diff --git a/tests/null_mapping/null_mapping_extra.t b/tests/null_mapping/null_mapping_extra.t index 3ac2e1b..87695d2 100644 --- a/tests/null_mapping/null_mapping_extra.t +++ b/tests/null_mapping/null_mapping_extra.t @@ -7,7 +7,7 @@ rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; -1"\n+----------|| Support null mapping only in arrow ||----------+\n"; -extra_opts:(`float16`large_string`large_binary`duration)!(9h;"stop";"x"$"acknowledge";12:00:00.000000000); +extra_opts:(`float16`large_utf8`large_binary`duration)!(9h;"stop";"x"$"acknowledge";12:00:00.000000000); options:(``NULL_MAPPING)!((::);extra_opts); diff --git a/tests/null_mapping/null_mapping_str.t b/tests/null_mapping/null_mapping_str.t index 295d533..5151c50 100644 --- a/tests/null_mapping/null_mapping_str.t +++ b/tests/null_mapping/null_mapping_str.t @@ -7,7 +7,7 @@ rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; -1"\n+----------|| Support null mapping ||----------+\n"; -str_opts:(`string`binary`fixed_binary)!("start";"x"$"alert";0Ng); +str_opts:(`utf8`binary`fixed_binary)!("start";"x"$"alert";0Ng); options:(``NULL_MAPPING)!((::);str_opts); From 8d71f49323aaad6d1a06f834d5a0e7c67b51e481 Mon Sep 17 00:00:00 2001 From: nmcdonnell-kx Date: Mon, 13 Mar 2023 15:55:13 +0000 Subject: [PATCH 254/276] KXI-22441 Recurse top level child lists of LIST, MAP, STRUCT, UNION, DICT * another NULL mapping: fixed_binary > fixed_size_binary --- examples/null_mapping.q | 2 +- src/KdbOptions.h | 2 +- tests/null_mapping/null_mapping_str.t | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/null_mapping.q b/examples/null_mapping.q index 6b4e9d7..be083f6 100644 --- a/examples/null_mapping.q +++ b/examples/null_mapping.q @@ -22,7 +22,7 @@ rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; short_opts:(`bool`uint8`int8`uint16`int16)!(0b;0x01;0x02;3h;4h); long_opts:(`uint32`int32`uint64`int64)!(5i;6i;7;8); float_opts:(`float32`float64`decimal)!(1.23e;4.56;7.89); -str_opts:(`utf8`binary`fixed_binary)!("start";"x"$"alert";0Ng); +str_opts:(`utf8`binary`fixed_size_binary)!("start";"x"$"alert";0Ng); time_opts:(`date32`timestamp`time64)!(2006.07.21;2011.01.01D00:00:00.000000000;12:00:00.000000000); // Support null mapping only in arrow diff --git a/src/KdbOptions.h b/src/KdbOptions.h index 40ac9a1..80254b6 100644 --- a/src/KdbOptions.h +++ b/src/KdbOptions.h @@ -49,7 +49,7 @@ namespace Options const std::string NM_LARGE_STRING = "large_utf8"; const std::string NM_BINARY = "binary"; const std::string NM_LARGE_BINARY = "large_binary"; - const std::string NM_FIXED_BINARY = "fixed_binary"; + const std::string NM_FIXED_BINARY = "fixed_size_binary"; const std::string NM_DATE_32 = "date32"; const std::string NM_DATE_64 = "date64"; const std::string NM_TIMESTAMP = "timestamp"; diff --git a/tests/null_mapping/null_mapping_str.t b/tests/null_mapping/null_mapping_str.t index 5151c50..349c38f 100644 --- a/tests/null_mapping/null_mapping_str.t +++ b/tests/null_mapping/null_mapping_str.t @@ -7,7 +7,7 @@ rm:{[filename] $[.z.o like "w*";system "del ",filename;system "rm ",filename]}; -1"\n+----------|| Support null mapping ||----------+\n"; -str_opts:(`utf8`binary`fixed_binary)!("start";"x"$"alert";0Ng); +str_opts:(`utf8`binary`fixed_size_binary)!("start";"x"$"alert";0Ng); options:(``NULL_MAPPING)!((::);str_opts); From 2a47fe16c64a12dffb874a86c41c656d9170e085 Mon Sep 17 00:00:00 2001 From: nmcdonnell-kx Date: Mon, 13 Mar 2023 16:13:13 +0000 Subject: [PATCH 255/276] KXI-22441 Recurse top level child lists of LIST, MAP, STRUCT, UNION, DICT * Inferred schemas should be able to contains nullable fields --- src/SchemaStore.cpp | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/SchemaStore.cpp b/src/SchemaStore.cpp index da407e2..03635f3 100644 --- a/src/SchemaStore.cpp +++ b/src/SchemaStore.cpp @@ -149,13 +149,7 @@ K inferSchema(K table) auto datatype = kx::arrowkdb::GetArrowType(kK(k_array_data)[i]); // Construct each arrow field - // Converting between kdb nulls are arrow nulls would incur a massive - // performance hit (up to 10x worse with trival datatypes that could otherwise - // be memcpy'ed). Also, not all kdb types have a null value, e.g. KB, KG, KS, - // 0 of KC, 0 of KG, etc. So don't allow fields to be created as nullable - // (other than NA type which is all nulls). - bool nullable = datatype->id() == arrow::Type::NA; - fields.push_back(arrow::field(field_names[i], datatype, nullable)); + fields.push_back(arrow::field(field_names[i], datatype, true)); } // Create the schema with these fields From baa7679313aa36bc5b80c11ae333434853da1796 Mon Sep 17 00:00:00 2001 From: nmcdonnell-kx Date: Mon, 13 Mar 2023 16:56:54 +0000 Subject: [PATCH 256/276] KXI-22441 Recurse top level child lists of LIST, MAP, STRUCT, UNION, DICT * Clean up null-mapping.md and null-bitmap.md * Update reference.md with null options --- docs/null-bitmap.md | 58 ++++++++++++--------------- docs/null-mapping.md | 53 +++++++++++++++--------- docs/reference.md | 95 ++++++++++++++++++++++++++++++-------------- 3 files changed, 126 insertions(+), 80 deletions(-) diff --git a/docs/null-bitmap.md b/docs/null-bitmap.md index a6550a4..f5f5ae1 100755 --- a/docs/null-bitmap.md +++ b/docs/null-bitmap.md @@ -8,7 +8,7 @@ Previously arrowkdb ignored the null bitmap when reading or writing an arrow arr - Mapping to kdb nulls would hurt the performance. - Users have requested that arrowkdb provides a null bitmap when reading an arrow array.so that the user can use this array in their applications. + Users have requested that arrowkdb provides a null bitmap when reading an arrow array so that the user can use this array in their applications. When reading an arrow array using arrowkdb the user can now choose to return the null bitmap as well as the data values. The shape of the null bitmap structure is exactly the same as the data structure.  It is then left to the user to interpret the two structures as appropriate for their application. @@ -17,51 +17,45 @@ Note: there is currently no support for null bitmap with the writer functions.  ## Implementation +The null bitmap feature is supported when reading: -Arrowkdb represents an arrow table (which is a set of arrow arrays) as a mixed list of lists.  This is then decorated with the field names from the schema to create a kdb table, similar to: +* Parquet files +* Arrow IPC files +* Arrow IPC streams + +To demonstrate it we first use the null mapping support to create a Parquet file containing nulls (although you can read null bitmaps from files generated by other writers such as PyArrow): ```q -q)field_names:`col1`col2`col3 -q)array_data:(3?0i;`float$3?0;3?0p) -q)GetTable:{flip field_names!array_data} -q)GetTable[] -col1 col2 col3 ------------------------------------------------------ --239800692 -1.16675e+18 2003.05.24D03:45:53.202889856 --930424766 4.413091e+18 2001.07.22D09:51:37.461634128 -1760748068 2.89119e+18 2001.07.26D01:03:47.039068936 +q)options:(``NULL_MAPPING)!(::;`bool`uint8`int8`uint16`int16`uint32`int32`uint64`int64`float16`float32`float64`date32`date64`month_interval`day_time_interval`timestamp`time32`time64`duration`utf8`large_utf8`binary`large_binary`fixed_size_binary!(0b;0x00;0x00;0Nh;0Nh;0Ni;0Ni;0N;0N;0Nh;0Ne;0Nf;0Nd;0Np;0Nm;0Nn;0Np;0Nt;0Nn;0Nn;"";"";`byte$"";`byte$"";`byte$"")) +q)table:([]col1:0N 1 2; col2:1.1 0n 2.2; col3:("aa"; "bb"; "")) +q).arrowkdb.pq.writeParquetFromTable["file.parquet";table;options] ``` Each reader function in arrowkdb takes an options dictionary.  A new `WITH_NULL_BITMAP option has been added.  When this option is set the reader functions return a two item mixed list, rather than one (the data values and null bitmap): ```q -q)read_data_structures:.arrowkdb.pq.readParquetToTable["file.parquet";(``WITH_NULL_BITMAP)!((::);1)] -q)read_data_structures -+`col1`col2`col3!(-239800692 -930424766 1760748068i;-1.16675e+18 4.413091e+18.. -+`col1`col2`col3!(011b;110b;010b) +q)read_results:.arrowkdb.pq.readParquetToTable["file.parquet";(enlist `WITH_NULL_BITMAP)!enlist 1] +q)read_results ++`col1`col2`col3!(0 1 2;1.1 0n 2.2;("aa";"bb";"")) ++`col1`col2`col3!(100b;000b;001b) ``` The null bitmap is a separate structure to kdb: ```q -q)null_bitmaps:(3?0b;3?0b;3?0b) -q)GetTableWithNulls:{((flip field_names!array_data);(flip field_names!null_bitmaps))} -q)GetTableWithNulls[] -+`col1`col2`col3!(-239800692 -930424766 1760748068i;-1.16675e+18 4.413091e+18.. -+`col1`col2`col3!(011b;110b;010b) -q)first GetTableWithNulls[] -col1 col2 col3 ------------------------------------------------------ --239800692 -1.16675e+18 2003.05.24D03:45:53.202889856 --930424766 4.413091e+18 2001.07.22D09:51:37.461634128 -1760748068 2.89119e+18 2001.07.26D01:03:47.039068936 -q)last GetTableWithNulls[] -col1 col2 col3 --------------- -0 1 0 -1 1 1 -1 0 0 +q)first read_results +col1 col2 col3 +-------------- +0 1.1 "aa" +1 "bb" +2 2.2 "" +q)last read_results +col1 col2 col3 +-------------- +1 0 0 +0 0 0 +0 0 1 ``` diff --git a/docs/null-mapping.md b/docs/null-mapping.md index 5cefbe0..589ff6b 100755 --- a/docs/null-mapping.md +++ b/docs/null-mapping.md @@ -4,7 +4,7 @@ Previously arrowkdb ignored the null bitmap when reading or writing an arrow array. Users have requested that arrowkdb maps arrow nulls into kdb. -Unlike arrow, not all kdb types have a null value and those that do overload one value in the range (typically INT_MIN or FLOAT_MIN).  +Unlike arrow, not all kdb types have a null value and those that do overload one value in the range (the 0N* values typically map to INT_MIN or FLOAT_MIN).  For example: @@ -19,27 +19,42 @@ For example: ## Implementation -When reading and writing an arrow array using arrowkdb the user can now choose whether to map arrow nulls. Each reader and writer function in arrowkdb takes an options dictionary.  A new `NULL_MAPPING option has been added which allows the user to specify whether an arrow datatype should be null mapped and what value to use for null in kdb.  +When reading and writing an arrow array using arrowkdb the user can now choose whether to map arrow nulls. Each reader and writer function in arrowkdb takes an options dictionary.  A new `NULL_MAPPING option containing a dictionary of datatypes > null values has been added which allows the user to specify whether an arrow datatype should be null mapped and what value to use for null in kdb. -For example: +> :warning: **An identify function (::) may be required in the options dictionary values** +> +> The options dictionary values list can be `7h`, `11h` or mixed list of `-7|-11|4|99h`. Therefore if the only set option is NULL_MAPPING, an additional empty key and corresponding value identity function (::) must be included in the options. + +The following Arrow datatype are supported, along with possible null mapping values: ```q -q)options[`NULL_MAPPING] -int16 | 0Nh -int32 | 0Ni -int64 | 0N -float32 | 0Ne -float64 | 0n -date32 | 0Nd -date64 | 0Np -month_interval | 0Nm -day_time_interval| 0Nn -timestamp | 0Np -time32 | 0Nt -time64 | 0Nn -duration | 0Nn -utf8 | "" -binary | `byte$() +q)options:(``NULL_MAPPING)!(::;`bool`uint8`int8`uint16`int16`uint32`int32`uint64`int64`float16`float32`float64`date32`date64`month_interval`day_time_interval`timestamp`time32`time64`duration`utf8`large_utf8`binary`large_binary`fixed_size_binary!(0b;0x00;0x00;0Nh;0Nh;0Ni;0Ni;0N;0N;0Nh;0Ne;0Nf;0Nd;0Np;0Nm;0Nn;0Np;0Nt;0Nn;0Nn;"";"";`byte$"";`byte$"";`byte$"")) +q)options`NULL_MAPPING +bool | 0b +uint8 | 0x00 +int8 | 0x00 +uint16 | 0Nh +int16 | 0Nh +uint32 | 0Ni +int32 | 0Ni +uint64 | 0N +int64 | 0N +float16 | 0Nh +float32 | 0Ne +float64 | 0n +date32 | 0Nd +date64 | 0Np +month_interval | 0Nm +day_time_interval| 0Nn +timestamp | 0Np +time32 | 0Nt +time64 | 0Nn +duration | 0Nn +utf8 | "" +large_utf8 | "" +binary | `byte$() +large_binary | `byte$() +fixed_size_binary| `byte$() ``` The type of each value in this dictionary must be the atomic type of the corresponding list representation for that datatype.  Where a datatype isn't present in this dictionary, arrowkdb will ignore the null bitmap (as per the previous behaviour). diff --git a/docs/reference.md b/docs/reference.md index 0af5cb1..1c65832 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -1602,8 +1602,8 @@ returns the schema identifier q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] q)f2:.arrowkdb.fd.field[`float_field;.arrowkdb.dt.float64[]] q).arrowkdb.sc.printSchema[.arrowkdb.sc.schema[(f1,f2)]] -int_field: int64 not null -float_field: double not null +int_field: int64 +float_field: double ``` ### `sc.inferSchema` @@ -1648,11 +1648,9 @@ returns list of field identifiers used by the schema q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] q)f2:.arrowkdb.fd.field[`float_field;.arrowkdb.dt.float64[]] q)schema:.arrowkdb.sc.schema[(f1,f2)] -q).arrowkdb.fd.printField each .arrowkdb.sc.schemaFields[schema] -int_field: int64 not null -float_field: double not null -:: -:: +q).arrowkdb.fd.printField each .arrowkdb.sc.schemaFields[schema]; +int_field: int64 +float_field: double ``` ## Schema management @@ -1680,9 +1678,9 @@ q)f2:.arrowkdb.fd.field[`float_field;.arrowkdb.dt.float64[]] q)f3:.arrowkdb.fd.field[`str_field;.arrowkdb.dt.utf8[]] q)schema:.arrowkdb.sc.schema[(f1,f2,f3)] q).arrowkdb.sc.printSchema[schema] -int_field: int64 not null -float_field: double not null -str_field: string not null +int_field: int64 +float_field: double +str_field: string ``` ### `sc.listSchemas` @@ -1778,7 +1776,7 @@ Where: - `datatype_id` is the datatype identifier of the array - `list` is the kdb+ list data to be displayed -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99h`. the function @@ -1788,6 +1786,7 @@ the function Supported options: - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](#null-mapping) for more details. > :warning: **For debugging use only** > @@ -1814,13 +1813,17 @@ q).arrowkdb.ar.prettyPrintArray[int_datatype;(1 2 3j);::] Where: - `list` is the kdb+ list data to be displayed -- `options` is reserved for future use - specify generic null (`::`) +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99h`. the function 1. prints array contents to stdout 1. returns generic null +Supported options: + +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](#null-mapping) for more details. + The kdb+ list type is mapped to an Arrow datatype as described [here](#inferreddatatypes). > :warning: **For debugging use only** @@ -1850,7 +1853,7 @@ Where: - `schema_id` is the schema identifier of the table - `array_data` is a mixed list of array data -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list; values list can be `7h`, `11h` or mixed list of -7|-11|4h +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list; values list can be `7h`, `11h` or mixed list of -7|-11|4|99h the function @@ -1862,6 +1865,7 @@ The mixed list of Arrow array data should be ordered in schema field number and Supported options: - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](#null-mapping) for more details. > :warning: **For debugging use only** > @@ -1914,7 +1918,7 @@ str_field: Where: - `table` is a kdb+ table -- `options` is reserved for future use - specify generic null (`::`) +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list; values list can be `7h`, `11h` or mixed list of -7|-11|4|99h the function @@ -1923,6 +1927,10 @@ the function Each column in the table is mapped to a field in the schema. The column name is used as the field name and the column’s kdb+ type is mapped to an Arrow datatype as as described [here](#inferreddatatypes). +Supported options: + +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](#null-mapping) for more details. + > :warning: **Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors** > > Each column in the table is mapped to a field in the schema. The column name is used as the field name and the column’s kdb+ type is mapped to an Arrow datatype as as described [here](#inferred-datatypes). @@ -1978,7 +1986,7 @@ Where: - `parquet_file` is a string containing the Parquet file name - `schema_id` is the schema identifier to use for the table - `array_data` is a mixed list of array data -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99h`. returns generic null on success @@ -1989,6 +1997,7 @@ Supported options: - `PARQUET_CHUNK_SIZE` - Controls the approximate size of encoded data pages within a column chunk. Long, default 1MB. - `PARQUET_VERSION` - Select the Parquet format version: `V1.0`, `V2.0`, `V2.4`, `V2.6` or `V2.LATEST`. Later versions are more fully featured but may be incompatible with older Parquet implementations. Default `V1.0` - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](#null-mapping) for more details. > :warning: **The Parquet format is compressed and designed for for maximum space efficiency which may cause a performance overhead compared to Arrow. Parquet is also less fully featured than Arrow which can result in schema limitations** > @@ -2018,7 +2027,7 @@ Where: - `parquet_file` is a string containing the Parquet file name - `table` is a kdb+ table -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99h`. returns generic null on success @@ -2026,6 +2035,7 @@ Supported options: - `PARQUET_CHUNK_SIZE` - Controls the approximate size of encoded data pages within a column chunk. Long, default 1MB. - `PARQUET_VERSION` - Select the Parquet format version: `V1.0`, `V2.0`, `V2.4`, `V2.6` or `V2.LATEST`. Later versions are more fully featured but may be incompatible with older Parquet implementations. Default `V1.0` +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](#null-mapping) for more details. > :warning: **Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors** > @@ -2073,7 +2083,7 @@ q).arrowkdb.sc.equalSchemas[schema;.arrowkdb.pq.readParquetSchema["file.parquet" Where: - `parquet_file` is a string containing the Parquet file name -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99h`. returns the array data @@ -2082,6 +2092,8 @@ Supported options: - `PARQUET_MULTITHREADED_READ` - Flag indicating whether the Parquet reader should run in multithreaded mode. This can improve performance by processing multiple columns in parallel. Long, default 0. - `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](#null-mapping) for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](#null-bitmap) for more details. Long, default 0. ```q q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] @@ -2107,13 +2119,14 @@ Where: - `parquet_file` is a string containing the Parquet file name - `column_index` is the index of the column to read, relative to the schema field order -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99h`. returns the array’s data Supported options: - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](#null-mapping) for more details. ```q q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] @@ -2138,7 +2151,7 @@ q)col1~array_data[1] Where: - `parquet_file` is a string containing the Parquet file name -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99h`. returns the kdb+ table @@ -2149,6 +2162,8 @@ Supported options: - `PARQUET_MULTITHREADED_READ` - Flag indicating whether the Parquet reader should run in multithreaded mode. This can improve performance by processing multiple columns in parallel. Long, default 0. - `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](#null-mapping) for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](#null-bitmap) for more details. Long, default 0. ```q q)table:([] int_field:(1 2 3); float_field:(4 5 6f); str_field:("aa";"bb";"cc")) @@ -2190,7 +2205,7 @@ Where: - `parquet_file` is a string containing the Parquet file name - `row_groups` is an integer list (6h) of row groups indices to read, or generic null (`::`) to read all row groups - `columns` is an integer list (6h) of column indices to read, or generic null (`::`) to read all columns -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99h`. returns the array data @@ -2199,6 +2214,8 @@ Supported options: - `PARQUET_MULTITHREADED_READ` - Flag indicating whether the Parquet reader should run in multithreaded mode. This can improve performance by processing multiple columns in parallel. Long, default 0. - `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](#null-mapping) for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](#null-bitmap) for more details. Long, default 0. ```q q)table:([]a:10000000#0;b:10000000#1) @@ -2224,7 +2241,7 @@ Where: - `parquet_file` is a string containing the Parquet file name - `row_groups` is an integer list (6h) of row groups indices to read, or generic null (`::`) to read all row groups - `columns` is an integer list (6h) of column indices to read, or generic null (`::`) to read all columns -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99h`. returns the kdb+ table @@ -2233,6 +2250,8 @@ Supported options: - `PARQUET_MULTITHREADED_READ` - Flag indicating whether the Parquet reader should run in multithreaded mode. This can improve performance by processing multiple columns in parallel. Long, default 0. - `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](#null-mapping) for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](#null-bitmap) for more details. Long, default 0. ```q q)table:([]a:10000000#0;b:10000000#1) @@ -2260,7 +2279,7 @@ Where: - `arrow_file` is a string containing the Arrow file name - `schema_id` is the schema identifier to use for the table - `array_data` is a mixed list of array data -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99h`. returns generic null on success @@ -2269,6 +2288,7 @@ The mixed list of Arrow array data should be ordered in schema field number and Supported options: - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](#null-mapping) for more details. ```q q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] @@ -2294,10 +2314,14 @@ Where: - `arrow_file` is a string containing the Arrow file name - `table` is a kdb+ table -- `options` is reserved for future use - specify generic null (`::`) +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99h`. returns generic null on success +Supported options: + +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](#null-mapping) for more details. + > :warning: **Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors** > > Each column in the table is mapped to a field in the schema. The column name is used as the field name and the column’s kdb+ type is mapped to an Arrow datatype as as described [here](#inferred-datatypes). @@ -2344,7 +2368,7 @@ q).arrowkdb.sc.equalSchemas[schema;.arrowkdb.ipc.readArrowSchema["file.arrow"]] Where: - `arrow_file` is a string containing the Arrow file name -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99h`. returns the array data @@ -2352,6 +2376,8 @@ Supported options: - `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](#null-mapping) for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](#null-bitmap) for more details. Long, default 0. ```q q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] @@ -2376,7 +2402,7 @@ q)read_data~array_data Where: - `arrow_file` is a string containing the Arrow file name -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99h`. returns the kdb+ table @@ -2386,6 +2412,8 @@ Supported options: - `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](#null-mapping) for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](#null-bitmap) for more details. Long, default 0. ```q q)table:([] int_field:(1 2 3); float_field:(4 5 6f); str_field:("aa";"bb";"cc")) @@ -2409,7 +2437,7 @@ Where: - `schema_id` is the schema identifier to use for the table - `array_data` is a mixed list of array data -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99h`. returns a byte list containing the serialized stream data @@ -2418,6 +2446,7 @@ The mixed list of Arrow array data should be ordered in schema field number and Supported options: - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](#null-mapping) for more details. ```q q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] @@ -2442,10 +2471,14 @@ q)read_data~array_data Where: - `table` is a kdb+ table -- `options` is reserved for future use - specify generic null (`::`) +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99h`. returns a byte list containing the serialized stream data +Supported options: + +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](#null-mapping) for more details. + > :warning: **Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors** > > Each column in the table is mapped to a field in the schema. The column name is used as the field name and the column’s kdb+ type is mapped to an Arrow datatype as as described [here](#inferred-datatypes). @@ -2492,13 +2525,15 @@ q).arrowkdb.sc.equalSchemas[schema;.arrowkdb.ipc.parseArrowSchema[serialized]] Where: - `serialized` is a byte list containing the serialized stream data -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99h`. returns the array data Supported options: - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](#null-mapping) for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](#null-bitmap) for more details. Long, default 0. ```q q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] @@ -2523,7 +2558,7 @@ q)read_data~array_data Where: - `serialized` is a byte list containing the serialized stream data -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99h`. returns the kdb+ table @@ -2532,6 +2567,8 @@ Each schema field name is used as the column name and the Arrow array data is us Supported options: - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](#null-mapping) for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](#null-bitmap) for more details. Long, default 0. ```q q)table:([] int_field:(1 2 3); float_field:(4 5 6f); str_field:("aa";"bb";"cc")) From 7a059e7efbe6569ae6777ec279f88ee56bd2ef2a Mon Sep 17 00:00:00 2001 From: nmcdonnell-kx Date: Mon, 13 Mar 2023 17:01:35 +0000 Subject: [PATCH 257/276] KXI-22441 Recurse top level child lists of LIST, MAP, STRUCT, UNION, DICT * markdown fixes --- docs/reference.md | 54 +++++++++++++++++++++++------------------------ 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/docs/reference.md b/docs/reference.md index 1c65832..4dc2967 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -1786,7 +1786,7 @@ the function Supported options: - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](#null-mapping) for more details. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](../null-mapping.md) for more details. > :warning: **For debugging use only** > @@ -1822,7 +1822,7 @@ the function Supported options: -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](#null-mapping) for more details. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](../null-mapping.md) for more details. The kdb+ list type is mapped to an Arrow datatype as described [here](#inferreddatatypes). @@ -1865,7 +1865,7 @@ The mixed list of Arrow array data should be ordered in schema field number and Supported options: - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](#null-mapping) for more details. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](../null-mapping.md) for more details. > :warning: **For debugging use only** > @@ -1929,7 +1929,7 @@ Each column in the table is mapped to a field in the schema. The column name is Supported options: -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](#null-mapping) for more details. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](../null-mapping.md) for more details. > :warning: **Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors** > @@ -1997,7 +1997,7 @@ Supported options: - `PARQUET_CHUNK_SIZE` - Controls the approximate size of encoded data pages within a column chunk. Long, default 1MB. - `PARQUET_VERSION` - Select the Parquet format version: `V1.0`, `V2.0`, `V2.4`, `V2.6` or `V2.LATEST`. Later versions are more fully featured but may be incompatible with older Parquet implementations. Default `V1.0` - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](#null-mapping) for more details. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](../null-mapping.md) for more details. > :warning: **The Parquet format is compressed and designed for for maximum space efficiency which may cause a performance overhead compared to Arrow. Parquet is also less fully featured than Arrow which can result in schema limitations** > @@ -2035,7 +2035,7 @@ Supported options: - `PARQUET_CHUNK_SIZE` - Controls the approximate size of encoded data pages within a column chunk. Long, default 1MB. - `PARQUET_VERSION` - Select the Parquet format version: `V1.0`, `V2.0`, `V2.4`, `V2.6` or `V2.LATEST`. Later versions are more fully featured but may be incompatible with older Parquet implementations. Default `V1.0` -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](#null-mapping) for more details. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](../null-mapping.md) for more details. > :warning: **Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors** > @@ -2092,8 +2092,8 @@ Supported options: - `PARQUET_MULTITHREADED_READ` - Flag indicating whether the Parquet reader should run in multithreaded mode. This can improve performance by processing multiple columns in parallel. Long, default 0. - `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](#null-mapping) for more details. -- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](#null-bitmap) for more details. Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](../null-mapping.md) for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](../null-bitmap.md) for more details. Long, default 0. ```q q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] @@ -2126,7 +2126,7 @@ returns the array’s data Supported options: - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](#null-mapping) for more details. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](../null-mapping.md) for more details. ```q q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] @@ -2162,8 +2162,8 @@ Supported options: - `PARQUET_MULTITHREADED_READ` - Flag indicating whether the Parquet reader should run in multithreaded mode. This can improve performance by processing multiple columns in parallel. Long, default 0. - `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](#null-mapping) for more details. -- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](#null-bitmap) for more details. Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](../null-mapping.md) for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](../null-bitmap.md) for more details. Long, default 0. ```q q)table:([] int_field:(1 2 3); float_field:(4 5 6f); str_field:("aa";"bb";"cc")) @@ -2214,8 +2214,8 @@ Supported options: - `PARQUET_MULTITHREADED_READ` - Flag indicating whether the Parquet reader should run in multithreaded mode. This can improve performance by processing multiple columns in parallel. Long, default 0. - `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](#null-mapping) for more details. -- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](#null-bitmap) for more details. Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](../null-mapping.md) for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](../null-bitmap.md) for more details. Long, default 0. ```q q)table:([]a:10000000#0;b:10000000#1) @@ -2250,8 +2250,8 @@ Supported options: - `PARQUET_MULTITHREADED_READ` - Flag indicating whether the Parquet reader should run in multithreaded mode. This can improve performance by processing multiple columns in parallel. Long, default 0. - `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](#null-mapping) for more details. -- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](#null-bitmap) for more details. Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](../null-mapping.md) for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](../null-bitmap.md) for more details. Long, default 0. ```q q)table:([]a:10000000#0;b:10000000#1) @@ -2288,7 +2288,7 @@ The mixed list of Arrow array data should be ordered in schema field number and Supported options: - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](#null-mapping) for more details. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](../null-mapping.md) for more details. ```q q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] @@ -2320,7 +2320,7 @@ returns generic null on success Supported options: -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](#null-mapping) for more details. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](../null-mapping.md) for more details. > :warning: **Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors** > @@ -2376,8 +2376,8 @@ Supported options: - `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](#null-mapping) for more details. -- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](#null-bitmap) for more details. Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](../null-mapping.md) for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](../null-bitmap.md) for more details. Long, default 0. ```q q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] @@ -2412,8 +2412,8 @@ Supported options: - `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](#null-mapping) for more details. -- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](#null-bitmap) for more details. Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](../null-mapping.md) for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](../null-bitmap.md) for more details. Long, default 0. ```q q)table:([] int_field:(1 2 3); float_field:(4 5 6f); str_field:("aa";"bb";"cc")) @@ -2446,7 +2446,7 @@ The mixed list of Arrow array data should be ordered in schema field number and Supported options: - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](#null-mapping) for more details. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](../null-mapping.md) for more details. ```q q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] @@ -2477,7 +2477,7 @@ returns a byte list containing the serialized stream data Supported options: -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](#null-mapping) for more details. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](../null-mapping.md) for more details. > :warning: **Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors** > @@ -2532,8 +2532,8 @@ returns the array data Supported options: - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](#null-mapping) for more details. -- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](#null-bitmap) for more details. Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](../null-mapping.md) for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](../null-bitmap.md) for more details. Long, default 0. ```q q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] @@ -2567,8 +2567,8 @@ Each schema field name is used as the column name and the Arrow array data is us Supported options: - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](#null-mapping) for more details. -- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](#null-bitmap) for more details. Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](../null-mapping.md) for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](../null-bitmap.md) for more details. Long, default 0. ```q q)table:([] int_field:(1 2 3); float_field:(4 5 6f); str_field:("aa";"bb";"cc")) From 3acd049ca8c526fe7677daa1487b64f33a054f1f Mon Sep 17 00:00:00 2001 From: nmcdonnell-kx Date: Mon, 13 Mar 2023 17:03:54 +0000 Subject: [PATCH 258/276] KXI-22441 Recurse top level child lists of LIST, MAP, STRUCT, UNION, DICT * markdown fixes * More markdown fixes --- docs/reference.md | 54 +++++++++++++++++++++++------------------------ 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/docs/reference.md b/docs/reference.md index 4dc2967..b9040f2 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -1786,7 +1786,7 @@ the function Supported options: - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](../null-mapping.md) for more details. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md)for more details. > :warning: **For debugging use only** > @@ -1822,7 +1822,7 @@ the function Supported options: -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](../null-mapping.md) for more details. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md)for more details. The kdb+ list type is mapped to an Arrow datatype as described [here](#inferreddatatypes). @@ -1865,7 +1865,7 @@ The mixed list of Arrow array data should be ordered in schema field number and Supported options: - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](../null-mapping.md) for more details. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md)for more details. > :warning: **For debugging use only** > @@ -1929,7 +1929,7 @@ Each column in the table is mapped to a field in the schema. The column name is Supported options: -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](../null-mapping.md) for more details. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md)for more details. > :warning: **Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors** > @@ -1997,7 +1997,7 @@ Supported options: - `PARQUET_CHUNK_SIZE` - Controls the approximate size of encoded data pages within a column chunk. Long, default 1MB. - `PARQUET_VERSION` - Select the Parquet format version: `V1.0`, `V2.0`, `V2.4`, `V2.6` or `V2.LATEST`. Later versions are more fully featured but may be incompatible with older Parquet implementations. Default `V1.0` - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](../null-mapping.md) for more details. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md)for more details. > :warning: **The Parquet format is compressed and designed for for maximum space efficiency which may cause a performance overhead compared to Arrow. Parquet is also less fully featured than Arrow which can result in schema limitations** > @@ -2035,7 +2035,7 @@ Supported options: - `PARQUET_CHUNK_SIZE` - Controls the approximate size of encoded data pages within a column chunk. Long, default 1MB. - `PARQUET_VERSION` - Select the Parquet format version: `V1.0`, `V2.0`, `V2.4`, `V2.6` or `V2.LATEST`. Later versions are more fully featured but may be incompatible with older Parquet implementations. Default `V1.0` -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](../null-mapping.md) for more details. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md)for more details. > :warning: **Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors** > @@ -2092,8 +2092,8 @@ Supported options: - `PARQUET_MULTITHREADED_READ` - Flag indicating whether the Parquet reader should run in multithreaded mode. This can improve performance by processing multiple columns in parallel. Long, default 0. - `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](../null-mapping.md) for more details. -- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](../null-bitmap.md) for more details. Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md)for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md)for more details. Long, default 0. ```q q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] @@ -2126,7 +2126,7 @@ returns the array’s data Supported options: - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](../null-mapping.md) for more details. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md)for more details. ```q q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] @@ -2162,8 +2162,8 @@ Supported options: - `PARQUET_MULTITHREADED_READ` - Flag indicating whether the Parquet reader should run in multithreaded mode. This can improve performance by processing multiple columns in parallel. Long, default 0. - `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](../null-mapping.md) for more details. -- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](../null-bitmap.md) for more details. Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md)for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md)for more details. Long, default 0. ```q q)table:([] int_field:(1 2 3); float_field:(4 5 6f); str_field:("aa";"bb";"cc")) @@ -2214,8 +2214,8 @@ Supported options: - `PARQUET_MULTITHREADED_READ` - Flag indicating whether the Parquet reader should run in multithreaded mode. This can improve performance by processing multiple columns in parallel. Long, default 0. - `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](../null-mapping.md) for more details. -- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](../null-bitmap.md) for more details. Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md)for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md)for more details. Long, default 0. ```q q)table:([]a:10000000#0;b:10000000#1) @@ -2250,8 +2250,8 @@ Supported options: - `PARQUET_MULTITHREADED_READ` - Flag indicating whether the Parquet reader should run in multithreaded mode. This can improve performance by processing multiple columns in parallel. Long, default 0. - `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](../null-mapping.md) for more details. -- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](../null-bitmap.md) for more details. Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md)for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md)for more details. Long, default 0. ```q q)table:([]a:10000000#0;b:10000000#1) @@ -2288,7 +2288,7 @@ The mixed list of Arrow array data should be ordered in schema field number and Supported options: - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](../null-mapping.md) for more details. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md)for more details. ```q q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] @@ -2320,7 +2320,7 @@ returns generic null on success Supported options: -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](../null-mapping.md) for more details. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md)for more details. > :warning: **Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors** > @@ -2376,8 +2376,8 @@ Supported options: - `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](../null-mapping.md) for more details. -- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](../null-bitmap.md) for more details. Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md)for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md)for more details. Long, default 0. ```q q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] @@ -2412,8 +2412,8 @@ Supported options: - `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](../null-mapping.md) for more details. -- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](../null-bitmap.md) for more details. Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md)for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md)for more details. Long, default 0. ```q q)table:([] int_field:(1 2 3); float_field:(4 5 6f); str_field:("aa";"bb";"cc")) @@ -2446,7 +2446,7 @@ The mixed list of Arrow array data should be ordered in schema field number and Supported options: - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](../null-mapping.md) for more details. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md)for more details. ```q q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] @@ -2477,7 +2477,7 @@ returns a byte list containing the serialized stream data Supported options: -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](../null-mapping.md) for more details. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md)for more details. > :warning: **Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors** > @@ -2532,8 +2532,8 @@ returns the array data Supported options: - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](../null-mapping.md) for more details. -- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](../null-bitmap.md) for more details. Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md)for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md)for more details. Long, default 0. ```q q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] @@ -2567,8 +2567,8 @@ Each schema field name is used as the column name and the Arrow array data is us Supported options: - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](../null-mapping.md) for more details. -- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](../null-bitmap.md) for more details. Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md)for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md)for more details. Long, default 0. ```q q)table:([] int_field:(1 2 3); float_field:(4 5 6f); str_field:("aa";"bb";"cc")) From a77ff82375b1ddec91514a2f0c8e7f91d0a38cad Mon Sep 17 00:00:00 2001 From: nmcdonnell-kx Date: Mon, 13 Mar 2023 17:05:39 +0000 Subject: [PATCH 259/276] KXI-22441 Recurse top level child lists of LIST, MAP, STRUCT, UNION, DICT * markdown fixes * More markdown fixes * Yet another markdown tweak --- docs/reference.md | 54 +++++++++++++++++++++++------------------------ 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/docs/reference.md b/docs/reference.md index b9040f2..26cd844 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -1786,7 +1786,7 @@ the function Supported options: - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md)for more details. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. > :warning: **For debugging use only** > @@ -1822,7 +1822,7 @@ the function Supported options: -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md)for more details. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. The kdb+ list type is mapped to an Arrow datatype as described [here](#inferreddatatypes). @@ -1865,7 +1865,7 @@ The mixed list of Arrow array data should be ordered in schema field number and Supported options: - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md)for more details. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. > :warning: **For debugging use only** > @@ -1929,7 +1929,7 @@ Each column in the table is mapped to a field in the schema. The column name is Supported options: -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md)for more details. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. > :warning: **Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors** > @@ -1997,7 +1997,7 @@ Supported options: - `PARQUET_CHUNK_SIZE` - Controls the approximate size of encoded data pages within a column chunk. Long, default 1MB. - `PARQUET_VERSION` - Select the Parquet format version: `V1.0`, `V2.0`, `V2.4`, `V2.6` or `V2.LATEST`. Later versions are more fully featured but may be incompatible with older Parquet implementations. Default `V1.0` - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md)for more details. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. > :warning: **The Parquet format is compressed and designed for for maximum space efficiency which may cause a performance overhead compared to Arrow. Parquet is also less fully featured than Arrow which can result in schema limitations** > @@ -2035,7 +2035,7 @@ Supported options: - `PARQUET_CHUNK_SIZE` - Controls the approximate size of encoded data pages within a column chunk. Long, default 1MB. - `PARQUET_VERSION` - Select the Parquet format version: `V1.0`, `V2.0`, `V2.4`, `V2.6` or `V2.LATEST`. Later versions are more fully featured but may be incompatible with older Parquet implementations. Default `V1.0` -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md)for more details. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. > :warning: **Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors** > @@ -2092,8 +2092,8 @@ Supported options: - `PARQUET_MULTITHREADED_READ` - Flag indicating whether the Parquet reader should run in multithreaded mode. This can improve performance by processing multiple columns in parallel. Long, default 0. - `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md)for more details. -- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md)for more details. Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md) for more details. Long, default 0. ```q q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] @@ -2126,7 +2126,7 @@ returns the array’s data Supported options: - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md)for more details. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. ```q q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] @@ -2162,8 +2162,8 @@ Supported options: - `PARQUET_MULTITHREADED_READ` - Flag indicating whether the Parquet reader should run in multithreaded mode. This can improve performance by processing multiple columns in parallel. Long, default 0. - `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md)for more details. -- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md)for more details. Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md) for more details. Long, default 0. ```q q)table:([] int_field:(1 2 3); float_field:(4 5 6f); str_field:("aa";"bb";"cc")) @@ -2214,8 +2214,8 @@ Supported options: - `PARQUET_MULTITHREADED_READ` - Flag indicating whether the Parquet reader should run in multithreaded mode. This can improve performance by processing multiple columns in parallel. Long, default 0. - `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md)for more details. -- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md)for more details. Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md) for more details. Long, default 0. ```q q)table:([]a:10000000#0;b:10000000#1) @@ -2250,8 +2250,8 @@ Supported options: - `PARQUET_MULTITHREADED_READ` - Flag indicating whether the Parquet reader should run in multithreaded mode. This can improve performance by processing multiple columns in parallel. Long, default 0. - `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md)for more details. -- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md)for more details. Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md) for more details. Long, default 0. ```q q)table:([]a:10000000#0;b:10000000#1) @@ -2288,7 +2288,7 @@ The mixed list of Arrow array data should be ordered in schema field number and Supported options: - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md)for more details. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. ```q q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] @@ -2320,7 +2320,7 @@ returns generic null on success Supported options: -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md)for more details. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. > :warning: **Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors** > @@ -2376,8 +2376,8 @@ Supported options: - `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md)for more details. -- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md)for more details. Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md) for more details. Long, default 0. ```q q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] @@ -2412,8 +2412,8 @@ Supported options: - `USE_MMAP` - Flag indicating whether the Parquet file should be memory mapped in. This can improve performance on systems which support mmap. Long, default: 0. - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md)for more details. -- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md)for more details. Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md) for more details. Long, default 0. ```q q)table:([] int_field:(1 2 3); float_field:(4 5 6f); str_field:("aa";"bb";"cc")) @@ -2446,7 +2446,7 @@ The mixed list of Arrow array data should be ordered in schema field number and Supported options: - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md)for more details. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. ```q q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] @@ -2477,7 +2477,7 @@ returns a byte list containing the serialized stream data Supported options: -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md)for more details. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. > :warning: **Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors** > @@ -2532,8 +2532,8 @@ returns the array data Supported options: - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md)for more details. -- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md)for more details. Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md) for more details. Long, default 0. ```q q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] @@ -2567,8 +2567,8 @@ Each schema field name is used as the column name and the Arrow array data is us Supported options: - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. -- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md)for more details. -- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md)for more details. Long, default 0. +- `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. +- `WITH_NULL_BITMAP` - Flag indicating whether to return the data values and the null bitmap as separate structures. See [here](null-bitmap.md) for more details. Long, default 0. ```q q)table:([] int_field:(1 2 3); float_field:(4 5 6f); str_field:("aa";"bb";"cc")) From 82fa1f5e6baa900d3db5be27a906b246985743db Mon Sep 17 00:00:00 2001 From: nmcdonnell-kx Date: Mon, 13 Mar 2023 17:49:14 +0000 Subject: [PATCH 260/276] KXI-22441 Recurse top level child lists of LIST, MAP, STRUCT, UNION, DICT * null mapping has to support 0Nf, 0Ne - i.e. nan --- src/HelperFunctions.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/HelperFunctions.h b/src/HelperFunctions.h index 63a4cef..dd06a86 100644 --- a/src/HelperFunctions.h +++ b/src/HelperFunctions.h @@ -86,7 +86,7 @@ inline bool is_equal( T lhs, T rhs ) { static const T epsilon = 2 * std::numeric_limits::epsilon(); - return ::fabs( lhs -= rhs ) <= epsilon; + return (std::isnan(lhs) && std::isnan(rhs)) || (std::fabs(lhs -= rhs) <= epsilon); } From 8c767e141f0f4d2dc23934c3ec0fb0300ad4b525 Mon Sep 17 00:00:00 2001 From: nmcdonnell-kx Date: Mon, 13 Mar 2023 18:08:19 +0000 Subject: [PATCH 261/276] KXI-22441 Recurse top level child lists of LIST, MAP, STRUCT, UNION, DICT * More cleanup --- docs/null-bitmap.md | 10 +++++----- docs/null-mapping.md | 44 +++++++++++++++++++++++++++++++++++++++++++- docs/reference.md | 38 +++++++++++++++++++------------------- 3 files changed, 67 insertions(+), 25 deletions(-) diff --git a/docs/null-bitmap.md b/docs/null-bitmap.md index f5f5ae1..55d7cb8 100755 --- a/docs/null-bitmap.md +++ b/docs/null-bitmap.md @@ -26,7 +26,7 @@ The null bitmap feature is supported when reading: To demonstrate it we first use the null mapping support to create a Parquet file containing nulls (although you can read null bitmaps from files generated by other writers such as PyArrow): ```q -q)options:(``NULL_MAPPING)!(::;`bool`uint8`int8`uint16`int16`uint32`int32`uint64`int64`float16`float32`float64`date32`date64`month_interval`day_time_interval`timestamp`time32`time64`duration`utf8`large_utf8`binary`large_binary`fixed_size_binary!(0b;0x00;0x00;0Nh;0Nh;0Ni;0Ni;0N;0N;0Nh;0Ne;0Nf;0Nd;0Np;0Nm;0Nn;0Np;0Nt;0Nn;0Nn;"";"";`byte$"";`byte$"";`byte$"")) +q)options:(``NULL_MAPPING)!(::;`bool`uint8`int8`uint16`int16`uint32`int32`uint64`int64`float16`float32`float64`date32`date64`month_interval`day_time_interval`timestamp`time32`time64`duration`utf8`large_utf8`binary`large_binary`fixed_size_binary!(0b;0x00;0x00;0Nh;0Nh;0Ni;0Ni;0N;0N;0Nh;0Ne;0n;0Nd;0Np;0Nm;0Nn;0Np;0Nt;0Nn;0Nn;"";"";`byte$"";`byte$"";`byte$"")) q)table:([]col1:0N 1 2; col2:1.1 0n 2.2; col3:("aa"; "bb"; "")) q).arrowkdb.pq.writeParquetFromTable["file.parquet";table;options] ``` @@ -37,8 +37,8 @@ Each reader function in arrowkdb takes an options dictionary.  A new `WITH_NULL ```q q)read_results:.arrowkdb.pq.readParquetToTable["file.parquet";(enlist `WITH_NULL_BITMAP)!enlist 1] q)read_results -+`col1`col2`col3!(0 1 2;1.1 0n 2.2;("aa";"bb";"")) -+`col1`col2`col3!(100b;000b;001b) ++`col1`col2`col3!(0 1 2;1.1 0 2.2;("aa";"bb";"")) ++`col1`col2`col3!(100b;010b;001b) ``` The null bitmap is a separate structure to kdb: @@ -48,13 +48,13 @@ q)first read_results col1 col2 col3 -------------- 0 1.1 "aa" -1 "bb" +1 0 "bb" 2 2.2 "" q)last read_results col1 col2 col3 -------------- 1 0 0 -0 0 0 +0 1 0 0 0 1 ``` diff --git a/docs/null-mapping.md b/docs/null-mapping.md index 589ff6b..75f5928 100755 --- a/docs/null-mapping.md +++ b/docs/null-mapping.md @@ -23,7 +23,7 @@ When reading and writing an arrow array using arrowkdb the user can now choose w > :warning: **An identify function (::) may be required in the options dictionary values** > -> The options dictionary values list can be `7h`, `11h` or mixed list of `-7|-11|4|99h`. Therefore if the only set option is NULL_MAPPING, an additional empty key and corresponding value identity function (::) must be included in the options. +> The options dictionary values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. Therefore if the only set option is NULL_MAPPING, an additional empty key and corresponding value identity function (::) must be included in the options to make the values a mixed list. The following Arrow datatype are supported, along with possible null mapping values: @@ -59,6 +59,48 @@ fixed_size_binary| `byte$() The type of each value in this dictionary must be the atomic type of the corresponding list representation for that datatype.  Where a datatype isn't present in this dictionary, arrowkdb will ignore the null bitmap (as per the previous behaviour). +## Example + +Using these null mapping we can pretty print an arrow arrow where the kdb nulls have been mapped to arrow nulls: + +```q +q)options:(``NULL_MAPPING)!(::;`bool`uint8`int8`uint16`int16`uint32`int32`uint64`int64`float16`float32`float64`date32`date64`month_interval`day_time_interval`timestamp`time32`time64`duration`utf8`large_utf8`binary`large_binary`fixed_size_binary!(0b;0x00;0x00;0Nh;0Nh;0Ni;0Ni;0N;0N;0Nh;0Ne;0n;0Nd;0Np;0Nm;0Nn;0Np;0Nt;0Nn;0Nn;"";"";`byte$"";`byte$"";`byte$"")) +q)table:([]col1:0N 1 2; col2:1.1 0n 2.2; col3:("aa"; "bb"; "")) +q).arrowkdb.tb.prettyPrintTableFromTable[table;options] +col1: int64 +col2: double +col3: string +---- +col1: + [ + [ + null, + 1, + 2 + ] + ] +col2: + [ + [ + 1.1, + null, + 2.2 + ] + ] +col3: + [ + [ + "aa", + "bb", + null + ] + ] + +q) +``` + + + ## Limitations diff --git a/docs/reference.md b/docs/reference.md index 26cd844..e2bb6dd 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -1776,7 +1776,7 @@ Where: - `datatype_id` is the datatype identifier of the array - `list` is the kdb+ list data to be displayed -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. the function @@ -1813,7 +1813,7 @@ q).arrowkdb.ar.prettyPrintArray[int_datatype;(1 2 3j);::] Where: - `list` is the kdb+ list data to be displayed -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. the function @@ -1853,7 +1853,7 @@ Where: - `schema_id` is the schema identifier of the table - `array_data` is a mixed list of array data -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list; values list can be `7h`, `11h` or mixed list of -7|-11|4|99h +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list; values list can be `7h`, `11h` or mixed list of -7|-11|4|99|101h the function @@ -1918,7 +1918,7 @@ str_field: Where: - `table` is a kdb+ table -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list; values list can be `7h`, `11h` or mixed list of -7|-11|4|99h +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list; values list can be `7h`, `11h` or mixed list of -7|-11|4|99|101h the function @@ -1986,7 +1986,7 @@ Where: - `parquet_file` is a string containing the Parquet file name - `schema_id` is the schema identifier to use for the table - `array_data` is a mixed list of array data -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. returns generic null on success @@ -2027,7 +2027,7 @@ Where: - `parquet_file` is a string containing the Parquet file name - `table` is a kdb+ table -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. returns generic null on success @@ -2083,7 +2083,7 @@ q).arrowkdb.sc.equalSchemas[schema;.arrowkdb.pq.readParquetSchema["file.parquet" Where: - `parquet_file` is a string containing the Parquet file name -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. returns the array data @@ -2119,7 +2119,7 @@ Where: - `parquet_file` is a string containing the Parquet file name - `column_index` is the index of the column to read, relative to the schema field order -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. returns the array’s data @@ -2151,7 +2151,7 @@ q)col1~array_data[1] Where: - `parquet_file` is a string containing the Parquet file name -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. returns the kdb+ table @@ -2205,7 +2205,7 @@ Where: - `parquet_file` is a string containing the Parquet file name - `row_groups` is an integer list (6h) of row groups indices to read, or generic null (`::`) to read all row groups - `columns` is an integer list (6h) of column indices to read, or generic null (`::`) to read all columns -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. returns the array data @@ -2241,7 +2241,7 @@ Where: - `parquet_file` is a string containing the Parquet file name - `row_groups` is an integer list (6h) of row groups indices to read, or generic null (`::`) to read all row groups - `columns` is an integer list (6h) of column indices to read, or generic null (`::`) to read all columns -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. returns the kdb+ table @@ -2279,7 +2279,7 @@ Where: - `arrow_file` is a string containing the Arrow file name - `schema_id` is the schema identifier to use for the table - `array_data` is a mixed list of array data -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. returns generic null on success @@ -2314,7 +2314,7 @@ Where: - `arrow_file` is a string containing the Arrow file name - `table` is a kdb+ table -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. returns generic null on success @@ -2368,7 +2368,7 @@ q).arrowkdb.sc.equalSchemas[schema;.arrowkdb.ipc.readArrowSchema["file.arrow"]] Where: - `arrow_file` is a string containing the Arrow file name -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. returns the array data @@ -2402,7 +2402,7 @@ q)read_data~array_data Where: - `arrow_file` is a string containing the Arrow file name -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. returns the kdb+ table @@ -2437,7 +2437,7 @@ Where: - `schema_id` is the schema identifier to use for the table - `array_data` is a mixed list of array data -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. returns a byte list containing the serialized stream data @@ -2471,7 +2471,7 @@ q)read_data~array_data Where: - `table` is a kdb+ table -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. returns a byte list containing the serialized stream data @@ -2525,7 +2525,7 @@ q).arrowkdb.sc.equalSchemas[schema;.arrowkdb.ipc.parseArrowSchema[serialized]] Where: - `serialized` is a byte list containing the serialized stream data -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. returns the array data @@ -2558,7 +2558,7 @@ q)read_data~array_data Where: - `serialized` is a byte list containing the serialized stream data -- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99h`. +- `options` is a kdb+ dictionary of options or generic null (`::`) to use defaults. Dictionary key must be a `11h` list. Values list can be `7h`, `11h` or mixed list of `-7|-11|4|99|101h`. returns the kdb+ table From 271a3a96e7f8541e497396bb37b8e811afe926e4 Mon Sep 17 00:00:00 2001 From: nmcdonnell-kx Date: Mon, 13 Mar 2023 18:50:18 +0000 Subject: [PATCH 262/276] KXI-22441 Recurse top level child lists of LIST, MAP, STRUCT, UNION, DICT * Add ARROW_CHUNK_ROWS option --- docs/reference.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/reference.md b/docs/reference.md index e2bb6dd..0542bd9 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -2289,6 +2289,7 @@ Supported options: - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. - `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. +- `ARROW_CHUNK_ROWS` - The number of rows to include in each arrow array. If the total rows in the kdb data are greater then the kdb lists are chunked into the arrow IPC writer. ```q q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] @@ -2321,6 +2322,7 @@ returns generic null on success Supported options: - `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. +- `ARROW_CHUNK_ROWS` - The number of rows to include in each arrow array. If the total rows in the kdb data are greater then the kdb lists are chunked into the arrow IPC writer. > :warning: **Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors** > @@ -2447,6 +2449,7 @@ Supported options: - `DECIMAL128_AS_DOUBLE` - Flag indicating whether to override the default type mapping for the Arrow decimal128 datatype and instead represent it as a double (9h). Long, default 0. - `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. +- `ARROW_CHUNK_ROWS` - The number of rows to include in each arrow array. If the total rows in the kdb data are greater then the kdb lists are chunked into the arrow IPC writer. ```q q)f1:.arrowkdb.fd.field[`int_field;.arrowkdb.dt.int64[]] @@ -2478,6 +2481,7 @@ returns a byte list containing the serialized stream data Supported options: - `NULL_MAPPING` - Sub-directory of null mapping datatypes and values. See [here](null-mapping.md) for more details. +- `ARROW_CHUNK_ROWS` - The number of rows to include in each arrow array. If the total rows in the kdb data are greater then the kdb lists are chunked into the arrow IPC writer. > :warning: **Inferred schemas only support a subset of the Arrow datatypes and is considerably less flexible than creating them with the datatype/field/schema constructors** > From 3397bc1041590c1785d5312e50d6d14e2b37d969 Mon Sep 17 00:00:00 2001 From: nmcdonnell-kx Date: Tue, 14 Mar 2023 10:42:57 +0000 Subject: [PATCH 263/276] KXI-22441 Recurse top level child lists of LIST, MAP, STRUCT, UNION, DICT * Minor improvement --- docs/null-mapping.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/null-mapping.md b/docs/null-mapping.md index 75f5928..0cbf4b1 100755 --- a/docs/null-mapping.md +++ b/docs/null-mapping.md @@ -4,7 +4,7 @@ Previously arrowkdb ignored the null bitmap when reading or writing an arrow array. Users have requested that arrowkdb maps arrow nulls into kdb. -Unlike arrow, not all kdb types have a null value and those that do overload one value in the range (the 0N* values typically map to INT_MIN or FLOAT_MIN).  +Unlike arrow, not all kdb types have a null value and those that do overload one value in the range (the 0N* values typically map to INT_MIN or NaN).  For example: From cb0e1157fbda209e9ca958f1c0da2fb09c3ab047 Mon Sep 17 00:00:00 2001 From: nmcdonnell-kx Date: Tue, 14 Mar 2023 10:59:35 +0000 Subject: [PATCH 264/276] KXI-22441 Recurse top level child lists of LIST, MAP, STRUCT, UNION, DICT * Reenable windows and macos builds --- .travis.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 69269a2..0606daf 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,9 +4,9 @@ jobs: os: linux - dist: focal os: linux -# - osx_image: xcode12.5 -# os: osx -# - os: windows + - osx_image: xcode12.5 + os: osx + - os: windows language: c compiler: gcc os: linux From d80098b6dfe8b39b6da110bd4d3974d5616b7227 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 14 Mar 2023 12:02:32 +0000 Subject: [PATCH 265/276] Arrow chunk rows example in docs --- docs/reference.md | 7 +++++++ 1 file changed, 7 insertions(+) mode change 100644 => 100755 docs/reference.md diff --git a/docs/reference.md b/docs/reference.md old mode 100644 new mode 100755 index 0542bd9..f82aa1b --- a/docs/reference.md +++ b/docs/reference.md @@ -2335,6 +2335,13 @@ q)read_table:.arrowkdb.ipc.readArrowToTable["file.arrow";::] q)read_table~table 1b ``` +> :warning: With writing a large table Arrow may rize **'Capacity error: Cannot write arrays larger than 2^31 - 1 in length**. Preferable [way](https://arrow.apache.org/docs/python/ipc.html) of serializing of such a table is dividing it into chunks providing `ARROW_CHUNK_ROWS` option. + +```q +table:([]col:2147483652#0x00) +options:(``ARROW_CHUNK_ROWS)!((::);214748365) +.arrowkdb.ipc.writeArrowFromTable["table.arrow";table;options] +``` ### `ipc.readArrowSchema` From 39ae5e28b41dc177914d52fd425a2585b30d9673 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 14 Mar 2023 12:27:38 +0000 Subject: [PATCH 266/276] Adding more batching warnings --- docs/reference.md | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/docs/reference.md b/docs/reference.md index f82aa1b..17fc866 100755 --- a/docs/reference.md +++ b/docs/reference.md @@ -2302,7 +2302,12 @@ q)read_data:.arrowkdb.ipc.readArrowData["file.arrow";::] q)read_data~array_data 1b ``` - +> :warning: With writing a large table Arrow may rize **'Capacity error: Cannot write arrays larger than 2^31 - 1 in length**. Preferable [way](https://arrow.apache.org/docs/python/ipc.html) of serializing of such a table is dividing it into chunks providing `ARROW_CHUNK_ROWS` option. +```q +table:([]col:2147483652#0x00) +options:(``ARROW_CHUNK_ROWS)!((::);214748365) +.arrowkdb.ipc.writeArrowFromTable["table.arrow";table;options] +``` ### `ipc.writeArrowFromTable` *Convert a kdb+ table to an Arrow table and write to an Arrow file, inferring the schema from the kdb+ table structure* @@ -2336,13 +2341,6 @@ q)read_table~table 1b ``` > :warning: With writing a large table Arrow may rize **'Capacity error: Cannot write arrays larger than 2^31 - 1 in length**. Preferable [way](https://arrow.apache.org/docs/python/ipc.html) of serializing of such a table is dividing it into chunks providing `ARROW_CHUNK_ROWS` option. - -```q -table:([]col:2147483652#0x00) -options:(``ARROW_CHUNK_ROWS)!((::);214748365) -.arrowkdb.ipc.writeArrowFromTable["table.arrow";table;options] -``` - ### `ipc.readArrowSchema` *Read the schema from an Arrow file* @@ -2469,6 +2467,7 @@ q)read_data:.arrowkdb.ipc.parseArrowData[serialized;::] q)read_data~array_data 1b ``` +> :warning: With writing a large table Arrow may rize **'Capacity error: Cannot write arrays larger than 2^31 - 1 in length**. Preferable [way](https://arrow.apache.org/docs/python/ipc.html) of serializing of such a table is dividing it into chunks providing `ARROW_CHUNK_ROWS` option. ### `ipc.serializeArrowFromTable` @@ -2501,7 +2500,7 @@ q)new_table:.arrowkdb.ipc.parseArrowToTable[serialized;::] q)new_table~table 1b ``` - +> :warning: With writing a large table Arrow may rize **'Capacity error: Cannot write arrays larger than 2^31 - 1 in length**. Preferable [way](https://arrow.apache.org/docs/python/ipc.html) of serializing of such a table is dividing it into chunks providing `ARROW_CHUNK_ROWS` option. ### `ipc.parseArrowSchema` *Parse the schema from an Arrow stream* From 847b0dc660009ce45c052c218ffd5c48c70508aa Mon Sep 17 00:00:00 2001 From: vgrechin-kx <122450037+vgrechin-kx@users.noreply.github.com> Date: Tue, 14 Mar 2023 15:31:16 +0300 Subject: [PATCH 267/276] Update docs/reference.md Co-authored-by: nmcdonnell-kx <63713601+nmcdonnell-kx@users.noreply.github.com> --- docs/reference.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/reference.md b/docs/reference.md index 17fc866..29e29f5 100755 --- a/docs/reference.md +++ b/docs/reference.md @@ -2340,7 +2340,9 @@ q)read_table:.arrowkdb.ipc.readArrowToTable["file.arrow";::] q)read_table~table 1b ``` -> :warning: With writing a large table Arrow may rize **'Capacity error: Cannot write arrays larger than 2^31 - 1 in length**. Preferable [way](https://arrow.apache.org/docs/python/ipc.html) of serializing of such a table is dividing it into chunks providing `ARROW_CHUNK_ROWS` option. +> :warning: **When writing a large table Arrow may raise 'Capacity error: Cannot write arrays larger than 2^31 - 1 in length**. +> +> Preferable [way](https://arrow.apache.org/docs/python/ipc.html) of serializing of such a table is dividing it into chunks by specifying `ARROW_CHUNK_ROWS` option. ### `ipc.readArrowSchema` *Read the schema from an Arrow file* From 66f69c0e1646bcc5dbfa3f9189a2d973a1c7a782 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 14 Mar 2023 12:32:34 +0000 Subject: [PATCH 268/276] Error message update --- docs/reference.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/reference.md b/docs/reference.md index 29e29f5..a251c9f 100755 --- a/docs/reference.md +++ b/docs/reference.md @@ -2302,7 +2302,7 @@ q)read_data:.arrowkdb.ipc.readArrowData["file.arrow";::] q)read_data~array_data 1b ``` -> :warning: With writing a large table Arrow may rize **'Capacity error: Cannot write arrays larger than 2^31 - 1 in length**. Preferable [way](https://arrow.apache.org/docs/python/ipc.html) of serializing of such a table is dividing it into chunks providing `ARROW_CHUNK_ROWS` option. +> :warning: With writing a large table Arrow may raise **'Capacity error: Cannot write arrays larger than 2^31 - 1 in length**. Preferable [way](https://arrow.apache.org/docs/python/ipc.html) of serializing of such a table is dividing it into chunks providing `ARROW_CHUNK_ROWS` option. ```q table:([]col:2147483652#0x00) options:(``ARROW_CHUNK_ROWS)!((::);214748365) @@ -2469,7 +2469,7 @@ q)read_data:.arrowkdb.ipc.parseArrowData[serialized;::] q)read_data~array_data 1b ``` -> :warning: With writing a large table Arrow may rize **'Capacity error: Cannot write arrays larger than 2^31 - 1 in length**. Preferable [way](https://arrow.apache.org/docs/python/ipc.html) of serializing of such a table is dividing it into chunks providing `ARROW_CHUNK_ROWS` option. +> :warning: With writing a large table Arrow may raise **'Capacity error: Cannot write arrays larger than 2^31 - 1 in length**. Preferable [way](https://arrow.apache.org/docs/python/ipc.html) of serializing of such a table is dividing it into chunks providing `ARROW_CHUNK_ROWS` option. ### `ipc.serializeArrowFromTable` @@ -2502,7 +2502,7 @@ q)new_table:.arrowkdb.ipc.parseArrowToTable[serialized;::] q)new_table~table 1b ``` -> :warning: With writing a large table Arrow may rize **'Capacity error: Cannot write arrays larger than 2^31 - 1 in length**. Preferable [way](https://arrow.apache.org/docs/python/ipc.html) of serializing of such a table is dividing it into chunks providing `ARROW_CHUNK_ROWS` option. +> :warning: With writing a large table Arrow may raise **'Capacity error: Cannot write arrays larger than 2^31 - 1 in length**. Preferable [way](https://arrow.apache.org/docs/python/ipc.html) of serializing of such a table is dividing it into chunks providing `ARROW_CHUNK_ROWS` option. ### `ipc.parseArrowSchema` *Parse the schema from an Arrow stream* From 438385249e090426ea8f8b5ca7253f9efaa2c244 Mon Sep 17 00:00:00 2001 From: Vyacheslav Grechin Date: Tue, 14 Mar 2023 12:56:38 +0000 Subject: [PATCH 269/276] Move batching example into corresponding chapter --- docs/reference.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/reference.md b/docs/reference.md index a251c9f..6534671 100755 --- a/docs/reference.md +++ b/docs/reference.md @@ -2303,11 +2303,7 @@ q)read_data~array_data 1b ``` > :warning: With writing a large table Arrow may raise **'Capacity error: Cannot write arrays larger than 2^31 - 1 in length**. Preferable [way](https://arrow.apache.org/docs/python/ipc.html) of serializing of such a table is dividing it into chunks providing `ARROW_CHUNK_ROWS` option. -```q -table:([]col:2147483652#0x00) -options:(``ARROW_CHUNK_ROWS)!((::);214748365) -.arrowkdb.ipc.writeArrowFromTable["table.arrow";table;options] -``` + ### `ipc.writeArrowFromTable` *Convert a kdb+ table to an Arrow table and write to an Arrow file, inferring the schema from the kdb+ table structure* @@ -2343,6 +2339,11 @@ q)read_table~table > :warning: **When writing a large table Arrow may raise 'Capacity error: Cannot write arrays larger than 2^31 - 1 in length**. > > Preferable [way](https://arrow.apache.org/docs/python/ipc.html) of serializing of such a table is dividing it into chunks by specifying `ARROW_CHUNK_ROWS` option. +```q +table:([]col:2147483652#0x00) +options:(``ARROW_CHUNK_ROWS)!((::);214748365) +.arrowkdb.ipc.writeArrowFromTable["table.arrow";table;options] +``` ### `ipc.readArrowSchema` *Read the schema from an Arrow file* From 33e712c9d32692470f4e072c644343abe64a1e71 Mon Sep 17 00:00:00 2001 From: vgrechin-kx <122450037+vgrechin-kx@users.noreply.github.com> Date: Tue, 14 Mar 2023 16:30:25 +0300 Subject: [PATCH 270/276] Update docs/reference.md Co-authored-by: nmcdonnell-kx <63713601+nmcdonnell-kx@users.noreply.github.com> --- docs/reference.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/reference.md b/docs/reference.md index 6534671..ef342da 100755 --- a/docs/reference.md +++ b/docs/reference.md @@ -2302,7 +2302,11 @@ q)read_data:.arrowkdb.ipc.readArrowData["file.arrow";::] q)read_data~array_data 1b ``` -> :warning: With writing a large table Arrow may raise **'Capacity error: Cannot write arrays larger than 2^31 - 1 in length**. Preferable [way](https://arrow.apache.org/docs/python/ipc.html) of serializing of such a table is dividing it into chunks providing `ARROW_CHUNK_ROWS` option. + +> :warning: **When writing a large table Arrow may raise 'Capacity error: Cannot write arrays larger than 2^31 - 1 in length**. +> +> Preferable [way](https://arrow.apache.org/docs/python/ipc.html) of serializing of such a table is dividing it into chunks by specifying `ARROW_CHUNK_ROWS` option. + ### `ipc.writeArrowFromTable` From 8e18cf1d88de8623730ce0371b08546ab9f3a5e3 Mon Sep 17 00:00:00 2001 From: vgrechin-kx <122450037+vgrechin-kx@users.noreply.github.com> Date: Tue, 14 Mar 2023 16:30:37 +0300 Subject: [PATCH 271/276] Update docs/reference.md Co-authored-by: nmcdonnell-kx <63713601+nmcdonnell-kx@users.noreply.github.com> --- docs/reference.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/reference.md b/docs/reference.md index ef342da..41efa2e 100755 --- a/docs/reference.md +++ b/docs/reference.md @@ -2340,6 +2340,7 @@ q)read_table:.arrowkdb.ipc.readArrowToTable["file.arrow";::] q)read_table~table 1b ``` + > :warning: **When writing a large table Arrow may raise 'Capacity error: Cannot write arrays larger than 2^31 - 1 in length**. > > Preferable [way](https://arrow.apache.org/docs/python/ipc.html) of serializing of such a table is dividing it into chunks by specifying `ARROW_CHUNK_ROWS` option. From 6517a7165b2c1755fc1f886f5722b99b9c7164f0 Mon Sep 17 00:00:00 2001 From: vgrechin-kx <122450037+vgrechin-kx@users.noreply.github.com> Date: Tue, 14 Mar 2023 16:30:46 +0300 Subject: [PATCH 272/276] Update docs/reference.md Co-authored-by: nmcdonnell-kx <63713601+nmcdonnell-kx@users.noreply.github.com> --- docs/reference.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/reference.md b/docs/reference.md index 41efa2e..0f45434 100755 --- a/docs/reference.md +++ b/docs/reference.md @@ -2475,7 +2475,10 @@ q)read_data:.arrowkdb.ipc.parseArrowData[serialized;::] q)read_data~array_data 1b ``` -> :warning: With writing a large table Arrow may raise **'Capacity error: Cannot write arrays larger than 2^31 - 1 in length**. Preferable [way](https://arrow.apache.org/docs/python/ipc.html) of serializing of such a table is dividing it into chunks providing `ARROW_CHUNK_ROWS` option. + +> :warning: **When writing a large table Arrow may raise 'Capacity error: Cannot write arrays larger than 2^31 - 1 in length**. +> +> Preferable [way](https://arrow.apache.org/docs/python/ipc.html) of serializing of such a table is dividing it into chunks by specifying `ARROW_CHUNK_ROWS` option. ### `ipc.serializeArrowFromTable` From 9af0f0154b00d6f555875461b3c1cfcc89a607ec Mon Sep 17 00:00:00 2001 From: vgrechin-kx <122450037+vgrechin-kx@users.noreply.github.com> Date: Tue, 14 Mar 2023 16:30:53 +0300 Subject: [PATCH 273/276] Update docs/reference.md Co-authored-by: nmcdonnell-kx <63713601+nmcdonnell-kx@users.noreply.github.com> --- docs/reference.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/reference.md b/docs/reference.md index 0f45434..4d6713c 100755 --- a/docs/reference.md +++ b/docs/reference.md @@ -2344,6 +2344,7 @@ q)read_table~table > :warning: **When writing a large table Arrow may raise 'Capacity error: Cannot write arrays larger than 2^31 - 1 in length**. > > Preferable [way](https://arrow.apache.org/docs/python/ipc.html) of serializing of such a table is dividing it into chunks by specifying `ARROW_CHUNK_ROWS` option. + ```q table:([]col:2147483652#0x00) options:(``ARROW_CHUNK_ROWS)!((::);214748365) From ec74720d70b5fa1d638f62f0c83de61f288dfe05 Mon Sep 17 00:00:00 2001 From: vgrechin-kx <122450037+vgrechin-kx@users.noreply.github.com> Date: Tue, 14 Mar 2023 16:31:06 +0300 Subject: [PATCH 274/276] Update docs/reference.md Co-authored-by: nmcdonnell-kx <63713601+nmcdonnell-kx@users.noreply.github.com> --- docs/reference.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/reference.md b/docs/reference.md index 4d6713c..bf9b4f6 100755 --- a/docs/reference.md +++ b/docs/reference.md @@ -2512,7 +2512,11 @@ q)new_table:.arrowkdb.ipc.parseArrowToTable[serialized;::] q)new_table~table 1b ``` -> :warning: With writing a large table Arrow may raise **'Capacity error: Cannot write arrays larger than 2^31 - 1 in length**. Preferable [way](https://arrow.apache.org/docs/python/ipc.html) of serializing of such a table is dividing it into chunks providing `ARROW_CHUNK_ROWS` option. + +> :warning: **When writing a large table Arrow may raise 'Capacity error: Cannot write arrays larger than 2^31 - 1 in length**. +> +> Preferable [way](https://arrow.apache.org/docs/python/ipc.html) of serializing of such a table is dividing it into chunks by specifying `ARROW_CHUNK_ROWS` option. + ### `ipc.parseArrowSchema` *Parse the schema from an Arrow stream* From cc030b8182870b1e5c0974c69d0c0646b4449a80 Mon Sep 17 00:00:00 2001 From: vgrechin-kx <122450037+vgrechin-kx@users.noreply.github.com> Date: Tue, 14 Mar 2023 16:31:35 +0300 Subject: [PATCH 275/276] Update docs/reference.md Co-authored-by: nmcdonnell-kx <63713601+nmcdonnell-kx@users.noreply.github.com> --- docs/reference.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/reference.md b/docs/reference.md index bf9b4f6..cec42f1 100755 --- a/docs/reference.md +++ b/docs/reference.md @@ -2517,7 +2517,6 @@ q)new_table~table > > Preferable [way](https://arrow.apache.org/docs/python/ipc.html) of serializing of such a table is dividing it into chunks by specifying `ARROW_CHUNK_ROWS` option. -### `ipc.parseArrowSchema` *Parse the schema from an Arrow stream* From e91519ade49a9447da3c805a6d62f912a13fb77a Mon Sep 17 00:00:00 2001 From: nmcdonnell-kx Date: Tue, 14 Mar 2023 13:49:34 +0000 Subject: [PATCH 276/276] KXI-0 Minor docs fix --- docs/reference.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/reference.md b/docs/reference.md index cec42f1..98efb53 100755 --- a/docs/reference.md +++ b/docs/reference.md @@ -2517,6 +2517,13 @@ q)new_table~table > > Preferable [way](https://arrow.apache.org/docs/python/ipc.html) of serializing of such a table is dividing it into chunks by specifying `ARROW_CHUNK_ROWS` option. +```q +table:([]col:2147483652#0x00) +options:(``ARROW_CHUNK_ROWS)!((::);214748365) +serialized:.arrowkdb.ipc.serializeArrowFromTable["table.arrow";table;options] +``` + +### `ipc.parseArrowSchema` *Parse the schema from an Arrow stream*