diff --git a/CMakeLists.txt b/CMakeLists.txt index be3cdd794e..4cb0749ce7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -247,8 +247,12 @@ if(ENABLE_CUDA) else() if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.18.0") message(STATUS "CMake 3.18+, Setting CUDA_ARCHITECTURES.") - set(CMAKE_CUDA_ARCHITECTURES - 35-virtual + if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12.0.0) + set(CMAKE_CUDA_ARCHITECTURES 35-virtual) + else() + set(CMAKE_CUDA_ARCHITECTURES "") + endif() + list(APPEND CMAKE_CUDA_ARCHITECTURES 50-virtual 60-virtual 70-virtual @@ -258,8 +262,12 @@ if(ENABLE_CUDA) message(STATUS "CUDA_ARCHITECTURES: ${CMAKE_CUDA_ARCHITECTURES}") else() message(STATUS "CMake 3.17 or under, setting CUDA architecture flags manually.") - set(CUDA_COMPILATION_ARCH - -gencode=arch=compute_35,code=compute_35; + if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12.0.0) + set(CUDA_COMPILATION_ARCH -gencode=arch=compute_35,code=compute_35;) + else() + set(CUDA_COMPILATION_ARCH "") + endif() + list(APPEND CUDA_COMPILATION_ARCH -gencode=arch=compute_50,code=compute_50; -gencode=arch=compute_60,code=compute_60; -gencode=arch=compute_70,code=compute_70; diff --git a/DataMgr/ForeignStorage/CsvFileBufferParser.cpp b/DataMgr/ForeignStorage/CsvFileBufferParser.cpp index 0937a4661f..58312016ee 100644 --- a/DataMgr/ForeignStorage/CsvFileBufferParser.cpp +++ b/DataMgr/ForeignStorage/CsvFileBufferParser.cpp @@ -168,6 +168,8 @@ ParseBufferResult CsvFileBufferParser::parseBuffer(ParseBufferRequest& request, std::vector> tmp_buffers; // holds string w/ removed escape chars, etc const char* line_start = p; + row_index_plus_one++; + bool incorrect_column_count = false; p = import_export::delimited_parser::get_row(p, thread_buf_end, buf_end, @@ -177,10 +179,6 @@ ParseBufferResult CsvFileBufferParser::parseBuffer(ParseBufferRequest& request, tmp_buffers, try_single_thread, !columns_are_pre_filtered); - - row_index_plus_one++; - - bool incorrect_column_count = false; try { validate_expected_column_count(row, num_cols, point_cols, file_path); } catch (const ForeignStorageException& e) { diff --git a/ImportExport/DelimitedParserUtils.cpp b/ImportExport/DelimitedParserUtils.cpp index 0231c12768..20cea36fd1 100644 --- a/ImportExport/DelimitedParserUtils.cpp +++ b/ImportExport/DelimitedParserUtils.cpp @@ -46,13 +46,18 @@ inline void trim_space(const char*& field_begin, const char*& field_end) { inline void trim_quotes(const char*& field_begin, const char*& field_end, const import_export::CopyParams& copy_params) { - if (copy_params.quoted && field_end - field_begin > 0 && - *field_begin == copy_params.quote) { - ++field_begin; + auto quote_begin = field_begin, quote_end = field_end; + if (copy_params.quoted) { + trim_space(quote_begin, quote_end); } - if (copy_params.quoted && field_end - field_begin > 0 && - *(field_end - 1) == copy_params.quote) { - --field_end; + if (copy_params.quoted && quote_end - quote_begin > 0) { + if (*quote_begin == copy_params.quote && *(quote_end - 1) == copy_params.quote) { + field_begin = ++quote_begin; + field_end = (quote_begin == quote_end) ? quote_end : --quote_end; + } else { + throw import_export::delimited_parser::DelimitedParserException( + "Unable to trim quotes."); + } } } } // namespace @@ -123,13 +128,26 @@ size_t find_end(const char* buffer, if (last_line_delim_pos <= 0) { size_t excerpt_length = std::min(50, size); std::string buffer_excerpt{buffer, buffer + excerpt_length}; - std::string error_message = - "Unable to find an end of line character after reading " + std::to_string(size) + - " characters. Please ensure that the correct \"line_delimiter\" option is " - "specified or update the \"buffer_size\" option appropriately. Row number: " + - std::to_string(buffer_first_row_index + 1) + - ". First few characters in row: " + buffer_excerpt; - throw InsufficientBufferSizeException{error_message}; + if (in_quote) { + std::string quote(1, copy_params.quote); + std::string error_message = + "Unable to find a matching end quote for the quote character '" + quote + + "' after reading " + std::to_string(size) + + " characters. Please ensure that all data fields are correctly formatted " + "or update the \"buffer_size\" option appropriately. Row number: " + + std::to_string(buffer_first_row_index + 1) + + ". First few characters in row: " + buffer_excerpt; + throw InsufficientBufferSizeException{error_message}; + } else { + std::string error_message = + "Unable to find an end of line character after reading " + + std::to_string(size) + + " characters. Please ensure that the correct \"line_delimiter\" option is " + "specified or update the \"buffer_size\" option appropriately. Row number: " + + std::to_string(buffer_first_row_index + 1) + + ". First few characters in row: " + buffer_excerpt; + throw InsufficientBufferSizeException{error_message}; + } } return last_line_delim_pos + 1; @@ -244,10 +262,10 @@ const char* get_row(const char* buf, } const char* field_begin = field_buf; const char* field_end = field_buf + j; + trim_quotes(field_begin, field_end, copy_params); if (copy_params.trim_spaces) { trim_space(field_begin, field_end); } - trim_quotes(field_begin, field_end, copy_params); row.emplace_back(field_begin, field_end - field_begin); } field = p + 1; diff --git a/ImportExport/DelimitedParserUtils.h b/ImportExport/DelimitedParserUtils.h index 032633d102..5aad86425f 100644 --- a/ImportExport/DelimitedParserUtils.h +++ b/ImportExport/DelimitedParserUtils.h @@ -37,6 +37,11 @@ class InsufficientBufferSizeException : public std::runtime_error { : std::runtime_error(message) {} }; +class DelimitedParserException : public std::runtime_error { + public: + DelimitedParserException(const std::string& message) : std::runtime_error(message) {} +}; + /** * @brief Finds the closest possible row beginning in the given buffer. * diff --git a/ImportExport/Importer.cpp b/ImportExport/Importer.cpp index 79f66ca0fa..f7029d1713 100644 --- a/ImportExport/Importer.cpp +++ b/ImportExport/Importer.cpp @@ -2020,6 +2020,7 @@ static ImportStatus import_thread_delimited( row.clear(); std::vector> tmp_buffers; // holds string w/ removed escape chars, etc + row_index_plus_one++; if (DEBUG_TIMING) { us = measure::execution([&]() { p = import_export::delimited_parser::get_row(p, @@ -2044,7 +2045,6 @@ static ImportStatus import_thread_delimited( try_single_thread, true); } - row_index_plus_one++; // Each POINT could consume two separate coords instead of a single WKT if (row.size() < num_cols || (num_cols + point_cols) < row.size()) { thread_import_status.rows_rejected++; diff --git a/QueryEngine/ColumnarResults.cpp b/QueryEngine/ColumnarResults.cpp index ae932777b7..97b41c0d69 100644 --- a/QueryEngine/ColumnarResults.cpp +++ b/QueryEngine/ColumnarResults.cpp @@ -204,9 +204,10 @@ int64_t countNumberOfValuesGeoPolygon(const ResultSet& rows, CHECK(ns); const auto s_ptr = boost::get(ns); if (s_ptr) { - // We count the number of commas and parenthesis in WKT representation - // of a polygon (e.g. POLYGON ((0 0,4 0,4 4,0 4,0 0),(1 1,1 2,2 2,2 1,1 1))) - // to get the number of points it contains: + // We count the number of commas in WKT representation + // of a polygon (e.g. POLYGON ((0 0,4 0,4 4,0 4,0 0),(1 + // 1,1 2,2 2,2 1,1 1))) to get the number of points it + // contains: running_count += std::count(s_ptr->begin(), s_ptr->end(), ',') + 1; } } else if (const auto tv = @@ -220,10 +221,60 @@ int64_t countNumberOfValuesGeoPolygon(const ResultSet& rows, : sizeof(double)) / 2; } else if (const auto tv = boost::get(&crt_row[column_idx])) { - const auto s = boost::get(tv->get()); - std::vector* d = s.coords.get(); - CHECK(d); - running_count += d->size(); + if (tv->get_ptr() != nullptr) { + const auto s = boost::get(tv->get()); + std::vector* d = s.coords.get(); + CHECK(d); + running_count += d->size(); + } // else s is NULL + } else { + UNREACHABLE(); + } + } + return running_count; + }, + std::plus()); +} + +int64_t countNumberOfValuesGeoMultiPolygon(const ResultSet& rows, + const SQLTypeInfo& ti, + const size_t column_idx) { + return tbb::parallel_reduce( + tbb::blocked_range(0, rows.rowCount()), + static_cast(0), + [&](tbb::blocked_range r, int64_t running_count) { + for (int i = r.begin(); i < r.end(); ++i) { + const auto crt_row = rows.getRowAtNoTranslations(i); + if (const auto tv = boost::get(&crt_row[column_idx])) { + const auto ns = boost::get(tv); + CHECK(ns); + const auto s_ptr = boost::get(ns); + if (s_ptr && *s_ptr != "NULL") { + // We count the number of commas in WKT representation + // of a multi-polygon (e.g. MULTIPOLYGON (((0 0,4 0,4 + // 4,0 4,0 0),(1 1,1 2,2 2,2 1,1 1)), (...))) to get + // the number of points it contains: + running_count += std::count(s_ptr->begin(), s_ptr->end(), ',') + 1; + } + } else if (const auto tv = + boost::get(&crt_row[column_idx])) { + const auto s = boost::get(tv); + CHECK(s); + if (s->coords_data != nullptr && s->coords_data->pointer != nullptr) { + VarlenDatum* d = s->coords_data.get(); + if (ti.get_compression() == kENCODING_GEOINT) { + running_count += d->length / (2 * sizeof(int32_t)); + } else { + running_count += d->length / (2 * sizeof(double)); + } + } // else s is NULL + } else if (const auto tv = boost::get(&crt_row[column_idx])) { + if (tv->get_ptr() != nullptr) { + const auto s = boost::get(tv->get()); + std::vector* d = s.coords.get(); + CHECK(d); + running_count += d->size() / 2; + } // else s is NULL } else { UNREACHABLE(); } @@ -283,10 +334,14 @@ ColumnarResults::ColumnarResults(std::shared_ptr row_set_mem_ case kPOLYGON: values_count = countNumberOfValuesGeoPolygon(rows, ti, i); break; + case kMULTIPOLYGON: + values_count = countNumberOfValuesGeoMultiPolygon(rows, ti, i); + break; default: UNREACHABLE() << "count number of values not implemented for " << ti.toString(); } + // TODO: include sizes count to optimize flatbuffer size const int64_t flatbuffer_size = getFlatBufferSize(num_rows_, values_count, ti); column_buffers_[i] = row_set_mem_owner->allocate(flatbuffer_size, thread_idx_); FlatBufferManager m{column_buffers_[i]}; @@ -670,7 +725,7 @@ inline void ColumnarResults::writeBackCell(const TargetValue& col_val, const auto ns = boost::get(tv); CHECK(ns); const auto s_ptr = boost::get(ns); - if (s_ptr == nullptr) { + if (s_ptr == nullptr || *s_ptr == "NULL") { auto lock_scope = (write_mutex == nullptr ? std::unique_lock() : std::unique_lock(*write_mutex)); @@ -725,26 +780,134 @@ inline void ColumnarResults::writeBackCell(const TargetValue& col_val, } CHECK_EQ(status, FlatBufferManager::Status::Success); } else if (const auto tv = boost::get(&col_val)) { - /* - Warning: the following code fails for NULL row values - because of the failure to detect the nullness correctly. - */ - const auto s = boost::get(tv->get()); - const std::vector* d = s.coords.get(); - const std::vector* r = s.ring_sizes.get(); - CHECK(d); - CHECK(r); - std::vector compressed_coords = - Geospatial::compress_coords(*d, type_info); - { + if (tv->get_ptr() == nullptr) { auto lock_scope = (write_mutex == nullptr ? std::unique_lock() : std::unique_lock(*write_mutex)); - status = m.setItemCountsAndData( - row_idx, - r->data(), - r->size(), - reinterpret_cast(compressed_coords.data())); + status = m.setNull(row_idx); + } else { + const auto s = boost::get(tv->get()); + const std::vector* d = s.coords.get(); + const std::vector* r = s.ring_sizes.get(); + CHECK(d); + CHECK(r); + if (d->size() == 0) { + CHECK_EQ(r->size(), 0); + auto lock_scope = + (write_mutex == nullptr ? std::unique_lock() + : std::unique_lock(*write_mutex)); + status = m.setNull(row_idx); + } else { + std::vector compressed_coords = + Geospatial::compress_coords(*d, type_info); + { + auto lock_scope = + (write_mutex == nullptr ? std::unique_lock() + : std::unique_lock(*write_mutex)); + status = m.setItemCountsAndData( + row_idx, + r->data(), + r->size(), + reinterpret_cast(compressed_coords.data())); + } + } + } + CHECK_EQ(status, FlatBufferManager::Status::Success); + } else { + UNREACHABLE(); + } + break; + } + case kMULTIPOLYGON: { + CHECK(FlatBufferManager::isFlatBuffer(column_buffers_[column_idx])); + FlatBufferManager m{column_buffers_[column_idx]}; + const SQLTypeInfoLite* ti_lite = + reinterpret_cast(m.get_user_data_buffer()); + if (ti_lite->compression == SQLTypeInfoLite::GEOINT) { + CHECK_EQ(type_info.get_compression(), kENCODING_GEOINT); + } else { + CHECK_EQ(type_info.get_compression(), kENCODING_NONE); + } + FlatBufferManager::Status status{}; + if (const auto tv = boost::get(&col_val)) { + const auto ns = boost::get(tv); + CHECK(ns); + const auto s_ptr = boost::get(ns); + if (s_ptr == nullptr || *s_ptr == "NULL") { + auto lock_scope = + (write_mutex == nullptr ? std::unique_lock() + : std::unique_lock(*write_mutex)); + status = m.setNull(row_idx); + } else { + std::vector coords; + std::vector ring_sizes; + std::vector polygon_sizes; + std::vector bounds; + int64_t approx_nof_coords = 2 * std::count(s_ptr->begin(), s_ptr->end(), ','); + int64_t approx_nof_rings = std::count(s_ptr->begin(), s_ptr->end(), '(') - 1; + int64_t approx_nof_polygons = approx_nof_rings; // upper bound + coords.reserve(approx_nof_coords); + ring_sizes.reserve(approx_nof_rings); + polygon_sizes.reserve(approx_nof_polygons); + bounds.reserve(4); + const auto gdal_wkt_ls = Geospatial::GeoMultiPolygon(*s_ptr); + gdal_wkt_ls.getColumns(coords, ring_sizes, polygon_sizes, bounds); + const std::vector compressed_coords = + Geospatial::compress_coords(coords, type_info); + { + auto lock_scope = + (write_mutex == nullptr ? std::unique_lock() + : std::unique_lock(*write_mutex)); + status = m.setItem(row_idx, compressed_coords, ring_sizes, polygon_sizes); + } + } + CHECK_EQ(status, FlatBufferManager::Status::Success); + } else if (const auto tv = boost::get(&col_val)) { + const auto s = boost::get(tv); + CHECK(s); + if (s->coords_data == nullptr || s->coords_data->pointer == nullptr) { + status = m.setNull(row_idx); + } else { + const VarlenDatum* d = s->coords_data.get(); + const VarlenDatum* r = s->ring_sizes_data.get(); + const VarlenDatum* p = s->poly_rings_data.get(); + CHECK(d); + CHECK(d->pointer); + { + auto lock_scope = + (write_mutex == nullptr ? std::unique_lock() + : std::unique_lock(*write_mutex)); + status = m.setItem(row_idx, + d->pointer, + d->length, + reinterpret_cast(r->pointer), + r->length / sizeof(int32_t), + reinterpret_cast(p->pointer), + p->length / sizeof(int32_t)); + } + } + CHECK_EQ(status, FlatBufferManager::Status::Success); + } else if (const auto tv = boost::get(&col_val)) { + if (tv->get_ptr() == nullptr) { + auto lock_scope = + (write_mutex == nullptr ? std::unique_lock() + : std::unique_lock(*write_mutex)); + status = m.setNull(row_idx); + } else { + const auto s = boost::get(tv->get()); + const std::vector* d = s.coords.get(); + const std::vector* r = s.ring_sizes.get(); + const std::vector* p = s.poly_rings.get(); + CHECK(d); + CHECK_NE(d->size(), 0); + std::vector compressed_coords = + Geospatial::compress_coords(*d, type_info); + { + auto lock_scope = + (write_mutex == nullptr ? std::unique_lock() + : std::unique_lock(*write_mutex)); + status = m.setItem(row_idx, compressed_coords, *r, *p); + } } CHECK_EQ(status, FlatBufferManager::Status::Success); } else { diff --git a/QueryEngine/Descriptors/InputDescriptors.h b/QueryEngine/Descriptors/InputDescriptors.h index ceca307d2d..34aa935a5b 100644 --- a/QueryEngine/Descriptors/InputDescriptors.h +++ b/QueryEngine/Descriptors/InputDescriptors.h @@ -69,6 +69,14 @@ class InputColDescriptor final { const InputDescriptor& getScanDesc() const { return input_desc_; } + shared::TableKey getTableKey() const { + return shared::TableKey{input_desc_.getTableKey()}; + } + + shared::ColumnKey getColumnKey() const { + return shared::ColumnKey{getTableKey(), col_id_}; + } + size_t hash() const { return input_desc_.hash() ^ (static_cast(col_id_) << 16); } diff --git a/QueryEngine/ExpressionRewrite.cpp b/QueryEngine/ExpressionRewrite.cpp index 1aba6e4c71..a85c276c0f 100644 --- a/QueryEngine/ExpressionRewrite.cpp +++ b/QueryEngine/ExpressionRewrite.cpp @@ -870,11 +870,9 @@ OverlapsJoinTranslationResult translate_overlaps_conjunction_with_reordering( // ordering for overlaps, the join builder will fail. std::set lhs_rte_idx; lhs->collect_rte_idx(lhs_rte_idx); - CHECK(!lhs_rte_idx.empty()); std::set rhs_rte_idx; rhs->collect_rte_idx(rhs_rte_idx); - CHECK(!rhs_rte_idx.empty()); - auto has_invalid_num_join_cols = lhs_rte_idx.size() > 1 || rhs_rte_idx.size() > 1; + auto has_invalid_num_join_cols = lhs_rte_idx.size() != 1 || rhs_rte_idx.size() != 1; auto has_invalid_rte_idx = lhs_rte_idx > rhs_rte_idx; return std::make_pair(has_invalid_num_join_cols || has_invalid_rte_idx, has_invalid_rte_idx); diff --git a/QueryEngine/GroupByRuntime.cpp b/QueryEngine/GroupByRuntime.cpp index c79e1d4fb3..33a0645415 100644 --- a/QueryEngine/GroupByRuntime.cpp +++ b/QueryEngine/GroupByRuntime.cpp @@ -272,7 +272,7 @@ bucketized_hash_join_idx(int64_t hash_buff, int64_t const max_key, const int64_t translated_null_val, int64_t bucket_normalization) { - if (key >= min_key && key <= max_key) { + if (hash_buff && key >= min_key && key <= max_key) { return *SUFFIX(get_bucketized_hash_slot)(reinterpret_cast(hash_buff), key, min_key / bucket_normalization, @@ -354,7 +354,7 @@ hash_join_idx_sharded(int64_t hash_buff, const uint32_t entry_count_per_shard, const uint32_t num_shards, const uint32_t device_count) { - if (key >= min_key && key <= max_key) { + if (hash_buff && key >= min_key && key <= max_key) { return *SUFFIX(get_hash_slot_sharded)(reinterpret_cast(hash_buff), key, min_key, diff --git a/QueryEngine/InputMetadata.cpp b/QueryEngine/InputMetadata.cpp index 23cf3f628a..82f3c759fb 100644 --- a/QueryEngine/InputMetadata.cpp +++ b/QueryEngine/InputMetadata.cpp @@ -266,14 +266,20 @@ ChunkMetadataMap synthesize_metadata_table_function(const ResultSet* rows) { FlatBufferManager m{const_cast(columnar_buffer)}; chunk_metadata->numBytes = m.getBufferSize(); if (is_geometry) { - // a geometry value is a pair of coordinates but its element - // type value is a int or double, hence multiplication by 2: - values_count = m.get_nof_values() * 2; + if (col_sql_type_info.get_type() == kMULTIPOLYGON) { + values_count = m.getValuesCount(); + values_buffer = m.get_values_buffer(); + } else { + // a geometry value is a pair of coordinates but its element + // type value is a int or double, hence multiplication by 2: + values_count = m.get_nof_values() * 2; + values_buffer = m.get_values(); + } } else { CHECK(is_array); values_count = m.get_nof_values(); + values_buffer = m.get_values(); } - values_buffer = m.get_values(); } else { chunk_metadata->numBytes = row_count * col_type_info.get_size(); values_count = row_count; diff --git a/QueryEngine/JoinHashTable/PerfectJoinHashTable.cpp b/QueryEngine/JoinHashTable/PerfectJoinHashTable.cpp index e2cb46d983..07abd36b2d 100644 --- a/QueryEngine/JoinHashTable/PerfectJoinHashTable.cpp +++ b/QueryEngine/JoinHashTable/PerfectJoinHashTable.cpp @@ -243,15 +243,6 @@ std::shared_ptr PerfectJoinHashTable::getInstance( throw TooManyHashEntries(oss.str()); } - auto const shard_count = get_shard_count(qual_bin_oper.get(), executor); - if (device_count > 1 && shard_count > 1) { - // use baseline hash join to compute this case until resolving related hash join logic - // todd(yoonmin): relax this after fixing related hashtable build/probe logic is fixed - throw TooManyHashEntries( - "Use baseline hash join: multiple GPUs process the input sharded table via " - "perfect hash can cause a wrong result"); - } - if (qual_bin_oper->get_optype() == kBW_EQ && col_range.getIntMax() >= std::numeric_limits::max()) { throw HashJoinFail("Cannot translate null value for kBW_EQ"); diff --git a/QueryEngine/PlanState.cpp b/QueryEngine/PlanState.cpp index 56d0ec6b48..aaf15b8956 100644 --- a/QueryEngine/PlanState.cpp +++ b/QueryEngine/PlanState.cpp @@ -67,17 +67,45 @@ void PlanState::allocateLocalColumnIds( int PlanState::getLocalColumnId(const Analyzer::ColumnVar* col_var, const bool fetch_column) { + // Previously, we consider `rte_idx` of `col_var` w/ its column key together + // to specify columns in the `global_to_local_col_ids_`. + // But there is a case when the same col has multiple 'rte_idx's + // For instance, the same geometry col is used not only as input col of the geo join op, + // but also included as input col of filter predicate + // In such a case, the same geometry col has two rte_idxs (the one defined by the filter + // predicate and the other determined by the geo join operator) + // The previous logic cannot cover this case b/c it allows only one `rte_idx` per col + // But it is safe to share `rte_idx` of among all use cases of the same col CHECK(col_var); const auto& global_col_key = col_var->getColumnKey(); - const int scan_idx = col_var->get_rte_idx(); - InputColDescriptor scan_col_desc( - global_col_key.column_id, global_col_key.table_id, global_col_key.db_id, scan_idx); + InputColDescriptor scan_col_desc(global_col_key.column_id, + global_col_key.table_id, + global_col_key.db_id, + col_var->get_rte_idx()); + std::optional col_id{std::nullopt}; + // let's try to find col_id w/ considering `rte_idx` const auto it = global_to_local_col_ids_.find(scan_col_desc); - CHECK(it != global_to_local_col_ids_.end()) << "Expected to find " << scan_col_desc; - if (fetch_column) { - columns_to_fetch_.insert(global_col_key); + if (it != global_to_local_col_ids_.end()) { + // we have a valid col_id + col_id = it->second; + } else { + // otherwise, let's try to find col_id for the same col + // (but have different 'rte_idx') to share it w/ `col_var` + for (auto const& kv : global_to_local_col_ids_) { + if (kv.first.getColumnKey() == global_col_key) { + col_id = kv.second; + break; + } + } + } + if (col_id && *col_id >= 0) { + if (fetch_column) { + columns_to_fetch_.insert(global_col_key); + } + return *col_id; } - return it->second; + CHECK(false) << "Expected to find " << global_col_key; + return {}; } void PlanState::addNonHashtableQualForLeftJoin(size_t idx, diff --git a/QueryEngine/ResultSetIteration.cpp b/QueryEngine/ResultSetIteration.cpp index 909396c0a7..6ba47194fd 100644 --- a/QueryEngine/ResultSetIteration.cpp +++ b/QueryEngine/ResultSetIteration.cpp @@ -1875,6 +1875,113 @@ TargetValue ResultSet::makeGeoTargetValue(const int8_t* geo_target_ptr, varlen_buffer[getCoordsDataPtr(geo_target_ptr) + 2].size())); } else if (col_lazy_fetch && col_lazy_fetch->is_lazily_fetched) { const auto& frag_col_buffers = getFragColBuffers(); + auto ptr = frag_col_buffers[col_lazy_fetch->local_col_id]; + if (FlatBufferManager::isFlatBuffer(ptr)) { + FlatBufferManager m{const_cast(ptr)}; + const SQLTypeInfoLite* ti_lite = + reinterpret_cast(m.get_user_data_buffer()); + if (ti_lite->compression == SQLTypeInfoLite::GEOINT) { + CHECK_EQ(target_info.sql_type.get_compression(), kENCODING_GEOINT); + } else { + CHECK_EQ(target_info.sql_type.get_compression(), kENCODING_NONE); + } + int64_t index = getCoordsDataPtr(geo_target_ptr); + bool is_null; + std::vector coords; + std::vector rings; + std::vector poly_rings; + int8_t* points; + int32_t nof_points; + int8_t* ring_sizes; + int32_t nof_rings; + int8_t* polygon_sizes; + int32_t nof_polygons; + if (ResultSet::GeoReturnType::WktString == geo_return_type_) { + if (ti_lite->compression == SQLTypeInfoLite::GEOINT) { + std::vector ccoords; + m.getItem(index, ccoords, rings, poly_rings, is_null); + coords = *decompress_coords( + target_info.sql_type, + reinterpret_cast(ccoords.data()), + ccoords.size() * sizeof(int32_t)); + } else { + m.getItem(index, coords, rings, poly_rings, is_null); + } + if (is_null) { + return NullableString("NULL"); + } + Geospatial::GeoMultiPolygon mpoly(coords, rings, poly_rings); + return NullableString(mpoly.getWktString()); + } else if (ResultSet::GeoReturnType::GeoTargetValuePtr == geo_return_type_) { + m.getItem(index, + nof_points, + points, + nof_rings, + ring_sizes, + nof_polygons, + polygon_sizes, + is_null); + if (is_null) { + return GeoMultiPolyTargetValuePtr(); + } + auto coords = std::make_shared( + nof_points * m.getValueSize(), points, false); + auto rings = + std::make_shared(nof_rings * sizeof(int32_t), + reinterpret_cast(ring_sizes), + false); + auto poly_rings = + std::make_shared(nof_polygons * sizeof(int32_t), + reinterpret_cast(polygon_sizes), + false); + return GeoMultiPolyTargetValuePtr( + {std::move(coords), std::move(rings), std::move(poly_rings)}); + } else if (ResultSet::GeoReturnType::GeoTargetValue == geo_return_type_) { + if (ti_lite->compression == SQLTypeInfoLite::GEOINT) { + std::vector ccoords; + m.getItem(index, ccoords, rings, poly_rings, is_null); + if (is_null) { + return GeoTargetValue(); + } + coords = *decompress_coords( + target_info.sql_type, + reinterpret_cast(ccoords.data()), + ccoords.size() * sizeof(int32_t)); + } else { + m.getItem(index, coords, rings, poly_rings, is_null); + if (is_null) { + return GeoTargetValue(); + } + } + return GeoTargetValue(GeoMultiPolyTargetValue(coords, rings, poly_rings)); + } else if (ResultSet::GeoReturnType::GeoTargetValueGpuPtr == geo_return_type_) { + m.getItem(index, + nof_points, + points, + nof_rings, + ring_sizes, + nof_polygons, + polygon_sizes, + is_null); + if (is_null) { + return GeoMultiPolyTargetValuePtr(); + } + auto coords = std::make_shared( + nof_points * m.getValueSize(), points, false); + auto rings = + std::make_shared(nof_rings * sizeof(int32_t), + reinterpret_cast(ring_sizes), + false); + auto poly_rings = + std::make_shared(nof_polygons * sizeof(int32_t), + reinterpret_cast(polygon_sizes), + false); + return GeoMultiPolyTargetValuePtr( + {std::move(coords), std::move(rings), std::move(poly_rings)}); + } else { + UNREACHABLE(); + } + } return GeoTargetValueBuilder::build( target_info.sql_type, diff --git a/QueryEngine/TableFunctions/TableFunctionManager.h b/QueryEngine/TableFunctions/TableFunctionManager.h index be4116f46c..47fe1bc314 100644 --- a/QueryEngine/TableFunctions/TableFunctionManager.h +++ b/QueryEngine/TableFunctions/TableFunctionManager.h @@ -147,7 +147,8 @@ struct TableFunctionManager { switch (ti.get_type()) { case kARRAY: case kLINESTRING: - case kPOLYGON: { + case kPOLYGON: + case kMULTIPOLYGON: { if (output_item_values_total_number_[i] == -1) { throw std::runtime_error("set_output_item_values_total_number(" + std::to_string(i) + @@ -221,7 +222,8 @@ struct TableFunctionManager { switch (ti.get_type()) { case kARRAY: case kLINESTRING: - case kPOLYGON: { + case kPOLYGON: + case kMULTIPOLYGON: { total_number = output_item_values_total_number_[i]; break; } @@ -232,6 +234,10 @@ struct TableFunctionManager { << ti.toString(); } initializeFlatBuffer(m, output_num_rows_, total_number, ti); + CHECK(FlatBufferManager::isFlatBuffer(output_buffers_ptr)); + // Checks if the implementations of getFlatBufferSize and + // initializeFlatBuffer in sqltypes.h are in sync: + CHECK_EQ(m.getBufferSize(), query_mem_desc.getFlatBufferSize(i)); output_buffers_ptr = align_to_int64(output_buffers_ptr + m.getBufferSize()); } else { const size_t col_width = ti.get_size(); diff --git a/QueryEngine/TableFunctions/TestFunctions/GeoTestTableFunctions.cpp b/QueryEngine/TableFunctions/TestFunctions/GeoTestTableFunctions.cpp index e360745a35..29b9088fa4 100644 --- a/QueryEngine/TableFunctions/TestFunctions/GeoTestTableFunctions.cpp +++ b/QueryEngine/TableFunctions/TestFunctions/GeoTestTableFunctions.cpp @@ -127,10 +127,13 @@ EXTENSION_NOINLINE int32_t ct_make_polygon3__cpu_(TableFunctionManager& mgr, mgr.set_output_item_values_total_number( 0, rings.getNofValues() + holes1.getNofValues() + holes2.getNofValues()); mgr.set_output_row_size(size); + // Initialize polygons + int count_nulls = 0; for (int64_t i = 0; i < size; i++) { if (rings.isNull(i)) { polygons.setNull(i); sizes.setNull(i); + count_nulls++; } else { std::vector> polygon_coords; @@ -145,12 +148,124 @@ EXTENSION_NOINLINE int32_t ct_make_polygon3__cpu_(TableFunctionManager& mgr, return mgr.ERROR_MESSAGE("fromCoords failed: " + ::toString(status)); } int nofpoints = 0; - for (int j = 0; j < polygon.size(); j++) { + for (size_t j = 0; j < polygon.size(); j++) { nofpoints += polygon.size(j); } sizes[i] = nofpoints; } } + + // Check polygons content + if (count_nulls == 0) { + return mgr.ERROR_MESSAGE("counting null test failed: count_nulls=" + + ::toString(count_nulls) + ", expected non-zero."); + } + + for (int64_t i = 0; i < size; i++) { + if (polygons.isNull(i)) { + count_nulls--; + } else { + std::vector> polygon_coords; + polygon_coords.push_back(rings[i].toCoords()); + if (!holes1.isNull(i)) { + polygon_coords.push_back(holes1[i].toCoords()); + } + if (!holes2.isNull(i)) { + polygon_coords.push_back(holes2[i].toCoords()); + } + + // polygons[i] is Geo::Polygon instances + // polygons[i][j] is Geo::LineString instances + // polygons[i][j][k] is Geo::Point2D instances + + auto nof_lines = polygons[i].size(); + + if (nof_lines != polygon_coords.size()) { + return mgr.ERROR_MESSAGE( + "polygon size test failed: nof_lines=" + ::toString(nof_lines) + + ", expected " + ::toString(polygon_coords.size()) + "."); + } + std::vector> poly_coords = polygons[i].toCoords(); + if (nof_lines != poly_coords.size()) { + return mgr.ERROR_MESSAGE( + "polygon toCoords size test failed: poly_coords.size()=" + + ::toString(poly_coords.size()) + ", expected " + ::toString(nof_lines) + "."); + } + + auto poly = polygons[i]; + + for (size_t j = 0; j < poly.size(); j++) { + Geo::LineString line = poly[j]; + std::vector line_coords = line.toCoords(); + auto nof_points = polygon_coords[j].size() / 2; + if (poly.size(j) != nof_points) { + return mgr.ERROR_MESSAGE("polygon linestring size test failed: poly.size(" + + ::toString(j) + ")=" + ::toString(poly.size(j)) + + ", expected " + ::toString(nof_points) + "."); + } + if (line.size() != nof_points) { + return mgr.ERROR_MESSAGE("polygon linestring size test failed: line.size()=" + + ::toString(line.size()) + ", expected " + + ::toString(nof_points) + "."); + } + if (poly_coords[j].size() != nof_points * 2) { + return mgr.ERROR_MESSAGE( + "polygon linestring coords size test failed: poly_coords[j].size()=" + + ::toString(poly_coords[j].size()) + ", expected " + + ::toString(nof_points * 2) + "."); + } + if (line_coords.size() != nof_points * 2) { + return mgr.ERROR_MESSAGE( + "polygon linestring coords size test failed: line_coords.size()=" + + ::toString(line_coords.size()) + ", expected " + + ::toString(nof_points * 2) + "."); + } + for (size_t k = 0; k < nof_points; k++) { + if (std::abs(polygon_coords[j][2 * k] - line_coords[2 * k]) > 1e-7) { + return mgr.ERROR_MESSAGE( + "polygon linestring X coord test failed: line_coords[2*k]=" + + ::toString(line_coords[2 * k]) + ", expected " + + ::toString(polygon_coords[j][2 * k]) + "."); + } + if (std::abs(polygon_coords[j][2 * k] - poly_coords[j][2 * k]) > 1e-7) { + return mgr.ERROR_MESSAGE("polygon X coord test failed: poly_coords[j][2*k]=" + + ::toString(poly_coords[j][2 * k]) + ", expected " + + ::toString(polygon_coords[j][2 * k]) + "."); + } + if (std::abs(polygon_coords[j][2 * k] - line[k].x) > 1e-7) { + return mgr.ERROR_MESSAGE( + "polygon linestring X coord test failed: line[k].x=" + + ::toString(line[k].x) + ", expected " + + ::toString(polygon_coords[j][2 * k]) + "."); + } + if (std::abs(polygon_coords[j][2 * k + 1] - line_coords[2 * k + 1]) > 1e-7) { + return mgr.ERROR_MESSAGE( + "polygon linestring Y coord test failed: line_coords[2*k+1]=" + + ::toString(line_coords[2 * k + 1]) + ", expected " + + ::toString(polygon_coords[j][2 * k + 1]) + "."); + } + if (std::abs(polygon_coords[j][2 * k + 1] - poly_coords[j][2 * k + 1]) > 1e-7) { + return mgr.ERROR_MESSAGE( + "polygon Y coord test failed: poly_coords[j][2*k+1]=" + + ::toString(poly_coords[j][2 * k + 1]) + ", expected " + + ::toString(polygon_coords[j][2 * k + 1]) + "."); + } + if (std::abs(polygon_coords[j][2 * k + 1] - line[k].y) > 1e-7) { + return mgr.ERROR_MESSAGE( + "polygon linestring X coord test failed: line[k].y=" + + ::toString(line[k].y) + ", expected " + + ::toString(polygon_coords[j][2 * k + 1]) + "."); + } + } + } + } + } + + if (count_nulls != 0) { + return mgr.ERROR_MESSAGE("counting null test failed: count_nulls=" + + ::toString(count_nulls) + ", expected 0."); + } + return size; } @@ -175,4 +290,175 @@ EXTENSION_NOINLINE int32_t ct_make_linestring2__cpu_(TableFunctionManager& mgr, return size; } -#endif // #ifndef __CUDACC__ \ No newline at end of file +EXTENSION_NOINLINE int32_t +ct_make_multipolygon__cpu_(TableFunctionManager& mgr, + const Column& polygons, + Column& mpolygons) { + auto size = polygons.size(); + mgr.set_output_item_values_total_number(0, polygons.getNofValues()); + mgr.set_output_row_size(size); + + // Initialize mpolygons + int count_nulls = 0; + for (int64_t i = 0; i < size; i++) { + if (polygons.isNull(i)) { + mpolygons.setNull(i); + count_nulls++; + } else { + std::vector>> mpolygon_coords; + mpolygon_coords.reserve(1); + std::vector> polygon_coords = polygons[i].toCoords(); + mpolygon_coords.push_back(polygon_coords); + auto status = mpolygons[i].fromCoords(mpolygon_coords); + if (status != FlatBufferManager::Status::Success) { + return mgr.ERROR_MESSAGE("fromCoords failed: " + ::toString(status)); + } + } + } + + // Check mpolygons content + if (count_nulls == 0) { + return mgr.ERROR_MESSAGE("counting null test failed: count_nulls=" + + ::toString(count_nulls) + ", expected non-zero."); + } + + for (int64_t i = 0; i < size; i++) { + if (mpolygons.isNull(i)) { + count_nulls--; + } else { + std::vector> polygon_coords = polygons[i].toCoords(); + + // mpolygons[i] is Geo::MultiPolygon instances + // mpolygons[i][j] is Geo::Polygon instances + // mpolygons[i][j][k] is Geo::LineString instances + // mpolygons[i][j][k][l] is Geo::Point2D instances + + auto nof_polygons = mpolygons[i].size(); + if (nof_polygons != 1) { + return mgr.ERROR_MESSAGE("multipolygon size test failed: nof_polygons=" + + ::toString(nof_polygons) + ", expected 1."); + } + + std::vector>> mpolygon_coords = + mpolygons[i].toCoords(); + if (nof_polygons != mpolygon_coords.size()) { + return mgr.ERROR_MESSAGE( + "multipolygon toCoords size test failed: mpolygon_coords.size()=" + + ::toString(mpolygon_coords.size()) + ", expected " + + ::toString(nof_polygons) + "."); + } + + Geo::Polygon poly = mpolygons[i][0]; + std::vector> poly_coords = mpolygon_coords[0]; + if (poly.size() != polygon_coords.size()) { + return mgr.ERROR_MESSAGE("multipolygon polygon size test failed: poly.size()=" + + ::toString(poly.size()) + ", expected " + + ::toString(polygon_coords.size()) + "."); + } + + if (poly_coords.size() != polygon_coords.size()) { + return mgr.ERROR_MESSAGE( + "multipolygon polygon coords size test failed: poly_coords.size()=" + + ::toString(poly_coords.size()) + ", expected " + + ::toString(polygon_coords.size()) + "."); + } + + for (size_t j = 0; j < poly.size(); j++) { + Geo::LineString line = poly[j]; + std::vector line_coords = line.toCoords(); + auto nof_points = polygon_coords[j].size() / 2; + if (poly.size(j) != nof_points) { + return mgr.ERROR_MESSAGE( + "multipolygon polygon linestring size test failed: poly.size(" + + ::toString(j) + ")=" + ::toString(poly.size(j)) + ", expected " + + ::toString(nof_points) + "."); + } + if (line.size() != nof_points) { + return mgr.ERROR_MESSAGE( + "multipolygon polygon linestring size test failed: line.size()=" + + ::toString(line.size()) + ", expected " + ::toString(nof_points) + "."); + } + if (poly_coords[j].size() != nof_points * 2) { + return mgr.ERROR_MESSAGE( + "multipolygon polygon linestring coords size test failed: " + "poly_coords[j].size()=" + + ::toString(poly_coords[j].size()) + ", expected " + + ::toString(nof_points * 2) + "."); + } + if (line_coords.size() != nof_points * 2) { + return mgr.ERROR_MESSAGE( + "multipolygon polygon linestring coords size test failed: " + "line_coords.size()=" + + ::toString(line_coords.size()) + ", expected " + + ::toString(nof_points * 2) + "."); + } + + for (size_t k = 0; k < nof_points; k++) { + if (std::abs(polygon_coords[j][2 * k] - line_coords[2 * k]) > 1e-7) { + return mgr.ERROR_MESSAGE( + "multipolygon polygon linestring X coord test failed: line_coords[2*k]=" + + ::toString(line_coords[2 * k]) + ", expected " + + ::toString(polygon_coords[j][2 * k]) + "."); + } + if (std::abs(polygon_coords[j][2 * k] - poly_coords[j][2 * k]) > 1e-7) { + return mgr.ERROR_MESSAGE( + "multipolygon polygon X coord test failed: poly_coords[j][2*k]=" + + ::toString(poly_coords[j][2 * k]) + ", expected " + + ::toString(polygon_coords[j][2 * k]) + "."); + } + if (std::abs(polygon_coords[j][2 * k] - line[k].x) > 1e-7) { + return mgr.ERROR_MESSAGE( + "multipolygon polygon linestring X coord test failed: line[k].x=" + + ::toString(line[k].x) + ", expected " + + ::toString(polygon_coords[j][2 * k]) + "."); + } + if (std::abs(polygon_coords[j][2 * k + 1] - line_coords[2 * k + 1]) > 1e-7) { + return mgr.ERROR_MESSAGE( + "multipolygon polygon linestring Y coord test failed: " + "line_coords[2*k+1]=" + + ::toString(line_coords[2 * k + 1]) + ", expected " + + ::toString(polygon_coords[j][2 * k + 1]) + "."); + } + if (std::abs(polygon_coords[j][2 * k + 1] - poly_coords[j][2 * k + 1]) > 1e-7) { + return mgr.ERROR_MESSAGE( + "multipolygon polygon Y coord test failed: poly_coords[j][2*k+1]=" + + ::toString(poly_coords[j][2 * k + 1]) + ", expected " + + ::toString(polygon_coords[j][2 * k + 1]) + "."); + } + if (std::abs(polygon_coords[j][2 * k + 1] - line[k].y) > 1e-7) { + return mgr.ERROR_MESSAGE( + "multipolygon polygon linestring X coord test failed: line[k].y=" + + ::toString(line[k].y) + ", expected " + + ::toString(polygon_coords[j][2 * k + 1]) + "."); + } + } + } + } + } + + if (count_nulls != 0) { + return mgr.ERROR_MESSAGE("counting null test failed: count_nulls=" + + ::toString(count_nulls) + ", expected 0."); + } + + return size; +} + +EXTENSION_NOINLINE int32_t ct_polygonn__cpu_(TableFunctionManager& mgr, + const Column& mpolygons, + int64_t n, + Column& polygons) { + auto size = mpolygons.size(); + mgr.set_output_item_values_total_number(0, mpolygons.getNofValues()); + mgr.set_output_row_size(size); + for (int64_t i = 0; i < size; i++) { + if (mpolygons.isNull(i)) { + polygons.setNull(i); + } else { + polygons.setItem(i, mpolygons[i][n - 1]); + } + } + return size; +} + +#endif // #ifndef __CUDACC__ diff --git a/QueryEngine/TableFunctions/TestFunctions/TableFunctionsTesting.h b/QueryEngine/TableFunctions/TestFunctions/TableFunctionsTesting.h index c92e2f0c52..f01b0e21e8 100644 --- a/QueryEngine/TableFunctions/TestFunctions/TableFunctionsTesting.h +++ b/QueryEngine/TableFunctions/TestFunctions/TableFunctionsTesting.h @@ -1317,6 +1317,8 @@ EXTENSION_NOINLINE int32_t ct_require_range__cpu_(const Column& input1, UDTF: ct_linestringn__cpu_(TableFunctionManager, Column polygons, int64_t n) -> Column linestrings UDTF: ct_make_polygon3__cpu_(TableFunctionManager, Cursor rings, Column holes1, Column holes2>) -> Column polygons, Column sizes UDTF: ct_make_linestring2__cpu_(TableFunctionManager, Cursor x, Column y>, double dx, double dy) -> Column linestrings + UDTF: ct_make_multipolygon__cpu_(TableFunctionManager, Column polygons) -> Column mpolygons + UDTF: ct_polygonn__cpu_(TableFunctionManager, Column mpolygons, int64_t n) -> Column polygons */ // clang-format on @@ -1360,6 +1362,15 @@ EXTENSION_NOINLINE int32_t ct_make_linestring2__cpu_(TableFunctionManager& mgr, double dy, Column& linestrings); +EXTENSION_NOINLINE int32_t ct_make_multipolygon__cpu_(TableFunctionManager& mgr, + const Column& polygons, + Column& mpolygons); + +EXTENSION_NOINLINE int32_t ct_polygonn__cpu_(TableFunctionManager& mgr, + const Column& mpolygons, + int64_t n, + Column& polygons); + #endif // ifndef __CUDACC__ // clang-format off diff --git a/QueryEngine/TargetExprBuilder.cpp b/QueryEngine/TargetExprBuilder.cpp index 60dbca2091..e4b956f890 100644 --- a/QueryEngine/TargetExprBuilder.cpp +++ b/QueryEngine/TargetExprBuilder.cpp @@ -134,7 +134,8 @@ void TargetExprCodegen::codegen( const bool varlen_projection = is_varlen_projection(target_expr, target_info.sql_type); /* TODO: find a better way to determine if target uses FlatBuffer storage or not. */ - const bool uses_flatbuffer = target_info.sql_type.get_type() == kPOLYGON; + const bool uses_flatbuffer = (target_info.sql_type.get_type() == kPOLYGON || + target_info.sql_type.get_type() == kMULTIPOLYGON); const auto agg_fn_names = agg_fn_base_names(target_info, varlen_projection); const auto window_func = dynamic_cast(target_expr); WindowProjectNodeContext::resetWindowFunctionContext(executor); diff --git a/QueryEngine/Utils/FlatBuffer.h b/QueryEngine/Utils/FlatBuffer.h index c1130b5ae1..281cfce90e 100644 --- a/QueryEngine/Utils/FlatBuffer.h +++ b/QueryEngine/Utils/FlatBuffer.h @@ -50,7 +50,6 @@ interpretation depends on the format parameters specified above. The size of the raw data buffer depends on the format id and the user-specified parameters in the data format metadata. - All buffers above are aligned to the 64-bit boundaries. In summary, the memory layout of a flatbuffer is: @@ -170,6 +169,183 @@ into rings and compressed_indices describes the partitioning of rings into polygons. + NestedArray format specification + -------------------------------- + + NestedArray represents a storage for zero, one, two, and three + dimensional ragged arrays. The storage format consists of sizes and + values buffers (plus offset buffers to optimize accessing + items). The sizes buffer stores the sizes of ragged arrays at + various levels and the values buffer stores the values of ragged + arrays. + + The NestedArray storage is used as a uniform storage schema for + different types (variable-length arrays, geotypes, etc) with + variable dimensionality. For example, a GeoMultiPolygon + + GeoMultiPolygon([ + GeoPolygon([LineString([(x000, y000), (x001, y001), ...])], + LineString([(x010, y010), (x011, y011), ...])], + ...]), + GeoPolygon([LineString([(x100, y100), (x101, y101), ...])], + LineString([(x110, y110), (x111, y111), ...])], + ...]), + ... + ]) + + is represented as a three dimensional ragged array where the sizes + buffer contains the number of polygons in the multi-polygon, all the + numbers of linestrings in polygons, all the numbers of points in + linestrings, and finally, the values buffer contains all the + coordinates. Note that a "value" is defined as a point with two + coordinates. + + The current implementation of NestedArray supports dimensionalities + up to 3 but the format can be extended to arbitrary dimensions. + + NestedArray API + --------------- + + To compute flatbuffer size required to represent a nested array with + the given dimensionsinality, total items count, estimated total + sizes and values counts, value type, and user data buffer size, + use:: + + int64_t compute_flatbuffer_size(dimensions, + total_items_count, + total_sizes_count, + total_values_count, + value_type, + user_data_size) + + To initialize the provided buffer for nested array format, use:: + + Status .initialize(dimensions, + total_items_count, + total_sizes_count, + total_values_count, + value_type, + null_value_ptr, + user_data_ptr, user_data_size) + + To test if the provided buffer contains an initialized FlatBuffer:: + + bool isFlatBuffer(buffer) + + To get the size of an initialized FlatBuffer:: + + int64_t getBufferSize(buffer) + int64_t .getBufferSize() + + To get the size of the values buffer:: + + size_t .getValuesBufferSize() + + To get the size of a value:: + + size_t .getValueSize() + + To get the number of specified values:: + + size_t .getValuesCount() + + To get the dimensionality of a nested array:: + + size_t .getDimensions() + + To get various buffers:: + + int8_t* .get_user_data_buffer() + int8_t* .get_values_buffer() + sizes_t* .get_sizes_buffer() + offsets_t* .get_values_offsets() + offsets_t* .get_sizes_offsets() + int8_t* .getNullValuePtr() + + To test if the provided buffer contains null value:: + + bool .containsNullValue() + + To get the item and subitems of a nested array:: + + template + Status .getItem(index, + vector& values, + vector& sizes, + vector& sizes_of_sizes, + bool& is_null) # ndims == 3 + + Status .getItem(index, + subindex, + vector& values, + vector& sizes, + bool& is_null) # ndims == 3 + + Status .getItem(index, + subindex, + subsubindex, + vector& values, + bool& is_null) # ndims == 3 + + Status .getItem(index, + int32_t& nof_values, + int8_t*& values, + int32_t& nof_sizes, + int8_t*& sizes, + int32_t& nof_sizes_of_sizes, + int8_t*& sizes_of_sizes, + bool& is_null) # ndims == 3 + + Status .getItem(index, + subindex, + int32_t& nof_values, + int8_t*& values, + int32_t& nof_sizes, + int8_t*& sizes, + bool& is_null) # ndims == 3 + + Status .getItem(index, + subindex, + subsubindex, + int32_t& nof_values, + int8_t*& values, + int32_t& nof_sizes, + int8_t*& sizes, + bool& is_null) # ndims == 3 + + To get the item or subitem lengths:: + + Status .getLength(index, size_t& length) # ndims == 3 + Status .getLength(index, subindex, size_t& length) # ndims == 3 + Status .getLength(index, subindex, subsubindex, size_t& length) # ndims == 3 + + To set an item of a nested array array:: + + Status .setItem(index, vector& arr) # ndims == 1 + + Status .setItem(index, vector>& arr) # ndims == 2 + + template + Status .setItem(index, vector>>& arr) # ndims == 3 + + template + Status .setItem(index, + vector + vector& sizes, + vector& sizes_of_sizes) # ndims == 3 + + Status setItem(const int64_t index, + int8_t* values_buf, + size_t values_buf_size, + int32_t* sizes_buf, + int32_t nof_sizes, + int32_t* sizes_of_sizes_buf, + int32_t nof_sizes_of_sizes) # ndims == 3 + + To test if item is NULL:: + + Status isNull(index, bool& is_null) + FlatBuffer usage ---------------- @@ -188,14 +364,30 @@ */ // clang-format on +#ifdef FLATBUFFER_ERROR_ABORTS +#include "../../Shared/toString.h" +#define RETURN_ERROR(exc) \ + { \ + PRINT(exc); \ + abort(); \ + return (exc); \ + } +#else +#define RETURN_ERROR(exc) return (exc) +#endif + #include #ifdef HAVE_TOSTRING #include #include #endif +#include #include "../../Shared/funcannotations.h" +#define FLATBUFFER_UNREACHABLE() \ + { abort(); } + // Notice that the format value is used to recognize if a memory // buffer uses some flat buffer format or not. To minimize chances for // false positive test results, use a non-trival integer value when @@ -207,7 +399,7 @@ enum FlatBufferFormat { GeoPolygonFormatId = 0x67706f6c79676f6e, // hex repr of 'gpolygon' // GeoMultiPointFormatId = 0x47656f706f696e74, // hex repr of 'Geopoint' // GeoMultiLineStringFormatId = 0x476c696e65737472, // hex repr of 'Glinestr' - // GeoMultiPolygonFormatId = 0x47706f6c79676f6e, // hex repr of 'Gpolygon' + NestedArrayFormatId = 0x6e65737465644152 // hex repr of 'nestedAR' }; inline int64_t _align_to_int64(int64_t addr) { @@ -216,18 +408,80 @@ inline int64_t _align_to_int64(int64_t addr) { } struct FlatBufferManager { + enum ValueType { + Bool8, + Int8, + Int16, + Int32, + Int64, + UInt8, + UInt16, + UInt32, + UInt64, + Float32, + Float64, + PointInt32, + PointFloat64 + }; + +#ifdef HAVE_TOSTRING + static std::string toString(const ValueType& type); +#endif + + static size_t get_size(ValueType type) { + switch (type) { + case Bool8: + case Int8: + case UInt8: + return 1; + case Int16: + case UInt16: + return 2; + case Int32: + case UInt32: + case Float32: + return 4; + case Int64: + case UInt64: + case Float64: + case PointInt32: + return 8; + case PointFloat64: + return 16; + } + FLATBUFFER_UNREACHABLE(); + return 0; + } + + /* + sizes_t is the type of a container size. Here we use int32_t + because Geospatial uses it as the type for the vector of ring and + polygon sizes. + + offsets_t is the type of offsets that is used to locate + sub-buffers within the FlatBuffer main buffer. Because NULL items + are encoded as negative offset values, the offsets type must be a + signed type. Hence, we define offsets_t as int64_t. + */ + + typedef int32_t sizes_t; + typedef int64_t offsets_t; + +#define FLATBUFFER_SIZES_T_VALUE_TYPE Int32 +#define FLATBUFFER_OFFSETS_T_VALUE_TYPE UInt64 + struct BaseWorker { - int64_t format_id; - int64_t flatbuffer_size; - int64_t format_metadata_offset; // the offset of the data format metadata buffer - int64_t format_worker_offset; // the offset of the data format worker buffer + FlatBufferFormat format_id; + offsets_t flatbuffer_size; + offsets_t format_metadata_offset; // the offset of the data format metadata buffer + offsets_t format_worker_offset; // the offset of the data format worker buffer #ifdef HAVE_TOSTRING std::string toString() const { std::string result = ::typeName(this) + "{"; result += "format_id=" + std::to_string(format_id); - result += ", flatbuffer_size=" + std::to_string(flatbuffer_size); - result += ", format_metadata_offset=" + std::to_string(format_metadata_offset); - result += ", format_worker_offset=" + std::to_string(format_worker_offset); + result += ",\n flatbuffer_size=" + std::to_string(flatbuffer_size); + result += ",\n format_metadata_offset=" + std::to_string(format_metadata_offset); + result += ",\n format_worker_offset=" + std::to_string(format_worker_offset); result += "}"; return result; } @@ -401,22 +655,80 @@ struct FlatBufferManager { #endif }; + struct NestedArrayWorker { + int64_t specified_items_count; + // all offsets are in bytes + offsets_t storage_indices_offset; + offsets_t sizes_offsets_offset; + offsets_t values_offsets_offset; + offsets_t sizes_buffer_offset; + offsets_t values_buffer_offset; + offsets_t user_data_buffer_offset; + size_t value_size; +#ifdef HAVE_TOSTRING + std::string toString() const { + std::string result = ::typeName(this) + "{"; + result += "specified_items_count=" + std::to_string(specified_items_count); + result += ",\n storage_indices_offset=" + std::to_string(storage_indices_offset); + result += ",\n sizes_offsets_offset=" + std::to_string(sizes_offsets_offset); + result += ",\n values_offsets_offset=" + std::to_string(values_offsets_offset); + result += ",\n sizes_buffer_offset=" + std::to_string(sizes_buffer_offset); + result += ",\n values_buffer_offset=" + std::to_string(values_buffer_offset); + result += + ",\n user_data_buffer_offset=" + std::to_string(user_data_buffer_offset); + result += ",\n value_size=" + std::to_string(value_size); + result += "}"; + return result; + } +#endif + }; + + struct NestedArray { + size_t dimensions; + int64_t total_items_count; + int64_t total_sizes_count; + int64_t total_values_count; + ValueType value_type; + size_t user_data_size; +#ifdef HAVE_TOSTRING + std::string toString() const { + std::string result = ::typeName(this) + "{"; + result += "dimensions=" + std::to_string(dimensions); + result += ",\n total_items_count=" + std::to_string(total_items_count); + result += ",\n total_sizes_count=" + std::to_string(total_sizes_count); + result += ",\n total_values_count=" + std::to_string(total_values_count); + result += ",\n value_type=" + FlatBufferManager::toString(value_type); + result += ",\n user_data_size=" + std::to_string(user_data_size); + result += "}"; + return result; + } +#endif + }; + enum Status { Success = 0, IndexError, SubIndexError, SizeError, + FlatbufferSizeError, ItemAlreadySpecifiedError, ItemUnspecifiedError, UnexpectedNullItemError, ValuesBufferTooSmallError, + SizesBufferTooSmallError, CompressedIndices2BufferTooSmallError, MemoryError, NotImplementedError, NotSupportedFormatError, + InvalidUserDataError, + DimensionalityError, + TypeError, + UserDataError, + InconsistentSizesError, UnknownFormatError }; + // FlatBuffer main buffer. It is the only member of the FlatBuffer struct. int8_t* buffer; // Check if a buffer contains FlatBuffer formatted data @@ -424,8 +736,9 @@ struct FlatBufferManager { if (buffer) { // warning: assume that buffer size is at least 8 bytes const auto* base = reinterpret_cast(buffer); - FlatBufferFormat header_format = static_cast(base->format_id); + FlatBufferFormat header_format = base->format_id; switch (header_format) { + case NestedArrayFormatId: case VarlenArrayFormatId: case GeoPointFormatId: case GeoLineStringFormatId: @@ -436,14 +749,17 @@ struct FlatBufferManager { ((int64_t*)buffer)[flatbuffer_size / sizeof(int64_t) - 1]); return footer_format == header_format; } - } break; - default:; + break; + } + default: + break; } } return false; } // Return the allocation size of the the FlatBuffer storage, in bytes + // TODO: return size_t value, 0 when not a flat buffer static int64_t getBufferSize(const void* buffer) { if (isFlatBuffer(buffer)) { return reinterpret_cast(buffer)->flatbuffer_size; @@ -453,17 +769,40 @@ struct FlatBufferManager { } // Return the allocation size of the the FlatBuffer storage, in bytes + // TODO: int64_t -> size_t inline int64_t getBufferSize() const { return reinterpret_cast(buffer)->flatbuffer_size; } + inline bool isNestedArray() const { return format() == NestedArrayFormatId; } + + inline size_t getValueSize() const { return getNestedArrayWorker()->value_size; } + + inline size_t getValuesBufferSize() const { + const auto* metadata = getNestedArrayMetadata(); + const auto* worker = getNestedArrayWorker(); + return worker->value_size * metadata->total_values_count; + } + + inline size_t getValuesCount() const { + const auto* worker = getNestedArrayWorker(); + const auto* values_offsets = get_values_offsets(); + const auto storage_index = worker->specified_items_count; + const auto values_offset = values_offsets[storage_index]; + if (values_offset < 0) { + return -(values_offset + 1); + } + return values_offset; + } + // Return the format of FlatBuffer HOST DEVICE inline FlatBufferFormat format() const { const auto* base = reinterpret_cast(buffer); - return static_cast(base->format_id); + return base->format_id; } // Return the number of items + // To be deprecated in favor of NestedArray format HOST DEVICE inline int64_t itemsCount() const { switch (format()) { case VarlenArrayFormatId: @@ -474,10 +813,15 @@ struct FlatBufferManager { return getGeoLineStringMetadata()->total_items_count; case GeoPolygonFormatId: return getGeoPolygonMetadata()->total_items_count; + case NestedArrayFormatId: + return getNestedArrayMetadata()->total_items_count; + default: + break; } return -1; // invalid value } + // To be deprecated in favor of NestedArray format HOST DEVICE inline int64_t items2Count() const { switch (format()) { case VarlenArrayFormatId: @@ -486,11 +830,33 @@ struct FlatBufferManager { break; case GeoPolygonFormatId: return getGeoPolygonMetadata()->max_nof_rings; + default: + break; } return -1; // invalid value } - HOST DEVICE inline int64_t dtypeSize() const { + // To be deprecated in favor of NestedArray format + HOST DEVICE inline int64_t valueByteSize() const { + switch (format()) { + case VarlenArrayFormatId: + return getVarlenArrayMetadata()->dtype_size; + case GeoPointFormatId: + return 2 * (getGeoPointMetadata()->is_geoint ? sizeof(int32_t) : sizeof(double)); + case GeoLineStringFormatId: + return 2 * + (getGeoLineStringMetadata()->is_geoint ? sizeof(int32_t) : sizeof(double)); + case GeoPolygonFormatId: + return 2 * + (getGeoPolygonMetadata()->is_geoint ? sizeof(int32_t) : sizeof(double)); + default: + break; + } + return -1; + } + + // To be deprecated in favor of NestedArray format + HOST DEVICE inline int64_t dtypeSize() const { // TODO: use valueByteSize instead switch (format()) { case VarlenArrayFormatId: return getVarlenArrayMetadata()->dtype_size; @@ -502,12 +868,15 @@ struct FlatBufferManager { case GeoPolygonFormatId: return 2 * (getGeoPolygonMetadata()->is_geoint ? sizeof(int32_t) : sizeof(double)); + default: + break; } return -1; } // VarlenArray support: + // To be deprecated in favor of NestedArray format static int64_t compute_flatbuffer_size(FlatBufferFormat format_id, const int8_t* format_metadata_ptr) { int64_t flatbuffer_size = _align_to_int64(sizeof(FlatBufferManager::BaseWorker)); @@ -578,7 +947,8 @@ struct FlatBufferManager { (format_metadata->total_items_count)); // storage_indices buffer size break; } - default:; + default: + FLATBUFFER_UNREACHABLE(); } flatbuffer_size += _align_to_int64(sizeof(int64_t)); // footer format id return flatbuffer_size; @@ -610,19 +980,175 @@ struct FlatBufferManager { return reinterpret_cast(buffer + base->format_metadata_offset); \ } + // To be deprecated in favor of NestedArray format FLATBUFFER_MANAGER_FORMAT_TOOLS(VarlenArray); FLATBUFFER_MANAGER_FORMAT_TOOLS(GeoPoint); FLATBUFFER_MANAGER_FORMAT_TOOLS(GeoLineString); FLATBUFFER_MANAGER_FORMAT_TOOLS(GeoPolygon); + /* + HOST DEVICE inline NestedArrayWorker* getNestedArrayWorker() { + auto* base = getBaseWorker(); + return reinterpret_cast(buffer + base->format_worker_offset); + } + HOST DEVICE inline const NestedArrayWorker* getNestedArrayWorker() const { + const auto* base = getBaseWorker(); + return reinterpret_cast(buffer + + base->format_worker_offset); + } + */ +#define FLATBUFFER_MANAGER_FORMAT_TOOLS_NEW(TYPENAME) \ + HOST DEVICE inline NestedArrayWorker* get##TYPENAME##Worker() { \ + auto* base = getBaseWorker(); \ + return reinterpret_cast(buffer + base->format_worker_offset); \ + } \ + HOST DEVICE inline TYPENAME* get##TYPENAME##Metadata() { \ + auto* base = getBaseWorker(); \ + return reinterpret_cast(buffer + base->format_metadata_offset); \ + } \ + HOST DEVICE inline const NestedArrayWorker* get##TYPENAME##Worker() const { \ + const auto* base = getBaseWorker(); \ + return reinterpret_cast(buffer + \ + base->format_worker_offset); \ + } \ + HOST DEVICE inline const TYPENAME* get##TYPENAME##Metadata() const { \ + const auto* base = getBaseWorker(); \ + return reinterpret_cast(buffer + base->format_metadata_offset); \ + } + + FLATBUFFER_MANAGER_FORMAT_TOOLS(NestedArray); #undef FLATBUFFER_MANAGER_FORMAT_TOOLS +#undef FLATBUFFER_MANAGER_FORMAT_TOOLS_NEW + +#define FLATBUFFER_MANAGER_SET_OFFSET(OBJ, NAME, SIZE) \ + offset = OBJ->NAME##_offset = offset + _align_to_int64(previous_size); \ + previous_size = SIZE; + + static int64_t compute_flatbuffer_size(int64_t dimensions, + int64_t total_items_count, + int64_t total_sizes_count, + int64_t total_values_count, + ValueType value_type, + size_t user_data_size) { + size_t value_size = get_size(value_type); + offsets_t flatbuffer_size = _align_to_int64(sizeof(FlatBufferManager::BaseWorker)); + flatbuffer_size += _align_to_int64(sizeof(NestedArray)); + flatbuffer_size += _align_to_int64(sizeof(NestedArrayWorker)); + flatbuffer_size += + _align_to_int64(value_size * (total_values_count + 1)); // values buffer + flatbuffer_size += + _align_to_int64(sizeof(sizes_t) * total_sizes_count); // sizes buffer + flatbuffer_size += + _align_to_int64(sizeof(offsets_t) * (total_items_count + 1)); // values offsets + flatbuffer_size += _align_to_int64( + sizeof(offsets_t) * (total_items_count * dimensions + 1)); // sizes offsets + flatbuffer_size += _align_to_int64( + sizeof(sizes_t) * total_items_count); // storage indices, must use signed type + flatbuffer_size += _align_to_int64(user_data_size); // user data + flatbuffer_size += _align_to_int64(sizeof(int64_t)); // format id + return flatbuffer_size; + } + + Status initialize(FlatBufferFormat format_id, // TODO: eliminate format_id or add it to + // compute_flatbuffer_size + int64_t dimensions, + int64_t total_items_count, + int64_t total_sizes_count, + int64_t total_values_count, + ValueType value_type, + const int8_t* null_value_ptr, + const int8_t* user_data_ptr, + size_t user_data_size) { + auto* base = getBaseWorker(); + base->format_id = format_id; + size_t value_size = get_size(value_type); + base->flatbuffer_size = compute_flatbuffer_size(dimensions, + total_items_count, + total_sizes_count, + total_values_count, + value_type, + user_data_size); + offsets_t offset = 0; + size_t previous_size = sizeof(FlatBufferManager::BaseWorker); + FLATBUFFER_MANAGER_SET_OFFSET(base, format_metadata, sizeof(NestedArray)); + FLATBUFFER_MANAGER_SET_OFFSET(base, format_worker, sizeof(NestedArrayWorker)); + + auto* metadata = getNestedArrayMetadata(); + metadata->dimensions = dimensions; + metadata->total_items_count = total_items_count; + metadata->total_sizes_count = total_sizes_count; + metadata->total_values_count = total_values_count; + metadata->value_type = value_type; + metadata->user_data_size = user_data_size; + + auto* worker = getNestedArrayWorker(); + worker->specified_items_count = 0; + worker->value_size = value_size; + + FLATBUFFER_MANAGER_SET_OFFSET( + worker, values_buffer, value_size * (total_values_count + 1)); + FLATBUFFER_MANAGER_SET_OFFSET( + worker, sizes_buffer, sizeof(sizes_t) * total_sizes_count); + FLATBUFFER_MANAGER_SET_OFFSET( + worker, values_offsets, sizeof(offsets_t) * (total_items_count + 1)); + FLATBUFFER_MANAGER_SET_OFFSET( + worker, sizes_offsets, sizeof(offsets_t) * (total_items_count * dimensions + 1)); + FLATBUFFER_MANAGER_SET_OFFSET( + worker, storage_indices, sizeof(sizes_t) * total_items_count); + FLATBUFFER_MANAGER_SET_OFFSET(worker, user_data_buffer, user_data_size); + + if (base->flatbuffer_size != + offset + _align_to_int64(previous_size) + _align_to_int64(sizeof(int64_t))) { + RETURN_ERROR(FlatbufferSizeError); + } + + offsets_t* values_offsets = get_values_offsets(); + offsets_t* sizes_offsets = get_sizes_offsets(); + values_offsets[0] = 0; + sizes_offsets[0] = 0; + sizes_t* storage_indices = get_storage_indices_new(); + for (int i = 0; i < total_items_count; i++) { + storage_indices[i] = -1; + } + + // the last value in values_buffer stores a null value: + int8_t* null_value_buffer = get_values_buffer() + value_size * total_values_count; + if (null_value_ptr != nullptr) { + if (memcpy(null_value_buffer, null_value_ptr, value_size) == nullptr) { + RETURN_ERROR(MemoryError); + } + } else { + if (memset(null_value_buffer, 0, value_size) == nullptr) { + RETURN_ERROR(MemoryError); + } + } + + if (user_data_size > 0 && user_data_ptr != nullptr) { + int8_t* user_data_buffer = get_user_data_buffer(); + if (memcpy(user_data_buffer, user_data_ptr, user_data_size) == nullptr) { + RETURN_ERROR(MemoryError); + } + } + + ((int64_t*)buffer)[base->flatbuffer_size / sizeof(int64_t) - 1] = + static_cast(format_id); + + if (isFlatBuffer(buffer)) { + return Success; + } + RETURN_ERROR(UnknownFormatError); + } + // To be deprecated in favor of NestedArray format void initialize(FlatBufferFormat format_id, const int8_t* format_metadata_ptr) { auto* base = getBaseWorker(); base->format_id = format_id; base->flatbuffer_size = compute_flatbuffer_size(format_id, format_metadata_ptr); base->format_metadata_offset = _align_to_int64(sizeof(FlatBufferManager::BaseWorker)); switch (format_id) { + case NestedArrayFormatId: + FLATBUFFER_UNREACHABLE(); + break; case VarlenArrayFormatId: { base->format_worker_offset = base->format_metadata_offset + @@ -779,7 +1305,16 @@ struct FlatBufferManager { // Low-level API + inline size_t getDimensions() const { + if (isNestedArray()) { + return getNestedArrayMetadata()->dimensions; + } + FLATBUFFER_UNREACHABLE(); + return 0; + } + // Return the upper bound to the total number of points in all items + // To be deprecated in favor of NestedArray format inline int64_t get_max_nof_values() const { switch (format()) { case VarlenArrayFormatId: @@ -790,13 +1325,20 @@ struct FlatBufferManager { return getGeoLineStringMetadata()->max_nof_values; case GeoPolygonFormatId: return getGeoPolygonMetadata()->max_nof_values; + default: + break; } return -1; } // Return the total number of values in all specified items + // To be deprecated in favor of NestedArray format inline int64_t get_nof_values() const { switch (format()) { + case NestedArrayFormatId: { + FLATBUFFER_UNREACHABLE(); + break; + } case GeoPolygonFormatId: { const int64_t storage_count2 = get_storage_count2(); const int64_t* compressed_indices2 = get_compressed_indices2(); @@ -811,6 +1353,7 @@ struct FlatBufferManager { } // Return the number of specified items + // To be deprecated in favor of NestedArray format HOST DEVICE inline int64_t& get_storage_count() { switch (format()) { case VarlenArrayFormatId: @@ -821,11 +1364,14 @@ struct FlatBufferManager { return getGeoLineStringWorker()->items_count; case GeoPolygonFormatId: return getGeoPolygonWorker()->items_count; + default: + break; } static int64_t dummy_storage_count = -1; return dummy_storage_count; } + // To be deprecated in favor of NestedArray format inline const int64_t& get_storage_count() const { switch (format()) { case VarlenArrayFormatId: @@ -836,12 +1382,15 @@ struct FlatBufferManager { return getGeoLineStringWorker()->items_count; case GeoPolygonFormatId: return getGeoPolygonWorker()->items_count; + default: + break; } static int64_t dummy = -1; return dummy; } // Return the number of specified blocks + // To be deprecated in favor of NestedArray format HOST DEVICE inline int64_t& get_storage_count2() { switch (format()) { case GeoPolygonFormatId: @@ -849,12 +1398,14 @@ struct FlatBufferManager { case VarlenArrayFormatId: case GeoPointFormatId: case GeoLineStringFormatId: + default: break; } static int64_t dummy_storage_count = -1; return dummy_storage_count; } + // To be deprecated in favor of NestedArray format inline const int64_t& get_storage_count2() const { switch (format()) { case GeoPolygonFormatId: @@ -862,6 +1413,7 @@ struct FlatBufferManager { case VarlenArrayFormatId: case GeoPointFormatId: case GeoLineStringFormatId: + default: break; } static int64_t dummy_storage_count = -1; @@ -869,6 +1421,7 @@ struct FlatBufferManager { } // Return the size of values buffer in bytes + // To be deprecated in favor of NestedArray format inline int64_t get_values_buffer_size() const { switch (format()) { case VarlenArrayFormatId: { @@ -889,12 +1442,15 @@ struct FlatBufferManager { const auto* worker = getGeoPolygonWorker(); return worker->compressed_indices2_offset - worker->values_offset; } + default: + break; } static int64_t dummy = -1; return dummy; } // Return the size of compressed_indices2 buffer in bytes + // To be deprecated in favor of NestedArray format inline int64_t get_compressed_indices2_buffer_size() const { switch (format()) { case GeoPolygonFormatId: { @@ -904,6 +1460,7 @@ struct FlatBufferManager { case VarlenArrayFormatId: case GeoPointFormatId: case GeoLineStringFormatId: + default: break; } static int64_t dummy = -1; @@ -911,6 +1468,7 @@ struct FlatBufferManager { } // Return the pointer to values buffer + // To be deprecated in favor of NestedArray format HOST DEVICE inline int8_t* get_values() { int64_t offset = 0; switch (format()) { @@ -932,6 +1490,7 @@ struct FlatBufferManager { return buffer + offset; } + // To be deprecated in favor of NestedArray format inline const int8_t* get_values() const { int64_t offset = 0; switch (format()) { @@ -954,6 +1513,7 @@ struct FlatBufferManager { } // Return the pointer to counts2 buffer + // To be deprecated in favor of NestedArray format HOST DEVICE inline int32_t* get_counts2() { int64_t offset = 0; switch (format()) { @@ -966,6 +1526,7 @@ struct FlatBufferManager { return reinterpret_cast(buffer + offset); } + // To be deprecated in favor of NestedArray format inline const int32_t* get_counts2() const { int64_t offset = 0; switch (format()) { @@ -979,6 +1540,7 @@ struct FlatBufferManager { } // Return the pointer to compressed indices2 buffer + // To be deprecated in favor of NestedArray format HOST DEVICE inline int64_t* get_compressed_indices2() { int64_t offset = 0; switch (format()) { @@ -991,6 +1553,7 @@ struct FlatBufferManager { return reinterpret_cast(buffer + offset); } + // To be deprecated in favor of NestedArray format inline const int64_t* get_compressed_indices2() const { int64_t offset = 0; switch (format()) { @@ -1004,6 +1567,7 @@ struct FlatBufferManager { } // Return the pointer to compressed indices buffer + // To be deprecated in favor of NestedArray format HOST DEVICE inline int64_t* get_compressed_indices() { int64_t offset = 0; switch (format()) { @@ -1022,6 +1586,7 @@ struct FlatBufferManager { return reinterpret_cast(buffer + offset); } + // To be deprecated in favor of NestedArray format inline const int64_t* get_compressed_indices() const { int64_t offset = 0; switch (format()) { @@ -1040,7 +1605,73 @@ struct FlatBufferManager { return reinterpret_cast(buffer + offset); } - // Return the pointer to storage indices buffer +#define FLATBUFFER_GET_BUFFER_METHODS(BUFFERNAME, BUFFERTYPE) \ + HOST DEVICE inline BUFFERTYPE* get_##BUFFERNAME() { \ + int64_t offset = 0; \ + switch (format()) { \ + case NestedArrayFormatId: \ + offset = getNestedArrayWorker()->BUFFERNAME##_offset; \ + break; \ + default: \ + return nullptr; \ + } \ + return reinterpret_cast(buffer + offset); \ + } \ + inline const BUFFERTYPE* get_##BUFFERNAME() const { \ + int64_t offset = 0; \ + switch (format()) { \ + case NestedArrayFormatId: \ + offset = getNestedArrayWorker()->BUFFERNAME##_offset; \ + break; \ + default: \ + return nullptr; \ + } \ + return reinterpret_cast(buffer + offset); \ + } + + FLATBUFFER_GET_BUFFER_METHODS(user_data_buffer, int8_t); + FLATBUFFER_GET_BUFFER_METHODS(values_buffer, int8_t); + FLATBUFFER_GET_BUFFER_METHODS(sizes_buffer, sizes_t); + FLATBUFFER_GET_BUFFER_METHODS(values_offsets, offsets_t); + FLATBUFFER_GET_BUFFER_METHODS(sizes_offsets, offsets_t); + +#undef FLATBUFFER_GET_BUFFER_METHODS + + inline const int8_t* getNullValuePtr() const { + if (isNestedArray()) { + return get_values_buffer() + getValuesBufferSize(); + } + return nullptr; + } + + inline bool containsNullValue(const int8_t* value_ptr) const { + const int8_t* null_value_ptr = getNullValuePtr(); + if (null_value_ptr != nullptr) { + switch (getValueSize()) { + case 1: + return *null_value_ptr == *value_ptr; + case 2: + return *reinterpret_cast(null_value_ptr) == + *reinterpret_cast(value_ptr); + case 4: + return *reinterpret_cast(null_value_ptr) == + *reinterpret_cast(value_ptr); + case 8: + return *reinterpret_cast(null_value_ptr) == + *reinterpret_cast(value_ptr); + case 16: + return (*reinterpret_cast(null_value_ptr) == + *reinterpret_cast(value_ptr) && + *(reinterpret_cast(null_value_ptr) + 1) == + *(reinterpret_cast(value_ptr) + 1)); + default: + break; + } + } + return false; + } + + // To be deprecated in favor of NestedArray format HOST DEVICE inline int64_t* get_storage_indices() { int64_t offset = 0; switch (format()) { @@ -1059,6 +1690,7 @@ struct FlatBufferManager { return reinterpret_cast(buffer + offset); } + // To be deprecated in favor of NestedArray format inline const int64_t* get_storage_indices() const { int64_t offset = 0; switch (format()) { @@ -1077,18 +1709,711 @@ struct FlatBufferManager { return reinterpret_cast(buffer + offset); } + // TODO: rename to get_storage_indices + HOST DEVICE inline sizes_t* get_storage_indices_new() { + offsets_t offset = 0; + switch (format()) { + case NestedArrayFormatId: + offset = getNestedArrayWorker()->storage_indices_offset; + break; + default: + return nullptr; + } + return reinterpret_cast(buffer + offset); + } + + inline const sizes_t* get_storage_indices_new() const { + offsets_t offset = 0; + switch (format()) { + case NestedArrayFormatId: + offset = getNestedArrayWorker()->storage_indices_offset; + break; + default: + return nullptr; + } + return reinterpret_cast(buffer + offset); + } + + Status getItemPrepare(const int64_t index, const size_t ndims) const { + if (index < 0 || index >= itemsCount()) { + RETURN_ERROR(IndexError); + } + if (format() != NestedArrayFormatId) { + RETURN_ERROR(NotSupportedFormatError); + } + if (getDimensions() != ndims) { + RETURN_ERROR(DimensionalityError); + } + return Success; + } + + inline sizes_t get_storage_index(const int64_t index) const { + return get_storage_indices_new()[index]; + } + // High-level API + Status getLength(const int64_t index, size_t& length) const { + const size_t ndims = getDimensions(); + Status status = getItemPrepare(index, ndims); + if (status != Success) { + return status; + } + const auto storage_index = get_storage_index(index); + if (ndims == 3) { + const auto* values_offsets = get_values_offsets(); + const auto values_offset = values_offsets[storage_index]; + if (values_offset < 0) { + length = 0; + return Success; + } + const auto* sizes_offsets = get_sizes_offsets(); + const auto* sizes_buffer = get_sizes_buffer(); + const auto sizes_offset = sizes_offsets[storage_index * ndims]; + length = sizes_buffer[sizes_offset]; + } else { + RETURN_ERROR(NotImplementedError); + } + + return Success; + } + + Status getLength(const int64_t index, const int64_t subindex, size_t& length) const { + const size_t ndims = getDimensions(); + Status status = getItemPrepare(index, ndims); + if (status != Success) { + return status; + } + const auto storage_index = get_storage_index(index); + if (ndims == 3) { + const auto* values_offsets = get_values_offsets(); + const auto values_offset = values_offsets[storage_index]; + if (values_offset < 0) { + RETURN_ERROR(IndexError); + } + const auto* sizes_offsets = get_sizes_offsets(); + const auto* sizes_buffer = get_sizes_buffer(); + const auto sizes2_offset = sizes_offsets[storage_index * ndims + 1]; + length = (sizes_buffer + sizes2_offset)[subindex]; + } else { + RETURN_ERROR(NotImplementedError); + } + return Success; + } + + Status getLength(const int64_t index, + const int64_t subindex, + const int64_t subsubindex, + size_t& length) const { + const size_t ndims = getDimensions(); + Status status = getItemPrepare(index, ndims); + if (status != Success) { + return status; + } + const auto storage_index = get_storage_index(index); + if (ndims == 3) { + const auto* values_offsets = get_values_offsets(); + const auto values_offset = values_offsets[storage_index]; + if (values_offset < 0) { + RETURN_ERROR(IndexError); + } + const auto* sizes_offsets = get_sizes_offsets(); + const auto* sizes_buffer = get_sizes_buffer(); + const auto sizes2_offset = sizes_offsets[storage_index * ndims + 1]; + const auto sizes3_offset = sizes_offsets[storage_index * ndims + 2]; + offsets_t soffset = 0; + for (int64_t i = 0; i < subindex; i++) { + soffset += (sizes_buffer + sizes2_offset)[i]; + } + length = (sizes_buffer + sizes3_offset + soffset)[subsubindex]; + } else { + RETURN_ERROR(NotImplementedError); + } + return Success; + } + + // TODO: parametrize sizes type + template + Status getItem(const int64_t index, + std::vector& values, + std::vector& sizes, + std::vector& sizes_of_sizes, + bool& is_null) { + if constexpr (!std::is_same::value) { + if constexpr (std::is_same::value) { + const auto* metadata = getNestedArrayMetadata(); + if (metadata->value_type != PointFloat64) { + RETURN_ERROR(TypeError); + } + } else if constexpr (std::is_same::value) { + const auto* metadata = getNestedArrayMetadata(); + if (metadata->value_type != PointInt32) { + RETURN_ERROR(TypeError); + } + } else { + RETURN_ERROR(NotImplementedError); + } + } + int32_t nof_values; + int8_t* values_ptr; + int32_t nof_sizes; + int8_t* sizes_ptr; + int32_t nof_sizes_of_sizes; + int8_t* sizes_of_sizes_ptr; + + Status status = getItem(index, + nof_values, + values_ptr, + nof_sizes, + sizes_ptr, + nof_sizes_of_sizes, + sizes_of_sizes_ptr, + is_null); + + if (status != Success) { + return status; + } + if (is_null) { + return Success; + } + const auto valuesize = getValueSize(); + const auto values_count = nof_values * valuesize / sizeof(CT); + values.reserve(values_count); + values.insert(values.end(), + reinterpret_cast(values_ptr), + reinterpret_cast(values_ptr) + values_count); + + sizes.reserve(nof_sizes); + sizes.insert(sizes.end(), + reinterpret_cast(sizes_ptr), + reinterpret_cast(sizes_ptr + nof_sizes * sizeof(sizes_t))); + + sizes_of_sizes.reserve(nof_sizes_of_sizes); + sizes_of_sizes.insert(sizes_of_sizes.end(), + reinterpret_cast(sizes_of_sizes_ptr), + reinterpret_cast( + sizes_of_sizes_ptr + nof_sizes_of_sizes * sizeof(sizes_t))); + + return Success; + } + + template + Status getItem(const int64_t index, + const int64_t subindex, + std::vector& values, + std::vector& sizes, + bool& is_null) { + if constexpr (!std::is_same::value) { + if constexpr (std::is_same::value) { + const auto* metadata = getNestedArrayMetadata(); + if (metadata->value_type != PointFloat64) { + RETURN_ERROR(TypeError); + } + } else if constexpr (std::is_same::value) { + const auto* metadata = getNestedArrayMetadata(); + if (metadata->value_type != PointInt32) { + RETURN_ERROR(TypeError); + } + } else { + RETURN_ERROR(NotImplementedError); + } + } + int32_t nof_values; + int8_t* values_ptr; + int32_t nof_sizes; + int8_t* sizes_ptr; + + Status status = + getItem(index, subindex, nof_values, values_ptr, nof_sizes, sizes_ptr, is_null); + + if (status != Success) { + return status; + } + if (is_null) { + return Success; + } + const auto valuesize = getValueSize(); + const auto values_count = nof_values * valuesize / sizeof(CT); + values.reserve(values_count); + values.insert(values.end(), + reinterpret_cast(values_ptr), + reinterpret_cast(values_ptr) + values_count); + + sizes.reserve(nof_sizes); + sizes.insert(sizes.end(), + reinterpret_cast(sizes_ptr), + reinterpret_cast(sizes_ptr + nof_sizes * sizeof(sizes_t))); + + return Success; + } + + template + Status getItem(const int64_t index, + const int64_t subindex, + const int64_t subsubindex, + std::vector& values, + bool& is_null) { + if constexpr (!std::is_same::value) { + if constexpr (std::is_same::value) { + const auto* metadata = getNestedArrayMetadata(); + if (metadata->value_type != PointFloat64) { + RETURN_ERROR(TypeError); + } + } else if constexpr (std::is_same::value) { + const auto* metadata = getNestedArrayMetadata(); + if (metadata->value_type != PointInt32) { + RETURN_ERROR(TypeError); + } + } else { + RETURN_ERROR(NotImplementedError); + } + } + int32_t nof_values; + int8_t* values_ptr; + + Status status = + getItem(index, subindex, subsubindex, nof_values, values_ptr, is_null); + if (status != Success) { + return status; + } + if (is_null) { + return Success; + } + const auto valuesize = getValueSize(); + const auto values_count = nof_values * valuesize / sizeof(CT); + values.reserve(values_count); + values.insert(values.end(), + reinterpret_cast(values_ptr), + reinterpret_cast(values_ptr) + values_count); + return Success; + } + + Status getItem(const int64_t index, + const int64_t subindex, + int32_t& nof_values, + int8_t*& values, + int32_t& nof_sizes, + int8_t*& sizes, + bool& is_null) { + const size_t ndims = 3; + Status status = getItemPrepare(index, ndims); + if (status != Success) { + return status; + } + const auto storage_index = get_storage_index(index); + auto* values_offsets = get_values_offsets(); + auto values_offset = values_offsets[storage_index]; + if (values_offset < 0) { + is_null = true; + nof_values = 0; + nof_sizes = 0; + values = nullptr; + sizes = nullptr; + return Success; + } + + const auto* sizes_offsets = get_sizes_offsets(); + auto* sizes_buffer = get_sizes_buffer(); + auto* values_buffer = get_values_buffer(); + const auto valuesize = getValueSize(); + const auto sizes2_offset = sizes_offsets[storage_index * ndims + 1]; + const auto sizes3_offset = sizes_offsets[storage_index * ndims + 2]; + + is_null = false; + offsets_t soffset = 0; + for (int64_t i = 0; i < subindex; i++) { + soffset += (sizes_buffer + sizes2_offset)[i]; + } + nof_sizes = (sizes_buffer + sizes2_offset)[subindex]; + values = values_buffer + (values_offset + soffset) * valuesize; + sizes = reinterpret_cast(sizes_buffer + sizes3_offset + soffset); + nof_values = 0; + for (int64_t i = 0; i < nof_sizes; i++) { + nof_values += sizes[i]; + } + return Success; + } + + Status getItem(const int64_t index, + const int64_t subindex, + const int64_t subsubindex, + int32_t& nof_values, + int8_t*& values, + bool& is_null) { + const size_t ndims = 3; + Status status = getItemPrepare(index, ndims); + if (status != Success) { + return status; + } + const auto storage_index = get_storage_index(index); + auto* values_offsets = get_values_offsets(); + auto values_offset = values_offsets[storage_index]; + if (values_offset < 0) { + is_null = true; + nof_values = 0; + values = nullptr; + return Success; + } + + const auto* sizes_offsets = get_sizes_offsets(); + auto* sizes_buffer = get_sizes_buffer(); + auto* values_buffer = get_values_buffer(); + const auto valuesize = getValueSize(); + const auto sizes2_offset = sizes_offsets[storage_index * ndims + 1]; + const auto sizes3_offset = sizes_offsets[storage_index * ndims + 2]; + + is_null = false; + offsets_t soffset = 0; + for (int64_t i = 0; i < subindex; i++) { + soffset += (sizes_buffer + sizes2_offset)[i]; + } + offsets_t soffset2 = 0; + for (int64_t i = 0; i < subsubindex; i++) { + soffset2 += (sizes_buffer + sizes3_offset + soffset)[i]; + } + values = values_buffer + (values_offset + soffset + soffset2) * valuesize; + nof_values = (sizes_buffer + sizes3_offset + soffset)[subsubindex]; + return Success; + } + + Status getItem(const int64_t index, + int32_t& nof_values, + int8_t*& values, + int32_t& nof_sizes, + int8_t*& sizes, + int32_t& nof_sizes_of_sizes, + int8_t*& sizes_of_sizes, + bool& is_null) { + const size_t ndims = 3; + Status status = getItemPrepare(index, ndims); + if (status != Success) { + return status; + } + const auto storage_index = get_storage_index(index); + + auto* values_offsets = get_values_offsets(); + auto values_offset = values_offsets[storage_index]; + if (values_offset < 0) { + is_null = true; + nof_values = 0; + nof_sizes = 0; + nof_sizes_of_sizes = 0; + values = nullptr; + sizes = nullptr; + sizes_of_sizes = nullptr; + return Success; + } + + const auto* sizes_offsets = get_sizes_offsets(); + auto* sizes_buffer = get_sizes_buffer(); + auto* values_buffer = get_values_buffer(); + const auto valuesize = getValueSize(); + const auto sizes_offset = sizes_offsets[storage_index * ndims]; + const auto sizes2_offset = sizes_offsets[storage_index * ndims + 1]; + const auto sizes3_offset = sizes_offsets[storage_index * ndims + 2]; + const auto next_values_offset = values_offsets[storage_index + 1]; + + is_null = false; + if (next_values_offset < 0) { + nof_values = -(next_values_offset + 1) - values_offset; + } else { + nof_values = next_values_offset - values_offset; + } + values = values_buffer + values_offset * valuesize; + + nof_sizes_of_sizes = sizes_buffer[sizes_offset]; + sizes_of_sizes = reinterpret_cast(sizes_buffer + sizes2_offset); + sizes = reinterpret_cast(sizes_buffer + sizes3_offset); + nof_sizes = 0; + for (int32_t i = 0; i < nof_sizes_of_sizes; i++) { + nof_sizes += (sizes_buffer + sizes2_offset)[i]; + } + return Success; + } + + Status setItemPrepare(const int64_t index, const size_t ndims) { + if (index < 0 || index >= itemsCount()) { + RETURN_ERROR(IndexError); + } + if (format() != NestedArrayFormatId) { + RETURN_ERROR(NotSupportedFormatError); + } + if (getDimensions() != ndims) { + RETURN_ERROR(DimensionalityError); + } + auto* storage_indices = get_storage_indices_new(); + if (storage_indices[index] >= 0) { + RETURN_ERROR(ItemAlreadySpecifiedError); + } + auto* worker = getNestedArrayWorker(); + auto storage_index = worker->specified_items_count; + storage_indices[index] = storage_index; + worker->specified_items_count++; + return Success; + } + + // TODO: rename to setNull + Status setNullNew(int64_t index) { + const size_t ndims = getDimensions(); + Status status = setItemPrepare(index, ndims); + if (status != Success) { + RETURN_ERROR(status); + } + const auto storage_index = get_storage_index(index); + + auto* values_offsets = get_values_offsets(); + auto* sizes_offsets = get_sizes_offsets(); + + const auto values_offset = values_offsets[storage_index]; + const auto sizes_offset = sizes_offsets[storage_index * ndims]; + auto* sizes_buffer = get_sizes_buffer(); + + sizes_buffer[sizes_offset] = 0; + for (size_t i = 0; i < ndims; i++) { + sizes_offsets[storage_index * ndims + i + 1] = sizes_offset + 1; + } + values_offsets[storage_index] = -(values_offset + 1); + values_offsets[storage_index + 1] = values_offset; + return Success; + } + + Status setItem(const int64_t index, std::vector& arr) { + const size_t ndims = 1; + Status status = setItemPrepare(index, ndims); + if (status != Success) { + return status; + } + const auto storage_index = get_storage_index(index); + + auto* values_offsets = get_values_offsets(); + auto* sizes_offsets = get_sizes_offsets(); + auto* sizes_buffer = get_sizes_buffer(); + auto* values_buffer = get_values_buffer(); + const auto* metadata = getNestedArrayMetadata(); + const auto valuesize = getValueSize(); + + auto values_offset = values_offsets[storage_index]; + const auto sizes_offset = sizes_offsets[storage_index * ndims]; + + sizes_t sz = (arr.size() * sizeof(double)) / valuesize; + sizes_buffer[sizes_offset] = sz; + + if (values_offset + sz > metadata->total_values_count) { + RETURN_ERROR(ValuesBufferTooSmallError); + } + if (memcpy(values_buffer + values_offset * valuesize, arr.data(), sz * valuesize) == + nullptr) { + RETURN_ERROR(MemoryError); + } + values_offset += sz; + + sizes_offsets[storage_index * ndims + 1] = sizes_offset + 1; + values_offsets[storage_index + 1] = values_offset; + return Success; + } + + Status setItem(const int64_t index, const std::vector>& item) { + const size_t ndims = 2; + Status status = setItemPrepare(index, ndims); + if (status != Success) { + return status; + } + const auto storage_index = get_storage_index(index); + + auto* values_offsets = get_values_offsets(); + auto* sizes_offsets = get_sizes_offsets(); + auto* sizes_buffer = get_sizes_buffer(); + auto* values_buffer = get_values_buffer(); + const auto* metadata = getNestedArrayMetadata(); + const auto valuesize = getValueSize(); + const sizes_t size = item.size(); + + auto values_offset = values_offsets[storage_index]; + const auto sizes_offset = sizes_offsets[storage_index * ndims]; + const auto sizes2_offset = sizes_offset + 1; + if (sizes2_offset + size > metadata->total_sizes_count) { + RETURN_ERROR(SizesBufferTooSmallError); + } + sizes_buffer[sizes_offset] = size; + for (sizes_t i = 0; i < size; i++) { + std::vector arr = item[i]; + sizes_t sz = (arr.size() * sizeof(double)) / valuesize; + sizes_buffer[sizes2_offset + i] = sz; + if (values_offset + sz > metadata->total_values_count) { + RETURN_ERROR(ValuesBufferTooSmallError); + } + if (memcpy(values_buffer + values_offset * valuesize, arr.data(), sz * valuesize) == + nullptr) { + RETURN_ERROR(MemoryError); + } + values_offset += sz; + } + sizes_offsets[storage_index * ndims + 1] = sizes2_offset; + sizes_offsets[storage_index * ndims + 2] = sizes2_offset + size; + values_offsets[storage_index + 1] = values_offset; + return Success; + } + + template + Status setItem(const int64_t index, + const std::vector>>& item) { + const size_t ndims = 3; + Status status = setItemPrepare(index, ndims); + if (status != Success) { + return status; + } + const auto storage_index = get_storage_index(index); + + auto* values_offsets = get_values_offsets(); + auto* sizes_offsets = get_sizes_offsets(); + auto* sizes_buffer = get_sizes_buffer(); + auto* values_buffer = get_values_buffer(); + const auto* metadata = getNestedArrayMetadata(); + const auto valuesize = getValueSize(); + const sizes_t size = item.size(); + + auto values_offset = values_offsets[storage_index]; + const auto sizes_offset = sizes_offsets[storage_index * ndims]; + const auto sizes2_offset = sizes_offset + 1; + const auto sizes3_offset = sizes2_offset + size; + sizes_t i3 = 0; + sizes_buffer[sizes_offset] = size; + + for (sizes_t i = 0; i < size; i++) { + const std::vector>& item2 = item[i]; + sizes_t size2 = item2.size(); + if (sizes3_offset + i3 + size2 > metadata->total_sizes_count) { + RETURN_ERROR(SizesBufferTooSmallError); + } + sizes_buffer[sizes2_offset + i] = size2; + for (sizes_t j = 0; j < size2; j++) { + const std::vector& arr = item2[j]; + sizes_t sz = (arr.size() * sizeof(CT)) / valuesize; + sizes_buffer[sizes3_offset + i3] = sz; + i3 += 1; + if (values_offset + sz > metadata->total_values_count) { + RETURN_ERROR(ValuesBufferTooSmallError); + } + if (memcpy(values_buffer + values_offset * valuesize, + arr.data(), + sz * valuesize) == nullptr) { + RETURN_ERROR(MemoryError); + } + values_offset += sz; + } + } + sizes_offsets[storage_index * ndims + 1] = sizes2_offset; + sizes_offsets[storage_index * ndims + 2] = sizes3_offset; + sizes_offsets[storage_index * ndims + 3] = sizes3_offset + i3; + values_offsets[storage_index + 1] = values_offset; + return Success; + } + + template + Status setItem(const int64_t index, + const std::vector& values, + const std::vector& sizes, + const std::vector& sizes_of_sizes) { + const auto* metadata = getNestedArrayMetadata(); + if constexpr (!std::is_same::value) { + if constexpr (std::is_same::value) { + if (metadata->value_type != PointFloat64) { + RETURN_ERROR(TypeError); + } + } else if constexpr (std::is_same::value) { + if (metadata->value_type != PointInt32) { + RETURN_ERROR(TypeError); + } + } else { + RETURN_ERROR(NotImplementedError); + } + } + return setItem(index, + reinterpret_cast(values.data()), + values.size() * sizeof(CT), + sizes.data(), + sizes.size(), + sizes_of_sizes.data(), + sizes_of_sizes.size()); + } + + Status setItem(const int64_t index, + const int8_t* values_buf, + const size_t values_buf_size, // in bytes + const int32_t* sizes_buf, + const int32_t nof_sizes, + const int32_t* sizes_of_sizes_buf, + const int32_t nof_sizes_of_sizes) { + const size_t ndims = 3; + Status status = setItemPrepare(index, ndims); + if (status != Success) { + RETURN_ERROR(status); + } + const auto* metadata = getNestedArrayMetadata(); + const auto storage_index = get_storage_index(index); + + auto* values_offsets = get_values_offsets(); + auto* sizes_offsets = get_sizes_offsets(); + auto* sizes_buffer = get_sizes_buffer(); + auto* values_buffer = get_values_buffer(); + const auto valuesize = getValueSize(); + auto values_offset = values_offsets[storage_index]; + const auto sizes_offset = sizes_offsets[storage_index * ndims]; + const auto sizes2_offset = sizes_offset + 1; + const auto sizes3_offset = sizes2_offset + nof_sizes_of_sizes; + if (sizes_offset + 1 + nof_sizes_of_sizes + nof_sizes > metadata->total_sizes_count) { + RETURN_ERROR(SizesBufferTooSmallError); + } + sizes_t sum_sizes_of_sizes = 0; + for (sizes_t i = 0; i < nof_sizes_of_sizes; i++) { + sum_sizes_of_sizes += sizes_of_sizes_buf[i]; + } + if (sum_sizes_of_sizes != nof_sizes) { + RETURN_ERROR(InconsistentSizesError); + } + sizes_t sum_sizes = 0; + for (sizes_t i = 0; i < nof_sizes; i++) { + sum_sizes += sizes_buf[i]; + } + sizes_t values_count = values_buf_size / valuesize; + if (sum_sizes != values_count) { + RETURN_ERROR(InconsistentSizesError); + } + if (values_offset + values_count > metadata->total_values_count) { + RETURN_ERROR(ValuesBufferTooSmallError); + } + sizes_buffer[sizes_offset] = nof_sizes_of_sizes; + if (memcpy(sizes_buffer + sizes2_offset, + sizes_of_sizes_buf, + nof_sizes_of_sizes * sizeof(int32_t)) == nullptr) { + RETURN_ERROR(MemoryError); + } + if (memcpy(sizes_buffer + sizes3_offset, sizes_buf, nof_sizes * sizeof(int32_t)) == + nullptr) { + RETURN_ERROR(MemoryError); + } + if (memcpy(values_buffer + values_offset * valuesize, values_buf, values_buf_size) == + nullptr) { + RETURN_ERROR(MemoryError); + } + sizes_offsets[storage_index * ndims + 1] = sizes2_offset; + sizes_offsets[storage_index * ndims + 2] = sizes3_offset; + sizes_offsets[storage_index * ndims + 3] = sizes3_offset + sum_sizes_of_sizes; + values_offsets[storage_index + 1] = values_offset + values_count; + return Success; + } + // Set a new item with index and size (in bytes) and initialize its // elements from source buffer. The item values will be // uninitialized when source buffer is nullptr. If dest != nullptr // then the item's buffer pointer will be stored in *dest. + // To be deprecated in favor of NestedArray format Status setItem(const int64_t index, const int8_t* src, const int64_t size, int8_t** dest = nullptr) { if (index < 0 || index >= itemsCount()) { - return IndexError; + RETURN_ERROR(IndexError); } switch (format()) { case VarlenArrayFormatId: @@ -1102,20 +2427,20 @@ struct FlatBufferManager { // bytes? } if (storage_indices[index] >= 0) { - return ItemAlreadySpecifiedError; + RETURN_ERROR(ItemAlreadySpecifiedError); } const int64_t cindex = compressed_indices[storage_count]; const int64_t values_buffer_size = get_values_buffer_size(); const int64_t csize = cindex * itemsize; if (csize + size > values_buffer_size) { - return ValuesBufferTooSmallError; + RETURN_ERROR(ValuesBufferTooSmallError); } break; } case GeoPointFormatId: { const int64_t itemsize = dtypeSize(); if (size != itemsize) { - return SizeError; + RETURN_ERROR(SizeError); } break; } @@ -1125,12 +2450,13 @@ struct FlatBufferManager { return setItemCountsAndData(index, &counts, 1, src, dest); } default: - return UnknownFormatError; + RETURN_ERROR(UnknownFormatError); } return setItemNoValidation(index, src, size, dest); } // Same as setItem but performs no input validation + // To be deprecated in favor of NestedArray format Status setItemNoValidation(const int64_t index, const int8_t* src, const int64_t size, @@ -1149,7 +2475,7 @@ struct FlatBufferManager { storage_indices[index] = storage_count; compressed_indices[storage_count + 1] = cindex + values_count; if (size > 0 && src != nullptr && memcpy(values + csize, src, size) == nullptr) { - return MemoryError; + RETURN_ERROR(MemoryError); } if (dest != nullptr) { *dest = values + csize; @@ -1162,7 +2488,7 @@ struct FlatBufferManager { const int64_t itemsize = dtypeSize(); const int64_t csize = index * itemsize; if (src != nullptr && memcpy(values + csize, src, size) == nullptr) { - return MemoryError; + RETURN_ERROR(MemoryError); } if (dest != nullptr) { *dest = values + csize; @@ -1175,25 +2501,26 @@ struct FlatBufferManager { return setItemCountsAndDataNoValidation(index, &counts, 1, src, dest); } default: - return UnknownFormatError; + RETURN_ERROR(UnknownFormatError); } return Success; } + // To be deprecated in favor of NestedArray format Status setItemCountsAndData(const int64_t index, const int32_t* counts, const int64_t nof_counts, const int8_t* src, int8_t** dest = nullptr) { if (index < 0 || index >= itemsCount()) { - return IndexError; + RETURN_ERROR(IndexError); } switch (format()) { case VarlenArrayFormatId: case GeoPointFormatId: case GeoLineStringFormatId: - return NotSupportedFormatError; + RETURN_ERROR(NotSupportedFormatError); case GeoPolygonFormatId: { const int64_t& storage_count = get_storage_count(); const int64_t& storage_count2 = get_storage_count2(); @@ -1202,14 +2529,14 @@ struct FlatBufferManager { const int64_t* storage_indices = get_storage_indices(); const int64_t valuesize = dtypeSize(); if (storage_indices[index] >= 0) { - return ItemAlreadySpecifiedError; + RETURN_ERROR(ItemAlreadySpecifiedError); } const int64_t compressed_indices2_buffer_size = get_compressed_indices2_buffer_size(); if (compressed_indices[storage_count] + nof_counts > compressed_indices2_buffer_size) { - return CompressedIndices2BufferTooSmallError; + RETURN_ERROR(CompressedIndices2BufferTooSmallError); } const int64_t offset = compressed_indices2[storage_count2] * valuesize; @@ -1219,16 +2546,17 @@ struct FlatBufferManager { } const int64_t values_buffer_size = get_values_buffer_size(); if (offset + size > values_buffer_size) { - return ValuesBufferTooSmallError; + RETURN_ERROR(ValuesBufferTooSmallError); } break; } default: - return UnknownFormatError; + RETURN_ERROR(UnknownFormatError); } return setItemCountsAndDataNoValidation(index, counts, nof_counts, src, dest); } + // To be deprecated in favor of NestedArray format // Same as setItem but performs no input validation Status setItemCountsAndDataNoValidation( const int64_t index, @@ -1240,7 +2568,7 @@ struct FlatBufferManager { case VarlenArrayFormatId: case GeoPointFormatId: case GeoLineStringFormatId: - return NotSupportedFormatError; + RETURN_ERROR(NotSupportedFormatError); case GeoPolygonFormatId: { int64_t& storage_count = get_storage_count(); int64_t& storage_count2 = get_storage_count2(); @@ -1266,7 +2594,7 @@ struct FlatBufferManager { compressed_indices2[storage_count2] = cindex2; } if (size > 0 && src != nullptr && memcpy(values + offset, src, size) == nullptr) { - return MemoryError; + RETURN_ERROR(MemoryError); } if (dest != nullptr) { *dest = values + offset; @@ -1275,19 +2603,20 @@ struct FlatBufferManager { break; } default: - return UnknownFormatError; + RETURN_ERROR(UnknownFormatError); } return Success; } + // To be deprecated in favor of NestedArray format Status setSubItem(const int64_t index, const int64_t subindex, const int8_t* src, const int64_t size, int8_t** dest = nullptr) { if (index < 0 || index >= itemsCount()) { - return IndexError; + RETURN_ERROR(IndexError); } switch (format()) { case VarlenArrayFormatId: @@ -1298,13 +2627,13 @@ struct FlatBufferManager { const int64_t* storage_indices = get_storage_indices(); const int64_t storage_index = storage_indices[index]; if (storage_index < 0) { - return ItemUnspecifiedError; + RETURN_ERROR(ItemUnspecifiedError); } int64_t* compressed_indices = get_compressed_indices(); const int64_t cindex = compressed_indices[storage_index]; if (cindex < 0) { if (size > 0) { - return UnexpectedNullItemError; + RETURN_ERROR(UnexpectedNullItemError); } } else { const int64_t next_cindex = compressed_indices[storage_index + 1]; @@ -1319,17 +2648,18 @@ struct FlatBufferManager { const int64_t next_cindex2 = compressed_indices2[cindex + subindex + 1]; const int64_t expected_size = (next_cindex2 - cindex2) * valuesize; if (expected_size != size) { - return SizeError; + RETURN_ERROR(SizeError); } } break; } default: - return UnknownFormatError; + RETURN_ERROR(UnknownFormatError); } return setSubItemNoValidation(index, subindex, src, size, dest); } + // To be deprecated in favor of NestedArray format Status setSubItemNoValidation(const int64_t index, const int64_t subindex, const int8_t* src, @@ -1339,7 +2669,7 @@ struct FlatBufferManager { case VarlenArrayFormatId: case GeoPointFormatId: case GeoLineStringFormatId: - return NotSupportedFormatError; + RETURN_ERROR(NotSupportedFormatError); case GeoPolygonFormatId: { const int64_t* storage_indices = get_storage_indices(); const int64_t storage_index = storage_indices[index]; @@ -1351,7 +2681,7 @@ struct FlatBufferManager { const int64_t cindex2 = compressed_indices2[cindex + subindex]; const int64_t offset = cindex2 * valuesize; if (size > 0 && src != nullptr && memcpy(values + offset, src, size) == nullptr) { - return MemoryError; + RETURN_ERROR(MemoryError); } if (dest != nullptr) { *dest = values + offset; @@ -1359,7 +2689,7 @@ struct FlatBufferManager { break; } default: - return UnknownFormatError; + RETURN_ERROR(UnknownFormatError); } return Success; } @@ -1371,9 +2701,10 @@ struct FlatBufferManager { return setItemNoValidation(index, nullptr, size, dest); } + // To be deprecated in favor of NestedArray format Status concatItem(int64_t index, const int8_t* src, int64_t size) { if (index < 0 || index >= itemsCount()) { - return IndexError; + RETURN_ERROR(IndexError); } switch (format()) { case VarlenArrayFormatId: @@ -1389,15 +2720,15 @@ struct FlatBufferManager { return setItem(index, src, size, nullptr); } if (size % itemsize != 0) { - return SizeError; + RETURN_ERROR(SizeError); } if (storage_index != storage_count) { - return IndexError; // index does not correspond to the last set - // item, only the last item can be - // concatenated + RETURN_ERROR(IndexError); // index does not correspond to the last set + // item, only the last item can be + // concatenated } if (compressed_indices[storage_index] < 0) { - return NotImplementedError; // todo: support concat to null when last + RETURN_ERROR(NotImplementedError); // todo: support concat to null when last } int64_t values_count = compressed_indices[next_storage_count] - compressed_indices[storage_index]; @@ -1406,19 +2737,20 @@ struct FlatBufferManager { int8_t* ptr = values + compressed_indices[storage_index] * itemsize; if (size > 0 && src != nullptr && memcpy(ptr + values_count * itemsize, src, size) == nullptr) { - return MemoryError; + RETURN_ERROR(MemoryError); } return Success; } default:; } - return UnknownFormatError; + RETURN_ERROR(UnknownFormatError); } // Set item with index as a null item + // To be deprecated in favor of NestedArray format Status setNull(int64_t index) { if (index < 0 || index >= itemsCount()) { - return IndexError; + RETURN_ERROR(IndexError); } switch (format()) { case VarlenArrayFormatId: @@ -1433,11 +2765,17 @@ struct FlatBufferManager { case GeoPointFormatId: { return setNullNoValidation(index); } + case NestedArrayFormatId: { + return setNullNew(index); + } + default: + break; } - return UnknownFormatError; + RETURN_ERROR(UnknownFormatError); } // Same as setNull but performs no input validation + // To be deprecated in favor of NestedArray format Status setNullNoValidation(int64_t index) { switch (format()) { case VarlenArrayFormatId: @@ -1471,16 +2809,29 @@ struct FlatBufferManager { break; } default: - return UnknownFormatError; + RETURN_ERROR(UnknownFormatError); } return Success; } // Check if the item is unspecified or null. Status isNull(int64_t index, bool& is_null) const { + if (isNestedArray()) { + const size_t ndims = getDimensions(); + Status status = getItemPrepare(index, ndims); + if (status != Success) { + return status; + } + const auto storage_index = get_storage_index(index); + const auto* values_offsets = get_values_offsets(); + const auto values_offset = values_offsets[storage_index]; + is_null = values_offset < 0; + return Success; + } if (index < 0 || index >= itemsCount()) { - return IndexError; + RETURN_ERROR(IndexError); } + // To be deprecated in favor of NestedArray format: switch (format()) { case VarlenArrayFormatId: case GeoLineStringFormatId: @@ -1489,7 +2840,7 @@ struct FlatBufferManager { const int64_t* storage_indices = get_storage_indices(); const int64_t storage_index = storage_indices[index]; if (storage_index < 0) { - return ItemUnspecifiedError; + RETURN_ERROR(ItemUnspecifiedError); } is_null = (compressed_indices[storage_index] < 0); return Success; @@ -1509,16 +2860,19 @@ struct FlatBufferManager { } return Success; } + default: + break; } - return UnknownFormatError; + RETURN_ERROR(UnknownFormatError); } // Get item at index by storing its size (in bytes), values buffer, // and nullity information to the corresponding pointer // arguments. + // To be deprecated in favor of NestedArray format HOST DEVICE Status getItem(int64_t index, int64_t& size, int8_t*& dest, bool& is_null) { if (index < 0 || index >= itemsCount()) { - return IndexError; + RETURN_ERROR(IndexError); } switch (format()) { case VarlenArrayFormatId: @@ -1557,10 +2911,13 @@ struct FlatBufferManager { } case GeoPolygonFormatId: break; + default: + break; } - return UnknownFormatError; + RETURN_ERROR(UnknownFormatError); } + // To be deprecated in favor of NestedArray format HOST DEVICE Status getItem(int64_t index, size_t& size, int8_t*& dest, bool& is_null) { int64_t sz{0}; Status status = getItem(index, sz, dest, is_null); @@ -1568,13 +2925,14 @@ struct FlatBufferManager { return status; } + // To be deprecated in favor of NestedArray format HOST DEVICE Status getItem2(int64_t index, int64_t*& cumcounts, int64_t& nof_counts, int8_t*& dest, bool& is_null) { if (index < 0 || index >= itemsCount()) { - return IndexError; + RETURN_ERROR(IndexError); } switch (format()) { case VarlenArrayFormatId: @@ -1589,7 +2947,7 @@ struct FlatBufferManager { const int64_t storage_index = storage_indices[index]; if (storage_index < 0) { - return ItemUnspecifiedError; + RETURN_ERROR(ItemUnspecifiedError); } const int64_t cindex = compressed_indices[storage_index]; if (cindex < 0) { @@ -1609,10 +2967,13 @@ struct FlatBufferManager { } return Success; } + default: + break; } - return UnknownFormatError; + RETURN_ERROR(UnknownFormatError); } + // To be deprecated in favor of NestedArray format HOST DEVICE Status getItemCountsAndData(const int64_t index, int32_t*& counts, int64_t& nof_counts, @@ -1620,7 +2981,7 @@ struct FlatBufferManager { int64_t& size, bool& is_null) { if (index < 0 || index >= itemsCount()) { - return IndexError; + RETURN_ERROR(IndexError); } switch (format()) { case VarlenArrayFormatId: @@ -1635,7 +2996,7 @@ struct FlatBufferManager { const int64_t storage_index = storage_indices[index]; if (storage_index < 0) { - return ItemUnspecifiedError; + RETURN_ERROR(ItemUnspecifiedError); } const int64_t cindex = compressed_indices[storage_index]; if (cindex < 0) { @@ -1658,13 +3019,16 @@ struct FlatBufferManager { } return Success; } + default: + break; } - return UnknownFormatError; + RETURN_ERROR(UnknownFormatError); } + // To be deprecated in favor of NestedArray format Status getItemLength(const int64_t index, int64_t& length) const { if (index < 0 || index >= itemsCount()) { - return IndexError; + RETURN_ERROR(IndexError); } switch (format()) { case GeoPointFormatId: @@ -1676,7 +3040,7 @@ struct FlatBufferManager { const int64_t* storage_indices = get_storage_indices(); const int64_t storage_index = storage_indices[index]; if (storage_index < 0) { - return ItemUnspecifiedError; + RETURN_ERROR(ItemUnspecifiedError); } const int64_t cindex = compressed_indices[storage_index]; if (cindex < 0) { @@ -1687,15 +3051,18 @@ struct FlatBufferManager { } return Success; } + default: + break; } - return UnknownFormatError; + RETURN_ERROR(UnknownFormatError); } + // To be deprecated in favor of NestedArray format Status getSubItemLength(const int64_t index, const int64_t subindex, int64_t& length) const { if (index < 0 || index >= itemsCount()) { - return IndexError; + RETURN_ERROR(IndexError); } switch (format()) { case VarlenArrayFormatId: @@ -1707,7 +3074,7 @@ struct FlatBufferManager { const int64_t* compressed_indices = get_compressed_indices(); const int64_t storage_index = storage_indices[index]; if (storage_index < 0) { - return ItemUnspecifiedError; + RETURN_ERROR(ItemUnspecifiedError); } const int64_t cindex = compressed_indices[storage_index]; if (cindex < 0) { @@ -1726,29 +3093,32 @@ struct FlatBufferManager { } return Success; } + default: + break; } - return UnknownFormatError; + RETURN_ERROR(UnknownFormatError); } // Get a subitem data of an item, e.g. a linestring within a polygon + // To be deprecated in favor of NestedArray format HOST DEVICE Status getSubItem(int64_t index, int64_t subindex, int64_t& size, int8_t*& dest, bool& is_null) { if (index < 0 || index >= itemsCount()) { - return IndexError; + RETURN_ERROR(IndexError); } switch (format()) { case VarlenArrayFormatId: case GeoPointFormatId: case GeoLineStringFormatId: - return NotSupportedFormatError; + RETURN_ERROR(NotSupportedFormatError); case GeoPolygonFormatId: { const int64_t* storage_indices = get_storage_indices(); const int64_t storage_index = storage_indices[index]; if (storage_index < 0) { - return ItemUnspecifiedError; + RETURN_ERROR(ItemUnspecifiedError); } int64_t* compressed_indices = get_compressed_indices(); const int64_t cindex = compressed_indices[storage_index]; @@ -1774,20 +3144,136 @@ struct FlatBufferManager { } return Success; } + default: + break; } - return UnknownFormatError; + RETURN_ERROR(UnknownFormatError); } #ifdef HAVE_TOSTRING #define HAVE_FLATBUFFER_TOSTRING + std::string bufferToString(const int8_t* buffer, + const size_t size, + ValueType value_type) const { + size_t value_size = get_size(value_type); + size_t count = size / value_size; + std::string result = ""; + for (size_t i = 0; i < count; i++) { + if (i > 0) { + result += ", "; + } + switch (value_type) { + case Bool8: + result += (buffer[i] ? "true" : "false"); + break; + case Int8: + result += std::to_string(buffer[i]); + break; + case Int16: + result += std::to_string(reinterpret_cast(buffer)[i]); + break; + case Int32: + result += std::to_string(reinterpret_cast(buffer)[i]); + break; + case Int64: + result += std::to_string(reinterpret_cast(buffer)[i]); + break; + case UInt8: + result += std::to_string(reinterpret_cast(buffer)[i]); + break; + case UInt16: + result += std::to_string(reinterpret_cast(buffer)[i]); + break; + case UInt32: + result += std::to_string(reinterpret_cast(buffer)[i]); + break; + case UInt64: + result += std::to_string(reinterpret_cast(buffer)[i]); + break; + case Float32: + result += std::to_string(reinterpret_cast(buffer)[i]); + break; + case Float64: + result += std::to_string(reinterpret_cast(buffer)[i]); + break; + case PointInt32: + result += "("; + if (containsNullValue(buffer + 2 * i * sizeof(int32_t))) { + result += "NULL"; + } else { + result += std::to_string(reinterpret_cast(buffer)[2 * i]); + result += ", "; + result += std::to_string(reinterpret_cast(buffer)[2 * i + 1]); + } + result += ")"; + break; + case PointFloat64: + result += "("; + if (containsNullValue(buffer + 2 * i * sizeof(double))) { + result += "NULL"; + } else { + result += std::to_string(reinterpret_cast(buffer)[2 * i]); + result += ", "; + result += std::to_string(reinterpret_cast(buffer)[2 * i + 1]); + } + result += ")"; + break; + } + } + return result; + } + std::string toString() const { if (buffer == nullptr) { return ::typeName(this) + "[UNINITIALIZED]"; } - std::string result = typeName(this) + "("; + std::string result = typeName(this) + "@" + ::toString((void*)buffer) + "("; result += "" + getBaseWorker()->toString(); - const auto fmt = format(); + if (isNestedArray()) { + const auto* metadata = getNestedArrayMetadata(); + const auto* worker = getNestedArrayWorker(); + result += ",\n " + metadata->toString(); + result += ",\n " + worker->toString(); + result += ",\n values_buffer=[" + + bufferToString( + get_values_buffer(), getValuesBufferSize(), metadata->value_type) + + "]"; + result += ",\n sizes_buffer=[" + + bufferToString( + reinterpret_cast(get_sizes_buffer()), + metadata->total_sizes_count * get_size(FLATBUFFER_SIZES_T_VALUE_TYPE), + FLATBUFFER_SIZES_T_VALUE_TYPE) + + "]"; + result += ",\n values_offsets=[" + + bufferToString(reinterpret_cast(get_values_offsets()), + (metadata->total_items_count + 1) * + get_size(FLATBUFFER_OFFSETS_T_VALUE_TYPE), + FLATBUFFER_OFFSETS_T_VALUE_TYPE) + + "]"; + result += ",\n sizes_offsets=[" + + bufferToString(reinterpret_cast(get_sizes_offsets()), + (metadata->total_items_count * metadata->dimensions + 1) * + get_size(FLATBUFFER_OFFSETS_T_VALUE_TYPE), + FLATBUFFER_OFFSETS_T_VALUE_TYPE) + + "]"; + result += ",\n storage_indices=[" + + bufferToString( + reinterpret_cast(get_storage_indices_new()), + metadata->total_items_count * get_size(FLATBUFFER_SIZES_T_VALUE_TYPE), + FLATBUFFER_SIZES_T_VALUE_TYPE) + + "]"; + result += ",\n user_data_buffer=[" + + bufferToString(get_user_data_buffer(), metadata->user_data_size, Int8) + + "]"; + result += ")"; + return result; + } + + // To be deprecated in favor of NestedArray format: + const FlatBufferFormat fmt = format(); + + std::cout << "fmt=" << static_cast(fmt) << ", " << sizeof(fmt) << std::endl; switch (fmt) { case VarlenArrayFormatId: { result += ", " + getVarlenArrayMetadata()->toString(); @@ -1809,6 +3295,8 @@ struct FlatBufferManager { result += ", " + getGeoPolygonWorker()->toString(); break; } + default: + break; } switch (fmt) { @@ -1904,6 +3392,64 @@ struct FlatBufferManager { }; #ifdef HAVE_TOSTRING +inline std::ostream& operator<<(std::ostream& os, + FlatBufferManager::ValueType const type) { + switch (type) { + case FlatBufferManager::Bool8: + os << "Bool8"; + break; + case FlatBufferManager::Int8: + os << "Int8"; + break; + case FlatBufferManager::Int16: + os << "Int16"; + break; + case FlatBufferManager::Int32: + os << "Int32"; + break; + case FlatBufferManager::Int64: + os << "Int64"; + break; + case FlatBufferManager::UInt8: + os << "UInt8"; + break; + case FlatBufferManager::UInt16: + os << "UInt16"; + break; + case FlatBufferManager::UInt32: + os << "UInt32"; + break; + case FlatBufferManager::UInt64: + os << "UInt64"; + break; + case FlatBufferManager::Float32: + os << "Float32"; + break; + case FlatBufferManager::Float64: + os << "Float64"; + break; + case FlatBufferManager::PointInt32: + os << "PointInt32"; + break; + case FlatBufferManager::PointFloat64: + os << "PointFloat64"; + break; + } + return os; +} + +inline std::string FlatBufferManager::toString(const FlatBufferManager::ValueType& type) { + std::ostringstream ss; + ss << type; + return ss.str(); +} + +inline std::string toString(const FlatBufferManager::ValueType& type) { + std::ostringstream ss; + ss << type; + return ss.str(); +} + inline std::ostream& operator<<(std::ostream& os, FlatBufferManager::Status const status) { switch (status) { @@ -1919,6 +3465,9 @@ inline std::ostream& operator<<(std::ostream& os, case FlatBufferManager::SizeError: os << "SizeError"; break; + case FlatBufferManager::FlatbufferSizeError: + os << "FlatbufferSizeError"; + break; case FlatBufferManager::ItemAlreadySpecifiedError: os << "ItemAlreadySpecifiedError"; break; @@ -1931,6 +3480,9 @@ inline std::ostream& operator<<(std::ostream& os, case FlatBufferManager::ValuesBufferTooSmallError: os << "ValuesBufferTooSmallError"; break; + case FlatBufferManager::SizesBufferTooSmallError: + os << "SizesBufferTooSmallError"; + break; case FlatBufferManager::CompressedIndices2BufferTooSmallError: os << "CompressedIndices2BufferTooSmallError"; break; @@ -1946,6 +3498,21 @@ inline std::ostream& operator<<(std::ostream& os, case FlatBufferManager::NotImplementedError: os << "NotImplementedError"; break; + case FlatBufferManager::InvalidUserDataError: + os << "InvalidUserDataError"; + break; + case FlatBufferManager::DimensionalityError: + os << "DimensionalityError"; + break; + case FlatBufferManager::UserDataError: + os << "UserDataError"; + break; + case FlatBufferManager::TypeError: + os << "TypeError"; + break; + case FlatBufferManager::InconsistentSizesError: + os << "InconsistentSizesError"; + break; default: os << "[Unknown FlatBufferManager::Status value]"; } @@ -1958,3 +3525,5 @@ inline std::string toString(const FlatBufferManager::Status& status) { return ss.str(); } #endif + +#undef RETURN_ERROR diff --git a/QueryEngine/heavydbTypes.h b/QueryEngine/heavydbTypes.h index 9ae3dcd024..b4bcbf77db 100644 --- a/QueryEngine/heavydbTypes.h +++ b/QueryEngine/heavydbTypes.h @@ -29,6 +29,8 @@ #include "DateAdd.h" +#include "../Shared/sqltypes_lite.h" + #if !(defined(__CUDACC__) || defined(NO_BOOST)) #include "../Shared/DateTimeParser.h" #endif @@ -725,7 +727,7 @@ struct GeoPolygonStruct { typedef struct GeoPolygonStruct GeoPolygon; -struct GeoMultiPolygon { +struct GeoMultiPolygonStruct { int8_t* ptr_coords; int32_t coords_size; int8_t* ring_sizes; @@ -752,6 +754,8 @@ struct GeoMultiPolygon { DEVICE int32_t getOutputSrid() const { return output_srid; } }; +typedef struct GeoMultiPolygonStruct GeoMultiPolygon; + // There are redundant #ifndef UDF_COMPILED inside // ifguard for StringDictionaryProxy to flag that // if we decide to adapt C++ UDF Compiler for table @@ -868,28 +872,56 @@ DEVICE inline double decompress_y_coord(const int8_t* data, } } -DEVICE inline Point2D get_point(const int8_t* data, - const int64_t index, - const int32_t input_srid, - const int32_t output_srid, - const bool is_geoint) { - Point2D point{decompress_x_coord(data, index, is_geoint), - decompress_y_coord(data, index, is_geoint)}; +DEVICE inline double decompress_x_coord(const int8_t* data, + const int64_t index, + const bool is_geoint, + const int32_t input_srid, + const int32_t output_srid) { + double x = decompress_x_coord(data, index, is_geoint); if (input_srid == output_srid || output_srid == 0) { - return point; + return x; } else if (input_srid == 4326 && output_srid == 900913) { // WGS 84 --> Web Mercator - point.x *= 111319.490778; - point.y = 6378136.99911 * log(tan(.00872664626 * point.y + .785398163397)); - return point; + x *= 111319.490778; + } else { +#ifndef __CUDACC__ + throw std::runtime_error("decompress_x_coord: unhandled geo transformation from " + + std::to_string(input_srid) + " to " + + std::to_string(output_srid) + '.'); +#endif } -#ifdef __CUDACC__ - return {}; // (NaN,NaN) -#else - throw std::runtime_error("Unhandled geo transformation from " + - std::to_string(input_srid) + " to " + - std::to_string(output_srid) + '.'); + return x; +} + +DEVICE inline double decompress_y_coord(const int8_t* data, + const int64_t index, + const bool is_geoint, + const int32_t input_srid, + const int32_t output_srid) { + double y = decompress_y_coord(data, index, is_geoint); + if (input_srid == output_srid || output_srid == 0) { + return y; + } else if (input_srid == 4326 && output_srid == 900913) { + // WGS 84 --> Web Mercator + y = 6378136.99911 * log(tan(.00872664626 * y + .785398163397)); + } else { +#ifndef __CUDACC__ + throw std::runtime_error("decompress_y_coord: unhandled geo transformation from " + + std::to_string(input_srid) + " to " + + std::to_string(output_srid) + '.'); #endif + } + return y; +} + +DEVICE inline Point2D get_point(const int8_t* data, + const int64_t index, + const int32_t input_srid, + const int32_t output_srid, + const bool is_geoint) { + Point2D point{decompress_x_coord(data, index, is_geoint, input_srid, output_srid), + decompress_y_coord(data, index, is_geoint, input_srid, output_srid)}; + return point; } #ifndef __CUDACC__ @@ -911,6 +943,7 @@ inline std::vector compress_coords(const int8_t* data, } return result; } + inline std::vector decompress_coords(const int8_t* data, const int64_t size, const bool is_geoint) { @@ -923,6 +956,71 @@ inline std::vector decompress_coords(const int8_t* data, } return result; } + +inline std::vector compress_coords(const std::vector& coords) { + std::vector result; + const size_t nofpoints = coords.size() / 2; + result.reserve(coords.size()); + const double* buf = coords.data(); + for (size_t i = 0; i < nofpoints; i++) { + result.push_back(compress_x_coord(buf, 2 * i)); + result.push_back(compress_y_coord(buf, 2 * i)); + } + return result; +} + +inline std::vector decompress_coords(const std::vector& coords) { + std::vector result; + const size_t nofpoints = coords.size() / 2; + result.reserve(coords.size()); + const int8_t* buf = reinterpret_cast(coords.data()); + for (size_t i = 0; i < nofpoints; i++) { + result.push_back(decompress_x_coord(buf, 2 * i, true)); + result.push_back(decompress_y_coord(buf, 2 * i, true)); + } + return result; +} + +inline std::vector> compress_coords( + const std::vector>& coords) { + std::vector> result; + result.reserve(coords.size()); + for (size_t i = 0; i < coords.size(); i++) { + result.push_back(compress_coords(coords[i])); + } + return result; +} + +inline std::vector> decompress_coords( + const std::vector>& coords) { + std::vector> result; + result.reserve(coords.size()); + for (size_t i = 0; i < coords.size(); i++) { + result.push_back(decompress_coords(coords[i])); + } + return result; +} + +inline std::vector>> compress_coords( + const std::vector>>& coords) { + std::vector>> result; + result.reserve(coords.size()); + for (size_t i = 0; i < coords.size(); i++) { + result.push_back(compress_coords(coords[i])); + } + return result; +} + +inline std::vector>> decompress_coords( + const std::vector>>& coords) { + std::vector>> result; + result.reserve(coords.size()); + for (size_t i = 0; i < coords.size(); i++) { + result.push_back(decompress_coords(coords[i])); + } + return result; +} + #endif inline bool get_is_geoint(const int8_t* flatbuffer) { @@ -983,7 +1081,7 @@ inline int32_t get_output_srid(const int8_t* flatbuffer) { } struct LineString { - int8_t* flatbuffer_; // FlatBuffer of GeoLineStrings or GeoPolygon + int8_t* flatbuffer_; // FlatBuffer of GeoLineStrings or GeoPolygon or GeoMultiPolygon int64_t index_[3]; // line string index within a // Column/Column/Column @@ -995,7 +1093,7 @@ struct LineString { status = m.getItem(index_[0], size, dest, is_null); break; case GeoPolygonFormatId: - status = m.getSubItem(index_[0], index_[1], size, dest, is_null); + status = m.getSubItem(index_[1], index_[0], size, dest, is_null); break; default: status = FlatBufferManager::Status::NotImplementedError; @@ -1005,10 +1103,49 @@ struct LineString { // Get the index-th point of the line string DEVICE Geo::Point2D getItem(const int64_t index, const int32_t output_srid = 0) const { + FlatBufferManager m{flatbuffer_}; + FlatBufferManager::Status status = FlatBufferManager::Status::NotImplementedError; + if (m.isNestedArray()) { + const SQLTypeInfoLite* ti = + reinterpret_cast(m.get_user_data_buffer()); + if (ti == nullptr) { + status = FlatBufferManager::UserDataError; + } else { + if (m.getDimensions() == 3) { + if (index_[2] >= 0) { + bool is_null = false; + int32_t nof_points; + int8_t* coords_ptr; + status = m.getItem( + index_[2], index_[1], index_[0], nof_points, coords_ptr, is_null); + if (status == FlatBufferManager::Status::Success) { + if (is_null) { + status = FlatBufferManager::Status::UnexpectedNullItemError; + } else if (index < 0 || index >= nof_points) { + status = FlatBufferManager::Status::IndexError; + } else { + return Geo::get_point( + coords_ptr, + 2 * index, + ti->get_input_srid(), + (output_srid < 0 ? ti->get_output_srid() : output_srid), + ti->is_geoint()); + } + } + } + } + } + if (status != FlatBufferManager::Status::Success) { +#ifndef __CUDACC__ + throw std::runtime_error("LineString.getItem failed: " + ::toString(status)); +#endif + } + } + // to be deprecated in favor of NestedArray: int8_t* ptr; int64_t size; bool is_null; - auto status = getBuffer(size, ptr, is_null); + status = getBuffer(size, ptr, is_null); if (status != FlatBufferManager::Status::Success) { #ifndef __CUDACC__ throw std::runtime_error("LineString.getItem failed: " + ::toString(status)); @@ -1018,7 +1155,7 @@ struct LineString { int32_t this_input_srid = get_input_srid(flatbuffer_); int32_t this_output_srid = get_output_srid(flatbuffer_); return Geo::get_point(ptr, - index, + 2 * index, this_input_srid, (output_srid < 0 ? this_output_srid : output_srid), is_geoint); @@ -1032,14 +1169,39 @@ struct LineString { #ifndef __CUDACC__ std::vector toCoords() const { + FlatBufferManager m{flatbuffer_}; + FlatBufferManager::Status status = FlatBufferManager::Status::NotImplementedError; + if (m.isNestedArray()) { + const SQLTypeInfoLite* ti = + reinterpret_cast(m.get_user_data_buffer()); + if (ti == nullptr) { + status = FlatBufferManager::UserDataError; + } else { + if (m.getDimensions() == 3) { + if (index_[2] >= 0) { + bool is_null = false; + int32_t nof_points; + int8_t* coords_ptr; + FlatBufferManager::Status status = m.getItem( + index_[2], index_[1], index_[0], nof_points, coords_ptr, is_null); + if (status == FlatBufferManager::Status::Success) { + return decompress_coords( + coords_ptr, nof_points * m.getValueSize(), ti->is_geoint()); + } + } + } + } + if (status != FlatBufferManager::Status::Success) { + throw std::runtime_error("LineString.toCoords2 failed: " + ::toString(status)); + } + } int8_t* ptr; int64_t size; bool is_null; - auto status = getBuffer(size, ptr, is_null); + status = getBuffer(size, ptr, is_null); - std::vector result; if (status != FlatBufferManager::Status::Success) { - throw std::runtime_error("LineString.getBuffer failed: " + ::toString(status)); + throw std::runtime_error("LineString.toCoords failed: " + ::toString(status)); } if (is_null) { return {}; @@ -1051,19 +1213,49 @@ struct LineString { #endif // Return the number of points of the line string - int64_t size() const { + size_t size() const { FlatBufferManager m{flatbuffer_}; + FlatBufferManager::Status status = FlatBufferManager::Status::NotImplementedError; + if (m.isNestedArray()) { + size_t length = 0; + if (m.getDimensions() == 3) { + if (index_[2] >= 0) { + status = m.getLength(index_[2], index_[1], index_[0], length); + } else if (index_[1] >= 0) { + status = m.getLength(index_[1], index_[0], length); + } else { + status = m.getLength(index_[0], length); + } + } else { + status = FlatBufferManager::Status::NotImplementedError; + } + if (status != FlatBufferManager::Status::Success) { +#ifndef __CUDACC__ + throw std::runtime_error("Geo::LineString size failed: " + ::toString(status)); +#endif + } + return length; + } + // to be deprecated in favor of NestedArray: int64_t length = 0; switch (m.format()) { case GeoLineStringFormatId: - m.getItemLength(index_[0], length); + status = m.getItemLength(index_[0], length); + break; + case GeoPolygonFormatId: + status = m.getSubItemLength(index_[1], index_[0], length); break; default: #ifndef __CUDACC__ - throw std::runtime_error("LineString::size: not implemented for format " + - ::toString(m.format())); + throw std::runtime_error("Geo::LineString::size: not implemented for format " + + ::toString(static_cast(m.format()))); #else ; +#endif + } + if (status != FlatBufferManager::Status::Success) { +#ifndef __CUDACC__ + throw std::runtime_error("Geo::LineString size failed: " + ::toString(status)); #endif } return length; @@ -1169,9 +1361,21 @@ struct Polygon { int8_t* flatbuffer_; // FlatBuffer of GeoPolygons int64_t index_[2]; // polygon index in a Column/Column +#ifndef __CUDACC__ + std::vector> toCoords() const { + std::vector> result; + auto sz = size(); + result.reserve(sz); + for (size_t i = 0; i < sz; i++) { + result.push_back(getItem(i).toCoords()); + } + return result; + } +#endif + // Return the index-th linestring - Geo::LineString getItem(const int64_t index) const { - Geo::LineString linestring{flatbuffer_, {index_[0], index, -1}}; + Geo::LineString getItem(const int64_t index, const int32_t output_srid = 0) const { + Geo::LineString linestring{flatbuffer_, {index, index_[0], index_[1]}}; return linestring; } @@ -1244,19 +1448,71 @@ struct Polygon { } // Return the number of line strings in a polygon - int64_t size() const { + size_t size() const { FlatBufferManager m{flatbuffer_}; - int64_t length = 0; - m.getItemLength(index_[0], length); - return length; + FlatBufferManager::Status status; + if (m.isNestedArray()) { + size_t length = 0; + if (m.getDimensions() == 3) { + if (index_[1] >= 0) { + status = m.getLength(index_[1], index_[0], length); + } else { + status = m.getLength(index_[0], length); + } + } else { + status = FlatBufferManager::Status::NotImplementedError; + } + if (status != FlatBufferManager::Status::Success) { +#ifndef __CUDACC__ + throw std::runtime_error("Geo::Polygon size failed: " + ::toString(status)); +#endif + } + return length; + } else { + int64_t length = 0; + status = m.getItemLength(index_[0], length); + if (status != FlatBufferManager::Status::Success) { +#ifndef __CUDACC__ + throw std::runtime_error("Geo::Polygon size failed: " + ::toString(status)); +#endif + } + return length; + } } // Return the number of points in the index-th line string - int64_t size(const int64_t index) const { + size_t size(const int64_t index) const { FlatBufferManager m{flatbuffer_}; - int64_t length = 0; - m.getSubItemLength(index_[0], index, length); - return length; + FlatBufferManager::Status status; + if (m.isNestedArray()) { + size_t length = 0; + if (m.getDimensions() == 3) { + if (index_[1] >= 0) { + status = m.getLength(index_[1], index_[0], index, length); + } else { + status = m.getLength(index_[0], index, length); + } + } else { + status = FlatBufferManager::Status::NotImplementedError; + } + if (status != FlatBufferManager::Status::Success) { +#ifndef __CUDACC__ + throw std::runtime_error("Geo::Polygon size(lineindex) failed: " + + ::toString(status)); +#endif + } + return length; + } else { + int64_t length = 0; + status = m.getSubItemLength(index_[0], index, length); + if (status != FlatBufferManager::Status::Success) { +#ifndef __CUDACC__ + throw std::runtime_error("Geo::Polygon size(lineindex) failed: " + + ::toString(status)); +#endif + } + return length; + } } #ifndef __CUDACC__ @@ -1318,6 +1574,87 @@ struct Polygon { #endif }; +struct MultiPolygon { + int8_t* flatbuffer_; // FlatBuffer of GeoMultiPolygons + int64_t index_[1]; // multipolygon index of Column + + // Return the number of polygons in the multipolgon + size_t size() const { + FlatBufferManager m{flatbuffer_}; + FlatBufferManager::Status status; + size_t length; + // return the length of a multipolygon + status = m.getLength(index_[0], length); + if (status != FlatBufferManager::Status::Success) { +#ifndef __CUDACC__ + throw std::runtime_error("Geo::MultiPolygon size failed: " + ::toString(status)); +#endif + } + return length; + } + + // Return the index-th polygon + Geo::Polygon getItem(const int64_t index, const int32_t output_srid = 0) const { + Geo::Polygon polygon{flatbuffer_, {index, index_[0]}}; + return polygon; + } + + DEVICE inline Geo::Polygon operator[](const unsigned int index) const { + /* Use getItem(index, output_srid) to enable user-specified + transformation. */ + return getItem(index); + } + +#ifndef __CUDACC__ + // Construct a multipolygon from a vector of vector of coordinates vector + template + FlatBufferManager::Status fromCoords( + const std::vector>>& coords) { + FlatBufferManager m{flatbuffer_}; + const SQLTypeInfoLite* ti = + reinterpret_cast(m.get_user_data_buffer()); + if (ti == nullptr) { + return FlatBufferManager::UserDataError; + } + if (ti->compression == SQLTypeInfoLite::GEOINT) { + if constexpr (std::is_same::value) { + const auto ccoords = compress_coords(coords); + return m.setItem(index_[0], ccoords); + } else { + return m.setItem(index_[0], coords); + } + } else if (ti->compression == SQLTypeInfoLite::NONE) { + if constexpr (std::is_same::value) { + return m.setItem(index_[0], coords); + } else { + const auto dcoords = decompress_coords(coords); + return m.setItem(index_[0], dcoords); + } + } else { + UNREACHABLE(); + } + return FlatBufferManager::NotImplementedError; + } + + std::vector>> toCoords() const { + std::vector>> result; + auto sz = size(); + result.reserve(sz); + for (size_t i = 0; i < sz; i++) { + result.push_back(getItem(i).toCoords()); + } + return result; + } +#endif + +#ifdef HAVE_TOSTRING + std::string toString() const { + return ::typeName(this) + "(..., {" + std::to_string(index_[0]) + ", " + + std::to_string(index_[1]) + "})"; + } +#endif +}; + } // namespace Geo template <> @@ -1567,6 +1904,64 @@ struct Column { #endif } + DEVICE inline void setItem(int64_t index, const Geo::Polygon& polygon) { + FlatBufferManager this_m{flatbuffer_}; + FlatBufferManager m{polygon.flatbuffer_}; + FlatBufferManager::Status status; + if (polygon.index_[1] == -1) { + // flatbuffer contains Column + int32_t* ring_sizes; + int8_t* points; + int64_t nof_rings, size; + bool is_null; + status = m.getItemCountsAndData( + polygon.index_[0], ring_sizes, nof_rings, points, size, is_null); + if (status != FlatBufferManager::Status::Success) { + goto fail; + } + if (is_null) { + status = this_m.setNull(index); + } else { + status = this_m.setItemCountsAndData(index, ring_sizes, nof_rings, points); + } + if (status != FlatBufferManager::Status::Success) { + goto fail; + } + } else { + // flatbuffer contains Column + int32_t nof_points; + int32_t nof_rings; + int8_t* points; + int8_t* ring_sizes; + bool is_null; + status = m.getItem(polygon.index_[1], + polygon.index_[0], + nof_points, + points, + nof_rings, + ring_sizes, + is_null); + if (status != FlatBufferManager::Status::Success) { + goto fail; + } + if (is_null) { + status = this_m.setNull(index); + } else { + status = this_m.setItemCountsAndData( + index, reinterpret_cast(ring_sizes), nof_rings, points); + } + if (status != FlatBufferManager::Status::Success) { + goto fail; + } + } + return; + fail: +#ifndef __CUDACC__ + throw std::runtime_error("setItem failed: " + ::toString(status)); +#endif + ; + } + // Return the total number of points in a Column inline int64_t getNofValues() const { FlatBufferManager m{flatbuffer_}; @@ -1582,6 +1977,60 @@ struct Column { #endif }; +template <> +struct Column { + int8_t* flatbuffer_; + int64_t num_rows_; + + DEVICE Geo::MultiPolygon getItem(const int64_t index) const { + Geo::MultiPolygon mpolygon{flatbuffer_, {index}}; + return mpolygon; + } + + DEVICE inline Geo::MultiPolygon operator[](const unsigned int index) const { + return getItem(static_cast(index)); + } + + DEVICE int64_t size() const { return num_rows_; } + + DEVICE inline bool isNull(int64_t index) const { + FlatBufferManager m{flatbuffer_}; + bool is_null = false; + auto status = m.isNull(index, is_null); +#ifndef __CUDACC__ + if (status != FlatBufferManager::Status::Success) { + throw std::runtime_error("isNull failed: " + ::toString(status)); + } +#endif + return is_null; + } + + DEVICE inline void setNull(int64_t index) { + FlatBufferManager m{flatbuffer_}; + auto status = m.setNull(index); +#ifndef __CUDACC__ + if (status != FlatBufferManager::Status::Success) { + throw std::runtime_error("setNull(" + std::to_string(index) + + ") failed: " + ::toString(status)); + } +#endif + } + + // Return the total number of points in a Column + inline int64_t getNofValues() const { + FlatBufferManager m{flatbuffer_}; + return m.getValuesCount(); + } + +#ifdef HAVE_FLATBUFFER_TOSTRING + std::string toString() const { + FlatBufferManager m{flatbuffer_}; + return ::typeName(this) + "(" + m.toString() + + ", num_rows=" + std::to_string(num_rows_) + ")"; + } +#endif +}; + template struct Column> { // A type for a column of variable length arrays diff --git a/Shared/sqltypes.h b/Shared/sqltypes.h index 1e508843e8..ae71b638a9 100644 --- a/Shared/sqltypes.h +++ b/Shared/sqltypes.h @@ -29,6 +29,7 @@ #include "../Logger/Logger.h" #include "Datum.h" #include "funcannotations.h" +#include "sqltypes_lite.h" #include #include @@ -1015,12 +1016,119 @@ class SQLTypeInfo { case kPOINT: case kLINESTRING: case kPOLYGON: + case kMULTIPOLYGON: return true; default:; } return false; } + SQLTypeInfoLite toLite() const { + SQLTypeInfoLite ti_lite; + switch (type) { + case kPOINT: + ti_lite.type = SQLTypeInfoLite::POINT; + ti_lite.subtype = SQLTypeInfoLite::UNSPECIFIED; + break; + case kLINESTRING: + ti_lite.type = SQLTypeInfoLite::LINESTRING; + ti_lite.subtype = SQLTypeInfoLite::UNSPECIFIED; + break; + case kPOLYGON: + ti_lite.type = SQLTypeInfoLite::POLYGON; + ti_lite.subtype = SQLTypeInfoLite::UNSPECIFIED; + break; + case kMULTIPOINT: + ti_lite.type = SQLTypeInfoLite::MULTIPOINT; + ti_lite.subtype = SQLTypeInfoLite::UNSPECIFIED; + break; + case kMULTILINESTRING: + ti_lite.type = SQLTypeInfoLite::MULTILINESTRING; + ti_lite.subtype = SQLTypeInfoLite::UNSPECIFIED; + break; + case kMULTIPOLYGON: + ti_lite.type = SQLTypeInfoLite::MULTIPOLYGON; + ti_lite.subtype = SQLTypeInfoLite::UNSPECIFIED; + break; + case kTEXT: + ti_lite.type = SQLTypeInfoLite::TEXT; + ti_lite.subtype = SQLTypeInfoLite::UNSPECIFIED; + break; + case kARRAY: + ti_lite.type = SQLTypeInfoLite::ARRAY; + switch (subtype) { + case kBOOLEAN: + ti_lite.subtype = SQLTypeInfoLite::BOOLEAN; + break; + case kTINYINT: + ti_lite.subtype = SQLTypeInfoLite::TINYINT; + break; + case kSMALLINT: + ti_lite.subtype = SQLTypeInfoLite::SMALLINT; + break; + case kINT: + ti_lite.subtype = SQLTypeInfoLite::INT; + break; + case kBIGINT: + ti_lite.subtype = SQLTypeInfoLite::BIGINT; + break; + case kFLOAT: + ti_lite.subtype = SQLTypeInfoLite::FLOAT; + break; + case kDOUBLE: + ti_lite.subtype = SQLTypeInfoLite::DOUBLE; + break; + case kTEXT: + ti_lite.subtype = SQLTypeInfoLite::TEXT; + break; + default: + UNREACHABLE(); + } + break; + default: + UNREACHABLE(); + } + if (is_geometry()) { + switch (get_compression()) { + case kENCODING_NONE: + ti_lite.compression = SQLTypeInfoLite::NONE; + break; + case kENCODING_GEOINT: + ti_lite.compression = SQLTypeInfoLite::GEOINT; + break; + default: + UNREACHABLE(); + } + ti_lite.dimension = get_input_srid(); + ti_lite.scale = get_output_srid(); + ti_lite.db_id = 0; // unused + ti_lite.dict_id = 0; // unused + } else if (type == kTEXT) { + switch (get_compression()) { + case kENCODING_NONE: + ti_lite.compression = SQLTypeInfoLite::NONE; + break; + case kENCODING_DICT: + ti_lite.compression = SQLTypeInfoLite::DICT; + break; + default: + UNREACHABLE(); + } + ti_lite.dimension = 0; // unused + ti_lite.scale = 0; // unused + ti_lite.db_id = dict_key_.db_id; + ti_lite.dict_id = dict_key_.dict_id; + } else if (type == kARRAY) { + ti_lite.dimension = 0; // unused + ti_lite.scale = 0; // unused + ti_lite.db_id = dict_key_.db_id; + ti_lite.dict_id = dict_key_.dict_id; + } else { + UNREACHABLE(); + } + return ti_lite; + } + private: SQLTypes type; // type id SQLTypes subtype; // element type of arrays or columns @@ -1491,6 +1599,73 @@ DEVICE inline void VarlenArray_get_nth(int8_t* buf, inline int64_t getFlatBufferSize(int64_t items_count, int64_t max_nof_values, const SQLTypeInfo& ti) { + size_t dimensions = 0; + FlatBufferManager::ValueType value_type; + int64_t max_nof_sizes = 0; + switch (ti.get_type()) { + case kPOINT: + dimensions = 0; + break; + case kLINESTRING: + case kMULTIPOINT: + case kARRAY: + dimensions = 1; + max_nof_sizes = items_count; + break; + case kPOLYGON: + case kMULTILINESTRING: + dimensions = 2; + max_nof_sizes = items_count + max_nof_values / 3; + break; + case kMULTIPOLYGON: + dimensions = 3; + max_nof_sizes = items_count + 2 * (max_nof_values / 3 + 1); + break; + default: + UNREACHABLE(); + } + + if (ti.is_geometry()) { + if (ti.get_compression() == kENCODING_GEOINT) { + value_type = FlatBufferManager::PointInt32; + } else { + value_type = FlatBufferManager::PointFloat64; + } + } else if (ti.is_array()) { + switch (ti.get_subtype()) { + case kBOOLEAN: + value_type = FlatBufferManager::Bool8; + break; + case kTINYINT: + value_type = FlatBufferManager::Int8; + break; + case kSMALLINT: + value_type = FlatBufferManager::Int16; + break; + case kINT: + value_type = FlatBufferManager::Int32; + break; + case kTEXT: + CHECK_EQ(ti.get_compression(), kENCODING_DICT); + value_type = FlatBufferManager::Int32; + break; + case kBIGINT: + value_type = FlatBufferManager::Int64; + break; + case kFLOAT: + value_type = FlatBufferManager::Float32; + break; + case kDOUBLE: + value_type = FlatBufferManager::Float64; + break; + default: + UNREACHABLE(); + break; + } + } else { + UNREACHABLE(); + } + switch (ti.get_type()) { case kPOINT: { FlatBufferManager::GeoPoint metadata{items_count, @@ -1519,6 +1694,17 @@ inline int64_t getFlatBufferSize(int64_t items_count, return FlatBufferManager::compute_flatbuffer_size( GeoPolygonFormatId, reinterpret_cast(&metadata)); } + case kMULTIPOINT: + case kMULTILINESTRING: + case kMULTIPOLYGON: { + return FlatBufferManager::compute_flatbuffer_size( + /* dimensions= */ dimensions, + /* total_items_count= */ items_count, + /* total sizes count= */ max_nof_sizes, + /* total values count= */ max_nof_values, + value_type, + /* user data size= */ sizeof(SQLTypeInfoLite)); + } case kARRAY: { const size_t array_item_size = ti.get_elem_type().get_size(); const auto dict_key = ti.getStringDictKey(); @@ -1539,6 +1725,75 @@ inline void initializeFlatBuffer(FlatBufferManager& m, int64_t items_count, int64_t max_nof_values, const SQLTypeInfo& ti) { + size_t dimensions = 0; + FlatBufferManager::ValueType value_type; + int64_t max_nof_sizes = 0; + switch (ti.get_type()) { + case kPOINT: + dimensions = 0; + break; + case kLINESTRING: + case kMULTIPOINT: + case kARRAY: + dimensions = 1; + max_nof_sizes = items_count; + break; + case kPOLYGON: + case kMULTILINESTRING: + dimensions = 2; + max_nof_sizes = items_count + max_nof_values / 3; + break; + case kMULTIPOLYGON: + dimensions = 3; + max_nof_sizes = items_count + 2 * (max_nof_values / 3 + 1); + break; + default: + UNREACHABLE(); + } + + if (ti.is_geometry()) { + if (ti.get_compression() == kENCODING_GEOINT) { + value_type = FlatBufferManager::PointInt32; + } else { + value_type = FlatBufferManager::PointFloat64; + } + } else if (ti.is_array()) { + switch (ti.get_subtype()) { + case kBOOLEAN: + value_type = FlatBufferManager::Bool8; + break; + case kTINYINT: + value_type = FlatBufferManager::Int8; + break; + case kSMALLINT: + value_type = FlatBufferManager::Int16; + break; + case kINT: + value_type = FlatBufferManager::Int32; + break; + case kTEXT: + CHECK_EQ(ti.get_compression(), kENCODING_DICT); + value_type = FlatBufferManager::Int32; + break; + case kBIGINT: + value_type = FlatBufferManager::Int64; + break; + case kFLOAT: + value_type = FlatBufferManager::Float32; + break; + case kDOUBLE: + value_type = FlatBufferManager::Float64; + break; + default: + UNREACHABLE(); + break; + } + } else { + UNREACHABLE(); + } + + SQLTypeInfoLite ti_lite = ti.toLite(); + switch (ti.get_type()) { case kPOINT: { FlatBufferManager::GeoPoint metadata{items_count, @@ -1567,6 +1822,31 @@ inline void initializeFlatBuffer(FlatBufferManager& m, m.initialize(GeoPolygonFormatId, reinterpret_cast(&metadata)); break; } + case kMULTIPOINT: + case kMULTILINESTRING: + case kMULTIPOLYGON: { + int8_t* null_value_ptr = nullptr; + uint32_t geoint_null_value[2] = {0x80000000U, 0x80000000U}; + double null_point[2] = {2 * DBL_MIN, 2 * DBL_MIN}; + if (ti.get_compression() == kENCODING_GEOINT) { + null_value_ptr = reinterpret_cast(geoint_null_value); + } else { + null_value_ptr = reinterpret_cast(null_point); + } + auto status = + m.initialize(NestedArrayFormatId, + /* dimensions= */ dimensions, + /* total_items_count= */ items_count, + /* total_sizes_count= */ max_nof_sizes, + /* total_values_count= */ max_nof_values, + value_type, + /* null value buffer=*/null_value_ptr, // null value buffer size + // is defined by value type + /* user data buffer=*/reinterpret_cast(&ti_lite), + /* user data buffer size=*/sizeof(SQLTypeInfoLite)); + CHECK_EQ(status, FlatBufferManager::Success); + break; + } case kARRAY: { const size_t array_item_size = ti.get_elem_type().get_size(); const auto dict_key = ti.getStringDictKey(); diff --git a/Shared/sqltypes_lite.h b/Shared/sqltypes_lite.h new file mode 100644 index 0000000000..529351361b --- /dev/null +++ b/Shared/sqltypes_lite.h @@ -0,0 +1,66 @@ +/* + * Copyright 2023 HEAVY.AI, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + Provides a light-weight data structure SQLTypeInfoLite to serialize + SQLTypeInfo (from sqltypes.h) for the extension functions (in + heavydbTypes.h) by FlatBufferManager. + + Extend SQLTypeInfoLite struct as needed but keep it simple so that + both sqltypes.h and heavydbTypes.h are able to include it (recall, + the two header files cannot include each other). +*/ + +#pragma once + +#include + +struct SQLTypeInfoLite { + enum SQLTypes { + UNSPECIFIED = 0, + BOOLEAN, + TINYINT, + SMALLINT, + INT, + BIGINT, + FLOAT, + DOUBLE, + POINT, + LINESTRING, + POLYGON, + MULTIPOINT, + MULTILINESTRING, + MULTIPOLYGON, + TEXT, + ARRAY + }; + enum EncodingType { + NONE = 0, + DICT, // used by TEXT and ARRAY of TEXT + GEOINT // used by geotypes + }; + SQLTypes type; + SQLTypes subtype; // used by ARRAY + EncodingType compression; // used by geotypes and TEXT and ARRAY of TEXT + int32_t dimension; // input_srid + int32_t scale; // output_srid + int32_t db_id; // used by TEXT and ARRAY of TEXT + int32_t dict_id; // used by TEXT and ARRAY of TEXT + + inline bool is_geoint() const { return compression == GEOINT; } + inline int32_t get_input_srid() const { return dimension; } + inline int32_t get_output_srid() const { return scale; } +}; diff --git a/TableArchiver/TableArchiver.cpp b/TableArchiver/TableArchiver.cpp index 14afc7533d..3180099ace 100644 --- a/TableArchiver/TableArchiver.cpp +++ b/TableArchiver/TableArchiver.cpp @@ -54,9 +54,13 @@ extern bool g_cluster; extern std::string g_base_path; bool g_test_rollback_dump_restore{false}; +constexpr static int kDumpVersion = 1; +constexpr static int kDumpVersion_remove_render_group_columns = 1; + constexpr static char const* table_schema_filename = "_table.sql"; constexpr static char const* table_oldinfo_filename = "_table.oldinfo"; constexpr static char const* table_epoch_filename = "_table.epoch"; +constexpr static char const* table_dumpversion_filename = "_table.dumpversion"; #if BOOST_VERSION < 107300 namespace std { @@ -79,7 +83,9 @@ inline std::string abs_path(const File_Namespace::GlobalFileMgr* global_file_mgr return boost::filesystem::canonical(global_file_mgr->getBasePath()).string(); } -inline std::string run(const std::string& cmd, const std::string& chdir = "") { +inline std::string run(const std::string& cmd, + const std::string& chdir = "", + const bool log_failure = true) { VLOG(3) << "running cmd: " << cmd; int rcode; std::error_code ec; @@ -99,11 +105,13 @@ inline std::string run(const std::string& cmd, const std::string& chdir = "") { errors = ss_errors.str(); }); if (rcode || ec) { - LOG(ERROR) << "failed cmd: " << cmd; - LOG(ERROR) << "exit code: " << rcode; - LOG(ERROR) << "error code: " << ec.value() << " - " << ec.message(); - LOG(ERROR) << "stdout: " << output; - LOG(ERROR) << "stderr: " << errors; + if (log_failure) { + LOG(ERROR) << "failed cmd: " << cmd; + LOG(ERROR) << "exit code: " << rcode; + LOG(ERROR) << "error code: " << ec.value() << " - " << ec.message(); + LOG(ERROR) << "stdout: " << output; + LOG(ERROR) << "stderr: " << errors; + } #if defined(__APPLE__) // osx bsdtar options "--use-compress-program" and "--fast-read" together // run into pipe write error after tar extracts the first occurrence of a @@ -150,7 +158,8 @@ inline std::string run(const std::string& cmd, const std::string& chdir = "") { inline std::string simple_file_cat(const std::string& archive_path, const std::string& file_name, - const std::string& compression) { + const std::string& compression, + const bool log_failure = true) { ddl_utils::validate_allowed_file_path(archive_path, ddl_utils::DataTransferType::IMPORT); #if defined(__APPLE__) @@ -163,7 +172,8 @@ inline std::string simple_file_cat(const std::string& archive_path, boost::filesystem::create_directories(temp_dir); run("tar " + compression + " -xvf " + get_quoted_string(archive_path) + " " + opt_occurrence + " " + file_name, - temp_dir.string()); + temp_dir.string(), + log_failure); const auto output = run("cat " + (temp_dir / file_name).string()); boost::filesystem::remove_all(temp_dir); return output; @@ -180,10 +190,11 @@ inline std::string get_table_schema(const std::string& archive_path, // If a table was altered there may be a mapping from old column ids to new ones these // values need to be replaced in the page headers. -void rewrite_column_ids_in_page_headers( +void update_or_drop_column_ids_in_page_headers( const boost::filesystem::path& path, const std::unordered_map& column_ids_map, - const int32_t table_epoch) { + const int32_t table_epoch, + const bool drop_not_update) { const std::string file_path = path.string(); const std::string file_name = path.filename().string(); std::vector tokens; @@ -230,11 +241,31 @@ void rewrite_column_ids_in_page_headers( continue; } auto column_map_it = column_ids_map.find(col_id); - CHECK(column_map_it != column_ids_map.end()) << "could not find " << col_id; - // If a header contains a column id that is remapped to new location - // then write that change to the file. - if (const auto dest_col_id = column_map_it->second; col_id != dest_col_id) { - col_id = dest_col_id; + bool rewrite_header = false; + if (drop_not_update) { + // if the header contains a column ID that is a key of the map + // erase the entire header so that column is effectively dropped + // the value of the map is ignored, thus allowing us to use the + // same function for both operations + if (column_map_it != column_ids_map.end()) { + // clear the entire header + std::memset(header_info, 0, sizeof(header_info)); + rewrite_header = true; + } + } else { + if (column_map_it == column_ids_map.end()) { + throw std::runtime_error("Page " + std::to_string(page) + " in " + file_path + + " has unexpected Column ID " + std::to_string(col_id) + + ". Dump may be corrupt."); + } + // If a header contains a column id that is remapped to new location + // then write that change to the file. + if (const auto dest_col_id = column_map_it->second; col_id != dest_col_id) { + col_id = dest_col_id; + rewrite_header = true; + } + } + if (rewrite_header) { if (0 != std::fseek(fp.get(), page * page_size, SEEK_SET)) { throw std::runtime_error("Failed to seek to page# " + std::to_string(page) + file_path + " for write: " + std::strerror(errno)); @@ -248,12 +279,15 @@ void rewrite_column_ids_in_page_headers( } } -// Adjust column ids in chunk keys in a table's data files under a temp_data_dir, +// Rewrite column ids in chunk keys in a table's data files under a temp_data_dir, // including files of all shards of the table. Can be slow for big files but should // be scale faster than refragmentizing. Table altering should be rare for olap. -void adjust_altered_table_files(const int32_t table_epoch, - const std::string& temp_data_dir, - const std::unordered_map& column_ids_map) { +// Also used to erase page headers for columns that must be dropped completely. +void update_or_drop_column_ids_in_table_files( + const int32_t table_epoch, + const std::string& temp_data_dir, + const std::unordered_map& column_ids_map, + const bool drop_not_update) { boost::filesystem::path base_path(temp_data_dir); boost::filesystem::recursive_directory_iterator end_it; ThreadController_NS::SimpleThreadController<> thread_controller(cpu_threads()); @@ -261,8 +295,11 @@ void adjust_altered_table_files(const int32_t table_epoch, ++fit) { if (!boost::filesystem::is_symlink(fit->path()) && boost::filesystem::is_regular_file(fit->status())) { - thread_controller.startThread( - rewrite_column_ids_in_page_headers, fit->path(), column_ids_map, table_epoch); + thread_controller.startThread(update_or_drop_column_ids_in_page_headers, + fit->path(), + column_ids_map, + table_epoch, + drop_not_update); thread_controller.checkThreadsStatus(); } } @@ -330,6 +367,73 @@ void rename_table_directories(const File_Namespace::GlobalFileMgr* global_file_m } } +std::unordered_map find_render_group_columns( + const std::list& src_columns, + std::vector& src_oldinfo_strs, + const std::string& archive_path) { + // scan for poly or mpoly columns and collect their names + std::vector poly_column_names; + for (auto const& src_column : src_columns) { + auto const sqltype = src_column.columnType.get_type(); + if (sqltype == kPOLYGON || sqltype == kMULTIPOLYGON) { + poly_column_names.push_back(src_column.columnName); + } + } + + // remove any matching render group columns from the source list + // and capture their IDs in the keys of a map (value is ignored) + std::unordered_map column_ids_to_drop; + auto last_itr = std::remove_if( + src_oldinfo_strs.begin(), + src_oldinfo_strs.end(), + [&](const std::string& v) -> bool { + // tokenize + std::vector tokens; + boost::algorithm::split( + tokens, v, boost::is_any_of(":"), boost::token_compress_on); + // extract name and ID + if (tokens.size() < 2) { + throw std::runtime_error( + "Dump " + archive_path + + " has invalid oldinfo file contents. Dump may be corrupt."); + } + auto const& column_name = tokens[0]; + auto const column_id = std::stoi(tokens[1]); + for (auto const& poly_column_name : poly_column_names) { + // is it a render group column? + auto const render_group_column_name = poly_column_name + "_render_group"; + if (column_name == render_group_column_name) { + LOG(INFO) << "RESTORE TABLE dropping render group column '" + << render_group_column_name << "' from dump " << archive_path; + // add to "set" + column_ids_to_drop[column_id] = -1; + return true; + } + } + return false; + }); + src_oldinfo_strs.erase(last_itr, src_oldinfo_strs.end()); + + return column_ids_to_drop; +} + +void drop_render_group_columns( + const std::unordered_map& render_group_column_ids, + const std::string& archive_path, + const std::string& temp_data_dir, + const std::string& compression) { + // rewrite page files to drop the columns with IDs that are the keys of the map + if (render_group_column_ids.size()) { + const auto epoch = boost::lexical_cast( + simple_file_cat(archive_path, table_epoch_filename, compression)); + const auto time_ms = measure<>::execution([&]() { + update_or_drop_column_ids_in_table_files( + epoch, temp_data_dir, render_group_column_ids, true /* drop */); + }); + VLOG(3) << "drop render group columns: " << time_ms << " ms"; + } +} + } // namespace void TableArchiver::dumpTable(const TableDescriptor* td, @@ -387,6 +491,10 @@ void TableArchiver::dumpTable(const TableDescriptor* td, const auto table_name = td->tableName; { + // - gen dumpversion file + const auto dumpversion_str = std::to_string(kDumpVersion); + file_writer( + uuid_dir / table_dumpversion_filename, "table dumpversion", dumpversion_str); // - gen schema file const auto schema_str = cat_->dumpSchema(td); file_writer(uuid_dir / table_schema_filename, "table schema", schema_str); @@ -512,6 +620,36 @@ void TableArchiver::restoreTable(const Catalog_Namespace::SessionInfo& session, all_src_oldinfo_str, boost::is_any_of(" "), boost::token_compress_on); + + // fetch dump version + int dump_version = -1; + try { + // attempt to read file, do not log if fail to read + auto const dump_version_str = + simple_file_cat(archive_path, table_dumpversion_filename, compression, false); + dump_version = std::stoi(dump_version_str); + } catch (std::runtime_error& e) { + // no dump version file found + dump_version = 0; + } + LOG(INFO) << "Dump Version: " << dump_version; + + // version-specific behavior + const bool do_drop_render_group_columns = + (dump_version < kDumpVersion_remove_render_group_columns); + + // remove any render group columns from the source columns so that the list of + // source columns matches the already-created table, and the removed ones will + // not have an entry in column_ids_map, and hence will not have their data + // mapped later (effectively dropping them), and return their IDs for when + // they are actually dropped later + std::unordered_map render_group_column_ids; + if (do_drop_render_group_columns) { + render_group_column_ids = + find_render_group_columns(src_columns, src_oldinfo_strs, archive_path); + } + + // compare with the destination columns auto all_dst_columns = cat_->getAllColumnMetadataForTable(td->tableId, true, true, true); if (src_oldinfo_strs.size() != all_dst_columns.size()) { @@ -559,20 +697,30 @@ void TableArchiver::restoreTable(const Catalog_Namespace::SessionInfo& session, was_table_altered = was_table_altered || it.first != it.second; }); VLOG(3) << "was_table_altered = " << was_table_altered; + // extract all data files to a temp dir. will swap with dst table dir after all set, // otherwise will corrupt table in case any bad thing happens in the middle. run("rm -rf " + temp_data_dir.string()); run("mkdir -p " + temp_data_dir.string()); run("tar " + compression + " -xvf " + get_quoted_string(archive_path), temp_data_dir); + // drop the render group columns here + if (do_drop_render_group_columns) { + drop_render_group_columns( + render_group_column_ids, archive_path, temp_data_dir, compression); + } + // if table was ever altered after it was created, update column ids in chunk headers. if (was_table_altered) { const auto epoch = boost::lexical_cast( simple_file_cat(archive_path, table_epoch_filename, compression)); - const auto time_ms = measure<>::execution( - [&]() { adjust_altered_table_files(epoch, temp_data_dir, column_ids_map); }); - VLOG(3) << "adjust_altered_table_files: " << time_ms << " ms"; + const auto time_ms = measure<>::execution([&]() { + update_or_drop_column_ids_in_table_files( + epoch, temp_data_dir, column_ids_map, false /* update */); + }); + VLOG(3) << "update_column_ids_table_files: " << time_ms << " ms"; } + // finally,,, swap table data/dict dirs! const auto data_file_dirs = cat_->getTableDataDirectories(td); const auto dict_file_dirs = cat_->getTableDictDirectories(td); diff --git a/Tests/DumpRestoreTest.cpp b/Tests/DumpRestoreTest.cpp index 2d58baf924..ff793a893b 100644 --- a/Tests/DumpRestoreTest.cpp +++ b/Tests/DumpRestoreTest.cpp @@ -250,6 +250,7 @@ class DumpAndRestoreTest : public ::testing::Test { boost::filesystem::remove_all(tar_ball_path); run_ddl_statement("DROP TABLE IF EXISTS test_table;"); run_ddl_statement("DROP TABLE IF EXISTS test_table_2;"); + run_ddl_statement("DROP TABLE IF EXISTS render_groups;"); g_test_rollback_dump_restore = false; } @@ -257,6 +258,7 @@ class DumpAndRestoreTest : public ::testing::Test { boost::filesystem::remove_all(tar_ball_path); run_ddl_statement("DROP TABLE IF EXISTS test_table;"); run_ddl_statement("DROP TABLE IF EXISTS test_table_2;"); + run_ddl_statement("DROP TABLE IF EXISTS render_groups;"); } void sqlAndCompareResult(const std::string& sql, @@ -581,6 +583,30 @@ TEST_F(DumpAndRestoreTest, DumpAlteredTable) { sqlAndCompareResult("SELECT * FROM test_table;", std::vector{1}); } +TEST_F(DumpAndRestoreTest, DropRenderGroupColumns) { + static constexpr int kNullInt = std::numeric_limits::min(); + auto file_path = boost::filesystem::canonical( + "../../Tests/Export/TableDump/dump_with_render_groups_good.gz") + .string(); + EXPECT_NO_THROW(run_ddl_statement("RESTORE TABLE render_groups FROM '" + file_path + + "' WITH (compression='gzip');")); + sqlAndCompareResult("SELECT id FROM render_groups;", + std::vector{1, 2, 3, 4, 5}); + sqlAndCompareResult("SELECT ST_NPOINTS(poly) FROM render_groups;", + std::vector{4, kNullInt, 3, 3, 0}); + sqlAndCompareResult("SELECT ST_NPOINTS(multipoly) FROM render_groups;", + std::vector{6, kNullInt, 9, 9, 0}); +} + +TEST_F(DumpAndRestoreTest, DropRenderGroupColumnsBad) { + auto file_path = boost::filesystem::canonical( + "../../Tests/Export/TableDump/dump_with_render_groups_bad.gz") + .string(); + EXPECT_THROW(run_ddl_statement("RESTORE TABLE render_groups FROM '" + file_path + + "' WITH (compression='gzip');"), + std::runtime_error); +} + #ifdef HAVE_AWS_S3 class S3RestoreTest : public DumpAndRestoreTest { protected: diff --git a/Tests/ExecuteTest.cpp b/Tests/ExecuteTest.cpp index 7069bcb69c..d7e6f722fc 100644 --- a/Tests/ExecuteTest.cpp +++ b/Tests/ExecuteTest.cpp @@ -12820,6 +12820,45 @@ TEST_F(Select, Joins_Negative_ShardKey) { } } +TEST_F(Select, Joins_One_Shard_Is_Empty) { + SKIP_ALL_ON_AGGREGATOR(); + if (skip_tests(ExecutorDeviceType::GPU)) { + return; + } + size_t num_shards = choose_shard_count(); + if (num_shards == 1) { + return; + } + auto drop_tbls = [] { + run_ddl_statement("DROP TABLE IF EXISTS SJ1;"); + run_ddl_statement("DROP TABLE IF EXISTS SJ2;"); + }; + drop_tbls(); + for (auto tbl_name : {"SJ1", "SJ2"}) { + std::ostringstream oss; + oss << "CREATE TABLE " << tbl_name + << " (v INT, SHARD KEY(v)) WITH (SHARD_COUNT=" << num_shards << ");"; + run_ddl_statement(oss.str()); + } + for (auto v : {2, 10}) { + std::ostringstream oss; + oss << "INSERT INTO SJ1 VALUES(" << v << ");"; + run_multiple_agg(oss.str(), ExecutorDeviceType::CPU); + } + for (auto v : {30, 2, 2, 4, 1, 3}) { + std::ostringstream oss; + oss << "INSERT INTO SJ2 VALUES(" << v << ");"; + run_multiple_agg(oss.str(), ExecutorDeviceType::CPU); + } + ScopeGuard reset = [drop_tbls] { drop_tbls(); }; + for (auto dt : {ExecutorDeviceType::CPU, ExecutorDeviceType::GPU}) { + SKIP_NO_GPU(); + EXPECT_EQ(v(run_simple_agg( + "SELECT COUNT(1) FROM SJ1 R, SJ2 S WHERE R.v = S.v;", dt)), + int64_t(2)); + } +} + TEST_F(Select, Joins_InnerJoin_AtLeastThreeTables) { for (auto dt : {ExecutorDeviceType::CPU, ExecutorDeviceType::GPU}) { SKIP_NO_GPU(); diff --git a/Tests/Export/TableDump/dump_with_render_groups_bad.gz b/Tests/Export/TableDump/dump_with_render_groups_bad.gz new file mode 100644 index 0000000000..b5d74b34b7 Binary files /dev/null and b/Tests/Export/TableDump/dump_with_render_groups_bad.gz differ diff --git a/Tests/Export/TableDump/dump_with_render_groups_good.gz b/Tests/Export/TableDump/dump_with_render_groups_good.gz new file mode 100644 index 0000000000..33b4147a1f Binary files /dev/null and b/Tests/Export/TableDump/dump_with_render_groups_good.gz differ diff --git a/Tests/Export/TableDump/truncate_test_dump_file.py b/Tests/Export/TableDump/truncate_test_dump_file.py new file mode 100644 index 0000000000..1ffbac8c0e --- /dev/null +++ b/Tests/Export/TableDump/truncate_test_dump_file.py @@ -0,0 +1,58 @@ +import os +import re +import shutil +import struct +import tarfile + +from argparse import ArgumentParser + +output_file_suffix = "_truncated" +def truncate_dump_file(file_path: str): + extract_path = "./dump_file_extract" + with tarfile.open(file_path, "r") as old_dump_tar: + old_dump_tar.extractall(path=extract_path) + + for root, _, files in os.walk(extract_path): + for file in files: + match = re.match("\d+\\.(\d+)\\.data", file) + if match: + truncate_data_file(os.path.join(root, file), int(match.group(1))) + + file_name = os.path.basename(file_path) + with tarfile.open(f"{file_name}{output_file_suffix}", "w:gz") as new_dump_tar: + new_dump_tar.add(extract_path, arcname="") + + shutil.rmtree(extract_path) + +def truncate_data_file(file_path: str, page_size: int): + print(f"Truncating data file: {file_path}") + + file_size = os.path.getsize(file_path) + last_empty_header_offset = None + with open(file_path, "r+b") as file: + file_offet = 0 + while file_offet < file_size: + file.seek(file_offet) + header_size = struct.unpack('i', file.read(4))[0] + + # Capture the first free page that is followed by only free pages. + if header_size == 0 and last_empty_header_offset is None: + last_empty_header_offset = file_offet + elif header_size != 0 and last_empty_header_offset is not None: + last_empty_header_offset = None + file_offet += page_size + + if last_empty_header_offset is not None: + # Keep at least one free page in order to ensure that code branches that check for free pages are executed. + new_file_size = last_empty_header_offset + page_size + if new_file_size < file_size: + file.truncate(new_file_size) + +if __name__ == '__main__': + parser = ArgumentParser(description=f"""Truncate the dump file at the specified path for tests. Truncation involves + removal of excess free pages that do not add to test coverage. Output + dump file is created in the current directory with a file name that matches + the input file name along with a {output_file_suffix} suffix.""") + parser.add_argument("file_path", type=str, help="Path to file file") + args = parser.parse_args() + truncate_dump_file(file_path=args.file_path) diff --git a/Tests/Import/datafiles/geospatial.csv b/Tests/Import/datafiles/geospatial.csv index d965daa959..251427d752 100644 --- a/Tests/Import/datafiles/geospatial.csv +++ b/Tests/Import/datafiles/geospatial.csv @@ -1,11 +1,11 @@ "point", "linestring", "polygon", "multipolygon", "point2", "point3", "point4", "trip_distance" -"POINT(0 0)", "LINESTRING(0 0, 0 0)", "POLYGON((0 0, 1 0, 0 1, 0 0))", "MULTIPOLYGON(((0 0, 1 0, 0 1, 0 0)))", "POINT(0 0)", "POINT(0 0)", "POINT(0 0)", "0.0" -"POINT(1 1)", "LINESTRING(1 0, 2 2, 3 3)", "POLYGON((0 0, 2 0, 0 2, 0 0))", "MULTIPOLYGON(((0 0, 2 0, 0 2, 0 0)))", "POINT(1 1)", "POINT(1 1)", "POINT(1 1)", "1.0" -"POINT(2 2)", "LINESTRING(2 0, 4 4)", "POLYGON((0 0, 3 0, 0 3, 0 0))", "MULTIPOLYGON(((0 0, 3 0, 0 3, 0 0)))", "POINT(2 2)", "POINT(2 2)", "POINT(2 2)", "2.0" -"POINT(3 3)", "LINESTRING(3 0, 6 6, 7 7)", "POLYGON((0 0, 4 0, 0 4, 0 0))", "MULTIPOLYGON(((0 0, 4 0, 0 4, 0 0)))", "POINT(3 3)", "POINT(3 3)", "POINT(3 3)", "3.0" -"POINT(4 4)", "LINESTRING(4 0, 8 8)", "POLYGON((0 0, 5 0, 0 5, 0 0))", "MULTIPOLYGON(((0 0, 5 0, 0 5, 0 0)))", "POINT(4 4)", "POINT(4 4)", "POINT(4 4)", "4.0" -"POINT(5 5)", "LINESTRING(5 0, 10 10, 11 11)", "POLYGON((0 0, 6 0, 0 6, 0 0))", "MULTIPOLYGON(((0 0, 6 0, 0 6, 0 0)))", "POINT(5 5)", "POINT(5 5)", "POINT(5 5)", "5.0" -"POINT(6 6)", "LINESTRING(6 0, 12 12)", "POLYGON((0 0,7 0, 0 7, 0 0))", "MULTIPOLYGON(((0 0, 7 0, 0 7, 0 0)))", "POINT(6 6)", "POINT(6 6)", "POINT(6 6)", "6.0" -"POINT(7 7)", "LINESTRING(7 0, 14 14, 15 15)", "POLYGON((0 0, 8 0, 0 8, 0 0))", "MULTIPOLYGON(((0 0, 8 0, 0 8, 0 0)))", "POINT(7 7)", "POINT(7 7)", "POINT(7 7)", "7.0" -"POINT(8 8)", "LINESTRING(8 0, 16 16)", "POLYGON((0 0,9 0, 0 9, 0 0))", "MULTIPOLYGON(((0 0, 9 0, 0 9, 0 0)))", "POINT(8 8)", "POINT(8 8)", "POINT(8 8)", "8.0" -"POINT(9 9)", "LINESTRING(9 0, 18 18, 19 19)", "POLYGON((0 0, 10 0, 0 10, 0 0))", "MULTIPOLYGON(((0 0, 10 0, 0 10, 0 0)))", "POINT(9 9)", "POINT(9 9)", "POINT(9 9)", "9.0" +"POINT(0 0)", "LINESTRING(0 0, 0 0)", "POLYGON((0 0, 1 0, 0 1, 0 0))", "MULTIPOLYGON(((0 0, 1 0, 0 1, 0 0)))", "POINT(0 0)", "POINT(0 0)", "POINT(0 0)", 0.0 +"POINT(1 1)", "LINESTRING(1 0, 2 2, 3 3)", "POLYGON((0 0, 2 0, 0 2, 0 0))", "MULTIPOLYGON(((0 0, 2 0, 0 2, 0 0)))", "POINT(1 1)", "POINT(1 1)", "POINT(1 1)", 1.0 +"POINT(2 2)", "LINESTRING(2 0, 4 4)", "POLYGON((0 0, 3 0, 0 3, 0 0))", "MULTIPOLYGON(((0 0, 3 0, 0 3, 0 0)))", "POINT(2 2)", "POINT(2 2)", "POINT(2 2)", 2.0 +"POINT(3 3)", "LINESTRING(3 0, 6 6, 7 7)", "POLYGON((0 0, 4 0, 0 4, 0 0))", "MULTIPOLYGON(((0 0, 4 0, 0 4, 0 0)))", "POINT(3 3)", "POINT(3 3)", "POINT(3 3)", 3.0 +"POINT(4 4)", "LINESTRING(4 0, 8 8)", "POLYGON((0 0, 5 0, 0 5, 0 0))", "MULTIPOLYGON(((0 0, 5 0, 0 5, 0 0)))", "POINT(4 4)", "POINT(4 4)", "POINT(4 4)", 4.0 +"POINT(5 5)", "LINESTRING(5 0, 10 10, 11 11)", "POLYGON((0 0, 6 0, 0 6, 0 0))", "MULTIPOLYGON(((0 0, 6 0, 0 6, 0 0)))", "POINT(5 5)", "POINT(5 5)", "POINT(5 5)", 5.0 +"POINT(6 6)", "LINESTRING(6 0, 12 12)", "POLYGON((0 0,7 0, 0 7, 0 0))", "MULTIPOLYGON(((0 0, 7 0, 0 7, 0 0)))", "POINT(6 6)", "POINT(6 6)", "POINT(6 6)", 6.0 +"POINT(7 7)", "LINESTRING(7 0, 14 14, 15 15)", "POLYGON((0 0, 8 0, 0 8, 0 0))", "MULTIPOLYGON(((0 0, 8 0, 0 8, 0 0)))", "POINT(7 7)", "POINT(7 7)", "POINT(7 7)", 7.0 +"POINT(8 8)", "LINESTRING(8 0, 16 16)", "POLYGON((0 0,9 0, 0 9, 0 0))", "MULTIPOLYGON(((0 0, 9 0, 0 9, 0 0)))", "POINT(8 8)", "POINT(8 8)", "POINT(8 8)", 8.0 +"POINT(9 9)", "LINESTRING(9 0, 18 18, 19 19)", "POLYGON((0 0, 10 0, 0 10, 0 0))", "MULTIPOLYGON(((0 0, 10 0, 0 10, 0 0)))", "POINT(9 9)", "POINT(9 9)", "POINT(9 9)", 9.0 diff --git a/Tests/Import/datafiles/random_strings_with_line_endings.7z b/Tests/Import/datafiles/random_strings_with_line_endings.7z index b2a6db7fbc..f307ab3499 100644 Binary files a/Tests/Import/datafiles/random_strings_with_line_endings.7z and b/Tests/Import/datafiles/random_strings_with_line_endings.7z differ diff --git a/Tests/Import/datafiles/with_quoted_fields_and_side_spaces.csv b/Tests/Import/datafiles/with_quoted_fields_and_side_spaces.csv new file mode 100644 index 0000000000..dd7c9f131b --- /dev/null +++ b/Tests/Import/datafiles/with_quoted_fields_and_side_spaces.csv @@ -0,0 +1,5 @@ +"id","str1","bool1","smallint1" +"1", " test1 ", "true" , "1" + "2","test2","false","2" +"3", "", "true", "3" +"4", test4 , "false", "4" \ No newline at end of file diff --git a/Tests/Import/datafiles/with_quoted_fields_unmatched_left_quote.csv b/Tests/Import/datafiles/with_quoted_fields_unmatched_left_quote.csv new file mode 100644 index 0000000000..9d3a623330 --- /dev/null +++ b/Tests/Import/datafiles/with_quoted_fields_unmatched_left_quote.csv @@ -0,0 +1,4 @@ +"id","str1","bool1","smallint1" +"1", " test1 ", "true" , "1" + "2","test2,"false","2" +"3", "", "true", "3" \ No newline at end of file diff --git a/Tests/Import/datafiles/with_quoted_fields_unmatched_right_quote.csv b/Tests/Import/datafiles/with_quoted_fields_unmatched_right_quote.csv new file mode 100644 index 0000000000..868b73573e --- /dev/null +++ b/Tests/Import/datafiles/with_quoted_fields_unmatched_right_quote.csv @@ -0,0 +1,4 @@ +"id","str1","bool1","smallint1" +"1", " test1 ", "true" , "1" + "2",test2","false","2" +"3", "", "true", "3" \ No newline at end of file diff --git a/Tests/ImportExportTest.cpp b/Tests/ImportExportTest.cpp index ccb802c933..3d23e3b3ec 100644 --- a/Tests/ImportExportTest.cpp +++ b/Tests/ImportExportTest.cpp @@ -2173,6 +2173,15 @@ const char* create_table_with_side_spaces = R"( ) WITH (FRAGMENT_SIZE=75000000); )"; +const char* create_table_with_quoted_fields_and_side_spaces = R"( + CREATE TABLE with_quoted_fields_and_side_spaces ( + id INTEGER, + str1 TEXT, + bool1 BOOLEAN, + smallint1 SMALLINT + ) WITH (FRAGMENT_SIZE=75000000); + )"; + const char* create_table_with_side_spaced_array = R"( CREATE TABLE array_with_side_spaces ( id INTEGER, @@ -2223,6 +2232,8 @@ class ImportTest : public ImportExportTestBase { sql(create_table_with_null_text_arrays); sql("drop table if exists with_side_spaces;"); sql(create_table_with_side_spaces); + sql("drop table if exists with_quoted_fields_and_side_spaces;"); + sql(create_table_with_quoted_fields_and_side_spaces); sql("drop table if exists array_with_side_spaces;"); sql(create_table_with_side_spaced_array); sql("drop table if exists null_table;"); @@ -2331,6 +2342,25 @@ class ImportTest : public ImportExportTestBase { return true; } + void importTestWithQuotedFieldsAndSideSpaces(const std::string& filename, + const std::string& quoted, + const std::string& trim) { + sql("TRUNCATE TABLE with_quoted_fields_and_side_spaces;"); + string query_str = + "COPY with_quoted_fields_and_side_spaces FROM '../../Tests/Import/datafiles/" + + filename + "' WITH (quoted='" + quoted + "',trim_spaces='" + trim + "');"; + sql(query_str); + string select_query_str = + "SELECT * FROM with_quoted_fields_and_side_spaces ORDER BY id;"; + string test1 = (trim == "true" ? "test1" : " test1 "); + string test4 = (trim == "true" ? "test4" : " test4 "); + sqlAndCompareResult(select_query_str, + {{i(1), test1, True, i(1)}, + {i(2), "test2", False, i(2)}, + {i(3), NULL, True, i(3)}, + {i(4), test4, False, i(4)}}); + } + bool importTestArrayWithSideSpaces(const string& filename, const string& trim) { sql("TRUNCATE TABLE array_with_side_spaces;"); string query_str = "COPY array_with_side_spaces FROM '../../Tests/Import/datafiles/" + @@ -2698,12 +2728,37 @@ TEST_F(ImportTest, with_quoted_fields) { } } +TEST_F(ImportTest, with_quoted_fields_unmatched_quote) { + std::string error_msg{ + "Unable to find a matching end quote for the quote character '\"' after reading 60 " + "characters. Please ensure that all data fields are correctly formatted or update " + "the \"buffer_size\" option appropriately. Row number: 2. First few characters in " + "row: "}; + queryAndAssertException( + "COPY with_quoted_fields FROM " + "'../../Tests/Import/datafiles/with_quoted_fields_unmatched_left_quote.csv' WITH " + "(header='true', quoted='true');", + error_msg + "\"2\",\"test2,\"false\",\"2\" \n\"3\", \"\", \"t"); + queryAndAssertException( + "COPY with_quoted_fields FROM " + "'../../Tests/Import/datafiles/with_quoted_fields_unmatched_right_quote.csv' WITH " + "(header='true', quoted='true');", + error_msg + "\"2\",test2\",\"false\",\"2\" \n\"3\", \"\", \"t"); +} + TEST_F(ImportTest, with_side_spaces) { for (auto trim : {"false", "true"}) { EXPECT_NO_THROW(importTestWithSideSpaces("with_side_spaces.csv", trim)); } } +TEST_F(ImportTest, with_quoted_fields_and_side_spaces) { + for (auto trim : {"false", "true"}) { + EXPECT_NO_THROW(importTestWithQuotedFieldsAndSideSpaces( + "with_quoted_fields_and_side_spaces.csv", "true", trim)); + } +} + TEST_F(ImportTest, with_side_spaced_array) { for (auto trim : {"false", "true"}) { EXPECT_NO_THROW(importTestArrayWithSideSpaces("array_with_side_spaces.csv", trim)); @@ -3027,7 +3082,7 @@ TEST_F(ImportTestGeo, CSV_Import_Buffer_Size_Less_Than_Row_Size) { } TEST_F(ImportTestGeo, CSV_Import_Max_Buffer_Resize_Less_Than_Row_Size) { - import_export::delimited_parser::set_max_buffer_resize(170); + import_export::delimited_parser::set_max_buffer_resize(168); const auto file_path = boost::filesystem::path("../../Tests/Import/datafiles/geospatial.csv"); @@ -3036,9 +3091,9 @@ TEST_F(ImportTestGeo, CSV_Import_Max_Buffer_Resize_Less_Than_Row_Size) { // adapt value based on which importer we're testing as they have different buffer size // management heuristics if (g_enable_legacy_delimited_import) { - expected_error_message += "170"; + expected_error_message += "168"; } else { - expected_error_message += "169"; + expected_error_message += "167"; } expected_error_message += " characters. " diff --git a/Tests/OverlapsJoinTest.cpp b/Tests/OverlapsJoinTest.cpp index b12fddbb96..de1cc96b65 100644 --- a/Tests/OverlapsJoinTest.cpp +++ b/Tests/OverlapsJoinTest.cpp @@ -2338,6 +2338,36 @@ TEST_F(OverlapsJoinRewriteTest, ArgumentOrderingAfterTableReordering) { } } +TEST_F(OverlapsJoinRewriteTest, ArgumentReorderingNonPointCol) { + // test logic is different compared with the previous test, + // so we do not use performTest function here + QR::get()->clearCpuMemory(); + g_enable_distance_rangejoin = true; + ScopeGuard reset_flag = [orig = g_from_table_reordering] { + g_from_table_reordering = orig; + }; + auto q1 = + "SELECT COUNT(*) FROM TEST_GEOPT R, TEST_GEOPT2 S WHERE ST_DISTANCE( " + "ST_GeomFromText('POINT(-87.653800724 41.839365536)', 4326), R.pt4326 ) <= 1.0 AND " + "ST_DISTANCE(R.pt4326, S.pt4326) < 0.01;"; + auto q2 = + "SELECT COUNT(*) FROM TEST_GEOPT R, TEST_GEOPT2 S WHERE ST_DISTANCE( S.pt4326, " + "ST_GeomFromText('POINT(-87.653800724 41.839365536)', 4326) ) <= 1.0 AND " + "ST_DISTANCE(R.pt4326, S.pt4326) < 0.01;"; + auto q3 = + "SELECT COUNT(*) FROM TEST_GEOPT R, TEST_GEOPT2 S WHERE ST_DISTANCE( R.pt4326, " + "ST_GeomFromText('POINT(-87.653800724 41.839365536)', 4326) ) <= 1.0 AND " + "ST_DISTANCE( S.pt4326, ST_GeomFromText('POINT(-87.653800724 41.839365536)', 4326) " + ") <= 1.0;"; + for (bool const table_reordering : {true, false}) { + g_from_table_reordering = table_reordering; + // check whether the query finishes without a crash + EXPECT_EQ((int64_t)0, v(execSQL(q1, ExecutorDeviceType::CPU))); + EXPECT_EQ((int64_t)0, v(execSQL(q2, ExecutorDeviceType::CPU))); + EXPECT_EQ((int64_t)0, v(execSQL(q3, ExecutorDeviceType::CPU))); + } +} + TEST_F(OverlapsJoinRewriteTest, TemporaryTable) { QR::get()->runDDLStatement("DROP TABLE IF EXISTS tp1;"); QR::get()->runDDLStatement("DROP TABLE IF EXISTS tp2;"); diff --git a/Tests/TableFunctionsTest.cpp b/Tests/TableFunctionsTest.cpp index ec66f41edb..4bac02dc0f 100644 --- a/Tests/TableFunctionsTest.cpp +++ b/Tests/TableFunctionsTest.cpp @@ -391,28 +391,38 @@ class TableFunctions : public ::testing::Test { "hh3 GEOMETRY(LINESTRING, 4326) ENCODING NONE, " "p4 GEOMETRY(POLYGON, 900913)," "r4 GEOMETRY(LINESTRING, 900913), h4 GEOMETRY(LINESTRING, 900913), hh4 " - "GEOMETRY(LINESTRING, 900913), sizes INT);"); + "GEOMETRY(LINESTRING, 900913), " + "mp1 MULTIPOLYGON, " + "mp2 GEOMETRY(MULTIPOLYGON, 4326), " + "mp3 GEOMETRY(MULTIPOLYGON, 4326) ENCODING NONE, " + "mp4 GEOMETRY(MULTIPOLYGON, 900913), " + "sizes INT);"); TestHelpers::ValuesGenerator gen("geo_polygon_test"); - run_multiple_agg(gen("'POLYGON((1 2,3 4,5 6,7 8,9 10),(2 3,3 4,1 2))'", - "'LINESTRING(1 2,3 4,5 6,7 8,9 10)'", - "'LINESTRING(2 3,3 4,1 2)'", - "'NULL'", - "'POLYGON((0 0,5 0,5 5,0 5,0 0),(2 2, 2 1,1 1,1 2,2 2))'", - "'LINESTRING(0 0,5 0,5 5,0 5)'", - "'LINESTRING(2 2,2 1,1 1,1 2)'", - "'NULL'", - "'POLYGON((0 0,6 0,6 6,0 6,0 0),(3 3,3 2,2 2,2 3,3 3))'", - "'LINESTRING(0 0,6 0,6 6,0 6))'", - "'LINESTRING(3 3,3 2,2 2,2 3)'", - "'NULL'", - "'POLYGON((0 0,7 0,7 7,0 7,0 0),(4 4,2 4, 2 3,4 2,4 4))'", - "'LINESTRING(0 0,7 0,7 7,0 7)'", - "'LINESTRING(4 4,4 2,2 3,2 4)'", - "'NULL'", - "8"), - ExecutorDeviceType::CPU); + run_multiple_agg( + gen("'POLYGON((1 2,3 4,5 6,7 8,9 10),(2 3,3 4,1 2))'", + "'LINESTRING(1 2,3 4,5 6,7 8,9 10)'", + "'LINESTRING(2 3,3 4,1 2)'", + "'NULL'", + "'POLYGON((0 0,5 0,5 5,0 5,0 0),(2 2, 2 1,1 1,1 2,2 2))'", + "'LINESTRING(0 0,5 0,5 5,0 5)'", + "'LINESTRING(2 2,2 1,1 1,1 2)'", + "'NULL'", + "'POLYGON((0 0,6 0,6 6,0 6,0 0),(3 3,3 2,2 2,2 3,3 3))'", + "'LINESTRING(0 0,6 0,6 6,0 6))'", + "'LINESTRING(3 3,3 2,2 2,2 3)'", + "'NULL'", + "'POLYGON((0 0,7 0,7 7,0 7,0 0),(4 4,2 4, 2 3,4 2,4 4))'", + "'LINESTRING(0 0,7 0,7 7,0 7)'", + "'LINESTRING(4 4,4 2,2 3,2 4)'", + "'NULL'", + "'MULTIPOLYGON(((1 2,3 4,5 6,7 8,9 10),(2 3,3 4,1 2)))'", + "'MULTIPOLYGON(((0 0,5 0,5 5,0 5,0 0),(2 2, 2 1,1 1,1 2,2 2)))'", + "'MULTIPOLYGON(((0 0,6 0,6 6,0 6,0 0),(3 3,3 2,2 2,2 3,3 3)))'", + "'MULTIPOLYGON(((0 0,7 0,7 7,0 7,0 0),(4 4,2 4, 2 3,4 2,4 4)))'", + "8"), + ExecutorDeviceType::CPU); run_multiple_agg(gen("'POLYGON((0 0,5 0,5 5,0 5,0 0))'", "'LINESTRING(0 0,5 0,5 5,0 5)'", @@ -430,6 +440,10 @@ class TableFunctions : public ::testing::Test { "'LINESTRING(0 0,4 0,4 4,0 4)'", "'NULL'", "'NULL'", + "'MULTIPOLYGON(((0 0,5 0,5 5,0 5,0 0)))'", + "'MULTIPOLYGON(((0 0,6 0,6 6,0 6,0 0)))'", + "'MULTIPOLYGON(((0 0,7 0,7 7,0 7,0 0)))'", + "'MULTIPOLYGON(((0 0,4 0,4 4,0 4,0 0)))'", "4"), ExecutorDeviceType::CPU); @@ -450,6 +464,13 @@ class TableFunctions : public ::testing::Test { "'LINESTRING(0 0,7 0,7 7,0 7)'", "'LINESTRING(4 4,4 2,2 3,2 4)'", "'LINESTRING(0 0,0 1,1 0)'", + "'MULTIPOLYGON(((1 2,3 4,5 6,7 8,9 10),(3 4,1 2,2 3),(5 6,7 8,9 10)))'", + "'MULTIPOLYGON(((0 0,5 0,5 5,0 5,0 0),(2 2,2 1,1 1,1 2,2 2),(0 0,0 1,1 " + "0)))'", + "'MULTIPOLYGON(((0 0,6 0,6 6,0 6,0 0),(3 3,3 2,2 2,2 3,3 3),(0 0,0 1,1 " + "0)))'", + "'MULTIPOLYGON(((0 0,7 0,7 7,0 7,0 0),(4 4,2 4, 2 3,4 2,4 4),(0 0,0 1,1 " + "0)))'", "11"), ExecutorDeviceType::CPU); run_multiple_agg(gen("'NULL'", @@ -468,6 +489,10 @@ class TableFunctions : public ::testing::Test { "'NULL'", "'NULL'", "'NULL'", + "'NULL'", + "'NULL'", + "'NULL'", + "'NULL'", "NULL"), ExecutorDeviceType::CPU); } @@ -2923,6 +2948,7 @@ void assert_equal(const TargetValue& val1, std::vector coords1, coords2; std::vector bounds1, bounds2; std::vector ring_sizes1, ring_sizes2; + std::vector poly_sizes1, poly_sizes2; switch (ti.get_type()) { case kLINESTRING: { const auto gdal_wkt_ls1 = Geospatial::GeoLineString(*s1); @@ -2940,9 +2966,20 @@ void assert_equal(const TargetValue& val1, gdal_wkt_poly2.getColumns(coords2, ring_sizes2, bounds2); break; } + case kMULTIPOLYGON: { + const auto gdal_wkt_mpoly1 = Geospatial::GeoMultiPolygon(*s1); + gdal_wkt_mpoly1.getColumns(coords1, ring_sizes1, poly_sizes1, bounds1); + const auto gdal_wkt_mpoly2 = Geospatial::GeoMultiPolygon(*s2); + gdal_wkt_mpoly2.getColumns(coords2, ring_sizes2, poly_sizes2, bounds2); + break; + } default: UNREACHABLE() << "ti=" << ti.to_string(); } + ASSERT_EQ(poly_sizes1.size(), poly_sizes2.size()); + for (size_t i = 0; i < poly_sizes1.size(); i++) { + ASSERT_EQ(poly_sizes1[i], poly_sizes2[i]); + } ASSERT_EQ(ring_sizes1.size(), ring_sizes2.size()); ASSERT_EQ(coords1.size(), coords2.size()); int64_t k = 0; @@ -2963,8 +3000,8 @@ void assert_equal(const TargetValue& val1, for (int32_t j = 0; j < sz; j++) { Point p1 = points1[j]; Point p2 = points2[j]; - ASSERT_NEAR(p1.x, p2.x, 1e-7); - ASSERT_NEAR(p1.y, p2.y, 1e-7); + ASSERT_NEAR(p1.x, p2.x, 1e-6); + ASSERT_NEAR(p1.y, p2.y, 1e-6); } } } @@ -3604,6 +3641,103 @@ TEST_F(TableFunctions, ColumnGeoPolygonOutput) { } } +TEST_F(TableFunctions, ColumnGeoPolygonInOutput) { + for (auto dt : {ExecutorDeviceType::CPU /*, ExecutorDeviceType::GPU*/}) { + SKIP_NO_GPU(); + for (int i = 1; i <= 4; i++) { + std::string pcol = "p" + std::to_string(i); + std::string rcol = "r" + std::to_string(i); + std::string hcol = "h" + std::to_string(i); + std::string hhcol = "hh" + std::to_string(i); + { + std::string q1 = "SELECT " + rcol + " FROM geo_polygon_test;"; + std::string q2 = + "SELECT linestrings FROM TABLE(CT_LINESTRINGN(CURSOR(" + "SELECT polygons FROM TABLE(CT_MAKE_POLYGON3(CURSOR(SELECT " + + rcol + ", " + hcol + ", " + hhcol + " FROM geo_polygon_test)))), 1));"; + const auto expected_rows = run_multiple_agg(q1, dt); + const auto rows = run_multiple_agg(q2, dt); + assert_equal(rows, expected_rows); + } + } + } +} + +TEST_F(TableFunctions, ColumnGeoMultiPolygonOutput) { + for (auto dt : {ExecutorDeviceType::CPU /*, ExecutorDeviceType::GPU*/}) { + SKIP_NO_GPU(); + for (int i = 1; i <= 4; i++) { + std::string pcol = "p" + std::to_string(i); + std::string mpcol = "mp" + std::to_string(i); + // Test Column output + { + std::string q1 = "SELECT " + mpcol + " FROM geo_polygon_test;"; + std::string q2 = + "SELECT mpolygons FROM TABLE(CT_MAKE_MULTIPOLYGON(CURSOR(SELECT " + pcol + + " FROM geo_polygon_test)));"; + const auto expected_rows = run_multiple_agg(q1, dt); + const auto rows = run_multiple_agg(q2, dt); + assert_equal(rows, expected_rows); + } + } + } +} + +TEST_F(TableFunctions, ColumnGeoMultiPolygonInput) { + for (auto dt : {ExecutorDeviceType::CPU /*, ExecutorDeviceType::GPU*/}) { + SKIP_NO_GPU(); + for (int i = 1; i <= 4; i++) { + std::string pcol = "p" + std::to_string(i); + std::string mpcol = "mp" + std::to_string(i); + // Test Column input + { + std::string q1 = "SELECT " + pcol + " FROM geo_polygon_test;"; + std::string q2 = "SELECT polygons FROM TABLE(CT_POLYGONN(CURSOR(SELECT " + mpcol + + " FROM geo_polygon_test), 1));"; + const auto expected_rows = run_multiple_agg(q1, dt); + const auto rows = run_multiple_agg(q2, dt); + assert_equal(rows, expected_rows); + } + } + } +} + +TEST_F(TableFunctions, ColumnGeoMultiPolygonInOutput) { + for (auto dt : {ExecutorDeviceType::CPU /*, ExecutorDeviceType::GPU*/}) { + SKIP_NO_GPU(); + for (int i = 1; i <= 4; i++) { + std::string pcol = "p" + std::to_string(i); + std::string mpcol = "mp" + std::to_string(i); + // Test Column input, Column + // output, Column input, Column + // output + { + std::string q1 = "SELECT " + pcol + " FROM geo_polygon_test;"; + std::string q2 = + "SELECT polygons FROM TABLE(CT_POLYGONN(CURSOR(SELECT" + " mpolygons FROM TABLE(CT_MAKE_MULTIPOLYGON(CURSOR(SELECT " + + pcol + " FROM geo_polygon_test)))), 1));"; + const auto expected_rows = run_multiple_agg(q1, dt); + const auto rows = run_multiple_agg(q2, dt); + assert_equal(rows, expected_rows); + } + // Test Column input, Column + // output, Column input, Column + // output: + { + std::string q1 = "SELECT " + mpcol + " FROM geo_polygon_test;"; + std::string q2 = + "SELECT mpolygons FROM TABLE(CT_MAKE_MULTIPOLYGON(" + "CURSOR(SELECT polygons FROM TABLE(CT_POLYGONN(CURSOR(SELECT " + + mpcol + " FROM geo_polygon_test), 1)))));"; + const auto expected_rows = run_multiple_agg(q1, dt); + const auto rows = run_multiple_agg(q2, dt); + assert_equal(rows, expected_rows); + } + } + } +} + TEST_F(TableFunctions, DefaultScalarValues) { for (auto dt : {ExecutorDeviceType::CPU, ExecutorDeviceType::GPU}) { SKIP_NO_GPU(); diff --git a/ThriftHandler/CommandLineOptions.cpp b/ThriftHandler/CommandLineOptions.cpp index 5d06d450f5..67acfac33f 100644 --- a/ThriftHandler/CommandLineOptions.cpp +++ b/ThriftHandler/CommandLineOptions.cpp @@ -18,12 +18,12 @@ #include #include -#include #include #include using namespace std::string_literals; +#include #include "CommandLineOptions.h" #include "ImportExport/ForeignDataImporter.h" #include "LeafHostInfo.h" @@ -93,27 +93,28 @@ void CommandLineOptions::init_logging() { log_options_.set_base_path(base_path); logger::init(log_options_); } - void CommandLineOptions::fillOptions() { - help_desc.add_options()("help,h", "Show available options."); - help_desc.add_options()( + po::options_description& desc = help_desc_; + + desc.add_options()("help,h", "Show available options."); + desc.add_options()( "allow-cpu-retry", po::value(&g_allow_cpu_retry) ->default_value(g_allow_cpu_retry) ->implicit_value(true), R"(Allow the queries which failed on GPU to retry on CPU, even when watchdog is enabled.)"); - help_desc.add_options()("allow-loop-joins", - po::value(&allow_loop_joins) - ->default_value(allow_loop_joins) - ->implicit_value(true), - "Enable loop joins."); - help_desc.add_options()("bigint-count", - po::value(&g_bigint_count) - ->default_value(g_bigint_count) - ->implicit_value(true), - "Use 64-bit count."); - - help_desc.add_options()( + desc.add_options()("allow-loop-joins", + po::value(&allow_loop_joins) + ->default_value(allow_loop_joins) + ->implicit_value(true), + "Enable loop joins."); + desc.add_options()("bigint-count", + po::value(&g_bigint_count) + ->default_value(g_bigint_count) + ->implicit_value(true), + "Use 64-bit count."); + + desc.add_options()( "enable-executor-resource-mgr", po::value(&g_enable_executor_resource_mgr) ->default_value(g_enable_executor_resource_mgr) @@ -124,7 +125,7 @@ void CommandLineOptions::fillOptions() { // Note we allow executor-cpu-result-mem-ratio to have values > 0 to allow // oversubscription of memory when warranted, but user should be careful with this as // too high a value can cause OOM errors. - help_desc.add_options()( + desc.add_options()( "executor-cpu-result-mem-ratio", po::value(&g_executor_resource_mgr_cpu_result_mem_ratio) ->default_value(g_executor_resource_mgr_cpu_result_mem_ratio), @@ -134,7 +135,7 @@ void CommandLineOptions::fillOptions() { "warranted, but too high a value can cause out-of-memory errors. Requires " "--executor-resource-mgr to be set"); - help_desc.add_options()( + desc.add_options()( "executor-cpu-result-mem-bytes", po::value(&g_executor_resource_mgr_cpu_result_mem_bytes) ->default_value(g_executor_resource_mgr_cpu_result_mem_bytes), @@ -147,7 +148,7 @@ void CommandLineOptions::fillOptions() { // oversubscription of threads when warranted, given we may be overly pessimistic about // kernel core occupation for some classes of queries. Care should be taken however with // setting this value too high as thrashing and thread starvation can result. - help_desc.add_options()( + desc.add_options()( "executor-per-query-max-cpu-threads-ratio", po::value(&g_executor_resource_mgr_per_query_max_cpu_slots_ratio) ->default_value(g_executor_resource_mgr_per_query_max_cpu_slots_ratio), @@ -157,7 +158,7 @@ void CommandLineOptions::fillOptions() { // Note we allow executor-per-query-max-cpu-result-mem-ratio to have values > 0 to allow // oversubscription of memory when warranted, but user should be careful with this as // too high a value can cause OOM errors. - help_desc.add_options()( + desc.add_options()( "executor-per-query-max-cpu-result-mem-ratio", po::value(&g_executor_resource_mgr_per_query_max_cpu_result_mem_ratio) ->default_value(g_executor_resource_mgr_per_query_max_cpu_result_mem_ratio), @@ -165,7 +166,7 @@ void CommandLineOptions::fillOptions() { "that can be " "allocated for a single query. Requires --enable-executor-resource-mgr to be set."); - help_desc.add_options()( + desc.add_options()( "allow-cpu-kernel-concurrency", po::value(&g_executor_resource_mgr_allow_cpu_kernel_concurrency) ->default_value(g_executor_resource_mgr_allow_cpu_kernel_concurrency) @@ -173,7 +174,7 @@ void CommandLineOptions::fillOptions() { "Allow for multiple queries to run execution kernels concurrently on CPU. Requires " "--enable-executor-resource-mgr to be set."); - help_desc.add_options()( + desc.add_options()( "allow-cpu-gpu-kernel-concurrency", po::value(&g_executor_resource_mgr_allow_cpu_gpu_kernel_concurrency) ->default_value(g_executor_resource_mgr_allow_cpu_gpu_kernel_concurrency) @@ -185,7 +186,7 @@ void CommandLineOptions::fillOptions() { // CPU slots/threads Single query CPU slot oversubscription should be controlled with // --executor-per-query-max-cpu-threads-ratio (i.e. by setting it to > 1.0) - help_desc.add_options()( + desc.add_options()( "allow-cpu-thread-oversubscription-concurrency", po::value( &g_executor_resource_mgr_allow_cpu_slot_oversubscription_concurrency) @@ -202,7 +203,7 @@ void CommandLineOptions::fillOptions() { // controlled with // --executor-per-query-cpu-result-mem-ratio (i.e. by setting it to > 1.0) - help_desc.add_options()( + desc.add_options()( "allow-cpu-result-mem-oversubscription-concurrency", po::value( &g_executor_resource_mgr_allow_cpu_result_mem_oversubscription_concurrency) @@ -214,342 +215,331 @@ void CommandLineOptions::fillOptions() { "can lead to out-of-memory errors. Requires --enable-executor-resource-mgr to be " "set."); - help_desc.add_options()( + desc.add_options()( "executor-max-available-resource-use-ratio", po::value(&g_executor_resource_mgr_max_available_resource_use_ratio) ->default_value(g_executor_resource_mgr_max_available_resource_use_ratio), "Set max proportion (0 < ratio <= 1.0) of available resources that should be " "granted to a query. Requires --executor-resource-mgr to be set"); - help_desc.add_options()("calcite-max-mem", - po::value(&system_parameters.calcite_max_mem) - ->default_value(system_parameters.calcite_max_mem), - "Max memory available to calcite JVM."); + desc.add_options()("calcite-max-mem", + po::value(&system_parameters.calcite_max_mem) + ->default_value(system_parameters.calcite_max_mem), + "Max memory available to calcite JVM."); if (!dist_v5_) { - help_desc.add_options()("calcite-port", - po::value(&system_parameters.calcite_port) - ->default_value(system_parameters.calcite_port), - "Calcite port number."); + desc.add_options()("calcite-port", + po::value(&system_parameters.calcite_port) + ->default_value(system_parameters.calcite_port), + "Calcite port number."); } - help_desc.add_options()("config", - po::value(&system_parameters.config_file), - "Path to server configuration file."); - help_desc.add_options()("cpu-buffer-mem-bytes", - po::value(&system_parameters.cpu_buffer_mem_bytes) - ->default_value(system_parameters.cpu_buffer_mem_bytes), - "Size of memory reserved for CPU buffers, in bytes."); - - help_desc.add_options()("cpu-only", - po::value(&system_parameters.cpu_only) - ->default_value(system_parameters.cpu_only) - ->implicit_value(true), - "Run on CPU only, even if GPUs are available."); - help_desc.add_options()("cuda-block-size", - po::value(&system_parameters.cuda_block_size) - ->default_value(system_parameters.cuda_block_size), - "Size of block to use on NVIDIA GPU."); - help_desc.add_options()("cuda-grid-size", - po::value(&system_parameters.cuda_grid_size) - ->default_value(system_parameters.cuda_grid_size), - "Size of grid to use on NVIDIA GPU."); - help_desc.add_options()("optimize-cuda-block-and-grid-sizes", - po::value(&optimize_cuda_block_and_grid_sizes) - ->default_value(false) - ->implicit_value(true)); + desc.add_options()("config", + po::value(&system_parameters.config_file), + "Path to server configuration file."); + desc.add_options()("cpu-buffer-mem-bytes", + po::value(&system_parameters.cpu_buffer_mem_bytes) + ->default_value(system_parameters.cpu_buffer_mem_bytes), + "Size of memory reserved for CPU buffers, in bytes."); + + desc.add_options()("cpu-only", + po::value(&system_parameters.cpu_only) + ->default_value(system_parameters.cpu_only) + ->implicit_value(true), + "Run on CPU only, even if GPUs are available."); + desc.add_options()("cuda-block-size", + po::value(&system_parameters.cuda_block_size) + ->default_value(system_parameters.cuda_block_size), + "Size of block to use on NVIDIA GPU."); + desc.add_options()("cuda-grid-size", + po::value(&system_parameters.cuda_grid_size) + ->default_value(system_parameters.cuda_grid_size), + "Size of grid to use on NVIDIA GPU."); + desc.add_options()("optimize-cuda-block-and-grid-sizes", + po::value(&optimize_cuda_block_and_grid_sizes) + ->default_value(false) + ->implicit_value(true)); if (!dist_v5_) { - help_desc.add_options()( + desc.add_options()( "data", po::value(&base_path)->required()->default_value("storage"), "Directory path to HeavyDB data storage (catalogs, raw data, log files, etc)."); positional_options.add("data", 1); } - help_desc.add_options()("db-query-list", - po::value(&db_query_file), - "Path to file containing HeavyDB warmup queries."); - help_desc.add_options()( + desc.add_options()("db-query-list", + po::value(&db_query_file), + "Path to file containing HeavyDB warmup queries."); + desc.add_options()( "exit-after-warmup", po::value(&exit_after_warmup)->default_value(false)->implicit_value(true), "Exit after HeavyDB warmup queries."); - help_desc.add_options()("dynamic-watchdog-time-limit", - po::value(&dynamic_watchdog_time_limit) - ->default_value(dynamic_watchdog_time_limit) - ->implicit_value(10000), - "Dynamic watchdog time limit, in milliseconds."); - help_desc.add_options()("enable-data-recycler", - po::value(&enable_data_recycler) - ->default_value(enable_data_recycler) - ->implicit_value(true), - "Use data recycler."); - help_desc.add_options()("use-hashtable-cache", - po::value(&use_hashtable_cache) - ->default_value(use_hashtable_cache) - ->implicit_value(true), - "Use hashtable cache."); - help_desc.add_options()("use-query-resultset-cache", - po::value(&g_use_query_resultset_cache) - ->default_value(g_use_query_resultset_cache) - ->implicit_value(true), - "Use query resultset cache."); - help_desc.add_options()("use-chunk-metadata-cache", - po::value(&g_use_chunk_metadata_cache) - ->default_value(g_use_chunk_metadata_cache) - ->implicit_value(true), - "Use chunk metadata cache."); - help_desc.add_options()( + desc.add_options()("dynamic-watchdog-time-limit", + po::value(&dynamic_watchdog_time_limit) + ->default_value(dynamic_watchdog_time_limit) + ->implicit_value(10000), + "Dynamic watchdog time limit, in milliseconds."); + desc.add_options()("enable-data-recycler", + po::value(&enable_data_recycler) + ->default_value(enable_data_recycler) + ->implicit_value(true), + "Use data recycler."); + desc.add_options()("use-hashtable-cache", + po::value(&use_hashtable_cache) + ->default_value(use_hashtable_cache) + ->implicit_value(true), + "Use hashtable cache."); + desc.add_options()("use-query-resultset-cache", + po::value(&g_use_query_resultset_cache) + ->default_value(g_use_query_resultset_cache) + ->implicit_value(true), + "Use query resultset cache."); + desc.add_options()("use-chunk-metadata-cache", + po::value(&g_use_chunk_metadata_cache) + ->default_value(g_use_chunk_metadata_cache) + ->implicit_value(true), + "Use chunk metadata cache."); + desc.add_options()( "hashtable-cache-total-bytes", po::value(&hashtable_cache_total_bytes) ->default_value(hashtable_cache_total_bytes) ->implicit_value(4294967296), "Size of total memory space for hashtable cache, in bytes (default: 4GB)."); - help_desc.add_options()("max-cacheable-hashtable-size-bytes", - po::value(&max_cacheable_hashtable_size_bytes) - ->default_value(max_cacheable_hashtable_size_bytes) - ->implicit_value(2147483648), - "The maximum size of hashtable that is available to cache, in " - "bytes (default: 2GB)."); - help_desc.add_options()( + desc.add_options()("max-cacheable-hashtable-size-bytes", + po::value(&max_cacheable_hashtable_size_bytes) + ->default_value(max_cacheable_hashtable_size_bytes) + ->implicit_value(2147483648), + "The maximum size of hashtable that is available to cache, in " + "bytes (default: 2GB)."); + desc.add_options()( "query-resultset-cache-total-bytes", po::value(&g_query_resultset_cache_total_bytes) ->default_value(g_query_resultset_cache_total_bytes), "Size of total memory space for query resultset cache, in bytes (default: 4GB)."); - help_desc.add_options()( - "max-query-resultset-size-bytes", - po::value(&g_max_cacheable_query_resultset_size_bytes) - ->default_value(g_max_cacheable_query_resultset_size_bytes), - "The maximum size of query resultset that is available to cache, in " - "bytes (default: 2GB)."); - help_desc.add_options()("allow-auto-query-resultset-caching", - po::value(&g_allow_auto_resultset_caching) - ->default_value(g_allow_auto_resultset_caching) - ->implicit_value(true), - "Allow automatic query resultset caching when the size of " - "query resultset is smaller or equal to the threshold defined " - "by `auto-resultset-caching-threshold-bytes`, in bytes (to " - "enable this, query resultset recycler " - "should be enabled, default: 1048576 bytes (or 1MB))."); - help_desc.add_options()( + desc.add_options()("max-query-resultset-size-bytes", + po::value(&g_max_cacheable_query_resultset_size_bytes) + ->default_value(g_max_cacheable_query_resultset_size_bytes), + "The maximum size of query resultset that is available to cache, in " + "bytes (default: 2GB)."); + desc.add_options()("allow-auto-query-resultset-caching", + po::value(&g_allow_auto_resultset_caching) + ->default_value(g_allow_auto_resultset_caching) + ->implicit_value(true), + "Allow automatic query resultset caching when the size of " + "query resultset is smaller or equal to the threshold defined " + "by `auto-resultset-caching-threshold-bytes`, in bytes (to " + "enable this, query resultset recycler " + "should be enabled, default: 1048576 bytes (or 1MB))."); + desc.add_options()( "auto-resultset-caching-threshold-bytes", po::value(&g_auto_resultset_caching_threshold) ->default_value(g_auto_resultset_caching_threshold), "A threshold that allows caching query resultset automatically if the size of " "resultset is less than it, in bytes (default: 1MB)."); - help_desc.add_options()("allow-query-step-skipping", - po::value(&g_allow_query_step_skipping) - ->default_value(g_allow_query_step_skipping) - ->implicit_value(true), - "Allow query step skipping when multi-step query has at least " - "one cached query resultset."); - help_desc.add_options()("enable-debug-timer", - po::value(&g_enable_debug_timer) - ->default_value(g_enable_debug_timer) - ->implicit_value(true), - "Enable debug timer logging."); - help_desc.add_options()("enable-dynamic-watchdog", - po::value(&enable_dynamic_watchdog) - ->default_value(enable_dynamic_watchdog) - ->implicit_value(true), - "Enable dynamic watchdog."); - help_desc.add_options()("enable-filter-push-down", - po::value(&g_enable_filter_push_down) - ->default_value(g_enable_filter_push_down) - ->implicit_value(true), - "Enable filter push down through joins."); - help_desc.add_options()("enable-overlaps-hashjoin", - po::value(&g_enable_overlaps_hashjoin) - ->default_value(g_enable_overlaps_hashjoin) - ->implicit_value(true), - "Enable the overlaps hash join framework allowing for range " - "join (e.g. spatial overlaps) computation using a hash table."); - help_desc.add_options()("enable-hashjoin-many-to-many", - po::value(&g_enable_hashjoin_many_to_many) - ->default_value(g_enable_hashjoin_many_to_many) - ->implicit_value(true), - "Enable the overlaps hash join framework allowing for range " - "join (e.g. spatial overlaps) computation using a hash table."); - help_desc.add_options()("enable-distance-rangejoin", - po::value(&g_enable_distance_rangejoin) - ->default_value(g_enable_distance_rangejoin) - ->implicit_value(true), - "Enable accelerating point distance joins with a hash table. " - "This rewrites ST_Distance when using an upperbound (<= X)."); - help_desc.add_options()("enable-runtime-query-interrupt", - po::value(&enable_runtime_query_interrupt) - ->default_value(enable_runtime_query_interrupt) - ->implicit_value(true), - "Enable runtime query interrupt."); - help_desc.add_options()("enable-non-kernel-time-query-interrupt", - po::value(&enable_non_kernel_time_query_interrupt) - ->default_value(enable_non_kernel_time_query_interrupt) - ->implicit_value(true), - "Enable non-kernel time query interrupt."); - help_desc.add_options()("pending-query-interrupt-freq", - po::value(&pending_query_interrupt_freq) - ->default_value(pending_query_interrupt_freq) - ->implicit_value(1000), - "A frequency of checking the request of pending query " - "interrupt from user (in millisecond)."); - help_desc.add_options()( - "running-query-interrupt-freq", - po::value(&running_query_interrupt_freq) - ->default_value(running_query_interrupt_freq) - ->implicit_value(0.5), - "A frequency of checking the request of running query " - "interrupt from user (0.0 (less frequent) ~ (more frequent) 1.0)."); - help_desc.add_options()("use-estimator-result-cache", - po::value(&use_estimator_result_cache) - ->default_value(use_estimator_result_cache) - ->implicit_value(true), - "Use estimator result cache."); + + desc.add_options()("allow-query-step-skipping", + po::value(&g_allow_query_step_skipping) + ->default_value(g_allow_query_step_skipping) + ->implicit_value(true), + "Allow query step skipping when multi-step query has at least " + "one cached query resultset."); + desc.add_options()("enable-debug-timer", + po::value(&g_enable_debug_timer) + ->default_value(g_enable_debug_timer) + ->implicit_value(true), + "Enable debug timer logging."); + desc.add_options()("enable-dynamic-watchdog", + po::value(&enable_dynamic_watchdog) + ->default_value(enable_dynamic_watchdog) + ->implicit_value(true), + "Enable dynamic watchdog."); + desc.add_options()("enable-filter-push-down", + po::value(&g_enable_filter_push_down) + ->default_value(g_enable_filter_push_down) + ->implicit_value(true), + "Enable filter push down through joins."); + desc.add_options()("enable-overlaps-hashjoin", + po::value(&g_enable_overlaps_hashjoin) + ->default_value(g_enable_overlaps_hashjoin) + ->implicit_value(true), + "Enable the overlaps hash join framework allowing for range " + "join (e.g. spatial overlaps) computation using a hash table."); + desc.add_options()("enable-hashjoin-many-to-many", + po::value(&g_enable_hashjoin_many_to_many) + ->default_value(g_enable_hashjoin_many_to_many) + ->implicit_value(true), + "Enable the overlaps hash join framework allowing for range " + "join (e.g. spatial overlaps) computation using a hash table."); + desc.add_options()("enable-distance-rangejoin", + po::value(&g_enable_distance_rangejoin) + ->default_value(g_enable_distance_rangejoin) + ->implicit_value(true), + "Enable accelerating point distance joins with a hash table. " + "This rewrites ST_Distance when using an upperbound (<= X)."); + desc.add_options()("enable-runtime-query-interrupt", + po::value(&enable_runtime_query_interrupt) + ->default_value(enable_runtime_query_interrupt) + ->implicit_value(true), + "Enable runtime query interrupt."); + desc.add_options()("enable-non-kernel-time-query-interrupt", + po::value(&enable_non_kernel_time_query_interrupt) + ->default_value(enable_non_kernel_time_query_interrupt) + ->implicit_value(true), + "Enable non-kernel time query interrupt."); + desc.add_options()("pending-query-interrupt-freq", + po::value(&pending_query_interrupt_freq) + ->default_value(pending_query_interrupt_freq) + ->implicit_value(1000), + "A frequency of checking the request of pending query " + "interrupt from user (in millisecond)."); + desc.add_options()("running-query-interrupt-freq", + po::value(&running_query_interrupt_freq) + ->default_value(running_query_interrupt_freq) + ->implicit_value(0.5), + "A frequency of checking the request of running query " + "interrupt from user (0.0 (less frequent) ~ (more frequent) 1.0)."); + if (!dist_v5_) { - help_desc.add_options()( + desc.add_options()( "enable-string-dict-hash-cache", po::value(&g_cache_string_hash) ->default_value(g_cache_string_hash) ->implicit_value(true), "Cache string hash values in the string dictionary server during import."); } - help_desc.add_options()( - "enable-thrift-logs", - po::value(&g_enable_thrift_logs) - ->default_value(g_enable_thrift_logs) - ->implicit_value(true), - "Enable writing messages directly from thrift to stdout/stderr."); - help_desc.add_options()("enable-watchdog", - po::value(&enable_watchdog) - ->default_value(enable_watchdog) - ->implicit_value(true), - "Enable watchdog."); - help_desc.add_options()( - "watchdog-none-encoded-string-translation-limit", - po::value(&watchdog_none_encoded_string_translation_limit) - ->default_value(watchdog_none_encoded_string_translation_limit), - "Max number of none-encoded strings allowed to be translated " - "to dictionary-encoded with watchdog enabled"); - help_desc.add_options()( - "filter-push-down-low-frac", - po::value(&g_filter_push_down_low_frac) - ->default_value(g_filter_push_down_low_frac) - ->implicit_value(g_filter_push_down_low_frac), - "Lower threshold for selectivity of filters that are pushed down."); - help_desc.add_options()( - "filter-push-down-high-frac", - po::value(&g_filter_push_down_high_frac) - ->default_value(g_filter_push_down_high_frac) - ->implicit_value(g_filter_push_down_high_frac), - "Higher threshold for selectivity of filters that are pushed down."); - help_desc.add_options()("filter-push-down-passing-row-ubound", - po::value(&g_filter_push_down_passing_row_ubound) - ->default_value(g_filter_push_down_passing_row_ubound) - ->implicit_value(g_filter_push_down_passing_row_ubound), - "Upperbound on the number of rows that should pass the filter " - "if the selectivity is less than " - "the high fraction threshold."); - help_desc.add_options()("from-table-reordering", - po::value(&g_from_table_reordering) - ->default_value(g_from_table_reordering) - ->implicit_value(true), - "Enable automatic table reordering in FROM clause."); - help_desc.add_options()("gpu-buffer-mem-bytes", - po::value(&system_parameters.gpu_buffer_mem_bytes) - ->default_value(system_parameters.gpu_buffer_mem_bytes), - "Size of memory reserved for GPU buffers, in bytes, per GPU."); - help_desc.add_options()("gpu-input-mem-limit", - po::value(&system_parameters.gpu_input_mem_limit) - ->default_value(system_parameters.gpu_input_mem_limit), - "Force query to CPU when input data memory usage exceeds this " - "percentage of available GPU memory."); - help_desc.add_options()( + desc.add_options()("enable-thrift-logs", + po::value(&g_enable_thrift_logs) + ->default_value(g_enable_thrift_logs) + ->implicit_value(true), + "Enable writing messages directly from thrift to stdout/stderr."); + desc.add_options()("enable-watchdog", + po::value(&enable_watchdog) + ->default_value(enable_watchdog) + ->implicit_value(true), + "Enable watchdog."); + desc.add_options()("watchdog-none-encoded-string-translation-limit", + po::value(&watchdog_none_encoded_string_translation_limit) + ->default_value(watchdog_none_encoded_string_translation_limit), + "Max number of none-encoded strings allowed to be translated " + "to dictionary-encoded with watchdog enabled"); + desc.add_options()("filter-push-down-low-frac", + po::value(&g_filter_push_down_low_frac) + ->default_value(g_filter_push_down_low_frac) + ->implicit_value(g_filter_push_down_low_frac), + "Lower threshold for selectivity of filters that are pushed down."); + desc.add_options()("filter-push-down-high-frac", + po::value(&g_filter_push_down_high_frac) + ->default_value(g_filter_push_down_high_frac) + ->implicit_value(g_filter_push_down_high_frac), + "Higher threshold for selectivity of filters that are pushed down."); + desc.add_options()("filter-push-down-passing-row-ubound", + po::value(&g_filter_push_down_passing_row_ubound) + ->default_value(g_filter_push_down_passing_row_ubound) + ->implicit_value(g_filter_push_down_passing_row_ubound), + "Upperbound on the number of rows that should pass the filter " + "if the selectivity is less than " + "the high fraction threshold."); + desc.add_options()("from-table-reordering", + po::value(&g_from_table_reordering) + ->default_value(g_from_table_reordering) + ->implicit_value(true), + "Enable automatic table reordering in FROM clause."); + desc.add_options()("gpu-buffer-mem-bytes", + po::value(&system_parameters.gpu_buffer_mem_bytes) + ->default_value(system_parameters.gpu_buffer_mem_bytes), + "Size of memory reserved for GPU buffers, in bytes, per GPU."); + desc.add_options()("gpu-input-mem-limit", + po::value(&system_parameters.gpu_input_mem_limit) + ->default_value(system_parameters.gpu_input_mem_limit), + "Force query to CPU when input data memory usage exceeds this " + "percentage of available GPU memory."); + desc.add_options()( "hll-precision-bits", po::value(&g_hll_precision_bits) ->default_value(g_hll_precision_bits) ->implicit_value(g_hll_precision_bits), "Number of bits used from the hash value used to specify the bucket number."); if (!dist_v5_) { - help_desc.add_options()("http-port", - po::value(&http_port)->default_value(http_port), - "HTTP port number."); - help_desc.add_options()( - "http-binary-port", - po::value(&http_binary_port)->default_value(http_binary_port), - "HTTP binary port number."); + desc.add_options()("http-port", + po::value(&http_port)->default_value(http_port), + "HTTP port number."); + desc.add_options()("http-binary-port", + po::value(&http_binary_port)->default_value(http_binary_port), + "HTTP binary port number."); } - help_desc.add_options()( + desc.add_options()( "idle-session-duration", po::value(&idle_session_duration)->default_value(idle_session_duration), "Maximum duration of idle session."); - help_desc.add_options()("inner-join-fragment-skipping", - po::value(&g_inner_join_fragment_skipping) - ->default_value(g_inner_join_fragment_skipping) - ->implicit_value(true), - "Enable/disable inner join fragment skipping. This feature is " - "considered stable and is enabled by default. This " - "parameter will be removed in a future release."); - help_desc.add_options()( + desc.add_options()("inner-join-fragment-skipping", + po::value(&g_inner_join_fragment_skipping) + ->default_value(g_inner_join_fragment_skipping) + ->implicit_value(true), + "Enable/disable inner join fragment skipping. This feature is " + "considered stable and is enabled by default. This " + "parameter will be removed in a future release."); + desc.add_options()( "max-session-duration", po::value(&max_session_duration)->default_value(max_session_duration), "Maximum duration of active session."); - help_desc.add_options()("num-sessions", - po::value(&system_parameters.num_sessions) - ->default_value(system_parameters.num_sessions), - "Maximum number of active session."); - help_desc.add_options()( - "null-div-by-zero", - po::value(&g_null_div_by_zero) - ->default_value(g_null_div_by_zero) - ->implicit_value(true), - "Return null on division by zero instead of throwing an exception."); - help_desc.add_options()( + desc.add_options()("num-sessions", + po::value(&system_parameters.num_sessions) + ->default_value(system_parameters.num_sessions), + "Maximum number of active session."); + desc.add_options()("null-div-by-zero", + po::value(&g_null_div_by_zero) + ->default_value(g_null_div_by_zero) + ->implicit_value(true), + "Return null on division by zero instead of throwing an exception."); + desc.add_options()( "num-reader-threads", po::value(&num_reader_threads)->default_value(num_reader_threads), "Number of reader threads to use."); - help_desc.add_options()( + desc.add_options()( "max-import-threads", po::value(&g_max_import_threads)->default_value(g_max_import_threads), "Max number of default import threads to use (num hardware threads will be used " "instead if lower). Can be overriden with copy statement threads option)."); - help_desc.add_options()( + desc.add_options()( "overlaps-max-table-size-bytes", po::value(&g_overlaps_max_table_size_bytes) ->default_value(g_overlaps_max_table_size_bytes), "The maximum size in bytes of the hash table for an overlaps hash join."); - help_desc.add_options()("overlaps-target-entries-per-bin", - po::value(&g_overlaps_target_entries_per_bin) - ->default_value(g_overlaps_target_entries_per_bin), - "The target number of hash entries per bin for overlaps join"); + desc.add_options()("overlaps-target-entries-per-bin", + po::value(&g_overlaps_target_entries_per_bin) + ->default_value(g_overlaps_target_entries_per_bin), + "The target number of hash entries per bin for overlaps join"); if (!dist_v5_) { - help_desc.add_options()("port,p", - po::value(&system_parameters.omnisci_server_port) - ->default_value(system_parameters.omnisci_server_port), - "TCP Port number."); + desc.add_options()("port,p", + po::value(&system_parameters.omnisci_server_port) + ->default_value(system_parameters.omnisci_server_port), + "TCP Port number."); } - help_desc.add_options()("num-gpus", - po::value(&system_parameters.num_gpus) - ->default_value(system_parameters.num_gpus), - "Number of gpus to use."); - help_desc.add_options()( + desc.add_options()("num-gpus", + po::value(&system_parameters.num_gpus) + ->default_value(system_parameters.num_gpus), + "Number of gpus to use."); + desc.add_options()( "read-only", po::value(&read_only)->default_value(read_only)->implicit_value(true), "Enable read-only mode."); - help_desc.add_options()( + desc.add_options()( "res-gpu-mem", po::value(&reserved_gpu_mem)->default_value(reserved_gpu_mem), "Reduces GPU memory available to the HeavyDB allocator by this amount. Used for " "compiled code cache and ancillary GPU functions and other processes that may also " "be using the GPU concurrent with HeavyDB."); - help_desc.add_options()("start-gpu", - po::value(&system_parameters.start_gpu) - ->default_value(system_parameters.start_gpu), - "First gpu to use."); - help_desc.add_options()("trivial-loop-join-threshold", - po::value(&g_trivial_loop_join_threshold) - ->default_value(g_trivial_loop_join_threshold) - ->implicit_value(1000), - "The maximum number of rows in the inner table of a loop join " - "considered to be trivially small."); - help_desc.add_options()( + desc.add_options()("start-gpu", + po::value(&system_parameters.start_gpu) + ->default_value(system_parameters.start_gpu), + "First gpu to use."); + desc.add_options()("trivial-loop-join-threshold", + po::value(&g_trivial_loop_join_threshold) + ->default_value(g_trivial_loop_join_threshold) + ->implicit_value(1000), + "The maximum number of rows in the inner table of a loop join " + "considered to be trivially small."); + desc.add_options()( "uniform-request-ids-per-thrift-call", po::value(&g_uniform_request_ids_per_thrift_call) ->default_value(g_uniform_request_ids_per_thrift_call) @@ -557,19 +547,19 @@ void CommandLineOptions::fillOptions() { "If true (default) then assign the same request_id to thrift calls that were " "initiated by the same external thrift call. If false then assign different " "request_ids and log the parent/child relationships."); - help_desc.add_options()("verbose", - po::value(&verbose_logging) - ->default_value(verbose_logging) - ->implicit_value(true), - "Write additional debug log messages to server logs."); - help_desc.add_options()( + desc.add_options()("verbose", + po::value(&verbose_logging) + ->default_value(verbose_logging) + ->implicit_value(true), + "Write additional debug log messages to server logs."); + desc.add_options()( "enable-runtime-udf", po::value(&enable_runtime_udf) ->default_value(enable_runtime_udf) ->implicit_value(true), "DEPRECATED. Please use `enable-runtime-udfs` instead as this flag will be removed " "in the near future."); - help_desc.add_options()( + desc.add_options()( "enable-runtime-udfs", po::value(&enable_runtime_udfs) ->default_value(enable_runtime_udfs) @@ -577,70 +567,70 @@ void CommandLineOptions::fillOptions() { "Enable runtime UDF registration by passing signatures and corresponding LLVM IR " "to the `register_runtime_udf` endpoint. For use with the Python Remote Backend " "Compiler server, packaged separately."); - help_desc.add_options()("enable-udf-registration-for-all-users", - po::value(&enable_udf_registration_for_all_users) - ->default_value(enable_udf_registration_for_all_users) - ->implicit_value(true), - "Allow all users, not just superusers, to register runtime " - "UDFs/UDTFs. Option only valid if " - "`--enable-runtime-udfs` is set to true."); - help_desc.add_options()("version,v", "Print Version Number."); - help_desc.add_options()("enable-string-functions", - po::value(&g_enable_string_functions) - ->default_value(g_enable_string_functions) - ->implicit_value(true), - "Enable experimental string functions."); - help_desc.add_options()("enable-experimental-string-functions", - po::value(&g_enable_string_functions) - ->default_value(g_enable_string_functions) - ->implicit_value(true), - "DEPRECATED. String functions are now enabled by default, " - "but can still be controlled with --enable-string-functions."); - help_desc.add_options()( + desc.add_options()("enable-udf-registration-for-all-users", + po::value(&enable_udf_registration_for_all_users) + ->default_value(enable_udf_registration_for_all_users) + ->implicit_value(true), + "Allow all users, not just superusers, to register runtime " + "UDFs/UDTFs. Option only valid if " + "`--enable-runtime-udfs` is set to true."); + desc.add_options()("version,v", "Print Version Number."); + desc.add_options()("enable-string-functions", + po::value(&g_enable_string_functions) + ->default_value(g_enable_string_functions) + ->implicit_value(true), + "Enable experimental string functions."); + desc.add_options()("enable-experimental-string-functions", + po::value(&g_enable_string_functions) + ->default_value(g_enable_string_functions) + ->implicit_value(true), + "DEPRECATED. String functions are now enabled by default, " + "but can still be controlled with --enable-string-functions."); + desc.add_options()( "enable-fsi", po::value(&g_enable_fsi)->default_value(g_enable_fsi)->implicit_value(true), "Enable foreign storage interface."); - help_desc.add_options()("enable-legacy-delimited-import", - po::value(&g_enable_legacy_delimited_import) - ->default_value(g_enable_legacy_delimited_import) - ->implicit_value(true), - "Use legacy importer for delimited sources."); + desc.add_options()("enable-legacy-delimited-import", + po::value(&g_enable_legacy_delimited_import) + ->default_value(g_enable_legacy_delimited_import) + ->implicit_value(true), + "Use legacy importer for delimited sources."); #ifdef ENABLE_IMPORT_PARQUET - help_desc.add_options()("enable-legacy-parquet-import", - po::value(&g_enable_legacy_parquet_import) - ->default_value(g_enable_legacy_parquet_import) - ->implicit_value(true), - "Use legacy importer for parquet sources."); + desc.add_options()("enable-legacy-parquet-import", + po::value(&g_enable_legacy_parquet_import) + ->default_value(g_enable_legacy_parquet_import) + ->implicit_value(true), + "Use legacy importer for parquet sources."); #endif - help_desc.add_options()("enable-fsi-regex-import", - po::value(&g_enable_fsi_regex_import) - ->default_value(g_enable_fsi_regex_import) - ->implicit_value(true), - "Use FSI importer for regex parsed sources."); - - help_desc.add_options()("enable-add-metadata-columns", - po::value(&g_enable_add_metadata_columns) - ->default_value(g_enable_add_metadata_columns) - ->implicit_value(true), - "Enable add_metadata_columns COPY FROM WITH option (Beta)."); - - help_desc.add_options()("disk-cache-path", - po::value(&disk_cache_config.path), - "Specify the path for the disk cache."); - - help_desc.add_options()( + desc.add_options()("enable-fsi-regex-import", + po::value(&g_enable_fsi_regex_import) + ->default_value(g_enable_fsi_regex_import) + ->implicit_value(true), + "Use FSI importer for regex parsed sources."); + + desc.add_options()("enable-add-metadata-columns", + po::value(&g_enable_add_metadata_columns) + ->default_value(g_enable_add_metadata_columns) + ->implicit_value(true), + "Enable add_metadata_columns COPY FROM WITH option (Beta)."); + + desc.add_options()("disk-cache-path", + po::value(&disk_cache_config.path), + "Specify the path for the disk cache."); + + desc.add_options()( "disk-cache-level", po::value(&(disk_cache_level))->default_value("foreign_tables"), "Specify level of disk cache. Valid options are 'foreign_tables', " "'local_tables', 'none', and 'all'."); - help_desc.add_options()("disk-cache-size", - po::value(&(disk_cache_config.size_limit)), - "Specify a maximum size for the disk cache in bytes."); + desc.add_options()("disk-cache-size", + po::value(&(disk_cache_config.size_limit)), + "Specify a maximum size for the disk cache in bytes."); #ifdef HAVE_AWS_S3 - help_desc.add_options()( + desc.add_options()( "allow-s3-server-privileges", po::value(&g_allow_s3_server_privileges) ->default_value(g_allow_s3_server_privileges) @@ -651,141 +641,137 @@ void CommandLineOptions::fillOptions() { "an AWS credentials file, or when running on an EC2 instance, with an IAM role " "that is attached to the instance."); #endif // defined(HAVE_AWS_S3) - help_desc.add_options()( + desc.add_options()( "enable-interoperability", po::value(&g_enable_interop) ->default_value(g_enable_interop) ->implicit_value(true), "Enable offloading of query portions to an external execution engine."); - help_desc.add_options()("enable-union", - po::value(&g_enable_union) - ->default_value(g_enable_union) - ->implicit_value(true), - "DEPRECATED. UNION ALL is enabled by default. Please remove " - "use of this option, as it may be disabled in the future."); - help_desc.add_options()( + desc.add_options()("enable-union", + po::value(&g_enable_union) + ->default_value(g_enable_union) + ->implicit_value(true), + "DEPRECATED. UNION ALL is enabled by default. Please remove " + "use of this option, as it may be disabled in the future."); + desc.add_options()( "calcite-service-timeout", po::value(&system_parameters.calcite_timeout) ->default_value(system_parameters.calcite_timeout), "Calcite server timeout (milliseconds). Increase this on systems with frequent " "schema changes or when running large numbers of parallel queries."); - help_desc.add_options()("calcite-service-keepalive", - po::value(&system_parameters.calcite_keepalive) - ->default_value(system_parameters.calcite_keepalive) - ->implicit_value(true), - "Enable keepalive on Calcite connections."); - help_desc.add_options()( + desc.add_options()("calcite-service-keepalive", + po::value(&system_parameters.calcite_keepalive) + ->default_value(system_parameters.calcite_keepalive) + ->implicit_value(true), + "Enable keepalive on Calcite connections."); + desc.add_options()( "stringdict-parallelizm", po::value(&g_enable_stringdict_parallel) ->default_value(g_enable_stringdict_parallel) ->implicit_value(true), "Allow StringDictionary to parallelize loads using multiple threads"); - help_desc.add_options()( - "log-user-id", - po::value(&Catalog_Namespace::g_log_user_id) - ->default_value(Catalog_Namespace::g_log_user_id) - ->implicit_value(true), - "Log userId integer in place of the userName (when available)."); - help_desc.add_options()("log-user-origin", - po::value(&log_user_origin) - ->default_value(log_user_origin) - ->implicit_value(true), - "Lookup the origin of inbound connections by IP address/DNS " - "name, and print this information as part of stdlog."); - help_desc.add_options()( - "allowed-import-paths", - po::value(&allowed_import_paths), - "List of allowed root paths that can be used in import operations."); - help_desc.add_options()( - "allowed-export-paths", - po::value(&allowed_export_paths), - "List of allowed root paths that can be used in export operations."); - help_desc.add_options()("enable-system-tables", - po::value(&g_enable_system_tables) - ->default_value(g_enable_system_tables) - ->implicit_value(true), - "Enable use of system tables."); - help_desc.add_options()("enable-table-functions", - po::value(&g_enable_table_functions) - ->default_value(g_enable_table_functions) - ->implicit_value(true), - "Enable system table functions support."); - help_desc.add_options()("enable-logs-system-tables", - po::value(&g_enable_logs_system_tables) - ->default_value(g_enable_logs_system_tables) - ->implicit_value(true), - "Enable use of logs system tables."); - help_desc.add_options()( + desc.add_options()("log-user-id", + po::value(&Catalog_Namespace::g_log_user_id) + ->default_value(Catalog_Namespace::g_log_user_id) + ->implicit_value(true), + "Log userId integer in place of the userName (when available)."); + desc.add_options()("log-user-origin", + po::value(&log_user_origin) + ->default_value(log_user_origin) + ->implicit_value(true), + "Lookup the origin of inbound connections by IP address/DNS " + "name, and print this information as part of stdlog."); + desc.add_options()("allowed-import-paths", + po::value(&allowed_import_paths), + "List of allowed root paths that can be used in import operations."); + desc.add_options()("allowed-export-paths", + po::value(&allowed_export_paths), + "List of allowed root paths that can be used in export operations."); + desc.add_options()("enable-system-tables", + po::value(&g_enable_system_tables) + ->default_value(g_enable_system_tables) + ->implicit_value(true), + "Enable use of system tables."); + desc.add_options()("enable-table-functions", + po::value(&g_enable_table_functions) + ->default_value(g_enable_table_functions) + ->implicit_value(true), + "Enable system table functions support."); + desc.add_options()("enable-logs-system-tables", + po::value(&g_enable_logs_system_tables) + ->default_value(g_enable_logs_system_tables) + ->implicit_value(true), + "Enable use of logs system tables."); + desc.add_options()( "logs-system-tables-max-files-count", po::value(&g_logs_system_tables_max_files_count) ->default_value(g_logs_system_tables_max_files_count), "Maximum number of log files that will be processed by each logs system table."); #ifdef ENABLE_MEMKIND - help_desc.add_options()("enable-tiered-cpu-mem", - po::value(&g_enable_tiered_cpu_mem) - ->default_value(g_enable_tiered_cpu_mem) - ->implicit_value(true), - "Enable additional tiers of CPU memory (PMEM, etc...)"); - help_desc.add_options()("pmem-size", po::value(&g_pmem_size)->default_value(0)); - help_desc.add_options()("pmem-path", po::value(&g_pmem_path)); + desc.add_options()("enable-tiered-cpu-mem", + po::value(&g_enable_tiered_cpu_mem) + ->default_value(g_enable_tiered_cpu_mem) + ->implicit_value(true), + "Enable additional tiers of CPU memory (PMEM, etc...)"); + desc.add_options()("pmem-size", po::value(&g_pmem_size)->default_value(0)); + desc.add_options()("pmem-path", po::value(&g_pmem_path)); #endif - help_desc.add(log_options_.get_options()); + desc.add(log_options_.get_options()); } -void CommandLineOptions::fillAdvancedOptions() { - developer_desc.add_options()("dev-options", "Print internal developer options."); - developer_desc.add_options()( +void CommandLineOptions::fillDeveloperOptions() { + po::options_description& desc = developer_desc_; + + desc.add_options()("dev-options", "Print internal developer options."); + desc.add_options()( "enable-calcite-view-optimize", po::value(&system_parameters.enable_calcite_view_optimize) ->default_value(system_parameters.enable_calcite_view_optimize) ->implicit_value(true), "Enable additional calcite (query plan) optimizations when a view is part of the " "query."); - developer_desc.add_options()( - "enable-columnar-output", - po::value(&g_enable_columnar_output) - ->default_value(g_enable_columnar_output) - ->implicit_value(true), - "Enable columnar output for intermediate/final query steps."); - developer_desc.add_options()( - "enable-left-join-filter-hoisting", - po::value(&g_enable_left_join_filter_hoisting) - ->default_value(g_enable_left_join_filter_hoisting) - ->implicit_value(true), - "Enable hoisting left hand side filters through left joins."); - developer_desc.add_options()("optimize-row-init", - po::value(&g_optimize_row_initialization) - ->default_value(g_optimize_row_initialization) - ->implicit_value(true), - "Optimize row initialization."); - developer_desc.add_options()("enable-legacy-syntax", - po::value(&enable_legacy_syntax) - ->default_value(enable_legacy_syntax) - ->implicit_value(true), - "Enable legacy syntax."); - developer_desc.add_options()( + desc.add_options()("enable-columnar-output", + po::value(&g_enable_columnar_output) + ->default_value(g_enable_columnar_output) + ->implicit_value(true), + "Enable columnar output for intermediate/final query steps."); + desc.add_options()("enable-left-join-filter-hoisting", + po::value(&g_enable_left_join_filter_hoisting) + ->default_value(g_enable_left_join_filter_hoisting) + ->implicit_value(true), + "Enable hoisting left hand side filters through left joins."); + desc.add_options()("optimize-row-init", + po::value(&g_optimize_row_initialization) + ->default_value(g_optimize_row_initialization) + ->implicit_value(true), + "Optimize row initialization."); + desc.add_options()("enable-legacy-syntax", + po::value(&enable_legacy_syntax) + ->default_value(enable_legacy_syntax) + ->implicit_value(true), + "Enable legacy syntax."); + desc.add_options()( "enable-multifrag", po::value(&allow_multifrag) ->default_value(allow_multifrag) ->implicit_value(true), "Enable execution over multiple fragments in a single round-trip to GPU."); - developer_desc.add_options()("enable-lazy-fetch", - po::value(&g_enable_lazy_fetch) - ->default_value(g_enable_lazy_fetch) - ->implicit_value(true), - "Enable lazy fetch columns in query results."); - developer_desc.add_options()( - "enable-shared-mem-group-by", - po::value(&g_enable_smem_group_by) - ->default_value(g_enable_smem_group_by) - ->implicit_value(true), - "Enable using GPU shared memory for some GROUP BY queries."); - developer_desc.add_options()("num-executors", - po::value(&system_parameters.num_executors) - ->default_value(system_parameters.num_executors), - "Number of executors to run in parallel."); - developer_desc.add_options()( + desc.add_options()("enable-lazy-fetch", + po::value(&g_enable_lazy_fetch) + ->default_value(g_enable_lazy_fetch) + ->implicit_value(true), + "Enable lazy fetch columns in query results."); + desc.add_options()("enable-shared-mem-group-by", + po::value(&g_enable_smem_group_by) + ->default_value(g_enable_smem_group_by) + ->implicit_value(true), + "Enable using GPU shared memory for some GROUP BY queries."); + desc.add_options()("num-executors", + po::value(&system_parameters.num_executors) + ->default_value(system_parameters.num_executors), + "Number of executors to run in parallel."); + desc.add_options()( "num-tuple-threshold-switch-to-baseline", po::value(&g_num_tuple_threshold_switch_to_baseline) ->default_value(g_num_tuple_threshold_switch_to_baseline) @@ -796,7 +782,7 @@ void CommandLineOptions::fillAdvancedOptions() { "We switch hash table layout when this condition and the condition related to " "\'col-range-to-num-hash-entries-threshold-switch-to-baseline\' are satisfied " "together."); - developer_desc.add_options()( + desc.add_options()( "ratio-num-hash-entry-to-num-tuple-switch-to-baseline", po::value(&g_ratio_num_hash_entry_to_num_tuple_switch_to_baseline) ->default_value(g_ratio_num_hash_entry_to_num_tuple_switch_to_baseline) @@ -807,120 +793,114 @@ void CommandLineOptions::fillAdvancedOptions() { "{THIS_THRESHOLD}" "We switch hash table layout when this condition and the condition related to " "\'num-tuple-threshold-switch-to-baseline\' are satisfied together."); - developer_desc.add_options()( + desc.add_options()( "gpu-shared-mem-threshold", po::value(&g_gpu_smem_threshold)->default_value(g_gpu_smem_threshold), "GPU shared memory threshold (in bytes). If query requires larger buffers than " "this threshold, we disable those optimizations. 0 (default) means no static cap."); - developer_desc.add_options()( + desc.add_options()( "enable-shared-mem-grouped-non-count-agg", po::value(&g_enable_smem_grouped_non_count_agg) ->default_value(g_enable_smem_grouped_non_count_agg) ->implicit_value(true), "Enable using GPU shared memory for grouped non-count aggregate queries."); - developer_desc.add_options()( - "enable-shared-mem-non-grouped-agg", - po::value(&g_enable_smem_non_grouped_agg) - ->default_value(g_enable_smem_non_grouped_agg) - ->implicit_value(true), - "Enable using GPU shared memory for non-grouped aggregate queries."); - developer_desc.add_options()("enable-direct-columnarization", - po::value(&g_enable_direct_columnarization) - ->default_value(g_enable_direct_columnarization) - ->implicit_value(true), - "Enables/disables a more optimized columnarization method " - "for intermediate steps in multi-step queries."); - developer_desc.add_options()( + desc.add_options()("enable-shared-mem-non-grouped-agg", + po::value(&g_enable_smem_non_grouped_agg) + ->default_value(g_enable_smem_non_grouped_agg) + ->implicit_value(true), + "Enable using GPU shared memory for non-grouped aggregate queries."); + desc.add_options()("enable-direct-columnarization", + po::value(&g_enable_direct_columnarization) + ->default_value(g_enable_direct_columnarization) + ->implicit_value(true), + "Enables/disables a more optimized columnarization method " + "for intermediate steps in multi-step queries."); + desc.add_options()( "offset-device-by-table-id", po::value(&g_use_table_device_offset) ->default_value(g_use_table_device_offset) ->implicit_value(true), "Enables/disables offseting the chosen device ID by the table ID for a given " "fragment. This improves balance of fragments across GPUs."); - developer_desc.add_options()("enable-window-functions", - po::value(&g_enable_window_functions) - ->default_value(g_enable_window_functions) - ->implicit_value(true), - "Enable window function support."); - developer_desc.add_options()( - "enable-parallel-window-partition-compute", - po::value(&g_enable_parallel_window_partition_compute) - ->default_value(g_enable_parallel_window_partition_compute) - ->implicit_value(true), - "Enable parallel window function partition computation."); - developer_desc.add_options()( - "enable-parallel-window-partition-sort", - po::value(&g_enable_parallel_window_partition_sort) - ->default_value(g_enable_parallel_window_partition_sort) - ->implicit_value(true), - "Enable parallel window function partition sorting."); - developer_desc.add_options()( + desc.add_options()("enable-window-functions", + po::value(&g_enable_window_functions) + ->default_value(g_enable_window_functions) + ->implicit_value(true), + "Enable window function support."); + desc.add_options()("enable-parallel-window-partition-compute", + po::value(&g_enable_parallel_window_partition_compute) + ->default_value(g_enable_parallel_window_partition_compute) + ->implicit_value(true), + "Enable parallel window function partition computation."); + desc.add_options()("enable-parallel-window-partition-sort", + po::value(&g_enable_parallel_window_partition_sort) + ->default_value(g_enable_parallel_window_partition_sort) + ->implicit_value(true), + "Enable parallel window function partition sorting."); + desc.add_options()( "window-function-frame-aggregation-tree-fanout", po::value(&g_window_function_aggregation_tree_fanout)->default_value(8), "A tree fanout for aggregation tree used to compute aggregation over " "window frame"); - developer_desc.add_options()("enable-dev-table-functions", - po::value(&g_enable_dev_table_functions) - ->default_value(g_enable_dev_table_functions) - ->implicit_value(true), - "Enable dev (test or alpha) table functions. Also " - "requires --enable-table-functions to be turned on"); - - developer_desc.add_options()( - "enable-geo-ops-on-uncompressed-coords", - po::value(&g_enable_geo_ops_on_uncompressed_coords) - ->default_value(g_enable_geo_ops_on_uncompressed_coords) - ->implicit_value(true), - "Enable faster geo operations on uncompressed coords"); - developer_desc.add_options()( + desc.add_options()("enable-dev-table-functions", + po::value(&g_enable_dev_table_functions) + ->default_value(g_enable_dev_table_functions) + ->implicit_value(true), + "Enable dev (test or alpha) table functions. Also " + "requires --enable-table-functions to be turned on"); + + desc.add_options()("enable-geo-ops-on-uncompressed-coords", + po::value(&g_enable_geo_ops_on_uncompressed_coords) + ->default_value(g_enable_geo_ops_on_uncompressed_coords) + ->implicit_value(true), + "Enable faster geo operations on uncompressed coords"); + desc.add_options()( "jit-debug-ir", po::value(&jit_debug)->default_value(jit_debug)->implicit_value(true), "Enable runtime debugger support for the JIT. Note that this flag is " "incompatible " "with the `ENABLE_JIT_DEBUG` build flag. The generated code can be found at " "`/tmp/mapdquery`."); - developer_desc.add_options()( + desc.add_options()( "intel-jit-profile", po::value(&intel_jit_profile) ->default_value(intel_jit_profile) ->implicit_value(true), "Enable runtime support for the JIT code profiling using Intel VTune."); - developer_desc.add_options()( + desc.add_options()( "enable-cpu-sub-tasks", po::value(&g_enable_cpu_sub_tasks) ->default_value(g_enable_cpu_sub_tasks) ->implicit_value(true), "Enable parallel processing of a single data fragment on CPU. This can improve CPU " "load balance and decrease reduction overhead."); - developer_desc.add_options()( + desc.add_options()( "cpu-sub-task-size", po::value(&g_cpu_sub_task_size)->default_value(g_cpu_sub_task_size), "Set CPU sub-task size in rows."); - developer_desc.add_options()( + desc.add_options()( "cpu-threads", po::value(&g_cpu_threads_override)->default_value(g_cpu_threads_override), "Set max CPU concurrent threads. Values <= 0 will use default of 2X the number of " "hardware threads."); - developer_desc.add_options()( + desc.add_options()( "skip-intermediate-count", po::value(&g_skip_intermediate_count) ->default_value(g_skip_intermediate_count) ->implicit_value(true), "Skip pre-flight counts for intermediate projections with no filters."); - developer_desc.add_options()( - "strip-join-covered-quals", - po::value(&g_strip_join_covered_quals) - ->default_value(g_strip_join_covered_quals) - ->implicit_value(true), - "Remove quals from the filtered count if they are covered by a " - "join condition (currently only ST_Contains)."); - - developer_desc.add_options()( - "min-cpu-slab-size", - po::value(&system_parameters.min_cpu_slab_size) - ->default_value(system_parameters.min_cpu_slab_size), - "Min slab size (size of memory allocations) for CPU buffer pool."); - developer_desc.add_options()( + desc.add_options()("strip-join-covered-quals", + po::value(&g_strip_join_covered_quals) + ->default_value(g_strip_join_covered_quals) + ->implicit_value(true), + "Remove quals from the filtered count if they are covered by a " + "join condition (currently only ST_Contains)."); + + desc.add_options()("min-cpu-slab-size", + po::value(&system_parameters.min_cpu_slab_size) + ->default_value(system_parameters.min_cpu_slab_size), + "Min slab size (size of memory allocations) for CPU buffer pool."); + desc.add_options()( "max-cpu-slab-size", po::value(&system_parameters.max_cpu_slab_size) ->default_value(system_parameters.max_cpu_slab_size), @@ -928,12 +908,11 @@ void CommandLineOptions::fillAdvancedOptions() { "there is not enough free memory to accomodate the target slab size, smaller " "slabs will be allocated, down to the minimum size specified by " "min-cpu-slab-size."); - developer_desc.add_options()( - "min-gpu-slab-size", - po::value(&system_parameters.min_gpu_slab_size) - ->default_value(system_parameters.min_gpu_slab_size), - "Min slab size (size of memory allocations) for GPU buffer pools."); - developer_desc.add_options()( + desc.add_options()("min-gpu-slab-size", + po::value(&system_parameters.min_gpu_slab_size) + ->default_value(system_parameters.min_gpu_slab_size), + "Min slab size (size of memory allocations) for GPU buffer pools."); + desc.add_options()( "max-gpu-slab-size", po::value(&system_parameters.max_gpu_slab_size) ->default_value(system_parameters.max_gpu_slab_size), @@ -942,7 +921,7 @@ void CommandLineOptions::fillAdvancedOptions() { "slabs will be allocated, down to the minimum size speified by " "min-gpu-slab-size."); - developer_desc.add_options()( + desc.add_options()( "max-output-projection-allocation-bytes", po::value(&g_max_memory_allocation_size) ->default_value(g_max_memory_allocation_size), @@ -950,7 +929,7 @@ void CommandLineOptions::fillAdvancedOptions() { "queries with no pre-flight count. Default is the maximum slab size (sizes " "greater " "than the maximum slab size have no affect). Requires bump allocator."); - developer_desc.add_options()( + desc.add_options()( "min-output-projection-allocation-bytes", po::value(&g_min_memory_allocation_size) ->default_value(g_min_memory_allocation_size), @@ -959,216 +938,214 @@ void CommandLineOptions::fillAdvancedOptions() { "obtained, the query will be retried with different execution parameters and/or " "on " "CPU (if allow-cpu-retry is enabled). Requires bump allocator."); - developer_desc.add_options()("enable-bump-allocator", - po::value(&g_enable_bump_allocator) - ->default_value(g_enable_bump_allocator) - ->implicit_value(true), - "Enable the bump allocator for projection queries on " - "GPU. The bump allocator will " - "allocate a fixed size buffer for each query, track the " - "number of rows passing the " - "kernel during query execution, and copy back only the " - "rows that passed the kernel " - "to CPU after execution. When disabled, pre-flight " - "count queries are used to size " - "the output buffer for projection queries."); - developer_desc.add_options()( + desc.add_options()("enable-bump-allocator", + po::value(&g_enable_bump_allocator) + ->default_value(g_enable_bump_allocator) + ->implicit_value(true), + "Enable the bump allocator for projection queries on " + "GPU. The bump allocator will " + "allocate a fixed size buffer for each query, track the " + "number of rows passing the " + "kernel during query execution, and copy back only the " + "rows that passed the kernel " + "to CPU after execution. When disabled, pre-flight " + "count queries are used to size " + "the output buffer for projection queries."); + desc.add_options()( "code-cache-eviction-percent", po::value(&g_fraction_code_cache_to_evict) ->default_value(g_fraction_code_cache_to_evict), "Percentage of the GPU code cache to evict if an out of memory error is " "encountered while attempting to place generated code on the GPU."); - developer_desc.add_options()("ssl-cert", - po::value(&system_parameters.ssl_cert_file) - ->default_value(std::string("")), - "SSL Validated public certficate."); + desc.add_options()("ssl-cert", + po::value(&system_parameters.ssl_cert_file) + ->default_value(std::string("")), + "SSL Validated public certficate."); - developer_desc.add_options()("ssl-private-key", - po::value(&system_parameters.ssl_key_file) - ->default_value(std::string("")), - "SSL private key file."); + desc.add_options()("ssl-private-key", + po::value(&system_parameters.ssl_key_file) + ->default_value(std::string("")), + "SSL private key file."); // Note ssl_trust_store is passed through to Calcite via system_parameters // todo(jack): add ensure ssl-trust-store exists if cert and private key in use - developer_desc.add_options()("ssl-trust-store", - po::value(&system_parameters.ssl_trust_store) - ->default_value(std::string("")), - "SSL public CA certifcates (java trust store) to validate " - "TLS connections (passed through to the Calcite server)."); + desc.add_options()("ssl-trust-store", + po::value(&system_parameters.ssl_trust_store) + ->default_value(std::string("")), + "SSL public CA certifcates (java trust store) to validate " + "TLS connections (passed through to the Calcite server)."); - developer_desc.add_options()( + desc.add_options()( "ssl-trust-password", po::value(&system_parameters.ssl_trust_password) ->default_value(std::string("")), "SSL password for java trust store provided via --ssl-trust-store parameter."); - developer_desc.add_options()( + desc.add_options()( "ssl-trust-ca", po::value(&system_parameters.ssl_trust_ca_file) ->default_value(std::string("")), "SSL public CA certificates to validate TLS connection(as a client)."); - developer_desc.add_options()( + desc.add_options()( "ssl-trust-ca-server", po::value(&authMetadata.ca_file_name)->default_value(std::string("")), "SSL public CA certificates to validate TLS connection(as a server)."); - developer_desc.add_options()("ssl-keystore", - po::value(&system_parameters.ssl_keystore) - ->default_value(std::string("")), - "SSL server credentials as a java key store (passed " - "through to the Calcite server)."); + desc.add_options()("ssl-keystore", + po::value(&system_parameters.ssl_keystore) + ->default_value(std::string("")), + "SSL server credentials as a java key store (passed " + "through to the Calcite server)."); - developer_desc.add_options()( - "ssl-keystore-password", - po::value(&system_parameters.ssl_keystore_password) - ->default_value(std::string("")), - "SSL password for java keystore, provide by via --ssl-keystore."); + desc.add_options()("ssl-keystore-password", + po::value(&system_parameters.ssl_keystore_password) + ->default_value(std::string("")), + "SSL password for java keystore, provide by via --ssl-keystore."); - developer_desc.add_options()( + desc.add_options()( "udf", po::value(&udf_file_name), "Load user defined extension functions from this file at startup. The file is " "expected to be a C/C++ file with extension .cpp."); - developer_desc.add_options()( - "udf-compiler-path", - po::value(&udf_compiler_path), - "Provide absolute path to clang++ used in udf compilation."); + desc.add_options()("udf-compiler-path", + po::value(&udf_compiler_path), + "Provide absolute path to clang++ used in udf compilation."); - developer_desc.add_options()("udf-compiler-options", - po::value>(&udf_compiler_options), - "Specify compiler options to tailor udf compilation."); + desc.add_options()("udf-compiler-options", + po::value>(&udf_compiler_options), + "Specify compiler options to tailor udf compilation."); #ifdef ENABLE_GEOS - developer_desc.add_options()("libgeos-so-filename", - po::value(&libgeos_so_filename), - "Specify libgeos shared object filename to be used for " - "geos-backed geo opertations."); + desc.add_options()("libgeos-so-filename", + po::value(&libgeos_so_filename), + "Specify libgeos shared object filename to be used for " + "geos-backed geo opertations."); #endif - developer_desc.add_options()( + desc.add_options()( "large-ndv-threshold", po::value(&g_large_ndv_threshold)->default_value(g_large_ndv_threshold)); - developer_desc.add_options()( + desc.add_options()( "large-ndv-multiplier", po::value(&g_large_ndv_multiplier)->default_value(g_large_ndv_multiplier)); - developer_desc.add_options()("approx_quantile_buffer", - po::value(&g_approx_quantile_buffer) - ->default_value(g_approx_quantile_buffer)); - developer_desc.add_options()("approx_quantile_centroids", - po::value(&g_approx_quantile_centroids) - ->default_value(g_approx_quantile_centroids)); - developer_desc.add_options()( + desc.add_options()("approx_quantile_buffer", + po::value(&g_approx_quantile_buffer) + ->default_value(g_approx_quantile_buffer)); + desc.add_options()("approx_quantile_centroids", + po::value(&g_approx_quantile_centroids) + ->default_value(g_approx_quantile_centroids)); + desc.add_options()( "bitmap-memory-limit", po::value(&g_bitmap_memory_limit)->default_value(g_bitmap_memory_limit), "Limit for count distinct bitmap memory use. The limit is computed by taking the " "size of the group by buffer (entry count in Query Memory Descriptor) and " "multiplying it by the number of count distinct expression and the size of bitmap " "required for each. For approx_count_distinct this is typically 8192 bytes."); - developer_desc.add_options()( + desc.add_options()( "enable-filter-function", po::value(&g_enable_filter_function) ->default_value(g_enable_filter_function) ->implicit_value(true), "Enable the filter function protection feature for the SQL JIT compiler. " "Normally should be on but techs might want to disable for troubleshooting."); - developer_desc.add_options()( + desc.add_options()( "enable-idp-temporary-users", po::value(&g_enable_idp_temporary_users) ->default_value(g_enable_idp_temporary_users) ->implicit_value(true), "Enable temporary users for SAML and LDAP logins on read-only servers. " "Normally should be on but techs might want to disable for troubleshooting."); - developer_desc.add_options()( + desc.add_options()( "enable-seconds-refresh-interval", po::value(&g_enable_seconds_refresh) ->default_value(g_enable_seconds_refresh) ->implicit_value(true), "Enable foreign table seconds refresh interval for testing purposes."); - developer_desc.add_options()("enable-auto-metadata-update", - po::value(&g_enable_auto_metadata_update) - ->default_value(g_enable_auto_metadata_update) - ->implicit_value(true), - "Enable automatic metadata update."); - developer_desc.add_options()( + desc.add_options()("enable-auto-metadata-update", + po::value(&g_enable_auto_metadata_update) + ->default_value(g_enable_auto_metadata_update) + ->implicit_value(true), + "Enable automatic metadata update."); + desc.add_options()( "parallel-top-min", po::value(&g_parallel_top_min)->default_value(g_parallel_top_min), "For ResultSets requiring a heap sort, the number of rows necessary to trigger " "parallelTop() to sort."); - developer_desc.add_options()( + desc.add_options()( "parallel-top-max", po::value(&g_parallel_top_max)->default_value(g_parallel_top_max), "For ResultSets requiring a heap sort, the maximum number of rows allowed by " "watchdog."); - developer_desc.add_options()( + desc.add_options()( "streaming-top-n-max", po::value(&g_streaming_topn_max)->default_value(g_streaming_topn_max), "The maximum number of rows allowing streaming top-N sorting."); - developer_desc.add_options()("vacuum-min-selectivity", - po::value(&g_vacuum_min_selectivity) - ->default_value(g_vacuum_min_selectivity), - "Minimum selectivity for automatic vacuuming. " - "This specifies the percentage (with a value of 0 " - "implying 0% and a value of 1 implying 100%) of " - "deleted rows in a fragment at which to perform " - "automatic vacuuming. A number greater than 1 can " - "be used to disable automatic vacuuming."); - developer_desc.add_options()("enable-automatic-ir-metadata", - po::value(&g_enable_automatic_ir_metadata) - ->default_value(g_enable_automatic_ir_metadata) - ->implicit_value(true), - "Enable automatic IR metadata (debug builds only)."); - developer_desc.add_options()( + desc.add_options()("vacuum-min-selectivity", + po::value(&g_vacuum_min_selectivity) + ->default_value(g_vacuum_min_selectivity), + "Minimum selectivity for automatic vacuuming. " + "This specifies the percentage (with a value of 0 " + "implying 0% and a value of 1 implying 100%) of " + "deleted rows in a fragment at which to perform " + "automatic vacuuming. A number greater than 1 can " + "be used to disable automatic vacuuming."); + desc.add_options()("enable-automatic-ir-metadata", + po::value(&g_enable_automatic_ir_metadata) + ->default_value(g_enable_automatic_ir_metadata) + ->implicit_value(true), + "Enable automatic IR metadata (debug builds only)."); + desc.add_options()( "max-log-length", po::value(&g_max_log_length)->default_value(g_max_log_length), "The maximum number of characters that a log message can has. If the log message " "is longer than this, we only record \'g_max_log_message_length\' characters."); - developer_desc.add_options()( + desc.add_options()( "estimator-failure-max-groupby-size", po::value(&g_estimator_failure_max_groupby_size) ->default_value(g_estimator_failure_max_groupby_size), "Maximum size of the groupby buffer if the estimator fails. By default we use the " "number of tuples in the table up to this value."); - developer_desc.add_options()("columnar-large-projections", - po::value(&g_columnar_large_projections) - ->default_value(g_columnar_large_projections) - ->implicit_value(true), - "Prefer columnar output if projection size is >= " - "threshold set by --columnar-large-projections-threshold " - "(default 1,000,000 rows)."); - developer_desc.add_options()( + desc.add_options()("columnar-large-projections", + po::value(&g_columnar_large_projections) + ->default_value(g_columnar_large_projections) + ->implicit_value(true), + "Prefer columnar output if projection size is >= " + "threshold set by --columnar-large-projections-threshold " + "(default 1,000,000 rows)."); + desc.add_options()( "columnar-large-projections-threshold", po::value(&g_columnar_large_projections_threshold) ->default_value(g_columnar_large_projections_threshold), "Threshold (in minimum number of rows) to prefer columnar output for projections. " "Requires --columnar-large-projections to be set."); - help_desc.add_options()( + desc.add_options()( "allow-query-step-cpu-retry", po::value(&g_allow_query_step_cpu_retry) ->default_value(g_allow_query_step_cpu_retry) ->implicit_value(true), R"(Allow certain query steps to retry on CPU, even when allow-cpu-retry is disabled)"); - help_desc.add_options()("enable-http-binary-server", - po::value(&g_enable_http_binary_server) - ->default_value(g_enable_http_binary_server) - ->implicit_value(true), - "Enable binary over HTTP Thrift server"); - - help_desc.add_options()("enable-query-engine-cuda-streams", - po::value(&g_query_engine_cuda_streams) - ->default_value(g_query_engine_cuda_streams) - ->implicit_value(true), - "Enable Query Engine CUDA streams"); - - help_desc.add_options()( + desc.add_options()("enable-http-binary-server", + po::value(&g_enable_http_binary_server) + ->default_value(g_enable_http_binary_server) + ->implicit_value(true), + "Enable binary over HTTP Thrift server"); + + desc.add_options()("enable-query-engine-cuda-streams", + po::value(&g_query_engine_cuda_streams) + ->default_value(g_query_engine_cuda_streams) + ->implicit_value(true), + "Enable Query Engine CUDA streams"); + + desc.add_options()( "allow-invalid-literal-buffer-reads", po::value(&g_allow_invalid_literal_buffer_reads) ->default_value(g_allow_invalid_literal_buffer_reads) ->implicit_value(true), "For backwards compatibility. Enabling may cause invalid query results."); - developer_desc.add_options()( + desc.add_options()( "enable-drop-render-group-columns-migration", po::value(&enable_drop_render_group_columns_migration) ->default_value(false) @@ -1607,7 +1584,7 @@ boost::optional CommandLineOptions::parse_command_line( char const* const* argv, const bool should_init_logging) { po::options_description all_desc("All options"); - all_desc.add(help_desc).add(developer_desc); + all_desc.add(help_desc_).add(developer_desc_); try { po::store(po::command_line_parser(argc, argv) @@ -1622,7 +1599,7 @@ boost::optional CommandLineOptions::parse_command_line( "[--http-port ] [--flush-log] [--version|-v]" << std::endl << std::endl; - std::cout << help_desc << std::endl; + std::cout << help_desc_ << std::endl; return 0; } if (vm.count("dev-options")) { @@ -1630,7 +1607,7 @@ boost::optional CommandLineOptions::parse_command_line( "[--http-port ] [--flush-log] [--version|-v]" << std::endl << std::endl; - std::cout << developer_desc << std::endl; + std::cout << developer_desc_ << std::endl; return 0; } if (vm.count("version")) { diff --git a/ThriftHandler/CommandLineOptions.h b/ThriftHandler/CommandLineOptions.h index 1b145d1e9e..8203ca572b 100644 --- a/ThriftHandler/CommandLineOptions.h +++ b/ThriftHandler/CommandLineOptions.h @@ -42,7 +42,7 @@ class CommandLineOptions { CommandLineOptions(char const* argv0, bool dist_v5_ = false) : log_options_(argv0), exe_name(argv0), dist_v5_(dist_v5_) { fillOptions(); - fillAdvancedOptions(); + fillDeveloperOptions(); } int http_port = 6278; int http_binary_port = 6276; @@ -126,12 +126,12 @@ class CommandLineOptions { bool enable_drop_render_group_columns_migration = false; void fillOptions(); - void fillAdvancedOptions(); + void fillDeveloperOptions(); std::string compressor = std::string(BLOSC_LZ4HC_COMPNAME); - po::options_description help_desc; - po::options_description developer_desc; + po::options_description help_desc_; + po::options_description developer_desc_; logger::LogOptions log_options_; std::string exe_name; po::positional_options_description positional_options;