diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt index 600e6ac245..9b0d80ec21 100644 --- a/src/main/cpp/CMakeLists.txt +++ b/src/main/cpp/CMakeLists.txt @@ -154,6 +154,7 @@ add_library( src/DateTimeRebaseJni.cpp src/DecimalUtilsJni.cpp src/HashJni.cpp + src/HistogramJni.cpp src/MapUtilsJni.cpp src/NativeParquetJni.cpp src/RowConversionJni.cpp @@ -165,6 +166,7 @@ add_library( src/cast_string_to_float.cu src/datetime_rebase.cu src/decimal_utils.cu + src/histogram.cu src/map_utils.cu src/murmur_hash.cu src/row_conversion.cu diff --git a/src/main/cpp/src/HistogramJni.cpp b/src/main/cpp/src/HistogramJni.cpp new file mode 100644 index 0000000000..950a005dcd --- /dev/null +++ b/src/main/cpp/src/HistogramJni.cpp @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cudf_jni_apis.hpp" +#include "histogram.hpp" + +extern "C" { + +JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_Histogram_createHistogramIfValid( + JNIEnv *env, jclass, jlong values_handle, jlong frequencies_handle, jboolean output_as_lists) { + JNI_NULL_CHECK(env, values_handle, "values_handle is null", 0); + JNI_NULL_CHECK(env, frequencies_handle, "frequencies_handle is null", 0); + + try { + cudf::jni::auto_set_device(env); + + auto const values = reinterpret_cast(values_handle); + auto const frequencies = reinterpret_cast(frequencies_handle); + return cudf::jni::ptr_as_jlong( + spark_rapids_jni::create_histogram_if_valid(*values, *frequencies, output_as_lists) + .release()); + } + CATCH_STD(env, 0); +} + +JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_Histogram_percentileFromHistogram( + JNIEnv *env, jclass, jlong input_handle, jdoubleArray jpercentages, jboolean output_as_lists) { + JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0); + JNI_NULL_CHECK(env, jpercentages, "jpercentages is null", 0); + + try { + cudf::jni::auto_set_device(env); + + auto const input = reinterpret_cast(input_handle); + auto const percentages = [&] { + auto const native_percentages = cudf::jni::native_jdoubleArray(env, jpercentages); + return std::vector(native_percentages.begin(), native_percentages.end()); + }(); + return cudf::jni::ptr_as_jlong( + spark_rapids_jni::percentile_from_histogram(*input, percentages, output_as_lists) + .release()); + } + CATCH_STD(env, 0); +} + +} // extern "C" diff --git a/src/main/cpp/src/histogram.cu b/src/main/cpp/src/histogram.cu new file mode 100644 index 0000000000..408b5f9f98 --- /dev/null +++ b/src/main/cpp/src/histogram.cu @@ -0,0 +1,436 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "histogram.hpp" + +// +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// +#include +#include +#include +#include +#include +#include +#include + +// +#include + +namespace spark_rapids_jni { + +namespace { + +template // +struct fill_percentile_fn { + __device__ void operator()(cudf::size_type const idx) const { + auto const histogram_idx = idx / percentages.size(); + + // If a histogram has null element, it never has more than one null (as the histogram + // only stores unique elements) and that null is sorted to stay at the end. + // We need to ignore null thus we will shift the end point if we see a null. + + auto const start = offsets[histogram_idx]; + auto const try_end = offsets[histogram_idx + 1]; + auto const all_valid = sorted_validity[try_end - 1]; + auto const end = all_valid ? try_end : try_end - 1; + + // If the end point after shifting coincides with the start point, we don't have any + // other valid element. + auto const has_all_nulls = start >= end; + + auto const percentage_idx = idx % percentages.size(); + if (out_validity && percentage_idx == 0) { + // If the histogram only contains null elements, the output percentile will be null. + out_validity[histogram_idx] = has_all_nulls ? 0 : 1; + } + + if (has_all_nulls) { + return; + } + + auto const max_positions = accumulated_counts[end - 1] - 1L; + auto const percentage = percentages[percentage_idx]; + auto const position = static_cast(max_positions) * percentage; + auto const lower = static_cast(floor(position)); + auto const higher = static_cast(ceil(position)); + + auto const lower_index = search_counts(lower + 1, start, end); + auto const lower_element = sorted_input[lower_index]; + if (higher == lower) { + output[idx] = lower_element; + return; + } + + auto const higher_index = search_counts(higher + 1, start, end); + auto const higher_element = sorted_input[higher_index]; + if (higher_element == lower_element) { + output[idx] = lower_element; + return; + } + + output[idx] = (higher - position) * lower_element + (position - lower) * higher_element; + } + + fill_percentile_fn(cudf::size_type const *const offsets_, ElementIterator const sorted_input_, + ValidityIterator const sorted_validity_, + cudf::device_span const accumulated_counts_, + cudf::device_span const percentages_, double *const output_, + int8_t *const out_validity_) + : offsets{offsets_}, sorted_input{sorted_input_}, sorted_validity{sorted_validity_}, + accumulated_counts{accumulated_counts_}, percentages{percentages_}, output{output_}, + out_validity{out_validity_} {} + +private: + __device__ cudf::size_type search_counts(int64_t position, cudf::size_type start, + cudf::size_type end) const { + auto const it = thrust::lower_bound(thrust::seq, accumulated_counts.begin() + start, + accumulated_counts.begin() + end, position); + return static_cast(thrust::distance(accumulated_counts.begin(), it)); + } + + cudf::size_type const *const offsets; + ElementIterator const sorted_input; + ValidityIterator const sorted_validity; + cudf::device_span const accumulated_counts; + cudf::device_span const percentages; + double *const output; + int8_t *const out_validity; +}; + +struct percentile_dispatcher { + template static constexpr bool is_supported() { return std::is_arithmetic_v; } + + // The output here is only intermediate result, consisting of: + // 1. The output percentile values, + // 2. Null mask to apply for the final output column containing percentile values, and + // 3. Null count corresponding to that null mask. + using output_type = + std::tuple, rmm::device_buffer, cudf::size_type>; + + template + std::enable_if_t(), output_type> operator()(Args &&...) const { + CUDF_FAIL("Unsupported type in histogram-to-percentile evaluation."); + } + + template ())> + output_type operator()(cudf::size_type const *const offsets, + cudf::size_type const *const ordered_indices, + cudf::column_device_view const &data, + cudf::device_span accumulated_counts, + cudf::device_span percentages, bool has_null, + cudf::size_type num_histograms, rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) const { + // Returns all nulls for totally empty input. + if (data.size() == 0 || percentages.size() == 0) { + return { + cudf::make_numeric_column(cudf::data_type{cudf::type_id::FLOAT64}, num_histograms, + cudf::mask_state::UNALLOCATED, stream, mr), + cudf::detail::create_null_mask(num_histograms, cudf::mask_state::ALL_NULL, stream, mr), + num_histograms}; + } + + auto percentiles = + cudf::make_numeric_column(cudf::data_type{cudf::type_id::FLOAT64}, + num_histograms * static_cast(percentages.size()), + cudf::mask_state::UNALLOCATED, stream, mr); + + auto const fill_percentile = [&](auto const sorted_validity_it, auto const out_validity) { + auto const sorted_input_it = + thrust::make_permutation_iterator(data.begin(), ordered_indices); + thrust::for_each_n(rmm::exec_policy(stream), thrust::make_counting_iterator(0), + num_histograms * static_cast(percentages.size()), + fill_percentile_fn{offsets, sorted_input_it, sorted_validity_it, + accumulated_counts, percentages, + percentiles->mutable_view().begin(), + out_validity}); + }; + + if (!has_null) { + fill_percentile(thrust::make_constant_iterator(true), nullptr); + } else { + auto const sorted_validity_it = thrust::make_permutation_iterator( + cudf::detail::make_validity_iterator(data), ordered_indices); + auto out_validities = rmm::device_uvector(num_histograms, stream, + rmm::mr::get_current_device_resource()); + fill_percentile(sorted_validity_it, out_validities.begin()); + + auto [null_mask, null_count] = cudf::detail::valid_if( + out_validities.begin(), out_validities.end(), thrust::identity{}, stream, mr); + if (null_count > 0) { + return {std::move(percentiles), std::move(null_mask), null_count}; + } + } + + return {std::move(percentiles), rmm::device_buffer{}, 0}; + } +}; + +void check_input(cudf::column_view const &input, std::vector const &percentages) { + CUDF_EXPECTS(input.type().id() == cudf::type_id::LIST, "The input column must be of type LIST.", + std::invalid_argument); + + auto const child = input.child(cudf::lists_column_view::child_column_index); + CUDF_EXPECTS(!child.has_nulls(), "Child of the input column must not have nulls.", + std::invalid_argument); + CUDF_EXPECTS(child.type().id() == cudf::type_id::STRUCT && child.num_children() == 2, + "Child of the input column must be of STRUCT type having two children.", + std::invalid_argument); + CUDF_EXPECTS(!child.child(1).has_nulls(), + "Child of the input column must have its second child containing non-null elements.", + std::invalid_argument); + CUDF_EXPECTS(child.child(1).type().id() == cudf::type_id::INT64, + "Child of the input column must have its second child of type INT64.", + std::invalid_argument); + + CUDF_EXPECTS(static_cast(input.size()) * percentages.size() <= + static_cast(std::numeric_limits::max()), + "Size of output exceeds cudf column size limit.", std::overflow_error); +} + +// Wrap the input column in a lists column, to satisfy the requirement type in Spark. +std::unique_ptr +wrap_in_list(std::unique_ptr &&input, rmm::device_buffer &&null_mask, + cudf::size_type null_count, cudf::size_type num_histograms, + cudf::size_type num_percentages, rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { + if (input->size() == 0) { + return cudf::lists::detail::make_empty_lists_column(input->type(), stream, mr); + } + + auto const sizes_itr = thrust::make_constant_iterator(num_percentages); + auto offsets = std::get<0>( + cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + num_histograms, stream, mr)); + auto output = cudf::make_lists_column(num_histograms, std::move(offsets), std::move(input), + null_count, std::move(null_mask), stream, mr); + if (null_count > 0) { + return cudf::detail::purge_nonempty_nulls(output->view(), stream, mr); + } + + return output; +} + +} // namespace + +std::unique_ptr create_histogram_if_valid(cudf::column_view const &values, + cudf::column_view const &frequencies, + bool output_as_lists, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { + CUDF_EXPECTS(!frequencies.has_nulls(), "The input frequencies must not have nulls.", + std::invalid_argument); + CUDF_EXPECTS(frequencies.type().id() == cudf::type_id::INT64, + "The input frequencies must be of type INT64.", std::invalid_argument); + CUDF_EXPECTS(values.size() == frequencies.size(), + "The input values and frequencies must have the same size.", std::invalid_argument); + + if (values.size() == 0) { + if (output_as_lists) { + return cudf::make_lists_column( + 0, cudf::make_empty_column(cudf::type_to_id()), + cudf::reduction::detail::make_empty_histogram_like(values), 0, {}); + } else { + return cudf::reduction::detail::make_empty_histogram_like(values); + } + } + + auto const default_mr = rmm::mr::get_current_device_resource(); + + // We only check if there is any row in frequencies that are negative (invalid) or zero. + auto check_invalid_and_zero = + cudf::detail::make_zeroed_device_uvector_async(2, stream, default_mr); + + // We need to check and remember which rows are valid (positive) so we can do filtering later on. + auto check_valid = rmm::device_uvector(frequencies.size(), stream, default_mr); + + thrust::for_each_n( + rmm::exec_policy(stream), thrust::make_counting_iterator(0), frequencies.size(), + [frequencies = frequencies.begin(), check_invalid = check_invalid_and_zero.begin(), + check_zero = check_invalid_and_zero.begin() + 1, + check_valid = check_valid.begin()] __device__(auto const idx) { + if (frequencies[idx] < 0) { + *check_invalid = 1; + } + if (frequencies[idx] == 0) { + *check_zero = 1; + } + + check_valid[idx] = static_cast(frequencies[idx] > 0); + }); + + auto const h_checks = cudf::detail::make_std_vector_sync(check_invalid_and_zero, stream); + CUDF_EXPECTS(!h_checks.front(), // check invalid (negative) frequencies + "The input frequencies must not contain negative values.", std::invalid_argument); + + auto const make_structs_histogram = [&](rmm::device_buffer &&null_mask, + cudf::size_type null_count) { + // Copy values and frequencies into a new structs column. + std::vector> values_and_frequencies; + values_and_frequencies.emplace_back(std::make_unique(values, stream, mr)); + values_and_frequencies.emplace_back(std::make_unique(frequencies, stream, mr)); + + // Set null mask for the returned values. + // Such null mask is resulted from checking frequencies > 0. + if (null_count > 0) { + if (!values.has_nulls()) { + values_and_frequencies.front()->set_null_mask(std::move(null_mask), null_count); + } else { + // We need to AND the current null mask with the given null mask. + auto [new_null_mask, new_null_count] = cudf::detail::bitmask_and( + std::vector{ + // Don't use values.null_mask(), to make sure no slicing. + values_and_frequencies.front()->view().null_mask(), + reinterpret_cast(null_mask.data())}, + std::vector{0, 0}, values.size(), stream, mr); + values_and_frequencies.front()->set_null_mask(std::move(new_null_mask), new_null_count); + } + + // Nulls will be eventually excluded from percentile computation. + // However, having frequencies containing zero (corresponding to these nulls) will crash + // cudf MERGE_HISTOGRAM aggregation. + // Therefore, we manually set `1` for the frequencies of nulls. + thrust::for_each_n( + rmm::exec_policy(stream), thrust::make_counting_iterator(0), frequencies.size(), + [frequencies = values_and_frequencies.back()->mutable_view().begin(), + null_mask = + values_and_frequencies.front()->view().null_mask()] __device__(auto const idx) { + // If this is a null, set 1 frequency. + if (!cudf::bit_is_set(null_mask, idx)) { + frequencies[idx] = int64_t{1}; + } + }); + } + + return cudf::make_structs_column(values.size(), std::move(values_and_frequencies), 0, + rmm::device_buffer{}, stream, mr); + }; + + auto const make_lists_histograms = [&](cudf::size_type num_elements, + std::unique_ptr &&structs_histogram) { + // Each output list will have size 1. + auto const sizes_itr = thrust::make_constant_iterator(1); + auto offsets = std::get<0>( + cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + num_elements, stream, mr)); + return cudf::make_lists_column(num_elements, std::move(offsets), std::move(structs_histogram), + 0, rmm::device_buffer{}, stream, mr); + }; + + if (output_as_lists) { + auto child = make_structs_histogram(rmm::device_buffer{}, 0); + auto lists_histograms = make_lists_histograms(values.size(), std::move(child)); + + if (!h_checks.back()) { // all frequencies are positive + return lists_histograms; + } + + // Frequencies contain zero. + // We generate a null mask in which nulls correspond to the zero row in frequencies. + // Then, apply it to the output lists column, empty out the null lists, and finally remove + // the null mask. + // By doing so, the input rows corresponding to zero frequencies will be output as empty lists. + auto [null_mask, null_count] = cudf::detail::valid_if(check_valid.begin(), check_valid.end(), + thrust::identity{}, stream, default_mr); + lists_histograms->set_null_mask(std::move(null_mask), null_count); + lists_histograms = cudf::detail::purge_nonempty_nulls(lists_histograms->view(), stream, mr); + lists_histograms->set_null_mask(rmm::device_buffer{}, 0); + return lists_histograms; + } else { // output_as_lists==false + if (!h_checks.back()) { // all frequencies are positive + return make_structs_histogram(rmm::device_buffer{}, 0); + } + + // We nullify the values corresponding to zero frequencies. + auto [null_mask, null_count] = cudf::detail::valid_if(check_valid.begin(), check_valid.end(), + thrust::identity{}, stream, mr); + return make_structs_histogram(std::move(null_mask), null_count); + } +} + +std::unique_ptr percentile_from_histogram(cudf::column_view const &input, + std::vector const &percentages, + bool output_as_list, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { + check_input(input, percentages); + + auto const lcv_histograms = cudf::lists_column_view{input}; + auto const histograms = lcv_histograms.get_sliced_child(stream); + auto const data_col = cudf::structs_column_view{histograms}.get_sliced_child(0); + auto const counts_col = cudf::structs_column_view{histograms}.get_sliced_child(1); + + auto const default_mr = rmm::mr::get_current_device_resource(); + auto const d_data = cudf::column_device_view::create(data_col, stream); + auto const d_percentages = + cudf::detail::make_device_uvector_sync(percentages, stream, default_mr); + + // Attach histogram labels to the input. + auto histogram_labels = + rmm::device_uvector(histograms.size(), stream, default_mr); + cudf::detail::label_segments(lcv_histograms.offsets_begin(), lcv_histograms.offsets_end(), + histogram_labels.begin(), histogram_labels.end(), stream); + auto const labels_cv = cudf::column_view{cudf::data_type{cudf::type_to_id()}, + static_cast(histogram_labels.size()), + histogram_labels.data(), nullptr, 0}; + auto const labeled_histograms = cudf::table_view{{labels_cv, histograms}}; + // Find the order of segmented sort elements within each histogram list. + // The null order must be `AFTER`. + auto const ordered_indices = cudf::detail::sorted_order( + labeled_histograms, std::vector{cudf::order::ASCENDING, cudf::order::ASCENDING}, + std::vector{cudf::null_order::AFTER, cudf::null_order::AFTER}, stream, + default_mr); + + auto const d_accumulated_counts = [&] { + auto const sorted_counts = thrust::make_permutation_iterator( + counts_col.begin(), ordered_indices->view().begin()); + auto accumulated_counts = rmm::device_uvector(counts_col.size(), stream, default_mr); + // We don't need a permutation iterator for the labels, since the same labels always + // stay together after sorting. + thrust::inclusive_scan_by_key(rmm::exec_policy(stream), histogram_labels.begin(), + histogram_labels.end(), sorted_counts, + accumulated_counts.begin()); + return accumulated_counts; + }(); + + auto [percentiles, null_mask, null_count] = type_dispatcher( + data_col.type(), percentile_dispatcher{}, lcv_histograms.offsets_begin(), + ordered_indices->view().begin(), *d_data, d_accumulated_counts, + d_percentages, data_col.has_nulls(), input.size(), stream, mr); + + if (output_as_list) { + return wrap_in_list(std::move(percentiles), std::move(null_mask), null_count, + lcv_histograms.size(), static_cast(percentages.size()), + stream, mr); + } + percentiles->set_null_mask(std::move(null_mask), null_count); + return std::move(percentiles); +} + +} // namespace spark_rapids_jni diff --git a/src/main/cpp/src/histogram.hpp b/src/main/cpp/src/histogram.hpp new file mode 100644 index 0000000000..2625b8705a --- /dev/null +++ b/src/main/cpp/src/histogram.hpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +// +#include + +namespace spark_rapids_jni { + +/** + * @brief Check the input if they are valid and create a histogram from them. + * + * Validity of the input columns are defined as following: + * - Values and frequencies columns must have the same size. + * - Frequencies column must be of type INT64, must not have nulls, and must not contain + * negative numbers. + * + * If the input columns are valid, a histogram will be created from them. Otherwise, an exception + * will be thrown. + * + * The output histogram is stored in a structs column in the form of `STRUCT`. + * If `output_as_lists == true`, each struct element is wrapped in a list, producing a + * lists-of-structs column. + * + * @param values The input values + * @param frequencies The frequencies corresponding to the input values + * @param output_as_list Specify whether to wrap each pair of in the output + * histogram in a separate list + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return A histogram column with data copied from the input + */ +std::unique_ptr create_histogram_if_valid( + cudf::column_view const &values, cudf::column_view const &frequencies, bool output_as_lists, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Compute percentiles from the given histograms and percentage values. + * + * The input histograms must be given in the form of `LIST>`. + * + * @param input The lists of input histograms + * @param percentages The input percentage values + * @param output_as_lists Specify whether the output percentiles will be wrapped in a list + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return A lists column, each list stores the percentile value(s) of the corresponding row in the + * input column + */ +std::unique_ptr percentile_from_histogram( + cudf::column_view const &input, std::vector const &percentage, bool output_as_lists, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + +} // namespace spark_rapids_jni diff --git a/src/main/java/com/nvidia/spark/rapids/jni/Histogram.java b/src/main/java/com/nvidia/spark/rapids/jni/Histogram.java new file mode 100644 index 0000000000..6606e340e6 --- /dev/null +++ b/src/main/java/com/nvidia/spark/rapids/jni/Histogram.java @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark.rapids.jni; + +import ai.rapids.cudf.ColumnVector; +import ai.rapids.cudf.ColumnView; +import ai.rapids.cudf.NativeDepsLoader; + +public class Histogram { + static { + NativeDepsLoader.loadNativeDeps(); + } + + /** + * Create histograms from the given values and frequencies if the frequencies are valid. + *

+ * The input is valid if they satisfy the following conditions: + * - Values and frequencies columns must have the same size. + * - Frequencies column must be of type INT64, must not have nulls, and must not contain + * negative numbers. + *

+ * If the input columns are valid, a histogram will be created from them. The histogram data is + * stored in a structs column in the form of `STRUCT`. + * If `output_as_lists == true`, each struct element is wrapped into a list, producing a + * lists-of-structs column. + * + * @param values The input values + * @param frequencies The frequencies corresponding to the input values + * @param outputAsLists Specify whether to wrap each pair of in the output + * histogram into a separate list + * @return A histogram column with data copied from the input + */ + public static ColumnVector createHistogramIfValid(ColumnView values, ColumnView frequencies, + boolean outputAsLists) { + return new ColumnVector(createHistogramIfValid(values.getNativeView(), + frequencies.getNativeView(), outputAsLists)); + } + + /** + * Compute percentiles from the given histograms and percentage values. + *

+ * The input histograms must be given in the format `LIST>`. + * + * @param input The lists of input histograms. + * @param percentages The input percentage values. + * @param outputAsLists Specify whether the output percentiles will be wrapped into a list. + * @return A lists column, each list stores the output percentile(s) computed for the + * corresponding row in the input column. + */ + public static ColumnVector percentileFromHistogram(ColumnView input, double[] percentages, + boolean outputAsLists) { + return new ColumnVector(percentileFromHistogram(input.getNativeView(), percentages, + outputAsLists)); + } + + + private static native long createHistogramIfValid(long valuesHandle, long frequenciesHandle, + boolean outputAsLists); + + private static native long percentileFromHistogram(long inputHandle, double[] percentages, + boolean outputAsLists); +}