Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement percentile computation from histogram data #1416

Merged
merged 36 commits into from
Oct 16, 2023
Merged
Show file tree
Hide file tree
Changes from 28 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
e5d18cc
Add files and empty functions
ttnghia Sep 15, 2023
c8257ea
Implementing histogram-to-percentile
ttnghia Sep 15, 2023
1e1b99e
Implement `percentile_from_histogram`
ttnghia Sep 16, 2023
17d7c9e
Add Java interface
ttnghia Sep 26, 2023
58969ea
Input `percentage` as GPU column
ttnghia Sep 26, 2023
ba1f543
Wrap in list if needed
ttnghia Sep 26, 2023
e958e51
Implementing groupby percentile
ttnghia Sep 27, 2023
fda35fe
Merge branch 'branch-23.10' into percentile
ttnghia Sep 29, 2023
9368413
Cleanup and docs
ttnghia Sep 29, 2023
a801a14
Fix null issue
ttnghia Sep 29, 2023
85e7465
Implement null check
ttnghia Sep 29, 2023
69c8df8
Fix typo
ttnghia Sep 30, 2023
981d1ca
Cleanup
ttnghia Sep 30, 2023
b54aa51
Cleanup
ttnghia Sep 30, 2023
43ead20
Change back to use the less error-prone segmented sort
ttnghia Sep 30, 2023
e762784
Fix all nulls cases
ttnghia Oct 1, 2023
1c5949a
Debugging
ttnghia Oct 1, 2023
b918a64
Fix null handling
ttnghia Oct 1, 2023
84390e1
Fix empty input handling
ttnghia Oct 1, 2023
16f71c7
Fix access indices
ttnghia Oct 1, 2023
3904468
Fix accumulated counts computation
ttnghia Oct 2, 2023
d24207b
WIP
ttnghia Oct 2, 2023
d8ddb4b
Implement input projection for values with frequencies
ttnghia Oct 3, 2023
6e00c5f
All tests passed!!!
ttnghia Oct 3, 2023
a8a5b5c
Cleanup and docs
ttnghia Oct 6, 2023
ef0026c
Merge branch 'branch-23.10' into percentile
ttnghia Oct 6, 2023
b1e6bfc
Rename files and class
ttnghia Oct 6, 2023
7466ddd
Change docs
ttnghia Oct 10, 2023
dc06cd4
Rename function, rewrite docs, and cleanup
ttnghia Oct 11, 2023
0b1ce8d
Rewrite comments
ttnghia Oct 11, 2023
96b08a0
Handling zero frequency
ttnghia Oct 13, 2023
8d5a8a8
Remove too detail comment
ttnghia Oct 13, 2023
d2e3fea
Update src/main/java/com/nvidia/spark/rapids/jni/Histogram.java
ttnghia Oct 13, 2023
683490a
Update src/main/java/com/nvidia/spark/rapids/jni/Histogram.java
ttnghia Oct 13, 2023
c9513b5
Merge branch 'branch-23.12' into percentile
ttnghia Oct 13, 2023
8c75439
Change error messages
ttnghia Oct 13, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/main/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ add_library(
src/DateTimeRebaseJni.cpp
src/DecimalUtilsJni.cpp
src/HashJni.cpp
src/HistogramJni.cpp
src/MapUtilsJni.cpp
src/NativeParquetJni.cpp
src/RowConversionJni.cpp
Expand All @@ -165,6 +166,7 @@ add_library(
src/cast_string_to_float.cu
src/datetime_rebase.cu
src/decimal_utils.cu
src/histogram.cu
src/map_utils.cu
src/murmur_hash.cu
src/row_conversion.cu
Expand Down
59 changes: 59 additions & 0 deletions src/main/cpp/src/HistogramJni.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "cudf_jni_apis.hpp"
#include "histogram.hpp"

extern "C" {

JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_Histogram_createHistogramsIfValid(
JNIEnv *env, jclass, jlong values_handle, jlong frequencies_handle, jboolean output_as_lists) {
JNI_NULL_CHECK(env, values_handle, "values_handle is null", 0);
JNI_NULL_CHECK(env, frequencies_handle, "frequencies_handle is null", 0);

try {
cudf::jni::auto_set_device(env);

auto const values = reinterpret_cast<cudf::column_view const *>(values_handle);
auto const frequencies = reinterpret_cast<cudf::column_view const *>(frequencies_handle);
return cudf::jni::ptr_as_jlong(
spark_rapids_jni::create_histograms_if_valid(*values, *frequencies, output_as_lists)
.release());
}
CATCH_STD(env, 0);
}

JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_Histogram_percentileFromHistogram(
JNIEnv *env, jclass, jlong input_handle, jdoubleArray jpercentages, jboolean output_as_lists) {
JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0);
JNI_NULL_CHECK(env, jpercentages, "jpercentages is null", 0);

try {
cudf::jni::auto_set_device(env);

auto const input = reinterpret_cast<cudf::column_view const *>(input_handle);
auto const percentages = [&] {
auto const native_percentages = cudf::jni::native_jdoubleArray(env, jpercentages);
return std::vector<double>(native_percentages.begin(), native_percentages.end());
}();
return cudf::jni::ptr_as_jlong(
spark_rapids_jni::percentile_from_histogram(*input, percentages, output_as_lists)
.release());
}
CATCH_STD(env, 0);
}

} // extern "C"
Loading