From 03c0f5a1ea332caac37f0d5b57b9838e80e691c1 Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Wed, 16 Oct 2024 10:01:15 +0800
Subject: [PATCH 01/12] Add HLL++ evaluation function

---
 src/main/cpp/CMakeLists.txt                   |   2 +
 src/main/cpp/src/HLLPP.cu                     | 102 ++++++++++++++++++
 src/main/cpp/src/HLLPP.hpp                    |  32 ++++++
 src/main/cpp/src/HLLPPJni.cpp                 |  34 ++++++
 .../com/nvidia/spark/rapids/jni/HLLPP.java    |  45 ++++++++
 .../nvidia/spark/rapids/jni/HLLPPTest.java    |  37 +++++++
 6 files changed, 252 insertions(+)
 create mode 100644 src/main/cpp/src/HLLPP.cu
 create mode 100644 src/main/cpp/src/HLLPP.hpp
 create mode 100644 src/main/cpp/src/HLLPPJni.cpp
 create mode 100644 src/main/java/com/nvidia/spark/rapids/jni/HLLPP.java
 create mode 100644 src/test/java/com/nvidia/spark/rapids/jni/HLLPPTest.java
diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt
index bfb9d55377..997e7bda15 100644
--- a/src/main/cpp/CMakeLists.txt
+++ b/src/main/cpp/CMakeLists.txt
@@ -196,6 +196,7 @@ add_library(
   src/HashJni.cpp
   src/HistogramJni.cpp
   src/HostTableJni.cpp
+  src/HLLPPJni.cpp
   src/JSONUtilsJni.cpp
   src/NativeParquetJni.cpp
   src/ParseURIJni.cpp
@@ -204,6 +205,7 @@ add_library(
   src/SparkResourceAdaptorJni.cpp
   src/SubStringIndexJni.cpp
   src/ZOrderJni.cpp
+  src/HLLPP.cu
   src/bloom_filter.cu
   src/case_when.cu
   src/cast_decimal_to_string.cu
diff --git a/src/main/cpp/src/HLLPP.cu b/src/main/cpp/src/HLLPP.cu
new file mode 100644
index 0000000000..439b9e1706
--- /dev/null
+++ b/src/main/cpp/src/HLLPP.cu
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "HLLPP.hpp"
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <cuco/detail/hyperloglog/finalizer.cuh>
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
+
+namespace spark_rapids_jni {
+
+namespace {
+
+// The number of bits required by register value. Register value stores num of zeros.
+// XXHash64 value is 64 bits, it's safe to use 6 bits to store a register value.
+constexpr int REGISTER_VALUE_BITS = 6;
+
+// MASK binary 6 bits: 111111
+constexpr uint64_t MASK = (1L << REGISTER_VALUE_BITS) - 1L;
+
+// One long stores 10 register values
+constexpr int REGISTERS_PER_LONG = 64 / REGISTER_VALUE_BITS;
+
+__device__ inline int get_register_value(int64_t const long_10_registers, int reg_idx)
+{
+  int64_t shift_mask = MASK << (REGISTER_VALUE_BITS * reg_idx);
+  int64_t v          = (long_10_registers & shift_mask) >> (REGISTER_VALUE_BITS * reg_idx);
+  return static_cast<int>(v);
+}
+
+struct estimate_fn {
+  cudf::device_span<int64_t const*> sketch_longs;
+  int const precision;
+  int64_t* const out;
+
+  __device__ void operator()(cudf::size_type const idx) const
+  {
+    auto const num_regs = 1ull << precision;
+    double sum          = 0;
+    int zeroes          = 0;
+
+    for (auto reg_idx = 0; reg_idx < num_regs; ++reg_idx) {
+      // each long contains 10 register values
+      int long_col_idx    = reg_idx / REGISTERS_PER_LONG;
+      int reg_idx_in_long = reg_idx % REGISTERS_PER_LONG;
+      int reg             = get_register_value(sketch_longs[long_col_idx][idx], reg_idx_in_long);
+      sum += double{1} / static_cast<double>(1ull << reg);
+      zeroes += reg == 0;
+    }
+
+    auto const finalize = cuco::hyperloglog_ns::detail::finalizer(precision);
+    out[idx]            = finalize(sum, zeroes);
+  }
+};
+
+}  // end anonymous namespace
+
+std::unique_ptr<cudf::column> estimate_from_hll_sketches(cudf::column_view const& input,
+                                                         int precision,
+                                                         rmm::cuda_stream_view stream,
+                                                         rmm::device_async_resource_ref mr)
+{
+  CUDF_EXPECTS(precision >= 4 && precision <= 18, "HLL++ requires precision in range: [4, 18]");
+  auto const input_iter = cudf::detail::make_counting_transform_iterator(
+    0, [&](int i) { return input.child(i).begin<int64_t>(); });
+  auto input_cols = std::vector<int64_t const*>(input_iter, input_iter + input.num_children());
+  auto d_inputs   = cudf::detail::make_device_uvector_async(input_cols, stream, mr);
+  auto result     = cudf::make_numeric_column(
+    cudf::data_type{cudf::type_id::INT64}, input.size(), cudf::mask_state::ALL_VALID, stream);
+  // evaluate from struct<long, ..., long>
+  thrust::for_each_n(rmm::exec_policy(stream),
+                     thrust::make_counting_iterator(0),
+                     input.size(),
+                     estimate_fn{d_inputs, precision, result->mutable_view().data<int64_t>()});
+  return result;
+}
+
+}  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/HLLPP.hpp b/src/main/cpp/src/HLLPP.hpp
new file mode 100644
index 0000000000..69e0b237e5
--- /dev/null
+++ b/src/main/cpp/src/HLLPP.hpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
+
+namespace spark_rapids_jni {
+
+std::unique_ptr<cudf::column> estimate_from_hll_sketches(
+  cudf::column_view const& input,
+  int precision,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+}  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/HLLPPJni.cpp b/src/main/cpp/src/HLLPPJni.cpp
new file mode 100644
index 0000000000..581af90a90
--- /dev/null
+++ b/src/main/cpp/src/HLLPPJni.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "HLLPP.hpp"
+#include "cudf_jni_apis.hpp"
+
+extern "C" {
+
+JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_HLLPP_estimateDistinctValueFromSketches(
+  JNIEnv* env, jclass, jlong sketches, jint precision)
+{
+  JNI_NULL_CHECK(env, sketches, "Sketch column is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const sketch_view = reinterpret_cast<cudf::column_view const*>(sketches);
+    return cudf::jni::ptr_as_jlong(
+      spark_rapids_jni::estimate_from_hll_sketches(*sketch_view, precision).release());
+  }
+  CATCH_STD(env, 0);
+}
+}
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/HLLPP.java b/src/main/java/com/nvidia/spark/rapids/jni/HLLPP.java
new file mode 100644
index 0000000000..1be2c80512
--- /dev/null
+++ b/src/main/java/com/nvidia/spark/rapids/jni/HLLPP.java
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.jni;
+
+import ai.rapids.cudf.ColumnVector;
+import ai.rapids.cudf.ColumnView;
+import ai.rapids.cudf.NativeDepsLoader;
+
+/**
+ * HyperLogLogPlusPlus
+ */
+public class HLLPP {
+  static {
+    NativeDepsLoader.loadNativeDeps();
+  }
+
+  /**
+   * Compute the approximate count distinct value from sketch values.
+   * <p>
+   * The input sketch values must be given in the format `LIST<INT8>`.
+   *
+   * @param input         The sketch column which constains `LIST<INT8> values.
+   * @param precision     The num of bits for addressing.
+   * @return A INT64 column with each value indicates the approximate count distinct value.
+   */
+  public static ColumnVector estimateDistinctValueFromSketches(ColumnView input, int precision) {
+    return new ColumnVector(estimateDistinctValueFromSketches(input.getNativeView(), precision));
+  }
+
+  private static native long estimateDistinctValueFromSketches(long inputHandle, int precision);
+}
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/HLLPPTest.java b/src/test/java/com/nvidia/spark/rapids/jni/HLLPPTest.java
new file mode 100644
index 0000000000..c14b565313
--- /dev/null
+++ b/src/test/java/com/nvidia/spark/rapids/jni/HLLPPTest.java
@@ -0,0 +1,37 @@
+/*
+* Copyright (c)  2024, NVIDIA CORPORATION.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package com.nvidia.spark.rapids.jni;
+
+import ai.rapids.cudf.GroupByAggregation;
+import ai.rapids.cudf.Table;
+
+import org.junit.jupiter.api.Test;
+
+
+public class HLLPPTest {
+
+  @Test
+  void testGroupByHLL() {
+    // A trivial test:
+    try (Table input = new Table.TestBuilder().column(1, 2, 3, 1, 2, 2, 1, 3, 3, 2)
+             .column(0, 1, -2, 3, -4, -5, -6, 7, -8, 9)
+             .build()){
+        input.groupBy(0).aggregate(GroupByAggregation.HLLPP(0)
+               .onColumn(1));
+    }
+  }
+}

From df8b223a6391dbd82c85bb2005b0d426a14ca304 Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Mon, 4 Nov 2024 10:18:19 +0800
Subject: [PATCH 02/12] Update function comments

---
 src/main/java/com/nvidia/spark/rapids/jni/HLLPP.java | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/main/java/com/nvidia/spark/rapids/jni/HLLPP.java b/src/main/java/com/nvidia/spark/rapids/jni/HLLPP.java
index 1be2c80512..9e51761f4a 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/HLLPP.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/HLLPP.java
@@ -31,9 +31,12 @@ public class HLLPP {
   /**
    * Compute the approximate count distinct value from sketch values.
    * <p>
-   * The input sketch values must be given in the format `LIST<INT8>`.
+   * The input sketch values must be given in the format `Struct<INT64, INT64, ...>`,
+   * The num of children is: num_registers_per_sketch / 10 + 1, here 10 means a INT64 contains
+   * max 10 registers. Register value is 6 bits. The input is columnar data, e.g.: sketch 0
+   * is composed of by all the data of the children at index 0.
    *
-   * @param input         The sketch column which constains `LIST<INT8> values.
+   * @param input         The sketch column which constains Struct<INT64, INT64, ...> values.
    * @param precision     The num of bits for addressing.
    * @return A INT64 column with each value indicates the approximate count distinct value.
    */

From 2daca3f536a847d25de7edc6555bd824d704df2f Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Tue, 19 Nov 2024 17:18:01 +0800
Subject: [PATCH 03/12] Fix

---
 src/main/cpp/src/HLLPP.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/main/cpp/src/HLLPP.cu b/src/main/cpp/src/HLLPP.cu
index 439b9e1706..ca35e77861 100644
--- a/src/main/cpp/src/HLLPP.cu
+++ b/src/main/cpp/src/HLLPP.cu
@@ -84,7 +84,7 @@ std::unique_ptr<cudf::column> estimate_from_hll_sketches(cudf::column_view const
                                                          rmm::cuda_stream_view stream,
                                                          rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(precision >= 4 && precision <= 18, "HLL++ requires precision in range: [4, 18]");
+  CUDF_EXPECTS(precision >= 4 , "HyperLogLogPlusPlus requires precision is bigger than 4.");
   auto const input_iter = cudf::detail::make_counting_transform_iterator(
     0, [&](int i) { return input.child(i).begin<int64_t>(); });
   auto input_cols = std::vector<int64_t const*>(input_iter, input_iter + input.num_children());

From 3afdfdef7ac93cda55267994f9865296e061c25c Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Tue, 26 Nov 2024 15:43:44 +0800
Subject: [PATCH 04/12] Use exec_policy_nosync instead of exec_policy

---
 src/main/cpp/compile_commands.json | 1 +
 src/main/cpp/src/HLLPP.cu          | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)
 create mode 120000 src/main/cpp/compile_commands.json

diff --git a/src/main/cpp/compile_commands.json b/src/main/cpp/compile_commands.json
new file mode 120000
index 0000000000..921c8b97d1
--- /dev/null
+++ b/src/main/cpp/compile_commands.json
@@ -0,0 +1 @@
+/home/chongg/code/spark-rapids-jni/target/jni/cmake-build/compile_commands.json
\ No newline at end of file
diff --git a/src/main/cpp/src/HLLPP.cu b/src/main/cpp/src/HLLPP.cu
index ca35e77861..939d8fe2e0 100644
--- a/src/main/cpp/src/HLLPP.cu
+++ b/src/main/cpp/src/HLLPP.cu
@@ -92,7 +92,7 @@ std::unique_ptr<cudf::column> estimate_from_hll_sketches(cudf::column_view const
   auto result     = cudf::make_numeric_column(
     cudf::data_type{cudf::type_id::INT64}, input.size(), cudf::mask_state::ALL_VALID, stream);
   // evaluate from struct<long, ..., long>
-  thrust::for_each_n(rmm::exec_policy(stream),
+  thrust::for_each_n(rmm::exec_policy_nosync(stream),
                      thrust::make_counting_iterator(0),
                      input.size(),
                      estimate_fn{d_inputs, precision, result->mutable_view().data<int64_t>()});

From 956af394dba6d784efa36a1e5ccc943ed53eea2c Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Tue, 26 Nov 2024 15:48:51 +0800
Subject: [PATCH 05/12] Format code; Remove a useless file

Signed-off-by: Chong Gao <res_life@163.com>
---
 src/main/cpp/compile_commands.json | 1 -
 src/main/cpp/src/HLLPP.cu          | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)
 delete mode 120000 src/main/cpp/compile_commands.json

diff --git a/src/main/cpp/compile_commands.json b/src/main/cpp/compile_commands.json
deleted file mode 120000
index 921c8b97d1..0000000000
--- a/src/main/cpp/compile_commands.json
+++ /dev/null
@@ -1 +0,0 @@
-/home/chongg/code/spark-rapids-jni/target/jni/cmake-build/compile_commands.json
\ No newline at end of file
diff --git a/src/main/cpp/src/HLLPP.cu b/src/main/cpp/src/HLLPP.cu
index 939d8fe2e0..d2d9493cf7 100644
--- a/src/main/cpp/src/HLLPP.cu
+++ b/src/main/cpp/src/HLLPP.cu
@@ -84,7 +84,7 @@ std::unique_ptr<cudf::column> estimate_from_hll_sketches(cudf::column_view const
                                                          rmm::cuda_stream_view stream,
                                                          rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(precision >= 4 , "HyperLogLogPlusPlus requires precision is bigger than 4.");
+  CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision is bigger than 4.");
   auto const input_iter = cudf::detail::make_counting_transform_iterator(
     0, [&](int i) { return input.child(i).begin<int64_t>(); });
   auto input_cols = std::vector<int64_t const*>(input_iter, input_iter + input.num_children());

From 5bfb54426a4cb137f1cce70d843681167c5f929b Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Sun, 15 Dec 2024 16:31:42 +0800
Subject: [PATCH 06/12] Use UDF

---
 src/main/cpp/CMakeLists.txt                   |   4 +-
 src/main/cpp/src/HLLPPHostUDFJni.cpp          |  66 ++
 src/main/cpp/src/hllpp.cu                     | 969 ++++++++++++++++++
 src/main/cpp/src/hllpp.hpp                    | 100 ++
 src/main/cpp/src/hllpp_host_udf.cu            | 183 ++++
 src/main/cpp/src/hllpp_host_udf.hpp           |  35 +
 .../nvidia/spark/rapids/jni/HLLPPHostUDF.java | 105 ++
 7 files changed, 1461 insertions(+), 1 deletion(-)
 create mode 100644 src/main/cpp/src/HLLPPHostUDFJni.cpp
 create mode 100644 src/main/cpp/src/hllpp.cu
 create mode 100644 src/main/cpp/src/hllpp.hpp
 create mode 100644 src/main/cpp/src/hllpp_host_udf.cu
 create mode 100644 src/main/cpp/src/hllpp_host_udf.hpp
 create mode 100644 src/main/java/com/nvidia/spark/rapids/jni/HLLPPHostUDF.java

diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt
index 8872303e73..b8b5f3a139 100644
--- a/src/main/cpp/CMakeLists.txt
+++ b/src/main/cpp/CMakeLists.txt
@@ -193,6 +193,7 @@ add_library(
   src/DateTimeRebaseJni.cpp
   src/DecimalUtilsJni.cpp
   src/GpuTimeZoneDBJni.cpp
+  src/HLLPPHostUDFJni.cpp
   src/HashJni.cpp
   src/HistogramJni.cpp
   src/HostTableJni.cpp
@@ -205,7 +206,6 @@ add_library(
   src/SparkResourceAdaptorJni.cpp
   src/SubStringIndexJni.cpp
   src/ZOrderJni.cpp
-  src/HLLPP.cu
   src/bloom_filter.cu
   src/case_when.cu
   src/cast_decimal_to_string.cu
@@ -219,6 +219,8 @@ add_library(
   src/from_json_to_structs.cu
   src/get_json_object.cu
   src/histogram.cu
+  src/hllpp_host_udf.cu
+  src/hllpp.cu
   src/json_utils.cu
   src/murmur_hash.cu
   src/parse_uri.cu
diff --git a/src/main/cpp/src/HLLPPHostUDFJni.cpp b/src/main/cpp/src/HLLPPHostUDFJni.cpp
new file mode 100644
index 0000000000..3132d088ac
--- /dev/null
+++ b/src/main/cpp/src/HLLPPHostUDFJni.cpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cudf_jni_apis.hpp"
+#include "hllpp.hpp"
+#include "hllpp_host_udf.hpp"
+
+extern "C" {
+
+JNIEXPORT jlong JNICALL
+Java_com_nvidia_spark_rapids_jni_HLLPPHostUDF_createHLLPPHostUDF(
+    JNIEnv *env, jclass, jint agg_type, int precision) {
+  try {
+    cudf::jni::auto_set_device(env);
+    auto udf_ptr = [&] {
+      // The value of agg_type must be sync with
+      // `HLLPPHostUDF.java#AggregationType`.
+      switch (agg_type) {
+      case 0:
+        return spark_rapids_jni::create_hllpp_reduction_host_udf(precision);
+      case 1:
+        return spark_rapids_jni::create_hllpp_reduction_merge_host_udf(
+            precision);
+      case 2:
+        return spark_rapids_jni::create_hllpp_groupby_host_udf(precision);
+      default:
+        return spark_rapids_jni::create_hllpp_groupby_merge_host_udf(precision);
+      }
+    }();
+    CUDF_EXPECTS(udf_ptr != nullptr,
+                 "Invalid HyperLogLogPlusPlus(HLLPP) UDF instance.");
+
+    return reinterpret_cast<jlong>(udf_ptr.release());
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT jlong JNICALL
+Java_com_nvidia_spark_rapids_jni_HLLPPHostUDF_estimateDistinctValueFromSketches(
+    JNIEnv *env, jclass, jlong sketches, jint precision) {
+  JNI_NULL_CHECK(env, sketches, "Sketch column is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const sketch_view =
+        reinterpret_cast<cudf::column_view const *>(sketches);
+    return cudf::jni::ptr_as_jlong(
+        spark_rapids_jni::estimate_from_hll_sketches(*sketch_view, precision)
+            .release());
+  }
+  CATCH_STD(env, 0);
+}
+
+} // extern "C"
diff --git a/src/main/cpp/src/hllpp.cu b/src/main/cpp/src/hllpp.cu
new file mode 100644
index 0000000000..08f452ad76
--- /dev/null
+++ b/src/main/cpp/src/hllpp.cu
@@ -0,0 +1,969 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "hash.hpp"
+#include "hllpp.hpp"
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/structs/structs_column_view.hpp>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
+#include <cudf/utilities/span.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <cuco/detail/hyperloglog/finalizer.cuh>
+#include <cuda/atomic>
+#include <cuda/std/__algorithm/min.h> // TODO #include <cuda/std/algorithm> once available
+#include <cuda/std/bit>
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/reverse.h>
+#include <thrust/tabulate.h>
+
+namespace spark_rapids_jni {
+
+namespace {
+
+/**
+ * @brief Get register value from a long which contains 10 register values,
+ * each register value in long is 6 bits.
+ */
+__device__ inline int get_register_value(int64_t const ten_registers,
+                                         int reg_idx) {
+  int64_t shift_mask = MASK << (REGISTER_VALUE_BITS * reg_idx);
+  int64_t v = (ten_registers & shift_mask) >> (REGISTER_VALUE_BITS * reg_idx);
+  return static_cast<int>(v);
+}
+
+/**
+ * @brief Computes HLLPP sketches(register values) from hash values and
+ * partially merge the sketches.
+ *
+ * `reduce_by_key` uses num_rows_input intermidate cache:
+ * https://github.com/NVIDIA/thrust/blob/2.1.0/thrust/system/detail/generic/reduce_by_key.inl#L112
+ *
+ * // scan the values by flag
+ * thrust::detail::temporary_array<ValueType,ExecutionPolicy>
+ * scanned_values(exec, n);
+ *
+ * Each sketch contains multiple integers, by default 512 integers(precision is
+ * 9), num_rows_input * 512 is huge, so this function uses a differrent approach
+ * to use less intermidate cache. New approach uses 2 phase merges: partial
+ * merge and final merge
+ *
+ * This function splits input into multiple segments with each segment has
+ * num_hashs_per_thread items. The input is sorted by group labels, each segment
+ * contains one or more consecutive groups. Each thread handles one segment with
+ * num_hashs_per_thread items in it:
+ * - Scan all the items in the segment, update the max value.
+ * - Output max value into registers_output_cache for the previous group when
+ * meets a new group.
+ * - Output max value into registers_thread_cache when reach the last item in
+ * the segment.
+ *
+ * In this way, we can save memory usage, cache less intermidate sketches
+ * (num_hashs / num_hashs_per_thread) sketches.
+ * num_threads = div_round_up(num_hashs, num_hashs_per_thread).
+ *
+ * e.g.: num_registers_per_sketch = 512 and num_hashs_per_thread = 4;
+ *
+ * Input is hashs, compute and get pair: register index -> register value
+ *
+ *   reg_index, reg_value, group_lable
+ * [
+ * ---------- segment 0 begin --------------------------
+ *    (0,            1),          g0
+ *    (0,            2),          g0
+ * // meets new group g1, save result for group g0 into registers_output_cache
+ *    (1,            1),          g1
+ * // outputs result at segemnt end for this thread to registers_thread_cache
+ *    (1,            9),          g1
+ * ---------- segment 1 begin --------------------------
+ *    (1,            1),          g1
+ *    (1,            1),          g1
+ *    (1,            5),          g1
+ * // outputs result at segemnt end for this thread to registers_thread_cache
+ *    (1,            1),          g1
+ * ---------- segment 2 begin --------------------------
+ *    (1,            1),          g1
+ *    (1,            1),          g1
+ *    (1,            8),          g1
+ * // outputs result at segemnt end for this thread to registers_thread_cache
+ * // assumes meets new group when at the end, save to registers_output_cache
+ *    (1,            1),          g1
+ * ]
+ * Output e.g.:
+ *
+ * group_lables_thread_cache:
+ * [
+ *   g1
+ *   g1
+ *   g1
+ * ]
+ * Has num_threads rows.
+ *
+ * registers_thread_cache:
+ * [
+ *    512 values: [0, 9, 0, ... ] // register values for group 1
+ *    512 values: [0, 5, 0, ... ] // register values for group 1
+ *    512 values: [0, 8, 0, ... ] // register values for group 1
+ * ]
+ * Has num_threads rows, each row is corresponding to
+ * `group_lables_thread_cache`
+ *
+ * registers_output_cache:
+ * [
+ *    512 values: [2, 0, 0, ... ] // register values for group 0
+ *    512 values: [0, 8, 0, ... ] // register values for group 1
+ * ]
+ * Has num_groups rows.
+ *
+ * The next kernel will merge the registers_output_cache and
+ * registers_thread_cache and get the final result.
+ */
+template <int num_hashs_per_thread>
+CUDF_KERNEL void partial_group_sketches_from_hashs_kernel(
+    cudf::column_device_view hashs,
+    cudf::device_span<cudf::size_type const> group_lables,
+    int64_t const precision, // num of bits for register addressing, e.g.: 9
+    int *const
+        registers_output_cache, // num is num_groups * num_registers_per_sketch
+    int *const
+        registers_thread_cache, // num is num_threads * num_registers_per_sketch
+    cudf::size_type *const
+        group_lables_thread_cache // save the group lables for each thread
+) {
+  auto const tid = cudf::detail::grid_1d::global_thread_id();
+  int64_t const num_hashs = hashs.size();
+  if (tid * num_hashs_per_thread >= hashs.size()) {
+    return;
+  }
+
+  // 2^precision = num_registers_per_sketch
+  int64_t num_registers_per_sketch = 1L << precision;
+  // e.g.: integer in binary: 1 0000 0000
+  uint64_t const w_padding = 1ULL << (precision - 1);
+  // e.g.: 64 - 9 = 55
+  int const idx_shift = 64 - precision;
+
+  auto const hash_first = tid * num_hashs_per_thread;
+  auto const hash_end =
+      cuda::std::min((tid + 1) * num_hashs_per_thread, num_hashs);
+
+  // init sketches for each thread
+  int *const sketch_ptr =
+      registers_thread_cache + tid * num_registers_per_sketch;
+  for (auto i = 0; i < num_registers_per_sketch; i++) {
+    sketch_ptr[i] = 0;
+  }
+
+  cudf::size_type prev_group = group_lables[hash_first];
+  for (auto hash_idx = hash_first; hash_idx < hash_end; hash_idx++) {
+    cudf::size_type curr_group = group_lables[hash_idx];
+
+    // cast to unsigned, then >> will shift without preserve the sign bit.
+    uint64_t const hash =
+        static_cast<uint64_t>(hashs.element<int64_t>(hash_idx));
+    auto const reg_idx = hash >> idx_shift;
+    int const reg_v = static_cast<int>(
+        cuda::std::countl_zero((hash << precision) | w_padding) + 1ULL);
+
+    if (curr_group == prev_group) {
+      // still in the same group, update the max value
+      if (reg_v > sketch_ptr[reg_idx]) {
+        sketch_ptr[reg_idx] = reg_v;
+      }
+    } else {
+      // meets new group, save output for the previous group and reset
+      for (auto i = 0; i < num_registers_per_sketch; i++) {
+        registers_output_cache[prev_group * num_registers_per_sketch + i] =
+            sketch_ptr[i];
+        sketch_ptr[i] = 0;
+      }
+      // save the result for current group
+      sketch_ptr[reg_idx] = reg_v;
+    }
+
+    if (hash_idx == hash_end - 1) {
+      // meets the last hash in the segment
+      if (hash_idx == num_hashs - 1) {
+        // meets the last segment, special logic: assume meets new group
+        for (auto i = 0; i < num_registers_per_sketch; i++) {
+          registers_output_cache[curr_group * num_registers_per_sketch + i] =
+              sketch_ptr[i];
+        }
+      } else {
+        // not the last segment, probe one item forward.
+        if (curr_group != group_lables[hash_idx + 1]) {
+          // meets a new group by checking the next item in the next segment
+          for (auto i = 0; i < num_registers_per_sketch; i++) {
+            registers_output_cache[curr_group * num_registers_per_sketch + i] =
+                sketch_ptr[i];
+          }
+        }
+      }
+    }
+
+    prev_group = curr_group;
+  }
+
+  // save the group lable for this thread
+  group_lables_thread_cache[tid] = group_lables[hash_end - 1];
+}
+
+/*
+ * @brief Merge registers_thread_cache into registers_output_cache, both of them
+ * are produced in the above kernel. Merge sketches vertically.
+ *
+ * For each register index, starts a thread to merge registers in
+ * registers_thread_cache to registers_output_cache. num_threads =
+ * num_registers_per_sketch.
+ *
+ * Input e.g.:
+ *
+ * group_lables_thread_cache:
+ * [
+ *   g0
+ *   g0
+ *   g1
+ *   ...
+ *   gN
+ * ]
+ * Has num_threads rows.
+ *
+ * registers_thread_cache:
+ * [
+ *    r0_g0, r1_g0, r2_g0, r3_g0, ... , r511_g0 // register values for group 0
+ *    r0_g0, r1_g0, r2_g0, r3_g0, ... , r511_g0 // register values for group 0
+ *    r0_g1, r1_g1, r2_g1, r3_g1, ... , r511_g1 // register values for group 1
+ *    ...
+ *    r0_gN, r1_gN, r2_gN, r3_gN, ... , r511_gN // register values for group N
+ * ]
+ * Has num_threads rows, each row is corresponding to
+ * `group_lables_thread_cache`
+ *
+ * registers_output_cache:
+ * [
+ *    r0_g0, r1_g0, r2_g0, r3_g0, ... , r511_g0 // register values for group 0
+ *    r0_g1, r1_g1, r2_g1, r3_g1, ... , r511_g1 // register values for group 1
+ *    ...
+ *    r0_gN, r1_gN, r2_gN, r3_gN, ... , r511_gN // register values for group N
+ * ]
+ * registers_output_cache has num_groups rows.
+ *
+ * For each thread, scan from the first register to the last register, find the
+ * max value in the same group, and then update to registers_output_cache
+ */
+template <int block_size>
+CUDF_KERNEL void merge_sketches_vertically(
+    int64_t num_sketches, int64_t num_registers_per_sketch,
+    int *const registers_output_cache, int const *const registers_thread_cache,
+    cudf::size_type const *const group_lables_thread_cache) {
+  __shared__ int8_t shared_data[block_size];
+  auto const tid = cudf::detail::grid_1d::global_thread_id();
+  int shared_idx = tid % block_size;
+
+  // register idx is tid
+  shared_data[shared_idx] = static_cast<int8_t>(0);
+  int prev_group = group_lables_thread_cache[0];
+  for (auto i = 0; i < num_sketches; i++) {
+    int curr_group = group_lables_thread_cache[i];
+    int8_t curr_reg_v = static_cast<int8_t>(
+        registers_thread_cache[i * num_registers_per_sketch + tid]);
+    if (curr_group == prev_group) {
+      if (curr_reg_v > shared_data[shared_idx]) {
+        shared_data[shared_idx] = curr_reg_v;
+      }
+    } else {
+      // meets a new group, store the result for previous group
+      int64_t result_reg_idx = prev_group * num_registers_per_sketch + tid;
+      int result_curr_reg_v = registers_output_cache[result_reg_idx];
+      if (shared_data[shared_idx] > result_curr_reg_v) {
+        registers_output_cache[result_reg_idx] = shared_data[shared_idx];
+      }
+
+      shared_data[shared_idx] = curr_reg_v;
+    }
+    prev_group = curr_group;
+  }
+
+  // handles the last register in this thread
+  int64_t reg_idx = prev_group * num_registers_per_sketch + tid;
+  int curr_reg_v = registers_output_cache[reg_idx];
+  if (shared_data[shared_idx] > curr_reg_v) {
+    registers_output_cache[reg_idx] = shared_data[shared_idx];
+  }
+}
+
+/**
+ * @brief Compact register values, compact 10 registers values
+ * (each register value is 6 bits) into a long.
+ * This is consistent with Spark.
+ * Output: long columns which will be composed into a struct column
+ *
+ * Number of threads is num_groups * num_long_cols.
+ *
+ * e.g., num_registers_per_sketch is 512(precision is 9):
+ * Input:
+ * registers_output_cache:
+ * [
+ *    r0_g0, r1_g0, r2_g0, r3_g0, ... , r511_g0 // register values for group 0
+ *    r0_g1, r1_g1, r2_g1, r3_g1, ... , r511_g1 // register values for group 1
+ *    ...
+ *    r0_gN, r1_gN, r2_gN, r3_gN, ... , r511_gN // register values for group N
+ * ]
+ * Has num_groups rows.
+ *
+ * Output:
+ * 52 long columns
+ *
+ * e.g.: r0 to r9 integers are all: 00000000-00000000-00000000-00100001, tailing
+ * 6 bits: 100-001 Compact to one long is:
+ * 100001-100001-100001-100001-100001-100001-100001-100001-100001-100001
+ */
+CUDF_KERNEL void
+compact_kernel(int64_t const num_groups, int64_t const num_registers_per_sketch,
+               cudf::device_span<int64_t *> sketches_output,
+               // num_groups * num_registers_per_sketch integers
+               cudf::device_span<int> registers_output_cache) {
+  int64_t const tid = cudf::detail::grid_1d::global_thread_id();
+  int64_t const num_long_cols =
+      num_registers_per_sketch / REGISTERS_PER_LONG + 1;
+  if (tid >= num_groups * num_long_cols) {
+    return;
+  }
+
+  int64_t const group_idx = tid / num_long_cols;
+  int64_t const long_idx = tid % num_long_cols;
+
+  int64_t const reg_begin_idx =
+      group_idx * num_registers_per_sketch + long_idx * REGISTERS_PER_LONG;
+  int64_t num_regs = REGISTERS_PER_LONG;
+  if (long_idx == num_long_cols - 1) {
+    num_regs = num_registers_per_sketch % REGISTERS_PER_LONG;
+  }
+
+  int64_t ten_registers = 0;
+  for (auto i = 0; i < num_regs; i++) {
+    int64_t reg_v = registers_output_cache[reg_begin_idx + i];
+    int64_t tmp = reg_v << (REGISTER_VALUE_BITS * i);
+    ten_registers |= tmp;
+  }
+
+  sketches_output[long_idx][group_idx] = ten_registers;
+}
+
+std::unique_ptr<cudf::column>
+group_hllpp(cudf::column_view const &input, int64_t const num_groups,
+            cudf::device_span<cudf::size_type const> group_lables,
+            int64_t const precision, rmm::cuda_stream_view stream,
+            rmm::device_async_resource_ref mr) {
+  int64_t num_registers_per_sketch = 1 << precision;
+  constexpr int64_t block_size = 256;
+  constexpr int num_hashs_per_thread = 256; // handles 256 items per thread
+  int64_t num_threads_partial_kernel =
+      cudf::util::div_rounding_up_safe(input.size(), num_hashs_per_thread);
+
+  auto sketches_output = rmm::device_uvector<int32_t>(
+      num_groups * num_registers_per_sketch, stream, mr);
+
+  { // add this block to release `registers_thread_cache` and
+    // `group_lables_thread_cache`
+    auto registers_thread_cache = rmm::device_uvector<int32_t>(
+        num_threads_partial_kernel * num_registers_per_sketch, stream, mr);
+    auto group_lables_thread_cache =
+        rmm::device_uvector<int32_t>(num_threads_partial_kernel, stream, mr);
+
+    { // add this block to release `hash_col`
+      // 1. compute all the hashs
+      auto input_table_view = cudf::table_view{{input}};
+      auto hash_col = xxhash64(input_table_view, SEED, stream, mr);
+      auto d_hashs = cudf::column_device_view::create(hash_col->view(), stream);
+
+      // 2. execute partial group by
+      int64_t num_blocks_p1 = cudf::util::div_rounding_up_safe(
+          num_threads_partial_kernel, block_size);
+      partial_group_sketches_from_hashs_kernel<num_hashs_per_thread>
+          <<<num_blocks_p1, block_size, 0, stream.value()>>>(
+              *d_hashs, group_lables, precision, sketches_output.begin(),
+              registers_thread_cache.begin(),
+              group_lables_thread_cache.begin());
+    }
+    // 3. merge the intermidate result
+    auto num_merge_threads = num_registers_per_sketch;
+    auto num_merge_blocks =
+        cudf::util::div_rounding_up_safe(num_merge_threads, block_size);
+    merge_sketches_vertically<block_size>
+        <<<num_merge_blocks, block_size, block_size, stream.value()>>>(
+            num_threads_partial_kernel, // num_sketches
+            num_registers_per_sketch, sketches_output.begin(),
+            registers_thread_cache.begin(), group_lables_thread_cache.begin());
+  }
+
+  // 4. create output columns
+  auto num_long_cols = num_registers_per_sketch / REGISTERS_PER_LONG + 1;
+  auto const results_iter =
+      cudf::detail::make_counting_transform_iterator(0, [&](int i) {
+        return cudf::make_numeric_column(
+            cudf::data_type{cudf::type_id::INT64}, num_groups,
+            cudf::mask_state::ALL_VALID, stream, mr);
+      });
+  auto children = std::vector<std::unique_ptr<cudf::column>>(
+      results_iter, results_iter + num_long_cols);
+  auto d_results = [&] {
+    auto host_results_pointer_iter = thrust::make_transform_iterator(
+        children.begin(), [](auto const &results_column) {
+          return results_column->mutable_view().template data<int64_t>();
+        });
+    auto host_results_pointers = std::vector<int64_t *>(
+        host_results_pointer_iter, host_results_pointer_iter + children.size());
+    return cudf::detail::make_device_uvector_async(host_results_pointers,
+                                                   stream, mr);
+  }();
+  auto result = cudf::make_structs_column(num_groups, std::move(children),
+                                          0,                    // null count
+                                          rmm::device_buffer{}, // null mask
+                                          stream);
+
+  // 5. compact sketches
+  auto num_phase3_threads = num_groups * num_long_cols;
+  auto num_phase3_blocks =
+      cudf::util::div_rounding_up_safe(num_phase3_threads, block_size);
+  compact_kernel<<<num_phase3_blocks, block_size, 0, stream.value()>>>(
+      num_groups, num_registers_per_sketch, d_results, sketches_output);
+
+  return result;
+}
+
+/**
+ * @brief Partial groups sketches in long columns, similar to
+ * `partial_group_sketches_from_hashs_kernel` It split longs into segments with
+ * each has `num_longs_per_threads` elements e.g.: num_registers_per_sketch =
+ * 512. Each sketch uses 52 (512 / 10 + 1) longs.
+ *
+ * Input:
+ *           col_0  col_1      col_51
+ * sketch_0: long,  long, ..., long
+ * sketch_1: long,  long, ..., long
+ * sketch_2: long,  long, ..., long
+ *
+ * num_threads = 52 * div_round_up(num_sketches_input, num_longs_per_threads)
+ * Each thread scans and merge num_longs_per_threads longs,
+ * and output the max register value when meets a new group.
+ * For the last long in a thread, outputs the result into
+ * `registers_thread_cache`.
+ *
+ * By split inputs into segments like `partial_group_sketches_from_hashs_kernel`
+ * and do partial merge, it will use less memory. Then the kernel
+ * merge_sketches_vertically can be used to merge the intermidate results:
+ * registers_output_cache, registers_thread_cache
+ */
+template <int num_longs_per_threads>
+CUDF_KERNEL void partial_group_long_sketches_kernel(
+    cudf::device_span<int64_t const *> sketches_input,
+    int64_t const num_sketches_input, int64_t const num_threads_per_col,
+    int64_t const num_registers_per_sketch, int64_t const num_groups,
+    cudf::device_span<cudf::size_type const> group_lables,
+    // num_groups * num_registers_per_sketch integers
+    int *const registers_output_cache,
+    // num_threads * num_registers_per_sketch integers
+    int *const registers_thread_cache,
+    // num_threads integers
+    cudf::size_type *const group_lables_thread_cache) {
+  auto const tid = cudf::detail::grid_1d::global_thread_id();
+  auto const num_long_cols = sketches_input.size();
+  if (tid >= num_threads_per_col * num_long_cols) {
+    return;
+  }
+
+  auto const long_idx = tid / num_threads_per_col;
+  auto const thread_idx_in_cols = tid % num_threads_per_col;
+  int64_t const *const longs_ptr = sketches_input[long_idx];
+
+  int *const registers_thread_ptr =
+      registers_thread_cache + thread_idx_in_cols * num_registers_per_sketch;
+
+  auto const sketch_first = thread_idx_in_cols * num_longs_per_threads;
+  auto const sketch_end =
+      cuda::std::min(sketch_first + num_longs_per_threads, num_sketches_input);
+
+  int num_regs = REGISTERS_PER_LONG;
+  if (long_idx == num_long_cols - 1) {
+    num_regs = num_registers_per_sketch % REGISTERS_PER_LONG;
+  }
+
+  for (auto i = 0; i < num_regs; i++) {
+    cudf::size_type prev_group = group_lables[sketch_first];
+    int max_reg_v = 0;
+    int reg_idx_in_sketch = long_idx * REGISTERS_PER_LONG + i;
+    for (auto sketch_idx = sketch_first; sketch_idx < sketch_end;
+         sketch_idx++) {
+      cudf::size_type curr_group = group_lables[sketch_idx];
+      int curr_reg_v = get_register_value(longs_ptr[sketch_idx], i);
+      if (curr_group == prev_group) {
+        // still in the same group, update the max value
+        if (curr_reg_v > max_reg_v) {
+          max_reg_v = curr_reg_v;
+        }
+      } else {
+        // meets new group, save output for the previous group
+        int64_t output_idx_prev =
+            num_registers_per_sketch * prev_group + reg_idx_in_sketch;
+        registers_output_cache[output_idx_prev] = max_reg_v;
+
+        // reset
+        max_reg_v = curr_reg_v;
+      }
+
+      if (sketch_idx == sketch_end - 1) {
+        // last item in the segment
+        int64_t output_idx_curr =
+            num_registers_per_sketch * curr_group + reg_idx_in_sketch;
+        if (sketch_idx == num_sketches_input - 1) {
+          // last segment
+          registers_output_cache[output_idx_curr] = max_reg_v;
+          max_reg_v = curr_reg_v;
+        } else {
+          if (curr_group != group_lables[sketch_idx + 1]) {
+            // look the first item in the next segment
+            registers_output_cache[output_idx_curr] = max_reg_v;
+            max_reg_v = curr_reg_v;
+          }
+        }
+      }
+
+      prev_group = curr_group;
+    }
+
+    // For each thread, output current max value
+    registers_thread_ptr[reg_idx_in_sketch] = max_reg_v;
+  }
+
+  if (long_idx == 0) {
+    group_lables_thread_cache[thread_idx_in_cols] =
+        group_lables[sketch_end - 1];
+  }
+}
+
+/**
+ * @brief Merge for struct<long, ..., long> column. Each long contains 10
+ * register values. Merge all rows in the same group.
+ */
+std::unique_ptr<cudf::column> group_merge_hllpp(
+    cudf::column_view const &hll_input, // struct<long, ..., long> column
+    int64_t const num_groups,
+    cudf::device_span<cudf::size_type const> group_lables,
+    int64_t const precision, rmm::cuda_stream_view stream,
+    rmm::device_async_resource_ref mr) {
+  int64_t num_registers_per_sketch = 1 << precision;
+  int64_t const num_sketches = hll_input.size();
+  int64_t const num_long_cols =
+      num_registers_per_sketch / REGISTERS_PER_LONG + 1;
+  constexpr int64_t num_longs_per_threads = 256;
+  constexpr int64_t block_size = 256;
+
+  int64_t num_threads_per_col_phase1 =
+      cudf::util::div_rounding_up_safe(num_sketches, num_longs_per_threads);
+  int64_t num_threads_phase1 = num_threads_per_col_phase1 * num_long_cols;
+  int64_t num_blocks =
+      cudf::util::div_rounding_up_safe(num_threads_phase1, block_size);
+  auto registers_output_cache = rmm::device_uvector<int32_t>(
+      num_registers_per_sketch * num_groups, stream, mr);
+  {
+    auto registers_thread_cache = rmm::device_uvector<int32_t>(
+        num_registers_per_sketch * num_threads_phase1, stream, mr);
+    auto group_lables_thread_cache =
+        rmm::device_uvector<int32_t>(num_threads_per_col_phase1, stream, mr);
+
+    cudf::structs_column_view scv(hll_input);
+    auto const input_iter =
+        cudf::detail::make_counting_transform_iterator(0, [&](int i) {
+          return scv.get_sliced_child(i, stream).begin<int64_t>();
+        });
+    auto input_cols =
+        std::vector<int64_t const *>(input_iter, input_iter + num_long_cols);
+    auto d_inputs =
+        cudf::detail::make_device_uvector_async(input_cols, stream, mr);
+    // 1st kernel: partially group
+    partial_group_long_sketches_kernel<num_longs_per_threads>
+        <<<num_blocks, block_size, 0, stream.value()>>>(
+            d_inputs, num_sketches, num_threads_per_col_phase1,
+            num_registers_per_sketch, num_groups, group_lables,
+            registers_output_cache.begin(), registers_thread_cache.begin(),
+            group_lables_thread_cache.begin());
+    auto const num_phase2_threads = num_registers_per_sketch;
+    auto const num_phase2_blocks =
+        cudf::util::div_rounding_up_safe(num_phase2_threads, block_size);
+    // 2nd kernel: vertical merge
+    merge_sketches_vertically<block_size>
+        <<<num_phase2_blocks, block_size, block_size, stream.value()>>>(
+            num_threads_per_col_phase1, // num_sketches
+            num_registers_per_sketch, registers_output_cache.begin(),
+            registers_thread_cache.begin(), group_lables_thread_cache.begin());
+  }
+
+  // create output columns
+  auto const results_iter =
+      cudf::detail::make_counting_transform_iterator(0, [&](int i) {
+        return cudf::make_numeric_column(
+            cudf::data_type{cudf::type_id::INT64}, num_groups,
+            cudf::mask_state::ALL_VALID, stream, mr);
+      });
+  auto results = std::vector<std::unique_ptr<cudf::column>>(
+      results_iter, results_iter + num_long_cols);
+  auto d_sketches_output = [&] {
+    auto host_results_pointer_iter = thrust::make_transform_iterator(
+        results.begin(), [](auto const &results_column) {
+          return results_column->mutable_view().template data<int64_t>();
+        });
+    auto host_results_pointers = std::vector<int64_t *>(
+        host_results_pointer_iter, host_results_pointer_iter + results.size());
+    return cudf::detail::make_device_uvector_async(host_results_pointers,
+                                                   stream, mr);
+  }();
+
+  // 3rd kernel: compact
+  auto num_phase3_threads = num_groups * num_long_cols;
+  auto num_phase3_blocks =
+      cudf::util::div_rounding_up_safe(num_phase3_threads, block_size);
+  compact_kernel<<<num_phase3_blocks, block_size, 0, stream.value()>>>(
+      num_groups, num_registers_per_sketch, d_sketches_output,
+      registers_output_cache);
+
+  return make_structs_column(num_groups, std::move(results), 0,
+                             rmm::device_buffer{});
+}
+
+/**
+ * Launch only 1 block, uses max 1M(2^18 *sizeof(int)) shared memory.
+ * For each hash, get a pair: (register index, register value).
+ * Use shared memory to speedup the fetch max atomic operation.
+ */
+template <int block_size>
+CUDF_KERNEL void reduce_hllpp_kernel(cudf::column_device_view hashs,
+                                     cudf::device_span<int64_t *> output,
+                                     int precision) {
+  __shared__ int32_t shared_data[block_size];
+
+  auto const tid = cudf::detail::grid_1d::global_thread_id();
+  auto const num_hashs = hashs.size();
+  uint64_t const num_registers_per_sketch = 1L << precision;
+  int const idx_shift = 64 - precision;
+  uint64_t const w_padding = 1ULL << (precision - 1);
+
+  // init tmp data
+  for (int i = tid; i < num_registers_per_sketch; i += block_size) {
+    shared_data[i] = 0;
+  }
+  __syncthreads();
+
+  // update max reg value for the reg index
+  for (int i = tid; i < num_hashs; i += block_size) {
+    uint64_t const hash = static_cast<uint64_t>(hashs.element<int64_t>(i));
+    // use unsigned int to avoid insert 1 for the highest bit when do right
+    // shift
+    uint64_t const reg_idx = hash >> idx_shift;
+    // get the leading zeros
+    int const reg_v = static_cast<int>(
+        cuda::std::countl_zero((hash << precision) | w_padding) + 1ULL);
+    cuda::atomic_ref<int32_t, cuda::thread_scope_block> register_ref(
+        shared_data[reg_idx]);
+    register_ref.fetch_max(reg_v, cuda::memory_order_relaxed);
+  }
+  __syncthreads();
+
+  // compact from register values (int array) to long array
+  // each long holds 10 integers, note reg value < 64 which means the bits from
+  // 7 to highest are all 0.
+  if (tid * REGISTERS_PER_LONG < num_registers_per_sketch) {
+    int start = tid * REGISTERS_PER_LONG;
+    int end = (tid + 1) * REGISTERS_PER_LONG;
+    if (end > num_registers_per_sketch) {
+      end = num_registers_per_sketch;
+    }
+
+    int64_t ret = 0;
+    for (int i = 0; i < end - start; i++) {
+      int shift = i * REGISTER_VALUE_BITS;
+      int64_t reg = shared_data[start + i];
+      ret |= (reg << shift);
+    }
+
+    output[tid][0] = ret;
+  }
+}
+
+std::unique_ptr<cudf::scalar> reduce_hllpp(cudf::column_view const &input,
+                                           int64_t const precision,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::device_async_resource_ref mr) {
+  int64_t num_registers_per_sketch = 1L << precision;
+  // 1. compute all the hashs
+  auto input_table_view = cudf::table_view{{input}};
+  auto hash_col = xxhash64(input_table_view, SEED, stream, mr);
+  auto d_hashs = cudf::column_device_view::create(hash_col->view(), stream);
+
+  // 2. generate long columns, the size of each long column is 1
+  auto num_long_cols = num_registers_per_sketch / REGISTERS_PER_LONG + 1;
+  auto const results_iter =
+      cudf::detail::make_counting_transform_iterator(0, [&](int i) {
+        return cudf::make_numeric_column(
+            cudf::data_type{cudf::type_id::INT64}, 1 /**num_groups*/,
+            cudf::mask_state::ALL_VALID, stream, mr);
+      });
+  auto children = std::vector<std::unique_ptr<cudf::column>>(
+      results_iter, results_iter + num_long_cols);
+  auto d_results = [&] {
+    auto host_results_pointer_iter = thrust::make_transform_iterator(
+        children.begin(), [](auto const &results_column) {
+          return results_column->mutable_view().template data<int64_t>();
+        });
+    auto host_results_pointers = std::vector<int64_t *>(
+        host_results_pointer_iter, host_results_pointer_iter + children.size());
+    return cudf::detail::make_device_uvector_async(host_results_pointers,
+                                                   stream, mr);
+  }();
+
+  // 2. reduce and generate compacted long values
+  constexpr int64_t block_size = 256;
+  // max shared memory is 2^18 * 4 = 1M
+  auto const shared_mem_size = num_registers_per_sketch * sizeof(int32_t);
+  reduce_hllpp_kernel<block_size>
+      <<<1, block_size, shared_mem_size, stream.value()>>>(*d_hashs, d_results,
+                                                           precision);
+
+  // 3. create struct scalar
+  auto host_results_view_iter = thrust::make_transform_iterator(
+      children.begin(),
+      [](auto const &results_column) { return results_column->view(); });
+  auto views = std::vector<cudf::column_view>(
+      host_results_view_iter, host_results_view_iter + num_long_cols);
+  auto table_view = cudf::table_view{views};
+  auto table = cudf::table(table_view);
+  return std::make_unique<cudf::struct_scalar>(std::move(table), true, stream,
+                                               mr);
+}
+
+CUDF_KERNEL void reduce_merge_hll_kernel_vertically(
+    cudf::device_span<int64_t const *> sketch_longs,
+    cudf::size_type num_sketches, int num_registers_per_sketch,
+    int *const output) {
+  auto const tid = cudf::detail::grid_1d::global_thread_id();
+  if (tid >= num_registers_per_sketch) {
+    return;
+  }
+  auto long_idx = tid / REGISTERS_PER_LONG;
+  auto reg_idx_in_long = tid % REGISTERS_PER_LONG;
+  int max = 0;
+  for (auto row_idx = 0; row_idx < num_sketches; row_idx++) {
+    int reg_v =
+        get_register_value(sketch_longs[long_idx][row_idx], reg_idx_in_long);
+    if (reg_v > max) {
+      max = reg_v;
+    }
+  }
+  output[tid] = max;
+}
+
+std::unique_ptr<cudf::scalar>
+reduce_merge_hllpp(cudf::column_view const &input, int64_t const precision,
+                   rmm::cuda_stream_view stream,
+                   rmm::device_async_resource_ref mr) {
+  // create device input
+  int64_t num_registers_per_sketch = 1 << precision;
+  auto num_long_cols = num_registers_per_sketch / REGISTERS_PER_LONG + 1;
+  cudf::structs_column_view scv(input);
+  auto const input_iter =
+      cudf::detail::make_counting_transform_iterator(0, [&](int i) {
+        return scv.get_sliced_child(i, stream).begin<int64_t>();
+      });
+  auto input_cols =
+      std::vector<int64_t const *>(input_iter, input_iter + num_long_cols);
+  auto d_inputs =
+      cudf::detail::make_device_uvector_async(input_cols, stream, mr);
+
+  // create one row output
+  auto const results_iter =
+      cudf::detail::make_counting_transform_iterator(0, [&](int i) {
+        return cudf::make_numeric_column(
+            cudf::data_type{cudf::type_id::INT64}, 1 /** num_rows */,
+            cudf::mask_state::ALL_VALID, stream, mr);
+      });
+  auto children = std::vector<std::unique_ptr<cudf::column>>(
+      results_iter, results_iter + num_long_cols);
+  auto d_results = [&] {
+    auto host_results_pointer_iter = thrust::make_transform_iterator(
+        children.begin(), [](auto const &results_column) {
+          return results_column->mutable_view().template data<int64_t>();
+        });
+    auto host_results_pointers = std::vector<int64_t *>(
+        host_results_pointer_iter, host_results_pointer_iter + children.size());
+    return cudf::detail::make_device_uvector_async(host_results_pointers,
+                                                   stream, mr);
+  }();
+
+  // execute merge kernel
+  auto num_threads = num_registers_per_sketch;
+  constexpr int64_t block_size = 256;
+  auto num_blocks = cudf::util::div_rounding_up_safe(num_threads, block_size);
+  auto output_cache =
+      rmm::device_uvector<int32_t>(num_registers_per_sketch, stream, mr);
+  reduce_merge_hll_kernel_vertically<<<num_blocks, block_size, 0,
+                                       stream.value()>>>(
+      d_inputs, input.size(), num_registers_per_sketch, output_cache.begin());
+
+  // compact to longs
+  auto const num_compact_threads = num_long_cols;
+  auto const num_compact_blocks =
+      cudf::util::div_rounding_up_safe(num_compact_threads, block_size);
+  compact_kernel<<<num_compact_blocks, block_size, 0, stream.value()>>>(
+      1 /** num_groups **/, num_registers_per_sketch, d_results, output_cache);
+
+  // create scalar
+  auto host_results_view_iter = thrust::make_transform_iterator(
+      children.begin(),
+      [](auto const &results_column) { return results_column->view(); });
+  auto views = std::vector<cudf::column_view>(
+      host_results_view_iter, host_results_view_iter + num_long_cols);
+  auto table_view = cudf::table_view{views};
+  auto table = cudf::table(table_view);
+  return std::make_unique<cudf::struct_scalar>(std::move(table), true, stream,
+                                               mr);
+}
+
+struct estimate_fn {
+  cudf::device_span<int64_t const *> sketch_longs;
+  int const precision;
+  int64_t *const out;
+
+  __device__ void operator()(cudf::size_type const idx) const {
+    auto const num_regs = 1ull << precision;
+    double sum = 0;
+    int zeroes = 0;
+
+    for (auto reg_idx = 0; reg_idx < num_regs; ++reg_idx) {
+      // each long contains 10 register values
+      int long_col_idx = reg_idx / REGISTERS_PER_LONG;
+      int reg_idx_in_long = reg_idx % REGISTERS_PER_LONG;
+      int reg =
+          get_register_value(sketch_longs[long_col_idx][idx], reg_idx_in_long);
+      sum += double{1} / static_cast<double>(1ull << reg);
+      zeroes += reg == 0;
+    }
+
+    auto const finalize = cuco::hyperloglog_ns::detail::finalizer(precision);
+    out[idx] = finalize(sum, zeroes);
+  }
+};
+
+} // end anonymous namespace
+
+std::unique_ptr<cudf::column> group_hyper_log_log_plus_plus(
+    cudf::column_view const &input, int64_t const num_groups,
+    cudf::device_span<cudf::size_type const> group_lables,
+    int64_t const precision, rmm::cuda_stream_view stream,
+    rmm::device_async_resource_ref mr) {
+  CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision >= 4.");
+  auto adjust_precision = precision > MAX_PRECISION ? MAX_PRECISION : precision;
+  return group_hllpp(input, num_groups, group_lables, adjust_precision, stream,
+                     mr);
+}
+
+std::unique_ptr<cudf::column> group_merge_hyper_log_log_plus_plus(
+    cudf::column_view const &input, int64_t const num_groups,
+    cudf::device_span<cudf::size_type const> group_lables,
+    int64_t const precision, rmm::cuda_stream_view stream,
+    rmm::device_async_resource_ref mr) {
+  CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision >= 4.");
+  CUDF_EXPECTS(
+      input.type().id() == cudf::type_id::STRUCT,
+      "HyperLogLogPlusPlus buffer type must be a STRUCT of long columns.");
+  for (auto i = 0; i < input.num_children(); i++) {
+    CUDF_EXPECTS(
+        input.child(i).type().id() == cudf::type_id::INT64,
+        "HyperLogLogPlusPlus buffer type must be a STRUCT of long columns.");
+  }
+  auto adjust_precision = precision > MAX_PRECISION ? MAX_PRECISION : precision;
+  auto expected_num_longs = (1 << adjust_precision) / REGISTERS_PER_LONG + 1;
+  CUDF_EXPECTS(input.num_children() == expected_num_longs,
+               "The num of long columns in input is incorrect.");
+  return group_merge_hllpp(input, num_groups, group_lables, adjust_precision,
+                           stream, mr);
+}
+
+std::unique_ptr<cudf::scalar> reduce_hyper_log_log_plus_plus(
+    cudf::column_view const &input, int64_t const precision,
+    rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) {
+  CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision >= 4.");
+  auto adjust_precision = precision > MAX_PRECISION ? MAX_PRECISION : precision;
+  return reduce_hllpp(input, adjust_precision, stream, mr);
+}
+
+std::unique_ptr<cudf::scalar> reduce_merge_hyper_log_log_plus_plus(
+    cudf::column_view const &input, int64_t const precision,
+    rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) {
+  CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision >= 4.");
+  CUDF_EXPECTS(
+      input.type().id() == cudf::type_id::STRUCT,
+      "HyperLogLogPlusPlus buffer type must be a STRUCT of long columns.");
+  for (auto i = 0; i < input.num_children(); i++) {
+    CUDF_EXPECTS(
+        input.child(i).type().id() == cudf::type_id::INT64,
+        "HyperLogLogPlusPlus buffer type must be a STRUCT of long columns.");
+  }
+  auto adjust_precision = precision > MAX_PRECISION ? MAX_PRECISION : precision;
+  auto expected_num_longs = (1 << adjust_precision) / REGISTERS_PER_LONG + 1;
+  CUDF_EXPECTS(input.num_children() == expected_num_longs,
+               "The num of long columns in input is incorrect.");
+  return reduce_merge_hllpp(input, adjust_precision, stream, mr);
+}
+
+std::unique_ptr<cudf::column>
+estimate_from_hll_sketches(cudf::column_view const &input, int precision,
+                           rmm::cuda_stream_view stream,
+                           rmm::device_async_resource_ref mr) {
+  CUDF_EXPECTS(precision >= 4,
+               "HyperLogLogPlusPlus requires precision is bigger than 4.");
+  auto const input_iter = cudf::detail::make_counting_transform_iterator(
+      0, [&](int i) { return input.child(i).begin<int64_t>(); });
+  auto input_cols = std::vector<int64_t const *>(
+      input_iter, input_iter + input.num_children());
+  auto d_inputs =
+      cudf::detail::make_device_uvector_async(input_cols, stream, mr);
+  auto result = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT64},
+                                          input.size(),
+                                          cudf::mask_state::ALL_VALID, stream);
+  // evaluate from struct<long, ..., long>
+  thrust::for_each_n(
+      rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(0),
+      input.size(),
+      estimate_fn{d_inputs, precision, result->mutable_view().data<int64_t>()});
+  return result;
+}
+
+} // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/hllpp.hpp b/src/main/cpp/src/hllpp.hpp
new file mode 100644
index 0000000000..4dda342a4f
--- /dev/null
+++ b/src/main/cpp/src/hllpp.hpp
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
+
+namespace spark_rapids_jni {
+
+/**
+ * The number of bits that is required for a HLLPP register value.
+ *
+ * This number is determined by the maximum number of leading binary zeros a
+ * hashcode can produce. This is equal to the number of bits the hashcode
+ * returns. The current implementation uses a 64-bit hashcode, this means 6-bits
+ * are (at most) needed to store the number of leading zeros.
+ */
+constexpr int REGISTER_VALUE_BITS = 6;
+
+// MASK binary 6 bits: 111-111
+constexpr uint64_t MASK = (1L << REGISTER_VALUE_BITS) - 1L;
+
+// This value is 10, one long stores 10 register values
+constexpr int REGISTERS_PER_LONG = 64 / REGISTER_VALUE_BITS;
+
+// XXHash seed, consistent with Spark
+constexpr int64_t SEED = 42L;
+
+// max precision, if require a precision bigger than 18, then use 18.
+constexpr int MAX_PRECISION = 18;
+
+/**
+ * Compute hash codes for the input, generate HyperLogLogPlusPlus(HLLPP)
+ * sketches from hash codes, and merge the sketches in the same group. Output is
+ * a struct column with multiple long columns which is consistent with Spark.
+ */
+std::unique_ptr<cudf::column> group_hyper_log_log_plus_plus(
+    cudf::column_view const &input, int64_t const num_groups,
+    cudf::device_span<cudf::size_type const> group_lables,
+    int64_t const precision, rmm::cuda_stream_view stream,
+    rmm::device_async_resource_ref mr);
+
+/**
+ * Merge HyperLogLogPlusPlus(HLLPP) sketches in the same group.
+ * Input is a struct column with multiple long columns which is consistent with
+ * Spark.
+ */
+std::unique_ptr<cudf::column> group_merge_hyper_log_log_plus_plus(
+    cudf::column_view const &input, int64_t const num_groups,
+    cudf::device_span<cudf::size_type const> group_lables,
+    int64_t const precision, rmm::cuda_stream_view stream,
+    rmm::device_async_resource_ref mr);
+
+/**
+ * Compute hash codes for the input, generate HyperLogLogPlusPlus(HLLPP)
+ * sketches from hash codes, and merge all the sketches into one sketch, output
+ * is a struct scalar with multiple long values.
+ */
+std::unique_ptr<cudf::scalar> reduce_hyper_log_log_plus_plus(
+    cudf::column_view const &input, int64_t const precision,
+    rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
+
+/**
+ * Merge all HyperLogLogPlusPlus(HLLPP) sketches in the input column into one
+ * sketch. Input is a struct column with multiple long columns which is
+ * consistent with Spark. Output is a struct scalar with multiple long values.
+ */
+std::unique_ptr<cudf::scalar> reduce_merge_hyper_log_log_plus_plus(
+    cudf::column_view const &input, int64_t const precision,
+    rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
+
+/**
+ * Estimate count distinct values for the input which contains
+ * Input is a struct column with multiple long columns which is consistent with
+ * Spark. Output is a long column with all values are not null. Spark returns 0
+ * for null values when doing APPROX_COUNT_DISTINCT.
+ */
+std::unique_ptr<cudf::column> estimate_from_hll_sketches(
+    cudf::column_view const &input, int precision,
+    rmm::cuda_stream_view stream = cudf::get_default_stream(),
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+} // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/hllpp_host_udf.cu b/src/main/cpp/src/hllpp_host_udf.cu
new file mode 100644
index 0000000000..c9ad271876
--- /dev/null
+++ b/src/main/cpp/src/hllpp_host_udf.cu
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "hllpp.hpp"
+#include "hllpp_host_udf.hpp"
+
+#include <cudf/aggregation.hpp>
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/valid_if.cuh>
+#include <cudf/groupby.hpp>
+#include <cudf/reduction.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/transform.h>
+#include <thrust/transform_reduce.h>
+
+namespace spark_rapids_jni {
+
+namespace {
+
+template <typename cudf_aggregation> struct hllpp_udf : cudf::host_udf_base {
+  static_assert(std::is_same_v<cudf_aggregation, cudf::reduce_aggregation> ||
+                std::is_same_v<cudf_aggregation, cudf::groupby_aggregation>);
+
+  hllpp_udf(int precision_, bool is_merge_)
+      : precision(precision_), is_merge(is_merge_) {}
+
+  [[nodiscard]] input_data_attributes get_required_data() const override {
+    if constexpr (std::is_same_v<cudf_aggregation, cudf::reduce_aggregation>) {
+      return {reduction_data_attribute::INPUT_VALUES};
+    } else {
+      return {groupby_data_attribute::GROUPED_VALUES,
+              groupby_data_attribute::GROUP_OFFSETS,
+              groupby_data_attribute::GROUP_LABELS};
+    }
+  }
+
+  [[nodiscard]] output_type
+  operator()(host_udf_input const &udf_input, rmm::cuda_stream_view stream,
+             rmm::device_async_resource_ref mr) const override {
+    if constexpr (std::is_same_v<cudf_aggregation, cudf::reduce_aggregation>) {
+      // reduce
+      auto const &input_values = std::get<cudf::column_view>(
+          udf_input.at(reduction_data_attribute::INPUT_VALUES));
+      if (input_values.size() == 0) {
+        return get_empty_output(std::nullopt, stream, mr);
+      }
+      if (is_merge) {
+        // reduce intermidate result, input_values are struct of long columns
+        return spark_rapids_jni::reduce_merge_hyper_log_log_plus_plus(
+            input_values, precision, stream, mr);
+      } else {
+        return spark_rapids_jni::reduce_hyper_log_log_plus_plus(
+            input_values, precision, stream, mr);
+      }
+    } else {
+      // groupby
+      auto const &group_values = std::get<cudf::column_view>(
+          udf_input.at(groupby_data_attribute::GROUPED_VALUES));
+      if (group_values.size() == 0) {
+        return get_empty_output(std::nullopt, stream, mr);
+      }
+      auto const group_offsets =
+          std::get<cudf::device_span<cudf::size_type const>>(
+              udf_input.at(groupby_data_attribute::GROUP_OFFSETS));
+      int num_groups = group_offsets.size() - 1;
+      auto const group_lables =
+          std::get<cudf::device_span<cudf::size_type const>>(
+              udf_input.at(groupby_data_attribute::GROUP_LABELS));
+      if (is_merge) {
+        // group by intermidate result, group_values are struct of long columns
+        return spark_rapids_jni::group_merge_hyper_log_log_plus_plus(
+            group_values, num_groups, group_lables, precision, stream, mr);
+      } else {
+        return spark_rapids_jni::group_hyper_log_log_plus_plus(
+            group_values, num_groups, group_lables, precision, stream, mr);
+      }
+    }
+  }
+
+  /**
+   * @brief create an empty struct scalar
+   */
+  [[nodiscard]] output_type
+  get_empty_output([[maybe_unused]] std::optional<cudf::data_type> output_dtype,
+                   rmm::cuda_stream_view stream,
+                   rmm::device_async_resource_ref mr) const override {
+    int num_registers = 1 << precision;
+    int num_long_cols = num_registers / REGISTERS_PER_LONG + 1;
+    auto const results_iter =
+        cudf::detail::make_counting_transform_iterator(0, [&](int i) {
+          return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT64});
+        });
+    auto children = std::vector<std::unique_ptr<cudf::column>>(
+        results_iter, results_iter + num_long_cols);
+
+    if constexpr (std::is_same_v<cudf_aggregation, cudf::reduce_aggregation>) {
+      // reduce
+      auto host_results_view_iter = thrust::make_transform_iterator(
+          children.begin(),
+          [](auto const &results_column) { return results_column->view(); });
+      auto views = std::vector<cudf::column_view>(
+          host_results_view_iter, host_results_view_iter + num_long_cols);
+      auto table_view = cudf::table_view{views};
+      auto table = cudf::table(table_view);
+      return std::make_unique<cudf::struct_scalar>(std::move(table), true,
+                                                   stream, mr);
+    } else {
+      // groupby
+      return cudf::make_structs_column(0, std::move(children),
+                                       0,                    // null count
+                                       rmm::device_buffer{}, // null mask
+                                       stream);
+    }
+  }
+
+  [[nodiscard]] bool is_equal(host_udf_base const &other) const override {
+    auto o = dynamic_cast<hllpp_udf const *>(&other);
+    return o != nullptr && o->precision == this->precision;
+  }
+
+  [[nodiscard]] std::size_t do_hash() const override {
+    return 31 * (31 * std::hash<std::string>{}({"hllpp_udf"}) + precision) +
+           is_merge;
+  }
+
+  [[nodiscard]] std::unique_ptr<host_udf_base> clone() const override {
+    return std::make_unique<hllpp_udf>(precision, is_merge);
+  }
+
+  int precision;
+  bool is_merge;
+};
+
+} // namespace
+
+std::unique_ptr<cudf::host_udf_base>
+create_hllpp_reduction_host_udf(int precision) {
+  return std::make_unique<hllpp_udf<cudf::reduce_aggregation>>(
+      precision, /*is_merge*/ false);
+}
+
+std::unique_ptr<cudf::host_udf_base>
+create_hllpp_reduction_merge_host_udf(int precision) {
+  return std::make_unique<hllpp_udf<cudf::reduce_aggregation>>(
+      precision, /*is_merge*/ true);
+}
+
+std::unique_ptr<cudf::host_udf_base>
+create_hllpp_groupby_host_udf(int precision) {
+  return std::make_unique<hllpp_udf<cudf::groupby_aggregation>>(
+      precision, /*is_merge*/ false);
+}
+
+std::unique_ptr<cudf::host_udf_base>
+create_hllpp_groupby_merge_host_udf(int precision) {
+  return std::make_unique<hllpp_udf<cudf::groupby_aggregation>>(
+      precision, /*is_merge*/ true);
+}
+
+} // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/hllpp_host_udf.hpp b/src/main/cpp/src/hllpp_host_udf.hpp
new file mode 100644
index 0000000000..fc4bb8b21b
--- /dev/null
+++ b/src/main/cpp/src/hllpp_host_udf.hpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/aggregation.hpp>
+
+namespace spark_rapids_jni {
+
+std::unique_ptr<cudf::host_udf_base>
+create_hllpp_reduction_host_udf(int precision);
+
+std::unique_ptr<cudf::host_udf_base>
+create_hllpp_reduction_merge_host_udf(int precision);
+
+std::unique_ptr<cudf::host_udf_base>
+create_hllpp_groupby_host_udf(int precision);
+
+std::unique_ptr<cudf::host_udf_base>
+create_hllpp_groupby_merge_host_udf(int precision);
+
+} // namespace spark_rapids_jni
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/HLLPPHostUDF.java b/src/main/java/com/nvidia/spark/rapids/jni/HLLPPHostUDF.java
new file mode 100644
index 0000000000..9018474c27
--- /dev/null
+++ b/src/main/java/com/nvidia/spark/rapids/jni/HLLPPHostUDF.java
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.jni;
+
+import ai.rapids.cudf.ColumnVector;
+import ai.rapids.cudf.ColumnView;
+import ai.rapids.cudf.NativeDepsLoader;
+
+/**
+ * HyperLogLogPlusPlus(HLLPP) host UDF aggregation utils
+ */
+public class HLLPPHostUDF {
+  static {
+    NativeDepsLoader.loadNativeDeps();
+  }
+
+  /**
+   * HyperLogLogPlusPlus(HLLPP) aggregation types
+   */
+  public enum AggregationType {
+
+    /**
+     * Compute hash codes for the input, generate HyperLogLogPlusPlus(HLLPP)
+     * sketches from hash codes, and merge all the sketches into one sketch, output
+     * is a struct scalar with multiple long values.
+     */
+    Reduction(0),
+
+    /**
+     * Merge all HyperLogLogPlusPlus(HLLPP) sketches in the input column into one
+     * sketch. Input is a struct column with multiple long columns which is
+     * consistent with Spark. Output is a struct scalar with multiple long values.
+     */
+    Reduction_MERGE(1),
+
+    /**
+     * Compute hash codes for the input, generate HyperLogLogPlusPlus(HLLPP)
+     * sketches from hash codes, and merge the sketches in the same group. Output is
+     * a struct column with multiple long columns which is consistent with Spark.
+     */
+    GroupBy(2),
+
+    /**
+     * Merge HyperLogLogPlusPlus(HLLPP) sketches in the same group.
+     * Input is a struct column with multiple long columns which is consistent with
+     * Spark.
+     */
+    GroupByMerge(3);
+
+    final int nativeId;
+
+    AggregationType(int nativeId) {
+      this.nativeId = nativeId;
+    }
+  }
+
+  /**
+   * Create a HyperLogLogPlusPlus(HLLPP) host UDF
+   */
+  public static long createHLLPPHostUDF(AggregationType type, int precision) {
+    return createHLLPPHostUDF(type.nativeId, precision);
+  }
+
+  /**
+   * Compute the approximate count distinct value from sketch values.
+   *
+   * The input is sketch values, must be given in the format:
+   * `Struct<INT64, INT64,...>`,
+   * The value of num_registers_per_sketch = 2^precision
+   * The children num of this Struct is: num_registers_per_sketch / 10 + 1,
+   * Here 10 means a INT64 contains 10 register values,
+   * each register value is 6 bits.
+   * Register value is the number of leading zero bits in xxhash64 hash code.
+   * xxhash64 hash code is 64 bits, Register value is 6 bits,
+   * 6 bits is enough to hold the max value 64.
+   *
+   * @param input     The sketch column which constains Struct<INT64, INT64, ...>
+   *                  values.
+   * @param precision The num of bits for HLLPP register addressing.
+   * @return A INT64 column with each value indicates the approximate count
+   *         distinct value.
+   */
+  public static ColumnVector estimateDistinctValueFromSketches(ColumnView input, int precision) {
+    return new ColumnVector(estimateDistinctValueFromSketches(input.getNativeView(), precision));
+  }
+
+  private static native long createHLLPPHostUDF(int type, int precision);
+
+  private static native long estimateDistinctValueFromSketches(long inputHandle, int precision);
+
+}

From f8c6a02eecc3357fd6e3a784218ee38e1761b86a Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Tue, 17 Dec 2024 21:04:54 +0800
Subject: [PATCH 07/12] Use UDF

---
 src/main/cpp/src/HLLPP.cu                     | 102 ---
 src/main/cpp/src/HLLPP.hpp                    |  32 -
 src/main/cpp/src/HLLPPHostUDFJni.cpp          |  37 +-
 src/main/cpp/src/HLLPPJni.cpp                 |  34 -
 src/main/cpp/src/hllpp.cu                     | 723 +++++++++---------
 src/main/cpp/src/hllpp.hpp                    |  42 +-
 src/main/cpp/src/hllpp_host_udf.cu            | 140 ++--
 src/main/cpp/src/hllpp_host_udf.hpp           |  14 +-
 .../com/nvidia/spark/rapids/jni/HLLPP.java    |  48 --
 .../nvidia/spark/rapids/jni/HLLPPTest.java    |  37 -
 10 files changed, 453 insertions(+), 756 deletions(-)
 delete mode 100644 src/main/cpp/src/HLLPP.cu
 delete mode 100644 src/main/cpp/src/HLLPP.hpp
 delete mode 100644 src/main/cpp/src/HLLPPJni.cpp
 delete mode 100644 src/main/java/com/nvidia/spark/rapids/jni/HLLPP.java
 delete mode 100644 src/test/java/com/nvidia/spark/rapids/jni/HLLPPTest.java

diff --git a/src/main/cpp/src/HLLPP.cu b/src/main/cpp/src/HLLPP.cu
deleted file mode 100644
index d2d9493cf7..0000000000
--- a/src/main/cpp/src/HLLPP.cu
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "HLLPP.hpp"
-
-#include <cudf/column/column.hpp>
-#include <cudf/column/column_device_view.cuh>
-#include <cudf/column/column_factories.hpp>
-#include <cudf/column/column_view.hpp>
-#include <cudf/detail/iterator.cuh>
-#include <cudf/lists/lists_column_view.hpp>
-#include <cudf/types.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <cuco/detail/hyperloglog/finalizer.cuh>
-#include <thrust/for_each.h>
-#include <thrust/iterator/counting_iterator.h>
-
-namespace spark_rapids_jni {
-
-namespace {
-
-// The number of bits required by register value. Register value stores num of zeros.
-// XXHash64 value is 64 bits, it's safe to use 6 bits to store a register value.
-constexpr int REGISTER_VALUE_BITS = 6;
-
-// MASK binary 6 bits: 111111
-constexpr uint64_t MASK = (1L << REGISTER_VALUE_BITS) - 1L;
-
-// One long stores 10 register values
-constexpr int REGISTERS_PER_LONG = 64 / REGISTER_VALUE_BITS;
-
-__device__ inline int get_register_value(int64_t const long_10_registers, int reg_idx)
-{
-  int64_t shift_mask = MASK << (REGISTER_VALUE_BITS * reg_idx);
-  int64_t v          = (long_10_registers & shift_mask) >> (REGISTER_VALUE_BITS * reg_idx);
-  return static_cast<int>(v);
-}
-
-struct estimate_fn {
-  cudf::device_span<int64_t const*> sketch_longs;
-  int const precision;
-  int64_t* const out;
-
-  __device__ void operator()(cudf::size_type const idx) const
-  {
-    auto const num_regs = 1ull << precision;
-    double sum          = 0;
-    int zeroes          = 0;
-
-    for (auto reg_idx = 0; reg_idx < num_regs; ++reg_idx) {
-      // each long contains 10 register values
-      int long_col_idx    = reg_idx / REGISTERS_PER_LONG;
-      int reg_idx_in_long = reg_idx % REGISTERS_PER_LONG;
-      int reg             = get_register_value(sketch_longs[long_col_idx][idx], reg_idx_in_long);
-      sum += double{1} / static_cast<double>(1ull << reg);
-      zeroes += reg == 0;
-    }
-
-    auto const finalize = cuco::hyperloglog_ns::detail::finalizer(precision);
-    out[idx]            = finalize(sum, zeroes);
-  }
-};
-
-}  // end anonymous namespace
-
-std::unique_ptr<cudf::column> estimate_from_hll_sketches(cudf::column_view const& input,
-                                                         int precision,
-                                                         rmm::cuda_stream_view stream,
-                                                         rmm::device_async_resource_ref mr)
-{
-  CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision is bigger than 4.");
-  auto const input_iter = cudf::detail::make_counting_transform_iterator(
-    0, [&](int i) { return input.child(i).begin<int64_t>(); });
-  auto input_cols = std::vector<int64_t const*>(input_iter, input_iter + input.num_children());
-  auto d_inputs   = cudf::detail::make_device_uvector_async(input_cols, stream, mr);
-  auto result     = cudf::make_numeric_column(
-    cudf::data_type{cudf::type_id::INT64}, input.size(), cudf::mask_state::ALL_VALID, stream);
-  // evaluate from struct<long, ..., long>
-  thrust::for_each_n(rmm::exec_policy_nosync(stream),
-                     thrust::make_counting_iterator(0),
-                     input.size(),
-                     estimate_fn{d_inputs, precision, result->mutable_view().data<int64_t>()});
-  return result;
-}
-
-}  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/HLLPP.hpp b/src/main/cpp/src/HLLPP.hpp
deleted file mode 100644
index 69e0b237e5..0000000000
--- a/src/main/cpp/src/HLLPP.hpp
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/column/column.hpp>
-#include <cudf/column/column_view.hpp>
-#include <cudf/utilities/default_stream.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
-
-namespace spark_rapids_jni {
-
-std::unique_ptr<cudf::column> estimate_from_hll_sketches(
-  cudf::column_view const& input,
-  int precision,
-  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
-
-}  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/HLLPPHostUDFJni.cpp b/src/main/cpp/src/HLLPPHostUDFJni.cpp
index 3132d088ac..a80a78c6b8 100644
--- a/src/main/cpp/src/HLLPPHostUDFJni.cpp
+++ b/src/main/cpp/src/HLLPPHostUDFJni.cpp
@@ -20,28 +20,22 @@
 
 extern "C" {
 
-JNIEXPORT jlong JNICALL
-Java_com_nvidia_spark_rapids_jni_HLLPPHostUDF_createHLLPPHostUDF(
-    JNIEnv *env, jclass, jint agg_type, int precision) {
+JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_HLLPPHostUDF_createHLLPPHostUDF(
+  JNIEnv* env, jclass, jint agg_type, int precision)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto udf_ptr = [&] {
       // The value of agg_type must be sync with
       // `HLLPPHostUDF.java#AggregationType`.
       switch (agg_type) {
-      case 0:
-        return spark_rapids_jni::create_hllpp_reduction_host_udf(precision);
-      case 1:
-        return spark_rapids_jni::create_hllpp_reduction_merge_host_udf(
-            precision);
-      case 2:
-        return spark_rapids_jni::create_hllpp_groupby_host_udf(precision);
-      default:
-        return spark_rapids_jni::create_hllpp_groupby_merge_host_udf(precision);
+        case 0: return spark_rapids_jni::create_hllpp_reduction_host_udf(precision);
+        case 1: return spark_rapids_jni::create_hllpp_reduction_merge_host_udf(precision);
+        case 2: return spark_rapids_jni::create_hllpp_groupby_host_udf(precision);
+        default: return spark_rapids_jni::create_hllpp_groupby_merge_host_udf(precision);
       }
     }();
-    CUDF_EXPECTS(udf_ptr != nullptr,
-                 "Invalid HyperLogLogPlusPlus(HLLPP) UDF instance.");
+    CUDF_EXPECTS(udf_ptr != nullptr, "Invalid HyperLogLogPlusPlus(HLLPP) UDF instance.");
 
     return reinterpret_cast<jlong>(udf_ptr.release());
   }
@@ -49,18 +43,19 @@ Java_com_nvidia_spark_rapids_jni_HLLPPHostUDF_createHLLPPHostUDF(
 }
 
 JNIEXPORT jlong JNICALL
-Java_com_nvidia_spark_rapids_jni_HLLPPHostUDF_estimateDistinctValueFromSketches(
-    JNIEnv *env, jclass, jlong sketches, jint precision) {
+Java_com_nvidia_spark_rapids_jni_HLLPPHostUDF_estimateDistinctValueFromSketches(JNIEnv* env,
+                                                                                jclass,
+                                                                                jlong sketches,
+                                                                                jint precision)
+{
   JNI_NULL_CHECK(env, sketches, "Sketch column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const sketch_view =
-        reinterpret_cast<cudf::column_view const *>(sketches);
+    auto const sketch_view = reinterpret_cast<cudf::column_view const*>(sketches);
     return cudf::jni::ptr_as_jlong(
-        spark_rapids_jni::estimate_from_hll_sketches(*sketch_view, precision)
-            .release());
+      spark_rapids_jni::estimate_from_hll_sketches(*sketch_view, precision).release());
   }
   CATCH_STD(env, 0);
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/src/main/cpp/src/HLLPPJni.cpp b/src/main/cpp/src/HLLPPJni.cpp
deleted file mode 100644
index 581af90a90..0000000000
--- a/src/main/cpp/src/HLLPPJni.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "HLLPP.hpp"
-#include "cudf_jni_apis.hpp"
-
-extern "C" {
-
-JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_HLLPP_estimateDistinctValueFromSketches(
-  JNIEnv* env, jclass, jlong sketches, jint precision)
-{
-  JNI_NULL_CHECK(env, sketches, "Sketch column is null", 0);
-  try {
-    cudf::jni::auto_set_device(env);
-    auto const sketch_view = reinterpret_cast<cudf::column_view const*>(sketches);
-    return cudf::jni::ptr_as_jlong(
-      spark_rapids_jni::estimate_from_hll_sketches(*sketch_view, precision).release());
-  }
-  CATCH_STD(env, 0);
-}
-}
diff --git a/src/main/cpp/src/hllpp.cu b/src/main/cpp/src/hllpp.cu
index 08f452ad76..8d39c66865 100644
--- a/src/main/cpp/src/hllpp.cu
+++ b/src/main/cpp/src/hllpp.cu
@@ -37,7 +37,7 @@
 
 #include <cuco/detail/hyperloglog/finalizer.cuh>
 #include <cuda/atomic>
-#include <cuda/std/__algorithm/min.h> // TODO #include <cuda/std/algorithm> once available
+#include <cuda/std/__algorithm/min.h>  // TODO #include <cuda/std/algorithm> once available
 #include <cuda/std/bit>
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -53,10 +53,10 @@ namespace {
  * @brief Get register value from a long which contains 10 register values,
  * each register value in long is 6 bits.
  */
-__device__ inline int get_register_value(int64_t const ten_registers,
-                                         int reg_idx) {
+__device__ inline int get_register_value(int64_t const ten_registers, int reg_idx)
+{
   int64_t shift_mask = MASK << (REGISTER_VALUE_BITS * reg_idx);
-  int64_t v = (ten_registers & shift_mask) >> (REGISTER_VALUE_BITS * reg_idx);
+  int64_t v          = (ten_registers & shift_mask) >> (REGISTER_VALUE_BITS * reg_idx);
   return static_cast<int>(v);
 }
 
@@ -148,21 +148,17 @@ __device__ inline int get_register_value(int64_t const ten_registers,
  */
 template <int num_hashs_per_thread>
 CUDF_KERNEL void partial_group_sketches_from_hashs_kernel(
-    cudf::column_device_view hashs,
-    cudf::device_span<cudf::size_type const> group_lables,
-    int64_t const precision, // num of bits for register addressing, e.g.: 9
-    int *const
-        registers_output_cache, // num is num_groups * num_registers_per_sketch
-    int *const
-        registers_thread_cache, // num is num_threads * num_registers_per_sketch
-    cudf::size_type *const
-        group_lables_thread_cache // save the group lables for each thread
-) {
-  auto const tid = cudf::detail::grid_1d::global_thread_id();
+  cudf::column_device_view hashs,
+  cudf::device_span<cudf::size_type const> group_lables,
+  int64_t const precision,                          // num of bits for register addressing, e.g.: 9
+  int* const registers_output_cache,                // num is num_groups * num_registers_per_sketch
+  int* const registers_thread_cache,                // num is num_threads * num_registers_per_sketch
+  cudf::size_type* const group_lables_thread_cache  // save the group lables for each thread
+)
+{
+  auto const tid          = cudf::detail::grid_1d::global_thread_id();
   int64_t const num_hashs = hashs.size();
-  if (tid * num_hashs_per_thread >= hashs.size()) {
-    return;
-  }
+  if (tid * num_hashs_per_thread >= hashs.size()) { return; }
 
   // 2^precision = num_registers_per_sketch
   int64_t num_registers_per_sketch = 1L << precision;
@@ -172,12 +168,10 @@ CUDF_KERNEL void partial_group_sketches_from_hashs_kernel(
   int const idx_shift = 64 - precision;
 
   auto const hash_first = tid * num_hashs_per_thread;
-  auto const hash_end =
-      cuda::std::min((tid + 1) * num_hashs_per_thread, num_hashs);
+  auto const hash_end   = cuda::std::min((tid + 1) * num_hashs_per_thread, num_hashs);
 
   // init sketches for each thread
-  int *const sketch_ptr =
-      registers_thread_cache + tid * num_registers_per_sketch;
+  int* const sketch_ptr = registers_thread_cache + tid * num_registers_per_sketch;
   for (auto i = 0; i < num_registers_per_sketch; i++) {
     sketch_ptr[i] = 0;
   }
@@ -187,23 +181,19 @@ CUDF_KERNEL void partial_group_sketches_from_hashs_kernel(
     cudf::size_type curr_group = group_lables[hash_idx];
 
     // cast to unsigned, then >> will shift without preserve the sign bit.
-    uint64_t const hash =
-        static_cast<uint64_t>(hashs.element<int64_t>(hash_idx));
-    auto const reg_idx = hash >> idx_shift;
-    int const reg_v = static_cast<int>(
-        cuda::std::countl_zero((hash << precision) | w_padding) + 1ULL);
+    uint64_t const hash = static_cast<uint64_t>(hashs.element<int64_t>(hash_idx));
+    auto const reg_idx  = hash >> idx_shift;
+    int const reg_v =
+      static_cast<int>(cuda::std::countl_zero((hash << precision) | w_padding) + 1ULL);
 
     if (curr_group == prev_group) {
       // still in the same group, update the max value
-      if (reg_v > sketch_ptr[reg_idx]) {
-        sketch_ptr[reg_idx] = reg_v;
-      }
+      if (reg_v > sketch_ptr[reg_idx]) { sketch_ptr[reg_idx] = reg_v; }
     } else {
       // meets new group, save output for the previous group and reset
       for (auto i = 0; i < num_registers_per_sketch; i++) {
-        registers_output_cache[prev_group * num_registers_per_sketch + i] =
-            sketch_ptr[i];
-        sketch_ptr[i] = 0;
+        registers_output_cache[prev_group * num_registers_per_sketch + i] = sketch_ptr[i];
+        sketch_ptr[i]                                                     = 0;
       }
       // save the result for current group
       sketch_ptr[reg_idx] = reg_v;
@@ -214,16 +204,14 @@ CUDF_KERNEL void partial_group_sketches_from_hashs_kernel(
       if (hash_idx == num_hashs - 1) {
         // meets the last segment, special logic: assume meets new group
         for (auto i = 0; i < num_registers_per_sketch; i++) {
-          registers_output_cache[curr_group * num_registers_per_sketch + i] =
-              sketch_ptr[i];
+          registers_output_cache[curr_group * num_registers_per_sketch + i] = sketch_ptr[i];
         }
       } else {
         // not the last segment, probe one item forward.
         if (curr_group != group_lables[hash_idx + 1]) {
           // meets a new group by checking the next item in the next segment
           for (auto i = 0; i < num_registers_per_sketch; i++) {
-            registers_output_cache[curr_group * num_registers_per_sketch + i] =
-                sketch_ptr[i];
+            registers_output_cache[curr_group * num_registers_per_sketch + i] = sketch_ptr[i];
           }
         }
       }
@@ -280,29 +268,29 @@ CUDF_KERNEL void partial_group_sketches_from_hashs_kernel(
  * max value in the same group, and then update to registers_output_cache
  */
 template <int block_size>
-CUDF_KERNEL void merge_sketches_vertically(
-    int64_t num_sketches, int64_t num_registers_per_sketch,
-    int *const registers_output_cache, int const *const registers_thread_cache,
-    cudf::size_type const *const group_lables_thread_cache) {
+CUDF_KERNEL void merge_sketches_vertically(int64_t num_sketches,
+                                           int64_t num_registers_per_sketch,
+                                           int* const registers_output_cache,
+                                           int const* const registers_thread_cache,
+                                           cudf::size_type const* const group_lables_thread_cache)
+{
   __shared__ int8_t shared_data[block_size];
   auto const tid = cudf::detail::grid_1d::global_thread_id();
   int shared_idx = tid % block_size;
 
   // register idx is tid
   shared_data[shared_idx] = static_cast<int8_t>(0);
-  int prev_group = group_lables_thread_cache[0];
+  int prev_group          = group_lables_thread_cache[0];
   for (auto i = 0; i < num_sketches; i++) {
     int curr_group = group_lables_thread_cache[i];
-    int8_t curr_reg_v = static_cast<int8_t>(
-        registers_thread_cache[i * num_registers_per_sketch + tid]);
+    int8_t curr_reg_v =
+      static_cast<int8_t>(registers_thread_cache[i * num_registers_per_sketch + tid]);
     if (curr_group == prev_group) {
-      if (curr_reg_v > shared_data[shared_idx]) {
-        shared_data[shared_idx] = curr_reg_v;
-      }
+      if (curr_reg_v > shared_data[shared_idx]) { shared_data[shared_idx] = curr_reg_v; }
     } else {
       // meets a new group, store the result for previous group
       int64_t result_reg_idx = prev_group * num_registers_per_sketch + tid;
-      int result_curr_reg_v = registers_output_cache[result_reg_idx];
+      int result_curr_reg_v  = registers_output_cache[result_reg_idx];
       if (shared_data[shared_idx] > result_curr_reg_v) {
         registers_output_cache[result_reg_idx] = shared_data[shared_idx];
       }
@@ -314,7 +302,7 @@ CUDF_KERNEL void merge_sketches_vertically(
 
   // handles the last register in this thread
   int64_t reg_idx = prev_group * num_registers_per_sketch + tid;
-  int curr_reg_v = registers_output_cache[reg_idx];
+  int curr_reg_v  = registers_output_cache[reg_idx];
   if (shared_data[shared_idx] > curr_reg_v) {
     registers_output_cache[reg_idx] = shared_data[shared_idx];
   }
@@ -346,116 +334,114 @@ CUDF_KERNEL void merge_sketches_vertically(
  * 6 bits: 100-001 Compact to one long is:
  * 100001-100001-100001-100001-100001-100001-100001-100001-100001-100001
  */
-CUDF_KERNEL void
-compact_kernel(int64_t const num_groups, int64_t const num_registers_per_sketch,
-               cudf::device_span<int64_t *> sketches_output,
-               // num_groups * num_registers_per_sketch integers
-               cudf::device_span<int> registers_output_cache) {
-  int64_t const tid = cudf::detail::grid_1d::global_thread_id();
-  int64_t const num_long_cols =
-      num_registers_per_sketch / REGISTERS_PER_LONG + 1;
-  if (tid >= num_groups * num_long_cols) {
-    return;
-  }
+CUDF_KERNEL void compact_kernel(int64_t const num_groups,
+                                int64_t const num_registers_per_sketch,
+                                cudf::device_span<int64_t*> sketches_output,
+                                // num_groups * num_registers_per_sketch integers
+                                cudf::device_span<int> registers_output_cache)
+{
+  int64_t const tid           = cudf::detail::grid_1d::global_thread_id();
+  int64_t const num_long_cols = num_registers_per_sketch / REGISTERS_PER_LONG + 1;
+  if (tid >= num_groups * num_long_cols) { return; }
 
   int64_t const group_idx = tid / num_long_cols;
-  int64_t const long_idx = tid % num_long_cols;
+  int64_t const long_idx  = tid % num_long_cols;
 
   int64_t const reg_begin_idx =
-      group_idx * num_registers_per_sketch + long_idx * REGISTERS_PER_LONG;
+    group_idx * num_registers_per_sketch + long_idx * REGISTERS_PER_LONG;
   int64_t num_regs = REGISTERS_PER_LONG;
-  if (long_idx == num_long_cols - 1) {
-    num_regs = num_registers_per_sketch % REGISTERS_PER_LONG;
-  }
+  if (long_idx == num_long_cols - 1) { num_regs = num_registers_per_sketch % REGISTERS_PER_LONG; }
 
   int64_t ten_registers = 0;
   for (auto i = 0; i < num_regs; i++) {
     int64_t reg_v = registers_output_cache[reg_begin_idx + i];
-    int64_t tmp = reg_v << (REGISTER_VALUE_BITS * i);
+    int64_t tmp   = reg_v << (REGISTER_VALUE_BITS * i);
     ten_registers |= tmp;
   }
 
   sketches_output[long_idx][group_idx] = ten_registers;
 }
 
-std::unique_ptr<cudf::column>
-group_hllpp(cudf::column_view const &input, int64_t const num_groups,
-            cudf::device_span<cudf::size_type const> group_lables,
-            int64_t const precision, rmm::cuda_stream_view stream,
-            rmm::device_async_resource_ref mr) {
-  int64_t num_registers_per_sketch = 1 << precision;
-  constexpr int64_t block_size = 256;
-  constexpr int num_hashs_per_thread = 256; // handles 256 items per thread
+std::unique_ptr<cudf::column> group_hllpp(cudf::column_view const& input,
+                                          int64_t const num_groups,
+                                          cudf::device_span<cudf::size_type const> group_lables,
+                                          int64_t const precision,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::device_async_resource_ref mr)
+{
+  int64_t num_registers_per_sketch   = 1 << precision;
+  constexpr int64_t block_size       = 256;
+  constexpr int num_hashs_per_thread = 256;  // handles 256 items per thread
   int64_t num_threads_partial_kernel =
-      cudf::util::div_rounding_up_safe(input.size(), num_hashs_per_thread);
+    cudf::util::div_rounding_up_safe(input.size(), num_hashs_per_thread);
 
-  auto sketches_output = rmm::device_uvector<int32_t>(
-      num_groups * num_registers_per_sketch, stream, mr);
+  auto sketches_output =
+    rmm::device_uvector<int32_t>(num_groups * num_registers_per_sketch, stream, mr);
 
-  { // add this block to release `registers_thread_cache` and
+  {  // add this block to release `registers_thread_cache` and
     // `group_lables_thread_cache`
     auto registers_thread_cache = rmm::device_uvector<int32_t>(
-        num_threads_partial_kernel * num_registers_per_sketch, stream, mr);
+      num_threads_partial_kernel * num_registers_per_sketch, stream, mr);
     auto group_lables_thread_cache =
-        rmm::device_uvector<int32_t>(num_threads_partial_kernel, stream, mr);
+      rmm::device_uvector<int32_t>(num_threads_partial_kernel, stream, mr);
 
-    { // add this block to release `hash_col`
+    {  // add this block to release `hash_col`
       // 1. compute all the hashs
       auto input_table_view = cudf::table_view{{input}};
-      auto hash_col = xxhash64(input_table_view, SEED, stream, mr);
-      auto d_hashs = cudf::column_device_view::create(hash_col->view(), stream);
+      auto hash_col         = xxhash64(input_table_view, SEED, stream, mr);
+      auto d_hashs          = cudf::column_device_view::create(hash_col->view(), stream);
 
       // 2. execute partial group by
-      int64_t num_blocks_p1 = cudf::util::div_rounding_up_safe(
-          num_threads_partial_kernel, block_size);
+      int64_t num_blocks_p1 =
+        cudf::util::div_rounding_up_safe(num_threads_partial_kernel, block_size);
       partial_group_sketches_from_hashs_kernel<num_hashs_per_thread>
-          <<<num_blocks_p1, block_size, 0, stream.value()>>>(
-              *d_hashs, group_lables, precision, sketches_output.begin(),
-              registers_thread_cache.begin(),
-              group_lables_thread_cache.begin());
+        <<<num_blocks_p1, block_size, 0, stream.value()>>>(*d_hashs,
+                                                           group_lables,
+                                                           precision,
+                                                           sketches_output.begin(),
+                                                           registers_thread_cache.begin(),
+                                                           group_lables_thread_cache.begin());
     }
     // 3. merge the intermidate result
     auto num_merge_threads = num_registers_per_sketch;
-    auto num_merge_blocks =
-        cudf::util::div_rounding_up_safe(num_merge_threads, block_size);
+    auto num_merge_blocks  = cudf::util::div_rounding_up_safe(num_merge_threads, block_size);
     merge_sketches_vertically<block_size>
-        <<<num_merge_blocks, block_size, block_size, stream.value()>>>(
-            num_threads_partial_kernel, // num_sketches
-            num_registers_per_sketch, sketches_output.begin(),
-            registers_thread_cache.begin(), group_lables_thread_cache.begin());
+      <<<num_merge_blocks, block_size, block_size, stream.value()>>>(
+        num_threads_partial_kernel,  // num_sketches
+        num_registers_per_sketch,
+        sketches_output.begin(),
+        registers_thread_cache.begin(),
+        group_lables_thread_cache.begin());
   }
 
   // 4. create output columns
-  auto num_long_cols = num_registers_per_sketch / REGISTERS_PER_LONG + 1;
-  auto const results_iter =
-      cudf::detail::make_counting_transform_iterator(0, [&](int i) {
-        return cudf::make_numeric_column(
-            cudf::data_type{cudf::type_id::INT64}, num_groups,
-            cudf::mask_state::ALL_VALID, stream, mr);
-      });
-  auto children = std::vector<std::unique_ptr<cudf::column>>(
-      results_iter, results_iter + num_long_cols);
+  auto num_long_cols      = num_registers_per_sketch / REGISTERS_PER_LONG + 1;
+  auto const results_iter = cudf::detail::make_counting_transform_iterator(0, [&](int i) {
+    return cudf::make_numeric_column(
+      cudf::data_type{cudf::type_id::INT64}, num_groups, cudf::mask_state::ALL_VALID, stream, mr);
+  });
+  auto children =
+    std::vector<std::unique_ptr<cudf::column>>(results_iter, results_iter + num_long_cols);
   auto d_results = [&] {
-    auto host_results_pointer_iter = thrust::make_transform_iterator(
-        children.begin(), [](auto const &results_column) {
-          return results_column->mutable_view().template data<int64_t>();
-        });
-    auto host_results_pointers = std::vector<int64_t *>(
-        host_results_pointer_iter, host_results_pointer_iter + children.size());
-    return cudf::detail::make_device_uvector_async(host_results_pointers,
-                                                   stream, mr);
+    auto host_results_pointer_iter =
+      thrust::make_transform_iterator(children.begin(), [](auto const& results_column) {
+        return results_column->mutable_view().template data<int64_t>();
+      });
+    auto host_results_pointers =
+      std::vector<int64_t*>(host_results_pointer_iter, host_results_pointer_iter + children.size());
+    return cudf::detail::make_device_uvector_async(host_results_pointers, stream, mr);
   }();
-  auto result = cudf::make_structs_column(num_groups, std::move(children),
-                                          0,                    // null count
-                                          rmm::device_buffer{}, // null mask
+  auto result = cudf::make_structs_column(num_groups,
+                                          std::move(children),
+                                          0,                     // null count
+                                          rmm::device_buffer{},  // null mask
                                           stream);
 
   // 5. compact sketches
   auto num_phase3_threads = num_groups * num_long_cols;
-  auto num_phase3_blocks =
-      cudf::util::div_rounding_up_safe(num_phase3_threads, block_size);
+  auto num_phase3_blocks  = cudf::util::div_rounding_up_safe(num_phase3_threads, block_size);
   compact_kernel<<<num_phase3_blocks, block_size, 0, stream.value()>>>(
-      num_groups, num_registers_per_sketch, d_results, sketches_output);
+    num_groups, num_registers_per_sketch, d_results, sketches_output);
 
   return result;
 }
@@ -485,55 +471,49 @@ group_hllpp(cudf::column_view const &input, int64_t const num_groups,
  */
 template <int num_longs_per_threads>
 CUDF_KERNEL void partial_group_long_sketches_kernel(
-    cudf::device_span<int64_t const *> sketches_input,
-    int64_t const num_sketches_input, int64_t const num_threads_per_col,
-    int64_t const num_registers_per_sketch, int64_t const num_groups,
-    cudf::device_span<cudf::size_type const> group_lables,
-    // num_groups * num_registers_per_sketch integers
-    int *const registers_output_cache,
-    // num_threads * num_registers_per_sketch integers
-    int *const registers_thread_cache,
-    // num_threads integers
-    cudf::size_type *const group_lables_thread_cache) {
-  auto const tid = cudf::detail::grid_1d::global_thread_id();
+  cudf::device_span<int64_t const*> sketches_input,
+  int64_t const num_sketches_input,
+  int64_t const num_threads_per_col,
+  int64_t const num_registers_per_sketch,
+  int64_t const num_groups,
+  cudf::device_span<cudf::size_type const> group_lables,
+  // num_groups * num_registers_per_sketch integers
+  int* const registers_output_cache,
+  // num_threads * num_registers_per_sketch integers
+  int* const registers_thread_cache,
+  // num_threads integers
+  cudf::size_type* const group_lables_thread_cache)
+{
+  auto const tid           = cudf::detail::grid_1d::global_thread_id();
   auto const num_long_cols = sketches_input.size();
-  if (tid >= num_threads_per_col * num_long_cols) {
-    return;
-  }
+  if (tid >= num_threads_per_col * num_long_cols) { return; }
 
-  auto const long_idx = tid / num_threads_per_col;
-  auto const thread_idx_in_cols = tid % num_threads_per_col;
-  int64_t const *const longs_ptr = sketches_input[long_idx];
+  auto const long_idx            = tid / num_threads_per_col;
+  auto const thread_idx_in_cols  = tid % num_threads_per_col;
+  int64_t const* const longs_ptr = sketches_input[long_idx];
 
-  int *const registers_thread_ptr =
-      registers_thread_cache + thread_idx_in_cols * num_registers_per_sketch;
+  int* const registers_thread_ptr =
+    registers_thread_cache + thread_idx_in_cols * num_registers_per_sketch;
 
   auto const sketch_first = thread_idx_in_cols * num_longs_per_threads;
-  auto const sketch_end =
-      cuda::std::min(sketch_first + num_longs_per_threads, num_sketches_input);
+  auto const sketch_end = cuda::std::min(sketch_first + num_longs_per_threads, num_sketches_input);
 
   int num_regs = REGISTERS_PER_LONG;
-  if (long_idx == num_long_cols - 1) {
-    num_regs = num_registers_per_sketch % REGISTERS_PER_LONG;
-  }
+  if (long_idx == num_long_cols - 1) { num_regs = num_registers_per_sketch % REGISTERS_PER_LONG; }
 
   for (auto i = 0; i < num_regs; i++) {
     cudf::size_type prev_group = group_lables[sketch_first];
-    int max_reg_v = 0;
-    int reg_idx_in_sketch = long_idx * REGISTERS_PER_LONG + i;
-    for (auto sketch_idx = sketch_first; sketch_idx < sketch_end;
-         sketch_idx++) {
+    int max_reg_v              = 0;
+    int reg_idx_in_sketch      = long_idx * REGISTERS_PER_LONG + i;
+    for (auto sketch_idx = sketch_first; sketch_idx < sketch_end; sketch_idx++) {
       cudf::size_type curr_group = group_lables[sketch_idx];
-      int curr_reg_v = get_register_value(longs_ptr[sketch_idx], i);
+      int curr_reg_v             = get_register_value(longs_ptr[sketch_idx], i);
       if (curr_group == prev_group) {
         // still in the same group, update the max value
-        if (curr_reg_v > max_reg_v) {
-          max_reg_v = curr_reg_v;
-        }
+        if (curr_reg_v > max_reg_v) { max_reg_v = curr_reg_v; }
       } else {
         // meets new group, save output for the previous group
-        int64_t output_idx_prev =
-            num_registers_per_sketch * prev_group + reg_idx_in_sketch;
+        int64_t output_idx_prev = num_registers_per_sketch * prev_group + reg_idx_in_sketch;
         registers_output_cache[output_idx_prev] = max_reg_v;
 
         // reset
@@ -542,17 +522,16 @@ CUDF_KERNEL void partial_group_long_sketches_kernel(
 
       if (sketch_idx == sketch_end - 1) {
         // last item in the segment
-        int64_t output_idx_curr =
-            num_registers_per_sketch * curr_group + reg_idx_in_sketch;
+        int64_t output_idx_curr = num_registers_per_sketch * curr_group + reg_idx_in_sketch;
         if (sketch_idx == num_sketches_input - 1) {
           // last segment
           registers_output_cache[output_idx_curr] = max_reg_v;
-          max_reg_v = curr_reg_v;
+          max_reg_v                               = curr_reg_v;
         } else {
           if (curr_group != group_lables[sketch_idx + 1]) {
             // look the first item in the next segment
             registers_output_cache[output_idx_curr] = max_reg_v;
-            max_reg_v = curr_reg_v;
+            max_reg_v                               = curr_reg_v;
           }
         }
       }
@@ -565,8 +544,7 @@ CUDF_KERNEL void partial_group_long_sketches_kernel(
   }
 
   if (long_idx == 0) {
-    group_lables_thread_cache[thread_idx_in_cols] =
-        group_lables[sketch_end - 1];
+    group_lables_thread_cache[thread_idx_in_cols] = group_lables[sketch_end - 1];
   }
 }
 
@@ -575,88 +553,83 @@ CUDF_KERNEL void partial_group_long_sketches_kernel(
  * register values. Merge all rows in the same group.
  */
 std::unique_ptr<cudf::column> group_merge_hllpp(
-    cudf::column_view const &hll_input, // struct<long, ..., long> column
-    int64_t const num_groups,
-    cudf::device_span<cudf::size_type const> group_lables,
-    int64_t const precision, rmm::cuda_stream_view stream,
-    rmm::device_async_resource_ref mr) {
-  int64_t num_registers_per_sketch = 1 << precision;
-  int64_t const num_sketches = hll_input.size();
-  int64_t const num_long_cols =
-      num_registers_per_sketch / REGISTERS_PER_LONG + 1;
+  cudf::column_view const& hll_input,  // struct<long, ..., long> column
+  int64_t const num_groups,
+  cudf::device_span<cudf::size_type const> group_lables,
+  int64_t const precision,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
+  int64_t num_registers_per_sketch        = 1 << precision;
+  int64_t const num_sketches              = hll_input.size();
+  int64_t const num_long_cols             = num_registers_per_sketch / REGISTERS_PER_LONG + 1;
   constexpr int64_t num_longs_per_threads = 256;
-  constexpr int64_t block_size = 256;
+  constexpr int64_t block_size            = 256;
 
   int64_t num_threads_per_col_phase1 =
-      cudf::util::div_rounding_up_safe(num_sketches, num_longs_per_threads);
+    cudf::util::div_rounding_up_safe(num_sketches, num_longs_per_threads);
   int64_t num_threads_phase1 = num_threads_per_col_phase1 * num_long_cols;
-  int64_t num_blocks =
-      cudf::util::div_rounding_up_safe(num_threads_phase1, block_size);
-  auto registers_output_cache = rmm::device_uvector<int32_t>(
-      num_registers_per_sketch * num_groups, stream, mr);
+  int64_t num_blocks         = cudf::util::div_rounding_up_safe(num_threads_phase1, block_size);
+  auto registers_output_cache =
+    rmm::device_uvector<int32_t>(num_registers_per_sketch * num_groups, stream, mr);
   {
-    auto registers_thread_cache = rmm::device_uvector<int32_t>(
-        num_registers_per_sketch * num_threads_phase1, stream, mr);
+    auto registers_thread_cache =
+      rmm::device_uvector<int32_t>(num_registers_per_sketch * num_threads_phase1, stream, mr);
     auto group_lables_thread_cache =
-        rmm::device_uvector<int32_t>(num_threads_per_col_phase1, stream, mr);
+      rmm::device_uvector<int32_t>(num_threads_per_col_phase1, stream, mr);
 
     cudf::structs_column_view scv(hll_input);
-    auto const input_iter =
-        cudf::detail::make_counting_transform_iterator(0, [&](int i) {
-          return scv.get_sliced_child(i, stream).begin<int64_t>();
-        });
-    auto input_cols =
-        std::vector<int64_t const *>(input_iter, input_iter + num_long_cols);
-    auto d_inputs =
-        cudf::detail::make_device_uvector_async(input_cols, stream, mr);
+    auto const input_iter = cudf::detail::make_counting_transform_iterator(
+      0, [&](int i) { return scv.get_sliced_child(i, stream).begin<int64_t>(); });
+    auto input_cols = std::vector<int64_t const*>(input_iter, input_iter + num_long_cols);
+    auto d_inputs   = cudf::detail::make_device_uvector_async(input_cols, stream, mr);
     // 1st kernel: partially group
     partial_group_long_sketches_kernel<num_longs_per_threads>
-        <<<num_blocks, block_size, 0, stream.value()>>>(
-            d_inputs, num_sketches, num_threads_per_col_phase1,
-            num_registers_per_sketch, num_groups, group_lables,
-            registers_output_cache.begin(), registers_thread_cache.begin(),
-            group_lables_thread_cache.begin());
+      <<<num_blocks, block_size, 0, stream.value()>>>(d_inputs,
+                                                      num_sketches,
+                                                      num_threads_per_col_phase1,
+                                                      num_registers_per_sketch,
+                                                      num_groups,
+                                                      group_lables,
+                                                      registers_output_cache.begin(),
+                                                      registers_thread_cache.begin(),
+                                                      group_lables_thread_cache.begin());
     auto const num_phase2_threads = num_registers_per_sketch;
-    auto const num_phase2_blocks =
-        cudf::util::div_rounding_up_safe(num_phase2_threads, block_size);
+    auto const num_phase2_blocks = cudf::util::div_rounding_up_safe(num_phase2_threads, block_size);
     // 2nd kernel: vertical merge
     merge_sketches_vertically<block_size>
-        <<<num_phase2_blocks, block_size, block_size, stream.value()>>>(
-            num_threads_per_col_phase1, // num_sketches
-            num_registers_per_sketch, registers_output_cache.begin(),
-            registers_thread_cache.begin(), group_lables_thread_cache.begin());
+      <<<num_phase2_blocks, block_size, block_size, stream.value()>>>(
+        num_threads_per_col_phase1,  // num_sketches
+        num_registers_per_sketch,
+        registers_output_cache.begin(),
+        registers_thread_cache.begin(),
+        group_lables_thread_cache.begin());
   }
 
   // create output columns
-  auto const results_iter =
-      cudf::detail::make_counting_transform_iterator(0, [&](int i) {
-        return cudf::make_numeric_column(
-            cudf::data_type{cudf::type_id::INT64}, num_groups,
-            cudf::mask_state::ALL_VALID, stream, mr);
-      });
-  auto results = std::vector<std::unique_ptr<cudf::column>>(
-      results_iter, results_iter + num_long_cols);
+  auto const results_iter = cudf::detail::make_counting_transform_iterator(0, [&](int i) {
+    return cudf::make_numeric_column(
+      cudf::data_type{cudf::type_id::INT64}, num_groups, cudf::mask_state::ALL_VALID, stream, mr);
+  });
+  auto results =
+    std::vector<std::unique_ptr<cudf::column>>(results_iter, results_iter + num_long_cols);
   auto d_sketches_output = [&] {
-    auto host_results_pointer_iter = thrust::make_transform_iterator(
-        results.begin(), [](auto const &results_column) {
-          return results_column->mutable_view().template data<int64_t>();
-        });
-    auto host_results_pointers = std::vector<int64_t *>(
-        host_results_pointer_iter, host_results_pointer_iter + results.size());
-    return cudf::detail::make_device_uvector_async(host_results_pointers,
-                                                   stream, mr);
+    auto host_results_pointer_iter =
+      thrust::make_transform_iterator(results.begin(), [](auto const& results_column) {
+        return results_column->mutable_view().template data<int64_t>();
+      });
+    auto host_results_pointers =
+      std::vector<int64_t*>(host_results_pointer_iter, host_results_pointer_iter + results.size());
+    return cudf::detail::make_device_uvector_async(host_results_pointers, stream, mr);
   }();
 
   // 3rd kernel: compact
   auto num_phase3_threads = num_groups * num_long_cols;
-  auto num_phase3_blocks =
-      cudf::util::div_rounding_up_safe(num_phase3_threads, block_size);
+  auto num_phase3_blocks  = cudf::util::div_rounding_up_safe(num_phase3_threads, block_size);
   compact_kernel<<<num_phase3_blocks, block_size, 0, stream.value()>>>(
-      num_groups, num_registers_per_sketch, d_sketches_output,
-      registers_output_cache);
+    num_groups, num_registers_per_sketch, d_sketches_output, registers_output_cache);
 
-  return make_structs_column(num_groups, std::move(results), 0,
-                             rmm::device_buffer{});
+  return make_structs_column(num_groups, std::move(results), 0, rmm::device_buffer{});
 }
 
 /**
@@ -666,15 +639,16 @@ std::unique_ptr<cudf::column> group_merge_hllpp(
  */
 template <int block_size>
 CUDF_KERNEL void reduce_hllpp_kernel(cudf::column_device_view hashs,
-                                     cudf::device_span<int64_t *> output,
-                                     int precision) {
+                                     cudf::device_span<int64_t*> output,
+                                     int precision)
+{
   __shared__ int32_t shared_data[block_size];
 
-  auto const tid = cudf::detail::grid_1d::global_thread_id();
-  auto const num_hashs = hashs.size();
+  auto const tid                          = cudf::detail::grid_1d::global_thread_id();
+  auto const num_hashs                    = hashs.size();
   uint64_t const num_registers_per_sketch = 1L << precision;
-  int const idx_shift = 64 - precision;
-  uint64_t const w_padding = 1ULL << (precision - 1);
+  int const idx_shift                     = 64 - precision;
+  uint64_t const w_padding                = 1ULL << (precision - 1);
 
   // init tmp data
   for (int i = tid; i < num_registers_per_sketch; i += block_size) {
@@ -689,10 +663,9 @@ CUDF_KERNEL void reduce_hllpp_kernel(cudf::column_device_view hashs,
     // shift
     uint64_t const reg_idx = hash >> idx_shift;
     // get the leading zeros
-    int const reg_v = static_cast<int>(
-        cuda::std::countl_zero((hash << precision) | w_padding) + 1ULL);
-    cuda::atomic_ref<int32_t, cuda::thread_scope_block> register_ref(
-        shared_data[reg_idx]);
+    int const reg_v =
+      static_cast<int>(cuda::std::countl_zero((hash << precision) | w_padding) + 1ULL);
+    cuda::atomic_ref<int32_t, cuda::thread_scope_block> register_ref(shared_data[reg_idx]);
     register_ref.fetch_max(reg_v, cuda::memory_order_relaxed);
   }
   __syncthreads();
@@ -702,14 +675,12 @@ CUDF_KERNEL void reduce_hllpp_kernel(cudf::column_device_view hashs,
   // 7 to highest are all 0.
   if (tid * REGISTERS_PER_LONG < num_registers_per_sketch) {
     int start = tid * REGISTERS_PER_LONG;
-    int end = (tid + 1) * REGISTERS_PER_LONG;
-    if (end > num_registers_per_sketch) {
-      end = num_registers_per_sketch;
-    }
+    int end   = (tid + 1) * REGISTERS_PER_LONG;
+    if (end > num_registers_per_sketch) { end = num_registers_per_sketch; }
 
     int64_t ret = 0;
     for (int i = 0; i < end - start; i++) {
-      int shift = i * REGISTER_VALUE_BITS;
+      int shift   = i * REGISTER_VALUE_BITS;
       int64_t reg = shared_data[start + i];
       ret |= (reg << shift);
     }
@@ -718,35 +689,36 @@ CUDF_KERNEL void reduce_hllpp_kernel(cudf::column_device_view hashs,
   }
 }
 
-std::unique_ptr<cudf::scalar> reduce_hllpp(cudf::column_view const &input,
+std::unique_ptr<cudf::scalar> reduce_hllpp(cudf::column_view const& input,
                                            int64_t const precision,
                                            rmm::cuda_stream_view stream,
-                                           rmm::device_async_resource_ref mr) {
+                                           rmm::device_async_resource_ref mr)
+{
   int64_t num_registers_per_sketch = 1L << precision;
   // 1. compute all the hashs
   auto input_table_view = cudf::table_view{{input}};
-  auto hash_col = xxhash64(input_table_view, SEED, stream, mr);
-  auto d_hashs = cudf::column_device_view::create(hash_col->view(), stream);
+  auto hash_col         = xxhash64(input_table_view, SEED, stream, mr);
+  auto d_hashs          = cudf::column_device_view::create(hash_col->view(), stream);
 
   // 2. generate long columns, the size of each long column is 1
-  auto num_long_cols = num_registers_per_sketch / REGISTERS_PER_LONG + 1;
-  auto const results_iter =
-      cudf::detail::make_counting_transform_iterator(0, [&](int i) {
-        return cudf::make_numeric_column(
-            cudf::data_type{cudf::type_id::INT64}, 1 /**num_groups*/,
-            cudf::mask_state::ALL_VALID, stream, mr);
-      });
-  auto children = std::vector<std::unique_ptr<cudf::column>>(
-      results_iter, results_iter + num_long_cols);
+  auto num_long_cols      = num_registers_per_sketch / REGISTERS_PER_LONG + 1;
+  auto const results_iter = cudf::detail::make_counting_transform_iterator(0, [&](int i) {
+    return cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT64},
+                                     1 /**num_groups*/,
+                                     cudf::mask_state::ALL_VALID,
+                                     stream,
+                                     mr);
+  });
+  auto children =
+    std::vector<std::unique_ptr<cudf::column>>(results_iter, results_iter + num_long_cols);
   auto d_results = [&] {
-    auto host_results_pointer_iter = thrust::make_transform_iterator(
-        children.begin(), [](auto const &results_column) {
-          return results_column->mutable_view().template data<int64_t>();
-        });
-    auto host_results_pointers = std::vector<int64_t *>(
-        host_results_pointer_iter, host_results_pointer_iter + children.size());
-    return cudf::detail::make_device_uvector_async(host_results_pointers,
-                                                   stream, mr);
+    auto host_results_pointer_iter =
+      thrust::make_transform_iterator(children.begin(), [](auto const& results_column) {
+        return results_column->mutable_view().template data<int64_t>();
+      });
+    auto host_results_pointers =
+      std::vector<int64_t*>(host_results_pointer_iter, host_results_pointer_iter + children.size());
+    return cudf::detail::make_device_uvector_async(host_results_pointers, stream, mr);
   }();
 
   // 2. reduce and generate compacted long values
@@ -754,216 +726,203 @@ std::unique_ptr<cudf::scalar> reduce_hllpp(cudf::column_view const &input,
   // max shared memory is 2^18 * 4 = 1M
   auto const shared_mem_size = num_registers_per_sketch * sizeof(int32_t);
   reduce_hllpp_kernel<block_size>
-      <<<1, block_size, shared_mem_size, stream.value()>>>(*d_hashs, d_results,
-                                                           precision);
+    <<<1, block_size, shared_mem_size, stream.value()>>>(*d_hashs, d_results, precision);
 
   // 3. create struct scalar
   auto host_results_view_iter = thrust::make_transform_iterator(
-      children.begin(),
-      [](auto const &results_column) { return results_column->view(); });
-  auto views = std::vector<cudf::column_view>(
-      host_results_view_iter, host_results_view_iter + num_long_cols);
+    children.begin(), [](auto const& results_column) { return results_column->view(); });
+  auto views =
+    std::vector<cudf::column_view>(host_results_view_iter, host_results_view_iter + num_long_cols);
   auto table_view = cudf::table_view{views};
-  auto table = cudf::table(table_view);
-  return std::make_unique<cudf::struct_scalar>(std::move(table), true, stream,
-                                               mr);
+  auto table      = cudf::table(table_view);
+  return std::make_unique<cudf::struct_scalar>(std::move(table), true, stream, mr);
 }
 
-CUDF_KERNEL void reduce_merge_hll_kernel_vertically(
-    cudf::device_span<int64_t const *> sketch_longs,
-    cudf::size_type num_sketches, int num_registers_per_sketch,
-    int *const output) {
+CUDF_KERNEL void reduce_merge_hll_kernel_vertically(cudf::device_span<int64_t const*> sketch_longs,
+                                                    cudf::size_type num_sketches,
+                                                    int num_registers_per_sketch,
+                                                    int* const output)
+{
   auto const tid = cudf::detail::grid_1d::global_thread_id();
-  if (tid >= num_registers_per_sketch) {
-    return;
-  }
-  auto long_idx = tid / REGISTERS_PER_LONG;
+  if (tid >= num_registers_per_sketch) { return; }
+  auto long_idx        = tid / REGISTERS_PER_LONG;
   auto reg_idx_in_long = tid % REGISTERS_PER_LONG;
-  int max = 0;
+  int max              = 0;
   for (auto row_idx = 0; row_idx < num_sketches; row_idx++) {
-    int reg_v =
-        get_register_value(sketch_longs[long_idx][row_idx], reg_idx_in_long);
-    if (reg_v > max) {
-      max = reg_v;
-    }
+    int reg_v = get_register_value(sketch_longs[long_idx][row_idx], reg_idx_in_long);
+    if (reg_v > max) { max = reg_v; }
   }
   output[tid] = max;
 }
 
-std::unique_ptr<cudf::scalar>
-reduce_merge_hllpp(cudf::column_view const &input, int64_t const precision,
-                   rmm::cuda_stream_view stream,
-                   rmm::device_async_resource_ref mr) {
+std::unique_ptr<cudf::scalar> reduce_merge_hllpp(cudf::column_view const& input,
+                                                 int64_t const precision,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::device_async_resource_ref mr)
+{
   // create device input
   int64_t num_registers_per_sketch = 1 << precision;
-  auto num_long_cols = num_registers_per_sketch / REGISTERS_PER_LONG + 1;
+  auto num_long_cols               = num_registers_per_sketch / REGISTERS_PER_LONG + 1;
   cudf::structs_column_view scv(input);
-  auto const input_iter =
-      cudf::detail::make_counting_transform_iterator(0, [&](int i) {
-        return scv.get_sliced_child(i, stream).begin<int64_t>();
-      });
-  auto input_cols =
-      std::vector<int64_t const *>(input_iter, input_iter + num_long_cols);
-  auto d_inputs =
-      cudf::detail::make_device_uvector_async(input_cols, stream, mr);
+  auto const input_iter = cudf::detail::make_counting_transform_iterator(
+    0, [&](int i) { return scv.get_sliced_child(i, stream).begin<int64_t>(); });
+  auto input_cols = std::vector<int64_t const*>(input_iter, input_iter + num_long_cols);
+  auto d_inputs   = cudf::detail::make_device_uvector_async(input_cols, stream, mr);
 
   // create one row output
-  auto const results_iter =
-      cudf::detail::make_counting_transform_iterator(0, [&](int i) {
-        return cudf::make_numeric_column(
-            cudf::data_type{cudf::type_id::INT64}, 1 /** num_rows */,
-            cudf::mask_state::ALL_VALID, stream, mr);
-      });
-  auto children = std::vector<std::unique_ptr<cudf::column>>(
-      results_iter, results_iter + num_long_cols);
+  auto const results_iter = cudf::detail::make_counting_transform_iterator(0, [&](int i) {
+    return cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT64},
+                                     1 /** num_rows */,
+                                     cudf::mask_state::ALL_VALID,
+                                     stream,
+                                     mr);
+  });
+  auto children =
+    std::vector<std::unique_ptr<cudf::column>>(results_iter, results_iter + num_long_cols);
   auto d_results = [&] {
-    auto host_results_pointer_iter = thrust::make_transform_iterator(
-        children.begin(), [](auto const &results_column) {
-          return results_column->mutable_view().template data<int64_t>();
-        });
-    auto host_results_pointers = std::vector<int64_t *>(
-        host_results_pointer_iter, host_results_pointer_iter + children.size());
-    return cudf::detail::make_device_uvector_async(host_results_pointers,
-                                                   stream, mr);
+    auto host_results_pointer_iter =
+      thrust::make_transform_iterator(children.begin(), [](auto const& results_column) {
+        return results_column->mutable_view().template data<int64_t>();
+      });
+    auto host_results_pointers =
+      std::vector<int64_t*>(host_results_pointer_iter, host_results_pointer_iter + children.size());
+    return cudf::detail::make_device_uvector_async(host_results_pointers, stream, mr);
   }();
 
   // execute merge kernel
-  auto num_threads = num_registers_per_sketch;
+  auto num_threads             = num_registers_per_sketch;
   constexpr int64_t block_size = 256;
-  auto num_blocks = cudf::util::div_rounding_up_safe(num_threads, block_size);
-  auto output_cache =
-      rmm::device_uvector<int32_t>(num_registers_per_sketch, stream, mr);
-  reduce_merge_hll_kernel_vertically<<<num_blocks, block_size, 0,
-                                       stream.value()>>>(
-      d_inputs, input.size(), num_registers_per_sketch, output_cache.begin());
+  auto num_blocks              = cudf::util::div_rounding_up_safe(num_threads, block_size);
+  auto output_cache            = rmm::device_uvector<int32_t>(num_registers_per_sketch, stream, mr);
+  reduce_merge_hll_kernel_vertically<<<num_blocks, block_size, 0, stream.value()>>>(
+    d_inputs, input.size(), num_registers_per_sketch, output_cache.begin());
 
   // compact to longs
   auto const num_compact_threads = num_long_cols;
-  auto const num_compact_blocks =
-      cudf::util::div_rounding_up_safe(num_compact_threads, block_size);
+  auto const num_compact_blocks = cudf::util::div_rounding_up_safe(num_compact_threads, block_size);
   compact_kernel<<<num_compact_blocks, block_size, 0, stream.value()>>>(
-      1 /** num_groups **/, num_registers_per_sketch, d_results, output_cache);
+    1 /** num_groups **/, num_registers_per_sketch, d_results, output_cache);
 
   // create scalar
   auto host_results_view_iter = thrust::make_transform_iterator(
-      children.begin(),
-      [](auto const &results_column) { return results_column->view(); });
-  auto views = std::vector<cudf::column_view>(
-      host_results_view_iter, host_results_view_iter + num_long_cols);
+    children.begin(), [](auto const& results_column) { return results_column->view(); });
+  auto views =
+    std::vector<cudf::column_view>(host_results_view_iter, host_results_view_iter + num_long_cols);
   auto table_view = cudf::table_view{views};
-  auto table = cudf::table(table_view);
-  return std::make_unique<cudf::struct_scalar>(std::move(table), true, stream,
-                                               mr);
+  auto table      = cudf::table(table_view);
+  return std::make_unique<cudf::struct_scalar>(std::move(table), true, stream, mr);
 }
 
 struct estimate_fn {
-  cudf::device_span<int64_t const *> sketch_longs;
+  cudf::device_span<int64_t const*> sketch_longs;
   int const precision;
-  int64_t *const out;
+  int64_t* const out;
 
-  __device__ void operator()(cudf::size_type const idx) const {
+  __device__ void operator()(cudf::size_type const idx) const
+  {
     auto const num_regs = 1ull << precision;
-    double sum = 0;
-    int zeroes = 0;
+    double sum          = 0;
+    int zeroes          = 0;
 
     for (auto reg_idx = 0; reg_idx < num_regs; ++reg_idx) {
       // each long contains 10 register values
-      int long_col_idx = reg_idx / REGISTERS_PER_LONG;
+      int long_col_idx    = reg_idx / REGISTERS_PER_LONG;
       int reg_idx_in_long = reg_idx % REGISTERS_PER_LONG;
-      int reg =
-          get_register_value(sketch_longs[long_col_idx][idx], reg_idx_in_long);
+      int reg             = get_register_value(sketch_longs[long_col_idx][idx], reg_idx_in_long);
       sum += double{1} / static_cast<double>(1ull << reg);
       zeroes += reg == 0;
     }
 
     auto const finalize = cuco::hyperloglog_ns::detail::finalizer(precision);
-    out[idx] = finalize(sum, zeroes);
+    out[idx]            = finalize(sum, zeroes);
   }
 };
 
-} // end anonymous namespace
+}  // end anonymous namespace
 
 std::unique_ptr<cudf::column> group_hyper_log_log_plus_plus(
-    cudf::column_view const &input, int64_t const num_groups,
-    cudf::device_span<cudf::size_type const> group_lables,
-    int64_t const precision, rmm::cuda_stream_view stream,
-    rmm::device_async_resource_ref mr) {
+  cudf::column_view const& input,
+  int64_t const num_groups,
+  cudf::device_span<cudf::size_type const> group_lables,
+  int64_t const precision,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
   CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision >= 4.");
   auto adjust_precision = precision > MAX_PRECISION ? MAX_PRECISION : precision;
-  return group_hllpp(input, num_groups, group_lables, adjust_precision, stream,
-                     mr);
+  return group_hllpp(input, num_groups, group_lables, adjust_precision, stream, mr);
 }
 
 std::unique_ptr<cudf::column> group_merge_hyper_log_log_plus_plus(
-    cudf::column_view const &input, int64_t const num_groups,
-    cudf::device_span<cudf::size_type const> group_lables,
-    int64_t const precision, rmm::cuda_stream_view stream,
-    rmm::device_async_resource_ref mr) {
+  cudf::column_view const& input,
+  int64_t const num_groups,
+  cudf::device_span<cudf::size_type const> group_lables,
+  int64_t const precision,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
   CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision >= 4.");
-  CUDF_EXPECTS(
-      input.type().id() == cudf::type_id::STRUCT,
-      "HyperLogLogPlusPlus buffer type must be a STRUCT of long columns.");
+  CUDF_EXPECTS(input.type().id() == cudf::type_id::STRUCT,
+               "HyperLogLogPlusPlus buffer type must be a STRUCT of long columns.");
   for (auto i = 0; i < input.num_children(); i++) {
-    CUDF_EXPECTS(
-        input.child(i).type().id() == cudf::type_id::INT64,
-        "HyperLogLogPlusPlus buffer type must be a STRUCT of long columns.");
+    CUDF_EXPECTS(input.child(i).type().id() == cudf::type_id::INT64,
+                 "HyperLogLogPlusPlus buffer type must be a STRUCT of long columns.");
   }
-  auto adjust_precision = precision > MAX_PRECISION ? MAX_PRECISION : precision;
+  auto adjust_precision   = precision > MAX_PRECISION ? MAX_PRECISION : precision;
   auto expected_num_longs = (1 << adjust_precision) / REGISTERS_PER_LONG + 1;
   CUDF_EXPECTS(input.num_children() == expected_num_longs,
                "The num of long columns in input is incorrect.");
-  return group_merge_hllpp(input, num_groups, group_lables, adjust_precision,
-                           stream, mr);
+  return group_merge_hllpp(input, num_groups, group_lables, adjust_precision, stream, mr);
 }
 
-std::unique_ptr<cudf::scalar> reduce_hyper_log_log_plus_plus(
-    cudf::column_view const &input, int64_t const precision,
-    rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) {
+std::unique_ptr<cudf::scalar> reduce_hyper_log_log_plus_plus(cudf::column_view const& input,
+                                                             int64_t const precision,
+                                                             rmm::cuda_stream_view stream,
+                                                             rmm::device_async_resource_ref mr)
+{
   CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision >= 4.");
   auto adjust_precision = precision > MAX_PRECISION ? MAX_PRECISION : precision;
   return reduce_hllpp(input, adjust_precision, stream, mr);
 }
 
 std::unique_ptr<cudf::scalar> reduce_merge_hyper_log_log_plus_plus(
-    cudf::column_view const &input, int64_t const precision,
-    rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) {
+  cudf::column_view const& input,
+  int64_t const precision,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
   CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision >= 4.");
-  CUDF_EXPECTS(
-      input.type().id() == cudf::type_id::STRUCT,
-      "HyperLogLogPlusPlus buffer type must be a STRUCT of long columns.");
+  CUDF_EXPECTS(input.type().id() == cudf::type_id::STRUCT,
+               "HyperLogLogPlusPlus buffer type must be a STRUCT of long columns.");
   for (auto i = 0; i < input.num_children(); i++) {
-    CUDF_EXPECTS(
-        input.child(i).type().id() == cudf::type_id::INT64,
-        "HyperLogLogPlusPlus buffer type must be a STRUCT of long columns.");
+    CUDF_EXPECTS(input.child(i).type().id() == cudf::type_id::INT64,
+                 "HyperLogLogPlusPlus buffer type must be a STRUCT of long columns.");
   }
-  auto adjust_precision = precision > MAX_PRECISION ? MAX_PRECISION : precision;
+  auto adjust_precision   = precision > MAX_PRECISION ? MAX_PRECISION : precision;
   auto expected_num_longs = (1 << adjust_precision) / REGISTERS_PER_LONG + 1;
   CUDF_EXPECTS(input.num_children() == expected_num_longs,
                "The num of long columns in input is incorrect.");
   return reduce_merge_hllpp(input, adjust_precision, stream, mr);
 }
 
-std::unique_ptr<cudf::column>
-estimate_from_hll_sketches(cudf::column_view const &input, int precision,
-                           rmm::cuda_stream_view stream,
-                           rmm::device_async_resource_ref mr) {
-  CUDF_EXPECTS(precision >= 4,
-               "HyperLogLogPlusPlus requires precision is bigger than 4.");
+std::unique_ptr<cudf::column> estimate_from_hll_sketches(cudf::column_view const& input,
+                                                         int precision,
+                                                         rmm::cuda_stream_view stream,
+                                                         rmm::device_async_resource_ref mr)
+{
+  CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision is bigger than 4.");
   auto const input_iter = cudf::detail::make_counting_transform_iterator(
-      0, [&](int i) { return input.child(i).begin<int64_t>(); });
-  auto input_cols = std::vector<int64_t const *>(
-      input_iter, input_iter + input.num_children());
-  auto d_inputs =
-      cudf::detail::make_device_uvector_async(input_cols, stream, mr);
-  auto result = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT64},
-                                          input.size(),
-                                          cudf::mask_state::ALL_VALID, stream);
+    0, [&](int i) { return input.child(i).begin<int64_t>(); });
+  auto input_cols = std::vector<int64_t const*>(input_iter, input_iter + input.num_children());
+  auto d_inputs   = cudf::detail::make_device_uvector_async(input_cols, stream, mr);
+  auto result     = cudf::make_numeric_column(
+    cudf::data_type{cudf::type_id::INT64}, input.size(), cudf::mask_state::ALL_VALID, stream);
   // evaluate from struct<long, ..., long>
-  thrust::for_each_n(
-      rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(0),
-      input.size(),
-      estimate_fn{d_inputs, precision, result->mutable_view().data<int64_t>()});
+  thrust::for_each_n(rmm::exec_policy_nosync(stream),
+                     thrust::make_counting_iterator(0),
+                     input.size(),
+                     estimate_fn{d_inputs, precision, result->mutable_view().data<int64_t>()});
   return result;
 }
 
-} // namespace spark_rapids_jni
+}  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/hllpp.hpp b/src/main/cpp/src/hllpp.hpp
index 4dda342a4f..d93e1debdf 100644
--- a/src/main/cpp/src/hllpp.hpp
+++ b/src/main/cpp/src/hllpp.hpp
@@ -52,10 +52,12 @@ constexpr int MAX_PRECISION = 18;
  * a struct column with multiple long columns which is consistent with Spark.
  */
 std::unique_ptr<cudf::column> group_hyper_log_log_plus_plus(
-    cudf::column_view const &input, int64_t const num_groups,
-    cudf::device_span<cudf::size_type const> group_lables,
-    int64_t const precision, rmm::cuda_stream_view stream,
-    rmm::device_async_resource_ref mr);
+  cudf::column_view const& input,
+  int64_t const num_groups,
+  cudf::device_span<cudf::size_type const> group_lables,
+  int64_t const precision,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
 
 /**
  * Merge HyperLogLogPlusPlus(HLLPP) sketches in the same group.
@@ -63,19 +65,22 @@ std::unique_ptr<cudf::column> group_hyper_log_log_plus_plus(
  * Spark.
  */
 std::unique_ptr<cudf::column> group_merge_hyper_log_log_plus_plus(
-    cudf::column_view const &input, int64_t const num_groups,
-    cudf::device_span<cudf::size_type const> group_lables,
-    int64_t const precision, rmm::cuda_stream_view stream,
-    rmm::device_async_resource_ref mr);
+  cudf::column_view const& input,
+  int64_t const num_groups,
+  cudf::device_span<cudf::size_type const> group_lables,
+  int64_t const precision,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
 
 /**
  * Compute hash codes for the input, generate HyperLogLogPlusPlus(HLLPP)
  * sketches from hash codes, and merge all the sketches into one sketch, output
  * is a struct scalar with multiple long values.
  */
-std::unique_ptr<cudf::scalar> reduce_hyper_log_log_plus_plus(
-    cudf::column_view const &input, int64_t const precision,
-    rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
+std::unique_ptr<cudf::scalar> reduce_hyper_log_log_plus_plus(cudf::column_view const& input,
+                                                             int64_t const precision,
+                                                             rmm::cuda_stream_view stream,
+                                                             rmm::device_async_resource_ref mr);
 
 /**
  * Merge all HyperLogLogPlusPlus(HLLPP) sketches in the input column into one
@@ -83,8 +88,10 @@ std::unique_ptr<cudf::scalar> reduce_hyper_log_log_plus_plus(
  * consistent with Spark. Output is a struct scalar with multiple long values.
  */
 std::unique_ptr<cudf::scalar> reduce_merge_hyper_log_log_plus_plus(
-    cudf::column_view const &input, int64_t const precision,
-    rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
+  cudf::column_view const& input,
+  int64_t const precision,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
 
 /**
  * Estimate count distinct values for the input which contains
@@ -93,8 +100,9 @@ std::unique_ptr<cudf::scalar> reduce_merge_hyper_log_log_plus_plus(
  * for null values when doing APPROX_COUNT_DISTINCT.
  */
 std::unique_ptr<cudf::column> estimate_from_hll_sketches(
-    cudf::column_view const &input, int precision,
-    rmm::cuda_stream_view stream = cudf::get_default_stream(),
-    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  cudf::column_view const& input,
+  int precision,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
-} // namespace spark_rapids_jni
+}  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/hllpp_host_udf.cu b/src/main/cpp/src/hllpp_host_udf.cu
index c9ad271876..370b906b65 100644
--- a/src/main/cpp/src/hllpp_host_udf.cu
+++ b/src/main/cpp/src/hllpp_host_udf.cu
@@ -40,14 +40,15 @@ namespace spark_rapids_jni {
 
 namespace {
 
-template <typename cudf_aggregation> struct hllpp_udf : cudf::host_udf_base {
+template <typename cudf_aggregation>
+struct hllpp_udf : cudf::host_udf_base {
   static_assert(std::is_same_v<cudf_aggregation, cudf::reduce_aggregation> ||
                 std::is_same_v<cudf_aggregation, cudf::groupby_aggregation>);
 
-  hllpp_udf(int precision_, bool is_merge_)
-      : precision(precision_), is_merge(is_merge_) {}
+  hllpp_udf(int precision_, bool is_merge_) : precision(precision_), is_merge(is_merge_) {}
 
-  [[nodiscard]] input_data_attributes get_required_data() const override {
+  [[nodiscard]] input_data_attributes get_required_data() const override
+  {
     if constexpr (std::is_same_v<cudf_aggregation, cudf::reduce_aggregation>) {
       return {reduction_data_attribute::INPUT_VALUES};
     } else {
@@ -57,45 +58,40 @@ template <typename cudf_aggregation> struct hllpp_udf : cudf::host_udf_base {
     }
   }
 
-  [[nodiscard]] output_type
-  operator()(host_udf_input const &udf_input, rmm::cuda_stream_view stream,
-             rmm::device_async_resource_ref mr) const override {
+  [[nodiscard]] output_type operator()(host_udf_input const& udf_input,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::device_async_resource_ref mr) const override
+  {
     if constexpr (std::is_same_v<cudf_aggregation, cudf::reduce_aggregation>) {
       // reduce
-      auto const &input_values = std::get<cudf::column_view>(
-          udf_input.at(reduction_data_attribute::INPUT_VALUES));
-      if (input_values.size() == 0) {
-        return get_empty_output(std::nullopt, stream, mr);
-      }
+      auto const& input_values =
+        std::get<cudf::column_view>(udf_input.at(reduction_data_attribute::INPUT_VALUES));
+      if (input_values.size() == 0) { return get_empty_output(std::nullopt, stream, mr); }
       if (is_merge) {
         // reduce intermidate result, input_values are struct of long columns
         return spark_rapids_jni::reduce_merge_hyper_log_log_plus_plus(
-            input_values, precision, stream, mr);
+          input_values, precision, stream, mr);
       } else {
         return spark_rapids_jni::reduce_hyper_log_log_plus_plus(
-            input_values, precision, stream, mr);
+          input_values, precision, stream, mr);
       }
     } else {
       // groupby
-      auto const &group_values = std::get<cudf::column_view>(
-          udf_input.at(groupby_data_attribute::GROUPED_VALUES));
-      if (group_values.size() == 0) {
-        return get_empty_output(std::nullopt, stream, mr);
-      }
-      auto const group_offsets =
-          std::get<cudf::device_span<cudf::size_type const>>(
-              udf_input.at(groupby_data_attribute::GROUP_OFFSETS));
-      int num_groups = group_offsets.size() - 1;
-      auto const group_lables =
-          std::get<cudf::device_span<cudf::size_type const>>(
-              udf_input.at(groupby_data_attribute::GROUP_LABELS));
+      auto const& group_values =
+        std::get<cudf::column_view>(udf_input.at(groupby_data_attribute::GROUPED_VALUES));
+      if (group_values.size() == 0) { return get_empty_output(std::nullopt, stream, mr); }
+      auto const group_offsets = std::get<cudf::device_span<cudf::size_type const>>(
+        udf_input.at(groupby_data_attribute::GROUP_OFFSETS));
+      int num_groups          = group_offsets.size() - 1;
+      auto const group_lables = std::get<cudf::device_span<cudf::size_type const>>(
+        udf_input.at(groupby_data_attribute::GROUP_LABELS));
       if (is_merge) {
         // group by intermidate result, group_values are struct of long columns
         return spark_rapids_jni::group_merge_hyper_log_log_plus_plus(
-            group_values, num_groups, group_lables, precision, stream, mr);
+          group_values, num_groups, group_lables, precision, stream, mr);
       } else {
         return spark_rapids_jni::group_hyper_log_log_plus_plus(
-            group_values, num_groups, group_lables, precision, stream, mr);
+          group_values, num_groups, group_lables, precision, stream, mr);
       }
     }
   }
@@ -103,50 +99,50 @@ template <typename cudf_aggregation> struct hllpp_udf : cudf::host_udf_base {
   /**
    * @brief create an empty struct scalar
    */
-  [[nodiscard]] output_type
-  get_empty_output([[maybe_unused]] std::optional<cudf::data_type> output_dtype,
-                   rmm::cuda_stream_view stream,
-                   rmm::device_async_resource_ref mr) const override {
-    int num_registers = 1 << precision;
-    int num_long_cols = num_registers / REGISTERS_PER_LONG + 1;
-    auto const results_iter =
-        cudf::detail::make_counting_transform_iterator(0, [&](int i) {
-          return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT64});
-        });
-    auto children = std::vector<std::unique_ptr<cudf::column>>(
-        results_iter, results_iter + num_long_cols);
+  [[nodiscard]] output_type get_empty_output(
+    [[maybe_unused]] std::optional<cudf::data_type> output_dtype,
+    rmm::cuda_stream_view stream,
+    rmm::device_async_resource_ref mr) const override
+  {
+    int num_registers       = 1 << precision;
+    int num_long_cols       = num_registers / REGISTERS_PER_LONG + 1;
+    auto const results_iter = cudf::detail::make_counting_transform_iterator(
+      0, [&](int i) { return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT64}); });
+    auto children =
+      std::vector<std::unique_ptr<cudf::column>>(results_iter, results_iter + num_long_cols);
 
     if constexpr (std::is_same_v<cudf_aggregation, cudf::reduce_aggregation>) {
       // reduce
       auto host_results_view_iter = thrust::make_transform_iterator(
-          children.begin(),
-          [](auto const &results_column) { return results_column->view(); });
-      auto views = std::vector<cudf::column_view>(
-          host_results_view_iter, host_results_view_iter + num_long_cols);
+        children.begin(), [](auto const& results_column) { return results_column->view(); });
+      auto views      = std::vector<cudf::column_view>(host_results_view_iter,
+                                                  host_results_view_iter + num_long_cols);
       auto table_view = cudf::table_view{views};
-      auto table = cudf::table(table_view);
-      return std::make_unique<cudf::struct_scalar>(std::move(table), true,
-                                                   stream, mr);
+      auto table      = cudf::table(table_view);
+      return std::make_unique<cudf::struct_scalar>(std::move(table), true, stream, mr);
     } else {
       // groupby
-      return cudf::make_structs_column(0, std::move(children),
-                                       0,                    // null count
-                                       rmm::device_buffer{}, // null mask
+      return cudf::make_structs_column(0,
+                                       std::move(children),
+                                       0,                     // null count
+                                       rmm::device_buffer{},  // null mask
                                        stream);
     }
   }
 
-  [[nodiscard]] bool is_equal(host_udf_base const &other) const override {
-    auto o = dynamic_cast<hllpp_udf const *>(&other);
+  [[nodiscard]] bool is_equal(host_udf_base const& other) const override
+  {
+    auto o = dynamic_cast<hllpp_udf const*>(&other);
     return o != nullptr && o->precision == this->precision;
   }
 
-  [[nodiscard]] std::size_t do_hash() const override {
-    return 31 * (31 * std::hash<std::string>{}({"hllpp_udf"}) + precision) +
-           is_merge;
+  [[nodiscard]] std::size_t do_hash() const override
+  {
+    return 31 * (31 * std::hash<std::string>{}({"hllpp_udf"}) + precision) + is_merge;
   }
 
-  [[nodiscard]] std::unique_ptr<host_udf_base> clone() const override {
+  [[nodiscard]] std::unique_ptr<host_udf_base> clone() const override
+  {
     return std::make_unique<hllpp_udf>(precision, is_merge);
   }
 
@@ -154,30 +150,26 @@ template <typename cudf_aggregation> struct hllpp_udf : cudf::host_udf_base {
   bool is_merge;
 };
 
-} // namespace
+}  // namespace
 
-std::unique_ptr<cudf::host_udf_base>
-create_hllpp_reduction_host_udf(int precision) {
-  return std::make_unique<hllpp_udf<cudf::reduce_aggregation>>(
-      precision, /*is_merge*/ false);
+std::unique_ptr<cudf::host_udf_base> create_hllpp_reduction_host_udf(int precision)
+{
+  return std::make_unique<hllpp_udf<cudf::reduce_aggregation>>(precision, /*is_merge*/ false);
 }
 
-std::unique_ptr<cudf::host_udf_base>
-create_hllpp_reduction_merge_host_udf(int precision) {
-  return std::make_unique<hllpp_udf<cudf::reduce_aggregation>>(
-      precision, /*is_merge*/ true);
+std::unique_ptr<cudf::host_udf_base> create_hllpp_reduction_merge_host_udf(int precision)
+{
+  return std::make_unique<hllpp_udf<cudf::reduce_aggregation>>(precision, /*is_merge*/ true);
 }
 
-std::unique_ptr<cudf::host_udf_base>
-create_hllpp_groupby_host_udf(int precision) {
-  return std::make_unique<hllpp_udf<cudf::groupby_aggregation>>(
-      precision, /*is_merge*/ false);
+std::unique_ptr<cudf::host_udf_base> create_hllpp_groupby_host_udf(int precision)
+{
+  return std::make_unique<hllpp_udf<cudf::groupby_aggregation>>(precision, /*is_merge*/ false);
 }
 
-std::unique_ptr<cudf::host_udf_base>
-create_hllpp_groupby_merge_host_udf(int precision) {
-  return std::make_unique<hllpp_udf<cudf::groupby_aggregation>>(
-      precision, /*is_merge*/ true);
+std::unique_ptr<cudf::host_udf_base> create_hllpp_groupby_merge_host_udf(int precision)
+{
+  return std::make_unique<hllpp_udf<cudf::groupby_aggregation>>(precision, /*is_merge*/ true);
 }
 
-} // namespace spark_rapids_jni
+}  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/hllpp_host_udf.hpp b/src/main/cpp/src/hllpp_host_udf.hpp
index fc4bb8b21b..e89cdf4e5f 100644
--- a/src/main/cpp/src/hllpp_host_udf.hpp
+++ b/src/main/cpp/src/hllpp_host_udf.hpp
@@ -20,16 +20,12 @@
 
 namespace spark_rapids_jni {
 
-std::unique_ptr<cudf::host_udf_base>
-create_hllpp_reduction_host_udf(int precision);
+std::unique_ptr<cudf::host_udf_base> create_hllpp_reduction_host_udf(int precision);
 
-std::unique_ptr<cudf::host_udf_base>
-create_hllpp_reduction_merge_host_udf(int precision);
+std::unique_ptr<cudf::host_udf_base> create_hllpp_reduction_merge_host_udf(int precision);
 
-std::unique_ptr<cudf::host_udf_base>
-create_hllpp_groupby_host_udf(int precision);
+std::unique_ptr<cudf::host_udf_base> create_hllpp_groupby_host_udf(int precision);
 
-std::unique_ptr<cudf::host_udf_base>
-create_hllpp_groupby_merge_host_udf(int precision);
+std::unique_ptr<cudf::host_udf_base> create_hllpp_groupby_merge_host_udf(int precision);
 
-} // namespace spark_rapids_jni
+}  // namespace spark_rapids_jni
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/HLLPP.java b/src/main/java/com/nvidia/spark/rapids/jni/HLLPP.java
deleted file mode 100644
index 9e51761f4a..0000000000
--- a/src/main/java/com/nvidia/spark/rapids/jni/HLLPP.java
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.nvidia.spark.rapids.jni;
-
-import ai.rapids.cudf.ColumnVector;
-import ai.rapids.cudf.ColumnView;
-import ai.rapids.cudf.NativeDepsLoader;
-
-/**
- * HyperLogLogPlusPlus
- */
-public class HLLPP {
-  static {
-    NativeDepsLoader.loadNativeDeps();
-  }
-
-  /**
-   * Compute the approximate count distinct value from sketch values.
-   * <p>
-   * The input sketch values must be given in the format `Struct<INT64, INT64, ...>`,
-   * The num of children is: num_registers_per_sketch / 10 + 1, here 10 means a INT64 contains
-   * max 10 registers. Register value is 6 bits. The input is columnar data, e.g.: sketch 0
-   * is composed of by all the data of the children at index 0.
-   *
-   * @param input         The sketch column which constains Struct<INT64, INT64, ...> values.
-   * @param precision     The num of bits for addressing.
-   * @return A INT64 column with each value indicates the approximate count distinct value.
-   */
-  public static ColumnVector estimateDistinctValueFromSketches(ColumnView input, int precision) {
-    return new ColumnVector(estimateDistinctValueFromSketches(input.getNativeView(), precision));
-  }
-
-  private static native long estimateDistinctValueFromSketches(long inputHandle, int precision);
-}
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/HLLPPTest.java b/src/test/java/com/nvidia/spark/rapids/jni/HLLPPTest.java
deleted file mode 100644
index c14b565313..0000000000
--- a/src/test/java/com/nvidia/spark/rapids/jni/HLLPPTest.java
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
-* Copyright (c)  2024, NVIDIA CORPORATION.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-package com.nvidia.spark.rapids.jni;
-
-import ai.rapids.cudf.GroupByAggregation;
-import ai.rapids.cudf.Table;
-
-import org.junit.jupiter.api.Test;
-
-
-public class HLLPPTest {
-
-  @Test
-  void testGroupByHLL() {
-    // A trivial test:
-    try (Table input = new Table.TestBuilder().column(1, 2, 3, 1, 2, 2, 1, 3, 3, 2)
-             .column(0, 1, -2, 3, -4, -5, -6, 7, -8, 9)
-             .build()){
-        input.groupBy(0).aggregate(GroupByAggregation.HLLPP(0)
-               .onColumn(1));
-    }
-  }
-}

From 208d67e5e71696ad8f619aa82c78838acc7292c5 Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Tue, 17 Dec 2024 21:07:24 +0800
Subject: [PATCH 08/12] Use UDF

---
 src/main/cpp/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt
index b8b5f3a139..44863aa220 100644
--- a/src/main/cpp/CMakeLists.txt
+++ b/src/main/cpp/CMakeLists.txt
@@ -197,7 +197,6 @@ add_library(
   src/HashJni.cpp
   src/HistogramJni.cpp
   src/HostTableJni.cpp
-  src/HLLPPJni.cpp
   src/JSONUtilsJni.cpp
   src/NativeParquetJni.cpp
   src/ParseURIJni.cpp

From e29d5a12b97af8460d2a5bcf1fbeb67917e8a1ee Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Wed, 18 Dec 2024 17:17:15 +0800
Subject: [PATCH 09/12] Address comments

---
 src/main/cpp/CMakeLists.txt                   |  8 +--
 ....cpp => HyperLogLogPlusPlusHostUDFJni.cpp} | 19 +++----
 .../{hllpp.cu => hyper_log_log_plus_plus.cu}  | 49 +++++++++++--------
 ...{hllpp.hpp => hyper_log_log_plus_plus.hpp} | 26 +++++-----
 ...cu => hyper_log_log_plus_plus_host_udf.cu} |  8 +--
 ...p => hyper_log_log_plus_plus_host_udf.hpp} |  0
 ...F.java => HyperLogLogPlusPlusHostUDF.java} |  2 +-
 7 files changed, 61 insertions(+), 51 deletions(-)
 rename src/main/cpp/src/{HLLPPHostUDFJni.cpp => HyperLogLogPlusPlusHostUDFJni.cpp} (79%)
 rename src/main/cpp/src/{hllpp.cu => hyper_log_log_plus_plus.cu} (96%)
 rename src/main/cpp/src/{hllpp.hpp => hyper_log_log_plus_plus.hpp} (80%)
 rename src/main/cpp/src/{hllpp_host_udf.cu => hyper_log_log_plus_plus_host_udf.cu} (97%)
 rename src/main/cpp/src/{hllpp_host_udf.hpp => hyper_log_log_plus_plus_host_udf.hpp} (100%)
 rename src/main/java/com/nvidia/spark/rapids/jni/{HLLPPHostUDF.java => HyperLogLogPlusPlusHostUDF.java} (98%)

diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt
index 44863aa220..70c9cd2a59 100644
--- a/src/main/cpp/CMakeLists.txt
+++ b/src/main/cpp/CMakeLists.txt
@@ -193,10 +193,10 @@ add_library(
   src/DateTimeRebaseJni.cpp
   src/DecimalUtilsJni.cpp
   src/GpuTimeZoneDBJni.cpp
-  src/HLLPPHostUDFJni.cpp
   src/HashJni.cpp
   src/HistogramJni.cpp
   src/HostTableJni.cpp
+  src/HyperLogLogPlusPlusHostUDFJni.cpp
   src/JSONUtilsJni.cpp
   src/NativeParquetJni.cpp
   src/ParseURIJni.cpp
@@ -218,8 +218,9 @@ add_library(
   src/from_json_to_structs.cu
   src/get_json_object.cu
   src/histogram.cu
-  src/hllpp_host_udf.cu
-  src/hllpp.cu
+  src/hive_hash.cu
+  src/hyper_log_log_plus_plus.cu
+  src/hyper_log_log_plus_plus_host_udf.cu
   src/json_utils.cu
   src/murmur_hash.cu
   src/parse_uri.cu
@@ -229,7 +230,6 @@ add_library(
   src/timezones.cu
   src/utilities.cu
   src/xxhash64.cu
-  src/hive_hash.cu
   src/zorder.cu
 )
 
diff --git a/src/main/cpp/src/HLLPPHostUDFJni.cpp b/src/main/cpp/src/HyperLogLogPlusPlusHostUDFJni.cpp
similarity index 79%
rename from src/main/cpp/src/HLLPPHostUDFJni.cpp
rename to src/main/cpp/src/HyperLogLogPlusPlusHostUDFJni.cpp
index a80a78c6b8..adf5da52f7 100644
--- a/src/main/cpp/src/HLLPPHostUDFJni.cpp
+++ b/src/main/cpp/src/HyperLogLogPlusPlusHostUDFJni.cpp
@@ -15,19 +15,22 @@
  */
 
 #include "cudf_jni_apis.hpp"
-#include "hllpp.hpp"
-#include "hllpp_host_udf.hpp"
+#include "hyper_log_log_plus_plus.hpp"
+#include "hyper_log_log_plus_plus_host_udf.hpp"
 
 extern "C" {
 
-JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_HLLPPHostUDF_createHLLPPHostUDF(
-  JNIEnv* env, jclass, jint agg_type, int precision)
+JNIEXPORT jlong JNICALL
+Java_com_nvidia_spark_rapids_jni_HyperLogLogPlusPlusHostUDF_createHLLPPHostUDF(JNIEnv* env,
+                                                                               jclass,
+                                                                               jint agg_type,
+                                                                               int precision)
 {
   try {
     cudf::jni::auto_set_device(env);
     auto udf_ptr = [&] {
       // The value of agg_type must be sync with
-      // `HLLPPHostUDF.java#AggregationType`.
+      // `HyperLogLogPlusPlusHostUDF.java#AggregationType`.
       switch (agg_type) {
         case 0: return spark_rapids_jni::create_hllpp_reduction_host_udf(precision);
         case 1: return spark_rapids_jni::create_hllpp_reduction_merge_host_udf(precision);
@@ -43,10 +46,8 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_HLLPPHostUDF_createHLLP
 }
 
 JNIEXPORT jlong JNICALL
-Java_com_nvidia_spark_rapids_jni_HLLPPHostUDF_estimateDistinctValueFromSketches(JNIEnv* env,
-                                                                                jclass,
-                                                                                jlong sketches,
-                                                                                jint precision)
+Java_com_nvidia_spark_rapids_jni_HyperLogLogPlusPlusHostUDF_estimateDistinctValueFromSketches(
+  JNIEnv* env, jclass, jlong sketches, jint precision)
 {
   JNI_NULL_CHECK(env, sketches, "Sketch column is null", 0);
   try {
diff --git a/src/main/cpp/src/hllpp.cu b/src/main/cpp/src/hyper_log_log_plus_plus.cu
similarity index 96%
rename from src/main/cpp/src/hllpp.cu
rename to src/main/cpp/src/hyper_log_log_plus_plus.cu
index 8d39c66865..4ff7850558 100644
--- a/src/main/cpp/src/hllpp.cu
+++ b/src/main/cpp/src/hyper_log_log_plus_plus.cu
@@ -55,8 +55,9 @@ namespace {
  */
 __device__ inline int get_register_value(int64_t const ten_registers, int reg_idx)
 {
-  int64_t shift_mask = MASK << (REGISTER_VALUE_BITS * reg_idx);
-  int64_t v          = (ten_registers & shift_mask) >> (REGISTER_VALUE_BITS * reg_idx);
+  auto const shift_bits = REGISTER_VALUE_BITS * reg_idx;
+  auto const shift_mask = MASK << shift_bits;
+  auto const v          = (ten_registers & shift_mask) >> shift_bit;
   return static_cast<int>(v);
 }
 
@@ -418,7 +419,7 @@ std::unique_ptr<cudf::column> group_hllpp(cudf::column_view const& input,
   auto num_long_cols      = num_registers_per_sketch / REGISTERS_PER_LONG + 1;
   auto const results_iter = cudf::detail::make_counting_transform_iterator(0, [&](int i) {
     return cudf::make_numeric_column(
-      cudf::data_type{cudf::type_id::INT64}, num_groups, cudf::mask_state::ALL_VALID, stream, mr);
+      cudf::data_type{cudf::type_id::INT64}, num_groups, cudf::mask_state::UNALLOCATED, stream, mr);
   });
   auto children =
     std::vector<std::unique_ptr<cudf::column>>(results_iter, results_iter + num_long_cols);
@@ -609,7 +610,7 @@ std::unique_ptr<cudf::column> group_merge_hllpp(
   // create output columns
   auto const results_iter = cudf::detail::make_counting_transform_iterator(0, [&](int i) {
     return cudf::make_numeric_column(
-      cudf::data_type{cudf::type_id::INT64}, num_groups, cudf::mask_state::ALL_VALID, stream, mr);
+      cudf::data_type{cudf::type_id::INT64}, num_groups, cudf::mask_state::UNALLOCATED, stream, mr);
   });
   auto results =
     std::vector<std::unique_ptr<cudf::column>>(results_iter, results_iter + num_long_cols);
@@ -705,7 +706,7 @@ std::unique_ptr<cudf::scalar> reduce_hllpp(cudf::column_view const& input,
   auto const results_iter = cudf::detail::make_counting_transform_iterator(0, [&](int i) {
     return cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT64},
                                      1 /**num_groups*/,
-                                     cudf::mask_state::ALL_VALID,
+                                     cudf::mask_state::UNALLOCATED,
                                      stream,
                                      mr);
   });
@@ -773,7 +774,7 @@ std::unique_ptr<cudf::scalar> reduce_merge_hllpp(cudf::column_view const& input,
   auto const results_iter = cudf::detail::make_counting_transform_iterator(0, [&](int i) {
     return cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT64},
                                      1 /** num_rows */,
-                                     cudf::mask_state::ALL_VALID,
+                                     cudf::mask_state::UNALLOCATED,
                                      stream,
                                      mr);
   });
@@ -814,13 +815,13 @@ std::unique_ptr<cudf::scalar> reduce_merge_hllpp(cudf::column_view const& input,
 }
 
 struct estimate_fn {
-  cudf::device_span<int64_t const*> sketch_longs;
-  int const precision;
-  int64_t* const out;
+  cudf::device_span<int64_t const*> sketches;
+  int64_t* out;
+  int precision;
 
   __device__ void operator()(cudf::size_type const idx) const
   {
-    auto const num_regs = 1ull << precision;
+    auto const num_regs = 1 << precision;
     double sum          = 0;
     int zeroes          = 0;
 
@@ -828,7 +829,7 @@ struct estimate_fn {
       // each long contains 10 register values
       int long_col_idx    = reg_idx / REGISTERS_PER_LONG;
       int reg_idx_in_long = reg_idx % REGISTERS_PER_LONG;
-      int reg             = get_register_value(sketch_longs[long_col_idx][idx], reg_idx_in_long);
+      int reg             = get_register_value(sketches[long_col_idx][idx], reg_idx_in_long);
       sum += double{1} / static_cast<double>(1ull << reg);
       zeroes += reg == 0;
     }
@@ -848,7 +849,7 @@ std::unique_ptr<cudf::column> group_hyper_log_log_plus_plus(
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision >= 4.");
+  CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision bigger than 4.");
   auto adjust_precision = precision > MAX_PRECISION ? MAX_PRECISION : precision;
   return group_hllpp(input, num_groups, group_lables, adjust_precision, stream, mr);
 }
@@ -861,7 +862,7 @@ std::unique_ptr<cudf::column> group_merge_hyper_log_log_plus_plus(
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision >= 4.");
+  CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision bigger than 4.");
   CUDF_EXPECTS(input.type().id() == cudf::type_id::STRUCT,
                "HyperLogLogPlusPlus buffer type must be a STRUCT of long columns.");
   for (auto i = 0; i < input.num_children(); i++) {
@@ -880,7 +881,7 @@ std::unique_ptr<cudf::scalar> reduce_hyper_log_log_plus_plus(cudf::column_view c
                                                              rmm::cuda_stream_view stream,
                                                              rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision >= 4.");
+  CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision bigger than 4.");
   auto adjust_precision = precision > MAX_PRECISION ? MAX_PRECISION : precision;
   return reduce_hllpp(input, adjust_precision, stream, mr);
 }
@@ -891,7 +892,7 @@ std::unique_ptr<cudf::scalar> reduce_merge_hyper_log_log_plus_plus(
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision >= 4.");
+  CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision bigger than 4.");
   CUDF_EXPECTS(input.type().id() == cudf::type_id::STRUCT,
                "HyperLogLogPlusPlus buffer type must be a STRUCT of long columns.");
   for (auto i = 0; i < input.num_children(); i++) {
@@ -910,13 +911,21 @@ std::unique_ptr<cudf::column> estimate_from_hll_sketches(cudf::column_view const
                                                          rmm::cuda_stream_view stream,
                                                          rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision is bigger than 4.");
+  CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision bigger than 4.");
+  CUDF_EXPECTS(input.type().id() == cudf::type_id::STRUCT,
+               "HyperLogLogPlusPlus buffer type must be a STRUCT of long columns.");
+  for (auto i = 0; i < input.num_children(); i++) {
+    CUDF_EXPECTS(input.child(i).type().id() == cudf::type_id::INT64,
+                 "HyperLogLogPlusPlus buffer type must be a STRUCT of long columns.");
+  }
   auto const input_iter = cudf::detail::make_counting_transform_iterator(
     0, [&](int i) { return input.child(i).begin<int64_t>(); });
-  auto input_cols = std::vector<int64_t const*>(input_iter, input_iter + input.num_children());
-  auto d_inputs   = cudf::detail::make_device_uvector_async(input_cols, stream, mr);
-  auto result     = cudf::make_numeric_column(
-    cudf::data_type{cudf::type_id::INT64}, input.size(), cudf::mask_state::ALL_VALID, stream);
+  auto const h_input_ptrs =
+    std::vector<int64_t const*>(input_iter, input_iter + input.num_children());
+  auto d_inputs = cudf::detail::make_device_uvector_async(
+    h_input_ptrs, stream, cudf::get_current_device_resource_ref());
+  auto result = cudf::make_numeric_column(
+    cudf::data_type{cudf::type_id::INT64}, input.size(), cudf::mask_state::UNALLOCATED, stream, mr);
   // evaluate from struct<long, ..., long>
   thrust::for_each_n(rmm::exec_policy_nosync(stream),
                      thrust::make_counting_iterator(0),
diff --git a/src/main/cpp/src/hllpp.hpp b/src/main/cpp/src/hyper_log_log_plus_plus.hpp
similarity index 80%
rename from src/main/cpp/src/hllpp.hpp
rename to src/main/cpp/src/hyper_log_log_plus_plus.hpp
index d93e1debdf..33df3b37a4 100644
--- a/src/main/cpp/src/hllpp.hpp
+++ b/src/main/cpp/src/hyper_log_log_plus_plus.hpp
@@ -18,9 +18,9 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace spark_rapids_jni {
 
@@ -56,8 +56,8 @@ std::unique_ptr<cudf::column> group_hyper_log_log_plus_plus(
   int64_t const num_groups,
   cudf::device_span<cudf::size_type const> group_lables,
   int64_t const precision,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr);
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * Merge HyperLogLogPlusPlus(HLLPP) sketches in the same group.
@@ -69,18 +69,19 @@ std::unique_ptr<cudf::column> group_merge_hyper_log_log_plus_plus(
   int64_t const num_groups,
   cudf::device_span<cudf::size_type const> group_lables,
   int64_t const precision,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr);
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * Compute hash codes for the input, generate HyperLogLogPlusPlus(HLLPP)
  * sketches from hash codes, and merge all the sketches into one sketch, output
  * is a struct scalar with multiple long values.
  */
-std::unique_ptr<cudf::scalar> reduce_hyper_log_log_plus_plus(cudf::column_view const& input,
-                                                             int64_t const precision,
-                                                             rmm::cuda_stream_view stream,
-                                                             rmm::device_async_resource_ref mr);
+std::unique_ptr<cudf::scalar> reduce_hyper_log_log_plus_plus(
+  cudf::column_view const& input,
+  int64_t const precision,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * Merge all HyperLogLogPlusPlus(HLLPP) sketches in the input column into one
@@ -90,8 +91,8 @@ std::unique_ptr<cudf::scalar> reduce_hyper_log_log_plus_plus(cudf::column_view c
 std::unique_ptr<cudf::scalar> reduce_merge_hyper_log_log_plus_plus(
   cudf::column_view const& input,
   int64_t const precision,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr);
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * Estimate count distinct values for the input which contains
@@ -103,6 +104,5 @@ std::unique_ptr<cudf::column> estimate_from_hll_sketches(
   cudf::column_view const& input,
   int precision,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
-
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 }  // namespace spark_rapids_jni
diff --git a/src/main/cpp/src/hllpp_host_udf.cu b/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu
similarity index 97%
rename from src/main/cpp/src/hllpp_host_udf.cu
rename to src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu
index 370b906b65..a112117c35 100644
--- a/src/main/cpp/src/hllpp_host_udf.cu
+++ b/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu
@@ -13,9 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
-#include "hllpp.hpp"
-#include "hllpp_host_udf.hpp"
+#include "hyper_log_log_plus_plus.hpp"
+#include "hyper_log_log_plus_plus_host_udf.hpp"
 
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column_device_view.cuh>
@@ -126,7 +125,8 @@ struct hllpp_udf : cudf::host_udf_base {
                                        std::move(children),
                                        0,                     // null count
                                        rmm::device_buffer{},  // null mask
-                                       stream);
+                                       stream,
+                                       mr);
     }
   }
 
diff --git a/src/main/cpp/src/hllpp_host_udf.hpp b/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.hpp
similarity index 100%
rename from src/main/cpp/src/hllpp_host_udf.hpp
rename to src/main/cpp/src/hyper_log_log_plus_plus_host_udf.hpp
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/HLLPPHostUDF.java b/src/main/java/com/nvidia/spark/rapids/jni/HyperLogLogPlusPlusHostUDF.java
similarity index 98%
rename from src/main/java/com/nvidia/spark/rapids/jni/HLLPPHostUDF.java
rename to src/main/java/com/nvidia/spark/rapids/jni/HyperLogLogPlusPlusHostUDF.java
index 9018474c27..6d09be3de6 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/HLLPPHostUDF.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/HyperLogLogPlusPlusHostUDF.java
@@ -23,7 +23,7 @@
 /**
  * HyperLogLogPlusPlus(HLLPP) host UDF aggregation utils
  */
-public class HLLPPHostUDF {
+public class HyperLogLogPlusPlusHostUDF {
   static {
     NativeDepsLoader.loadNativeDeps();
   }

From 3e225129f21fbff240afa3e61857fe94a1469a4b Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Thu, 19 Dec 2024 19:21:32 +0800
Subject: [PATCH 10/12] Fix compile error

---
 src/main/cpp/src/hyper_log_log_plus_plus.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/main/cpp/src/hyper_log_log_plus_plus.cu b/src/main/cpp/src/hyper_log_log_plus_plus.cu
index 4ff7850558..0576988322 100644
--- a/src/main/cpp/src/hyper_log_log_plus_plus.cu
+++ b/src/main/cpp/src/hyper_log_log_plus_plus.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 #include "hash.hpp"
-#include "hllpp.hpp"
+#include "hyper_log_log_plus_plus.hpp"
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
@@ -57,7 +57,7 @@ __device__ inline int get_register_value(int64_t const ten_registers, int reg_id
 {
   auto const shift_bits = REGISTER_VALUE_BITS * reg_idx;
   auto const shift_mask = MASK << shift_bits;
-  auto const v          = (ten_registers & shift_mask) >> shift_bit;
+  auto const v          = (ten_registers & shift_mask) >> shift_bits;
   return static_cast<int>(v);
 }
 
@@ -930,7 +930,7 @@ std::unique_ptr<cudf::column> estimate_from_hll_sketches(cudf::column_view const
   thrust::for_each_n(rmm::exec_policy_nosync(stream),
                      thrust::make_counting_iterator(0),
                      input.size(),
-                     estimate_fn{d_inputs, precision, result->mutable_view().data<int64_t>()});
+                     estimate_fn{d_inputs, result->mutable_view().data<int64_t>(), precision});
   return result;
 }
 

From aa7ca68f003a3722a98d526ae08db690b59e2dff Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Fri, 20 Dec 2024 19:40:26 +0800
Subject: [PATCH 11/12] Handle null inputs: must ignore the null input values

---
 src/main/cpp/src/hyper_log_log_plus_plus.cu | 40 +++++++++++++--------
 1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/src/main/cpp/src/hyper_log_log_plus_plus.cu b/src/main/cpp/src/hyper_log_log_plus_plus.cu
index 0576988322..974f533987 100644
--- a/src/main/cpp/src/hyper_log_log_plus_plus.cu
+++ b/src/main/cpp/src/hyper_log_log_plus_plus.cu
@@ -21,6 +21,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/lists/lists_column_view.hpp>
@@ -91,6 +92,8 @@ __device__ inline int get_register_value(int64_t const ten_registers, int reg_id
  * (num_hashs / num_hashs_per_thread) sketches.
  * num_threads = div_round_up(num_hashs, num_hashs_per_thread).
  *
+ * Note: Must exclude null hash values from computing HLLPP sketches.
+ *
  * e.g.: num_registers_per_sketch = 512 and num_hashs_per_thread = 4;
  *
  * Input is hashs, compute and get pair: register index -> register value
@@ -181,11 +184,15 @@ CUDF_KERNEL void partial_group_sketches_from_hashs_kernel(
   for (auto hash_idx = hash_first; hash_idx < hash_end; hash_idx++) {
     cudf::size_type curr_group = group_lables[hash_idx];
 
-    // cast to unsigned, then >> will shift without preserve the sign bit.
-    uint64_t const hash = static_cast<uint64_t>(hashs.element<int64_t>(hash_idx));
-    auto const reg_idx  = hash >> idx_shift;
-    int const reg_v =
-      static_cast<int>(cuda::std::countl_zero((hash << precision) | w_padding) + 1ULL);
+    int reg_idx = 0;  // init value for null hash
+    int reg_v   = 0;  // init value for null hash
+    if (!hashs.is_null(hash_idx)) {
+      // cast to unsigned, then >> will shift without preserve the sign bit.
+      uint64_t const hash = static_cast<uint64_t>(hashs.element<int64_t>(hash_idx));
+      reg_idx             = hash >> idx_shift;
+      // get the leading zeros
+      reg_v = static_cast<int>(cuda::std::countl_zero((hash << precision) | w_padding) + 1ULL);
+    }
 
     if (curr_group == prev_group) {
       // still in the same group, update the max value
@@ -390,7 +397,8 @@ std::unique_ptr<cudf::column> group_hllpp(cudf::column_view const& input,
       // 1. compute all the hashs
       auto input_table_view = cudf::table_view{{input}};
       auto hash_col         = xxhash64(input_table_view, SEED, stream, mr);
-      auto d_hashs          = cudf::column_device_view::create(hash_col->view(), stream);
+      hash_col->set_null_mask(cudf::detail::copy_bitmask(input, stream, mr), input.null_count());
+      auto d_hashs = cudf::column_device_view::create(hash_col->view(), stream);
 
       // 2. execute partial group by
       int64_t num_blocks_p1 =
@@ -659,13 +667,16 @@ CUDF_KERNEL void reduce_hllpp_kernel(cudf::column_device_view hashs,
 
   // update max reg value for the reg index
   for (int i = tid; i < num_hashs; i += block_size) {
-    uint64_t const hash = static_cast<uint64_t>(hashs.element<int64_t>(i));
-    // use unsigned int to avoid insert 1 for the highest bit when do right
-    // shift
-    uint64_t const reg_idx = hash >> idx_shift;
-    // get the leading zeros
-    int const reg_v =
-      static_cast<int>(cuda::std::countl_zero((hash << precision) | w_padding) + 1ULL);
+    int reg_idx = 0;  // init value for null hash
+    int reg_v   = 0;  // init value for null hash
+    if (!hashs.is_null(i)) {
+      // cast to unsigned, then >> will shift without preserve the sign bit.
+      uint64_t const hash = static_cast<uint64_t>(hashs.element<int64_t>(i));
+      reg_idx             = hash >> idx_shift;
+      // get the leading zeros
+      reg_v = static_cast<int>(cuda::std::countl_zero((hash << precision) | w_padding) + 1ULL);
+    }
+
     cuda::atomic_ref<int32_t, cuda::thread_scope_block> register_ref(shared_data[reg_idx]);
     register_ref.fetch_max(reg_v, cuda::memory_order_relaxed);
   }
@@ -699,7 +710,8 @@ std::unique_ptr<cudf::scalar> reduce_hllpp(cudf::column_view const& input,
   // 1. compute all the hashs
   auto input_table_view = cudf::table_view{{input}};
   auto hash_col         = xxhash64(input_table_view, SEED, stream, mr);
-  auto d_hashs          = cudf::column_device_view::create(hash_col->view(), stream);
+  hash_col->set_null_mask(cudf::detail::copy_bitmask(input, stream, mr), input.null_count());
+  auto d_hashs = cudf::column_device_view::create(hash_col->view(), stream);
 
   // 2. generate long columns, the size of each long column is 1
   auto num_long_cols      = num_registers_per_sketch / REGISTERS_PER_LONG + 1;

From f0970c05b631a02fdd9c4ab1ba04bfb08bedfb73 Mon Sep 17 00:00:00 2001
From: Chong Gao <res_life@163.com>
Date: Mon, 23 Dec 2024 12:04:39 +0800
Subject: [PATCH 12/12] Rename refactor: Correct spelling errors

---
 src/main/cpp/src/hyper_log_log_plus_plus.cu   | 72 +++++++++----------
 src/main/cpp/src/hyper_log_log_plus_plus.hpp  |  4 +-
 .../src/hyper_log_log_plus_plus_host_udf.cu   |  6 +-
 3 files changed, 41 insertions(+), 41 deletions(-)

diff --git a/src/main/cpp/src/hyper_log_log_plus_plus.cu b/src/main/cpp/src/hyper_log_log_plus_plus.cu
index 974f533987..f83240f498 100644
--- a/src/main/cpp/src/hyper_log_log_plus_plus.cu
+++ b/src/main/cpp/src/hyper_log_log_plus_plus.cu
@@ -98,7 +98,7 @@ __device__ inline int get_register_value(int64_t const ten_registers, int reg_id
  *
  * Input is hashs, compute and get pair: register index -> register value
  *
- *   reg_index, reg_value, group_lable
+ *   reg_index, reg_value, group_label
  * [
  * ---------- segment 0 begin --------------------------
  *    (0,            1),          g0
@@ -123,7 +123,7 @@ __device__ inline int get_register_value(int64_t const ten_registers, int reg_id
  * ]
  * Output e.g.:
  *
- * group_lables_thread_cache:
+ * group_labels_thread_cache:
  * [
  *   g1
  *   g1
@@ -138,7 +138,7 @@ __device__ inline int get_register_value(int64_t const ten_registers, int reg_id
  *    512 values: [0, 8, 0, ... ] // register values for group 1
  * ]
  * Has num_threads rows, each row is corresponding to
- * `group_lables_thread_cache`
+ * `group_labels_thread_cache`
  *
  * registers_output_cache:
  * [
@@ -153,11 +153,11 @@ __device__ inline int get_register_value(int64_t const ten_registers, int reg_id
 template <int num_hashs_per_thread>
 CUDF_KERNEL void partial_group_sketches_from_hashs_kernel(
   cudf::column_device_view hashs,
-  cudf::device_span<cudf::size_type const> group_lables,
+  cudf::device_span<cudf::size_type const> group_labels,
   int64_t const precision,                          // num of bits for register addressing, e.g.: 9
   int* const registers_output_cache,                // num is num_groups * num_registers_per_sketch
   int* const registers_thread_cache,                // num is num_threads * num_registers_per_sketch
-  cudf::size_type* const group_lables_thread_cache  // save the group lables for each thread
+  cudf::size_type* const group_labels_thread_cache  // save the group labels for each thread
 )
 {
   auto const tid          = cudf::detail::grid_1d::global_thread_id();
@@ -180,9 +180,9 @@ CUDF_KERNEL void partial_group_sketches_from_hashs_kernel(
     sketch_ptr[i] = 0;
   }
 
-  cudf::size_type prev_group = group_lables[hash_first];
+  cudf::size_type prev_group = group_labels[hash_first];
   for (auto hash_idx = hash_first; hash_idx < hash_end; hash_idx++) {
-    cudf::size_type curr_group = group_lables[hash_idx];
+    cudf::size_type curr_group = group_labels[hash_idx];
 
     int reg_idx = 0;  // init value for null hash
     int reg_v   = 0;  // init value for null hash
@@ -216,7 +216,7 @@ CUDF_KERNEL void partial_group_sketches_from_hashs_kernel(
         }
       } else {
         // not the last segment, probe one item forward.
-        if (curr_group != group_lables[hash_idx + 1]) {
+        if (curr_group != group_labels[hash_idx + 1]) {
           // meets a new group by checking the next item in the next segment
           for (auto i = 0; i < num_registers_per_sketch; i++) {
             registers_output_cache[curr_group * num_registers_per_sketch + i] = sketch_ptr[i];
@@ -228,8 +228,8 @@ CUDF_KERNEL void partial_group_sketches_from_hashs_kernel(
     prev_group = curr_group;
   }
 
-  // save the group lable for this thread
-  group_lables_thread_cache[tid] = group_lables[hash_end - 1];
+  // save the group label for this thread
+  group_labels_thread_cache[tid] = group_labels[hash_end - 1];
 }
 
 /*
@@ -242,7 +242,7 @@ CUDF_KERNEL void partial_group_sketches_from_hashs_kernel(
  *
  * Input e.g.:
  *
- * group_lables_thread_cache:
+ * group_labels_thread_cache:
  * [
  *   g0
  *   g0
@@ -261,7 +261,7 @@ CUDF_KERNEL void partial_group_sketches_from_hashs_kernel(
  *    r0_gN, r1_gN, r2_gN, r3_gN, ... , r511_gN // register values for group N
  * ]
  * Has num_threads rows, each row is corresponding to
- * `group_lables_thread_cache`
+ * `group_labels_thread_cache`
  *
  * registers_output_cache:
  * [
@@ -280,7 +280,7 @@ CUDF_KERNEL void merge_sketches_vertically(int64_t num_sketches,
                                            int64_t num_registers_per_sketch,
                                            int* const registers_output_cache,
                                            int const* const registers_thread_cache,
-                                           cudf::size_type const* const group_lables_thread_cache)
+                                           cudf::size_type const* const group_labels_thread_cache)
 {
   __shared__ int8_t shared_data[block_size];
   auto const tid = cudf::detail::grid_1d::global_thread_id();
@@ -288,9 +288,9 @@ CUDF_KERNEL void merge_sketches_vertically(int64_t num_sketches,
 
   // register idx is tid
   shared_data[shared_idx] = static_cast<int8_t>(0);
-  int prev_group          = group_lables_thread_cache[0];
+  int prev_group          = group_labels_thread_cache[0];
   for (auto i = 0; i < num_sketches; i++) {
-    int curr_group = group_lables_thread_cache[i];
+    int curr_group = group_labels_thread_cache[i];
     int8_t curr_reg_v =
       static_cast<int8_t>(registers_thread_cache[i * num_registers_per_sketch + tid]);
     if (curr_group == prev_group) {
@@ -372,7 +372,7 @@ CUDF_KERNEL void compact_kernel(int64_t const num_groups,
 
 std::unique_ptr<cudf::column> group_hllpp(cudf::column_view const& input,
                                           int64_t const num_groups,
-                                          cudf::device_span<cudf::size_type const> group_lables,
+                                          cudf::device_span<cudf::size_type const> group_labels,
                                           int64_t const precision,
                                           rmm::cuda_stream_view stream,
                                           rmm::device_async_resource_ref mr)
@@ -387,10 +387,10 @@ std::unique_ptr<cudf::column> group_hllpp(cudf::column_view const& input,
     rmm::device_uvector<int32_t>(num_groups * num_registers_per_sketch, stream, mr);
 
   {  // add this block to release `registers_thread_cache` and
-    // `group_lables_thread_cache`
+    // `group_labels_thread_cache`
     auto registers_thread_cache = rmm::device_uvector<int32_t>(
       num_threads_partial_kernel * num_registers_per_sketch, stream, mr);
-    auto group_lables_thread_cache =
+    auto group_labels_thread_cache =
       rmm::device_uvector<int32_t>(num_threads_partial_kernel, stream, mr);
 
     {  // add this block to release `hash_col`
@@ -405,11 +405,11 @@ std::unique_ptr<cudf::column> group_hllpp(cudf::column_view const& input,
         cudf::util::div_rounding_up_safe(num_threads_partial_kernel, block_size);
       partial_group_sketches_from_hashs_kernel<num_hashs_per_thread>
         <<<num_blocks_p1, block_size, 0, stream.value()>>>(*d_hashs,
-                                                           group_lables,
+                                                           group_labels,
                                                            precision,
                                                            sketches_output.begin(),
                                                            registers_thread_cache.begin(),
-                                                           group_lables_thread_cache.begin());
+                                                           group_labels_thread_cache.begin());
     }
     // 3. merge the intermidate result
     auto num_merge_threads = num_registers_per_sketch;
@@ -420,7 +420,7 @@ std::unique_ptr<cudf::column> group_hllpp(cudf::column_view const& input,
         num_registers_per_sketch,
         sketches_output.begin(),
         registers_thread_cache.begin(),
-        group_lables_thread_cache.begin());
+        group_labels_thread_cache.begin());
   }
 
   // 4. create output columns
@@ -485,13 +485,13 @@ CUDF_KERNEL void partial_group_long_sketches_kernel(
   int64_t const num_threads_per_col,
   int64_t const num_registers_per_sketch,
   int64_t const num_groups,
-  cudf::device_span<cudf::size_type const> group_lables,
+  cudf::device_span<cudf::size_type const> group_labels,
   // num_groups * num_registers_per_sketch integers
   int* const registers_output_cache,
   // num_threads * num_registers_per_sketch integers
   int* const registers_thread_cache,
   // num_threads integers
-  cudf::size_type* const group_lables_thread_cache)
+  cudf::size_type* const group_labels_thread_cache)
 {
   auto const tid           = cudf::detail::grid_1d::global_thread_id();
   auto const num_long_cols = sketches_input.size();
@@ -511,11 +511,11 @@ CUDF_KERNEL void partial_group_long_sketches_kernel(
   if (long_idx == num_long_cols - 1) { num_regs = num_registers_per_sketch % REGISTERS_PER_LONG; }
 
   for (auto i = 0; i < num_regs; i++) {
-    cudf::size_type prev_group = group_lables[sketch_first];
+    cudf::size_type prev_group = group_labels[sketch_first];
     int max_reg_v              = 0;
     int reg_idx_in_sketch      = long_idx * REGISTERS_PER_LONG + i;
     for (auto sketch_idx = sketch_first; sketch_idx < sketch_end; sketch_idx++) {
-      cudf::size_type curr_group = group_lables[sketch_idx];
+      cudf::size_type curr_group = group_labels[sketch_idx];
       int curr_reg_v             = get_register_value(longs_ptr[sketch_idx], i);
       if (curr_group == prev_group) {
         // still in the same group, update the max value
@@ -537,7 +537,7 @@ CUDF_KERNEL void partial_group_long_sketches_kernel(
           registers_output_cache[output_idx_curr] = max_reg_v;
           max_reg_v                               = curr_reg_v;
         } else {
-          if (curr_group != group_lables[sketch_idx + 1]) {
+          if (curr_group != group_labels[sketch_idx + 1]) {
             // look the first item in the next segment
             registers_output_cache[output_idx_curr] = max_reg_v;
             max_reg_v                               = curr_reg_v;
@@ -553,7 +553,7 @@ CUDF_KERNEL void partial_group_long_sketches_kernel(
   }
 
   if (long_idx == 0) {
-    group_lables_thread_cache[thread_idx_in_cols] = group_lables[sketch_end - 1];
+    group_labels_thread_cache[thread_idx_in_cols] = group_labels[sketch_end - 1];
   }
 }
 
@@ -564,7 +564,7 @@ CUDF_KERNEL void partial_group_long_sketches_kernel(
 std::unique_ptr<cudf::column> group_merge_hllpp(
   cudf::column_view const& hll_input,  // struct<long, ..., long> column
   int64_t const num_groups,
-  cudf::device_span<cudf::size_type const> group_lables,
+  cudf::device_span<cudf::size_type const> group_labels,
   int64_t const precision,
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
@@ -584,7 +584,7 @@ std::unique_ptr<cudf::column> group_merge_hllpp(
   {
     auto registers_thread_cache =
       rmm::device_uvector<int32_t>(num_registers_per_sketch * num_threads_phase1, stream, mr);
-    auto group_lables_thread_cache =
+    auto group_labels_thread_cache =
       rmm::device_uvector<int32_t>(num_threads_per_col_phase1, stream, mr);
 
     cudf::structs_column_view scv(hll_input);
@@ -599,10 +599,10 @@ std::unique_ptr<cudf::column> group_merge_hllpp(
                                                       num_threads_per_col_phase1,
                                                       num_registers_per_sketch,
                                                       num_groups,
-                                                      group_lables,
+                                                      group_labels,
                                                       registers_output_cache.begin(),
                                                       registers_thread_cache.begin(),
-                                                      group_lables_thread_cache.begin());
+                                                      group_labels_thread_cache.begin());
     auto const num_phase2_threads = num_registers_per_sketch;
     auto const num_phase2_blocks = cudf::util::div_rounding_up_safe(num_phase2_threads, block_size);
     // 2nd kernel: vertical merge
@@ -612,7 +612,7 @@ std::unique_ptr<cudf::column> group_merge_hllpp(
         num_registers_per_sketch,
         registers_output_cache.begin(),
         registers_thread_cache.begin(),
-        group_lables_thread_cache.begin());
+        group_labels_thread_cache.begin());
   }
 
   // create output columns
@@ -856,20 +856,20 @@ struct estimate_fn {
 std::unique_ptr<cudf::column> group_hyper_log_log_plus_plus(
   cudf::column_view const& input,
   int64_t const num_groups,
-  cudf::device_span<cudf::size_type const> group_lables,
+  cudf::device_span<cudf::size_type const> group_labels,
   int64_t const precision,
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(precision >= 4, "HyperLogLogPlusPlus requires precision bigger than 4.");
   auto adjust_precision = precision > MAX_PRECISION ? MAX_PRECISION : precision;
-  return group_hllpp(input, num_groups, group_lables, adjust_precision, stream, mr);
+  return group_hllpp(input, num_groups, group_labels, adjust_precision, stream, mr);
 }
 
 std::unique_ptr<cudf::column> group_merge_hyper_log_log_plus_plus(
   cudf::column_view const& input,
   int64_t const num_groups,
-  cudf::device_span<cudf::size_type const> group_lables,
+  cudf::device_span<cudf::size_type const> group_labels,
   int64_t const precision,
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
@@ -885,7 +885,7 @@ std::unique_ptr<cudf::column> group_merge_hyper_log_log_plus_plus(
   auto expected_num_longs = (1 << adjust_precision) / REGISTERS_PER_LONG + 1;
   CUDF_EXPECTS(input.num_children() == expected_num_longs,
                "The num of long columns in input is incorrect.");
-  return group_merge_hllpp(input, num_groups, group_lables, adjust_precision, stream, mr);
+  return group_merge_hllpp(input, num_groups, group_labels, adjust_precision, stream, mr);
 }
 
 std::unique_ptr<cudf::scalar> reduce_hyper_log_log_plus_plus(cudf::column_view const& input,
diff --git a/src/main/cpp/src/hyper_log_log_plus_plus.hpp b/src/main/cpp/src/hyper_log_log_plus_plus.hpp
index 33df3b37a4..0489e67b1f 100644
--- a/src/main/cpp/src/hyper_log_log_plus_plus.hpp
+++ b/src/main/cpp/src/hyper_log_log_plus_plus.hpp
@@ -54,7 +54,7 @@ constexpr int MAX_PRECISION = 18;
 std::unique_ptr<cudf::column> group_hyper_log_log_plus_plus(
   cudf::column_view const& input,
   int64_t const num_groups,
-  cudf::device_span<cudf::size_type const> group_lables,
+  cudf::device_span<cudf::size_type const> group_labels,
   int64_t const precision,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
@@ -67,7 +67,7 @@ std::unique_ptr<cudf::column> group_hyper_log_log_plus_plus(
 std::unique_ptr<cudf::column> group_merge_hyper_log_log_plus_plus(
   cudf::column_view const& input,
   int64_t const num_groups,
-  cudf::device_span<cudf::size_type const> group_lables,
+  cudf::device_span<cudf::size_type const> group_labels,
   int64_t const precision,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
diff --git a/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu b/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu
index a112117c35..ac99018f0d 100644
--- a/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu
+++ b/src/main/cpp/src/hyper_log_log_plus_plus_host_udf.cu
@@ -82,15 +82,15 @@ struct hllpp_udf : cudf::host_udf_base {
       auto const group_offsets = std::get<cudf::device_span<cudf::size_type const>>(
         udf_input.at(groupby_data_attribute::GROUP_OFFSETS));
       int num_groups          = group_offsets.size() - 1;
-      auto const group_lables = std::get<cudf::device_span<cudf::size_type const>>(
+      auto const group_labels = std::get<cudf::device_span<cudf::size_type const>>(
         udf_input.at(groupby_data_attribute::GROUP_LABELS));
       if (is_merge) {
         // group by intermidate result, group_values are struct of long columns
         return spark_rapids_jni::group_merge_hyper_log_log_plus_plus(
-          group_values, num_groups, group_lables, precision, stream, mr);
+          group_values, num_groups, group_labels, precision, stream, mr);
       } else {
         return spark_rapids_jni::group_hyper_log_log_plus_plus(
-          group_values, num_groups, group_lables, precision, stream, mr);
+          group_values, num_groups, group_labels, precision, stream, mr);
       }
     }
   }