diff --git a/src/main/java/ai/rapids/cudf/CudfAccessor.java b/src/main/java/ai/rapids/cudf/CudfAccessor.java
deleted file mode 100644
index 3352ee1ae9..0000000000
--- a/src/main/java/ai/rapids/cudf/CudfAccessor.java
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package ai.rapids.cudf;
-
-// TODO: properly expose these functions in the actual Scalar API and remove this layer.
-// https://github.com/NVIDIA/spark-rapids-jni/issues/1307
-public class CudfAccessor {
- public static long getScalarHandle(Scalar s) {
- return s.getScalarHandle();
- }
-
- public static Scalar scalarFromHandle(DType type, long scalarHandle) {
- return new Scalar(type, scalarHandle);
- }
-}
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/BloomFilter.java b/src/main/java/com/nvidia/spark/rapids/jni/BloomFilter.java
index 46bf9a7f08..6a676a54bb 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/BloomFilter.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/BloomFilter.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -16,12 +16,8 @@
package com.nvidia.spark.rapids.jni;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
import ai.rapids.cudf.BaseDeviceMemoryBuffer;
import ai.rapids.cudf.ColumnVector;
-import ai.rapids.cudf.CudfAccessor;
import ai.rapids.cudf.CudfException;
import ai.rapids.cudf.DType;
import ai.rapids.cudf.Scalar;
@@ -46,7 +42,7 @@ public static Scalar create(int numHashes, long bloomFilterBits){
if(bloomFilterBits <= 0){
throw new IllegalArgumentException("Bloom filters must have a positive number of bits");
}
- return CudfAccessor.scalarFromHandle(DType.LIST, creategpu(numHashes, bloomFilterBits));
+ return new Scalar(DType.LIST, creategpu(numHashes, bloomFilterBits));
}
/**
@@ -55,39 +51,39 @@ public static Scalar create(int numHashes, long bloomFilterBits){
* @param cv The column containing the values to add.
*/
public static void put(Scalar bloomFilter, ColumnVector cv){
- put(CudfAccessor.getScalarHandle(bloomFilter), cv.getNativeView());
+ put(bloomFilter.getScalarHandle(), cv.getNativeView());
}
/**
* Merge one or more bloom filters into a new bloom filter.
- * @param bloomFilters A ColumnVector containing a bloom filter per row.
+ * @param bloomFilters A ColumnVector containing a bloom filter per row.
* @return A new bloom filter containing the merged inputs.
*/
public static Scalar merge(ColumnVector bloomFilters){
- return CudfAccessor.scalarFromHandle(DType.LIST, merge(bloomFilters.getNativeView()));
+ return new Scalar(DType.LIST, merge(bloomFilters.getNativeView()));
}
/**
- * Probe a bloom filter with a column of longs. Returns a column of booleans. For
+ * Probe a bloom filter with a column of longs. Returns a column of booleans. For
* each row in the output; a value of true indicates that the corresponding input value
* -may- be in the set of values used to build the bloom filter; a value of false indicates
* that the corresponding input value is conclusively not in the set of values used to build
- * the bloom filter.
+ * the bloom filter.
* @param bloomFilter The bloom filter to be probed.
* @param cv The column containing the values to check.
* @return A boolean column indicating the results of the probe.
*/
public static ColumnVector probe(Scalar bloomFilter, ColumnVector cv){
- return new ColumnVector(probe(CudfAccessor.getScalarHandle(bloomFilter), cv.getNativeView()));
+ return new ColumnVector(probe(bloomFilter.getScalarHandle(), cv.getNativeView()));
}
/**
- * Probe a bloom filter with a column of longs. Returns a column of booleans. For
+ * Probe a bloom filter with a column of longs. Returns a column of booleans. For
* each row in the output; a value of true indicates that the corresponding input value
* -may- be in the set of values used to build the bloom filter; a value of false indicates
* that the corresponding input value is conclusively not in the set of values used to build
- * the bloom filter.
- * @param bloomFilter The bloom filter to be probed. This buffer is expected to be the
+ * the bloom filter.
+ * @param bloomFilter The bloom filter to be probed. This buffer is expected to be the
* fully packed Spark bloom filter, including header.
* @param cv The column containing the values to check.
* @return A boolean column indicating the results of the probe.
@@ -95,10 +91,10 @@ public static ColumnVector probe(Scalar bloomFilter, ColumnVector cv){
public static ColumnVector probe(BaseDeviceMemoryBuffer bloomFilter, ColumnVector cv){
return new ColumnVector(probebuffer(bloomFilter.getAddress(), bloomFilter.getLength(), cv.getNativeView()));
}
-
+
private static native long creategpu(int numHashes, long bloomFilterBits) throws CudfException;
private static native int put(long bloomFilter, long cv) throws CudfException;
private static native long merge(long bloomFilters) throws CudfException;
- private static native long probe(long bloomFilter, long cv) throws CudfException;
- private static native long probebuffer(long bloomFilter, long bloomFilterSize, long cv) throws CudfException;
+ private static native long probe(long bloomFilter, long cv) throws CudfException;
+ private static native long probebuffer(long bloomFilter, long bloomFilterSize, long cv) throws CudfException;
}
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/DateTimeRebase.java b/src/main/java/com/nvidia/spark/rapids/jni/DateTimeRebase.java
index d73ee038d6..7ec98aa930 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/DateTimeRebase.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/DateTimeRebase.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -19,41 +19,18 @@
import ai.rapids.cudf.*;
/**
- * Utility class for converting between column major and row major data
+ * This will be removed after the plugin picks up DateTimeUtils class.
*/
public class DateTimeRebase {
static {
NativeDepsLoader.loadNativeDeps();
}
- /**
- * Convert the given timestamps as a number of days or microseconds since the epoch instant
- * 1970-01-01T00:00:00Z to a local date-time in Proleptic Gregorian calendar, reinterpreting
- * the result as in Julian calendar, then compute the number of days or microseconds since the
- * epoch from that Julian local date-time.
- *
- * This is to match with Apache Spark's `localRebaseGregorianToJulianDays` and
- * `rebaseGregorianToJulianMicros` functions with timezone fixed to UTC.
- */
public static ColumnVector rebaseGregorianToJulian(ColumnView input) {
- return new ColumnVector(rebaseGregorianToJulian(input.getNativeView()));
+ return DateTimeUtils.rebaseGregorianToJulian(input);
}
- /**
- * Convert the given timestamps as a number of days or microseconds since the epoch instant
- * 1970-01-01T00:00:00Z to a local date-time in Julian calendar, reinterpreting the result
- * as in Proleptic Gregorian calendar, then compute the number of days or microseconds since the
- * epoch from that Gregorian local date-time.
- *
- * This is to match with Apache Spark's `localRebaseJulianToGregorianDays` and
- * `rebaseJulianToGregorianMicros` functions with timezone fixed to UTC.
- */
public static ColumnVector rebaseJulianToGregorian(ColumnView input) {
- return new ColumnVector(rebaseJulianToGregorian(input.getNativeView()));
+ return DateTimeUtils.rebaseJulianToGregorian(input);
}
-
-
- private static native long rebaseGregorianToJulian(long nativeHandle);
-
- private static native long rebaseJulianToGregorian(long nativeHandle);
}
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/DateTimeUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/DateTimeUtils.java
new file mode 100644
index 0000000000..b02aa1eca6
--- /dev/null
+++ b/src/main/java/com/nvidia/spark/rapids/jni/DateTimeUtils.java
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.jni;
+
+import ai.rapids.cudf.*;
+
+/**
+ * Utility class for converting between column major and row major data
+ */
+public class DateTimeUtils {
+ static {
+ NativeDepsLoader.loadNativeDeps();
+ }
+
+ /**
+ * Convert the given timestamps as a number of days or microseconds since the epoch instant
+ * 1970-01-01T00:00:00Z to a local date-time in Proleptic Gregorian calendar, reinterpreting
+ * the result as in Julian calendar, then compute the number of days or microseconds since the
+ * epoch from that Julian local date-time.
+ *
+ * This is to match with Apache Spark's `localRebaseGregorianToJulianDays` and
+ * `rebaseGregorianToJulianMicros` functions with timezone fixed to UTC.
+ *
+ * @param input The input column
+ * @return A new column with the rebase applied
+ */
+ public static ColumnVector rebaseGregorianToJulian(ColumnView input) {
+ return new ColumnVector(rebaseGregorianToJulian(input.getNativeView()));
+ }
+
+ /**
+ * Convert the given timestamps as a number of days or microseconds since the epoch instant
+ * 1970-01-01T00:00:00Z to a local date-time in Julian calendar, reinterpreting the result
+ * as in Proleptic Gregorian calendar, then compute the number of days or microseconds since the
+ * epoch from that Gregorian local date-time.
+ *
+ * This is to match with Apache Spark's `localRebaseJulianToGregorianDays` and
+ * `rebaseJulianToGregorianMicros` functions with timezone fixed to UTC.
+ *
+ * @param input The input column
+ * @return A new column with the rebase applied
+ */
+ public static ColumnVector rebaseJulianToGregorian(ColumnView input) {
+ return new ColumnVector(rebaseJulianToGregorian(input.getNativeView()));
+ }
+
+ /**
+ * Truncate the given date or timestamp to the unit specified by the format string.
+ *
+ * The input date/time must be of type TIMESTAMP_DAYS or TIMESTAMP_MICROSECONDS, and the format
+ * be of type STRING. In addition, the format strings are case-insensitive.
+ *
+ * For TIMESTAMP_DAYS, the valid format are:
+ * - {@code "YEAR", "YYYY", "YY"}: truncate to the first date of the year.
+ * - {@code "QUARTER"}: truncate to the first date of the quarter.
+ * - {@code "MONTH", "MM", "MON"}: truncate to the first date of the month.
+ * - {@code "WEEK"}: truncate to the Monday of the week.
+ *
+ * For TIMESTAMP_MICROSECONDS, the valid format are:
+ * - {@code "YEAR", "YYYY", "YY"}: truncate to the first date of the year.
+ * - {@code "QUARTER"}: truncate to the first date of the quarter.
+ * - {@code "MONTH", "MM", "MON"}: truncate to the first date of the month.
+ * - {@code "WEEK"}: truncate to the Monday of the week.
+ * - {@code "DAY", "DD"}: zero out the time part.
+ * - {@code "HOUR"}: zero out the minute and second with fraction part.
+ * - {@code "MINUTE"}: zero out the second with fraction part.
+ * - {@code "SECOND"}: zero out the second fraction part.
+ * - {@code "MILLISECOND"}: zero out the microseconds.
+ * - {@code "MICROSECOND"}: keep everything.
+ *
+ * @param datetime The input date/time
+ * @param format The time component to truncate to
+ * @return The truncated date/time
+ */
+ public static ColumnVector truncate(ColumnView datetime, ColumnView format) {
+ return new ColumnVector(truncateWithColumnFormat(datetime.getNativeView(),
+ format.getNativeView()));
+ }
+
+ /**
+ * Truncate the given date or timestamp to the unit specified by the format string.
+ *
+ * This function is similar to {@link #truncate(ColumnView, ColumnView)} but the input format
+ * is a string literal instead of a column.
+ *
+ * @param datetime The input date/time
+ * @param format The time component to truncate to
+ * @return The truncated date/time
+ */
+ public static ColumnVector truncate(ColumnView datetime, String format) {
+ return new ColumnVector(truncateWithScalarFormat(datetime.getNativeView(), format));
+ }
+
+
+ private static native long rebaseGregorianToJulian(long nativeHandle);
+
+ private static native long rebaseJulianToGregorian(long nativeHandle);
+
+ private static native long truncateWithColumnFormat(long datetimeHandle, long formatHandle);
+
+ private static native long truncateWithScalarFormat(long datetimeHandle, String format);
+}
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/GpuSubstringIndexUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/GpuSubstringIndexUtils.java
index a8750919c9..81d2da56bf 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/GpuSubstringIndexUtils.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/GpuSubstringIndexUtils.java
@@ -24,7 +24,7 @@ public class GpuSubstringIndexUtils {
}
public static ColumnVector substringIndex(ColumnView cv, Scalar delimiter, int count){
- return new ColumnVector(substringIndex(cv.getNativeView(), CudfAccessor.getScalarHandle(delimiter), count));
+ return new ColumnVector(substringIndex(cv.getNativeView(), delimiter.getScalarHandle(), count));
}
private static native long substringIndex(long columnView, long delimiter, int count) throws CudfException;
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/Hash.java b/src/main/java/com/nvidia/spark/rapids/jni/Hash.java
index 16971c5bdb..2d23ae5256 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/Hash.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/Hash.java
@@ -22,13 +22,15 @@
import ai.rapids.cudf.NativeDepsLoader;
public class Hash {
- // there doesn't appear to be a useful constant in spark to reference. this could break.
- static final long DEFAULT_XXHASH64_SEED = 42;
-
static {
NativeDepsLoader.loadNativeDeps();
}
+ // there doesn't appear to be a useful constant in spark to reference. this could break.
+ static final long DEFAULT_XXHASH64_SEED = 42;
+
+ public static final int MAX_STACK_DEPTH = getMaxStackDepth();
+
/**
* Create a new vector containing spark's 32-bit murmur3 hash of each row in the table.
* Spark's murmur3 hash uses a different tail processing algorithm.
@@ -75,7 +77,6 @@ public static ColumnVector xxhash64(long seed, ColumnView columns[]) {
assert columns[i] != null : "Column vectors passed may not be null";
assert columns[i].getRowCount() == size : "Row count mismatch, all columns must be the same size";
assert !columns[i].getType().isDurationType() : "Unsupported column type Duration";
- assert !columns[i].getType().isNestedType() : "Unsupported column type Nested";
columnViews[i] = columns[i].getNativeView();
}
return new ColumnVector(xxhash64(seed, columnViews));
@@ -101,6 +102,8 @@ public static ColumnVector hiveHash(ColumnView columns[]) {
return new ColumnVector(hiveHash(columnViews));
}
+ private static native int getMaxStackDepth();
+
private static native long murmurHash32(int seed, long[] viewHandles) throws CudfException;
private static native long xxhash64(long seed, long[] viewHandles) throws CudfException;
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/RegexRewriteUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/RegexRewriteUtils.java
index 9277c3e0f9..d509227cdf 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/RegexRewriteUtils.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/RegexRewriteUtils.java
@@ -28,7 +28,7 @@ public class RegexRewriteUtils {
* a literal string followed by a range of characters in the range of start to end, with at least
* len characters.
*
- * @param strings Column of strings to check for literal.
+ * @param input Column of strings to check for literal.
* @param literal UTF-8 encoded string to check in strings column.
* @param len Minimum number of characters to check after the literal.
* @param start Minimum UTF-8 codepoint value to check for in the range.
@@ -37,7 +37,7 @@ public class RegexRewriteUtils {
*/
public static ColumnVector literalRangePattern(ColumnVector input, Scalar literal, int len, int start, int end) {
assert(input.getType().equals(DType.STRING)) : "column must be a String";
- return new ColumnVector(literalRangePattern(input.getNativeView(), CudfAccessor.getScalarHandle(literal), len, start, end));
+ return new ColumnVector(literalRangePattern(input.getNativeView(), literal.getScalarHandle(), len, start, end));
}
private static native long literalRangePattern(long input, long literal, int len, int start, int end);
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/kudo/KudoSerializer.java b/src/main/java/com/nvidia/spark/rapids/jni/kudo/KudoSerializer.java
index 6370531428..7ae784e639 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/kudo/KudoSerializer.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/kudo/KudoSerializer.java
@@ -16,20 +16,29 @@
package com.nvidia.spark.rapids.jni.kudo;
-import ai.rapids.cudf.*;
+import static com.nvidia.spark.rapids.jni.Preconditions.ensure;
+import static java.util.Objects.requireNonNull;
+
+import ai.rapids.cudf.BufferType;
+import ai.rapids.cudf.Cuda;
+import ai.rapids.cudf.HostColumnVector;
+import ai.rapids.cudf.JCudfSerialization;
+import ai.rapids.cudf.Schema;
+import ai.rapids.cudf.Table;
import com.nvidia.spark.rapids.jni.Pair;
import com.nvidia.spark.rapids.jni.schema.Visitors;
-
-import java.io.*;
+import java.io.BufferedOutputStream;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
import java.util.Arrays;
import java.util.List;
import java.util.function.LongConsumer;
import java.util.function.Supplier;
import java.util.stream.IntStream;
-import static com.nvidia.spark.rapids.jni.Preconditions.ensure;
-import static java.util.Objects.requireNonNull;
-
/**
* This class is used to serialize/deserialize a table using the Kudo format.
*
@@ -148,8 +157,9 @@
public class KudoSerializer {
private static final byte[] PADDING = new byte[64];
- private static final BufferType[] ALL_BUFFER_TYPES = new BufferType[]{BufferType.VALIDITY, BufferType.OFFSET,
- BufferType.DATA};
+ private static final BufferType[] ALL_BUFFER_TYPES =
+ new BufferType[] {BufferType.VALIDITY, BufferType.OFFSET,
+ BufferType.DATA};
static {
Arrays.fill(PADDING, (byte) 0);
@@ -176,7 +186,7 @@ public KudoSerializer(Schema schema) {
* @param numRows number of rows to write
* @return number of bytes written
*/
- long writeToStream(Table table, OutputStream out, int rowOffset, int numRows) {
+ WriteMetrics writeToStreamWithMetrics(Table table, OutputStream out, int rowOffset, int numRows) {
HostColumnVector[] columns = null;
try {
columns = IntStream.range(0, table.getNumberOfColumns())
@@ -185,7 +195,7 @@ long writeToStream(Table table, OutputStream out, int rowOffset, int numRows) {
.toArray(HostColumnVector[]::new);
Cuda.DEFAULT_STREAM.sync();
- return writeToStream(columns, out, rowOffset, numRows);
+ return writeToStreamWithMetrics(columns, out, rowOffset, numRows);
} finally {
if (columns != null) {
for (HostColumnVector column : columns) {
@@ -195,6 +205,18 @@ long writeToStream(Table table, OutputStream out, int rowOffset, int numRows) {
}
}
+ /**
+ * Write partition of an array of {@link HostColumnVector} to an output stream.
+ * See {@link #writeToStreamWithMetrics(HostColumnVector[], OutputStream, int, int)} for more
+ * details.
+ *
+ * @return number of bytes written
+ */
+ public long writeToStream(HostColumnVector[] columns, OutputStream out, int rowOffset,
+ int numRows) {
+ return writeToStreamWithMetrics(columns, out, rowOffset, numRows).getWrittenBytes();
+ }
+
/**
* Write partition of an array of {@link HostColumnVector} to an output stream.
*
@@ -208,7 +230,8 @@ long writeToStream(Table table, OutputStream out, int rowOffset, int numRows) {
* @param numRows number of rows to write
* @return number of bytes written
*/
- public long writeToStream(HostColumnVector[] columns, OutputStream out, int rowOffset, int numRows) {
+ public WriteMetrics writeToStreamWithMetrics(HostColumnVector[] columns, OutputStream out,
+ int rowOffset, int numRows) {
ensure(numRows > 0, () -> "numRows must be > 0, but was " + numRows);
ensure(columns.length > 0, () -> "columns must not be empty, for row count only records " +
"please call writeRowCountToStream");
@@ -286,17 +309,25 @@ public Pair
mergeToTable(List kudoTables) throws
}
}
- private long writeSliced(HostColumnVector[] columns, DataWriter out, int rowOffset, int numRows) throws Exception {
- KudoTableHeaderCalc headerCalc = new KudoTableHeaderCalc(rowOffset, numRows, flattenedColumnCount);
- Visitors.visitColumns(columns, headerCalc);
+ private WriteMetrics writeSliced(HostColumnVector[] columns, DataWriter out, int rowOffset,
+ int numRows) throws Exception {
+ WriteMetrics metrics = new WriteMetrics();
+ KudoTableHeaderCalc headerCalc =
+ new KudoTableHeaderCalc(rowOffset, numRows, flattenedColumnCount);
+ withTime(() -> Visitors.visitColumns(columns, headerCalc), metrics::addCalcHeaderTime);
KudoTableHeader header = headerCalc.getHeader();
+ long currentTime = System.nanoTime();
header.writeTo(out);
+ metrics.addCopyHeaderTime(System.nanoTime() - currentTime);
+ metrics.addWrittenBytes(header.getSerializedSize());
long bytesWritten = 0;
for (BufferType bufferType : ALL_BUFFER_TYPES) {
- SlicedBufferSerializer serializer = new SlicedBufferSerializer(rowOffset, numRows, bufferType, out);
+ SlicedBufferSerializer serializer = new SlicedBufferSerializer(rowOffset, numRows, bufferType,
+ out, metrics);
Visitors.visitColumns(columns, serializer);
bytesWritten += serializer.getTotalDataLen();
+ metrics.addWrittenBytes(serializer.getTotalDataLen());
}
if (bytesWritten != header.getTotalDataLen()) {
@@ -307,7 +338,7 @@ private long writeSliced(HostColumnVector[] columns, DataWriter out, int rowOffs
out.flush();
- return header.getSerializedSize() + bytesWritten;
+ return metrics;
}
private static DataWriter writerFrom(OutputStream out) {
@@ -348,6 +379,12 @@ static T withTime(Supplier task, LongConsumer timeConsumer) {
return ret;
}
+ static void withTime(Runnable task, LongConsumer timeConsumer) {
+ long now = System.nanoTime();
+ task.run();
+ timeConsumer.accept(System.nanoTime() - now);
+ }
+
/**
* This method returns the length in bytes needed to represent X number of rows
* e.g. getValidityLengthInBytes(5) => 1 byte
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/kudo/SlicedBufferSerializer.java b/src/main/java/com/nvidia/spark/rapids/jni/kudo/SlicedBufferSerializer.java
index e22a523855..080cb5eda6 100644
--- a/src/main/java/com/nvidia/spark/rapids/jni/kudo/SlicedBufferSerializer.java
+++ b/src/main/java/com/nvidia/spark/rapids/jni/kudo/SlicedBufferSerializer.java
@@ -16,19 +16,18 @@
package com.nvidia.spark.rapids.jni.kudo;
+import static com.nvidia.spark.rapids.jni.kudo.KudoSerializer.padForHostAlignment;
+
import ai.rapids.cudf.BufferType;
import ai.rapids.cudf.DType;
import ai.rapids.cudf.HostColumnVectorCore;
import ai.rapids.cudf.HostMemoryBuffer;
import com.nvidia.spark.rapids.jni.schema.HostColumnsVisitor;
-
import java.io.IOException;
import java.util.ArrayDeque;
import java.util.Deque;
import java.util.List;
-import static com.nvidia.spark.rapids.jni.kudo.KudoSerializer.padForHostAlignment;
-
/**
* This class visits a list of columns and serialize one of the buffers (validity, offset, or data) into with kudo
* format.
@@ -48,13 +47,16 @@ class SlicedBufferSerializer implements HostColumnsVisitor {
private final DataWriter writer;
private final Deque sliceInfos = new ArrayDeque<>();
+ private final WriteMetrics metrics;
private long totalDataLen;
- SlicedBufferSerializer(int rowOffset, int numRows, BufferType bufferType, DataWriter writer) {
+ SlicedBufferSerializer(int rowOffset, int numRows, BufferType bufferType, DataWriter writer,
+ WriteMetrics metrics) {
this.root = new SliceInfo(rowOffset, numRows);
this.bufferType = bufferType;
this.writer = writer;
this.sliceInfos.addLast(root);
+ this.metrics = metrics;
this.totalDataLen = 0;
}
@@ -153,28 +155,26 @@ public Void visit(HostColumnVectorCore col) {
}
}
- private long copySlicedValidity(HostColumnVectorCore column, SliceInfo sliceInfo) throws IOException {
+ private long copySlicedValidity(HostColumnVectorCore column, SliceInfo sliceInfo)
+ throws IOException {
if (column.getValidity() != null && sliceInfo.getRowCount() > 0) {
HostMemoryBuffer buff = column.getValidity();
long len = sliceInfo.getValidityBufferInfo().getBufferLength();
- writer.copyDataFrom(buff, sliceInfo.getValidityBufferInfo().getBufferOffset(),
- len);
- return padForHostAlignment(writer, len);
+ return copyBufferAndPadForHost(buff, sliceInfo.getValidityBufferInfo().getBufferOffset(), len);
} else {
return 0;
}
}
- private long copySlicedOffset(HostColumnVectorCore column, SliceInfo sliceInfo) throws IOException {
+ private long copySlicedOffset(HostColumnVectorCore column, SliceInfo sliceInfo)
+ throws IOException {
if (sliceInfo.rowCount <= 0 || column.getOffsets() == null) {
// Don't copy anything, there are no rows
return 0;
}
long bytesToCopy = (sliceInfo.rowCount + 1) * Integer.BYTES;
long srcOffset = sliceInfo.offset * Integer.BYTES;
- HostMemoryBuffer buff = column.getOffsets();
- writer.copyDataFrom(buff, srcOffset, bytesToCopy);
- return padForHostAlignment(writer, bytesToCopy);
+ return copyBufferAndPadForHost(column.getOffsets(), srcOffset, bytesToCopy);
}
private long copySlicedData(HostColumnVectorCore column, SliceInfo sliceInfo) throws IOException {
@@ -182,7 +182,8 @@ private long copySlicedData(HostColumnVectorCore column, SliceInfo sliceInfo) th
DType type = column.getType();
if (type.equals(DType.STRING)) {
long startByteOffset = column.getOffsets().getInt(sliceInfo.offset * Integer.BYTES);
- long endByteOffset = column.getOffsets().getInt((sliceInfo.offset + sliceInfo.rowCount) * Integer.BYTES);
+ long endByteOffset =
+ column.getOffsets().getInt((sliceInfo.offset + sliceInfo.rowCount) * Integer.BYTES);
long bytesToCopy = endByteOffset - startByteOffset;
if (column.getData() == null) {
if (bytesToCopy != 0) {
@@ -192,14 +193,12 @@ private long copySlicedData(HostColumnVectorCore column, SliceInfo sliceInfo) th
return 0;
} else {
- writer.copyDataFrom(column.getData(), startByteOffset, bytesToCopy);
- return padForHostAlignment(writer, bytesToCopy);
+ return copyBufferAndPadForHost(column.getData(), startByteOffset, bytesToCopy);
}
} else if (type.getSizeInBytes() > 0) {
long bytesToCopy = sliceInfo.rowCount * type.getSizeInBytes();
long srcOffset = sliceInfo.offset * type.getSizeInBytes();
- writer.copyDataFrom(column.getData(), srcOffset, bytesToCopy);
- return padForHostAlignment(writer, bytesToCopy);
+ return copyBufferAndPadForHost(column.getData(), srcOffset, bytesToCopy);
} else {
return 0;
}
@@ -207,4 +206,13 @@ private long copySlicedData(HostColumnVectorCore column, SliceInfo sliceInfo) th
return 0;
}
}
+
+ private long copyBufferAndPadForHost(HostMemoryBuffer buffer, long offset, long length)
+ throws IOException {
+ long now = System.nanoTime();
+ writer.copyDataFrom(buffer, offset, length);
+ long ret = padForHostAlignment(writer, length);
+ metrics.addCopyBufferTime(System.nanoTime() - now);
+ return ret;
+ }
}
diff --git a/src/main/java/com/nvidia/spark/rapids/jni/kudo/WriteMetrics.java b/src/main/java/com/nvidia/spark/rapids/jni/kudo/WriteMetrics.java
new file mode 100644
index 0000000000..d34564e776
--- /dev/null
+++ b/src/main/java/com/nvidia/spark/rapids/jni/kudo/WriteMetrics.java
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.jni.kudo;
+
+/**
+ * This class contains metrics for serializing table using kudo format.
+ */
+public class WriteMetrics {
+ private long calcHeaderTime;
+ private long copyHeaderTime;
+ private long copyBufferTime;
+ private long writtenBytes;
+
+
+ public WriteMetrics() {
+ this.calcHeaderTime = 0;
+ this.copyHeaderTime = 0;
+ this.copyBufferTime = 0;
+ this.writtenBytes = 0;
+ }
+
+ /**
+ * Get the time spent on calculating the header.
+ */
+ public long getCalcHeaderTime() {
+ return calcHeaderTime;
+ }
+
+ /**
+ * Get the time spent on copying the buffer.
+ */
+ public long getCopyBufferTime() {
+ return copyBufferTime;
+ }
+
+ public void addCopyBufferTime(long time) {
+ copyBufferTime += time;
+ }
+
+ /**
+ * Get the time spent on copying the header.
+ */
+ public long getCopyHeaderTime() {
+ return copyHeaderTime;
+ }
+
+ public void addCalcHeaderTime(long time) {
+ calcHeaderTime += time;
+ }
+
+ public void addCopyHeaderTime(long time) {
+ copyHeaderTime += time;
+ }
+
+ /**
+ * Get the number of bytes written.
+ */
+ public long getWrittenBytes() {
+ return writtenBytes;
+ }
+
+ public void addWrittenBytes(long bytes) {
+ writtenBytes += bytes;
+ }
+}
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/DateTimeRebaseTest.java b/src/test/java/com/nvidia/spark/rapids/jni/DateTimeRebaseTest.java
deleted file mode 100644
index 5508d56d4d..0000000000
--- a/src/test/java/com/nvidia/spark/rapids/jni/DateTimeRebaseTest.java
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.nvidia.spark.rapids.jni;
-
-import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
-
-import org.junit.jupiter.api.Test;
-
-import ai.rapids.cudf.ColumnVector;
-
-public class DateTimeRebaseTest {
- @Test
- void rebaseDaysToJulianTest() {
- try (ColumnVector input = ColumnVector.timestampDaysFromBoxedInts(-719162, -354285, null,
- -141714, -141438, -141437,
- null, null,
- -141432, -141427, -31463, -31453, -1, 0, 18335);
- ColumnVector expected = ColumnVector.timestampDaysFromBoxedInts(-719164, -354280, null,
- -141704, -141428, -141427,
- null, null,
- -141427, -141427, -31463, -31453, -1, 0, 18335);
- ColumnVector result = DateTimeRebase.rebaseGregorianToJulian(input)) {
- assertColumnsAreEqual(expected, result);
- }
- }
-
- @Test
- void rebaseDaysToGregorianTest() {
- try (ColumnVector input = ColumnVector.timestampDaysFromBoxedInts(-719164, -354280, null,
- -141704, -141428, -141427,
- null, null,
- -141427, -141427, -31463, -31453, -1, 0, 18335);
- ColumnVector expected = ColumnVector.timestampDaysFromBoxedInts(-719162, -354285, null,
- -141714, -141438, -141427,
- null, null,
- -141427, -141427, -31463, -31453, -1, 0, 18335);
- ColumnVector result = DateTimeRebase.rebaseJulianToGregorian(input)) {
- assertColumnsAreEqual(expected, result);
- }
- }
-
- @Test
- void rebaseMicroToJulian() {
- try (ColumnVector input = ColumnVector.timestampMicroSecondsFromBoxedLongs(-62135593076345679L,
- -30610213078876544L,
- null,
- -12244061221876544L,
- -12220243200000000L,
- -12219639001448163L,
- -12219292799000001L,
- -45446999900L,
- 1L,
- null,
- 1584178381500000L);
- ColumnVector expected =
- ColumnVector.timestampMicroSecondsFromBoxedLongs(-62135765876345679L,
- -30609781078876544L,
- null,
- -12243197221876544L,
- -12219379200000000L,
- -12219207001448163L,
- -12219292799000001L,
- -45446999900L,
- 1L,
- null,
- 1584178381500000L);
- ColumnVector result = DateTimeRebase.rebaseGregorianToJulian(input)) {
- assertColumnsAreEqual(expected, result);
- }
- }
-
- @Test
- void rebaseMicroToGregorian() {
- try (ColumnVector input = ColumnVector.timestampMicroSecondsFromBoxedLongs(-62135765876345679L,
- -30609781078876544L,
- null,
- -12243197221876544L,
- -12219379200000000L,
- -12219207001448163L,
- -12219292799000001L,
- -45446999900L,
- 1L,
- null,
- 1584178381500000L);
- ColumnVector expected =
- ColumnVector.timestampMicroSecondsFromBoxedLongs(-62135593076345679L,
- -30610213078876544L,
- null,
- -12244061221876544L,
- -12220243200000000L,
- -12219207001448163L,
- -12219292799000001L,
- -45446999900L,
- 1L,
- null,
- 1584178381500000L);
- ColumnVector result = DateTimeRebase.rebaseJulianToGregorian(input)) {
- assertColumnsAreEqual(expected, result);
- }
- }
-}
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/DateTimeUtilsTest.java b/src/test/java/com/nvidia/spark/rapids/jni/DateTimeUtilsTest.java
new file mode 100644
index 0000000000..6007dfe219
--- /dev/null
+++ b/src/test/java/com/nvidia/spark/rapids/jni/DateTimeUtilsTest.java
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.jni;
+
+import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
+
+import org.junit.jupiter.api.Test;
+
+import ai.rapids.cudf.ColumnVector;
+
+public class DateTimeUtilsTest {
+ @Test
+ void rebaseDaysToJulianTest() {
+ try (
+ ColumnVector input = ColumnVector.timestampDaysFromBoxedInts(-719162, -354285, null,
+ -141714, -141438, -141437,
+ null, null,
+ -141432, -141427, -31463, -31453, -1, 0, 18335);
+ ColumnVector expected = ColumnVector.timestampDaysFromBoxedInts(-719164, -354280, null,
+ -141704, -141428, -141427,
+ null, null,
+ -141427, -141427, -31463, -31453, -1, 0, 18335);
+ ColumnVector result = DateTimeUtils.rebaseGregorianToJulian(input)) {
+ assertColumnsAreEqual(expected, result);
+ }
+ }
+
+ @Test
+ void rebaseDaysToGregorianTest() {
+ try (
+ ColumnVector input = ColumnVector.timestampDaysFromBoxedInts(-719164, -354280, null,
+ -141704, -141428, -141427,
+ null, null,
+ -141427, -141427, -31463, -31453, -1, 0, 18335);
+ ColumnVector expected = ColumnVector.timestampDaysFromBoxedInts(-719162, -354285, null,
+ -141714, -141438, -141427,
+ null, null,
+ -141427, -141427, -31463, -31453, -1, 0, 18335);
+ ColumnVector result = DateTimeUtils.rebaseJulianToGregorian(input)) {
+ assertColumnsAreEqual(expected, result);
+ }
+ }
+
+ @Test
+ void rebaseMicroToJulian() {
+ try (
+ ColumnVector input = ColumnVector.timestampMicroSecondsFromBoxedLongs(-62135593076345679L,
+ -30610213078876544L,
+ null,
+ -12244061221876544L,
+ -12220243200000000L,
+ -12219639001448163L,
+ -12219292799000001L,
+ -45446999900L,
+ 1L,
+ null,
+ 1584178381500000L);
+ ColumnVector expected =
+ ColumnVector.timestampMicroSecondsFromBoxedLongs(-62135765876345679L,
+ -30609781078876544L,
+ null,
+ -12243197221876544L,
+ -12219379200000000L,
+ -12219207001448163L,
+ -12219292799000001L,
+ -45446999900L,
+ 1L,
+ null,
+ 1584178381500000L);
+ ColumnVector result = DateTimeUtils.rebaseGregorianToJulian(input)) {
+ assertColumnsAreEqual(expected, result);
+ }
+ }
+
+ @Test
+ void rebaseMicroToGregorian() {
+ try (
+ ColumnVector input = ColumnVector.timestampMicroSecondsFromBoxedLongs(-62135765876345679L,
+ -30609781078876544L,
+ null,
+ -12243197221876544L,
+ -12219379200000000L,
+ -12219207001448163L,
+ -12219292799000001L,
+ -45446999900L,
+ 1L,
+ null,
+ 1584178381500000L);
+ ColumnVector expected =
+ ColumnVector.timestampMicroSecondsFromBoxedLongs(-62135593076345679L,
+ -30610213078876544L,
+ null,
+ -12244061221876544L,
+ -12220243200000000L,
+ -12219207001448163L,
+ -12219292799000001L,
+ -45446999900L,
+ 1L,
+ null,
+ 1584178381500000L);
+ ColumnVector result = DateTimeUtils.rebaseJulianToGregorian(input)) {
+ assertColumnsAreEqual(expected, result);
+ }
+ }
+
+ @Test
+ void truncateDateTest() {
+ try (ColumnVector input = ColumnVector.timestampDaysFromBoxedInts(-31463, -31453, null, 0, 18335);
+ ColumnVector format = ColumnVector.fromStrings("YEAR", "MONTH", "WEEK", "QUARTER", "YY");
+ ColumnVector expected = ColumnVector.timestampDaysFromBoxedInts(-31776, -31472, null, 0, 18262);
+ ColumnVector result = DateTimeUtils.truncate(input, format)) {
+ assertColumnsAreEqual(expected, result);
+ }
+ }
+
+ @Test
+ void truncateTimestampTest() {
+ try (
+ ColumnVector input = ColumnVector.timestampMicroSecondsFromBoxedLongs(
+ -12219292799000001L,
+ -45446999900L,
+ 1L,
+ null,
+ 1584178381500000L);
+ ColumnVector format = ColumnVector.fromStrings("YEAR", "HOUR", "WEEK", "QUARTER", "SECOND");
+ ColumnVector expected = ColumnVector.timestampMicroSecondsFromBoxedLongs(
+ -12244089600000000L,
+ -46800000000L,
+ -259200000000L,
+ null,
+ 1584178381000000L);
+ ColumnVector result = DateTimeUtils.truncate(input, format)) {
+ assertColumnsAreEqual(expected, result);
+ }
+ }
+}
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/HashTest.java b/src/test/java/com/nvidia/spark/rapids/jni/HashTest.java
index 19172a8d33..874cb84b5e 100644
--- a/src/test/java/com/nvidia/spark/rapids/jni/HashTest.java
+++ b/src/test/java/com/nvidia/spark/rapids/jni/HashTest.java
@@ -387,6 +387,182 @@ void testXXHash64Mixed() {
}
}
+ @Test
+ void testXXHash64Struct() {
+ try (ColumnVector strings = ColumnVector.fromStrings(
+ "a", "B\n", "dE\"\u0100\t\u0101 \ud720\ud721",
+ "A very long (greater than 128 bytes/char string) to test a multi hash-step data point " +
+ "in the MD5 hash function. This string needed to be longer.",
+ null, null);
+ ColumnVector integers = ColumnVector.fromBoxedInts(0, 100, -100, Integer.MIN_VALUE, Integer.MAX_VALUE, null);
+ ColumnVector doubles = ColumnVector.fromBoxedDoubles(
+ 0.0, 100.0, -100.0, POSITIVE_DOUBLE_NAN_LOWER_RANGE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, null);
+ ColumnVector floats = ColumnVector.fromBoxedFloats(
+ 0f, 100f, -100f, NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, null);
+ ColumnVector bools = ColumnVector.fromBoxedBooleans(true, false, null, false, true, null);
+ ColumnView structs = ColumnView.makeStructView(strings, integers, doubles, floats, bools);
+ ColumnVector result = Hash.xxhash64(new ColumnView[]{structs});
+ ColumnVector expected = ColumnVector.fromBoxedLongs(7451748878409563026L, 6024043102550151964L, 3380664624738534402L, 8444697026100086329L, -5888679192448042852L, Hash.DEFAULT_XXHASH64_SEED)) {
+ assertColumnsAreEqual(expected, result);
+ }
+ }
+
+ @Test
+ void testXXHash64NestedStruct() {
+ try (ColumnVector strings = ColumnVector.fromStrings(
+ "a", "B\n", "dE\"\u0100\t\u0101 \ud720\ud721",
+ "A very long (greater than 128 bytes/char string) to test a multi hash-step data point " +
+ "in the MD5 hash function. This string needed to be longer.",
+ null, null);
+ ColumnVector integers = ColumnVector.fromBoxedInts(0, 100, -100, Integer.MIN_VALUE, Integer.MAX_VALUE, null);
+ ColumnVector doubles = ColumnVector.fromBoxedDoubles(
+ 0.0, 100.0, -100.0, POSITIVE_DOUBLE_NAN_LOWER_RANGE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, null);
+ ColumnVector floats = ColumnVector.fromBoxedFloats(
+ 0f, 100f, -100f, NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, null);
+ ColumnVector bools = ColumnVector.fromBoxedBooleans(true, false, null, false, true, null);
+ ColumnView structs1 = ColumnView.makeStructView(strings, integers);
+ ColumnView structs2 = ColumnView.makeStructView(structs1, doubles);
+ ColumnView structs3 = ColumnView.makeStructView(bools);
+ ColumnView structs = ColumnView.makeStructView(structs2, floats, structs3);
+ ColumnVector result = Hash.xxhash64(new ColumnView[]{structs});
+ ColumnVector expected = ColumnVector.fromBoxedLongs(7451748878409563026L, 6024043102550151964L, 3380664624738534402L, 8444697026100086329L, -5888679192448042852L, Hash.DEFAULT_XXHASH64_SEED)) {
+ assertColumnsAreEqual(expected, result);
+ }
+ }
+
+ @Test
+ void testXXHash64Lists() {
+ try (ColumnVector stringListCV = ColumnVector.fromLists(
+ new ListType(true, new BasicType(true, DType.STRING)),
+ Arrays.asList(null, "a"),
+ Arrays.asList("B\n", ""),
+ Arrays.asList("dE\"\u0100\t\u0101", " \ud720\ud721"),
+ Collections.singletonList("A very long (greater than 128 bytes/char string) to test a multi hash-step data point " +
+ "in the MD5 hash function. This string needed to be longer."),
+ Collections.singletonList(""),
+ null);
+ ColumnVector stringExpected = ColumnVector.fromBoxedLongs(-8582455328737087284L, 7160715839242204087L, -862482741676457612L, -3700309651391443614L, -7444071767201028348L, Hash.DEFAULT_XXHASH64_SEED);
+ ColumnVector stringResult = Hash.xxhash64(new ColumnView[]{stringListCV});
+ ColumnVector intListCV = ColumnVector.fromLists(
+ new ListType(true, new BasicType(true, DType.INT32)),
+ Collections.emptyList(),
+ Arrays.asList(0, -2, 3),
+ Collections.singletonList(Integer.MAX_VALUE),
+ Arrays.asList(5, -6, null),
+ Collections.singletonList(Integer.MIN_VALUE),
+ null);
+ ColumnVector intExpected = ColumnVector.fromBoxedLongs(Hash.DEFAULT_XXHASH64_SEED, -4022702357093761688L, 1508894993788531228L, 7329154841501342665L, 2073849959933241805L, Hash.DEFAULT_XXHASH64_SEED);
+ ColumnVector intResult = Hash.xxhash64(new ColumnVector[]{intListCV})) {
+ assertColumnsAreEqual(stringExpected, stringResult);
+ assertColumnsAreEqual(intExpected, intResult);
+ }
+ }
+
+ @Test
+ void testXXHash64NestedLists() {
+ try (ColumnVector nestedStringListCV = ColumnVector.fromLists(
+ new ListType(true, new ListType(true, new BasicType(true, DType.STRING))),
+ Arrays.asList(null, Collections.singletonList("a")),
+ Collections.singletonList(Arrays.asList("B\n", "")),
+ Arrays.asList(Collections.singletonList("dE\"\u0100\t\u0101"), Collections.singletonList(" \ud720\ud721")),
+ Collections.singletonList(Collections.singletonList("A very long (greater than 128 bytes/char string) to test a multi hash-step data point " +
+ "in the MD5 hash function. This string needed to be longer.")),
+ Collections.singletonList(Collections.singletonList("")),
+ null);
+ ColumnVector stringExpected = ColumnVector.fromBoxedLongs(-8582455328737087284L, 7160715839242204087L, -862482741676457612L, -3700309651391443614L, -7444071767201028348L, Hash.DEFAULT_XXHASH64_SEED);
+ ColumnVector stringResult = Hash.xxhash64(new ColumnView[]{nestedStringListCV});
+ ColumnVector nestedIntListCV = ColumnVector.fromLists(
+ new ListType(true, new ListType(true, new BasicType(true, DType.INT32))),
+ Collections.emptyList(),
+ Arrays.asList(Collections.singletonList(0), Collections.singletonList(-2), Collections.singletonList(3)),
+ Collections.singletonList(Collections.singletonList(Integer.MAX_VALUE)),
+ Arrays.asList(Collections.singletonList(5), Arrays.asList(-6, null)),
+ Collections.singletonList(Collections.singletonList(Integer.MIN_VALUE)),
+ null);
+ ColumnVector intExpected = ColumnVector.fromBoxedLongs(Hash.DEFAULT_XXHASH64_SEED, -4022702357093761688L, 1508894993788531228L, 7329154841501342665L, 2073849959933241805L, Hash.DEFAULT_XXHASH64_SEED);
+ ColumnVector intResult = Hash.xxhash64(new ColumnVector[]{nestedIntListCV});) {
+ assertColumnsAreEqual(stringExpected, stringResult);
+ assertColumnsAreEqual(intExpected, intResult);
+ }
+ }
+
+ @Test
+ void testXXHash64StructOfList() {
+ try (ColumnVector stringListCV = ColumnVector.fromLists(
+ new ListType(true, new BasicType(true, DType.STRING)),
+ Arrays.asList(null, "a"),
+ Arrays.asList("B\n", ""),
+ Arrays.asList("dE\"\u0100\t\u0101", " \ud720\ud721"),
+ Collections.singletonList("A very long (greater than 128 bytes/char string) to test a multi hash-step data point " +
+ "in the MD5 hash function. This string needed to be longer."),
+ Collections.singletonList(""),
+ null);
+ ColumnVector intListCV = ColumnVector.fromLists(
+ new ListType(true, new BasicType(true, DType.INT32)),
+ Collections.emptyList(),
+ Arrays.asList(0, -2, 3),
+ Collections.singletonList(Integer.MAX_VALUE),
+ Arrays.asList(5, -6, null),
+ Collections.singletonList(Integer.MIN_VALUE),
+ null);
+ ColumnVector doubles = ColumnVector.fromBoxedDoubles(
+ 0.0, 100.0, -100.0, POSITIVE_DOUBLE_NAN_LOWER_RANGE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, null);
+ ColumnVector floats = ColumnVector.fromBoxedFloats(
+ 0f, 100f, -100f, NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, null);
+ ColumnView structCV = ColumnView.makeStructView(intListCV, stringListCV, doubles, floats);
+ ColumnVector nestedExpected = ColumnVector.fromBoxedLongs(-8492741646850220468L, -6547737320918905493L, -8718220625378038731L, 5441580647216064522L, 3645801243834961127L, Hash.DEFAULT_XXHASH64_SEED);
+ ColumnVector nestedResult = Hash.xxhash64(new ColumnView[]{structCV})) {
+ assertColumnsAreEqual(nestedExpected, nestedResult);
+ }
+ }
+
+ @Test
+ void testXXHash64ListOfStruct() {
+ try (ColumnVector structListCV = ColumnVector.fromLists(new ListType(true, new StructType(true,
+ new BasicType(true, DType.STRING), new BasicType(true, DType.INT32), new BasicType(true, DType.FLOAT64), new BasicType(true, DType.FLOAT32), new BasicType(true, DType.BOOL8))),
+ Collections.emptyList(),
+ Collections.singletonList(new StructData("a", 0, 0.0, 0f, true)),
+ Arrays.asList(new StructData("B\n", 100, 100.0, 100f, false), new StructData("dE\"\u0100\t\u0101 \ud720\ud721", -100, -100.0, -100f, null)),
+ Collections.singletonList(new StructData("A very long (greater than 128 bytes/char string) to test a multi hash-step data point " +
+ "in the MD5 hash function. This string needed to be longer.", Integer.MIN_VALUE, POSITIVE_DOUBLE_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_LOWER_RANGE, false)),
+ Arrays.asList(new StructData(null, Integer.MAX_VALUE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, true), new StructData(null, null, null, null, null)),
+ null);
+ ColumnVector result = Hash.xxhash64(new ColumnView[]{structListCV});
+ ColumnVector expected = ColumnVector.fromBoxedLongs(Hash.DEFAULT_XXHASH64_SEED, 7451748878409563026L, 948372773124634350L, 8444697026100086329L, -5888679192448042852L, Hash.DEFAULT_XXHASH64_SEED)) {
+ assertColumnsAreEqual(expected, result);
+ }
+ }
+
+ @Test
+ void testXXHash64NestedDepthExceedsLimit() {
+ try (ColumnVector nestedIntListCV = ColumnVector.fromLists(
+ new ListType(true, new ListType(true, new BasicType(true, DType.INT32))),
+ Arrays.asList(Arrays.asList(null, null), null),
+ Arrays.asList(Collections.singletonList(0), Collections.singletonList(-2), Collections.singletonList(3)),
+ Arrays.asList(null, Collections.singletonList(Integer.MAX_VALUE)),
+ Arrays.asList(Collections.singletonList(5), Arrays.asList(-6, null)),
+ Arrays.asList(Collections.singletonList(Integer.MIN_VALUE), null),
+ null);
+ ColumnVector integers = ColumnVector.fromBoxedInts(
+ 0, 100, -100, Integer.MIN_VALUE, Integer.MAX_VALUE, null);
+ ColumnVector doubles = ColumnVector.fromBoxedDoubles(0.0, 100.0, -100.0,
+ POSITIVE_DOUBLE_NAN_LOWER_RANGE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, null);
+ ColumnVector floats = ColumnVector.fromBoxedFloats(0f, 100f, -100f,
+ NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, null);
+ ColumnVector bools = ColumnVector.fromBoxedBooleans(
+ true, false, null, false, true, null);
+ ColumnView structs1 = ColumnView.makeStructView(nestedIntListCV, integers);
+ ColumnView structs2 = ColumnView.makeStructView(structs1, doubles);
+ ColumnView structs3 = ColumnView.makeStructView(structs2, bools);
+ ColumnView structs4 = ColumnView.makeStructView(structs3);
+ ColumnView structs5 = ColumnView.makeStructView(structs4, floats);
+ ColumnView structs6 = ColumnView.makeStructView(structs5);
+ ColumnView structs7 = ColumnView.makeStructView(structs6);
+ ColumnView nestedResult = ColumnView.makeStructView(structs7);) {
+ assertThrows(CudfException.class, () -> Hash.xxhash64(new ColumnView[]{nestedResult}));
+ }
+ }
+
@Test
void testHiveHashBools() {
try (ColumnVector v0 = ColumnVector.fromBoxedBooleans(true, false, null);
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/RmmSparkTest.java b/src/test/java/com/nvidia/spark/rapids/jni/RmmSparkTest.java
index 270a4266cd..f618f945b0 100644
--- a/src/test/java/com/nvidia/spark/rapids/jni/RmmSparkTest.java
+++ b/src/test/java/com/nvidia/spark/rapids/jni/RmmSparkTest.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/src/test/java/com/nvidia/spark/rapids/jni/kudo/KudoSerializerTest.java b/src/test/java/com/nvidia/spark/rapids/jni/kudo/KudoSerializerTest.java
index 210777accf..3ffcb5e61b 100644
--- a/src/test/java/com/nvidia/spark/rapids/jni/kudo/KudoSerializerTest.java
+++ b/src/test/java/com/nvidia/spark/rapids/jni/kudo/KudoSerializerTest.java
@@ -75,7 +75,7 @@ public void testWriteSimple() throws Exception {
try (Table t = buildSimpleTable()) {
ByteArrayOutputStream out = new ByteArrayOutputStream();
- long bytesWritten = serializer.writeToStream(t, out, 0, 4);
+ long bytesWritten = serializer.writeToStreamWithMetrics(t, out, 0, 4).getWrittenBytes();
assertEquals(189, bytesWritten);
ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray());
@@ -365,7 +365,7 @@ private static void checkMergeTable(Table expected, List tableSlices
ByteArrayOutputStream bout = new ByteArrayOutputStream();
for (TableSlice slice : tableSlices) {
- serializer.writeToStream(slice.getBaseTable(), bout, slice.getStartRow(), slice.getNumRows());
+ serializer.writeToStreamWithMetrics(slice.getBaseTable(), bout, slice.getStartRow(), slice.getNumRows());
}
bout.flush();
diff --git a/thirdparty/cudf b/thirdparty/cudf
index fa62ff45ed..a081a573b6 160000
--- a/thirdparty/cudf
+++ b/thirdparty/cudf
@@ -1 +1 @@
-Subproject commit fa62ff45eddd8256f0a3e8cebf077970dd70cb67
+Subproject commit a081a573b6ca626f7b77ec21322acff5012e7ada
diff --git a/thirdparty/cudf-pins/rapids-cmake.sha b/thirdparty/cudf-pins/rapids-cmake.sha
index fad760dd73..005c3541bc 100644
--- a/thirdparty/cudf-pins/rapids-cmake.sha
+++ b/thirdparty/cudf-pins/rapids-cmake.sha
@@ -1 +1 @@
-fa61767d1584a9c8d083aa9ce8636af89cc63923
+faef975d488895186cf04a714faeb335f73757f0
diff --git a/thirdparty/cudf-pins/versions.json b/thirdparty/cudf-pins/versions.json
index 752f515108..ec2b83f4ad 100644
--- a/thirdparty/cudf-pins/versions.json
+++ b/thirdparty/cudf-pins/versions.json
@@ -5,20 +5,10 @@
{
"always_download" : true,
"git_shallow" : false,
- "git_tag" : "e21d607157218540cd7c45461213fb96adf720b7",
+ "git_tag" : "05e019afe53f9b0e4454cbd822f9bdda18df49bb",
"git_url" : "https://github.com/NVIDIA/cccl.git",
"patches" :
[
- {
- "file" : "${current_json_dir}/../cudf/cpp/cmake/thirdparty/patches/cccl_symbol_visibility.diff",
- "fixed_in" : "2.6",
- "issue" : "Correct symbol visibility issues in libcudacxx [https://github.com/NVIDIA/cccl/pull/1832/]"
- },
- {
- "file" : "${current_json_dir}/../cudf/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff",
- "fixed_in" : "",
- "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]"
- },
{
"file" : "${current_json_dir}/../cudf/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff",
"fixed_in" : "",
@@ -30,7 +20,7 @@
"issue" : "Improve Thrust scan compile times by reducing the number of kernels generated [https://github.com/rapidsai/cudf/pull/8183]"
}
],
- "version" : "2.5.0"
+ "version" : "2.7.0"
},
"GTest" :
{
@@ -52,7 +42,7 @@
{
"always_download" : true,
"git_shallow" : false,
- "git_tag" : "dc0f9fc20c2a544e53099e640a681b347532391a",
+ "git_tag" : "096346b739da3fb1d9c3b402190c0a3a7e554440",
"git_url" : "https://github.com/NVIDIA/cuCollections.git",
"version" : "0.0.1"
},
@@ -99,7 +89,7 @@
{
"always_download" : true,
"git_shallow" : false,
- "git_tag" : "e82574ba3787d5c6b1d1cd3f6aba02b52b233f45",
+ "git_tag" : "288535770abbe950ab8ec655d44f5aa9d6704cea",
"git_url" : "https://github.com/rapidsai/kvikio.git",
"version" : "25.02"
},
@@ -109,14 +99,6 @@
"git_shallow" : false,
"git_tag" : "1e2664a70ec14907409cadcceb14d79b9670bcdb",
"git_url" : "https://github.com/apache/arrow-nanoarrow.git",
- "patches" :
- [
- {
- "file" : "${current_json_dir}/../cudf/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff",
- "fixed_in" : "",
- "issue" : "https://github.com/apache/arrow-nanoarrow/issues/537"
- }
- ],
"version" : "0.6.0.dev"
},
"nvcomp" :
@@ -145,11 +127,19 @@
"git_url" : "https://github.com/NVIDIA/NVTX.git",
"version" : "3.1.0"
},
+ "rapids_logger" :
+ {
+ "always_download" : true,
+ "git_shallow" : false,
+ "git_tag" : "c510947ae9d3a67530cfe3e5eaccb5a3b8ea0e55",
+ "git_url" : "https://github.com/rapidsai/rapids-logger.git",
+ "version" : "c510947ae9d3a67530cfe3e5eaccb5a3b8ea0e55"
+ },
"rmm" :
{
"always_download" : true,
"git_shallow" : false,
- "git_tag" : "c9c6039ab71f91fb41376abea7ec36b8a2563de1",
+ "git_tag" : "1af03eb55ce51a376c3df2dc0cdf3c81738b2dd6",
"git_url" : "https://github.com/rapidsai/rmm.git",
"version" : "25.02"
},