Skip to content

Commit

Permalink
Merge branch 'branch-25.02' into multi-buffer-datasource
Browse files Browse the repository at this point in the history
  • Loading branch information
abellina authored Jan 8, 2025
2 parents 0571d2a + 421b26f commit a927711
Show file tree
Hide file tree
Showing 50 changed files with 150 additions and 2,608 deletions.
14 changes: 8 additions & 6 deletions .github/CODEOWNERS
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
# Copyright (c) 2020-2025, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
Expand All @@ -9,8 +9,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.


/jenkins/ @jlowe @revans2 @tgravescs @GaryShen2008 @NvTimLiu @gerashegalov
pom.xml @jlowe @revans2 @tgravescs @GaryShen2008 @NvTimLiu @gerashegalov
/dist/ @jlowe @revans2 @tgravescs @GaryShen2008 @NvTimLiu @gerashegalov
/.github/ @jlowe @revans2 @tgravescs @GaryShen2008 @NvTimLiu @gerashegalov
# Build-related
/jenkins/ @NVIDIA/sparkrapids-cicd-codeowners
pom.xml @NVIDIA/sparkrapids-cicd-codeowners
/dist/ @NVIDIA/sparkrapids-cicd-codeowners
/.github/ @NVIDIA/sparkrapids-cicd-codeowners
/build/ @NVIDIA/sparkrapids-cicd-codeowners
/scripts/ @NVIDIA/sparkrapids-cicd-codeowners
2 changes: 1 addition & 1 deletion NOTICE
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
RAPIDS plugin for Apache Spark
Copyright (c) 2019-2024, NVIDIA CORPORATION
Copyright (c) 2019-2025, NVIDIA CORPORATION

--------------------------------------------------------------------------------

Expand Down
2 changes: 1 addition & 1 deletion NOTICE-binary
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
RAPIDS plugin for Apache Spark
Copyright (c) 2019-2024, NVIDIA CORPORATION
Copyright (c) 2019-2025, NVIDIA CORPORATION

// ------------------------------------------------------------------
// NOTICE file corresponding to the section 4d of The Apache License,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023-2024, NVIDIA CORPORATION.
* Copyright (c) 2023-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -63,8 +63,7 @@ abstract class GpuDeltaParquetFileFormatBase extends GpuReadParquetFileFormat {
pushedFilters,
fileScan.rapidsConf,
fileScan.allMetrics,
fileScan.queryUsesInputFile,
fileScan.alluxioPathsMap)
fileScan.queryUsesInputFile)
}

override def buildReaderWithPartitionValuesAndMetrics(
Expand All @@ -75,8 +74,7 @@ abstract class GpuDeltaParquetFileFormatBase extends GpuReadParquetFileFormat {
filters: Seq[Filter],
options: Map[String, String],
hadoopConf: Configuration,
metrics: Map[String, GpuMetric],
alluxioPathReplacementMap: Option[Map[String, String]])
metrics: Map[String, GpuMetric])
: PartitionedFile => Iterator[InternalRow] = {
super.buildReaderWithPartitionValuesAndMetrics(
sparkSession,
Expand All @@ -86,8 +84,7 @@ abstract class GpuDeltaParquetFileFormatBase extends GpuReadParquetFileFormat {
filters,
options,
hadoopConf,
metrics,
alluxioPathReplacementMap)
metrics)
}

override def supportFieldName(name: String): Boolean = {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023-2024, NVIDIA CORPORATION.
* Copyright (c) 2023-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -63,8 +63,7 @@ trait GpuDeltaParquetFileFormat extends GpuReadParquetFileFormat {
pushedFilters,
fileScan.rapidsConf,
fileScan.allMetrics,
fileScan.queryUsesInputFile,
fileScan.alluxioPathsMap)
fileScan.queryUsesInputFile)
}

override def buildReaderWithPartitionValuesAndMetrics(
Expand All @@ -75,8 +74,7 @@ trait GpuDeltaParquetFileFormat extends GpuReadParquetFileFormat {
filters: Seq[Filter],
options: Map[String, String],
hadoopConf: Configuration,
metrics: Map[String, GpuMetric],
alluxioPathReplacementMap: Option[Map[String, String]])
metrics: Map[String, GpuMetric])
: PartitionedFile => Iterator[InternalRow] = {
super.buildReaderWithPartitionValuesAndMetrics(
sparkSession,
Expand All @@ -86,8 +84,7 @@ trait GpuDeltaParquetFileFormat extends GpuReadParquetFileFormat {
filters,
options,
hadoopConf,
metrics,
alluxioPathReplacementMap)
metrics)
}

override def supportFieldName(name: String): Boolean = {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023-2024, NVIDIA CORPORATION.
* Copyright (c) 2023-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -68,11 +68,8 @@ case class GpuDelta24xParquetFileFormat(
filters: Seq[Filter],
options: Map[String, String],
hadoopConf: Configuration,
metrics: Map[String, GpuMetric],
alluxioPathReplacementMap: Option[Map[String, String]])
metrics: Map[String, GpuMetric])
: PartitionedFile => Iterator[InternalRow] = {


val dataReader = super.buildReaderWithPartitionValuesAndMetrics(
sparkSession,
dataSchema,
Expand All @@ -81,8 +78,7 @@ case class GpuDelta24xParquetFileFormat(
if (disablePushDown) Seq.empty else filters,
options,
hadoopConf,
metrics,
alluxioPathReplacementMap)
metrics)

val delVecs = broadcastDvMap
val maxDelVecScatterBatchSize = RapidsConf
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023-2024, NVIDIA CORPORATION.
* Copyright (c) 2023-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -65,8 +65,7 @@ case class GpuDeltaParquetFileFormat(
filters: Seq[Filter],
options: Map[String, String],
hadoopConf: Configuration,
metrics: Map[String, GpuMetric],
alluxioPathReplacementMap: Option[Map[String, String]])
metrics: Map[String, GpuMetric])
: PartitionedFile => Iterator[InternalRow] = {

val dataReader = super.buildReaderWithPartitionValuesAndMetrics(
Expand All @@ -77,8 +76,7 @@ case class GpuDeltaParquetFileFormat(
filters,
options,
hadoopConf,
metrics,
alluxioPathReplacementMap)
metrics)

val delVecs = broadcastDvMap
val maxDelVecScatterBatchSize = RapidsConf
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
* Copyright (c) 2024-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -65,8 +65,7 @@ case class GpuDeltaParquetFileFormat(
filters: Seq[Filter],
options: Map[String, String],
hadoopConf: Configuration,
metrics: Map[String, GpuMetric],
alluxioPathReplacementMap: Option[Map[String, String]])
metrics: Map[String, GpuMetric])
: PartitionedFile => Iterator[InternalRow] = {

val dataReader = super.buildReaderWithPartitionValuesAndMetrics(
Expand All @@ -77,8 +76,7 @@ case class GpuDeltaParquetFileFormat(
filters,
options,
hadoopConf,
metrics,
alluxioPathReplacementMap)
metrics)

val delVecs = broadcastDvMap
val maxDelVecScatterBatchSize = RapidsConf
Expand Down
4 changes: 2 additions & 2 deletions dist/pom.xml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Copyright (c) 2020-2024, NVIDIA CORPORATION.
Copyright (c) 2020-2025, NVIDIA CORPORATION.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -190,7 +190,7 @@
<executions>
<execution>
<id>if_modified_files</id>
<phase>verify</phase>
<phase>package</phase>
<goals>
<goal>exec</goal>
</goals>
Expand Down
12 changes: 1 addition & 11 deletions docs/additional-functionality/advanced_configs.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,6 @@ For commonly used configurations and examples of setting options, please refer t

Name | Description | Default Value | Applicable at
-----|-------------|--------------|--------------
<a name="alluxio.automount.enabled"></a>spark.rapids.alluxio.automount.enabled|Enable the feature of auto mounting the cloud storage to Alluxio. It requires the Alluxio master is the same node of Spark driver node. The Alluxio master's host and port will be read from alluxio.master.hostname and alluxio.master.rpc.port(default: 19998) from ALLUXIO_HOME/conf/alluxio-site.properties, then replace a cloud path which matches spark.rapids.alluxio.bucket.regex like "s3://bar/b.csv" to "alluxio://0.1.2.3:19998/bar/b.csv", and the bucket "s3://bar" will be mounted to "/bar" in Alluxio automatically.|false|Runtime
<a name="alluxio.bucket.regex"></a>spark.rapids.alluxio.bucket.regex|A regex to decide which bucket should be auto-mounted to Alluxio. E.g. when setting as "^s3://bucket.*", the bucket which starts with "s3://bucket" will be mounted to Alluxio and the path "s3://bucket-foo/a.csv" will be replaced to "alluxio://0.1.2.3:19998/bucket-foo/a.csv". It's only valid when setting spark.rapids.alluxio.automount.enabled=true. The default value matches all the buckets in "s3://" or "s3a://" scheme.|^s3a{0,1}://.*|Runtime
<a name="alluxio.home"></a>spark.rapids.alluxio.home|The Alluxio installation home path or link to the installation home path. |/opt/alluxio|Startup
<a name="alluxio.large.file.threshold"></a>spark.rapids.alluxio.large.file.threshold|The threshold is used to identify whether average size of files is large when reading from S3. If reading large files from S3 and the disks used by Alluxio are slow, directly reading from S3 is better than reading caches from Alluxio, because S3 network bandwidth is faster than local disk. This improvement takes effect when spark.rapids.alluxio.slow.disk is enabled.|67108864|Runtime
<a name="alluxio.master"></a>spark.rapids.alluxio.master|The Alluxio master hostname. If not set, read Alluxio master URL from spark.rapids.alluxio.home locally. This config is useful when Alluxio master and Spark driver are not co-located.||Startup
<a name="alluxio.master.port"></a>spark.rapids.alluxio.master.port|The Alluxio master port. If not set, read Alluxio master port from spark.rapids.alluxio.home locally. This config is useful when Alluxio master and Spark driver are not co-located.|19998|Startup
<a name="alluxio.pathsToReplace"></a>spark.rapids.alluxio.pathsToReplace|List of paths to be replaced with corresponding Alluxio scheme. E.g. when configure is set to "s3://foo->alluxio://0.1.2.3:19998/foo,gs://bar->alluxio://0.1.2.3:19998/bar", it means: "s3://foo/a.csv" will be replaced to "alluxio://0.1.2.3:19998/foo/a.csv" and "gs://bar/b.csv" will be replaced to "alluxio://0.1.2.3:19998/bar/b.csv". To use this config, you have to mount the buckets to Alluxio by yourself. If you set this config, spark.rapids.alluxio.automount.enabled won't be valid.|None|Startup
<a name="alluxio.replacement.algo"></a>spark.rapids.alluxio.replacement.algo|The algorithm used when replacing the UFS path with the Alluxio path. CONVERT_TIME and TASK_TIME are the valid options. CONVERT_TIME indicates that we do it when we convert it to a GPU file read, this has extra overhead of creating an entirely new file index, which requires listing the files and getting all new file info from Alluxio. TASK_TIME replaces the path as late as possible inside of the task. By waiting and replacing it at task time, it just replaces the path without fetching the file information again, this is faster but doesn't update locality information if that has a bit impact on performance.|TASK_TIME|Runtime
<a name="alluxio.slow.disk"></a>spark.rapids.alluxio.slow.disk|Indicates whether the disks used by Alluxio are slow. If it's true and reading S3 large files, Rapids Accelerator reads from S3 directly instead of reading from Alluxio caches. Refer to spark.rapids.alluxio.large.file.threshold which defines a threshold that identifying whether files are large. Typically, it's slow disks if speed is less than 300M/second. If using convert time spark.rapids.alluxio.replacement.algo, this may not apply to all file types like Delta files|true|Runtime
<a name="alluxio.user"></a>spark.rapids.alluxio.user|Alluxio user is set on the Alluxio client, which is used to mount or get information. By default it should be the user that running the Alluxio processes. The default value is ubuntu.|ubuntu|Runtime
<a name="filecache.allowPathRegexp"></a>spark.rapids.filecache.allowPathRegexp|A regular expression to decide which paths will be cached when the file cache is enabled. If this is not set, then all paths are allowed to cache. If a path is allowed by this regexp but blocked by spark.rapids.filecache.blockPathRegexp, then the path is blocked to cache.|None|Startup
<a name="filecache.blockPathRegexp"></a>spark.rapids.filecache.blockPathRegexp|A regular expression to decide which paths will not be cached when the file cache is enabled. If a path is blocked by this regexp but is allowed by spark.rapids.filecache.allowPathRegexp, then the path is blocked.|None|Startup
<a name="filecache.checkStale"></a>spark.rapids.filecache.checkStale|Controls whether the cached is checked for being out of date with respect to the input file. When enabled, the data that has been cached locally for a file will be invalidated if the file is updated after being cached. This feature is only necessary if an input file for a Spark application can be changed during the lifetime of the application. If an individual input file will not be overwritten during the Spark application then performance may be improved by setting this to false.|true|Startup
Expand Down Expand Up @@ -69,7 +59,7 @@ Name | Description | Default Value | Applicable at
<a name="sql.castFloatToString.enabled"></a>spark.rapids.sql.castFloatToString.enabled|Casting from floating point types to string on the GPU returns results that have a different precision than the default results of Spark.|true|Runtime
<a name="sql.castStringToFloat.enabled"></a>spark.rapids.sql.castStringToFloat.enabled|When set to true, enables casting from strings to float types (float, double) on the GPU. Currently hex values aren't supported on the GPU. Also note that casting from string to float types on the GPU returns incorrect results when the string represents any number "1.7976931348623158E308" <= x < "1.7976931348623159E308" and "-1.7976931348623158E308" >= x > "-1.7976931348623159E308" in both these cases the GPU returns Double.MaxValue while CPU returns "+Infinity" and "-Infinity" respectively|true|Runtime
<a name="sql.castStringToTimestamp.enabled"></a>spark.rapids.sql.castStringToTimestamp.enabled|When set to true, casting from string to timestamp is supported on the GPU. The GPU only supports a subset of formats when casting strings to timestamps. Refer to the CAST documentation for more details.|false|Runtime
<a name="sql.coalescing.reader.numFilterParallel"></a>spark.rapids.sql.coalescing.reader.numFilterParallel|This controls the number of files the coalescing reader will run in each thread when it filters blocks for reading. If this value is greater than zero the files will be filtered in a multithreaded manner where each thread filters the number of files set by this config. If this is set to zero the files are filtered serially. This uses the same thread pool as the multithreaded reader, see spark.rapids.sql.multiThreadedRead.numThreads. Note that filtering multithreaded is useful with Alluxio.|0|Runtime
<a name="sql.coalescing.reader.numFilterParallel"></a>spark.rapids.sql.coalescing.reader.numFilterParallel|This controls the number of files the coalescing reader will run in each thread when it filters blocks for reading. If this value is greater than zero the files will be filtered in a multithreaded manner where each thread filters the number of files set by this config. If this is set to zero the files are filtered serially. This uses the same thread pool as the multithreaded reader, see spark.rapids.sql.multiThreadedRead.numThreads.|0|Runtime
<a name="sql.concurrentWriterPartitionFlushSize"></a>spark.rapids.sql.concurrentWriterPartitionFlushSize|The flush size of the concurrent writer cache in bytes for each partition. If specified spark.sql.maxConcurrentOutputFileWriters, use concurrent writer to write data. Concurrent writer first caches data for each partition and begins to flush the data if it finds one partition with a size that is greater than or equal to this config. The default value is 0, which will try to select a size based off of file type specific configs. E.g.: It uses `write.parquet.row-group-size-bytes` config for Parquet type and `orc.stripe.size` config for Orc type. If the value is greater than 0, will use this positive value.Max value may get better performance but not always, because concurrent writer uses spillable cache and big value may cause more IO swaps.|0|Runtime
<a name="sql.csv.read.decimal.enabled"></a>spark.rapids.sql.csv.read.decimal.enabled|CSV reading is not 100% compatible when reading decimals.|false|Runtime
<a name="sql.csv.read.double.enabled"></a>spark.rapids.sql.csv.read.double.enabled|CSV reading is not 100% compatible when reading doubles.|true|Runtime
Expand Down
15 changes: 11 additions & 4 deletions jenkins/deploy.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash
#
# Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020-2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -42,6 +42,9 @@ DIST_PL=${DIST_PL:-"dist"}
###### Build the path of jar(s) to be deployed ######
MVN_SETTINGS=${MVN_SETTINGS:-"jenkins/settings.xml"}
MVN="mvn -B -Dmaven.wagon.http.retryHandler.count=3 -DretryFailedDeploymentCount=3 -s $MVN_SETTINGS"
SCALA_BINARY_VER=${SCALA_BINARY_VER:-"2.12"}
[ $SCALA_BINARY_VER == "2.13" ] && MVN="$MVN -f scala2.13/"

function mvnEval {
$MVN help:evaluate -q -DforceStdout -pl $1 -Dexpression=$2
}
Expand Down Expand Up @@ -73,9 +76,13 @@ DEPLOY_FILES=$(echo $CLASSIFIERS | sed -e "s;\([^,]*\);${FPATH}-\1.jar;g")
SQL_ART_ID=$(mvnEval $SQL_PL project.artifactId)
SQL_ART_VER=$(mvnEval $SQL_PL project.version)
JS_FPATH="$(echo -n ${SQL_PL}/target/spark*)/${SQL_ART_ID}-${SQL_ART_VER}"
cp $JS_FPATH-sources.jar $FPATH-sources.jar
cp $JS_FPATH-javadoc.jar $FPATH-javadoc.jar

if [ $SCALA_BINARY_VER == "2.13" ]; then
cp scala2.13/$JS_FPATH-sources.jar scala2.13/$FPATH-sources.jar
cp scala2.13/$JS_FPATH-javadoc.jar scala2.13/$FPATH-javadoc.jar
else
cp $JS_FPATH-sources.jar $FPATH-sources.jar
cp $JS_FPATH-javadoc.jar $FPATH-javadoc.jar
fi
echo "Plan to deploy ${FPATH}.jar to $SERVER_URL (ID:$SERVER_ID)"

GPG_PLUGIN="org.apache.maven.plugins:maven-gpg-plugin:3.1.0:sign-and-deploy-file"
Expand Down
Loading

0 comments on commit a927711

Please sign in to comment.