diff --git a/docs/additional-functionality/advanced_configs.md b/docs/additional-functionality/advanced_configs.md index 40acd48329c..4c5a721689c 100644 --- a/docs/additional-functionality/advanced_configs.md +++ b/docs/additional-functionality/advanced_configs.md @@ -142,6 +142,7 @@ Name | Description | Default Value | Applicable at spark.rapids.sql.stableSort.enabled|Enable or disable stable sorting. Apache Spark's sorting is typically a stable sort, but sort stability cannot be guaranteed in distributed work loads because the order in which upstream data arrives to a task is not guaranteed. Sort stability then only matters when reading and sorting data from a file using a single task/partition. Because of limitations in the plugin when you enable stable sorting all of the data for a single task will be combined into a single batch before sorting. This currently disables spilling from GPU memory if the data size is too large.|false|Runtime spark.rapids.sql.suppressPlanningFailure|Option to fallback an individual query to CPU if an unexpected condition prevents the query plan from being converted to a GPU-enabled one. Note this is different from a normal CPU fallback for a yet-to-be-supported Spark SQL feature. If this happens the error should be reported and investigated as a GitHub issue.|false|Runtime spark.rapids.sql.variableFloatAgg.enabled|Spark assumes that all operations produce the exact same result each time. This is not true for some floating point aggregations, which can produce slightly different results on the GPU as the aggregation is done in parallel. This can enable those operations if you know the query is only computing it once.|true|Runtime +spark.rapids.sql.window.batched.bounded.row.max|Max value for bounded row window preceding/following extents permissible for the window to be evaluated in batched mode. This value affects both the preceding and following bounds, potentially doubling the window size permitted for batched execution|100|Runtime spark.rapids.sql.window.range.byte.enabled|When the order-by column of a range based window is byte type and the range boundary calculated for a value has overflow, CPU and GPU will get the different results. When set to false disables the range window acceleration for the byte type order-by column|false|Runtime spark.rapids.sql.window.range.decimal.enabled|When set to false, this disables the range window acceleration for the DECIMAL type order-by column|true|Runtime spark.rapids.sql.window.range.double.enabled|When set to false, this disables the range window acceleration for the double type order-by column|true|Runtime diff --git a/integration_tests/src/main/python/window_function_test.py b/integration_tests/src/main/python/window_function_test.py index 3de2d3de859..59c763c5505 100644 --- a/integration_tests/src/main/python/window_function_test.py +++ b/integration_tests/src/main/python/window_function_test.py @@ -1824,6 +1824,123 @@ def test_window_aggs_for_negative_rows_unpartitioned(data_gen, batch_size): conf=conf) +@ignore_order(local=True) +@pytest.mark.parametrize('batch_size', ['1000', '1g'], ids=idfn) +@pytest.mark.parametrize('data_gen', [ + _grpkey_short_with_nulls, + _grpkey_int_with_nulls, + _grpkey_long_with_nulls, + _grpkey_date_with_nulls, +], ids=idfn) +def test_window_aggs_for_batched_finite_row_windows_partitioned(data_gen, batch_size): + conf = {'spark.rapids.sql.batchSizeBytes': batch_size} + assert_gpu_and_cpu_are_equal_sql( + lambda spark: gen_df(spark, data_gen, length=2048), + 'window_agg_table', + """ + SELECT + COUNT(1) OVER (PARTITION BY a ORDER BY b,c ASC + ROWS BETWEEN CURRENT ROW AND 100 FOLLOWING) AS count_1_asc, + COUNT(c) OVER (PARTITION BY a ORDER BY b,c ASC + ROWS BETWEEN 100 PRECEDING AND CURRENT ROW) AS count_c_asc, + COUNT(c) OVER (PARTITION BY a ORDER BY b,c ASC + ROWS BETWEEN -50 PRECEDING AND 100 FOLLOWING) AS count_c_negative, + COUNT(1) OVER (PARTITION BY a ORDER BY b,c ASC + ROWS BETWEEN 50 PRECEDING AND -10 FOLLOWING) AS count_1_negative, + SUM(c) OVER (PARTITION BY a ORDER BY b,c ASC + ROWS BETWEEN 1 PRECEDING AND 3 FOLLOWING) AS sum_c_asc, + AVG(c) OVER (PARTITION BY a ORDER BY b,c ASC + ROWS BETWEEN 10 PRECEDING AND 30 FOLLOWING) AS avg_c_asc, + MAX(c) OVER (PARTITION BY a ORDER BY b,c DESC + ROWS BETWEEN 1 PRECEDING AND 3 FOLLOWING) AS max_c_desc, + MIN(c) OVER (PARTITION BY a ORDER BY b,c ASC + ROWS BETWEEN 1 PRECEDING AND 3 FOLLOWING) AS min_c_asc, + LAG(c, 30) OVER (PARTITION BY a ORDER BY b,c ASC) AS lag_c_30_asc, + LEAD(c, 40) OVER (PARTITION BY a ORDER BY b,c ASC) AS lead_c_40_asc + FROM window_agg_table + """, + validate_execs_in_gpu_plan=['GpuBatchedBoundedWindowExec'], + conf=conf) + + +@ignore_order(local=True) +@pytest.mark.parametrize('batch_size', ['1000', '1g'], ids=idfn) +@pytest.mark.parametrize('data_gen', [ + _grpkey_short_with_nulls, + _grpkey_int_with_nulls, + _grpkey_long_with_nulls, + _grpkey_date_with_nulls, +], ids=idfn) +def test_window_aggs_for_batched_finite_row_windows_unpartitioned(data_gen, batch_size): + conf = {'spark.rapids.sql.batchSizeBytes': batch_size} + assert_gpu_and_cpu_are_equal_sql( + lambda spark: gen_df(spark, data_gen, length=2048), + 'window_agg_table', + """ + SELECT + COUNT(1) OVER (ORDER BY b,c,a ASC + ROWS BETWEEN CURRENT ROW AND 100 FOLLOWING) AS count_1_asc, + COUNT(c) OVER (PARTITION BY a ORDER BY b,c,a ASC + ROWS BETWEEN 100 PRECEDING AND CURRENT ROW) AS count_c_asc, + COUNT(c) OVER (PARTITION BY a ORDER BY b,c,a ASC + ROWS BETWEEN -50 PRECEDING AND 100 FOLLOWING) AS count_c_negative, + COUNT(1) OVER (PARTITION BY a ORDER BY b,c,a ASC + ROWS BETWEEN 50 PRECEDING AND -10 FOLLOWING) AS count_1_negative, + SUM(c) OVER (PARTITION BY a ORDER BY b,c,a ASC + ROWS BETWEEN 1 PRECEDING AND 3 FOLLOWING) AS sum_c_asc, + AVG(c) OVER (PARTITION BY a ORDER BY b,c,a ASC + ROWS BETWEEN 10 PRECEDING AND 30 FOLLOWING) AS avg_c_asc, + MAX(c) OVER (PARTITION BY a ORDER BY b,c,a DESC + ROWS BETWEEN 1 PRECEDING AND 3 FOLLOWING) AS max_c_desc, + MIN(c) OVER (PARTITION BY a ORDER BY b,c,a ASC + ROWS BETWEEN 1 PRECEDING AND 3 FOLLOWING) AS min_c_asc, + LAG(c, 6) OVER (PARTITION BY a ORDER BY b,c,a ASC) AS lag_c_6, + LEAD(c,4) OVER (PARTITION BY a ORDER BY b,c,a ASC) AS lead_c_4 + FROM window_agg_table + """, + validate_execs_in_gpu_plan=['GpuBatchedBoundedWindowExec'], + conf=conf) + + +@ignore_order(local=True) +@pytest.mark.parametrize('data_gen', [_grpkey_int_with_nulls,], ids=idfn) +def test_window_aggs_for_batched_finite_row_windows_fallback(data_gen): + """ + This test is to verify that batching is disabled for bounded windows if + the window extents exceed the window-extents specified in the RAPIDS conf. + """ + + # Query with window extent = { 200 PRECEDING, 200 FOLLOWING }. + query = """ + SELECT + COUNT(1) OVER (PARTITION BY a ORDER BY b,c ASC + ROWS BETWEEN 200 PRECEDING AND 200 FOLLOWING) AS count_1_asc + FROM window_agg_table + """ + + def get_conf_with_extent(extent): + return {'spark.rapids.sql.batchSizeBytes': '1000', + 'spark.rapids.sql.window.batched.bounded.row.max': extent} + + def assert_query_runs_on(exec, conf): + assert_gpu_and_cpu_are_equal_sql( + lambda spark: gen_df(spark, data_gen, length=2048), + 'window_agg_table', + query, + validate_execs_in_gpu_plan=[exec], + conf=conf) + + # Check that with max window extent set to 100, + # query runs without batching, i.e. `GpuWindowExec`. + conf_100 = get_conf_with_extent(100) + assert_query_runs_on(exec='GpuWindowExec', conf=conf_100) + + # Check that with max window extent set to 200, + # query runs *with* batching, i.e. `GpuBatchedBoundedWindowExec`. + conf_200 = get_conf_with_extent(200) + assert_query_runs_on(exec='GpuBatchedBoundedWindowExec', conf=conf_200) + + def test_lru_cache_datagen(): # log cache info at the end of integration tests, not related to window functions info = gen_df_help.cache_info() diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuBatchedBoundedWindowExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuBatchedBoundedWindowExec.scala new file mode 100644 index 00000000000..0c288210dbe --- /dev/null +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuBatchedBoundedWindowExec.scala @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark.rapids + +import ai.rapids.cudf.{ColumnVector => CudfColumnVector, NvtxColor, Table => CudfTable} +import com.nvidia.spark.rapids.Arm.withResource +import com.nvidia.spark.rapids.ScalableTaskCompletion.onTaskCompletion + +import org.apache.spark.TaskContext +import org.apache.spark.internal.Logging +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.catalyst.expressions.{Expression, NamedExpression, SortOrder} +import org.apache.spark.sql.execution.SparkPlan +import org.apache.spark.sql.types._ +import org.apache.spark.sql.vectorized.ColumnarBatch + +class GpuBatchedBoundedWindowIterator( + input: Iterator[ColumnarBatch], + override val boundWindowOps: Seq[GpuExpression], + override val boundPartitionSpec: Seq[GpuExpression], + override val boundOrderSpec: Seq[SortOrder], + val outputTypes: Array[DataType], + minPreceding: Int, + maxFollowing: Int, + numOutputBatches: GpuMetric, + numOutputRows: GpuMetric, + opTime: GpuMetric) extends Iterator[ColumnarBatch] with BasicWindowCalc with Logging { + + override def isRunningBatched: Boolean = false // Not "Running Window" optimized. + // This is strictly for batching. + + override def hasNext: Boolean = numUnprocessedInCache > 0 || input.hasNext + + var cached: Option[Array[CudfColumnVector]] = None // For processing with the next batch. + + private var numUnprocessedInCache: Int = 0 // numRows at the bottom not processed completely. + private var numPrecedingRowsAdded: Int = 0 // numRows at the top, added for preceding context. + + // Register handler to clean up cache when task completes. + Option(TaskContext.get()).foreach { tc => + onTaskCompletion(tc) { + clearCached() + } + } + + // Caches input column schema on first read. + var inputTypes: Option[Array[DataType]] = None + + // Clears cached column vectors, after consumption. + private def clearCached(): Unit = { + cached.foreach(_.foreach(_.close)) + cached = None + } + + private def getNextInputBatch: SpillableColumnarBatch = { + // Sets column batch types using the types cached from the + // first input column read. + def optionallySetInputTypes(inputCB: ColumnarBatch): Unit = { + if (inputTypes.isEmpty) { + inputTypes = Some(GpuColumnVector.extractTypes(inputCB)) + } + } + + // Reads fresh batch from iterator, initializes input data-types if necessary. + def getFreshInputBatch: ColumnarBatch = { + val fresh_batch = input.next() + optionallySetInputTypes(fresh_batch) + fresh_batch + } + + def concatenateColumns(cached: Array[CudfColumnVector], + freshBatchTable: CudfTable) + : Array[CudfColumnVector] = { + + if (cached.length != freshBatchTable.getNumberOfColumns) { + throw new IllegalArgumentException("Expected the same number of columns " + + "in input batch and cached batch.") + } + cached.zipWithIndex.map { case (cachedCol, idx) => + CudfColumnVector.concatenate(cachedCol, freshBatchTable.getColumn(idx)) + } + } + + // Either cached has unprocessed rows, or input.hasNext(). + if (input.hasNext) { + if (cached.isDefined) { + // Cached input AND new input rows exist. Return concat-ed rows. + withResource(getFreshInputBatch) { freshBatchCB => + withResource(GpuColumnVector.from(freshBatchCB)) { freshBatchTable => + withResource(concatenateColumns(cached.get, freshBatchTable)) { concat => + clearCached() + SpillableColumnarBatch(convertToBatch(inputTypes.get, concat), + SpillPriorities.ACTIVE_BATCHING_PRIORITY) + } + } + } + } else { + // No cached input available. Return fresh input rows, only. + SpillableColumnarBatch(getFreshInputBatch, + SpillPriorities.ACTIVE_BATCHING_PRIORITY) + } + } + else { + // No fresh input available. Return cached input. + val cachedCB = convertToBatch(inputTypes.get, cached.get) + clearCached() + SpillableColumnarBatch(cachedCB, + SpillPriorities.ACTIVE_BATCHING_PRIORITY) + } + } + + /** + * Helper to trim specified number of rows off the top and bottom, + * of all specified columns. + */ + private def trim(columns: Array[CudfColumnVector], + offTheTop: Int, + offTheBottom: Int): Array[CudfColumnVector] = { + + def checkValidSizes(col: CudfColumnVector): Unit = + if ((offTheTop + offTheBottom) > col.getRowCount) { + throw new IllegalArgumentException(s"Cannot trim column of size ${col.getRowCount} by " + + s"$offTheTop rows at the top, and $offTheBottom rows at the bottom.") + } + + columns.map{ col => + checkValidSizes(col) + col.subVector(offTheTop, col.getRowCount.toInt - offTheBottom) + } + } + + private def resetInputCache(newCache: Option[Array[CudfColumnVector]], + newPrecedingAdded: Int): Unit= { + cached.foreach(_.foreach(_.close)) + cached = newCache + numPrecedingRowsAdded = newPrecedingAdded + } + + override def next(): ColumnarBatch = { + var outputBatch: ColumnarBatch = null + while (outputBatch == null && hasNext) { + withResource(getNextInputBatch) { inputCbSpillable => + withResource(inputCbSpillable.getColumnarBatch()) { inputCB => + + val inputRowCount = inputCB.numRows() + val noMoreInput = !input.hasNext + numUnprocessedInCache = if (noMoreInput) { + // If there are no more input rows expected, + // this is the last output batch. + // Consider all rows in the batch as processed. + 0 + } else { + // More input rows expected. The last `maxFollowing` rows can't be finalized. + // Cannot exceed `inputRowCount`. + if (maxFollowing < 0) { // E.g. LAG(3) => [ preceding=-3, following=-3 ] + // -ve following => No need to wait for more following rows. + // All "following" context is already available in the current batch. + 0 + } else { + maxFollowing min inputRowCount + } + } + + if (numPrecedingRowsAdded + numUnprocessedInCache >= inputRowCount) { + // No point calling windowing kernel: the results will simply be ignored. + logWarning("Not enough rows! Cannot output a batch.") + } else { + withResource(new NvtxWithMetrics("window", NvtxColor.CYAN, opTime)) { _ => + withResource(computeBasicWindow(inputCB)) { outputCols => + outputBatch = withResource( + trim(outputCols, + numPrecedingRowsAdded, numUnprocessedInCache)) { trimmed => + convertToBatch(outputTypes, trimmed) + } + } + } + } + + // Compute new cache using current input. + numPrecedingRowsAdded = if (minPreceding > 0) { // E.g. LEAD(3) => [prec=3, foll=3] + // preceding > 0 => No "preceding" rows need be carried forward. + // Only the rows that need to be recomputed. + 0 + } else { + Math.abs(minPreceding) min (inputRowCount - numUnprocessedInCache) + } + val inputCols = Range(0, inputCB.numCols()).map { + inputCB.column(_).asInstanceOf[GpuColumnVector].getBase + }.toArray + + val newCached = trim(inputCols, + inputRowCount - (numPrecedingRowsAdded + numUnprocessedInCache), + 0) + resetInputCache(Some(newCached), numPrecedingRowsAdded) + } + } + } + numOutputBatches += 1 + numOutputRows += outputBatch.numRows() + outputBatch + } +} + +/// Window Exec used exclusively for batching bounded window functions. +class GpuBatchedBoundedWindowExec( + override val windowOps: Seq[NamedExpression], + override val gpuPartitionSpec: Seq[Expression], + override val gpuOrderSpec: Seq[SortOrder], + override val child: SparkPlan)( + override val cpuPartitionSpec: Seq[Expression], + override val cpuOrderSpec: Seq[SortOrder], + minPreceding: Integer, + maxFollowing: Integer +) extends GpuWindowExec(windowOps, + gpuPartitionSpec, + gpuOrderSpec, + child)(cpuPartitionSpec, cpuOrderSpec) { + + override def otherCopyArgs: Seq[AnyRef] = + cpuPartitionSpec :: cpuOrderSpec :: minPreceding :: maxFollowing :: Nil + + override def childrenCoalesceGoal: Seq[CoalesceGoal] = Seq.fill(children.size)(null) + + override def outputBatching: CoalesceGoal = null + + override protected def internalDoExecuteColumnar(): RDD[ColumnarBatch] = { + val numOutputBatches = gpuLongMetric(GpuMetric.NUM_OUTPUT_BATCHES) + val numOutputRows = gpuLongMetric(GpuMetric.NUM_OUTPUT_ROWS) + val opTime = gpuLongMetric(GpuMetric.OP_TIME) + + val boundWindowOps = GpuBindReferences.bindGpuReferences(windowOps, child.output) + val boundPartitionSpec = GpuBindReferences.bindGpuReferences(gpuPartitionSpec, child.output) + val boundOrderSpec = GpuBindReferences.bindReferences(gpuOrderSpec, child.output) + + child.executeColumnar().mapPartitions { iter => + new GpuBatchedBoundedWindowIterator(iter, boundWindowOps, boundPartitionSpec, + boundOrderSpec, output.map(_.dataType).toArray, minPreceding, maxFollowing, + numOutputBatches, numOutputRows, opTime) + } + } +} \ No newline at end of file diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuWindowExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuWindowExec.scala index 760196dbf94..6de2eb1f2de 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuWindowExec.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuWindowExec.scala @@ -144,7 +144,7 @@ abstract class GpuBaseWindowExecMeta[WindowExecType <: SparkPlan] (windowExec: W val allBatched = fixedUpWindowOps.forall { case GpuAlias(GpuWindowExpression(func, spec), _) => - GpuWindowExec.isBatchedFunc(func, spec) + GpuWindowExec.isBatchedFunc(func, spec, conf) case GpuAlias(_: AttributeReference, _) | _: AttributeReference => // We allow pure result columns for running windows true @@ -162,7 +162,7 @@ abstract class GpuBaseWindowExecMeta[WindowExecType <: SparkPlan] (windowExec: W } val windowExpr = if (allBatched) { - val batchedOps = GpuWindowExec.splitBatchedOps(fixedUpWindowOps) + val batchedOps = GpuWindowExec.splitBatchedOps(fixedUpWindowOps, conf) batchedOps.getWindowExec( partitionSpec.map(_.convertToGpu()), orderSpec.map(_.convertToGpu().asInstanceOf[SortOrder]), @@ -170,7 +170,7 @@ abstract class GpuBaseWindowExecMeta[WindowExecType <: SparkPlan] (windowExec: W getPartitionSpecs, getOrderSpecs) } else { - GpuWindowExec( + new GpuWindowExec( fixedUpWindowOps, partitionSpec.map(_.convertToGpu()), orderSpec.map(_.convertToGpu().asInstanceOf[SortOrder]), @@ -179,6 +179,8 @@ abstract class GpuBaseWindowExecMeta[WindowExecType <: SparkPlan] (windowExec: W if (isPostNeeded) { GpuProjectExec(post.toList, windowExpr)() + } else if (windowExpr.output != windowExec.output) { + GpuProjectExec(windowExec.output.toList, windowExpr)() } else { windowExpr } @@ -233,11 +235,33 @@ class GpuWindowExecMeta(windowExec: WindowExec, } case class BatchedOps(running: Seq[NamedExpression], - unboundedToUnbounded: Seq[NamedExpression], - passThrough: Seq[NamedExpression]) { + unboundedToUnbounded: Seq[NamedExpression], + bounded: Seq[NamedExpression], + passThrough: Seq[NamedExpression]) { + def getRunningExpressionsWithPassthrough: Seq[NamedExpression] = passThrough ++ running + def getDoublePassExpressionsWithRunningAsPassthrough: Seq[NamedExpression] = + passThrough ++ unboundedToUnbounded ++ running.map(_.toAttribute) + + def getBoundedExpressionsWithTheRestAsPassthrough: Seq[NamedExpression] = + passThrough ++ bounded ++ (unboundedToUnbounded ++ running).map(_.toAttribute) + + def getMinPrecedingMaxFollowingForBoundedWindows: (Int, Int) = { + // All bounded window expressions should have window bound window specs. + val boundedWindowSpecs = bounded.map{ + case GpuAlias(GpuWindowExpression(_, spec), _) => spec + case other => throw new IllegalArgumentException("Expected a window-expression " + + s" found $other") + } + val precedingAndFollowing = boundedWindowSpecs.map( + GpuWindowExec.getBoundedWindowPrecedingAndFollowing + ) + (precedingAndFollowing.map{ _._1 }.min, // Only non-positive (>=0) values supported. + precedingAndFollowing.map{ _._2 }.max) + } + private def getRunningWindowExec( gpuPartitionSpec: Seq[Expression], gpuOrderSpec: Seq[SortOrder], @@ -262,6 +286,18 @@ case class BatchedOps(running: Seq[NamedExpression], gpuOrderSpec, child)(cpuPartitionSpec, cpuOrderSpec) + private def getBatchedBoundedWindowExec(gpuPartitionSpec: Seq[Expression], + gpuOrderSpec: Seq[SortOrder], + child: SparkPlan, + cpuPartitionSpec: Seq[Expression], + cpuOrderSpec: Seq[SortOrder]): GpuExec = { + val (prec@_, foll@_) = getMinPrecedingMaxFollowingForBoundedWindows + new GpuBatchedBoundedWindowExec(getBoundedExpressionsWithTheRestAsPassthrough, + gpuPartitionSpec, + gpuOrderSpec, + child)(cpuPartitionSpec, cpuOrderSpec, prec, foll) + } + def getWindowExec( gpuPartitionSpec: Seq[Expression], gpuOrderSpec: Seq[SortOrder], @@ -270,26 +306,46 @@ case class BatchedOps(running: Seq[NamedExpression], cpuOrderSpec: Seq[SortOrder]): GpuExec = { // The order of these matter so we can pass the output of the first through the second one if (hasRunning) { - val running = getRunningWindowExec(gpuPartitionSpec, gpuOrderSpec, child, + val runningExec = getRunningWindowExec(gpuPartitionSpec, gpuOrderSpec, child, cpuPartitionSpec, cpuOrderSpec) if (hasDoublePass) { - getDoublePassWindowExec(gpuPartitionSpec, gpuOrderSpec, running, + val doublePassExec = getDoublePassWindowExec(gpuPartitionSpec, gpuOrderSpec, runningExec, cpuPartitionSpec, cpuOrderSpec) + if (hasBounded) { + getBatchedBoundedWindowExec(gpuPartitionSpec, gpuOrderSpec, doublePassExec, + cpuPartitionSpec, cpuOrderSpec) + } else { + doublePassExec + } } else { - running + if (hasBounded) { + getBatchedBoundedWindowExec(gpuPartitionSpec, gpuOrderSpec, runningExec, + cpuPartitionSpec, cpuOrderSpec) + } + else { + runningExec + } } } else { - getDoublePassWindowExec(gpuPartitionSpec, gpuOrderSpec, child, - cpuPartitionSpec, cpuOrderSpec) + if (hasDoublePass) { + val doublePassExec = getDoublePassWindowExec(gpuPartitionSpec, gpuOrderSpec, child, + cpuPartitionSpec, cpuOrderSpec) + if (hasBounded) { + getBatchedBoundedWindowExec(gpuPartitionSpec, gpuOrderSpec, doublePassExec, + cpuPartitionSpec, cpuOrderSpec) + } else { + doublePassExec + } + } else { + getBatchedBoundedWindowExec(gpuPartitionSpec, gpuOrderSpec, child, + cpuPartitionSpec, cpuOrderSpec) + } } } def hasRunning: Boolean = running.nonEmpty - - def getDoublePassExpressionsWithRunningAsPassthrough: Seq[NamedExpression] = - passThrough ++ unboundedToUnbounded ++ running.map(_.toAttribute) - def hasDoublePass: Boolean = unboundedToUnbounded.nonEmpty + def hasBounded: Boolean = bounded.nonEmpty } object GpuWindowExec { @@ -484,6 +540,63 @@ object GpuWindowExec { case _ => false } + /** + * Checks whether the window spec is both ROWS-based and bounded. + * Window functions of this spec can possibly still be batched. + */ + private def isBoundedRowsWindowAndBatchable(spec: GpuWindowSpecDefinition, + conf: RapidsConf): Boolean = { + + def inPermissibleRange(bounds: Int) = + Math.abs(bounds) <= conf.batchedBoundedRowsWindowMax + + spec match { + + case GpuWindowSpecDefinition(_, _, GpuSpecifiedWindowFrame( + RowFrame, + GpuLiteral(prec: Int, _), + GpuLiteral(foll: Int, _))) => + inPermissibleRange(prec) && inPermissibleRange(foll) + case GpuWindowSpecDefinition(_, _, GpuSpecifiedWindowFrame( + RowFrame, + GpuSpecialFrameBoundary(CurrentRow), + GpuLiteral(foll: Int, _))) => + inPermissibleRange(foll) + case GpuWindowSpecDefinition(_, _, GpuSpecifiedWindowFrame( + RowFrame, + GpuLiteral(prec: Int, _), + GpuSpecialFrameBoundary(CurrentRow))) => + inPermissibleRange(prec) + case GpuWindowSpecDefinition(_, _, GpuSpecifiedWindowFrame( + RowFrame, + GpuSpecialFrameBoundary(CurrentRow), + GpuSpecialFrameBoundary(CurrentRow))) => true + case _ => false + } + } + + def getBoundedWindowPrecedingAndFollowing(spec: GpuWindowSpecDefinition): (Int, Int) = + spec match { + case GpuWindowSpecDefinition(_, _, GpuSpecifiedWindowFrame( + RowFrame, + GpuLiteral(prec: Int, _), + GpuLiteral(foll: Int, _))) => (prec, foll) + case GpuWindowSpecDefinition(_, _, GpuSpecifiedWindowFrame( + RowFrame, + GpuSpecialFrameBoundary(CurrentRow), + GpuLiteral(foll: Int, _))) => (0, foll) + case GpuWindowSpecDefinition(_, _, GpuSpecifiedWindowFrame( + RowFrame, + GpuLiteral(prec: Int,_), + GpuSpecialFrameBoundary(CurrentRow))) => (prec, 0) + case GpuWindowSpecDefinition(_, _, GpuSpecifiedWindowFrame( + RowFrame, + GpuSpecialFrameBoundary(CurrentRow), + GpuSpecialFrameBoundary(CurrentRow))) => (0, 0) + case _ => throw new IllegalArgumentException("Expected bounded ROWS spec, " + + s"found $spec") // Can't reach here. + } + def isUnboundedToUnboundedWindow(spec: GpuWindowSpecDefinition): Boolean = spec match { case GpuWindowSpecDefinition(_, _, GpuSpecifiedWindowFrame(_, GpuSpecialFrameBoundary(UnboundedPreceding), @@ -510,12 +623,19 @@ object GpuWindowExec { case _ => false } - def isBatchedFunc(func: Expression, spec: GpuWindowSpecDefinition): Boolean = - isBatchedRunningFunc(func, spec) || isBatchedUnboundedToUnboundedFunc(func, spec) + def isBatchedFunc(func: Expression, + spec: GpuWindowSpecDefinition, + conf: RapidsConf): Boolean = { + isBatchedRunningFunc(func, spec) || + isBatchedUnboundedToUnboundedFunc(func, spec) || + isBoundedRowsWindowAndBatchable(spec, conf) + } - def splitBatchedOps(windowOps: Seq[NamedExpression]): BatchedOps = { + def splitBatchedOps(windowOps: Seq[NamedExpression], + conf: RapidsConf): BatchedOps = { val running = ArrayBuffer[NamedExpression]() val doublePass = ArrayBuffer[NamedExpression]() + val batchedBounded = ArrayBuffer[NamedExpression]() val passThrough = ArrayBuffer[NamedExpression]() windowOps.foreach { case expr@GpuAlias(GpuWindowExpression(func, spec), _) => @@ -523,6 +643,8 @@ object GpuWindowExec { running.append(expr) } else if (isBatchedUnboundedToUnboundedFunc(func, spec)) { doublePass.append(expr) + } else if (isBoundedRowsWindowAndBatchable(spec, conf)) { + batchedBounded.append(expr) } else { throw new IllegalArgumentException( s"Found unexpected expression $expr in window exec ${expr.getClass}") @@ -535,7 +657,7 @@ object GpuWindowExec { throw new IllegalArgumentException( s"Found unexpected expression $other in window exec ${other.getClass}") } - BatchedOps(running.toSeq, doublePass.toSeq, passThrough.toSeq) + BatchedOps(running.toSeq, doublePass.toSeq, batchedBounded.toSeq, passThrough.toSeq) } } @@ -1264,10 +1386,10 @@ trait BasicWindowCalc { // `orderByPositions` and `partByPositions` are the positions in `initialProjections` for // the order by columns and the part by columns respectively. private val (initialProjections, - passThrough, - aggregations, - orderByPositions, - partByPositions) = { + passThrough, + aggregations, + orderByPositions, + partByPositions) = { val initialProjections = ArrayBuffer[Expression]() val dedupedInitialProjections = mutable.HashMap[Expression, Int]() diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala index fa793ba84fa..fc8e0870395 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala @@ -1340,6 +1340,15 @@ object RapidsConf { .booleanConf .createWithDefault(true) + val BATCHED_BOUNDED_ROW_WINDOW_MAX: ConfEntryWithDefault[Integer] = + conf("spark.rapids.sql.window.batched.bounded.row.max") + .doc("Max value for bounded row window preceding/following extents " + + "permissible for the window to be evaluated in batched mode. This value affects " + + "both the preceding and following bounds, potentially doubling the window size " + + "permitted for batched execution") + .integerConf + .createWithDefault(value = 100) + val ENABLE_SINGLE_PASS_PARTIAL_SORT_AGG: ConfEntryWithDefault[Boolean] = conf("spark.rapids.sql.agg.singlePassPartialSortEnabled") .doc("Enable or disable a single pass partial sort optimization where if a heuristic " + @@ -2743,6 +2752,8 @@ class RapidsConf(conf: Map[String, String]) extends Logging { lazy val isRangeWindowDecimalEnabled: Boolean = get(ENABLE_RANGE_WINDOW_DECIMAL) + lazy val batchedBoundedRowsWindowMax: Int = get(BATCHED_BOUNDED_ROW_WINDOW_MAX) + lazy val allowSinglePassPartialSortAgg: Boolean = get(ENABLE_SINGLE_PASS_PARTIAL_SORT_AGG) lazy val forceSinglePassPartialSortAgg: Boolean = get(FORCE_SINGLE_PASS_PARTIAL_SORT_AGG)