NVIDIA · revans2 · Oct 20, 2023 · Sep 25, 2023 · Sep 25, 2023 · Sep 25, 2023
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala
@@ -3328,13 +3328,13 @@ object GpuOverrides extends Logging {
       "Collect a set of unique elements, not supported in reduction",
       ExprChecks.fullAgg(
         TypeSig.ARRAY.nested(TypeSig.commonCudfTypes + TypeSig.DECIMAL_128 +
-            TypeSig.NULL + TypeSig.STRUCT + TypeSig.ARRAY),
+          TypeSig.NULL + TypeSig.STRUCT + TypeSig.ARRAY),
         TypeSig.ARRAY.nested(TypeSig.all),
         Seq(ParamCheck("input",
           (TypeSig.commonCudfTypes + TypeSig.DECIMAL_128 +
-              TypeSig.NULL +
-              TypeSig.STRUCT +
-              TypeSig.ARRAY).nested(),
+            TypeSig.NULL +
+            TypeSig.STRUCT +
+            TypeSig.ARRAY).nested(),
           TypeSig.all))),
       (c, conf, p, r) => new TypedImperativeAggExprMeta[CollectSet](c, conf, p, r) {
 
@@ -3402,6 +3402,53 @@ object GpuOverrides extends Logging {
           GpuVarianceSamp(childExprs.head, !legacyStatisticalAggregate)
         }
       }),
+    expr[Percentile](
+      "Aggregation computing exact percentile",
+      ExprChecks.reductionAndGroupByAgg(
+        // The output can be a single number or array depending on whether percentiles param
+        // is a single number or an array.
+        TypeSig.gpuNumeric +
+          TypeSig.ARRAY.nested(TypeSig.gpuNumeric),
+        TypeSig.cpuNumeric + TypeSig.DATE + TypeSig.TIMESTAMP + TypeSig.ARRAY.nested(
+          TypeSig.cpuNumeric + TypeSig.DATE + TypeSig.TIMESTAMP),
+        Seq(
+          ParamCheck("input",
+            TypeSig.gpuNumeric,
+            TypeSig.cpuNumeric + TypeSig.DATE + TypeSig.TIMESTAMP),
+          ParamCheck("percentage",
+            TypeSig.DOUBLE + TypeSig.ARRAY.nested(TypeSig.DOUBLE),
+            TypeSig.DOUBLE + TypeSig.ARRAY.nested(TypeSig.DOUBLE)),
+          ParamCheck("frequency",
+            TypeSig.LONG + TypeSig.ARRAY.nested(TypeSig.LONG),
+            TypeSig.LONG + TypeSig.ARRAY.nested(TypeSig.LONG)))),
+      (c, conf, p, r) => new TypedImperativeAggExprMeta[Percentile](c, conf, p, r) {
+        override def convertToGpu(childExprs: Seq[Expression]): GpuExpression = {
+          val exprMeta = p.get.asInstanceOf[BaseExprMeta[_]]
+          val context = exprMeta.context
+          context match {
+            case ReductionAggExprContext => GpuPercentile(childExprs, isReduction = true)
+            case GroupByAggExprContext => GpuPercentile(childExprs, isReduction = false)
+            case _ => throw new IllegalStateException(s"Invalid aggregation context: $context")
+          }
+//          val Seq(value, percentage, frequency) = childExprs
+//          frequency match {
+//            case GpuLiteral(freq, LongType) if freq == 1 =>
+//              GpuPercentileDefault(value, percentage)
+//            case v : Any =>
+//              GpuPercentileWithFrequency(value, percentage, frequency)
+//          }
+        }
+        override def aggBufferAttribute: AttributeReference = {
+          val aggBuffer = c.aggBufferAttributes.head
+          aggBuffer.copy(dataType = c.dataType)(aggBuffer.exprId, aggBuffer.qualifier)
+        }
+        override def createCpuToGpuBufferConverter(): CpuToGpuAggregateBufferConverter =
+          new CpuToGpuPercentileBufferConverter(c.child.dataType)
+        override def createGpuToCpuBufferConverter(): GpuToCpuAggregateBufferConverter =
+          new GpuToCpuPercentileBufferConverter()
+        override val supportBufferConversion: Boolean = true
+        override val needsAnsiCheck: Boolean = false
+      }),
     expr[ApproximatePercentile](
       "Approximate percentile",
       ExprChecks.reductionAndGroupByAgg(

diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/AggregateFunctions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/AggregateFunctions.scala
@@ -18,8 +18,9 @@ package org.apache.spark.sql.rapids
 
 import ai.rapids.cudf
 import ai.rapids.cudf.{Aggregation128Utils, BinaryOp, ColumnVector, DType, GroupByAggregation, GroupByScanAggregation, NaNEquality, NullEquality, NullPolicy, NvtxColor, NvtxRange, ReductionAggregation, ReplacePolicy, RollingAggregation, RollingAggregationOnColumn, Scalar, ScanAggregation}
+import ai.rapids.cudf.TableDebug
 import com.nvidia.spark.rapids._
-import com.nvidia.spark.rapids.Arm.withResource
+import com.nvidia.spark.rapids.Arm.{withResource, withResourceIfAllowed}
 import com.nvidia.spark.rapids.RapidsPluginImplicits.ReallyAGpuExpression
 import com.nvidia.spark.rapids.shims.{GpuDeterministicFirstLastCollectShim, ShimExpression, ShimUnaryExpression, TypeUtilsShims}
 
@@ -388,9 +389,9 @@ class CudfMergeLists(override val dataType: DataType) extends CudfAggregate {
 }
 
 /**
- * Spark handles NaN's equality by different way for non-nested float/double and float/double 
- * in nested types. When we use non-nested versions of floats and doubles, NaN values are 
- * considered unequal, but when we collect sets of nested versions, NaNs are considered equal 
+ * Spark handles NaN's equality by different way for non-nested float/double and float/double
+ * in nested types. When we use non-nested versions of floats and doubles, NaN values are
+ * considered unequal, but when we collect sets of nested versions, NaNs are considered equal
  * on the CPU. So we set NaNEquality dynamically in CudfCollectSet and CudfMergeSets.
  * Note that dataType is ArrayType(child.dataType) here.
  */
@@ -401,7 +402,7 @@ class CudfCollectSet(override val dataType: DataType) extends CudfAggregate {
         case ArrayType(FloatType | DoubleType, _) =>
           ReductionAggregation.collectSet(
             NullPolicy.EXCLUDE, NullEquality.EQUAL, NaNEquality.UNEQUAL)
-        case _: DataType => 
+        case _: DataType =>
           ReductionAggregation.collectSet(
             NullPolicy.EXCLUDE, NullEquality.EQUAL, NaNEquality.ALL_EQUAL)
       }
@@ -1981,17 +1982,17 @@ case class GpuCollectSet(
   override def aggBufferAttributes: Seq[AttributeReference] = outputBuf :: Nil
 
   override def prettyName: String = "collect_set"
-  
-  // Spark handles NaN's equality by different way for non-nested float/double and float/double 
-  // in nested types. When we use non-nested versions of floats and doubles, NaN values are 
-  // considered unequal, but when we collect sets of nested versions, NaNs are considered equal 
+
+  // Spark handles NaN's equality by different way for non-nested float/double and float/double
+  // in nested types. When we use non-nested versions of floats and doubles, NaN values are
+  // considered unequal, but when we collect sets of nested versions, NaNs are considered equal
   // on the CPU. So we set NaNEquality dynamically here.
   override def windowAggregation(
       inputs: Seq[(ColumnVector, Int)]): RollingAggregationOnColumn = child.dataType match {
-    case FloatType | DoubleType => 
+    case FloatType | DoubleType =>
       RollingAggregation.collectSet(NullPolicy.EXCLUDE, NullEquality.EQUAL,
         NaNEquality.UNEQUAL).onColumn(inputs.head._2)
-    case _ => 
+    case _ =>
       RollingAggregation.collectSet(NullPolicy.EXCLUDE, NullEquality.EQUAL,
         NaNEquality.ALL_EQUAL).onColumn(inputs.head._2)
   }
@@ -2011,16 +2012,16 @@ trait GpuToCpuBufferTransition extends ShimUnaryExpression with CodegenFallback
   override def dataType: DataType = BinaryType
 }
 
-class CpuToGpuCollectBufferConverter(
-    elementType: DataType) extends CpuToGpuAggregateBufferConverter {
+class CpuToGpuCollectBufferConverter(elementType: DataType)
+  extends CpuToGpuAggregateBufferConverter {
   def createExpression(child: Expression): CpuToGpuBufferTransition = {
     CpuToGpuCollectBufferTransition(child, elementType)
   }
 }
 
-case class CpuToGpuCollectBufferTransition(
-    override val child: Expression,
-    private val elementType: DataType) extends CpuToGpuBufferTransition {
+case class CpuToGpuCollectBufferTransition(override val child: Expression,
+                                           private val elementType: DataType)
+  extends CpuToGpuBufferTransition {
 
   private lazy val row = new UnsafeRow(1)
 
@@ -2044,8 +2045,199 @@ class GpuToCpuCollectBufferConverter extends GpuToCpuAggregateBufferConverter {
   }
 }
 
-case class GpuToCpuCollectBufferTransition(
-    override val child: Expression) extends GpuToCpuBufferTransition {
+case class GpuToCpuCollectBufferTransition(override val child: Expression)
+  extends GpuToCpuBufferTransition {
+
+  private lazy val projection = UnsafeProjection.create(Array(child.dataType))
+
+  override protected def nullSafeEval(input: Any): Array[Byte] = {
+    // Converts UnSafeArrayData into binary buffer, according to the serialize method of Collect.
+    // The binary buffer is the binary view of a UnsafeRow, which only contains single field
+    // with ArrayType of elementType. As Collect.serialize, we create an UnsafeProjection to
+    // transform ArrayData to binary view of the single field UnsafeRow. Unlike Collect.serialize,
+    // we don't have to build ArrayData from on-heap array, since the input is already formatted
+    // in ArrayData(UnsafeArrayData).
+    val arrayData = input.asInstanceOf[ArrayData]
+    projection.apply(InternalRow.apply(arrayData)).getBytes
+  }
+}
+
+case class GpuIdentity(child: Expression) extends GpuUnaryExpression {
+  override def prettyName: String = "identity"
+
+  override def dataType: DataType = child.dataType
+
+  override def doColumnar(input: GpuColumnVector): ColumnVector = {
+    TableDebug.get().debug("Final histogram", input.getBase);
+    input.getBase.incRefCount()
+  }
+
+  override def nullable: Boolean = child.nullable
+
+//      override def children: Seq[Expression] = Seq(child)
+
+  //override def canEqual(that: Any): Boolean = true
+
+}
+
+/**
+ * Compute percentile of the input number(s).
+ *
+ * The two 'offset' parameters are not used by GPU version, but are here for the compatibility
+ * with the CPU version and automated checks.
+ */
+abstract class GpuPercentile(childExprs: Seq[Expression], isReduction: Boolean)
+  extends GpuAggregateFunction with Serializable {
+  protected class CudfHistogram(override val dataType: DataType) extends CudfAggregate {
+    override lazy val reductionAggregate: cudf.ColumnVector => cudf.Scalar =
+      (col: cudf.ColumnVector) => col.reduce(ReductionAggregation.histogram(), DType.LIST)
+    override lazy val groupByAggregate: GroupByAggregation = GroupByAggregation.histogram()
+    override val name: String = "CudfHistogram"
+  }
+
+  protected class CudfMergeHistogram(override val dataType: DataType) extends CudfAggregate {
+    override lazy val reductionAggregate: cudf.ColumnVector => cudf.Scalar =
+      (col: cudf.ColumnVector) => col.reduce(ReductionAggregation.mergeHistogram(), DType.LIST)
+    override lazy val groupByAggregate: GroupByAggregation = GroupByAggregation.mergeHistogram()
+    override val name: String = "CudfMergeHistogram"
+  }
+
+  override lazy val mergeAggregates: Seq[CudfAggregate] = Seq(new CudfMergeHistogram(dataType))
+  override lazy val evaluateExpression: Expression = {
+    // AggregationUtils.percentileFromHistogram()
+    // TODO
+    GpuIdentity(histogramBuff)
+    GpuLiteral(1.0, DoubleType)
+
+  }
+  private final lazy val histogramBuff: AttributeReference =
+    AttributeReference("histogramBuff", dataType)()
+
+  override def dataType: DataType = childExprs(1).dataType match {
+    case _: ArrayType => ArrayType(DoubleType, containsNull = false)
+    case _ => DoubleType
+  }
+  override def aggBufferAttributes: Seq[AttributeReference] = histogramBuff :: Nil
+  override def prettyName: String = "percentile"
+  override def nullable: Boolean = false
+
+  override val initialValues: Seq[Expression] = Seq(GpuLiteral.create(null, dataType))
+  override def children: Seq[Expression] = childExprs
+}
+
+case class GpuGetListChild(child: Expression) extends GpuExpression {
+
+
+  override def columnarEvalAny(batch: ColumnarBatch): Any = {
+    val dt = dataType
+    withResourceIfAllowed(child.columnarEvalAny(batch)) {
+      case cv: GpuColumnVector =>
+        withResource(cv.getBase.getChildColumnView(0)) { view =>
+          GpuColumnVector.from(view.copyToColumnVector(), dt)
+        }
+
+      case other =>
+        throw new IllegalArgumentException(s"Got an unexpected type out of columnarEvalAny $other")
+    }
+  }
+
+  override def columnarEval(batch: ColumnarBatch): GpuColumnVector =
+    GpuExpressionsUtils.resolveColumnVector(columnarEvalAny(batch), batch.numRows())
+
+  override def nullable: Boolean = child.nullable
+
+  override def dataType: DataType = child.dataType
+
+  override def children: Seq[Expression] = Seq(child)
+}
+
+/**
+ * Compute percentile of the input number(s).
+ *
+ * The two 'offset' parameters are not used by GPU version, but are here for the compatibility
+ * with the CPU version and automated checks.
+ */
+case class GpuPercentileDefault(childExprs: Seq[Expression], isReduction: Boolean)
+  extends GpuPercentile(childExprs, isReduction) {
+
+  override val inputProjection: Seq[Expression] = Seq(childExprs.head)
+
+  private lazy val histogramUpdate = new CudfHistogram(dataType)
+  override lazy val updateAggregates: Seq[CudfAggregate] = Seq(histogramUpdate)
+
+  override lazy val postUpdate: Seq[Expression] = {
+    if (isReduction) {
+      val reductionResult = histogramUpdate.attr
+      Seq(GpuGetListChild(reductionResult))
+    } else {
+      Seq(histogramUpdate.attr)
+    }
+  }
+}
+  /**
+ * Compute percentile of the input number(s).
+ *
+ * The two 'offset' parameters are not used by GPU version, but are here for the compatibility
+ * with the CPU version and automated checks.
+ */
+case class GpuPercentileWithFrequency(childExprs: Seq[Expression], isReduction: Boolean)
+  extends GpuPercentile(childExprs, isReduction) {
+
+  override val inputProjection: Seq[Expression] = {
+      val childrenWithNames = GpuLiteral("value", StringType) :: childExprs.head ::
+        GpuLiteral("frequency", StringType) :: childExprs(2) :: Nil
+      GpuCreateNamedStruct(childrenWithNames) :: Nil
+  }
+  override lazy val updateAggregates: Seq[CudfAggregate] =  Seq(new CudfMergeHistogram(dataType))
+}
+
+object GpuPercentile{
+  def apply(childExprs: Seq[Expression], isReduction: Boolean): GpuPercentile = {
+    val Seq(_, _, frequency) = childExprs
+    frequency match {
+      case GpuLiteral(freq, LongType) if freq == 1 =>
+        GpuPercentileDefault(childExprs, isReduction)
+      case _: Any =>
+        GpuPercentileWithFrequency(childExprs, isReduction)
+    }
+  }
+}
+
+class CpuToGpuPercentileBufferConverter(elementType: DataType)
+  extends CpuToGpuAggregateBufferConverter {
+  def createExpression(child: Expression): CpuToGpuBufferTransition = {
+    CpuToGpuPercentileBufferTransition(child, elementType)
+  }
+}
+
+case class CpuToGpuPercentileBufferTransition(override val child: Expression,
+                                              private val elementType: DataType)
+  extends CpuToGpuBufferTransition {
+
+  private lazy val row = new UnsafeRow(1)
+
+  override def dataType: DataType = ArrayType(elementType, containsNull = false)
+
+  override protected def nullSafeEval(input: Any): ArrayData = {
+    // Converts binary buffer into UnSafeArrayData, according to the deserialize method of Collect.
+    // The input binary buffer is the binary view of a UnsafeRow, which only contains single field
+    // with ArrayType of elementType. Since array of elements exactly matches the GPU format, we
+    // don't need to do any conversion in memory level. Instead, we simply bind the binary data to
+    // a reused UnsafeRow. Then, fetch the only field as ArrayData.
+    val bytes = input.asInstanceOf[Array[Byte]]
+    row.pointTo(bytes, bytes.length)
+    row.getArray(0).copy()
+  }
+}
+
+class GpuToCpuPercentileBufferConverter extends GpuToCpuAggregateBufferConverter {
+  def createExpression(child: Expression): GpuToCpuBufferTransition = {
+    GpuToCpuPercentileBufferTransition(child)
+  }
+}
+
+case class GpuToCpuPercentileBufferTransition(override val child: Expression)
+  extends GpuToCpuBufferTransition {
 
   private lazy val projection = UnsafeProjection.create(Array(child.dataType))