Merge master

mrpowers-io · Oct 5, 2024 · 9606780 · 9606780
2 parents 8369ce5 + c59d416
commit 9606780
Show file tree

Hide file tree

Showing 7 changed files with 159 additions and 8 deletions.
diff --git a/build.sbt b/build.sbt
@@ -1,10 +1,13 @@
-scalafmtOnCompile in Compile := true
+Compile / scalafmtOnCompile := true
 
 organization := "com.github.mrpowers"
 name := "spark-daria"
 
 version := "1.2.3"
 
+crossScalaVersions := Seq("2.12.15", "2.13.8")
+scalaVersion := "2.12.15"
+
 val versionRegex = """^(.*)\.(.*)\.(.*)$""".r
 
 val scala2_13 = "2.13.14"
@@ -50,7 +53,7 @@ testFrameworks += new TestFramework("com.github.mrpowers.spark.daria.CustomFrame
 
 credentials += Credentials(Path.userHome / ".sbt" / "sonatype_credentials")
 
-fork in Test := true
+Test / fork := true
 
 licenses := Seq("MIT" -> url("http://opensource.org/licenses/MIT"))
 

diff --git a/core/src/main/scala/com/github/mrpowers/spark/daria/sql/functions.scala b/core/src/main/scala/com/github/mrpowers/spark/daria/sql/functions.scala
@@ -1,5 +1,6 @@
 package com.github.mrpowers.spark.daria.sql
 
+import com.github.mrpowers.spark.daria.sql.udafs.ArrayConcatAggregator
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.sql.Column
 import org.apache.spark.sql.functions._
@@ -750,4 +751,8 @@ object functions {
   def excelEpochToDate(colName: String): Column = {
     excelEpochToDate(col(colName))
   }
+
+  def arrayConcat(col: Column): Column = {
+    flatten(collect_list(col))
+  }
 }
diff --git a/core/src/main/scala/com/github/mrpowers/spark/daria/sql/udafs/ArrayConcat.scala b/core/src/main/scala/com/github/mrpowers/spark/daria/sql/udafs/ArrayConcat.scala
@@ -1,8 +1,10 @@
 package com.github.mrpowers.spark.daria.sql.udafs
 
-import org.apache.spark.sql.Row
-import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
-import org.apache.spark.sql.types.{ArrayType, DataType, StructField, StructType}
+import scala.reflect.runtime.universe.TypeTag
+import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
+import org.apache.spark.sql.{Encoder, Row}
+import org.apache.spark.sql.expressions.{Aggregator, MutableAggregationBuffer, UserDefinedAggregateFunction}
+import org.apache.spark.sql.types._
 
 class ArrayConcat(elementSchema: DataType, nullable: Boolean = true) extends UserDefinedAggregateFunction {
 
@@ -43,3 +45,27 @@ class ArrayConcat(elementSchema: DataType, nullable: Boolean = true) extends Use
     buffer.getAs[Seq[Any]](0)
   }
 }
+
+case class ArrayConcatAggregator[T: TypeTag]() extends Aggregator[Seq[T], Seq[T], Seq[T]] {
+  override def zero: Seq[T] = Seq.empty[T]
+
+  override def reduce(b: Seq[T], a: Seq[T]): Seq[T] = {
+    if (a == null) {
+      return b
+    }
+    b ++ a
+  }
+
+  override def merge(b1: Seq[T], b2: Seq[T]): Seq[T] = {
+    if (b2 == null) {
+      return b1
+    }
+    b1 ++ b2
+  }
+
+  override def finish(reduction: Seq[T]): Seq[T] = reduction
+
+  override def bufferEncoder: Encoder[Seq[T]] = ExpressionEncoder[Seq[T]]()
+
+  override def outputEncoder: Encoder[Seq[T]] = ExpressionEncoder[Seq[T]]()
+}
diff --git a/core/src/test/scala/com/github/mrpowers/spark/daria/sql/FunctionsTest.scala b/core/src/test/scala/com/github/mrpowers/spark/daria/sql/FunctionsTest.scala
@@ -1,12 +1,12 @@
 package com.github.mrpowers.spark.daria.sql
 
 import java.sql.{Date, Timestamp}
-
 import utest._
 import com.github.mrpowers.spark.fast.tests.{ColumnComparer, DataFrameComparer}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types._
 import SparkSessionExt._
+import com.github.mrpowers.spark.daria.sql.functions.arrayConcat
 
 object FunctionsTest extends TestSuite with DataFrameComparer with ColumnComparer with SparkSessionTestWrapper {
 
@@ -1429,5 +1429,81 @@ object FunctionsTest extends TestSuite with DataFrameComparer with ColumnCompare
       }
     }
 
+    "arrayConcat" - {
+      "arrayConcat array of string type" - {
+
+        val actualDF = spark
+          .createDF(
+            List(
+              Array(
+                "snake",
+                "rat"
+              ),
+              null,
+              Array(
+                "cat",
+                "crazy"
+              )
+            ),
+            List(("array", ArrayType(StringType), true))
+          )
+          .agg(arrayConcat(col("array")).as("array"))
+
+        val expectedDF = spark
+          .createDF(
+            List(
+              Array(
+                "snake",
+                "rat",
+                "cat",
+                "crazy"
+              )
+            ),
+            List(("array", ArrayType(StringType), true))
+          )
+
+        assertSmallDataFrameEquality(
+          actualDF,
+          expectedDF,
+          ignoreNullable = true
+        )
+
+      }
+
+      "arrayConcat array of Int type" - {
+
+        val actualDF = spark
+          .createDF(
+            List(
+              Array(
+                1,
+                2
+              ),
+              null,
+              Array(
+                3,
+                4
+              )
+            ),
+            List(("array", ArrayType(IntegerType), true))
+          )
+          .agg(arrayConcat(col("array")).as("array"))
+
+        val expectedDF = spark
+          .createDF(
+            List(Array(1, 2, 3, 4)),
+            List(("array", ArrayType(IntegerType), true))
+          )
+
+        assertSmallDataFrameEquality(
+          actualDF,
+          expectedDF,
+          ignoreNullable = true
+        )
+
+      }
+
+    }
+
   }
 }
diff --git a/core/src/test/scala/com/github/mrpowers/spark/daria/sql/udafs/ArrayConcatTest.scala b/core/src/test/scala/com/github/mrpowers/spark/daria/sql/udafs/ArrayConcatTest.scala
@@ -54,6 +54,47 @@ object ArrayConcatTest extends TestSuite with DataFrameComparer with SparkSessio
 
       }
 
+      "concatenates rows of arrays using aggregator" - {
+
+        val arrayConcat = udaf(new ArrayConcatAggregator[String]())
+
+        val actualDF = spark
+          .createDF(
+            List(
+              Array(
+                "snake",
+                "rat"
+              ),
+              null,
+              Array(
+                "cat",
+                "crazy"
+              )
+            ),
+            List(("array", ArrayType(StringType), true))
+          )
+          .agg(arrayConcat(col("array")).as("array"))
+
+        val expectedDF = spark
+          .createDF(
+            List(
+              Array(
+                "snake",
+                "rat",
+                "cat",
+                "crazy"
+              )
+            ),
+            List(("array", ArrayType(StringType), true))
+          )
+
+        assertSmallDataFrameEquality(
+          actualDF,
+          expectedDF
+        )
+
+      }
+
     }
 
   }

diff --git a/project/build.properties b/project/build.properties
@@ -1 +1 @@
-sbt.version = 1.4.3
+sbt.version = 1.10.2
diff --git a/project/plugins.sbt b/project/plugins.sbt
@@ -8,5 +8,5 @@ resolvers += Resolver.bintrayIvyRepo(
 addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.4.0")
 
 addSbtPlugin("com.eed3si9n"     % "sbt-assembly" % "0.15.0")
-addSbtPlugin("org.xerial.sbt"   % "sbt-sonatype" % "3.9.10")
+addSbtPlugin("org.xerial.sbt"   % "sbt-sonatype" % "3.11.3")
 addSbtPlugin("com.jsuereth"     % "sbt-pgp"      % "2.0.1")