Fix streaming issue

The RDD keys are not serializable, which can fail some RDD operations. We create the RDD element keys _after_ repartitioning to avoid them being serialized across partitions. This change allowed me to successfully run a data migration with stream changes enabled. Such a scenario can not be added to our test suite though because KCL only works with the real AWS servers (see scylladb#113)
julienrf · Mar 13, 2024 · afcbfd8 · afcbfd8
1 parent 1db4c85
commit afcbfd8
Show file tree

Hide file tree

Showing 2 changed files with 32 additions and 18 deletions.
diff --git a/migrator/src/main/scala/com/scylladb/migrator/DynamoUtils.scala b/migrator/src/main/scala/com/scylladb/migrator/DynamoUtils.scala
@@ -64,7 +64,8 @@ object DynamoUtils {
 
   def enableDynamoStream(source: SourceSettings.DynamoDB): Unit = {
     val sourceClient = buildDynamoClient(source.endpoint, source.credentials, source.region)
-    val sourceStreamsClient = buildDynamoStreamsClient(source.credentials, source.region)
+    val sourceStreamsClient =
+      buildDynamoStreamsClient(source.endpoint, source.credentials, source.region)
 
     sourceClient
       .updateTable(
@@ -114,9 +115,18 @@ object DynamoUtils {
     builder.build()
   }
 
-  def buildDynamoStreamsClient(creds: Option[AWSCredentialsProvider], region: Option[String]) = {
+  def buildDynamoStreamsClient(endpoint: Option[DynamoDBEndpoint],
+                               creds: Option[AWSCredentialsProvider],
+                               region: Option[String]) = {
     val builder = AmazonDynamoDBStreamsClientBuilder.standard()
 
+    endpoint.foreach { endpoint =>
+      builder
+        .withEndpointConfiguration(
+          new AwsClientBuilder.EndpointConfiguration(
+            endpoint.renderEndpoint,
+            region.getOrElse("empty")))
+    }
     creds.foreach(builder.withCredentials)
     region.foreach(builder.withRegion)
 

diff --git a/migrator/src/main/scala/com/scylladb/migrator/writers/DynamoStreamReplication.scala b/migrator/src/main/scala/com/scylladb/migrator/writers/DynamoStreamReplication.scala
@@ -66,26 +66,30 @@ object DynamoStreamReplication {
       }
       .foreachRDD { msgs =>
         val rdd = msgs
-          .collect {
-            case Some(item) => (new Text(), new DynamoDBItemWritable(item))
-          }
+          .collect { case Some(item) => new DynamoDBItemWritable(item) }
           .repartition(Runtime.getRuntime.availableProcessors() * 2)
+          .map(item => (new Text, item)) // Create the key after repartitioning to avoid Serialization issues
 
-        log.info("Changes to be applied:")
-        rdd
-          .map(_._2) // Remove keys because they are not serializable
-          .groupBy { itemWritable =>
-            itemWritable.getItem.get(operationTypeColumn) match {
-              case `putOperation`    => "UPSERT"
-              case `deleteOperation` => "DELETE"
-              case _                 => "UNKNOWN"
+        val changes =
+          rdd
+            .map(_._2) // Remove keys because they are not serializable
+            .groupBy { itemWritable =>
+              itemWritable.getItem.get(operationTypeColumn) match {
+                case `putOperation`    => "UPSERT"
+                case `deleteOperation` => "DELETE"
+                case _                 => "UNKNOWN"
+              }
             }
+            .mapValues(_.size)
+            .collect()
+        if (changes.nonEmpty) {
+          log.info("Changes to be applied:")
+          for ((operation, count) <- changes) {
+            log.info(s"${operation}: ${count}")
           }
-          .mapValues(_.size)
-          .foreach {
-            case (operation, count) =>
-              log.info(s"${operation}: ${count}")
-          }
+        } else {
+          log.info("No changes to apply")
+        }
 
         DynamoDB.writeRDD(target, renames, rdd, Some(targetTableDesc))(spark)
       }