From 8b2bdabe760d3bd1e9e9cd7610f1585f3e6f4d0b Mon Sep 17 00:00:00 2001
From: Julien Richard-Foy <julien@richard-foy.fr>
Date: Tue, 16 Apr 2024 10:32:54 +0200
Subject: [PATCH] Copy content of `KinesisReceiver` implementation from Spark
 original repository at the following point in time (`branch-2.4`):

https://github.com/apache/spark/blob/4be566062defa249435c4d72eb106fe7b933e023/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala

This commit can be used as a point of reference to easily see the changes we applied to the original implementation.
---
 .../kinesis/KinesisCheckpointer.scala         | 146 +++++++
 .../streaming/kinesis/KinesisReceiver.scala   | 366 ++++++++++++++++++
 .../kinesis/KinesisRecordProcessor.scala      | 186 +++++++++
 3 files changed, 698 insertions(+)
 create mode 100644 spark-kinesis-dynamodb/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointer.scala
 create mode 100644 spark-kinesis-dynamodb/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala
 create mode 100644 spark-kinesis-dynamodb/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala

diff --git a/spark-kinesis-dynamodb/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointer.scala b/spark-kinesis-dynamodb/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointer.scala
new file mode 100644
index 00000000..11e94953
--- /dev/null
+++ b/spark-kinesis-dynamodb/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointer.scala
@@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.streaming.kinesis
+
+import java.util.concurrent._
+
+import scala.util.control.NonFatal
+
+import com.amazonaws.services.kinesis.clientlibrary.interfaces.IRecordProcessorCheckpointer
+import com.amazonaws.services.kinesis.clientlibrary.lib.worker.ShutdownReason
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.streaming.Duration
+import org.apache.spark.streaming.util.RecurringTimer
+import org.apache.spark.util.{Clock, SystemClock}
+
+/**
+ * This is a helper class for managing Kinesis checkpointing.
+ *
+ * @param receiver The receiver that keeps track of which sequence numbers we can checkpoint
+ * @param checkpointInterval How frequently we will checkpoint to DynamoDB
+ * @param workerId Worker Id of KCL worker for logging purposes
+ * @param clock In order to use ManualClocks for the purpose of testing
+ */
+private[kinesis] class KinesisCheckpointer(
+    receiver: KinesisReceiver[_],
+    checkpointInterval: Duration,
+    workerId: String,
+    clock: Clock = new SystemClock) extends Logging {
+
+  // a map from shardId's to checkpointers
+  private val checkpointers = new ConcurrentHashMap[String, IRecordProcessorCheckpointer]()
+
+  private val lastCheckpointedSeqNums = new ConcurrentHashMap[String, String]()
+
+  private val checkpointerThread: RecurringTimer = startCheckpointerThread()
+
+  /** Update the checkpointer instance to the most recent one for the given shardId. */
+  def setCheckpointer(shardId: String, checkpointer: IRecordProcessorCheckpointer): Unit = {
+    checkpointers.put(shardId, checkpointer)
+  }
+
+  /**
+   * Stop tracking the specified shardId.
+   *
+   * If a checkpointer is provided, e.g. on IRecordProcessor.shutdown [[ShutdownReason.TERMINATE]],
+   * we will use that to make the final checkpoint. If `null` is provided, we will not make the
+   * checkpoint, e.g. in case of [[ShutdownReason.ZOMBIE]].
+   */
+  def removeCheckpointer(shardId: String, checkpointer: IRecordProcessorCheckpointer): Unit = {
+    synchronized {
+      checkpointers.remove(shardId)
+    }
+    if (checkpointer != null) {
+      try {
+        // We must call `checkpoint()` with no parameter to finish reading shards.
+        // See a URL below for details:
+        // https://forums.aws.amazon.com/thread.jspa?threadID=244218
+        KinesisRecordProcessor.retryRandom(checkpointer.checkpoint(), 4, 100)
+      } catch {
+        case NonFatal(e) =>
+          logError(s"Exception:  WorkerId $workerId encountered an exception while checkpointing" +
+            s"to finish reading a shard of $shardId.", e)
+          // Rethrow the exception to the Kinesis Worker that is managing this RecordProcessor
+          throw e
+      }
+    }
+  }
+
+  /** Perform the checkpoint. */
+  private def checkpoint(shardId: String, checkpointer: IRecordProcessorCheckpointer): Unit = {
+    try {
+      if (checkpointer != null) {
+        receiver.getLatestSeqNumToCheckpoint(shardId).foreach { latestSeqNum =>
+          val lastSeqNum = lastCheckpointedSeqNums.get(shardId)
+          // Kinesis sequence numbers are monotonically increasing strings, therefore we can do
+          // safely do the string comparison
+          if (lastSeqNum == null || latestSeqNum > lastSeqNum) {
+            /* Perform the checkpoint */
+            KinesisRecordProcessor.retryRandom(checkpointer.checkpoint(latestSeqNum), 4, 100)
+            logDebug(s"Checkpoint:  WorkerId $workerId completed checkpoint at sequence number" +
+              s" $latestSeqNum for shardId $shardId")
+            lastCheckpointedSeqNums.put(shardId, latestSeqNum)
+          }
+        }
+      } else {
+        logDebug(s"Checkpointing skipped for shardId $shardId. Checkpointer not set.")
+      }
+    } catch {
+      case NonFatal(e) =>
+        logWarning(s"Failed to checkpoint shardId $shardId to DynamoDB.", e)
+    }
+  }
+
+  /** Checkpoint the latest saved sequence numbers for all active shardId's. */
+  private def checkpointAll(): Unit = synchronized {
+    // if this method throws an exception, then the scheduled task will not run again
+    try {
+      val shardIds = checkpointers.keys()
+      while (shardIds.hasMoreElements) {
+        val shardId = shardIds.nextElement()
+        checkpoint(shardId, checkpointers.get(shardId))
+      }
+    } catch {
+      case NonFatal(e) =>
+        logWarning("Failed to checkpoint to DynamoDB.", e)
+    }
+  }
+
+  /**
+   * Start the checkpointer thread with the given checkpoint duration.
+   */
+  private def startCheckpointerThread(): RecurringTimer = {
+    val period = checkpointInterval.milliseconds
+    val threadName = s"Kinesis Checkpointer - Worker $workerId"
+    val timer = new RecurringTimer(clock, period, _ => checkpointAll(), threadName)
+    timer.start()
+    logDebug(s"Started checkpointer thread: $threadName")
+    timer
+  }
+
+  /**
+   * Shutdown the checkpointer. Should be called on the onStop of the Receiver.
+   */
+  def shutdown(): Unit = {
+    // the recurring timer checkpoints for us one last time.
+    checkpointerThread.stop(interruptTimer = false)
+    checkpointers.clear()
+    lastCheckpointedSeqNums.clear()
+    logInfo("Successfully shutdown Kinesis Checkpointer.")
+  }
+}
diff --git a/spark-kinesis-dynamodb/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala b/spark-kinesis-dynamodb/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala
new file mode 100644
index 00000000..69c52365
--- /dev/null
+++ b/spark-kinesis-dynamodb/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala
@@ -0,0 +1,366 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.streaming.kinesis
+
+import java.util.UUID
+import java.util.concurrent.ConcurrentHashMap
+
+import scala.collection.JavaConverters._
+import scala.collection.mutable
+import scala.util.control.NonFatal
+
+import com.amazonaws.services.kinesis.clientlibrary.interfaces.{IRecordProcessor, IRecordProcessorCheckpointer, IRecordProcessorFactory}
+import com.amazonaws.services.kinesis.clientlibrary.lib.worker.{KinesisClientLibConfiguration, Worker}
+import com.amazonaws.services.kinesis.model.Record
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.storage.{StorageLevel, StreamBlockId}
+import org.apache.spark.streaming.Duration
+import org.apache.spark.streaming.kinesis.KinesisInitialPositions.AtTimestamp
+import org.apache.spark.streaming.receiver.{BlockGenerator, BlockGeneratorListener, Receiver}
+import org.apache.spark.util.Utils
+
+/**
+ * Custom AWS Kinesis-specific implementation of Spark Streaming's Receiver.
+ * This implementation relies on the Kinesis Client Library (KCL) Worker as described here:
+ * https://github.com/awslabs/amazon-kinesis-client
+ *
+ * The way this Receiver works is as follows:
+ *
+ *  - The receiver starts a KCL Worker, which is essentially runs a threadpool of multiple
+ *    KinesisRecordProcessor
+ *  - Each KinesisRecordProcessor receives data from a Kinesis shard in batches. Each batch is
+ *    inserted into a Block Generator, and the corresponding range of sequence numbers is recorded.
+ *  - When the block generator defines a block, then the recorded sequence number ranges that were
+ *    inserted into the block are recorded separately for being used later.
+ *  - When the block is ready to be pushed, the block is pushed and the ranges are reported as
+ *    metadata of the block. In addition, the ranges are used to find out the latest sequence
+ *    number for each shard that can be checkpointed through the DynamoDB.
+ *  - Periodically, each KinesisRecordProcessor checkpoints the latest successfully stored sequence
+ *    number for it own shard.
+ *
+ * @param streamName   Kinesis stream name
+ * @param endpointUrl  Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com)
+ * @param regionName  Region name used by the Kinesis Client Library for
+ *                    DynamoDB (lease coordination and checkpointing) and CloudWatch (metrics)
+ * @param initialPosition  Instance of [[KinesisInitialPosition]]
+ *                         In the absence of Kinesis checkpoint info, this is the
+ *                         worker's initial starting position in the stream.
+ *                         The values are either the beginning of the stream
+ *                         per Kinesis' limit of 24 hours
+ *                         ([[KinesisInitialPositions.TrimHorizon]]) or
+ *                         the tip of the stream ([[KinesisInitialPositions.Latest]]).
+ * @param checkpointAppName  Kinesis application name. Kinesis Apps are mapped to Kinesis Streams
+ *                 by the Kinesis Client Library.  If you change the App name or Stream name,
+ *                 the KCL will throw errors.  This usually requires deleting the backing
+ *                 DynamoDB table with the same name this Kinesis application.
+ * @param checkpointInterval  Checkpoint interval for Kinesis checkpointing.
+ *                            See the Kinesis Spark Streaming documentation for more
+ *                            details on the different types of checkpoints.
+ * @param storageLevel Storage level to use for storing the received objects
+ * @param kinesisCreds SparkAWSCredentials instance that will be used to generate the
+ *                     AWSCredentialsProvider passed to the KCL to authorize Kinesis API calls.
+ * @param cloudWatchCreds Optional SparkAWSCredentials instance that will be used to generate the
+ *                        AWSCredentialsProvider passed to the KCL to authorize CloudWatch API
+ *                        calls. Will use kinesisCreds if value is None.
+ * @param dynamoDBCreds Optional SparkAWSCredentials instance that will be used to generate the
+ *                      AWSCredentialsProvider passed to the KCL to authorize DynamoDB API calls.
+ *                      Will use kinesisCreds if value is None.
+ */
+private[kinesis] class KinesisReceiver[T](
+    val streamName: String,
+    endpointUrl: String,
+    regionName: String,
+    initialPosition: KinesisInitialPosition,
+    checkpointAppName: String,
+    checkpointInterval: Duration,
+    storageLevel: StorageLevel,
+    messageHandler: Record => T,
+    kinesisCreds: SparkAWSCredentials,
+    dynamoDBCreds: Option[SparkAWSCredentials],
+    cloudWatchCreds: Option[SparkAWSCredentials])
+  extends Receiver[T](storageLevel) with Logging { receiver =>
+
+  /*
+   * =================================================================================
+   * The following vars are initialize in the onStart() method which executes in the
+   * Spark worker after this Receiver is serialized and shipped to the worker.
+   * =================================================================================
+   */
+
+  /**
+   * workerId is used by the KCL should be based on the ip address of the actual Spark Worker
+   * where this code runs (not the driver's IP address.)
+   */
+  @volatile private var workerId: String = null
+
+  /**
+   * Worker is the core client abstraction from the Kinesis Client Library (KCL).
+   * A worker can process more than one shards from the given stream.
+   * Each shard is assigned its own IRecordProcessor and the worker run multiple such
+   * processors.
+   */
+  @volatile private var worker: Worker = null
+  @volatile private var workerThread: Thread = null
+
+  /** BlockGenerator used to generates blocks out of Kinesis data */
+  @volatile private var blockGenerator: BlockGenerator = null
+
+  /**
+   * Sequence number ranges added to the current block being generated.
+   * Accessing and updating of this map is synchronized by locks in BlockGenerator.
+   */
+  private val seqNumRangesInCurrentBlock = new mutable.ArrayBuffer[SequenceNumberRange]
+
+  /** Sequence number ranges of data added to each generated block */
+  private val blockIdToSeqNumRanges = new ConcurrentHashMap[StreamBlockId, SequenceNumberRanges]
+
+  /**
+   * The centralized kinesisCheckpointer that checkpoints based on the given checkpointInterval.
+   */
+  @volatile private var kinesisCheckpointer: KinesisCheckpointer = null
+
+  /**
+   * Latest sequence number ranges that have been stored successfully.
+   * This is used for checkpointing through KCL */
+  private val shardIdToLatestStoredSeqNum = new ConcurrentHashMap[String, String]
+
+  /**
+   * This is called when the KinesisReceiver starts and must be non-blocking.
+   * The KCL creates and manages the receiving/processing thread pool through Worker.run().
+   */
+  override def onStart() {
+    blockGenerator = supervisor.createBlockGenerator(new GeneratedBlockHandler)
+
+    workerId = Utils.localHostName() + ":" + UUID.randomUUID()
+
+    kinesisCheckpointer = new KinesisCheckpointer(receiver, checkpointInterval, workerId)
+    val kinesisProvider = kinesisCreds.provider
+
+    val kinesisClientLibConfiguration = {
+      val baseClientLibConfiguration = new KinesisClientLibConfiguration(
+        checkpointAppName,
+        streamName,
+        kinesisProvider,
+        dynamoDBCreds.map(_.provider).getOrElse(kinesisProvider),
+        cloudWatchCreds.map(_.provider).getOrElse(kinesisProvider),
+        workerId)
+        .withKinesisEndpoint(endpointUrl)
+        .withTaskBackoffTimeMillis(500)
+        .withRegionName(regionName)
+
+      // Update the Kinesis client lib config with timestamp
+      // if InitialPositionInStream.AT_TIMESTAMP is passed
+      initialPosition match {
+        case ts: AtTimestamp =>
+          baseClientLibConfiguration.withTimestampAtInitialPositionInStream(ts.getTimestamp)
+        case _ =>
+          baseClientLibConfiguration.withInitialPositionInStream(initialPosition.getPosition)
+      }
+    }
+
+   /*
+    *  RecordProcessorFactory creates impls of IRecordProcessor.
+    *  IRecordProcessor adapts the KCL to our Spark KinesisReceiver via the
+    *  IRecordProcessor.processRecords() method.
+    *  We're using our custom KinesisRecordProcessor in this case.
+    */
+    val recordProcessorFactory = new IRecordProcessorFactory {
+      override def createProcessor: IRecordProcessor =
+        new KinesisRecordProcessor(receiver, workerId)
+    }
+
+    worker = new Worker(recordProcessorFactory, kinesisClientLibConfiguration)
+    workerThread = new Thread() {
+      override def run(): Unit = {
+        try {
+          worker.run()
+        } catch {
+          case NonFatal(e) =>
+            restart("Error running the KCL worker in Receiver", e)
+        }
+      }
+    }
+
+    blockIdToSeqNumRanges.clear()
+    blockGenerator.start()
+
+    workerThread.setName(s"Kinesis Receiver ${streamId}")
+    workerThread.setDaemon(true)
+    workerThread.start()
+
+    logInfo(s"Started receiver with workerId $workerId")
+  }
+
+  /**
+   * This is called when the KinesisReceiver stops.
+   * The KCL worker.shutdown() method stops the receiving/processing threads.
+   * The KCL will do its best to drain and checkpoint any in-flight records upon shutdown.
+   */
+  override def onStop() {
+    if (workerThread != null) {
+      if (worker != null) {
+        worker.shutdown()
+        worker = null
+      }
+      workerThread.join()
+      workerThread = null
+      logInfo(s"Stopped receiver for workerId $workerId")
+    }
+    workerId = null
+    if (kinesisCheckpointer != null) {
+      kinesisCheckpointer.shutdown()
+      kinesisCheckpointer = null
+    }
+  }
+
+  /** Add records of the given shard to the current block being generated */
+  private[kinesis] def addRecords(shardId: String, records: java.util.List[Record]): Unit = {
+    if (records.size > 0) {
+      val dataIterator = records.iterator().asScala.map(messageHandler)
+      val metadata = SequenceNumberRange(streamName, shardId,
+        records.get(0).getSequenceNumber(), records.get(records.size() - 1).getSequenceNumber(),
+        records.size())
+      blockGenerator.addMultipleDataWithCallback(dataIterator, metadata)
+    }
+  }
+
+  /** Return the current rate limit defined in [[BlockGenerator]]. */
+  private[kinesis] def getCurrentLimit: Int = {
+    assert(blockGenerator != null)
+    math.min(blockGenerator.getCurrentLimit, Int.MaxValue).toInt
+  }
+
+  /** Get the latest sequence number for the given shard that can be checkpointed through KCL */
+  private[kinesis] def getLatestSeqNumToCheckpoint(shardId: String): Option[String] = {
+    Option(shardIdToLatestStoredSeqNum.get(shardId))
+  }
+
+  /**
+   * Set the checkpointer that will be used to checkpoint sequence numbers to DynamoDB for the
+   * given shardId.
+   */
+  def setCheckpointer(shardId: String, checkpointer: IRecordProcessorCheckpointer): Unit = {
+    assert(kinesisCheckpointer != null, "Kinesis Checkpointer not initialized!")
+    kinesisCheckpointer.setCheckpointer(shardId, checkpointer)
+  }
+
+  /**
+   * Remove the checkpointer for the given shardId. The provided checkpointer will be used to
+   * checkpoint one last time for the given shard. If `checkpointer` is `null`, then we will not
+   * checkpoint.
+   */
+  def removeCheckpointer(shardId: String, checkpointer: IRecordProcessorCheckpointer): Unit = {
+    assert(kinesisCheckpointer != null, "Kinesis Checkpointer not initialized!")
+    kinesisCheckpointer.removeCheckpointer(shardId, checkpointer)
+  }
+
+  /**
+   * Remember the range of sequence numbers that was added to the currently active block.
+   * Internally, this is synchronized with `finalizeRangesForCurrentBlock()`.
+   */
+  private def rememberAddedRange(range: SequenceNumberRange): Unit = {
+    seqNumRangesInCurrentBlock += range
+  }
+
+  /**
+   * Finalize the ranges added to the block that was active and prepare the ranges buffer
+   * for next block. Internally, this is synchronized with `rememberAddedRange()`.
+   */
+  private def finalizeRangesForCurrentBlock(blockId: StreamBlockId): Unit = {
+    blockIdToSeqNumRanges.put(blockId, SequenceNumberRanges(seqNumRangesInCurrentBlock.toArray))
+    seqNumRangesInCurrentBlock.clear()
+    logDebug(s"Generated block $blockId has $blockIdToSeqNumRanges")
+  }
+
+  /** Store the block along with its associated ranges */
+  private def storeBlockWithRanges(
+      blockId: StreamBlockId, arrayBuffer: mutable.ArrayBuffer[T]): Unit = {
+    val rangesToReportOption = Option(blockIdToSeqNumRanges.remove(blockId))
+    if (rangesToReportOption.isEmpty) {
+      stop("Error while storing block into Spark, could not find sequence number ranges " +
+        s"for block $blockId")
+      return
+    }
+
+    val rangesToReport = rangesToReportOption.get
+    var attempt = 0
+    var stored = false
+    var throwable: Throwable = null
+    while (!stored && attempt <= 3) {
+      try {
+        store(arrayBuffer, rangesToReport)
+        stored = true
+      } catch {
+        case NonFatal(th) =>
+          attempt += 1
+          throwable = th
+      }
+    }
+    if (!stored) {
+      stop("Error while storing block into Spark", throwable)
+    }
+
+    // Update the latest sequence number that have been successfully stored for each shard
+    // Note that we are doing this sequentially because the array of sequence number ranges
+    // is assumed to be
+    rangesToReport.ranges.foreach { range =>
+      shardIdToLatestStoredSeqNum.put(range.shardId, range.toSeqNumber)
+    }
+  }
+
+  /**
+   * Class to handle blocks generated by this receiver's block generator. Specifically, in
+   * the context of the Kinesis Receiver, this handler does the following.
+   *
+   * - When an array of records is added to the current active block in the block generator,
+   *   this handler keeps track of the corresponding sequence number range.
+   * - When the currently active block is ready to sealed (not more records), this handler
+   *   keep track of the list of ranges added into this block in another H
+   */
+  private class GeneratedBlockHandler extends BlockGeneratorListener {
+
+    /**
+     * Callback method called after a data item is added into the BlockGenerator.
+     * The data addition, block generation, and calls to onAddData and onGenerateBlock
+     * are all synchronized through the same lock.
+     */
+    def onAddData(data: Any, metadata: Any): Unit = {
+      rememberAddedRange(metadata.asInstanceOf[SequenceNumberRange])
+    }
+
+    /**
+     * Callback method called after a block has been generated.
+     * The data addition, block generation, and calls to onAddData and onGenerateBlock
+     * are all synchronized through the same lock.
+     */
+    def onGenerateBlock(blockId: StreamBlockId): Unit = {
+      finalizeRangesForCurrentBlock(blockId)
+    }
+
+    /** Callback method called when a block is ready to be pushed / stored. */
+    def onPushBlock(blockId: StreamBlockId, arrayBuffer: mutable.ArrayBuffer[_]): Unit = {
+      storeBlockWithRanges(blockId,
+        arrayBuffer.asInstanceOf[mutable.ArrayBuffer[T]])
+    }
+
+    /** Callback called in case of any error in internal of the BlockGenerator */
+    def onError(message: String, throwable: Throwable): Unit = {
+      reportError(message, throwable)
+    }
+  }
+}
diff --git a/spark-kinesis-dynamodb/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala b/spark-kinesis-dynamodb/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala
new file mode 100644
index 00000000..8c6a399d
--- /dev/null
+++ b/spark-kinesis-dynamodb/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala
@@ -0,0 +1,186 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.streaming.kinesis
+
+import java.util.List
+
+import scala.util.Random
+import scala.util.control.NonFatal
+
+import com.amazonaws.services.kinesis.clientlibrary.exceptions.{InvalidStateException, KinesisClientLibDependencyException, ShutdownException, ThrottlingException}
+import com.amazonaws.services.kinesis.clientlibrary.interfaces.{IRecordProcessor, IRecordProcessorCheckpointer}
+import com.amazonaws.services.kinesis.clientlibrary.lib.worker.ShutdownReason
+import com.amazonaws.services.kinesis.model.Record
+
+import org.apache.spark.internal.Logging
+
+/**
+ * Kinesis-specific implementation of the Kinesis Client Library (KCL) IRecordProcessor.
+ * This implementation operates on the Array[Byte] from the KinesisReceiver.
+ * The Kinesis Worker creates an instance of this KinesisRecordProcessor for each
+ * shard in the Kinesis stream upon startup.  This is normally done in separate threads,
+ * but the KCLs within the KinesisReceivers will balance themselves out if you create
+ * multiple Receivers.
+ *
+ * @param receiver Kinesis receiver
+ * @param workerId for logging purposes
+ */
+private[kinesis] class KinesisRecordProcessor[T](receiver: KinesisReceiver[T], workerId: String)
+  extends IRecordProcessor with Logging {
+
+  // shardId populated during initialize()
+  @volatile
+  private var shardId: String = _
+
+  /**
+   * The Kinesis Client Library calls this method during IRecordProcessor initialization.
+   *
+   * @param shardId assigned by the KCL to this particular RecordProcessor.
+   */
+  override def initialize(shardId: String) {
+    this.shardId = shardId
+    logInfo(s"Initialized workerId $workerId with shardId $shardId")
+  }
+
+  /**
+   * This method is called by the KCL when a batch of records is pulled from the Kinesis stream.
+   * This is the record-processing bridge between the KCL's IRecordProcessor.processRecords()
+   * and Spark Streaming's Receiver.store().
+   *
+   * @param batch list of records from the Kinesis stream shard
+   * @param checkpointer used to update Kinesis when this batch has been processed/stored
+   *   in the DStream
+   */
+  override def processRecords(batch: List[Record], checkpointer: IRecordProcessorCheckpointer) {
+    if (!receiver.isStopped()) {
+      try {
+        // Limit the number of processed records from Kinesis stream. This is because the KCL cannot
+        // control the number of aggregated records to be fetched even if we set `MaxRecords`
+        // in `KinesisClientLibConfiguration`. For example, if we set 10 to the number of max
+        // records in a worker and a producer aggregates two records into one message, the worker
+        // possibly 20 records every callback function called.
+        val maxRecords = receiver.getCurrentLimit
+        for (start <- 0 until batch.size by maxRecords) {
+          val miniBatch = batch.subList(start, math.min(start + maxRecords, batch.size))
+          receiver.addRecords(shardId, miniBatch)
+          logDebug(s"Stored: Worker $workerId stored ${miniBatch.size} records " +
+            s"for shardId $shardId")
+        }
+        receiver.setCheckpointer(shardId, checkpointer)
+      } catch {
+        case NonFatal(e) =>
+          /*
+           *  If there is a failure within the batch, the batch will not be checkpointed.
+           *  This will potentially cause records since the last checkpoint to be processed
+           *     more than once.
+           */
+          logError(s"Exception:  WorkerId $workerId encountered and exception while storing " +
+              s" or checkpointing a batch for workerId $workerId and shardId $shardId.", e)
+
+          /* Rethrow the exception to the Kinesis Worker that is managing this RecordProcessor. */
+          throw e
+      }
+    } else {
+      /* RecordProcessor has been stopped. */
+      logInfo(s"Stopped:  KinesisReceiver has stopped for workerId $workerId" +
+          s" and shardId $shardId.  No more records will be processed.")
+    }
+  }
+
+  /**
+   * Kinesis Client Library is shutting down this Worker for 1 of 2 reasons:
+   * 1) the stream is resharding by splitting or merging adjacent shards
+   *     (ShutdownReason.TERMINATE)
+   * 2) the failed or latent Worker has stopped sending heartbeats for whatever reason
+   *     (ShutdownReason.ZOMBIE)
+   *
+   * @param checkpointer used to perform a Kinesis checkpoint for ShutdownReason.TERMINATE
+   * @param reason for shutdown (ShutdownReason.TERMINATE or ShutdownReason.ZOMBIE)
+   */
+  override def shutdown(
+      checkpointer: IRecordProcessorCheckpointer,
+      reason: ShutdownReason): Unit = {
+    logInfo(s"Shutdown:  Shutting down workerId $workerId with reason $reason")
+    // null if not initialized before shutdown:
+    if (shardId == null) {
+      logWarning(s"No shardId for workerId $workerId?")
+    } else {
+      reason match {
+        /*
+         * TERMINATE Use Case.  Checkpoint.
+         * Checkpoint to indicate that all records from the shard have been drained and processed.
+         * It's now OK to read from the new shards that resulted from a resharding event.
+         */
+        case ShutdownReason.TERMINATE => receiver.removeCheckpointer(shardId, checkpointer)
+
+        /*
+         * ZOMBIE Use Case or Unknown reason.  NoOp.
+         * No checkpoint because other workers may have taken over and already started processing
+         *    the same records.
+         * This may lead to records being processed more than once.
+         * Return null so that we don't checkpoint
+         */
+        case _ => receiver.removeCheckpointer(shardId, null)
+      }
+    }
+  }
+}
+
+private[kinesis] object KinesisRecordProcessor extends Logging {
+  /**
+   * Retry the given amount of times with a random backoff time (millis) less than the
+   *   given maxBackOffMillis
+   *
+   * @param expression expression to evaluate
+   * @param numRetriesLeft number of retries left
+   * @param maxBackOffMillis: max millis between retries
+   *
+   * @return evaluation of the given expression
+   * @throws Unretryable exception, unexpected exception,
+   *  or any exception that persists after numRetriesLeft reaches 0
+   */
+  @annotation.tailrec
+  def retryRandom[T](expression: => T, numRetriesLeft: Int, maxBackOffMillis: Int): T = {
+    util.Try { expression } match {
+      /* If the function succeeded, evaluate to x. */
+      case util.Success(x) => x
+      /* If the function failed, either retry or throw the exception */
+      case util.Failure(e) => e match {
+        /* Retry:  Throttling or other Retryable exception has occurred */
+        case _: ThrottlingException | _: KinesisClientLibDependencyException
+            if numRetriesLeft > 1 =>
+          val backOffMillis = Random.nextInt(maxBackOffMillis)
+          Thread.sleep(backOffMillis)
+          logError(s"Retryable Exception:  Random backOffMillis=${backOffMillis}", e)
+          retryRandom(expression, numRetriesLeft - 1, maxBackOffMillis)
+        /* Throw:  Shutdown has been requested by the Kinesis Client Library. */
+        case _: ShutdownException =>
+          logError(s"ShutdownException:  Caught shutdown exception, skipping checkpoint.", e)
+          throw e
+        /* Throw:  Non-retryable exception has occurred with the Kinesis Client Library */
+        case _: InvalidStateException =>
+          logError(s"InvalidStateException:  Cannot save checkpoint to the DynamoDB table used" +
+              s" by the Amazon Kinesis Client Library.  Table likely doesn't exist.", e)
+          throw e
+        /* Throw:  Unexpected exception has occurred */
+        case _ =>
+          logError(s"Unexpected, non-retryable exception.", e)
+          throw e
+      }
+    }
+  }
+}