apache · junrao · Oct 30, 2024 · Aug 8, 2024 · Aug 23, 2024 · Aug 28, 2024
diff --git a/core/src/main/java/kafka/server/share/DelayedShareFetch.java b/core/src/main/java/kafka/server/share/DelayedShareFetch.java
@@ -127,7 +127,7 @@ public void onComplete() {
             shareFetchData.future().complete(result);
         } catch (Exception e) {
             log.error("Error processing delayed share fetch request", e);
-            shareFetchData.future().completeExceptionally(e);
+            sharePartitionManager.handleFetchException(shareFetchData.groupId(), topicPartitionData.keySet(), shareFetchData.future(), e);
         } finally {
             // Releasing the lock to move ahead with the next request in queue.
             releasePartitionLocks(shareFetchData.groupId(), topicPartitionData.keySet());

diff --git a/core/src/main/java/kafka/server/share/SharePartition.java b/core/src/main/java/kafka/server/share/SharePartition.java
@@ -180,6 +180,11 @@ public static RecordState forId(byte id) {
      */
     private final TopicIdPartition topicIdPartition;
 
+    /**
+     * The leader epoch is used to track the partition epoch.
+     */
+    private final int leaderEpoch;
+
     /**
      * The in-flight record is used to track the state of a record that has been fetched from the
      * leader. The state of the record is used to determine if the record should be re-fetched or if it
@@ -279,6 +284,7 @@ public static RecordState forId(byte id) {
     SharePartition(
         String groupId,
         TopicIdPartition topicIdPartition,
+        int leaderEpoch,
         int maxInFlightMessages,
         int maxDeliveryCount,
         int defaultRecordLockDurationMs,
@@ -290,6 +296,7 @@ public static RecordState forId(byte id) {
     ) {
         this.groupId = groupId;
         this.topicIdPartition = topicIdPartition;
+        this.leaderEpoch = leaderEpoch;
         this.maxInFlightMessages = maxInFlightMessages;
         this.maxDeliveryCount = maxDeliveryCount;
         this.cachedState = new ConcurrentSkipListMap<>();
@@ -340,7 +347,7 @@ public CompletableFuture<Void> maybeInitialize() {
                 .setGroupTopicPartitionData(new GroupTopicPartitionData.Builder<PartitionIdLeaderEpochData>()
                     .setGroupId(this.groupId)
                     .setTopicsData(Collections.singletonList(new TopicData<>(topicIdPartition.topicId(),
-                        Collections.singletonList(PartitionFactory.newPartitionIdLeaderEpochData(topicIdPartition.partition(), 0)))))
+                        Collections.singletonList(PartitionFactory.newPartitionIdLeaderEpochData(topicIdPartition.partition(), leaderEpoch)))))
                     .build())
                 .build()
             ).whenComplete((result, exception) -> {
@@ -1666,7 +1673,7 @@ CompletableFuture<Void> writeShareGroupState(List<PersisterStateBatch> stateBatc
                 .setGroupId(this.groupId)
                 .setTopicsData(Collections.singletonList(new TopicData<>(topicIdPartition.topicId(),
                     Collections.singletonList(PartitionFactory.newPartitionStateBatchData(
-                        topicIdPartition.partition(), stateEpoch, startOffset, 0, stateBatches))))
+                        topicIdPartition.partition(), stateEpoch, startOffset, leaderEpoch, stateBatches))))
                 ).build()).build())
             .whenComplete((result, exception) -> {
                 if (exception != null) {

diff --git a/core/src/main/java/kafka/server/share/SharePartitionManager.java b/core/src/main/java/kafka/server/share/SharePartitionManager.java
@@ -16,6 +16,7 @@
  */
 package kafka.server.share;
 
+import kafka.cluster.Partition;
 import kafka.server.ReplicaManager;
 
 import org.apache.kafka.clients.consumer.AcknowledgeType;
@@ -69,8 +70,10 @@
 import java.util.Set;
 import java.util.concurrent.CompletableFuture;
 import java.util.concurrent.ConcurrentHashMap;
+import java.util.stream.Collectors;
 
 import scala.jdk.javaapi.CollectionConverters;
+import scala.util.Either;
 
 /**
  * The SharePartitionManager is responsible for managing the SharePartitions and ShareSessions.
@@ -260,11 +263,13 @@ public CompletableFuture<Map<TopicIdPartition, ShareAcknowledgeResponseData.Part
         this.shareGroupMetrics.shareAcknowledgement();
         Map<TopicIdPartition, CompletableFuture<Errors>> futures = new HashMap<>();
         acknowledgeTopics.forEach((topicIdPartition, acknowledgePartitionBatches) -> {
-            SharePartition sharePartition = partitionCacheMap.get(sharePartitionKey(groupId, topicIdPartition));
+            SharePartitionKey sharePartitionKey = sharePartitionKey(groupId, topicIdPartition);
+            SharePartition sharePartition = partitionCacheMap.get(sharePartitionKey);
             if (sharePartition != null) {
                 CompletableFuture<Errors> future = new CompletableFuture<>();
                 sharePartition.acknowledge(memberId, acknowledgePartitionBatches).whenComplete((result, throwable) -> {
                     if (throwable != null) {
+                        handleSharePartitionException(sharePartitionKey, throwable);
                         future.complete(Errors.forException(throwable));
                         return;
                     }
@@ -328,14 +333,16 @@ public CompletableFuture<Map<TopicIdPartition, ShareAcknowledgeResponseData.Part
 
         Map<TopicIdPartition, CompletableFuture<Errors>> futuresMap = new HashMap<>();
         topicIdPartitions.forEach(topicIdPartition -> {
-            SharePartition sharePartition = partitionCacheMap.get(sharePartitionKey(groupId, topicIdPartition));
+            SharePartitionKey sharePartitionKey = sharePartitionKey(groupId, topicIdPartition);
+            SharePartition sharePartition = partitionCacheMap.get(sharePartitionKey);
             if (sharePartition == null) {
                 log.error("No share partition found for groupId {} topicPartition {} while releasing acquired topic partitions", groupId, topicIdPartition);
                 futuresMap.put(topicIdPartition, CompletableFuture.completedFuture(Errors.UNKNOWN_TOPIC_OR_PARTITION));
             } else {
                 CompletableFuture<Errors> future = new CompletableFuture<>();
                 sharePartition.releaseAcquiredRecords(memberId).whenComplete((result, throwable) -> {
                     if (throwable != null) {
+                        handleSharePartitionException(sharePartitionKey, throwable);
                         future.complete(Errors.forException(throwable));
                         return;
                     }
@@ -479,6 +486,30 @@ public void acknowledgeSessionUpdate(String groupId, ShareRequestMetadata reqMet
         }
     }
 
+    /**
+     * The handleFetchException method is used to handle the exception that occurred while reading from log.
+     * The method will handle the exception for each topic-partition in the request. The share partition
+     * might get removed from the cache.
+     * <p>
+     * The replica read request might error out for one share partition
+     * but as we cannot determine which share partition errored out, we might remove all the share partitions
+     * in the request.
+     *
+     * @param groupId The group id in the share fetch request.
+     * @param topicIdPartitions The topic-partitions in the replica read request.
+     * @param future The future to complete with the exception.
+     * @param throwable The exception that occurred while fetching messages.
+     */
+    public void handleFetchException(
+        String groupId,
+        Set<TopicIdPartition> topicIdPartitions,
+        CompletableFuture<Map<TopicIdPartition, PartitionData>> future,
+        Throwable throwable
+    ) {
+        topicIdPartitions.forEach(topicIdPartition -> handleSharePartitionException(sharePartitionKey(groupId, topicIdPartition), throwable));
+        maybeCompleteShareFetchExceptionally(future, topicIdPartitions, throwable);
+    }
+
     /**
      * The cachedTopicIdPartitionsInShareSession method is used to get the cached topic-partitions in the share session.
      *
@@ -543,7 +574,7 @@ void processShareFetch(ShareFetchData shareFetchData) {
                 // TopicPartitionData list will be populated only if the share partition is already initialized.
                 sharePartition.maybeInitialize().whenComplete((result, throwable) -> {
                     if (throwable != null) {
-                        maybeCompleteInitializationWithException(sharePartitionKey, shareFetchData.future(), throwable);
+                        maybeCompleteInitializationWithException(sharePartitionKey, shareFetchData.future(), topicIdPartition, throwable);
                     }
                 });
             });
@@ -565,21 +596,24 @@ void processShareFetch(ShareFetchData shareFetchData) {
             addDelayedShareFetch(new DelayedShareFetch(shareFetchData, replicaManager, this),
                 delayedShareFetchWatchKeys);
         } catch (Exception e) {
-            // In case exception occurs then release the locks so queue can be further processed.
-            log.error("Error processing fetch queue for share partitions", e);
-            if (!shareFetchData.future().isDone()) {
-                shareFetchData.future().completeExceptionally(e);
-            }
+            // Complete the whole fetch request with an exception if there is an error processing.
+            // The exception currently can be thrown only if there is an error while initializing
+            // the share partition. But skip the processing for other share partitions in the request
+            // as this situation is not expected.
+            log.error("Error processing share fetch request", e);
+            maybeCompleteShareFetchExceptionally(shareFetchData.future(), shareFetchData.partitionMaxBytes().keySet(), e);
         }
     }
 
     private SharePartition getOrCreateSharePartition(SharePartitionKey sharePartitionKey) {
         return partitionCacheMap.computeIfAbsent(sharePartitionKey,
                 k -> {
                     long start = time.hiResClockMs();
+                    int leaderEpoch = leaderEpoch(sharePartitionKey.topicIdPartition().topicPartition());
                     SharePartition partition = new SharePartition(
                             sharePartitionKey.groupId(),
                             sharePartitionKey.topicIdPartition(),
+                            leaderEpoch,
                             maxInFlightMessages,
                             maxDeliveryCount,
                             defaultRecordLockDurationMs,
@@ -597,6 +631,7 @@ private SharePartition getOrCreateSharePartition(SharePartitionKey sharePartitio
     private void maybeCompleteInitializationWithException(
             SharePartitionKey sharePartitionKey,
             CompletableFuture<Map<TopicIdPartition, PartitionData>> future,
+            TopicIdPartition topicIdPartition,
             Throwable throwable) {
         if (throwable instanceof LeaderNotAvailableException) {
             log.debug("The share partition with key {} is not initialized yet", sharePartitionKey);
@@ -606,22 +641,51 @@ private void maybeCompleteInitializationWithException(
             return;
         }
 
+        // Remove the partition from the cache as it's failed to initialize.
+        partitionCacheMap.remove(sharePartitionKey);
+        // The partition initialization failed, so complete the request with the exception.
+        // The server should not be in this state, so log the error on broker and surface the same
+        // to the client. The broker should not be in this state, investigate the root cause of the error.
+        log.error("Error initializing share partition with key {}", sharePartitionKey, throwable);
+        maybeCompleteShareFetchExceptionally(future, Collections.singletonList(topicIdPartition), throwable);
+    }
+
+    private void handleSharePartitionException(
+        SharePartitionKey sharePartitionKey,
+        Throwable throwable
+    ) {
         if (throwable instanceof NotLeaderOrFollowerException || throwable instanceof FencedStateEpochException) {
             log.info("The share partition with key {} is fenced: {}", sharePartitionKey, throwable.getMessage());
             // The share partition is fenced hence remove the partition from map and let the client retry.
             // But surface the error to the client so client might take some action i.e. re-fetch
             // the metadata and retry the fetch on new leader.
             partitionCacheMap.remove(sharePartitionKey);
-            future.completeExceptionally(throwable);
-            return;
         }
+    }
 
-        // The partition initialization failed, so complete the request with the exception.
-        // The server should not be in this state, so log the error on broker and surface the same
-        // to the client. As of now this state is in-recoverable for the broker, and we should
-        // investigate the root cause of the error.
-        log.error("Error initializing share partition with key {}", sharePartitionKey, throwable);
-        future.completeExceptionally(throwable);
+    private void maybeCompleteShareFetchExceptionally(CompletableFuture<Map<TopicIdPartition, PartitionData>> future,
+        Collection<TopicIdPartition> topicIdPartitions, Throwable throwable) {
+        if (!future.isDone()) {
+            Errors error = Errors.forException(throwable);
+            future.complete(topicIdPartitions.stream().collect(Collectors.toMap(
+                tp -> tp, tp -> new PartitionData().setErrorCode(error.code()).setErrorMessage(error.message()))));
+        }
+    }
+
+    private int leaderEpoch(TopicPartition tp) {
+        Either<Errors, Partition> partitionOrError = replicaManager.getPartitionOrError(tp);
+        if (partitionOrError.isLeft()) {
+            log.debug("Failed to get partition leader for topic partition: {}-{} due to error: {}",
+                tp.topic(), tp.partition(), partitionOrError.left().get());
+            throw partitionOrError.left().get().exception();
+        }
+
+        Partition partition = partitionOrError.right().get();
+        if (!partition.isLeader()) {
+            log.debug("The broker is not the leader for topic partition: {}-{}", tp.topic(), tp.partition());
+            throw new NotLeaderOrFollowerException();
+        }
+        return partition.getLeaderEpoch();
     }
 
     private SharePartitionKey sharePartitionKey(String groupId, TopicIdPartition topicIdPartition) {

diff --git a/core/src/main/scala/kafka/server/KafkaApis.scala b/core/src/main/scala/kafka/server/KafkaApis.scala
@@ -4249,7 +4249,8 @@ class KafkaApis(val requestChannel: RequestChannel,
         fetchMinBytes,
         fetchMaxBytes,
         FetchIsolation.HIGH_WATERMARK,
-        clientMetadata
+        clientMetadata,
+        true
       )
 
       // call the share partition manager to fetch messages from the local replica.