prevent calling next on exhausted dataloader in train pipeline (#1778)

joshuadeng · facebook-github-bot · commit d7bced9b231c · 2024-03-12T13:57:47.000-07:00
Summary: Pull Request resolved: #1778 calling `next` on an already exhausted dataloader can cause the dataloader to hang. this diff prevents this from occurring while respecting the train pipeline api which can allow user to send in a different pipeline. Reviewed By: sarckk, lequytra Differential Revision: D54753344 fbshipit-source-id: 64a5ec3b5fa39cbfe3206b7993608c42c81039ee
diff --git a/torchrec/distributed/train_pipeline/train_pipeline.py b/torchrec/distributed/train_pipeline/train_pipeline.py
@@ -178,6 +178,8 @@ def __init__(
         self._batch_ip2: Optional[In] = None
         self._context = TrainPipelineContext()
         self._pipelined_modules: List[ShardedModule] = []
+        self._dataloader_iter: Optional[Iterator[In]] = None
+        self._dataloader_exhausted: bool = False
 
     def _fill_pipeline(self, dataloader_iter: Iterator[In]) -> None:
         # pipeline is already filled
@@ -262,13 +264,30 @@ def _copy_batch_to_gpu(self, dataloader_iter: Iterator[In]) -> Optional[In]:
         """
         with record_function("## copy_batch_to_gpu ##"):
             with torch.cuda.stream(self._memcpy_stream):
-                batch = next(dataloader_iter, None)
+                batch = self._next_batch(dataloader_iter)
                 if batch is not None:
                     batch = _to_device(batch, self._device, non_blocking=True)
                 elif not self._execute_all_batches:
                     raise StopIteration
                 return batch
 
+    def _next_batch(self, dataloader_iter: Iterator[In]) -> Optional[In]:
+        """
+        Retrieves next batch from dataloader and prevents calling `next` on an already
+        exhausted dataloader, which can cause hanging.
+        """
+        if dataloader_iter is not self._dataloader_iter:
+            self._dataloader_iter = dataloader_iter
+            self._dataloader_exhausted = False
+
+        if self._dataloader_exhausted:
+            batch = None
+        else:
+            batch = next(dataloader_iter, None)
+            if batch is None:
+                self._dataloader_exhausted = True
+        return batch
+
     def _start_sparse_data_dist(self, batch: Optional[In]) -> None:
         """
         Waits for batch to finish getting copied to GPU, then starts the input dist.