Add missing event wait for last stage in StagedTrainPipeline (#1770)

sarckk · facebook-github-bot · commit a90d3eded36f · 2024-03-12T11:57:32.000-07:00
Summary: Pull Request resolved: #1770 StagedTrainPipeline expects model forward() to happen outside of the pipeline, which means that we need to wait for the last pre-forward stage to finish before progressing in the main compute stream. Also changes `wait_sparse_data_dist` to happen in the SDD stream instead of main stream Reviewed By: dracifer, joshuadeng Differential Revision: D54685704 fbshipit-source-id: cad14e1a67fb06bf56be359ef4face6877ee794b
diff --git a/torchrec/distributed/train_pipeline/tests/test_train_pipeline.py b/torchrec/distributed/train_pipeline/tests/test_train_pipeline.py
@@ -875,7 +875,9 @@ def gpu_preproc(x: StageOut) -> StageOut:
                 fill_callback=sdd.wait_sparse_data_dist,
             ),
         ]
-        pipeline = StagedTrainPipeline(pipeline_stages=pipeline_stages)
+        pipeline = StagedTrainPipeline(
+            pipeline_stages=pipeline_stages, compute_stream=torch.cuda.current_stream()
+        )
         dataloader = iter(data)
 
         pipelined_out = []
diff --git a/torchrec/distributed/train_pipeline/train_pipeline.py b/torchrec/distributed/train_pipeline/train_pipeline.py
@@ -591,6 +591,9 @@ class StagedTrainPipeline(TrainPipeline[In, Optional[StageOut]]):
     Args:
         pipeline_stages (List[PipelineStage]): A list of stages to execute.
         debug_mode (bool): Whether to enable debug mode.
+        compute_stream (Optional[torch.cuda.Stream]): The main compute stream in which
+            model forward is run, usually torch.cuda.default_stream(). Defaults to the
+            current cuda stream.
 
     Example::
         train_pipeline = StagedTrainPipeline(
@@ -619,6 +622,7 @@ def __init__(
         self,
         pipeline_stages: List[PipelineStage],
         debug_mode: bool = False,
+        compute_stream: Optional[torch.cuda.Stream] = None,
     ) -> None:
         self._pipeline_stages = pipeline_stages
         self._debug_mode = debug_mode
@@ -627,20 +631,23 @@ def __init__(
         )
         self._initialized = False
         self._num_steps = 0
+        self._dataloader_iter: Optional[Iterator[In]] = None
+        self._dataloader_exhausted: bool = False
+        self._compute_stream: torch.cuda.streams.Stream = (
+            compute_stream or torch.cuda.current_stream()
+        )
 
     @property
     def num_stages(self) -> int:
         return len(self._pipeline_stages)
 
-    def _advance(self) -> Optional[StageOut]:
+    def _advance(self) -> Optional[StageOutputWithEvent]:
         # left shifts all batch results.
         out = self._stage_outputs[0]
         for idx in range(self.num_stages - 1):
             self._stage_outputs[idx] = self._stage_outputs[idx + 1]
         self._stage_outputs[-1] = None
-        if out is None:
-            return out
-        return out[0]
+        return out
 
     def _run_with_event(
         self,
@@ -662,6 +669,23 @@ def _run_with_event(
             new_event.record(stream)
             return (output, new_event)
 
+    def _next_batch(self, dataloader_iter: Iterator[In]) -> Optional[In]:
+        """
+        Retrieves next batch from dataloader and prevents calling `next` on an already
+        exhausted dataloader, which can cause hanging.
+        """
+        if dataloader_iter is not self._dataloader_iter:
+            self._dataloader_iter = dataloader_iter
+            self._dataloader_exhausted = False
+
+        if self._dataloader_exhausted:
+            batch = None
+        else:
+            batch = next(dataloader_iter, None)
+            if batch is None:
+                self._dataloader_exhausted = True
+        return batch
+
     def _run_stage(
         self,
         batch_offset: int,
@@ -680,7 +704,7 @@ def _run_stage(
             f"## Pipeline Stage {stage_idx} : {stage.name} for batch {batch_offset + self._num_steps} ##"
         ):
             if stage_idx == 0:
-                batch_to_wait = next(dataloader_iter, None)
+                batch_to_wait = self._next_batch(dataloader_iter)
                 event = None
             else:
                 batch_to_wait_with_event = self._stage_outputs[batch_offset]
@@ -765,7 +789,12 @@ def progress(
         if not self._initialized:
             self._fill_pipeline(dataloader_iter)
 
-        output = self._advance()
+        output_with_event = self._advance()
+
+        if output_with_event is None:
+            # All data consumed, exit early
+            return None
+
         self._num_steps += 1
 
         for stage_idx in range(self.num_stages):
@@ -776,4 +805,11 @@ def progress(
                 dataloader_iter=dataloader_iter,
             )
 
-        return output
+        out, event = output_with_event
+        if event is not None:
+            # Since model forward() is expected to run outside the pipeline,
+            # we need to explicitly wait for the last stage to finish
+            event.wait(self._compute_stream)
+            out.record_stream(self._compute_stream)
+
+        return out
diff --git a/torchrec/distributed/train_pipeline/utils.py b/torchrec/distributed/train_pipeline/utils.py
@@ -802,8 +802,12 @@ def start_sparse_data_dist(self, batch: In) -> In:
         return batch
 
     def wait_sparse_data_dist(self) -> None:
-        self.context.module_contexts = self.context.module_contexts_next_batch.copy()
-        self.context.input_dist_tensors_requests.clear()
-        for names, awaitable in self.context.fused_splits_awaitables:
-            for name, request in zip(names, awaitable.wait()):
-                self.context.input_dist_tensors_requests[name] = request
+        with record_function("## wait_sparse_data_dist ##"):
+            with torch.cuda.stream(self.stream):
+                self.context.module_contexts = (
+                    self.context.module_contexts_next_batch.copy()
+                )
+                self.context.input_dist_tensors_requests.clear()
+                for names, awaitable in self.context.fused_splits_awaitables:
+                    for name, request in zip(names, awaitable.wait()):
+                        self.context.input_dist_tensors_requests[name] = request