Disable Traces on Benchmark when in ServiceLab (#1804)

PaulZhang12 · facebook-github-bot · commit 71cc224af62f · 2024-03-20T06:20:22.000-07:00
Summary: Pull Request resolved: #1804 Training benchmarks have been broken on trunk due to multiprocessing issues. This diff seems to have fixed it once and for all. Will await periodic experiment to complete successfully before modifying TARGETS for benchmark to run on all diffs Reviewed By: sarckk Differential Revision: D55036955 fbshipit-source-id: add8b9617d6ae86217908bc1e9c123cfddb5bdaa
diff --git a/torchrec/distributed/benchmark/benchmark_utils.py b/torchrec/distributed/benchmark/benchmark_utils.py
@@ -506,53 +506,54 @@ def benchmark(
         b = torch.cuda.max_memory_allocated(rank)
         max_mem_allocated.append(b // 1024 // 1024)
 
-    # pyre-ignore[2]
-    def trace_handler(prof) -> None:
-        total_average = prof.profiler.total_average()
-        logger.info(f" TOTAL_AVERAGE:\n{name}\n{total_average}")
-        dir_path: str = output_dir
-
-        # Don't output trace files if dir_path is empty
-        # or rank != 0, rank=-1 in no pg case, only 1 rank should output
-        # in pg case, so rank=0
-        if dir_path == "" or rank > 0:
-            return
-
-        trace_file: str = f"{dir_path}/trace-{name}.json"
-        stacks_cpu_file = f"{dir_path}/stacks-cpu-{name}.stacks"
-        stacks_cuda_file = f"{dir_path}/stacks-cuda-{name}.stacks"
-        logger.info(f" PROFILE[{name}].chrome_trace:{trace_file}")
-
-        prof.export_chrome_trace(trace_file)
-        prof.export_stacks(stacks_cpu_file, "self_cpu_time_total")
-        prof.export_stacks(stacks_cuda_file, "self_cuda_time_total")
-
-    # - git clone https://github.com/brendangregg/FlameGraph
-    # - cd FlameGraph
-    # - ./flamegraph.pl --title "CPU time" --countname "us." profiler.stacks > perf_viz.svg
-
-    with torch.profiler.profile(
-        activities=[
-            torch.profiler.ProfilerActivity.CPU,
-            torch.profiler.ProfilerActivity.CUDA,
-        ],
-        record_shapes=True,
-        profile_memory=True,
-        with_stack=True,
-        with_flops=True,
-        with_modules=True,
-        on_trace_ready=trace_handler,
-    ) as p:
-        for _input in prof_inputs:
-            with record_function("## forward ##"):
-                model(_input)
-                p.step()
-
-    if rank == -1:
-        for di in range(world_size):
-            torch.cuda.synchronize(di)
-    else:
-        torch.cuda.synchronize(rank)
+    if output_dir != "":
+        # Only do profiling if output_dir is set
+
+        # pyre-ignore[2]
+        def trace_handler(prof) -> None:
+            total_average = prof.profiler.total_average()
+            logger.info(f" TOTAL_AVERAGE:\n{name}\n{total_average}")
+            dir_path: str = output_dir
+
+            # only 1 rank should output in pg case, rank = 0
+            if rank > 0:
+                return
+
+            trace_file: str = f"{dir_path}/trace-{name}.json"
+            stacks_cpu_file = f"{dir_path}/stacks-cpu-{name}.stacks"
+            stacks_cuda_file = f"{dir_path}/stacks-cuda-{name}.stacks"
+            logger.info(f" PROFILE[{name}].chrome_trace:{trace_file}")
+
+            prof.export_chrome_trace(trace_file)
+            prof.export_stacks(stacks_cpu_file, "self_cpu_time_total")
+            prof.export_stacks(stacks_cuda_file, "self_cuda_time_total")
+
+        # - git clone https://github.com/brendangregg/FlameGraph
+        # - cd FlameGraph
+        # - ./flamegraph.pl --title "CPU time" --countname "us." profiler.stacks > perf_viz.svg
+
+        with torch.profiler.profile(
+            activities=[
+                torch.profiler.ProfilerActivity.CPU,
+                torch.profiler.ProfilerActivity.CUDA,
+            ],
+            record_shapes=True,
+            profile_memory=True,
+            with_stack=True,
+            with_flops=True,
+            with_modules=True,
+            on_trace_ready=trace_handler,
+        ) as p:
+            for _input in prof_inputs:
+                with record_function("## forward ##"):
+                    model(_input)
+                    p.step()
+
+            if rank == -1:
+                for di in range(torch.cuda.device_count()):
+                    torch.cuda.synchronize(torch.device(f"cuda:{di}"))
+            else:
+                torch.cuda.synchronize()
 
     return BenchmarkResult(
         short_name=name,
@@ -754,6 +755,8 @@ def benchmark_module(
         output_dir: Directory to output profiler outputs (traces, stacks)
         pooling_configs: The pooling factor for the tables.
             (Optional; if not set, we'll use 10 as default)
+        func_to_benchmark: Custom function to benchmark, check out default_func_to_benchmark for default
+        benchmark_func_kwargs: Custom keyword arguments to pass to func_to_benchmark
 
     Returns:
         A list of BenchmarkResults