use monitored barrier

huggingface · Feb 19, 2024 · ff729ba · ff729ba
1 parent 8e0ad46
commit ff729ba
Show file tree

Hide file tree

Showing 7 changed files with 25 additions and 16 deletions.
diff --git a/optimum_benchmark/experiment.py b/optimum_benchmark/experiment.py
@@ -19,6 +19,7 @@
     from .launchers.base import Launcher
     from .backends.base import Backend
 
+import pandas as pd
 from json import dump
 from flatten_dict import flatten
 from hydra.utils import get_class
@@ -64,6 +65,13 @@ def to_json(self, path: str, flat: bool = False) -> None:
             with open(path, "w") as f:
                 dump(self.to_dict(), f, indent=4)
 
+    def to_dataframe(self) -> pd.DataFrame:
+        flat_report_dict = self.to_flat_dict()
+        return pd.DataFrame.from_dict(flat_report_dict, orient="index")
+
+    def to_csv(self, path: str) -> None:
+        self.to_dataframe().to_csv(path, index=False)
+
     def save_pretrained(
         self,
         save_directory: Union[str, os.PathLike],

diff --git a/optimum_benchmark/launchers/process/launcher.py b/optimum_benchmark/launchers/process/launcher.py
@@ -42,6 +42,9 @@ def launch(self, worker: Callable, *worker_args) -> BenchmarkReport:
             while not process_context.join():
                 pass
 
+        # restore the original logging configuration
+        setup_logging(log_level)
+
         try:
             report: BenchmarkReport = queue.get()
         except EOFError:
@@ -50,13 +53,13 @@ def launch(self, worker: Callable, *worker_args) -> BenchmarkReport:
         return report
 
 
-def entrypoint(_, worker, queue, lock, log_level, *worker_args):
+def entrypoint(i, worker, queue, lock, log_level, *worker_args):
     """
     This a pickalable function that correctly sets up the logging configuration for the worker process,
     and puts the output of the worker function into a lock-protected queue.
     """
 
-    setup_logging(log_level)
+    setup_logging(log_level, prefix=f"PROC-{i}")
 
     worker_output = worker(*worker_args)
 

diff --git a/optimum_benchmark/launchers/torchrun/launcher.py b/optimum_benchmark/launchers/torchrun/launcher.py
@@ -59,6 +59,9 @@ def launch(self, worker: Callable, *worker_args) -> Dict[str, Any]:
                 entrypoint=entrypoint, args=(worker, queue, lock, log_level, *worker_args), config=launch_config
             )
 
+        # restore the original logging configuration
+        setup_logging(log_level)
+
         outputs: List[BenchmarkReport] = []
         while not queue.empty():
             outputs.append(queue.get())
@@ -71,7 +74,6 @@ def launch(self, worker: Callable, *worker_args) -> Dict[str, Any]:
         else:
             raise ValueError("No benchmark report was returned by the workers")
 
-        setup_logging(level=log_level)
         report.log()
 
         return report

diff --git a/optimum_benchmark/trackers/energy.py b/optimum_benchmark/trackers/energy.py
@@ -134,14 +134,14 @@ def track(self, interval=1, file_prefix="method"):
             )
 
         if self.distributed:
-            torch.distributed.barrier()
+            torch.distributed.monitored_barrier()
 
         self.emission_tracker.start()
         yield
         self.emission_tracker.stop()
 
         if self.distributed:
-            torch.distributed.barrier()
+            torch.distributed.monitored_barrier()
 
         self.cpu_energy = self.emission_tracker._total_cpu_energy.kWh
         self.gpu_energy = self.emission_tracker._total_gpu_energy.kWh

diff --git a/optimum_benchmark/trackers/latency.py b/optimum_benchmark/trackers/latency.py
@@ -109,15 +109,15 @@ def reset(self):
     @contextmanager
     def track(self):
         if self.distributed:
-            torch.distributed.barrier()
+            torch.distributed.monitored_barrier()
 
         if self.backend == "pytorch" and self.device == "cuda":
             yield from self._pytorch_cuda_latency()
         else:
             yield from self._cpu_latency()
 
         if self.distributed:
-            torch.distributed.barrier()
+            torch.distributed.monitored_barrier()
 
     def _pytorch_cuda_latency(self):
         start = torch.cuda.Event(enable_timing=True)

diff --git a/optimum_benchmark/trackers/memory.py b/optimum_benchmark/trackers/memory.py
@@ -103,7 +103,7 @@ def reset(self):
     @contextmanager
     def track(self):
         if self.distributed:
-            torch.distributed.barrier()
+            torch.distributed.monitored_barrier()
 
         if self.device == "cuda" and self.backend == "pytorch":
             yield from self._cuda_pytorch_memory()
@@ -113,7 +113,7 @@ def track(self):
             yield from self._cpu_memory()
 
         if self.distributed:
-            torch.distributed.barrier()
+            torch.distributed.monitored_barrier()
 
     def _cuda_pytorch_memory(self):
         torch.cuda.empty_cache()

diff --git a/tests/test_api.py b/tests/test_api.py
@@ -147,14 +147,8 @@ def test_api_dataset_generator(library, task, model):
 @pytest.mark.parametrize("launcher_config", LAUNCHER_CONFIGS)
 @pytest.mark.parametrize("device", DEVICES)
 def test_api_launch(launcher_config, device):
-    device_ids = None
-
-    if device == "cuda":
-        device_ids = ",".join(str(i) for i in range(torch.cuda.device_count()))
-
-    # only inference cuz training is slow
     benchmark_config = InferenceConfig(latency=True, memory=True)
-    # only pytorch backend cuz default
+    device_ids = ",".join(str(i) for i in range(torch.cuda.device_count())) if device == "cuda" else None
     backend_config = PyTorchConfig(model="bert-base-uncased", device_ids=device_ids, no_weights=True, device=device)
     experiment_config = ExperimentConfig(
         experiment_name="api-experiment", benchmark=benchmark_config, launcher=launcher_config, backend=backend_config
@@ -164,6 +158,8 @@ def test_api_launch(launcher_config, device):
     with TemporaryDirectory() as tempdir:
         experiment_config.to_dict()
         experiment_config.to_flat_dict()
+        experiment_config.to_dataframe()
+        experiment_config.to_csv(f"{tempdir}/experiment_config.csv")
         experiment_config.to_json(f"{tempdir}/experiment_config.json")
 
         benchmark_report.to_dict()