sync processes befor each measure to avoid hanging

huggingface · Feb 19, 2024 · 8e0ad46 · 8e0ad46
1 parent 37a0b32
commit 8e0ad46
Show file tree

Hide file tree

Showing 3 changed files with 23 additions and 9 deletions.
diff --git a/optimum_benchmark/trackers/energy.py b/optimum_benchmark/trackers/energy.py
@@ -80,6 +80,7 @@ class EnergyTracker:
     def __init__(self, device: str, device_ids: Optional[str] = None):
         self.device = device
         self.device_ids = device_ids
+        self.distributed = is_torch_distributed_available() and torch.distributed.is_initialized()
 
         if self.device == "cuda":
             if self.device_ids is None:
@@ -89,9 +90,6 @@ def __init__(self, device: str, device_ids: Optional[str] = None):
             self.device_ids = list(map(int, self.device_ids.split(",")))
             LOGGER.info(f"\t+ Tracking GPU energy on devices {self.device_ids}")
 
-        if is_torch_distributed_available() and torch.distributed.is_initialized():
-            torch.distributed.barrier()
-
         self.reset()
 
     def reset(self):
@@ -135,10 +133,16 @@ def track(self, interval=1, file_prefix="method"):
                 country_iso_code=os.environ.get("COUNTRY_ISO_CODE", "FRA"),
             )
 
+        if self.distributed:
+            torch.distributed.barrier()
+
         self.emission_tracker.start()
         yield
         self.emission_tracker.stop()
 
+        if self.distributed:
+            torch.distributed.barrier()
+
         self.cpu_energy = self.emission_tracker._total_cpu_energy.kWh
         self.gpu_energy = self.emission_tracker._total_gpu_energy.kWh
         self.ram_energy = self.emission_tracker._total_ram_energy.kWh

diff --git a/optimum_benchmark/trackers/latency.py b/optimum_benchmark/trackers/latency.py
@@ -92,15 +92,13 @@ class LatencyTracker:
     def __init__(self, device: str, backend: str):
         self.device = device
         self.backend = backend
+        self.distributed = is_torch_distributed_available() and torch.distributed.is_initialized()
 
         if self.backend == "pytorch" and self.device == "cuda":
             LOGGER.info("\t+ Tracking Pytorch CUDA latency")
         else:
             LOGGER.info("\t+ Tracking CPU latency")
 
-        if is_torch_distributed_available() and torch.distributed.is_initialized():
-            torch.distributed.barrier()
-
         self.reset()
 
     def reset(self):
@@ -110,11 +108,17 @@ def reset(self):
 
     @contextmanager
     def track(self):
+        if self.distributed:
+            torch.distributed.barrier()
+
         if self.backend == "pytorch" and self.device == "cuda":
             yield from self._pytorch_cuda_latency()
         else:
             yield from self._cpu_latency()
 
+        if self.distributed:
+            torch.distributed.barrier()
+
     def _pytorch_cuda_latency(self):
         start = torch.cuda.Event(enable_timing=True)
         start.record()

diff --git a/optimum_benchmark/trackers/memory.py b/optimum_benchmark/trackers/memory.py
@@ -71,6 +71,7 @@ def __init__(self, device: str, backend: str, device_ids: Optional[str] = None):
         self.device = device
         self.backend = backend
         self.device_ids = device_ids
+        self.distributed = is_torch_distributed_available() and torch.distributed.is_initialized()
 
         LOGGER.info("\t+ Tracking RAM memory")
 
@@ -91,9 +92,6 @@ def __init__(self, device: str, backend: str, device_ids: Optional[str] = None):
                     )
                 LOGGER.info(f"\t+ Tracking Allocated/Reserved memory of {num_pytorch_devices} Pytorch CUDA devices")
 
-        if is_torch_distributed_available() and torch.distributed.is_initialized():
-            torch.distributed.barrier()
-
         self.reset()
 
     def reset(self):
@@ -104,13 +102,19 @@ def reset(self):
 
     @contextmanager
     def track(self):
+        if self.distributed:
+            torch.distributed.barrier()
+
         if self.device == "cuda" and self.backend == "pytorch":
             yield from self._cuda_pytorch_memory()
         elif self.device == "cuda":
             yield from self._cuda_memory()
         else:
             yield from self._cpu_memory()
 
+        if self.distributed:
+            torch.distributed.barrier()
+
     def _cuda_pytorch_memory(self):
         torch.cuda.empty_cache()
 
@@ -129,6 +133,8 @@ def _cuda_pytorch_memory(self):
             torch.cuda.max_memory_reserved(device=device) / 1e6 for device in range(torch.cuda.device_count())
         )
 
+        torch.cuda.empty_cache()
+
     def _cuda_memory(self):
         child_connection, parent_connection = Pipe()
         memory_process = Process(