catch overlaping port from find_free_port

huggingface · Feb 15, 2024 · 98046f8 · 98046f8
1 parent 558b341
commit 98046f8
Show file tree

Hide file tree

Showing 4 changed files with 16 additions and 9 deletions.
diff --git a/.github/workflows/3d_parallelism_unit_tests.yaml b/.github/workflows/3d_parallelism_unit_tests.yaml
@@ -58,5 +58,6 @@ jobs:
         --color=yes \
         --durations=0 \
         --ignore tests/kernels \
+        --ignore tests/fp8 \
         --verbose \
         tests/
diff --git a/src/nanotron/distributed.py b/src/nanotron/distributed.py
@@ -240,7 +240,7 @@ def get_rank(group: Optional[ProcessGroup] = None) -> int:  # pylint: disable=fu
     return result
 
 
-def initialize_torch_distributed(port: Optional[int] = None):
+def initialize_torch_distributed():
     """Initializes torch distributed with the environment variables"""
     rank = int(os.getenv("RANK", "0"))
     world_size = int(os.getenv("WORLD_SIZE", "1"))
@@ -259,7 +259,14 @@ def initialize_torch_distributed(port: Optional[int] = None):
         backend = "gloo"
 
     # Call the init process.
-    port = find_free_port() if port is None else port
+    # port = find_free_port() if port is None else port
+
+    port = os.getenv("MASTER_PORT")
+    if port is None:
+        port = find_free_port()
+    else:
+        port = int(port)
+
     init_method = f"env://localhost:{port}"
     dist.init_process_group(
         init_method=init_method, backend=backend, world_size=world_size, rank=rank, timeout=dist.default_pg_timeout

diff --git a/src/nanotron/parallel/context.py b/src/nanotron/parallel/context.py
@@ -1,5 +1,5 @@
 import os
-from typing import Literal, Optional, Tuple
+from typing import Literal, Tuple
 
 import numpy as np
 import torch
@@ -15,7 +15,6 @@ def __init__(
         tensor_parallel_size: int,
         pipeline_parallel_size: int,
         data_parallel_size: int,
-        port: Optional[int] = None,
         backend: DistributedBackend = "nccl",
     ):
         """Initialize parallel context."""
@@ -49,7 +48,7 @@ def __init__(
         assert backend == "nccl", "Only nccl backend is supported for now."
 
         if not dist.is_initialized():
-            dist.initialize_torch_distributed(port)
+            dist.initialize_torch_distributed()
 
         world_size = int(os.getenv("WORLD_SIZE", "1"))
         ranks = list(range(world_size))

diff --git a/src/nanotron/utils.py b/src/nanotron/utils.py
@@ -2,10 +2,10 @@
 import inspect
 import math
 import os
-from contextlib import ExitStack, contextmanager
-from typing import Callable, ContextManager, List, Optional
 import random
 import socket
+from contextlib import ExitStack, contextmanager
+from typing import Callable, ContextManager, List, Optional
 
 import torch
 from packaging import version
@@ -159,5 +159,5 @@ def find_free_port(min_port: int = 2000, max_port: int = 65000) -> int:
                 sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
                 sock.bind(("localhost", port))
                 return port
-        except OSError as e:
-            raise e
+        except OSError:
+            raise Exception("Address already in use")