Revert "add supports_coalescing property in c10d::Backend to determin…

…e whether backend supports coalescing (pytorch#135338)" This reverts commit e557444. Reverted pytorch#135338 on behalf of https://github.com/ZainRizvi due to Sorry but this is failing internally. Please see D65663382 for more details ([comment](pytorch#135338 (comment)))
Ryo-not-rio · Dec 2, 2024 · 2d1981d · 2d1981d
1 parent 5abe9e2
commit 2d1981d
Show file tree

Hide file tree

Showing 5 changed files with 8 additions and 33 deletions.
diff --git a/torch/csrc/distributed/c10d/Backend.hpp b/torch/csrc/distributed/c10d/Backend.hpp
@@ -58,10 +58,6 @@ class TORCH_API Backend : public torch::CustomClassHolder {
     return false;
   }
 
-  virtual bool supportsCoalescing() const {
-    return false;
-  }
-
   virtual void startCoalescing() {
     TORCH_CHECK(
         false,

diff --git a/torch/csrc/distributed/c10d/ProcessGroup.hpp b/torch/csrc/distributed/c10d/ProcessGroup.hpp
@@ -811,7 +811,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
     TORCH_CHECK(
         backendTypeToBackend_.find(backendType_) != backendTypeToBackend_.end(),
         "Could not find the default backend type ",
-        uint16_t(backendType_),
+        backendType_,
         " for Process Group with name ",
         getBackendName(),
         ".");
@@ -832,9 +832,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
     TORCH_CHECK(
         backendTypeToBackend_.find(backendType) != backendTypeToBackend_.end(),
         "Could not find backend type ",
-        uint16_t(backendType),
-        " for Process Group with name ",
-        backendTypeToString(backendType),
+        backendType,
         ".");
     return backendTypeToBackend_.at(backendType);
   }

diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@@ -598,10 +598,6 @@ class TORCH_API ProcessGroupNCCL : public Backend {
     return true;
   }
 
-  bool supportsCoalescing() const override {
-    return true;
-  }
-
   void startCoalescing() override;
 
   c10::intrusive_ptr<Work> endCoalescing() override;

diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
@@ -2321,10 +2321,6 @@ The hook must have the following signature:
               "supports_splitting",
               &::c10d::Backend::supportsSplitting,
               "(test whether the backend supports splitting)")
-          .def_property_readonly(
-              "supports_coalescing",
-              &::c10d::Backend::supportsCoalescing,
-              "(test whether the backend supports coalescing)")
           .def(
               "broadcast",
               &::c10d::Backend::broadcast,

diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
@@ -1806,22 +1806,13 @@ def _new_process_group_helper(
         group_rank,
         group_size,
     )
-    backend_config = BackendConfig(backend)
     # Set the default backend when only single backend is passed in.
     if "," not in str(backend) and ":" not in str(backend):
         assert backend in Backend.backend_type_map, f"Unknown backend type {backend}"
-        if backend == Backend.UNDEFINED:
-            # Currently when backend is UNDEFINED, both ``gloo`` and ``nccl`` backends
-            # will be created, we use nccl(if cuda is available) or gloo as default
-            # backend so we can correctly call getDefaultBackend which in ProcessGroup.
-            if Backend.NCCL in backend_config.get_device_backend_map().values():
-                pg._set_default_backend(Backend.backend_type_map[Backend.NCCL])
-            else:
-                pg._set_default_backend(Backend.backend_type_map[Backend.GLOO])
-        else:
-            pg._set_default_backend(Backend.backend_type_map[backend])
+        pg._set_default_backend(Backend.backend_type_map[backend])
     if device_id:
         pg.bound_device_id = device_id
+    backend_config = BackendConfig(backend)
     backend_class: torch._C._distributed_c10d.Backend
     for device, backend_str in backend_config.get_device_backend_map().items():
         # Use the group name as prefix in the default store, such that
@@ -2028,7 +2019,7 @@ def destroy_process_group(group: Optional[ProcessGroup] = None):
     # alive until all works and hooks are done. The current implementation does the
     # latter. Therefore, we explicitly call _wait_for_pending_works() here to wait
     # for the pending hooks to finish.
-    if type(pg) == ProcessGroup and pg._has_hooks():
+    if pg.name().lower() == "nccl" and pg._has_hooks():
         pg._wait_for_pending_works()
 
     if group is None or group == GroupMember.WORLD:
@@ -2568,17 +2559,15 @@ def batch_isend_irecv(p2p_op_list):
     """
     _check_p2p_op_list(p2p_op_list)
     group = p2p_op_list[0].group
-    if group is None:
-        group = _get_default_group()
     device = p2p_op_list[0].tensor.device
-    if group._get_backend(device).supports_coalescing:
-        # backend support coalescing
+    if device.type == "cuda":
+        # NCCL style coalescing
         with _coalescing_manager(group, device, async_ops=True) as cm:
             for p2p_op in p2p_op_list:
                 p2p_op.op(p2p_op.tensor, p2p_op.peer, p2p_op.group, p2p_op.tag)
         return cm.works
     else:
-        # backend not support coalescing
+        # Backward support for Gloo
         reqs = []
         for p2p_op in p2p_op_list:
             work = p2p_op.op(p2p_op.tensor, p2p_op.peer, p2p_op.group, p2p_op.tag)