diff --git a/torch/csrc/distributed/c10d/Backend.hpp b/torch/csrc/distributed/c10d/Backend.hpp index 56ac58119009c..06efbcac29712 100644 --- a/torch/csrc/distributed/c10d/Backend.hpp +++ b/torch/csrc/distributed/c10d/Backend.hpp @@ -58,10 +58,6 @@ class TORCH_API Backend : public torch::CustomClassHolder { return false; } - virtual bool supportsCoalescing() const { - return false; - } - virtual void startCoalescing() { TORCH_CHECK( false, diff --git a/torch/csrc/distributed/c10d/ProcessGroup.hpp b/torch/csrc/distributed/c10d/ProcessGroup.hpp index 0000e7088fa84..5cba3a39629d4 100644 --- a/torch/csrc/distributed/c10d/ProcessGroup.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroup.hpp @@ -811,7 +811,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder { TORCH_CHECK( backendTypeToBackend_.find(backendType_) != backendTypeToBackend_.end(), "Could not find the default backend type ", - uint16_t(backendType_), + backendType_, " for Process Group with name ", getBackendName(), "."); @@ -832,9 +832,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder { TORCH_CHECK( backendTypeToBackend_.find(backendType) != backendTypeToBackend_.end(), "Could not find backend type ", - uint16_t(backendType), - " for Process Group with name ", - backendTypeToString(backendType), + backendType, "."); return backendTypeToBackend_.at(backendType); } diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp index 9f9034f4daf5c..1d51792adb974 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp @@ -598,10 +598,6 @@ class TORCH_API ProcessGroupNCCL : public Backend { return true; } - bool supportsCoalescing() const override { - return true; - } - void startCoalescing() override; c10::intrusive_ptr endCoalescing() override; diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp index 12f361f7e3755..0a89690522e2d 100644 --- a/torch/csrc/distributed/c10d/init.cpp +++ b/torch/csrc/distributed/c10d/init.cpp @@ -2321,10 +2321,6 @@ The hook must have the following signature: "supports_splitting", &::c10d::Backend::supportsSplitting, "(test whether the backend supports splitting)") - .def_property_readonly( - "supports_coalescing", - &::c10d::Backend::supportsCoalescing, - "(test whether the backend supports coalescing)") .def( "broadcast", &::c10d::Backend::broadcast, diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py index 5e3071051c535..61cb222233108 100644 --- a/torch/distributed/distributed_c10d.py +++ b/torch/distributed/distributed_c10d.py @@ -1806,22 +1806,13 @@ def _new_process_group_helper( group_rank, group_size, ) - backend_config = BackendConfig(backend) # Set the default backend when only single backend is passed in. if "," not in str(backend) and ":" not in str(backend): assert backend in Backend.backend_type_map, f"Unknown backend type {backend}" - if backend == Backend.UNDEFINED: - # Currently when backend is UNDEFINED, both ``gloo`` and ``nccl`` backends - # will be created, we use nccl(if cuda is available) or gloo as default - # backend so we can correctly call getDefaultBackend which in ProcessGroup. - if Backend.NCCL in backend_config.get_device_backend_map().values(): - pg._set_default_backend(Backend.backend_type_map[Backend.NCCL]) - else: - pg._set_default_backend(Backend.backend_type_map[Backend.GLOO]) - else: - pg._set_default_backend(Backend.backend_type_map[backend]) + pg._set_default_backend(Backend.backend_type_map[backend]) if device_id: pg.bound_device_id = device_id + backend_config = BackendConfig(backend) backend_class: torch._C._distributed_c10d.Backend for device, backend_str in backend_config.get_device_backend_map().items(): # Use the group name as prefix in the default store, such that @@ -2028,7 +2019,7 @@ def destroy_process_group(group: Optional[ProcessGroup] = None): # alive until all works and hooks are done. The current implementation does the # latter. Therefore, we explicitly call _wait_for_pending_works() here to wait # for the pending hooks to finish. - if type(pg) == ProcessGroup and pg._has_hooks(): + if pg.name().lower() == "nccl" and pg._has_hooks(): pg._wait_for_pending_works() if group is None or group == GroupMember.WORLD: @@ -2568,17 +2559,15 @@ def batch_isend_irecv(p2p_op_list): """ _check_p2p_op_list(p2p_op_list) group = p2p_op_list[0].group - if group is None: - group = _get_default_group() device = p2p_op_list[0].tensor.device - if group._get_backend(device).supports_coalescing: - # backend support coalescing + if device.type == "cuda": + # NCCL style coalescing with _coalescing_manager(group, device, async_ops=True) as cm: for p2p_op in p2p_op_list: p2p_op.op(p2p_op.tensor, p2p_op.peer, p2p_op.group, p2p_op.tag) return cm.works else: - # backend not support coalescing + # Backward support for Gloo reqs = [] for p2p_op in p2p_op_list: work = p2p_op.op(p2p_op.tensor, p2p_op.peer, p2p_op.group, p2p_op.tag)