Skip to content

Commit

Permalink
Revert "add supports_coalescing property in c10d::Backend to determin…
Browse files Browse the repository at this point in the history
…e whether backend supports coalescing (pytorch#135338)"

This reverts commit e557444.

Reverted pytorch#135338 on behalf of https://github.com/ZainRizvi due to Sorry but this is failing internally. Please see D65663382 for more details ([comment](pytorch#135338 (comment)))
  • Loading branch information
pytorchmergebot authored and Ryo-not-rio committed Dec 2, 2024
1 parent 5abe9e2 commit 2d1981d
Show file tree
Hide file tree
Showing 5 changed files with 8 additions and 33 deletions.
4 changes: 0 additions & 4 deletions torch/csrc/distributed/c10d/Backend.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,6 @@ class TORCH_API Backend : public torch::CustomClassHolder {
return false;
}

virtual bool supportsCoalescing() const {
return false;
}

virtual void startCoalescing() {
TORCH_CHECK(
false,
Expand Down
6 changes: 2 additions & 4 deletions torch/csrc/distributed/c10d/ProcessGroup.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -811,7 +811,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
TORCH_CHECK(
backendTypeToBackend_.find(backendType_) != backendTypeToBackend_.end(),
"Could not find the default backend type ",
uint16_t(backendType_),
backendType_,
" for Process Group with name ",
getBackendName(),
".");
Expand All @@ -832,9 +832,7 @@ class TORCH_API ProcessGroup : public torch::CustomClassHolder {
TORCH_CHECK(
backendTypeToBackend_.find(backendType) != backendTypeToBackend_.end(),
"Could not find backend type ",
uint16_t(backendType),
" for Process Group with name ",
backendTypeToString(backendType),
backendType,
".");
return backendTypeToBackend_.at(backendType);
}
Expand Down
4 changes: 0 additions & 4 deletions torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -598,10 +598,6 @@ class TORCH_API ProcessGroupNCCL : public Backend {
return true;
}

bool supportsCoalescing() const override {
return true;
}

void startCoalescing() override;

c10::intrusive_ptr<Work> endCoalescing() override;
Expand Down
4 changes: 0 additions & 4 deletions torch/csrc/distributed/c10d/init.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2321,10 +2321,6 @@ The hook must have the following signature:
"supports_splitting",
&::c10d::Backend::supportsSplitting,
"(test whether the backend supports splitting)")
.def_property_readonly(
"supports_coalescing",
&::c10d::Backend::supportsCoalescing,
"(test whether the backend supports coalescing)")
.def(
"broadcast",
&::c10d::Backend::broadcast,
Expand Down
23 changes: 6 additions & 17 deletions torch/distributed/distributed_c10d.py
Original file line number Diff line number Diff line change
Expand Up @@ -1806,22 +1806,13 @@ def _new_process_group_helper(
group_rank,
group_size,
)
backend_config = BackendConfig(backend)
# Set the default backend when only single backend is passed in.
if "," not in str(backend) and ":" not in str(backend):
assert backend in Backend.backend_type_map, f"Unknown backend type {backend}"
if backend == Backend.UNDEFINED:
# Currently when backend is UNDEFINED, both ``gloo`` and ``nccl`` backends
# will be created, we use nccl(if cuda is available) or gloo as default
# backend so we can correctly call getDefaultBackend which in ProcessGroup.
if Backend.NCCL in backend_config.get_device_backend_map().values():
pg._set_default_backend(Backend.backend_type_map[Backend.NCCL])
else:
pg._set_default_backend(Backend.backend_type_map[Backend.GLOO])
else:
pg._set_default_backend(Backend.backend_type_map[backend])
pg._set_default_backend(Backend.backend_type_map[backend])
if device_id:
pg.bound_device_id = device_id
backend_config = BackendConfig(backend)
backend_class: torch._C._distributed_c10d.Backend
for device, backend_str in backend_config.get_device_backend_map().items():
# Use the group name as prefix in the default store, such that
Expand Down Expand Up @@ -2028,7 +2019,7 @@ def destroy_process_group(group: Optional[ProcessGroup] = None):
# alive until all works and hooks are done. The current implementation does the
# latter. Therefore, we explicitly call _wait_for_pending_works() here to wait
# for the pending hooks to finish.
if type(pg) == ProcessGroup and pg._has_hooks():
if pg.name().lower() == "nccl" and pg._has_hooks():
pg._wait_for_pending_works()

if group is None or group == GroupMember.WORLD:
Expand Down Expand Up @@ -2568,17 +2559,15 @@ def batch_isend_irecv(p2p_op_list):
"""
_check_p2p_op_list(p2p_op_list)
group = p2p_op_list[0].group
if group is None:
group = _get_default_group()
device = p2p_op_list[0].tensor.device
if group._get_backend(device).supports_coalescing:
# backend support coalescing
if device.type == "cuda":
# NCCL style coalescing
with _coalescing_manager(group, device, async_ops=True) as cm:
for p2p_op in p2p_op_list:
p2p_op.op(p2p_op.tensor, p2p_op.peer, p2p_op.group, p2p_op.tag)
return cm.works
else:
# backend not support coalescing
# Backward support for Gloo
reqs = []
for p2p_op in p2p_op_list:
work = p2p_op.op(p2p_op.tensor, p2p_op.peer, p2p_op.group, p2p_op.tag)
Expand Down

0 comments on commit 2d1981d

Please sign in to comment.