pytorch · msaroufim · Nov 22, 2023 · Nov 15, 2023 · Nov 15, 2023 · Nov 16, 2023
diff --git a/distributed/tensor_parallelism/original.py b/distributed/tensor_parallelism/original.py
@@ -0,0 +1,127 @@
+import argparse
+
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+
+from torch.distributed._tensor import DeviceMesh
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.tensor.parallel import (
+    PairwiseParallel,
+    parallelize_module,
+)
+from torch.distributed.tensor.parallel.fsdp import enable_2d_with_fsdp
+
+from utils import cleanup, setup, ToyModel
+try:
+    from torch.distributed.tensor.parallel import (
+        SequenceParallel
+    )
+    SP_AVAILABLE = True
+except BaseException as e:
+    pass
+
+
+"""
+This is the script to test 2D Parallel which combines Tensor/Sequence
+parallel with Fully Sharded Data Parallel (TP/SP + FSDP) on a toy model
+in the SPMD style. We show an E2E working flow from forward, backward
+and optimization.
+
+We enabled Fully Sharded Data Parallel + Tensor Parallel in
+separate parallel dimensions:
+    Data Parallel across hosts
+    Tensor Parallel within each host
+
+ We use a simple diagram to illustrate below:
+
+======================================================================
+------------       ------------       ------------       ------------
+| Host 1   |       | Host 2   |       |          |       | Host N   |
+| 8 GPUs   |       | 8 GPUs   |       |          |       | 8 GPUs   |
+|          |       |          |       |    ...   |       |          |
+| (TP)     |       | (TP)     |       |          |       | (TP)     |
+|[0,1,..,7]|       |[8,9..,15]|       |          |       |[8N-8,8N-7|
+|          |       |          |       |          |       | .., 8N-1]|
+|          |       |          |       |          |       |          |
+------------       ------------       ------------       ------------
+FSDP:
+[0, 8, ..., 8N-8], [1, 9, ..., 8N-7], ..., [7, 15, ..., 8N-1]
+======================================================================
+
+More details can be seen in the slide:
+https://docs.google.com/presentation/d/17g6WqrO00rP3MsxbRENsPpjrlSkwiA_QB4r93_eB5is/
+"""
+
+
+def demo_2d(rank, args):
+    """
+    Main body of the demo of a basic version of tensor parallel by using
+    PyTorch native APIs.
+    """
+    print(f"Running basic Megatron style TP example on rank {rank}.")
+    setup(rank, args.world_size)
+    assert (
+        args.world_size % args.tp_size == 0
+    ), "World size needs to be divisible by TP size"
+
+    # create a sharding plan based on the given world_size.
+    device_mesh = DeviceMesh(
+        "cuda", torch.arange(0, args.world_size).view(-1, args.tp_size)
+    )
+
+    # create model and move it to GPU with id rank
+    model = ToyModel().cuda(rank)
+    # Create a optimizer for the parallelized module.
+    LR = 0.25
+    optimizer = torch.optim.SGD(model.parameters(), lr=LR)
+    # Parallelize the module based on the given Parallel Style.
+    parallel_style = SequenceParallel() if args.run_seq_parallel else PairwiseParallel()
+    model = parallelize_module(model, device_mesh, parallel_style, tp_mesh_dim=1)
+
+    # We need to register hooks for TP + FSDP integration.
+    assert (
+        enable_2d_with_fsdp()
+    ), "FSDP 2D hook is not registered. Please use PyTorch with version >= 2.0"
+    dp_pg = device_mesh.get_dim_groups()[0]
+    model = FSDP(model, process_group=dp_pg)
+
+    # Perform a num of iterations of forward/backward
+    # and optimizations for the sharded module.
+    for i in range(args.iter_nums):
+        # For TP, input needs to be same across all TP ranks.
+        # while for SP, input can be different across all ranks.
+        # Setting the random seed is to mimic the behavior of dataloader.
+        dp_rank = (
+            rank
+            if args.run_seq_parallel
+            else dist.get_rank(dp_pg)
+        )
+        torch.manual_seed(i + dp_rank)
+        inp = torch.rand(20, 10).cuda(rank)
+        output = model(inp)
+        output.sum().backward()
+        optimizer.step()
+
+    cleanup()
+
+
+if __name__ == "__main__":
+    n_gpus = torch.cuda.device_count()
+    parser = argparse.ArgumentParser()
+    # This is passed in via cmd
+    parser.add_argument("--world_size", type=int, default=n_gpus)
+    parser.add_argument("--iter_nums", type=int, default=10)
+    parser.add_argument("--run_seq_parallel", type=bool, default=False)
+    parser.add_argument("--tp_size", type=int, default=2)
+    args = parser.parse_args()
+    # The main entry point is called directly without using subprocess
+    if n_gpus < 4:
+        print("Requires at least 4 GPUs to run.")
+    elif not SP_AVAILABLE:
+        print(
+            "PyTorch doesn't have Sequence Parallelism available,"
+            " need nightly build."
+        )
+    else:
+        mp.spawn(demo_2d, args=(args,), nprocs=args.world_size, join=True)
diff --git a/distributed/tensor_parallelism/requirements.txt b/distributed/tensor_parallelism/requirements.txt
@@ -1,6 +1,6 @@
 # Python dependencies required for running the example
 
 --pre
---extra-index-url https://download.pytorch.org/whl/nightly/cu113
---extra-index-url https://download.pytorch.org/whl/nightly/cu116
-torch >= 1.14.0.dev0; sys_platform == "linux"
+--extra-index-url https://download.pytorch.org/whl/nightly/cu118
+--extra-index-url https://download.pytorch.org/whl/nightly/cu121
+torch >= 2.2.0.dev0; sys_platform == "linux"
diff --git a/distributed/tensor_parallelism/run_sequence_parallel.sh b/distributed/tensor_parallelism/run_sequence_parallel.sh
@@ -0,0 +1 @@
+torchrun --nnodes=1 --nproc_per_node=4 --rdzv_id=101 --rdzv_endpoint="localhost:5973" sequence_parallel_example.py
diff --git a/distributed/tensor_parallelism/run_tensor_parallel.sh b/distributed/tensor_parallelism/run_tensor_parallel.sh
@@ -0,0 +1 @@
+torchrun --nnodes=1 --nproc_per_node=4 --rdzv_id=101 --rdzv_endpoint="localhost:5973" tensor_parallel_example.py
diff --git a/distributed/tensor_parallelism/run_twod_parallel.sh b/distributed/tensor_parallelism/run_twod_parallel.sh
@@ -0,0 +1 @@
+torchrun --nnodes=1 --nproc_per_node=4 --rdzv_id=101 --rdzv_endpoint="localhost:5973" two_d_parallel_example.py
diff --git a/distributed/tensor_parallelism/sequence_parallel_example.py b/distributed/tensor_parallelism/sequence_parallel_example.py
@@ -1,11 +1,17 @@
 import argparse
-
+import os
 import torch
-import torch.multiprocessing as mp
 
-from torch.distributed._tensor import DeviceMesh
-from torch.distributed.tensor.parallel import parallelize_module
-from utils import cleanup, setup, ToyModel
+from torch.distributed._tensor.device_mesh import init_device_mesh
+from torch.distributed._tensor import DeviceMesh, DTensor, Replicate, Shard
+
+from torch.distributed.tensor.parallel import (
+    PairwiseParallel,
+    parallelize_module,
+    ColwiseParallel,
+    RowwiseParallel,
+)
+from utils import cleanup, ToyModel, torchrun_setup
 
 try:
     from torch.distributed.tensor.parallel import (
@@ -33,51 +39,75 @@
 """
 
 
-def demo_sp(rank, args):
+def demo_sp(args):
     """
     Main body of the demo of a basic version of sequence parallel by using
     PyTorch native APIs.
     """
-    print(f"Running SP example on rank {rank}.")
-    setup(rank, args.world_size)
+    torchrun_setup()
+
+    # understand world topology
+    _rank = int(os.environ["RANK"])
+    _local_rank = int(os.environ["LOCAL_RANK"])
+    _world_size = int(os.environ["WORLD_SIZE"])
+    _local_world_size = int(os.environ["LOCAL_WORLD_SIZE"])
+
+    torch.cuda.set_device(_local_rank)
+
+    def rank_print(msg):
+        """helper function to print only on global rank 0"""
+        if _rank==0:
+            print(f"{msg}")
+
+    print(f"Running basic Megatron style Sequence Parallel example on rank {_rank}.")
+
+    # create a device mesh based on the given world_size.
+
+    device = f"cuda"
+    device_mesh = init_device_mesh(device_type = device,mesh_shape = (_world_size,))
+    assert device_mesh is not None, "unable to create valid device mesh"
+
+    rank_print(f"Device Mesh created: {device_mesh=}")
 
-    # create a sharding plan based on the given world_size.
-    device_mesh = DeviceMesh("cuda", torch.arange(0, args.world_size))
 
     # create model and move it to GPU with id rank
-    model = ToyModel().cuda(rank)
+    model = ToyModel().cuda(_rank)
+
+    # Custom parallelization plan for the model
+    sp_model = parallelize_module(module = model,
+                                    device_mesh = device_mesh,
+                                    parallelize_plan = {
+                                        "net1": ColwiseParallel(input_layouts=Shard(0)),
+                                        "net2": RowwiseParallel(output_layouts=Shard(0)),
+                                    },
+    )
+
+
     # Create a optimizer for the parallelized module.
-    LR = 0.25
-    optimizer = torch.optim.SGD(model.parameters(), lr=LR)
-    # Parallelize the module based on the given Parallel Style.
-    model = parallelize_module(model, device_mesh, SequenceParallel())
+    lr = 0.25
+    optimizer = torch.optim.AdamW(sp_model.parameters(), lr=lr)
+
 
     # Perform a num of iterations of forward/backward
     # and optimizations for the sharded module.
-    for _ in range(args.iter_nums):
+    num_iters = 10
+    rank_print(f"Sequence Parallel training starting...")
+
+    for i in range(num_iters):
         # For SP, input can be different across all ranks.
-        inp = torch.rand(20, 10).cuda(rank)
-        output = model(inp)
+        inp = torch.rand(20, 10).cuda(_rank)
+        output = sp_model(inp)
         output.sum().backward()
         optimizer.step()
+        rank_print(f"Sequence Parallel iter {i} completed")
 
+    rank_print(f"Sequence Parallel training completed!")
     cleanup()
 
 
 if __name__ == "__main__":
-    n_gpus = torch.cuda.device_count()
     parser = argparse.ArgumentParser()
     # This is passed in via cmd
-    parser.add_argument("--world_size", type=int, default=n_gpus)
-    parser.add_argument("--iter_nums", type=int, default=10)
     args = parser.parse_args()
     # The main entry point is called directly without using subprocess
-    if n_gpus < 2:
-        print("Requires at least 2 GPUs to run.")
-    elif not SP_AVAILABLE:
-        print(
-            "PyTorch doesn't have Sequence Parallelism available,"
-            " need nightly build."
-        )
-    else:
-        mp.spawn(demo_sp, args=(args,), nprocs=args.world_size, join=True)
+    demo_sp(args,)
diff --git a/distributed/tensor_parallelism/tensor_parallel_example.py b/distributed/tensor_parallelism/tensor_parallel_example.py
@@ -1,11 +1,17 @@
 import argparse
-
+import os
 import torch
-import torch.multiprocessing as mp
 
-from torch.distributed._tensor import DeviceMesh
-from torch.distributed.tensor.parallel import PairwiseParallel, parallelize_module
-from utils import cleanup, setup, ToyModel
+from torch.distributed._tensor.device_mesh import init_device_mesh
+from torch.distributed._tensor import DeviceMesh, DTensor, Replicate, Shard
+
+from torch.distributed.tensor.parallel import (
+    PairwiseParallel,
+    parallelize_module,
+    ColwiseParallel,
+    RowwiseParallel,
+)
+from utils import cleanup, ToyModel, torchrun_setup
 
 
 """
@@ -40,48 +46,74 @@
 """
 
 
-def demo_tp(rank, args):
+def demo_tp(args):
     """
     Main body of the demo of a basic version of tensor parallel by using
     PyTorch native APIs.
     """
-    print(f"Running basic Megatron style TP example on rank {rank}.")
-    setup(rank, args.world_size)
+    torchrun_setup()
 
-    # create a sharding plan based on the given world_size.
-    device_mesh = DeviceMesh("cuda", torch.arange(0, args.world_size))
+    # understand world topology
+    _rank = int(os.environ["RANK"])
+    _local_rank = int(os.environ["LOCAL_RANK"])
+    _world_size = int(os.environ["WORLD_SIZE"])
+    _local_world_size = int(os.environ["LOCAL_WORLD_SIZE"])
+
+    torch.cuda.set_device(_local_rank)
+
+    def rank_print(msg):
+        """helper function to print only on global rank 0"""
+        if _rank==0:
+            print(f"{msg}")
+
+    print(f"Running basic Megatron style TP example on rank {_rank}.")
 
-    # create model and move it to GPU with id rank
-    model = ToyModel().cuda(rank)
-    # Create a optimizer for the parallelized module.
-    LR = 0.25
-    optimizer = torch.optim.SGD(model.parameters(), lr=LR)
-    # Parallelize the module based on the given Parallel Style.
-    model = parallelize_module(model, device_mesh, PairwiseParallel())
 
+   # create a device mesh based on the given world_size.
+
+    device = f"cuda"
+    device_mesh = init_device_mesh(device_type = device,mesh_shape = (_world_size,))
+    assert device_mesh is not None, "unable to create valid device mesh"
+
+    rank_print(f"Device Mesh created: {device_mesh=}")
+
+    # create model and move it to GPU with id rank
+    tp_model = ToyModel().cuda(_rank)
+
+    # Create an optimizer for the parallelized module.
+    lr = 0.25
+    optimizer = torch.optim.AdamW(tp_model.parameters(), lr=lr)
+
+   # Custom parallelization plan for the model
+    tp_model = parallelize_module(module = tp_model,
+                                    device_mesh = device_mesh,
+                                    parallelize_plan = {
+                                        "net1": ColwiseParallel(),
+                                        "net2": RowwiseParallel(),
+                                    },
+    )
     # Perform a num of iterations of forward/backward
     # and optimizations for the sharded module.
-    for i in range(args.iter_nums):
+    num_iters = 10
+    rank_print(f"Tensor Parallel training starting...")
+
+    for i in range(num_iters):
         # For TP, input needs to be same across all TP ranks.
         # Setting the random seed is to mimic the behavior of dataloader.
         torch.manual_seed(i)
-        inp = torch.rand(20, 10).cuda(rank)
-        output = model(inp)
+        inp = torch.rand(20, 10).cuda(_rank)
+        output = tp_model(inp)
         output.sum().backward()
         optimizer.step()
+        rank_print(f"Tensor Parallel iter {i} completed")
+
+    rank_print(f"Tensor Parallel training completed!")
 
     cleanup()
 
 
 if __name__ == "__main__":
-    n_gpus = torch.cuda.device_count()
     parser = argparse.ArgumentParser()
     # This is passed in via cmd
-    parser.add_argument("--world_size", type=int, default=n_gpus)
-    parser.add_argument("--iter_nums", type=int, default=10)
     args = parser.parse_args()
-    # The main entry point is called directly without using subprocess
-    if n_gpus < 2:
-        print("Requires at least 2 GPUs to run.")
-    else:
-        mp.spawn(demo_tp, args=(args,), nprocs=args.world_size, join=True)
+    demo_tp(args)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		torchrun --nnodes=1 --nproc_per_node=4 --rdzv_id=101 --rdzv_endpoint="localhost:5973" sequence_parallel_example.py
lessw2020 marked this conversation as resolved. Show resolved Hide resolved