hpcaitech · ver217 · Aug 27, 2024 · Jul 1, 2024 · Jul 4, 2024 · Jul 4, 2024
@@ -9,6 +9,7 @@ on:
     paths:
       - "examples/**"
       - "!examples/**.md"
+      - ".github/workflows/example_check_on_pr.yml"
 
 jobs:
   # This is for changed example files detect and output a matrix containing all the corresponding directory name.
@@ -107,7 +108,7 @@ jobs:
 
       - name: Install Colossal-AI
         run: |
-          BUILD_EXT=1 pip install -v .
+          BUILD_EXT=1 pip install -v -e .
 
       - name: Store Colossal-AI Cache
         run: |

@@ -366,7 +366,9 @@ def __init__(
         enable_jit_fused: bool = False,
         enable_sequence_overlap: bool = False,
         enable_async_reduce: bool = True,
+        use_fp8: bool = False,
         verbose: bool = False,
+        fp8_communication: bool = False,
     ) -> None:
         super().__init__()
         assert precision in SUPPORTED_PRECISION, f"precision {precision} is not supported"
@@ -401,6 +403,8 @@ def __init__(
             master_weights=master_weights,
             max_prefetch=max_prefetch,
             enable_async_reduce=enable_async_reduce,
+            fp8_communication=fp8_communication,
+            use_fp8=use_fp8,
         )
         self.zero_optim_config = dict(
             gpu_margin_mem_ratio=gpu_margin_mem_ratio,

@@ -31,6 +31,7 @@
 from colossalai.pipeline.schedule import InterleavedSchedule, OneForwardOneBackwardSchedule
 from colossalai.pipeline.stage_manager import PipelineStageManager
 from colossalai.quantization import BnbQuantizationConfig, quantize_model
+from colossalai.quantization.fp8_hook import FP8Hook
 from colossalai.shardformer import GradientCheckpointConfig, ShardConfig, ShardFormer
 from colossalai.shardformer.layer.utils import SeqParallelUtils, is_share_sp_tp
 from colossalai.shardformer.policies.base_policy import Policy
@@ -66,6 +67,7 @@ def __init__(
         ddp_config: dict,
         custom_policy: Policy,
         overlap_allgather: bool = False,
+        use_fp8: bool = False,
     ) -> None:
         self.stage_manager = shard_config.pipeline_stage_manager
         self.shard_config = shard_config
@@ -75,6 +77,7 @@ def __init__(
         self.use_ddp = use_ddp
         self.require_grad_sync = True
         self.overlap_allgather = overlap_allgather
+        self.use_fp8 = use_fp8
 
         shardformer = ShardFormer(shard_config)
         if custom_policy is not None:
@@ -112,6 +115,9 @@ def __init__(
             module = DDP(module, process_group=dp_group, **ddp_config)
 
         super().__init__(module)
+        self.op_hooks = []
+        if use_fp8:
+            self.op_hooks.append(FP8Hook())
         if overlap_allgather:
             self.op_hook = ZeroOpHook()
             for p in module.parameters():
@@ -223,7 +229,11 @@ def _force_wait_all_gather(self):
             wait_all_gather_handle(p)
 
     def _wait_all_gather(self):
-        return ColoParamOpHookManager.use_hooks(self.op_hook) if self.overlap_allgather else nullcontext()
+        return (
+            ColoParamOpHookManager.use_hooks(*self.op_hooks)
+            if (self.overlap_allgather or self.use_fp8)
+            else nullcontext()
+        )
 
 
 def get_param_info(optim: Optimizer):
@@ -969,6 +979,7 @@ class HybridParallelPlugin(PipelinePluginBase):
         gradient_checkpoint_config (GradientCheckpointConfig, optional): Configuration for gradient checkpointing. Defaults to None.
         enable_metadata_cache (bool, optional): Whether to enable metadata cache for pipeline parallelism. Defaults to True.
         make_vocab_size_divisible_by (int, optional): it's used when padding the vocabulary size, to make it choose an faster kenel. Default to 64.
+        fp8_communication (bool, optional): Whether to enable fp8 communication in model parallelism
         overlap_p2p (bool, optional): Whether to overlap the p2p communication in pipeline parallelism
         inner_ring_size (int, optional): The inner ring size of 2D Ring Attention when sp mode is "ring_attn".
             It's advisable to not tune this (especially in single-node settings) and let it be heuristically set based on topology by default.
@@ -1020,6 +1031,8 @@ def __init__(
         dp_outside: bool = True,
         overlap_p2p: bool = True,
         overlap_allgather: bool = False,
+        fp8_communication: bool = False,
+        use_fp8: bool = False,
         inner_ring_size: int = None,
     ) -> None:
         super().__init__()
@@ -1069,8 +1082,10 @@ def __init__(
         self.enable_flash_attention = enable_flash_attention
         self.enable_jit_fused = enable_jit_fused
         self.enable_sequence_parallelism = enable_sequence_parallelism
+        self.use_fp8 = use_fp8
         if dp_outside:
             self.dp_axis, self.pp_axis, self.tp_axis, self.sp_axis = 0, 1, 2, 3
+            self.pg_mesh = ProcessGroupMesh(self.dp_size, self.pp_size, self.tp_size, self.sp_size)
             if sequence_parallelism_mode == "ring_attn":
                 # Swap tp and sp since 2D Ring has better inter-node latency
                 self.pg_mesh = ProcessGroupMesh(self.dp_size, self.pp_size, self.sp_size, self.tp_size)
@@ -1117,13 +1132,15 @@ def __init__(
                     microbatch_size=microbatch_size,
                     enable_metadata_cache=enable_metadata_cache,
                     overlap_p2p=overlap_p2p,
+                    fp8_communication=fp8_communication,
                 )
             elif pp_style == "1f1b":
                 self.schedule = OneForwardOneBackwardSchedule(
                     stage_manager=self.stage_manager,
                     num_microbatches=num_microbatches,
                     microbatch_size=microbatch_size,
                     enable_metadata_cache=enable_metadata_cache,
+                    fp8_communication=fp8_communication,
                 )
             else:
                 raise NotImplementedError()
@@ -1158,6 +1175,7 @@ def __init__(
             parallel_output=parallel_output,
             make_vocab_size_divisible_by=make_vocab_size_divisible_by,
             gradient_checkpoint_config=gradient_checkpoint_config,
+            fp8_communication=fp8_communication,
             inner_ring_size=inner_ring_size,
         )
         self.amp_config = dict(
@@ -1250,7 +1268,7 @@ def configure(
             use_ddp = (self.dp_size > 1 and self.pp_size == 1 and self.zero_stage == 0) or (
                 self.dp_size == 1 and self.pp_size == 1
             )
-
+            # sync gradients across DP * SP ranks
             # Apply Hybrid ZeRO across DP * SP ranks
             if self.enable_sequence_parallelism and not is_share_sp_tp(self.sequence_parallelism_mode):
                 dp_group = self.pg_mesh.create_group_along_axis([self.dp_axis, self.sp_axis])
@@ -1268,6 +1286,7 @@ def configure(
                 ddp_config=self.ddp_config,
                 custom_policy=self.custom_policy,
                 overlap_allgather=(self.zero_stage > 0 and self.zero_config["overlap_allgather"]),
+                use_fp8=self.use_fp8,
             )
         if optimizer is not None and not isinstance(optimizer, OptimizerWrapper):
             if zero_stage == 0:

@@ -34,6 +34,7 @@
 from colossalai.logging import get_dist_logger
 from colossalai.nn.optimizer import DistGaloreAwamW, cast_to_distributed
 from colossalai.quantization import BnbQuantizationConfig, quantize_model
+from colossalai.quantization.fp8_hook import FP8Hook
 from colossalai.tensor.colo_parameter import ColoParameter
 from colossalai.tensor.param_op_hook import ColoParamOpHookManager
 from colossalai.zero import LowLevelZeroOptimizer
@@ -62,7 +63,12 @@ class OptimizerParamCheckState(enum.Enum):
 
 class LowLevelZeroModel(ModelWrapper, AMPModelMixin):
     def __init__(
-        self, module: nn.Module, precision: str, overlap_allgather: bool = False, cast_inputs: bool = True
+        self,
+        module: nn.Module,
+        precision: str,
+        overlap_allgather: bool = False,
+        cast_inputs: bool = True,
+        use_fp8: bool = False,
     ) -> None:
         super().__init__(module)
         self.dtype = None
@@ -75,11 +81,16 @@ def __init__(
         module = module.to(get_accelerator().get_current_device())
         self.module = module
         self.convert_fn = None
+        self.use_fp8 = use_fp8
         if self.dtype is not None and cast_inputs:
             self.convert_fn = partial(_convert_floating_point, dtype=self.dtype)
         self.overlap_allgather = overlap_allgather
+        self.op_hooks = []
         if overlap_allgather:
-            self.op_hook = ZeroOpHook()
+            self.op_hooks.append(ZeroOpHook())
+        if use_fp8:
+            self.op_hooks.append(FP8Hook())
+        if overlap_allgather or use_fp8:
             for p in module.parameters():
                 if p.requires_grad and type(p) is not ColoParameter:
                     p.__class__ = ColoParameter
@@ -337,6 +348,8 @@ def __init__(
         master_weights: bool = True,
         verbose: bool = False,
         cast_inputs: bool = True,
+        fp8_communication: bool = False,
+        use_fp8: bool = False,
     ) -> None:
         super().__init__()
         assert stage in (1, 2), f"LowLevelZeroPlugin only supports stage 1/2 training"
@@ -360,12 +373,14 @@ def __init__(
             cpu_offload=cpu_offload,
             master_weights=master_weights,
             overlap_allgather=overlap_allgather,
+            fp8_communication=fp8_communication,
         )
         self.lora_enabled = False
         self.verbose = verbose
         self.logger = get_dist_logger()
         self.cast_inputs = cast_inputs
 
+        self.use_fp8 = use_fp8
         # set class name with stage, for better error message
         setattr(self.__class__, "__name__", f"LowLevelZeroPlugin_ZeRO-{stage}")
 
@@ -484,6 +499,7 @@ def configure(
                 self.precision,
                 overlap_allgather=self.zero_optim_kwargs["overlap_allgather"],
                 cast_inputs=self.cast_inputs,
+                use_fp8=self.use_fp8,
             )
 
         # TODO: Support Galore + ZeRO

@@ -214,6 +214,8 @@ def __init__(
         moe_dp_outside: bool = True,
         overlap_p2p: bool = True,
         overlap_allgather: bool = False,
+        fp8_communication: bool = False,
+        use_fp8: bool = False,
     ) -> None:
         self.logger = get_dist_logger()
         if overlap_communication or zero_stage == 2:
@@ -327,6 +329,7 @@ def __init__(
             self.sp_group = self.pg_mesh.get_group_along_axis(self.tp_axis)
         else:
             self.sp_group = self.pg_mesh.get_group_along_axis(self.sp_axis)
+        self.use_fp8 = use_fp8
 
         self.shard_config = ShardConfig(
             tensor_parallel_process_group=self.tp_group,
@@ -345,6 +348,7 @@ def __init__(
             parallel_output=parallel_output,
             make_vocab_size_divisible_by=make_vocab_size_divisible_by,
             gradient_checkpoint_config=gradient_checkpoint_config,
+            fp8_communication=fp8_communication,
         )
         self.amp_config = dict(
             initial_scale=initial_scale,
@@ -431,6 +435,7 @@ def configure(
                 use_ddp=use_ddp,
                 ddp_config=self.ddp_config,
                 custom_policy=self.custom_policy,
+                use_fp8=self.use_fp8,
             )
         if optimizer is not None and not isinstance(optimizer, OptimizerWrapper):
             if self.ep_size > 1:

@@ -179,6 +179,7 @@ def __init__(
         check_reduction: bool = False,
         gradient_as_bucket_view: bool = False,
         static_graph: bool = False,
+        fp8_communication: bool = False,
     ) -> None:
         super().__init__()
         self.ddp_kwargs = dict(
@@ -189,6 +190,7 @@ def __init__(
             gradient_as_bucket_view=gradient_as_bucket_view,
             static_graph=static_graph,
         )
+        self.fp8_communication = fp8_communication
 
     def support_no_sync(self) -> bool:
         return True
@@ -228,6 +230,11 @@ def configure(
         if optimizer is not None and not isinstance(optimizer, OptimizerWrapper):
             optimizer = OptimizerWrapper(optimizer)
 
+        if self.fp8_communication:
+            from colossalai.quantization.fp8 import fp8_compress_ddp_grad_comm_hook_async
+
+            model.module.register_comm_hook(None, fp8_compress_ddp_grad_comm_hook_async)
+
         return model, optimizer, criterion, dataloader, lr_scheduler
 
     def control_checkpoint_io(self) -> bool:

@@ -298,6 +298,7 @@ def __init__(
             ignored_modules: Optional[Iterable[torch.nn.Module]] = None,
             param_init_fn: Optional[Callable[[nn.Module], None]] = None,
             sync_module_states: bool = False,
+            fp8_communication: bool = False,
         ):
             super().__init__()
             self.fsdp_kwargs = dict(
@@ -311,6 +312,7 @@ def __init__(
                 param_init_fn=param_init_fn,
                 sync_module_states=sync_module_states,
             )
+            self.fp8_communication = fp8_communication
             self.logger = get_dist_logger()
 
     else:
@@ -348,6 +350,19 @@ def configure(
         # wrap the model with PyTorch FSDP
         fsdp_model = TorchFSDPModel(model, device_id=torch.cuda.current_device(), **self.fsdp_kwargs)
 
+        if self.fp8_communication:
+            from colossalai.quantization.utils import patch_fsdp_params_comm_hook
+
+            patch_fsdp_params_comm_hook()
+
+            from colossalai.quantization.fp8 import fp8_compress_fsdp_params_comm_hook
+
+            fsdp_model.module.register_params_comm_hook(None, fp8_compress_fsdp_params_comm_hook)
+
+            from colossalai.quantization.fp8 import fp8_compress_fsdp_grad_comm_hook
+
+            fsdp_model.module.register_comm_hook(None, fp8_compress_fsdp_grad_comm_hook)
+
         if optimizer is not None:
             if len(optimizer.param_groups) > 1:
                 self.logger.warning(

@@ -6,6 +6,8 @@
 from torch.cuda.amp import custom_bwd, custom_fwd
 from torch.distributed import ProcessGroup
 
+from colossalai.quantization.fp8 import all_to_all_single_fp8
+
 MOE_KERNEL = None
 
 
@@ -380,6 +382,7 @@ def _all_to_all(
     output_split_sizes: Optional[List[int]] = None,
     group=None,
     async_op: bool = False,
+    fp8_communication: bool = False,
 ):
     """
     Returns:
@@ -392,9 +395,14 @@ def _all_to_all(
     outputs = torch.empty(outputs_shape, dtype=inputs.dtype, device=inputs.device)
     inputs = inputs.contiguous()
     outputs = outputs.contiguous()
-    handle = dist.all_to_all_single(
-        outputs, inputs, output_split_sizes, input_split_sizes, group=group, async_op=async_op
-    )
+    if fp8_communication:
+        handle = all_to_all_single_fp8(
+            outputs, inputs, output_split_sizes, input_split_sizes, group=group, async_op=False
+        )
+    else:
+        handle = dist.all_to_all_single(
+            outputs, inputs, output_split_sizes, input_split_sizes, group=group, async_op=async_op
+        )
     return outputs, handle
 
 
@@ -407,6 +415,7 @@ def forward(
         output_split_sizes=None,
         group=None,
         overlap: bool = False,
+        fp8_communication: bool = False,
     ):
         """
         Returns:
@@ -416,7 +425,9 @@ def forward(
         ctx.input_split_sizes = input_split_sizes
         ctx.output_split_sizes = output_split_sizes
         ctx.group = group
-        return _all_to_all(inputs, input_split_sizes, output_split_sizes, group, overlap)
+        return _all_to_all(
+            inputs, input_split_sizes, output_split_sizes, group, overlap, fp8_communication=fp8_communication
+        )
 
     @staticmethod
     def backward(ctx: Any, *grad_outputs):
@@ -426,6 +437,7 @@ def backward(ctx: Any, *grad_outputs):
             None,
             None,
             None,
+            None,
         )
 
 
@@ -435,8 +447,9 @@ def all_to_all_uneven(
     output_split_sizes: Optional[List[int]] = None,
     group=None,
     overlap: bool = False,
+    fp8_communication: bool = False,
 ):
     assert (
         inputs.requires_grad
     ), "Input must require grad to assure that backward is executed, otherwise it might hang the program."
-    return AllToAllUneven.apply(inputs, input_split_sizes, output_split_sizes, group, overlap)
+    return AllToAllUneven.apply(inputs, input_split_sizes, output_split_sizes, group, overlap, fp8_communication)