From d751006b385e06230ac0ed267a03831366cd9245 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sun, 24 Nov 2024 07:42:43 -0800 Subject: [PATCH] Rename triton_fused_moe -> fused_moe_triton --- python/sglang/srt/layers/fused_moe/__init__.py | 1 - python/sglang/srt/layers/fused_moe_grok/__init__.py | 1 + ...N=4096,device_name=AMD_Instinct_MI300X,dtype=float8.json | 0 ...N=8192,device_name=AMD_Instinct_MI300X,dtype=float8.json | 0 .../srt/layers/{fused_moe => fused_moe_grok}/fused_moe.py | 0 .../srt/layers/{fused_moe => fused_moe_grok}/layer.py | 6 +++--- .../{triton_fused_moe => fused_moe_triton}/__init__.py | 6 +++--- ...,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json | 0 .../E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json | 0 ...,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json | 0 .../E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json | 0 ...,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json | 0 ...,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json | 0 .../E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json | 0 ...,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json | 0 .../E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json | 0 ...,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json | 0 .../E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json | 0 .../E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json | 0 .../E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json | 0 .../E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json | 0 ...,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json | 0 .../E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json | 0 ...,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json | 0 .../E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json | 0 .../E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json | 0 .../E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json | 0 ...,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json | 0 ...,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json | 0 ...00,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 0 ...,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json | 0 .../E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json | 0 ...00,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 0 ...,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json | 0 .../E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json | 0 ...,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json | 0 ...00,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 0 .../E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json | 0 .../E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json | 0 .../E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json | 0 .../E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json | 0 .../E=8,N=14336,device_name=AMD_Instinct_MI300X.json | 0 ...36,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 0 .../configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json | 0 .../E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json | 0 .../E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json | 0 .../E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json | 0 .../E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json | 0 ...48,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 0 .../E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json | 0 .../configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json | 0 .../E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json | 0 .../E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json | 0 ...84,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 0 .../E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json | 0 .../configs/E=8,N=3584,device_name=NVIDIA_L40S.json | 0 .../E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json | 0 ...96,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 0 .../E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json | 0 .../configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json | 0 .../E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json | 0 ...68,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 0 .../E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json | 0 ...92,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json | 0 .../{triton_fused_moe => fused_moe_triton}/configs/README | 0 .../{triton_fused_moe => fused_moe_triton}/fused_moe.py | 2 +- .../layers/{triton_fused_moe => fused_moe_triton}/layer.py | 4 ++-- python/sglang/srt/layers/quantization/__init__.py | 2 +- python/sglang/srt/models/dbrx.py | 2 +- python/sglang/srt/models/deepseek.py | 2 +- python/sglang/srt/models/deepseek_v2.py | 2 +- python/sglang/srt/models/grok.py | 2 +- python/sglang/srt/models/mixtral.py | 2 +- python/sglang/srt/models/olmoe.py | 2 +- python/sglang/srt/models/qwen2_moe.py | 2 +- python/sglang/srt/models/xverse_moe.py | 2 +- 76 files changed, 19 insertions(+), 19 deletions(-) delete mode 100644 python/sglang/srt/layers/fused_moe/__init__.py create mode 100644 python/sglang/srt/layers/fused_moe_grok/__init__.py rename python/sglang/srt/layers/{fused_moe => fused_moe_grok}/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=float8.json (100%) rename python/sglang/srt/layers/{fused_moe => fused_moe_grok}/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=float8.json (100%) rename python/sglang/srt/layers/{fused_moe => fused_moe_grok}/fused_moe.py (100%) rename python/sglang/srt/layers/{fused_moe => fused_moe_grok}/layer.py (99%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/__init__.py (80%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=8,N=3584,device_name=NVIDIA_L40S.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/configs/README (100%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/fused_moe.py (99%) rename python/sglang/srt/layers/{triton_fused_moe => fused_moe_triton}/layer.py (99%) diff --git a/python/sglang/srt/layers/fused_moe/__init__.py b/python/sglang/srt/layers/fused_moe/__init__.py deleted file mode 100644 index 5f7691c09..000000000 --- a/python/sglang/srt/layers/fused_moe/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from sglang.srt.layers.fused_moe.layer import FusedMoE, FusedMoEMethodBase diff --git a/python/sglang/srt/layers/fused_moe_grok/__init__.py b/python/sglang/srt/layers/fused_moe_grok/__init__.py new file mode 100644 index 000000000..c915c960d --- /dev/null +++ b/python/sglang/srt/layers/fused_moe_grok/__init__.py @@ -0,0 +1 @@ +from sglang.srt.layers.fused_moe_grok.layer import FusedMoE, FusedMoEMethodBase diff --git a/python/sglang/srt/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=float8.json b/python/sglang/srt/layers/fused_moe_grok/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=float8.json similarity index 100% rename from python/sglang/srt/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=float8.json rename to python/sglang/srt/layers/fused_moe_grok/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=float8.json diff --git a/python/sglang/srt/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=float8.json b/python/sglang/srt/layers/fused_moe_grok/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=float8.json similarity index 100% rename from python/sglang/srt/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=float8.json rename to python/sglang/srt/layers/fused_moe_grok/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=float8.json diff --git a/python/sglang/srt/layers/fused_moe/fused_moe.py b/python/sglang/srt/layers/fused_moe_grok/fused_moe.py similarity index 100% rename from python/sglang/srt/layers/fused_moe/fused_moe.py rename to python/sglang/srt/layers/fused_moe_grok/fused_moe.py diff --git a/python/sglang/srt/layers/fused_moe/layer.py b/python/sglang/srt/layers/fused_moe_grok/layer.py similarity index 99% rename from python/sglang/srt/layers/fused_moe/layer.py rename to python/sglang/srt/layers/fused_moe_grok/layer.py index df91ba117..89cc33d11 100644 --- a/python/sglang/srt/layers/fused_moe/layer.py +++ b/python/sglang/srt/layers/fused_moe_grok/layer.py @@ -20,7 +20,7 @@ from vllm.model_executor.layers.quantization.fp8 import Fp8Config from vllm.model_executor.utils import set_weight_attrs -from sglang.srt.layers.fused_moe.fused_moe import padding_size +from sglang.srt.layers.fused_moe_grok.fused_moe import padding_size from sglang.srt.utils import is_hip logger = init_logger(__name__) @@ -123,7 +123,7 @@ def forward_cuda( num_expert_group: Optional[int], topk_group: Optional[int], ) -> torch.Tensor: - from sglang.srt.layers.fused_moe.fused_moe import fused_moe + from sglang.srt.layers.fused_moe_grok.fused_moe import fused_moe return fused_moe( x, @@ -609,7 +609,7 @@ def apply( topk_group: Optional[int] = None, ) -> torch.Tensor: - from sglang.srt.layers.fused_moe.fused_moe import fused_moe + from sglang.srt.layers.fused_moe_grok.fused_moe import fused_moe return fused_moe( x, diff --git a/python/sglang/srt/layers/triton_fused_moe/__init__.py b/python/sglang/srt/layers/fused_moe_triton/__init__.py similarity index 80% rename from python/sglang/srt/layers/triton_fused_moe/__init__.py rename to python/sglang/srt/layers/fused_moe_triton/__init__.py index b2eb11835..b895b9e48 100644 --- a/python/sglang/srt/layers/triton_fused_moe/__init__.py +++ b/python/sglang/srt/layers/fused_moe_triton/__init__.py @@ -1,14 +1,14 @@ from contextlib import contextmanager from typing import Any, Dict, Optional -import sglang.srt.layers.triton_fused_moe.fused_moe # noqa -from sglang.srt.layers.triton_fused_moe.fused_moe import ( +import sglang.srt.layers.fused_moe_triton.fused_moe # noqa +from sglang.srt.layers.fused_moe_triton.fused_moe import ( fused_experts, fused_topk, get_config_file_name, grouped_topk, ) -from sglang.srt.layers.triton_fused_moe.layer import ( +from sglang.srt.layers.fused_moe_triton.layer import ( FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported, diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_L40S.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=3584,device_name=NVIDIA_L40S.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_L40S.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json b/python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json rename to python/sglang/srt/layers/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json diff --git a/python/sglang/srt/layers/triton_fused_moe/configs/README b/python/sglang/srt/layers/fused_moe_triton/configs/README similarity index 100% rename from python/sglang/srt/layers/triton_fused_moe/configs/README rename to python/sglang/srt/layers/fused_moe_triton/configs/README diff --git a/python/sglang/srt/layers/triton_fused_moe/fused_moe.py b/python/sglang/srt/layers/fused_moe_triton/fused_moe.py similarity index 99% rename from python/sglang/srt/layers/triton_fused_moe/fused_moe.py rename to python/sglang/srt/layers/fused_moe_triton/fused_moe.py index 8a289a5c6..4f92512b2 100644 --- a/python/sglang/srt/layers/triton_fused_moe/fused_moe.py +++ b/python/sglang/srt/layers/fused_moe_triton/fused_moe.py @@ -376,7 +376,7 @@ def try_get_optimal_moe_config( M: int, is_marlin: bool = False, ): - from sglang.srt.layers.triton_fused_moe import get_config + from sglang.srt.layers.fused_moe_triton import get_config override_config = get_config() if override_config: diff --git a/python/sglang/srt/layers/triton_fused_moe/layer.py b/python/sglang/srt/layers/fused_moe_triton/layer.py similarity index 99% rename from python/sglang/srt/layers/triton_fused_moe/layer.py rename to python/sglang/srt/layers/fused_moe_triton/layer.py index 93a6e5506..d9503fe20 100644 --- a/python/sglang/srt/layers/triton_fused_moe/layer.py +++ b/python/sglang/srt/layers/fused_moe_triton/layer.py @@ -20,7 +20,7 @@ from sglang.srt.utils import set_weight_attrs if torch.cuda.is_available() or torch.hip.is_available(): - from sglang.srt.layers.triton_fused_moe.fused_moe import fused_experts + from sglang.srt.layers.fused_moe_triton.fused_moe import fused_experts else: fused_experts = None # type: ignore @@ -514,7 +514,7 @@ def select_experts( num_expert_group: Optional[int] = None, custom_routing_function: Optional[Callable] = None, ): - from sglang.srt.layers.triton_fused_moe.fused_moe import ( + from sglang.srt.layers.fused_moe_triton.fused_moe import ( fused_topk, grouped_topk, ) diff --git a/python/sglang/srt/layers/quantization/__init__.py b/python/sglang/srt/layers/quantization/__init__.py index 584ae0d89..78d9f99b5 100644 --- a/python/sglang/srt/layers/quantization/__init__.py +++ b/python/sglang/srt/layers/quantization/__init__.py @@ -68,7 +68,7 @@ def fp8_get_quant_method(self, layer, prefix): is_layer_skipped, ) - from sglang.srt.layers.triton_fused_moe.layer import FusedMoE + from sglang.srt.layers.fused_moe_triton.layer import FusedMoE if isinstance(layer, LinearBase): if is_layer_skipped(prefix, self.ignored_layers): diff --git a/python/sglang/srt/models/dbrx.py b/python/sglang/srt/models/dbrx.py index cfbf21c70..b8dad0248 100644 --- a/python/sglang/srt/models/dbrx.py +++ b/python/sglang/srt/models/dbrx.py @@ -28,6 +28,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.transformers_utils.configs.dbrx import DbrxConfig +from sglang.srt.layers.fused_moe_triton import fused_moe from sglang.srt.layers.linear import ( QKVParallelLinear, ReplicatedLinear, @@ -36,7 +37,6 @@ from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.layers.triton_fused_moe import fused_moe from sglang.srt.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, diff --git a/python/sglang/srt/models/deepseek.py b/python/sglang/srt/models/deepseek.py index e8e163dfc..cdebafa2f 100644 --- a/python/sglang/srt/models/deepseek.py +++ b/python/sglang/srt/models/deepseek.py @@ -30,6 +30,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from sglang.srt.layers.activation import SiluAndMul +from sglang.srt.layers.fused_moe_triton import fused_moe from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.linear import ( MergedColumnParallelLinear, @@ -40,7 +41,6 @@ from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.layers.triton_fused_moe import fused_moe from sglang.srt.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding, diff --git a/python/sglang/srt/models/deepseek_v2.py b/python/sglang/srt/models/deepseek_v2.py index 73ab9c059..85467c12c 100644 --- a/python/sglang/srt/models/deepseek_v2.py +++ b/python/sglang/srt/models/deepseek_v2.py @@ -31,6 +31,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from sglang.srt.layers.activation import SiluAndMul +from sglang.srt.layers.fused_moe_triton import FusedMoE from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.linear import ( ColumnParallelLinear, @@ -41,7 +42,6 @@ from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.layers.triton_fused_moe import FusedMoE from sglang.srt.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding, diff --git a/python/sglang/srt/models/grok.py b/python/sglang/srt/models/grok.py index 40f50785a..f8326c72d 100644 --- a/python/sglang/srt/models/grok.py +++ b/python/sglang/srt/models/grok.py @@ -31,7 +31,7 @@ from vllm.model_executor.model_loader.loader import DefaultModelLoader from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from sglang.srt.layers.fused_moe import FusedMoE +from sglang.srt.layers.fused_moe_grok import FusedMoE from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.linear import ( QKVParallelLinear, diff --git a/python/sglang/srt/models/mixtral.py b/python/sglang/srt/models/mixtral.py index 46a6b6ac7..98d5ab332 100644 --- a/python/sglang/srt/models/mixtral.py +++ b/python/sglang/srt/models/mixtral.py @@ -25,6 +25,7 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from sglang.srt.layers.fused_moe_triton import FusedMoE from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.linear import ( QKVParallelLinear, @@ -35,7 +36,6 @@ from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.layers.radix_attention import RadixAttention from sglang.srt.layers.torchao_utils import apply_torchao_config_ -from sglang.srt.layers.triton_fused_moe import FusedMoE from sglang.srt.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding, diff --git a/python/sglang/srt/models/olmoe.py b/python/sglang/srt/models/olmoe.py index 984638d5b..407eb98cb 100644 --- a/python/sglang/srt/models/olmoe.py +++ b/python/sglang/srt/models/olmoe.py @@ -38,11 +38,11 @@ from vllm.utils import print_warning_once from sglang.srt.layers.activation import SiluAndMul +from sglang.srt.layers.fused_moe_triton import FusedMoE from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.layers.triton_fused_moe import FusedMoE from sglang.srt.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding, diff --git a/python/sglang/srt/models/qwen2_moe.py b/python/sglang/srt/models/qwen2_moe.py index d363ec6a0..febd6d748 100644 --- a/python/sglang/srt/models/qwen2_moe.py +++ b/python/sglang/srt/models/qwen2_moe.py @@ -30,6 +30,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from sglang.srt.layers.activation import SiluAndMul +from sglang.srt.layers.fused_moe_triton import FusedMoE from sglang.srt.layers.layernorm import RMSNorm from sglang.srt.layers.linear import ( MergedColumnParallelLinear, @@ -41,7 +42,6 @@ from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.layers.radix_attention import RadixAttention from sglang.srt.layers.torchao_utils import apply_torchao_config_ -from sglang.srt.layers.triton_fused_moe import FusedMoE from sglang.srt.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding, diff --git a/python/sglang/srt/models/xverse_moe.py b/python/sglang/srt/models/xverse_moe.py index 8cdd4c570..c6458f7f5 100644 --- a/python/sglang/srt/models/xverse_moe.py +++ b/python/sglang/srt/models/xverse_moe.py @@ -34,10 +34,10 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from sglang.srt.layers.fused_moe_triton import fused_moe from sglang.srt.layers.logits_processor import LogitsProcessor from sglang.srt.layers.quantization.base_config import QuantizationConfig from sglang.srt.layers.radix_attention import RadixAttention -from sglang.srt.layers.triton_fused_moe import fused_moe from sglang.srt.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding,