From 57317b15d990a5862705c89a63202ca7652795b8 Mon Sep 17 00:00:00 2001 From: Pravin Jagtap Date: Mon, 8 Apr 2024 01:10:37 -0400 Subject: [PATCH] AMDGPU: MC support for v_cvt_scalef32_pk32_f32_[fp|bf]6 of gfx950 Co-authored-by: Pravin Jagtap --- llvm/lib/Target/AMDGPU/AMDGPU.td | 14 +++++++++-- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 3 +++ .../Disassembler/AMDGPUDisassembler.cpp | 1 + .../AMDGPU/Disassembler/AMDGPUDisassembler.h | 1 + llvm/lib/Target/AMDGPU/SIInstrInfo.td | 10 ++++++-- llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 1 + llvm/lib/Target/AMDGPU/VOP3Instructions.td | 22 +++++++++++++++++ llvm/test/MC/AMDGPU/gfx950_asm_features.s | 10 +++++++- llvm/test/MC/AMDGPU/gfx950_err.s | 24 +++++++++++++++++++ .../Disassembler/AMDGPU/gfx950_dasm_vop3.txt | 6 +++++ 10 files changed, 87 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 64e88cf03b429b..15a1bb799804d7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -402,11 +402,17 @@ def FeatureFP4ConversionScaleInsts : SubtargetFeature<"fp4-cvt-scale-insts", "Has fp4 conversion scale instructions" >; +def FeatureFP6BF6ConversionScaleInsts : SubtargetFeature<"fp6bf6-cvt-scale-insts", + "HasFP6BF6ConversionScaleInsts", + "true", + "Has fp6 and bf6 conversion scale instructions" +>; + def FeatureGFX950Insts : SubtargetFeature<"gfx950-insts", "GFX950Insts", "true", "Additional instructions for GFX950+", - [FeaturePermlane16Swap, FeaturePermlane32Swap, FeatureFP8ConversionScaleInsts, FeatureBF8ConversionScaleInsts, FeatureFP4ConversionScaleInsts] + [FeaturePermlane16Swap, FeaturePermlane32Swap, FeatureFP8ConversionScaleInsts, FeatureBF8ConversionScaleInsts, FeatureFP4ConversionScaleInsts, FeatureFP6BF6ConversionScaleInsts] >; def FeatureGFX10Insts : SubtargetFeature<"gfx10-insts", @@ -1552,7 +1558,8 @@ def FeatureISAVersion9_5_Common : FeatureSet< FeatureBitOp3Insts, FeatureFP8ConversionScaleInsts, FeatureBF8ConversionScaleInsts, - FeatureFP4ConversionScaleInsts + FeatureFP4ConversionScaleInsts, + FeatureFP6BF6ConversionScaleInsts ])>; def FeatureISAVersion9_4_0 : FeatureSet< @@ -2435,6 +2442,9 @@ def HasBF8ConversionScaleInsts : Predicate<"Subtarget->hasBF8ConversionScaleInst def HasFP4ConversionScaleInsts : Predicate<"Subtarget->hasFP4ConversionScaleInsts()">, AssemblerPredicate<(all_of FeatureFP4ConversionScaleInsts)>; +def HasFP6BF6ConversionScaleInsts : Predicate<"Subtarget->hasFP6BF6ConversionScaleInsts()">, + AssemblerPredicate<(all_of FeatureFP6BF6ConversionScaleInsts)>; + def HasGDS : Predicate<"Subtarget->hasGDS()">; def HasGWS : Predicate<"Subtarget->hasGWS()">; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 20f573da0ec82b..1a09f55dfdb28a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -53,6 +53,7 @@ class AMDGPUSubtarget { bool HasFP8ConversionScaleInsts = false; bool HasBF8ConversionScaleInsts = false; bool HasFP4ConversionScaleInsts = false; + bool HasFP6BF6ConversionScaleInsts = false; bool EnableRealTrue16Insts = false; bool HasBF16ConversionInsts = false; bool HasMadMixInsts = false; @@ -184,6 +185,8 @@ class AMDGPUSubtarget { bool hasFP4ConversionScaleInsts() const { return HasFP4ConversionScaleInsts; } + bool hasFP6BF6ConversionScaleInsts() const { return HasFP6BF6ConversionScaleInsts; } + bool hasMadMacF32Insts() const { return HasMadMacF32Insts || !isGCN(); } diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 136fe2e3f90d02..fa5f86b0788cc2 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -1530,6 +1530,7 @@ unsigned AMDGPUDisassembler::getVgprClassId(const OpWidthTy Width) const { case OPWV232: return VReg_64RegClassID; case OPW96: return VReg_96RegClassID; case OPW128: return VReg_128RegClassID; + case OPW192: return VReg_192RegClassID; case OPW160: return VReg_160RegClassID; case OPW256: return VReg_256RegClassID; case OPW288: return VReg_288RegClassID; diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index 3e20a2ab9e66ca..b19e4b74a394cb 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -219,6 +219,7 @@ class AMDGPUDisassembler : public MCDisassembler { OPW96, OPW128, OPW160, + OPW192, OPW256, OPW288, OPW320, diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index acb703dba6a980..f20d6526e20b2c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1696,7 +1696,8 @@ class getVALUDstForVT { defvar op16 = !if(IsTrue16, !if (IsVOP3Encoding, VOPDstOperand_t16, VOPDstOperand_t16Lo128), VOPDstOperand); - RegisterOperand ret = !cond(!eq(VT.Size, 256) : VOPDstOperand, + RegisterOperand ret = !cond(!eq(VT.Size, 1024) : VOPDstOperand, + !eq(VT.Size, 256) : VOPDstOperand, !eq(VT.Size, 128) : VOPDstOperand, !eq(VT.Size, 64) : VOPDstOperand, !eq(VT.Size, 32) : VOPDstOperand, @@ -1752,7 +1753,8 @@ class getSOPSrcForVT { // Returns the vreg register class to use for source operand given VT class getVregSrcForVT { RegisterOperand ret = - !cond(!eq(VT.Size, 128) : RegisterOperand, + !cond(!eq(VT.Size, 192) : RegisterOperand, + !eq(VT.Size, 128) : RegisterOperand, !eq(VT.Size, 96) : RegisterOperand, !eq(VT.Size, 64) : RegisterOperand, !eq(VT.Size, 48) : RegisterOperand, @@ -1785,6 +1787,7 @@ class getVOP3SrcForVT { !eq(VT, v2i16) : VSrc_v2b16, !eq(VT, v4f16) : AVSrc_64, !eq(VT, v4bf16) : AVSrc_64, + !eq(VT.Size, 192) : VRegSrc_192, !eq(VT.Size, 128) : VRegSrc_128, !eq(VT.Size, 96) : VRegSrc_96, !eq(VT.Size, 64) : VSrc_b64, @@ -2828,6 +2831,9 @@ def VOP_I32_I32_I32_ARITH : VOPProfile <[i32, i32, i32, untyped], /*EnableClamp= def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>; def VOP_F32_F16_F16_F16 : VOPProfile <[f32, f16, f16, f16]>; def VOP_V2BF16_F32_F32 : VOPProfile <[v2bf16, f32, f32, untyped]>; +def VOP_V32F32_V6I32_F32 : VOPProfile <[v32f32, v6i32, f32, untyped]>; +def VOP_V32F16_V6I32_F32 : VOPProfile <[v32f16, v6i32, f32, untyped]>; +def VOP_V32BF16_V6I32_F32 : VOPProfile <[v32bf16, v6i32, f32, untyped]>; def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>; def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index e3baeed01841ab..11ca4df6e9f445 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -1249,6 +1249,7 @@ def VRegSrc_32 : SrcReg9; def VRegSrc_64 : SrcReg9; def VRegSrc_96 : SrcReg9; def VRegSrc_128: SrcReg9; +def VRegSrc_192: SrcReg9; def VRegSrc_256: SrcReg9; def VRegOrLdsSrc_32 : SrcReg9; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index cf00910210e0bd..1009f2d9593609 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -925,6 +925,19 @@ def VOP3_CVT_SCALE_PK_FP8BF8_F16BF16_Profile : VOP3_Profile : VOP3_Profile

{ + let HasModifiers = 0; + let HasSrc0IntMods = 0; + let HasSrc1IntMods = 0; + let HasOMod = 0; + let HasOpSel = 0; + let HasClamp = 0; + let HasExtDPP = 0; + let HasExt32BitDPP = 0; + let HasExtVOP3DPP = 0; + let HasExt64BitDPP = 0; +} + let SubtargetPredicate = HasFP8ConversionScaleInsts, mayRaiseFPException = 0 in { defm V_CVT_SCALEF32_F16_FP8 : VOP3Inst<"v_cvt_scalef32_f16_fp8", VOP3_CVT_SCALE_F1632_FP8BF8_Profile>; defm V_CVT_SCALEF32_F32_FP8 : VOP3Inst<"v_cvt_scalef32_f32_fp8", VOP3_CVT_SCALE_F1632_FP8BF8_Profile>; @@ -950,6 +963,11 @@ let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in defm V_CVT_SCALEF32_PK_BF16_FP4 : VOP3Inst<"v_cvt_scalef32_pk_bf16_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile>; } +let SubtargetPredicate = HasFP6BF6ConversionScaleInsts, mayRaiseFPException = 0 in { + defm V_CVT_SCALEF32_PK32_F32_FP6 : VOP3Inst<"v_cvt_scalef32_pk32_f32_fp6", VOP3_CVT_SCALEF32_PK_F864_Profile>; + defm V_CVT_SCALEF32_PK32_F32_BF6 : VOP3Inst<"v_cvt_scalef32_pk32_f32_bf6", VOP3_CVT_SCALEF32_PK_F864_Profile>; +} + let SubtargetPredicate = isGFX10Plus in { let isCommutable = 1, isReMaterializable = 1 in { defm V_XOR3_B32 : VOP3Inst <"v_xor3_b32", VOP3_Profile>; @@ -1894,3 +1912,7 @@ defm V_CVT_SCALEF32_PK_FP4_F32 : VOP3OpSel_Real_gfx9 <0x23d>; defm V_CVT_SCALEF32_PK_F16_FP4 : VOP3OpSel_Real_gfx9 <0x250>; defm V_CVT_SCALEF32_PK_BF16_FP4 : VOP3OpSel_Real_gfx9 <0x251>; } +let OtherPredicates = [HasFP6BF6ConversionScaleInsts] in { +defm V_CVT_SCALEF32_PK32_F32_FP6 : VOP3_Real_gfx9<0x256, "v_cvt_scalef32_pk32_f32_fp6">; +defm V_CVT_SCALEF32_PK32_F32_BF6 : VOP3_Real_gfx9<0x257, "v_cvt_scalef32_pk32_f32_bf6">; +} diff --git a/llvm/test/MC/AMDGPU/gfx950_asm_features.s b/llvm/test/MC/AMDGPU/gfx950_asm_features.s index 85cd02aa714175..95d31d2293075f 100644 --- a/llvm/test/MC/AMDGPU/gfx950_asm_features.s +++ b/llvm/test/MC/AMDGPU/gfx950_asm_features.s @@ -884,4 +884,12 @@ v_cvt_scalef32_pk_bf16_fp4 v1, v2, s3 op_sel:[1,1,0] // NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error: // GFX950: v_cvt_scalef32_pk_bf16_fp4 v1, s2, 3 op_sel:[1,1,0] ; encoding: [0x01,0x18,0x51,0xd2,0x02,0x06,0x01,0x00] -v_cvt_scalef32_pk_bf16_fp4 v1, s2, 3 op_sel:[1,1,0] \ No newline at end of file +v_cvt_scalef32_pk_bf16_fp4 v1, s2, 3 op_sel:[1,1,0] + +// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error: +// GFX950: v_cvt_scalef32_pk32_f32_fp6 v[2:33], v[2:7], v6 ; encoding: [0x02,0x00,0x56,0xd2,0x02,0x0d,0x02,0x00] +v_cvt_scalef32_pk32_f32_fp6 v[2:33], v[2:7], v6 + +// NOT-GFX950: :[[@LINE+2]]:{{[0-9]+}}: error: +// GFX950: v_cvt_scalef32_pk32_f32_bf6 v[2:33], v[2:7], v6 ; encoding: [0x02,0x00,0x57,0xd2,0x02,0x0d,0x02,0x00] +v_cvt_scalef32_pk32_f32_bf6 v[2:33], v[2:7], v6 \ No newline at end of file diff --git a/llvm/test/MC/AMDGPU/gfx950_err.s b/llvm/test/MC/AMDGPU/gfx950_err.s index 89167ae35e2967..6eebd4f7ccd76b 100644 --- a/llvm/test/MC/AMDGPU/gfx950_err.s +++ b/llvm/test/MC/AMDGPU/gfx950_err.s @@ -125,3 +125,27 @@ v_cvt_scalef32_pk_bf16_fp4 v1, v2, v3 div:2 // GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand v_cvt_scalef32_pk_bf16_fp4 v1, v2, v3 clamp div:2 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_pk32_f32_fp6 v[2:33], v[2:7], v6 clamp + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand +v_cvt_scalef32_pk32_f32_fp6 v[2:33], v[2:7], v6 mul:2 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand +v_cvt_scalef32_pk32_f32_fp6 v[2:33], v[2:7], v6 div:2 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand +v_cvt_scalef32_pk32_f32_fp6 v[2:33], v[2:7], v6 clamp div:2 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_pk32_f32_bf6 v[2:33], v[2:7], v6 clamp + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand +v_cvt_scalef32_pk32_f32_bf6 v[2:33], v[2:7], v6 mul:2 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand +v_cvt_scalef32_pk32_f32_bf6 v[2:33], v[2:7], v6 div:2 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: not a valid operand +v_cvt_scalef32_pk32_f32_bf6 v[2:33], v[2:7], v6 clamp div:2 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt index 80b5835fab1084..73fd3edfbad486 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt @@ -611,3 +611,9 @@ # GFX950: v_cvt_scalef32_pk_bf16_fp4 v1, s2, 3 op_sel:[1,1,0] ; encoding: [0x01,0x18,0x51,0xd2,0x02,0x06,0x01,0x00] 0x01,0x18,0x51,0xd2,0x02,0x06,0x01,0x00 + +# GFX950: v_cvt_scalef32_pk32_f32_fp6 v[2:33], v[2:7], v6 ; encoding: [0x02,0x00,0x56,0xd2,0x02,0x0d,0x02,0x00] +0x02,0x00,0x56,0xd2,0x02,0x0d,0x02,0x00 + +# GFX950: v_cvt_scalef32_pk32_f32_bf6 v[2:33], v[2:7], v6 ; encoding: [0x02,0x00,0x57,0xd2,0x02,0x0d,0x02,0x00] +0x02,0x00,0x57,0xd2,0x02,0x0d,0x02,0x00