From 8552268bfe1d655a2c7cc472f06fa497dfad4264 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Sun, 13 Oct 2024 11:57:55 +0900 Subject: [PATCH] add vmpsadbw for avx10.2 --- gen/gen_avx512.cpp | 3 +-- gen/gen_code.cpp | 2 +- test/avx10/misc.txt | 9 ++++++++- test/avx10_test.cpp | 24 ++++++++++++++++++++++++ test/test_by_xed.cpp | 3 +++ test/test_by_xed.py | 5 +++++ xbyak/xbyak.h | 16 ++++++++-------- xbyak/xbyak_mnemonic.h | 2 +- 8 files changed, 51 insertions(+), 13 deletions(-) diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp index 109afc6d..98408445 100644 --- a/gen/gen_avx512.cpp +++ b/gen/gen_avx512.cpp @@ -447,7 +447,6 @@ void putX_X_XM_IMM() { 0x1B, "vcvtne2ph2hf8s", T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_YMM | T_B16 | T_N1, false }, { 0x52, "vdpphps", T_MUST_EVEX | T_0F38 | T_EW0 | T_YMM | T_B32, false }, -// { 0x42, "vmpsadbw", T_MUST_EVEX | T_F3 | T_0F3A | T_EW0 | T_YMM | T_B32, true }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; @@ -455,7 +454,7 @@ void putX_X_XM_IMM() printf("void %s(const Xmm& x1, const Xmm& x2, const Operand& op%s) { opAVX_X_X_XM(x1, x2, op, %s, 0x%02X%s); }\n" , p->name, p->hasIMM ? ", uint8_t imm" : "", s.c_str(), p->code, p->hasIMM ? ", imm" : ""); } -// puts("void vmpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F3A | T_YMM, 0x42, encoding, imm, T_66 | T_W0 | T_YMM, T_F3 | T_EW0 | T_B32); }"); + puts("void vmpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F3A | T_YMM, 0x42, encoding, imm, T_66 | T_W0 | T_YMM, T_F3 | T_0F3A | T_EW0 | T_B32, 1); }"); } void putShift() diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp index 764d1180..58c176a6 100644 --- a/gen/gen_code.cpp +++ b/gen/gen_code.cpp @@ -57,7 +57,7 @@ void putX_X_XM(bool omitOnly) { 0x0C, "blendps", T_0F3A | T_66 | T_W0 | T_YMM, true, true, 3 }, { 0x41, "dppd", T_0F3A | T_66 | T_W0, true, true, 3 }, { 0x40, "dpps", T_0F3A | T_66 | T_W0 | T_YMM, true, true, 3 }, - { 0x42, "mpsadbw", T_0F3A | T_66 | T_W0 | T_YMM, true, true, 3 }, + { 0x42, "mpsadbw", T_0F3A | T_66 | T_W0 | T_YMM, true, true, 1 }, { 0x0E, "pblendw", T_0F3A | T_66 | T_W0 | T_YMM, true, true, 3 }, { 0x02, "pblendd", T_0F3A | T_66 | T_W0 | T_YMM, true, true, 2 }, { 0x0B, "roundsd", T_0F3A | T_66 | T_W0, true, true, 3 }, diff --git a/test/avx10/misc.txt b/test/avx10/misc.txt index f7d1351b..5c39e819 100644 --- a/test/avx10/misc.txt +++ b/test/avx10/misc.txt @@ -10,4 +10,11 @@ vdpphps(zm1, zm2, zm3); vdpphps(zm1, zm2, ptr[rax+128]); vdpphps(zm1, zm2, ptr_b[rax+128]); -// skip vmpsadbw +vmpsadbw(xm1, xm3, xm15, 3); +vmpsadbw(xm1|T_z, xm4, ptr[rax+128], 5); + +vmpsadbw(ym1|k4, ym3, ym15, 3); +vmpsadbw(ym1, ym4, ptr[rax+128], 5); + +vmpsadbw(zm1|k4, zm3, zm15, 3); +vmpsadbw(zm1, zm4, ptr[rax+128], 5); diff --git a/test/avx10_test.cpp b/test/avx10_test.cpp index 9a4a8480..5f742fe7 100644 --- a/test/avx10_test.cpp +++ b/test/avx10_test.cpp @@ -228,3 +228,27 @@ CYBOZU_TEST_AUTO(ymm_with_sae) CYBOZU_TEST_EQUAL(c.getSize(), n); CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); } + +CYBOZU_TEST_AUTO(vmpsadbw) +{ + struct Code : Xbyak::CodeGenerator { + Code() + { + setDefaultEncoding(); + vmpsadbw(xm1, xm3, xm15, 3); // vex(avx) + vmpsadbw(ym1, ym3, ptr[rax+128], 3); // vex(avx2) + setDefaultEncoding(VexEncoding, EvexEncoding); + vmpsadbw(ym1, ym3, ym15, 3); // evex(avx10.2) + vmpsadbw(ym1, ym3, ptr[rax+128], 3); // evex(avx10.2) + } + } c; + const uint8_t tbl[] = { + 0xc4, 0xc3, 0x61, 0x42, 0xcf, 0x03, + 0xc4, 0xe3, 0x65, 0x42, 0x88, 0x80, 0x00, 0x00, 0x00, 0x03, + 0x62, 0xd3, 0x66, 0x28, 0x42, 0xcf, 0x03, + 0x62, 0xf3, 0x66, 0x28, 0x42, 0x48, 0x04, 0x03, + }; + const size_t n = sizeof(tbl) / sizeof(tbl[0]); + CYBOZU_TEST_EQUAL(c.getSize(), n); + CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); +} diff --git a/test/test_by_xed.cpp b/test/test_by_xed.cpp index 93c370cc..ddac779a 100644 --- a/test/test_by_xed.cpp +++ b/test/test_by_xed.cpp @@ -1,10 +1,13 @@ #include #include +using namespace Xbyak; + struct Code : Xbyak::CodeGenerator { Code() : Xbyak::CodeGenerator(4096*8) { + setDefaultEncoding(VexEncoding, EvexEncoding); #include "tmp.cpp" } }; diff --git a/test/test_by_xed.py b/test/test_by_xed.py index 5b84995b..afd77d8a 100644 --- a/test/test_by_xed.py +++ b/test/test_by_xed.py @@ -210,6 +210,11 @@ def parseNmemonic(s): args = [] attrs = [] + # remove Xbyak::{Evex,Vex}Encoding + r = re.search(r'(,[^,]*Encoding)', s) + if r: + s = s.replace(r.group(1), '') + (s, broadcast) = parseBroadcast(s) # replace xm0 with xmm0 diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h index 16422900..c5de0086 100644 --- a/xbyak/xbyak.h +++ b/xbyak/xbyak.h @@ -2661,11 +2661,11 @@ class CodeGenerator : public CodeArray { if (addr.getRegExp().getIndex().getKind() != kind) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING) opVex(x, 0, addr, type, code); } - void opEncoding(const Xmm& x1, const Xmm& x2, const Operand& op, uint64_t type, int code, PreferredEncoding encoding, int sel = 0) + void opEncoding(const Xmm& x1, const Xmm& x2, const Operand& op, uint64_t type, int code, PreferredEncoding encoding, int imm = NONE, uint64_t typeVex = 0, uint64_t typeEvex = 0, int sel = 0) { - opAVX_X_X_XM(x1, x2, op, type | orEvexIf(encoding, sel), code); + opAVX_X_X_XM(x1, x2, op, type | orEvexIf(encoding, typeVex, typeEvex, sel), code, imm); } - int orEvexIf(PreferredEncoding encoding, int sel = 0) { + int orEvexIf(PreferredEncoding encoding, uint64_t typeVex, uint64_t typeEvex, int sel) { if (encoding == DefaultEncoding) { encoding = defaultEncoding_[sel]; } @@ -2673,9 +2673,9 @@ class CodeGenerator : public CodeArray { #ifdef XBYAK_DISABLE_AVX512 XBYAK_THROW(ERR_EVEX_IS_INVALID) #endif - return T_MUST_EVEX; + return T_MUST_EVEX | typeEvex; } - return 0; + return typeVex; } void opInOut(const Reg& a, const Reg& d, uint8_t code) { @@ -3132,8 +3132,8 @@ class CodeGenerator : public CodeArray { #endif , isDefaultJmpNEAR_(false) { - defaultEncoding_[0] = EvexEncoding; // use avx512-vnni not avx-vnni - defaultEncoding_[1] = VexEncoding; // use vmpsadbw(avx) not avx10.2 + // select avx512-vnni, vmpsadbw(avx) + setDefaultEncoding(); labelMgr_.set(this); } void reset() @@ -3171,7 +3171,7 @@ class CodeGenerator : public CodeArray { #endif // set default encoding to select Vex or Evex - void setDefaultEncoding(PreferredEncoding vnniEnc, PreferredEncoding mpsadbwEnc = VexEncoding) + void setDefaultEncoding(PreferredEncoding vnniEnc = EvexEncoding, PreferredEncoding mpsadbwEnc = VexEncoding) { defaultEncoding_[0] = vnniEnc; defaultEncoding_[1] = mpsadbwEnc; } void sha1msg12(const Xmm& x, const Operand& op) diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index daafcd1e..8515e41a 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -1369,7 +1369,6 @@ void vmovupd(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_ void vmovupd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F|T_EW1|T_YMM|T_EVEX, 0x10); } void vmovups(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_0F|T_EW0|T_YMM|T_EVEX|T_M_K, 0x11); } void vmovups(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F|T_EW0|T_YMM|T_EVEX, 0x10); } -void vmpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_W0|T_YMM, 0x42, imm); } void vmulpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x59); } void vmulps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x59); } void vmulsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x59); } @@ -2408,6 +2407,7 @@ void vmovsh(const Xmm& x1, const Xmm& x2, const Xmm& x3) { opAVX_X_X_XM(x1, x2, void vmovw(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_N2|T_66|T_MAP5|T_MUST_EVEX, 0x7E); } void vmovw(const Reg32e& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, r, T_N2|T_66|T_MAP5|T_MUST_EVEX, 0x7E); } void vmovw(const Xmm& x, const Operand& op) { if (!op.isREG(32|64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_N2|T_66|T_MAP5|T_MUST_EVEX, 0x6E); } +void vmpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F3A | T_YMM, 0x42, encoding, imm, T_66 | T_W0 | T_YMM, T_F3 | T_0F3A | T_EW0 | T_B32, 1); } void vmulnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x59); } void vmulph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x59); } void vmulsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x59); }