From e7db8e6655cfe1779171ccc9c2077fe717339b09 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Tue, 8 Oct 2024 16:59:02 +0900 Subject: [PATCH] support YMM embedded rounding of AVX10.2 --- test/Makefile | 10 +- test/avx10_test.cpp | 230 ++++++++++++++++++++++++++++++++++++++++++++ xbyak/xbyak.h | 23 +++-- 3 files changed, 252 insertions(+), 11 deletions(-) create mode 100644 test/avx10_test.cpp diff --git a/test/Makefile b/test/Makefile index d2d83099..862c110a 100644 --- a/test/Makefile +++ b/test/Makefile @@ -1,4 +1,4 @@ -TARGET = make_nm normalize_prefix bad_address misc cvt_test cvt_test32 noexception misc32 detect_x32 +TARGET = make_nm normalize_prefix bad_address misc cvt_test cvt_test32 noexception misc32 detect_x32 avx10_test XBYAK_INC=../xbyak/xbyak.h ../xbyak/xbyak_mnemonic.h UNAME_S=$(shell uname -s) ifeq ($(shell ./detect_x32),x32) @@ -57,6 +57,8 @@ noexception: noexception.cpp $(XBYAK_INC) $(CXX) $(CFLAGS) $< -o $@ -fno-exceptions apx: apx.cpp $(XBYAK_INC) $(CXX) $(CFLAGS) apx.cpp -o $@ +avx10_test: avx10_test.cpp $(XBYAK_INC) + $(CXX) $(CFLAGS) avx10_test.cpp -o $@ -DXBYAK64 test_nm: normalize_prefix $(TARGET) $(MAKE) -C ../gen @@ -81,6 +83,7 @@ ifneq ($(X32),1) endif ./jmp64 ./apx + ./avx10_test endif test_avx: normalize_prefix @@ -103,6 +106,9 @@ ifeq ($(BIT),64) CXX=$(CXX) ./test_avx512.sh 64 endif +test_avx10: avx10_test + ./avx10_test + detect_x32: detect_x32.c $(CC) $< -o $@ @@ -112,7 +118,7 @@ test: detect_x32 $(MAKE) test_avx512 clean: - $(RM) a.asm *.lst *.obj *.o $(TARGET) lib_run nm.cpp nm_frame make_512 + $(RM) a.asm *.lst *.obj *.o $(TARGET) lib_run nm.cpp nm_frame make_512 avx10_test lib_run: lib_test.cpp lib_run.cpp lib.h $(CXX) $(CFLAGS) lib_run.cpp lib_test.cpp -o lib_run diff --git a/test/avx10_test.cpp b/test/avx10_test.cpp new file mode 100644 index 00000000..9a4a8480 --- /dev/null +++ b/test/avx10_test.cpp @@ -0,0 +1,230 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace Xbyak; + +CYBOZU_TEST_AUTO(ymm_with_sae) +{ + struct Code : Xbyak::CodeGenerator { + Code() + { + vaddpd(ymm1, ymm2, ymm3 |T_rn_sae); + vaddph(ymm1, ymm2, ymm3 |T_rn_sae); + vaddps(ymm1, ymm2, ymm3 |T_rn_sae); + vcmppd(k1, ymm2, ymm3 |T_sae, 3); + vcmpph(k1, ymm2, ymm3 |T_sae, 3); + vcmpps(k1, ymm2, ymm3 |T_sae, 3); + vcvtdq2ph(xmm1, ymm2 |T_rn_sae); + vcvtdq2ps(ymm1, ymm2 |T_rn_sae); + vcvtpd2dq(xmm1, ymm2 |T_rn_sae); + vcvtpd2ph(xmm1, ymm2 |T_rn_sae); + vcvtpd2ps(xmm1, ymm2 |T_rn_sae); + vcvtpd2qq(ymm1, ymm2 |T_rn_sae); + vcvtpd2udq(xmm1, ymm2 |T_rn_sae); + vcvtpd2uqq(ymm1, ymm2 |T_rn_sae); + vcvtph2dq(ymm1, xmm2 |T_rn_sae); + vcvtph2pd(ymm1, xmm2 |T_sae); + vcvtph2ps(ymm1, xmm2 |T_sae); + vcvtph2psx(ymm1, xmm2 |T_sae); + vcvtph2qq(ymm1, xmm2 |T_rn_sae); + vcvtph2udq(ymm1, xmm2 |T_rn_sae); + vcvtph2uqq(ymm1, xmm2 |T_rn_sae); + vcvtph2uw(ymm1, ymm2 |T_rn_sae); + vcvtph2w(ymm1, ymm2 |T_rn_sae); + vcvtps2dq(ymm1, ymm2 |T_rn_sae); + vcvtps2pd(ymm1, xmm2 |T_sae); + vcvtps2ph(xmm1, ymm2 |T_sae, 3); + vcvtps2phx(xmm1, ymm2 |T_rn_sae); + vcvtps2qq(ymm1, xmm2 |T_rn_sae); + vcvtps2udq(ymm1, ymm2 |T_rn_sae); + vcvtps2uqq(ymm1, xmm2 |T_rn_sae); + vcvtqq2pd(ymm1, ymm2 |T_rn_sae); + vcvtqq2ph(xmm1, ymm2 |T_rn_sae); + vcvtqq2ps(xmm1, ymm2 |T_rn_sae); + vcvttpd2dq(xmm1, ymm2 |T_sae); + vcvttpd2qq(ymm1, ymm2 |T_sae); + vcvttpd2udq(xmm1, ymm2 |T_sae); + vcvttpd2uqq(ymm1, ymm2 |T_sae); + vcvttph2dq(ymm1, xmm2 |T_sae); + vcvttph2qq(ymm1, xmm2 |T_sae); + vcvttph2udq(ymm1, xmm2 |T_sae); + vcvttph2uqq(ymm1, xmm2 |T_sae); + vcvttph2uw(ymm1, ymm2 |T_sae); + vcvttph2w(ymm1, ymm2 |T_sae); + vcvttps2dq(ymm1, ymm2 |T_sae); + vcvttps2qq(ymm1, xmm2 |T_sae); + vcvttps2udq(ymm1, ymm2 |T_sae); + vcvttps2uqq(ymm1, xmm2 |T_sae); + vcvtudq2ph(xmm1, ymm2 |T_rn_sae); + vcvtudq2ps(ymm1, ymm2 |T_rn_sae); + vcvtuqq2pd(ymm1, ymm2 |T_rn_sae); + vcvtuqq2ph(xmm1, ymm2 |T_rn_sae); + vcvtuqq2ps(xmm1, ymm2 |T_rn_sae); + vcvtuw2ph(ymm1, ymm2 |T_rn_sae); + vcvtw2ph(ymm1, ymm2 |T_rn_sae); + vdivpd(ymm1, ymm2, ymm3 |T_rn_sae); + vdivph(ymm1, ymm2, ymm3 |T_rn_sae); + vdivps(ymm1, ymm2, ymm3 |T_rn_sae); + vfcmaddcph(ymm1, ymm2, ymm3 |T_rn_sae); + vfcmulcph(ymm1, ymm2, ymm3 |T_rn_sae); + vfixupimmpd(ymm1, ymm2, ymm3 |T_sae, 3); + vfixupimmps(ymm1, ymm2, ymm3 |T_sae, 3); + vfmadd132pd(ymm1, ymm2, ymm3 |T_rn_sae); + vfmadd132ph(ymm1, ymm2, ymm3 |T_rn_sae); + vfmadd132ps(ymm1, ymm2, ymm3 |T_rn_sae); + vfmadd213pd(ymm1, ymm2, ymm3 |T_rn_sae); + vfmadd213ph(ymm1, ymm2, ymm3 |T_rn_sae); + vfmadd213ps(ymm1, ymm2, ymm3 |T_rn_sae); + vfmadd231pd(ymm1, ymm2, ymm3 |T_rn_sae); + vfmadd231ph(ymm1, ymm2, ymm3 |T_rn_sae); + vfmadd231ps(ymm1, ymm2, ymm3 |T_rn_sae); + vfmaddcph(ymm1, ymm2, ymm3 |T_rn_sae); + vfmaddsub132pd(ymm1, ymm2, ymm3 |T_rn_sae); + vfmaddsub132ph(ymm1, ymm2, ymm3 |T_rn_sae); + vfmaddsub132ps(ymm1, ymm2, ymm3 |T_rn_sae); + vfmaddsub213pd(ymm1, ymm2, ymm3 |T_rn_sae); + vfmaddsub213ph(ymm1, ymm2, ymm3 |T_rn_sae); + vfmaddsub213ps(ymm1, ymm2, ymm3 |T_rn_sae); + vfmaddsub231pd(ymm1, ymm2, ymm3 |T_rn_sae); + vfmaddsub231ph(ymm1, ymm2, ymm3 |T_rn_sae); + vfmaddsub231ps(ymm1, ymm2, ymm3 |T_rn_sae); + vfmsub132pd(ymm1, ymm2, ymm3 |T_rn_sae); + vfmsub132ph(ymm1, ymm2, ymm3 |T_rn_sae); + vfmsub132ps(ymm1, ymm2, ymm3 |T_rn_sae); + vfmsub213pd(ymm1, ymm2, ymm3 |T_rn_sae); + vfmsub213ph(ymm1, ymm2, ymm3 |T_rn_sae); + vfmsub213ps(ymm1, ymm2, ymm3 |T_rn_sae); + vfmsub231pd(ymm1, ymm2, ymm3 |T_rn_sae); + vfmsub231ph(ymm1, ymm2, ymm3 |T_rn_sae); + vfmsub231ps(ymm1, ymm2, ymm3 |T_rn_sae); + vfmsubadd132pd(ymm1, ymm2, ymm3 |T_rn_sae); + vfmsubadd132ph(ymm1, ymm2, ymm3 |T_rn_sae); + vfmsubadd132ps(ymm1, ymm2, ymm3 |T_rn_sae); + vfmsubadd213pd(ymm1, ymm2, ymm3 |T_rn_sae); + vfmsubadd213ph(ymm1, ymm2, ymm3 |T_rn_sae); + vfmsubadd213ps(ymm1, ymm2, ymm3 |T_rn_sae); + vfmsubadd231pd(ymm1, ymm2, ymm3 |T_rn_sae); + vfmsubadd231ph(ymm1, ymm2, ymm3 |T_rn_sae); + vfmsubadd231ps(ymm1, ymm2, ymm3 |T_rn_sae); + vfmulcph(ymm1, ymm2, ymm3 |T_rn_sae); + vfnmadd132pd(ymm1, ymm2, ymm3 |T_rn_sae); + vfnmadd132ph(ymm1, ymm2, ymm3 |T_rn_sae); + vfnmadd132ps(ymm1, ymm2, ymm3 |T_rn_sae); + vfnmadd213pd(ymm1, ymm2, ymm3 |T_rn_sae); + vfnmadd213ph(ymm1, ymm2, ymm3 |T_rn_sae); + vfnmadd213ps(ymm1, ymm2, ymm3 |T_rn_sae); + vfnmadd231pd(ymm1, ymm2, ymm3 |T_rn_sae); + vfnmadd231ph(ymm1, ymm2, ymm3 |T_rn_sae); + vfnmadd231ps(ymm1, ymm2, ymm3 |T_rn_sae); + vfnmsub132pd(ymm1, ymm2, ymm3 |T_rn_sae); + vfnmsub132ph(ymm1, ymm2, ymm3 |T_rn_sae); + vfnmsub132ps(ymm1, ymm2, ymm3 |T_rn_sae); + vfnmsub213pd(ymm1, ymm2, ymm3 |T_rn_sae); + vfnmsub213ph(ymm1, ymm2, ymm3 |T_rn_sae); + vfnmsub213ps(ymm1, ymm2, ymm3 |T_rn_sae); + vfnmsub231pd(ymm1, ymm2, ymm3 |T_rn_sae); + vfnmsub231ph(ymm1, ymm2, ymm3 |T_rn_sae); + vfnmsub231ps(ymm1, ymm2, ymm3 |T_rn_sae); + vgetexppd(ymm1, ymm2 |T_sae); + vgetexpph(ymm1, ymm2 |T_sae); + vgetexpps(ymm1, ymm2 |T_sae); + vgetmantpd(ymm1, ymm2 |T_sae, 3); + vgetmantph(ymm1, ymm2 |T_sae, 3); + vgetmantps(ymm1, ymm2 |T_sae, 3); + vmaxpd(ymm1, ymm2, ymm3 |T_sae); + vmaxph(ymm1, ymm2, ymm3 |T_sae); + vmaxps(ymm1, ymm2, ymm3 |T_sae); + vminpd(ymm1, ymm2, ymm3 |T_sae); + vminph(ymm1, ymm2, ymm3 |T_sae); + vminps(ymm1, ymm2, ymm3 |T_sae); + vmulpd(ymm1, ymm2, ymm3 |T_rn_sae); + vmulph(ymm1, ymm2, ymm3 |T_rn_sae); + vmulps(ymm1, ymm2, ymm3 |T_rn_sae); + vrangepd(ymm1, ymm2, ymm3 |T_sae, 3); + vrangeps(ymm1, ymm2, ymm3 |T_sae, 3); + vreducepd(ymm1, ymm2 |T_sae, 3); + vreduceph(ymm1, ymm2 |T_sae, 3); + vreduceps(ymm1, ymm2 |T_sae, 3); + vrndscalepd(ymm1, ymm2 |T_sae, 3); + vrndscaleph(ymm1, ymm2 |T_sae, 3); + vrndscaleps(ymm1, ymm2 |T_sae, 3); + vscalefpd(ymm1, ymm2, ymm3 |T_rn_sae); + vscalefph(ymm1, ymm2, ymm3 |T_rn_sae); + vscalefps(ymm1, ymm2, ymm3 |T_rn_sae); + vsqrtpd(ymm1, ymm2 |T_rn_sae); + vsqrtph(ymm1, ymm2 |T_rn_sae); + vsqrtps(ymm1, ymm2 |T_rn_sae); + vsubpd(ymm1, ymm2, ymm3 |T_rn_sae); + vsubph(ymm1, ymm2, ymm3 |T_rn_sae); + vsubps(ymm1, ymm2, ymm3 |T_rn_sae); + } + } c; + const uint8_t tbl[] = { + 0x62, 0xf1, 0xe9, 0x18, 0x58, 0xcb, 0x62, 0xf5, 0x68, 0x18, 0x58, 0xcb, 0x62, 0xf1, 0x68, 0x18, + 0x58, 0xcb, 0x62, 0xf1, 0xe9, 0x18, 0xc2, 0xcb, 0x03, 0x62, 0xf3, 0x68, 0x18, 0xc2, 0xcb, 0x03, + 0x62, 0xf1, 0x68, 0x18, 0xc2, 0xcb, 0x03, 0x62, 0xf5, 0x78, 0x18, 0x5b, 0xca, 0x62, 0xf1, 0x78, + 0x18, 0x5b, 0xca, 0x62, 0xf1, 0xfb, 0x18, 0xe6, 0xca, 0x62, 0xf5, 0xf9, 0x18, 0x5a, 0xca, 0x62, + 0xf1, 0xf9, 0x18, 0x5a, 0xca, 0x62, 0xf1, 0xf9, 0x18, 0x7b, 0xca, 0x62, 0xf1, 0xf8, 0x18, 0x79, + 0xca, 0x62, 0xf1, 0xf9, 0x18, 0x79, 0xca, 0x62, 0xf5, 0x79, 0x18, 0x5b, 0xca, 0x62, 0xf5, 0x78, + 0x18, 0x5a, 0xca, 0x62, 0xf2, 0x79, 0x18, 0x13, 0xca, 0x62, 0xf6, 0x79, 0x18, 0x13, 0xca, 0x62, + 0xf5, 0x79, 0x18, 0x7b, 0xca, 0x62, 0xf5, 0x78, 0x18, 0x79, 0xca, 0x62, 0xf5, 0x79, 0x18, 0x79, + 0xca, 0x62, 0xf5, 0x78, 0x18, 0x7d, 0xca, 0x62, 0xf5, 0x79, 0x18, 0x7d, 0xca, 0x62, 0xf1, 0x79, + 0x18, 0x5b, 0xca, 0x62, 0xf1, 0x78, 0x18, 0x5a, 0xca, 0x62, 0xf3, 0x79, 0x18, 0x1d, 0xd1, 0x03, + 0x62, 0xf5, 0x79, 0x18, 0x1d, 0xca, 0x62, 0xf1, 0x79, 0x18, 0x7b, 0xca, 0x62, 0xf1, 0x78, 0x18, + 0x79, 0xca, 0x62, 0xf1, 0x79, 0x18, 0x79, 0xca, 0x62, 0xf1, 0xfa, 0x18, 0xe6, 0xca, 0x62, 0xf5, + 0xf8, 0x18, 0x5b, 0xca, 0x62, 0xf1, 0xf8, 0x18, 0x5b, 0xca, 0x62, 0xf1, 0xf9, 0x18, 0xe6, 0xca, + 0x62, 0xf1, 0xf9, 0x18, 0x7a, 0xca, 0x62, 0xf1, 0xf8, 0x18, 0x78, 0xca, 0x62, 0xf1, 0xf9, 0x18, + 0x78, 0xca, 0x62, 0xf5, 0x7a, 0x18, 0x5b, 0xca, 0x62, 0xf5, 0x79, 0x18, 0x7a, 0xca, 0x62, 0xf5, + 0x78, 0x18, 0x78, 0xca, 0x62, 0xf5, 0x79, 0x18, 0x78, 0xca, 0x62, 0xf5, 0x78, 0x18, 0x7c, 0xca, + 0x62, 0xf5, 0x79, 0x18, 0x7c, 0xca, 0x62, 0xf1, 0x7a, 0x18, 0x5b, 0xca, 0x62, 0xf1, 0x79, 0x18, + 0x7a, 0xca, 0x62, 0xf1, 0x78, 0x18, 0x78, 0xca, 0x62, 0xf1, 0x79, 0x18, 0x78, 0xca, 0x62, 0xf5, + 0x7b, 0x18, 0x7a, 0xca, 0x62, 0xf1, 0x7b, 0x18, 0x7a, 0xca, 0x62, 0xf1, 0xfa, 0x18, 0x7a, 0xca, + 0x62, 0xf5, 0xfb, 0x18, 0x7a, 0xca, 0x62, 0xf1, 0xfb, 0x18, 0x7a, 0xca, 0x62, 0xf5, 0x7b, 0x18, + 0x7d, 0xca, 0x62, 0xf5, 0x7a, 0x18, 0x7d, 0xca, 0x62, 0xf1, 0xe9, 0x18, 0x5e, 0xcb, 0x62, 0xf5, + 0x68, 0x18, 0x5e, 0xcb, 0x62, 0xf1, 0x68, 0x18, 0x5e, 0xcb, 0x62, 0xf6, 0x6b, 0x18, 0x56, 0xcb, + 0x62, 0xf6, 0x6b, 0x18, 0xd6, 0xcb, 0x62, 0xf3, 0xe9, 0x18, 0x54, 0xcb, 0x03, 0x62, 0xf3, 0x69, + 0x18, 0x54, 0xcb, 0x03, 0x62, 0xf2, 0xe9, 0x18, 0x98, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0x98, 0xcb, + 0x62, 0xf2, 0x69, 0x18, 0x98, 0xcb, 0x62, 0xf2, 0xe9, 0x18, 0xa8, 0xcb, 0x62, 0xf6, 0x69, 0x18, + 0xa8, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0xa8, 0xcb, 0x62, 0xf2, 0xe9, 0x18, 0xb8, 0xcb, 0x62, 0xf6, + 0x69, 0x18, 0xb8, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0xb8, 0xcb, 0x62, 0xf6, 0x6a, 0x18, 0x56, 0xcb, + 0x62, 0xf2, 0xe9, 0x18, 0x96, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0x96, 0xcb, 0x62, 0xf2, 0x69, 0x18, + 0x96, 0xcb, 0x62, 0xf2, 0xe9, 0x18, 0xa6, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0xa6, 0xcb, 0x62, 0xf2, + 0x69, 0x18, 0xa6, 0xcb, 0x62, 0xf2, 0xe9, 0x18, 0xb6, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0xb6, 0xcb, + 0x62, 0xf2, 0x69, 0x18, 0xb6, 0xcb, 0x62, 0xf2, 0xe9, 0x18, 0x9a, 0xcb, 0x62, 0xf6, 0x69, 0x18, + 0x9a, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0x9a, 0xcb, 0x62, 0xf2, 0xe9, 0x18, 0xaa, 0xcb, 0x62, 0xf6, + 0x69, 0x18, 0xaa, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0xaa, 0xcb, 0x62, 0xf2, 0xe9, 0x18, 0xba, 0xcb, + 0x62, 0xf6, 0x69, 0x18, 0xba, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0xba, 0xcb, 0x62, 0xf2, 0xe9, 0x18, + 0x97, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0x97, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0x97, 0xcb, 0x62, 0xf2, + 0xe9, 0x18, 0xa7, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0xa7, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0xa7, 0xcb, + 0x62, 0xf2, 0xe9, 0x18, 0xb7, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0xb7, 0xcb, 0x62, 0xf2, 0x69, 0x18, + 0xb7, 0xcb, 0x62, 0xf6, 0x6a, 0x18, 0xd6, 0xcb, 0x62, 0xf2, 0xe9, 0x18, 0x9c, 0xcb, 0x62, 0xf6, + 0x69, 0x18, 0x9c, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0x9c, 0xcb, 0x62, 0xf2, 0xe9, 0x18, 0xac, 0xcb, + 0x62, 0xf6, 0x69, 0x18, 0xac, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0xac, 0xcb, 0x62, 0xf2, 0xe9, 0x18, + 0xbc, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0xbc, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0xbc, 0xcb, 0x62, 0xf2, + 0xe9, 0x18, 0x9e, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0x9e, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0x9e, 0xcb, + 0x62, 0xf2, 0xe9, 0x18, 0xae, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0xae, 0xcb, 0x62, 0xf2, 0x69, 0x18, + 0xae, 0xcb, 0x62, 0xf2, 0xe9, 0x18, 0xbe, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0xbe, 0xcb, 0x62, 0xf2, + 0x69, 0x18, 0xbe, 0xcb, 0x62, 0xf2, 0xf9, 0x18, 0x42, 0xca, 0x62, 0xf6, 0x79, 0x18, 0x42, 0xca, + 0x62, 0xf2, 0x79, 0x18, 0x42, 0xca, 0x62, 0xf3, 0xf9, 0x18, 0x26, 0xca, 0x03, 0x62, 0xf3, 0x78, + 0x18, 0x26, 0xca, 0x03, 0x62, 0xf3, 0x79, 0x18, 0x26, 0xca, 0x03, 0x62, 0xf1, 0xe9, 0x18, 0x5f, + 0xcb, 0x62, 0xf5, 0x68, 0x18, 0x5f, 0xcb, 0x62, 0xf1, 0x68, 0x18, 0x5f, 0xcb, 0x62, 0xf1, 0xe9, + 0x18, 0x5d, 0xcb, 0x62, 0xf5, 0x68, 0x18, 0x5d, 0xcb, 0x62, 0xf1, 0x68, 0x18, 0x5d, 0xcb, 0x62, + 0xf1, 0xe9, 0x18, 0x59, 0xcb, 0x62, 0xf5, 0x68, 0x18, 0x59, 0xcb, 0x62, 0xf1, 0x68, 0x18, 0x59, + 0xcb, 0x62, 0xf3, 0xe9, 0x18, 0x50, 0xcb, 0x03, 0x62, 0xf3, 0x69, 0x18, 0x50, 0xcb, 0x03, 0x62, + 0xf3, 0xf9, 0x18, 0x56, 0xca, 0x03, 0x62, 0xf3, 0x78, 0x18, 0x56, 0xca, 0x03, 0x62, 0xf3, 0x79, + 0x18, 0x56, 0xca, 0x03, 0x62, 0xf3, 0xf9, 0x18, 0x09, 0xca, 0x03, 0x62, 0xf3, 0x78, 0x18, 0x08, + 0xca, 0x03, 0x62, 0xf3, 0x79, 0x18, 0x08, 0xca, 0x03, 0x62, 0xf2, 0xe9, 0x18, 0x2c, 0xcb, 0x62, + 0xf6, 0x69, 0x18, 0x2c, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0x2c, 0xcb, 0x62, 0xf1, 0xf9, 0x18, 0x51, + 0xca, 0x62, 0xf5, 0x78, 0x18, 0x51, 0xca, 0x62, 0xf1, 0x78, 0x18, 0x51, 0xca, 0x62, 0xf1, 0xe9, + 0x18, 0x5c, 0xcb, 0x62, 0xf5, 0x68, 0x18, 0x5c, 0xcb, 0x62, 0xf1, 0x68, 0x18, 0x5c, 0xcb, + }; + const size_t n = sizeof(tbl) / sizeof(tbl[0]); + CYBOZU_TEST_EQUAL(c.getSize(), n); + CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); +} diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h index d53b37bf..57ec8b68 100644 --- a/xbyak/xbyak.h +++ b/xbyak/xbyak.h @@ -1867,16 +1867,19 @@ class CodeGenerator : public CodeArray { } db(code); } - void verifySAE(const Reg& r, uint64_t type) const + // Allow YMM embedded rounding for AVX10.2 to minimize flag modifications + bool verifySAE(const Reg& r, const Reg& b, uint64_t type) const { - if (((type & T_SAE_X) && r.isXMM()) || ((type & T_SAE_Y) && r.isYMM()) || ((type & T_SAE_Z) && r.isZMM())) return; - XBYAK_THROW(ERR_SAE_IS_INVALID) + if (((type & T_SAE_X) && (r.isYMM() && b.isXMM())) || ((type & T_SAE_Y) && b.isXMM()) || ((type & T_SAE_Z) && b.isYMM())) return true; + if (((type & T_SAE_X) && b.isXMM()) || ((type & T_SAE_Y) && b.isYMM()) || ((type & T_SAE_Z) && b.isZMM())) return false; + XBYAK_THROW_RET(ERR_SAE_IS_INVALID, false) } - void verifyER(const Reg& r, uint64_t type) const + bool verifyER(const Reg& r, const Reg& b, uint64_t type) const { - if ((type & T_ER_R) && r.isREG(32|64)) return; - if (((type & T_ER_X) && r.isXMM()) || ((type & T_ER_Y) && r.isYMM()) || ((type & T_ER_Z) && r.isZMM())) return; - XBYAK_THROW(ERR_ER_IS_INVALID) + if ((type & T_ER_R) && b.isREG(32|64)) return false; + if (((type & T_ER_X) && (r.isYMM() && b.isXMM())) || ((type & T_ER_Y) && b.isXMM()) || ((type & T_ER_Z) && b.isYMM())) return true; + if (((type & T_ER_X) && b.isXMM()) || ((type & T_ER_Y) && b.isYMM()) || ((type & T_ER_Z) && b.isZMM())) return false; + XBYAK_THROW_RET(ERR_SAE_IS_INVALID, false) } // (a, b, c) contains non zero two or three values then err int verifyDuplicate(int a, int b, int c, int err) @@ -1905,11 +1908,13 @@ class CodeGenerator : public CodeArray { int rounding = verifyDuplicate(reg.getRounding(), base.getRounding(), v ? v->getRounding() : 0, ERR_ROUNDING_IS_ALREADY_SET); int disp8N = 1; if (rounding) { + bool isUzero = false; if (rounding == EvexModifierRounding::T_SAE) { - verifySAE(base, type); LL = 0; + isUzero = verifySAE(reg, base, type); LL = 0; } else { - verifyER(base, type); LL = rounding - 1; + isUzero = verifyER(reg, base, type); LL = rounding - 1; } + if (isUzero) U = 0; // avx10.2 Evex.U b = true; } else { if (v) VL = (std::max)(VL, v->getBit());