Skip to content

Commit

Permalink
support YMM embedded rounding of AVX10.2
Browse files Browse the repository at this point in the history
  • Loading branch information
herumi committed Oct 8, 2024
1 parent d993342 commit e7db8e6
Show file tree
Hide file tree
Showing 3 changed files with 252 additions and 11 deletions.
10 changes: 8 additions & 2 deletions test/Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
TARGET = make_nm normalize_prefix bad_address misc cvt_test cvt_test32 noexception misc32 detect_x32
TARGET = make_nm normalize_prefix bad_address misc cvt_test cvt_test32 noexception misc32 detect_x32 avx10_test
XBYAK_INC=../xbyak/xbyak.h ../xbyak/xbyak_mnemonic.h
UNAME_S=$(shell uname -s)
ifeq ($(shell ./detect_x32),x32)
Expand Down Expand Up @@ -57,6 +57,8 @@ noexception: noexception.cpp $(XBYAK_INC)
$(CXX) $(CFLAGS) $< -o $@ -fno-exceptions
apx: apx.cpp $(XBYAK_INC)
$(CXX) $(CFLAGS) apx.cpp -o $@
avx10_test: avx10_test.cpp $(XBYAK_INC)
$(CXX) $(CFLAGS) avx10_test.cpp -o $@ -DXBYAK64

test_nm: normalize_prefix $(TARGET)
$(MAKE) -C ../gen
Expand All @@ -81,6 +83,7 @@ ifneq ($(X32),1)
endif
./jmp64
./apx
./avx10_test
endif

test_avx: normalize_prefix
Expand All @@ -103,6 +106,9 @@ ifeq ($(BIT),64)
CXX=$(CXX) ./test_avx512.sh 64
endif

test_avx10: avx10_test
./avx10_test

detect_x32: detect_x32.c
$(CC) $< -o $@

Expand All @@ -112,7 +118,7 @@ test: detect_x32
$(MAKE) test_avx512

clean:
$(RM) a.asm *.lst *.obj *.o $(TARGET) lib_run nm.cpp nm_frame make_512
$(RM) a.asm *.lst *.obj *.o $(TARGET) lib_run nm.cpp nm_frame make_512 avx10_test

lib_run: lib_test.cpp lib_run.cpp lib.h
$(CXX) $(CFLAGS) lib_run.cpp lib_test.cpp -o lib_run
Expand Down
230 changes: 230 additions & 0 deletions test/avx10_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
#include <stdio.h>
#include <string.h>
#include <string>
#include <xbyak/xbyak.h>
#include <xbyak/xbyak_util.h>
#include <cybozu/inttype.hpp>
#include <cybozu/test.hpp>
#include <algorithm>

using namespace Xbyak;

CYBOZU_TEST_AUTO(ymm_with_sae)
{
struct Code : Xbyak::CodeGenerator {
Code()
{
vaddpd(ymm1, ymm2, ymm3 |T_rn_sae);
vaddph(ymm1, ymm2, ymm3 |T_rn_sae);
vaddps(ymm1, ymm2, ymm3 |T_rn_sae);
vcmppd(k1, ymm2, ymm3 |T_sae, 3);
vcmpph(k1, ymm2, ymm3 |T_sae, 3);
vcmpps(k1, ymm2, ymm3 |T_sae, 3);
vcvtdq2ph(xmm1, ymm2 |T_rn_sae);
vcvtdq2ps(ymm1, ymm2 |T_rn_sae);
vcvtpd2dq(xmm1, ymm2 |T_rn_sae);
vcvtpd2ph(xmm1, ymm2 |T_rn_sae);
vcvtpd2ps(xmm1, ymm2 |T_rn_sae);
vcvtpd2qq(ymm1, ymm2 |T_rn_sae);
vcvtpd2udq(xmm1, ymm2 |T_rn_sae);
vcvtpd2uqq(ymm1, ymm2 |T_rn_sae);
vcvtph2dq(ymm1, xmm2 |T_rn_sae);
vcvtph2pd(ymm1, xmm2 |T_sae);
vcvtph2ps(ymm1, xmm2 |T_sae);
vcvtph2psx(ymm1, xmm2 |T_sae);
vcvtph2qq(ymm1, xmm2 |T_rn_sae);
vcvtph2udq(ymm1, xmm2 |T_rn_sae);
vcvtph2uqq(ymm1, xmm2 |T_rn_sae);
vcvtph2uw(ymm1, ymm2 |T_rn_sae);
vcvtph2w(ymm1, ymm2 |T_rn_sae);
vcvtps2dq(ymm1, ymm2 |T_rn_sae);
vcvtps2pd(ymm1, xmm2 |T_sae);
vcvtps2ph(xmm1, ymm2 |T_sae, 3);
vcvtps2phx(xmm1, ymm2 |T_rn_sae);
vcvtps2qq(ymm1, xmm2 |T_rn_sae);
vcvtps2udq(ymm1, ymm2 |T_rn_sae);
vcvtps2uqq(ymm1, xmm2 |T_rn_sae);
vcvtqq2pd(ymm1, ymm2 |T_rn_sae);
vcvtqq2ph(xmm1, ymm2 |T_rn_sae);
vcvtqq2ps(xmm1, ymm2 |T_rn_sae);
vcvttpd2dq(xmm1, ymm2 |T_sae);
vcvttpd2qq(ymm1, ymm2 |T_sae);
vcvttpd2udq(xmm1, ymm2 |T_sae);
vcvttpd2uqq(ymm1, ymm2 |T_sae);
vcvttph2dq(ymm1, xmm2 |T_sae);
vcvttph2qq(ymm1, xmm2 |T_sae);
vcvttph2udq(ymm1, xmm2 |T_sae);
vcvttph2uqq(ymm1, xmm2 |T_sae);
vcvttph2uw(ymm1, ymm2 |T_sae);
vcvttph2w(ymm1, ymm2 |T_sae);
vcvttps2dq(ymm1, ymm2 |T_sae);
vcvttps2qq(ymm1, xmm2 |T_sae);
vcvttps2udq(ymm1, ymm2 |T_sae);
vcvttps2uqq(ymm1, xmm2 |T_sae);
vcvtudq2ph(xmm1, ymm2 |T_rn_sae);
vcvtudq2ps(ymm1, ymm2 |T_rn_sae);
vcvtuqq2pd(ymm1, ymm2 |T_rn_sae);
vcvtuqq2ph(xmm1, ymm2 |T_rn_sae);
vcvtuqq2ps(xmm1, ymm2 |T_rn_sae);
vcvtuw2ph(ymm1, ymm2 |T_rn_sae);
vcvtw2ph(ymm1, ymm2 |T_rn_sae);
vdivpd(ymm1, ymm2, ymm3 |T_rn_sae);
vdivph(ymm1, ymm2, ymm3 |T_rn_sae);
vdivps(ymm1, ymm2, ymm3 |T_rn_sae);
vfcmaddcph(ymm1, ymm2, ymm3 |T_rn_sae);
vfcmulcph(ymm1, ymm2, ymm3 |T_rn_sae);
vfixupimmpd(ymm1, ymm2, ymm3 |T_sae, 3);
vfixupimmps(ymm1, ymm2, ymm3 |T_sae, 3);
vfmadd132pd(ymm1, ymm2, ymm3 |T_rn_sae);
vfmadd132ph(ymm1, ymm2, ymm3 |T_rn_sae);
vfmadd132ps(ymm1, ymm2, ymm3 |T_rn_sae);
vfmadd213pd(ymm1, ymm2, ymm3 |T_rn_sae);
vfmadd213ph(ymm1, ymm2, ymm3 |T_rn_sae);
vfmadd213ps(ymm1, ymm2, ymm3 |T_rn_sae);
vfmadd231pd(ymm1, ymm2, ymm3 |T_rn_sae);
vfmadd231ph(ymm1, ymm2, ymm3 |T_rn_sae);
vfmadd231ps(ymm1, ymm2, ymm3 |T_rn_sae);
vfmaddcph(ymm1, ymm2, ymm3 |T_rn_sae);
vfmaddsub132pd(ymm1, ymm2, ymm3 |T_rn_sae);
vfmaddsub132ph(ymm1, ymm2, ymm3 |T_rn_sae);
vfmaddsub132ps(ymm1, ymm2, ymm3 |T_rn_sae);
vfmaddsub213pd(ymm1, ymm2, ymm3 |T_rn_sae);
vfmaddsub213ph(ymm1, ymm2, ymm3 |T_rn_sae);
vfmaddsub213ps(ymm1, ymm2, ymm3 |T_rn_sae);
vfmaddsub231pd(ymm1, ymm2, ymm3 |T_rn_sae);
vfmaddsub231ph(ymm1, ymm2, ymm3 |T_rn_sae);
vfmaddsub231ps(ymm1, ymm2, ymm3 |T_rn_sae);
vfmsub132pd(ymm1, ymm2, ymm3 |T_rn_sae);
vfmsub132ph(ymm1, ymm2, ymm3 |T_rn_sae);
vfmsub132ps(ymm1, ymm2, ymm3 |T_rn_sae);
vfmsub213pd(ymm1, ymm2, ymm3 |T_rn_sae);
vfmsub213ph(ymm1, ymm2, ymm3 |T_rn_sae);
vfmsub213ps(ymm1, ymm2, ymm3 |T_rn_sae);
vfmsub231pd(ymm1, ymm2, ymm3 |T_rn_sae);
vfmsub231ph(ymm1, ymm2, ymm3 |T_rn_sae);
vfmsub231ps(ymm1, ymm2, ymm3 |T_rn_sae);
vfmsubadd132pd(ymm1, ymm2, ymm3 |T_rn_sae);
vfmsubadd132ph(ymm1, ymm2, ymm3 |T_rn_sae);
vfmsubadd132ps(ymm1, ymm2, ymm3 |T_rn_sae);
vfmsubadd213pd(ymm1, ymm2, ymm3 |T_rn_sae);
vfmsubadd213ph(ymm1, ymm2, ymm3 |T_rn_sae);
vfmsubadd213ps(ymm1, ymm2, ymm3 |T_rn_sae);
vfmsubadd231pd(ymm1, ymm2, ymm3 |T_rn_sae);
vfmsubadd231ph(ymm1, ymm2, ymm3 |T_rn_sae);
vfmsubadd231ps(ymm1, ymm2, ymm3 |T_rn_sae);
vfmulcph(ymm1, ymm2, ymm3 |T_rn_sae);
vfnmadd132pd(ymm1, ymm2, ymm3 |T_rn_sae);
vfnmadd132ph(ymm1, ymm2, ymm3 |T_rn_sae);
vfnmadd132ps(ymm1, ymm2, ymm3 |T_rn_sae);
vfnmadd213pd(ymm1, ymm2, ymm3 |T_rn_sae);
vfnmadd213ph(ymm1, ymm2, ymm3 |T_rn_sae);
vfnmadd213ps(ymm1, ymm2, ymm3 |T_rn_sae);
vfnmadd231pd(ymm1, ymm2, ymm3 |T_rn_sae);
vfnmadd231ph(ymm1, ymm2, ymm3 |T_rn_sae);
vfnmadd231ps(ymm1, ymm2, ymm3 |T_rn_sae);
vfnmsub132pd(ymm1, ymm2, ymm3 |T_rn_sae);
vfnmsub132ph(ymm1, ymm2, ymm3 |T_rn_sae);
vfnmsub132ps(ymm1, ymm2, ymm3 |T_rn_sae);
vfnmsub213pd(ymm1, ymm2, ymm3 |T_rn_sae);
vfnmsub213ph(ymm1, ymm2, ymm3 |T_rn_sae);
vfnmsub213ps(ymm1, ymm2, ymm3 |T_rn_sae);
vfnmsub231pd(ymm1, ymm2, ymm3 |T_rn_sae);
vfnmsub231ph(ymm1, ymm2, ymm3 |T_rn_sae);
vfnmsub231ps(ymm1, ymm2, ymm3 |T_rn_sae);
vgetexppd(ymm1, ymm2 |T_sae);
vgetexpph(ymm1, ymm2 |T_sae);
vgetexpps(ymm1, ymm2 |T_sae);
vgetmantpd(ymm1, ymm2 |T_sae, 3);
vgetmantph(ymm1, ymm2 |T_sae, 3);
vgetmantps(ymm1, ymm2 |T_sae, 3);
vmaxpd(ymm1, ymm2, ymm3 |T_sae);
vmaxph(ymm1, ymm2, ymm3 |T_sae);
vmaxps(ymm1, ymm2, ymm3 |T_sae);
vminpd(ymm1, ymm2, ymm3 |T_sae);
vminph(ymm1, ymm2, ymm3 |T_sae);
vminps(ymm1, ymm2, ymm3 |T_sae);
vmulpd(ymm1, ymm2, ymm3 |T_rn_sae);
vmulph(ymm1, ymm2, ymm3 |T_rn_sae);
vmulps(ymm1, ymm2, ymm3 |T_rn_sae);
vrangepd(ymm1, ymm2, ymm3 |T_sae, 3);
vrangeps(ymm1, ymm2, ymm3 |T_sae, 3);
vreducepd(ymm1, ymm2 |T_sae, 3);
vreduceph(ymm1, ymm2 |T_sae, 3);
vreduceps(ymm1, ymm2 |T_sae, 3);
vrndscalepd(ymm1, ymm2 |T_sae, 3);
vrndscaleph(ymm1, ymm2 |T_sae, 3);
vrndscaleps(ymm1, ymm2 |T_sae, 3);
vscalefpd(ymm1, ymm2, ymm3 |T_rn_sae);
vscalefph(ymm1, ymm2, ymm3 |T_rn_sae);
vscalefps(ymm1, ymm2, ymm3 |T_rn_sae);
vsqrtpd(ymm1, ymm2 |T_rn_sae);
vsqrtph(ymm1, ymm2 |T_rn_sae);
vsqrtps(ymm1, ymm2 |T_rn_sae);
vsubpd(ymm1, ymm2, ymm3 |T_rn_sae);
vsubph(ymm1, ymm2, ymm3 |T_rn_sae);
vsubps(ymm1, ymm2, ymm3 |T_rn_sae);
}
} c;
const uint8_t tbl[] = {
0x62, 0xf1, 0xe9, 0x18, 0x58, 0xcb, 0x62, 0xf5, 0x68, 0x18, 0x58, 0xcb, 0x62, 0xf1, 0x68, 0x18,
0x58, 0xcb, 0x62, 0xf1, 0xe9, 0x18, 0xc2, 0xcb, 0x03, 0x62, 0xf3, 0x68, 0x18, 0xc2, 0xcb, 0x03,
0x62, 0xf1, 0x68, 0x18, 0xc2, 0xcb, 0x03, 0x62, 0xf5, 0x78, 0x18, 0x5b, 0xca, 0x62, 0xf1, 0x78,
0x18, 0x5b, 0xca, 0x62, 0xf1, 0xfb, 0x18, 0xe6, 0xca, 0x62, 0xf5, 0xf9, 0x18, 0x5a, 0xca, 0x62,
0xf1, 0xf9, 0x18, 0x5a, 0xca, 0x62, 0xf1, 0xf9, 0x18, 0x7b, 0xca, 0x62, 0xf1, 0xf8, 0x18, 0x79,
0xca, 0x62, 0xf1, 0xf9, 0x18, 0x79, 0xca, 0x62, 0xf5, 0x79, 0x18, 0x5b, 0xca, 0x62, 0xf5, 0x78,
0x18, 0x5a, 0xca, 0x62, 0xf2, 0x79, 0x18, 0x13, 0xca, 0x62, 0xf6, 0x79, 0x18, 0x13, 0xca, 0x62,
0xf5, 0x79, 0x18, 0x7b, 0xca, 0x62, 0xf5, 0x78, 0x18, 0x79, 0xca, 0x62, 0xf5, 0x79, 0x18, 0x79,
0xca, 0x62, 0xf5, 0x78, 0x18, 0x7d, 0xca, 0x62, 0xf5, 0x79, 0x18, 0x7d, 0xca, 0x62, 0xf1, 0x79,
0x18, 0x5b, 0xca, 0x62, 0xf1, 0x78, 0x18, 0x5a, 0xca, 0x62, 0xf3, 0x79, 0x18, 0x1d, 0xd1, 0x03,
0x62, 0xf5, 0x79, 0x18, 0x1d, 0xca, 0x62, 0xf1, 0x79, 0x18, 0x7b, 0xca, 0x62, 0xf1, 0x78, 0x18,
0x79, 0xca, 0x62, 0xf1, 0x79, 0x18, 0x79, 0xca, 0x62, 0xf1, 0xfa, 0x18, 0xe6, 0xca, 0x62, 0xf5,
0xf8, 0x18, 0x5b, 0xca, 0x62, 0xf1, 0xf8, 0x18, 0x5b, 0xca, 0x62, 0xf1, 0xf9, 0x18, 0xe6, 0xca,
0x62, 0xf1, 0xf9, 0x18, 0x7a, 0xca, 0x62, 0xf1, 0xf8, 0x18, 0x78, 0xca, 0x62, 0xf1, 0xf9, 0x18,
0x78, 0xca, 0x62, 0xf5, 0x7a, 0x18, 0x5b, 0xca, 0x62, 0xf5, 0x79, 0x18, 0x7a, 0xca, 0x62, 0xf5,
0x78, 0x18, 0x78, 0xca, 0x62, 0xf5, 0x79, 0x18, 0x78, 0xca, 0x62, 0xf5, 0x78, 0x18, 0x7c, 0xca,
0x62, 0xf5, 0x79, 0x18, 0x7c, 0xca, 0x62, 0xf1, 0x7a, 0x18, 0x5b, 0xca, 0x62, 0xf1, 0x79, 0x18,
0x7a, 0xca, 0x62, 0xf1, 0x78, 0x18, 0x78, 0xca, 0x62, 0xf1, 0x79, 0x18, 0x78, 0xca, 0x62, 0xf5,
0x7b, 0x18, 0x7a, 0xca, 0x62, 0xf1, 0x7b, 0x18, 0x7a, 0xca, 0x62, 0xf1, 0xfa, 0x18, 0x7a, 0xca,
0x62, 0xf5, 0xfb, 0x18, 0x7a, 0xca, 0x62, 0xf1, 0xfb, 0x18, 0x7a, 0xca, 0x62, 0xf5, 0x7b, 0x18,
0x7d, 0xca, 0x62, 0xf5, 0x7a, 0x18, 0x7d, 0xca, 0x62, 0xf1, 0xe9, 0x18, 0x5e, 0xcb, 0x62, 0xf5,
0x68, 0x18, 0x5e, 0xcb, 0x62, 0xf1, 0x68, 0x18, 0x5e, 0xcb, 0x62, 0xf6, 0x6b, 0x18, 0x56, 0xcb,
0x62, 0xf6, 0x6b, 0x18, 0xd6, 0xcb, 0x62, 0xf3, 0xe9, 0x18, 0x54, 0xcb, 0x03, 0x62, 0xf3, 0x69,
0x18, 0x54, 0xcb, 0x03, 0x62, 0xf2, 0xe9, 0x18, 0x98, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0x98, 0xcb,
0x62, 0xf2, 0x69, 0x18, 0x98, 0xcb, 0x62, 0xf2, 0xe9, 0x18, 0xa8, 0xcb, 0x62, 0xf6, 0x69, 0x18,
0xa8, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0xa8, 0xcb, 0x62, 0xf2, 0xe9, 0x18, 0xb8, 0xcb, 0x62, 0xf6,
0x69, 0x18, 0xb8, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0xb8, 0xcb, 0x62, 0xf6, 0x6a, 0x18, 0x56, 0xcb,
0x62, 0xf2, 0xe9, 0x18, 0x96, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0x96, 0xcb, 0x62, 0xf2, 0x69, 0x18,
0x96, 0xcb, 0x62, 0xf2, 0xe9, 0x18, 0xa6, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0xa6, 0xcb, 0x62, 0xf2,
0x69, 0x18, 0xa6, 0xcb, 0x62, 0xf2, 0xe9, 0x18, 0xb6, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0xb6, 0xcb,
0x62, 0xf2, 0x69, 0x18, 0xb6, 0xcb, 0x62, 0xf2, 0xe9, 0x18, 0x9a, 0xcb, 0x62, 0xf6, 0x69, 0x18,
0x9a, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0x9a, 0xcb, 0x62, 0xf2, 0xe9, 0x18, 0xaa, 0xcb, 0x62, 0xf6,
0x69, 0x18, 0xaa, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0xaa, 0xcb, 0x62, 0xf2, 0xe9, 0x18, 0xba, 0xcb,
0x62, 0xf6, 0x69, 0x18, 0xba, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0xba, 0xcb, 0x62, 0xf2, 0xe9, 0x18,
0x97, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0x97, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0x97, 0xcb, 0x62, 0xf2,
0xe9, 0x18, 0xa7, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0xa7, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0xa7, 0xcb,
0x62, 0xf2, 0xe9, 0x18, 0xb7, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0xb7, 0xcb, 0x62, 0xf2, 0x69, 0x18,
0xb7, 0xcb, 0x62, 0xf6, 0x6a, 0x18, 0xd6, 0xcb, 0x62, 0xf2, 0xe9, 0x18, 0x9c, 0xcb, 0x62, 0xf6,
0x69, 0x18, 0x9c, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0x9c, 0xcb, 0x62, 0xf2, 0xe9, 0x18, 0xac, 0xcb,
0x62, 0xf6, 0x69, 0x18, 0xac, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0xac, 0xcb, 0x62, 0xf2, 0xe9, 0x18,
0xbc, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0xbc, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0xbc, 0xcb, 0x62, 0xf2,
0xe9, 0x18, 0x9e, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0x9e, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0x9e, 0xcb,
0x62, 0xf2, 0xe9, 0x18, 0xae, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0xae, 0xcb, 0x62, 0xf2, 0x69, 0x18,
0xae, 0xcb, 0x62, 0xf2, 0xe9, 0x18, 0xbe, 0xcb, 0x62, 0xf6, 0x69, 0x18, 0xbe, 0xcb, 0x62, 0xf2,
0x69, 0x18, 0xbe, 0xcb, 0x62, 0xf2, 0xf9, 0x18, 0x42, 0xca, 0x62, 0xf6, 0x79, 0x18, 0x42, 0xca,
0x62, 0xf2, 0x79, 0x18, 0x42, 0xca, 0x62, 0xf3, 0xf9, 0x18, 0x26, 0xca, 0x03, 0x62, 0xf3, 0x78,
0x18, 0x26, 0xca, 0x03, 0x62, 0xf3, 0x79, 0x18, 0x26, 0xca, 0x03, 0x62, 0xf1, 0xe9, 0x18, 0x5f,
0xcb, 0x62, 0xf5, 0x68, 0x18, 0x5f, 0xcb, 0x62, 0xf1, 0x68, 0x18, 0x5f, 0xcb, 0x62, 0xf1, 0xe9,
0x18, 0x5d, 0xcb, 0x62, 0xf5, 0x68, 0x18, 0x5d, 0xcb, 0x62, 0xf1, 0x68, 0x18, 0x5d, 0xcb, 0x62,
0xf1, 0xe9, 0x18, 0x59, 0xcb, 0x62, 0xf5, 0x68, 0x18, 0x59, 0xcb, 0x62, 0xf1, 0x68, 0x18, 0x59,
0xcb, 0x62, 0xf3, 0xe9, 0x18, 0x50, 0xcb, 0x03, 0x62, 0xf3, 0x69, 0x18, 0x50, 0xcb, 0x03, 0x62,
0xf3, 0xf9, 0x18, 0x56, 0xca, 0x03, 0x62, 0xf3, 0x78, 0x18, 0x56, 0xca, 0x03, 0x62, 0xf3, 0x79,
0x18, 0x56, 0xca, 0x03, 0x62, 0xf3, 0xf9, 0x18, 0x09, 0xca, 0x03, 0x62, 0xf3, 0x78, 0x18, 0x08,
0xca, 0x03, 0x62, 0xf3, 0x79, 0x18, 0x08, 0xca, 0x03, 0x62, 0xf2, 0xe9, 0x18, 0x2c, 0xcb, 0x62,
0xf6, 0x69, 0x18, 0x2c, 0xcb, 0x62, 0xf2, 0x69, 0x18, 0x2c, 0xcb, 0x62, 0xf1, 0xf9, 0x18, 0x51,
0xca, 0x62, 0xf5, 0x78, 0x18, 0x51, 0xca, 0x62, 0xf1, 0x78, 0x18, 0x51, 0xca, 0x62, 0xf1, 0xe9,
0x18, 0x5c, 0xcb, 0x62, 0xf5, 0x68, 0x18, 0x5c, 0xcb, 0x62, 0xf1, 0x68, 0x18, 0x5c, 0xcb,
};
const size_t n = sizeof(tbl) / sizeof(tbl[0]);
CYBOZU_TEST_EQUAL(c.getSize(), n);
CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
}
23 changes: 14 additions & 9 deletions xbyak/xbyak.h
Original file line number Diff line number Diff line change
Expand Up @@ -1867,16 +1867,19 @@ class CodeGenerator : public CodeArray {
}
db(code);
}
void verifySAE(const Reg& r, uint64_t type) const
// Allow YMM embedded rounding for AVX10.2 to minimize flag modifications
bool verifySAE(const Reg& r, const Reg& b, uint64_t type) const
{
if (((type & T_SAE_X) && r.isXMM()) || ((type & T_SAE_Y) && r.isYMM()) || ((type & T_SAE_Z) && r.isZMM())) return;
XBYAK_THROW(ERR_SAE_IS_INVALID)
if (((type & T_SAE_X) && (r.isYMM() && b.isXMM())) || ((type & T_SAE_Y) && b.isXMM()) || ((type & T_SAE_Z) && b.isYMM())) return true;
if (((type & T_SAE_X) && b.isXMM()) || ((type & T_SAE_Y) && b.isYMM()) || ((type & T_SAE_Z) && b.isZMM())) return false;
XBYAK_THROW_RET(ERR_SAE_IS_INVALID, false)
}
void verifyER(const Reg& r, uint64_t type) const
bool verifyER(const Reg& r, const Reg& b, uint64_t type) const
{
if ((type & T_ER_R) && r.isREG(32|64)) return;
if (((type & T_ER_X) && r.isXMM()) || ((type & T_ER_Y) && r.isYMM()) || ((type & T_ER_Z) && r.isZMM())) return;
XBYAK_THROW(ERR_ER_IS_INVALID)
if ((type & T_ER_R) && b.isREG(32|64)) return false;
if (((type & T_ER_X) && (r.isYMM() && b.isXMM())) || ((type & T_ER_Y) && b.isXMM()) || ((type & T_ER_Z) && b.isYMM())) return true;
if (((type & T_ER_X) && b.isXMM()) || ((type & T_ER_Y) && b.isYMM()) || ((type & T_ER_Z) && b.isZMM())) return false;
XBYAK_THROW_RET(ERR_SAE_IS_INVALID, false)
}
// (a, b, c) contains non zero two or three values then err
int verifyDuplicate(int a, int b, int c, int err)
Expand Down Expand Up @@ -1905,11 +1908,13 @@ class CodeGenerator : public CodeArray {
int rounding = verifyDuplicate(reg.getRounding(), base.getRounding(), v ? v->getRounding() : 0, ERR_ROUNDING_IS_ALREADY_SET);
int disp8N = 1;
if (rounding) {
bool isUzero = false;
if (rounding == EvexModifierRounding::T_SAE) {
verifySAE(base, type); LL = 0;
isUzero = verifySAE(reg, base, type); LL = 0;
} else {
verifyER(base, type); LL = rounding - 1;
isUzero = verifyER(reg, base, type); LL = rounding - 1;
}
if (isUzero) U = 0; // avx10.2 Evex.U
b = true;
} else {
if (v) VL = (std::max)(VL, v->getBit());
Expand Down

0 comments on commit e7db8e6

Please sign in to comment.